diff --git a/.github/workflows/circleci.yml b/.github/workflows/circleci.yml
deleted file mode 100644
index 1347ba3a0f7c..000000000000
--- a/.github/workflows/circleci.yml
+++ /dev/null
@@ -1,25 +0,0 @@
-# To enable this workflow on a fork, comment out:
-#
-# if: github.repository == 'numpy/numpy'
-
-name: CircleCI artifact redirector
-
-on: [status]
-
-permissions: read-all
-
-jobs:
-  circleci_artifacts_redirector_job:
-    runs-on: ubuntu-latest
-    if: "github.repository == 'numpy/numpy' && !contains(github.event.head_commit.message, '[circle skip]') && !contains(github.event.head_commit.message, '[skip circle]')  && github.event.context == 'ci/circleci: build'"
-    name: Run CircleCI artifacts redirector
-    permissions:
-      statuses: write
-    steps:
-      - name: GitHub Action step
-        uses: larsoner/circleci-artifacts-redirector-action@328d16f501600fcb4535e1024a538077cd333ea8 # master
-        with:
-          repo-token: ${{ secrets.GITHUB_TOKEN }}
-          api-token: ${{ secrets.CIRCLE_TOKEN }}
-          artifact-path: 0/doc/build/html/index.html
-          circleci-jobs: build
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
deleted file mode 100644
index 08bcf63fdb75..000000000000
--- a/.github/workflows/codeql.yml
+++ /dev/null
@@ -1,73 +0,0 @@
-# For most projects, this workflow file will not need changing; you simply need
-# to commit it to your repository.
-#
-# You may wish to alter this file to override the set of languages analyzed,
-# or to provide custom queries or build logic.
-#
-# ******** NOTE ********
-# We have attempted to detect the languages in your repository. Please check
-# the `language` matrix defined below to confirm you have the correct set of
-# supported CodeQL languages.
-#
-name: "CodeQL"
-
-on:
-  push:
-    branches: ["main"]
-  pull_request:
-    # The branches below must be a subset of the branches above
-    branches: ["main"]
-  schedule:
-    - cron: "0 0 * * 1"
-
-permissions:
-  contents: read
-
-jobs:
-  analyze:
-    name: Analyze
-    runs-on: ubuntu-latest
-    permissions:
-      actions: read
-      contents: read
-      security-events: write
-
-    strategy:
-      fail-fast: false
-      matrix:
-        language: ["python"]
-        # CodeQL supports [ $supported-codeql-languages ]
-        # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0
-
-      # Initializes the CodeQL tools for scanning.
-      - name: Initialize CodeQL
-        uses: github/codeql-action/init@cdcdbb579706841c47f7063dda365e292e5cad7a # v2.13.4
-        with:
-          languages: ${{ matrix.language }}
-          # If you wish to specify custom queries, you can do so here or in a config file.
-          # By default, queries listed here will override any specified in a config file.
-          # Prefix the list here with "+" to use these queries and those in the config file.
-
-      # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
-      # If this step fails, then you should remove it and run the build manually (see below)
-      - name: Autobuild
-        uses: github/codeql-action/autobuild@cdcdbb579706841c47f7063dda365e292e5cad7a # v2.13.4
-
-      # ℹ️ Command-line programs to run using the OS shell.
-      # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
-
-      #   If the Autobuild fails above, remove it and uncomment the following three lines.
-      #   modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
-
-      # - run: |
-      #   echo "Run, Build Application using script"
-      #   ./location_of_script_within_repo/buildscript.sh
-
-      - name: Perform CodeQL Analysis
-        uses: github/codeql-action/analyze@cdcdbb579706841c47f7063dda365e292e5cad7a # v2.13.4
-        with:
-          category: "/language:${{matrix.language}}"
diff --git a/.github/workflows/cygwin.yml b/.github/workflows/cygwin.yml
deleted file mode 100644
index b8226171db3f..000000000000
--- a/.github/workflows/cygwin.yml
+++ /dev/null
@@ -1,82 +0,0 @@
-# To enable this workflow on a fork, comment out:
-#
-# if: github.repository == 'numpy/numpy'
-name: Test on Cygwin
-on:
-  pull_request:
-    branches:
-      - main
-      - maintenance/**
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read # to fetch code (actions/checkout)
-
-jobs:
-  cygwin_build_test:
-    runs-on: windows-latest
-    if: "github.repository == 'numpy/numpy'"
-    steps:
-      - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-        with:
-          submodules: recursive
-          fetch-depth: 0
-      - name: Install Cygwin
-        uses: egor-tensin/setup-cygwin@d2c752bab416d4b0662591bd366fc2686297c82d   #v4
-        with:
-          platform: x86_64
-          install-dir: 'C:\tools\cygwin'
-          packages: >-
-            python39=3.9.16-1 python39-devel=3.9.16-1 python39-pip python-pip-wheel
-            python-setuptools-wheel liblapack-devel liblapack0 gcc-fortran
-            gcc-g++ git dash cmake ninja
-      - name: Set Windows PATH
-        uses: egor-tensin/cleanup-path@f04bc953e6823bf491cc0bdcff959c630db1b458 # v4.0.1
-        with:
-          dirs: 'C:\tools\cygwin\bin;C:\tools\cygwin\lib\lapack'
-      - name: Verify that bash is Cygwin bash
-        run: |
-          command bash
-          bash -c "uname -svrmo"
-      - name: Tell Cygwin's git about this repository.
-        run: |
-          dash -c "which git; /usr/bin/git config --system --add safe.directory /cygdrive/d/a/numpy/numpy"
-      - name: Verify python version
-        # Make sure it's the Cygwin one, not a Windows one
-        run: |
-          dash -c "which python3.9; /usr/bin/python3.9 --version -V"
-      - name: Build NumPy wheel
-        run: |
-          dash -c "/usr/bin/python3.9 -m pip install build pytest hypothesis pytest-xdist Cython meson"
-          dash -c "/usr/bin/python3.9 -m build . --wheel -Csetup-args=-Dblas=blas -Csetup-args=-Dlapack=lapack -Csetup-args=-Dcpu-dispatch=none -Csetup-args=-Dcpu-baseline=native"
-      - name: Install NumPy from wheel
-        run: |
-          bash -c "/usr/bin/python3.9 -m pip install dist/numpy-*cp39*.whl"
-      - name: Rebase NumPy compiled extensions
-        run: |
-          dash "tools/rebase_installed_dlls_cygwin.sh" 3.9
-      - name: Run NumPy test suite
-        shell: "C:\\tools\\cygwin\\bin\\bash.exe -o igncr -eo pipefail {0}"
-        run: |
-          cd tools
-          /usr/bin/python3.9 -m pytest --pyargs numpy -n2 -m "not slow"
-      - name: Upload wheel if tests fail
-        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 # v4.3.0
-        if: failure()
-        with:
-          name: numpy-cygwin-wheel
-          path: dist/numpy-*cp39*.whl
-      - name: Check the extension modules on failure
-        if: failure()
-        run: |
-          dash -c "/usr/bin/python3.9 -m pip show numpy"
-          dash -c "/usr/bin/python3.9 -m pip show -f numpy | grep .dll"
-          dash -c "/bin/tr -d '\r' <tools/list_installed_dll_dependencies_cygwin.sh >list_dlls_unix.sh"
-          dash "list_dlls_unix.sh" 3.9
-      - name: Print installed package versions on failure
-        if: failure()
-        run: |
-          cygcheck -c
diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml
deleted file mode 100644
index 07127a7f6cb5..000000000000
--- a/.github/workflows/dependency-review.yml
+++ /dev/null
@@ -1,20 +0,0 @@
-# Dependency Review Action
-#
-# This Action will scan dependency manifest files that change as part of a Pull Request, surfacing known-vulnerable versions of the packages declared or updated in the PR. Once installed, if the workflow run is marked as required, PRs introducing known-vulnerable packages will be blocked from merging.
-#
-# Source repository: https://github.com/actions/dependency-review-action
-# Public documentation: https://docs.github.com/en/code-security/supply-chain-security/understanding-your-software-supply-chain/about-dependency-review#dependency-review-enforcement
-name: 'Dependency Review'
-on: [pull_request]
-
-permissions:
-  contents: read
-
-jobs:
-  dependency-review:
-    runs-on: ubuntu-latest
-    steps:
-      - name: 'Checkout Repository'
-        uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0
-      - name: 'Dependency Review'
-        uses: actions/dependency-review-action@1360a344ccb0ab6e9475edef90ad2f46bf8003b1 # v3.0.6
diff --git a/.github/workflows/emscripten.yml b/.github/workflows/emscripten.yml
deleted file mode 100644
index 94e4809074c1..000000000000
--- a/.github/workflows/emscripten.yml
+++ /dev/null
@@ -1,80 +0,0 @@
-# To enable this workflow on a fork, comment out:
-#
-# if: github.repository == 'numpy/numpy'
-name: Test Emscripten/Pyodide build
-
-on:
-  pull_request:
-    branches:
-      - main
-      - maintenance/**
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read # to fetch code (actions/checkout)
-
-jobs:
-  build-wasm-emscripten:
-    runs-on: ubuntu-22.04
-    if: "github.repository == 'numpy/numpy'"
-    env:
-      PYODIDE_VERSION: 0.23.1
-      # PYTHON_VERSION and EMSCRIPTEN_VERSION are determined by PYODIDE_VERSION.
-      # The appropriate versions can be found in the Pyodide repodata.json
-      # "info" field, or in Makefile.envs:
-      # https://github.com/pyodide/pyodide/blob/main/Makefile.envs#L2
-      PYTHON_VERSION: 3.11.2
-      EMSCRIPTEN_VERSION: 3.1.32
-      NODE_VERSION: 18
-    steps:
-      - name: Checkout numpy
-        uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0
-        with:
-          submodules: true
-          # versioneer.py requires the latest tag to be reachable. Here we
-          # fetch the complete history to get access to the tags.
-          # A shallow clone can work when the following issue is resolved:
-          # https://github.com/actions/checkout/issues/338
-          fetch-depth: 0
-
-      - name: set up python
-        id: setup-python
-        uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0
-        with:
-          python-version: ${{ env.PYTHON_VERSION }}
-
-      - uses: mymindstorm/setup-emsdk@ab889da2abbcbb280f91ec4c215d3bb4f3a8f775 # v12
-        with:
-          version: ${{ env.EMSCRIPTEN_VERSION }}
-          actions-cache-folder: emsdk-cache
-
-      - name: Install pyodide-build
-        run: pip install "pydantic<2" pyodide-build==$PYODIDE_VERSION
-
-      - name: Build
-        run: |
-          # Pyodide is still in the process of adding better/easier support for
-          # non-setup.py based builds.
-          cp pyproject.toml.setuppy pyproject.toml
-          CFLAGS=-g2 LDFLAGS=-g2 pyodide build
-
-      - name: set up node
-        uses: actions/setup-node@b39b52d1213e96004bfcb1c61a8a6fa8ab84f3e8 # v4.0.1
-        with:
-          node-version: ${{ env.NODE_VERSION }}
-
-      - name: Set up Pyodide virtual environment
-        run: |
-          pyodide venv .venv-pyodide
-          source .venv-pyodide/bin/activate
-          pip install dist/*.whl
-          python -c "import sys; print(sys.platform)"
-          pip install -r test_requirements.txt
-      - name: Test
-        run: |
-          source .venv-pyodide/bin/activate
-          cd ..
-          python numpy/runtests.py -n -vv
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
deleted file mode 100644
index 9ceafebb711a..000000000000
--- a/.github/workflows/labeler.yml
+++ /dev/null
@@ -1,19 +0,0 @@
-name: "Pull Request Labeler"
-on:
-  pull_request_target:
-    types: [opened]
-
-permissions: {}
-
-jobs:
-  pr-labeler:
-    runs-on: ubuntu-latest
-    permissions: 
-      pull-requests: write  # to add labels
-    steps:
-    - name: Label the PR
-      uses: gerrymanoim/pr-prefix-labeler@c8062327f6de59a9ae1c19f7f07cacd0b976b6fa # v3
-      continue-on-error: true
-      env:
-        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      if: github.repository == 'numpy/numpy'
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
deleted file mode 100644
index ebceec365d7c..000000000000
--- a/.github/workflows/linux.yml
+++ /dev/null
@@ -1,239 +0,0 @@
-name: Linux tests
-
-# This file is meant for testing across supported Python versions, build types
-# and interpreters (PyPy, python-dbg, a pre-release Python in summer time),
-# build-via-sdist, run benchmarks, measure code coverage, and other build
-# options like relaxed-strides.
-
-on:
-  push:
-    branches:
-      # coverage comparison in the "full" step needs to run on main after merges
-      - main
-  pull_request:
-    branches:
-      - main
-      - maintenance/**
-
-defaults:
-  run:
-    shell: bash
-
-env:
-  DOWNLOAD_OPENBLAS: 1
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read # to fetch code (actions/checkout)
-
-jobs:
-  lint:
-    if: github.repository == 'numpy/numpy' && github.event_name != 'push'
-    runs-on: ubuntu-latest
-    continue-on-error: true
-    steps:
-    - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # v3.5.3
-      with:
-        submodules: recursive
-        fetch-depth: 0
-    - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0
-      with:
-        python-version: '3.9'
-    - name: Install linter requirements
-      run:
-        python -m pip install -r linter_requirements.txt
-    - name: Run linter on PR diff
-      run:
-        python tools/linter.py --branch origin/${{ github.base_ref }}
-
-  smoke_test:
-    if: "github.repository == 'numpy/numpy'"
-    runs-on: ubuntu-latest
-    env:
-      MESON_ARGS: "-Dallow-noblas=true -Dcpu-baseline=none -Dcpu-dispatch=none"
-    steps:
-    - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # v3.5.3
-      with:
-        submodules: recursive
-        fetch-depth: 0
-    - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0
-      with:
-        python-version: '3.9'
-    - uses: ./.github/meson_actions
-
-  basic:
-    needs: [smoke_test]
-    runs-on: ubuntu-latest
-    if: github.event_name != 'push'
-    strategy:
-      matrix:
-        python-version: ["3.9", "pypy3.9-v7.3.12"]
-    env:
-      EXPECT_CPU_FEATURES: "SSE SSE2 SSE3 SSSE3 SSE41 POPCNT SSE42 AVX F16C FMA3 AVX2 AVX512F AVX512CD AVX512_KNL AVX512_KNM AVX512_SKX AVX512_CLX AVX512_CNL AVX512_ICL"
-    steps:
-    - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # v3.5.3
-      with:
-        submodules: recursive
-        fetch-depth: 0
-    - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0
-      with:
-        python-version: ${{ matrix.python-version }}
-    - uses: ./.github/actions
-
-  debug:
-    needs: [smoke_test]
-    runs-on: ubuntu-latest
-    if: github.event_name != 'push'
-    env:
-      USE_DEBUG: 1
-    steps:
-    - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # v3.5.3
-      with:
-        submodules: recursive
-        fetch-depth: 0
-    - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0
-      with:
-        python-version: '3.11'
-
-    - uses: ./.github/actions
-
-  full:
-    # Build a wheel, install it, then run the full test suite with code coverage
-    needs: [smoke_test]
-    runs-on: ubuntu-22.04
-    steps:
-    - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # v3.5.3
-      with:
-        submodules: recursive
-        fetch-depth: 0
-    - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0
-      with:
-        python-version: '3.9'
-    - name: Install build and test dependencies from PyPI
-      run: |
-        pip install -r build_requirements.txt
-        pip install -r test_requirements.txt
-    - name: Install gfortran and OpenBLAS (MacPython build)
-      run: |
-        set -xe
-        sudo apt install gfortran libgfortran5
-        target=$(python tools/openblas_support.py)
-        sudo cp -r $target/lib/* /usr/lib
-        sudo cp $target/include/* /usr/include
-    - name: Build a wheel
-      run: |
-        python -m build --wheel --no-isolation --skip-dependency-check
-        pip install dist/numpy*.whl
-    - name: Run full test suite
-      run: |
-        cd tools
-        pytest --pyargs numpy --cov-report=html:build/coverage
-        # TODO: gcov
-
-  benchmark:
-    needs: [smoke_test]
-    runs-on: ubuntu-latest
-    if: github.event_name != 'push'
-    env:
-      PYTHONOPTIMIZE: 2
-      BLAS: None
-      LAPACK: None
-      ATLAS: None
-      NPY_BLAS_ORDER: mkl,blis,openblas,atlas,blas
-      NPY_LAPACK_ORDER: MKL,OPENBLAS,ATLAS,LAPACK
-      USE_ASV: 1
-    steps:
-    - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # v3.5.3
-      with:
-        submodules: recursive
-        fetch-depth: 0
-    - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0
-      with:
-        python-version: '3.9'
-    - uses: ./.github/actions
-
-  relaxed_strides_debug:
-    needs: [smoke_test]
-    runs-on: ubuntu-latest
-    if: github.event_name != 'push'
-    env:
-      CHECK_BLAS: 1
-      NPY_USE_BLAS_ILP64: 1
-      NPY_RELAXED_STRIDES_DEBUG: 1
-    steps:
-    - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # v3.5.3
-      with:
-        submodules: recursive
-        fetch-depth: 0
-    - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0
-      with:
-        python-version: '3.11'
-    - uses: ./.github/actions
-
-  sdist:
-    needs: [smoke_test]
-    runs-on: ubuntu-latest
-    if: github.event_name != 'push'
-    steps:
-    - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # v3.5.3
-      with:
-        submodules: recursive
-        fetch-depth: 0
-    - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0
-      with:
-        python-version: '3.11'
-    - name: Install gfortran and OpenBLAS (MacPython build)
-      run: |
-        set -xe
-        sudo apt install gfortran libgfortran5
-        target=$(python tools/openblas_support.py)
-        sudo cp -r $target/lib/* /usr/lib
-        sudo cp $target/include/* /usr/include
-    - name: Build a wheel via an sdist
-      run: |
-        pip install build
-        python -m build
-        pip install dist/numpy*.whl
-    - name: Install test dependencies
-      run: |
-        pip install -r test_requirements.txt
-        pip install ninja
-    - name: Run test suite
-      run: |
-        cd tools
-        pytest --pyargs numpy -m "not slow"
-
-  custom_checks:
-    needs: [smoke_test]
-    runs-on: ubuntu-latest
-    if: github.event_name != 'push'
-    steps:
-    - uses: actions/checkout@f43a0e5ff2bd294095638e18286ca9a3d1956744 # v3.6.0
-      with:
-        submodules: recursive
-        fetch-depth: 0
-    - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0
-      with:
-        python-version: '3.11'
-    - name: Install build and test dependencies from PyPI
-      run: |
-        pip install -r build_requirements.txt
-        pip install -r test_requirements.txt
-        pip install vulture
-    - name: Build and install NumPy
-      run: |
-        # Install using the fastests way to build (no BLAS, no SIMD)
-        spin build -j2 -- -Dallow-noblas=true -Dcpu-baseline=none -Dcpu-dispatch=none
-    - name: Check build-internal dependencies
-      run: |
-        ninja -C build -t missingdeps
-    - name: Check installed test and stub files
-      run: |
-        python tools/check_installed_files.py $(find ./build-install -path '*/site-packages/numpy')
-    - name: Check for unreachable code paths in Python modules
-      run: |
-        # Need the explicit `bash -c` here because `grep` returns exit code 1 for no matches
-        bash -c "! vulture . --min-confidence 100 --exclude doc/,numpy/distutils/,vendored-meson/ | grep 'unreachable'"
diff --git a/.github/workflows/linux_blas.yml b/.github/workflows/linux_blas.yml
deleted file mode 100644
index aac37d205e9d..000000000000
--- a/.github/workflows/linux_blas.yml
+++ /dev/null
@@ -1,395 +0,0 @@
-name: BLAS tests (Linux)
-
-# This file is meant for testing different BLAS/LAPACK flavors and build
-# options on Linux. All other yml files for Linux will only test without BLAS
-# (mostly because that's easier and faster to build) or with the same 64-bit
-# OpenBLAS build that is used in the wheel jobs.
-#
-# Jobs and their purpose:
-#
-#   - openblas64_setuppy:
-#         This job uses the default 64-bit build of OpenBLAS with the
-#         `numpy.distutils`-based build. It can be removed once we remove
-#         support for those builds.
-#   - openblas32_stable_nightly:
-#         Uses the 32-bit OpenBLAS builds, both the latest stable release
-#         and a nightly build.
-#   - openblas_no_pkgconfig_fedora:
-#         Test OpenBLAS on Fedora. Fedora doesn't ship .pc files for OpenBLAS,
-#         hence this exercises the "system dependency" detection method.
-#   - flexiblas_fedora:
-#         Tests FlexiBLAS (the default on Fedora for its own packages), via
-#         pkg-config. FlexiBLAS allows runtime switching of BLAS/LAPACK
-#         libraries, which is a useful capability (not tested in this job).
-#   - openblas_cmake:
-#         Tests whether OpenBLAS LP64 is detected correctly when only CMake
-#         and not pkg-config is installed.
-#   - netlib-debian:
-#         Installs libblas/liblapack, which in Debian contains libcblas within
-#         libblas.
-#   - netlib-split:
-#         Installs vanilla Netlib blas/lapack with separate libcblas, which is
-#         the last option tried in auto-detection.
-#   - mkl:
-#         Tests MKL installed from PyPI (because easiest/fastest, if broken) in
-#         3 ways: both LP64 and ILP64 via pkg-config, and then using the
-#         Single Dynamic Library (SDL, or `libmkl_rt`).
-#   - blis:
-#         Simple test for LP64 via pkg-config
-#   - atlas:
-#         Simple test for LP64 via pkg-config
-
-on:
-  pull_request:
-    branches:
-      - main
-      - maintenance/**
-
-defaults:
-  run:
-    shell: bash
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read # to fetch code (actions/checkout)
-
-jobs:
-  openblas32_stable_nightly:
-    if: "github.repository == 'numpy/numpy'"
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        USE_NIGHTLY_OPENBLAS: [false, true]
-    env:
-      USE_NIGHTLY_OPENBLAS: ${{ matrix.USE_NIGHTLY_OPENBLAS }}
-    name: "Test Linux (${{ matrix.USE_NIGHTLY_OPENBLAS && 'nightly' || 'stable' }} OpenBLAS)"
-    steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-      with:
-        submodules: recursive
-        fetch-depth: 0
-    - uses: actions/setup-python@65d7f2d534ac1bc67fcd62888c5f4f3d2cb2b236 # v4.7.1
-      with:
-        python-version: '3.11'
-
-    - name: Install dependencies
-      run: |
-        pip install -r build_requirements.txt
-        # Install OpenBLAS
-        set -xe
-        if [[ $USE_NIGHTLY_OPENBLAS == "true" ]]; then
-          target=$(python tools/openblas_support.py --nightly)
-        else
-          target=$(python tools/openblas_support.py)
-        fi
-        sudo cp -r $target/lib/* /usr/lib
-        sudo cp $target/include/* /usr/include
-
-    - name: Build
-      shell: 'script -q -e -c "bash --noprofile --norc -eo pipefail {0}"'
-      env:
-        TERM: xterm-256color
-      run:
-        spin build -- --werror -Dallow-noblas=false
-
-    - name: Check build-internal dependencies
-      run:
-        ninja -C build -t missingdeps
-
-    - name: Check installed test and stub files
-      run:
-        python tools/check_installed_files.py $(find ./build-install -path '*/site-packages/numpy')
-
-    - name: Test
-      shell: 'script -q -e -c "bash --noprofile --norc -eo pipefail {0}"'
-      env:
-        TERM: xterm-256color
-        LD_LIBRARY_PATH: "/usr/local/lib/"  # to find libopenblas.so.0
-
-      run: |
-        pip install pytest pytest-xdist hypothesis typing_extensions
-        spin test -j auto
-
-
-  openblas_no_pkgconfig_fedora:
-    if: "github.repository == 'numpy/numpy'"
-    runs-on: ubuntu-latest
-    container: fedora:39
-    name: "OpenBLAS (Fedora, no pkg-config, LP64/ILP64)"
-    steps:
-    - name: Install system dependencies
-      run: |
-        dnf install git gcc-gfortran g++ python3-devel openblas-devel -y
-
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-      with:
-        submodules: recursive
-        fetch-depth: 0
-
-    - name: Install dependencies
-      run: |
-        pip install -r build_requirements.txt
-        pip install pytest hypothesis typing_extensions
-
-    - name: Build (LP64)
-      run: spin build -- -Dblas=openblas -Dlapack=openblas -Ddisable-optimization=true -Dallow-noblas=false
-
-    - name: Test
-      run: spin test -- numpy/linalg
-
-    - name: Build (ILP64)
-      run: |
-        rm -rf build
-        spin build -- -Duse-ilp64=true -Ddisable-optimization=true -Dallow-noblas=false
-
-    - name: Test
-      run: spin test -- numpy/linalg
-
-
-  flexiblas_fedora:
-    if: "github.repository == 'numpy/numpy'"
-    runs-on: ubuntu-latest
-    container: fedora:39
-    name: "FlexiBLAS (LP64, ILP64 on Fedora)"
-    steps:
-    - name: Install system dependencies
-      run: |
-        dnf install git gcc-gfortran g++ python3-devel flexiblas-devel -y
-
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-      with:
-        submodules: recursive
-        fetch-depth: 0
-
-    - name: Install dependencies
-      run: |
-        pip install -r build_requirements.txt
-        pip install pytest hypothesis typing_extensions
-
-    - name: Build
-      run: spin build -- -Ddisable-optimization=true -Dallow-noblas=false
-
-    - name: Test
-      run: spin test -- numpy/linalg
-
-    - name: Build (ILP64)
-      run: |
-        rm -rf build
-        spin build -- -Ddisable-optimization=true -Duse-ilp64=true -Dallow-noblas=false
-
-    - name: Test (ILP64)
-      run: spin test -- numpy/linalg
-
-
-  openblas_cmake:
-    if: "github.repository == 'numpy/numpy'"
-    runs-on: ubuntu-latest
-    name: "OpenBLAS with CMake"
-    steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-      with:
-        submodules: recursive
-        fetch-depth: 0
-    - uses: actions/setup-python@65d7f2d534ac1bc67fcd62888c5f4f3d2cb2b236 # v4.7.1
-      with:
-        python-version: '3.11'
-
-    - name: Install dependencies
-      run: |
-        pip install -r build_requirements.txt
-        pip install pytest pytest-xdist hypothesis typing_extensions
-        sudo apt-get install libopenblas-dev cmake
-        sudo apt-get remove pkg-config
-
-    - name: Build
-      run: spin build -- -Ddisable-optimization=true -Dallow-noblas=false
-
-    - name: Test
-      run: spin test -j auto -- numpy/linalg
-
- 
-  netlib-debian:
-    if: "github.repository == 'numpy/numpy'"
-    runs-on: ubuntu-latest
-    name: "Debian libblas/liblapack"
-    steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-      with:
-        submodules: recursive
-        fetch-depth: 0
-    - uses: actions/setup-python@65d7f2d534ac1bc67fcd62888c5f4f3d2cb2b236 # v4.7.1
-      with:
-        python-version: '3.11'
-
-    - name: Install dependencies
-      run: |
-        pip install -r build_requirements.txt
-        sudo apt-get install liblapack-dev pkg-config
-
-    - name: Build
-      run: |
-        spin build -- -Ddisable-optimization=true -Dallow-noblas=false
-
-    - name: Test
-      run: |
-        pip install pytest pytest-xdist hypothesis typing_extensions
-        spin test -j auto -- numpy/linalg
-
-
-  netlib-split:
-    if: "github.repository == 'numpy/numpy'"
-    runs-on: ubuntu-latest
-    container: opensuse/tumbleweed
-    name: "OpenSUSE Netlib BLAS/LAPACK"
-    steps:
-    - name: Install system dependencies
-      run: |
-        # No blas.pc on OpenSUSE as of Nov 2023, so no need to install pkg-config.
-        # If it is needed in the future, use install name `pkgconf-pkg-config`
-        zypper install -y git gcc-c++ python3-pip python3-devel blas cblas lapack
-
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-      with:
-        submodules: recursive
-        fetch-depth: 0
-
-    - name: Install PyPI dependencies
-      run: |
-        pip install --break-system-packages -r build_requirements.txt
-
-    - name: Build
-      run: |
-        spin build -- -Dblas=blas -Dlapack=lapack -Ddisable-optimization=true -Dallow-noblas=false
-
-    - name: Test
-      run: |
-        pip install --break-system-packages pytest pytest-xdist hypothesis typing_extensions
-        spin test -j auto -- numpy/linalg
-
-
-  mkl:
-    if: "github.repository == 'numpy/numpy'"
-    runs-on: ubuntu-latest
-    name: "MKL (LP64, ILP64, SDL)"
-    steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-      with:
-        submodules: recursive
-        fetch-depth: 0
-    - uses: actions/setup-python@65d7f2d534ac1bc67fcd62888c5f4f3d2cb2b236 # v4.7.1
-      with:
-        python-version: '3.11'
-
-    - name: Install dependencies
-      run: |
-        pip install -r build_requirements.txt
-        pip install pytest pytest-xdist hypothesis typing_extensions
-        pip install mkl mkl-devel
-
-    - name: Repair MKL pkg-config files and symlinks
-      run: |
-        # MKL 2023.2 works when installed from conda-forge (except for `-iomp`
-        # and `-tbb` pkg-config files), Spack, or with the standalone Intel
-        # installer. The standalone installer is the worst option, since it's
-        # large and clumsy to install and requires running a setvars.sh script
-        # before things work. The PyPI MKL packages are broken and need the
-        # fixes in this step. For details, see
-        # https://github.com/conda-forge/intel_repack-feedstock/issues/34
-        cd $Python3_ROOT_DIR/lib/pkgconfig
-        sed -i 's/\/intel64//g' mkl*.pc
-        # add the expected .so -> .so.2 symlinks to fix linking
-        cd ..
-        for i in $( ls libmkl*.so.2 ); do ln -s $i ${i%.*}; done
-
-    - name: Build with defaults (LP64)
-      run: |
-        pkg-config --libs mkl-dynamic-lp64-seq  # check link flags
-        spin build -- -Ddisable-optimization=true -Dallow-noblas=false
-
-    - name: Test
-      run: spin test -- numpy/linalg
-
-    - name: Build with ILP64
-      run: |
-        git clean -xdf > /dev/null
-        pkg-config --libs mkl-dynamic-ilp64-seq
-        spin build -- -Duse-ilp64=true -Ddisable-optimization=true -Dallow-noblas=false
-
-    - name: Test
-      run: spin test -- numpy/linalg
-
-    - name: Build without pkg-config (default options, SDL)
-      run: |
-        git clean -xdf > /dev/null
-        pushd $Python3_ROOT_DIR/lib/pkgconfig
-        rm mkl*.pc
-        popd
-        export MKLROOT=$Python3_ROOT_DIR
-        spin build -- -Ddisable-optimization=true -Dallow-noblas=false
-
-    - name: Test
-      run: spin test -- numpy/linalg
-
-  blis:
-    if: "github.repository == 'numpy/numpy'"
-    runs-on: ubuntu-latest
-    name: "BLIS"
-    steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-      with:
-        submodules: recursive
-        fetch-depth: 0
-    - uses: actions/setup-python@65d7f2d534ac1bc67fcd62888c5f4f3d2cb2b236 # v4.7.1
-      with:
-        python-version: '3.11'
-
-    - name: Install dependencies
-      run: |
-        pip install -r build_requirements.txt
-        pip install pytest pytest-xdist hypothesis typing_extensions
-        sudo apt-get install libblis-dev libopenblas-dev pkg-config
-
-    - name: Add BLIS pkg-config file
-      run: |
-        # Needed because blis.pc missing in Debian:
-        # https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=989076
-        # The alternative here would be to use another distro or Miniforge
-        sudo cp tools/ci/_blis_debian.pc /usr/lib/x86_64-linux-gnu/pkgconfig/blis.pc
-        # Check if the patch works:
-        pkg-config --libs blis
-        pkg-config --cflags blis
-
-    - name: Build
-      run: spin build -- -Dblas=blis -Ddisable-optimization=true -Dallow-noblas=false
-
-    - name: Test
-      run: spin test -- numpy/linalg
-
-  atlas:
-    if: "github.repository == 'numpy/numpy'"
-    runs-on: ubuntu-latest
-    name: "ATLAS"
-    steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-      with:
-        submodules: recursive
-        fetch-depth: 0
-    - uses: actions/setup-python@65d7f2d534ac1bc67fcd62888c5f4f3d2cb2b236 # v4.7.1
-      with:
-        python-version: '3.11'
-
-    - name: Install dependencies
-      run: |
-        pip install -r build_requirements.txt
-        pip install pytest pytest-xdist hypothesis typing_extensions
-        sudo apt-get install libatlas-base-dev pkg-config
-
-    - name: Build
-      run: spin build -- -Dblas=blas-atlas -Dlapack=lapack-atlas -Ddisable-optimization=true -Dallow-noblas=false
-
-    - name: Test
-      run: spin test -- numpy/linalg
-
diff --git a/.github/workflows/linux_musl.yml b/.github/workflows/linux_musl.yml
deleted file mode 100644
index 5c65a2b2e8b9..000000000000
--- a/.github/workflows/linux_musl.yml
+++ /dev/null
@@ -1,69 +0,0 @@
-name: Test musllinux_x86_64
-
-on:
-  pull_request:
-    branches:
-      - main
-      - maintenance/**
-
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-
-permissions:
-  contents: read # to fetch code (actions/checkout)
-
-
-jobs:
-  musllinux_x86_64:
-    runs-on: ubuntu-latest
-    if: "github.repository == 'numpy/numpy'"
-    container:
-      # Use container used for building musllinux wheels
-      # it has git installed, all the pythons, etc
-      image: quay.io/pypa/musllinux_1_1_x86_64
-
-    steps:
-    - name: setup
-      run: |
-        apk update --quiet
-
-        # using git commands to clone because versioneer doesn't work when
-        # actions/checkout is used for the clone step in a container
-
-        git config --global --add safe.directory $PWD 
-        
-        if [ $GITHUB_EVENT_NAME != pull_request ]; then
-            git clone --recursive --branch=$GITHUB_REF_NAME https://github.com/${GITHUB_REPOSITORY}.git $GITHUB_WORKSPACE
-            git reset --hard $GITHUB_SHA
-        else        
-            git clone --recursive https://github.com/${GITHUB_REPOSITORY}.git $GITHUB_WORKSPACE
-            git fetch origin $GITHUB_REF:my_ref_name
-            git checkout $GITHUB_BASE_REF
-            git -c user.email="you@example.com" merge --no-commit my_ref_name
-        fi
-        git submodule update --init
-
-        ln -s /usr/local/bin/python3.10 /usr/local/bin/python
-
-    - name: test-musllinux_x86_64
-      env:
-        PKG_CONFIG_PATH: ${{ github.workspace }}/.openblas
-      run: |
-        python -m venv test_env
-        source test_env/bin/activate
-
-        pip install scipy-openblas64
-
-        pip install -r build_requirements.txt -r test_requirements.txt
-
-        # use meson to build and test 
-        spin build --with-scipy-openblas=64 -- -Duse-ilp64=true
-        spin test -j auto
-
-    - name: Meson Log
-      shell: bash
-      run: |
-        cat build/meson-logs/meson-log.txt
diff --git a/.github/workflows/linux_qemu.yml b/.github/workflows/linux_qemu.yml
deleted file mode 100644
index 31ceab851553..000000000000
--- a/.github/workflows/linux_qemu.yml
+++ /dev/null
@@ -1,157 +0,0 @@
-# Meson's Python module doesn't support crosscompiling,
-# and python dependencies may be another potential hurdle.
-# There might also be a need to run runtime tests during configure time.
-#
-# The recommended practice is to rely on Docker to provide the x86_64 crosscompile toolchain,
-# enabling native execution via binfmt.
-#
-# In simpler terms, everything except the crosscompile toolchain will be emulated.
-
-name: Linux Qemu tests
-
-on:
-  pull_request:
-    branches:
-      - main
-      - maintenance/**
-
-defaults:
-  run:
-    shell: bash
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-jobs:
-  linux_qemu:
-    if: "github.repository == 'numpy/numpy'"
-    runs-on: ubuntu-22.04
-    continue-on-error: true
-    strategy:
-      matrix:
-        BUILD_PROP:
-          - [
-              "armhf",
-              "arm-linux-gnueabihf",
-              "arm32v7/ubuntu:22.04",
-              "-Dallow-noblas=true",
-              # test_unary_spurious_fpexception is currently skipped
-              # FIXME(@seiko2plus): Requires confirmation for the following issue:
-              # The presence of an FP invalid exception caused by sqrt. Unsure if this is a qemu bug or not.
-              "(test_kind or test_multiarray or test_simd or test_umath or test_ufunc) and not test_unary_spurious_fpexception"
-          ]
-          - [
-              "ppc64le",
-              "powerpc64le-linux-gnu",
-              "ppc64le/ubuntu:22.04",
-              "-Dallow-noblas=true",
-              "test_kind or test_multiarray or test_simd or test_umath or test_ufunc",
-          ]
-          - [
-              "ppc64le - baseline(Power9)",
-              "powerpc64le-linux-gnu",
-              "ppc64le/ubuntu:22.04",
-              "-Dallow-noblas=true -Dcpu-baseline=vsx3",
-              "test_kind or test_multiarray or test_simd or test_umath or test_ufunc",
-          ]
-          - [
-              "s390x",
-              "s390x-linux-gnu",
-              "s390x/ubuntu:22.04",
-              "-Dallow-noblas=true",
-              # Skipping TestRationalFunctions.test_gcd_overflow test
-              # because of a possible qemu bug that appears to be related to int64 overflow in absolute operation.
-              # TODO(@seiko2plus): Confirm the bug and provide a minimal reproducer, then report it to upstream.
-              "(test_kind or test_multiarray or test_simd or test_umath or test_ufunc) and not test_gcd_overflow"
-          ]
-          - [
-              "s390x - baseline(Z13)",
-              "s390x-linux-gnu",
-              "s390x/ubuntu:22.04",
-              "-Dallow-noblas=true -Dcpu-baseline=vx",
-              "(test_kind or test_multiarray or test_simd or test_umath or test_ufunc) and not test_gcd_overflow"
-          ]
-    env:
-      TOOLCHAIN_NAME: ${{ matrix.BUILD_PROP[1] }}
-      DOCKER_CONTAINER: ${{ matrix.BUILD_PROP[2] }}
-      MESON_OPTIONS: ${{ matrix.BUILD_PROP[3] }}
-      RUNTIME_TEST_FILTER: ${{ matrix.BUILD_PROP[4] }}
-      TERM: xterm-256color
-
-    name: "${{ matrix.BUILD_PROP[0] }}"
-    steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-      with:
-        submodules: recursive
-        fetch-depth: 0
-
-    - name: Initialize binfmt_misc for qemu-user-static
-      run: |
-        docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
-
-    - name: Install GCC cross-compilers
-      run: |
-        sudo apt update
-        sudo apt install -y ninja-build gcc-${TOOLCHAIN_NAME} g++-${TOOLCHAIN_NAME} gfortran-${TOOLCHAIN_NAME}
-
-    - name: Cache docker container
-      uses: actions/cache@v3
-      id: container-cache
-      with:
-        path: ~/docker_${{ matrix.BUILD_PROP[1] }}
-        key: container-${{ runner.os }}-${{ matrix.BUILD_PROP[1] }}-${{ matrix.BUILD_PROP[2] }}-${{ hashFiles('build_requirements.txt') }}
-
-    - name: Creates new container
-      if: steps.container-cache.outputs.cache-hit != 'true'
-      run: |
-        docker run --name the_container --interactive -v /:/host -v $(pwd):/numpy ${DOCKER_CONTAINER} /bin/bash -c "
-          apt update &&
-          apt install -y cmake git python3 python-is-python3 python3-dev python3-pip &&
-          mkdir -p /lib64 && ln -s /host/lib64/ld-* /lib64/ &&
-          ln -s /host/lib/x86_64-linux-gnu /lib/x86_64-linux-gnu &&
-          rm -rf /usr/${TOOLCHAIN_NAME} && ln -s /host/usr/${TOOLCHAIN_NAME} /usr/${TOOLCHAIN_NAME} &&
-          rm -rf /usr/lib/gcc/${TOOLCHAIN_NAME} && ln -s /host/usr/lib/gcc-cross/${TOOLCHAIN_NAME} /usr/lib/gcc/${TOOLCHAIN_NAME} &&
-          rm -f /usr/bin/gcc && ln -s /host/usr/bin/${TOOLCHAIN_NAME}-gcc /usr/bin/gcc &&
-          rm -f /usr/bin/g++ && ln -s /host/usr/bin/${TOOLCHAIN_NAME}-g++ /usr/bin/g++ &&
-          rm -f /usr/bin/gfortran && ln -s /host/usr/bin/${TOOLCHAIN_NAME}-gfortran /usr/bin/gfortran &&
-          rm -f /usr/bin/ar && ln -s /host/usr/bin/${TOOLCHAIN_NAME}-ar /usr/bin/ar &&
-          rm -f /usr/bin/as && ln -s /host/usr/bin/${TOOLCHAIN_NAME}-as /usr/bin/as &&
-          rm -f /usr/bin/ld && ln -s /host/usr/bin/${TOOLCHAIN_NAME}-ld /usr/bin/ld &&
-          rm -f /usr/bin/ld.bfd && ln -s /host/usr/bin/${TOOLCHAIN_NAME}-ld.bfd /usr/bin/ld.bfd &&
-          rm -f /usr/bin/ninja && ln -s /host/usr/bin/ninja /usr/bin/ninja &&
-          git config --global --add safe.directory /numpy &&
-          python -m pip install -r /numpy/build_requirements.txt &&
-          python -m pip install pytest pytest-xdist hypothesis typing_extensions &&
-          rm -f /usr/local/bin/ninja && mkdir -p /usr/local/bin && ln -s /host/usr/bin/ninja /usr/local/bin/ninja
-        "
-        docker commit the_container the_container
-        mkdir -p "~/docker_${TOOLCHAIN_NAME}"
-        docker save -o "~/docker_${TOOLCHAIN_NAME}/the_container.tar" the_container
-
-    - name: Load container from cache
-      if: steps.container-cache.outputs.cache-hit == 'true'
-      run: docker load -i "~/docker_${TOOLCHAIN_NAME}/the_container.tar"
-
-    - name: Meson Build
-      run: |
-        docker run --rm -e "TERM=xterm-256color" -v $(pwd):/numpy -v /:/host the_container \
-        /bin/script -e -q -c "/bin/bash --noprofile --norc -eo pipefail -c '
-          cd /numpy && spin build --clean -- ${MESON_OPTIONS}
-        '"
-
-    - name: Meson Log
-      if: always()
-      run: 'cat build/meson-logs/meson-log.txt'
-
-    - name: Run Tests
-      run: |
-        docker run --rm -e "TERM=xterm-256color" -v $(pwd):/numpy -v /:/host the_container \
-        /bin/script -e -q -c "/bin/bash --noprofile --norc -eo pipefail -c '
-          export F90=/usr/bin/gfortran
-          cd /numpy && spin test -- -k \"${RUNTIME_TEST_FILTER}\"
-        '"
-
diff --git a/.github/workflows/linux_simd.yml b/.github/workflows/linux_simd.yml
deleted file mode 100644
index 0f13dadad456..000000000000
--- a/.github/workflows/linux_simd.yml
+++ /dev/null
@@ -1,256 +0,0 @@
-name: Linux SIMD tests
-
-# This file is meant for testing different SIMD-related build options and
-# optimization levels. See `meson_options.txt` for the available build options.
-#
-# Jobs and their purposes:
-#
-# - baseline_only:
-#   Focuses on completing as quickly as possible and acts as a filter for other, more resource-intensive jobs.
-#   Utilizes only the default baseline targets (e.g., SSE3 on X86_64) without enabling any runtime dispatched features.
-#
-# - old_gcc:
-#   Tests the oldest supported GCC version with default CPU/baseline/dispatch settings.
-#
-# - without_optimizations:
-#   Completely disables all SIMD optimizations and other compiler optimizations such as loop unrolling.
-#
-# - native:
-#   Tests against the host CPU features set as the baseline without enabling any runtime dispatched features.
-#   Intended to assess the entire NumPy codebase against host flags, even for code sections lacking handwritten SIMD intrincis.
-#
-# - without_avx512/avx2/fma3:
-#   Uses runtime SIMD dispatching but disables AVX2, FMA3, and AVX512.
-#   Intended to evaluate 128-bit SIMD extensions without FMA support.
-#
-# - without_avx512:
-#   Uses runtime SIMD dispatching but disables AVX512.
-#   Intended to evaluate 128-bit/256-bit SIMD extensions.
-#
-# - intel_sde:
-#   Executes only the SIMD tests for various AVX512 SIMD extensions under the Intel Software Development Emulator (SDE).
-#
-on:
-  pull_request:
-    branches:
-      - main
-      - maintenance/**
-
-defaults:
-  run:
-    shell: 'script -q -e -c "bash --noprofile --norc -eo pipefail {0}"'
-
-env:
-  TERM: xterm-256color
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read # to fetch code (actions/checkout)
-
-jobs:
-  baseline_only:
-    if: "github.repository == 'numpy/numpy'"
-    runs-on: ubuntu-latest
-    env:
-      MESON_ARGS: "-Dallow-noblas=true -Dcpu-dispatch=none"
-    steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-      with:
-        submodules: recursive
-        fetch-depth: 0
-    - uses: actions/setup-python@65d7f2d534ac1bc67fcd62888c5f4f3d2cb2b236 # v4.7.1
-      with:
-        python-version: '3.9'
-    - uses: ./.github/meson_actions
-      name: Build/Test
-
-  old_gcc:
-    if: github.event_name != 'push'
-    needs: [baseline_only]
-    runs-on: ubuntu-latest
-    env:
-      MESON_ARGS: "-Dallow-noblas=true"
-    steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-      with:
-        submodules: recursive
-        fetch-depth: 0
-    - uses: actions/setup-python@65d7f2d534ac1bc67fcd62888c5f4f3d2cb2b236 # v4.7.1
-      with:
-        python-version: '3.9'
-
-    - name: Install GCC/8/9
-      run: |
-        echo "deb http://archive.ubuntu.com/ubuntu focal main universe" | sudo tee /etc/apt/sources.list.d/focal.list
-        sudo apt update
-        sudo apt install -y g++-8 g++-9
-
-    - name: Enable gcc-8
-      run: |
-        sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 1
-        sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-8 1
-
-    - uses: ./.github/meson_actions
-      name: Build/Test against gcc-8
-
-    - name: Enable gcc-9
-      run: |
-        sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 2
-        sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-9 2
-
-    - uses: ./.github/meson_actions
-      name: Build/Test against gcc-9
-
-  specialize:
-    needs: [baseline_only]
-    runs-on: ubuntu-latest
-    if: github.event_name != 'push'
-    continue-on-error: true
-    strategy:
-      matrix:
-        BUILD_PROP:
-          #- [
-            #"without optimizations",
-            #"-Dallow-noblas=true -Ddisable-optimization=true",
-            #"3.12"
-          #]
-          - [
-            "native",
-            "-Dallow-noblas=true -Dcpu-baseline=native -Dcpu-dispatch=none",
-            "3.11"
-          ]
-          - [
-            "without avx512",
-            "-Dallow-noblas=true -Dcpu-dispatch=SSSE3,SSE41,POPCNT,SSE42,AVX,F16C,AVX2,FMA3",
-            "3.10"
-          ]
-          - [
-            "without avx512/avx2/fma3",
-            "-Dallow-noblas=true -Dcpu-dispatch=SSSE3,SSE41,POPCNT,SSE42,AVX,F16C",
-            "3.9"
-          ]
-
-    env:
-      MESON_ARGS: ${{ matrix.BUILD_PROP[1] }}
-
-    name: "${{ matrix.BUILD_PROP[0] }}"
-    steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-      with:
-        submodules: recursive
-        fetch-depth: 0
-    - uses: actions/setup-python@65d7f2d534ac1bc67fcd62888c5f4f3d2cb2b236 # v4.7.1
-      with:
-        python-version: "${{ matrix.BUILD_PROP[2] }}"
-    - uses: ./.github/meson_actions
-      name: Build/Test
-
-  intel_sde_avx512:
-    needs: [baseline_only]
-    runs-on: ubuntu-latest
-    steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-      with:
-        submodules: recursive
-        fetch-depth: 0
-    - uses: actions/setup-python@65d7f2d534ac1bc67fcd62888c5f4f3d2cb2b236 # v4.7.1
-      with:
-        python-version: '3.11'
-
-    - name: Install Intel SDE
-      run: |
-        curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/788820/sde-external-9.27.0-2023-09-13-lin.tar.xz
-        mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/
-        sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde
-
-    - name: Install dependencies
-      run: |
-        sudo apt update
-        sudo apt install -y g++-13
-        sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-13 1
-        sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-13 1
-        python -m pip install -r build_requirements.txt
-        python -m pip install pytest pytest-xdist hypothesis typing_extensions
-
-    - name: Build
-      run: spin build -- -Dallow-noblas=true -Dcpu-baseline=avx512f -Dtest-simd='BASELINE,AVX512_KNL,AVX512_KNM,AVX512_SKX,AVX512_CLX,AVX512_CNL,AVX512_ICL,AVX512_SPR'
-
-    - name: Meson Log
-      if: always()
-      run: cat build/meson-logs/meson-log.txt
-
-    - name: SIMD tests (KNM)
-      run: |
-        export NUMPY_SITE=$(realpath build-install/usr/lib/python*/site-packages/)
-        export PYTHONPATH="$PYTHONPATH:$NUMPY_SITE"
-        cd build-install &&
-        sde -knm -- python -c "import numpy; numpy.show_config()" &&
-        sde -knm -- python -m pytest $NUMPY_SITE/numpy/core/tests/test_simd*
-
-    - name: linalg/ufunc/umath tests (TGL)
-      run: |
-        export NUMPY_SITE=$(realpath build-install/usr/lib/python*/site-packages/)
-        export PYTHONPATH="$PYTHONPATH:$NUMPY_SITE"
-        cd build-install &&
-        sde -tgl -- python -c "import numpy; numpy.show_config()" &&
-        sde -tgl -- python -m pytest $NUMPY_SITE/numpy/core/tests/test_umath* \
-                                     $NUMPY_SITE/numpy/core/tests/test_ufunc.py \
-                                     $NUMPY_SITE/numpy/core/tests/test_multiarray.py \
-                                     $NUMPY_SITE/numpy/linalg/tests/test_*
-
-
-  intel_sde_spr:
-    needs: [baseline_only]
-    runs-on: ubuntu-latest
-    steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-      with:
-        submodules: recursive
-        fetch-depth: 0
-    - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
-      with:
-        python-version: '3.11'
-
-    - name: Install Intel SDE
-      run: |
-        curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/788820/sde-external-9.27.0-2023-09-13-lin.tar.xz
-        mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/
-        sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde
-
-    - name: Install dependencies
-      run: |
-        sudo apt update
-        sudo apt install -y g++-13
-        sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-13 1
-        sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-13 1
-        python -m pip install -r build_requirements.txt
-        python -m pip install pytest pytest-xdist hypothesis typing_extensions
-
-    - name: Build
-      run: spin build -- -Dallow-noblas=true -Dcpu-baseline=avx512_spr
-
-    - name: Meson Log
-      if: always()
-      run: cat build/meson-logs/meson-log.txt
-
-    - name: SIMD tests (SPR)
-      run: |
-        export NUMPY_SITE=$(realpath build-install/usr/lib/python*/site-packages/)
-        export PYTHONPATH="$PYTHONPATH:$NUMPY_SITE"
-        cd build-install &&
-        sde -spr -- python -c "import numpy; numpy.show_config()" &&
-        sde -spr -- python -m pytest $NUMPY_SITE/numpy/core/tests/test_simd*
-
-    - name: linalg/ufunc/umath tests on Intel SPR
-      run: |
-        export NUMPY_SITE=$(realpath build-install/usr/lib/python*/site-packages/)
-        export PYTHONPATH="$PYTHONPATH:$NUMPY_SITE"
-        cd build-install &&
-        sde -spr -- python -c "import numpy; numpy.show_config()" &&
-        sde -spr -- python -m pytest $NUMPY_SITE/numpy/core/tests/test_umath* \
-                                     $NUMPY_SITE/numpy/core/tests/test_ufunc.py \
-                                     $NUMPY_SITE/numpy/core/tests/test_multiarray.py \
-                                     $NUMPY_SITE/numpy/linalg/tests/test_*
diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml
deleted file mode 100644
index 5a7ebbecc80e..000000000000
--- a/.github/workflows/macos.yml
+++ /dev/null
@@ -1,136 +0,0 @@
-name: macOS tests (meson)
-
-on:
-  pull_request:
-    branches:
-      - main
-      - maintenance/**
-
-permissions:
-   contents: read  # to fetch code (actions/checkout)
-
-env:
-  CCACHE_DIR: "${{ github.workspace }}/.ccache"
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  x86_conda:
-    name: macOS x86-64 conda
-    if: "github.repository == 'numpy/numpy'"
-    runs-on: macos-latest
-    strategy:
-      matrix:
-        python-version: ["3.11"]
-
-    steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-      with:
-        submodules: recursive
-        fetch-depth: 0
-
-    - name:  Prepare cache dirs and timestamps
-      id:    prep-ccache
-      shell: bash -l {0}
-      run: |
-        mkdir -p "${CCACHE_DIR}"
-        echo "dir=$CCACHE_DIR" >> $GITHUB_OUTPUT
-        NOW=$(date -u +"%F-%T")
-        echo "timestamp=${NOW}" >> $GITHUB_OUTPUT
-        echo "today=$(/bin/date -u '+%Y%m%d')" >> $GITHUB_OUTPUT
-
-    - name: Setup compiler cache
-      uses: actions/cache@88522ab9f39a2ea568f7027eddc7d8d8bc9d59c8 # v3.3.1
-      id:    cache-ccache
-      with:
-        path: ${{ steps.prep-ccache.outputs.dir }}
-        key:  ${{ github.workflow }}-${{ matrix.python-version }}-ccache-macos-${{ steps.prep-ccache.outputs.timestamp }}
-        restore-keys: |
-          ${{ github.workflow }}-${{ matrix.python-version }}-ccache-macos-
-
-    - name: Setup Mambaforge
-      uses: conda-incubator/setup-miniconda@11b562958363ec5770fef326fe8ef0366f8cbf8a # v3.0.1
-      with:
-        python-version: ${{ matrix.python-version }}
-        channels: conda-forge
-        channel-priority: true
-        activate-environment: numpy-dev
-        use-only-tar-bz2: false
-        miniforge-variant: Mambaforge
-        miniforge-version: latest
-        use-mamba: true
-
-    # Updates if `environment.yml` or the date changes. The latter is needed to
-    # ensure we re-solve once a day (since we don't lock versions). Could be
-    # replaced by a conda-lock based approach in the future.
-    - name: Cache conda environment
-      uses: actions/cache@88522ab9f39a2ea568f7027eddc7d8d8bc9d59c8 # v3.3.1
-      env:
-        # Increase this value to reset cache if environment.yml has not changed
-        CACHE_NUMBER: 1
-      with:
-        path: ${{ env.CONDA }}/envs/numpy-dev
-        key:
-          ${{ runner.os }}--${{ steps.prep-ccache.outputs.today }}-conda-${{ env.CACHE_NUMBER }}-${{ hashFiles('environment.yml') }}
-      id: envcache
-
-    - name: Update Conda Environment
-      run: mamba env update -n numpy-dev -f environment.yml
-      if: steps.envcache.outputs.cache-hit != 'true'
-
-    - name: Build and Install NumPy
-      shell: bash -l {0}
-      run: |
-        conda activate numpy-dev
-        CC="ccache $CC" spin build -j2 -- -Dallow-noblas=false
-
-    - name: Run test suite (full)
-      shell: bash -l {0}
-      run: |
-        conda activate numpy-dev
-        export OMP_NUM_THREADS=2
-        spin test -j2 -m full
-
-    - name: Ccache statistics
-      shell: bash -l {0}
-      run: |
-        conda activate numpy-dev
-        ccache -s
-
-  accelerate:
-    name: Accelerate (LP64, ILP64)
-    if: "github.repository == 'numpy/numpy'"
-    runs-on: macos-13
-    steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-      with:
-        submodules: recursive
-        fetch-depth: 0
-
-    - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0
-      with:
-        python-version: '3.10'
-
-    - uses: maxim-lobanov/setup-xcode@9a697e2b393340c3cacd97468baa318e4c883d98 # v1.5.1
-      with:
-        xcode-version: '14.3'
-
-    - name: Install dependencies
-      run: |
-        pip install -r build_requirements.txt
-        pip install pytest pytest-xdist hypothesis
-
-    - name: Build against Accelerate (LP64)
-      run: spin build -- -Ddisable-optimization=true -Dallow-noblas=false
-
-    - name: Test (linalg only)
-      run: spin test -j2 -- numpy/linalg
-
-    - name: Build NumPy against Accelerate (ILP64)
-      run: |
-        spin build -- -Duse-ilp64=true -Dallow-noblas=false
-
-    - name: Test (fast tests)
-      run: spin test -j2
diff --git a/.github/workflows/mypy.yml b/.github/workflows/mypy.yml
deleted file mode 100644
index c89b8e11ec95..000000000000
--- a/.github/workflows/mypy.yml
+++ /dev/null
@@ -1,67 +0,0 @@
-name: Run MyPy
-
-# Mypy is too slow to run as part of regular CI. The purpose of the jobs in
-# this file is to cover running Mypy across:
-#
-#   - OSes: Linux, Windows and macOS
-#   - Python versions: lowest/highest supported versions, and an intermediate one
-#
-# The build matrix aims for sparse coverage across those two dimensions.
-# Use of BLAS/LAPACK and SIMD is disabled on purpose, because those things
-# don't matter for static typing and this speeds up the builds.
-#
-# This is a separate job file so it's easy to trigger by hand.
-
-on:
-  pull_request:
-    branches:
-      - main
-      - maintenance/**
-    paths-ignore:
-      - 'benchmarks/'
-      - '.circlecl/'
-      - 'docs/'
-      - 'meson_cpu/'
-      - 'tools/'
-  workflow_dispatch:
-
-defaults:
-  run:
-    shell: bash
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read # to fetch code (actions/checkout)
-
-jobs:
-  mypy:
-    if: "github.repository == 'numpy/numpy'"
-    name: "MyPy"
-    runs-on: ${{ matrix.os_python[0] }}
-    strategy:
-      matrix:
-        os_python:
-          - [ubuntu-latest, '3.10']  # switch to 3.12-dev after mypy is upgraded (see gh-23764)
-          - [windows-2019, '3.11']
-          - [macos-12, '3.9']
-    steps:
-    - uses: actions/checkout@c85c95e3d7251135ab7dc9ce3241c5835cc595a9 # v3.5.3
-      with:
-        submodules: recursive
-        fetch-depth: 0
-    - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0
-      with:
-        python-version: ${{ matrix.os_python[1] }}
-    - name: Install dependencies
-      run: |
-        pip install -r build_requirements.txt
-        pip install -r test_requirements.txt
-    - name: Build
-      run: |
-        spin build -j2 -- -Dallow-noblas=true -Ddisable-optimization=true --vsenv
-    - name: Run Mypy
-      run: |
-        spin mypy
diff --git a/.github/workflows/nanvix-ci.yml b/.github/workflows/nanvix-ci.yml
index 9366ec0d91eb..0ce155a7549f 100644
--- a/.github/workflows/nanvix-ci.yml
+++ b/.github/workflows/nanvix-ci.yml
@@ -1,294 +1,56 @@
+# Copyright(c) The Maintainers of Nanvix.
+# Licensed under the MIT License.
+
 name: Nanvix CI
 
 on:
   schedule:
-    - cron: "0 0 * * *"
+    - cron: "0 13 * * *"
   push:
-    branches:
-      - nanvix/**
+    branches: ["nanvix/**"]
   pull_request:
-    branches:
-      - nanvix/**
+    branches: ["nanvix/**"]
   workflow_dispatch:
-  repository_dispatch:
-    types: [nanvix-minor-release, nanvix-major-release, cpython-release]
 
 permissions:
   contents: write
   actions: write
   issues: write
+  pull-requests: write
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref_name || github.ref || 'default' }}
-  cancel-in-progress: ${{ github.event_name == 'repository_dispatch' }}
+  cancel-in-progress: true
 
 jobs:
-  get-nanvix-info:
-    runs-on: ubuntu-latest
-    outputs:
-      sha: ${{ steps.extract.outputs.sha }}
-      sha_full: ${{ steps.extract.outputs.sha_full }}
-    steps:
-      - name: Download Nanvix Release Info
-        id: extract
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          set -euo pipefail
-          curl -fsSL https://raw.githubusercontent.com/nanvix/nanvix/refs/heads/dev/scripts/get-nanvix.sh | bash -s -- --force nanvix-artifacts
-          ARTIFACT_FILE=$(find nanvix-artifacts -maxdepth 1 -type f -name "*.tar.bz2" | head -1)
-          if [[ -z "$ARTIFACT_FILE" ]]; then
-            echo "::error::No Nanvix artifact found"
-            exit 1
-          fi
-          NANVIX_SHA_FULL=$(basename "$ARTIFACT_FILE" | sed -E 's/.*-([a-f0-9]{40})\.tar\.bz2$/\1/')
-          NANVIX_SHA="${NANVIX_SHA_FULL::7}"
-          echo "sha=$NANVIX_SHA" >> "$GITHUB_OUTPUT"
-          echo "sha_full=$NANVIX_SHA_FULL" >> "$GITHUB_OUTPUT"
-
-  build:
-    needs: get-nanvix-info
-    runs-on: ubuntu-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        platform: [hyperlight, microvm]
-        process-mode: [multi-process, single-process, standalone]
-        memory: [128mb]
-    name: ${{ matrix.platform }}-${{ matrix.process-mode }}-${{ matrix.memory }}
-    container:
-      image: nanvix/toolchain:latest-minimal
-      options: --device /dev/kvm
-    defaults:
-      run:
-        shell: bash
-    env:
-      NANVIX_SHA: ${{ needs.get-nanvix-info.outputs.sha }}
-      NANVIX_SHA_FULL: ${{ needs.get-nanvix-info.outputs.sha_full }}
-      USER: runner
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: recursive
-
-      - name: Download and Extract Nanvix Release
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          set -euo pipefail
-          curl -fsSL https://raw.githubusercontent.com/nanvix/nanvix/refs/heads/dev/scripts/get-nanvix.sh | bash -s -- --force nanvix-artifacts
-          ARTIFACT_PATTERN="${{ matrix.platform }}.*${{ matrix.process-mode }}.*${{ matrix.memory }}.*\.tar\.bz2"
-          ARTIFACT_FILE=$(find nanvix-artifacts -maxdepth 1 -type f -name "*.tar.bz2" | grep -E "$ARTIFACT_PATTERN" | head -1)
-          if [[ -z "$ARTIFACT_FILE" ]]; then
-            echo "::error::No ${{ matrix.platform }} ${{ matrix.process-mode }} artifact found"
-            exit 1
-          fi
-          mkdir -p nanvix-artifacts/extracted
-          tar -xjf "$ARTIFACT_FILE" -C nanvix-artifacts/extracted
-          NANVIX_HOME=$(find nanvix-artifacts/extracted -maxdepth 2 -type d -name "bin" -exec dirname {} \; | head -1)
-          echo "NANVIX_HOME=$NANVIX_HOME" >> "$GITHUB_ENV"
-
-      - name: Install Host Build Dependencies
-        run: |
-          apt-get update -qq && apt-get install -y -qq python3-pip python3-dev cython3 > /dev/null 2>&1 || true
-          pip3 install --break-system-packages cython meson meson-python 2>&1 || pip3 install cython meson meson-python 2>&1 || true
-          which cython3 || which cython || echo "WARNING: cython not found"
-
-      - name: Install CPython Headers
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          set -euo pipefail
-          CPYTHON_ARTIFACT="cpython-microvm-standalone-128mb.tar.bz2"
-          mkdir -p cpython-dep
-          curl -fsSL -o "cpython-dep/${CPYTHON_ARTIFACT}" \
-            "https://github.com/nanvix/cpython/releases/latest/download/${CPYTHON_ARTIFACT}"
-          tar -xjf "cpython-dep/${CPYTHON_ARTIFACT}" -C cpython-dep
-          CPYTHON_DIR=$(find cpython-dep -mindepth 1 -maxdepth 1 -type d \( -name "sysroot" -o -name "cpython-*" \) | head -1)
-          if [[ -n "$CPYTHON_DIR" ]] && [[ -d "$CPYTHON_DIR/include" ]]; then
-            mkdir -p "$NANVIX_HOME/include"
-            cp -rf "$CPYTHON_DIR"/include/* "$NANVIX_HOME/include/" 2>/dev/null || true
-            echo "CPython headers installed from standalone release"
-          fi
-
-          # Create pkg-config file for Meson
-          mkdir -p "$NANVIX_HOME/lib/pkgconfig"
-          cat > "$NANVIX_HOME/lib/pkgconfig/python-3.12.pc" << PKGEOF
-          prefix=$NANVIX_HOME
-          includedir=\${prefix}/include/python3.12
-          Name: Python
-          Description: Embed Python into an application
-          Version: 3.12.3
-          Cflags: -I\${includedir}
-          PKGEOF
-          cp "$NANVIX_HOME/lib/pkgconfig/python-3.12.pc" "$NANVIX_HOME/lib/pkgconfig/python-3.12-embed.pc"
-          echo "pkg-config files created"
-
-      - name: Build
-        run: |
-          make -f Makefile.nanvix CONFIG_NANVIX=y NANVIX_HOME="$NANVIX_HOME" NANVIX_TOOLCHAIN="/opt/nanvix" all
-
-      - name: Test
-        run: |
-          make -f Makefile.nanvix CONFIG_NANVIX=y NANVIX_HOME="$NANVIX_HOME" NANVIX_TOOLCHAIN="/opt/nanvix" test
-
-      - name: Package Artifacts
-        run: |
-          set -euo pipefail
-          ARTIFACT_NAME="numpy-${{ matrix.platform }}-${{ matrix.process-mode }}-${{ matrix.memory }}"
-          DIST_DIR="dist/${ARTIFACT_NAME}"
-          mkdir -p "${DIST_DIR}/lib/numpy"
-          if [ -d "$NANVIX_HOME/lib/numpy" ]; then
-            cp -f "$NANVIX_HOME"/lib/numpy/*.a "${DIST_DIR}/lib/numpy/"
-          else
-            echo "::warning::$NANVIX_HOME/lib/numpy not found, searching for .a files"
-            find . -name "*.a" -path "*/numpy/*" -exec cp -f {} "${DIST_DIR}/lib/numpy/" \;
-          fi
-          tar -cjf "dist/${ARTIFACT_NAME}.tar.bz2" -C dist "${ARTIFACT_NAME}"
-          echo "ARTIFACT_TARBALL=dist/${ARTIFACT_NAME}.tar.bz2" >> "$GITHUB_ENV"
-
-      - name: Upload Build Artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          name: numpy-${{ matrix.platform }}-${{ matrix.process-mode }}-${{ matrix.memory }}
-          path: ${{ env.ARTIFACT_TARBALL }}
-          retention-days: 7
-
-  release:
-    needs: [get-nanvix-info, build]
-    if: ${{ !cancelled() && github.event_name != 'pull_request' }}
-    runs-on: ubuntu-latest
-    env:
-      NANVIX_SHA: ${{ needs.get-nanvix-info.outputs.sha }}
-      NANVIX_SHA_FULL: ${{ needs.get-nanvix-info.outputs.sha_full }}
-    steps:
-      - uses: actions/checkout@v4
-
-      - name: Download All Artifacts
-        id: download-artifacts
-        uses: actions/download-artifact@v4
-        continue-on-error: true
-        with:
-          path: release-artifacts
-          pattern: numpy-*
-
-      - name: Prepare Release Assets
-        run: |
-          set -euo pipefail
-          mkdir -p release-assets
-          if [[ "${{ steps.download-artifacts.outcome }}" == "failure" ]]; then
-            echo "::warning::Artifact download step failed — this may indicate an API or network issue"
-          fi
-          find release-artifacts -name "*.tar.bz2" -exec cp {} release-assets/ \; 2>/dev/null || true
-          ASSET_COUNT=$(find release-assets -name "*.tar.bz2" 2>/dev/null | wc -l)
-          echo "Found $ASSET_COUNT release asset(s)"
-          if [[ "$ASSET_COUNT" -eq 0 ]]; then
-            echo "::error::No release assets found — all matrix builds may have failed or artifact retrieval encountered an error"
-            exit 1
-          fi
-          ls -la release-assets/
-
-      - name: Get Nanvix Release Info
-        id: nanvix-release
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          set -euo pipefail
-          API_URL="https://api.github.com/repos/nanvix/nanvix/releases/tags/latest"
-          RELEASE_INFO=$(curl -fsSL -H "Authorization: Bearer ${GH_TOKEN}" "$API_URL")
-          NANVIX_NAME=$(echo "$RELEASE_INFO" | jq -r '.name // "latest"')
-          NANVIX_PUBLISHED=$(echo "$RELEASE_INFO" | jq -r '.published_at')
-          echo "name=${NANVIX_NAME}" >> "$GITHUB_OUTPUT"
-          echo "published=${NANVIX_PUBLISHED}" >> "$GITHUB_OUTPUT"
-
-      - name: Create Latest Release
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          NANVIX_NAME: ${{ steps.nanvix-release.outputs.name }}
-          NANVIX_PUBLISHED: ${{ steps.nanvix-release.outputs.published }}
-        run: |
-          RELEASE_TAG="${GITHUB_SHA::7}-nanvix-${NANVIX_SHA}"
-          if gh release view "$RELEASE_TAG" &>/dev/null; then
-            gh release delete "$RELEASE_TAG" --yes --cleanup-tag || true
-          fi
-          gh release create "$RELEASE_TAG" \
-            --title "Build ${GITHUB_SHA::7}" \
-            --notes "Automated build from branch ${{ github.ref_name }} at commit ${{ github.sha }}.
-
-          **Build Information:**
-          - Workflow Run: [#${{ github.run_number }}](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})
-          - Commit: [${{ github.sha }}](${{ github.server_url }}/${{ github.repository }}/commit/${{ github.sha }})
-          - Date: $(date -u +"%Y-%m-%d %H:%M:%S UTC")
-
-          **Nanvix Release:**
-          - Name: ${NANVIX_NAME}
-          - Published: ${NANVIX_PUBLISHED}
-          - Commit: [${NANVIX_SHA_FULL}](https://github.com/nanvix/nanvix/commit/${NANVIX_SHA_FULL})
-
-          **Dependencies:**
-          - CPython headers (nanvix/cpython)
-          - OpenBLAS (nanvix/OpenBLAS)" \
-            --latest \
-            release-assets/*.tar.bz2
-
-      # Trigger dependent workflows (nanvix-python integration)
-      - name: Trigger Dependent Workflows
-        if: success()
-        env:
-          GH_TOKEN: ${{ secrets.DISPATCH_TOKEN }}
-        run: |
-          RELEASE_TAG="${GITHUB_SHA::7}-nanvix-${NANVIX_SHA}"
-          echo "Triggering nanvix-python workflow..."
-          gh api repos/nanvix/nanvix-python/dispatches \
-            -X POST \
-            -H "Accept: application/vnd.github+json" \
-            -f event_type="numpy-release" \
-            -f "client_payload[nanvix_sha]=${NANVIX_SHA}" \
-            -f "client_payload[numpy_tag]=${RELEASE_TAG}" || echo "::warning::Failed to trigger nanvix-python workflow"
-
-  report-failure:
-    needs: [build, release]
-    if: >-
-      ${{ always() &&
-          (github.event_name == 'repository_dispatch' || github.event_name == 'schedule' || github.event_name == 'push') &&
-          needs.build.result == 'failure' &&
-          needs.release.result == 'failure' }}
-    runs-on: ubuntu-latest
-    steps:
-      - name: Report failure issue
-        uses: actions/github-script@v7
-        env:
-          BUILD_RESULT: ${{ needs.build.result }}
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          script: |
-            const runUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
-            const owner = context.repo.owner;
-            const repo = context.repo.repo;
-            const title = `CI failure`;
-            const body = [
-              `The numpy CI workflow failed.`,
-              `- Trigger: ${context.eventName}`,
-              `- Run: [#${context.runNumber}](${runUrl})`,
-              `- SHA: ${context.sha}`,
-              `- build: ${process.env.BUILD_RESULT}`,
-              '',
-              'Please investigate and take any corrective actions.'
-            ].join('\n');
-
-            const { data: search } = await github.rest.search.issuesAndPullRequests({
-              q: `repo:${owner}/${repo} is:issue is:open in:title "CI failure"`
-            });
-            const existing = search.items.find(i => i.title === title);
-
-            if (existing) {
-              await github.rest.issues.createComment({ owner, repo, issue_number: existing.number, body });
-              const currentAssignees = existing.assignees.map(a => a.login);
-              const desired = ['ppenna', 'danbugs'];
-              const missing = desired.filter(a => !currentAssignees.includes(a));
-              if (missing.length > 0) {
-                await github.rest.issues.addAssignees({ owner, repo, issue_number: existing.number, assignees: missing });
-              }
-            } else {
-              await github.rest.issues.create({ owner, repo, title, body, assignees: ['ppenna', 'danbugs'] });
-            }
+  ci:
+    if: github.event_name != 'schedule' && github.event_name != 'workflow_dispatch'
+    uses: nanvix/workflows/.github/workflows/nanvix-ci.yml@v2.0.0
+    with:
+      zutil-version: "v0.8.2"
+      docker-image: "ghcr.io/nanvix/toolchain-python:latest"
+      platforms: '["microvm"]'
+      memory-sizes: '["256mb"]'
+      windows-matrix-exclude: '[]'
+      skip-full-test-modes: '[]'
+      caller-event-name: ${{ github.event_name }}
+      windows-test: false
+    secrets:
+      GH_TOKEN: ${{ secrets.GH_TOKEN || secrets.GITHUB_TOKEN }}
+      DISPATCH_TOKEN: ${{ secrets.DISPATCH_TOKEN }}
+
+  ci-scheduled:
+    if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
+    uses: nanvix/workflows/.github/workflows/nanvix-ci.yml@v2.0.0
+    with:
+      zutil-version: "v0.8.2"
+      docker-image: "ghcr.io/nanvix/toolchain-python:latest"
+      platforms: '["microvm"]'
+      memory-sizes: '["256mb"]'
+      windows-matrix-exclude: '[]'
+      skip-full-test-modes: '[]'
+      caller-event-name: 'schedule'
+      windows-test: false
+    secrets:
+      GH_TOKEN: ${{ secrets.GH_TOKEN || secrets.GITHUB_TOKEN }}
+      DISPATCH_TOKEN: ${{ secrets.DISPATCH_TOKEN }}
diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml
deleted file mode 100644
index 6e9dfa0a445f..000000000000
--- a/.github/workflows/scorecards.yml
+++ /dev/null
@@ -1,55 +0,0 @@
-name: Scorecards supply-chain security
-on:
-  # For Branch-Protection check. Only the default branch is supported. See
-  # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection
-  branch_protection_rule:
-  # To guarantee Maintained check is occasionally updated. See
-  # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained
-  schedule:
-    - cron: "19 23 * * 5"
-  push:
-    branches: ["main"]
-
-# Declare default permissions as read only.
-permissions: read-all
-
-jobs:
-  analysis:
-    name: Scorecards analysis
-    runs-on: ubuntu-latest
-    permissions:
-      # Needed to upload the results to code-scanning dashboard.
-      security-events: write
-      # Needed to publish results and get a badge (see publish_results below).
-      id-token: write
-
-    steps:
-      - name: "Checkout code"
-        uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0
-        with:
-          persist-credentials: false
-
-      - name: "Run analysis"
-        uses: ossf/scorecard-action@08b4669551908b1024bb425080c797723083c031 # v2.2.0
-        with:
-          results_file: results.sarif
-          results_format: sarif
-          # Publish results to OpenSSF REST API for easy access by consumers.
-          # Allows the repository to include the Scorecard badge.
-          # See https://github.com/ossf/scorecard-action#publishing-results.
-          publish_results: true
-
-      # Upload the results as artifacts (optional). Commenting out will disable
-      # uploads of run results in SARIF format to the repository Actions tab.
-      - name: "Upload artifact"
-        uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # v3.1.2
-        with:
-          name: SARIF file
-          path: results.sarif
-          retention-days: 5
-
-      # Upload the results to GitHub's code scanning dashboard.
-      - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@cdcdbb579706841c47f7063dda365e292e5cad7a # v2.1.27
-        with:
-          sarif_file: results.sarif
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
deleted file mode 100644
index fbc82c03aaa8..000000000000
--- a/.github/workflows/wheels.yml
+++ /dev/null
@@ -1,247 +0,0 @@
-# Workflow to build and test wheels.
-# To work on the wheel building infrastructure on a fork, comment out:
-#
-# if: github.repository == 'numpy/numpy'
-#
-# in the get_commit_message job. Be sure to include [wheel build] in your commit
-# message to trigger the build. All files related to wheel building are located
-# at tools/wheels/
-# Alternatively, you can add labels to the pull request in order to trigger wheel
-# builds.
-# The labels that trigger builds are:
-# 36 - Build(for changes to the building process,
-# 14 - Release(ensure wheels build before release)
-name: Wheel builder
-
-on:
-  schedule:
-    #        ┌───────────── minute (0 - 59)
-    #        │  ┌───────────── hour (0 - 23)
-    #        │  │ ┌───────────── day of the month (1 - 31)
-    #        │  │ │ ┌───────────── month (1 - 12 or JAN-DEC)
-    #        │  │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
-    #        │  │ │ │ │
-    - cron: "42 2 * * SUN,WED"
-  push:
-  pull_request:
-    types: [labeled, opened, synchronize, reopened]
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read # to fetch code (actions/checkout)
-
-jobs:
-  get_commit_message:
-    name: Get commit message
-    runs-on: ubuntu-latest
-    if: "github.repository == 'numpy/numpy'"
-    outputs:
-      message: ${{ steps.commit_message.outputs.message }}
-    steps:
-      - name: Checkout numpy
-        uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0
-        # Gets the correct commit message for pull request
-        with:
-          ref: ${{ github.event.pull_request.head.sha }}
-      - name: Get commit message
-        id: commit_message
-        run: |
-          set -xe
-          COMMIT_MSG=$(git log --no-merges -1 --oneline)
-          echo "message=$COMMIT_MSG" >> $GITHUB_OUTPUT
-          echo github.ref ${{ github.ref }}
-
-  build_wheels:
-    name: Build wheel for ${{ matrix.python }}-${{ matrix.buildplat[1] }}
-    needs: get_commit_message
-    if: >-
-      contains(needs.get_commit_message.outputs.message, '[wheel build]') ||
-      github.event_name == 'schedule' ||
-      github.event_name == 'workflow_dispatch' ||
-      (github.event_name == 'pull_request' &&
-      (contains(github.event.pull_request.labels.*.name, '36 - Build') ||
-      contains(github.event.pull_request.labels.*.name, '14 - Release'))) ||
-      (github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && ( ! endsWith(github.ref, 'dev0')))
-    runs-on: ${{ matrix.buildplat[0] }}
-    strategy:
-      # Ensure that a wheel builder finishes even if another fails
-      fail-fast: false
-      matrix:
-        # Github Actions doesn't support pairing matrix values together, let's improvise
-        # https://github.com/github/feedback/discussions/7835#discussioncomment-1769026
-        buildplat:
-          - [ubuntu-20.04, manylinux_x86_64]
-          - [ubuntu-20.04, musllinux_x86_64]
-          - [macos-12, macosx_x86_64]
-          - [windows-2019, win_amd64]
-          - [windows-2019, win32]
-        python: ["cp39", "cp310", "cp311", "cp312", "pp39"]
-        exclude:
-          # Don't build PyPy 32-bit windows
-          - buildplat: [windows-2019, win32]
-            python: "pp39"
-          - buildplat: [ ubuntu-20.04, musllinux_x86_64 ]
-            python: "pp39"
-    env:
-      IS_32_BIT: ${{ matrix.buildplat[1] == 'win32' }}
-      IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }}
-      IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
-    steps:
-      - name: Checkout numpy
-        uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0
-        with:
-          submodules: true
-          # versioneer.py requires the latest tag to be reachable. Here we
-          # fetch the complete history to get access to the tags.
-          # A shallow clone can work when the following issue is resolved:
-          # https://github.com/actions/checkout/issues/338
-          fetch-depth: 0
-
-      - name: Setup MSVC (32-bit)
-        if: ${{ matrix.buildplat[1] == 'win32' }}
-        uses: bus1/cabuild/action/msdevshell@e22aba57d6e74891d059d66501b6b5aed8123c4d  # v1
-        with:
-          architecture: 'x86'
-
-      - name: pkg-config-for-win
-        run: |
-          choco install -y --checksum 6004DF17818F5A6DBF19CB335CC92702 pkgconfiglite
-        if: runner.os == 'windows'
-
-      # Used to push the built wheels
-      - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0
-        with:
-          python-version: "3.x"
-
-      - name: Build wheels
-        uses: pypa/cibuildwheel@ce3fb7832089eb3e723a0a99cab7f3eaccf074fd  # v2.16.5
-        env:
-          CIBW_PRERELEASE_PYTHONS: True
-          CIBW_BUILD: ${{ matrix.python }}-${{ matrix.buildplat[1] }}
-
-      - uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # v3.1.2
-        with:
-          name: ${{ matrix.python }}-${{ startsWith(matrix.buildplat[1], 'macosx') && 'macosx' || matrix.buildplat[1] }}
-          path: ./wheelhouse/*.whl
-
-      - uses: conda-incubator/setup-miniconda@11b562958363ec5770fef326fe8ef0366f8cbf8a # v3.0.1
-        with:
-          # for installation of anaconda-client, required for upload to
-          # anaconda.org
-          # default (and activated) environment name is test
-          # Note that this step is *after* specific pythons have been used to
-          # build and test the wheel
-          auto-update-conda: true
-          python-version: "3.10"
-
-      - name: Upload wheels
-        if: success()
-        shell: bash -el {0}
-        # see https://github.com/marketplace/actions/setup-miniconda for why
-        # `-el {0}` is required.
-        env:
-          NUMPY_STAGING_UPLOAD_TOKEN: ${{ secrets.NUMPY_STAGING_UPLOAD_TOKEN }}
-          NUMPY_NIGHTLY_UPLOAD_TOKEN: ${{ secrets.NUMPY_NIGHTLY_UPLOAD_TOKEN }}
-        run: |
-          #conda install -y anaconda-client
-          # pin urllib3 until anaconda-client is fixed upstream
-          conda install -y anaconda-client 'urllib3<2.0.0'
-          source tools/wheels/upload_wheels.sh
-          set_upload_vars
-          # trigger an upload to
-          # https://anaconda.org/scientific-python-nightly-wheels/numpy
-          # for cron jobs or "Run workflow" (restricted to main branch).
-          # Tags will upload to
-          # https://anaconda.org/multibuild-wheels-staging/numpy
-          # The tokens were originally generated at anaconda.org
-          upload_wheels
-
-  build_sdist:
-    name: Build sdist
-    needs: get_commit_message
-    if: >-
-      contains(needs.get_commit_message.outputs.message, '[wheel build]') ||
-      github.event_name == 'schedule' ||
-      github.event_name == 'workflow_dispatch' ||
-      (github.event_name == 'pull_request' &&
-      (contains(github.event.pull_request.labels.*.name, '36 - Build') ||
-      contains(github.event.pull_request.labels.*.name, '14 - Release'))) ||
-      (github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && ( ! endsWith(github.ref, 'dev0')))
-    runs-on: ubuntu-latest
-    env:
-      IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }}
-      # commented out so the sdist doesn't upload to nightly
-      # IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
-    steps:
-      - name: Checkout numpy
-        uses: actions/checkout@3df4ab11eba7bda6032a0b82a6bb43b11571feac # v4.0.0
-        with:
-          submodules: true
-          # versioneer.py requires the latest tag to be reachable. Here we
-          # fetch the complete history to get access to the tags.
-          # A shallow clone can work when the following issue is resolved:
-          # https://github.com/actions/checkout/issues/338
-          fetch-depth: 0
-      # Used to push the built wheels
-      - uses: actions/setup-python@61a6322f88396a6271a6ee3565807d608ecaddd1 # v4.7.0
-        with:
-          # Build sdist on lowest supported Python
-          python-version: "3.9"
-      - name: Build sdist
-        run: |
-          python -m pip install -U pip build
-          python -m build --sdist -Csetup-args=-Dallow-noblas=true
-      - name: Test the sdist
-        run: |
-          # TODO: Don't run test suite, and instead build wheels from sdist
-          # Depends on pypa/cibuildwheel#1020
-          python -m pip install dist/*.gz -Csetup-args=-Dallow-noblas=true
-          pip install ninja
-          pip install -r test_requirements.txt
-          cd .. # Can't import numpy within numpy src directory
-          python -c "import numpy, sys; print(numpy.__version__); sys.exit(numpy.test() is False)"
-
-      - name: Check README rendering for PyPI
-        run: |
-          python -mpip install twine
-          twine check dist/*
-
-      - uses: actions/upload-artifact@0b7f8abb1508181956e8e162db84b466c27e18ce # v3.1.2
-        with:
-          name: sdist
-          path: ./dist/*
-
-      - uses: conda-incubator/setup-miniconda@11b562958363ec5770fef326fe8ef0366f8cbf8a # v3.0.1
-        with:
-          # for installation of anaconda-client, required for upload to
-          # anaconda.org
-          # default (and activated) environment name is test
-          # Note that this step is *after* specific pythons have been used to
-          # build and test
-          auto-update-conda: true
-          python-version: "3.10"
-
-      - name: Upload sdist
-        if: success()
-        shell: bash -el {0}
-        env:
-          NUMPY_STAGING_UPLOAD_TOKEN: ${{ secrets.NUMPY_STAGING_UPLOAD_TOKEN }}
-          # commented out so the sdist doesn't upload to nightly
-          # NUMPY_NIGHTLY_UPLOAD_TOKEN: ${{ secrets.NUMPY_NIGHTLY_UPLOAD_TOKEN }}
-        run: |
-          #conda install -y anaconda-client
-          # pin urllib3 until anaconda-client is fixed upstream
-          conda install -y anaconda-client 'urllib3<2.0.0'
-          source tools/wheels/upload_wheels.sh
-          set_upload_vars
-          # trigger an upload to
-          # https://anaconda.org/scientific-python-nightly-wheels/numpy
-          # for cron jobs or "Run workflow" (restricted to main branch).
-          # Tags will upload to
-          # https://anaconda.org/multibuild-wheels-staging/numpy
-          # The tokens were originally generated at anaconda.org
-          upload_wheels
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
deleted file mode 100644
index 988a55b761db..000000000000
--- a/.github/workflows/windows.yml
+++ /dev/null
@@ -1,114 +0,0 @@
-name: Windows tests
-
-on:
-  pull_request:
-    branches:
-      - main
-      - maintenance/**
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read # to fetch code (actions/checkout)
-
-jobs:
-  python64bit_openblas:
-    name: x86-64, LP64 OpenBLAS
-    runs-on: windows-2019
-    if: "github.repository == 'numpy/numpy'"
-    strategy:
-      matrix:
-        compiler: ["MSVC", "Clang-cl"]
-    steps:
-    - name: Checkout
-      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-      with:
-        submodules: recursive
-        fetch-depth: 0
-
-    - name: Setup Python
-      uses: actions/setup-python@65d7f2d534ac1bc67fcd62888c5f4f3d2cb2b236 # v4.7.1
-      with:
-        python-version: '3.11'
-
-    - name: Install build dependencies from PyPI
-      run: |
-        python -m pip install spin Cython
-
-    - name: Install pkg-config
-      run: |
-        choco install -y --checksum 6004DF17818F5A6DBF19CB335CC92702 pkgconfiglite
-
-    - name: Install Clang-cl
-      if: matrix.compiler == 'Clang-cl'
-      run: |
-        choco install llvm -y --version=16.0.6
-
-    - name: Install NumPy (MSVC)
-      if: matrix.compiler == 'MSVC'
-      env:
-        PKG_CONFIG_PATH: ${{ github.workspace }}/.openblas
-      run: |
-        python -m pip install scipy-openblas32
-        spin build --with-scipy-openblas=32 -j2 -- --vsenv
-
-    - name: Install NumPy (Clang-cl)
-      if: matrix.compiler == 'Clang-cl'
-      env:
-        PKG_CONFIG_PATH: ${{ github.workspace }}/.openblas
-      run: |
-        "[binaries]","c = 'clang-cl'","cpp = 'clang-cl'","ar = 'llvm-lib'","c_ld = 'lld-link'","cpp_ld = 'lld-link'" | Out-File $PWD/clang-cl-build.ini -Encoding ascii
-        python -m pip install scipy-openblas32
-        spin build --with-scipy-openblas=32 -j2 -- --vsenv --native-file=$PWD/clang-cl-build.ini
-
-    - name: Meson Log
-      shell: bash
-      if: ${{ failure()  }}
-      run: |
-        cat build/meson-logs/meson-log.txt
-
-    - name: Install test dependencies
-      run: |
-        python -m pip install -r test_requirements.txt
-        python -m pip install threadpoolctl
-
-    - name: Run test suite
-      run: |
-        spin test
-
-  msvc_32bit_python_no_openblas:
-    name: MSVC, 32-bit Python, no BLAS
-    runs-on: windows-2019
-    if: "github.repository == 'numpy/numpy'"
-    steps:
-      - name: Checkout
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-        with:
-          submodules: recursive
-          fetch-depth: 0
-
-      - name: Setup Python (32-bit)
-        uses: actions/setup-python@65d7f2d534ac1bc67fcd62888c5f4f3d2cb2b236 # v4.7.1
-        with:
-          python-version: '3.10'
-          architecture: 'x86'
-
-      - name: Setup MSVC (32-bit)
-        uses: bus1/cabuild/action/msdevshell@e22aba57d6e74891d059d66501b6b5aed8123c4d  # v1
-        with:
-          architecture: 'x86'
-
-      - name: Build and install
-        run: |
-          python -m pip install . -v -Ccompile-args="-j2" -Csetup-args="-Dallow-noblas=true"
-
-      - name: Install test dependencies
-        run: |
-          python -m pip install -r test_requirements.txt
-
-      - name: Run test suite (fast)
-        run: |
-          cd tools
-          python -m pytest --pyargs numpy -m "not slow" -n2
diff --git a/.nanvix/nanvix.toml b/.nanvix/nanvix.toml
new file mode 100644
index 000000000000..7aec9d6c8be3
--- /dev/null
+++ b/.nanvix/nanvix.toml
@@ -0,0 +1,10 @@
+[package]
+name = "numpy"
+version = "1.26.4"
+nanvix-version = "0.12.552"
+
+[builds]
+[builds.matrix]
+platforms = ["microvm"]
+modes = ["standalone"]
+memory = ["256mb"]
diff --git a/.nanvix/z.py b/.nanvix/z.py
new file mode 100644
index 000000000000..7ccfe8f21b29
--- /dev/null
+++ b/.nanvix/z.py
@@ -0,0 +1,90 @@
+# Copyright(c) The Maintainers of Nanvix.
+# Licensed under the MIT License.
+
+"""Nanvix build script for numpy C extensions.
+
+Usage:
+    ./z setup      # Download Nanvix sysroot and CPython headers
+    ./z build      # Cross-compile numpy multiarray_umath module
+    ./z test       # (no-op — tested via nanvix-python)
+    ./z release    # Package libnumpy_core.a release tarball
+    ./z clean      # Remove build artifacts
+    ./z distclean  # Deep clean
+"""
+
+import os
+import shutil
+import sys
+from pathlib import Path
+
+from nanvix_zutil import ZScript, log
+
+
+class NumpyBuild(ZScript):
+    """Build script for nanvix/numpy."""
+
+    SYSROOT_REQUIRED_FILES: tuple[str, ...] = (
+        "lib/libposix.a",
+        "lib/user.ld",
+    )
+
+    @property
+    def _nanvix_port_dir(self) -> Path:
+        return self.repo_root / "nanvix-port"
+
+    @property
+    def _dist_dir(self) -> Path:
+        return self.repo_root / "dist" / "obj"
+
+    def setup(self) -> bool:
+        ok = super().setup()
+        if not ok:
+            return False
+        log.info("setup complete")
+        return True
+
+    def build(self) -> None:
+        log.info("cross-compiling numpy C extensions...")
+        script = self._nanvix_port_dir / "build-nanvix.sh"
+        if not script.is_file():
+            log.error(f"build script not found: {script}")
+            sys.exit(1)
+
+        self.run("bash", "nanvix-port/build-nanvix.sh")
+
+        lib = self._dist_dir / "libnumpy_core.a"
+        if lib.is_file():
+            log.info(f"build complete: {lib} ({lib.stat().st_size // 1024} KB)")
+        else:
+            log.error("build failed: libnumpy_core.a not found")
+            sys.exit(1)
+
+    def test(self) -> None:
+        log.info("numpy extensions are tested via nanvix-python — skipping")
+
+    def release(self) -> None:
+        import tarfile
+
+        lib = self._dist_dir / "libnumpy_core.a"
+        if not lib.is_file():
+            log.error("libnumpy_core.a not found — run ./z build first")
+            sys.exit(1)
+
+        platform = os.environ.get("NANVIX_MACHINE", "microvm")
+        memory = os.environ.get("NANVIX_MEMORY_SIZE", "256mb")
+        tag = f"numpy-{platform}-standalone-{memory}"
+        tarball = self._dist_dir / f"{tag}.tar.gz"
+
+        with tarfile.open(tarball, "w:gz") as tf:
+            tf.add(lib, arcname=f"{tag}/lib/libnumpy_core.a")
+
+        log.info(f"release: {tarball} ({tarball.stat().st_size // 1024} KB)")
+
+    def clean(self) -> None:
+        if self._dist_dir.is_dir():
+            shutil.rmtree(self._dist_dir)
+            log.info("cleaned dist/")
+        builddir = self.repo_root / "builddir"
+        if builddir.is_dir():
+            shutil.rmtree(builddir)
+            log.info("cleaned builddir/")
diff --git a/nanvix-port/build-nanvix.sh b/nanvix-port/build-nanvix.sh
new file mode 100755
index 000000000000..089e18f47451
--- /dev/null
+++ b/nanvix-port/build-nanvix.sh
@@ -0,0 +1,184 @@
+#!/bin/bash
+# Cross-compile numpy _multiarray_umath for Nanvix (i686)
+# Run inside Docker: ghcr.io/nanvix/toolchain-gcc:latest
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+NUMPY_ROOT="$(dirname "$SCRIPT_DIR")"
+GENDIR="$NUMPY_ROOT/numpy/core/src/_generated"
+CFGDIR="$NUMPY_ROOT/nanvix-port/generated-headers"
+APIDIR="$NUMPY_ROOT/nanvix-port/generated-headers"
+PYINC="$NUMPY_ROOT/nanvix-port/cpython-headers/python3.12"
+OUTDIR="$NUMPY_ROOT/dist/obj"
+
+CC="i686-nanvix-gcc"
+CXX="i686-nanvix-g++"
+AR="i686-nanvix-ar"
+
+CFLAGS="-m32 -march=pentiumpro -Os -fdata-sections -ffunction-sections \
+  -DNDEBUG -DNPY_DISABLE_OPTIMIZATION -DNPY_NO_SMP -DNPY_NO_SIGNAL \
+  -DNPY_INTERNAL_BUILD=1 \
+  -D_MULTIARRAYMODULE -D_UMATHMODULE \
+  -Wno-unused-function -Wno-deprecated-declarations -Wno-unused-variable \
+  -Wno-sign-compare -Wno-unused-but-set-variable -Wno-missing-field-initializers"
+
+CXXFLAGS="-m32 -march=pentiumpro -Os -fdata-sections -ffunction-sections \
+  -std=c++17 -DNDEBUG -DNPY_DISABLE_OPTIMIZATION -DNPY_NO_SMP -DNPY_NO_SIGNAL \
+  -DNPY_INTERNAL_BUILD=1 \
+  -D_MULTIARRAYMODULE -D_UMATHMODULE \
+  -Wno-unused-function -Wno-deprecated-declarations -Wno-unused-variable \
+  -Wno-sign-compare -Wno-missing-field-initializers"
+
+INCLUDES="-I$PYINC \
+  -I$NUMPY_ROOT/numpy/core/include \
+  -I$NUMPY_ROOT/numpy/core/src/common \
+  -I$NUMPY_ROOT/numpy/core/src/multiarray \
+  -I$NUMPY_ROOT/numpy/core/src/npymath \
+  -I$NUMPY_ROOT/numpy/core/src/umath \
+  -I$CFGDIR \
+  -I$APIDIR \
+  -I$GENDIR"
+
+mkdir -p "$OUTDIR"
+
+compile_c() {
+    local src="$1"
+    local obj="$OUTDIR/$(basename "${src%.*}").o"
+    echo "  CC  $(basename $src)"
+    $CC $CFLAGS $INCLUDES -c "$src" -o "$obj" 2>&1 || { echo "FAILED: $src"; return 1; }
+}
+
+compile_cxx() {
+    local src="$1"
+    local obj="$OUTDIR/$(basename "${src%.*}").o"
+    echo "  CXX $(basename $src)"
+    $CXX $CXXFLAGS $INCLUDES -c "$src" -o "$obj" 2>&1 || { echo "FAILED: $src"; return 1; }
+}
+
+compile_gen() {
+    local src="$GENDIR/$1"
+    local obj="$OUTDIR/${1%.*}.o"
+    echo "  CC  $1 (generated)"
+    $CC $CFLAGS $INCLUDES -c "$src" -o "$obj" 2>&1 || { echo "FAILED: $src"; return 1; }
+}
+
+echo "=== Phase 1: npymath library ==="
+SRC="$NUMPY_ROOT/numpy/core/src/npymath"
+compile_cxx "$SRC/halffloat.cpp"
+compile_c "$SRC/npy_math.c"
+compile_cxx "$SRC/ieee754.cpp"
+compile_gen "npy_math_complex.c"
+
+echo ""
+echo "=== Phase 2: common sources ==="
+SRC="$NUMPY_ROOT/numpy/core/src/common"
+compile_c "$SRC/array_assign.c"
+compile_c "$SRC/mem_overlap.c"
+compile_c "$SRC/npy_argparse.c"
+compile_c "$SRC/npy_hashtable.c"
+compile_c "$SRC/npy_longdouble.c"
+compile_c "$SRC/ucsnarrow.c"
+compile_c "$SRC/ufunc_override.c"
+compile_c "$SRC/numpyos.c"
+compile_c "$SRC/npy_cpu_features.c"
+
+echo ""
+echo "=== Phase 3: multiarray sources ==="
+SRC="$NUMPY_ROOT/numpy/core/src/multiarray"
+for f in \
+    abstractdtypes.c alloc.c arrayobject.c array_coercion.c array_method.c \
+    array_assign_scalar.c array_assign_array.c arrayfunction_override.c \
+    buffer.c calculation.c compiled_base.c common.c common_dtype.c \
+    convert.c convert_datatype.c conversion_utils.c ctors.c \
+    datetime.c datetime_strings.c datetime_busday.c datetime_busdaycal.c \
+    descriptor.c dlpack.c dtypemeta.c dragon4.c dtype_transfer.c dtype_traversal.c \
+    experimental_public_dtype_api.c flagsobject.c getset.c hashdescr.c \
+    item_selection.c iterators.c legacy_dtype_implementation.c \
+    mapping.c methods.c multiarraymodule.c \
+    nditer_api.c nditer_constr.c nditer_pywrap.c \
+    number.c refcount.c sequence.c shape.c scalarapi.c \
+    strfuncs.c temp_elide.c typeinfo.c usertypes.c vdot.c
+do
+    compile_c "$SRC/$f"
+done
+
+# Template-generated multiarray files
+for f in arraytypes.c einsum.c einsum_sumprod.c lowlevel_strided_loops.c \
+         nditer_templ.c scalartypes.c
+do
+    compile_gen "$f"
+done
+
+# Textreading sources
+SRC="$NUMPY_ROOT/numpy/core/src/multiarray/textreading"
+for f in conversions.c field_types.c growth.c readtext.c rows.c \
+         stream_pyobject.c str_to_int.c
+do
+    compile_c "$SRC/$f"
+done
+compile_cxx "$SRC/tokenize.cpp"
+
+echo ""
+echo "=== Phase 4: npysort C++ sources ==="
+SRC="$NUMPY_ROOT/numpy/core/src/npysort"
+for f in quicksort.cpp mergesort.cpp timsort.cpp heapsort.cpp \
+         radixsort.cpp selection.cpp binsearch.cpp
+do
+    compile_cxx "$SRC/$f"
+done
+
+echo ""
+echo "=== Phase 5: umath sources ==="
+SRC="$NUMPY_ROOT/numpy/core/src/umath"
+for f in ufunc_type_resolution.c dispatching.c extobj.c \
+         legacy_array_method.c override.c reduction.c \
+         ufunc_object.c umathmodule.c wrapping_array_method.c \
+         _scaled_float_dtype.c
+do
+    compile_c "$SRC/$f"
+done
+compile_cxx "$SRC/clip.cpp"
+compile_cxx "$SRC/string_ufuncs.cpp"
+
+# Template-generated umath files
+for f in funcs.inc loops.c matmul.c scalarmath.c
+do
+    # funcs.inc needs special handling - it's included, not compiled directly
+    if [ "$f" = "funcs.inc" ]; then
+        continue
+    fi
+    compile_gen "$f"
+done
+
+echo ""
+echo "=== Phase 6: dispatch-able sources (baseline only) ==="
+# These .dispatch.c files are compiled with NPY_DISABLE_OPTIMIZATION
+# so they only produce baseline implementations
+for f in argfunc.dispatch.c \
+         loops_arithm_fp.dispatch.c loops_arithmetic.dispatch.c \
+         loops_autovec.dispatch.c loops_comparison.dispatch.c \
+         loops_exponent_log.dispatch.c loops_hyperbolic.dispatch.c \
+         loops_logical.dispatch.c loops_minmax.dispatch.c \
+         loops_modulo.dispatch.c loops_trigonometric.dispatch.c \
+         loops_umath_fp.dispatch.c loops_unary.dispatch.c \
+         loops_unary_complex.dispatch.c loops_unary_fp.dispatch.c \
+         loops_unary_fp_le.dispatch.c
+do
+    compile_gen "$f"
+done
+
+echo ""
+echo "=== Phase 7: Additional sources ==="
+# __multiarray_api.c and __ufunc_api.c are #included inside
+# multiarraymodule.c and umathmodule.c respectively, not compiled separately.
+
+# arm64 exports stub (needed for symbol)
+compile_c "$NUMPY_ROOT/numpy/core/src/npymath/arm64_exports.c"
+
+echo ""
+echo "=== Phase 8: Creating static archive ==="
+$AR rcs "$OUTDIR/libnumpy_core.a" "$OUTDIR"/*.o
+echo "Created: $OUTDIR/libnumpy_core.a ($(du -h "$OUTDIR/libnumpy_core.a" | cut -f1))"
+echo "Objects: $(ls "$OUTDIR"/*.o | wc -l) files"
+echo ""
+echo "=== BUILD COMPLETE ==="
diff --git a/nanvix-port/cpython-headers/python3.12/Python.h b/nanvix-port/cpython-headers/python3.12/Python.h
new file mode 100644
index 000000000000..5eddda633616
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/Python.h
@@ -0,0 +1,109 @@
+// Entry point of the Python C API.
+// C extensions should only #include <Python.h>, and not include directly
+// the other Python header files included by <Python.h>.
+
+#ifndef Py_PYTHON_H
+#define Py_PYTHON_H
+
+// Since this is a "meta-include" file, no #ifdef __cplusplus / extern "C" {
+
+// Include Python header files
+#include "patchlevel.h"
+#include "pyconfig.h"
+#include "pymacconfig.h"
+
+#if defined(__sgi) && !defined(_SGI_MP_SOURCE)
+#  define _SGI_MP_SOURCE
+#endif
+
+// stdlib.h, stdio.h, errno.h and string.h headers are not used by Python
+// headers, but kept for backward compatibility. They are excluded from the
+// limited C API of Python 3.11.
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
+#  include <stdlib.h>
+#  include <stdio.h>              // FILE*
+#  include <errno.h>              // errno
+#  include <string.h>             // memcpy()
+#endif
+#ifndef MS_WINDOWS
+#  include <unistd.h>
+#endif
+#ifdef HAVE_STDDEF_H
+#  include <stddef.h>             // size_t
+#endif
+
+#include <assert.h>               // assert()
+#include <wchar.h>                // wchar_t
+
+#include "pyport.h"
+#include "pymacro.h"
+#include "pymath.h"
+#include "pymem.h"
+#include "pytypedefs.h"
+#include "pybuffer.h"
+#include "object.h"
+#include "objimpl.h"
+#include "typeslots.h"
+#include "pyhash.h"
+#include "cpython/pydebug.h"
+#include "bytearrayobject.h"
+#include "bytesobject.h"
+#include "unicodeobject.h"
+#include "cpython/initconfig.h"
+#include "pystate.h"
+#include "pyerrors.h"
+#include "longobject.h"
+#include "cpython/longintrepr.h"
+#include "boolobject.h"
+#include "floatobject.h"
+#include "complexobject.h"
+#include "rangeobject.h"
+#include "memoryobject.h"
+#include "tupleobject.h"
+#include "listobject.h"
+#include "dictobject.h"
+#include "cpython/odictobject.h"
+#include "enumobject.h"
+#include "setobject.h"
+#include "methodobject.h"
+#include "moduleobject.h"
+#include "cpython/funcobject.h"
+#include "cpython/classobject.h"
+#include "fileobject.h"
+#include "pycapsule.h"
+#include "cpython/code.h"
+#include "pyframe.h"
+#include "traceback.h"
+#include "sliceobject.h"
+#include "cpython/cellobject.h"
+#include "iterobject.h"
+#include "cpython/genobject.h"
+#include "descrobject.h"
+#include "genericaliasobject.h"
+#include "warnings.h"
+#include "weakrefobject.h"
+#include "structseq.h"
+#include "cpython/picklebufobject.h"
+#include "cpython/pytime.h"
+#include "codecs.h"
+#include "pythread.h"
+#include "cpython/context.h"
+#include "modsupport.h"
+#include "compile.h"
+#include "pythonrun.h"
+#include "pylifecycle.h"
+#include "ceval.h"
+#include "sysmodule.h"
+#include "osmodule.h"
+#include "intrcheck.h"
+#include "import.h"
+#include "abstract.h"
+#include "bltinmodule.h"
+#include "cpython/pyctype.h"
+#include "pystrtod.h"
+#include "pystrcmp.h"
+#include "fileutils.h"
+#include "cpython/pyfpe.h"
+#include "tracemalloc.h"
+
+#endif /* !Py_PYTHON_H */
diff --git a/nanvix-port/cpython-headers/python3.12/abstract.h b/nanvix-port/cpython-headers/python3.12/abstract.h
new file mode 100644
index 000000000000..064b0300b51e
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/abstract.h
@@ -0,0 +1,899 @@
+/* Abstract Object Interface (many thanks to Jim Fulton) */
+
+#ifndef Py_ABSTRACTOBJECT_H
+#define Py_ABSTRACTOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* === Object Protocol ================================================== */
+
+/* Implemented elsewhere:
+
+   int PyObject_Print(PyObject *o, FILE *fp, int flags);
+
+   Print an object 'o' on file 'fp'.  Returns -1 on error. The flags argument
+   is used to enable certain printing options. The only option currently
+   supported is Py_PRINT_RAW. By default (flags=0), PyObject_Print() formats
+   the object by calling PyObject_Repr(). If flags equals to Py_PRINT_RAW, it
+   formats the object by calling PyObject_Str(). */
+
+
+/* Implemented elsewhere:
+
+   int PyObject_HasAttrString(PyObject *o, const char *attr_name);
+
+   Returns 1 if object 'o' has the attribute attr_name, and 0 otherwise.
+
+   This is equivalent to the Python expression: hasattr(o,attr_name).
+
+   This function always succeeds. */
+
+
+/* Implemented elsewhere:
+
+   PyObject* PyObject_GetAttrString(PyObject *o, const char *attr_name);
+
+   Retrieve an attributed named attr_name form object o.
+   Returns the attribute value on success, or NULL on failure.
+
+   This is the equivalent of the Python expression: o.attr_name. */
+
+
+/* Implemented elsewhere:
+
+   int PyObject_HasAttr(PyObject *o, PyObject *attr_name);
+
+   Returns 1 if o has the attribute attr_name, and 0 otherwise.
+
+   This is equivalent to the Python expression: hasattr(o,attr_name).
+
+   This function always succeeds. */
+
+/* Implemented elsewhere:
+
+   PyObject* PyObject_GetAttr(PyObject *o, PyObject *attr_name);
+
+   Retrieve an attributed named 'attr_name' form object 'o'.
+   Returns the attribute value on success, or NULL on failure.
+
+   This is the equivalent of the Python expression: o.attr_name. */
+
+
+/* Implemented elsewhere:
+
+   int PyObject_SetAttrString(PyObject *o, const char *attr_name, PyObject *v);
+
+   Set the value of the attribute named attr_name, for object 'o',
+   to the value 'v'. Raise an exception and return -1 on failure; return 0 on
+   success.
+
+   This is the equivalent of the Python statement o.attr_name=v. */
+
+
+/* Implemented elsewhere:
+
+   int PyObject_SetAttr(PyObject *o, PyObject *attr_name, PyObject *v);
+
+   Set the value of the attribute named attr_name, for object 'o', to the value
+   'v'. an exception and return -1 on failure; return 0 on success.
+
+   This is the equivalent of the Python statement o.attr_name=v. */
+
+/* Implemented as a macro:
+
+   int PyObject_DelAttrString(PyObject *o, const char *attr_name);
+
+   Delete attribute named attr_name, for object o. Returns
+   -1 on failure.
+
+   This is the equivalent of the Python statement: del o.attr_name. */
+#define PyObject_DelAttrString(O, A) PyObject_SetAttrString((O), (A), NULL)
+
+
+/* Implemented as a macro:
+
+   int PyObject_DelAttr(PyObject *o, PyObject *attr_name);
+
+   Delete attribute named attr_name, for object o. Returns -1
+   on failure.  This is the equivalent of the Python
+   statement: del o.attr_name. */
+#define  PyObject_DelAttr(O, A) PyObject_SetAttr((O), (A), NULL)
+
+
+/* Implemented elsewhere:
+
+   PyObject *PyObject_Repr(PyObject *o);
+
+   Compute the string representation of object 'o'.  Returns the
+   string representation on success, NULL on failure.
+
+   This is the equivalent of the Python expression: repr(o).
+
+   Called by the repr() built-in function. */
+
+
+/* Implemented elsewhere:
+
+   PyObject *PyObject_Str(PyObject *o);
+
+   Compute the string representation of object, o.  Returns the
+   string representation on success, NULL on failure.
+
+   This is the equivalent of the Python expression: str(o).
+
+   Called by the str() and print() built-in functions. */
+
+
+/* Declared elsewhere
+
+   PyAPI_FUNC(int) PyCallable_Check(PyObject *o);
+
+   Determine if the object, o, is callable.  Return 1 if the object is callable
+   and 0 otherwise.
+
+   This function always succeeds. */
+
+
+#ifdef PY_SSIZE_T_CLEAN
+#  define PyObject_CallFunction _PyObject_CallFunction_SizeT
+#  define PyObject_CallMethod _PyObject_CallMethod_SizeT
+#endif
+
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03090000
+/* Call a callable Python object without any arguments */
+PyAPI_FUNC(PyObject *) PyObject_CallNoArgs(PyObject *func);
+#endif
+
+
+/* Call a callable Python object 'callable' with arguments given by the
+   tuple 'args' and keywords arguments given by the dictionary 'kwargs'.
+
+   'args' must not be NULL, use an empty tuple if no arguments are
+   needed. If no named arguments are needed, 'kwargs' can be NULL.
+
+   This is the equivalent of the Python expression:
+   callable(*args, **kwargs). */
+PyAPI_FUNC(PyObject *) PyObject_Call(PyObject *callable,
+                                     PyObject *args, PyObject *kwargs);
+
+
+/* Call a callable Python object 'callable', with arguments given by the
+   tuple 'args'.  If no arguments are needed, then 'args' can be NULL.
+
+   Returns the result of the call on success, or NULL on failure.
+
+   This is the equivalent of the Python expression:
+   callable(*args). */
+PyAPI_FUNC(PyObject *) PyObject_CallObject(PyObject *callable,
+                                           PyObject *args);
+
+/* Call a callable Python object, callable, with a variable number of C
+   arguments. The C arguments are described using a mkvalue-style format
+   string.
+
+   The format may be NULL, indicating that no arguments are provided.
+
+   Returns the result of the call on success, or NULL on failure.
+
+   This is the equivalent of the Python expression:
+   callable(arg1, arg2, ...). */
+PyAPI_FUNC(PyObject *) PyObject_CallFunction(PyObject *callable,
+                                             const char *format, ...);
+
+/* Call the method named 'name' of object 'obj' with a variable number of
+   C arguments.  The C arguments are described by a mkvalue format string.
+
+   The format can be NULL, indicating that no arguments are provided.
+
+   Returns the result of the call on success, or NULL on failure.
+
+   This is the equivalent of the Python expression:
+   obj.name(arg1, arg2, ...). */
+PyAPI_FUNC(PyObject *) PyObject_CallMethod(PyObject *obj,
+                                           const char *name,
+                                           const char *format, ...);
+
+PyAPI_FUNC(PyObject *) _PyObject_CallFunction_SizeT(PyObject *callable,
+                                                    const char *format,
+                                                    ...);
+
+PyAPI_FUNC(PyObject *) _PyObject_CallMethod_SizeT(PyObject *obj,
+                                                  const char *name,
+                                                  const char *format,
+                                                  ...);
+
+/* Call a callable Python object 'callable' with a variable number of C
+   arguments. The C arguments are provided as PyObject* values, terminated
+   by a NULL.
+
+   Returns the result of the call on success, or NULL on failure.
+
+   This is the equivalent of the Python expression:
+   callable(arg1, arg2, ...). */
+PyAPI_FUNC(PyObject *) PyObject_CallFunctionObjArgs(PyObject *callable,
+                                                    ...);
+
+/* Call the method named 'name' of object 'obj' with a variable number of
+   C arguments.  The C arguments are provided as PyObject* values, terminated
+   by NULL.
+
+   Returns the result of the call on success, or NULL on failure.
+
+   This is the equivalent of the Python expression: obj.name(*args). */
+
+PyAPI_FUNC(PyObject *) PyObject_CallMethodObjArgs(
+    PyObject *obj,
+    PyObject *name,
+    ...);
+
+/* Given a vectorcall nargsf argument, return the actual number of arguments.
+ * (For use outside the limited API, this is re-defined as a static inline
+ * function in cpython/abstract.h)
+ */
+PyAPI_FUNC(Py_ssize_t) PyVectorcall_NARGS(size_t nargsf);
+
+/* Call "callable" (which must support vectorcall) with positional arguments
+   "tuple" and keyword arguments "dict". "dict" may also be NULL */
+PyAPI_FUNC(PyObject *) PyVectorcall_Call(PyObject *callable, PyObject *tuple, PyObject *dict);
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030C0000
+#define PY_VECTORCALL_ARGUMENTS_OFFSET \
+    (_Py_STATIC_CAST(size_t, 1) << (8 * sizeof(size_t) - 1))
+
+/* Perform a PEP 590-style vector call on 'callable' */
+PyAPI_FUNC(PyObject *) PyObject_Vectorcall(
+    PyObject *callable,
+    PyObject *const *args,
+    size_t nargsf,
+    PyObject *kwnames);
+
+/* Call the method 'name' on args[0] with arguments in args[1..nargsf-1]. */
+PyAPI_FUNC(PyObject *) PyObject_VectorcallMethod(
+    PyObject *name, PyObject *const *args,
+    size_t nargsf, PyObject *kwnames);
+#endif
+
+/* Implemented elsewhere:
+
+   Py_hash_t PyObject_Hash(PyObject *o);
+
+   Compute and return the hash, hash_value, of an object, o.  On
+   failure, return -1.
+
+   This is the equivalent of the Python expression: hash(o). */
+
+
+/* Implemented elsewhere:
+
+   int PyObject_IsTrue(PyObject *o);
+
+   Returns 1 if the object, o, is considered to be true, 0 if o is
+   considered to be false and -1 on failure.
+
+   This is equivalent to the Python expression: not not o. */
+
+
+/* Implemented elsewhere:
+
+   int PyObject_Not(PyObject *o);
+
+   Returns 0 if the object, o, is considered to be true, 1 if o is
+   considered to be false and -1 on failure.
+
+   This is equivalent to the Python expression: not o. */
+
+
+/* Get the type of an object.
+
+   On success, returns a type object corresponding to the object type of object
+   'o'. On failure, returns NULL.
+
+   This is equivalent to the Python expression: type(o) */
+PyAPI_FUNC(PyObject *) PyObject_Type(PyObject *o);
+
+
+/* Return the size of object 'o'.  If the object 'o' provides both sequence and
+   mapping protocols, the sequence size is returned.
+
+   On error, -1 is returned.
+
+   This is the equivalent to the Python expression: len(o) */
+PyAPI_FUNC(Py_ssize_t) PyObject_Size(PyObject *o);
+
+
+/* For DLL compatibility */
+#undef PyObject_Length
+PyAPI_FUNC(Py_ssize_t) PyObject_Length(PyObject *o);
+#define PyObject_Length PyObject_Size
+
+/* Return element of 'o' corresponding to the object 'key'. Return NULL
+  on failure.
+
+  This is the equivalent of the Python expression: o[key] */
+PyAPI_FUNC(PyObject *) PyObject_GetItem(PyObject *o, PyObject *key);
+
+
+/* Map the object 'key' to the value 'v' into 'o'.
+
+   Raise an exception and return -1 on failure; return 0 on success.
+
+   This is the equivalent of the Python statement: o[key]=v. */
+PyAPI_FUNC(int) PyObject_SetItem(PyObject *o, PyObject *key, PyObject *v);
+
+/* Remove the mapping for the string 'key' from the object 'o'.
+   Returns -1 on failure.
+
+   This is equivalent to the Python statement: del o[key]. */
+PyAPI_FUNC(int) PyObject_DelItemString(PyObject *o, const char *key);
+
+/* Delete the mapping for the object 'key' from the object 'o'.
+   Returns -1 on failure.
+
+   This is the equivalent of the Python statement: del o[key]. */
+PyAPI_FUNC(int) PyObject_DelItem(PyObject *o, PyObject *key);
+
+
+/* === Old Buffer API ============================================ */
+
+/* FIXME:  usage of these should all be replaced in Python itself
+   but for backwards compatibility we will implement them.
+   Their usage without a corresponding "unlock" mechanism
+   may create issues (but they would already be there). */
+
+/* Takes an arbitrary object which must support the (character, single segment)
+   buffer interface and returns a pointer to a read-only memory location
+   usable as character based input for subsequent processing.
+
+   Return 0 on success.  buffer and buffer_len are only set in case no error
+   occurs. Otherwise, -1 is returned and an exception set. */
+Py_DEPRECATED(3.0)
+PyAPI_FUNC(int) PyObject_AsCharBuffer(PyObject *obj,
+                                      const char **buffer,
+                                      Py_ssize_t *buffer_len);
+
+/* Checks whether an arbitrary object supports the (character, single segment)
+   buffer interface.
+
+   Returns 1 on success, 0 on failure. */
+Py_DEPRECATED(3.0) PyAPI_FUNC(int) PyObject_CheckReadBuffer(PyObject *obj);
+
+/* Same as PyObject_AsCharBuffer() except that this API expects (readable,
+   single segment) buffer interface and returns a pointer to a read-only memory
+   location which can contain arbitrary data.
+
+   0 is returned on success.  buffer and buffer_len are only set in case no
+   error occurs.  Otherwise, -1 is returned and an exception set. */
+Py_DEPRECATED(3.0)
+PyAPI_FUNC(int) PyObject_AsReadBuffer(PyObject *obj,
+                                      const void **buffer,
+                                      Py_ssize_t *buffer_len);
+
+/* Takes an arbitrary object which must support the (writable, single segment)
+   buffer interface and returns a pointer to a writable memory location in
+   buffer of size 'buffer_len'.
+
+   Return 0 on success.  buffer and buffer_len are only set in case no error
+   occurs. Otherwise, -1 is returned and an exception set. */
+Py_DEPRECATED(3.0)
+PyAPI_FUNC(int) PyObject_AsWriteBuffer(PyObject *obj,
+                                       void **buffer,
+                                       Py_ssize_t *buffer_len);
+
+
+/* === New Buffer API ============================================ */
+
+/* Takes an arbitrary object and returns the result of calling
+   obj.__format__(format_spec). */
+PyAPI_FUNC(PyObject *) PyObject_Format(PyObject *obj,
+                                       PyObject *format_spec);
+
+
+/* ==== Iterators ================================================ */
+
+/* Takes an object and returns an iterator for it.
+   This is typically a new iterator but if the argument is an iterator, this
+   returns itself. */
+PyAPI_FUNC(PyObject *) PyObject_GetIter(PyObject *);
+
+/* Takes an AsyncIterable object and returns an AsyncIterator for it.
+   This is typically a new iterator but if the argument is an AsyncIterator,
+   this returns itself. */
+PyAPI_FUNC(PyObject *) PyObject_GetAIter(PyObject *);
+
+/* Returns non-zero if the object 'obj' provides iterator protocols, and 0 otherwise.
+
+   This function always succeeds. */
+PyAPI_FUNC(int) PyIter_Check(PyObject *);
+
+/* Returns non-zero if the object 'obj' provides AsyncIterator protocols, and 0 otherwise.
+
+   This function always succeeds. */
+PyAPI_FUNC(int) PyAIter_Check(PyObject *);
+
+/* Takes an iterator object and calls its tp_iternext slot,
+   returning the next value.
+
+   If the iterator is exhausted, this returns NULL without setting an
+   exception.
+
+   NULL with an exception means an error occurred. */
+PyAPI_FUNC(PyObject *) PyIter_Next(PyObject *);
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030A0000
+
+/* Takes generator, coroutine or iterator object and sends the value into it.
+   Returns:
+   - PYGEN_RETURN (0) if generator has returned.
+     'result' parameter is filled with return value
+   - PYGEN_ERROR (-1) if exception was raised.
+     'result' parameter is NULL
+   - PYGEN_NEXT (1) if generator has yielded.
+     'result' parameter is filled with yielded value. */
+PyAPI_FUNC(PySendResult) PyIter_Send(PyObject *, PyObject *, PyObject **);
+#endif
+
+
+/* === Number Protocol ================================================== */
+
+/* Returns 1 if the object 'o' provides numeric protocols, and 0 otherwise.
+
+   This function always succeeds. */
+PyAPI_FUNC(int) PyNumber_Check(PyObject *o);
+
+/* Returns the result of adding o1 and o2, or NULL on failure.
+
+   This is the equivalent of the Python expression: o1 + o2. */
+PyAPI_FUNC(PyObject *) PyNumber_Add(PyObject *o1, PyObject *o2);
+
+/* Returns the result of subtracting o2 from o1, or NULL on failure.
+
+   This is the equivalent of the Python expression: o1 - o2. */
+PyAPI_FUNC(PyObject *) PyNumber_Subtract(PyObject *o1, PyObject *o2);
+
+/* Returns the result of multiplying o1 and o2, or NULL on failure.
+
+   This is the equivalent of the Python expression: o1 * o2. */
+PyAPI_FUNC(PyObject *) PyNumber_Multiply(PyObject *o1, PyObject *o2);
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000
+/* This is the equivalent of the Python expression: o1 @ o2. */
+PyAPI_FUNC(PyObject *) PyNumber_MatrixMultiply(PyObject *o1, PyObject *o2);
+#endif
+
+/* Returns the result of dividing o1 by o2 giving an integral result,
+   or NULL on failure.
+
+   This is the equivalent of the Python expression: o1 // o2. */
+PyAPI_FUNC(PyObject *) PyNumber_FloorDivide(PyObject *o1, PyObject *o2);
+
+/* Returns the result of dividing o1 by o2 giving a float result, or NULL on
+   failure.
+
+   This is the equivalent of the Python expression: o1 / o2. */
+PyAPI_FUNC(PyObject *) PyNumber_TrueDivide(PyObject *o1, PyObject *o2);
+
+/* Returns the remainder of dividing o1 by o2, or NULL on failure.
+
+   This is the equivalent of the Python expression: o1 % o2. */
+PyAPI_FUNC(PyObject *) PyNumber_Remainder(PyObject *o1, PyObject *o2);
+
+/* See the built-in function divmod.
+
+   Returns NULL on failure.
+
+   This is the equivalent of the Python expression: divmod(o1, o2). */
+PyAPI_FUNC(PyObject *) PyNumber_Divmod(PyObject *o1, PyObject *o2);
+
+/* See the built-in function pow. Returns NULL on failure.
+
+   This is the equivalent of the Python expression: pow(o1, o2, o3),
+   where o3 is optional. */
+PyAPI_FUNC(PyObject *) PyNumber_Power(PyObject *o1, PyObject *o2,
+                                      PyObject *o3);
+
+/* Returns the negation of o on success, or NULL on failure.
+
+ This is the equivalent of the Python expression: -o. */
+PyAPI_FUNC(PyObject *) PyNumber_Negative(PyObject *o);
+
+/* Returns the positive of o on success, or NULL on failure.
+
+   This is the equivalent of the Python expression: +o. */
+PyAPI_FUNC(PyObject *) PyNumber_Positive(PyObject *o);
+
+/* Returns the absolute value of 'o', or NULL on failure.
+
+   This is the equivalent of the Python expression: abs(o). */
+PyAPI_FUNC(PyObject *) PyNumber_Absolute(PyObject *o);
+
+/* Returns the bitwise negation of 'o' on success, or NULL on failure.
+
+   This is the equivalent of the Python expression: ~o. */
+PyAPI_FUNC(PyObject *) PyNumber_Invert(PyObject *o);
+
+/* Returns the result of left shifting o1 by o2 on success, or NULL on failure.
+
+   This is the equivalent of the Python expression: o1 << o2. */
+PyAPI_FUNC(PyObject *) PyNumber_Lshift(PyObject *o1, PyObject *o2);
+
+/* Returns the result of right shifting o1 by o2 on success, or NULL on
+   failure.
+
+   This is the equivalent of the Python expression: o1 >> o2. */
+PyAPI_FUNC(PyObject *) PyNumber_Rshift(PyObject *o1, PyObject *o2);
+
+/* Returns the result of bitwise and of o1 and o2 on success, or NULL on
+   failure.
+
+   This is the equivalent of the Python expression: o1 & o2. */
+PyAPI_FUNC(PyObject *) PyNumber_And(PyObject *o1, PyObject *o2);
+
+/* Returns the bitwise exclusive or of o1 by o2 on success, or NULL on failure.
+
+   This is the equivalent of the Python expression: o1 ^ o2. */
+PyAPI_FUNC(PyObject *) PyNumber_Xor(PyObject *o1, PyObject *o2);
+
+/* Returns the result of bitwise or on o1 and o2 on success, or NULL on
+   failure.
+
+   This is the equivalent of the Python expression: o1 | o2. */
+PyAPI_FUNC(PyObject *) PyNumber_Or(PyObject *o1, PyObject *o2);
+
+/* Returns 1 if obj is an index integer (has the nb_index slot of the
+   tp_as_number structure filled in), and 0 otherwise. */
+PyAPI_FUNC(int) PyIndex_Check(PyObject *);
+
+/* Returns the object 'o' converted to a Python int, or NULL with an exception
+   raised on failure. */
+PyAPI_FUNC(PyObject *) PyNumber_Index(PyObject *o);
+
+/* Returns the object 'o' converted to Py_ssize_t by going through
+   PyNumber_Index() first.
+
+   If an overflow error occurs while converting the int to Py_ssize_t, then the
+   second argument 'exc' is the error-type to return.  If it is NULL, then the
+   overflow error is cleared and the value is clipped. */
+PyAPI_FUNC(Py_ssize_t) PyNumber_AsSsize_t(PyObject *o, PyObject *exc);
+
+/* Returns the object 'o' converted to an integer object on success, or NULL
+   on failure.
+
+   This is the equivalent of the Python expression: int(o). */
+PyAPI_FUNC(PyObject *) PyNumber_Long(PyObject *o);
+
+/* Returns the object 'o' converted to a float object on success, or NULL
+  on failure.
+
+  This is the equivalent of the Python expression: float(o). */
+PyAPI_FUNC(PyObject *) PyNumber_Float(PyObject *o);
+
+
+/* --- In-place variants of (some of) the above number protocol functions -- */
+
+/* Returns the result of adding o2 to o1, possibly in-place, or NULL
+   on failure.
+
+   This is the equivalent of the Python expression: o1 += o2. */
+PyAPI_FUNC(PyObject *) PyNumber_InPlaceAdd(PyObject *o1, PyObject *o2);
+
+/* Returns the result of subtracting o2 from o1, possibly in-place or
+   NULL on failure.
+
+   This is the equivalent of the Python expression: o1 -= o2. */
+PyAPI_FUNC(PyObject *) PyNumber_InPlaceSubtract(PyObject *o1, PyObject *o2);
+
+/* Returns the result of multiplying o1 by o2, possibly in-place, or NULL on
+   failure.
+
+   This is the equivalent of the Python expression: o1 *= o2. */
+PyAPI_FUNC(PyObject *) PyNumber_InPlaceMultiply(PyObject *o1, PyObject *o2);
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000
+/* This is the equivalent of the Python expression: o1 @= o2. */
+PyAPI_FUNC(PyObject *) PyNumber_InPlaceMatrixMultiply(PyObject *o1, PyObject *o2);
+#endif
+
+/* Returns the result of dividing o1 by o2 giving an integral result, possibly
+   in-place, or NULL on failure.
+
+   This is the equivalent of the Python expression: o1 /= o2. */
+PyAPI_FUNC(PyObject *) PyNumber_InPlaceFloorDivide(PyObject *o1,
+                                                   PyObject *o2);
+
+/* Returns the result of dividing o1 by o2 giving a float result, possibly
+   in-place, or null on failure.
+
+   This is the equivalent of the Python expression: o1 /= o2. */
+PyAPI_FUNC(PyObject *) PyNumber_InPlaceTrueDivide(PyObject *o1,
+                                                  PyObject *o2);
+
+/* Returns the remainder of dividing o1 by o2, possibly in-place, or NULL on
+   failure.
+
+   This is the equivalent of the Python expression: o1 %= o2. */
+PyAPI_FUNC(PyObject *) PyNumber_InPlaceRemainder(PyObject *o1, PyObject *o2);
+
+/* Returns the result of raising o1 to the power of o2, possibly in-place,
+   or NULL on failure.
+
+   This is the equivalent of the Python expression: o1 **= o2,
+   or o1 = pow(o1, o2, o3) if o3 is present. */
+PyAPI_FUNC(PyObject *) PyNumber_InPlacePower(PyObject *o1, PyObject *o2,
+                                             PyObject *o3);
+
+/* Returns the result of left shifting o1 by o2, possibly in-place, or NULL
+   on failure.
+
+   This is the equivalent of the Python expression: o1 <<= o2. */
+PyAPI_FUNC(PyObject *) PyNumber_InPlaceLshift(PyObject *o1, PyObject *o2);
+
+/* Returns the result of right shifting o1 by o2, possibly in-place or NULL
+   on failure.
+
+   This is the equivalent of the Python expression: o1 >>= o2. */
+PyAPI_FUNC(PyObject *) PyNumber_InPlaceRshift(PyObject *o1, PyObject *o2);
+
+/* Returns the result of bitwise and of o1 and o2, possibly in-place, or NULL
+   on failure.
+
+   This is the equivalent of the Python expression: o1 &= o2. */
+PyAPI_FUNC(PyObject *) PyNumber_InPlaceAnd(PyObject *o1, PyObject *o2);
+
+/* Returns the bitwise exclusive or of o1 by o2, possibly in-place, or NULL
+   on failure.
+
+   This is the equivalent of the Python expression: o1 ^= o2. */
+PyAPI_FUNC(PyObject *) PyNumber_InPlaceXor(PyObject *o1, PyObject *o2);
+
+/* Returns the result of bitwise or of o1 and o2, possibly in-place,
+   or NULL on failure.
+
+   This is the equivalent of the Python expression: o1 |= o2. */
+PyAPI_FUNC(PyObject *) PyNumber_InPlaceOr(PyObject *o1, PyObject *o2);
+
+/* Returns the integer n converted to a string with a base, with a base
+   marker of 0b, 0o or 0x prefixed if applicable.
+
+   If n is not an int object, it is converted with PyNumber_Index first. */
+PyAPI_FUNC(PyObject *) PyNumber_ToBase(PyObject *n, int base);
+
+
+/* === Sequence protocol ================================================ */
+
+/* Return 1 if the object provides sequence protocol, and zero
+   otherwise.
+
+   This function always succeeds. */
+PyAPI_FUNC(int) PySequence_Check(PyObject *o);
+
+/* Return the size of sequence object o, or -1 on failure. */
+PyAPI_FUNC(Py_ssize_t) PySequence_Size(PyObject *o);
+
+/* For DLL compatibility */
+#undef PySequence_Length
+PyAPI_FUNC(Py_ssize_t) PySequence_Length(PyObject *o);
+#define PySequence_Length PySequence_Size
+
+
+/* Return the concatenation of o1 and o2 on success, and NULL on failure.
+
+   This is the equivalent of the Python expression: o1 + o2. */
+PyAPI_FUNC(PyObject *) PySequence_Concat(PyObject *o1, PyObject *o2);
+
+/* Return the result of repeating sequence object 'o' 'count' times,
+  or NULL on failure.
+
+  This is the equivalent of the Python expression: o * count. */
+PyAPI_FUNC(PyObject *) PySequence_Repeat(PyObject *o, Py_ssize_t count);
+
+/* Return the ith element of o, or NULL on failure.
+
+   This is the equivalent of the Python expression: o[i]. */
+PyAPI_FUNC(PyObject *) PySequence_GetItem(PyObject *o, Py_ssize_t i);
+
+/* Return the slice of sequence object o between i1 and i2, or NULL on failure.
+
+   This is the equivalent of the Python expression: o[i1:i2]. */
+PyAPI_FUNC(PyObject *) PySequence_GetSlice(PyObject *o, Py_ssize_t i1, Py_ssize_t i2);
+
+/* Assign object 'v' to the ith element of the sequence 'o'. Raise an exception
+   and return -1 on failure; return 0 on success.
+
+   This is the equivalent of the Python statement o[i] = v. */
+PyAPI_FUNC(int) PySequence_SetItem(PyObject *o, Py_ssize_t i, PyObject *v);
+
+/* Delete the 'i'-th element of the sequence 'v'. Returns -1 on failure.
+
+   This is the equivalent of the Python statement: del o[i]. */
+PyAPI_FUNC(int) PySequence_DelItem(PyObject *o, Py_ssize_t i);
+
+/* Assign the sequence object 'v' to the slice in sequence object 'o',
+   from 'i1' to 'i2'. Returns -1 on failure.
+
+   This is the equivalent of the Python statement: o[i1:i2] = v. */
+PyAPI_FUNC(int) PySequence_SetSlice(PyObject *o, Py_ssize_t i1, Py_ssize_t i2,
+                                    PyObject *v);
+
+/* Delete the slice in sequence object 'o' from 'i1' to 'i2'.
+   Returns -1 on failure.
+
+   This is the equivalent of the Python statement: del o[i1:i2]. */
+PyAPI_FUNC(int) PySequence_DelSlice(PyObject *o, Py_ssize_t i1, Py_ssize_t i2);
+
+/* Returns the sequence 'o' as a tuple on success, and NULL on failure.
+
+   This is equivalent to the Python expression: tuple(o). */
+PyAPI_FUNC(PyObject *) PySequence_Tuple(PyObject *o);
+
+/* Returns the sequence 'o' as a list on success, and NULL on failure.
+   This is equivalent to the Python expression: list(o) */
+PyAPI_FUNC(PyObject *) PySequence_List(PyObject *o);
+
+/* Return the sequence 'o' as a list, unless it's already a tuple or list.
+
+   Use PySequence_Fast_GET_ITEM to access the members of this list, and
+   PySequence_Fast_GET_SIZE to get its length.
+
+   Returns NULL on failure.  If the object does not support iteration, raises a
+   TypeError exception with 'm' as the message text. */
+PyAPI_FUNC(PyObject *) PySequence_Fast(PyObject *o, const char* m);
+
+/* Return the size of the sequence 'o', assuming that 'o' was returned by
+   PySequence_Fast and is not NULL. */
+#define PySequence_Fast_GET_SIZE(o) \
+    (PyList_Check(o) ? PyList_GET_SIZE(o) : PyTuple_GET_SIZE(o))
+
+/* Return the 'i'-th element of the sequence 'o', assuming that o was returned
+   by PySequence_Fast, and that i is within bounds. */
+#define PySequence_Fast_GET_ITEM(o, i)\
+     (PyList_Check(o) ? PyList_GET_ITEM((o), (i)) : PyTuple_GET_ITEM((o), (i)))
+
+/* Return a pointer to the underlying item array for
+   an object returned by PySequence_Fast */
+#define PySequence_Fast_ITEMS(sf) \
+    (PyList_Check(sf) ? ((PyListObject *)(sf))->ob_item \
+                      : ((PyTupleObject *)(sf))->ob_item)
+
+/* Return the number of occurrences on value on 'o', that is, return
+   the number of keys for which o[key] == value.
+
+   On failure, return -1.  This is equivalent to the Python expression:
+   o.count(value). */
+PyAPI_FUNC(Py_ssize_t) PySequence_Count(PyObject *o, PyObject *value);
+
+/* Return 1 if 'ob' is in the sequence 'seq'; 0 if 'ob' is not in the sequence
+   'seq'; -1 on error.
+
+   Use __contains__ if possible, else _PySequence_IterSearch(). */
+PyAPI_FUNC(int) PySequence_Contains(PyObject *seq, PyObject *ob);
+
+/* For DLL-level backwards compatibility */
+#undef PySequence_In
+/* Determine if the sequence 'o' contains 'value'. If an item in 'o' is equal
+   to 'value', return 1, otherwise return 0. On error, return -1.
+
+   This is equivalent to the Python expression: value in o. */
+PyAPI_FUNC(int) PySequence_In(PyObject *o, PyObject *value);
+
+/* For source-level backwards compatibility */
+#define PySequence_In PySequence_Contains
+
+
+/* Return the first index for which o[i] == value.
+   On error, return -1.
+
+   This is equivalent to the Python expression: o.index(value). */
+PyAPI_FUNC(Py_ssize_t) PySequence_Index(PyObject *o, PyObject *value);
+
+
+/* --- In-place versions of some of the above Sequence functions --- */
+
+/* Append sequence 'o2' to sequence 'o1', in-place when possible. Return the
+   resulting object, which could be 'o1', or NULL on failure.
+
+  This is the equivalent of the Python expression: o1 += o2. */
+PyAPI_FUNC(PyObject *) PySequence_InPlaceConcat(PyObject *o1, PyObject *o2);
+
+/* Repeat sequence 'o' by 'count', in-place when possible. Return the resulting
+   object, which could be 'o', or NULL on failure.
+
+   This is the equivalent of the Python expression: o1 *= count.  */
+PyAPI_FUNC(PyObject *) PySequence_InPlaceRepeat(PyObject *o, Py_ssize_t count);
+
+
+/* === Mapping protocol ================================================= */
+
+/* Return 1 if the object provides mapping protocol, and 0 otherwise.
+
+   This function always succeeds. */
+PyAPI_FUNC(int) PyMapping_Check(PyObject *o);
+
+/* Returns the number of keys in mapping object 'o' on success, and -1 on
+  failure. This is equivalent to the Python expression: len(o). */
+PyAPI_FUNC(Py_ssize_t) PyMapping_Size(PyObject *o);
+
+/* For DLL compatibility */
+#undef PyMapping_Length
+PyAPI_FUNC(Py_ssize_t) PyMapping_Length(PyObject *o);
+#define PyMapping_Length PyMapping_Size
+
+
+/* Implemented as a macro:
+
+   int PyMapping_DelItemString(PyObject *o, const char *key);
+
+   Remove the mapping for the string 'key' from the mapping 'o'. Returns -1 on
+   failure.
+
+   This is equivalent to the Python statement: del o[key]. */
+#define PyMapping_DelItemString(O, K) PyObject_DelItemString((O), (K))
+
+/* Implemented as a macro:
+
+   int PyMapping_DelItem(PyObject *o, PyObject *key);
+
+   Remove the mapping for the object 'key' from the mapping object 'o'.
+   Returns -1 on failure.
+
+   This is equivalent to the Python statement: del o[key]. */
+#define PyMapping_DelItem(O, K) PyObject_DelItem((O), (K))
+
+/* On success, return 1 if the mapping object 'o' has the key 'key',
+   and 0 otherwise.
+
+   This is equivalent to the Python expression: key in o.
+
+   This function always succeeds. */
+PyAPI_FUNC(int) PyMapping_HasKeyString(PyObject *o, const char *key);
+
+/* Return 1 if the mapping object has the key 'key', and 0 otherwise.
+
+   This is equivalent to the Python expression: key in o.
+
+   This function always succeeds. */
+PyAPI_FUNC(int) PyMapping_HasKey(PyObject *o, PyObject *key);
+
+/* On success, return a list or tuple of the keys in mapping object 'o'.
+   On failure, return NULL. */
+PyAPI_FUNC(PyObject *) PyMapping_Keys(PyObject *o);
+
+/* On success, return a list or tuple of the values in mapping object 'o'.
+   On failure, return NULL. */
+PyAPI_FUNC(PyObject *) PyMapping_Values(PyObject *o);
+
+/* On success, return a list or tuple of the items in mapping object 'o',
+   where each item is a tuple containing a key-value pair. On failure, return
+   NULL. */
+PyAPI_FUNC(PyObject *) PyMapping_Items(PyObject *o);
+
+/* Return element of 'o' corresponding to the string 'key' or NULL on failure.
+
+   This is the equivalent of the Python expression: o[key]. */
+PyAPI_FUNC(PyObject *) PyMapping_GetItemString(PyObject *o,
+                                               const char *key);
+
+/* Map the string 'key' to the value 'v' in the mapping 'o'.
+   Returns -1 on failure.
+
+   This is the equivalent of the Python statement: o[key]=v. */
+PyAPI_FUNC(int) PyMapping_SetItemString(PyObject *o, const char *key,
+                                        PyObject *value);
+
+/* isinstance(object, typeorclass) */
+PyAPI_FUNC(int) PyObject_IsInstance(PyObject *object, PyObject *typeorclass);
+
+/* issubclass(object, typeorclass) */
+PyAPI_FUNC(int) PyObject_IsSubclass(PyObject *object, PyObject *typeorclass);
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_ABSTRACTOBJECT_H
+#  include "cpython/abstract.h"
+#  undef Py_CPYTHON_ABSTRACTOBJECT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* Py_ABSTRACTOBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/bltinmodule.h b/nanvix-port/cpython-headers/python3.12/bltinmodule.h
new file mode 100644
index 000000000000..868c9e6443bf
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/bltinmodule.h
@@ -0,0 +1,14 @@
+#ifndef Py_BLTINMODULE_H
+#define Py_BLTINMODULE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_DATA(PyTypeObject) PyFilter_Type;
+PyAPI_DATA(PyTypeObject) PyMap_Type;
+PyAPI_DATA(PyTypeObject) PyZip_Type;
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_BLTINMODULE_H */
diff --git a/nanvix-port/cpython-headers/python3.12/boolobject.h b/nanvix-port/cpython-headers/python3.12/boolobject.h
new file mode 100644
index 000000000000..19aef5b1b87c
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/boolobject.h
@@ -0,0 +1,42 @@
+/* Boolean object interface */
+
+#ifndef Py_BOOLOBJECT_H
+#define Py_BOOLOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+// PyBool_Type is declared by object.h
+
+#define PyBool_Check(x) Py_IS_TYPE((x), &PyBool_Type)
+
+/* Py_False and Py_True are the only two bools in existence. */
+
+/* Don't use these directly */
+PyAPI_DATA(PyLongObject) _Py_FalseStruct;
+PyAPI_DATA(PyLongObject) _Py_TrueStruct;
+
+/* Use these macros */
+#define Py_False _PyObject_CAST(&_Py_FalseStruct)
+#define Py_True _PyObject_CAST(&_Py_TrueStruct)
+
+// Test if an object is the True singleton, the same as "x is True" in Python.
+PyAPI_FUNC(int) Py_IsTrue(PyObject *x);
+#define Py_IsTrue(x) Py_Is((x), Py_True)
+
+// Test if an object is the False singleton, the same as "x is False" in Python.
+PyAPI_FUNC(int) Py_IsFalse(PyObject *x);
+#define Py_IsFalse(x) Py_Is((x), Py_False)
+
+/* Macros for returning Py_True or Py_False, respectively */
+#define Py_RETURN_TRUE return Py_True
+#define Py_RETURN_FALSE return Py_False
+
+/* Function to return a bool from a C long */
+PyAPI_FUNC(PyObject *) PyBool_FromLong(long);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_BOOLOBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/bytearrayobject.h b/nanvix-port/cpython-headers/python3.12/bytearrayobject.h
new file mode 100644
index 000000000000..3d53fdba6432
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/bytearrayobject.h
@@ -0,0 +1,44 @@
+/* ByteArray object interface */
+
+#ifndef Py_BYTEARRAYOBJECT_H
+#define Py_BYTEARRAYOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Type PyByteArrayObject represents a mutable array of bytes.
+ * The Python API is that of a sequence;
+ * the bytes are mapped to ints in [0, 256).
+ * Bytes are not characters; they may be used to encode characters.
+ * The only way to go between bytes and str/unicode is via encoding
+ * and decoding.
+ * For the convenience of C programmers, the bytes type is considered
+ * to contain a char pointer, not an unsigned char pointer.
+ */
+
+/* Type object */
+PyAPI_DATA(PyTypeObject) PyByteArray_Type;
+PyAPI_DATA(PyTypeObject) PyByteArrayIter_Type;
+
+/* Type check macros */
+#define PyByteArray_Check(self) PyObject_TypeCheck((self), &PyByteArray_Type)
+#define PyByteArray_CheckExact(self) Py_IS_TYPE((self), &PyByteArray_Type)
+
+/* Direct API functions */
+PyAPI_FUNC(PyObject *) PyByteArray_FromObject(PyObject *);
+PyAPI_FUNC(PyObject *) PyByteArray_Concat(PyObject *, PyObject *);
+PyAPI_FUNC(PyObject *) PyByteArray_FromStringAndSize(const char *, Py_ssize_t);
+PyAPI_FUNC(Py_ssize_t) PyByteArray_Size(PyObject *);
+PyAPI_FUNC(char *) PyByteArray_AsString(PyObject *);
+PyAPI_FUNC(int) PyByteArray_Resize(PyObject *, Py_ssize_t);
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_BYTEARRAYOBJECT_H
+#  include "cpython/bytearrayobject.h"
+#  undef Py_CPYTHON_BYTEARRAYOBJECT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_BYTEARRAYOBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/bytesobject.h b/nanvix-port/cpython-headers/python3.12/bytesobject.h
new file mode 100644
index 000000000000..ee448cd02bda
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/bytesobject.h
@@ -0,0 +1,69 @@
+
+/* Bytes object interface */
+
+#ifndef Py_BYTESOBJECT_H
+#define Py_BYTESOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdarg.h>               // va_list
+
+/*
+Type PyBytesObject represents a byte string.  An extra zero byte is
+reserved at the end to ensure it is zero-terminated, but a size is
+present so strings with null bytes in them can be represented.  This
+is an immutable object type.
+
+There are functions to create new bytes objects, to test
+an object for bytes-ness, and to get the
+byte string value.  The latter function returns a null pointer
+if the object is not of the proper type.
+There is a variant that takes an explicit size as well as a
+variant that assumes a zero-terminated string.  Note that none of the
+functions should be applied to NULL pointer.
+*/
+
+PyAPI_DATA(PyTypeObject) PyBytes_Type;
+PyAPI_DATA(PyTypeObject) PyBytesIter_Type;
+
+#define PyBytes_Check(op) \
+                 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_BYTES_SUBCLASS)
+#define PyBytes_CheckExact(op) Py_IS_TYPE((op), &PyBytes_Type)
+
+PyAPI_FUNC(PyObject *) PyBytes_FromStringAndSize(const char *, Py_ssize_t);
+PyAPI_FUNC(PyObject *) PyBytes_FromString(const char *);
+PyAPI_FUNC(PyObject *) PyBytes_FromObject(PyObject *);
+PyAPI_FUNC(PyObject *) PyBytes_FromFormatV(const char*, va_list)
+                                Py_GCC_ATTRIBUTE((format(printf, 1, 0)));
+PyAPI_FUNC(PyObject *) PyBytes_FromFormat(const char*, ...)
+                                Py_GCC_ATTRIBUTE((format(printf, 1, 2)));
+PyAPI_FUNC(Py_ssize_t) PyBytes_Size(PyObject *);
+PyAPI_FUNC(char *) PyBytes_AsString(PyObject *);
+PyAPI_FUNC(PyObject *) PyBytes_Repr(PyObject *, int);
+PyAPI_FUNC(void) PyBytes_Concat(PyObject **, PyObject *);
+PyAPI_FUNC(void) PyBytes_ConcatAndDel(PyObject **, PyObject *);
+PyAPI_FUNC(PyObject *) PyBytes_DecodeEscape(const char *, Py_ssize_t,
+                                            const char *, Py_ssize_t,
+                                            const char *);
+
+/* Provides access to the internal data buffer and size of a bytes object.
+   Passing NULL as len parameter will force the string buffer to be
+   0-terminated (passing a string with embedded NUL characters will
+   cause an exception).  */
+PyAPI_FUNC(int) PyBytes_AsStringAndSize(
+    PyObject *obj,      /* bytes object */
+    char **s,           /* pointer to buffer variable */
+    Py_ssize_t *len     /* pointer to length variable or NULL */
+    );
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_BYTESOBJECT_H
+#  include "cpython/bytesobject.h"
+#  undef Py_CPYTHON_BYTESOBJECT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_BYTESOBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/ceval.h b/nanvix-port/cpython-headers/python3.12/ceval.h
new file mode 100644
index 000000000000..ad4d909d6f2b
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/ceval.h
@@ -0,0 +1,168 @@
+/* Interface to random parts in ceval.c */
+
+#ifndef Py_CEVAL_H
+#define Py_CEVAL_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+PyAPI_FUNC(PyObject *) PyEval_EvalCode(PyObject *, PyObject *, PyObject *);
+
+PyAPI_FUNC(PyObject *) PyEval_EvalCodeEx(PyObject *co,
+                                         PyObject *globals,
+                                         PyObject *locals,
+                                         PyObject *const *args, int argc,
+                                         PyObject *const *kwds, int kwdc,
+                                         PyObject *const *defs, int defc,
+                                         PyObject *kwdefs, PyObject *closure);
+
+/* PyEval_CallObjectWithKeywords(), PyEval_CallObject(), PyEval_CallFunction
+ * and PyEval_CallMethod are deprecated. Since they are officially part of the
+ * stable ABI (PEP 384), they must be kept for backward compatibility.
+ * PyObject_Call(), PyObject_CallFunction() and PyObject_CallMethod() are
+ * recommended to call a callable object.
+ */
+
+Py_DEPRECATED(3.9) PyAPI_FUNC(PyObject *) PyEval_CallObjectWithKeywords(
+    PyObject *callable,
+    PyObject *args,
+    PyObject *kwargs);
+
+/* Deprecated since PyEval_CallObjectWithKeywords is deprecated */
+#define PyEval_CallObject(callable, arg) \
+    PyEval_CallObjectWithKeywords((callable), (arg), _PyObject_CAST(_Py_NULL))
+
+Py_DEPRECATED(3.9) PyAPI_FUNC(PyObject *) PyEval_CallFunction(
+    PyObject *callable, const char *format, ...);
+Py_DEPRECATED(3.9) PyAPI_FUNC(PyObject *) PyEval_CallMethod(
+    PyObject *obj, const char *name, const char *format, ...);
+
+PyAPI_FUNC(PyObject *) PyEval_GetBuiltins(void);
+PyAPI_FUNC(PyObject *) PyEval_GetGlobals(void);
+PyAPI_FUNC(PyObject *) PyEval_GetLocals(void);
+PyAPI_FUNC(PyFrameObject *) PyEval_GetFrame(void);
+
+PyAPI_FUNC(int) Py_AddPendingCall(int (*func)(void *), void *arg);
+PyAPI_FUNC(int) Py_MakePendingCalls(void);
+
+/* Protection against deeply nested recursive calls
+
+   In Python 3.0, this protection has two levels:
+   * normal anti-recursion protection is triggered when the recursion level
+     exceeds the current recursion limit. It raises a RecursionError, and sets
+     the "overflowed" flag in the thread state structure. This flag
+     temporarily *disables* the normal protection; this allows cleanup code
+     to potentially outgrow the recursion limit while processing the
+     RecursionError.
+   * "last chance" anti-recursion protection is triggered when the recursion
+     level exceeds "current recursion limit + 50". By construction, this
+     protection can only be triggered when the "overflowed" flag is set. It
+     means the cleanup code has itself gone into an infinite loop, or the
+     RecursionError has been mistakingly ignored. When this protection is
+     triggered, the interpreter aborts with a Fatal Error.
+
+   In addition, the "overflowed" flag is automatically reset when the
+   recursion level drops below "current recursion limit - 50". This heuristic
+   is meant to ensure that the normal anti-recursion protection doesn't get
+   disabled too long.
+
+   Please note: this scheme has its own limitations. See:
+   http://mail.python.org/pipermail/python-dev/2008-August/082106.html
+   for some observations.
+*/
+PyAPI_FUNC(void) Py_SetRecursionLimit(int);
+PyAPI_FUNC(int) Py_GetRecursionLimit(void);
+
+PyAPI_FUNC(int) Py_EnterRecursiveCall(const char *where);
+PyAPI_FUNC(void) Py_LeaveRecursiveCall(void);
+
+PyAPI_FUNC(const char *) PyEval_GetFuncName(PyObject *);
+PyAPI_FUNC(const char *) PyEval_GetFuncDesc(PyObject *);
+
+PyAPI_FUNC(PyObject *) PyEval_EvalFrame(PyFrameObject *);
+PyAPI_FUNC(PyObject *) PyEval_EvalFrameEx(PyFrameObject *f, int exc);
+
+/* Interface for threads.
+
+   A module that plans to do a blocking system call (or something else
+   that lasts a long time and doesn't touch Python data) can allow other
+   threads to run as follows:
+
+    ...preparations here...
+    Py_BEGIN_ALLOW_THREADS
+    ...blocking system call here...
+    Py_END_ALLOW_THREADS
+    ...interpret result here...
+
+   The Py_BEGIN_ALLOW_THREADS/Py_END_ALLOW_THREADS pair expands to a
+   {}-surrounded block.
+   To leave the block in the middle (e.g., with return), you must insert
+   a line containing Py_BLOCK_THREADS before the return, e.g.
+
+    if (...premature_exit...) {
+        Py_BLOCK_THREADS
+        PyErr_SetFromErrno(PyExc_OSError);
+        return NULL;
+    }
+
+   An alternative is:
+
+    Py_BLOCK_THREADS
+    if (...premature_exit...) {
+        PyErr_SetFromErrno(PyExc_OSError);
+        return NULL;
+    }
+    Py_UNBLOCK_THREADS
+
+   For convenience, that the value of 'errno' is restored across
+   Py_END_ALLOW_THREADS and Py_BLOCK_THREADS.
+
+   WARNING: NEVER NEST CALLS TO Py_BEGIN_ALLOW_THREADS AND
+   Py_END_ALLOW_THREADS!!!
+
+   Note that not yet all candidates have been converted to use this
+   mechanism!
+*/
+
+PyAPI_FUNC(PyThreadState *) PyEval_SaveThread(void);
+PyAPI_FUNC(void) PyEval_RestoreThread(PyThreadState *);
+
+Py_DEPRECATED(3.9) PyAPI_FUNC(int) PyEval_ThreadsInitialized(void);
+Py_DEPRECATED(3.9) PyAPI_FUNC(void) PyEval_InitThreads(void);
+/* PyEval_AcquireLock() and PyEval_ReleaseLock() are part of stable ABI.
+ * They will be removed from this header file in the future version.
+ * But they will be remained in ABI until Python 4.0.
+ */
+Py_DEPRECATED(3.2) PyAPI_FUNC(void) PyEval_AcquireLock(void);
+Py_DEPRECATED(3.2) PyAPI_FUNC(void) PyEval_ReleaseLock(void);
+PyAPI_FUNC(void) PyEval_AcquireThread(PyThreadState *tstate);
+PyAPI_FUNC(void) PyEval_ReleaseThread(PyThreadState *tstate);
+
+#define Py_BEGIN_ALLOW_THREADS { \
+                        PyThreadState *_save; \
+                        _save = PyEval_SaveThread();
+#define Py_BLOCK_THREADS        PyEval_RestoreThread(_save);
+#define Py_UNBLOCK_THREADS      _save = PyEval_SaveThread();
+#define Py_END_ALLOW_THREADS    PyEval_RestoreThread(_save); \
+                 }
+
+/* Masks and values used by FORMAT_VALUE opcode. */
+#define FVC_MASK      0x3
+#define FVC_NONE      0x0
+#define FVC_STR       0x1
+#define FVC_REPR      0x2
+#define FVC_ASCII     0x3
+#define FVS_MASK      0x4
+#define FVS_HAVE_SPEC 0x4
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_CEVAL_H
+#  include "cpython/ceval.h"
+#  undef Py_CPYTHON_CEVAL_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_CEVAL_H */
diff --git a/nanvix-port/cpython-headers/python3.12/codecs.h b/nanvix-port/cpython-headers/python3.12/codecs.h
new file mode 100644
index 000000000000..37ecfb4ab757
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/codecs.h
@@ -0,0 +1,248 @@
+#ifndef Py_CODECREGISTRY_H
+#define Py_CODECREGISTRY_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* ------------------------------------------------------------------------
+
+   Python Codec Registry and support functions
+
+
+Written by Marc-Andre Lemburg (mal@lemburg.com).
+
+Copyright (c) Corporation for National Research Initiatives.
+
+   ------------------------------------------------------------------------ */
+
+/* Register a new codec search function.
+
+   As side effect, this tries to load the encodings package, if not
+   yet done, to make sure that it is always first in the list of
+   search functions.
+
+   The search_function's refcount is incremented by this function. */
+
+PyAPI_FUNC(int) PyCodec_Register(
+       PyObject *search_function
+       );
+
+/* Unregister a codec search function and clear the registry's cache.
+   If the search function is not registered, do nothing.
+   Return 0 on success. Raise an exception and return -1 on error. */
+
+PyAPI_FUNC(int) PyCodec_Unregister(
+       PyObject *search_function
+       );
+
+/* Codec registry lookup API.
+
+   Looks up the given encoding and returns a CodecInfo object with
+   function attributes which implement the different aspects of
+   processing the encoding.
+
+   The encoding string is looked up converted to all lower-case
+   characters. This makes encodings looked up through this mechanism
+   effectively case-insensitive.
+
+   If no codec is found, a KeyError is set and NULL returned.
+
+   As side effect, this tries to load the encodings package, if not
+   yet done. This is part of the lazy load strategy for the encodings
+   package.
+
+ */
+
+#ifndef Py_LIMITED_API
+PyAPI_FUNC(PyObject *) _PyCodec_Lookup(
+       const char *encoding
+       );
+
+PyAPI_FUNC(int) _PyCodec_Forget(
+       const char *encoding
+       );
+#endif
+
+/* Codec registry encoding check API.
+
+   Returns 1/0 depending on whether there is a registered codec for
+   the given encoding.
+
+*/
+
+PyAPI_FUNC(int) PyCodec_KnownEncoding(
+       const char *encoding
+       );
+
+/* Generic codec based encoding API.
+
+   object is passed through the encoder function found for the given
+   encoding using the error handling method defined by errors. errors
+   may be NULL to use the default method defined for the codec.
+
+   Raises a LookupError in case no encoder can be found.
+
+ */
+
+PyAPI_FUNC(PyObject *) PyCodec_Encode(
+       PyObject *object,
+       const char *encoding,
+       const char *errors
+       );
+
+/* Generic codec based decoding API.
+
+   object is passed through the decoder function found for the given
+   encoding using the error handling method defined by errors. errors
+   may be NULL to use the default method defined for the codec.
+
+   Raises a LookupError in case no encoder can be found.
+
+ */
+
+PyAPI_FUNC(PyObject *) PyCodec_Decode(
+       PyObject *object,
+       const char *encoding,
+       const char *errors
+       );
+
+#ifndef Py_LIMITED_API
+/* Text codec specific encoding and decoding API.
+
+   Checks the encoding against a list of codecs which do not
+   implement a str<->bytes encoding before attempting the
+   operation.
+
+   Please note that these APIs are internal and should not
+   be used in Python C extensions.
+
+   XXX (ncoghlan): should we make these, or something like them, public
+   in Python 3.5+?
+
+ */
+PyAPI_FUNC(PyObject *) _PyCodec_LookupTextEncoding(
+       const char *encoding,
+       const char *alternate_command
+       );
+
+PyAPI_FUNC(PyObject *) _PyCodec_EncodeText(
+       PyObject *object,
+       const char *encoding,
+       const char *errors
+       );
+
+PyAPI_FUNC(PyObject *) _PyCodec_DecodeText(
+       PyObject *object,
+       const char *encoding,
+       const char *errors
+       );
+
+/* These two aren't actually text encoding specific, but _io.TextIOWrapper
+ * is the only current API consumer.
+ */
+PyAPI_FUNC(PyObject *) _PyCodecInfo_GetIncrementalDecoder(
+       PyObject *codec_info,
+       const char *errors
+       );
+
+PyAPI_FUNC(PyObject *) _PyCodecInfo_GetIncrementalEncoder(
+       PyObject *codec_info,
+       const char *errors
+       );
+#endif
+
+
+
+/* --- Codec Lookup APIs --------------------------------------------------
+
+   All APIs return a codec object with incremented refcount and are
+   based on _PyCodec_Lookup().  The same comments w/r to the encoding
+   name also apply to these APIs.
+
+*/
+
+/* Get an encoder function for the given encoding. */
+
+PyAPI_FUNC(PyObject *) PyCodec_Encoder(
+       const char *encoding
+       );
+
+/* Get a decoder function for the given encoding. */
+
+PyAPI_FUNC(PyObject *) PyCodec_Decoder(
+       const char *encoding
+       );
+
+/* Get an IncrementalEncoder object for the given encoding. */
+
+PyAPI_FUNC(PyObject *) PyCodec_IncrementalEncoder(
+       const char *encoding,
+       const char *errors
+       );
+
+/* Get an IncrementalDecoder object function for the given encoding. */
+
+PyAPI_FUNC(PyObject *) PyCodec_IncrementalDecoder(
+       const char *encoding,
+       const char *errors
+       );
+
+/* Get a StreamReader factory function for the given encoding. */
+
+PyAPI_FUNC(PyObject *) PyCodec_StreamReader(
+       const char *encoding,
+       PyObject *stream,
+       const char *errors
+       );
+
+/* Get a StreamWriter factory function for the given encoding. */
+
+PyAPI_FUNC(PyObject *) PyCodec_StreamWriter(
+       const char *encoding,
+       PyObject *stream,
+       const char *errors
+       );
+
+/* Unicode encoding error handling callback registry API */
+
+/* Register the error handling callback function error under the given
+   name. This function will be called by the codec when it encounters
+   unencodable characters/undecodable bytes and doesn't know the
+   callback name, when name is specified as the error parameter
+   in the call to the encode/decode function.
+   Return 0 on success, -1 on error */
+PyAPI_FUNC(int) PyCodec_RegisterError(const char *name, PyObject *error);
+
+/* Lookup the error handling callback function registered under the given
+   name. As a special case NULL can be passed, in which case
+   the error handling callback for "strict" will be returned. */
+PyAPI_FUNC(PyObject *) PyCodec_LookupError(const char *name);
+
+/* raise exc as an exception */
+PyAPI_FUNC(PyObject *) PyCodec_StrictErrors(PyObject *exc);
+
+/* ignore the unicode error, skipping the faulty input */
+PyAPI_FUNC(PyObject *) PyCodec_IgnoreErrors(PyObject *exc);
+
+/* replace the unicode encode error with ? or U+FFFD */
+PyAPI_FUNC(PyObject *) PyCodec_ReplaceErrors(PyObject *exc);
+
+/* replace the unicode encode error with XML character references */
+PyAPI_FUNC(PyObject *) PyCodec_XMLCharRefReplaceErrors(PyObject *exc);
+
+/* replace the unicode encode error with backslash escapes (\x, \u and \U) */
+PyAPI_FUNC(PyObject *) PyCodec_BackslashReplaceErrors(PyObject *exc);
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000
+/* replace the unicode encode error with backslash escapes (\N, \x, \u and \U) */
+PyAPI_FUNC(PyObject *) PyCodec_NameReplaceErrors(PyObject *exc);
+#endif
+
+#ifndef Py_LIMITED_API
+PyAPI_DATA(const char *) Py_hexdigits;
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_CODECREGISTRY_H */
diff --git a/nanvix-port/cpython-headers/python3.12/compile.h b/nanvix-port/cpython-headers/python3.12/compile.h
new file mode 100644
index 000000000000..52d0bc76c9fc
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/compile.h
@@ -0,0 +1,22 @@
+#ifndef Py_COMPILE_H
+#define Py_COMPILE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* These definitions must match corresponding definitions in graminit.h. */
+#define Py_single_input 256
+#define Py_file_input 257
+#define Py_eval_input 258
+#define Py_func_type_input 345
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_COMPILE_H
+#  include "cpython/compile.h"
+#  undef Py_CPYTHON_COMPILE_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_COMPILE_H */
diff --git a/nanvix-port/cpython-headers/python3.12/complexobject.h b/nanvix-port/cpython-headers/python3.12/complexobject.h
new file mode 100644
index 000000000000..ebe49a832f74
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/complexobject.h
@@ -0,0 +1,30 @@
+/* Complex number structure */
+
+#ifndef Py_COMPLEXOBJECT_H
+#define Py_COMPLEXOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Complex object interface */
+
+PyAPI_DATA(PyTypeObject) PyComplex_Type;
+
+#define PyComplex_Check(op) PyObject_TypeCheck((op), &PyComplex_Type)
+#define PyComplex_CheckExact(op) Py_IS_TYPE((op), &PyComplex_Type)
+
+PyAPI_FUNC(PyObject *) PyComplex_FromDoubles(double real, double imag);
+
+PyAPI_FUNC(double) PyComplex_RealAsDouble(PyObject *op);
+PyAPI_FUNC(double) PyComplex_ImagAsDouble(PyObject *op);
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_COMPLEXOBJECT_H
+#  include "cpython/complexobject.h"
+#  undef Py_CPYTHON_COMPLEXOBJECT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_COMPLEXOBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/abstract.h b/nanvix-port/cpython-headers/python3.12/cpython/abstract.h
new file mode 100644
index 000000000000..3b27aab2fc47
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/abstract.h
@@ -0,0 +1,206 @@
+#ifndef Py_CPYTHON_ABSTRACTOBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+/* === Object Protocol ================================================== */
+
+#ifdef PY_SSIZE_T_CLEAN
+#  define _PyObject_CallMethodId _PyObject_CallMethodId_SizeT
+#endif
+
+/* Convert keyword arguments from the FASTCALL (stack: C array, kwnames: tuple)
+   format to a Python dictionary ("kwargs" dict).
+
+   The type of kwnames keys is not checked. The final function getting
+   arguments is responsible to check if all keys are strings, for example using
+   PyArg_ParseTupleAndKeywords() or PyArg_ValidateKeywordArguments().
+
+   Duplicate keys are merged using the last value. If duplicate keys must raise
+   an exception, the caller is responsible to implement an explicit keys on
+   kwnames. */
+PyAPI_FUNC(PyObject *) _PyStack_AsDict(
+    PyObject *const *values,
+    PyObject *kwnames);
+
+/* Suggested size (number of positional arguments) for arrays of PyObject*
+   allocated on a C stack to avoid allocating memory on the heap memory. Such
+   array is used to pass positional arguments to call functions of the
+   PyObject_Vectorcall() family.
+
+   The size is chosen to not abuse the C stack and so limit the risk of stack
+   overflow. The size is also chosen to allow using the small stack for most
+   function calls of the Python standard library. On 64-bit CPU, it allocates
+   40 bytes on the stack. */
+#define _PY_FASTCALL_SMALL_STACK 5
+
+PyAPI_FUNC(PyObject *) _Py_CheckFunctionResult(
+    PyThreadState *tstate,
+    PyObject *callable,
+    PyObject *result,
+    const char *where);
+
+/* === Vectorcall protocol (PEP 590) ============================= */
+
+/* Call callable using tp_call. Arguments are like PyObject_Vectorcall()
+   or PyObject_FastCallDict() (both forms are supported),
+   except that nargs is plainly the number of arguments without flags. */
+PyAPI_FUNC(PyObject *) _PyObject_MakeTpCall(
+    PyThreadState *tstate,
+    PyObject *callable,
+    PyObject *const *args, Py_ssize_t nargs,
+    PyObject *keywords);
+
+// PyVectorcall_NARGS() is exported as a function for the stable ABI.
+// Here (when we are not using the stable ABI), the name is overridden to
+// call a static inline function for best performance.
+#define PyVectorcall_NARGS(n) _PyVectorcall_NARGS(n)
+static inline Py_ssize_t
+_PyVectorcall_NARGS(size_t n)
+{
+    return n & ~PY_VECTORCALL_ARGUMENTS_OFFSET;
+}
+
+PyAPI_FUNC(vectorcallfunc) PyVectorcall_Function(PyObject *callable);
+
+// Backwards compatibility aliases for API that was provisional in Python 3.8
+#define _PyObject_Vectorcall PyObject_Vectorcall
+#define _PyObject_VectorcallMethod PyObject_VectorcallMethod
+#define _PyObject_FastCallDict PyObject_VectorcallDict
+#define _PyVectorcall_Function PyVectorcall_Function
+#define _PyObject_CallOneArg PyObject_CallOneArg
+#define _PyObject_CallMethodNoArgs PyObject_CallMethodNoArgs
+#define _PyObject_CallMethodOneArg PyObject_CallMethodOneArg
+
+/* Same as PyObject_Vectorcall except that keyword arguments are passed as
+   dict, which may be NULL if there are no keyword arguments. */
+PyAPI_FUNC(PyObject *) PyObject_VectorcallDict(
+    PyObject *callable,
+    PyObject *const *args,
+    size_t nargsf,
+    PyObject *kwargs);
+
+// Same as PyObject_Vectorcall(), except without keyword arguments
+PyAPI_FUNC(PyObject *) _PyObject_FastCall(
+    PyObject *func,
+    PyObject *const *args,
+    Py_ssize_t nargs);
+
+PyAPI_FUNC(PyObject *) PyObject_CallOneArg(PyObject *func, PyObject *arg);
+
+static inline PyObject *
+PyObject_CallMethodNoArgs(PyObject *self, PyObject *name)
+{
+    size_t nargsf = 1 | PY_VECTORCALL_ARGUMENTS_OFFSET;
+    return PyObject_VectorcallMethod(name, &self, nargsf, _Py_NULL);
+}
+
+static inline PyObject *
+PyObject_CallMethodOneArg(PyObject *self, PyObject *name, PyObject *arg)
+{
+    PyObject *args[2] = {self, arg};
+    size_t nargsf = 2 | PY_VECTORCALL_ARGUMENTS_OFFSET;
+    assert(arg != NULL);
+    return PyObject_VectorcallMethod(name, args, nargsf, _Py_NULL);
+}
+
+PyAPI_FUNC(PyObject *) _PyObject_CallMethod(PyObject *obj,
+                                            PyObject *name,
+                                            const char *format, ...);
+
+/* Like PyObject_CallMethod(), but expect a _Py_Identifier*
+   as the method name. */
+PyAPI_FUNC(PyObject *) _PyObject_CallMethodId(PyObject *obj,
+                                              _Py_Identifier *name,
+                                              const char *format, ...);
+
+PyAPI_FUNC(PyObject *) _PyObject_CallMethodId_SizeT(PyObject *obj,
+                                                    _Py_Identifier *name,
+                                                    const char *format,
+                                                    ...);
+
+PyAPI_FUNC(PyObject *) _PyObject_CallMethodIdObjArgs(
+    PyObject *obj,
+    _Py_Identifier *name,
+    ...);
+
+static inline PyObject *
+_PyObject_VectorcallMethodId(
+    _Py_Identifier *name, PyObject *const *args,
+    size_t nargsf, PyObject *kwnames)
+{
+    PyObject *oname = _PyUnicode_FromId(name); /* borrowed */
+    if (!oname) {
+        return _Py_NULL;
+    }
+    return PyObject_VectorcallMethod(oname, args, nargsf, kwnames);
+}
+
+static inline PyObject *
+_PyObject_CallMethodIdNoArgs(PyObject *self, _Py_Identifier *name)
+{
+    size_t nargsf = 1 | PY_VECTORCALL_ARGUMENTS_OFFSET;
+    return _PyObject_VectorcallMethodId(name, &self, nargsf, _Py_NULL);
+}
+
+static inline PyObject *
+_PyObject_CallMethodIdOneArg(PyObject *self, _Py_Identifier *name, PyObject *arg)
+{
+    PyObject *args[2] = {self, arg};
+    size_t nargsf = 2 | PY_VECTORCALL_ARGUMENTS_OFFSET;
+    assert(arg != NULL);
+    return _PyObject_VectorcallMethodId(name, args, nargsf, _Py_NULL);
+}
+
+PyAPI_FUNC(int) _PyObject_HasLen(PyObject *o);
+
+/* Guess the size of object 'o' using len(o) or o.__length_hint__().
+   If neither of those return a non-negative value, then return the default
+   value.  If one of the calls fails, this function returns -1. */
+PyAPI_FUNC(Py_ssize_t) PyObject_LengthHint(PyObject *o, Py_ssize_t);
+
+/* === Sequence protocol ================================================ */
+
+/* Assume tp_as_sequence and sq_item exist and that 'i' does not
+   need to be corrected for a negative index. */
+#define PySequence_ITEM(o, i)\
+    ( Py_TYPE(o)->tp_as_sequence->sq_item((o), (i)) )
+
+#define PY_ITERSEARCH_COUNT    1
+#define PY_ITERSEARCH_INDEX    2
+#define PY_ITERSEARCH_CONTAINS 3
+
+/* Iterate over seq.
+
+   Result depends on the operation:
+
+   PY_ITERSEARCH_COUNT:  return # of times obj appears in seq; -1 if
+     error.
+   PY_ITERSEARCH_INDEX:  return 0-based index of first occurrence of
+     obj in seq; set ValueError and return -1 if none found;
+     also return -1 on error.
+   PY_ITERSEARCH_CONTAINS:  return 1 if obj in seq, else 0; -1 on
+     error. */
+PyAPI_FUNC(Py_ssize_t) _PySequence_IterSearch(PyObject *seq,
+                                              PyObject *obj, int operation);
+
+/* === Mapping protocol ================================================= */
+
+PyAPI_FUNC(int) _PyObject_RealIsInstance(PyObject *inst, PyObject *cls);
+
+PyAPI_FUNC(int) _PyObject_RealIsSubclass(PyObject *derived, PyObject *cls);
+
+PyAPI_FUNC(char *const *) _PySequence_BytesToCharpArray(PyObject* self);
+
+PyAPI_FUNC(void) _Py_FreeCharPArray(char *const array[]);
+
+/* For internal use by buffer API functions */
+PyAPI_FUNC(void) _Py_add_one_to_index_F(int nd, Py_ssize_t *index,
+                                        const Py_ssize_t *shape);
+PyAPI_FUNC(void) _Py_add_one_to_index_C(int nd, Py_ssize_t *index,
+                                        const Py_ssize_t *shape);
+
+/* Convert Python int to Py_ssize_t. Do nothing if the argument is None. */
+PyAPI_FUNC(int) _Py_convert_optional_to_ssize_t(PyObject *, void *);
+
+/* Same as PyNumber_Index but can return an instance of a subclass of int. */
+PyAPI_FUNC(PyObject *) _PyNumber_Index(PyObject *o);
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/bytearrayobject.h b/nanvix-port/cpython-headers/python3.12/cpython/bytearrayobject.h
new file mode 100644
index 000000000000..9ba176eb2d3a
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/bytearrayobject.h
@@ -0,0 +1,34 @@
+#ifndef Py_CPYTHON_BYTEARRAYOBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+/* Object layout */
+typedef struct {
+    PyObject_VAR_HEAD
+    Py_ssize_t ob_alloc;   /* How many bytes allocated in ob_bytes */
+    char *ob_bytes;        /* Physical backing buffer */
+    char *ob_start;        /* Logical start inside ob_bytes */
+    Py_ssize_t ob_exports; /* How many buffer exports */
+} PyByteArrayObject;
+
+PyAPI_DATA(char) _PyByteArray_empty_string[];
+
+/* Macros and static inline functions, trading safety for speed */
+#define _PyByteArray_CAST(op) \
+    (assert(PyByteArray_Check(op)), _Py_CAST(PyByteArrayObject*, op))
+
+static inline char* PyByteArray_AS_STRING(PyObject *op)
+{
+    PyByteArrayObject *self = _PyByteArray_CAST(op);
+    if (Py_SIZE(self)) {
+        return self->ob_start;
+    }
+    return _PyByteArray_empty_string;
+}
+#define PyByteArray_AS_STRING(self) PyByteArray_AS_STRING(_PyObject_CAST(self))
+
+static inline Py_ssize_t PyByteArray_GET_SIZE(PyObject *op) {
+    PyByteArrayObject *self = _PyByteArray_CAST(op);
+    return Py_SIZE(self);
+}
+#define PyByteArray_GET_SIZE(self) PyByteArray_GET_SIZE(_PyObject_CAST(self))
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/bytesobject.h b/nanvix-port/cpython-headers/python3.12/cpython/bytesobject.h
new file mode 100644
index 000000000000..e982031c107d
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/bytesobject.h
@@ -0,0 +1,129 @@
+#ifndef Py_CPYTHON_BYTESOBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+typedef struct {
+    PyObject_VAR_HEAD
+    Py_DEPRECATED(3.11) Py_hash_t ob_shash;
+    char ob_sval[1];
+
+    /* Invariants:
+     *     ob_sval contains space for 'ob_size+1' elements.
+     *     ob_sval[ob_size] == 0.
+     *     ob_shash is the hash of the byte string or -1 if not computed yet.
+     */
+} PyBytesObject;
+
+PyAPI_FUNC(int) _PyBytes_Resize(PyObject **, Py_ssize_t);
+PyAPI_FUNC(PyObject*) _PyBytes_FormatEx(
+    const char *format,
+    Py_ssize_t format_len,
+    PyObject *args,
+    int use_bytearray);
+PyAPI_FUNC(PyObject*) _PyBytes_FromHex(
+    PyObject *string,
+    int use_bytearray);
+
+/* Helper for PyBytes_DecodeEscape that detects invalid escape chars. */
+PyAPI_FUNC(PyObject *) _PyBytes_DecodeEscape(const char *, Py_ssize_t,
+                                             const char *, const char **);
+
+/* Macros and static inline functions, trading safety for speed */
+#define _PyBytes_CAST(op) \
+    (assert(PyBytes_Check(op)), _Py_CAST(PyBytesObject*, op))
+
+static inline char* PyBytes_AS_STRING(PyObject *op)
+{
+    return _PyBytes_CAST(op)->ob_sval;
+}
+#define PyBytes_AS_STRING(op) PyBytes_AS_STRING(_PyObject_CAST(op))
+
+static inline Py_ssize_t PyBytes_GET_SIZE(PyObject *op) {
+    PyBytesObject *self = _PyBytes_CAST(op);
+    return Py_SIZE(self);
+}
+#define PyBytes_GET_SIZE(self) PyBytes_GET_SIZE(_PyObject_CAST(self))
+
+/* _PyBytes_Join(sep, x) is like sep.join(x).  sep must be PyBytesObject*,
+   x must be an iterable object. */
+PyAPI_FUNC(PyObject *) _PyBytes_Join(PyObject *sep, PyObject *x);
+
+
+/* The _PyBytesWriter structure is big: it contains an embedded "stack buffer".
+   A _PyBytesWriter variable must be declared at the end of variables in a
+   function to optimize the memory allocation on the stack. */
+typedef struct {
+    /* bytes, bytearray or NULL (when the small buffer is used) */
+    PyObject *buffer;
+
+    /* Number of allocated size. */
+    Py_ssize_t allocated;
+
+    /* Minimum number of allocated bytes,
+       incremented by _PyBytesWriter_Prepare() */
+    Py_ssize_t min_size;
+
+    /* If non-zero, use a bytearray instead of a bytes object for buffer. */
+    int use_bytearray;
+
+    /* If non-zero, overallocate the buffer (default: 0).
+       This flag must be zero if use_bytearray is non-zero. */
+    int overallocate;
+
+    /* Stack buffer */
+    int use_small_buffer;
+    char small_buffer[512];
+} _PyBytesWriter;
+
+/* Initialize a bytes writer
+
+   By default, the overallocation is disabled. Set the overallocate attribute
+   to control the allocation of the buffer. */
+PyAPI_FUNC(void) _PyBytesWriter_Init(_PyBytesWriter *writer);
+
+/* Get the buffer content and reset the writer.
+   Return a bytes object, or a bytearray object if use_bytearray is non-zero.
+   Raise an exception and return NULL on error. */
+PyAPI_FUNC(PyObject *) _PyBytesWriter_Finish(_PyBytesWriter *writer,
+    void *str);
+
+/* Deallocate memory of a writer (clear its internal buffer). */
+PyAPI_FUNC(void) _PyBytesWriter_Dealloc(_PyBytesWriter *writer);
+
+/* Allocate the buffer to write size bytes.
+   Return the pointer to the beginning of buffer data.
+   Raise an exception and return NULL on error. */
+PyAPI_FUNC(void*) _PyBytesWriter_Alloc(_PyBytesWriter *writer,
+    Py_ssize_t size);
+
+/* Ensure that the buffer is large enough to write *size* bytes.
+   Add size to the writer minimum size (min_size attribute).
+
+   str is the current pointer inside the buffer.
+   Return the updated current pointer inside the buffer.
+   Raise an exception and return NULL on error. */
+PyAPI_FUNC(void*) _PyBytesWriter_Prepare(_PyBytesWriter *writer,
+    void *str,
+    Py_ssize_t size);
+
+/* Resize the buffer to make it larger.
+   The new buffer may be larger than size bytes because of overallocation.
+   Return the updated current pointer inside the buffer.
+   Raise an exception and return NULL on error.
+
+   Note: size must be greater than the number of allocated bytes in the writer.
+
+   This function doesn't use the writer minimum size (min_size attribute).
+
+   See also _PyBytesWriter_Prepare().
+   */
+PyAPI_FUNC(void*) _PyBytesWriter_Resize(_PyBytesWriter *writer,
+    void *str,
+    Py_ssize_t size);
+
+/* Write bytes.
+   Raise an exception and return NULL on error. */
+PyAPI_FUNC(void*) _PyBytesWriter_WriteBytes(_PyBytesWriter *writer,
+    void *str,
+    const void *bytes,
+    Py_ssize_t size);
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/cellobject.h b/nanvix-port/cpython-headers/python3.12/cpython/cellobject.h
new file mode 100644
index 000000000000..47a6a491497e
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/cellobject.h
@@ -0,0 +1,44 @@
+/* Cell object interface */
+
+#ifndef Py_LIMITED_API
+#ifndef Py_CELLOBJECT_H
+#define Py_CELLOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+    PyObject_HEAD
+    /* Content of the cell or NULL when empty */
+    PyObject *ob_ref;
+} PyCellObject;
+
+PyAPI_DATA(PyTypeObject) PyCell_Type;
+
+#define PyCell_Check(op) Py_IS_TYPE((op), &PyCell_Type)
+
+PyAPI_FUNC(PyObject *) PyCell_New(PyObject *);
+PyAPI_FUNC(PyObject *) PyCell_Get(PyObject *);
+PyAPI_FUNC(int) PyCell_Set(PyObject *, PyObject *);
+
+static inline PyObject* PyCell_GET(PyObject *op) {
+    PyCellObject *cell;
+    assert(PyCell_Check(op));
+    cell = _Py_CAST(PyCellObject*, op);
+    return cell->ob_ref;
+}
+#define PyCell_GET(op) PyCell_GET(_PyObject_CAST(op))
+
+static inline void PyCell_SET(PyObject *op, PyObject *value) {
+    PyCellObject *cell;
+    assert(PyCell_Check(op));
+    cell = _Py_CAST(PyCellObject*, op);
+    cell->ob_ref = value;
+}
+#define PyCell_SET(op, value) PyCell_SET(_PyObject_CAST(op), (value))
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_TUPLEOBJECT_H */
+#endif /* Py_LIMITED_API */
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/ceval.h b/nanvix-port/cpython-headers/python3.12/cpython/ceval.h
new file mode 100644
index 000000000000..a9616bd6a4f5
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/ceval.h
@@ -0,0 +1,35 @@
+#ifndef Py_CPYTHON_CEVAL_H
+#  error "this header file must not be included directly"
+#endif
+
+PyAPI_FUNC(void) PyEval_SetProfile(Py_tracefunc, PyObject *);
+PyAPI_FUNC(void) PyEval_SetProfileAllThreads(Py_tracefunc, PyObject *);
+PyAPI_DATA(int) _PyEval_SetProfile(PyThreadState *tstate, Py_tracefunc func, PyObject *arg);
+PyAPI_FUNC(void) PyEval_SetTrace(Py_tracefunc, PyObject *);
+PyAPI_FUNC(void) PyEval_SetTraceAllThreads(Py_tracefunc, PyObject *);
+PyAPI_FUNC(int) _PyEval_SetTrace(PyThreadState *tstate, Py_tracefunc func, PyObject *arg);
+
+/* Helper to look up a builtin object */
+PyAPI_FUNC(PyObject *) _PyEval_GetBuiltin(PyObject *);
+PyAPI_FUNC(PyObject *) _PyEval_GetBuiltinId(_Py_Identifier *);
+/* Look at the current frame's (if any) code's co_flags, and turn on
+   the corresponding compiler flags in cf->cf_flags.  Return 1 if any
+   flag was set, else return 0. */
+PyAPI_FUNC(int) PyEval_MergeCompilerFlags(PyCompilerFlags *cf);
+
+PyAPI_FUNC(PyObject *) _PyEval_EvalFrameDefault(PyThreadState *tstate, struct _PyInterpreterFrame *f, int exc);
+
+PyAPI_FUNC(void) _PyEval_SetSwitchInterval(unsigned long microseconds);
+PyAPI_FUNC(unsigned long) _PyEval_GetSwitchInterval(void);
+
+PyAPI_FUNC(int) _PyEval_MakePendingCalls(PyThreadState *);
+
+PyAPI_FUNC(Py_ssize_t) PyUnstable_Eval_RequestCodeExtraIndex(freefunc);
+// Old name -- remove when this API changes:
+_Py_DEPRECATED_EXTERNALLY(3.12) static inline Py_ssize_t
+_PyEval_RequestCodeExtraIndex(freefunc f) {
+    return PyUnstable_Eval_RequestCodeExtraIndex(f);
+}
+
+PyAPI_FUNC(int) _PyEval_SliceIndex(PyObject *, Py_ssize_t *);
+PyAPI_FUNC(int) _PyEval_SliceIndexNotNone(PyObject *, Py_ssize_t *);
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/classobject.h b/nanvix-port/cpython-headers/python3.12/cpython/classobject.h
new file mode 100644
index 000000000000..d7c9ddd1336c
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/classobject.h
@@ -0,0 +1,71 @@
+/* Former class object interface -- now only bound methods are here  */
+
+/* Revealing some structures (not for general use) */
+
+#ifndef Py_LIMITED_API
+#ifndef Py_CLASSOBJECT_H
+#define Py_CLASSOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+    PyObject_HEAD
+    PyObject *im_func;   /* The callable object implementing the method */
+    PyObject *im_self;   /* The instance it is bound to */
+    PyObject *im_weakreflist; /* List of weak references */
+    vectorcallfunc vectorcall;
+} PyMethodObject;
+
+PyAPI_DATA(PyTypeObject) PyMethod_Type;
+
+#define PyMethod_Check(op) Py_IS_TYPE((op), &PyMethod_Type)
+
+PyAPI_FUNC(PyObject *) PyMethod_New(PyObject *, PyObject *);
+
+PyAPI_FUNC(PyObject *) PyMethod_Function(PyObject *);
+PyAPI_FUNC(PyObject *) PyMethod_Self(PyObject *);
+
+#define _PyMethod_CAST(meth) \
+    (assert(PyMethod_Check(meth)), _Py_CAST(PyMethodObject*, meth))
+
+/* Static inline functions for direct access to these values.
+   Type checks are *not* done, so use with care. */
+static inline PyObject* PyMethod_GET_FUNCTION(PyObject *meth) {
+    return _PyMethod_CAST(meth)->im_func;
+}
+#define PyMethod_GET_FUNCTION(meth) PyMethod_GET_FUNCTION(_PyObject_CAST(meth))
+
+static inline PyObject* PyMethod_GET_SELF(PyObject *meth) {
+    return _PyMethod_CAST(meth)->im_self;
+}
+#define PyMethod_GET_SELF(meth) PyMethod_GET_SELF(_PyObject_CAST(meth))
+
+typedef struct {
+    PyObject_HEAD
+    PyObject *func;
+} PyInstanceMethodObject;
+
+PyAPI_DATA(PyTypeObject) PyInstanceMethod_Type;
+
+#define PyInstanceMethod_Check(op) Py_IS_TYPE((op), &PyInstanceMethod_Type)
+
+PyAPI_FUNC(PyObject *) PyInstanceMethod_New(PyObject *);
+PyAPI_FUNC(PyObject *) PyInstanceMethod_Function(PyObject *);
+
+#define _PyInstanceMethod_CAST(meth) \
+    (assert(PyInstanceMethod_Check(meth)), \
+     _Py_CAST(PyInstanceMethodObject*, meth))
+
+/* Static inline function for direct access to these values.
+   Type checks are *not* done, so use with care. */
+static inline PyObject* PyInstanceMethod_GET_FUNCTION(PyObject *meth) {
+    return _PyInstanceMethod_CAST(meth)->func;
+}
+#define PyInstanceMethod_GET_FUNCTION(meth) PyInstanceMethod_GET_FUNCTION(_PyObject_CAST(meth))
+
+#ifdef __cplusplus
+}
+#endif
+#endif   // !Py_CLASSOBJECT_H
+#endif   // !Py_LIMITED_API
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/code.h b/nanvix-port/cpython-headers/python3.12/cpython/code.h
new file mode 100644
index 000000000000..311cffec113f
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/code.h
@@ -0,0 +1,389 @@
+/* Definitions for bytecode */
+
+#ifndef Py_LIMITED_API
+#ifndef Py_CODE_H
+#define Py_CODE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Count of all local monitoring events */
+#define  _PY_MONITORING_LOCAL_EVENTS 10
+/* Count of all "real" monitoring events (not derived from other events) */
+#define _PY_MONITORING_UNGROUPED_EVENTS 15
+/* Count of all  monitoring events */
+#define _PY_MONITORING_EVENTS 17
+
+/* Tables of which tools are active for each monitored event. */
+/* For 3.12 ABI compatibility this is over sized */
+typedef struct _Py_LocalMonitors {
+    /* Only _PY_MONITORING_LOCAL_EVENTS of these are used */
+    uint8_t tools[_PY_MONITORING_UNGROUPED_EVENTS];
+} _Py_LocalMonitors;
+
+typedef struct _Py_GlobalMonitors {
+    uint8_t tools[_PY_MONITORING_UNGROUPED_EVENTS];
+} _Py_GlobalMonitors;
+
+/* Each instruction in a code object is a fixed-width value,
+ * currently 2 bytes: 1-byte opcode + 1-byte oparg.  The EXTENDED_ARG
+ * opcode allows for larger values but the current limit is 3 uses
+ * of EXTENDED_ARG (see Python/compile.c), for a maximum
+ * 32-bit value.  This aligns with the note in Python/compile.c
+ * (compiler_addop_i_line) indicating that the max oparg value is
+ * 2**32 - 1, rather than INT_MAX.
+ */
+
+typedef union {
+    uint16_t cache;
+    struct {
+        uint8_t code;
+        uint8_t arg;
+    } op;
+} _Py_CODEUNIT;
+
+
+/* These macros only remain defined for compatibility. */
+#define _Py_OPCODE(word) ((word).op.code)
+#define _Py_OPARG(word) ((word).op.arg)
+
+static inline _Py_CODEUNIT
+_py_make_codeunit(uint8_t opcode, uint8_t oparg)
+{
+    // No designated initialisers because of C++ compat
+    _Py_CODEUNIT word;
+    word.op.code = opcode;
+    word.op.arg = oparg;
+    return word;
+}
+
+static inline void
+_py_set_opcode(_Py_CODEUNIT *word, uint8_t opcode)
+{
+    word->op.code = opcode;
+}
+
+#define _Py_MAKE_CODEUNIT(opcode, oparg) _py_make_codeunit((opcode), (oparg))
+#define _Py_SET_OPCODE(word, opcode) _py_set_opcode(&(word), (opcode))
+
+
+typedef struct {
+    PyObject *_co_code;
+    PyObject *_co_varnames;
+    PyObject *_co_cellvars;
+    PyObject *_co_freevars;
+} _PyCoCached;
+
+/* Ancillary data structure used for instrumentation.
+   Line instrumentation creates an array of
+   these. One entry per code unit.*/
+typedef struct {
+    uint8_t original_opcode;
+    int8_t line_delta;
+} _PyCoLineInstrumentationData;
+
+/* Main data structure used for instrumentation.
+ * This is allocated when needed for instrumentation
+ */
+typedef struct {
+    /* Monitoring specific to this code object */
+    _Py_LocalMonitors local_monitors;
+    /* Monitoring that is active on this code object */
+    _Py_LocalMonitors active_monitors;
+    /* The tools that are to be notified for events for the matching code unit */
+    uint8_t *tools;
+    /* Information to support line events */
+    _PyCoLineInstrumentationData *lines;
+    /* The tools that are to be notified for line events for the matching code unit */
+    uint8_t *line_tools;
+    /* Information to support instruction events */
+    /* The underlying instructions, which can themselves be instrumented */
+    uint8_t *per_instruction_opcodes;
+    /* The tools that are to be notified for instruction events for the matching code unit */
+    uint8_t *per_instruction_tools;
+} _PyCoMonitoringData;
+
+// To avoid repeating ourselves in deepfreeze.py, all PyCodeObject members are
+// defined in this macro:
+#define _PyCode_DEF(SIZE) {                                                    \
+    PyObject_VAR_HEAD                                                          \
+                                                                               \
+    /* Note only the following fields are used in hash and/or comparisons      \
+     *                                                                         \
+     * - co_name                                                               \
+     * - co_argcount                                                           \
+     * - co_posonlyargcount                                                    \
+     * - co_kwonlyargcount                                                     \
+     * - co_nlocals                                                            \
+     * - co_stacksize                                                          \
+     * - co_flags                                                              \
+     * - co_firstlineno                                                        \
+     * - co_consts                                                             \
+     * - co_names                                                              \
+     * - co_localsplusnames                                                    \
+     * This is done to preserve the name and line number for tracebacks        \
+     * and debuggers; otherwise, constant de-duplication would collapse        \
+     * identical functions/lambdas defined on different lines.                 \
+     */                                                                        \
+                                                                               \
+    /* These fields are set with provided values on new code objects. */       \
+                                                                               \
+    /* The hottest fields (in the eval loop) are grouped here at the top. */   \
+    PyObject *co_consts;           /* list (constants used) */                 \
+    PyObject *co_names;            /* list of strings (names used) */          \
+    PyObject *co_exceptiontable;   /* Byte string encoding exception handling  \
+                                      table */                                 \
+    int co_flags;                  /* CO_..., see below */                     \
+                                                                               \
+    /* The rest are not so impactful on performance. */                        \
+    int co_argcount;              /* #arguments, except *args */               \
+    int co_posonlyargcount;       /* #positional only arguments */             \
+    int co_kwonlyargcount;        /* #keyword only arguments */                \
+    int co_stacksize;             /* #entries needed for evaluation stack */   \
+    int co_firstlineno;           /* first source line number */               \
+                                                                               \
+    /* redundant values (derived from co_localsplusnames and                   \
+       co_localspluskinds) */                                                  \
+    int co_nlocalsplus;           /* number of local + cell + free variables */ \
+    int co_framesize;             /* Size of frame in words */                 \
+    int co_nlocals;               /* number of local variables */              \
+    int co_ncellvars;             /* total number of cell variables */         \
+    int co_nfreevars;             /* number of free variables */               \
+    uint32_t co_version;          /* version number */                         \
+                                                                               \
+    PyObject *co_localsplusnames; /* tuple mapping offsets to names */         \
+    PyObject *co_localspluskinds; /* Bytes mapping to local kinds (one byte    \
+                                     per variable) */                          \
+    PyObject *co_filename;        /* unicode (where it was loaded from) */     \
+    PyObject *co_name;            /* unicode (name, for reference) */          \
+    PyObject *co_qualname;        /* unicode (qualname, for reference) */      \
+    PyObject *co_linetable;       /* bytes object that holds location info */  \
+    PyObject *co_weakreflist;     /* to support weakrefs to code objects */    \
+    _PyCoCached *_co_cached;      /* cached co_* attributes */                 \
+    uint64_t _co_instrumentation_version; /* current instrumentation version */  \
+    _PyCoMonitoringData *_co_monitoring; /* Monitoring data */                 \
+    int _co_firsttraceable;       /* index of first traceable instruction */   \
+    /* Scratch space for extra data relating to the code object.               \
+       Type is a void* to keep the format private in codeobject.c to force     \
+       people to go through the proper APIs. */                                \
+    void *co_extra;                                                            \
+    char co_code_adaptive[(SIZE)];                                             \
+}
+
+/* Bytecode object */
+struct PyCodeObject _PyCode_DEF(1);
+
+/* Masks for co_flags above */
+#define CO_OPTIMIZED    0x0001
+#define CO_NEWLOCALS    0x0002
+#define CO_VARARGS      0x0004
+#define CO_VARKEYWORDS  0x0008
+#define CO_NESTED       0x0010
+#define CO_GENERATOR    0x0020
+
+/* The CO_COROUTINE flag is set for coroutine functions (defined with
+   ``async def`` keywords) */
+#define CO_COROUTINE            0x0080
+#define CO_ITERABLE_COROUTINE   0x0100
+#define CO_ASYNC_GENERATOR      0x0200
+
+/* bpo-39562: These constant values are changed in Python 3.9
+   to prevent collision with compiler flags. CO_FUTURE_ and PyCF_
+   constants must be kept unique. PyCF_ constants can use bits from
+   0x0100 to 0x10000. CO_FUTURE_ constants use bits starting at 0x20000. */
+#define CO_FUTURE_DIVISION      0x20000
+#define CO_FUTURE_ABSOLUTE_IMPORT 0x40000 /* do absolute imports by default */
+#define CO_FUTURE_WITH_STATEMENT  0x80000
+#define CO_FUTURE_PRINT_FUNCTION  0x100000
+#define CO_FUTURE_UNICODE_LITERALS 0x200000
+
+#define CO_FUTURE_BARRY_AS_BDFL  0x400000
+#define CO_FUTURE_GENERATOR_STOP  0x800000
+#define CO_FUTURE_ANNOTATIONS    0x1000000
+
+/* This should be defined if a future statement modifies the syntax.
+   For example, when a keyword is added.
+*/
+#define PY_PARSER_REQUIRES_FUTURE_KEYWORD
+
+#define CO_MAXBLOCKS 20 /* Max static block nesting within a function */
+
+PyAPI_DATA(PyTypeObject) PyCode_Type;
+
+#define PyCode_Check(op) Py_IS_TYPE((op), &PyCode_Type)
+
+static inline Py_ssize_t PyCode_GetNumFree(PyCodeObject *op) {
+    assert(PyCode_Check(op));
+    return op->co_nfreevars;
+}
+
+static inline int PyCode_GetFirstFree(PyCodeObject *op) {
+    assert(PyCode_Check(op));
+    return op->co_nlocalsplus - op->co_nfreevars;
+}
+
+#define _PyCode_CODE(CO) _Py_RVALUE((_Py_CODEUNIT *)(CO)->co_code_adaptive)
+#define _PyCode_NBYTES(CO) (Py_SIZE(CO) * (Py_ssize_t)sizeof(_Py_CODEUNIT))
+
+/* Unstable public interface */
+PyAPI_FUNC(PyCodeObject *) PyUnstable_Code_New(
+        int, int, int, int, int, PyObject *, PyObject *,
+        PyObject *, PyObject *, PyObject *, PyObject *,
+        PyObject *, PyObject *, PyObject *, int, PyObject *,
+        PyObject *);
+
+PyAPI_FUNC(PyCodeObject *) PyUnstable_Code_NewWithPosOnlyArgs(
+        int, int, int, int, int, int, PyObject *, PyObject *,
+        PyObject *, PyObject *, PyObject *, PyObject *,
+        PyObject *, PyObject *, PyObject *, int, PyObject *,
+        PyObject *);
+        /* same as struct above */
+// Old names -- remove when this API changes:
+_Py_DEPRECATED_EXTERNALLY(3.12) static inline PyCodeObject *
+PyCode_New(
+        int a, int b, int c, int d, int e, PyObject *f, PyObject *g,
+        PyObject *h, PyObject *i, PyObject *j, PyObject *k,
+        PyObject *l, PyObject *m, PyObject *n, int o, PyObject *p,
+        PyObject *q)
+{
+    return PyUnstable_Code_New(
+        a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q);
+}
+_Py_DEPRECATED_EXTERNALLY(3.12) static inline PyCodeObject *
+PyCode_NewWithPosOnlyArgs(
+        int a, int poac, int b, int c, int d, int e, PyObject *f, PyObject *g,
+        PyObject *h, PyObject *i, PyObject *j, PyObject *k,
+        PyObject *l, PyObject *m, PyObject *n, int o, PyObject *p,
+        PyObject *q)
+{
+    return PyUnstable_Code_NewWithPosOnlyArgs(
+        a, poac, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q);
+}
+
+/* Creates a new empty code object with the specified source location. */
+PyAPI_FUNC(PyCodeObject *)
+PyCode_NewEmpty(const char *filename, const char *funcname, int firstlineno);
+
+/* Return the line number associated with the specified bytecode index
+   in this code object.  If you just need the line number of a frame,
+   use PyFrame_GetLineNumber() instead. */
+PyAPI_FUNC(int) PyCode_Addr2Line(PyCodeObject *, int);
+
+PyAPI_FUNC(int) PyCode_Addr2Location(PyCodeObject *, int, int *, int *, int *, int *);
+
+#define PY_FOREACH_CODE_EVENT(V) \
+    V(CREATE)                 \
+    V(DESTROY)
+
+typedef enum {
+    #define PY_DEF_EVENT(op) PY_CODE_EVENT_##op,
+    PY_FOREACH_CODE_EVENT(PY_DEF_EVENT)
+    #undef PY_DEF_EVENT
+} PyCodeEvent;
+
+
+/*
+ * A callback that is invoked for different events in a code object's lifecycle.
+ *
+ * The callback is invoked with a borrowed reference to co, after it is
+ * created and before it is destroyed.
+ *
+ * If the callback sets an exception, it must return -1. Otherwise
+ * it should return 0.
+ */
+typedef int (*PyCode_WatchCallback)(
+  PyCodeEvent event,
+  PyCodeObject* co);
+
+/*
+ * Register a per-interpreter callback that will be invoked for code object
+ * lifecycle events.
+ *
+ * Returns a handle that may be passed to PyCode_ClearWatcher on success,
+ * or -1 and sets an error if no more handles are available.
+ */
+PyAPI_FUNC(int) PyCode_AddWatcher(PyCode_WatchCallback callback);
+
+/*
+ * Clear the watcher associated with the watcher_id handle.
+ *
+ * Returns 0 on success or -1 if no watcher exists for the provided id.
+ */
+PyAPI_FUNC(int) PyCode_ClearWatcher(int watcher_id);
+
+/* for internal use only */
+struct _opaque {
+    int computed_line;
+    const uint8_t *lo_next;
+    const uint8_t *limit;
+};
+
+typedef struct _line_offsets {
+    int ar_start;
+    int ar_end;
+    int ar_line;
+    struct _opaque opaque;
+} PyCodeAddressRange;
+
+/* Update *bounds to describe the first and one-past-the-last instructions in the
+   same line as lasti.  Return the number of that line.
+*/
+PyAPI_FUNC(int) _PyCode_CheckLineNumber(int lasti, PyCodeAddressRange *bounds);
+
+/* Create a comparable key used to compare constants taking in account the
+ * object type. It is used to make sure types are not coerced (e.g., float and
+ * complex) _and_ to distinguish 0.0 from -0.0 e.g. on IEEE platforms
+ *
+ * Return (type(obj), obj, ...): a tuple with variable size (at least 2 items)
+ * depending on the type and the value. The type is the first item to not
+ * compare bytes and str which can raise a BytesWarning exception. */
+PyAPI_FUNC(PyObject*) _PyCode_ConstantKey(PyObject *obj);
+
+PyAPI_FUNC(PyObject*) PyCode_Optimize(PyObject *code, PyObject* consts,
+                                      PyObject *names, PyObject *lnotab);
+
+PyAPI_FUNC(int) PyUnstable_Code_GetExtra(
+    PyObject *code, Py_ssize_t index, void **extra);
+PyAPI_FUNC(int) PyUnstable_Code_SetExtra(
+    PyObject *code, Py_ssize_t index, void *extra);
+// Old names -- remove when this API changes:
+_Py_DEPRECATED_EXTERNALLY(3.12) static inline int
+_PyCode_GetExtra(PyObject *code, Py_ssize_t index, void **extra)
+{
+    return PyUnstable_Code_GetExtra(code, index, extra);
+}
+_Py_DEPRECATED_EXTERNALLY(3.12) static inline int
+_PyCode_SetExtra(PyObject *code, Py_ssize_t index, void *extra)
+{
+    return PyUnstable_Code_SetExtra(code, index, extra);
+}
+
+/* Equivalent to getattr(code, 'co_code') in Python.
+   Returns a strong reference to a bytes object. */
+PyAPI_FUNC(PyObject *) PyCode_GetCode(PyCodeObject *code);
+/* Equivalent to getattr(code, 'co_varnames') in Python. */
+PyAPI_FUNC(PyObject *) PyCode_GetVarnames(PyCodeObject *code);
+/* Equivalent to getattr(code, 'co_cellvars') in Python. */
+PyAPI_FUNC(PyObject *) PyCode_GetCellvars(PyCodeObject *code);
+/* Equivalent to getattr(code, 'co_freevars') in Python. */
+PyAPI_FUNC(PyObject *) PyCode_GetFreevars(PyCodeObject *code);
+
+typedef enum _PyCodeLocationInfoKind {
+    /* short forms are 0 to 9 */
+    PY_CODE_LOCATION_INFO_SHORT0 = 0,
+    /* one lineforms are 10 to 12 */
+    PY_CODE_LOCATION_INFO_ONE_LINE0 = 10,
+    PY_CODE_LOCATION_INFO_ONE_LINE1 = 11,
+    PY_CODE_LOCATION_INFO_ONE_LINE2 = 12,
+
+    PY_CODE_LOCATION_INFO_NO_COLUMNS = 13,
+    PY_CODE_LOCATION_INFO_LONG = 14,
+    PY_CODE_LOCATION_INFO_NONE = 15
+} _PyCodeLocationInfoKind;
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // !Py_CODE_H
+#endif  // !Py_LIMITED_API
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/compile.h b/nanvix-port/cpython-headers/python3.12/cpython/compile.h
new file mode 100644
index 000000000000..f5a62a8ec6dd
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/compile.h
@@ -0,0 +1,69 @@
+#ifndef Py_CPYTHON_COMPILE_H
+#  error "this header file must not be included directly"
+#endif
+
+/* Public interface */
+#define PyCF_MASK (CO_FUTURE_DIVISION | CO_FUTURE_ABSOLUTE_IMPORT | \
+                   CO_FUTURE_WITH_STATEMENT | CO_FUTURE_PRINT_FUNCTION | \
+                   CO_FUTURE_UNICODE_LITERALS | CO_FUTURE_BARRY_AS_BDFL | \
+                   CO_FUTURE_GENERATOR_STOP | CO_FUTURE_ANNOTATIONS)
+#define PyCF_MASK_OBSOLETE (CO_NESTED)
+
+/* bpo-39562: CO_FUTURE_ and PyCF_ constants must be kept unique.
+   PyCF_ constants can use bits from 0x0100 to 0x10000.
+   CO_FUTURE_ constants use bits starting at 0x20000. */
+#define PyCF_SOURCE_IS_UTF8  0x0100
+#define PyCF_DONT_IMPLY_DEDENT 0x0200
+#define PyCF_ONLY_AST 0x0400
+#define PyCF_IGNORE_COOKIE 0x0800
+#define PyCF_TYPE_COMMENTS 0x1000
+#define PyCF_ALLOW_TOP_LEVEL_AWAIT 0x2000
+#define PyCF_ALLOW_INCOMPLETE_INPUT 0x4000
+#define PyCF_COMPILE_MASK (PyCF_ONLY_AST | PyCF_ALLOW_TOP_LEVEL_AWAIT | \
+                           PyCF_TYPE_COMMENTS | PyCF_DONT_IMPLY_DEDENT | \
+                           PyCF_ALLOW_INCOMPLETE_INPUT)
+
+typedef struct {
+    int cf_flags;  /* bitmask of CO_xxx flags relevant to future */
+    int cf_feature_version;  /* minor Python version (PyCF_ONLY_AST) */
+} PyCompilerFlags;
+
+#define _PyCompilerFlags_INIT \
+    (PyCompilerFlags){.cf_flags = 0, .cf_feature_version = PY_MINOR_VERSION}
+
+/* source location information */
+typedef struct {
+    int lineno;
+    int end_lineno;
+    int col_offset;
+    int end_col_offset;
+} _PyCompilerSrcLocation;
+
+#define SRC_LOCATION_FROM_AST(n) \
+    (_PyCompilerSrcLocation){ \
+               .lineno = (n)->lineno, \
+               .end_lineno = (n)->end_lineno, \
+               .col_offset = (n)->col_offset, \
+               .end_col_offset = (n)->end_col_offset }
+
+/* Future feature support */
+
+typedef struct {
+    int ff_features;                    /* flags set by future statements */
+    _PyCompilerSrcLocation ff_location; /* location of last future statement */
+} PyFutureFeatures;
+
+#define FUTURE_NESTED_SCOPES "nested_scopes"
+#define FUTURE_GENERATORS "generators"
+#define FUTURE_DIVISION "division"
+#define FUTURE_ABSOLUTE_IMPORT "absolute_import"
+#define FUTURE_WITH_STATEMENT "with_statement"
+#define FUTURE_PRINT_FUNCTION "print_function"
+#define FUTURE_UNICODE_LITERALS "unicode_literals"
+#define FUTURE_BARRY_AS_BDFL "barry_as_FLUFL"
+#define FUTURE_GENERATOR_STOP "generator_stop"
+#define FUTURE_ANNOTATIONS "annotations"
+
+#define PY_INVALID_STACK_EFFECT INT_MAX
+PyAPI_FUNC(int) PyCompile_OpcodeStackEffect(int opcode, int oparg);
+PyAPI_FUNC(int) PyCompile_OpcodeStackEffectWithJump(int opcode, int oparg, int jump);
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/complexobject.h b/nanvix-port/cpython-headers/python3.12/cpython/complexobject.h
new file mode 100644
index 000000000000..b7d7283ae889
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/complexobject.h
@@ -0,0 +1,44 @@
+#ifndef Py_CPYTHON_COMPLEXOBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+typedef struct {
+    double real;
+    double imag;
+} Py_complex;
+
+/* Operations on complex numbers from complexmodule.c */
+
+PyAPI_FUNC(Py_complex) _Py_c_sum(Py_complex, Py_complex);
+PyAPI_FUNC(Py_complex) _Py_c_diff(Py_complex, Py_complex);
+PyAPI_FUNC(Py_complex) _Py_c_neg(Py_complex);
+PyAPI_FUNC(Py_complex) _Py_c_prod(Py_complex, Py_complex);
+PyAPI_FUNC(Py_complex) _Py_c_quot(Py_complex, Py_complex);
+PyAPI_FUNC(Py_complex) _Py_c_pow(Py_complex, Py_complex);
+PyAPI_FUNC(double) _Py_c_abs(Py_complex);
+
+/* Complex object interface */
+
+/*
+PyComplexObject represents a complex number with double-precision
+real and imaginary parts.
+*/
+typedef struct {
+    PyObject_HEAD
+    Py_complex cval;
+} PyComplexObject;
+
+PyAPI_FUNC(PyObject *) PyComplex_FromCComplex(Py_complex);
+
+PyAPI_FUNC(Py_complex) PyComplex_AsCComplex(PyObject *op);
+
+#ifdef Py_BUILD_CORE
+/* Format the object based on the format_spec, as defined in PEP 3101
+   (Advanced String Formatting). */
+extern int _PyComplex_FormatAdvancedWriter(
+    _PyUnicodeWriter *writer,
+    PyObject *obj,
+    PyObject *format_spec,
+    Py_ssize_t start,
+    Py_ssize_t end);
+#endif  // Py_BUILD_CORE
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/context.h b/nanvix-port/cpython-headers/python3.12/cpython/context.h
new file mode 100644
index 000000000000..9879fc7192eb
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/context.h
@@ -0,0 +1,78 @@
+#ifndef Py_LIMITED_API
+#ifndef Py_CONTEXT_H
+#define Py_CONTEXT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_DATA(PyTypeObject) PyContext_Type;
+typedef struct _pycontextobject PyContext;
+
+PyAPI_DATA(PyTypeObject) PyContextVar_Type;
+typedef struct _pycontextvarobject PyContextVar;
+
+PyAPI_DATA(PyTypeObject) PyContextToken_Type;
+typedef struct _pycontexttokenobject PyContextToken;
+
+
+#define PyContext_CheckExact(o) Py_IS_TYPE((o), &PyContext_Type)
+#define PyContextVar_CheckExact(o) Py_IS_TYPE((o), &PyContextVar_Type)
+#define PyContextToken_CheckExact(o) Py_IS_TYPE((o), &PyContextToken_Type)
+
+
+PyAPI_FUNC(PyObject *) PyContext_New(void);
+PyAPI_FUNC(PyObject *) PyContext_Copy(PyObject *);
+PyAPI_FUNC(PyObject *) PyContext_CopyCurrent(void);
+
+PyAPI_FUNC(int) PyContext_Enter(PyObject *);
+PyAPI_FUNC(int) PyContext_Exit(PyObject *);
+
+
+/* Create a new context variable.
+
+   default_value can be NULL.
+*/
+PyAPI_FUNC(PyObject *) PyContextVar_New(
+    const char *name, PyObject *default_value);
+
+
+/* Get a value for the variable.
+
+   Returns -1 if an error occurred during lookup.
+
+   Returns 0 if value either was or was not found.
+
+   If value was found, *value will point to it.
+   If not, it will point to:
+
+   - default_value, if not NULL;
+   - the default value of "var", if not NULL;
+   - NULL.
+
+   '*value' will be a new ref, if not NULL.
+*/
+PyAPI_FUNC(int) PyContextVar_Get(
+    PyObject *var, PyObject *default_value, PyObject **value);
+
+
+/* Set a new value for the variable.
+   Returns NULL if an error occurs.
+*/
+PyAPI_FUNC(PyObject *) PyContextVar_Set(PyObject *var, PyObject *value);
+
+
+/* Reset a variable to its previous value.
+   Returns 0 on success, -1 on error.
+*/
+PyAPI_FUNC(int) PyContextVar_Reset(PyObject *var, PyObject *token);
+
+
+/* This method is exposed only for CPython tests. Don not use it. */
+PyAPI_FUNC(PyObject *) _PyContext_NewHamtForTests(void);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_CONTEXT_H */
+#endif /* !Py_LIMITED_API */
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/descrobject.h b/nanvix-port/cpython-headers/python3.12/cpython/descrobject.h
new file mode 100644
index 000000000000..e2ea1b9a2d30
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/descrobject.h
@@ -0,0 +1,64 @@
+#ifndef Py_CPYTHON_DESCROBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+typedef PyObject *(*wrapperfunc)(PyObject *self, PyObject *args,
+                                 void *wrapped);
+
+typedef PyObject *(*wrapperfunc_kwds)(PyObject *self, PyObject *args,
+                                      void *wrapped, PyObject *kwds);
+
+struct wrapperbase {
+    const char *name;
+    int offset;
+    void *function;
+    wrapperfunc wrapper;
+    const char *doc;
+    int flags;
+    PyObject *name_strobj;
+};
+
+/* Flags for above struct */
+#define PyWrapperFlag_KEYWORDS 1 /* wrapper function takes keyword args */
+
+/* Various kinds of descriptor objects */
+
+typedef struct {
+    PyObject_HEAD
+    PyTypeObject *d_type;
+    PyObject *d_name;
+    PyObject *d_qualname;
+} PyDescrObject;
+
+#define PyDescr_COMMON PyDescrObject d_common
+
+#define PyDescr_TYPE(x) (((PyDescrObject *)(x))->d_type)
+#define PyDescr_NAME(x) (((PyDescrObject *)(x))->d_name)
+
+typedef struct {
+    PyDescr_COMMON;
+    PyMethodDef *d_method;
+    vectorcallfunc vectorcall;
+} PyMethodDescrObject;
+
+typedef struct {
+    PyDescr_COMMON;
+    PyMemberDef *d_member;
+} PyMemberDescrObject;
+
+typedef struct {
+    PyDescr_COMMON;
+    PyGetSetDef *d_getset;
+} PyGetSetDescrObject;
+
+typedef struct {
+    PyDescr_COMMON;
+    struct wrapperbase *d_base;
+    void *d_wrapped; /* This can be any function pointer */
+} PyWrapperDescrObject;
+
+PyAPI_DATA(PyTypeObject) _PyMethodWrapper_Type;
+
+PyAPI_FUNC(PyObject *) PyDescr_NewWrapper(PyTypeObject *,
+                                                struct wrapperbase *, void *);
+PyAPI_FUNC(int) PyDescr_IsData(PyObject *);
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/dictobject.h b/nanvix-port/cpython-headers/python3.12/cpython/dictobject.h
new file mode 100644
index 000000000000..ddada922020a
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/dictobject.h
@@ -0,0 +1,118 @@
+#ifndef Py_CPYTHON_DICTOBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+typedef struct _dictkeysobject PyDictKeysObject;
+typedef struct _dictvalues PyDictValues;
+
+/* The ma_values pointer is NULL for a combined table
+ * or points to an array of PyObject* for a split table
+ */
+typedef struct {
+    PyObject_HEAD
+
+    /* Number of items in the dictionary */
+    Py_ssize_t ma_used;
+
+    /* Dictionary version: globally unique, value change each time
+       the dictionary is modified */
+#ifdef Py_BUILD_CORE
+    uint64_t ma_version_tag;
+#else
+    Py_DEPRECATED(3.12) uint64_t ma_version_tag;
+#endif
+
+    PyDictKeysObject *ma_keys;
+
+    /* If ma_values is NULL, the table is "combined": keys and values
+       are stored in ma_keys.
+
+       If ma_values is not NULL, the table is split:
+       keys are stored in ma_keys and values are stored in ma_values */
+    PyDictValues *ma_values;
+} PyDictObject;
+
+PyAPI_FUNC(PyObject *) _PyDict_GetItem_KnownHash(PyObject *mp, PyObject *key,
+                                       Py_hash_t hash);
+PyAPI_FUNC(PyObject *) _PyDict_GetItemWithError(PyObject *dp, PyObject *key);
+PyAPI_FUNC(PyObject *) _PyDict_GetItemIdWithError(PyObject *dp,
+                                                  _Py_Identifier *key);
+PyAPI_FUNC(PyObject *) _PyDict_GetItemStringWithError(PyObject *, const char *);
+PyAPI_FUNC(PyObject *) PyDict_SetDefault(
+    PyObject *mp, PyObject *key, PyObject *defaultobj);
+PyAPI_FUNC(int) _PyDict_SetItem_KnownHash(PyObject *mp, PyObject *key,
+                                          PyObject *item, Py_hash_t hash);
+PyAPI_FUNC(int) _PyDict_DelItem_KnownHash(PyObject *mp, PyObject *key,
+                                          Py_hash_t hash);
+PyAPI_FUNC(int) _PyDict_DelItemIf(PyObject *mp, PyObject *key,
+                                  int (*predicate)(PyObject *value));
+PyAPI_FUNC(int) _PyDict_Next(
+    PyObject *mp, Py_ssize_t *pos, PyObject **key, PyObject **value, Py_hash_t *hash);
+
+/* Get the number of items of a dictionary. */
+static inline Py_ssize_t PyDict_GET_SIZE(PyObject *op) {
+    PyDictObject *mp;
+    assert(PyDict_Check(op));
+    mp = _Py_CAST(PyDictObject*, op);
+    return mp->ma_used;
+}
+#define PyDict_GET_SIZE(op) PyDict_GET_SIZE(_PyObject_CAST(op))
+
+PyAPI_FUNC(int) _PyDict_Contains_KnownHash(PyObject *, PyObject *, Py_hash_t);
+PyAPI_FUNC(int) _PyDict_ContainsId(PyObject *, _Py_Identifier *);
+PyAPI_FUNC(PyObject *) _PyDict_NewPresized(Py_ssize_t minused);
+PyAPI_FUNC(void) _PyDict_MaybeUntrack(PyObject *mp);
+PyAPI_FUNC(int) _PyDict_HasOnlyStringKeys(PyObject *mp);
+PyAPI_FUNC(Py_ssize_t) _PyDict_SizeOf(PyDictObject *);
+PyAPI_FUNC(PyObject *) _PyDict_Pop(PyObject *, PyObject *, PyObject *);
+#define _PyDict_HasSplitTable(d) ((d)->ma_values != NULL)
+
+/* Like PyDict_Merge, but override can be 0, 1 or 2.  If override is 0,
+   the first occurrence of a key wins, if override is 1, the last occurrence
+   of a key wins, if override is 2, a KeyError with conflicting key as
+   argument is raised.
+*/
+PyAPI_FUNC(int) _PyDict_MergeEx(PyObject *mp, PyObject *other, int override);
+PyAPI_FUNC(int) _PyDict_SetItemId(PyObject *dp, _Py_Identifier *key, PyObject *item);
+
+PyAPI_FUNC(int) _PyDict_DelItemId(PyObject *mp, _Py_Identifier *key);
+PyAPI_FUNC(void) _PyDict_DebugMallocStats(FILE *out);
+
+/* _PyDictView */
+
+typedef struct {
+    PyObject_HEAD
+    PyDictObject *dv_dict;
+} _PyDictViewObject;
+
+PyAPI_FUNC(PyObject *) _PyDictView_New(PyObject *, PyTypeObject *);
+PyAPI_FUNC(PyObject *) _PyDictView_Intersect(PyObject* self, PyObject *other);
+
+/* Dictionary watchers */
+
+#define PY_FOREACH_DICT_EVENT(V) \
+    V(ADDED)                     \
+    V(MODIFIED)                  \
+    V(DELETED)                   \
+    V(CLONED)                    \
+    V(CLEARED)                   \
+    V(DEALLOCATED)
+
+typedef enum {
+    #define PY_DEF_EVENT(EVENT) PyDict_EVENT_##EVENT,
+    PY_FOREACH_DICT_EVENT(PY_DEF_EVENT)
+    #undef PY_DEF_EVENT
+} PyDict_WatchEvent;
+
+// Callback to be invoked when a watched dict is cleared, dealloced, or modified.
+// In clear/dealloc case, key and new_value will be NULL. Otherwise, new_value will be the
+// new value for key, NULL if key is being deleted.
+typedef int(*PyDict_WatchCallback)(PyDict_WatchEvent event, PyObject* dict, PyObject* key, PyObject* new_value);
+
+// Register/unregister a dict-watcher callback
+PyAPI_FUNC(int) PyDict_AddWatcher(PyDict_WatchCallback callback);
+PyAPI_FUNC(int) PyDict_ClearWatcher(int watcher_id);
+
+// Mark given dictionary as "watched" (callback will be called if it is modified)
+PyAPI_FUNC(int) PyDict_Watch(int watcher_id, PyObject* dict);
+PyAPI_FUNC(int) PyDict_Unwatch(int watcher_id, PyObject* dict);
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/fileobject.h b/nanvix-port/cpython-headers/python3.12/cpython/fileobject.h
new file mode 100644
index 000000000000..b70ec318986d
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/fileobject.h
@@ -0,0 +1,19 @@
+#ifndef Py_CPYTHON_FILEOBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+PyAPI_FUNC(char *) Py_UniversalNewlineFgets(char *, int, FILE*, PyObject *);
+PyAPI_FUNC(char *) _Py_UniversalNewlineFgetsWithSize(char *, int, FILE*, PyObject *, size_t*);
+
+/* The std printer acts as a preliminary sys.stderr until the new io
+   infrastructure is in place. */
+PyAPI_FUNC(PyObject *) PyFile_NewStdPrinter(int);
+PyAPI_DATA(PyTypeObject) PyStdPrinter_Type;
+
+typedef PyObject * (*Py_OpenCodeHookFunction)(PyObject *, void *);
+
+PyAPI_FUNC(PyObject *) PyFile_OpenCode(const char *utf8path);
+PyAPI_FUNC(PyObject *) PyFile_OpenCodeObject(PyObject *path);
+PyAPI_FUNC(int) PyFile_SetOpenCodeHook(Py_OpenCodeHookFunction hook, void *userData);
+
+PyAPI_FUNC(int) _PyLong_FileDescriptor_Converter(PyObject *, void *);
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/fileutils.h b/nanvix-port/cpython-headers/python3.12/cpython/fileutils.h
new file mode 100644
index 000000000000..b386ad107bde
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/fileutils.h
@@ -0,0 +1,8 @@
+#ifndef Py_CPYTHON_FILEUTILS_H
+#  error "this header file must not be included directly"
+#endif
+
+// Used by _testcapi which must not use the internal C API
+PyAPI_FUNC(FILE*) _Py_fopen_obj(
+    PyObject *path,
+    const char *mode);
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/floatobject.h b/nanvix-port/cpython-headers/python3.12/cpython/floatobject.h
new file mode 100644
index 000000000000..127093098bfe
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/floatobject.h
@@ -0,0 +1,27 @@
+#ifndef Py_CPYTHON_FLOATOBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+typedef struct {
+    PyObject_HEAD
+    double ob_fval;
+} PyFloatObject;
+
+#define _PyFloat_CAST(op) \
+    (assert(PyFloat_Check(op)), _Py_CAST(PyFloatObject*, op))
+
+// Static inline version of PyFloat_AsDouble() trading safety for speed.
+// It doesn't check if op is a double object.
+static inline double PyFloat_AS_DOUBLE(PyObject *op) {
+    return _PyFloat_CAST(op)->ob_fval;
+}
+#define PyFloat_AS_DOUBLE(op) PyFloat_AS_DOUBLE(_PyObject_CAST(op))
+
+
+PyAPI_FUNC(int) PyFloat_Pack2(double x, char *p, int le);
+PyAPI_FUNC(int) PyFloat_Pack4(double x, char *p, int le);
+PyAPI_FUNC(int) PyFloat_Pack8(double x, char *p, int le);
+
+PyAPI_FUNC(double) PyFloat_Unpack2(const char *p, int le);
+PyAPI_FUNC(double) PyFloat_Unpack4(const char *p, int le);
+PyAPI_FUNC(double) PyFloat_Unpack8(const char *p, int le);
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/frameobject.h b/nanvix-port/cpython-headers/python3.12/cpython/frameobject.h
new file mode 100644
index 000000000000..4e19535c656f
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/frameobject.h
@@ -0,0 +1,29 @@
+/* Frame object interface */
+
+#ifndef Py_CPYTHON_FRAMEOBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+/* Standard object interface */
+
+PyAPI_FUNC(PyFrameObject *) PyFrame_New(PyThreadState *, PyCodeObject *,
+                                        PyObject *, PyObject *);
+
+/* The rest of the interface is specific for frame objects */
+
+/* Conversions between "fast locals" and locals in dictionary */
+
+PyAPI_FUNC(void) PyFrame_LocalsToFast(PyFrameObject *, int);
+
+/* -- Caveat emptor --
+ * The concept of entry frames is an implementation detail of the CPython
+ * interpreter. This API is considered unstable and is provided for the
+ * convenience of debuggers, profilers and state-inspecting tools. Notice that
+ * this API can be changed in future minor versions if the underlying frame
+ * mechanism change or the concept of an 'entry frame' or its semantics becomes
+ * obsolete or outdated. */
+
+PyAPI_FUNC(int) _PyFrame_IsEntryFrame(PyFrameObject *frame);
+
+PyAPI_FUNC(int) PyFrame_FastToLocalsWithError(PyFrameObject *f);
+PyAPI_FUNC(void) PyFrame_FastToLocals(PyFrameObject *);
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/funcobject.h b/nanvix-port/cpython-headers/python3.12/cpython/funcobject.h
new file mode 100644
index 000000000000..6f78f5868d01
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/funcobject.h
@@ -0,0 +1,188 @@
+/* Function object interface */
+
+#ifndef Py_LIMITED_API
+#ifndef Py_FUNCOBJECT_H
+#define Py_FUNCOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#define COMMON_FIELDS(PREFIX) \
+    PyObject *PREFIX ## globals; \
+    PyObject *PREFIX ## builtins; \
+    PyObject *PREFIX ## name; \
+    PyObject *PREFIX ## qualname; \
+    PyObject *PREFIX ## code;        /* A code object, the __code__ attribute */ \
+    PyObject *PREFIX ## defaults;    /* NULL or a tuple */ \
+    PyObject *PREFIX ## kwdefaults;  /* NULL or a dict */ \
+    PyObject *PREFIX ## closure;     /* NULL or a tuple of cell objects */
+
+typedef struct {
+    COMMON_FIELDS(fc_)
+} PyFrameConstructor;
+
+/* Function objects and code objects should not be confused with each other:
+ *
+ * Function objects are created by the execution of the 'def' statement.
+ * They reference a code object in their __code__ attribute, which is a
+ * purely syntactic object, i.e. nothing more than a compiled version of some
+ * source code lines.  There is one code object per source code "fragment",
+ * but each code object can be referenced by zero or many function objects
+ * depending only on how many times the 'def' statement in the source was
+ * executed so far.
+ */
+
+typedef struct {
+    PyObject_HEAD
+    COMMON_FIELDS(func_)
+    PyObject *func_doc;         /* The __doc__ attribute, can be anything */
+    PyObject *func_dict;        /* The __dict__ attribute, a dict or NULL */
+    PyObject *func_weakreflist; /* List of weak references */
+    PyObject *func_module;      /* The __module__ attribute, can be anything */
+    PyObject *func_annotations; /* Annotations, a dict or NULL */
+    PyObject *func_typeparams;  /* Tuple of active type variables or NULL */
+    vectorcallfunc vectorcall;
+    /* Version number for use by specializer.
+     * Can set to non-zero when we want to specialize.
+     * Will be set to zero if any of these change:
+     *     defaults
+     *     kwdefaults (only if the object changes, not the contents of the dict)
+     *     code
+     *     annotations
+     *     vectorcall function pointer */
+    uint32_t func_version;
+
+    /* Invariant:
+     *     func_closure contains the bindings for func_code->co_freevars, so
+     *     PyTuple_Size(func_closure) == PyCode_GetNumFree(func_code)
+     *     (func_closure may be NULL if PyCode_GetNumFree(func_code) == 0).
+     */
+} PyFunctionObject;
+
+PyAPI_DATA(PyTypeObject) PyFunction_Type;
+
+#define PyFunction_Check(op) Py_IS_TYPE((op), &PyFunction_Type)
+
+PyAPI_FUNC(PyObject *) PyFunction_New(PyObject *, PyObject *);
+PyAPI_FUNC(PyObject *) PyFunction_NewWithQualName(PyObject *, PyObject *, PyObject *);
+PyAPI_FUNC(PyObject *) PyFunction_GetCode(PyObject *);
+PyAPI_FUNC(PyObject *) PyFunction_GetGlobals(PyObject *);
+PyAPI_FUNC(PyObject *) PyFunction_GetModule(PyObject *);
+PyAPI_FUNC(PyObject *) PyFunction_GetDefaults(PyObject *);
+PyAPI_FUNC(int) PyFunction_SetDefaults(PyObject *, PyObject *);
+PyAPI_FUNC(void) PyFunction_SetVectorcall(PyFunctionObject *, vectorcallfunc);
+PyAPI_FUNC(PyObject *) PyFunction_GetKwDefaults(PyObject *);
+PyAPI_FUNC(int) PyFunction_SetKwDefaults(PyObject *, PyObject *);
+PyAPI_FUNC(PyObject *) PyFunction_GetClosure(PyObject *);
+PyAPI_FUNC(int) PyFunction_SetClosure(PyObject *, PyObject *);
+PyAPI_FUNC(PyObject *) PyFunction_GetAnnotations(PyObject *);
+PyAPI_FUNC(int) PyFunction_SetAnnotations(PyObject *, PyObject *);
+
+PyAPI_FUNC(PyObject *) _PyFunction_Vectorcall(
+    PyObject *func,
+    PyObject *const *stack,
+    size_t nargsf,
+    PyObject *kwnames);
+
+#define _PyFunction_CAST(func) \
+    (assert(PyFunction_Check(func)), _Py_CAST(PyFunctionObject*, func))
+
+/* Static inline functions for direct access to these values.
+   Type checks are *not* done, so use with care. */
+static inline PyObject* PyFunction_GET_CODE(PyObject *func) {
+    return _PyFunction_CAST(func)->func_code;
+}
+#define PyFunction_GET_CODE(func) PyFunction_GET_CODE(_PyObject_CAST(func))
+
+static inline PyObject* PyFunction_GET_GLOBALS(PyObject *func) {
+    return _PyFunction_CAST(func)->func_globals;
+}
+#define PyFunction_GET_GLOBALS(func) PyFunction_GET_GLOBALS(_PyObject_CAST(func))
+
+static inline PyObject* PyFunction_GET_MODULE(PyObject *func) {
+    return _PyFunction_CAST(func)->func_module;
+}
+#define PyFunction_GET_MODULE(func) PyFunction_GET_MODULE(_PyObject_CAST(func))
+
+static inline PyObject* PyFunction_GET_DEFAULTS(PyObject *func) {
+    return _PyFunction_CAST(func)->func_defaults;
+}
+#define PyFunction_GET_DEFAULTS(func) PyFunction_GET_DEFAULTS(_PyObject_CAST(func))
+
+static inline PyObject* PyFunction_GET_KW_DEFAULTS(PyObject *func) {
+    return _PyFunction_CAST(func)->func_kwdefaults;
+}
+#define PyFunction_GET_KW_DEFAULTS(func) PyFunction_GET_KW_DEFAULTS(_PyObject_CAST(func))
+
+static inline PyObject* PyFunction_GET_CLOSURE(PyObject *func) {
+    return _PyFunction_CAST(func)->func_closure;
+}
+#define PyFunction_GET_CLOSURE(func) PyFunction_GET_CLOSURE(_PyObject_CAST(func))
+
+static inline PyObject* PyFunction_GET_ANNOTATIONS(PyObject *func) {
+    return _PyFunction_CAST(func)->func_annotations;
+}
+#define PyFunction_GET_ANNOTATIONS(func) PyFunction_GET_ANNOTATIONS(_PyObject_CAST(func))
+
+/* The classmethod and staticmethod types lives here, too */
+PyAPI_DATA(PyTypeObject) PyClassMethod_Type;
+PyAPI_DATA(PyTypeObject) PyStaticMethod_Type;
+
+PyAPI_FUNC(PyObject *) PyClassMethod_New(PyObject *);
+PyAPI_FUNC(PyObject *) PyStaticMethod_New(PyObject *);
+
+#define PY_FOREACH_FUNC_EVENT(V) \
+    V(CREATE)                    \
+    V(DESTROY)                   \
+    V(MODIFY_CODE)               \
+    V(MODIFY_DEFAULTS)           \
+    V(MODIFY_KWDEFAULTS)
+
+typedef enum {
+    #define PY_DEF_EVENT(EVENT) PyFunction_EVENT_##EVENT,
+    PY_FOREACH_FUNC_EVENT(PY_DEF_EVENT)
+    #undef PY_DEF_EVENT
+} PyFunction_WatchEvent;
+
+/*
+ * A callback that is invoked for different events in a function's lifecycle.
+ *
+ * The callback is invoked with a borrowed reference to func, after it is
+ * created and before it is modified or destroyed. The callback should not
+ * modify func.
+ *
+ * When a function's code object, defaults, or kwdefaults are modified the
+ * callback will be invoked with the respective event and new_value will
+ * contain a borrowed reference to the new value that is about to be stored in
+ * the function. Otherwise the third argument is NULL.
+ *
+ * If the callback returns with an exception set, it must return -1. Otherwise
+ * it should return 0.
+ */
+typedef int (*PyFunction_WatchCallback)(
+  PyFunction_WatchEvent event,
+  PyFunctionObject *func,
+  PyObject *new_value);
+
+/*
+ * Register a per-interpreter callback that will be invoked for function lifecycle
+ * events.
+ *
+ * Returns a handle that may be passed to PyFunction_ClearWatcher on success,
+ * or -1 and sets an error if no more handles are available.
+ */
+PyAPI_FUNC(int) PyFunction_AddWatcher(PyFunction_WatchCallback callback);
+
+/*
+ * Clear the watcher associated with the watcher_id handle.
+ *
+ * Returns 0 on success or -1 if no watcher exists for the supplied id.
+ */
+PyAPI_FUNC(int) PyFunction_ClearWatcher(int watcher_id);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_FUNCOBJECT_H */
+#endif /* Py_LIMITED_API */
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/genobject.h b/nanvix-port/cpython-headers/python3.12/cpython/genobject.h
new file mode 100644
index 000000000000..7856481b5db3
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/genobject.h
@@ -0,0 +1,89 @@
+/* Generator object interface */
+
+#ifndef Py_LIMITED_API
+#ifndef Py_GENOBJECT_H
+#define Py_GENOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* --- Generators --------------------------------------------------------- */
+
+/* _PyGenObject_HEAD defines the initial segment of generator
+   and coroutine objects. */
+#define _PyGenObject_HEAD(prefix)                                           \
+    PyObject_HEAD                                                           \
+    /* List of weak reference. */                                           \
+    PyObject *prefix##_weakreflist;                                         \
+    /* Name of the generator. */                                            \
+    PyObject *prefix##_name;                                                \
+    /* Qualified name of the generator. */                                  \
+    PyObject *prefix##_qualname;                                            \
+    _PyErr_StackItem prefix##_exc_state;                                    \
+    PyObject *prefix##_origin_or_finalizer;                                 \
+    char prefix##_hooks_inited;                                             \
+    char prefix##_closed;                                                   \
+    char prefix##_running_async;                                            \
+    /* The frame */                                                         \
+    int8_t prefix##_frame_state;                                            \
+    PyObject *prefix##_iframe[1];                                           \
+
+typedef struct {
+    /* The gi_ prefix is intended to remind of generator-iterator. */
+    _PyGenObject_HEAD(gi)
+} PyGenObject;
+
+PyAPI_DATA(PyTypeObject) PyGen_Type;
+
+#define PyGen_Check(op) PyObject_TypeCheck((op), &PyGen_Type)
+#define PyGen_CheckExact(op) Py_IS_TYPE((op), &PyGen_Type)
+
+PyAPI_FUNC(PyObject *) PyGen_New(PyFrameObject *);
+PyAPI_FUNC(PyObject *) PyGen_NewWithQualName(PyFrameObject *,
+    PyObject *name, PyObject *qualname);
+PyAPI_FUNC(int) _PyGen_SetStopIterationValue(PyObject *);
+PyAPI_FUNC(int) _PyGen_FetchStopIterationValue(PyObject **);
+PyAPI_FUNC(void) _PyGen_Finalize(PyObject *self);
+PyAPI_FUNC(PyCodeObject *) PyGen_GetCode(PyGenObject *gen);
+
+
+/* --- PyCoroObject ------------------------------------------------------- */
+
+typedef struct {
+    _PyGenObject_HEAD(cr)
+} PyCoroObject;
+
+PyAPI_DATA(PyTypeObject) PyCoro_Type;
+PyAPI_DATA(PyTypeObject) _PyCoroWrapper_Type;
+
+#define PyCoro_CheckExact(op) Py_IS_TYPE((op), &PyCoro_Type)
+PyAPI_FUNC(PyObject *) PyCoro_New(PyFrameObject *,
+    PyObject *name, PyObject *qualname);
+
+
+/* --- Asynchronous Generators -------------------------------------------- */
+
+typedef struct {
+    _PyGenObject_HEAD(ag)
+} PyAsyncGenObject;
+
+PyAPI_DATA(PyTypeObject) PyAsyncGen_Type;
+PyAPI_DATA(PyTypeObject) _PyAsyncGenASend_Type;
+PyAPI_DATA(PyTypeObject) _PyAsyncGenWrappedValue_Type;
+PyAPI_DATA(PyTypeObject) _PyAsyncGenAThrow_Type;
+
+PyAPI_FUNC(PyObject *) PyAsyncGen_New(PyFrameObject *,
+    PyObject *name, PyObject *qualname);
+
+#define PyAsyncGen_CheckExact(op) Py_IS_TYPE((op), &PyAsyncGen_Type)
+
+#define PyAsyncGenASend_CheckExact(op) Py_IS_TYPE((op), &_PyAsyncGenASend_Type)
+
+
+#undef _PyGenObject_HEAD
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_GENOBJECT_H */
+#endif /* Py_LIMITED_API */
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/import.h b/nanvix-port/cpython-headers/python3.12/cpython/import.h
new file mode 100644
index 000000000000..2bca4ade4c4f
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/import.h
@@ -0,0 +1,46 @@
+#ifndef Py_CPYTHON_IMPORT_H
+#  error "this header file must not be included directly"
+#endif
+
+PyMODINIT_FUNC PyInit__imp(void);
+
+PyAPI_FUNC(int) _PyImport_IsInitialized(PyInterpreterState *);
+
+PyAPI_FUNC(PyObject *) _PyImport_GetModuleId(_Py_Identifier *name);
+PyAPI_FUNC(int) _PyImport_SetModule(PyObject *name, PyObject *module);
+PyAPI_FUNC(int) _PyImport_SetModuleString(const char *name, PyObject* module);
+
+PyAPI_FUNC(void) _PyImport_AcquireLock(PyInterpreterState *interp);
+PyAPI_FUNC(int) _PyImport_ReleaseLock(PyInterpreterState *interp);
+
+PyAPI_FUNC(int) _PyImport_FixupBuiltin(
+    PyObject *mod,
+    const char *name,            /* UTF-8 encoded string */
+    PyObject *modules
+    );
+PyAPI_FUNC(int) _PyImport_FixupExtensionObject(PyObject*, PyObject *,
+                                               PyObject *, PyObject *);
+
+struct _inittab {
+    const char *name;           /* ASCII encoded string */
+    PyObject* (*initfunc)(void);
+};
+// This is not used after Py_Initialize() is called.
+PyAPI_DATA(struct _inittab *) PyImport_Inittab;
+PyAPI_FUNC(int) PyImport_ExtendInittab(struct _inittab *newtab);
+
+struct _frozen {
+    const char *name;                 /* ASCII encoded string */
+    const unsigned char *code;
+    int size;
+    int is_package;
+    PyObject *(*get_code)(void);
+};
+
+/* Embedding apps may change this pointer to point to their favorite
+   collection of frozen modules: */
+
+PyAPI_DATA(const struct _frozen *) PyImport_FrozenModules;
+
+PyAPI_DATA(PyObject *) _PyImport_GetModuleAttr(PyObject *, PyObject *);
+PyAPI_DATA(PyObject *) _PyImport_GetModuleAttrString(const char *, const char *);
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/initconfig.h b/nanvix-port/cpython-headers/python3.12/cpython/initconfig.h
new file mode 100644
index 000000000000..cbae97f12f53
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/initconfig.h
@@ -0,0 +1,256 @@
+#ifndef Py_PYCORECONFIG_H
+#define Py_PYCORECONFIG_H
+#ifndef Py_LIMITED_API
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* --- PyStatus ----------------------------------------------- */
+
+typedef struct {
+    enum {
+        _PyStatus_TYPE_OK=0,
+        _PyStatus_TYPE_ERROR=1,
+        _PyStatus_TYPE_EXIT=2
+    } _type;
+    const char *func;
+    const char *err_msg;
+    int exitcode;
+} PyStatus;
+
+PyAPI_FUNC(PyStatus) PyStatus_Ok(void);
+PyAPI_FUNC(PyStatus) PyStatus_Error(const char *err_msg);
+PyAPI_FUNC(PyStatus) PyStatus_NoMemory(void);
+PyAPI_FUNC(PyStatus) PyStatus_Exit(int exitcode);
+PyAPI_FUNC(int) PyStatus_IsError(PyStatus err);
+PyAPI_FUNC(int) PyStatus_IsExit(PyStatus err);
+PyAPI_FUNC(int) PyStatus_Exception(PyStatus err);
+PyAPI_FUNC(PyObject *) _PyErr_SetFromPyStatus(PyStatus status);
+
+/* --- PyWideStringList ------------------------------------------------ */
+
+typedef struct {
+    /* If length is greater than zero, items must be non-NULL
+       and all items strings must be non-NULL */
+    Py_ssize_t length;
+    wchar_t **items;
+} PyWideStringList;
+
+PyAPI_FUNC(PyStatus) PyWideStringList_Append(PyWideStringList *list,
+    const wchar_t *item);
+PyAPI_FUNC(PyStatus) PyWideStringList_Insert(PyWideStringList *list,
+    Py_ssize_t index,
+    const wchar_t *item);
+
+
+/* --- PyPreConfig ----------------------------------------------- */
+
+typedef struct PyPreConfig {
+    int _config_init;     /* _PyConfigInitEnum value */
+
+    /* Parse Py_PreInitializeFromBytesArgs() arguments?
+       See PyConfig.parse_argv */
+    int parse_argv;
+
+    /* If greater than 0, enable isolated mode: sys.path contains
+       neither the script's directory nor the user's site-packages directory.
+
+       Set to 1 by the -I command line option. If set to -1 (default), inherit
+       Py_IsolatedFlag value. */
+    int isolated;
+
+    /* If greater than 0: use environment variables.
+       Set to 0 by -E command line option. If set to -1 (default), it is
+       set to !Py_IgnoreEnvironmentFlag. */
+    int use_environment;
+
+    /* Set the LC_CTYPE locale to the user preferred locale? If equals to 0,
+       set coerce_c_locale and coerce_c_locale_warn to 0. */
+    int configure_locale;
+
+    /* Coerce the LC_CTYPE locale if it's equal to "C"? (PEP 538)
+
+       Set to 0 by PYTHONCOERCECLOCALE=0. Set to 1 by PYTHONCOERCECLOCALE=1.
+       Set to 2 if the user preferred LC_CTYPE locale is "C".
+
+       If it is equal to 1, LC_CTYPE locale is read to decide if it should be
+       coerced or not (ex: PYTHONCOERCECLOCALE=1). Internally, it is set to 2
+       if the LC_CTYPE locale must be coerced.
+
+       Disable by default (set to 0). Set it to -1 to let Python decide if it
+       should be enabled or not. */
+    int coerce_c_locale;
+
+    /* Emit a warning if the LC_CTYPE locale is coerced?
+
+       Set to 1 by PYTHONCOERCECLOCALE=warn.
+
+       Disable by default (set to 0). Set it to -1 to let Python decide if it
+       should be enabled or not. */
+    int coerce_c_locale_warn;
+
+#ifdef MS_WINDOWS
+    /* If greater than 1, use the "mbcs" encoding instead of the UTF-8
+       encoding for the filesystem encoding.
+
+       Set to 1 if the PYTHONLEGACYWINDOWSFSENCODING environment variable is
+       set to a non-empty string. If set to -1 (default), inherit
+       Py_LegacyWindowsFSEncodingFlag value.
+
+       See PEP 529 for more details. */
+    int legacy_windows_fs_encoding;
+#endif
+
+    /* Enable UTF-8 mode? (PEP 540)
+
+       Disabled by default (equals to 0).
+
+       Set to 1 by "-X utf8" and "-X utf8=1" command line options.
+       Set to 1 by PYTHONUTF8=1 environment variable.
+
+       Set to 0 by "-X utf8=0" and PYTHONUTF8=0.
+
+       If equals to -1, it is set to 1 if the LC_CTYPE locale is "C" or
+       "POSIX", otherwise it is set to 0. Inherit Py_UTF8Mode value value. */
+    int utf8_mode;
+
+    /* If non-zero, enable the Python Development Mode.
+
+       Set to 1 by the -X dev command line option. Set by the PYTHONDEVMODE
+       environment variable. */
+    int dev_mode;
+
+    /* Memory allocator: PYTHONMALLOC env var.
+       See PyMemAllocatorName for valid values. */
+    int allocator;
+} PyPreConfig;
+
+PyAPI_FUNC(void) PyPreConfig_InitPythonConfig(PyPreConfig *config);
+PyAPI_FUNC(void) PyPreConfig_InitIsolatedConfig(PyPreConfig *config);
+
+
+/* --- PyConfig ---------------------------------------------- */
+
+/* This structure is best documented in the Doc/c-api/init_config.rst file. */
+typedef struct PyConfig {
+    int _config_init;     /* _PyConfigInitEnum value */
+
+    int isolated;
+    int use_environment;
+    int dev_mode;
+    int install_signal_handlers;
+    int use_hash_seed;
+    unsigned long hash_seed;
+    int faulthandler;
+    int tracemalloc;
+    int perf_profiling;
+    int import_time;
+    int code_debug_ranges;
+    int show_ref_count;
+    int dump_refs;
+    wchar_t *dump_refs_file;
+    int malloc_stats;
+    wchar_t *filesystem_encoding;
+    wchar_t *filesystem_errors;
+    wchar_t *pycache_prefix;
+    int parse_argv;
+    PyWideStringList orig_argv;
+    PyWideStringList argv;
+    PyWideStringList xoptions;
+    PyWideStringList warnoptions;
+    int site_import;
+    int bytes_warning;
+    int warn_default_encoding;
+    int inspect;
+    int interactive;
+    int optimization_level;
+    int parser_debug;
+    int write_bytecode;
+    int verbose;
+    int quiet;
+    int user_site_directory;
+    int configure_c_stdio;
+    int buffered_stdio;
+    wchar_t *stdio_encoding;
+    wchar_t *stdio_errors;
+#ifdef MS_WINDOWS
+    int legacy_windows_stdio;
+#endif
+    wchar_t *check_hash_pycs_mode;
+    int use_frozen_modules;
+    int safe_path;
+    int int_max_str_digits;
+
+    /* --- Path configuration inputs ------------ */
+    int pathconfig_warnings;
+    wchar_t *program_name;
+    wchar_t *pythonpath_env;
+    wchar_t *home;
+    wchar_t *platlibdir;
+
+    /* --- Path configuration outputs ----------- */
+    int module_search_paths_set;
+    PyWideStringList module_search_paths;
+    wchar_t *stdlib_dir;
+    wchar_t *executable;
+    wchar_t *base_executable;
+    wchar_t *prefix;
+    wchar_t *base_prefix;
+    wchar_t *exec_prefix;
+    wchar_t *base_exec_prefix;
+
+    /* --- Parameter only used by Py_Main() ---------- */
+    int skip_source_first_line;
+    wchar_t *run_command;
+    wchar_t *run_module;
+    wchar_t *run_filename;
+
+    /* --- Private fields ---------------------------- */
+
+    // Install importlib? If equals to 0, importlib is not initialized at all.
+    // Needed by freeze_importlib.
+    int _install_importlib;
+
+    // If equal to 0, stop Python initialization before the "main" phase.
+    int _init_main;
+
+    // If non-zero, we believe we're running from a source tree.
+    int _is_python_build;
+} PyConfig;
+
+PyAPI_FUNC(void) PyConfig_InitPythonConfig(PyConfig *config);
+PyAPI_FUNC(void) PyConfig_InitIsolatedConfig(PyConfig *config);
+PyAPI_FUNC(void) PyConfig_Clear(PyConfig *);
+PyAPI_FUNC(PyStatus) PyConfig_SetString(
+    PyConfig *config,
+    wchar_t **config_str,
+    const wchar_t *str);
+PyAPI_FUNC(PyStatus) PyConfig_SetBytesString(
+    PyConfig *config,
+    wchar_t **config_str,
+    const char *str);
+PyAPI_FUNC(PyStatus) PyConfig_Read(PyConfig *config);
+PyAPI_FUNC(PyStatus) PyConfig_SetBytesArgv(
+    PyConfig *config,
+    Py_ssize_t argc,
+    char * const *argv);
+PyAPI_FUNC(PyStatus) PyConfig_SetArgv(PyConfig *config,
+    Py_ssize_t argc,
+    wchar_t * const *argv);
+PyAPI_FUNC(PyStatus) PyConfig_SetWideStringList(PyConfig *config,
+    PyWideStringList *list,
+    Py_ssize_t length, wchar_t **items);
+
+
+/* --- Helper functions --------------------------------------- */
+
+/* Get the original command line arguments, before Python modified them.
+
+   See also PyConfig.orig_argv. */
+PyAPI_FUNC(void) Py_GetArgcArgv(int *argc, wchar_t ***argv);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_LIMITED_API */
+#endif /* !Py_PYCORECONFIG_H */
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/interpreteridobject.h b/nanvix-port/cpython-headers/python3.12/cpython/interpreteridobject.h
new file mode 100644
index 000000000000..5076584209b9
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/interpreteridobject.h
@@ -0,0 +1,11 @@
+#ifndef Py_CPYTHON_INTERPRETERIDOBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+/* Interpreter ID Object */
+
+PyAPI_DATA(PyTypeObject) _PyInterpreterID_Type;
+
+PyAPI_FUNC(PyObject *) _PyInterpreterID_New(int64_t);
+PyAPI_FUNC(PyObject *) _PyInterpreterState_GetIDObject(PyInterpreterState *);
+PyAPI_FUNC(PyInterpreterState *) _PyInterpreterID_LookUp(PyObject *);
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/listobject.h b/nanvix-port/cpython-headers/python3.12/cpython/listobject.h
new file mode 100644
index 000000000000..8fa82122d8d2
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/listobject.h
@@ -0,0 +1,47 @@
+#ifndef Py_CPYTHON_LISTOBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+typedef struct {
+    PyObject_VAR_HEAD
+    /* Vector of pointers to list elements.  list[0] is ob_item[0], etc. */
+    PyObject **ob_item;
+
+    /* ob_item contains space for 'allocated' elements.  The number
+     * currently in use is ob_size.
+     * Invariants:
+     *     0 <= ob_size <= allocated
+     *     len(list) == ob_size
+     *     ob_item == NULL implies ob_size == allocated == 0
+     * list.sort() temporarily sets allocated to -1 to detect mutations.
+     *
+     * Items must normally not be NULL, except during construction when
+     * the list is not yet visible outside the function that builds it.
+     */
+    Py_ssize_t allocated;
+} PyListObject;
+
+PyAPI_FUNC(PyObject *) _PyList_Extend(PyListObject *, PyObject *);
+PyAPI_FUNC(void) _PyList_DebugMallocStats(FILE *out);
+
+/* Cast argument to PyListObject* type. */
+#define _PyList_CAST(op) \
+    (assert(PyList_Check(op)), _Py_CAST(PyListObject*, (op)))
+
+// Macros and static inline functions, trading safety for speed
+
+static inline Py_ssize_t PyList_GET_SIZE(PyObject *op) {
+    PyListObject *list = _PyList_CAST(op);
+    return Py_SIZE(list);
+}
+#define PyList_GET_SIZE(op) PyList_GET_SIZE(_PyObject_CAST(op))
+
+#define PyList_GET_ITEM(op, index) (_PyList_CAST(op)->ob_item[(index)])
+
+static inline void
+PyList_SET_ITEM(PyObject *op, Py_ssize_t index, PyObject *value) {
+    PyListObject *list = _PyList_CAST(op);
+    list->ob_item[index] = value;
+}
+#define PyList_SET_ITEM(op, index, value) \
+    PyList_SET_ITEM(_PyObject_CAST(op), (index), _PyObject_CAST(value))
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/longintrepr.h b/nanvix-port/cpython-headers/python3.12/cpython/longintrepr.h
new file mode 100644
index 000000000000..78ac79a7cb88
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/longintrepr.h
@@ -0,0 +1,133 @@
+#ifndef Py_LIMITED_API
+#ifndef Py_LONGINTREPR_H
+#define Py_LONGINTREPR_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* This is published for the benefit of "friends" marshal.c and _decimal.c. */
+
+/* Parameters of the integer representation.  There are two different
+   sets of parameters: one set for 30-bit digits, stored in an unsigned 32-bit
+   integer type, and one set for 15-bit digits with each digit stored in an
+   unsigned short.  The value of PYLONG_BITS_IN_DIGIT, defined either at
+   configure time or in pyport.h, is used to decide which digit size to use.
+
+   Type 'digit' should be able to hold 2*PyLong_BASE-1, and type 'twodigits'
+   should be an unsigned integer type able to hold all integers up to
+   PyLong_BASE*PyLong_BASE-1.  x_sub assumes that 'digit' is an unsigned type,
+   and that overflow is handled by taking the result modulo 2**N for some N >
+   PyLong_SHIFT.  The majority of the code doesn't care about the precise
+   value of PyLong_SHIFT, but there are some notable exceptions:
+
+   - PyLong_{As,From}ByteArray require that PyLong_SHIFT be at least 8
+
+   - long_hash() requires that PyLong_SHIFT is *strictly* less than the number
+     of bits in an unsigned long, as do the PyLong <-> long (or unsigned long)
+     conversion functions
+
+   - the Python int <-> size_t/Py_ssize_t conversion functions expect that
+     PyLong_SHIFT is strictly less than the number of bits in a size_t
+
+   - the marshal code currently expects that PyLong_SHIFT is a multiple of 15
+
+   - NSMALLNEGINTS and NSMALLPOSINTS should be small enough to fit in a single
+     digit; with the current values this forces PyLong_SHIFT >= 9
+
+  The values 15 and 30 should fit all of the above requirements, on any
+  platform.
+*/
+
+#if PYLONG_BITS_IN_DIGIT == 30
+typedef uint32_t digit;
+typedef int32_t sdigit; /* signed variant of digit */
+typedef uint64_t twodigits;
+typedef int64_t stwodigits; /* signed variant of twodigits */
+#define PyLong_SHIFT    30
+#define _PyLong_DECIMAL_SHIFT   9 /* max(e such that 10**e fits in a digit) */
+#define _PyLong_DECIMAL_BASE    ((digit)1000000000) /* 10 ** DECIMAL_SHIFT */
+#elif PYLONG_BITS_IN_DIGIT == 15
+typedef unsigned short digit;
+typedef short sdigit; /* signed variant of digit */
+typedef unsigned long twodigits;
+typedef long stwodigits; /* signed variant of twodigits */
+#define PyLong_SHIFT    15
+#define _PyLong_DECIMAL_SHIFT   4 /* max(e such that 10**e fits in a digit) */
+#define _PyLong_DECIMAL_BASE    ((digit)10000) /* 10 ** DECIMAL_SHIFT */
+#else
+#error "PYLONG_BITS_IN_DIGIT should be 15 or 30"
+#endif
+#define PyLong_BASE     ((digit)1 << PyLong_SHIFT)
+#define PyLong_MASK     ((digit)(PyLong_BASE - 1))
+
+/* Long integer representation.
+   The absolute value of a number is equal to
+        SUM(for i=0 through abs(ob_size)-1) ob_digit[i] * 2**(SHIFT*i)
+   Negative numbers are represented with ob_size < 0;
+   zero is represented by ob_size == 0.
+   In a normalized number, ob_digit[abs(ob_size)-1] (the most significant
+   digit) is never zero.  Also, in all cases, for all valid i,
+        0 <= ob_digit[i] <= MASK.
+   The allocation function takes care of allocating extra memory
+   so that ob_digit[0] ... ob_digit[abs(ob_size)-1] are actually available.
+   We always allocate memory for at least one digit, so accessing ob_digit[0]
+   is always safe. However, in the case ob_size == 0, the contents of
+   ob_digit[0] may be undefined.
+
+   CAUTION:  Generic code manipulating subtypes of PyVarObject has to
+   aware that ints abuse  ob_size's sign bit.
+*/
+
+typedef struct _PyLongValue {
+    uintptr_t lv_tag; /* Number of digits, sign and flags */
+    digit ob_digit[1];
+} _PyLongValue;
+
+struct _longobject {
+    PyObject_HEAD
+    _PyLongValue long_value;
+};
+
+PyAPI_FUNC(PyLongObject *) _PyLong_New(Py_ssize_t);
+
+/* Return a copy of src. */
+PyAPI_FUNC(PyObject *) _PyLong_Copy(PyLongObject *src);
+
+PyAPI_FUNC(PyLongObject *)
+_PyLong_FromDigits(int negative, Py_ssize_t digit_count, digit *digits);
+
+
+/* Inline some internals for speed. These should be in pycore_long.h
+ * if user code didn't need them inlined. */
+
+#define _PyLong_SIGN_MASK 3
+#define _PyLong_NON_SIZE_BITS 3
+
+
+static inline int
+_PyLong_IsCompact(const PyLongObject* op) {
+    assert(PyType_HasFeature((op)->ob_base.ob_type, Py_TPFLAGS_LONG_SUBCLASS));
+    return op->long_value.lv_tag < (2 << _PyLong_NON_SIZE_BITS);
+}
+
+#define PyUnstable_Long_IsCompact _PyLong_IsCompact
+
+static inline Py_ssize_t
+_PyLong_CompactValue(const PyLongObject *op)
+{
+    Py_ssize_t sign;
+    assert(PyType_HasFeature((op)->ob_base.ob_type, Py_TPFLAGS_LONG_SUBCLASS));
+    assert(PyUnstable_Long_IsCompact(op));
+    sign = 1 - (op->long_value.lv_tag & _PyLong_SIGN_MASK);
+    return sign * (Py_ssize_t)op->long_value.ob_digit[0];
+}
+
+#define PyUnstable_Long_CompactValue _PyLong_CompactValue
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_LONGINTREPR_H */
+#endif /* Py_LIMITED_API */
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/longobject.h b/nanvix-port/cpython-headers/python3.12/cpython/longobject.h
new file mode 100644
index 000000000000..90cc0f267ae8
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/longobject.h
@@ -0,0 +1,100 @@
+#ifndef Py_CPYTHON_LONGOBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+PyAPI_FUNC(int) _PyLong_AsInt(PyObject *);
+
+PyAPI_FUNC(int) _PyLong_UnsignedShort_Converter(PyObject *, void *);
+PyAPI_FUNC(int) _PyLong_UnsignedInt_Converter(PyObject *, void *);
+PyAPI_FUNC(int) _PyLong_UnsignedLong_Converter(PyObject *, void *);
+PyAPI_FUNC(int) _PyLong_UnsignedLongLong_Converter(PyObject *, void *);
+PyAPI_FUNC(int) _PyLong_Size_t_Converter(PyObject *, void *);
+
+/* _PyLong_Frexp returns a double x and an exponent e such that the
+   true value is approximately equal to x * 2**e.  e is >= 0.  x is
+   0.0 if and only if the input is 0 (in which case, e and x are both
+   zeroes); otherwise, 0.5 <= abs(x) < 1.0.  On overflow, which is
+   possible if the number of bits doesn't fit into a Py_ssize_t, sets
+   OverflowError and returns -1.0 for x, 0 for e. */
+PyAPI_FUNC(double) _PyLong_Frexp(PyLongObject *a, Py_ssize_t *e);
+
+PyAPI_FUNC(PyObject *) PyLong_FromUnicodeObject(PyObject *u, int base);
+PyAPI_FUNC(PyObject *) _PyLong_FromBytes(const char *, Py_ssize_t, int);
+
+/* _PyLong_Sign.  Return 0 if v is 0, -1 if v < 0, +1 if v > 0.
+   v must not be NULL, and must be a normalized long.
+   There are no error cases.
+*/
+PyAPI_FUNC(int) _PyLong_Sign(PyObject *v);
+
+/* _PyLong_NumBits.  Return the number of bits needed to represent the
+   absolute value of a long.  For example, this returns 1 for 1 and -1, 2
+   for 2 and -2, and 2 for 3 and -3.  It returns 0 for 0.
+   v must not be NULL, and must be a normalized long.
+   (size_t)-1 is returned and OverflowError set if the true result doesn't
+   fit in a size_t.
+*/
+PyAPI_FUNC(size_t) _PyLong_NumBits(PyObject *v);
+
+/* _PyLong_DivmodNear.  Given integers a and b, compute the nearest
+   integer q to the exact quotient a / b, rounding to the nearest even integer
+   in the case of a tie.  Return (q, r), where r = a - q*b.  The remainder r
+   will satisfy abs(r) <= abs(b)/2, with equality possible only if q is
+   even.
+*/
+PyAPI_FUNC(PyObject *) _PyLong_DivmodNear(PyObject *, PyObject *);
+
+/* _PyLong_FromByteArray:  View the n unsigned bytes as a binary integer in
+   base 256, and return a Python int with the same numeric value.
+   If n is 0, the integer is 0.  Else:
+   If little_endian is 1/true, bytes[n-1] is the MSB and bytes[0] the LSB;
+   else (little_endian is 0/false) bytes[0] is the MSB and bytes[n-1] the
+   LSB.
+   If is_signed is 0/false, view the bytes as a non-negative integer.
+   If is_signed is 1/true, view the bytes as a 2's-complement integer,
+   non-negative if bit 0x80 of the MSB is clear, negative if set.
+   Error returns:
+   + Return NULL with the appropriate exception set if there's not
+     enough memory to create the Python int.
+*/
+PyAPI_FUNC(PyObject *) _PyLong_FromByteArray(
+    const unsigned char* bytes, size_t n,
+    int little_endian, int is_signed);
+
+/* _PyLong_AsByteArray: Convert the least-significant 8*n bits of long
+   v to a base-256 integer, stored in array bytes.  Normally return 0,
+   return -1 on error.
+   If little_endian is 1/true, store the MSB at bytes[n-1] and the LSB at
+   bytes[0]; else (little_endian is 0/false) store the MSB at bytes[0] and
+   the LSB at bytes[n-1].
+   If is_signed is 0/false, it's an error if v < 0; else (v >= 0) n bytes
+   are filled and there's nothing special about bit 0x80 of the MSB.
+   If is_signed is 1/true, bytes is filled with the 2's-complement
+   representation of v's value.  Bit 0x80 of the MSB is the sign bit.
+   Error returns (-1):
+   + is_signed is 0 and v < 0.  TypeError is set in this case, and bytes
+     isn't altered.
+   + n isn't big enough to hold the full mathematical value of v.  For
+     example, if is_signed is 0 and there are more digits in the v than
+     fit in n; or if is_signed is 1, v < 0, and n is just 1 bit shy of
+     being large enough to hold a sign bit.  OverflowError is set in this
+     case, but bytes holds the least-significant n bytes of the true value.
+*/
+PyAPI_FUNC(int) _PyLong_AsByteArray(PyLongObject* v,
+    unsigned char* bytes, size_t n,
+    int little_endian, int is_signed);
+
+/* _PyLong_Format: Convert the long to a string object with given base,
+   appending a base prefix of 0[box] if base is 2, 8 or 16. */
+PyAPI_FUNC(PyObject *) _PyLong_Format(PyObject *obj, int base);
+
+/* For use by the gcd function in mathmodule.c */
+PyAPI_FUNC(PyObject *) _PyLong_GCD(PyObject *, PyObject *);
+
+PyAPI_FUNC(PyObject *) _PyLong_Rshift(PyObject *, size_t);
+PyAPI_FUNC(PyObject *) _PyLong_Lshift(PyObject *, size_t);
+
+
+PyAPI_FUNC(int) PyUnstable_Long_IsCompact(const PyLongObject* op);
+PyAPI_FUNC(Py_ssize_t) PyUnstable_Long_CompactValue(const PyLongObject* op);
+
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/memoryobject.h b/nanvix-port/cpython-headers/python3.12/cpython/memoryobject.h
new file mode 100644
index 000000000000..3837fa8c6ab5
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/memoryobject.h
@@ -0,0 +1,52 @@
+#ifndef Py_CPYTHON_MEMORYOBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+PyAPI_DATA(PyTypeObject) _PyManagedBuffer_Type;
+
+/* The structs are declared here so that macros can work, but they shouldn't
+   be considered public. Don't access their fields directly, use the macros
+   and functions instead! */
+#define _Py_MANAGED_BUFFER_RELEASED    0x001  /* access to exporter blocked */
+#define _Py_MANAGED_BUFFER_FREE_FORMAT 0x002  /* free format */
+
+typedef struct {
+    PyObject_HEAD
+    int flags;          /* state flags */
+    Py_ssize_t exports; /* number of direct memoryview exports */
+    Py_buffer master; /* snapshot buffer obtained from the original exporter */
+} _PyManagedBufferObject;
+
+
+/* memoryview state flags */
+#define _Py_MEMORYVIEW_RELEASED    0x001  /* access to master buffer blocked */
+#define _Py_MEMORYVIEW_C           0x002  /* C-contiguous layout */
+#define _Py_MEMORYVIEW_FORTRAN     0x004  /* Fortran contiguous layout */
+#define _Py_MEMORYVIEW_SCALAR      0x008  /* scalar: ndim = 0 */
+#define _Py_MEMORYVIEW_PIL         0x010  /* PIL-style layout */
+#define _Py_MEMORYVIEW_RESTRICTED  0x020  /* Disallow new references to the memoryview's buffer */
+
+typedef struct {
+    PyObject_VAR_HEAD
+    _PyManagedBufferObject *mbuf; /* managed buffer */
+    Py_hash_t hash;               /* hash value for read-only views */
+    int flags;                    /* state flags */
+    Py_ssize_t exports;           /* number of buffer re-exports */
+    Py_buffer view;               /* private copy of the exporter's view */
+    PyObject *weakreflist;
+    Py_ssize_t ob_array[1];       /* shape, strides, suboffsets */
+} PyMemoryViewObject;
+
+#define _PyMemoryView_CAST(op) _Py_CAST(PyMemoryViewObject*, op)
+
+/* Get a pointer to the memoryview's private copy of the exporter's buffer. */
+static inline Py_buffer* PyMemoryView_GET_BUFFER(PyObject *op) {
+    return (&_PyMemoryView_CAST(op)->view);
+}
+#define PyMemoryView_GET_BUFFER(op) PyMemoryView_GET_BUFFER(_PyObject_CAST(op))
+
+/* Get a pointer to the exporting object (this may be NULL!). */
+static inline PyObject* PyMemoryView_GET_BASE(PyObject *op) {
+    return _PyMemoryView_CAST(op)->view.obj;
+}
+#define PyMemoryView_GET_BASE(op) PyMemoryView_GET_BASE(_PyObject_CAST(op))
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/methodobject.h b/nanvix-port/cpython-headers/python3.12/cpython/methodobject.h
new file mode 100644
index 000000000000..d541e1549480
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/methodobject.h
@@ -0,0 +1,66 @@
+#ifndef Py_CPYTHON_METHODOBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+// PyCFunctionObject structure
+
+typedef struct {
+    PyObject_HEAD
+    PyMethodDef *m_ml; /* Description of the C function to call */
+    PyObject    *m_self; /* Passed as 'self' arg to the C func, can be NULL */
+    PyObject    *m_module; /* The __module__ attribute, can be anything */
+    PyObject    *m_weakreflist; /* List of weak references */
+    vectorcallfunc vectorcall;
+} PyCFunctionObject;
+
+#define _PyCFunctionObject_CAST(func) \
+    (assert(PyCFunction_Check(func)), \
+     _Py_CAST(PyCFunctionObject*, (func)))
+
+
+// PyCMethodObject structure
+
+typedef struct {
+    PyCFunctionObject func;
+    PyTypeObject *mm_class; /* Class that defines this method */
+} PyCMethodObject;
+
+#define _PyCMethodObject_CAST(func) \
+    (assert(PyCMethod_Check(func)), \
+     _Py_CAST(PyCMethodObject*, (func)))
+
+PyAPI_DATA(PyTypeObject) PyCMethod_Type;
+
+#define PyCMethod_CheckExact(op) Py_IS_TYPE((op), &PyCMethod_Type)
+#define PyCMethod_Check(op) PyObject_TypeCheck((op), &PyCMethod_Type)
+
+
+/* Static inline functions for direct access to these values.
+   Type checks are *not* done, so use with care. */
+static inline PyCFunction PyCFunction_GET_FUNCTION(PyObject *func) {
+    return _PyCFunctionObject_CAST(func)->m_ml->ml_meth;
+}
+#define PyCFunction_GET_FUNCTION(func) PyCFunction_GET_FUNCTION(_PyObject_CAST(func))
+
+static inline PyObject* PyCFunction_GET_SELF(PyObject *func_obj) {
+    PyCFunctionObject *func = _PyCFunctionObject_CAST(func_obj);
+    if (func->m_ml->ml_flags & METH_STATIC) {
+        return _Py_NULL;
+    }
+    return func->m_self;
+}
+#define PyCFunction_GET_SELF(func) PyCFunction_GET_SELF(_PyObject_CAST(func))
+
+static inline int PyCFunction_GET_FLAGS(PyObject *func) {
+    return _PyCFunctionObject_CAST(func)->m_ml->ml_flags;
+}
+#define PyCFunction_GET_FLAGS(func) PyCFunction_GET_FLAGS(_PyObject_CAST(func))
+
+static inline PyTypeObject* PyCFunction_GET_CLASS(PyObject *func_obj) {
+    PyCFunctionObject *func = _PyCFunctionObject_CAST(func_obj);
+    if (func->m_ml->ml_flags & METH_METHOD) {
+        return _PyCMethodObject_CAST(func)->mm_class;
+    }
+    return _Py_NULL;
+}
+#define PyCFunction_GET_CLASS(func) PyCFunction_GET_CLASS(_PyObject_CAST(func))
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/modsupport.h b/nanvix-port/cpython-headers/python3.12/cpython/modsupport.h
new file mode 100644
index 000000000000..2259291aff67
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/modsupport.h
@@ -0,0 +1,109 @@
+#ifndef Py_CPYTHON_MODSUPPORT_H
+#  error "this header file must not be included directly"
+#endif
+
+/* If PY_SSIZE_T_CLEAN is defined, each functions treats #-specifier
+   to mean Py_ssize_t */
+#ifdef PY_SSIZE_T_CLEAN
+#define _Py_VaBuildStack                _Py_VaBuildStack_SizeT
+#else
+PyAPI_FUNC(PyObject *) _Py_VaBuildValue_SizeT(const char *, va_list);
+PyAPI_FUNC(PyObject **) _Py_VaBuildStack_SizeT(
+    PyObject **small_stack,
+    Py_ssize_t small_stack_len,
+    const char *format,
+    va_list va,
+    Py_ssize_t *p_nargs);
+#endif
+
+PyAPI_FUNC(int) _PyArg_UnpackStack(
+    PyObject *const *args,
+    Py_ssize_t nargs,
+    const char *name,
+    Py_ssize_t min,
+    Py_ssize_t max,
+    ...);
+
+PyAPI_FUNC(int) _PyArg_NoKeywords(const char *funcname, PyObject *kwargs);
+PyAPI_FUNC(int) _PyArg_NoKwnames(const char *funcname, PyObject *kwnames);
+PyAPI_FUNC(int) _PyArg_NoPositional(const char *funcname, PyObject *args);
+#define _PyArg_NoKeywords(funcname, kwargs) \
+    ((kwargs) == NULL || _PyArg_NoKeywords((funcname), (kwargs)))
+#define _PyArg_NoKwnames(funcname, kwnames) \
+    ((kwnames) == NULL || _PyArg_NoKwnames((funcname), (kwnames)))
+#define _PyArg_NoPositional(funcname, args) \
+    ((args) == NULL || _PyArg_NoPositional((funcname), (args)))
+
+#define _Py_ANY_VARARGS(n) ((n) == PY_SSIZE_T_MAX)
+
+PyAPI_FUNC(void) _PyArg_BadArgument(const char *, const char *, const char *, PyObject *);
+PyAPI_FUNC(int) _PyArg_CheckPositional(const char *, Py_ssize_t,
+                                       Py_ssize_t, Py_ssize_t);
+#define _PyArg_CheckPositional(funcname, nargs, min, max) \
+    ((!_Py_ANY_VARARGS(max) && (min) <= (nargs) && (nargs) <= (max)) \
+     || _PyArg_CheckPositional((funcname), (nargs), (min), (max)))
+
+PyAPI_FUNC(PyObject **) _Py_VaBuildStack(
+    PyObject **small_stack,
+    Py_ssize_t small_stack_len,
+    const char *format,
+    va_list va,
+    Py_ssize_t *p_nargs);
+
+typedef struct _PyArg_Parser {
+    int initialized;
+    const char *format;
+    const char * const *keywords;
+    const char *fname;
+    const char *custom_msg;
+    int pos;            /* number of positional-only arguments */
+    int min;            /* minimal number of arguments */
+    int max;            /* maximal number of positional arguments */
+    PyObject *kwtuple;  /* tuple of keyword parameter names */
+    struct _PyArg_Parser *next;
+} _PyArg_Parser;
+
+#ifdef PY_SSIZE_T_CLEAN
+#define _PyArg_ParseTupleAndKeywordsFast  _PyArg_ParseTupleAndKeywordsFast_SizeT
+#define _PyArg_ParseStack  _PyArg_ParseStack_SizeT
+#define _PyArg_ParseStackAndKeywords  _PyArg_ParseStackAndKeywords_SizeT
+#define _PyArg_VaParseTupleAndKeywordsFast  _PyArg_VaParseTupleAndKeywordsFast_SizeT
+#endif
+
+PyAPI_FUNC(int) _PyArg_ParseTupleAndKeywordsFast(PyObject *, PyObject *,
+                                                 struct _PyArg_Parser *, ...);
+PyAPI_FUNC(int) _PyArg_ParseStack(
+    PyObject *const *args,
+    Py_ssize_t nargs,
+    const char *format,
+    ...);
+PyAPI_FUNC(int) _PyArg_ParseStackAndKeywords(
+    PyObject *const *args,
+    Py_ssize_t nargs,
+    PyObject *kwnames,
+    struct _PyArg_Parser *,
+    ...);
+PyAPI_FUNC(int) _PyArg_VaParseTupleAndKeywordsFast(PyObject *, PyObject *,
+                                                   struct _PyArg_Parser *, va_list);
+PyAPI_FUNC(PyObject * const *) _PyArg_UnpackKeywords(
+        PyObject *const *args, Py_ssize_t nargs,
+        PyObject *kwargs, PyObject *kwnames,
+        struct _PyArg_Parser *parser,
+        int minpos, int maxpos, int minkw,
+        PyObject **buf);
+
+PyAPI_FUNC(PyObject * const *) _PyArg_UnpackKeywordsWithVararg(
+        PyObject *const *args, Py_ssize_t nargs,
+        PyObject *kwargs, PyObject *kwnames,
+        struct _PyArg_Parser *parser,
+        int minpos, int maxpos, int minkw,
+        int vararg, PyObject **buf);
+
+#define _PyArg_UnpackKeywords(args, nargs, kwargs, kwnames, parser, minpos, maxpos, minkw, buf) \
+    (((minkw) == 0 && (kwargs) == NULL && (kwnames) == NULL && \
+      (minpos) <= (nargs) && (nargs) <= (maxpos) && (args) != NULL) ? (args) : \
+     _PyArg_UnpackKeywords((args), (nargs), (kwargs), (kwnames), (parser), \
+                           (minpos), (maxpos), (minkw), (buf)))
+
+PyAPI_FUNC(PyObject *) _PyModule_CreateInitialized(PyModuleDef*, int apiver);
+PyAPI_FUNC(int) _PyModule_Add(PyObject *, const char *, PyObject *);
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/object.h b/nanvix-port/cpython-headers/python3.12/cpython/object.h
new file mode 100644
index 000000000000..ae7f780a9318
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/object.h
@@ -0,0 +1,575 @@
+#ifndef Py_CPYTHON_OBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+PyAPI_FUNC(void) _Py_NewReference(PyObject *op);
+PyAPI_FUNC(void) _Py_NewReferenceNoTotal(PyObject *op);
+
+#ifdef Py_TRACE_REFS
+/* Py_TRACE_REFS is such major surgery that we call external routines. */
+PyAPI_FUNC(void) _Py_ForgetReference(PyObject *);
+#endif
+
+#ifdef Py_REF_DEBUG
+/* These are useful as debugging aids when chasing down refleaks. */
+PyAPI_FUNC(Py_ssize_t) _Py_GetGlobalRefTotal(void);
+#  define _Py_GetRefTotal() _Py_GetGlobalRefTotal()
+PyAPI_FUNC(Py_ssize_t) _Py_GetLegacyRefTotal(void);
+PyAPI_FUNC(Py_ssize_t) _PyInterpreterState_GetRefTotal(PyInterpreterState *);
+#endif
+
+
+/********************* String Literals ****************************************/
+/* This structure helps managing static strings. The basic usage goes like this:
+   Instead of doing
+
+       r = PyObject_CallMethod(o, "foo", "args", ...);
+
+   do
+
+       _Py_IDENTIFIER(foo);
+       ...
+       r = _PyObject_CallMethodId(o, &PyId_foo, "args", ...);
+
+   PyId_foo is a static variable, either on block level or file level. On first
+   usage, the string "foo" is interned, and the structures are linked. On interpreter
+   shutdown, all strings are released.
+
+   Alternatively, _Py_static_string allows choosing the variable name.
+   _PyUnicode_FromId returns a borrowed reference to the interned string.
+   _PyObject_{Get,Set,Has}AttrId are __getattr__ versions using _Py_Identifier*.
+*/
+typedef struct _Py_Identifier {
+    const char* string;
+    // Index in PyInterpreterState.unicode.ids.array. It is process-wide
+    // unique and must be initialized to -1.
+    Py_ssize_t index;
+} _Py_Identifier;
+
+#ifndef Py_BUILD_CORE
+// For now we are keeping _Py_IDENTIFIER for continued use
+// in non-builtin extensions (and naughty PyPI modules).
+
+#define _Py_static_string_init(value) { .string = (value), .index = -1 }
+#define _Py_static_string(varname, value)  static _Py_Identifier varname = _Py_static_string_init(value)
+#define _Py_IDENTIFIER(varname) _Py_static_string(PyId_##varname, #varname)
+
+#endif /* !Py_BUILD_CORE */
+
+typedef struct {
+    /* Number implementations must check *both*
+       arguments for proper type and implement the necessary conversions
+       in the slot functions themselves. */
+
+    binaryfunc nb_add;
+    binaryfunc nb_subtract;
+    binaryfunc nb_multiply;
+    binaryfunc nb_remainder;
+    binaryfunc nb_divmod;
+    ternaryfunc nb_power;
+    unaryfunc nb_negative;
+    unaryfunc nb_positive;
+    unaryfunc nb_absolute;
+    inquiry nb_bool;
+    unaryfunc nb_invert;
+    binaryfunc nb_lshift;
+    binaryfunc nb_rshift;
+    binaryfunc nb_and;
+    binaryfunc nb_xor;
+    binaryfunc nb_or;
+    unaryfunc nb_int;
+    void *nb_reserved;  /* the slot formerly known as nb_long */
+    unaryfunc nb_float;
+
+    binaryfunc nb_inplace_add;
+    binaryfunc nb_inplace_subtract;
+    binaryfunc nb_inplace_multiply;
+    binaryfunc nb_inplace_remainder;
+    ternaryfunc nb_inplace_power;
+    binaryfunc nb_inplace_lshift;
+    binaryfunc nb_inplace_rshift;
+    binaryfunc nb_inplace_and;
+    binaryfunc nb_inplace_xor;
+    binaryfunc nb_inplace_or;
+
+    binaryfunc nb_floor_divide;
+    binaryfunc nb_true_divide;
+    binaryfunc nb_inplace_floor_divide;
+    binaryfunc nb_inplace_true_divide;
+
+    unaryfunc nb_index;
+
+    binaryfunc nb_matrix_multiply;
+    binaryfunc nb_inplace_matrix_multiply;
+} PyNumberMethods;
+
+typedef struct {
+    lenfunc sq_length;
+    binaryfunc sq_concat;
+    ssizeargfunc sq_repeat;
+    ssizeargfunc sq_item;
+    void *was_sq_slice;
+    ssizeobjargproc sq_ass_item;
+    void *was_sq_ass_slice;
+    objobjproc sq_contains;
+
+    binaryfunc sq_inplace_concat;
+    ssizeargfunc sq_inplace_repeat;
+} PySequenceMethods;
+
+typedef struct {
+    lenfunc mp_length;
+    binaryfunc mp_subscript;
+    objobjargproc mp_ass_subscript;
+} PyMappingMethods;
+
+typedef PySendResult (*sendfunc)(PyObject *iter, PyObject *value, PyObject **result);
+
+typedef struct {
+    unaryfunc am_await;
+    unaryfunc am_aiter;
+    unaryfunc am_anext;
+    sendfunc am_send;
+} PyAsyncMethods;
+
+typedef struct {
+     getbufferproc bf_getbuffer;
+     releasebufferproc bf_releasebuffer;
+} PyBufferProcs;
+
+/* Allow printfunc in the tp_vectorcall_offset slot for
+ * backwards-compatibility */
+typedef Py_ssize_t printfunc;
+
+// If this structure is modified, Doc/includes/typestruct.h should be updated
+// as well.
+struct _typeobject {
+    PyObject_VAR_HEAD
+    const char *tp_name; /* For printing, in format "<module>.<name>" */
+    Py_ssize_t tp_basicsize, tp_itemsize; /* For allocation */
+
+    /* Methods to implement standard operations */
+
+    destructor tp_dealloc;
+    Py_ssize_t tp_vectorcall_offset;
+    getattrfunc tp_getattr;
+    setattrfunc tp_setattr;
+    PyAsyncMethods *tp_as_async; /* formerly known as tp_compare (Python 2)
+                                    or tp_reserved (Python 3) */
+    reprfunc tp_repr;
+
+    /* Method suites for standard classes */
+
+    PyNumberMethods *tp_as_number;
+    PySequenceMethods *tp_as_sequence;
+    PyMappingMethods *tp_as_mapping;
+
+    /* More standard operations (here for binary compatibility) */
+
+    hashfunc tp_hash;
+    ternaryfunc tp_call;
+    reprfunc tp_str;
+    getattrofunc tp_getattro;
+    setattrofunc tp_setattro;
+
+    /* Functions to access object as input/output buffer */
+    PyBufferProcs *tp_as_buffer;
+
+    /* Flags to define presence of optional/expanded features */
+    unsigned long tp_flags;
+
+    const char *tp_doc; /* Documentation string */
+
+    /* Assigned meaning in release 2.0 */
+    /* call function for all accessible objects */
+    traverseproc tp_traverse;
+
+    /* delete references to contained objects */
+    inquiry tp_clear;
+
+    /* Assigned meaning in release 2.1 */
+    /* rich comparisons */
+    richcmpfunc tp_richcompare;
+
+    /* weak reference enabler */
+    Py_ssize_t tp_weaklistoffset;
+
+    /* Iterators */
+    getiterfunc tp_iter;
+    iternextfunc tp_iternext;
+
+    /* Attribute descriptor and subclassing stuff */
+    PyMethodDef *tp_methods;
+    PyMemberDef *tp_members;
+    PyGetSetDef *tp_getset;
+    // Strong reference on a heap type, borrowed reference on a static type
+    PyTypeObject *tp_base;
+    PyObject *tp_dict;
+    descrgetfunc tp_descr_get;
+    descrsetfunc tp_descr_set;
+    Py_ssize_t tp_dictoffset;
+    initproc tp_init;
+    allocfunc tp_alloc;
+    newfunc tp_new;
+    freefunc tp_free; /* Low-level free-memory routine */
+    inquiry tp_is_gc; /* For PyObject_IS_GC */
+    PyObject *tp_bases;
+    PyObject *tp_mro; /* method resolution order */
+    PyObject *tp_cache; /* no longer used */
+    void *tp_subclasses;  /* for static builtin types this is an index */
+    PyObject *tp_weaklist; /* not used for static builtin types */
+    destructor tp_del;
+
+    /* Type attribute cache version tag. Added in version 2.6 */
+    unsigned int tp_version_tag;
+
+    destructor tp_finalize;
+    vectorcallfunc tp_vectorcall;
+
+    /* bitset of which type-watchers care about this type */
+    unsigned char tp_watched;
+};
+
+/* This struct is used by the specializer
+ * It should should be treated as an opaque blob
+ * by code other than the specializer and interpreter. */
+struct _specialization_cache {
+    // In order to avoid bloating the bytecode with lots of inline caches, the
+    // members of this structure have a somewhat unique contract. They are set
+    // by the specialization machinery, and are invalidated by PyType_Modified.
+    // The rules for using them are as follows:
+    // - If getitem is non-NULL, then it is the same Python function that
+    //   PyType_Lookup(cls, "__getitem__") would return.
+    // - If getitem is NULL, then getitem_version is meaningless.
+    // - If getitem->func_version == getitem_version, then getitem can be called
+    //   with two positional arguments and no keyword arguments, and has neither
+    //   *args nor **kwargs (as required by BINARY_SUBSCR_GETITEM):
+    PyObject *getitem;
+    uint32_t getitem_version;
+};
+
+/* The *real* layout of a type object when allocated on the heap */
+typedef struct _heaptypeobject {
+    /* Note: there's a dependency on the order of these members
+       in slotptr() in typeobject.c . */
+    PyTypeObject ht_type;
+    PyAsyncMethods as_async;
+    PyNumberMethods as_number;
+    PyMappingMethods as_mapping;
+    PySequenceMethods as_sequence; /* as_sequence comes after as_mapping,
+                                      so that the mapping wins when both
+                                      the mapping and the sequence define
+                                      a given operator (e.g. __getitem__).
+                                      see add_operators() in typeobject.c . */
+    PyBufferProcs as_buffer;
+    PyObject *ht_name, *ht_slots, *ht_qualname;
+    struct _dictkeysobject *ht_cached_keys;
+    PyObject *ht_module;
+    char *_ht_tpname;  // Storage for "tp_name"; see PyType_FromModuleAndSpec
+    struct _specialization_cache _spec_cache; // For use by the specializer.
+    /* here are optional user slots, followed by the members. */
+} PyHeapTypeObject;
+
+PyAPI_FUNC(const char *) _PyType_Name(PyTypeObject *);
+PyAPI_FUNC(PyObject *) _PyType_Lookup(PyTypeObject *, PyObject *);
+PyAPI_FUNC(PyObject *) _PyType_LookupId(PyTypeObject *, _Py_Identifier *);
+PyAPI_FUNC(PyObject *) _PyObject_LookupSpecialId(PyObject *, _Py_Identifier *);
+#ifndef Py_BUILD_CORE
+// Backward compatibility for 3rd-party extensions
+// that may be using the old name.
+#define _PyObject_LookupSpecial _PyObject_LookupSpecialId
+#endif
+PyAPI_FUNC(PyTypeObject *) _PyType_CalculateMetaclass(PyTypeObject *, PyObject *);
+PyAPI_FUNC(PyObject *) _PyType_GetDocFromInternalDoc(const char *, const char *);
+PyAPI_FUNC(PyObject *) _PyType_GetTextSignatureFromInternalDoc(const char *, const char *);
+PyAPI_FUNC(PyObject *) PyType_GetModuleByDef(PyTypeObject *, PyModuleDef *);
+PyAPI_FUNC(PyObject *) PyType_GetDict(PyTypeObject *);
+
+PyAPI_FUNC(int) PyObject_Print(PyObject *, FILE *, int);
+PyAPI_FUNC(void) _Py_BreakPoint(void);
+PyAPI_FUNC(void) _PyObject_Dump(PyObject *);
+PyAPI_FUNC(int) _PyObject_IsFreed(PyObject *);
+
+PyAPI_FUNC(int) _PyObject_IsAbstract(PyObject *);
+PyAPI_FUNC(PyObject *) _PyObject_GetAttrId(PyObject *, _Py_Identifier *);
+PyAPI_FUNC(int) _PyObject_SetAttrId(PyObject *, _Py_Identifier *, PyObject *);
+/* Replacements of PyObject_GetAttr() and _PyObject_GetAttrId() which
+   don't raise AttributeError.
+
+   Return 1 and set *result != NULL if an attribute is found.
+   Return 0 and set *result == NULL if an attribute is not found;
+   an AttributeError is silenced.
+   Return -1 and set *result == NULL if an error other than AttributeError
+   is raised.
+*/
+PyAPI_FUNC(int) _PyObject_LookupAttr(PyObject *, PyObject *, PyObject **);
+PyAPI_FUNC(int) _PyObject_LookupAttrId(PyObject *, _Py_Identifier *, PyObject **);
+
+PyAPI_FUNC(int) _PyObject_GetMethod(PyObject *obj, PyObject *name, PyObject **method);
+
+PyAPI_FUNC(PyObject **) _PyObject_GetDictPtr(PyObject *);
+PyAPI_FUNC(PyObject *) _PyObject_NextNotImplemented(PyObject *);
+PyAPI_FUNC(void) PyObject_CallFinalizer(PyObject *);
+PyAPI_FUNC(int) PyObject_CallFinalizerFromDealloc(PyObject *);
+
+/* Same as PyObject_Generic{Get,Set}Attr, but passing the attributes
+   dict as the last parameter. */
+PyAPI_FUNC(PyObject *)
+_PyObject_GenericGetAttrWithDict(PyObject *, PyObject *, PyObject *, int);
+PyAPI_FUNC(int)
+_PyObject_GenericSetAttrWithDict(PyObject *, PyObject *,
+                                 PyObject *, PyObject *);
+
+PyAPI_FUNC(PyObject *) _PyObject_FunctionStr(PyObject *);
+
+/* Safely decref `dst` and set `dst` to `src`.
+ *
+ * As in case of Py_CLEAR "the obvious" code can be deadly:
+ *
+ *     Py_DECREF(dst);
+ *     dst = src;
+ *
+ * The safe way is:
+ *
+ *      Py_SETREF(dst, src);
+ *
+ * That arranges to set `dst` to `src` _before_ decref'ing, so that any code
+ * triggered as a side-effect of `dst` getting torn down no longer believes
+ * `dst` points to a valid object.
+ *
+ * Temporary variables are used to only evalutate macro arguments once and so
+ * avoid the duplication of side effects. _Py_TYPEOF() or memcpy() is used to
+ * avoid a miscompilation caused by type punning. See Py_CLEAR() comment for
+ * implementation details about type punning.
+ *
+ * The memcpy() implementation does not emit a compiler warning if 'src' has
+ * not the same type than 'src': any pointer type is accepted for 'src'.
+ */
+#ifdef _Py_TYPEOF
+#define Py_SETREF(dst, src) \
+    do { \
+        _Py_TYPEOF(dst)* _tmp_dst_ptr = &(dst); \
+        _Py_TYPEOF(dst) _tmp_old_dst = (*_tmp_dst_ptr); \
+        *_tmp_dst_ptr = (src); \
+        Py_DECREF(_tmp_old_dst); \
+    } while (0)
+#else
+#define Py_SETREF(dst, src) \
+    do { \
+        PyObject **_tmp_dst_ptr = _Py_CAST(PyObject**, &(dst)); \
+        PyObject *_tmp_old_dst = (*_tmp_dst_ptr); \
+        PyObject *_tmp_src = _PyObject_CAST(src); \
+        memcpy(_tmp_dst_ptr, &_tmp_src, sizeof(PyObject*)); \
+        Py_DECREF(_tmp_old_dst); \
+    } while (0)
+#endif
+
+/* Py_XSETREF() is a variant of Py_SETREF() that uses Py_XDECREF() instead of
+ * Py_DECREF().
+ */
+#ifdef _Py_TYPEOF
+#define Py_XSETREF(dst, src) \
+    do { \
+        _Py_TYPEOF(dst)* _tmp_dst_ptr = &(dst); \
+        _Py_TYPEOF(dst) _tmp_old_dst = (*_tmp_dst_ptr); \
+        *_tmp_dst_ptr = (src); \
+        Py_XDECREF(_tmp_old_dst); \
+    } while (0)
+#else
+#define Py_XSETREF(dst, src) \
+    do { \
+        PyObject **_tmp_dst_ptr = _Py_CAST(PyObject**, &(dst)); \
+        PyObject *_tmp_old_dst = (*_tmp_dst_ptr); \
+        PyObject *_tmp_src = _PyObject_CAST(src); \
+        memcpy(_tmp_dst_ptr, &_tmp_src, sizeof(PyObject*)); \
+        Py_XDECREF(_tmp_old_dst); \
+    } while (0)
+#endif
+
+
+PyAPI_DATA(PyTypeObject) _PyNone_Type;
+PyAPI_DATA(PyTypeObject) _PyNotImplemented_Type;
+
+/* Maps Py_LT to Py_GT, ..., Py_GE to Py_LE.
+ * Defined in object.c.
+ */
+PyAPI_DATA(int) _Py_SwappedOp[];
+
+PyAPI_FUNC(void)
+_PyDebugAllocatorStats(FILE *out, const char *block_name, int num_blocks,
+                       size_t sizeof_block);
+PyAPI_FUNC(void)
+_PyObject_DebugTypeStats(FILE *out);
+
+/* Define a pair of assertion macros:
+   _PyObject_ASSERT_FROM(), _PyObject_ASSERT_WITH_MSG() and _PyObject_ASSERT().
+
+   These work like the regular C assert(), in that they will abort the
+   process with a message on stderr if the given condition fails to hold,
+   but compile away to nothing if NDEBUG is defined.
+
+   However, before aborting, Python will also try to call _PyObject_Dump() on
+   the given object.  This may be of use when investigating bugs in which a
+   particular object is corrupt (e.g. buggy a tp_visit method in an extension
+   module breaking the garbage collector), to help locate the broken objects.
+
+   The WITH_MSG variant allows you to supply an additional message that Python
+   will attempt to print to stderr, after the object dump. */
+#ifdef NDEBUG
+   /* No debugging: compile away the assertions: */
+#  define _PyObject_ASSERT_FROM(obj, expr, msg, filename, lineno, func) \
+    ((void)0)
+#else
+   /* With debugging: generate checks: */
+#  define _PyObject_ASSERT_FROM(obj, expr, msg, filename, lineno, func) \
+    ((expr) \
+      ? (void)(0) \
+      : _PyObject_AssertFailed((obj), Py_STRINGIFY(expr), \
+                               (msg), (filename), (lineno), (func)))
+#endif
+
+#define _PyObject_ASSERT_WITH_MSG(obj, expr, msg) \
+    _PyObject_ASSERT_FROM((obj), expr, (msg), __FILE__, __LINE__, __func__)
+#define _PyObject_ASSERT(obj, expr) \
+    _PyObject_ASSERT_WITH_MSG((obj), expr, NULL)
+
+#define _PyObject_ASSERT_FAILED_MSG(obj, msg) \
+    _PyObject_AssertFailed((obj), NULL, (msg), __FILE__, __LINE__, __func__)
+
+/* Declare and define _PyObject_AssertFailed() even when NDEBUG is defined,
+   to avoid causing compiler/linker errors when building extensions without
+   NDEBUG against a Python built with NDEBUG defined.
+
+   msg, expr and function can be NULL. */
+PyAPI_FUNC(void) _Py_NO_RETURN _PyObject_AssertFailed(
+    PyObject *obj,
+    const char *expr,
+    const char *msg,
+    const char *file,
+    int line,
+    const char *function);
+
+/* Check if an object is consistent. For example, ensure that the reference
+   counter is greater than or equal to 1, and ensure that ob_type is not NULL.
+
+   Call _PyObject_AssertFailed() if the object is inconsistent.
+
+   If check_content is zero, only check header fields: reduce the overhead.
+
+   The function always return 1. The return value is just here to be able to
+   write:
+
+   assert(_PyObject_CheckConsistency(obj, 1)); */
+PyAPI_FUNC(int) _PyObject_CheckConsistency(
+    PyObject *op,
+    int check_content);
+
+
+/* Trashcan mechanism, thanks to Christian Tismer.
+
+When deallocating a container object, it's possible to trigger an unbounded
+chain of deallocations, as each Py_DECREF in turn drops the refcount on "the
+next" object in the chain to 0.  This can easily lead to stack overflows,
+especially in threads (which typically have less stack space to work with).
+
+A container object can avoid this by bracketing the body of its tp_dealloc
+function with a pair of macros:
+
+static void
+mytype_dealloc(mytype *p)
+{
+    ... declarations go here ...
+
+    PyObject_GC_UnTrack(p);        // must untrack first
+    Py_TRASHCAN_BEGIN(p, mytype_dealloc)
+    ... The body of the deallocator goes here, including all calls ...
+    ... to Py_DECREF on contained objects.                         ...
+    Py_TRASHCAN_END                // there should be no code after this
+}
+
+CAUTION:  Never return from the middle of the body!  If the body needs to
+"get out early", put a label immediately before the Py_TRASHCAN_END
+call, and goto it.  Else the call-depth counter (see below) will stay
+above 0 forever, and the trashcan will never get emptied.
+
+How it works:  The BEGIN macro increments a call-depth counter.  So long
+as this counter is small, the body of the deallocator is run directly without
+further ado.  But if the counter gets large, it instead adds p to a list of
+objects to be deallocated later, skips the body of the deallocator, and
+resumes execution after the END macro.  The tp_dealloc routine then returns
+without deallocating anything (and so unbounded call-stack depth is avoided).
+
+When the call stack finishes unwinding again, code generated by the END macro
+notices this, and calls another routine to deallocate all the objects that
+may have been added to the list of deferred deallocations.  In effect, a
+chain of N deallocations is broken into (N-1)/(_PyTrash_UNWIND_LEVEL-1) pieces,
+with the call stack never exceeding a depth of _PyTrash_UNWIND_LEVEL.
+
+Since the tp_dealloc of a subclass typically calls the tp_dealloc of the base
+class, we need to ensure that the trashcan is only triggered on the tp_dealloc
+of the actual class being deallocated. Otherwise we might end up with a
+partially-deallocated object. To check this, the tp_dealloc function must be
+passed as second argument to Py_TRASHCAN_BEGIN().
+*/
+
+/* Python 3.9 private API, invoked by the macros below. */
+PyAPI_FUNC(int) _PyTrash_begin(PyThreadState *tstate, PyObject *op);
+PyAPI_FUNC(void) _PyTrash_end(PyThreadState *tstate);
+/* Python 3.10 private API, invoked by the Py_TRASHCAN_BEGIN(). */
+PyAPI_FUNC(int) _PyTrash_cond(PyObject *op, destructor dealloc);
+
+#define Py_TRASHCAN_BEGIN_CONDITION(op, cond) \
+    do { \
+        PyThreadState *_tstate = NULL; \
+        /* If "cond" is false, then _tstate remains NULL and the deallocator \
+         * is run normally without involving the trashcan */ \
+        if (cond) { \
+            _tstate = _PyThreadState_UncheckedGet(); \
+            if (_PyTrash_begin(_tstate, _PyObject_CAST(op))) { \
+                break; \
+            } \
+        }
+        /* The body of the deallocator is here. */
+#define Py_TRASHCAN_END \
+        if (_tstate) { \
+            _PyTrash_end(_tstate); \
+        } \
+    } while (0);
+
+#define Py_TRASHCAN_BEGIN(op, dealloc) \
+    Py_TRASHCAN_BEGIN_CONDITION((op), \
+        _PyTrash_cond(_PyObject_CAST(op), (destructor)(dealloc)))
+
+/* The following two macros, Py_TRASHCAN_SAFE_BEGIN and
+ * Py_TRASHCAN_SAFE_END, are deprecated since version 3.11 and
+ * will be removed in the future.
+ * Use Py_TRASHCAN_BEGIN and Py_TRASHCAN_END instead.
+ */
+Py_DEPRECATED(3.11) typedef int UsingDeprecatedTrashcanMacro;
+#define Py_TRASHCAN_SAFE_BEGIN(op) \
+    do { \
+        UsingDeprecatedTrashcanMacro cond=1; \
+        Py_TRASHCAN_BEGIN_CONDITION((op), cond);
+#define Py_TRASHCAN_SAFE_END(op) \
+        Py_TRASHCAN_END; \
+    } while(0);
+
+PyAPI_FUNC(void *) PyObject_GetItemData(PyObject *obj);
+
+PyAPI_FUNC(int) _PyObject_VisitManagedDict(PyObject *obj, visitproc visit, void *arg);
+PyAPI_FUNC(void) _PyObject_ClearManagedDict(PyObject *obj);
+
+#define TYPE_MAX_WATCHERS 8
+
+typedef int(*PyType_WatchCallback)(PyTypeObject *);
+PyAPI_FUNC(int) PyType_AddWatcher(PyType_WatchCallback callback);
+PyAPI_FUNC(int) PyType_ClearWatcher(int watcher_id);
+PyAPI_FUNC(int) PyType_Watch(int watcher_id, PyObject *type);
+PyAPI_FUNC(int) PyType_Unwatch(int watcher_id, PyObject *type);
+
+/* Attempt to assign a version tag to the given type.
+ *
+ * Returns 1 if the type already had a valid version tag or a new one was
+ * assigned, or 0 if a new tag could not be assigned.
+ */
+PyAPI_FUNC(int) PyUnstable_Type_AssignVersionTag(PyTypeObject *type);
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/objimpl.h b/nanvix-port/cpython-headers/python3.12/cpython/objimpl.h
new file mode 100644
index 000000000000..5a8cdd57c784
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/objimpl.h
@@ -0,0 +1,95 @@
+#ifndef Py_CPYTHON_OBJIMPL_H
+#  error "this header file must not be included directly"
+#endif
+
+static inline size_t _PyObject_SIZE(PyTypeObject *type) {
+    return _Py_STATIC_CAST(size_t, type->tp_basicsize);
+}
+
+/* _PyObject_VAR_SIZE returns the number of bytes (as size_t) allocated for a
+   vrbl-size object with nitems items, exclusive of gc overhead (if any).  The
+   value is rounded up to the closest multiple of sizeof(void *), in order to
+   ensure that pointer fields at the end of the object are correctly aligned
+   for the platform (this is of special importance for subclasses of, e.g.,
+   str or int, so that pointers can be stored after the embedded data).
+
+   Note that there's no memory wastage in doing this, as malloc has to
+   return (at worst) pointer-aligned memory anyway.
+*/
+#if ((SIZEOF_VOID_P - 1) & SIZEOF_VOID_P) != 0
+#   error "_PyObject_VAR_SIZE requires SIZEOF_VOID_P be a power of 2"
+#endif
+
+static inline size_t _PyObject_VAR_SIZE(PyTypeObject *type, Py_ssize_t nitems) {
+    size_t size = _Py_STATIC_CAST(size_t, type->tp_basicsize);
+    size += _Py_STATIC_CAST(size_t, nitems) * _Py_STATIC_CAST(size_t, type->tp_itemsize);
+    return _Py_SIZE_ROUND_UP(size, SIZEOF_VOID_P);
+}
+
+
+/* This example code implements an object constructor with a custom
+   allocator, where PyObject_New is inlined, and shows the important
+   distinction between two steps (at least):
+       1) the actual allocation of the object storage;
+       2) the initialization of the Python specific fields
+      in this storage with PyObject_{Init, InitVar}.
+
+   PyObject *
+   YourObject_New(...)
+   {
+       PyObject *op;
+
+       op = (PyObject *) Your_Allocator(_PyObject_SIZE(YourTypeStruct));
+       if (op == NULL) {
+           return PyErr_NoMemory();
+       }
+
+       PyObject_Init(op, &YourTypeStruct);
+
+       op->ob_field = value;
+       ...
+       return op;
+   }
+
+   Note that in C++, the use of the new operator usually implies that
+   the 1st step is performed automatically for you, so in a C++ class
+   constructor you would start directly with PyObject_Init/InitVar. */
+
+
+typedef struct {
+    /* user context passed as the first argument to the 2 functions */
+    void *ctx;
+
+    /* allocate an arena of size bytes */
+    void* (*alloc) (void *ctx, size_t size);
+
+    /* free an arena */
+    void (*free) (void *ctx, void *ptr, size_t size);
+} PyObjectArenaAllocator;
+
+/* Get the arena allocator. */
+PyAPI_FUNC(void) PyObject_GetArenaAllocator(PyObjectArenaAllocator *allocator);
+
+/* Set the arena allocator. */
+PyAPI_FUNC(void) PyObject_SetArenaAllocator(PyObjectArenaAllocator *allocator);
+
+
+/* Test if an object implements the garbage collector protocol */
+PyAPI_FUNC(int) PyObject_IS_GC(PyObject *obj);
+
+
+/* Code built with Py_BUILD_CORE must include pycore_gc.h instead which
+   defines a different _PyGC_FINALIZED() macro. */
+#ifndef Py_BUILD_CORE
+   // Kept for backward compatibility with Python 3.8
+#  define _PyGC_FINALIZED(o) PyObject_GC_IsFinalized(o)
+#endif
+
+
+// Test if a type supports weak references
+PyAPI_FUNC(int) PyType_SUPPORTS_WEAKREFS(PyTypeObject *type);
+
+PyAPI_FUNC(PyObject **) PyObject_GET_WEAKREFS_LISTPTR(PyObject *op);
+
+PyAPI_FUNC(PyObject *) PyUnstable_Object_GC_NewWithExtraData(PyTypeObject *,
+                                                             size_t);
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/odictobject.h b/nanvix-port/cpython-headers/python3.12/cpython/odictobject.h
new file mode 100644
index 000000000000..3822d554868c
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/odictobject.h
@@ -0,0 +1,43 @@
+#ifndef Py_ODICTOBJECT_H
+#define Py_ODICTOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* OrderedDict */
+/* This API is optional and mostly redundant. */
+
+#ifndef Py_LIMITED_API
+
+typedef struct _odictobject PyODictObject;
+
+PyAPI_DATA(PyTypeObject) PyODict_Type;
+PyAPI_DATA(PyTypeObject) PyODictIter_Type;
+PyAPI_DATA(PyTypeObject) PyODictKeys_Type;
+PyAPI_DATA(PyTypeObject) PyODictItems_Type;
+PyAPI_DATA(PyTypeObject) PyODictValues_Type;
+
+#define PyODict_Check(op) PyObject_TypeCheck((op), &PyODict_Type)
+#define PyODict_CheckExact(op) Py_IS_TYPE((op), &PyODict_Type)
+#define PyODict_SIZE(op) PyDict_GET_SIZE((op))
+
+PyAPI_FUNC(PyObject *) PyODict_New(void);
+PyAPI_FUNC(int) PyODict_SetItem(PyObject *od, PyObject *key, PyObject *item);
+PyAPI_FUNC(int) PyODict_DelItem(PyObject *od, PyObject *key);
+
+/* wrappers around PyDict* functions */
+#define PyODict_GetItem(od, key) PyDict_GetItem(_PyObject_CAST(od), (key))
+#define PyODict_GetItemWithError(od, key) \
+    PyDict_GetItemWithError(_PyObject_CAST(od), (key))
+#define PyODict_Contains(od, key) PyDict_Contains(_PyObject_CAST(od), (key))
+#define PyODict_Size(od) PyDict_Size(_PyObject_CAST(od))
+#define PyODict_GetItemString(od, key) \
+    PyDict_GetItemString(_PyObject_CAST(od), (key))
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_ODICTOBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/picklebufobject.h b/nanvix-port/cpython-headers/python3.12/cpython/picklebufobject.h
new file mode 100644
index 000000000000..f3cbaeef9195
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/picklebufobject.h
@@ -0,0 +1,31 @@
+/* PickleBuffer object. This is built-in for ease of use from third-party
+ * C extensions.
+ */
+
+#ifndef Py_PICKLEBUFOBJECT_H
+#define Py_PICKLEBUFOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_LIMITED_API
+
+PyAPI_DATA(PyTypeObject) PyPickleBuffer_Type;
+
+#define PyPickleBuffer_Check(op) Py_IS_TYPE((op), &PyPickleBuffer_Type)
+
+/* Create a PickleBuffer redirecting to the given buffer-enabled object */
+PyAPI_FUNC(PyObject *) PyPickleBuffer_FromObject(PyObject *);
+/* Get the PickleBuffer's underlying view to the original object
+ * (NULL if released)
+ */
+PyAPI_FUNC(const Py_buffer *) PyPickleBuffer_GetBuffer(PyObject *);
+/* Release the PickleBuffer.  Returns 0 on success, -1 on error. */
+PyAPI_FUNC(int) PyPickleBuffer_Release(PyObject *);
+
+#endif /* !Py_LIMITED_API */
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_PICKLEBUFOBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/pthread_stubs.h b/nanvix-port/cpython-headers/python3.12/cpython/pthread_stubs.h
new file mode 100644
index 000000000000..d95ee03d8308
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/pthread_stubs.h
@@ -0,0 +1,88 @@
+#ifndef Py_CPYTHON_PTRHEAD_STUBS_H
+#define Py_CPYTHON_PTRHEAD_STUBS_H
+
+#if !defined(HAVE_PTHREAD_STUBS)
+#  error "this header file requires stubbed pthreads."
+#endif
+
+#ifndef _POSIX_THREADS
+#  define _POSIX_THREADS 1
+#endif
+
+/* Minimal pthread stubs for CPython.
+ *
+ * The stubs implement the minimum pthread API for CPython.
+ * - pthread_create() fails.
+ * - pthread_exit() calls exit(0).
+ * - pthread_key_*() functions implement minimal TSS without destructor.
+ * - all other functions do nothing and return 0.
+ */
+
+#ifdef __wasi__
+// WASI's bits/alltypes.h provides type definitions when __NEED_ is set.
+// The header file can be included multiple times.
+#  define __NEED_pthread_cond_t 1
+#  define __NEED_pthread_condattr_t 1
+#  define __NEED_pthread_mutex_t 1
+#  define __NEED_pthread_mutexattr_t 1
+#  define __NEED_pthread_key_t 1
+#  define __NEED_pthread_t 1
+#  define __NEED_pthread_attr_t 1
+#  include <bits/alltypes.h>
+#else
+typedef struct { void *__x; } pthread_cond_t;
+typedef struct { unsigned __attr; } pthread_condattr_t;
+typedef struct { void *__x; } pthread_mutex_t;
+typedef struct { unsigned __attr; } pthread_mutexattr_t;
+typedef unsigned pthread_key_t;
+typedef unsigned pthread_t;
+typedef struct { unsigned __attr; } pthread_attr_t;
+#endif
+
+// mutex
+PyAPI_FUNC(int) pthread_mutex_init(pthread_mutex_t *restrict mutex,
+                                   const pthread_mutexattr_t *restrict attr);
+PyAPI_FUNC(int) pthread_mutex_destroy(pthread_mutex_t *mutex);
+PyAPI_FUNC(int) pthread_mutex_trylock(pthread_mutex_t *mutex);
+PyAPI_FUNC(int) pthread_mutex_lock(pthread_mutex_t *mutex);
+PyAPI_FUNC(int) pthread_mutex_unlock(pthread_mutex_t *mutex);
+
+// condition
+PyAPI_FUNC(int) pthread_cond_init(pthread_cond_t *restrict cond,
+                                  const pthread_condattr_t *restrict attr);
+PyAPI_FUNC(int) pthread_cond_destroy(pthread_cond_t *cond);
+PyAPI_FUNC(int) pthread_cond_wait(pthread_cond_t *restrict cond,
+                                  pthread_mutex_t *restrict mutex);
+PyAPI_FUNC(int) pthread_cond_timedwait(pthread_cond_t *restrict cond,
+                                       pthread_mutex_t *restrict mutex,
+                                       const struct timespec *restrict abstime);
+PyAPI_FUNC(int) pthread_cond_signal(pthread_cond_t *cond);
+PyAPI_FUNC(int) pthread_condattr_init(pthread_condattr_t *attr);
+PyAPI_FUNC(int) pthread_condattr_setclock(
+    pthread_condattr_t *attr, clockid_t clock_id);
+
+// pthread
+PyAPI_FUNC(int) pthread_create(pthread_t *restrict thread,
+                               const pthread_attr_t *restrict attr,
+                               void *(*start_routine)(void *),
+                               void *restrict arg);
+PyAPI_FUNC(int) pthread_detach(pthread_t thread);
+PyAPI_FUNC(pthread_t) pthread_self(void);
+PyAPI_FUNC(int) pthread_exit(void *retval) __attribute__ ((__noreturn__));
+PyAPI_FUNC(int) pthread_attr_init(pthread_attr_t *attr);
+PyAPI_FUNC(int) pthread_attr_setstacksize(pthread_attr_t *attr, size_t stacksize);
+PyAPI_FUNC(int) pthread_attr_destroy(pthread_attr_t *attr);
+
+
+// pthread_key
+#ifndef PTHREAD_KEYS_MAX
+#  define PTHREAD_KEYS_MAX 128
+#endif
+
+PyAPI_FUNC(int) pthread_key_create(pthread_key_t *key,
+                                   void (*destr_function)(void *));
+PyAPI_FUNC(int) pthread_key_delete(pthread_key_t key);
+PyAPI_FUNC(void *) pthread_getspecific(pthread_key_t key);
+PyAPI_FUNC(int) pthread_setspecific(pthread_key_t key, const void *value);
+
+#endif // Py_CPYTHON_PTRHEAD_STUBS_H
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/pyctype.h b/nanvix-port/cpython-headers/python3.12/cpython/pyctype.h
new file mode 100644
index 000000000000..729d93275e6c
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/pyctype.h
@@ -0,0 +1,39 @@
+#ifndef Py_LIMITED_API
+#ifndef PYCTYPE_H
+#define PYCTYPE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PY_CTF_LOWER  0x01
+#define PY_CTF_UPPER  0x02
+#define PY_CTF_ALPHA  (PY_CTF_LOWER|PY_CTF_UPPER)
+#define PY_CTF_DIGIT  0x04
+#define PY_CTF_ALNUM  (PY_CTF_ALPHA|PY_CTF_DIGIT)
+#define PY_CTF_SPACE  0x08
+#define PY_CTF_XDIGIT 0x10
+
+PyAPI_DATA(const unsigned int) _Py_ctype_table[256];
+
+/* Unlike their C counterparts, the following macros are not meant to
+ * handle an int with any of the values [EOF, 0-UCHAR_MAX]. The argument
+ * must be a signed/unsigned char. */
+#define Py_ISLOWER(c)  (_Py_ctype_table[Py_CHARMASK(c)] & PY_CTF_LOWER)
+#define Py_ISUPPER(c)  (_Py_ctype_table[Py_CHARMASK(c)] & PY_CTF_UPPER)
+#define Py_ISALPHA(c)  (_Py_ctype_table[Py_CHARMASK(c)] & PY_CTF_ALPHA)
+#define Py_ISDIGIT(c)  (_Py_ctype_table[Py_CHARMASK(c)] & PY_CTF_DIGIT)
+#define Py_ISXDIGIT(c) (_Py_ctype_table[Py_CHARMASK(c)] & PY_CTF_XDIGIT)
+#define Py_ISALNUM(c)  (_Py_ctype_table[Py_CHARMASK(c)] & PY_CTF_ALNUM)
+#define Py_ISSPACE(c)  (_Py_ctype_table[Py_CHARMASK(c)] & PY_CTF_SPACE)
+
+PyAPI_DATA(const unsigned char) _Py_ctype_tolower[256];
+PyAPI_DATA(const unsigned char) _Py_ctype_toupper[256];
+
+#define Py_TOLOWER(c) (_Py_ctype_tolower[Py_CHARMASK(c)])
+#define Py_TOUPPER(c) (_Py_ctype_toupper[Py_CHARMASK(c)])
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !PYCTYPE_H */
+#endif /* !Py_LIMITED_API */
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/pydebug.h b/nanvix-port/cpython-headers/python3.12/cpython/pydebug.h
new file mode 100644
index 000000000000..f6ebd99ed7e2
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/pydebug.h
@@ -0,0 +1,38 @@
+#ifndef Py_LIMITED_API
+#ifndef Py_PYDEBUG_H
+#define Py_PYDEBUG_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_DebugFlag;
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_VerboseFlag;
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_QuietFlag;
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_InteractiveFlag;
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_InspectFlag;
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_OptimizeFlag;
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_NoSiteFlag;
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_BytesWarningFlag;
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_FrozenFlag;
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_IgnoreEnvironmentFlag;
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_DontWriteBytecodeFlag;
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_NoUserSiteDirectory;
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_UnbufferedStdioFlag;
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_HashRandomizationFlag;
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_IsolatedFlag;
+
+#ifdef MS_WINDOWS
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_LegacyWindowsFSEncodingFlag;
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_LegacyWindowsStdioFlag;
+#endif
+
+/* this is a wrapper around getenv() that pays attention to
+   Py_IgnoreEnvironmentFlag.  It should be used for getting variables like
+   PYTHONPATH and PYTHONHOME from the environment */
+PyAPI_FUNC(char*) Py_GETENV(const char *name);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_PYDEBUG_H */
+#endif /* Py_LIMITED_API */
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/pyerrors.h b/nanvix-port/cpython-headers/python3.12/cpython/pyerrors.h
new file mode 100644
index 000000000000..156665cbdb1b
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/pyerrors.h
@@ -0,0 +1,178 @@
+#ifndef Py_CPYTHON_ERRORS_H
+#  error "this header file must not be included directly"
+#endif
+
+/* Error objects */
+
+/* PyException_HEAD defines the initial segment of every exception class. */
+#define PyException_HEAD PyObject_HEAD PyObject *dict;\
+             PyObject *args; PyObject *notes; PyObject *traceback;\
+             PyObject *context; PyObject *cause;\
+             char suppress_context;
+
+typedef struct {
+    PyException_HEAD
+} PyBaseExceptionObject;
+
+typedef struct {
+    PyException_HEAD
+    PyObject *msg;
+    PyObject *excs;
+} PyBaseExceptionGroupObject;
+
+typedef struct {
+    PyException_HEAD
+    PyObject *msg;
+    PyObject *filename;
+    PyObject *lineno;
+    PyObject *offset;
+    PyObject *end_lineno;
+    PyObject *end_offset;
+    PyObject *text;
+    PyObject *print_file_and_line;
+} PySyntaxErrorObject;
+
+typedef struct {
+    PyException_HEAD
+    PyObject *msg;
+    PyObject *name;
+    PyObject *path;
+    PyObject *name_from;
+} PyImportErrorObject;
+
+typedef struct {
+    PyException_HEAD
+    PyObject *encoding;
+    PyObject *object;
+    Py_ssize_t start;
+    Py_ssize_t end;
+    PyObject *reason;
+} PyUnicodeErrorObject;
+
+typedef struct {
+    PyException_HEAD
+    PyObject *code;
+} PySystemExitObject;
+
+typedef struct {
+    PyException_HEAD
+    PyObject *myerrno;
+    PyObject *strerror;
+    PyObject *filename;
+    PyObject *filename2;
+#ifdef MS_WINDOWS
+    PyObject *winerror;
+#endif
+    Py_ssize_t written;   /* only for BlockingIOError, -1 otherwise */
+} PyOSErrorObject;
+
+typedef struct {
+    PyException_HEAD
+    PyObject *value;
+} PyStopIterationObject;
+
+typedef struct {
+    PyException_HEAD
+    PyObject *name;
+} PyNameErrorObject;
+
+typedef struct {
+    PyException_HEAD
+    PyObject *obj;
+    PyObject *name;
+} PyAttributeErrorObject;
+
+/* Compatibility typedefs */
+typedef PyOSErrorObject PyEnvironmentErrorObject;
+#ifdef MS_WINDOWS
+typedef PyOSErrorObject PyWindowsErrorObject;
+#endif
+
+/* Error handling definitions */
+
+PyAPI_FUNC(void) _PyErr_SetKeyError(PyObject *);
+PyAPI_FUNC(_PyErr_StackItem*) _PyErr_GetTopmostException(PyThreadState *tstate);
+PyAPI_FUNC(PyObject*) _PyErr_GetHandledException(PyThreadState *);
+PyAPI_FUNC(void) _PyErr_SetHandledException(PyThreadState *, PyObject *);
+PyAPI_FUNC(void) _PyErr_GetExcInfo(PyThreadState *, PyObject **, PyObject **, PyObject **);
+
+/* Context manipulation (PEP 3134) */
+
+Py_DEPRECATED(3.12) PyAPI_FUNC(void) _PyErr_ChainExceptions(PyObject *, PyObject *, PyObject *);
+PyAPI_FUNC(void) _PyErr_ChainExceptions1(PyObject *);
+
+/* Like PyErr_Format(), but saves current exception as __context__ and
+   __cause__.
+ */
+PyAPI_FUNC(PyObject *) _PyErr_FormatFromCause(
+    PyObject *exception,
+    const char *format,   /* ASCII-encoded string  */
+    ...
+    );
+
+/* In exceptions.c */
+
+PyAPI_FUNC(int) _PyException_AddNote(
+     PyObject *exc,
+     PyObject *note);
+
+PyAPI_FUNC(PyObject*) PyUnstable_Exc_PrepReraiseStar(
+     PyObject *orig,
+     PyObject *excs);
+
+/* In signalmodule.c */
+
+int PySignal_SetWakeupFd(int fd);
+PyAPI_FUNC(int) _PyErr_CheckSignals(void);
+
+/* Support for adding program text to SyntaxErrors */
+
+PyAPI_FUNC(void) PyErr_SyntaxLocationObject(
+    PyObject *filename,
+    int lineno,
+    int col_offset);
+
+PyAPI_FUNC(void) PyErr_RangedSyntaxLocationObject(
+    PyObject *filename,
+    int lineno,
+    int col_offset,
+    int end_lineno,
+    int end_col_offset);
+
+PyAPI_FUNC(PyObject *) PyErr_ProgramTextObject(
+    PyObject *filename,
+    int lineno);
+
+PyAPI_FUNC(PyObject *) _PyErr_ProgramDecodedTextObject(
+    PyObject *filename,
+    int lineno,
+    const char* encoding);
+
+PyAPI_FUNC(PyObject *) _PyUnicodeTranslateError_Create(
+    PyObject *object,
+    Py_ssize_t start,
+    Py_ssize_t end,
+    const char *reason          /* UTF-8 encoded string */
+    );
+
+PyAPI_FUNC(void) _PyErr_WriteUnraisableMsg(
+    const char *err_msg,
+    PyObject *obj);
+
+PyAPI_FUNC(void) _Py_NO_RETURN _Py_FatalErrorFunc(
+    const char *func,
+    const char *message);
+
+PyAPI_FUNC(void) _Py_NO_RETURN _Py_FatalErrorFormat(
+    const char *func,
+    const char *format,
+    ...);
+
+extern PyObject *_PyErr_SetImportErrorWithNameFrom(
+        PyObject *,
+        PyObject *,
+        PyObject *,
+        PyObject *);
+
+
+#define Py_FatalError(message) _Py_FatalErrorFunc(__func__, (message))
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/pyfpe.h b/nanvix-port/cpython-headers/python3.12/cpython/pyfpe.h
new file mode 100644
index 000000000000..cc2def63aa55
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/pyfpe.h
@@ -0,0 +1,15 @@
+#ifndef Py_PYFPE_H
+#define Py_PYFPE_H
+/* Header excluded from the stable API */
+#ifndef Py_LIMITED_API
+
+/* These macros used to do something when Python was built with --with-fpectl,
+ * but support for that was dropped in 3.7. We continue to define them though,
+ * to avoid breaking API users.
+ */
+
+#define PyFPE_START_PROTECT(err_string, leave_stmt)
+#define PyFPE_END_PROTECT(v)
+
+#endif /* !defined(Py_LIMITED_API) */
+#endif /* !Py_PYFPE_H */
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/pyframe.h b/nanvix-port/cpython-headers/python3.12/cpython/pyframe.h
new file mode 100644
index 000000000000..0e2afff925e3
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/pyframe.h
@@ -0,0 +1,35 @@
+#ifndef Py_CPYTHON_PYFRAME_H
+#  error "this header file must not be included directly"
+#endif
+
+PyAPI_DATA(PyTypeObject) PyFrame_Type;
+
+#define PyFrame_Check(op) Py_IS_TYPE((op), &PyFrame_Type)
+
+PyAPI_FUNC(PyFrameObject *) PyFrame_GetBack(PyFrameObject *frame);
+PyAPI_FUNC(PyObject *) PyFrame_GetLocals(PyFrameObject *frame);
+
+PyAPI_FUNC(PyObject *) PyFrame_GetGlobals(PyFrameObject *frame);
+PyAPI_FUNC(PyObject *) PyFrame_GetBuiltins(PyFrameObject *frame);
+
+PyAPI_FUNC(PyObject *) PyFrame_GetGenerator(PyFrameObject *frame);
+PyAPI_FUNC(int) PyFrame_GetLasti(PyFrameObject *frame);
+PyAPI_FUNC(PyObject*) PyFrame_GetVar(PyFrameObject *frame, PyObject *name);
+PyAPI_FUNC(PyObject*) PyFrame_GetVarString(PyFrameObject *frame, const char *name);
+
+/* The following functions are for use by debuggers and other tools
+ * implementing custom frame evaluators with PEP 523. */
+
+struct _PyInterpreterFrame;
+
+/* Returns the code object of the frame (strong reference).
+ * Does not raise an exception. */
+PyAPI_FUNC(PyObject *) PyUnstable_InterpreterFrame_GetCode(struct _PyInterpreterFrame *frame);
+
+/* Returns a byte ofsset into the last executed instruction.
+ * Does not raise an exception. */
+PyAPI_FUNC(int) PyUnstable_InterpreterFrame_GetLasti(struct _PyInterpreterFrame *frame);
+
+/* Returns the currently executing line number, or -1 if there is no line number.
+ * Does not raise an exception. */
+PyAPI_FUNC(int) PyUnstable_InterpreterFrame_GetLine(struct _PyInterpreterFrame *frame);
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/pylifecycle.h b/nanvix-port/cpython-headers/python3.12/cpython/pylifecycle.h
new file mode 100644
index 000000000000..4daea33bf801
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/pylifecycle.h
@@ -0,0 +1,111 @@
+#ifndef Py_CPYTHON_PYLIFECYCLE_H
+#  error "this header file must not be included directly"
+#endif
+
+/* Py_FrozenMain is kept out of the Limited API until documented and present
+   in all builds of Python */
+PyAPI_FUNC(int) Py_FrozenMain(int argc, char **argv);
+
+/* Only used by applications that embed the interpreter and need to
+ * override the standard encoding determination mechanism
+ */
+Py_DEPRECATED(3.11) PyAPI_FUNC(int) Py_SetStandardStreamEncoding(
+    const char *encoding,
+    const char *errors);
+
+/* PEP 432 Multi-phase initialization API (Private while provisional!) */
+
+PyAPI_FUNC(PyStatus) Py_PreInitialize(
+    const PyPreConfig *src_config);
+PyAPI_FUNC(PyStatus) Py_PreInitializeFromBytesArgs(
+    const PyPreConfig *src_config,
+    Py_ssize_t argc,
+    char **argv);
+PyAPI_FUNC(PyStatus) Py_PreInitializeFromArgs(
+    const PyPreConfig *src_config,
+    Py_ssize_t argc,
+    wchar_t **argv);
+
+PyAPI_FUNC(int) _Py_IsCoreInitialized(void);
+
+
+/* Initialization and finalization */
+
+PyAPI_FUNC(PyStatus) Py_InitializeFromConfig(
+    const PyConfig *config);
+PyAPI_FUNC(PyStatus) _Py_InitializeMain(void);
+
+PyAPI_FUNC(int) Py_RunMain(void);
+
+
+PyAPI_FUNC(void) _Py_NO_RETURN Py_ExitStatusException(PyStatus err);
+
+/* Restore signals that the interpreter has called SIG_IGN on to SIG_DFL. */
+PyAPI_FUNC(void) _Py_RestoreSignals(void);
+
+PyAPI_FUNC(int) Py_FdIsInteractive(FILE *, const char *);
+PyAPI_FUNC(int) _Py_FdIsInteractive(FILE *fp, PyObject *filename);
+
+Py_DEPRECATED(3.11) PyAPI_FUNC(void) _Py_SetProgramFullPath(const wchar_t *);
+
+PyAPI_FUNC(const char *) _Py_gitidentifier(void);
+PyAPI_FUNC(const char *) _Py_gitversion(void);
+
+PyAPI_FUNC(int) _Py_IsFinalizing(void);
+PyAPI_FUNC(int) _Py_IsInterpreterFinalizing(PyInterpreterState *interp);
+
+/* Random */
+PyAPI_FUNC(int) _PyOS_URandom(void *buffer, Py_ssize_t size);
+PyAPI_FUNC(int) _PyOS_URandomNonblock(void *buffer, Py_ssize_t size);
+
+/* Legacy locale support */
+PyAPI_FUNC(int) _Py_CoerceLegacyLocale(int warn);
+PyAPI_FUNC(int) _Py_LegacyLocaleDetected(int warn);
+PyAPI_FUNC(char *) _Py_SetLocaleFromEnv(int category);
+
+/* --- PyInterpreterConfig ------------------------------------ */
+
+#define PyInterpreterConfig_DEFAULT_GIL (0)
+#define PyInterpreterConfig_SHARED_GIL (1)
+#define PyInterpreterConfig_OWN_GIL (2)
+
+typedef struct {
+    // XXX "allow_object_sharing"?  "own_objects"?
+    int use_main_obmalloc;
+    int allow_fork;
+    int allow_exec;
+    int allow_threads;
+    int allow_daemon_threads;
+    int check_multi_interp_extensions;
+    int gil;
+} PyInterpreterConfig;
+
+#define _PyInterpreterConfig_INIT \
+    { \
+        .use_main_obmalloc = 0, \
+        .allow_fork = 0, \
+        .allow_exec = 0, \
+        .allow_threads = 1, \
+        .allow_daemon_threads = 0, \
+        .check_multi_interp_extensions = 1, \
+        .gil = PyInterpreterConfig_OWN_GIL, \
+    }
+
+#define _PyInterpreterConfig_LEGACY_INIT \
+    { \
+        .use_main_obmalloc = 1, \
+        .allow_fork = 1, \
+        .allow_exec = 1, \
+        .allow_threads = 1, \
+        .allow_daemon_threads = 1, \
+        .check_multi_interp_extensions = 0, \
+        .gil = PyInterpreterConfig_SHARED_GIL, \
+    }
+
+PyAPI_FUNC(PyStatus) Py_NewInterpreterFromConfig(
+    PyThreadState **tstate_p,
+    const PyInterpreterConfig *config);
+
+typedef void (*atexit_datacallbackfunc)(void *);
+PyAPI_FUNC(int) _Py_AtExit(
+        PyInterpreterState *, atexit_datacallbackfunc, void *);
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/pymem.h b/nanvix-port/cpython-headers/python3.12/cpython/pymem.h
new file mode 100644
index 000000000000..d1054d76520b
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/pymem.h
@@ -0,0 +1,98 @@
+#ifndef Py_CPYTHON_PYMEM_H
+#  error "this header file must not be included directly"
+#endif
+
+PyAPI_FUNC(void *) PyMem_RawMalloc(size_t size);
+PyAPI_FUNC(void *) PyMem_RawCalloc(size_t nelem, size_t elsize);
+PyAPI_FUNC(void *) PyMem_RawRealloc(void *ptr, size_t new_size);
+PyAPI_FUNC(void) PyMem_RawFree(void *ptr);
+
+/* Try to get the allocators name set by _PyMem_SetupAllocators(). */
+PyAPI_FUNC(const char*) _PyMem_GetCurrentAllocatorName(void);
+
+/* strdup() using PyMem_RawMalloc() */
+PyAPI_FUNC(char *) _PyMem_RawStrdup(const char *str);
+
+/* strdup() using PyMem_Malloc() */
+PyAPI_FUNC(char *) _PyMem_Strdup(const char *str);
+
+/* wcsdup() using PyMem_RawMalloc() */
+PyAPI_FUNC(wchar_t*) _PyMem_RawWcsdup(const wchar_t *str);
+
+
+typedef enum {
+    /* PyMem_RawMalloc(), PyMem_RawRealloc() and PyMem_RawFree() */
+    PYMEM_DOMAIN_RAW,
+
+    /* PyMem_Malloc(), PyMem_Realloc() and PyMem_Free() */
+    PYMEM_DOMAIN_MEM,
+
+    /* PyObject_Malloc(), PyObject_Realloc() and PyObject_Free() */
+    PYMEM_DOMAIN_OBJ
+} PyMemAllocatorDomain;
+
+typedef enum {
+    PYMEM_ALLOCATOR_NOT_SET = 0,
+    PYMEM_ALLOCATOR_DEFAULT = 1,
+    PYMEM_ALLOCATOR_DEBUG = 2,
+    PYMEM_ALLOCATOR_MALLOC = 3,
+    PYMEM_ALLOCATOR_MALLOC_DEBUG = 4,
+#ifdef WITH_PYMALLOC
+    PYMEM_ALLOCATOR_PYMALLOC = 5,
+    PYMEM_ALLOCATOR_PYMALLOC_DEBUG = 6,
+#endif
+} PyMemAllocatorName;
+
+
+typedef struct {
+    /* user context passed as the first argument to the 4 functions */
+    void *ctx;
+
+    /* allocate a memory block */
+    void* (*malloc) (void *ctx, size_t size);
+
+    /* allocate a memory block initialized by zeros */
+    void* (*calloc) (void *ctx, size_t nelem, size_t elsize);
+
+    /* allocate or resize a memory block */
+    void* (*realloc) (void *ctx, void *ptr, size_t new_size);
+
+    /* release a memory block */
+    void (*free) (void *ctx, void *ptr);
+} PyMemAllocatorEx;
+
+/* Get the memory block allocator of the specified domain. */
+PyAPI_FUNC(void) PyMem_GetAllocator(PyMemAllocatorDomain domain,
+                                    PyMemAllocatorEx *allocator);
+
+/* Set the memory block allocator of the specified domain.
+
+   The new allocator must return a distinct non-NULL pointer when requesting
+   zero bytes.
+
+   For the PYMEM_DOMAIN_RAW domain, the allocator must be thread-safe: the GIL
+   is not held when the allocator is called.
+
+   If the new allocator is not a hook (don't call the previous allocator), the
+   PyMem_SetupDebugHooks() function must be called to reinstall the debug hooks
+   on top on the new allocator. */
+PyAPI_FUNC(void) PyMem_SetAllocator(PyMemAllocatorDomain domain,
+                                    PyMemAllocatorEx *allocator);
+
+/* Setup hooks to detect bugs in the following Python memory allocator
+   functions:
+
+   - PyMem_RawMalloc(), PyMem_RawRealloc(), PyMem_RawFree()
+   - PyMem_Malloc(), PyMem_Realloc(), PyMem_Free()
+   - PyObject_Malloc(), PyObject_Realloc() and PyObject_Free()
+
+   Newly allocated memory is filled with the byte 0xCB, freed memory is filled
+   with the byte 0xDB. Additional checks:
+
+   - detect API violations, ex: PyObject_Free() called on a buffer allocated
+     by PyMem_Malloc()
+   - detect write before the start of the buffer (buffer underflow)
+   - detect write after the end of the buffer (buffer overflow)
+
+   The function does nothing if Python is not compiled is debug mode. */
+PyAPI_FUNC(void) PyMem_SetupDebugHooks(void);
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/pystate.h b/nanvix-port/cpython-headers/python3.12/cpython/pystate.h
new file mode 100644
index 000000000000..95fad893786d
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/pystate.h
@@ -0,0 +1,456 @@
+#ifndef Py_CPYTHON_PYSTATE_H
+#  error "this header file must not be included directly"
+#endif
+
+
+/*
+Runtime Feature Flags
+
+Each flag indicate whether or not a specific runtime feature
+is available in a given context.  For example, forking the process
+might not be allowed in the current interpreter (i.e. os.fork() would fail).
+*/
+
+/* Set if the interpreter share obmalloc runtime state
+   with the main interpreter. */
+#define Py_RTFLAGS_USE_MAIN_OBMALLOC (1UL << 5)
+
+/* Set if import should check a module for subinterpreter support. */
+#define Py_RTFLAGS_MULTI_INTERP_EXTENSIONS (1UL << 8)
+
+/* Set if threads are allowed. */
+#define Py_RTFLAGS_THREADS (1UL << 10)
+
+/* Set if daemon threads are allowed. */
+#define Py_RTFLAGS_DAEMON_THREADS (1UL << 11)
+
+/* Set if os.fork() is allowed. */
+#define Py_RTFLAGS_FORK (1UL << 15)
+
+/* Set if os.exec*() is allowed. */
+#define Py_RTFLAGS_EXEC (1UL << 16)
+
+
+PyAPI_FUNC(int) _PyInterpreterState_HasFeature(PyInterpreterState *interp,
+                                               unsigned long feature);
+
+
+/* private interpreter helpers */
+
+PyAPI_FUNC(int) _PyInterpreterState_RequiresIDRef(PyInterpreterState *);
+PyAPI_FUNC(void) _PyInterpreterState_RequireIDRef(PyInterpreterState *, int);
+
+PyAPI_FUNC(PyObject *) _PyInterpreterState_GetMainModule(PyInterpreterState *);
+
+
+/* State unique per thread */
+
+/* Py_tracefunc return -1 when raising an exception, or 0 for success. */
+typedef int (*Py_tracefunc)(PyObject *, PyFrameObject *, int, PyObject *);
+
+/* The following values are used for 'what' for tracefunc functions
+ *
+ * To add a new kind of trace event, also update "trace_init" in
+ * Python/sysmodule.c to define the Python level event name
+ */
+#define PyTrace_CALL 0
+#define PyTrace_EXCEPTION 1
+#define PyTrace_LINE 2
+#define PyTrace_RETURN 3
+#define PyTrace_C_CALL 4
+#define PyTrace_C_EXCEPTION 5
+#define PyTrace_C_RETURN 6
+#define PyTrace_OPCODE 7
+
+// Internal structure: you should not use it directly, but use public functions
+// like PyThreadState_EnterTracing() and PyThreadState_LeaveTracing().
+typedef struct _PyCFrame {
+    /* This struct will be threaded through the C stack
+     * allowing fast access to per-thread state that needs
+     * to be accessed quickly by the interpreter, but can
+     * be modified outside of the interpreter.
+     *
+     * WARNING: This makes data on the C stack accessible from
+     * heap objects. Care must be taken to maintain stack
+     * discipline and make sure that instances of this struct cannot
+     * accessed outside of their lifetime.
+     */
+    /* Pointer to the currently executing frame (it can be NULL) */
+    struct _PyInterpreterFrame *current_frame;
+    struct _PyCFrame *previous;
+} _PyCFrame;
+
+typedef struct _err_stackitem {
+    /* This struct represents a single execution context where we might
+     * be currently handling an exception.  It is a per-coroutine state
+     * (coroutine in the computer science sense, including the thread
+     * and generators).
+     *
+     * This is used as an entry on the exception stack, where each
+     * entry indicates if it is currently handling an exception.
+     * This ensures that the exception state is not impacted
+     * by "yields" from an except handler.  The thread
+     * always has an entry (the bottom-most one).
+     */
+
+    /* The exception currently being handled in this context, if any. */
+    PyObject *exc_value;
+
+    struct _err_stackitem *previous_item;
+
+} _PyErr_StackItem;
+
+typedef struct _stack_chunk {
+    struct _stack_chunk *previous;
+    size_t size;
+    size_t top;
+    PyObject * data[1]; /* Variable sized */
+} _PyStackChunk;
+
+struct _py_trashcan {
+    int delete_nesting;
+    PyObject *delete_later;
+};
+
+struct _ts {
+    /* See Python/ceval.c for comments explaining most fields */
+
+    PyThreadState *prev;
+    PyThreadState *next;
+    PyInterpreterState *interp;
+
+    struct {
+        /* Has been initialized to a safe state.
+
+           In order to be effective, this must be set to 0 during or right
+           after allocation. */
+        unsigned int initialized:1;
+
+        /* Has been bound to an OS thread. */
+        unsigned int bound:1;
+        /* Has been unbound from its OS thread. */
+        unsigned int unbound:1;
+        /* Has been bound aa current for the GILState API. */
+        unsigned int bound_gilstate:1;
+        /* Currently in use (maybe holds the GIL). */
+        unsigned int active:1;
+
+        /* various stages of finalization */
+        unsigned int finalizing:1;
+        unsigned int cleared:1;
+        unsigned int finalized:1;
+
+        /* padding to align to 4 bytes */
+        unsigned int :24;
+    } _status;
+
+    int py_recursion_remaining;
+    int py_recursion_limit;
+
+    int c_recursion_remaining;
+    int recursion_headroom; /* Allow 50 more calls to handle any errors. */
+
+    /* 'tracing' keeps track of the execution depth when tracing/profiling.
+       This is to prevent the actual trace/profile code from being recorded in
+       the trace/profile. */
+    int tracing;
+    int what_event; /* The event currently being monitored, if any. */
+
+    /* Pointer to current _PyCFrame in the C stack frame of the currently,
+     * or most recently, executing _PyEval_EvalFrameDefault. */
+    _PyCFrame *cframe;
+
+    Py_tracefunc c_profilefunc;
+    Py_tracefunc c_tracefunc;
+    PyObject *c_profileobj;
+    PyObject *c_traceobj;
+
+    /* The exception currently being raised */
+    PyObject *current_exception;
+
+    /* Pointer to the top of the exception stack for the exceptions
+     * we may be currently handling.  (See _PyErr_StackItem above.)
+     * This is never NULL. */
+    _PyErr_StackItem *exc_info;
+
+    PyObject *dict;  /* Stores per-thread state */
+
+    int gilstate_counter;
+
+    PyObject *async_exc; /* Asynchronous exception to raise */
+    unsigned long thread_id; /* Thread id where this tstate was created */
+
+    /* Native thread id where this tstate was created. This will be 0 except on
+     * those platforms that have the notion of native thread id, for which the
+     * macro PY_HAVE_THREAD_NATIVE_ID is then defined.
+     */
+    unsigned long native_thread_id;
+
+    struct _py_trashcan trash;
+
+    /* Called when a thread state is deleted normally, but not when it
+     * is destroyed after fork().
+     * Pain:  to prevent rare but fatal shutdown errors (issue 18808),
+     * Thread.join() must wait for the join'ed thread's tstate to be unlinked
+     * from the tstate chain.  That happens at the end of a thread's life,
+     * in pystate.c.
+     * The obvious way doesn't quite work:  create a lock which the tstate
+     * unlinking code releases, and have Thread.join() wait to acquire that
+     * lock.  The problem is that we _are_ at the end of the thread's life:
+     * if the thread holds the last reference to the lock, decref'ing the
+     * lock will delete the lock, and that may trigger arbitrary Python code
+     * if there's a weakref, with a callback, to the lock.  But by this time
+     * _PyRuntime.gilstate.tstate_current is already NULL, so only the simplest
+     * of C code can be allowed to run (in particular it must not be possible to
+     * release the GIL).
+     * So instead of holding the lock directly, the tstate holds a weakref to
+     * the lock:  that's the value of on_delete_data below.  Decref'ing a
+     * weakref is harmless.
+     * on_delete points to _threadmodule.c's static release_sentinel() function.
+     * After the tstate is unlinked, release_sentinel is called with the
+     * weakref-to-lock (on_delete_data) argument, and release_sentinel releases
+     * the indirectly held lock.
+     */
+    void (*on_delete)(void *);
+    void *on_delete_data;
+
+    int coroutine_origin_tracking_depth;
+
+    PyObject *async_gen_firstiter;
+    PyObject *async_gen_finalizer;
+
+    PyObject *context;
+    uint64_t context_ver;
+
+    /* Unique thread state id. */
+    uint64_t id;
+
+    _PyStackChunk *datastack_chunk;
+    PyObject **datastack_top;
+    PyObject **datastack_limit;
+    /* XXX signal handlers should also be here */
+
+    /* The following fields are here to avoid allocation during init.
+       The data is exposed through PyThreadState pointer fields.
+       These fields should not be accessed directly outside of init.
+       This is indicated by an underscore prefix on the field names.
+
+       All other PyInterpreterState pointer fields are populated when
+       needed and default to NULL.
+       */
+       // Note some fields do not have a leading underscore for backward
+       // compatibility.  See https://bugs.python.org/issue45953#msg412046.
+
+    /* The thread's exception stack entry.  (Always the last entry.) */
+    _PyErr_StackItem exc_state;
+
+    /* The bottom-most frame on the stack. */
+    _PyCFrame root_cframe;
+};
+
+/* WASI has limited call stack. Python's recursion limit depends on code
+   layout, optimization, and WASI runtime. Wasmtime can handle about 700
+   recursions, sometimes less. 500 is a more conservative limit. */
+#ifdef Py_DEBUG
+#  if defined(__wasi__)
+#    define C_RECURSION_LIMIT 150
+#  else
+#    define C_RECURSION_LIMIT 500
+#  endif
+#else
+#  if defined(__wasi__)
+#    define C_RECURSION_LIMIT 500
+#  elif defined(__s390x__)
+#    define C_RECURSION_LIMIT 800
+#  elif defined(_WIN32)
+#    define C_RECURSION_LIMIT 3000
+#  elif defined(_Py_ADDRESS_SANITIZER)
+#    define C_RECURSION_LIMIT 4000
+#  else
+     // This value is duplicated in Lib/test/support/__init__.py
+#    define C_RECURSION_LIMIT 10000
+#  endif
+#endif
+
+/* other API */
+
+// Alias for backward compatibility with Python 3.8
+#define _PyInterpreterState_Get PyInterpreterState_Get
+
+/* An alias for the internal _PyThreadState_New(),
+   kept for stable ABI compatibility. */
+PyAPI_FUNC(PyThreadState *) _PyThreadState_Prealloc(PyInterpreterState *);
+
+/* Similar to PyThreadState_Get(), but don't issue a fatal error
+ * if it is NULL. */
+PyAPI_FUNC(PyThreadState *) _PyThreadState_UncheckedGet(void);
+
+PyAPI_FUNC(PyObject *) _PyThreadState_GetDict(PyThreadState *tstate);
+
+// Disable tracing and profiling.
+PyAPI_FUNC(void) PyThreadState_EnterTracing(PyThreadState *tstate);
+
+// Reset tracing and profiling: enable them if a trace function or a profile
+// function is set, otherwise disable them.
+PyAPI_FUNC(void) PyThreadState_LeaveTracing(PyThreadState *tstate);
+
+/* PyGILState */
+
+/* Helper/diagnostic function - return 1 if the current thread
+   currently holds the GIL, 0 otherwise.
+
+   The function returns 1 if _PyGILState_check_enabled is non-zero. */
+PyAPI_FUNC(int) PyGILState_Check(void);
+
+/* Get the single PyInterpreterState used by this process' GILState
+   implementation.
+
+   This function doesn't check for error. Return NULL before _PyGILState_Init()
+   is called and after _PyGILState_Fini() is called.
+
+   See also _PyInterpreterState_Get() and _PyInterpreterState_GET(). */
+PyAPI_FUNC(PyInterpreterState *) _PyGILState_GetInterpreterStateUnsafe(void);
+
+/* The implementation of sys._current_frames()  Returns a dict mapping
+   thread id to that thread's current frame.
+*/
+PyAPI_FUNC(PyObject *) _PyThread_CurrentFrames(void);
+
+/* The implementation of sys._current_exceptions()  Returns a dict mapping
+   thread id to that thread's current exception.
+*/
+PyAPI_FUNC(PyObject *) _PyThread_CurrentExceptions(void);
+
+/* Routines for advanced debuggers, requested by David Beazley.
+   Don't use unless you know what you are doing! */
+PyAPI_FUNC(PyInterpreterState *) PyInterpreterState_Main(void);
+PyAPI_FUNC(PyInterpreterState *) PyInterpreterState_Head(void);
+PyAPI_FUNC(PyInterpreterState *) PyInterpreterState_Next(PyInterpreterState *);
+PyAPI_FUNC(PyThreadState *) PyInterpreterState_ThreadHead(PyInterpreterState *);
+PyAPI_FUNC(PyThreadState *) PyThreadState_Next(PyThreadState *);
+PyAPI_FUNC(void) PyThreadState_DeleteCurrent(void);
+
+/* Frame evaluation API */
+
+typedef PyObject* (*_PyFrameEvalFunction)(PyThreadState *tstate, struct _PyInterpreterFrame *, int);
+
+PyAPI_FUNC(_PyFrameEvalFunction) _PyInterpreterState_GetEvalFrameFunc(
+    PyInterpreterState *interp);
+PyAPI_FUNC(void) _PyInterpreterState_SetEvalFrameFunc(
+    PyInterpreterState *interp,
+    _PyFrameEvalFunction eval_frame);
+
+PyAPI_FUNC(const PyConfig*) _PyInterpreterState_GetConfig(PyInterpreterState *interp);
+
+/* Get a copy of the current interpreter configuration.
+
+   Return 0 on success. Raise an exception and return -1 on error.
+
+   The caller must initialize 'config', using PyConfig_InitPythonConfig()
+   for example.
+
+   Python must be preinitialized to call this method.
+   The caller must hold the GIL.
+
+   Once done with the configuration, PyConfig_Clear() must be called to clear
+   it. */
+PyAPI_FUNC(int) _PyInterpreterState_GetConfigCopy(
+    struct PyConfig *config);
+
+/* Set the configuration of the current interpreter.
+
+   This function should be called during or just after the Python
+   initialization.
+
+   Update the sys module with the new configuration. If the sys module was
+   modified directly after the Python initialization, these changes are lost.
+
+   Some configuration like faulthandler or warnoptions can be updated in the
+   configuration, but don't reconfigure Python (don't enable/disable
+   faulthandler and don't reconfigure warnings filters).
+
+   Return 0 on success. Raise an exception and return -1 on error.
+
+   The configuration should come from _PyInterpreterState_GetConfigCopy(). */
+PyAPI_FUNC(int) _PyInterpreterState_SetConfig(
+    const struct PyConfig *config);
+
+// Get the configuration of the current interpreter.
+// The caller must hold the GIL.
+PyAPI_FUNC(const PyConfig*) _Py_GetConfig(void);
+
+
+/* cross-interpreter data */
+
+// _PyCrossInterpreterData is similar to Py_buffer as an effectively
+// opaque struct that holds data outside the object machinery.  This
+// is necessary to pass safely between interpreters in the same process.
+typedef struct _xid _PyCrossInterpreterData;
+
+typedef PyObject *(*xid_newobjectfunc)(_PyCrossInterpreterData *);
+typedef void (*xid_freefunc)(void *);
+
+struct _xid {
+    // data is the cross-interpreter-safe derivation of a Python object
+    // (see _PyObject_GetCrossInterpreterData).  It will be NULL if the
+    // new_object func (below) encodes the data.
+    void *data;
+    // obj is the Python object from which the data was derived.  This
+    // is non-NULL only if the data remains bound to the object in some
+    // way, such that the object must be "released" (via a decref) when
+    // the data is released.  In that case the code that sets the field,
+    // likely a registered "crossinterpdatafunc", is responsible for
+    // ensuring it owns the reference (i.e. incref).
+    PyObject *obj;
+    // interp is the ID of the owning interpreter of the original
+    // object.  It corresponds to the active interpreter when
+    // _PyObject_GetCrossInterpreterData() was called.  This should only
+    // be set by the cross-interpreter machinery.
+    //
+    // We use the ID rather than the PyInterpreterState to avoid issues
+    // with deleted interpreters.  Note that IDs are never re-used, so
+    // each one will always correspond to a specific interpreter
+    // (whether still alive or not).
+    int64_t interp;
+    // new_object is a function that returns a new object in the current
+    // interpreter given the data.  The resulting object (a new
+    // reference) will be equivalent to the original object.  This field
+    // is required.
+    xid_newobjectfunc new_object;
+    // free is called when the data is released.  If it is NULL then
+    // nothing will be done to free the data.  For some types this is
+    // okay (e.g. bytes) and for those types this field should be set
+    // to NULL.  However, for most the data was allocated just for
+    // cross-interpreter use, so it must be freed when
+    // _PyCrossInterpreterData_Release is called or the memory will
+    // leak.  In that case, at the very least this field should be set
+    // to PyMem_RawFree (the default if not explicitly set to NULL).
+    // The call will happen with the original interpreter activated.
+    xid_freefunc free;
+};
+
+PyAPI_FUNC(void) _PyCrossInterpreterData_Init(
+        _PyCrossInterpreterData *data,
+        PyInterpreterState *interp, void *shared, PyObject *obj,
+        xid_newobjectfunc new_object);
+PyAPI_FUNC(int) _PyCrossInterpreterData_InitWithSize(
+        _PyCrossInterpreterData *,
+        PyInterpreterState *interp, const size_t, PyObject *,
+        xid_newobjectfunc);
+PyAPI_FUNC(void) _PyCrossInterpreterData_Clear(
+        PyInterpreterState *, _PyCrossInterpreterData *);
+
+PyAPI_FUNC(int) _PyObject_GetCrossInterpreterData(PyObject *, _PyCrossInterpreterData *);
+PyAPI_FUNC(PyObject *) _PyCrossInterpreterData_NewObject(_PyCrossInterpreterData *);
+PyAPI_FUNC(int) _PyCrossInterpreterData_Release(_PyCrossInterpreterData *);
+
+PyAPI_FUNC(int) _PyObject_CheckCrossInterpreterData(PyObject *);
+
+/* cross-interpreter data registry */
+
+typedef int (*crossinterpdatafunc)(PyThreadState *tstate, PyObject *,
+                                   _PyCrossInterpreterData *);
+
+PyAPI_FUNC(int) _PyCrossInterpreterData_RegisterClass(PyTypeObject *, crossinterpdatafunc);
+PyAPI_FUNC(int) _PyCrossInterpreterData_UnregisterClass(PyTypeObject *);
+PyAPI_FUNC(crossinterpdatafunc) _PyCrossInterpreterData_Lookup(PyObject *);
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/pythonrun.h b/nanvix-port/cpython-headers/python3.12/cpython/pythonrun.h
new file mode 100644
index 000000000000..fb6176553740
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/pythonrun.h
@@ -0,0 +1,121 @@
+#ifndef Py_CPYTHON_PYTHONRUN_H
+#  error "this header file must not be included directly"
+#endif
+
+PyAPI_FUNC(int) PyRun_SimpleStringFlags(const char *, PyCompilerFlags *);
+PyAPI_FUNC(int) _PyRun_SimpleFileObject(
+    FILE *fp,
+    PyObject *filename,
+    int closeit,
+    PyCompilerFlags *flags);
+PyAPI_FUNC(int) PyRun_AnyFileExFlags(
+    FILE *fp,
+    const char *filename,       /* decoded from the filesystem encoding */
+    int closeit,
+    PyCompilerFlags *flags);
+PyAPI_FUNC(int) _PyRun_AnyFileObject(
+    FILE *fp,
+    PyObject *filename,
+    int closeit,
+    PyCompilerFlags *flags);
+PyAPI_FUNC(int) PyRun_SimpleFileExFlags(
+    FILE *fp,
+    const char *filename,       /* decoded from the filesystem encoding */
+    int closeit,
+    PyCompilerFlags *flags);
+PyAPI_FUNC(int) PyRun_InteractiveOneFlags(
+    FILE *fp,
+    const char *filename,       /* decoded from the filesystem encoding */
+    PyCompilerFlags *flags);
+PyAPI_FUNC(int) PyRun_InteractiveOneObject(
+    FILE *fp,
+    PyObject *filename,
+    PyCompilerFlags *flags);
+PyAPI_FUNC(int) PyRun_InteractiveLoopFlags(
+    FILE *fp,
+    const char *filename,       /* decoded from the filesystem encoding */
+    PyCompilerFlags *flags);
+PyAPI_FUNC(int) _PyRun_InteractiveLoopObject(
+    FILE *fp,
+    PyObject *filename,
+    PyCompilerFlags *flags);
+
+
+PyAPI_FUNC(PyObject *) PyRun_StringFlags(const char *, int, PyObject *,
+                                         PyObject *, PyCompilerFlags *);
+
+PyAPI_FUNC(PyObject *) PyRun_FileExFlags(
+    FILE *fp,
+    const char *filename,       /* decoded from the filesystem encoding */
+    int start,
+    PyObject *globals,
+    PyObject *locals,
+    int closeit,
+    PyCompilerFlags *flags);
+
+
+PyAPI_FUNC(PyObject *) Py_CompileStringExFlags(
+    const char *str,
+    const char *filename,       /* decoded from the filesystem encoding */
+    int start,
+    PyCompilerFlags *flags,
+    int optimize);
+PyAPI_FUNC(PyObject *) Py_CompileStringObject(
+    const char *str,
+    PyObject *filename, int start,
+    PyCompilerFlags *flags,
+    int optimize);
+
+#define Py_CompileString(str, p, s) Py_CompileStringExFlags((str), (p), (s), NULL, -1)
+#define Py_CompileStringFlags(str, p, s, f) Py_CompileStringExFlags((str), (p), (s), (f), -1)
+
+
+PyAPI_FUNC(const char *) _Py_SourceAsString(
+    PyObject *cmd,
+    const char *funcname,
+    const char *what,
+    PyCompilerFlags *cf,
+    PyObject **cmd_copy);
+
+
+/* A function flavor is also exported by libpython. It is required when
+    libpython is accessed directly rather than using header files which defines
+    macros below. On Windows, for example, PyAPI_FUNC() uses dllexport to
+    export functions in pythonXX.dll. */
+PyAPI_FUNC(PyObject *) PyRun_String(const char *str, int s, PyObject *g, PyObject *l);
+PyAPI_FUNC(int) PyRun_AnyFile(FILE *fp, const char *name);
+PyAPI_FUNC(int) PyRun_AnyFileEx(FILE *fp, const char *name, int closeit);
+PyAPI_FUNC(int) PyRun_AnyFileFlags(FILE *, const char *, PyCompilerFlags *);
+PyAPI_FUNC(int) PyRun_SimpleString(const char *s);
+PyAPI_FUNC(int) PyRun_SimpleFile(FILE *f, const char *p);
+PyAPI_FUNC(int) PyRun_SimpleFileEx(FILE *f, const char *p, int c);
+PyAPI_FUNC(int) PyRun_InteractiveOne(FILE *f, const char *p);
+PyAPI_FUNC(int) PyRun_InteractiveLoop(FILE *f, const char *p);
+PyAPI_FUNC(PyObject *) PyRun_File(FILE *fp, const char *p, int s, PyObject *g, PyObject *l);
+PyAPI_FUNC(PyObject *) PyRun_FileEx(FILE *fp, const char *p, int s, PyObject *g, PyObject *l, int c);
+PyAPI_FUNC(PyObject *) PyRun_FileFlags(FILE *fp, const char *p, int s, PyObject *g, PyObject *l, PyCompilerFlags *flags);
+
+/* Use macros for a bunch of old variants */
+#define PyRun_String(str, s, g, l) PyRun_StringFlags((str), (s), (g), (l), NULL)
+#define PyRun_AnyFile(fp, name) PyRun_AnyFileExFlags((fp), (name), 0, NULL)
+#define PyRun_AnyFileEx(fp, name, closeit) \
+    PyRun_AnyFileExFlags((fp), (name), (closeit), NULL)
+#define PyRun_AnyFileFlags(fp, name, flags) \
+    PyRun_AnyFileExFlags((fp), (name), 0, (flags))
+#define PyRun_SimpleString(s) PyRun_SimpleStringFlags((s), NULL)
+#define PyRun_SimpleFile(f, p) PyRun_SimpleFileExFlags((f), (p), 0, NULL)
+#define PyRun_SimpleFileEx(f, p, c) PyRun_SimpleFileExFlags((f), (p), (c), NULL)
+#define PyRun_InteractiveOne(f, p) PyRun_InteractiveOneFlags((f), (p), NULL)
+#define PyRun_InteractiveLoop(f, p) PyRun_InteractiveLoopFlags((f), (p), NULL)
+#define PyRun_File(fp, p, s, g, l) \
+    PyRun_FileExFlags((fp), (p), (s), (g), (l), 0, NULL)
+#define PyRun_FileEx(fp, p, s, g, l, c) \
+    PyRun_FileExFlags((fp), (p), (s), (g), (l), (c), NULL)
+#define PyRun_FileFlags(fp, p, s, g, l, flags) \
+    PyRun_FileExFlags((fp), (p), (s), (g), (l), 0, (flags))
+
+
+/* Stuff with no proper home (yet) */
+PyAPI_FUNC(char *) PyOS_Readline(FILE *, FILE *, const char *);
+PyAPI_DATA(PyThreadState*) _PyOS_ReadlineTState;
+PyAPI_DATA(char) *(*PyOS_ReadlineFunctionPointer)(FILE *, FILE *, const char *);
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/pythread.h b/nanvix-port/cpython-headers/python3.12/cpython/pythread.h
new file mode 100644
index 000000000000..ce4ec8f65b15
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/pythread.h
@@ -0,0 +1,42 @@
+#ifndef Py_CPYTHON_PYTHREAD_H
+#  error "this header file must not be included directly"
+#endif
+
+#define PYTHREAD_INVALID_THREAD_ID ((unsigned long)-1)
+
+#ifdef HAVE_FORK
+/* Private function to reinitialize a lock at fork in the child process.
+   Reset the lock to the unlocked state.
+   Return 0 on success, return -1 on error. */
+PyAPI_FUNC(int) _PyThread_at_fork_reinit(PyThread_type_lock *lock);
+#endif  /* HAVE_FORK */
+
+#ifdef HAVE_PTHREAD_H
+    /* Darwin needs pthread.h to know type name the pthread_key_t. */
+#   include <pthread.h>
+#   define NATIVE_TSS_KEY_T     pthread_key_t
+#elif defined(NT_THREADS)
+    /* In Windows, native TSS key type is DWORD,
+       but hardcode the unsigned long to avoid errors for include directive.
+    */
+#   define NATIVE_TSS_KEY_T     unsigned long
+#elif defined(HAVE_PTHREAD_STUBS)
+#   include "cpython/pthread_stubs.h"
+#   define NATIVE_TSS_KEY_T     pthread_key_t
+#else
+#   error "Require native threads. See https://bugs.python.org/issue31370"
+#endif
+
+/* When Py_LIMITED_API is not defined, the type layout of Py_tss_t is
+   exposed to allow static allocation in the API clients.  Even in this case,
+   you must handle TSS keys through API functions due to compatibility.
+*/
+struct _Py_tss_t {
+    int _is_initialized;
+    NATIVE_TSS_KEY_T _key;
+};
+
+#undef NATIVE_TSS_KEY_T
+
+/* When static allocation, you must initialize with Py_tss_NEEDS_INIT. */
+#define Py_tss_NEEDS_INIT   {0}
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/pytime.h b/nanvix-port/cpython-headers/python3.12/cpython/pytime.h
new file mode 100644
index 000000000000..16d88d191e9e
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/pytime.h
@@ -0,0 +1,331 @@
+// The _PyTime_t API is written to use timestamp and timeout values stored in
+// various formats and to read clocks.
+//
+// The _PyTime_t type is an integer to support directly common arithmetic
+// operations like t1 + t2.
+//
+// The _PyTime_t API supports a resolution of 1 nanosecond. The _PyTime_t type
+// is signed to support negative timestamps. The supported range is around
+// [-292.3 years; +292.3 years]. Using the Unix epoch (January 1st, 1970), the
+// supported date range is around [1677-09-21; 2262-04-11].
+//
+// Formats:
+//
+// * seconds
+// * seconds as a floating pointer number (C double)
+// * milliseconds (10^-3 seconds)
+// * microseconds (10^-6 seconds)
+// * 100 nanoseconds (10^-7 seconds)
+// * nanoseconds (10^-9 seconds)
+// * timeval structure, 1 microsecond resolution (10^-6 seconds)
+// * timespec structure, 1 nanosecond resolution (10^-9 seconds)
+//
+// Integer overflows are detected and raise OverflowError. Conversion to a
+// resolution worse than 1 nanosecond is rounded correctly with the requested
+// rounding mode. There are 4 rounding modes: floor (towards -inf), ceiling
+// (towards +inf), half even and up (away from zero).
+//
+// Some functions clamp the result in the range [_PyTime_MIN; _PyTime_MAX], so
+// the caller doesn't have to handle errors and doesn't need to hold the GIL.
+// For example, _PyTime_Add(t1, t2) computes t1+t2 and clamp the result on
+// overflow.
+//
+// Clocks:
+//
+// * System clock
+// * Monotonic clock
+// * Performance counter
+//
+// Operations like (t * k / q) with integers are implemented in a way to reduce
+// the risk of integer overflow. Such operation is used to convert a clock
+// value expressed in ticks with a frequency to _PyTime_t, like
+// QueryPerformanceCounter() with QueryPerformanceFrequency().
+
+#ifndef Py_LIMITED_API
+#ifndef Py_PYTIME_H
+#define Py_PYTIME_H
+
+/**************************************************************************
+Symbols and macros to supply platform-independent interfaces to time related
+functions and constants
+**************************************************************************/
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __clang__
+struct timeval;
+#endif
+
+/* _PyTime_t: Python timestamp with subsecond precision. It can be used to
+   store a duration, and so indirectly a date (related to another date, like
+   UNIX epoch). */
+typedef int64_t _PyTime_t;
+// _PyTime_MIN nanoseconds is around -292.3 years
+#define _PyTime_MIN INT64_MIN
+// _PyTime_MAX nanoseconds is around +292.3 years
+#define _PyTime_MAX INT64_MAX
+#define _SIZEOF_PYTIME_T 8
+
+typedef enum {
+    /* Round towards minus infinity (-inf).
+       For example, used to read a clock. */
+    _PyTime_ROUND_FLOOR=0,
+    /* Round towards infinity (+inf).
+       For example, used for timeout to wait "at least" N seconds. */
+    _PyTime_ROUND_CEILING=1,
+    /* Round to nearest with ties going to nearest even integer.
+       For example, used to round from a Python float. */
+    _PyTime_ROUND_HALF_EVEN=2,
+    /* Round away from zero
+       For example, used for timeout. _PyTime_ROUND_CEILING rounds
+       -1e-9 to 0 milliseconds which causes bpo-31786 issue.
+       _PyTime_ROUND_UP rounds -1e-9 to -1 millisecond which keeps
+       the timeout sign as expected. select.poll(timeout) must block
+       for negative values." */
+    _PyTime_ROUND_UP=3,
+    /* _PyTime_ROUND_TIMEOUT (an alias for _PyTime_ROUND_UP) should be
+       used for timeouts. */
+    _PyTime_ROUND_TIMEOUT = _PyTime_ROUND_UP
+} _PyTime_round_t;
+
+
+/* Convert a time_t to a PyLong. */
+PyAPI_FUNC(PyObject *) _PyLong_FromTime_t(
+    time_t sec);
+
+/* Convert a PyLong to a time_t. */
+PyAPI_FUNC(time_t) _PyLong_AsTime_t(
+    PyObject *obj);
+
+/* Convert a number of seconds, int or float, to time_t. */
+PyAPI_FUNC(int) _PyTime_ObjectToTime_t(
+    PyObject *obj,
+    time_t *sec,
+    _PyTime_round_t);
+
+/* Convert a number of seconds, int or float, to a timeval structure.
+   usec is in the range [0; 999999] and rounded towards zero.
+   For example, -1.2 is converted to (-2, 800000). */
+PyAPI_FUNC(int) _PyTime_ObjectToTimeval(
+    PyObject *obj,
+    time_t *sec,
+    long *usec,
+    _PyTime_round_t);
+
+/* Convert a number of seconds, int or float, to a timespec structure.
+   nsec is in the range [0; 999999999] and rounded towards zero.
+   For example, -1.2 is converted to (-2, 800000000). */
+PyAPI_FUNC(int) _PyTime_ObjectToTimespec(
+    PyObject *obj,
+    time_t *sec,
+    long *nsec,
+    _PyTime_round_t);
+
+
+/* Create a timestamp from a number of seconds. */
+PyAPI_FUNC(_PyTime_t) _PyTime_FromSeconds(int seconds);
+
+/* Macro to create a timestamp from a number of seconds, no integer overflow.
+   Only use the macro for small values, prefer _PyTime_FromSeconds(). */
+#define _PYTIME_FROMSECONDS(seconds) \
+            ((_PyTime_t)(seconds) * (1000 * 1000 * 1000))
+
+/* Create a timestamp from a number of nanoseconds. */
+PyAPI_FUNC(_PyTime_t) _PyTime_FromNanoseconds(_PyTime_t ns);
+
+/* Create a timestamp from a number of microseconds.
+ * Clamp to [_PyTime_MIN; _PyTime_MAX] on overflow. */
+PyAPI_FUNC(_PyTime_t) _PyTime_FromMicrosecondsClamp(_PyTime_t us);
+
+/* Create a timestamp from nanoseconds (Python int). */
+PyAPI_FUNC(int) _PyTime_FromNanosecondsObject(_PyTime_t *t,
+    PyObject *obj);
+
+/* Convert a number of seconds (Python float or int) to a timestamp.
+   Raise an exception and return -1 on error, return 0 on success. */
+PyAPI_FUNC(int) _PyTime_FromSecondsObject(_PyTime_t *t,
+    PyObject *obj,
+    _PyTime_round_t round);
+
+/* Convert a number of milliseconds (Python float or int, 10^-3) to a timestamp.
+   Raise an exception and return -1 on error, return 0 on success. */
+PyAPI_FUNC(int) _PyTime_FromMillisecondsObject(_PyTime_t *t,
+    PyObject *obj,
+    _PyTime_round_t round);
+
+/* Convert a timestamp to a number of seconds as a C double. */
+PyAPI_FUNC(double) _PyTime_AsSecondsDouble(_PyTime_t t);
+
+/* Convert timestamp to a number of milliseconds (10^-3 seconds). */
+PyAPI_FUNC(_PyTime_t) _PyTime_AsMilliseconds(_PyTime_t t,
+    _PyTime_round_t round);
+
+/* Convert timestamp to a number of microseconds (10^-6 seconds). */
+PyAPI_FUNC(_PyTime_t) _PyTime_AsMicroseconds(_PyTime_t t,
+    _PyTime_round_t round);
+
+/* Convert timestamp to a number of nanoseconds (10^-9 seconds). */
+PyAPI_FUNC(_PyTime_t) _PyTime_AsNanoseconds(_PyTime_t t);
+
+#ifdef MS_WINDOWS
+// Convert timestamp to a number of 100 nanoseconds (10^-7 seconds).
+PyAPI_FUNC(_PyTime_t) _PyTime_As100Nanoseconds(_PyTime_t t,
+    _PyTime_round_t round);
+#endif
+
+/* Convert timestamp to a number of nanoseconds (10^-9 seconds) as a Python int
+   object. */
+PyAPI_FUNC(PyObject *) _PyTime_AsNanosecondsObject(_PyTime_t t);
+
+#ifndef MS_WINDOWS
+/* Create a timestamp from a timeval structure.
+   Raise an exception and return -1 on overflow, return 0 on success. */
+PyAPI_FUNC(int) _PyTime_FromTimeval(_PyTime_t *tp, struct timeval *tv);
+#endif
+
+/* Convert a timestamp to a timeval structure (microsecond resolution).
+   tv_usec is always positive.
+   Raise an exception and return -1 if the conversion overflowed,
+   return 0 on success. */
+PyAPI_FUNC(int) _PyTime_AsTimeval(_PyTime_t t,
+    struct timeval *tv,
+    _PyTime_round_t round);
+
+/* Similar to _PyTime_AsTimeval() but don't raise an exception on overflow.
+   On overflow, clamp tv_sec to _PyTime_t min/max. */
+PyAPI_FUNC(void) _PyTime_AsTimeval_clamp(_PyTime_t t,
+    struct timeval *tv,
+    _PyTime_round_t round);
+
+/* Convert a timestamp to a number of seconds (secs) and microseconds (us).
+   us is always positive. This function is similar to _PyTime_AsTimeval()
+   except that secs is always a time_t type, whereas the timeval structure
+   uses a C long for tv_sec on Windows.
+   Raise an exception and return -1 if the conversion overflowed,
+   return 0 on success. */
+PyAPI_FUNC(int) _PyTime_AsTimevalTime_t(
+    _PyTime_t t,
+    time_t *secs,
+    int *us,
+    _PyTime_round_t round);
+
+#if defined(HAVE_CLOCK_GETTIME) || defined(HAVE_KQUEUE)
+/* Create a timestamp from a timespec structure.
+   Raise an exception and return -1 on overflow, return 0 on success. */
+PyAPI_FUNC(int) _PyTime_FromTimespec(_PyTime_t *tp, struct timespec *ts);
+
+/* Convert a timestamp to a timespec structure (nanosecond resolution).
+   tv_nsec is always positive.
+   Raise an exception and return -1 on error, return 0 on success. */
+PyAPI_FUNC(int) _PyTime_AsTimespec(_PyTime_t t, struct timespec *ts);
+
+/* Similar to _PyTime_AsTimespec() but don't raise an exception on overflow.
+   On overflow, clamp tv_sec to _PyTime_t min/max. */
+PyAPI_FUNC(void) _PyTime_AsTimespec_clamp(_PyTime_t t, struct timespec *ts);
+#endif
+
+
+// Compute t1 + t2. Clamp to [_PyTime_MIN; _PyTime_MAX] on overflow.
+PyAPI_FUNC(_PyTime_t) _PyTime_Add(_PyTime_t t1, _PyTime_t t2);
+
+/* Compute ticks * mul / div.
+   Clamp to [_PyTime_MIN; _PyTime_MAX] on overflow.
+   The caller must ensure that ((div - 1) * mul) cannot overflow. */
+PyAPI_FUNC(_PyTime_t) _PyTime_MulDiv(_PyTime_t ticks,
+    _PyTime_t mul,
+    _PyTime_t div);
+
+/* Structure used by time.get_clock_info() */
+typedef struct {
+    const char *implementation;
+    int monotonic;
+    int adjustable;
+    double resolution;
+} _Py_clock_info_t;
+
+/* Get the current time from the system clock.
+
+   If the internal clock fails, silently ignore the error and return 0.
+   On integer overflow, silently ignore the overflow and clamp the clock to
+   [_PyTime_MIN; _PyTime_MAX].
+
+   Use _PyTime_GetSystemClockWithInfo() to check for failure. */
+PyAPI_FUNC(_PyTime_t) _PyTime_GetSystemClock(void);
+
+/* Get the current time from the system clock.
+ * On success, set *t and *info (if not NULL), and return 0.
+ * On error, raise an exception and return -1.
+ */
+PyAPI_FUNC(int) _PyTime_GetSystemClockWithInfo(
+    _PyTime_t *t,
+    _Py_clock_info_t *info);
+
+/* Get the time of a monotonic clock, i.e. a clock that cannot go backwards.
+   The clock is not affected by system clock updates. The reference point of
+   the returned value is undefined, so that only the difference between the
+   results of consecutive calls is valid.
+
+   If the internal clock fails, silently ignore the error and return 0.
+   On integer overflow, silently ignore the overflow and clamp the clock to
+   [_PyTime_MIN; _PyTime_MAX].
+
+   Use _PyTime_GetMonotonicClockWithInfo() to check for failure. */
+PyAPI_FUNC(_PyTime_t) _PyTime_GetMonotonicClock(void);
+
+/* Get the time of a monotonic clock, i.e. a clock that cannot go backwards.
+   The clock is not affected by system clock updates. The reference point of
+   the returned value is undefined, so that only the difference between the
+   results of consecutive calls is valid.
+
+   Fill info (if set) with information of the function used to get the time.
+
+   Return 0 on success, raise an exception and return -1 on error. */
+PyAPI_FUNC(int) _PyTime_GetMonotonicClockWithInfo(
+    _PyTime_t *t,
+    _Py_clock_info_t *info);
+
+
+/* Converts a timestamp to the Gregorian time, using the local time zone.
+   Return 0 on success, raise an exception and return -1 on error. */
+PyAPI_FUNC(int) _PyTime_localtime(time_t t, struct tm *tm);
+
+/* Converts a timestamp to the Gregorian time, assuming UTC.
+   Return 0 on success, raise an exception and return -1 on error. */
+PyAPI_FUNC(int) _PyTime_gmtime(time_t t, struct tm *tm);
+
+/* Get the performance counter: clock with the highest available resolution to
+   measure a short duration.
+
+   If the internal clock fails, silently ignore the error and return 0.
+   On integer overflow, silently ignore the overflow and clamp the clock to
+   [_PyTime_MIN; _PyTime_MAX].
+
+   Use _PyTime_GetPerfCounterWithInfo() to check for failure. */
+PyAPI_FUNC(_PyTime_t) _PyTime_GetPerfCounter(void);
+
+/* Get the performance counter: clock with the highest available resolution to
+   measure a short duration.
+
+   Fill info (if set) with information of the function used to get the time.
+
+   Return 0 on success, raise an exception and return -1 on error. */
+PyAPI_FUNC(int) _PyTime_GetPerfCounterWithInfo(
+    _PyTime_t *t,
+    _Py_clock_info_t *info);
+
+
+// Create a deadline.
+// Pseudo code: _PyTime_GetMonotonicClock() + timeout.
+PyAPI_FUNC(_PyTime_t) _PyDeadline_Init(_PyTime_t timeout);
+
+// Get remaining time from a deadline.
+// Pseudo code: deadline - _PyTime_GetMonotonicClock().
+PyAPI_FUNC(_PyTime_t) _PyDeadline_Get(_PyTime_t deadline);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* Py_PYTIME_H */
+#endif /* Py_LIMITED_API */
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/setobject.h b/nanvix-port/cpython-headers/python3.12/cpython/setobject.h
new file mode 100644
index 000000000000..20fd63eaae56
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/setobject.h
@@ -0,0 +1,72 @@
+#ifndef Py_CPYTHON_SETOBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+/* There are three kinds of entries in the table:
+
+1. Unused:  key == NULL and hash == 0
+2. Dummy:   key == dummy and hash == -1
+3. Active:  key != NULL and key != dummy and hash != -1
+
+The hash field of Unused slots is always zero.
+
+The hash field of Dummy slots are set to -1
+meaning that dummy entries can be detected by
+either entry->key==dummy or by entry->hash==-1.
+*/
+
+#define PySet_MINSIZE 8
+
+typedef struct {
+    PyObject *key;
+    Py_hash_t hash;             /* Cached hash code of the key */
+} setentry;
+
+/* The SetObject data structure is shared by set and frozenset objects.
+
+Invariant for sets:
+ - hash is -1
+
+Invariants for frozensets:
+ - data is immutable.
+ - hash is the hash of the frozenset or -1 if not computed yet.
+
+*/
+
+typedef struct {
+    PyObject_HEAD
+
+    Py_ssize_t fill;            /* Number active and dummy entries*/
+    Py_ssize_t used;            /* Number active entries */
+
+    /* The table contains mask + 1 slots, and that's a power of 2.
+     * We store the mask instead of the size because the mask is more
+     * frequently needed.
+     */
+    Py_ssize_t mask;
+
+    /* The table points to a fixed-size smalltable for small tables
+     * or to additional malloc'ed memory for bigger tables.
+     * The table pointer is never NULL which saves us from repeated
+     * runtime null-tests.
+     */
+    setentry *table;
+    Py_hash_t hash;             /* Only used by frozenset objects */
+    Py_ssize_t finger;          /* Search finger for pop() */
+
+    setentry smalltable[PySet_MINSIZE];
+    PyObject *weakreflist;      /* List of weak references */
+} PySetObject;
+
+#define _PySet_CAST(so) \
+    (assert(PyAnySet_Check(so)), _Py_CAST(PySetObject*, so))
+
+static inline Py_ssize_t PySet_GET_SIZE(PyObject *so) {
+    return _PySet_CAST(so)->used;
+}
+#define PySet_GET_SIZE(so) PySet_GET_SIZE(_PyObject_CAST(so))
+
+PyAPI_DATA(PyObject *) _PySet_Dummy;
+
+PyAPI_FUNC(int) _PySet_NextEntry(PyObject *set, Py_ssize_t *pos, PyObject **key, Py_hash_t *hash);
+PyAPI_FUNC(int) _PySet_Update(PyObject *set, PyObject *iterable);
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/sysmodule.h b/nanvix-port/cpython-headers/python3.12/cpython/sysmodule.h
new file mode 100644
index 000000000000..19d9dddc344a
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/sysmodule.h
@@ -0,0 +1,16 @@
+#ifndef Py_CPYTHON_SYSMODULE_H
+#  error "this header file must not be included directly"
+#endif
+
+PyAPI_FUNC(PyObject *) _PySys_GetAttr(PyThreadState *tstate,
+                                      PyObject *name);
+
+PyAPI_FUNC(size_t) _PySys_GetSizeOf(PyObject *);
+
+typedef int(*Py_AuditHookFunction)(const char *, PyObject *, void *);
+
+PyAPI_FUNC(int) PySys_Audit(
+    const char *event,
+    const char *argFormat,
+    ...);
+PyAPI_FUNC(int) PySys_AddAuditHook(Py_AuditHookFunction, void*);
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/traceback.h b/nanvix-port/cpython-headers/python3.12/cpython/traceback.h
new file mode 100644
index 000000000000..a4e087b2b4ec
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/traceback.h
@@ -0,0 +1,16 @@
+#ifndef Py_CPYTHON_TRACEBACK_H
+#  error "this header file must not be included directly"
+#endif
+
+typedef struct _traceback PyTracebackObject;
+
+struct _traceback {
+    PyObject_HEAD
+    PyTracebackObject *tb_next;
+    PyFrameObject *tb_frame;
+    int tb_lasti;
+    int tb_lineno;
+};
+
+PyAPI_FUNC(int) _Py_DisplaySourceLine(PyObject *, PyObject *, int, int, int *, PyObject **);
+PyAPI_FUNC(void) _PyTraceback_Add(const char *, const char *, int);
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/tupleobject.h b/nanvix-port/cpython-headers/python3.12/cpython/tupleobject.h
new file mode 100644
index 000000000000..f6a1f076e033
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/tupleobject.h
@@ -0,0 +1,39 @@
+#ifndef Py_CPYTHON_TUPLEOBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+typedef struct {
+    PyObject_VAR_HEAD
+    /* ob_item contains space for 'ob_size' elements.
+       Items must normally not be NULL, except during construction when
+       the tuple is not yet visible outside the function that builds it. */
+    PyObject *ob_item[1];
+} PyTupleObject;
+
+PyAPI_FUNC(int) _PyTuple_Resize(PyObject **, Py_ssize_t);
+PyAPI_FUNC(void) _PyTuple_MaybeUntrack(PyObject *);
+
+/* Cast argument to PyTupleObject* type. */
+#define _PyTuple_CAST(op) \
+    (assert(PyTuple_Check(op)), _Py_CAST(PyTupleObject*, (op)))
+
+// Macros and static inline functions, trading safety for speed
+
+static inline Py_ssize_t PyTuple_GET_SIZE(PyObject *op) {
+    PyTupleObject *tuple = _PyTuple_CAST(op);
+    return Py_SIZE(tuple);
+}
+#define PyTuple_GET_SIZE(op) PyTuple_GET_SIZE(_PyObject_CAST(op))
+
+#define PyTuple_GET_ITEM(op, index) (_PyTuple_CAST(op)->ob_item[(index)])
+
+/* Function *only* to be used to fill in brand new tuples */
+static inline void
+PyTuple_SET_ITEM(PyObject *op, Py_ssize_t index, PyObject *value) {
+    PyTupleObject *tuple = _PyTuple_CAST(op);
+    tuple->ob_item[index] = value;
+}
+#define PyTuple_SET_ITEM(op, index, value) \
+    PyTuple_SET_ITEM(_PyObject_CAST(op), (index), _PyObject_CAST(value))
+
+PyAPI_FUNC(void) _PyTuple_DebugMallocStats(FILE *out);
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/unicodeobject.h b/nanvix-port/cpython-headers/python3.12/cpython/unicodeobject.h
new file mode 100644
index 000000000000..f177cd9e2af9
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/unicodeobject.h
@@ -0,0 +1,963 @@
+#ifndef Py_CPYTHON_UNICODEOBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+/* Py_UNICODE was the native Unicode storage format (code unit) used by
+   Python and represents a single Unicode element in the Unicode type.
+   With PEP 393, Py_UNICODE is deprecated and replaced with a
+   typedef to wchar_t. */
+#define PY_UNICODE_TYPE wchar_t
+/* Py_DEPRECATED(3.3) */ typedef wchar_t Py_UNICODE;
+
+/* --- Internal Unicode Operations ---------------------------------------- */
+
+// Static inline functions to work with surrogates
+static inline int Py_UNICODE_IS_SURROGATE(Py_UCS4 ch) {
+    return (0xD800 <= ch && ch <= 0xDFFF);
+}
+static inline int Py_UNICODE_IS_HIGH_SURROGATE(Py_UCS4 ch) {
+    return (0xD800 <= ch && ch <= 0xDBFF);
+}
+static inline int Py_UNICODE_IS_LOW_SURROGATE(Py_UCS4 ch) {
+    return (0xDC00 <= ch && ch <= 0xDFFF);
+}
+
+// Join two surrogate characters and return a single Py_UCS4 value.
+static inline Py_UCS4 Py_UNICODE_JOIN_SURROGATES(Py_UCS4 high, Py_UCS4 low)  {
+    assert(Py_UNICODE_IS_HIGH_SURROGATE(high));
+    assert(Py_UNICODE_IS_LOW_SURROGATE(low));
+    return 0x10000 + (((high & 0x03FF) << 10) | (low & 0x03FF));
+}
+
+// High surrogate = top 10 bits added to 0xD800.
+// The character must be in the range [U+10000; U+10ffff].
+static inline Py_UCS4 Py_UNICODE_HIGH_SURROGATE(Py_UCS4 ch) {
+    assert(0x10000 <= ch && ch <= 0x10ffff);
+    return (0xD800 - (0x10000 >> 10) + (ch >> 10));
+}
+
+// Low surrogate = bottom 10 bits added to 0xDC00.
+// The character must be in the range [U+10000; U+10ffff].
+static inline Py_UCS4 Py_UNICODE_LOW_SURROGATE(Py_UCS4 ch) {
+    assert(0x10000 <= ch && ch <= 0x10ffff);
+    return (0xDC00 + (ch & 0x3FF));
+}
+
+/* --- Unicode Type ------------------------------------------------------- */
+
+/* ASCII-only strings created through PyUnicode_New use the PyASCIIObject
+   structure. state.ascii and state.compact are set, and the data
+   immediately follow the structure. utf8_length can be found
+   in the length field; the utf8 pointer is equal to the data pointer. */
+typedef struct {
+    /* There are 4 forms of Unicode strings:
+
+       - compact ascii:
+
+         * structure = PyASCIIObject
+         * test: PyUnicode_IS_COMPACT_ASCII(op)
+         * kind = PyUnicode_1BYTE_KIND
+         * compact = 1
+         * ascii = 1
+         * (length is the length of the utf8)
+         * (data starts just after the structure)
+         * (since ASCII is decoded from UTF-8, the utf8 string are the data)
+
+       - compact:
+
+         * structure = PyCompactUnicodeObject
+         * test: PyUnicode_IS_COMPACT(op) && !PyUnicode_IS_ASCII(op)
+         * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
+           PyUnicode_4BYTE_KIND
+         * compact = 1
+         * ascii = 0
+         * utf8 is not shared with data
+         * utf8_length = 0 if utf8 is NULL
+         * (data starts just after the structure)
+
+       - legacy string:
+
+         * structure = PyUnicodeObject structure
+         * test: !PyUnicode_IS_COMPACT(op)
+         * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
+           PyUnicode_4BYTE_KIND
+         * compact = 0
+         * data.any is not NULL
+         * utf8 is shared and utf8_length = length with data.any if ascii = 1
+         * utf8_length = 0 if utf8 is NULL
+
+       Compact strings use only one memory block (structure + characters),
+       whereas legacy strings use one block for the structure and one block
+       for characters.
+
+       Legacy strings are created by subclasses of Unicode.
+
+       See also _PyUnicode_CheckConsistency().
+    */
+    PyObject_HEAD
+    Py_ssize_t length;          /* Number of code points in the string */
+    Py_hash_t hash;             /* Hash value; -1 if not set */
+    struct {
+        /* If interned is non-zero, the two references from the
+           dictionary to this object are *not* counted in ob_refcnt.
+           The possible values here are:
+               0: Not Interned
+               1: Interned
+               2: Interned and Immortal
+               3: Interned, Immortal, and Static
+           This categorization allows the runtime to determine the right
+           cleanup mechanism at runtime shutdown. */
+        unsigned int interned:2;
+        /* Character size:
+
+           - PyUnicode_1BYTE_KIND (1):
+
+             * character type = Py_UCS1 (8 bits, unsigned)
+             * all characters are in the range U+0000-U+00FF (latin1)
+             * if ascii is set, all characters are in the range U+0000-U+007F
+               (ASCII), otherwise at least one character is in the range
+               U+0080-U+00FF
+
+           - PyUnicode_2BYTE_KIND (2):
+
+             * character type = Py_UCS2 (16 bits, unsigned)
+             * all characters are in the range U+0000-U+FFFF (BMP)
+             * at least one character is in the range U+0100-U+FFFF
+
+           - PyUnicode_4BYTE_KIND (4):
+
+             * character type = Py_UCS4 (32 bits, unsigned)
+             * all characters are in the range U+0000-U+10FFFF
+             * at least one character is in the range U+10000-U+10FFFF
+         */
+        unsigned int kind:3;
+        /* Compact is with respect to the allocation scheme. Compact unicode
+           objects only require one memory block while non-compact objects use
+           one block for the PyUnicodeObject struct and another for its data
+           buffer. */
+        unsigned int compact:1;
+        /* The string only contains characters in the range U+0000-U+007F (ASCII)
+           and the kind is PyUnicode_1BYTE_KIND. If ascii is set and compact is
+           set, use the PyASCIIObject structure. */
+        unsigned int ascii:1;
+        /* The object is statically allocated. */
+        unsigned int statically_allocated:1;
+        /* Padding to ensure that PyUnicode_DATA() is always aligned to
+           4 bytes (see issue #19537 on m68k). */
+        unsigned int :24;
+    } state;
+} PyASCIIObject;
+
+/* Non-ASCII strings allocated through PyUnicode_New use the
+   PyCompactUnicodeObject structure. state.compact is set, and the data
+   immediately follow the structure. */
+typedef struct {
+    PyASCIIObject _base;
+    Py_ssize_t utf8_length;     /* Number of bytes in utf8, excluding the
+                                 * terminating \0. */
+    char *utf8;                 /* UTF-8 representation (null-terminated) */
+} PyCompactUnicodeObject;
+
+/* Object format for Unicode subclasses. */
+typedef struct {
+    PyCompactUnicodeObject _base;
+    union {
+        void *any;
+        Py_UCS1 *latin1;
+        Py_UCS2 *ucs2;
+        Py_UCS4 *ucs4;
+    } data;                     /* Canonical, smallest-form Unicode buffer */
+} PyUnicodeObject;
+
+PyAPI_FUNC(int) _PyUnicode_CheckConsistency(
+    PyObject *op,
+    int check_content);
+
+
+#define _PyASCIIObject_CAST(op) \
+    (assert(PyUnicode_Check(op)), \
+     _Py_CAST(PyASCIIObject*, (op)))
+#define _PyCompactUnicodeObject_CAST(op) \
+    (assert(PyUnicode_Check(op)), \
+     _Py_CAST(PyCompactUnicodeObject*, (op)))
+#define _PyUnicodeObject_CAST(op) \
+    (assert(PyUnicode_Check(op)), \
+     _Py_CAST(PyUnicodeObject*, (op)))
+
+
+/* --- Flexible String Representation Helper Macros (PEP 393) -------------- */
+
+/* Values for PyASCIIObject.state: */
+
+/* Interning state. */
+#define SSTATE_NOT_INTERNED 0
+#define SSTATE_INTERNED_MORTAL 1
+#define SSTATE_INTERNED_IMMORTAL 2
+#define SSTATE_INTERNED_IMMORTAL_STATIC 3
+
+/* Use only if you know it's a string */
+static inline unsigned int PyUnicode_CHECK_INTERNED(PyObject *op) {
+    return _PyASCIIObject_CAST(op)->state.interned;
+}
+#define PyUnicode_CHECK_INTERNED(op) PyUnicode_CHECK_INTERNED(_PyObject_CAST(op))
+
+/* For backward compatibility */
+static inline unsigned int PyUnicode_IS_READY(PyObject* Py_UNUSED(op)) {
+    return 1;
+}
+#define PyUnicode_IS_READY(op) PyUnicode_IS_READY(_PyObject_CAST(op))
+
+/* Return true if the string contains only ASCII characters, or 0 if not. The
+   string may be compact (PyUnicode_IS_COMPACT_ASCII) or not, but must be
+   ready. */
+static inline unsigned int PyUnicode_IS_ASCII(PyObject *op) {
+    return _PyASCIIObject_CAST(op)->state.ascii;
+}
+#define PyUnicode_IS_ASCII(op) PyUnicode_IS_ASCII(_PyObject_CAST(op))
+
+/* Return true if the string is compact or 0 if not.
+   No type checks or Ready calls are performed. */
+static inline unsigned int PyUnicode_IS_COMPACT(PyObject *op) {
+    return _PyASCIIObject_CAST(op)->state.compact;
+}
+#define PyUnicode_IS_COMPACT(op) PyUnicode_IS_COMPACT(_PyObject_CAST(op))
+
+/* Return true if the string is a compact ASCII string (use PyASCIIObject
+   structure), or 0 if not.  No type checks or Ready calls are performed. */
+static inline int PyUnicode_IS_COMPACT_ASCII(PyObject *op) {
+    return (_PyASCIIObject_CAST(op)->state.ascii && PyUnicode_IS_COMPACT(op));
+}
+#define PyUnicode_IS_COMPACT_ASCII(op) PyUnicode_IS_COMPACT_ASCII(_PyObject_CAST(op))
+
+enum PyUnicode_Kind {
+/* Return values of the PyUnicode_KIND() function: */
+    PyUnicode_1BYTE_KIND = 1,
+    PyUnicode_2BYTE_KIND = 2,
+    PyUnicode_4BYTE_KIND = 4
+};
+
+// PyUnicode_KIND(): Return one of the PyUnicode_*_KIND values defined above.
+//
+// gh-89653: Converting this macro to a static inline function would introduce
+// new compiler warnings on "kind < PyUnicode_KIND(str)" (compare signed and
+// unsigned numbers) where kind type is an int or on
+// "unsigned int kind = PyUnicode_KIND(str)" (cast signed to unsigned).
+#define PyUnicode_KIND(op) _Py_RVALUE(_PyASCIIObject_CAST(op)->state.kind)
+
+/* Return a void pointer to the raw unicode buffer. */
+static inline void* _PyUnicode_COMPACT_DATA(PyObject *op) {
+    if (PyUnicode_IS_ASCII(op)) {
+        return _Py_STATIC_CAST(void*, (_PyASCIIObject_CAST(op) + 1));
+    }
+    return _Py_STATIC_CAST(void*, (_PyCompactUnicodeObject_CAST(op) + 1));
+}
+
+static inline void* _PyUnicode_NONCOMPACT_DATA(PyObject *op) {
+    void *data;
+    assert(!PyUnicode_IS_COMPACT(op));
+    data = _PyUnicodeObject_CAST(op)->data.any;
+    assert(data != NULL);
+    return data;
+}
+
+static inline void* PyUnicode_DATA(PyObject *op) {
+    if (PyUnicode_IS_COMPACT(op)) {
+        return _PyUnicode_COMPACT_DATA(op);
+    }
+    return _PyUnicode_NONCOMPACT_DATA(op);
+}
+#define PyUnicode_DATA(op) PyUnicode_DATA(_PyObject_CAST(op))
+
+/* Return pointers to the canonical representation cast to unsigned char,
+   Py_UCS2, or Py_UCS4 for direct character access.
+   No checks are performed, use PyUnicode_KIND() before to ensure
+   these will work correctly. */
+
+#define PyUnicode_1BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS1*, PyUnicode_DATA(op))
+#define PyUnicode_2BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS2*, PyUnicode_DATA(op))
+#define PyUnicode_4BYTE_DATA(op) _Py_STATIC_CAST(Py_UCS4*, PyUnicode_DATA(op))
+
+/* Returns the length of the unicode string. */
+static inline Py_ssize_t PyUnicode_GET_LENGTH(PyObject *op) {
+    return _PyASCIIObject_CAST(op)->length;
+}
+#define PyUnicode_GET_LENGTH(op) PyUnicode_GET_LENGTH(_PyObject_CAST(op))
+
+/* Write into the canonical representation, this function does not do any sanity
+   checks and is intended for usage in loops.  The caller should cache the
+   kind and data pointers obtained from other function calls.
+   index is the index in the string (starts at 0) and value is the new
+   code point value which should be written to that location. */
+static inline void PyUnicode_WRITE(int kind, void *data,
+                                   Py_ssize_t index, Py_UCS4 value)
+{
+    assert(index >= 0);
+    if (kind == PyUnicode_1BYTE_KIND) {
+        assert(value <= 0xffU);
+        _Py_STATIC_CAST(Py_UCS1*, data)[index] = _Py_STATIC_CAST(Py_UCS1, value);
+    }
+    else if (kind == PyUnicode_2BYTE_KIND) {
+        assert(value <= 0xffffU);
+        _Py_STATIC_CAST(Py_UCS2*, data)[index] = _Py_STATIC_CAST(Py_UCS2, value);
+    }
+    else {
+        assert(kind == PyUnicode_4BYTE_KIND);
+        assert(value <= 0x10ffffU);
+        _Py_STATIC_CAST(Py_UCS4*, data)[index] = value;
+    }
+}
+#define PyUnicode_WRITE(kind, data, index, value) \
+    PyUnicode_WRITE(_Py_STATIC_CAST(int, kind), _Py_CAST(void*, data), \
+                    (index), _Py_STATIC_CAST(Py_UCS4, value))
+
+/* Read a code point from the string's canonical representation.  No checks
+   or ready calls are performed. */
+static inline Py_UCS4 PyUnicode_READ(int kind,
+                                     const void *data, Py_ssize_t index)
+{
+    assert(index >= 0);
+    if (kind == PyUnicode_1BYTE_KIND) {
+        return _Py_STATIC_CAST(const Py_UCS1*, data)[index];
+    }
+    if (kind == PyUnicode_2BYTE_KIND) {
+        return _Py_STATIC_CAST(const Py_UCS2*, data)[index];
+    }
+    assert(kind == PyUnicode_4BYTE_KIND);
+    return _Py_STATIC_CAST(const Py_UCS4*, data)[index];
+}
+#define PyUnicode_READ(kind, data, index) \
+    PyUnicode_READ(_Py_STATIC_CAST(int, kind), \
+                   _Py_STATIC_CAST(const void*, data), \
+                   (index))
+
+/* PyUnicode_READ_CHAR() is less efficient than PyUnicode_READ() because it
+   calls PyUnicode_KIND() and might call it twice.  For single reads, use
+   PyUnicode_READ_CHAR, for multiple consecutive reads callers should
+   cache kind and use PyUnicode_READ instead. */
+static inline Py_UCS4 PyUnicode_READ_CHAR(PyObject *unicode, Py_ssize_t index)
+{
+    int kind;
+
+    assert(index >= 0);
+    // Tolerate reading the NUL character at str[len(str)]
+    assert(index <= PyUnicode_GET_LENGTH(unicode));
+
+    kind = PyUnicode_KIND(unicode);
+    if (kind == PyUnicode_1BYTE_KIND) {
+        return PyUnicode_1BYTE_DATA(unicode)[index];
+    }
+    if (kind == PyUnicode_2BYTE_KIND) {
+        return PyUnicode_2BYTE_DATA(unicode)[index];
+    }
+    assert(kind == PyUnicode_4BYTE_KIND);
+    return PyUnicode_4BYTE_DATA(unicode)[index];
+}
+#define PyUnicode_READ_CHAR(unicode, index) \
+    PyUnicode_READ_CHAR(_PyObject_CAST(unicode), (index))
+
+/* Return a maximum character value which is suitable for creating another
+   string based on op.  This is always an approximation but more efficient
+   than iterating over the string. */
+static inline Py_UCS4 PyUnicode_MAX_CHAR_VALUE(PyObject *op)
+{
+    int kind;
+
+    if (PyUnicode_IS_ASCII(op)) {
+        return 0x7fU;
+    }
+
+    kind = PyUnicode_KIND(op);
+    if (kind == PyUnicode_1BYTE_KIND) {
+       return 0xffU;
+    }
+    if (kind == PyUnicode_2BYTE_KIND) {
+        return 0xffffU;
+    }
+    assert(kind == PyUnicode_4BYTE_KIND);
+    return 0x10ffffU;
+}
+#define PyUnicode_MAX_CHAR_VALUE(op) \
+    PyUnicode_MAX_CHAR_VALUE(_PyObject_CAST(op))
+
+/* === Public API ========================================================= */
+
+/* --- Plain Py_UNICODE --------------------------------------------------- */
+
+/* With PEP 393, this is the recommended way to allocate a new unicode object.
+   This function will allocate the object and its buffer in a single memory
+   block.  Objects created using this function are not resizable. */
+PyAPI_FUNC(PyObject*) PyUnicode_New(
+    Py_ssize_t size,            /* Number of code points in the new string */
+    Py_UCS4 maxchar             /* maximum code point value in the string */
+    );
+
+/* For backward compatibility */
+static inline int PyUnicode_READY(PyObject* Py_UNUSED(op))
+{
+    return 0;
+}
+#define PyUnicode_READY(op) PyUnicode_READY(_PyObject_CAST(op))
+
+/* Get a copy of a Unicode string. */
+PyAPI_FUNC(PyObject*) _PyUnicode_Copy(
+    PyObject *unicode
+    );
+
+/* Copy character from one unicode object into another, this function performs
+   character conversion when necessary and falls back to memcpy() if possible.
+
+   Fail if to is too small (smaller than *how_many* or smaller than
+   len(from)-from_start), or if kind(from[from_start:from_start+how_many]) >
+   kind(to), or if *to* has more than 1 reference.
+
+   Return the number of written character, or return -1 and raise an exception
+   on error.
+
+   Pseudo-code:
+
+       how_many = min(how_many, len(from) - from_start)
+       to[to_start:to_start+how_many] = from[from_start:from_start+how_many]
+       return how_many
+
+   Note: The function doesn't write a terminating null character.
+   */
+PyAPI_FUNC(Py_ssize_t) PyUnicode_CopyCharacters(
+    PyObject *to,
+    Py_ssize_t to_start,
+    PyObject *from,
+    Py_ssize_t from_start,
+    Py_ssize_t how_many
+    );
+
+/* Unsafe version of PyUnicode_CopyCharacters(): don't check arguments and so
+   may crash if parameters are invalid (e.g. if the output string
+   is too short). */
+PyAPI_FUNC(void) _PyUnicode_FastCopyCharacters(
+    PyObject *to,
+    Py_ssize_t to_start,
+    PyObject *from,
+    Py_ssize_t from_start,
+    Py_ssize_t how_many
+    );
+
+/* Fill a string with a character: write fill_char into
+   unicode[start:start+length].
+
+   Fail if fill_char is bigger than the string maximum character, or if the
+   string has more than 1 reference.
+
+   Return the number of written character, or return -1 and raise an exception
+   on error. */
+PyAPI_FUNC(Py_ssize_t) PyUnicode_Fill(
+    PyObject *unicode,
+    Py_ssize_t start,
+    Py_ssize_t length,
+    Py_UCS4 fill_char
+    );
+
+/* Unsafe version of PyUnicode_Fill(): don't check arguments and so may crash
+   if parameters are invalid (e.g. if length is longer than the string). */
+PyAPI_FUNC(void) _PyUnicode_FastFill(
+    PyObject *unicode,
+    Py_ssize_t start,
+    Py_ssize_t length,
+    Py_UCS4 fill_char
+    );
+
+/* Create a new string from a buffer of Py_UCS1, Py_UCS2 or Py_UCS4 characters.
+   Scan the string to find the maximum character. */
+PyAPI_FUNC(PyObject*) PyUnicode_FromKindAndData(
+    int kind,
+    const void *buffer,
+    Py_ssize_t size);
+
+/* Create a new string from a buffer of ASCII characters.
+   WARNING: Don't check if the string contains any non-ASCII character. */
+PyAPI_FUNC(PyObject*) _PyUnicode_FromASCII(
+    const char *buffer,
+    Py_ssize_t size);
+
+/* Compute the maximum character of the substring unicode[start:end].
+   Return 127 for an empty string. */
+PyAPI_FUNC(Py_UCS4) _PyUnicode_FindMaxChar (
+    PyObject *unicode,
+    Py_ssize_t start,
+    Py_ssize_t end);
+
+/* --- _PyUnicodeWriter API ----------------------------------------------- */
+
+typedef struct {
+    PyObject *buffer;
+    void *data;
+    int kind;
+    Py_UCS4 maxchar;
+    Py_ssize_t size;
+    Py_ssize_t pos;
+
+    /* minimum number of allocated characters (default: 0) */
+    Py_ssize_t min_length;
+
+    /* minimum character (default: 127, ASCII) */
+    Py_UCS4 min_char;
+
+    /* If non-zero, overallocate the buffer (default: 0). */
+    unsigned char overallocate;
+
+    /* If readonly is 1, buffer is a shared string (cannot be modified)
+       and size is set to 0. */
+    unsigned char readonly;
+} _PyUnicodeWriter ;
+
+/* Initialize a Unicode writer.
+ *
+ * By default, the minimum buffer size is 0 character and overallocation is
+ * disabled. Set min_length, min_char and overallocate attributes to control
+ * the allocation of the buffer. */
+PyAPI_FUNC(void)
+_PyUnicodeWriter_Init(_PyUnicodeWriter *writer);
+
+/* Prepare the buffer to write 'length' characters
+   with the specified maximum character.
+
+   Return 0 on success, raise an exception and return -1 on error. */
+#define _PyUnicodeWriter_Prepare(WRITER, LENGTH, MAXCHAR)             \
+    (((MAXCHAR) <= (WRITER)->maxchar                                  \
+      && (LENGTH) <= (WRITER)->size - (WRITER)->pos)                  \
+     ? 0                                                              \
+     : (((LENGTH) == 0)                                               \
+        ? 0                                                           \
+        : _PyUnicodeWriter_PrepareInternal((WRITER), (LENGTH), (MAXCHAR))))
+
+/* Don't call this function directly, use the _PyUnicodeWriter_Prepare() macro
+   instead. */
+PyAPI_FUNC(int)
+_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
+                                 Py_ssize_t length, Py_UCS4 maxchar);
+
+/* Prepare the buffer to have at least the kind KIND.
+   For example, kind=PyUnicode_2BYTE_KIND ensures that the writer will
+   support characters in range U+000-U+FFFF.
+
+   Return 0 on success, raise an exception and return -1 on error. */
+#define _PyUnicodeWriter_PrepareKind(WRITER, KIND)                    \
+    ((KIND) <= (WRITER)->kind                                         \
+     ? 0                                                              \
+     : _PyUnicodeWriter_PrepareKindInternal((WRITER), (KIND)))
+
+/* Don't call this function directly, use the _PyUnicodeWriter_PrepareKind()
+   macro instead. */
+PyAPI_FUNC(int)
+_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
+                                     int kind);
+
+/* Append a Unicode character.
+   Return 0 on success, raise an exception and return -1 on error. */
+PyAPI_FUNC(int)
+_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer,
+    Py_UCS4 ch
+    );
+
+/* Append a Unicode string.
+   Return 0 on success, raise an exception and return -1 on error. */
+PyAPI_FUNC(int)
+_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer,
+    PyObject *str               /* Unicode string */
+    );
+
+/* Append a substring of a Unicode string.
+   Return 0 on success, raise an exception and return -1 on error. */
+PyAPI_FUNC(int)
+_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer,
+    PyObject *str,              /* Unicode string */
+    Py_ssize_t start,
+    Py_ssize_t end
+    );
+
+/* Append an ASCII-encoded byte string.
+   Return 0 on success, raise an exception and return -1 on error. */
+PyAPI_FUNC(int)
+_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
+    const char *str,           /* ASCII-encoded byte string */
+    Py_ssize_t len             /* number of bytes, or -1 if unknown */
+    );
+
+/* Append a latin1-encoded byte string.
+   Return 0 on success, raise an exception and return -1 on error. */
+PyAPI_FUNC(int)
+_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
+    const char *str,           /* latin1-encoded byte string */
+    Py_ssize_t len             /* length in bytes */
+    );
+
+/* Get the value of the writer as a Unicode string. Clear the
+   buffer of the writer. Raise an exception and return NULL
+   on error. */
+PyAPI_FUNC(PyObject *)
+_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer);
+
+/* Deallocate memory of a writer (clear its internal buffer). */
+PyAPI_FUNC(void)
+_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer);
+
+
+/* Format the object based on the format_spec, as defined in PEP 3101
+   (Advanced String Formatting). */
+PyAPI_FUNC(int) _PyUnicode_FormatAdvancedWriter(
+    _PyUnicodeWriter *writer,
+    PyObject *obj,
+    PyObject *format_spec,
+    Py_ssize_t start,
+    Py_ssize_t end);
+
+/* --- Manage the default encoding ---------------------------------------- */
+
+/* Returns a pointer to the default encoding (UTF-8) of the
+   Unicode object unicode.
+
+   Like PyUnicode_AsUTF8AndSize(), this also caches the UTF-8 representation
+   in the unicodeobject.
+
+   _PyUnicode_AsString is a #define for PyUnicode_AsUTF8 to
+   support the previous internal function with the same behaviour.
+
+   Use of this API is DEPRECATED since no size information can be
+   extracted from the returned data.
+*/
+
+PyAPI_FUNC(const char *) PyUnicode_AsUTF8(PyObject *unicode);
+
+#define _PyUnicode_AsString PyUnicode_AsUTF8
+
+/* --- UTF-7 Codecs ------------------------------------------------------- */
+
+PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF7(
+    PyObject *unicode,          /* Unicode object */
+    int base64SetO,             /* Encode RFC2152 Set O characters in base64 */
+    int base64WhiteSpace,       /* Encode whitespace (sp, ht, nl, cr) in base64 */
+    const char *errors          /* error handling */
+    );
+
+/* --- UTF-8 Codecs ------------------------------------------------------- */
+
+PyAPI_FUNC(PyObject*) _PyUnicode_AsUTF8String(
+    PyObject *unicode,
+    const char *errors);
+
+/* --- UTF-32 Codecs ------------------------------------------------------ */
+
+PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF32(
+    PyObject *object,           /* Unicode object */
+    const char *errors,         /* error handling */
+    int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
+    );
+
+/* --- UTF-16 Codecs ------------------------------------------------------ */
+
+/* Returns a Python string object holding the UTF-16 encoded value of
+   the Unicode data.
+
+   If byteorder is not 0, output is written according to the following
+   byte order:
+
+   byteorder == -1: little endian
+   byteorder == 0:  native byte order (writes a BOM mark)
+   byteorder == 1:  big endian
+
+   If byteorder is 0, the output string will always start with the
+   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
+   prepended.
+*/
+PyAPI_FUNC(PyObject*) _PyUnicode_EncodeUTF16(
+    PyObject* unicode,          /* Unicode object */
+    const char *errors,         /* error handling */
+    int byteorder               /* byteorder to use 0=BOM+native;-1=LE,1=BE */
+    );
+
+/* --- Unicode-Escape Codecs ---------------------------------------------- */
+
+/* Variant of PyUnicode_DecodeUnicodeEscape that supports partial decoding. */
+PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeStateful(
+        const char *string,     /* Unicode-Escape encoded string */
+        Py_ssize_t length,      /* size of string */
+        const char *errors,     /* error handling */
+        Py_ssize_t *consumed    /* bytes consumed */
+);
+/* Helper for PyUnicode_DecodeUnicodeEscape that detects invalid escape
+   chars. */
+PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
+        const char *string,     /* Unicode-Escape encoded string */
+        Py_ssize_t length,      /* size of string */
+        const char *errors,     /* error handling */
+        Py_ssize_t *consumed,   /* bytes consumed */
+        const char **first_invalid_escape  /* on return, points to first
+                                              invalid escaped char in
+                                              string. */
+);
+
+/* --- Raw-Unicode-Escape Codecs ---------------------------------------------- */
+
+/* Variant of PyUnicode_DecodeRawUnicodeEscape that supports partial decoding. */
+PyAPI_FUNC(PyObject*) _PyUnicode_DecodeRawUnicodeEscapeStateful(
+        const char *string,     /* Unicode-Escape encoded string */
+        Py_ssize_t length,      /* size of string */
+        const char *errors,     /* error handling */
+        Py_ssize_t *consumed    /* bytes consumed */
+);
+
+/* --- Latin-1 Codecs ----------------------------------------------------- */
+
+PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
+    PyObject* unicode,
+    const char* errors);
+
+/* --- ASCII Codecs ------------------------------------------------------- */
+
+PyAPI_FUNC(PyObject*) _PyUnicode_AsASCIIString(
+    PyObject* unicode,
+    const char* errors);
+
+/* --- Character Map Codecs ----------------------------------------------- */
+
+/* Translate an Unicode object by applying a character mapping table to
+   it and return the resulting Unicode object.
+
+   The mapping table must map Unicode ordinal integers to Unicode strings,
+   Unicode ordinal integers or None (causing deletion of the character).
+
+   Mapping tables may be dictionaries or sequences. Unmapped character
+   ordinals (ones which cause a LookupError) are left untouched and
+   are copied as-is.
+*/
+PyAPI_FUNC(PyObject*) _PyUnicode_EncodeCharmap(
+    PyObject *unicode,          /* Unicode object */
+    PyObject *mapping,          /* encoding mapping */
+    const char *errors          /* error handling */
+    );
+
+/* --- Decimal Encoder ---------------------------------------------------- */
+
+/* Coverts a Unicode object holding a decimal value to an ASCII string
+   for using in int, float and complex parsers.
+   Transforms code points that have decimal digit property to the
+   corresponding ASCII digit code points.  Transforms spaces to ASCII.
+   Transforms code points starting from the first non-ASCII code point that
+   is neither a decimal digit nor a space to the end into '?'. */
+
+PyAPI_FUNC(PyObject*) _PyUnicode_TransformDecimalAndSpaceToASCII(
+    PyObject *unicode           /* Unicode object */
+    );
+
+/* --- Methods & Slots ---------------------------------------------------- */
+
+PyAPI_FUNC(PyObject *) _PyUnicode_JoinArray(
+    PyObject *separator,
+    PyObject *const *items,
+    Py_ssize_t seqlen
+    );
+
+/* Test whether a unicode is equal to ASCII identifier.  Return 1 if true,
+   0 otherwise.  The right argument must be ASCII identifier.
+   Any error occurs inside will be cleared before return. */
+PyAPI_FUNC(int) _PyUnicode_EqualToASCIIId(
+    PyObject *left,             /* Left string */
+    _Py_Identifier *right       /* Right identifier */
+    );
+
+/* Test whether a unicode is equal to ASCII string.  Return 1 if true,
+   0 otherwise.  The right argument must be ASCII-encoded string.
+   Any error occurs inside will be cleared before return. */
+PyAPI_FUNC(int) _PyUnicode_EqualToASCIIString(
+    PyObject *left,
+    const char *right           /* ASCII-encoded string */
+    );
+
+/* Externally visible for str.strip(unicode) */
+PyAPI_FUNC(PyObject *) _PyUnicode_XStrip(
+    PyObject *self,
+    int striptype,
+    PyObject *sepobj
+    );
+
+/* Using explicit passed-in values, insert the thousands grouping
+   into the string pointed to by buffer.  For the argument descriptions,
+   see Objects/stringlib/localeutil.h */
+PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(
+    _PyUnicodeWriter *writer,
+    Py_ssize_t n_buffer,
+    PyObject *digits,
+    Py_ssize_t d_pos,
+    Py_ssize_t n_digits,
+    Py_ssize_t min_width,
+    const char *grouping,
+    PyObject *thousands_sep,
+    Py_UCS4 *maxchar);
+
+/* === Characters Type APIs =============================================== */
+
+/* These should not be used directly. Use the Py_UNICODE_IS* and
+   Py_UNICODE_TO* macros instead.
+
+   These APIs are implemented in Objects/unicodectype.c.
+
+*/
+
+PyAPI_FUNC(int) _PyUnicode_IsLowercase(
+    Py_UCS4 ch       /* Unicode character */
+    );
+
+PyAPI_FUNC(int) _PyUnicode_IsUppercase(
+    Py_UCS4 ch       /* Unicode character */
+    );
+
+PyAPI_FUNC(int) _PyUnicode_IsTitlecase(
+    Py_UCS4 ch       /* Unicode character */
+    );
+
+PyAPI_FUNC(int) _PyUnicode_IsXidStart(
+    Py_UCS4 ch       /* Unicode character */
+    );
+
+PyAPI_FUNC(int) _PyUnicode_IsXidContinue(
+    Py_UCS4 ch       /* Unicode character */
+    );
+
+PyAPI_FUNC(int) _PyUnicode_IsWhitespace(
+    const Py_UCS4 ch         /* Unicode character */
+    );
+
+PyAPI_FUNC(int) _PyUnicode_IsLinebreak(
+    const Py_UCS4 ch         /* Unicode character */
+    );
+
+/* Py_DEPRECATED(3.3) */ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToLowercase(
+    Py_UCS4 ch       /* Unicode character */
+    );
+
+/* Py_DEPRECATED(3.3) */ PyAPI_FUNC(Py_UCS4) _PyUnicode_ToUppercase(
+    Py_UCS4 ch       /* Unicode character */
+    );
+
+Py_DEPRECATED(3.3) PyAPI_FUNC(Py_UCS4) _PyUnicode_ToTitlecase(
+    Py_UCS4 ch       /* Unicode character */
+    );
+
+PyAPI_FUNC(int) _PyUnicode_ToLowerFull(
+    Py_UCS4 ch,       /* Unicode character */
+    Py_UCS4 *res
+    );
+
+PyAPI_FUNC(int) _PyUnicode_ToTitleFull(
+    Py_UCS4 ch,       /* Unicode character */
+    Py_UCS4 *res
+    );
+
+PyAPI_FUNC(int) _PyUnicode_ToUpperFull(
+    Py_UCS4 ch,       /* Unicode character */
+    Py_UCS4 *res
+    );
+
+PyAPI_FUNC(int) _PyUnicode_ToFoldedFull(
+    Py_UCS4 ch,       /* Unicode character */
+    Py_UCS4 *res
+    );
+
+PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
+    Py_UCS4 ch         /* Unicode character */
+    );
+
+PyAPI_FUNC(int) _PyUnicode_IsCased(
+    Py_UCS4 ch         /* Unicode character */
+    );
+
+PyAPI_FUNC(int) _PyUnicode_ToDecimalDigit(
+    Py_UCS4 ch       /* Unicode character */
+    );
+
+PyAPI_FUNC(int) _PyUnicode_ToDigit(
+    Py_UCS4 ch       /* Unicode character */
+    );
+
+PyAPI_FUNC(double) _PyUnicode_ToNumeric(
+    Py_UCS4 ch       /* Unicode character */
+    );
+
+PyAPI_FUNC(int) _PyUnicode_IsDecimalDigit(
+    Py_UCS4 ch       /* Unicode character */
+    );
+
+PyAPI_FUNC(int) _PyUnicode_IsDigit(
+    Py_UCS4 ch       /* Unicode character */
+    );
+
+PyAPI_FUNC(int) _PyUnicode_IsNumeric(
+    Py_UCS4 ch       /* Unicode character */
+    );
+
+PyAPI_FUNC(int) _PyUnicode_IsPrintable(
+    Py_UCS4 ch       /* Unicode character */
+    );
+
+PyAPI_FUNC(int) _PyUnicode_IsAlpha(
+    Py_UCS4 ch       /* Unicode character */
+    );
+
+// Helper array used by Py_UNICODE_ISSPACE().
+PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];
+
+// Since splitting on whitespace is an important use case, and
+// whitespace in most situations is solely ASCII whitespace, we
+// optimize for the common case by using a quick look-up table
+// _Py_ascii_whitespace (see below) with an inlined check.
+static inline int Py_UNICODE_ISSPACE(Py_UCS4 ch) {
+    if (ch < 128) {
+        return _Py_ascii_whitespace[ch];
+    }
+    return _PyUnicode_IsWhitespace(ch);
+}
+
+#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
+#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
+#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
+#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)
+
+#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
+#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
+#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)
+
+#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
+#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
+#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
+#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
+
+#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
+#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
+#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
+
+#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
+
+static inline int Py_UNICODE_ISALNUM(Py_UCS4 ch) {
+   return (Py_UNICODE_ISALPHA(ch)
+           || Py_UNICODE_ISDECIMAL(ch)
+           || Py_UNICODE_ISDIGIT(ch)
+           || Py_UNICODE_ISNUMERIC(ch));
+}
+
+
+/* === Misc functions ===================================================== */
+
+PyAPI_FUNC(PyObject*) _PyUnicode_FormatLong(PyObject *, int, int, int);
+
+/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
+PyAPI_FUNC(PyObject*) _PyUnicode_FromId(_Py_Identifier*);
+
+/* Fast equality check when the inputs are known to be exact unicode types
+   and where the hash values are equal (i.e. a very probable match) */
+PyAPI_FUNC(int) _PyUnicode_EQ(PyObject *, PyObject *);
+
+/* Equality check. */
+PyAPI_FUNC(int) _PyUnicode_Equal(PyObject *, PyObject *);
+
+PyAPI_FUNC(int) _PyUnicode_WideCharString_Converter(PyObject *, void *);
+PyAPI_FUNC(int) _PyUnicode_WideCharString_Opt_Converter(PyObject *, void *);
+
+PyAPI_FUNC(Py_ssize_t) _PyUnicode_ScanIdentifier(PyObject *);
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/warnings.h b/nanvix-port/cpython-headers/python3.12/cpython/warnings.h
new file mode 100644
index 000000000000..4e3eb88e8ff4
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/warnings.h
@@ -0,0 +1,20 @@
+#ifndef Py_CPYTHON_WARNINGS_H
+#  error "this header file must not be included directly"
+#endif
+
+PyAPI_FUNC(int) PyErr_WarnExplicitObject(
+    PyObject *category,
+    PyObject *message,
+    PyObject *filename,
+    int lineno,
+    PyObject *module,
+    PyObject *registry);
+
+PyAPI_FUNC(int) PyErr_WarnExplicitFormat(
+    PyObject *category,
+    const char *filename, int lineno,
+    const char *module, PyObject *registry,
+    const char *format, ...);
+
+// DEPRECATED: Use PyErr_WarnEx() instead.
+#define PyErr_Warn(category, msg) PyErr_WarnEx((category), (msg), 1)
diff --git a/nanvix-port/cpython-headers/python3.12/cpython/weakrefobject.h b/nanvix-port/cpython-headers/python3.12/cpython/weakrefobject.h
new file mode 100644
index 000000000000..fd79fdc2dcc4
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/cpython/weakrefobject.h
@@ -0,0 +1,56 @@
+#ifndef Py_CPYTHON_WEAKREFOBJECT_H
+#  error "this header file must not be included directly"
+#endif
+
+/* PyWeakReference is the base struct for the Python ReferenceType, ProxyType,
+ * and CallableProxyType.
+ */
+struct _PyWeakReference {
+    PyObject_HEAD
+
+    /* The object to which this is a weak reference, or Py_None if none.
+     * Note that this is a stealth reference:  wr_object's refcount is
+     * not incremented to reflect this pointer.
+     */
+    PyObject *wr_object;
+
+    /* A callable to invoke when wr_object dies, or NULL if none. */
+    PyObject *wr_callback;
+
+    /* A cache for wr_object's hash code.  As usual for hashes, this is -1
+     * if the hash code isn't known yet.
+     */
+    Py_hash_t hash;
+
+    /* If wr_object is weakly referenced, wr_object has a doubly-linked NULL-
+     * terminated list of weak references to it.  These are the list pointers.
+     * If wr_object goes away, wr_object is set to Py_None, and these pointers
+     * have no meaning then.
+     */
+    PyWeakReference *wr_prev;
+    PyWeakReference *wr_next;
+    vectorcallfunc vectorcall;
+};
+
+PyAPI_FUNC(Py_ssize_t) _PyWeakref_GetWeakrefCount(PyWeakReference *head);
+
+PyAPI_FUNC(void) _PyWeakref_ClearRef(PyWeakReference *self);
+
+static inline PyObject* PyWeakref_GET_OBJECT(PyObject *ref_obj) {
+    PyWeakReference *ref;
+    PyObject *obj;
+    assert(PyWeakref_Check(ref_obj));
+    ref = _Py_CAST(PyWeakReference*, ref_obj);
+    obj = ref->wr_object;
+    // Explanation for the Py_REFCNT() check: when a weakref's target is part
+    // of a long chain of deallocations which triggers the trashcan mechanism,
+    // clearing the weakrefs can be delayed long after the target's refcount
+    // has dropped to zero.  In the meantime, code accessing the weakref will
+    // be able to "see" the target object even though it is supposed to be
+    // unreachable.  See issue gh-60806.
+    if (Py_REFCNT(obj) > 0) {
+        return obj;
+    }
+    return Py_None;
+}
+#define PyWeakref_GET_OBJECT(ref) PyWeakref_GET_OBJECT(_PyObject_CAST(ref))
diff --git a/nanvix-port/cpython-headers/python3.12/datetime.h b/nanvix-port/cpython-headers/python3.12/datetime.h
new file mode 100644
index 000000000000..b78cc0e8e2e5
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/datetime.h
@@ -0,0 +1,267 @@
+/*  datetime.h
+ */
+#ifndef Py_LIMITED_API
+#ifndef DATETIME_H
+#define DATETIME_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Fields are packed into successive bytes, each viewed as unsigned and
+ * big-endian, unless otherwise noted:
+ *
+ * byte offset
+ *  0           year     2 bytes, 1-9999
+ *  2           month    1 byte, 1-12
+ *  3           day      1 byte, 1-31
+ *  4           hour     1 byte, 0-23
+ *  5           minute   1 byte, 0-59
+ *  6           second   1 byte, 0-59
+ *  7           usecond  3 bytes, 0-999999
+ * 10
+ */
+
+/* # of bytes for year, month, and day. */
+#define _PyDateTime_DATE_DATASIZE 4
+
+/* # of bytes for hour, minute, second, and usecond. */
+#define _PyDateTime_TIME_DATASIZE 6
+
+/* # of bytes for year, month, day, hour, minute, second, and usecond. */
+#define _PyDateTime_DATETIME_DATASIZE 10
+
+
+typedef struct
+{
+    PyObject_HEAD
+    Py_hash_t hashcode;         /* -1 when unknown */
+    int days;                   /* -MAX_DELTA_DAYS <= days <= MAX_DELTA_DAYS */
+    int seconds;                /* 0 <= seconds < 24*3600 is invariant */
+    int microseconds;           /* 0 <= microseconds < 1000000 is invariant */
+} PyDateTime_Delta;
+
+typedef struct
+{
+    PyObject_HEAD               /* a pure abstract base class */
+} PyDateTime_TZInfo;
+
+
+/* The datetime and time types have hashcodes, and an optional tzinfo member,
+ * present if and only if hastzinfo is true.
+ */
+#define _PyTZINFO_HEAD          \
+    PyObject_HEAD               \
+    Py_hash_t hashcode;         \
+    char hastzinfo;             /* boolean flag */
+
+/* No _PyDateTime_BaseTZInfo is allocated; it's just to have something
+ * convenient to cast to, when getting at the hastzinfo member of objects
+ * starting with _PyTZINFO_HEAD.
+ */
+typedef struct
+{
+    _PyTZINFO_HEAD
+} _PyDateTime_BaseTZInfo;
+
+/* All time objects are of PyDateTime_TimeType, but that can be allocated
+ * in two ways, with or without a tzinfo member.  Without is the same as
+ * tzinfo == None, but consumes less memory.  _PyDateTime_BaseTime is an
+ * internal struct used to allocate the right amount of space for the
+ * "without" case.
+ */
+#define _PyDateTime_TIMEHEAD    \
+    _PyTZINFO_HEAD              \
+    unsigned char data[_PyDateTime_TIME_DATASIZE];
+
+typedef struct
+{
+    _PyDateTime_TIMEHEAD
+} _PyDateTime_BaseTime;         /* hastzinfo false */
+
+typedef struct
+{
+    _PyDateTime_TIMEHEAD
+    unsigned char fold;
+    PyObject *tzinfo;
+} PyDateTime_Time;              /* hastzinfo true */
+
+
+/* All datetime objects are of PyDateTime_DateTimeType, but that can be
+ * allocated in two ways too, just like for time objects above.  In addition,
+ * the plain date type is a base class for datetime, so it must also have
+ * a hastzinfo member (although it's unused there).
+ */
+typedef struct
+{
+    _PyTZINFO_HEAD
+    unsigned char data[_PyDateTime_DATE_DATASIZE];
+} PyDateTime_Date;
+
+#define _PyDateTime_DATETIMEHEAD        \
+    _PyTZINFO_HEAD                      \
+    unsigned char data[_PyDateTime_DATETIME_DATASIZE];
+
+typedef struct
+{
+    _PyDateTime_DATETIMEHEAD
+} _PyDateTime_BaseDateTime;     /* hastzinfo false */
+
+typedef struct
+{
+    _PyDateTime_DATETIMEHEAD
+    unsigned char fold;
+    PyObject *tzinfo;
+} PyDateTime_DateTime;          /* hastzinfo true */
+
+
+/* Apply for date and datetime instances. */
+
+// o is a pointer to a time or a datetime object.
+#define _PyDateTime_HAS_TZINFO(o)  (((_PyDateTime_BaseTZInfo *)(o))->hastzinfo)
+
+#define PyDateTime_GET_YEAR(o)     ((((PyDateTime_Date*)(o))->data[0] << 8) | \
+                     ((PyDateTime_Date*)(o))->data[1])
+#define PyDateTime_GET_MONTH(o)    (((PyDateTime_Date*)(o))->data[2])
+#define PyDateTime_GET_DAY(o)      (((PyDateTime_Date*)(o))->data[3])
+
+#define PyDateTime_DATE_GET_HOUR(o)        (((PyDateTime_DateTime*)(o))->data[4])
+#define PyDateTime_DATE_GET_MINUTE(o)      (((PyDateTime_DateTime*)(o))->data[5])
+#define PyDateTime_DATE_GET_SECOND(o)      (((PyDateTime_DateTime*)(o))->data[6])
+#define PyDateTime_DATE_GET_MICROSECOND(o)              \
+    ((((PyDateTime_DateTime*)(o))->data[7] << 16) |       \
+     (((PyDateTime_DateTime*)(o))->data[8] << 8)  |       \
+      ((PyDateTime_DateTime*)(o))->data[9])
+#define PyDateTime_DATE_GET_FOLD(o)        (((PyDateTime_DateTime*)(o))->fold)
+#define PyDateTime_DATE_GET_TZINFO(o)      (_PyDateTime_HAS_TZINFO((o)) ? \
+    ((PyDateTime_DateTime *)(o))->tzinfo : Py_None)
+
+/* Apply for time instances. */
+#define PyDateTime_TIME_GET_HOUR(o)        (((PyDateTime_Time*)(o))->data[0])
+#define PyDateTime_TIME_GET_MINUTE(o)      (((PyDateTime_Time*)(o))->data[1])
+#define PyDateTime_TIME_GET_SECOND(o)      (((PyDateTime_Time*)(o))->data[2])
+#define PyDateTime_TIME_GET_MICROSECOND(o)              \
+    ((((PyDateTime_Time*)(o))->data[3] << 16) |           \
+     (((PyDateTime_Time*)(o))->data[4] << 8)  |           \
+      ((PyDateTime_Time*)(o))->data[5])
+#define PyDateTime_TIME_GET_FOLD(o)        (((PyDateTime_Time*)(o))->fold)
+#define PyDateTime_TIME_GET_TZINFO(o)      (_PyDateTime_HAS_TZINFO(o) ? \
+    ((PyDateTime_Time *)(o))->tzinfo : Py_None)
+
+/* Apply for time delta instances */
+#define PyDateTime_DELTA_GET_DAYS(o)         (((PyDateTime_Delta*)(o))->days)
+#define PyDateTime_DELTA_GET_SECONDS(o)      (((PyDateTime_Delta*)(o))->seconds)
+#define PyDateTime_DELTA_GET_MICROSECONDS(o)            \
+    (((PyDateTime_Delta*)(o))->microseconds)
+
+
+/* Define structure for C API. */
+typedef struct {
+    /* type objects */
+    PyTypeObject *DateType;
+    PyTypeObject *DateTimeType;
+    PyTypeObject *TimeType;
+    PyTypeObject *DeltaType;
+    PyTypeObject *TZInfoType;
+
+    /* singletons */
+    PyObject *TimeZone_UTC;
+
+    /* constructors */
+    PyObject *(*Date_FromDate)(int, int, int, PyTypeObject*);
+    PyObject *(*DateTime_FromDateAndTime)(int, int, int, int, int, int, int,
+        PyObject*, PyTypeObject*);
+    PyObject *(*Time_FromTime)(int, int, int, int, PyObject*, PyTypeObject*);
+    PyObject *(*Delta_FromDelta)(int, int, int, int, PyTypeObject*);
+    PyObject *(*TimeZone_FromTimeZone)(PyObject *offset, PyObject *name);
+
+    /* constructors for the DB API */
+    PyObject *(*DateTime_FromTimestamp)(PyObject*, PyObject*, PyObject*);
+    PyObject *(*Date_FromTimestamp)(PyObject*, PyObject*);
+
+    /* PEP 495 constructors */
+    PyObject *(*DateTime_FromDateAndTimeAndFold)(int, int, int, int, int, int, int,
+        PyObject*, int, PyTypeObject*);
+    PyObject *(*Time_FromTimeAndFold)(int, int, int, int, PyObject*, int, PyTypeObject*);
+
+} PyDateTime_CAPI;
+
+#define PyDateTime_CAPSULE_NAME "datetime.datetime_CAPI"
+
+
+/* This block is only used as part of the public API and should not be
+ * included in _datetimemodule.c, which does not use the C API capsule.
+ * See bpo-35081 for more details.
+ * */
+#ifndef _PY_DATETIME_IMPL
+/* Define global variable for the C API and a macro for setting it. */
+static PyDateTime_CAPI *PyDateTimeAPI = NULL;
+
+#define PyDateTime_IMPORT \
+    PyDateTimeAPI = (PyDateTime_CAPI *)PyCapsule_Import(PyDateTime_CAPSULE_NAME, 0)
+
+/* Macro for access to the UTC singleton */
+#define PyDateTime_TimeZone_UTC PyDateTimeAPI->TimeZone_UTC
+
+/* Macros for type checking when not building the Python core. */
+#define PyDate_Check(op) PyObject_TypeCheck((op), PyDateTimeAPI->DateType)
+#define PyDate_CheckExact(op) Py_IS_TYPE((op), PyDateTimeAPI->DateType)
+
+#define PyDateTime_Check(op) PyObject_TypeCheck((op), PyDateTimeAPI->DateTimeType)
+#define PyDateTime_CheckExact(op) Py_IS_TYPE((op), PyDateTimeAPI->DateTimeType)
+
+#define PyTime_Check(op) PyObject_TypeCheck((op), PyDateTimeAPI->TimeType)
+#define PyTime_CheckExact(op) Py_IS_TYPE((op), PyDateTimeAPI->TimeType)
+
+#define PyDelta_Check(op) PyObject_TypeCheck((op), PyDateTimeAPI->DeltaType)
+#define PyDelta_CheckExact(op) Py_IS_TYPE((op), PyDateTimeAPI->DeltaType)
+
+#define PyTZInfo_Check(op) PyObject_TypeCheck((op), PyDateTimeAPI->TZInfoType)
+#define PyTZInfo_CheckExact(op) Py_IS_TYPE((op), PyDateTimeAPI->TZInfoType)
+
+
+/* Macros for accessing constructors in a simplified fashion. */
+#define PyDate_FromDate(year, month, day) \
+    PyDateTimeAPI->Date_FromDate((year), (month), (day), PyDateTimeAPI->DateType)
+
+#define PyDateTime_FromDateAndTime(year, month, day, hour, min, sec, usec) \
+    PyDateTimeAPI->DateTime_FromDateAndTime((year), (month), (day), (hour), \
+        (min), (sec), (usec), Py_None, PyDateTimeAPI->DateTimeType)
+
+#define PyDateTime_FromDateAndTimeAndFold(year, month, day, hour, min, sec, usec, fold) \
+    PyDateTimeAPI->DateTime_FromDateAndTimeAndFold((year), (month), (day), (hour), \
+        (min), (sec), (usec), Py_None, (fold), PyDateTimeAPI->DateTimeType)
+
+#define PyTime_FromTime(hour, minute, second, usecond) \
+    PyDateTimeAPI->Time_FromTime((hour), (minute), (second), (usecond), \
+        Py_None, PyDateTimeAPI->TimeType)
+
+#define PyTime_FromTimeAndFold(hour, minute, second, usecond, fold) \
+    PyDateTimeAPI->Time_FromTimeAndFold((hour), (minute), (second), (usecond), \
+        Py_None, (fold), PyDateTimeAPI->TimeType)
+
+#define PyDelta_FromDSU(days, seconds, useconds) \
+    PyDateTimeAPI->Delta_FromDelta((days), (seconds), (useconds), 1, \
+        PyDateTimeAPI->DeltaType)
+
+#define PyTimeZone_FromOffset(offset) \
+    PyDateTimeAPI->TimeZone_FromTimeZone((offset), NULL)
+
+#define PyTimeZone_FromOffsetAndName(offset, name) \
+    PyDateTimeAPI->TimeZone_FromTimeZone((offset), (name))
+
+/* Macros supporting the DB API. */
+#define PyDateTime_FromTimestamp(args) \
+    PyDateTimeAPI->DateTime_FromTimestamp( \
+        (PyObject*) (PyDateTimeAPI->DateTimeType), (args), NULL)
+
+#define PyDate_FromTimestamp(args) \
+    PyDateTimeAPI->Date_FromTimestamp( \
+        (PyObject*) (PyDateTimeAPI->DateType), (args))
+
+#endif   /* !defined(_PY_DATETIME_IMPL) */
+
+#ifdef __cplusplus
+}
+#endif
+#endif
+#endif /* !Py_LIMITED_API */
diff --git a/nanvix-port/cpython-headers/python3.12/descrobject.h b/nanvix-port/cpython-headers/python3.12/descrobject.h
new file mode 100644
index 000000000000..fd66d17b497a
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/descrobject.h
@@ -0,0 +1,100 @@
+/* Descriptors */
+#ifndef Py_DESCROBJECT_H
+#define Py_DESCROBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef PyObject *(*getter)(PyObject *, void *);
+typedef int (*setter)(PyObject *, PyObject *, void *);
+
+struct PyGetSetDef {
+    const char *name;
+    getter get;
+    setter set;
+    const char *doc;
+    void *closure;
+};
+
+PyAPI_DATA(PyTypeObject) PyClassMethodDescr_Type;
+PyAPI_DATA(PyTypeObject) PyGetSetDescr_Type;
+PyAPI_DATA(PyTypeObject) PyMemberDescr_Type;
+PyAPI_DATA(PyTypeObject) PyMethodDescr_Type;
+PyAPI_DATA(PyTypeObject) PyWrapperDescr_Type;
+PyAPI_DATA(PyTypeObject) PyDictProxy_Type;
+PyAPI_DATA(PyTypeObject) PyProperty_Type;
+
+PyAPI_FUNC(PyObject *) PyDescr_NewMethod(PyTypeObject *, PyMethodDef *);
+PyAPI_FUNC(PyObject *) PyDescr_NewClassMethod(PyTypeObject *, PyMethodDef *);
+PyAPI_FUNC(PyObject *) PyDescr_NewMember(PyTypeObject *, PyMemberDef *);
+PyAPI_FUNC(PyObject *) PyDescr_NewGetSet(PyTypeObject *, PyGetSetDef *);
+
+PyAPI_FUNC(PyObject *) PyDictProxy_New(PyObject *);
+PyAPI_FUNC(PyObject *) PyWrapper_New(PyObject *, PyObject *);
+
+
+/* An array of PyMemberDef structures defines the name, type and offset
+   of selected members of a C structure.  These can be read by
+   PyMember_GetOne() and set by PyMember_SetOne() (except if their READONLY
+   flag is set).  The array must be terminated with an entry whose name
+   pointer is NULL. */
+struct PyMemberDef {
+    const char *name;
+    int type;
+    Py_ssize_t offset;
+    int flags;
+    const char *doc;
+};
+
+// These constants used to be in structmember.h, not prefixed by Py_.
+// (structmember.h now has aliases to the new names.)
+
+/* Types */
+#define Py_T_SHORT     0
+#define Py_T_INT       1
+#define Py_T_LONG      2
+#define Py_T_FLOAT     3
+#define Py_T_DOUBLE    4
+#define Py_T_STRING    5
+#define _Py_T_OBJECT   6  // Deprecated, use Py_T_OBJECT_EX instead
+/* the ordering here is weird for binary compatibility */
+#define Py_T_CHAR      7   /* 1-character string */
+#define Py_T_BYTE      8   /* 8-bit signed int */
+/* unsigned variants: */
+#define Py_T_UBYTE     9
+#define Py_T_USHORT    10
+#define Py_T_UINT      11
+#define Py_T_ULONG     12
+
+/* Added by Jack: strings contained in the structure */
+#define Py_T_STRING_INPLACE    13
+
+/* Added by Lillo: bools contained in the structure (assumed char) */
+#define Py_T_BOOL      14
+
+#define Py_T_OBJECT_EX 16
+#define Py_T_LONGLONG  17
+#define Py_T_ULONGLONG 18
+
+#define Py_T_PYSSIZET  19      /* Py_ssize_t */
+#define _Py_T_NONE     20 // Deprecated. Value is always None.
+
+/* Flags */
+#define Py_READONLY            1
+#define Py_AUDIT_READ          2 // Added in 3.10, harmless no-op before that
+#define _Py_WRITE_RESTRICTED   4 // Deprecated, no-op. Do not reuse the value.
+#define Py_RELATIVE_OFFSET     8
+
+PyAPI_FUNC(PyObject *) PyMember_GetOne(const char *, PyMemberDef *);
+PyAPI_FUNC(int) PyMember_SetOne(char *, PyMemberDef *, PyObject *);
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_DESCROBJECT_H
+#  include "cpython/descrobject.h"
+#  undef Py_CPYTHON_DESCROBJECT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_DESCROBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/dictobject.h b/nanvix-port/cpython-headers/python3.12/dictobject.h
new file mode 100644
index 000000000000..e7fcb44d0cf9
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/dictobject.h
@@ -0,0 +1,97 @@
+#ifndef Py_DICTOBJECT_H
+#define Py_DICTOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Dictionary object type -- mapping from hashable object to object */
+
+/* The distribution includes a separate file, Objects/dictnotes.txt,
+   describing explorations into dictionary design and optimization.
+   It covers typical dictionary use patterns, the parameters for
+   tuning dictionaries, and several ideas for possible optimizations.
+*/
+
+PyAPI_DATA(PyTypeObject) PyDict_Type;
+
+#define PyDict_Check(op) \
+                 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_DICT_SUBCLASS)
+#define PyDict_CheckExact(op) Py_IS_TYPE((op), &PyDict_Type)
+
+PyAPI_FUNC(PyObject *) PyDict_New(void);
+PyAPI_FUNC(PyObject *) PyDict_GetItem(PyObject *mp, PyObject *key);
+PyAPI_FUNC(PyObject *) PyDict_GetItemWithError(PyObject *mp, PyObject *key);
+PyAPI_FUNC(int) PyDict_SetItem(PyObject *mp, PyObject *key, PyObject *item);
+PyAPI_FUNC(int) PyDict_DelItem(PyObject *mp, PyObject *key);
+PyAPI_FUNC(void) PyDict_Clear(PyObject *mp);
+PyAPI_FUNC(int) PyDict_Next(
+    PyObject *mp, Py_ssize_t *pos, PyObject **key, PyObject **value);
+PyAPI_FUNC(PyObject *) PyDict_Keys(PyObject *mp);
+PyAPI_FUNC(PyObject *) PyDict_Values(PyObject *mp);
+PyAPI_FUNC(PyObject *) PyDict_Items(PyObject *mp);
+PyAPI_FUNC(Py_ssize_t) PyDict_Size(PyObject *mp);
+PyAPI_FUNC(PyObject *) PyDict_Copy(PyObject *mp);
+PyAPI_FUNC(int) PyDict_Contains(PyObject *mp, PyObject *key);
+
+/* PyDict_Update(mp, other) is equivalent to PyDict_Merge(mp, other, 1). */
+PyAPI_FUNC(int) PyDict_Update(PyObject *mp, PyObject *other);
+
+/* PyDict_Merge updates/merges from a mapping object (an object that
+   supports PyMapping_Keys() and PyObject_GetItem()).  If override is true,
+   the last occurrence of a key wins, else the first.  The Python
+   dict.update(other) is equivalent to PyDict_Merge(dict, other, 1).
+*/
+PyAPI_FUNC(int) PyDict_Merge(PyObject *mp,
+                             PyObject *other,
+                             int override);
+
+/* PyDict_MergeFromSeq2 updates/merges from an iterable object producing
+   iterable objects of length 2.  If override is true, the last occurrence
+   of a key wins, else the first.  The Python dict constructor dict(seq2)
+   is equivalent to dict={}; PyDict_MergeFromSeq(dict, seq2, 1).
+*/
+PyAPI_FUNC(int) PyDict_MergeFromSeq2(PyObject *d,
+                                     PyObject *seq2,
+                                     int override);
+
+PyAPI_FUNC(PyObject *) PyDict_GetItemString(PyObject *dp, const char *key);
+PyAPI_FUNC(int) PyDict_SetItemString(PyObject *dp, const char *key, PyObject *item);
+PyAPI_FUNC(int) PyDict_DelItemString(PyObject *dp, const char *key);
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030A0000
+PyAPI_FUNC(PyObject *) PyObject_GenericGetDict(PyObject *, void *);
+#endif
+
+/* Dictionary (keys, values, items) views */
+
+PyAPI_DATA(PyTypeObject) PyDictKeys_Type;
+PyAPI_DATA(PyTypeObject) PyDictValues_Type;
+PyAPI_DATA(PyTypeObject) PyDictItems_Type;
+
+#define PyDictKeys_Check(op) PyObject_TypeCheck((op), &PyDictKeys_Type)
+#define PyDictValues_Check(op) PyObject_TypeCheck((op), &PyDictValues_Type)
+#define PyDictItems_Check(op) PyObject_TypeCheck((op), &PyDictItems_Type)
+/* This excludes Values, since they are not sets. */
+# define PyDictViewSet_Check(op) \
+    (PyDictKeys_Check(op) || PyDictItems_Check(op))
+
+/* Dictionary (key, value, items) iterators */
+
+PyAPI_DATA(PyTypeObject) PyDictIterKey_Type;
+PyAPI_DATA(PyTypeObject) PyDictIterValue_Type;
+PyAPI_DATA(PyTypeObject) PyDictIterItem_Type;
+
+PyAPI_DATA(PyTypeObject) PyDictRevIterKey_Type;
+PyAPI_DATA(PyTypeObject) PyDictRevIterItem_Type;
+PyAPI_DATA(PyTypeObject) PyDictRevIterValue_Type;
+
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_DICTOBJECT_H
+#  include "cpython/dictobject.h"
+#  undef Py_CPYTHON_DICTOBJECT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_DICTOBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/dynamic_annotations.h b/nanvix-port/cpython-headers/python3.12/dynamic_annotations.h
new file mode 100644
index 000000000000..4d4def9bf898
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/dynamic_annotations.h
@@ -0,0 +1,499 @@
+/* Copyright (c) 2008-2009, Google Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *     * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * ---
+ * Author: Kostya Serebryany
+ * Copied to CPython by Jeffrey Yasskin, with all macros renamed to
+ * start with _Py_ to avoid colliding with users embedding Python, and
+ * with deprecated macros removed.
+ */
+
+/* This file defines dynamic annotations for use with dynamic analysis
+   tool such as valgrind, PIN, etc.
+
+   Dynamic annotation is a source code annotation that affects
+   the generated code (that is, the annotation is not a comment).
+   Each such annotation is attached to a particular
+   instruction and/or to a particular object (address) in the program.
+
+   The annotations that should be used by users are macros in all upper-case
+   (e.g., _Py_ANNOTATE_NEW_MEMORY).
+
+   Actual implementation of these macros may differ depending on the
+   dynamic analysis tool being used.
+
+   See https://code.google.com/p/data-race-test/  for more information.
+
+   This file supports the following dynamic analysis tools:
+   - None (DYNAMIC_ANNOTATIONS_ENABLED is not defined or zero).
+      Macros are defined empty.
+   - ThreadSanitizer, Helgrind, DRD (DYNAMIC_ANNOTATIONS_ENABLED is 1).
+      Macros are defined as calls to non-inlinable empty functions
+      that are intercepted by Valgrind. */
+
+#ifndef __DYNAMIC_ANNOTATIONS_H__
+#define __DYNAMIC_ANNOTATIONS_H__
+
+#ifndef DYNAMIC_ANNOTATIONS_ENABLED
+# define DYNAMIC_ANNOTATIONS_ENABLED 0
+#endif
+
+#if DYNAMIC_ANNOTATIONS_ENABLED != 0
+
+  /* -------------------------------------------------------------
+     Annotations useful when implementing condition variables such as CondVar,
+     using conditional critical sections (Await/LockWhen) and when constructing
+     user-defined synchronization mechanisms.
+
+     The annotations _Py_ANNOTATE_HAPPENS_BEFORE() and
+     _Py_ANNOTATE_HAPPENS_AFTER() can be used to define happens-before arcs in
+     user-defined synchronization mechanisms: the race detector will infer an
+     arc from the former to the latter when they share the same argument
+     pointer.
+
+     Example 1 (reference counting):
+
+     void Unref() {
+       _Py_ANNOTATE_HAPPENS_BEFORE(&refcount_);
+       if (AtomicDecrementByOne(&refcount_) == 0) {
+         _Py_ANNOTATE_HAPPENS_AFTER(&refcount_);
+         delete this;
+       }
+     }
+
+     Example 2 (message queue):
+
+     void MyQueue::Put(Type *e) {
+       MutexLock lock(&mu_);
+       _Py_ANNOTATE_HAPPENS_BEFORE(e);
+       PutElementIntoMyQueue(e);
+     }
+
+     Type *MyQueue::Get() {
+       MutexLock lock(&mu_);
+       Type *e = GetElementFromMyQueue();
+       _Py_ANNOTATE_HAPPENS_AFTER(e);
+       return e;
+     }
+
+     Note: when possible, please use the existing reference counting and message
+     queue implementations instead of inventing new ones. */
+
+  /* Report that wait on the condition variable at address "cv" has succeeded
+     and the lock at address "lock" is held. */
+#define _Py_ANNOTATE_CONDVAR_LOCK_WAIT(cv, lock) \
+    AnnotateCondVarWait(__FILE__, __LINE__, cv, lock)
+
+  /* Report that wait on the condition variable at "cv" has succeeded.  Variant
+     w/o lock. */
+#define _Py_ANNOTATE_CONDVAR_WAIT(cv) \
+    AnnotateCondVarWait(__FILE__, __LINE__, cv, NULL)
+
+  /* Report that we are about to signal on the condition variable at address
+     "cv". */
+#define _Py_ANNOTATE_CONDVAR_SIGNAL(cv) \
+    AnnotateCondVarSignal(__FILE__, __LINE__, cv)
+
+  /* Report that we are about to signal_all on the condition variable at "cv". */
+#define _Py_ANNOTATE_CONDVAR_SIGNAL_ALL(cv) \
+    AnnotateCondVarSignalAll(__FILE__, __LINE__, cv)
+
+  /* Annotations for user-defined synchronization mechanisms. */
+#define _Py_ANNOTATE_HAPPENS_BEFORE(obj) _Py_ANNOTATE_CONDVAR_SIGNAL(obj)
+#define _Py_ANNOTATE_HAPPENS_AFTER(obj)  _Py_ANNOTATE_CONDVAR_WAIT(obj)
+
+  /* Report that the bytes in the range [pointer, pointer+size) are about
+     to be published safely. The race checker will create a happens-before
+     arc from the call _Py_ANNOTATE_PUBLISH_MEMORY_RANGE(pointer, size) to
+     subsequent accesses to this memory.
+     Note: this annotation may not work properly if the race detector uses
+     sampling, i.e. does not observe all memory accesses.
+     */
+#define _Py_ANNOTATE_PUBLISH_MEMORY_RANGE(pointer, size) \
+    AnnotatePublishMemoryRange(__FILE__, __LINE__, pointer, size)
+
+  /* Instruct the tool to create a happens-before arc between mu->Unlock() and
+     mu->Lock(). This annotation may slow down the race detector and hide real
+     races. Normally it is used only when it would be difficult to annotate each
+     of the mutex's critical sections individually using the annotations above.
+     This annotation makes sense only for hybrid race detectors. For pure
+     happens-before detectors this is a no-op. For more details see
+     https://code.google.com/p/data-race-test/wiki/PureHappensBeforeVsHybrid . */
+#define _Py_ANNOTATE_PURE_HAPPENS_BEFORE_MUTEX(mu) \
+    AnnotateMutexIsUsedAsCondVar(__FILE__, __LINE__, mu)
+
+  /* -------------------------------------------------------------
+     Annotations useful when defining memory allocators, or when memory that
+     was protected in one way starts to be protected in another. */
+
+  /* Report that a new memory at "address" of size "size" has been allocated.
+     This might be used when the memory has been retrieved from a free list and
+     is about to be reused, or when the locking discipline for a variable
+     changes. */
+#define _Py_ANNOTATE_NEW_MEMORY(address, size) \
+    AnnotateNewMemory(__FILE__, __LINE__, address, size)
+
+  /* -------------------------------------------------------------
+     Annotations useful when defining FIFO queues that transfer data between
+     threads. */
+
+  /* Report that the producer-consumer queue (such as ProducerConsumerQueue) at
+     address "pcq" has been created.  The _Py_ANNOTATE_PCQ_* annotations should
+     be used only for FIFO queues.  For non-FIFO queues use
+     _Py_ANNOTATE_HAPPENS_BEFORE (for put) and _Py_ANNOTATE_HAPPENS_AFTER (for
+     get). */
+#define _Py_ANNOTATE_PCQ_CREATE(pcq) \
+    AnnotatePCQCreate(__FILE__, __LINE__, pcq)
+
+  /* Report that the queue at address "pcq" is about to be destroyed. */
+#define _Py_ANNOTATE_PCQ_DESTROY(pcq) \
+    AnnotatePCQDestroy(__FILE__, __LINE__, pcq)
+
+  /* Report that we are about to put an element into a FIFO queue at address
+     "pcq". */
+#define _Py_ANNOTATE_PCQ_PUT(pcq) \
+    AnnotatePCQPut(__FILE__, __LINE__, pcq)
+
+  /* Report that we've just got an element from a FIFO queue at address "pcq". */
+#define _Py_ANNOTATE_PCQ_GET(pcq) \
+    AnnotatePCQGet(__FILE__, __LINE__, pcq)
+
+  /* -------------------------------------------------------------
+     Annotations that suppress errors.  It is usually better to express the
+     program's synchronization using the other annotations, but these can
+     be used when all else fails. */
+
+  /* Report that we may have a benign race at "pointer", with size
+     "sizeof(*(pointer))". "pointer" must be a non-void* pointer.  Insert at the
+     point where "pointer" has been allocated, preferably close to the point
+     where the race happens.  See also _Py_ANNOTATE_BENIGN_RACE_STATIC. */
+#define _Py_ANNOTATE_BENIGN_RACE(pointer, description) \
+    AnnotateBenignRaceSized(__FILE__, __LINE__, pointer, \
+                            sizeof(*(pointer)), description)
+
+  /* Same as _Py_ANNOTATE_BENIGN_RACE(address, description), but applies to
+     the memory range [address, address+size). */
+#define _Py_ANNOTATE_BENIGN_RACE_SIZED(address, size, description) \
+    AnnotateBenignRaceSized(__FILE__, __LINE__, address, size, description)
+
+  /* Request the analysis tool to ignore all reads in the current thread
+     until _Py_ANNOTATE_IGNORE_READS_END is called.
+     Useful to ignore intentional racey reads, while still checking
+     other reads and all writes.
+     See also _Py_ANNOTATE_UNPROTECTED_READ. */
+#define _Py_ANNOTATE_IGNORE_READS_BEGIN() \
+    AnnotateIgnoreReadsBegin(__FILE__, __LINE__)
+
+  /* Stop ignoring reads. */
+#define _Py_ANNOTATE_IGNORE_READS_END() \
+    AnnotateIgnoreReadsEnd(__FILE__, __LINE__)
+
+  /* Similar to _Py_ANNOTATE_IGNORE_READS_BEGIN, but ignore writes. */
+#define _Py_ANNOTATE_IGNORE_WRITES_BEGIN() \
+    AnnotateIgnoreWritesBegin(__FILE__, __LINE__)
+
+  /* Stop ignoring writes. */
+#define _Py_ANNOTATE_IGNORE_WRITES_END() \
+    AnnotateIgnoreWritesEnd(__FILE__, __LINE__)
+
+  /* Start ignoring all memory accesses (reads and writes). */
+#define _Py_ANNOTATE_IGNORE_READS_AND_WRITES_BEGIN() \
+    do {\
+      _Py_ANNOTATE_IGNORE_READS_BEGIN();\
+      _Py_ANNOTATE_IGNORE_WRITES_BEGIN();\
+    }while(0)\
+
+  /* Stop ignoring all memory accesses. */
+#define _Py_ANNOTATE_IGNORE_READS_AND_WRITES_END() \
+    do {\
+      _Py_ANNOTATE_IGNORE_WRITES_END();\
+      _Py_ANNOTATE_IGNORE_READS_END();\
+    }while(0)\
+
+  /* Similar to _Py_ANNOTATE_IGNORE_READS_BEGIN, but ignore synchronization events:
+     RWLOCK* and CONDVAR*. */
+#define _Py_ANNOTATE_IGNORE_SYNC_BEGIN() \
+    AnnotateIgnoreSyncBegin(__FILE__, __LINE__)
+
+  /* Stop ignoring sync events. */
+#define _Py_ANNOTATE_IGNORE_SYNC_END() \
+    AnnotateIgnoreSyncEnd(__FILE__, __LINE__)
+
+
+  /* Enable (enable!=0) or disable (enable==0) race detection for all threads.
+     This annotation could be useful if you want to skip expensive race analysis
+     during some period of program execution, e.g. during initialization. */
+#define _Py_ANNOTATE_ENABLE_RACE_DETECTION(enable) \
+    AnnotateEnableRaceDetection(__FILE__, __LINE__, enable)
+
+  /* -------------------------------------------------------------
+     Annotations useful for debugging. */
+
+  /* Request to trace every access to "address". */
+#define _Py_ANNOTATE_TRACE_MEMORY(address) \
+    AnnotateTraceMemory(__FILE__, __LINE__, address)
+
+  /* Report the current thread name to a race detector. */
+#define _Py_ANNOTATE_THREAD_NAME(name) \
+    AnnotateThreadName(__FILE__, __LINE__, name)
+
+  /* -------------------------------------------------------------
+     Annotations useful when implementing locks.  They are not
+     normally needed by modules that merely use locks.
+     The "lock" argument is a pointer to the lock object. */
+
+  /* Report that a lock has been created at address "lock". */
+#define _Py_ANNOTATE_RWLOCK_CREATE(lock) \
+    AnnotateRWLockCreate(__FILE__, __LINE__, lock)
+
+  /* Report that the lock at address "lock" is about to be destroyed. */
+#define _Py_ANNOTATE_RWLOCK_DESTROY(lock) \
+    AnnotateRWLockDestroy(__FILE__, __LINE__, lock)
+
+  /* Report that the lock at address "lock" has been acquired.
+     is_w=1 for writer lock, is_w=0 for reader lock. */
+#define _Py_ANNOTATE_RWLOCK_ACQUIRED(lock, is_w) \
+    AnnotateRWLockAcquired(__FILE__, __LINE__, lock, is_w)
+
+  /* Report that the lock at address "lock" is about to be released. */
+#define _Py_ANNOTATE_RWLOCK_RELEASED(lock, is_w) \
+    AnnotateRWLockReleased(__FILE__, __LINE__, lock, is_w)
+
+  /* -------------------------------------------------------------
+     Annotations useful when implementing barriers.  They are not
+     normally needed by modules that merely use barriers.
+     The "barrier" argument is a pointer to the barrier object. */
+
+  /* Report that the "barrier" has been initialized with initial "count".
+   If 'reinitialization_allowed' is true, initialization is allowed to happen
+   multiple times w/o calling barrier_destroy() */
+#define _Py_ANNOTATE_BARRIER_INIT(barrier, count, reinitialization_allowed) \
+    AnnotateBarrierInit(__FILE__, __LINE__, barrier, count, \
+                        reinitialization_allowed)
+
+  /* Report that we are about to enter barrier_wait("barrier"). */
+#define _Py_ANNOTATE_BARRIER_WAIT_BEFORE(barrier) \
+    AnnotateBarrierWaitBefore(__FILE__, __LINE__, barrier)
+
+  /* Report that we just exited barrier_wait("barrier"). */
+#define _Py_ANNOTATE_BARRIER_WAIT_AFTER(barrier) \
+    AnnotateBarrierWaitAfter(__FILE__, __LINE__, barrier)
+
+  /* Report that the "barrier" has been destroyed. */
+#define _Py_ANNOTATE_BARRIER_DESTROY(barrier) \
+    AnnotateBarrierDestroy(__FILE__, __LINE__, barrier)
+
+  /* -------------------------------------------------------------
+     Annotations useful for testing race detectors. */
+
+  /* Report that we expect a race on the variable at "address".
+     Use only in unit tests for a race detector. */
+#define _Py_ANNOTATE_EXPECT_RACE(address, description) \
+    AnnotateExpectRace(__FILE__, __LINE__, address, description)
+
+  /* A no-op. Insert where you like to test the interceptors. */
+#define _Py_ANNOTATE_NO_OP(arg) \
+    AnnotateNoOp(__FILE__, __LINE__, arg)
+
+  /* Force the race detector to flush its state. The actual effect depends on
+   * the implementation of the detector. */
+#define _Py_ANNOTATE_FLUSH_STATE() \
+    AnnotateFlushState(__FILE__, __LINE__)
+
+
+#else  /* DYNAMIC_ANNOTATIONS_ENABLED == 0 */
+
+#define _Py_ANNOTATE_RWLOCK_CREATE(lock) /* empty */
+#define _Py_ANNOTATE_RWLOCK_DESTROY(lock) /* empty */
+#define _Py_ANNOTATE_RWLOCK_ACQUIRED(lock, is_w) /* empty */
+#define _Py_ANNOTATE_RWLOCK_RELEASED(lock, is_w) /* empty */
+#define _Py_ANNOTATE_BARRIER_INIT(barrier, count, reinitialization_allowed) /* */
+#define _Py_ANNOTATE_BARRIER_WAIT_BEFORE(barrier) /* empty */
+#define _Py_ANNOTATE_BARRIER_WAIT_AFTER(barrier) /* empty */
+#define _Py_ANNOTATE_BARRIER_DESTROY(barrier) /* empty */
+#define _Py_ANNOTATE_CONDVAR_LOCK_WAIT(cv, lock) /* empty */
+#define _Py_ANNOTATE_CONDVAR_WAIT(cv) /* empty */
+#define _Py_ANNOTATE_CONDVAR_SIGNAL(cv) /* empty */
+#define _Py_ANNOTATE_CONDVAR_SIGNAL_ALL(cv) /* empty */
+#define _Py_ANNOTATE_HAPPENS_BEFORE(obj) /* empty */
+#define _Py_ANNOTATE_HAPPENS_AFTER(obj) /* empty */
+#define _Py_ANNOTATE_PUBLISH_MEMORY_RANGE(address, size) /* empty */
+#define _Py_ANNOTATE_UNPUBLISH_MEMORY_RANGE(address, size)  /* empty */
+#define _Py_ANNOTATE_SWAP_MEMORY_RANGE(address, size)  /* empty */
+#define _Py_ANNOTATE_PCQ_CREATE(pcq) /* empty */
+#define _Py_ANNOTATE_PCQ_DESTROY(pcq) /* empty */
+#define _Py_ANNOTATE_PCQ_PUT(pcq) /* empty */
+#define _Py_ANNOTATE_PCQ_GET(pcq) /* empty */
+#define _Py_ANNOTATE_NEW_MEMORY(address, size) /* empty */
+#define _Py_ANNOTATE_EXPECT_RACE(address, description) /* empty */
+#define _Py_ANNOTATE_BENIGN_RACE(address, description) /* empty */
+#define _Py_ANNOTATE_BENIGN_RACE_SIZED(address, size, description) /* empty */
+#define _Py_ANNOTATE_PURE_HAPPENS_BEFORE_MUTEX(mu) /* empty */
+#define _Py_ANNOTATE_MUTEX_IS_USED_AS_CONDVAR(mu) /* empty */
+#define _Py_ANNOTATE_TRACE_MEMORY(arg) /* empty */
+#define _Py_ANNOTATE_THREAD_NAME(name) /* empty */
+#define _Py_ANNOTATE_IGNORE_READS_BEGIN() /* empty */
+#define _Py_ANNOTATE_IGNORE_READS_END() /* empty */
+#define _Py_ANNOTATE_IGNORE_WRITES_BEGIN() /* empty */
+#define _Py_ANNOTATE_IGNORE_WRITES_END() /* empty */
+#define _Py_ANNOTATE_IGNORE_READS_AND_WRITES_BEGIN() /* empty */
+#define _Py_ANNOTATE_IGNORE_READS_AND_WRITES_END() /* empty */
+#define _Py_ANNOTATE_IGNORE_SYNC_BEGIN() /* empty */
+#define _Py_ANNOTATE_IGNORE_SYNC_END() /* empty */
+#define _Py_ANNOTATE_ENABLE_RACE_DETECTION(enable) /* empty */
+#define _Py_ANNOTATE_NO_OP(arg) /* empty */
+#define _Py_ANNOTATE_FLUSH_STATE() /* empty */
+
+#endif  /* DYNAMIC_ANNOTATIONS_ENABLED */
+
+/* Use the macros above rather than using these functions directly. */
+#ifdef __cplusplus
+extern "C" {
+#endif
+void AnnotateRWLockCreate(const char *file, int line,
+                          const volatile void *lock);
+void AnnotateRWLockDestroy(const char *file, int line,
+                           const volatile void *lock);
+void AnnotateRWLockAcquired(const char *file, int line,
+                            const volatile void *lock, long is_w);
+void AnnotateRWLockReleased(const char *file, int line,
+                            const volatile void *lock, long is_w);
+void AnnotateBarrierInit(const char *file, int line,
+                         const volatile void *barrier, long count,
+                         long reinitialization_allowed);
+void AnnotateBarrierWaitBefore(const char *file, int line,
+                               const volatile void *barrier);
+void AnnotateBarrierWaitAfter(const char *file, int line,
+                              const volatile void *barrier);
+void AnnotateBarrierDestroy(const char *file, int line,
+                            const volatile void *barrier);
+void AnnotateCondVarWait(const char *file, int line,
+                         const volatile void *cv,
+                         const volatile void *lock);
+void AnnotateCondVarSignal(const char *file, int line,
+                           const volatile void *cv);
+void AnnotateCondVarSignalAll(const char *file, int line,
+                              const volatile void *cv);
+void AnnotatePublishMemoryRange(const char *file, int line,
+                                const volatile void *address,
+                                long size);
+void AnnotateUnpublishMemoryRange(const char *file, int line,
+                                  const volatile void *address,
+                                  long size);
+void AnnotatePCQCreate(const char *file, int line,
+                       const volatile void *pcq);
+void AnnotatePCQDestroy(const char *file, int line,
+                        const volatile void *pcq);
+void AnnotatePCQPut(const char *file, int line,
+                    const volatile void *pcq);
+void AnnotatePCQGet(const char *file, int line,
+                    const volatile void *pcq);
+void AnnotateNewMemory(const char *file, int line,
+                       const volatile void *address,
+                       long size);
+void AnnotateExpectRace(const char *file, int line,
+                        const volatile void *address,
+                        const char *description);
+void AnnotateBenignRace(const char *file, int line,
+                        const volatile void *address,
+                        const char *description);
+void AnnotateBenignRaceSized(const char *file, int line,
+                        const volatile void *address,
+                        long size,
+                        const char *description);
+void AnnotateMutexIsUsedAsCondVar(const char *file, int line,
+                                  const volatile void *mu);
+void AnnotateTraceMemory(const char *file, int line,
+                         const volatile void *arg);
+void AnnotateThreadName(const char *file, int line,
+                        const char *name);
+void AnnotateIgnoreReadsBegin(const char *file, int line);
+void AnnotateIgnoreReadsEnd(const char *file, int line);
+void AnnotateIgnoreWritesBegin(const char *file, int line);
+void AnnotateIgnoreWritesEnd(const char *file, int line);
+void AnnotateEnableRaceDetection(const char *file, int line, int enable);
+void AnnotateNoOp(const char *file, int line,
+                  const volatile void *arg);
+void AnnotateFlushState(const char *file, int line);
+
+/* Return non-zero value if running under valgrind.
+
+  If "valgrind.h" is included into dynamic_annotations.c,
+  the regular valgrind mechanism will be used.
+  See http://valgrind.org/docs/manual/manual-core-adv.html about
+  RUNNING_ON_VALGRIND and other valgrind "client requests".
+  The file "valgrind.h" may be obtained by doing
+     svn co svn://svn.valgrind.org/valgrind/trunk/include
+
+  If for some reason you can't use "valgrind.h" or want to fake valgrind,
+  there are two ways to make this function return non-zero:
+    - Use environment variable: export RUNNING_ON_VALGRIND=1
+    - Make your tool intercept the function RunningOnValgrind() and
+      change its return value.
+ */
+int RunningOnValgrind(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#if DYNAMIC_ANNOTATIONS_ENABLED != 0 && defined(__cplusplus)
+
+  /* _Py_ANNOTATE_UNPROTECTED_READ is the preferred way to annotate racey reads.
+
+     Instead of doing
+        _Py_ANNOTATE_IGNORE_READS_BEGIN();
+        ... = x;
+        _Py_ANNOTATE_IGNORE_READS_END();
+     one can use
+        ... = _Py_ANNOTATE_UNPROTECTED_READ(x); */
+  template <class T>
+  inline T _Py_ANNOTATE_UNPROTECTED_READ(const volatile T &x) {
+    _Py_ANNOTATE_IGNORE_READS_BEGIN();
+    T res = x;
+    _Py_ANNOTATE_IGNORE_READS_END();
+    return res;
+  }
+  /* Apply _Py_ANNOTATE_BENIGN_RACE_SIZED to a static variable. */
+#define _Py_ANNOTATE_BENIGN_RACE_STATIC(static_var, description)        \
+    namespace {                                                       \
+      class static_var ## _annotator {                                \
+       public:                                                        \
+        static_var ## _annotator() {                                  \
+          _Py_ANNOTATE_BENIGN_RACE_SIZED(&static_var,                     \
+                                      sizeof(static_var),             \
+            # static_var ": " description);                           \
+        }                                                             \
+      };                                                              \
+      static static_var ## _annotator the ## static_var ## _annotator;\
+    }
+#else /* DYNAMIC_ANNOTATIONS_ENABLED == 0 */
+
+#define _Py_ANNOTATE_UNPROTECTED_READ(x) (x)
+#define _Py_ANNOTATE_BENIGN_RACE_STATIC(static_var, description)  /* empty */
+
+#endif /* DYNAMIC_ANNOTATIONS_ENABLED */
+
+#endif  /* __DYNAMIC_ANNOTATIONS_H__ */
diff --git a/nanvix-port/cpython-headers/python3.12/enumobject.h b/nanvix-port/cpython-headers/python3.12/enumobject.h
new file mode 100644
index 000000000000..c14dbfc8c37e
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/enumobject.h
@@ -0,0 +1,17 @@
+#ifndef Py_ENUMOBJECT_H
+#define Py_ENUMOBJECT_H
+
+/* Enumerate Object */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_DATA(PyTypeObject) PyEnum_Type;
+PyAPI_DATA(PyTypeObject) PyReversed_Type;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !Py_ENUMOBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/errcode.h b/nanvix-port/cpython-headers/python3.12/errcode.h
new file mode 100644
index 000000000000..bd9066bb4151
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/errcode.h
@@ -0,0 +1,38 @@
+#ifndef Py_ERRCODE_H
+#define Py_ERRCODE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Error codes passed around between file input, tokenizer, parser and
+   interpreter.  This is necessary so we can turn them into Python
+   exceptions at a higher level.  Note that some errors have a
+   slightly different meaning when passed from the tokenizer to the
+   parser than when passed from the parser to the interpreter; e.g.
+   the parser only returns E_EOF when it hits EOF immediately, and it
+   never returns E_OK. */
+
+#define E_OK             10      /* No error */
+#define E_EOF            11      /* End Of File */
+#define E_INTR           12      /* Interrupted */
+#define E_TOKEN          13      /* Bad token */
+#define E_SYNTAX         14      /* Syntax error */
+#define E_NOMEM          15      /* Ran out of memory */
+#define E_DONE           16      /* Parsing complete */
+#define E_ERROR          17      /* Execution error */
+#define E_TABSPACE       18      /* Inconsistent mixing of tabs and spaces */
+#define E_OVERFLOW       19      /* Node had too many children */
+#define E_TOODEEP        20      /* Too many indentation levels */
+#define E_DEDENT         21      /* No matching outer block for dedent */
+#define E_DECODE         22      /* Error in decoding into Unicode */
+#define E_EOFS           23      /* EOF in triple-quoted string */
+#define E_EOLS           24      /* EOL in single-quoted string */
+#define E_LINECONT       25      /* Unexpected characters after a line continuation */
+#define E_BADSINGLE      27      /* Ill-formed single statement input */
+#define E_INTERACT_STOP  28      /* Interactive mode stopped tokenization */
+#define E_COLUMNOVERFLOW 29      /* Column offset overflow */
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_ERRCODE_H */
diff --git a/nanvix-port/cpython-headers/python3.12/exports.h b/nanvix-port/cpython-headers/python3.12/exports.h
new file mode 100644
index 000000000000..59373c39ff75
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/exports.h
@@ -0,0 +1,36 @@
+#ifndef Py_EXPORTS_H
+#define Py_EXPORTS_H
+
+#if defined(_WIN32) || defined(__CYGWIN__)
+    #if defined(Py_ENABLE_SHARED)
+        #define Py_IMPORTED_SYMBOL __declspec(dllimport)
+        #define Py_EXPORTED_SYMBOL __declspec(dllexport)
+        #define Py_LOCAL_SYMBOL
+    #else
+        #define Py_IMPORTED_SYMBOL
+        #define Py_EXPORTED_SYMBOL
+        #define Py_LOCAL_SYMBOL
+    #endif
+#else
+/*
+ * If we only ever used gcc >= 5, we could use __has_attribute(visibility)
+ * as a cross-platform way to determine if visibility is supported. However,
+ * we may still need to support gcc >= 4, as some Ubuntu LTS and Centos versions
+ * have 4 < gcc < 5.
+ */
+    #ifndef __has_attribute
+      #define __has_attribute(x) 0  // Compatibility with non-clang compilers.
+    #endif
+    #if (defined(__GNUC__) && (__GNUC__ >= 4)) ||\
+        (defined(__clang__) && __has_attribute(visibility))
+        #define Py_IMPORTED_SYMBOL __attribute__ ((visibility ("default")))
+        #define Py_EXPORTED_SYMBOL __attribute__ ((visibility ("default")))
+        #define Py_LOCAL_SYMBOL  __attribute__ ((visibility ("hidden")))
+    #else
+        #define Py_IMPORTED_SYMBOL
+        #define Py_EXPORTED_SYMBOL
+        #define Py_LOCAL_SYMBOL
+    #endif
+#endif
+
+#endif /* Py_EXPORTS_H */
diff --git a/nanvix-port/cpython-headers/python3.12/fileobject.h b/nanvix-port/cpython-headers/python3.12/fileobject.h
new file mode 100644
index 000000000000..2deef544d667
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/fileobject.h
@@ -0,0 +1,49 @@
+/* File object interface (what's left of it -- see io.py) */
+
+#ifndef Py_FILEOBJECT_H
+#define Py_FILEOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PY_STDIOTEXTMODE "b"
+
+PyAPI_FUNC(PyObject *) PyFile_FromFd(int, const char *, const char *, int,
+                                     const char *, const char *,
+                                     const char *, int);
+PyAPI_FUNC(PyObject *) PyFile_GetLine(PyObject *, int);
+PyAPI_FUNC(int) PyFile_WriteObject(PyObject *, PyObject *, int);
+PyAPI_FUNC(int) PyFile_WriteString(const char *, PyObject *);
+PyAPI_FUNC(int) PyObject_AsFileDescriptor(PyObject *);
+
+/* The default encoding used by the platform file system APIs
+   If non-NULL, this is different than the default encoding for strings
+*/
+Py_DEPRECATED(3.12) PyAPI_DATA(const char *) Py_FileSystemDefaultEncoding;
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03060000
+Py_DEPRECATED(3.12) PyAPI_DATA(const char *) Py_FileSystemDefaultEncodeErrors;
+#endif
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_HasFileSystemDefaultEncoding;
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03070000
+Py_DEPRECATED(3.12) PyAPI_DATA(int) Py_UTF8Mode;
+#endif
+
+/* A routine to check if a file descriptor can be select()-ed. */
+#ifdef _MSC_VER
+    /* On Windows, any socket fd can be select()-ed, no matter how high */
+    #define _PyIsSelectable_fd(FD) (1)
+#else
+    #define _PyIsSelectable_fd(FD) ((unsigned int)(FD) < (unsigned int)FD_SETSIZE)
+#endif
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_FILEOBJECT_H
+#  include "cpython/fileobject.h"
+#  undef Py_CPYTHON_FILEOBJECT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_FILEOBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/fileutils.h b/nanvix-port/cpython-headers/python3.12/fileutils.h
new file mode 100644
index 000000000000..ba5acc84fcb1
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/fileutils.h
@@ -0,0 +1,26 @@
+#ifndef Py_FILEUTILS_H
+#define Py_FILEUTILS_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000
+PyAPI_FUNC(wchar_t *) Py_DecodeLocale(
+    const char *arg,
+    size_t *size);
+
+PyAPI_FUNC(char*) Py_EncodeLocale(
+    const wchar_t *text,
+    size_t *error_pos);
+#endif
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_FILEUTILS_H
+#  include "cpython/fileutils.h"
+#  undef Py_CPYTHON_FILEUTILS_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_FILEUTILS_H */
diff --git a/nanvix-port/cpython-headers/python3.12/floatobject.h b/nanvix-port/cpython-headers/python3.12/floatobject.h
new file mode 100644
index 000000000000..999441ac536e
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/floatobject.h
@@ -0,0 +1,54 @@
+
+/* Float object interface */
+
+/*
+PyFloatObject represents a (double precision) floating point number.
+*/
+
+#ifndef Py_FLOATOBJECT_H
+#define Py_FLOATOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_DATA(PyTypeObject) PyFloat_Type;
+
+#define PyFloat_Check(op) PyObject_TypeCheck(op, &PyFloat_Type)
+#define PyFloat_CheckExact(op) Py_IS_TYPE((op), &PyFloat_Type)
+
+#define Py_RETURN_NAN return PyFloat_FromDouble(Py_NAN)
+
+#define Py_RETURN_INF(sign)                          \
+    do {                                             \
+        if (copysign(1., sign) == 1.) {              \
+            return PyFloat_FromDouble(Py_HUGE_VAL);  \
+        }                                            \
+        else {                                       \
+            return PyFloat_FromDouble(-Py_HUGE_VAL); \
+        }                                            \
+    } while(0)
+
+PyAPI_FUNC(double) PyFloat_GetMax(void);
+PyAPI_FUNC(double) PyFloat_GetMin(void);
+PyAPI_FUNC(PyObject*) PyFloat_GetInfo(void);
+
+/* Return Python float from string PyObject. */
+PyAPI_FUNC(PyObject*) PyFloat_FromString(PyObject*);
+
+/* Return Python float from C double. */
+PyAPI_FUNC(PyObject*) PyFloat_FromDouble(double);
+
+/* Extract C double from Python float.  The macro version trades safety for
+   speed. */
+PyAPI_FUNC(double) PyFloat_AsDouble(PyObject*);
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_FLOATOBJECT_H
+#  include "cpython/floatobject.h"
+#  undef Py_CPYTHON_FLOATOBJECT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_FLOATOBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/frameobject.h b/nanvix-port/cpython-headers/python3.12/frameobject.h
new file mode 100644
index 000000000000..adb628f6314f
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/frameobject.h
@@ -0,0 +1,20 @@
+/* Frame object interface */
+
+#ifndef Py_FRAMEOBJECT_H
+#define Py_FRAMEOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "pyframe.h"
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_FRAMEOBJECT_H
+#  include "cpython/frameobject.h"
+#  undef Py_CPYTHON_FRAMEOBJECT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_FRAMEOBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/genericaliasobject.h b/nanvix-port/cpython-headers/python3.12/genericaliasobject.h
new file mode 100644
index 000000000000..cf002976b27c
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/genericaliasobject.h
@@ -0,0 +1,14 @@
+// Implementation of PEP 585: support list[int] etc.
+#ifndef Py_GENERICALIASOBJECT_H
+#define Py_GENERICALIASOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_FUNC(PyObject *) Py_GenericAlias(PyObject *, PyObject *);
+PyAPI_DATA(PyTypeObject) Py_GenericAliasType;
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_GENERICALIASOBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/import.h b/nanvix-port/cpython-headers/python3.12/import.h
new file mode 100644
index 000000000000..5d5f3425b8e7
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/import.h
@@ -0,0 +1,98 @@
+/* Module definition and import interface */
+
+#ifndef Py_IMPORT_H
+#define Py_IMPORT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_FUNC(long) PyImport_GetMagicNumber(void);
+PyAPI_FUNC(const char *) PyImport_GetMagicTag(void);
+PyAPI_FUNC(PyObject *) PyImport_ExecCodeModule(
+    const char *name,           /* UTF-8 encoded string */
+    PyObject *co
+    );
+PyAPI_FUNC(PyObject *) PyImport_ExecCodeModuleEx(
+    const char *name,           /* UTF-8 encoded string */
+    PyObject *co,
+    const char *pathname        /* decoded from the filesystem encoding */
+    );
+PyAPI_FUNC(PyObject *) PyImport_ExecCodeModuleWithPathnames(
+    const char *name,           /* UTF-8 encoded string */
+    PyObject *co,
+    const char *pathname,       /* decoded from the filesystem encoding */
+    const char *cpathname       /* decoded from the filesystem encoding */
+    );
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+PyAPI_FUNC(PyObject *) PyImport_ExecCodeModuleObject(
+    PyObject *name,
+    PyObject *co,
+    PyObject *pathname,
+    PyObject *cpathname
+    );
+#endif
+PyAPI_FUNC(PyObject *) PyImport_GetModuleDict(void);
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03070000
+PyAPI_FUNC(PyObject *) PyImport_GetModule(PyObject *name);
+#endif
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+PyAPI_FUNC(PyObject *) PyImport_AddModuleObject(
+    PyObject *name
+    );
+#endif
+PyAPI_FUNC(PyObject *) PyImport_AddModule(
+    const char *name            /* UTF-8 encoded string */
+    );
+PyAPI_FUNC(PyObject *) PyImport_ImportModule(
+    const char *name            /* UTF-8 encoded string */
+    );
+PyAPI_FUNC(PyObject *) PyImport_ImportModuleNoBlock(
+    const char *name            /* UTF-8 encoded string */
+    );
+PyAPI_FUNC(PyObject *) PyImport_ImportModuleLevel(
+    const char *name,           /* UTF-8 encoded string */
+    PyObject *globals,
+    PyObject *locals,
+    PyObject *fromlist,
+    int level
+    );
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000
+PyAPI_FUNC(PyObject *) PyImport_ImportModuleLevelObject(
+    PyObject *name,
+    PyObject *globals,
+    PyObject *locals,
+    PyObject *fromlist,
+    int level
+    );
+#endif
+
+#define PyImport_ImportModuleEx(n, g, l, f) \
+    PyImport_ImportModuleLevel((n), (g), (l), (f), 0)
+
+PyAPI_FUNC(PyObject *) PyImport_GetImporter(PyObject *path);
+PyAPI_FUNC(PyObject *) PyImport_Import(PyObject *name);
+PyAPI_FUNC(PyObject *) PyImport_ReloadModule(PyObject *m);
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+PyAPI_FUNC(int) PyImport_ImportFrozenModuleObject(
+    PyObject *name
+    );
+#endif
+PyAPI_FUNC(int) PyImport_ImportFrozenModule(
+    const char *name            /* UTF-8 encoded string */
+    );
+
+PyAPI_FUNC(int) PyImport_AppendInittab(
+    const char *name,           /* ASCII encoded string */
+    PyObject* (*initfunc)(void)
+    );
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_IMPORT_H
+#  include "cpython/import.h"
+#  undef Py_CPYTHON_IMPORT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_IMPORT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_abstract.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_abstract.h
new file mode 100644
index 000000000000..b1afb2dc7be6
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_abstract.h
@@ -0,0 +1,25 @@
+#ifndef Py_INTERNAL_ABSTRACT_H
+#define Py_INTERNAL_ABSTRACT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+// Fast inlined version of PyIndex_Check()
+static inline int
+_PyIndex_Check(PyObject *obj)
+{
+    PyNumberMethods *tp_as_number = Py_TYPE(obj)->tp_as_number;
+    return (tp_as_number != NULL && tp_as_number->nb_index != NULL);
+}
+
+PyObject *_PyNumber_PowerNoMod(PyObject *lhs, PyObject *rhs);
+PyObject *_PyNumber_InPlacePowerNoMod(PyObject *lhs, PyObject *rhs);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_ABSTRACT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_asdl.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_asdl.h
new file mode 100644
index 000000000000..afeada88d13e
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_asdl.h
@@ -0,0 +1,112 @@
+#ifndef Py_INTERNAL_ASDL_H
+#define Py_INTERNAL_ASDL_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_pyarena.h"       // _PyArena_Malloc()
+
+typedef PyObject * identifier;
+typedef PyObject * string;
+typedef PyObject * object;
+typedef PyObject * constant;
+
+/* It would be nice if the code generated by asdl_c.py was completely
+   independent of Python, but it is a goal the requires too much work
+   at this stage.  So, for example, I'll represent identifiers as
+   interned Python strings.
+*/
+
+#define _ASDL_SEQ_HEAD \
+    Py_ssize_t size;   \
+    void **elements;
+
+typedef struct {
+    _ASDL_SEQ_HEAD
+} asdl_seq;
+
+typedef struct {
+    _ASDL_SEQ_HEAD
+    void *typed_elements[1];
+} asdl_generic_seq;
+
+typedef struct {
+    _ASDL_SEQ_HEAD
+    PyObject *typed_elements[1];
+} asdl_identifier_seq;
+
+typedef struct {
+    _ASDL_SEQ_HEAD
+    int typed_elements[1];
+} asdl_int_seq;
+
+asdl_generic_seq *_Py_asdl_generic_seq_new(Py_ssize_t size, PyArena *arena);
+asdl_identifier_seq *_Py_asdl_identifier_seq_new(Py_ssize_t size, PyArena *arena);
+asdl_int_seq *_Py_asdl_int_seq_new(Py_ssize_t size, PyArena *arena);
+
+
+#define GENERATE_ASDL_SEQ_CONSTRUCTOR(NAME, TYPE) \
+asdl_ ## NAME ## _seq *_Py_asdl_ ## NAME ## _seq_new(Py_ssize_t size, PyArena *arena) \
+{ \
+    asdl_ ## NAME ## _seq *seq = NULL; \
+    size_t n; \
+    /* check size is sane */ \
+    if (size < 0 || \
+        (size && (((size_t)size - 1) > (SIZE_MAX / sizeof(void *))))) { \
+        PyErr_NoMemory(); \
+        return NULL; \
+    } \
+    n = (size ? (sizeof(TYPE *) * (size - 1)) : 0); \
+    /* check if size can be added safely */ \
+    if (n > SIZE_MAX - sizeof(asdl_ ## NAME ## _seq)) { \
+        PyErr_NoMemory(); \
+        return NULL; \
+    } \
+    n += sizeof(asdl_ ## NAME ## _seq); \
+    seq = (asdl_ ## NAME ## _seq *)_PyArena_Malloc(arena, n); \
+    if (!seq) { \
+        PyErr_NoMemory(); \
+        return NULL; \
+    } \
+    memset(seq, 0, n); \
+    seq->size = size; \
+    seq->elements = (void**)seq->typed_elements; \
+    return seq; \
+}
+
+#define asdl_seq_GET_UNTYPED(S, I) _Py_RVALUE((S)->elements[(I)])
+#define asdl_seq_GET(S, I) _Py_RVALUE((S)->typed_elements[(I)])
+#define asdl_seq_LEN(S) _Py_RVALUE(((S) == NULL ? 0 : (S)->size))
+
+#ifdef Py_DEBUG
+#  define asdl_seq_SET(S, I, V) \
+    do { \
+        Py_ssize_t _asdl_i = (I); \
+        assert((S) != NULL); \
+        assert(0 <= _asdl_i && _asdl_i < (S)->size); \
+        (S)->typed_elements[_asdl_i] = (V); \
+    } while (0)
+#else
+#  define asdl_seq_SET(S, I, V) _Py_RVALUE((S)->typed_elements[(I)] = (V))
+#endif
+
+#ifdef Py_DEBUG
+#  define asdl_seq_SET_UNTYPED(S, I, V) \
+    do { \
+        Py_ssize_t _asdl_i = (I); \
+        assert((S) != NULL); \
+        assert(0 <= _asdl_i && _asdl_i < (S)->size); \
+        (S)->elements[_asdl_i] = (V); \
+    } while (0)
+#else
+#  define asdl_seq_SET_UNTYPED(S, I, V) _Py_RVALUE((S)->elements[(I)] = (V))
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_ASDL_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_ast.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_ast.h
new file mode 100644
index 000000000000..b568902bb1e3
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_ast.h
@@ -0,0 +1,922 @@
+// File automatically generated by Parser/asdl_c.py.
+
+#ifndef Py_INTERNAL_AST_H
+#define Py_INTERNAL_AST_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_asdl.h"
+
+typedef struct _mod *mod_ty;
+
+typedef struct _stmt *stmt_ty;
+
+typedef struct _expr *expr_ty;
+
+typedef enum _expr_context { Load=1, Store=2, Del=3 } expr_context_ty;
+
+typedef enum _boolop { And=1, Or=2 } boolop_ty;
+
+typedef enum _operator { Add=1, Sub=2, Mult=3, MatMult=4, Div=5, Mod=6, Pow=7,
+                         LShift=8, RShift=9, BitOr=10, BitXor=11, BitAnd=12,
+                         FloorDiv=13 } operator_ty;
+
+typedef enum _unaryop { Invert=1, Not=2, UAdd=3, USub=4 } unaryop_ty;
+
+typedef enum _cmpop { Eq=1, NotEq=2, Lt=3, LtE=4, Gt=5, GtE=6, Is=7, IsNot=8,
+                      In=9, NotIn=10 } cmpop_ty;
+
+typedef struct _comprehension *comprehension_ty;
+
+typedef struct _excepthandler *excepthandler_ty;
+
+typedef struct _arguments *arguments_ty;
+
+typedef struct _arg *arg_ty;
+
+typedef struct _keyword *keyword_ty;
+
+typedef struct _alias *alias_ty;
+
+typedef struct _withitem *withitem_ty;
+
+typedef struct _match_case *match_case_ty;
+
+typedef struct _pattern *pattern_ty;
+
+typedef struct _type_ignore *type_ignore_ty;
+
+typedef struct _type_param *type_param_ty;
+
+
+typedef struct {
+    _ASDL_SEQ_HEAD
+    mod_ty typed_elements[1];
+} asdl_mod_seq;
+
+asdl_mod_seq *_Py_asdl_mod_seq_new(Py_ssize_t size, PyArena *arena);
+
+typedef struct {
+    _ASDL_SEQ_HEAD
+    stmt_ty typed_elements[1];
+} asdl_stmt_seq;
+
+asdl_stmt_seq *_Py_asdl_stmt_seq_new(Py_ssize_t size, PyArena *arena);
+
+typedef struct {
+    _ASDL_SEQ_HEAD
+    expr_ty typed_elements[1];
+} asdl_expr_seq;
+
+asdl_expr_seq *_Py_asdl_expr_seq_new(Py_ssize_t size, PyArena *arena);
+
+typedef struct {
+    _ASDL_SEQ_HEAD
+    comprehension_ty typed_elements[1];
+} asdl_comprehension_seq;
+
+asdl_comprehension_seq *_Py_asdl_comprehension_seq_new(Py_ssize_t size, PyArena
+                                                       *arena);
+
+typedef struct {
+    _ASDL_SEQ_HEAD
+    excepthandler_ty typed_elements[1];
+} asdl_excepthandler_seq;
+
+asdl_excepthandler_seq *_Py_asdl_excepthandler_seq_new(Py_ssize_t size, PyArena
+                                                       *arena);
+
+typedef struct {
+    _ASDL_SEQ_HEAD
+    arguments_ty typed_elements[1];
+} asdl_arguments_seq;
+
+asdl_arguments_seq *_Py_asdl_arguments_seq_new(Py_ssize_t size, PyArena *arena);
+
+typedef struct {
+    _ASDL_SEQ_HEAD
+    arg_ty typed_elements[1];
+} asdl_arg_seq;
+
+asdl_arg_seq *_Py_asdl_arg_seq_new(Py_ssize_t size, PyArena *arena);
+
+typedef struct {
+    _ASDL_SEQ_HEAD
+    keyword_ty typed_elements[1];
+} asdl_keyword_seq;
+
+asdl_keyword_seq *_Py_asdl_keyword_seq_new(Py_ssize_t size, PyArena *arena);
+
+typedef struct {
+    _ASDL_SEQ_HEAD
+    alias_ty typed_elements[1];
+} asdl_alias_seq;
+
+asdl_alias_seq *_Py_asdl_alias_seq_new(Py_ssize_t size, PyArena *arena);
+
+typedef struct {
+    _ASDL_SEQ_HEAD
+    withitem_ty typed_elements[1];
+} asdl_withitem_seq;
+
+asdl_withitem_seq *_Py_asdl_withitem_seq_new(Py_ssize_t size, PyArena *arena);
+
+typedef struct {
+    _ASDL_SEQ_HEAD
+    match_case_ty typed_elements[1];
+} asdl_match_case_seq;
+
+asdl_match_case_seq *_Py_asdl_match_case_seq_new(Py_ssize_t size, PyArena
+                                                 *arena);
+
+typedef struct {
+    _ASDL_SEQ_HEAD
+    pattern_ty typed_elements[1];
+} asdl_pattern_seq;
+
+asdl_pattern_seq *_Py_asdl_pattern_seq_new(Py_ssize_t size, PyArena *arena);
+
+typedef struct {
+    _ASDL_SEQ_HEAD
+    type_ignore_ty typed_elements[1];
+} asdl_type_ignore_seq;
+
+asdl_type_ignore_seq *_Py_asdl_type_ignore_seq_new(Py_ssize_t size, PyArena
+                                                   *arena);
+
+typedef struct {
+    _ASDL_SEQ_HEAD
+    type_param_ty typed_elements[1];
+} asdl_type_param_seq;
+
+asdl_type_param_seq *_Py_asdl_type_param_seq_new(Py_ssize_t size, PyArena
+                                                 *arena);
+
+
+enum _mod_kind {Module_kind=1, Interactive_kind=2, Expression_kind=3,
+                 FunctionType_kind=4};
+struct _mod {
+    enum _mod_kind kind;
+    union {
+        struct {
+            asdl_stmt_seq *body;
+            asdl_type_ignore_seq *type_ignores;
+        } Module;
+
+        struct {
+            asdl_stmt_seq *body;
+        } Interactive;
+
+        struct {
+            expr_ty body;
+        } Expression;
+
+        struct {
+            asdl_expr_seq *argtypes;
+            expr_ty returns;
+        } FunctionType;
+
+    } v;
+};
+
+enum _stmt_kind {FunctionDef_kind=1, AsyncFunctionDef_kind=2, ClassDef_kind=3,
+                  Return_kind=4, Delete_kind=5, Assign_kind=6,
+                  TypeAlias_kind=7, AugAssign_kind=8, AnnAssign_kind=9,
+                  For_kind=10, AsyncFor_kind=11, While_kind=12, If_kind=13,
+                  With_kind=14, AsyncWith_kind=15, Match_kind=16,
+                  Raise_kind=17, Try_kind=18, TryStar_kind=19, Assert_kind=20,
+                  Import_kind=21, ImportFrom_kind=22, Global_kind=23,
+                  Nonlocal_kind=24, Expr_kind=25, Pass_kind=26, Break_kind=27,
+                  Continue_kind=28};
+struct _stmt {
+    enum _stmt_kind kind;
+    union {
+        struct {
+            identifier name;
+            arguments_ty args;
+            asdl_stmt_seq *body;
+            asdl_expr_seq *decorator_list;
+            expr_ty returns;
+            string type_comment;
+            asdl_type_param_seq *type_params;
+        } FunctionDef;
+
+        struct {
+            identifier name;
+            arguments_ty args;
+            asdl_stmt_seq *body;
+            asdl_expr_seq *decorator_list;
+            expr_ty returns;
+            string type_comment;
+            asdl_type_param_seq *type_params;
+        } AsyncFunctionDef;
+
+        struct {
+            identifier name;
+            asdl_expr_seq *bases;
+            asdl_keyword_seq *keywords;
+            asdl_stmt_seq *body;
+            asdl_expr_seq *decorator_list;
+            asdl_type_param_seq *type_params;
+        } ClassDef;
+
+        struct {
+            expr_ty value;
+        } Return;
+
+        struct {
+            asdl_expr_seq *targets;
+        } Delete;
+
+        struct {
+            asdl_expr_seq *targets;
+            expr_ty value;
+            string type_comment;
+        } Assign;
+
+        struct {
+            expr_ty name;
+            asdl_type_param_seq *type_params;
+            expr_ty value;
+        } TypeAlias;
+
+        struct {
+            expr_ty target;
+            operator_ty op;
+            expr_ty value;
+        } AugAssign;
+
+        struct {
+            expr_ty target;
+            expr_ty annotation;
+            expr_ty value;
+            int simple;
+        } AnnAssign;
+
+        struct {
+            expr_ty target;
+            expr_ty iter;
+            asdl_stmt_seq *body;
+            asdl_stmt_seq *orelse;
+            string type_comment;
+        } For;
+
+        struct {
+            expr_ty target;
+            expr_ty iter;
+            asdl_stmt_seq *body;
+            asdl_stmt_seq *orelse;
+            string type_comment;
+        } AsyncFor;
+
+        struct {
+            expr_ty test;
+            asdl_stmt_seq *body;
+            asdl_stmt_seq *orelse;
+        } While;
+
+        struct {
+            expr_ty test;
+            asdl_stmt_seq *body;
+            asdl_stmt_seq *orelse;
+        } If;
+
+        struct {
+            asdl_withitem_seq *items;
+            asdl_stmt_seq *body;
+            string type_comment;
+        } With;
+
+        struct {
+            asdl_withitem_seq *items;
+            asdl_stmt_seq *body;
+            string type_comment;
+        } AsyncWith;
+
+        struct {
+            expr_ty subject;
+            asdl_match_case_seq *cases;
+        } Match;
+
+        struct {
+            expr_ty exc;
+            expr_ty cause;
+        } Raise;
+
+        struct {
+            asdl_stmt_seq *body;
+            asdl_excepthandler_seq *handlers;
+            asdl_stmt_seq *orelse;
+            asdl_stmt_seq *finalbody;
+        } Try;
+
+        struct {
+            asdl_stmt_seq *body;
+            asdl_excepthandler_seq *handlers;
+            asdl_stmt_seq *orelse;
+            asdl_stmt_seq *finalbody;
+        } TryStar;
+
+        struct {
+            expr_ty test;
+            expr_ty msg;
+        } Assert;
+
+        struct {
+            asdl_alias_seq *names;
+        } Import;
+
+        struct {
+            identifier module;
+            asdl_alias_seq *names;
+            int level;
+        } ImportFrom;
+
+        struct {
+            asdl_identifier_seq *names;
+        } Global;
+
+        struct {
+            asdl_identifier_seq *names;
+        } Nonlocal;
+
+        struct {
+            expr_ty value;
+        } Expr;
+
+    } v;
+    int lineno;
+    int col_offset;
+    int end_lineno;
+    int end_col_offset;
+};
+
+enum _expr_kind {BoolOp_kind=1, NamedExpr_kind=2, BinOp_kind=3, UnaryOp_kind=4,
+                  Lambda_kind=5, IfExp_kind=6, Dict_kind=7, Set_kind=8,
+                  ListComp_kind=9, SetComp_kind=10, DictComp_kind=11,
+                  GeneratorExp_kind=12, Await_kind=13, Yield_kind=14,
+                  YieldFrom_kind=15, Compare_kind=16, Call_kind=17,
+                  FormattedValue_kind=18, JoinedStr_kind=19, Constant_kind=20,
+                  Attribute_kind=21, Subscript_kind=22, Starred_kind=23,
+                  Name_kind=24, List_kind=25, Tuple_kind=26, Slice_kind=27};
+struct _expr {
+    enum _expr_kind kind;
+    union {
+        struct {
+            boolop_ty op;
+            asdl_expr_seq *values;
+        } BoolOp;
+
+        struct {
+            expr_ty target;
+            expr_ty value;
+        } NamedExpr;
+
+        struct {
+            expr_ty left;
+            operator_ty op;
+            expr_ty right;
+        } BinOp;
+
+        struct {
+            unaryop_ty op;
+            expr_ty operand;
+        } UnaryOp;
+
+        struct {
+            arguments_ty args;
+            expr_ty body;
+        } Lambda;
+
+        struct {
+            expr_ty test;
+            expr_ty body;
+            expr_ty orelse;
+        } IfExp;
+
+        struct {
+            asdl_expr_seq *keys;
+            asdl_expr_seq *values;
+        } Dict;
+
+        struct {
+            asdl_expr_seq *elts;
+        } Set;
+
+        struct {
+            expr_ty elt;
+            asdl_comprehension_seq *generators;
+        } ListComp;
+
+        struct {
+            expr_ty elt;
+            asdl_comprehension_seq *generators;
+        } SetComp;
+
+        struct {
+            expr_ty key;
+            expr_ty value;
+            asdl_comprehension_seq *generators;
+        } DictComp;
+
+        struct {
+            expr_ty elt;
+            asdl_comprehension_seq *generators;
+        } GeneratorExp;
+
+        struct {
+            expr_ty value;
+        } Await;
+
+        struct {
+            expr_ty value;
+        } Yield;
+
+        struct {
+            expr_ty value;
+        } YieldFrom;
+
+        struct {
+            expr_ty left;
+            asdl_int_seq *ops;
+            asdl_expr_seq *comparators;
+        } Compare;
+
+        struct {
+            expr_ty func;
+            asdl_expr_seq *args;
+            asdl_keyword_seq *keywords;
+        } Call;
+
+        struct {
+            expr_ty value;
+            int conversion;
+            expr_ty format_spec;
+        } FormattedValue;
+
+        struct {
+            asdl_expr_seq *values;
+        } JoinedStr;
+
+        struct {
+            constant value;
+            string kind;
+        } Constant;
+
+        struct {
+            expr_ty value;
+            identifier attr;
+            expr_context_ty ctx;
+        } Attribute;
+
+        struct {
+            expr_ty value;
+            expr_ty slice;
+            expr_context_ty ctx;
+        } Subscript;
+
+        struct {
+            expr_ty value;
+            expr_context_ty ctx;
+        } Starred;
+
+        struct {
+            identifier id;
+            expr_context_ty ctx;
+        } Name;
+
+        struct {
+            asdl_expr_seq *elts;
+            expr_context_ty ctx;
+        } List;
+
+        struct {
+            asdl_expr_seq *elts;
+            expr_context_ty ctx;
+        } Tuple;
+
+        struct {
+            expr_ty lower;
+            expr_ty upper;
+            expr_ty step;
+        } Slice;
+
+    } v;
+    int lineno;
+    int col_offset;
+    int end_lineno;
+    int end_col_offset;
+};
+
+struct _comprehension {
+    expr_ty target;
+    expr_ty iter;
+    asdl_expr_seq *ifs;
+    int is_async;
+};
+
+enum _excepthandler_kind {ExceptHandler_kind=1};
+struct _excepthandler {
+    enum _excepthandler_kind kind;
+    union {
+        struct {
+            expr_ty type;
+            identifier name;
+            asdl_stmt_seq *body;
+        } ExceptHandler;
+
+    } v;
+    int lineno;
+    int col_offset;
+    int end_lineno;
+    int end_col_offset;
+};
+
+struct _arguments {
+    asdl_arg_seq *posonlyargs;
+    asdl_arg_seq *args;
+    arg_ty vararg;
+    asdl_arg_seq *kwonlyargs;
+    asdl_expr_seq *kw_defaults;
+    arg_ty kwarg;
+    asdl_expr_seq *defaults;
+};
+
+struct _arg {
+    identifier arg;
+    expr_ty annotation;
+    string type_comment;
+    int lineno;
+    int col_offset;
+    int end_lineno;
+    int end_col_offset;
+};
+
+struct _keyword {
+    identifier arg;
+    expr_ty value;
+    int lineno;
+    int col_offset;
+    int end_lineno;
+    int end_col_offset;
+};
+
+struct _alias {
+    identifier name;
+    identifier asname;
+    int lineno;
+    int col_offset;
+    int end_lineno;
+    int end_col_offset;
+};
+
+struct _withitem {
+    expr_ty context_expr;
+    expr_ty optional_vars;
+};
+
+struct _match_case {
+    pattern_ty pattern;
+    expr_ty guard;
+    asdl_stmt_seq *body;
+};
+
+enum _pattern_kind {MatchValue_kind=1, MatchSingleton_kind=2,
+                     MatchSequence_kind=3, MatchMapping_kind=4,
+                     MatchClass_kind=5, MatchStar_kind=6, MatchAs_kind=7,
+                     MatchOr_kind=8};
+struct _pattern {
+    enum _pattern_kind kind;
+    union {
+        struct {
+            expr_ty value;
+        } MatchValue;
+
+        struct {
+            constant value;
+        } MatchSingleton;
+
+        struct {
+            asdl_pattern_seq *patterns;
+        } MatchSequence;
+
+        struct {
+            asdl_expr_seq *keys;
+            asdl_pattern_seq *patterns;
+            identifier rest;
+        } MatchMapping;
+
+        struct {
+            expr_ty cls;
+            asdl_pattern_seq *patterns;
+            asdl_identifier_seq *kwd_attrs;
+            asdl_pattern_seq *kwd_patterns;
+        } MatchClass;
+
+        struct {
+            identifier name;
+        } MatchStar;
+
+        struct {
+            pattern_ty pattern;
+            identifier name;
+        } MatchAs;
+
+        struct {
+            asdl_pattern_seq *patterns;
+        } MatchOr;
+
+    } v;
+    int lineno;
+    int col_offset;
+    int end_lineno;
+    int end_col_offset;
+};
+
+enum _type_ignore_kind {TypeIgnore_kind=1};
+struct _type_ignore {
+    enum _type_ignore_kind kind;
+    union {
+        struct {
+            int lineno;
+            string tag;
+        } TypeIgnore;
+
+    } v;
+};
+
+enum _type_param_kind {TypeVar_kind=1, ParamSpec_kind=2, TypeVarTuple_kind=3};
+struct _type_param {
+    enum _type_param_kind kind;
+    union {
+        struct {
+            identifier name;
+            expr_ty bound;
+        } TypeVar;
+
+        struct {
+            identifier name;
+        } ParamSpec;
+
+        struct {
+            identifier name;
+        } TypeVarTuple;
+
+    } v;
+    int lineno;
+    int col_offset;
+    int end_lineno;
+    int end_col_offset;
+};
+
+
+// Note: these macros affect function definitions, not only call sites.
+mod_ty _PyAST_Module(asdl_stmt_seq * body, asdl_type_ignore_seq * type_ignores,
+                     PyArena *arena);
+mod_ty _PyAST_Interactive(asdl_stmt_seq * body, PyArena *arena);
+mod_ty _PyAST_Expression(expr_ty body, PyArena *arena);
+mod_ty _PyAST_FunctionType(asdl_expr_seq * argtypes, expr_ty returns, PyArena
+                           *arena);
+stmt_ty _PyAST_FunctionDef(identifier name, arguments_ty args, asdl_stmt_seq *
+                           body, asdl_expr_seq * decorator_list, expr_ty
+                           returns, string type_comment, asdl_type_param_seq *
+                           type_params, int lineno, int col_offset, int
+                           end_lineno, int end_col_offset, PyArena *arena);
+stmt_ty _PyAST_AsyncFunctionDef(identifier name, arguments_ty args,
+                                asdl_stmt_seq * body, asdl_expr_seq *
+                                decorator_list, expr_ty returns, string
+                                type_comment, asdl_type_param_seq *
+                                type_params, int lineno, int col_offset, int
+                                end_lineno, int end_col_offset, PyArena *arena);
+stmt_ty _PyAST_ClassDef(identifier name, asdl_expr_seq * bases,
+                        asdl_keyword_seq * keywords, asdl_stmt_seq * body,
+                        asdl_expr_seq * decorator_list, asdl_type_param_seq *
+                        type_params, int lineno, int col_offset, int
+                        end_lineno, int end_col_offset, PyArena *arena);
+stmt_ty _PyAST_Return(expr_ty value, int lineno, int col_offset, int
+                      end_lineno, int end_col_offset, PyArena *arena);
+stmt_ty _PyAST_Delete(asdl_expr_seq * targets, int lineno, int col_offset, int
+                      end_lineno, int end_col_offset, PyArena *arena);
+stmt_ty _PyAST_Assign(asdl_expr_seq * targets, expr_ty value, string
+                      type_comment, int lineno, int col_offset, int end_lineno,
+                      int end_col_offset, PyArena *arena);
+stmt_ty _PyAST_TypeAlias(expr_ty name, asdl_type_param_seq * type_params,
+                         expr_ty value, int lineno, int col_offset, int
+                         end_lineno, int end_col_offset, PyArena *arena);
+stmt_ty _PyAST_AugAssign(expr_ty target, operator_ty op, expr_ty value, int
+                         lineno, int col_offset, int end_lineno, int
+                         end_col_offset, PyArena *arena);
+stmt_ty _PyAST_AnnAssign(expr_ty target, expr_ty annotation, expr_ty value, int
+                         simple, int lineno, int col_offset, int end_lineno,
+                         int end_col_offset, PyArena *arena);
+stmt_ty _PyAST_For(expr_ty target, expr_ty iter, asdl_stmt_seq * body,
+                   asdl_stmt_seq * orelse, string type_comment, int lineno, int
+                   col_offset, int end_lineno, int end_col_offset, PyArena
+                   *arena);
+stmt_ty _PyAST_AsyncFor(expr_ty target, expr_ty iter, asdl_stmt_seq * body,
+                        asdl_stmt_seq * orelse, string type_comment, int
+                        lineno, int col_offset, int end_lineno, int
+                        end_col_offset, PyArena *arena);
+stmt_ty _PyAST_While(expr_ty test, asdl_stmt_seq * body, asdl_stmt_seq *
+                     orelse, int lineno, int col_offset, int end_lineno, int
+                     end_col_offset, PyArena *arena);
+stmt_ty _PyAST_If(expr_ty test, asdl_stmt_seq * body, asdl_stmt_seq * orelse,
+                  int lineno, int col_offset, int end_lineno, int
+                  end_col_offset, PyArena *arena);
+stmt_ty _PyAST_With(asdl_withitem_seq * items, asdl_stmt_seq * body, string
+                    type_comment, int lineno, int col_offset, int end_lineno,
+                    int end_col_offset, PyArena *arena);
+stmt_ty _PyAST_AsyncWith(asdl_withitem_seq * items, asdl_stmt_seq * body,
+                         string type_comment, int lineno, int col_offset, int
+                         end_lineno, int end_col_offset, PyArena *arena);
+stmt_ty _PyAST_Match(expr_ty subject, asdl_match_case_seq * cases, int lineno,
+                     int col_offset, int end_lineno, int end_col_offset,
+                     PyArena *arena);
+stmt_ty _PyAST_Raise(expr_ty exc, expr_ty cause, int lineno, int col_offset,
+                     int end_lineno, int end_col_offset, PyArena *arena);
+stmt_ty _PyAST_Try(asdl_stmt_seq * body, asdl_excepthandler_seq * handlers,
+                   asdl_stmt_seq * orelse, asdl_stmt_seq * finalbody, int
+                   lineno, int col_offset, int end_lineno, int end_col_offset,
+                   PyArena *arena);
+stmt_ty _PyAST_TryStar(asdl_stmt_seq * body, asdl_excepthandler_seq * handlers,
+                       asdl_stmt_seq * orelse, asdl_stmt_seq * finalbody, int
+                       lineno, int col_offset, int end_lineno, int
+                       end_col_offset, PyArena *arena);
+stmt_ty _PyAST_Assert(expr_ty test, expr_ty msg, int lineno, int col_offset,
+                      int end_lineno, int end_col_offset, PyArena *arena);
+stmt_ty _PyAST_Import(asdl_alias_seq * names, int lineno, int col_offset, int
+                      end_lineno, int end_col_offset, PyArena *arena);
+stmt_ty _PyAST_ImportFrom(identifier module, asdl_alias_seq * names, int level,
+                          int lineno, int col_offset, int end_lineno, int
+                          end_col_offset, PyArena *arena);
+stmt_ty _PyAST_Global(asdl_identifier_seq * names, int lineno, int col_offset,
+                      int end_lineno, int end_col_offset, PyArena *arena);
+stmt_ty _PyAST_Nonlocal(asdl_identifier_seq * names, int lineno, int
+                        col_offset, int end_lineno, int end_col_offset, PyArena
+                        *arena);
+stmt_ty _PyAST_Expr(expr_ty value, int lineno, int col_offset, int end_lineno,
+                    int end_col_offset, PyArena *arena);
+stmt_ty _PyAST_Pass(int lineno, int col_offset, int end_lineno, int
+                    end_col_offset, PyArena *arena);
+stmt_ty _PyAST_Break(int lineno, int col_offset, int end_lineno, int
+                     end_col_offset, PyArena *arena);
+stmt_ty _PyAST_Continue(int lineno, int col_offset, int end_lineno, int
+                        end_col_offset, PyArena *arena);
+expr_ty _PyAST_BoolOp(boolop_ty op, asdl_expr_seq * values, int lineno, int
+                      col_offset, int end_lineno, int end_col_offset, PyArena
+                      *arena);
+expr_ty _PyAST_NamedExpr(expr_ty target, expr_ty value, int lineno, int
+                         col_offset, int end_lineno, int end_col_offset,
+                         PyArena *arena);
+expr_ty _PyAST_BinOp(expr_ty left, operator_ty op, expr_ty right, int lineno,
+                     int col_offset, int end_lineno, int end_col_offset,
+                     PyArena *arena);
+expr_ty _PyAST_UnaryOp(unaryop_ty op, expr_ty operand, int lineno, int
+                       col_offset, int end_lineno, int end_col_offset, PyArena
+                       *arena);
+expr_ty _PyAST_Lambda(arguments_ty args, expr_ty body, int lineno, int
+                      col_offset, int end_lineno, int end_col_offset, PyArena
+                      *arena);
+expr_ty _PyAST_IfExp(expr_ty test, expr_ty body, expr_ty orelse, int lineno,
+                     int col_offset, int end_lineno, int end_col_offset,
+                     PyArena *arena);
+expr_ty _PyAST_Dict(asdl_expr_seq * keys, asdl_expr_seq * values, int lineno,
+                    int col_offset, int end_lineno, int end_col_offset, PyArena
+                    *arena);
+expr_ty _PyAST_Set(asdl_expr_seq * elts, int lineno, int col_offset, int
+                   end_lineno, int end_col_offset, PyArena *arena);
+expr_ty _PyAST_ListComp(expr_ty elt, asdl_comprehension_seq * generators, int
+                        lineno, int col_offset, int end_lineno, int
+                        end_col_offset, PyArena *arena);
+expr_ty _PyAST_SetComp(expr_ty elt, asdl_comprehension_seq * generators, int
+                       lineno, int col_offset, int end_lineno, int
+                       end_col_offset, PyArena *arena);
+expr_ty _PyAST_DictComp(expr_ty key, expr_ty value, asdl_comprehension_seq *
+                        generators, int lineno, int col_offset, int end_lineno,
+                        int end_col_offset, PyArena *arena);
+expr_ty _PyAST_GeneratorExp(expr_ty elt, asdl_comprehension_seq * generators,
+                            int lineno, int col_offset, int end_lineno, int
+                            end_col_offset, PyArena *arena);
+expr_ty _PyAST_Await(expr_ty value, int lineno, int col_offset, int end_lineno,
+                     int end_col_offset, PyArena *arena);
+expr_ty _PyAST_Yield(expr_ty value, int lineno, int col_offset, int end_lineno,
+                     int end_col_offset, PyArena *arena);
+expr_ty _PyAST_YieldFrom(expr_ty value, int lineno, int col_offset, int
+                         end_lineno, int end_col_offset, PyArena *arena);
+expr_ty _PyAST_Compare(expr_ty left, asdl_int_seq * ops, asdl_expr_seq *
+                       comparators, int lineno, int col_offset, int end_lineno,
+                       int end_col_offset, PyArena *arena);
+expr_ty _PyAST_Call(expr_ty func, asdl_expr_seq * args, asdl_keyword_seq *
+                    keywords, int lineno, int col_offset, int end_lineno, int
+                    end_col_offset, PyArena *arena);
+expr_ty _PyAST_FormattedValue(expr_ty value, int conversion, expr_ty
+                              format_spec, int lineno, int col_offset, int
+                              end_lineno, int end_col_offset, PyArena *arena);
+expr_ty _PyAST_JoinedStr(asdl_expr_seq * values, int lineno, int col_offset,
+                         int end_lineno, int end_col_offset, PyArena *arena);
+expr_ty _PyAST_Constant(constant value, string kind, int lineno, int
+                        col_offset, int end_lineno, int end_col_offset, PyArena
+                        *arena);
+expr_ty _PyAST_Attribute(expr_ty value, identifier attr, expr_context_ty ctx,
+                         int lineno, int col_offset, int end_lineno, int
+                         end_col_offset, PyArena *arena);
+expr_ty _PyAST_Subscript(expr_ty value, expr_ty slice, expr_context_ty ctx, int
+                         lineno, int col_offset, int end_lineno, int
+                         end_col_offset, PyArena *arena);
+expr_ty _PyAST_Starred(expr_ty value, expr_context_ty ctx, int lineno, int
+                       col_offset, int end_lineno, int end_col_offset, PyArena
+                       *arena);
+expr_ty _PyAST_Name(identifier id, expr_context_ty ctx, int lineno, int
+                    col_offset, int end_lineno, int end_col_offset, PyArena
+                    *arena);
+expr_ty _PyAST_List(asdl_expr_seq * elts, expr_context_ty ctx, int lineno, int
+                    col_offset, int end_lineno, int end_col_offset, PyArena
+                    *arena);
+expr_ty _PyAST_Tuple(asdl_expr_seq * elts, expr_context_ty ctx, int lineno, int
+                     col_offset, int end_lineno, int end_col_offset, PyArena
+                     *arena);
+expr_ty _PyAST_Slice(expr_ty lower, expr_ty upper, expr_ty step, int lineno,
+                     int col_offset, int end_lineno, int end_col_offset,
+                     PyArena *arena);
+comprehension_ty _PyAST_comprehension(expr_ty target, expr_ty iter,
+                                      asdl_expr_seq * ifs, int is_async,
+                                      PyArena *arena);
+excepthandler_ty _PyAST_ExceptHandler(expr_ty type, identifier name,
+                                      asdl_stmt_seq * body, int lineno, int
+                                      col_offset, int end_lineno, int
+                                      end_col_offset, PyArena *arena);
+arguments_ty _PyAST_arguments(asdl_arg_seq * posonlyargs, asdl_arg_seq * args,
+                              arg_ty vararg, asdl_arg_seq * kwonlyargs,
+                              asdl_expr_seq * kw_defaults, arg_ty kwarg,
+                              asdl_expr_seq * defaults, PyArena *arena);
+arg_ty _PyAST_arg(identifier arg, expr_ty annotation, string type_comment, int
+                  lineno, int col_offset, int end_lineno, int end_col_offset,
+                  PyArena *arena);
+keyword_ty _PyAST_keyword(identifier arg, expr_ty value, int lineno, int
+                          col_offset, int end_lineno, int end_col_offset,
+                          PyArena *arena);
+alias_ty _PyAST_alias(identifier name, identifier asname, int lineno, int
+                      col_offset, int end_lineno, int end_col_offset, PyArena
+                      *arena);
+withitem_ty _PyAST_withitem(expr_ty context_expr, expr_ty optional_vars,
+                            PyArena *arena);
+match_case_ty _PyAST_match_case(pattern_ty pattern, expr_ty guard,
+                                asdl_stmt_seq * body, PyArena *arena);
+pattern_ty _PyAST_MatchValue(expr_ty value, int lineno, int col_offset, int
+                             end_lineno, int end_col_offset, PyArena *arena);
+pattern_ty _PyAST_MatchSingleton(constant value, int lineno, int col_offset,
+                                 int end_lineno, int end_col_offset, PyArena
+                                 *arena);
+pattern_ty _PyAST_MatchSequence(asdl_pattern_seq * patterns, int lineno, int
+                                col_offset, int end_lineno, int end_col_offset,
+                                PyArena *arena);
+pattern_ty _PyAST_MatchMapping(asdl_expr_seq * keys, asdl_pattern_seq *
+                               patterns, identifier rest, int lineno, int
+                               col_offset, int end_lineno, int end_col_offset,
+                               PyArena *arena);
+pattern_ty _PyAST_MatchClass(expr_ty cls, asdl_pattern_seq * patterns,
+                             asdl_identifier_seq * kwd_attrs, asdl_pattern_seq
+                             * kwd_patterns, int lineno, int col_offset, int
+                             end_lineno, int end_col_offset, PyArena *arena);
+pattern_ty _PyAST_MatchStar(identifier name, int lineno, int col_offset, int
+                            end_lineno, int end_col_offset, PyArena *arena);
+pattern_ty _PyAST_MatchAs(pattern_ty pattern, identifier name, int lineno, int
+                          col_offset, int end_lineno, int end_col_offset,
+                          PyArena *arena);
+pattern_ty _PyAST_MatchOr(asdl_pattern_seq * patterns, int lineno, int
+                          col_offset, int end_lineno, int end_col_offset,
+                          PyArena *arena);
+type_ignore_ty _PyAST_TypeIgnore(int lineno, string tag, PyArena *arena);
+type_param_ty _PyAST_TypeVar(identifier name, expr_ty bound, int lineno, int
+                             col_offset, int end_lineno, int end_col_offset,
+                             PyArena *arena);
+type_param_ty _PyAST_ParamSpec(identifier name, int lineno, int col_offset, int
+                               end_lineno, int end_col_offset, PyArena *arena);
+type_param_ty _PyAST_TypeVarTuple(identifier name, int lineno, int col_offset,
+                                  int end_lineno, int end_col_offset, PyArena
+                                  *arena);
+
+
+PyObject* PyAST_mod2obj(mod_ty t);
+mod_ty PyAST_obj2mod(PyObject* ast, PyArena* arena, int mode);
+int PyAST_Check(PyObject* obj);
+
+extern int _PyAST_Validate(mod_ty);
+
+/* _PyAST_ExprAsUnicode is defined in ast_unparse.c */
+extern PyObject* _PyAST_ExprAsUnicode(expr_ty);
+
+/* Return the borrowed reference to the first literal string in the
+   sequence of statements or NULL if it doesn't start from a literal string.
+   Doesn't set exception. */
+extern PyObject* _PyAST_GetDocString(asdl_stmt_seq *);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_AST_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_ast_state.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_ast_state.h
new file mode 100644
index 000000000000..863c73b0d6e4
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_ast_state.h
@@ -0,0 +1,265 @@
+// File automatically generated by Parser/asdl_c.py.
+
+#ifndef Py_INTERNAL_AST_STATE_H
+#define Py_INTERNAL_AST_STATE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+struct ast_state {
+    int initialized;
+    int unused_recursion_depth;
+    int unused_recursion_limit;
+    PyObject *AST_type;
+    PyObject *Add_singleton;
+    PyObject *Add_type;
+    PyObject *And_singleton;
+    PyObject *And_type;
+    PyObject *AnnAssign_type;
+    PyObject *Assert_type;
+    PyObject *Assign_type;
+    PyObject *AsyncFor_type;
+    PyObject *AsyncFunctionDef_type;
+    PyObject *AsyncWith_type;
+    PyObject *Attribute_type;
+    PyObject *AugAssign_type;
+    PyObject *Await_type;
+    PyObject *BinOp_type;
+    PyObject *BitAnd_singleton;
+    PyObject *BitAnd_type;
+    PyObject *BitOr_singleton;
+    PyObject *BitOr_type;
+    PyObject *BitXor_singleton;
+    PyObject *BitXor_type;
+    PyObject *BoolOp_type;
+    PyObject *Break_type;
+    PyObject *Call_type;
+    PyObject *ClassDef_type;
+    PyObject *Compare_type;
+    PyObject *Constant_type;
+    PyObject *Continue_type;
+    PyObject *Del_singleton;
+    PyObject *Del_type;
+    PyObject *Delete_type;
+    PyObject *DictComp_type;
+    PyObject *Dict_type;
+    PyObject *Div_singleton;
+    PyObject *Div_type;
+    PyObject *Eq_singleton;
+    PyObject *Eq_type;
+    PyObject *ExceptHandler_type;
+    PyObject *Expr_type;
+    PyObject *Expression_type;
+    PyObject *FloorDiv_singleton;
+    PyObject *FloorDiv_type;
+    PyObject *For_type;
+    PyObject *FormattedValue_type;
+    PyObject *FunctionDef_type;
+    PyObject *FunctionType_type;
+    PyObject *GeneratorExp_type;
+    PyObject *Global_type;
+    PyObject *GtE_singleton;
+    PyObject *GtE_type;
+    PyObject *Gt_singleton;
+    PyObject *Gt_type;
+    PyObject *IfExp_type;
+    PyObject *If_type;
+    PyObject *ImportFrom_type;
+    PyObject *Import_type;
+    PyObject *In_singleton;
+    PyObject *In_type;
+    PyObject *Interactive_type;
+    PyObject *Invert_singleton;
+    PyObject *Invert_type;
+    PyObject *IsNot_singleton;
+    PyObject *IsNot_type;
+    PyObject *Is_singleton;
+    PyObject *Is_type;
+    PyObject *JoinedStr_type;
+    PyObject *LShift_singleton;
+    PyObject *LShift_type;
+    PyObject *Lambda_type;
+    PyObject *ListComp_type;
+    PyObject *List_type;
+    PyObject *Load_singleton;
+    PyObject *Load_type;
+    PyObject *LtE_singleton;
+    PyObject *LtE_type;
+    PyObject *Lt_singleton;
+    PyObject *Lt_type;
+    PyObject *MatMult_singleton;
+    PyObject *MatMult_type;
+    PyObject *MatchAs_type;
+    PyObject *MatchClass_type;
+    PyObject *MatchMapping_type;
+    PyObject *MatchOr_type;
+    PyObject *MatchSequence_type;
+    PyObject *MatchSingleton_type;
+    PyObject *MatchStar_type;
+    PyObject *MatchValue_type;
+    PyObject *Match_type;
+    PyObject *Mod_singleton;
+    PyObject *Mod_type;
+    PyObject *Module_type;
+    PyObject *Mult_singleton;
+    PyObject *Mult_type;
+    PyObject *Name_type;
+    PyObject *NamedExpr_type;
+    PyObject *Nonlocal_type;
+    PyObject *NotEq_singleton;
+    PyObject *NotEq_type;
+    PyObject *NotIn_singleton;
+    PyObject *NotIn_type;
+    PyObject *Not_singleton;
+    PyObject *Not_type;
+    PyObject *Or_singleton;
+    PyObject *Or_type;
+    PyObject *ParamSpec_type;
+    PyObject *Pass_type;
+    PyObject *Pow_singleton;
+    PyObject *Pow_type;
+    PyObject *RShift_singleton;
+    PyObject *RShift_type;
+    PyObject *Raise_type;
+    PyObject *Return_type;
+    PyObject *SetComp_type;
+    PyObject *Set_type;
+    PyObject *Slice_type;
+    PyObject *Starred_type;
+    PyObject *Store_singleton;
+    PyObject *Store_type;
+    PyObject *Sub_singleton;
+    PyObject *Sub_type;
+    PyObject *Subscript_type;
+    PyObject *TryStar_type;
+    PyObject *Try_type;
+    PyObject *Tuple_type;
+    PyObject *TypeAlias_type;
+    PyObject *TypeIgnore_type;
+    PyObject *TypeVarTuple_type;
+    PyObject *TypeVar_type;
+    PyObject *UAdd_singleton;
+    PyObject *UAdd_type;
+    PyObject *USub_singleton;
+    PyObject *USub_type;
+    PyObject *UnaryOp_type;
+    PyObject *While_type;
+    PyObject *With_type;
+    PyObject *YieldFrom_type;
+    PyObject *Yield_type;
+    PyObject *__dict__;
+    PyObject *__doc__;
+    PyObject *__match_args__;
+    PyObject *__module__;
+    PyObject *_attributes;
+    PyObject *_fields;
+    PyObject *alias_type;
+    PyObject *annotation;
+    PyObject *arg;
+    PyObject *arg_type;
+    PyObject *args;
+    PyObject *argtypes;
+    PyObject *arguments_type;
+    PyObject *asname;
+    PyObject *ast;
+    PyObject *attr;
+    PyObject *bases;
+    PyObject *body;
+    PyObject *boolop_type;
+    PyObject *bound;
+    PyObject *cases;
+    PyObject *cause;
+    PyObject *cls;
+    PyObject *cmpop_type;
+    PyObject *col_offset;
+    PyObject *comparators;
+    PyObject *comprehension_type;
+    PyObject *context_expr;
+    PyObject *conversion;
+    PyObject *ctx;
+    PyObject *decorator_list;
+    PyObject *defaults;
+    PyObject *elt;
+    PyObject *elts;
+    PyObject *end_col_offset;
+    PyObject *end_lineno;
+    PyObject *exc;
+    PyObject *excepthandler_type;
+    PyObject *expr_context_type;
+    PyObject *expr_type;
+    PyObject *finalbody;
+    PyObject *format_spec;
+    PyObject *func;
+    PyObject *generators;
+    PyObject *guard;
+    PyObject *handlers;
+    PyObject *id;
+    PyObject *ifs;
+    PyObject *is_async;
+    PyObject *items;
+    PyObject *iter;
+    PyObject *key;
+    PyObject *keys;
+    PyObject *keyword_type;
+    PyObject *keywords;
+    PyObject *kind;
+    PyObject *kw_defaults;
+    PyObject *kwarg;
+    PyObject *kwd_attrs;
+    PyObject *kwd_patterns;
+    PyObject *kwonlyargs;
+    PyObject *left;
+    PyObject *level;
+    PyObject *lineno;
+    PyObject *lower;
+    PyObject *match_case_type;
+    PyObject *mod_type;
+    PyObject *module;
+    PyObject *msg;
+    PyObject *name;
+    PyObject *names;
+    PyObject *op;
+    PyObject *operand;
+    PyObject *operator_type;
+    PyObject *ops;
+    PyObject *optional_vars;
+    PyObject *orelse;
+    PyObject *pattern;
+    PyObject *pattern_type;
+    PyObject *patterns;
+    PyObject *posonlyargs;
+    PyObject *rest;
+    PyObject *returns;
+    PyObject *right;
+    PyObject *simple;
+    PyObject *slice;
+    PyObject *step;
+    PyObject *stmt_type;
+    PyObject *subject;
+    PyObject *tag;
+    PyObject *target;
+    PyObject *targets;
+    PyObject *test;
+    PyObject *type;
+    PyObject *type_comment;
+    PyObject *type_ignore_type;
+    PyObject *type_ignores;
+    PyObject *type_param_type;
+    PyObject *type_params;
+    PyObject *unaryop_type;
+    PyObject *upper;
+    PyObject *value;
+    PyObject *values;
+    PyObject *vararg;
+    PyObject *withitem_type;
+};
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_AST_STATE_H */
+
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_atexit.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_atexit.h
new file mode 100644
index 000000000000..63a2cd5d507d
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_atexit.h
@@ -0,0 +1,57 @@
+#ifndef Py_INTERNAL_ATEXIT_H
+#define Py_INTERNAL_ATEXIT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+
+//###############
+// runtime atexit
+
+typedef void (*atexit_callbackfunc)(void);
+
+struct _atexit_runtime_state {
+    PyThread_type_lock mutex;
+#define NEXITFUNCS 32
+    atexit_callbackfunc callbacks[NEXITFUNCS];
+    int ncallbacks;
+};
+
+
+//###################
+// interpreter atexit
+
+struct atexit_callback;
+typedef struct atexit_callback {
+    atexit_datacallbackfunc func;
+    void *data;
+    struct atexit_callback *next;
+} atexit_callback;
+
+typedef struct {
+    PyObject *func;
+    PyObject *args;
+    PyObject *kwargs;
+} atexit_py_callback;
+
+struct atexit_state {
+    atexit_callback *ll_callbacks;
+    atexit_callback *last_ll_callback;
+
+    // XXX The rest of the state could be moved to the atexit module state
+    // and a low-level callback added for it during module exec.
+    // For the moment we leave it here.
+    atexit_py_callback **callbacks;
+    int ncallbacks;
+    int callback_len;
+};
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_ATEXIT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_atomic.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_atomic.h
new file mode 100644
index 000000000000..425d69f868b5
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_atomic.h
@@ -0,0 +1,557 @@
+#ifndef Py_ATOMIC_H
+#define Py_ATOMIC_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "dynamic_annotations.h"   /* _Py_ANNOTATE_MEMORY_ORDER */
+#include "pyconfig.h"
+
+#ifdef HAVE_STD_ATOMIC
+#  include <stdatomic.h>
+#endif
+
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#if defined(_M_IX86) || defined(_M_X64)
+#  include <immintrin.h>
+#endif
+#endif
+
+/* This is modeled after the atomics interface from C1x, according to
+ * the draft at
+ * http://www.open-std.org/JTC1/SC22/wg14/www/docs/n1425.pdf.
+ * Operations and types are named the same except with a _Py_ prefix
+ * and have the same semantics.
+ *
+ * Beware, the implementations here are deep magic.
+ */
+
+#if defined(HAVE_STD_ATOMIC)
+
+typedef enum _Py_memory_order {
+    _Py_memory_order_relaxed = memory_order_relaxed,
+    _Py_memory_order_acquire = memory_order_acquire,
+    _Py_memory_order_release = memory_order_release,
+    _Py_memory_order_acq_rel = memory_order_acq_rel,
+    _Py_memory_order_seq_cst = memory_order_seq_cst
+} _Py_memory_order;
+
+typedef struct _Py_atomic_address {
+    atomic_uintptr_t _value;
+} _Py_atomic_address;
+
+typedef struct _Py_atomic_int {
+    atomic_int _value;
+} _Py_atomic_int;
+
+#define _Py_atomic_signal_fence(/*memory_order*/ ORDER) \
+    atomic_signal_fence(ORDER)
+
+#define _Py_atomic_thread_fence(/*memory_order*/ ORDER) \
+    atomic_thread_fence(ORDER)
+
+#define _Py_atomic_store_explicit(ATOMIC_VAL, NEW_VAL, ORDER) \
+    atomic_store_explicit(&((ATOMIC_VAL)->_value), NEW_VAL, ORDER)
+
+#define _Py_atomic_load_explicit(ATOMIC_VAL, ORDER) \
+    atomic_load_explicit(&((ATOMIC_VAL)->_value), ORDER)
+
+// Use builtin atomic operations in GCC >= 4.7 and clang
+#elif defined(HAVE_BUILTIN_ATOMIC)
+
+typedef enum _Py_memory_order {
+    _Py_memory_order_relaxed = __ATOMIC_RELAXED,
+    _Py_memory_order_acquire = __ATOMIC_ACQUIRE,
+    _Py_memory_order_release = __ATOMIC_RELEASE,
+    _Py_memory_order_acq_rel = __ATOMIC_ACQ_REL,
+    _Py_memory_order_seq_cst = __ATOMIC_SEQ_CST
+} _Py_memory_order;
+
+typedef struct _Py_atomic_address {
+    uintptr_t _value;
+} _Py_atomic_address;
+
+typedef struct _Py_atomic_int {
+    int _value;
+} _Py_atomic_int;
+
+#define _Py_atomic_signal_fence(/*memory_order*/ ORDER) \
+    __atomic_signal_fence(ORDER)
+
+#define _Py_atomic_thread_fence(/*memory_order*/ ORDER) \
+    __atomic_thread_fence(ORDER)
+
+#define _Py_atomic_store_explicit(ATOMIC_VAL, NEW_VAL, ORDER) \
+    (assert((ORDER) == __ATOMIC_RELAXED                       \
+            || (ORDER) == __ATOMIC_SEQ_CST                    \
+            || (ORDER) == __ATOMIC_RELEASE),                  \
+     __atomic_store_n(&((ATOMIC_VAL)->_value), NEW_VAL, ORDER))
+
+#define _Py_atomic_load_explicit(ATOMIC_VAL, ORDER)           \
+    (assert((ORDER) == __ATOMIC_RELAXED                       \
+            || (ORDER) == __ATOMIC_SEQ_CST                    \
+            || (ORDER) == __ATOMIC_ACQUIRE                    \
+            || (ORDER) == __ATOMIC_CONSUME),                  \
+     __atomic_load_n(&((ATOMIC_VAL)->_value), ORDER))
+
+/* Only support GCC (for expression statements) and x86 (for simple
+ * atomic semantics) and MSVC x86/x64/ARM */
+#elif defined(__GNUC__) && (defined(__i386__) || defined(__amd64))
+typedef enum _Py_memory_order {
+    _Py_memory_order_relaxed,
+    _Py_memory_order_acquire,
+    _Py_memory_order_release,
+    _Py_memory_order_acq_rel,
+    _Py_memory_order_seq_cst
+} _Py_memory_order;
+
+typedef struct _Py_atomic_address {
+    uintptr_t _value;
+} _Py_atomic_address;
+
+typedef struct _Py_atomic_int {
+    int _value;
+} _Py_atomic_int;
+
+
+static __inline__ void
+_Py_atomic_signal_fence(_Py_memory_order order)
+{
+    if (order != _Py_memory_order_relaxed)
+        __asm__ volatile("":::"memory");
+}
+
+static __inline__ void
+_Py_atomic_thread_fence(_Py_memory_order order)
+{
+    if (order != _Py_memory_order_relaxed)
+        __asm__ volatile("mfence":::"memory");
+}
+
+/* Tell the race checker about this operation's effects. */
+static __inline__ void
+_Py_ANNOTATE_MEMORY_ORDER(const volatile void *address, _Py_memory_order order)
+{
+    (void)address;              /* shut up -Wunused-parameter */
+    switch(order) {
+    case _Py_memory_order_release:
+    case _Py_memory_order_acq_rel:
+    case _Py_memory_order_seq_cst:
+        _Py_ANNOTATE_HAPPENS_BEFORE(address);
+        break;
+    case _Py_memory_order_relaxed:
+    case _Py_memory_order_acquire:
+        break;
+    }
+    switch(order) {
+    case _Py_memory_order_acquire:
+    case _Py_memory_order_acq_rel:
+    case _Py_memory_order_seq_cst:
+        _Py_ANNOTATE_HAPPENS_AFTER(address);
+        break;
+    case _Py_memory_order_relaxed:
+    case _Py_memory_order_release:
+        break;
+    }
+}
+
+#define _Py_atomic_store_explicit(ATOMIC_VAL, NEW_VAL, ORDER) \
+    __extension__ ({ \
+        __typeof__(ATOMIC_VAL) atomic_val = ATOMIC_VAL; \
+        __typeof__(atomic_val->_value) new_val = NEW_VAL;\
+        volatile __typeof__(new_val) *volatile_data = &atomic_val->_value; \
+        _Py_memory_order order = ORDER; \
+        _Py_ANNOTATE_MEMORY_ORDER(atomic_val, order); \
+        \
+        /* Perform the operation. */ \
+        _Py_ANNOTATE_IGNORE_WRITES_BEGIN(); \
+        switch(order) { \
+        case _Py_memory_order_release: \
+            _Py_atomic_signal_fence(_Py_memory_order_release); \
+            /* fallthrough */ \
+        case _Py_memory_order_relaxed: \
+            *volatile_data = new_val; \
+            break; \
+        \
+        case _Py_memory_order_acquire: \
+        case _Py_memory_order_acq_rel: \
+        case _Py_memory_order_seq_cst: \
+            __asm__ volatile("xchg %0, %1" \
+                         : "+r"(new_val) \
+                         : "m"(atomic_val->_value) \
+                         : "memory"); \
+            break; \
+        } \
+        _Py_ANNOTATE_IGNORE_WRITES_END(); \
+    })
+
+#define _Py_atomic_load_explicit(ATOMIC_VAL, ORDER) \
+    __extension__ ({  \
+        __typeof__(ATOMIC_VAL) atomic_val = ATOMIC_VAL; \
+        __typeof__(atomic_val->_value) result; \
+        volatile __typeof__(result) *volatile_data = &atomic_val->_value; \
+        _Py_memory_order order = ORDER; \
+        _Py_ANNOTATE_MEMORY_ORDER(atomic_val, order); \
+        \
+        /* Perform the operation. */ \
+        _Py_ANNOTATE_IGNORE_READS_BEGIN(); \
+        switch(order) { \
+        case _Py_memory_order_release: \
+        case _Py_memory_order_acq_rel: \
+        case _Py_memory_order_seq_cst: \
+            /* Loads on x86 are not releases by default, so need a */ \
+            /* thread fence. */ \
+            _Py_atomic_thread_fence(_Py_memory_order_release); \
+            break; \
+        default: \
+            /* No fence */ \
+            break; \
+        } \
+        result = *volatile_data; \
+        switch(order) { \
+        case _Py_memory_order_acquire: \
+        case _Py_memory_order_acq_rel: \
+        case _Py_memory_order_seq_cst: \
+            /* Loads on x86 are automatically acquire operations so */ \
+            /* can get by with just a compiler fence. */ \
+            _Py_atomic_signal_fence(_Py_memory_order_acquire); \
+            break; \
+        default: \
+            /* No fence */ \
+            break; \
+        } \
+        _Py_ANNOTATE_IGNORE_READS_END(); \
+        result; \
+    })
+
+#elif defined(_MSC_VER)
+/*  _Interlocked* functions provide a full memory barrier and are therefore
+    enough for acq_rel and seq_cst. If the HLE variants aren't available
+    in hardware they will fall back to a full memory barrier as well.
+
+    This might affect performance but likely only in some very specific and
+    hard to measure scenario.
+*/
+#if defined(_M_IX86) || defined(_M_X64)
+typedef enum _Py_memory_order {
+    _Py_memory_order_relaxed,
+    _Py_memory_order_acquire,
+    _Py_memory_order_release,
+    _Py_memory_order_acq_rel,
+    _Py_memory_order_seq_cst
+} _Py_memory_order;
+
+typedef struct _Py_atomic_address {
+    volatile uintptr_t _value;
+} _Py_atomic_address;
+
+typedef struct _Py_atomic_int {
+    volatile int _value;
+} _Py_atomic_int;
+
+
+#if defined(_M_X64)
+#define _Py_atomic_store_64bit(ATOMIC_VAL, NEW_VAL, ORDER) \
+    switch (ORDER) { \
+    case _Py_memory_order_acquire: \
+      _InterlockedExchange64_HLEAcquire((__int64 volatile*)&((ATOMIC_VAL)->_value), (__int64)(NEW_VAL)); \
+      break; \
+    case _Py_memory_order_release: \
+      _InterlockedExchange64_HLERelease((__int64 volatile*)&((ATOMIC_VAL)->_value), (__int64)(NEW_VAL)); \
+      break; \
+    default: \
+      _InterlockedExchange64((__int64 volatile*)&((ATOMIC_VAL)->_value), (__int64)(NEW_VAL)); \
+      break; \
+  }
+#else
+#define _Py_atomic_store_64bit(ATOMIC_VAL, NEW_VAL, ORDER) ((void)0);
+#endif
+
+#define _Py_atomic_store_32bit(ATOMIC_VAL, NEW_VAL, ORDER) \
+  switch (ORDER) { \
+  case _Py_memory_order_acquire: \
+    _InterlockedExchange_HLEAcquire((volatile long*)&((ATOMIC_VAL)->_value), (int)(NEW_VAL)); \
+    break; \
+  case _Py_memory_order_release: \
+    _InterlockedExchange_HLERelease((volatile long*)&((ATOMIC_VAL)->_value), (int)(NEW_VAL)); \
+    break; \
+  default: \
+    _InterlockedExchange((volatile long*)&((ATOMIC_VAL)->_value), (int)(NEW_VAL)); \
+    break; \
+  }
+
+#if defined(_M_X64)
+/*  This has to be an intptr_t for now.
+    gil_created() uses -1 as a sentinel value, if this returns
+    a uintptr_t it will do an unsigned compare and crash
+*/
+inline intptr_t _Py_atomic_load_64bit_impl(volatile uintptr_t* value, int order) {
+    __int64 old;
+    switch (order) {
+    case _Py_memory_order_acquire:
+    {
+      do {
+        old = *value;
+      } while(_InterlockedCompareExchange64_HLEAcquire((volatile __int64*)value, old, old) != old);
+      break;
+    }
+    case _Py_memory_order_release:
+    {
+      do {
+        old = *value;
+      } while(_InterlockedCompareExchange64_HLERelease((volatile __int64*)value, old, old) != old);
+      break;
+    }
+    case _Py_memory_order_relaxed:
+      old = *value;
+      break;
+    default:
+    {
+      do {
+        old = *value;
+      } while(_InterlockedCompareExchange64((volatile __int64*)value, old, old) != old);
+      break;
+    }
+    }
+    return old;
+}
+
+#define _Py_atomic_load_64bit(ATOMIC_VAL, ORDER) \
+    _Py_atomic_load_64bit_impl((volatile uintptr_t*)&((ATOMIC_VAL)->_value), (ORDER))
+
+#else
+#define _Py_atomic_load_64bit(ATOMIC_VAL, ORDER) ((ATOMIC_VAL)->_value)
+#endif
+
+inline int _Py_atomic_load_32bit_impl(volatile int* value, int order) {
+    long old;
+    switch (order) {
+    case _Py_memory_order_acquire:
+    {
+      do {
+        old = *value;
+      } while(_InterlockedCompareExchange_HLEAcquire((volatile long*)value, old, old) != old);
+      break;
+    }
+    case _Py_memory_order_release:
+    {
+      do {
+        old = *value;
+      } while(_InterlockedCompareExchange_HLERelease((volatile long*)value, old, old) != old);
+      break;
+    }
+    case _Py_memory_order_relaxed:
+      old = *value;
+      break;
+    default:
+    {
+      do {
+        old = *value;
+      } while(_InterlockedCompareExchange((volatile long*)value, old, old) != old);
+      break;
+    }
+    }
+    return old;
+}
+
+#define _Py_atomic_load_32bit(ATOMIC_VAL, ORDER) \
+    _Py_atomic_load_32bit_impl((volatile int*)&((ATOMIC_VAL)->_value), (ORDER))
+
+#define _Py_atomic_store_explicit(ATOMIC_VAL, NEW_VAL, ORDER) \
+  if (sizeof((ATOMIC_VAL)->_value) == 8) { \
+    _Py_atomic_store_64bit((ATOMIC_VAL), NEW_VAL, ORDER) } else { \
+    _Py_atomic_store_32bit((ATOMIC_VAL), NEW_VAL, ORDER) }
+
+#define _Py_atomic_load_explicit(ATOMIC_VAL, ORDER) \
+  ( \
+    sizeof((ATOMIC_VAL)->_value) == 8 ? \
+    _Py_atomic_load_64bit((ATOMIC_VAL), ORDER) : \
+    _Py_atomic_load_32bit((ATOMIC_VAL), ORDER) \
+  )
+#elif defined(_M_ARM) || defined(_M_ARM64)
+typedef enum _Py_memory_order {
+    _Py_memory_order_relaxed,
+    _Py_memory_order_acquire,
+    _Py_memory_order_release,
+    _Py_memory_order_acq_rel,
+    _Py_memory_order_seq_cst
+} _Py_memory_order;
+
+typedef struct _Py_atomic_address {
+    volatile uintptr_t _value;
+} _Py_atomic_address;
+
+typedef struct _Py_atomic_int {
+    volatile int _value;
+} _Py_atomic_int;
+
+
+#if defined(_M_ARM64)
+#define _Py_atomic_store_64bit(ATOMIC_VAL, NEW_VAL, ORDER) \
+    switch (ORDER) { \
+    case _Py_memory_order_acquire: \
+      _InterlockedExchange64_acq((__int64 volatile*)&((ATOMIC_VAL)->_value), (__int64)NEW_VAL); \
+      break; \
+    case _Py_memory_order_release: \
+      _InterlockedExchange64_rel((__int64 volatile*)&((ATOMIC_VAL)->_value), (__int64)NEW_VAL); \
+      break; \
+    default: \
+      _InterlockedExchange64((__int64 volatile*)&((ATOMIC_VAL)->_value), (__int64)NEW_VAL); \
+      break; \
+  }
+#else
+#define _Py_atomic_store_64bit(ATOMIC_VAL, NEW_VAL, ORDER) ((void)0);
+#endif
+
+#define _Py_atomic_store_32bit(ATOMIC_VAL, NEW_VAL, ORDER) \
+  switch (ORDER) { \
+  case _Py_memory_order_acquire: \
+    _InterlockedExchange_acq((volatile long*)&((ATOMIC_VAL)->_value), (int)NEW_VAL); \
+    break; \
+  case _Py_memory_order_release: \
+    _InterlockedExchange_rel((volatile long*)&((ATOMIC_VAL)->_value), (int)NEW_VAL); \
+    break; \
+  default: \
+    _InterlockedExchange((volatile long*)&((ATOMIC_VAL)->_value), (int)NEW_VAL); \
+    break; \
+  }
+
+#if defined(_M_ARM64)
+/*  This has to be an intptr_t for now.
+    gil_created() uses -1 as a sentinel value, if this returns
+    a uintptr_t it will do an unsigned compare and crash
+*/
+inline intptr_t _Py_atomic_load_64bit_impl(volatile uintptr_t* value, int order) {
+    uintptr_t old;
+    switch (order) {
+    case _Py_memory_order_acquire:
+    {
+      do {
+        old = *value;
+      } while(_InterlockedCompareExchange64_acq(value, old, old) != old);
+      break;
+    }
+    case _Py_memory_order_release:
+    {
+      do {
+        old = *value;
+      } while(_InterlockedCompareExchange64_rel(value, old, old) != old);
+      break;
+    }
+    case _Py_memory_order_relaxed:
+      old = *value;
+      break;
+    default:
+    {
+      do {
+        old = *value;
+      } while(_InterlockedCompareExchange64(value, old, old) != old);
+      break;
+    }
+    }
+    return old;
+}
+
+#define _Py_atomic_load_64bit(ATOMIC_VAL, ORDER) \
+    _Py_atomic_load_64bit_impl((volatile uintptr_t*)&((ATOMIC_VAL)->_value), (ORDER))
+
+#else
+#define _Py_atomic_load_64bit(ATOMIC_VAL, ORDER) ((ATOMIC_VAL)->_value)
+#endif
+
+inline int _Py_atomic_load_32bit_impl(volatile int* value, int order) {
+    int old;
+    switch (order) {
+    case _Py_memory_order_acquire:
+    {
+      do {
+        old = *value;
+      } while(_InterlockedCompareExchange_acq(value, old, old) != old);
+      break;
+    }
+    case _Py_memory_order_release:
+    {
+      do {
+        old = *value;
+      } while(_InterlockedCompareExchange_rel(value, old, old) != old);
+      break;
+    }
+    case _Py_memory_order_relaxed:
+      old = *value;
+      break;
+    default:
+    {
+      do {
+        old = *value;
+      } while(_InterlockedCompareExchange(value, old, old) != old);
+      break;
+    }
+    }
+    return old;
+}
+
+#define _Py_atomic_load_32bit(ATOMIC_VAL, ORDER) \
+    _Py_atomic_load_32bit_impl((volatile int*)&((ATOMIC_VAL)->_value), (ORDER))
+
+#define _Py_atomic_store_explicit(ATOMIC_VAL, NEW_VAL, ORDER) \
+  if (sizeof((ATOMIC_VAL)->_value) == 8) { \
+    _Py_atomic_store_64bit((ATOMIC_VAL), (NEW_VAL), (ORDER)) } else { \
+    _Py_atomic_store_32bit((ATOMIC_VAL), (NEW_VAL), (ORDER)) }
+
+#define _Py_atomic_load_explicit(ATOMIC_VAL, ORDER) \
+  ( \
+    sizeof((ATOMIC_VAL)->_value) == 8 ? \
+    _Py_atomic_load_64bit((ATOMIC_VAL), (ORDER)) : \
+    _Py_atomic_load_32bit((ATOMIC_VAL), (ORDER)) \
+  )
+#endif
+#else  /* !gcc x86  !_msc_ver */
+typedef enum _Py_memory_order {
+    _Py_memory_order_relaxed,
+    _Py_memory_order_acquire,
+    _Py_memory_order_release,
+    _Py_memory_order_acq_rel,
+    _Py_memory_order_seq_cst
+} _Py_memory_order;
+
+typedef struct _Py_atomic_address {
+    uintptr_t _value;
+} _Py_atomic_address;
+
+typedef struct _Py_atomic_int {
+    int _value;
+} _Py_atomic_int;
+/* Fall back to other compilers and processors by assuming that simple
+   volatile accesses are atomic.  This is false, so people should port
+   this. */
+#define _Py_atomic_signal_fence(/*memory_order*/ ORDER) ((void)0)
+#define _Py_atomic_thread_fence(/*memory_order*/ ORDER) ((void)0)
+#define _Py_atomic_store_explicit(ATOMIC_VAL, NEW_VAL, ORDER) \
+    ((ATOMIC_VAL)->_value = NEW_VAL)
+#define _Py_atomic_load_explicit(ATOMIC_VAL, ORDER) \
+    ((ATOMIC_VAL)->_value)
+#endif
+
+/* Standardized shortcuts. */
+#define _Py_atomic_store(ATOMIC_VAL, NEW_VAL) \
+    _Py_atomic_store_explicit((ATOMIC_VAL), (NEW_VAL), _Py_memory_order_seq_cst)
+#define _Py_atomic_load(ATOMIC_VAL) \
+    _Py_atomic_load_explicit((ATOMIC_VAL), _Py_memory_order_seq_cst)
+
+/* Python-local extensions */
+
+#define _Py_atomic_store_relaxed(ATOMIC_VAL, NEW_VAL) \
+    _Py_atomic_store_explicit((ATOMIC_VAL), (NEW_VAL), _Py_memory_order_relaxed)
+#define _Py_atomic_load_relaxed(ATOMIC_VAL) \
+    _Py_atomic_load_explicit((ATOMIC_VAL), _Py_memory_order_relaxed)
+
+#ifdef __cplusplus
+}
+#endif
+#endif  /* Py_ATOMIC_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_atomic_funcs.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_atomic_funcs.h
new file mode 100644
index 000000000000..a708789cea73
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_atomic_funcs.h
@@ -0,0 +1,94 @@
+/* Atomic functions: similar to pycore_atomic.h, but don't need
+   to declare variables as atomic.
+
+   Py_ssize_t type:
+
+   * value = _Py_atomic_size_get(&var)
+   * _Py_atomic_size_set(&var, value)
+
+   Use sequentially-consistent ordering (__ATOMIC_SEQ_CST memory order):
+   enforce total ordering with all other atomic functions.
+*/
+#ifndef Py_ATOMIC_FUNC_H
+#define Py_ATOMIC_FUNC_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#if defined(_MSC_VER)
+#  include <intrin.h>             // _InterlockedExchange()
+#endif
+
+
+// Use builtin atomic operations in GCC >= 4.7 and clang
+#ifdef HAVE_BUILTIN_ATOMIC
+
+static inline Py_ssize_t _Py_atomic_size_get(Py_ssize_t *var)
+{
+    return __atomic_load_n(var, __ATOMIC_SEQ_CST);
+}
+
+static inline void _Py_atomic_size_set(Py_ssize_t *var, Py_ssize_t value)
+{
+    __atomic_store_n(var, value, __ATOMIC_SEQ_CST);
+}
+
+#elif defined(_MSC_VER)
+
+static inline Py_ssize_t _Py_atomic_size_get(Py_ssize_t *var)
+{
+#if SIZEOF_VOID_P == 8
+    Py_BUILD_ASSERT(sizeof(__int64) == sizeof(*var));
+    volatile __int64 *volatile_var = (volatile __int64 *)var;
+    __int64 old;
+    do {
+        old = *volatile_var;
+    } while(_InterlockedCompareExchange64(volatile_var, old, old) != old);
+#else
+    Py_BUILD_ASSERT(sizeof(long) == sizeof(*var));
+    volatile long *volatile_var = (volatile long *)var;
+    long old;
+    do {
+        old = *volatile_var;
+    } while(_InterlockedCompareExchange(volatile_var, old, old) != old);
+#endif
+    return old;
+}
+
+static inline void _Py_atomic_size_set(Py_ssize_t *var, Py_ssize_t value)
+{
+#if SIZEOF_VOID_P == 8
+    Py_BUILD_ASSERT(sizeof(__int64) == sizeof(*var));
+    volatile __int64 *volatile_var = (volatile __int64 *)var;
+    _InterlockedExchange64(volatile_var, value);
+#else
+    Py_BUILD_ASSERT(sizeof(long) == sizeof(*var));
+    volatile long *volatile_var = (volatile long *)var;
+    _InterlockedExchange(volatile_var, value);
+#endif
+}
+
+#else
+// Fallback implementation using volatile
+
+static inline Py_ssize_t _Py_atomic_size_get(Py_ssize_t *var)
+{
+    volatile Py_ssize_t *volatile_var = (volatile Py_ssize_t *)var;
+    return *volatile_var;
+}
+
+static inline void _Py_atomic_size_set(Py_ssize_t *var, Py_ssize_t value)
+{
+    volatile Py_ssize_t *volatile_var = (volatile Py_ssize_t *)var;
+    *volatile_var = value;
+}
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif  /* Py_ATOMIC_FUNC_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_bitutils.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_bitutils.h
new file mode 100644
index 000000000000..e6bf61ef425b
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_bitutils.h
@@ -0,0 +1,186 @@
+/* Bit and bytes utilities.
+
+   Bytes swap functions, reverse order of bytes:
+
+   - _Py_bswap16(uint16_t)
+   - _Py_bswap32(uint32_t)
+   - _Py_bswap64(uint64_t)
+*/
+
+#ifndef Py_INTERNAL_BITUTILS_H
+#define Py_INTERNAL_BITUTILS_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#if defined(__GNUC__) \
+      && ((__GNUC__ >= 5) || (__GNUC__ == 4) && (__GNUC_MINOR__ >= 8))
+   /* __builtin_bswap16() is available since GCC 4.8,
+      __builtin_bswap32() is available since GCC 4.3,
+      __builtin_bswap64() is available since GCC 4.3. */
+#  define _PY_HAVE_BUILTIN_BSWAP
+#endif
+
+#ifdef _MSC_VER
+   /* Get _byteswap_ushort(), _byteswap_ulong(), _byteswap_uint64() */
+#  include <intrin.h>
+#endif
+
+static inline uint16_t
+_Py_bswap16(uint16_t word)
+{
+#if defined(_PY_HAVE_BUILTIN_BSWAP) || _Py__has_builtin(__builtin_bswap16)
+    return __builtin_bswap16(word);
+#elif defined(_MSC_VER)
+    Py_BUILD_ASSERT(sizeof(word) == sizeof(unsigned short));
+    return _byteswap_ushort(word);
+#else
+    // Portable implementation which doesn't rely on circular bit shift
+    return ( ((word & UINT16_C(0x00FF)) << 8)
+           | ((word & UINT16_C(0xFF00)) >> 8));
+#endif
+}
+
+static inline uint32_t
+_Py_bswap32(uint32_t word)
+{
+#if defined(_PY_HAVE_BUILTIN_BSWAP) || _Py__has_builtin(__builtin_bswap32)
+    return __builtin_bswap32(word);
+#elif defined(_MSC_VER)
+    Py_BUILD_ASSERT(sizeof(word) == sizeof(unsigned long));
+    return _byteswap_ulong(word);
+#else
+    // Portable implementation which doesn't rely on circular bit shift
+    return ( ((word & UINT32_C(0x000000FF)) << 24)
+           | ((word & UINT32_C(0x0000FF00)) <<  8)
+           | ((word & UINT32_C(0x00FF0000)) >>  8)
+           | ((word & UINT32_C(0xFF000000)) >> 24));
+#endif
+}
+
+static inline uint64_t
+_Py_bswap64(uint64_t word)
+{
+#if defined(_PY_HAVE_BUILTIN_BSWAP) || _Py__has_builtin(__builtin_bswap64)
+    return __builtin_bswap64(word);
+#elif defined(_MSC_VER)
+    return _byteswap_uint64(word);
+#else
+    // Portable implementation which doesn't rely on circular bit shift
+    return ( ((word & UINT64_C(0x00000000000000FF)) << 56)
+           | ((word & UINT64_C(0x000000000000FF00)) << 40)
+           | ((word & UINT64_C(0x0000000000FF0000)) << 24)
+           | ((word & UINT64_C(0x00000000FF000000)) <<  8)
+           | ((word & UINT64_C(0x000000FF00000000)) >>  8)
+           | ((word & UINT64_C(0x0000FF0000000000)) >> 24)
+           | ((word & UINT64_C(0x00FF000000000000)) >> 40)
+           | ((word & UINT64_C(0xFF00000000000000)) >> 56));
+#endif
+}
+
+
+// Population count: count the number of 1's in 'x'
+// (number of bits set to 1), also known as the hamming weight.
+//
+// Implementation note. CPUID is not used, to test if x86 POPCNT instruction
+// can be used, to keep the implementation simple. For example, Visual Studio
+// __popcnt() is not used this reason. The clang and GCC builtin function can
+// use the x86 POPCNT instruction if the target architecture has SSE4a or
+// newer.
+static inline int
+_Py_popcount32(uint32_t x)
+{
+#if (defined(__clang__) || defined(__GNUC__))
+
+#if SIZEOF_INT >= 4
+    Py_BUILD_ASSERT(sizeof(x) <= sizeof(unsigned int));
+    return __builtin_popcount(x);
+#else
+    // The C standard guarantees that unsigned long will always be big enough
+    // to hold a uint32_t value without losing information.
+    Py_BUILD_ASSERT(sizeof(x) <= sizeof(unsigned long));
+    return __builtin_popcountl(x);
+#endif
+
+#else
+    // 32-bit SWAR (SIMD Within A Register) popcount
+
+    // Binary: 0 1 0 1 ...
+    const uint32_t M1 = 0x55555555;
+    // Binary: 00 11 00 11. ..
+    const uint32_t M2 = 0x33333333;
+    // Binary: 0000 1111 0000 1111 ...
+    const uint32_t M4 = 0x0F0F0F0F;
+
+    // Put count of each 2 bits into those 2 bits
+    x = x - ((x >> 1) & M1);
+    // Put count of each 4 bits into those 4 bits
+    x = (x & M2) + ((x >> 2) & M2);
+    // Put count of each 8 bits into those 8 bits
+    x = (x + (x >> 4)) & M4;
+    // Sum of the 4 byte counts.
+    // Take care when considering changes to the next line. Portability and
+    // correctness are delicate here, thanks to C's "integer promotions" (C99
+    // §6.3.1.1p2). On machines where the `int` type has width greater than 32
+    // bits, `x` will be promoted to an `int`, and following C's "usual
+    // arithmetic conversions" (C99 §6.3.1.8), the multiplication will be
+    // performed as a multiplication of two `unsigned int` operands. In this
+    // case it's critical that we cast back to `uint32_t` in order to keep only
+    // the least significant 32 bits. On machines where the `int` type has
+    // width no greater than 32, the multiplication is of two 32-bit unsigned
+    // integer types, and the (uint32_t) cast is a no-op. In both cases, we
+    // avoid the risk of undefined behaviour due to overflow of a
+    // multiplication of signed integer types.
+    return (uint32_t)(x * 0x01010101U) >> 24;
+#endif
+}
+
+
+// Return the index of the most significant 1 bit in 'x'. This is the smallest
+// integer k such that x < 2**k. Equivalent to floor(log2(x)) + 1 for x != 0.
+static inline int
+_Py_bit_length(unsigned long x)
+{
+#if (defined(__clang__) || defined(__GNUC__))
+    if (x != 0) {
+        // __builtin_clzl() is available since GCC 3.4.
+        // Undefined behavior for x == 0.
+        return (int)sizeof(unsigned long) * 8 - __builtin_clzl(x);
+    }
+    else {
+        return 0;
+    }
+#elif defined(_MSC_VER)
+    // _BitScanReverse() is documented to search 32 bits.
+    Py_BUILD_ASSERT(sizeof(unsigned long) <= 4);
+    unsigned long msb;
+    if (_BitScanReverse(&msb, x)) {
+        return (int)msb + 1;
+    }
+    else {
+        return 0;
+    }
+#else
+    const int BIT_LENGTH_TABLE[32] = {
+        0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
+        5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5
+    };
+    int msb = 0;
+    while (x >= 32) {
+        msb += 6;
+        x >>= 6;
+    }
+    msb += BIT_LENGTH_TABLE[x];
+    return msb;
+#endif
+}
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_BITUTILS_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_blocks_output_buffer.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_blocks_output_buffer.h
new file mode 100644
index 000000000000..28cf6fba4eeb
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_blocks_output_buffer.h
@@ -0,0 +1,317 @@
+/*
+   _BlocksOutputBuffer is used to maintain an output buffer
+   that has unpredictable size. Suitable for compression/decompression
+   API (bz2/lzma/zlib) that has stream->next_out and stream->avail_out:
+
+        stream->next_out:  point to the next output position.
+        stream->avail_out: the number of available bytes left in the buffer.
+
+   It maintains a list of bytes object, so there is no overhead of resizing
+   the buffer.
+
+   Usage:
+
+   1, Initialize the struct instance like this:
+        _BlocksOutputBuffer buffer = {.list = NULL};
+      Set .list to NULL for _BlocksOutputBuffer_OnError()
+
+   2, Initialize the buffer use one of these functions:
+        _BlocksOutputBuffer_InitAndGrow()
+        _BlocksOutputBuffer_InitWithSize()
+
+   3, If (avail_out == 0), grow the buffer:
+        _BlocksOutputBuffer_Grow()
+
+   4, Get the current outputted data size:
+        _BlocksOutputBuffer_GetDataSize()
+
+   5, Finish the buffer, and return a bytes object:
+        _BlocksOutputBuffer_Finish()
+
+   6, Clean up the buffer when an error occurred:
+        _BlocksOutputBuffer_OnError()
+*/
+
+#ifndef Py_INTERNAL_BLOCKS_OUTPUT_BUFFER_H
+#define Py_INTERNAL_BLOCKS_OUTPUT_BUFFER_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "Python.h"
+
+typedef struct {
+    // List of bytes objects
+    PyObject *list;
+    // Number of whole allocated size
+    Py_ssize_t allocated;
+    // Max length of the buffer, negative number means unlimited length.
+    Py_ssize_t max_length;
+} _BlocksOutputBuffer;
+
+static const char unable_allocate_msg[] = "Unable to allocate output buffer.";
+
+/* In 32-bit build, the max block size should <= INT32_MAX. */
+#define OUTPUT_BUFFER_MAX_BLOCK_SIZE (256*1024*1024)
+
+/* Block size sequence */
+#define KB (1024)
+#define MB (1024*1024)
+static const Py_ssize_t BUFFER_BLOCK_SIZE[] =
+    { 32*KB, 64*KB, 256*KB, 1*MB, 4*MB, 8*MB, 16*MB, 16*MB,
+      32*MB, 32*MB, 32*MB, 32*MB, 64*MB, 64*MB, 128*MB, 128*MB,
+      OUTPUT_BUFFER_MAX_BLOCK_SIZE };
+#undef KB
+#undef MB
+
+/* According to the block sizes defined by BUFFER_BLOCK_SIZE, the whole
+   allocated size growth step is:
+    1   32 KB       +32 KB
+    2   96 KB       +64 KB
+    3   352 KB      +256 KB
+    4   1.34 MB     +1 MB
+    5   5.34 MB     +4 MB
+    6   13.34 MB    +8 MB
+    7   29.34 MB    +16 MB
+    8   45.34 MB    +16 MB
+    9   77.34 MB    +32 MB
+    10  109.34 MB   +32 MB
+    11  141.34 MB   +32 MB
+    12  173.34 MB   +32 MB
+    13  237.34 MB   +64 MB
+    14  301.34 MB   +64 MB
+    15  429.34 MB   +128 MB
+    16  557.34 MB   +128 MB
+    17  813.34 MB   +256 MB
+    18  1069.34 MB  +256 MB
+    19  1325.34 MB  +256 MB
+    20  1581.34 MB  +256 MB
+    21  1837.34 MB  +256 MB
+    22  2093.34 MB  +256 MB
+    ...
+*/
+
+/* Initialize the buffer, and grow the buffer.
+
+   max_length: Max length of the buffer, -1 for unlimited length.
+
+   On success, return allocated size (>=0)
+   On failure, return -1
+*/
+static inline Py_ssize_t
+_BlocksOutputBuffer_InitAndGrow(_BlocksOutputBuffer *buffer,
+                                const Py_ssize_t max_length,
+                                void **next_out)
+{
+    PyObject *b;
+    Py_ssize_t block_size;
+
+    // ensure .list was set to NULL
+    assert(buffer->list == NULL);
+
+    // get block size
+    if (0 <= max_length && max_length < BUFFER_BLOCK_SIZE[0]) {
+        block_size = max_length;
+    } else {
+        block_size = BUFFER_BLOCK_SIZE[0];
+    }
+
+    // the first block
+    b = PyBytes_FromStringAndSize(NULL, block_size);
+    if (b == NULL) {
+        return -1;
+    }
+
+    // create the list
+    buffer->list = PyList_New(1);
+    if (buffer->list == NULL) {
+        Py_DECREF(b);
+        return -1;
+    }
+    PyList_SET_ITEM(buffer->list, 0, b);
+
+    // set variables
+    buffer->allocated = block_size;
+    buffer->max_length = max_length;
+
+    *next_out = PyBytes_AS_STRING(b);
+    return block_size;
+}
+
+/* Initialize the buffer, with an initial size.
+
+   Check block size limit in the outer wrapper function. For example, some libs
+   accept UINT32_MAX as the maximum block size, then init_size should <= it.
+
+   On success, return allocated size (>=0)
+   On failure, return -1
+*/
+static inline Py_ssize_t
+_BlocksOutputBuffer_InitWithSize(_BlocksOutputBuffer *buffer,
+                                 const Py_ssize_t init_size,
+                                 void **next_out)
+{
+    PyObject *b;
+
+    // ensure .list was set to NULL
+    assert(buffer->list == NULL);
+
+    // the first block
+    b = PyBytes_FromStringAndSize(NULL, init_size);
+    if (b == NULL) {
+        PyErr_SetString(PyExc_MemoryError, unable_allocate_msg);
+        return -1;
+    }
+
+    // create the list
+    buffer->list = PyList_New(1);
+    if (buffer->list == NULL) {
+        Py_DECREF(b);
+        return -1;
+    }
+    PyList_SET_ITEM(buffer->list, 0, b);
+
+    // set variables
+    buffer->allocated = init_size;
+    buffer->max_length = -1;
+
+    *next_out = PyBytes_AS_STRING(b);
+    return init_size;
+}
+
+/* Grow the buffer. The avail_out must be 0, please check it before calling.
+
+   On success, return allocated size (>=0)
+   On failure, return -1
+*/
+static inline Py_ssize_t
+_BlocksOutputBuffer_Grow(_BlocksOutputBuffer *buffer,
+                         void **next_out,
+                         const Py_ssize_t avail_out)
+{
+    PyObject *b;
+    const Py_ssize_t list_len = Py_SIZE(buffer->list);
+    Py_ssize_t block_size;
+
+    // ensure no gaps in the data
+    if (avail_out != 0) {
+        PyErr_SetString(PyExc_SystemError,
+                        "avail_out is non-zero in _BlocksOutputBuffer_Grow().");
+        return -1;
+    }
+
+    // get block size
+    if (list_len < (Py_ssize_t) Py_ARRAY_LENGTH(BUFFER_BLOCK_SIZE)) {
+        block_size = BUFFER_BLOCK_SIZE[list_len];
+    } else {
+        block_size = BUFFER_BLOCK_SIZE[Py_ARRAY_LENGTH(BUFFER_BLOCK_SIZE) - 1];
+    }
+
+    // check max_length
+    if (buffer->max_length >= 0) {
+        // if (rest == 0), should not grow the buffer.
+        Py_ssize_t rest = buffer->max_length - buffer->allocated;
+        assert(rest > 0);
+
+        // block_size of the last block
+        if (block_size > rest) {
+            block_size = rest;
+        }
+    }
+
+    // check buffer->allocated overflow
+    if (block_size > PY_SSIZE_T_MAX - buffer->allocated) {
+        PyErr_SetString(PyExc_MemoryError, unable_allocate_msg);
+        return -1;
+    }
+
+    // create the block
+    b = PyBytes_FromStringAndSize(NULL, block_size);
+    if (b == NULL) {
+        PyErr_SetString(PyExc_MemoryError, unable_allocate_msg);
+        return -1;
+    }
+    if (PyList_Append(buffer->list, b) < 0) {
+        Py_DECREF(b);
+        return -1;
+    }
+    Py_DECREF(b);
+
+    // set variables
+    buffer->allocated += block_size;
+
+    *next_out = PyBytes_AS_STRING(b);
+    return block_size;
+}
+
+/* Return the current outputted data size. */
+static inline Py_ssize_t
+_BlocksOutputBuffer_GetDataSize(_BlocksOutputBuffer *buffer,
+                                const Py_ssize_t avail_out)
+{
+    return buffer->allocated - avail_out;
+}
+
+/* Finish the buffer.
+
+   Return a bytes object on success
+   Return NULL on failure
+*/
+static inline PyObject *
+_BlocksOutputBuffer_Finish(_BlocksOutputBuffer *buffer,
+                           const Py_ssize_t avail_out)
+{
+    PyObject *result, *block;
+    const Py_ssize_t list_len = Py_SIZE(buffer->list);
+
+    // fast path for single block
+    if ((list_len == 1 && avail_out == 0) ||
+        (list_len == 2 && Py_SIZE(PyList_GET_ITEM(buffer->list, 1)) == avail_out))
+    {
+        block = PyList_GET_ITEM(buffer->list, 0);
+        Py_INCREF(block);
+
+        Py_CLEAR(buffer->list);
+        return block;
+    }
+
+    // final bytes object
+    result = PyBytes_FromStringAndSize(NULL, buffer->allocated - avail_out);
+    if (result == NULL) {
+        PyErr_SetString(PyExc_MemoryError, unable_allocate_msg);
+        return NULL;
+    }
+
+    // memory copy
+    if (list_len > 0) {
+        char *posi = PyBytes_AS_STRING(result);
+
+        // blocks except the last one
+        Py_ssize_t i = 0;
+        for (; i < list_len-1; i++) {
+            block = PyList_GET_ITEM(buffer->list, i);
+            memcpy(posi, PyBytes_AS_STRING(block), Py_SIZE(block));
+            posi += Py_SIZE(block);
+        }
+        // the last block
+        block = PyList_GET_ITEM(buffer->list, i);
+        memcpy(posi, PyBytes_AS_STRING(block), Py_SIZE(block) - avail_out);
+    } else {
+        assert(Py_SIZE(result) == 0);
+    }
+
+    Py_CLEAR(buffer->list);
+    return result;
+}
+
+/* Clean up the buffer when an error occurred. */
+static inline void
+_BlocksOutputBuffer_OnError(_BlocksOutputBuffer *buffer)
+{
+    Py_CLEAR(buffer->list);
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* Py_INTERNAL_BLOCKS_OUTPUT_BUFFER_H */
\ No newline at end of file
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_bytes_methods.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_bytes_methods.h
new file mode 100644
index 000000000000..11e8ab20e913
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_bytes_methods.h
@@ -0,0 +1,73 @@
+#ifndef Py_LIMITED_API
+#ifndef Py_BYTES_CTYPE_H
+#define Py_BYTES_CTYPE_H
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+/*
+ * The internal implementation behind PyBytes (bytes) and PyByteArray (bytearray)
+ * methods of the given names, they operate on ASCII byte strings.
+ */
+extern PyObject* _Py_bytes_isspace(const char *cptr, Py_ssize_t len);
+extern PyObject* _Py_bytes_isalpha(const char *cptr, Py_ssize_t len);
+extern PyObject* _Py_bytes_isalnum(const char *cptr, Py_ssize_t len);
+extern PyObject* _Py_bytes_isascii(const char *cptr, Py_ssize_t len);
+extern PyObject* _Py_bytes_isdigit(const char *cptr, Py_ssize_t len);
+extern PyObject* _Py_bytes_islower(const char *cptr, Py_ssize_t len);
+extern PyObject* _Py_bytes_isupper(const char *cptr, Py_ssize_t len);
+extern PyObject* _Py_bytes_istitle(const char *cptr, Py_ssize_t len);
+
+/* These store their len sized answer in the given preallocated *result arg. */
+extern void _Py_bytes_lower(char *result, const char *cptr, Py_ssize_t len);
+extern void _Py_bytes_upper(char *result, const char *cptr, Py_ssize_t len);
+extern void _Py_bytes_title(char *result, const char *s, Py_ssize_t len);
+extern void _Py_bytes_capitalize(char *result, const char *s, Py_ssize_t len);
+extern void _Py_bytes_swapcase(char *result, const char *s, Py_ssize_t len);
+
+extern PyObject *_Py_bytes_find(const char *str, Py_ssize_t len, PyObject *args);
+extern PyObject *_Py_bytes_index(const char *str, Py_ssize_t len, PyObject *args);
+extern PyObject *_Py_bytes_rfind(const char *str, Py_ssize_t len, PyObject *args);
+extern PyObject *_Py_bytes_rindex(const char *str, Py_ssize_t len, PyObject *args);
+extern PyObject *_Py_bytes_count(const char *str, Py_ssize_t len, PyObject *args);
+extern int _Py_bytes_contains(const char *str, Py_ssize_t len, PyObject *arg);
+extern PyObject *_Py_bytes_startswith(const char *str, Py_ssize_t len, PyObject *args);
+extern PyObject *_Py_bytes_endswith(const char *str, Py_ssize_t len, PyObject *args);
+
+/* The maketrans() static method. */
+extern PyObject* _Py_bytes_maketrans(Py_buffer *frm, Py_buffer *to);
+
+/* Shared __doc__ strings. */
+extern const char _Py_isspace__doc__[];
+extern const char _Py_isalpha__doc__[];
+extern const char _Py_isalnum__doc__[];
+extern const char _Py_isascii__doc__[];
+extern const char _Py_isdigit__doc__[];
+extern const char _Py_islower__doc__[];
+extern const char _Py_isupper__doc__[];
+extern const char _Py_istitle__doc__[];
+extern const char _Py_lower__doc__[];
+extern const char _Py_upper__doc__[];
+extern const char _Py_title__doc__[];
+extern const char _Py_capitalize__doc__[];
+extern const char _Py_swapcase__doc__[];
+extern const char _Py_count__doc__[];
+extern const char _Py_find__doc__[];
+extern const char _Py_index__doc__[];
+extern const char _Py_rfind__doc__[];
+extern const char _Py_rindex__doc__[];
+extern const char _Py_startswith__doc__[];
+extern const char _Py_endswith__doc__[];
+extern const char _Py_maketrans__doc__[];
+extern const char _Py_expandtabs__doc__[];
+extern const char _Py_ljust__doc__[];
+extern const char _Py_rjust__doc__[];
+extern const char _Py_center__doc__[];
+extern const char _Py_zfill__doc__[];
+
+/* this is needed because some docs are shared from the .o, not static */
+#define PyDoc_STRVAR_shared(name,str) const char name[] = PyDoc_STR(str)
+
+#endif /* !Py_BYTES_CTYPE_H */
+#endif /* !Py_LIMITED_API */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_bytesobject.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_bytesobject.h
new file mode 100644
index 000000000000..d36fa9569d64
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_bytesobject.h
@@ -0,0 +1,47 @@
+#ifndef Py_INTERNAL_BYTESOBJECT_H
+#define Py_INTERNAL_BYTESOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+
+/* Substring Search.
+
+   Returns the index of the first occurrence of
+   a substring ("needle") in a larger text ("haystack").
+   If the needle is not found, return -1.
+   If the needle is found, add offset to the index.
+*/
+
+PyAPI_FUNC(Py_ssize_t)
+_PyBytes_Find(const char *haystack, Py_ssize_t len_haystack,
+              const char *needle, Py_ssize_t len_needle,
+              Py_ssize_t offset);
+
+/* Same as above, but search right-to-left */
+PyAPI_FUNC(Py_ssize_t)
+_PyBytes_ReverseFind(const char *haystack, Py_ssize_t len_haystack,
+                     const char *needle, Py_ssize_t len_needle,
+                     Py_ssize_t offset);
+
+
+/** Helper function to implement the repeat and inplace repeat methods on a buffer
+ *
+ * len_dest is assumed to be an integer multiple of len_src.
+ * If src equals dest, then assume the operation is inplace.
+ *
+ * This method repeately doubles the number of bytes copied to reduce
+ * the number of invocations of memcpy.
+ */
+PyAPI_FUNC(void)
+_PyBytes_Repeat(char* dest, Py_ssize_t len_dest,
+    const char* src, Py_ssize_t len_src);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_BYTESOBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_call.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_call.h
new file mode 100644
index 000000000000..5d9342b562b0
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_call.h
@@ -0,0 +1,133 @@
+#ifndef Py_INTERNAL_CALL_H
+#define Py_INTERNAL_CALL_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_pystate.h"       // _PyThreadState_GET()
+
+PyAPI_FUNC(PyObject *) _PyObject_Call_Prepend(
+    PyThreadState *tstate,
+    PyObject *callable,
+    PyObject *obj,
+    PyObject *args,
+    PyObject *kwargs);
+
+PyAPI_FUNC(PyObject *) _PyObject_FastCallDictTstate(
+    PyThreadState *tstate,
+    PyObject *callable,
+    PyObject *const *args,
+    size_t nargsf,
+    PyObject *kwargs);
+
+PyAPI_FUNC(PyObject *) _PyObject_Call(
+    PyThreadState *tstate,
+    PyObject *callable,
+    PyObject *args,
+    PyObject *kwargs);
+
+extern PyObject * _PyObject_CallMethodFormat(
+        PyThreadState *tstate, PyObject *callable, const char *format, ...);
+
+
+// Static inline variant of public PyVectorcall_Function().
+static inline vectorcallfunc
+_PyVectorcall_FunctionInline(PyObject *callable)
+{
+    assert(callable != NULL);
+
+    PyTypeObject *tp = Py_TYPE(callable);
+    if (!PyType_HasFeature(tp, Py_TPFLAGS_HAVE_VECTORCALL)) {
+        return NULL;
+    }
+    assert(PyCallable_Check(callable));
+
+    Py_ssize_t offset = tp->tp_vectorcall_offset;
+    assert(offset > 0);
+
+    vectorcallfunc ptr;
+    memcpy(&ptr, (char *) callable + offset, sizeof(ptr));
+    return ptr;
+}
+
+
+/* Call the callable object 'callable' with the "vectorcall" calling
+   convention.
+
+   args is a C array for positional arguments.
+
+   nargsf is the number of positional arguments plus optionally the flag
+   PY_VECTORCALL_ARGUMENTS_OFFSET which means that the caller is allowed to
+   modify args[-1].
+
+   kwnames is a tuple of keyword names. The values of the keyword arguments
+   are stored in "args" after the positional arguments (note that the number
+   of keyword arguments does not change nargsf). kwnames can also be NULL if
+   there are no keyword arguments.
+
+   keywords must only contain strings and all keys must be unique.
+
+   Return the result on success. Raise an exception and return NULL on
+   error. */
+static inline PyObject *
+_PyObject_VectorcallTstate(PyThreadState *tstate, PyObject *callable,
+                           PyObject *const *args, size_t nargsf,
+                           PyObject *kwnames)
+{
+    vectorcallfunc func;
+    PyObject *res;
+
+    assert(kwnames == NULL || PyTuple_Check(kwnames));
+    assert(args != NULL || PyVectorcall_NARGS(nargsf) == 0);
+
+    func = _PyVectorcall_FunctionInline(callable);
+    if (func == NULL) {
+        Py_ssize_t nargs = PyVectorcall_NARGS(nargsf);
+        return _PyObject_MakeTpCall(tstate, callable, args, nargs, kwnames);
+    }
+    res = func(callable, args, nargsf, kwnames);
+    return _Py_CheckFunctionResult(tstate, callable, res, NULL);
+}
+
+
+static inline PyObject *
+_PyObject_CallNoArgsTstate(PyThreadState *tstate, PyObject *func) {
+    return _PyObject_VectorcallTstate(tstate, func, NULL, 0, NULL);
+}
+
+
+// Private static inline function variant of public PyObject_CallNoArgs()
+static inline PyObject *
+_PyObject_CallNoArgs(PyObject *func) {
+    EVAL_CALL_STAT_INC_IF_FUNCTION(EVAL_CALL_API, func);
+    PyThreadState *tstate = _PyThreadState_GET();
+    return _PyObject_VectorcallTstate(tstate, func, NULL, 0, NULL);
+}
+
+
+static inline PyObject *
+_PyObject_FastCallTstate(PyThreadState *tstate, PyObject *func, PyObject *const *args, Py_ssize_t nargs)
+{
+    EVAL_CALL_STAT_INC_IF_FUNCTION(EVAL_CALL_API, func);
+    return _PyObject_VectorcallTstate(tstate, func, args, (size_t)nargs, NULL);
+}
+
+PyObject *const *
+_PyStack_UnpackDict(PyThreadState *tstate,
+    PyObject *const *args, Py_ssize_t nargs,
+    PyObject *kwargs, PyObject **p_kwnames);
+
+void
+_PyStack_UnpackDict_Free(PyObject *const *stack, Py_ssize_t nargs,
+    PyObject *kwnames);
+
+void _PyStack_UnpackDict_FreeNoDecRef(PyObject *const *stack, PyObject *kwnames);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_CALL_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_ceval.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_ceval.h
new file mode 100644
index 000000000000..921b1cfcd3ac
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_ceval.h
@@ -0,0 +1,164 @@
+#ifndef Py_INTERNAL_CEVAL_H
+#define Py_INTERNAL_CEVAL_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+/* Forward declarations */
+struct pyruntimestate;
+struct _ceval_runtime_state;
+
+#ifndef Py_DEFAULT_RECURSION_LIMIT
+#  define Py_DEFAULT_RECURSION_LIMIT 1000
+#endif
+
+#include "pycore_interp.h"        // PyInterpreterState.eval_frame
+#include "pycore_pystate.h"       // _PyThreadState_GET()
+
+
+extern void _Py_FinishPendingCalls(PyThreadState *tstate);
+extern void _PyEval_InitState(PyInterpreterState *, PyThread_type_lock);
+extern void _PyEval_FiniState(struct _ceval_state *ceval);
+PyAPI_FUNC(void) _PyEval_SignalReceived(PyInterpreterState *interp);
+PyAPI_FUNC(int) _PyEval_AddPendingCall(
+    PyInterpreterState *interp,
+    int (*func)(void *),
+    void *arg,
+    int mainthreadonly);
+PyAPI_FUNC(void) _PyEval_SignalAsyncExc(PyInterpreterState *interp);
+#ifdef HAVE_FORK
+extern PyStatus _PyEval_ReInitThreads(PyThreadState *tstate);
+#endif
+
+// Used by sys.call_tracing()
+extern PyObject* _PyEval_CallTracing(PyObject *func, PyObject *args);
+
+// Used by sys.get_asyncgen_hooks()
+extern PyObject* _PyEval_GetAsyncGenFirstiter(void);
+extern PyObject* _PyEval_GetAsyncGenFinalizer(void);
+
+// Used by sys.set_asyncgen_hooks()
+extern int _PyEval_SetAsyncGenFirstiter(PyObject *);
+extern int _PyEval_SetAsyncGenFinalizer(PyObject *);
+
+// Used by sys.get_coroutine_origin_tracking_depth()
+// and sys.set_coroutine_origin_tracking_depth()
+extern int _PyEval_GetCoroutineOriginTrackingDepth(void);
+extern int _PyEval_SetCoroutineOriginTrackingDepth(int depth);
+
+extern void _PyEval_Fini(void);
+
+
+extern PyObject* _PyEval_GetBuiltins(PyThreadState *tstate);
+extern PyObject* _PyEval_BuiltinsFromGlobals(
+    PyThreadState *tstate,
+    PyObject *globals);
+
+// Trampoline API
+
+typedef struct {
+    // Callback to initialize the trampoline state
+    void* (*init_state)(void);
+    // Callback to register every trampoline being created
+    void (*write_state)(void* state, const void *code_addr,
+                        unsigned int code_size, PyCodeObject* code);
+    // Callback to free the trampoline state
+    int (*free_state)(void* state);
+} _PyPerf_Callbacks;
+
+extern int _PyPerfTrampoline_SetCallbacks(_PyPerf_Callbacks *);
+extern void _PyPerfTrampoline_GetCallbacks(_PyPerf_Callbacks *);
+extern int _PyPerfTrampoline_Init(int activate);
+extern int _PyPerfTrampoline_Fini(void);
+extern void _PyPerfTrampoline_FreeArenas(void);
+extern int _PyIsPerfTrampolineActive(void);
+extern PyStatus _PyPerfTrampoline_AfterFork_Child(void);
+#ifdef PY_HAVE_PERF_TRAMPOLINE
+extern _PyPerf_Callbacks _Py_perfmap_callbacks;
+#endif
+
+static inline PyObject*
+_PyEval_EvalFrame(PyThreadState *tstate, struct _PyInterpreterFrame *frame, int throwflag)
+{
+    EVAL_CALL_STAT_INC(EVAL_CALL_TOTAL);
+    if (tstate->interp->eval_frame == NULL) {
+        return _PyEval_EvalFrameDefault(tstate, frame, throwflag);
+    }
+    return tstate->interp->eval_frame(tstate, frame, throwflag);
+}
+
+extern PyObject*
+_PyEval_Vector(PyThreadState *tstate,
+            PyFunctionObject *func, PyObject *locals,
+            PyObject* const* args, size_t argcount,
+            PyObject *kwnames);
+
+extern int _PyEval_ThreadsInitialized(void);
+extern PyStatus _PyEval_InitGIL(PyThreadState *tstate, int own_gil);
+extern void _PyEval_FiniGIL(PyInterpreterState *interp);
+
+extern void _PyEval_AcquireLock(PyThreadState *tstate);
+extern void _PyEval_ReleaseLock(PyInterpreterState *, PyThreadState *);
+extern PyThreadState * _PyThreadState_SwapNoGIL(PyThreadState *);
+
+extern void _PyEval_DeactivateOpCache(void);
+
+
+/* --- _Py_EnterRecursiveCall() ----------------------------------------- */
+
+#ifdef USE_STACKCHECK
+/* With USE_STACKCHECK macro defined, trigger stack checks in
+   _Py_CheckRecursiveCall() on every 64th call to _Py_EnterRecursiveCall. */
+static inline int _Py_MakeRecCheck(PyThreadState *tstate)  {
+    return (tstate->c_recursion_remaining-- <= 0
+            || (tstate->c_recursion_remaining & 63) == 0);
+}
+#else
+static inline int _Py_MakeRecCheck(PyThreadState *tstate) {
+    return tstate->c_recursion_remaining-- <= 0;
+}
+#endif
+
+PyAPI_FUNC(int) _Py_CheckRecursiveCall(
+    PyThreadState *tstate,
+    const char *where);
+
+int _Py_CheckRecursiveCallPy(
+    PyThreadState *tstate);
+
+static inline int _Py_EnterRecursiveCallTstate(PyThreadState *tstate,
+                                               const char *where) {
+    return (_Py_MakeRecCheck(tstate) && _Py_CheckRecursiveCall(tstate, where));
+}
+
+static inline int _Py_EnterRecursiveCall(const char *where) {
+    PyThreadState *tstate = _PyThreadState_GET();
+    return _Py_EnterRecursiveCallTstate(tstate, where);
+}
+
+static inline void _Py_LeaveRecursiveCallTstate(PyThreadState *tstate)  {
+    tstate->c_recursion_remaining++;
+}
+
+static inline void _Py_LeaveRecursiveCall(void)  {
+    PyThreadState *tstate = _PyThreadState_GET();
+    _Py_LeaveRecursiveCallTstate(tstate);
+}
+
+extern struct _PyInterpreterFrame* _PyEval_GetFrame(void);
+
+extern PyObject* _Py_MakeCoro(PyFunctionObject *func);
+
+extern int _Py_HandlePending(PyThreadState *tstate);
+
+extern PyObject * _PyEval_GetFrameLocals(void);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_CEVAL_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_ceval_state.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_ceval_state.h
new file mode 100644
index 000000000000..e56e43c6e0c6
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_ceval_state.h
@@ -0,0 +1,103 @@
+#ifndef Py_INTERNAL_CEVAL_STATE_H
+#define Py_INTERNAL_CEVAL_STATE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+
+#include "pycore_atomic.h"          /* _Py_atomic_address */
+#include "pycore_gil.h"             // struct _gil_runtime_state
+
+
+struct _pending_calls {
+    int busy;
+    PyThread_type_lock lock;
+    /* Request for running pending calls. */
+    _Py_atomic_int calls_to_do;
+    /* Request for looking at the `async_exc` field of the current
+       thread state.
+       Guarded by the GIL. */
+    int async_exc;
+#define NPENDINGCALLS 32
+    struct _pending_call {
+        int (*func)(void *);
+        void *arg;
+    } calls[NPENDINGCALLS];
+    int first;
+    int last;
+};
+
+typedef enum {
+    PERF_STATUS_FAILED = -1,  // Perf trampoline is in an invalid state
+    PERF_STATUS_NO_INIT = 0,  // Perf trampoline is not initialized
+    PERF_STATUS_OK = 1,       // Perf trampoline is ready to be executed
+} perf_status_t;
+
+
+#ifdef PY_HAVE_PERF_TRAMPOLINE
+struct code_arena_st;
+
+struct trampoline_api_st {
+    void* (*init_state)(void);
+    void (*write_state)(void* state, const void *code_addr,
+                        unsigned int code_size, PyCodeObject* code);
+    int (*free_state)(void* state);
+    void *state;
+};
+#endif
+
+struct _ceval_runtime_state {
+    struct {
+#ifdef PY_HAVE_PERF_TRAMPOLINE
+        perf_status_t status;
+        Py_ssize_t extra_code_index;
+        struct code_arena_st *code_arena;
+        struct trampoline_api_st trampoline_api;
+        FILE *map_file;
+#else
+        int _not_used;
+#endif
+    } perf;
+    /* Request for checking signals. It is shared by all interpreters (see
+       bpo-40513). Any thread of any interpreter can receive a signal, but only
+       the main thread of the main interpreter can handle signals: see
+       _Py_ThreadCanHandleSignals(). */
+    _Py_atomic_int signals_pending;
+    /* Pending calls to be made only on the main thread. */
+    struct _pending_calls pending_mainthread;
+};
+
+#ifdef PY_HAVE_PERF_TRAMPOLINE
+# define _PyEval_RUNTIME_PERF_INIT \
+    { \
+        .status = PERF_STATUS_NO_INIT, \
+        .extra_code_index = -1, \
+    }
+#else
+# define _PyEval_RUNTIME_PERF_INIT {0}
+#endif
+
+
+struct _ceval_state {
+    /* This single variable consolidates all requests to break out of
+       the fast path in the eval loop. */
+    _Py_atomic_int eval_breaker;
+    /* Request for dropping the GIL */
+    _Py_atomic_int gil_drop_request;
+    int recursion_limit;
+    struct _gil_runtime_state *gil;
+    int own_gil;
+    /* The GC is ready to be executed */
+    _Py_atomic_int gc_scheduled;
+    struct _pending_calls pending;
+};
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_CEVAL_STATE_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_code.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_code.h
new file mode 100644
index 000000000000..92e0a8bbd394
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_code.h
@@ -0,0 +1,496 @@
+#ifndef Py_INTERNAL_CODE_H
+#define Py_INTERNAL_CODE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define CODE_MAX_WATCHERS 8
+
+/* PEP 659
+ * Specialization and quickening structs and helper functions
+ */
+
+
+// Inline caches. If you change the number of cache entries for an instruction,
+// you must *also* update the number of cache entries in Lib/opcode.py and bump
+// the magic number in Lib/importlib/_bootstrap_external.py!
+
+#define CACHE_ENTRIES(cache) (sizeof(cache)/sizeof(_Py_CODEUNIT))
+
+typedef struct {
+    uint16_t counter;
+    uint16_t index;
+    uint16_t module_keys_version;
+    uint16_t builtin_keys_version;
+} _PyLoadGlobalCache;
+
+#define INLINE_CACHE_ENTRIES_LOAD_GLOBAL CACHE_ENTRIES(_PyLoadGlobalCache)
+
+typedef struct {
+    uint16_t counter;
+} _PyBinaryOpCache;
+
+#define INLINE_CACHE_ENTRIES_BINARY_OP CACHE_ENTRIES(_PyBinaryOpCache)
+
+typedef struct {
+    uint16_t counter;
+} _PyUnpackSequenceCache;
+
+#define INLINE_CACHE_ENTRIES_UNPACK_SEQUENCE \
+    CACHE_ENTRIES(_PyUnpackSequenceCache)
+
+typedef struct {
+    uint16_t counter;
+} _PyCompareOpCache;
+
+#define INLINE_CACHE_ENTRIES_COMPARE_OP CACHE_ENTRIES(_PyCompareOpCache)
+
+typedef struct {
+    uint16_t counter;
+} _PyBinarySubscrCache;
+
+#define INLINE_CACHE_ENTRIES_BINARY_SUBSCR CACHE_ENTRIES(_PyBinarySubscrCache)
+
+typedef struct {
+    uint16_t counter;
+} _PySuperAttrCache;
+
+#define INLINE_CACHE_ENTRIES_LOAD_SUPER_ATTR CACHE_ENTRIES(_PySuperAttrCache)
+
+typedef struct {
+    uint16_t counter;
+    uint16_t version[2];
+    uint16_t index;
+} _PyAttrCache;
+
+typedef struct {
+    uint16_t counter;
+    uint16_t type_version[2];
+    uint16_t keys_version[2];
+    uint16_t descr[4];
+} _PyLoadMethodCache;
+
+
+// MUST be the max(_PyAttrCache, _PyLoadMethodCache)
+#define INLINE_CACHE_ENTRIES_LOAD_ATTR CACHE_ENTRIES(_PyLoadMethodCache)
+
+#define INLINE_CACHE_ENTRIES_STORE_ATTR CACHE_ENTRIES(_PyAttrCache)
+
+typedef struct {
+    uint16_t counter;
+    uint16_t func_version[2];
+} _PyCallCache;
+
+#define INLINE_CACHE_ENTRIES_CALL CACHE_ENTRIES(_PyCallCache)
+
+typedef struct {
+    uint16_t counter;
+} _PyStoreSubscrCache;
+
+#define INLINE_CACHE_ENTRIES_STORE_SUBSCR CACHE_ENTRIES(_PyStoreSubscrCache)
+
+typedef struct {
+    uint16_t counter;
+} _PyForIterCache;
+
+#define INLINE_CACHE_ENTRIES_FOR_ITER CACHE_ENTRIES(_PyForIterCache)
+
+typedef struct {
+    uint16_t counter;
+} _PySendCache;
+
+#define INLINE_CACHE_ENTRIES_SEND CACHE_ENTRIES(_PySendCache)
+
+// Borrowed references to common callables:
+struct callable_cache {
+    PyObject *isinstance;
+    PyObject *len;
+    PyObject *list_append;
+    PyObject *object__getattribute__;
+};
+
+/* "Locals plus" for a code object is the set of locals + cell vars +
+ * free vars.  This relates to variable names as well as offsets into
+ * the "fast locals" storage array of execution frames.  The compiler
+ * builds the list of names, their offsets, and the corresponding
+ * kind of local.
+ *
+ * Those kinds represent the source of the initial value and the
+ * variable's scope (as related to closures).  A "local" is an
+ * argument or other variable defined in the current scope.  A "free"
+ * variable is one that is defined in an outer scope and comes from
+ * the function's closure.  A "cell" variable is a local that escapes
+ * into an inner function as part of a closure, and thus must be
+ * wrapped in a cell.  Any "local" can also be a "cell", but the
+ * "free" kind is mutually exclusive with both.
+ */
+
+// Note that these all fit within a byte, as do combinations.
+// Later, we will use the smaller numbers to differentiate the different
+// kinds of locals (e.g. pos-only arg, varkwargs, local-only).
+#define CO_FAST_HIDDEN  0x10
+#define CO_FAST_LOCAL   0x20
+#define CO_FAST_CELL    0x40
+#define CO_FAST_FREE    0x80
+
+typedef unsigned char _PyLocals_Kind;
+
+static inline _PyLocals_Kind
+_PyLocals_GetKind(PyObject *kinds, int i)
+{
+    assert(PyBytes_Check(kinds));
+    assert(0 <= i && i < PyBytes_GET_SIZE(kinds));
+    char *ptr = PyBytes_AS_STRING(kinds);
+    return (_PyLocals_Kind)(ptr[i]);
+}
+
+static inline void
+_PyLocals_SetKind(PyObject *kinds, int i, _PyLocals_Kind kind)
+{
+    assert(PyBytes_Check(kinds));
+    assert(0 <= i && i < PyBytes_GET_SIZE(kinds));
+    char *ptr = PyBytes_AS_STRING(kinds);
+    ptr[i] = (char) kind;
+}
+
+
+struct _PyCodeConstructor {
+    /* metadata */
+    PyObject *filename;
+    PyObject *name;
+    PyObject *qualname;
+    int flags;
+
+    /* the code */
+    PyObject *code;
+    int firstlineno;
+    PyObject *linetable;
+
+    /* used by the code */
+    PyObject *consts;
+    PyObject *names;
+
+    /* mapping frame offsets to information */
+    PyObject *localsplusnames;  // Tuple of strings
+    PyObject *localspluskinds;  // Bytes object, one byte per variable
+
+    /* args (within varnames) */
+    int argcount;
+    int posonlyargcount;
+    // XXX Replace argcount with posorkwargcount (argcount - posonlyargcount).
+    int kwonlyargcount;
+
+    /* needed to create the frame */
+    int stacksize;
+
+    /* used by the eval loop */
+    PyObject *exceptiontable;
+};
+
+// Using an "arguments struct" like this is helpful for maintainability
+// in a case such as this with many parameters.  It does bear a risk:
+// if the struct changes and callers are not updated properly then the
+// compiler will not catch problems (like a missing argument).  This can
+// cause hard-to-debug problems.  The risk is mitigated by the use of
+// check_code() in codeobject.c.  However, we may decide to switch
+// back to a regular function signature.  Regardless, this approach
+// wouldn't be appropriate if this weren't a strictly internal API.
+// (See the comments in https://github.com/python/cpython/pull/26258.)
+PyAPI_FUNC(int) _PyCode_Validate(struct _PyCodeConstructor *);
+PyAPI_FUNC(PyCodeObject *) _PyCode_New(struct _PyCodeConstructor *);
+
+
+/* Private API */
+
+/* Getters for internal PyCodeObject data. */
+extern PyObject* _PyCode_GetVarnames(PyCodeObject *);
+extern PyObject* _PyCode_GetCellvars(PyCodeObject *);
+extern PyObject* _PyCode_GetFreevars(PyCodeObject *);
+extern PyObject* _PyCode_GetCode(PyCodeObject *);
+
+/** API for initializing the line number tables. */
+extern int _PyCode_InitAddressRange(PyCodeObject* co, PyCodeAddressRange *bounds);
+
+/** Out of process API for initializing the location table. */
+extern void _PyLineTable_InitAddressRange(
+    const char *linetable,
+    Py_ssize_t length,
+    int firstlineno,
+    PyCodeAddressRange *range);
+
+/** API for traversing the line number table. */
+extern int _PyLineTable_NextAddressRange(PyCodeAddressRange *range);
+extern int _PyLineTable_PreviousAddressRange(PyCodeAddressRange *range);
+
+/* Specialization functions */
+
+extern void _Py_Specialize_LoadSuperAttr(PyObject *global_super, PyObject *cls,
+                                         _Py_CODEUNIT *instr, int load_method);
+extern void _Py_Specialize_LoadAttr(PyObject *owner, _Py_CODEUNIT *instr,
+                                    PyObject *name);
+extern void _Py_Specialize_StoreAttr(PyObject *owner, _Py_CODEUNIT *instr,
+                                     PyObject *name);
+extern void _Py_Specialize_LoadGlobal(PyObject *globals, PyObject *builtins,
+                                      _Py_CODEUNIT *instr, PyObject *name);
+extern void _Py_Specialize_BinarySubscr(PyObject *sub, PyObject *container,
+                                        _Py_CODEUNIT *instr);
+extern void _Py_Specialize_StoreSubscr(PyObject *container, PyObject *sub,
+                                       _Py_CODEUNIT *instr);
+extern void _Py_Specialize_Call(PyObject *callable, _Py_CODEUNIT *instr,
+                                int nargs, PyObject *kwnames);
+extern void _Py_Specialize_BinaryOp(PyObject *lhs, PyObject *rhs, _Py_CODEUNIT *instr,
+                                    int oparg, PyObject **locals);
+extern void _Py_Specialize_CompareOp(PyObject *lhs, PyObject *rhs,
+                                     _Py_CODEUNIT *instr, int oparg);
+extern void _Py_Specialize_UnpackSequence(PyObject *seq, _Py_CODEUNIT *instr,
+                                          int oparg);
+extern void _Py_Specialize_ForIter(PyObject *iter, _Py_CODEUNIT *instr, int oparg);
+extern void _Py_Specialize_Send(PyObject *receiver, _Py_CODEUNIT *instr);
+
+/* Finalizer function for static codeobjects used in deepfreeze.py */
+extern void _PyStaticCode_Fini(PyCodeObject *co);
+/* Function to intern strings of codeobjects and quicken the bytecode */
+extern int _PyStaticCode_Init(PyCodeObject *co);
+
+#ifdef Py_STATS
+
+
+#define STAT_INC(opname, name) do { if (_py_stats) _py_stats->opcode_stats[opname].specialization.name++; } while (0)
+#define STAT_DEC(opname, name) do { if (_py_stats) _py_stats->opcode_stats[opname].specialization.name--; } while (0)
+#define OPCODE_EXE_INC(opname) do { if (_py_stats) _py_stats->opcode_stats[opname].execution_count++; } while (0)
+#define CALL_STAT_INC(name) do { if (_py_stats) _py_stats->call_stats.name++; } while (0)
+#define OBJECT_STAT_INC(name) do { if (_py_stats) _py_stats->object_stats.name++; } while (0)
+#define OBJECT_STAT_INC_COND(name, cond) \
+    do { if (_py_stats && cond) _py_stats->object_stats.name++; } while (0)
+#define EVAL_CALL_STAT_INC(name) do { if (_py_stats) _py_stats->call_stats.eval_calls[name]++; } while (0)
+#define EVAL_CALL_STAT_INC_IF_FUNCTION(name, callable) \
+    do { if (_py_stats && PyFunction_Check(callable)) _py_stats->call_stats.eval_calls[name]++; } while (0)
+
+// Used by the _opcode extension which is built as a shared library
+PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void);
+
+#else
+#define STAT_INC(opname, name) ((void)0)
+#define STAT_DEC(opname, name) ((void)0)
+#define OPCODE_EXE_INC(opname) ((void)0)
+#define CALL_STAT_INC(name) ((void)0)
+#define OBJECT_STAT_INC(name) ((void)0)
+#define OBJECT_STAT_INC_COND(name, cond) ((void)0)
+#define EVAL_CALL_STAT_INC(name) ((void)0)
+#define EVAL_CALL_STAT_INC_IF_FUNCTION(name, callable) ((void)0)
+#endif  // !Py_STATS
+
+// Utility functions for reading/writing 32/64-bit values in the inline caches.
+// Great care should be taken to ensure that these functions remain correct and
+// performant! They should compile to just "move" instructions on all supported
+// compilers and platforms.
+
+// We use memcpy to let the C compiler handle unaligned accesses and endianness
+// issues for us. It also seems to produce better code than manual copying for
+// most compilers (see https://blog.regehr.org/archives/959 for more info).
+
+static inline void
+write_u32(uint16_t *p, uint32_t val)
+{
+    memcpy(p, &val, sizeof(val));
+}
+
+static inline void
+write_u64(uint16_t *p, uint64_t val)
+{
+    memcpy(p, &val, sizeof(val));
+}
+
+static inline void
+write_obj(uint16_t *p, PyObject *val)
+{
+    memcpy(p, &val, sizeof(val));
+}
+
+static inline uint16_t
+read_u16(uint16_t *p)
+{
+    return *p;
+}
+
+static inline uint32_t
+read_u32(uint16_t *p)
+{
+    uint32_t val;
+    memcpy(&val, p, sizeof(val));
+    return val;
+}
+
+static inline uint64_t
+read_u64(uint16_t *p)
+{
+    uint64_t val;
+    memcpy(&val, p, sizeof(val));
+    return val;
+}
+
+static inline PyObject *
+read_obj(uint16_t *p)
+{
+    PyObject *val;
+    memcpy(&val, p, sizeof(val));
+    return val;
+}
+
+/* See Objects/exception_handling_notes.txt for details.
+ */
+static inline unsigned char *
+parse_varint(unsigned char *p, int *result) {
+    int val = p[0] & 63;
+    while (p[0] & 64) {
+        p++;
+        val = (val << 6) | (p[0] & 63);
+    }
+    *result = val;
+    return p+1;
+}
+
+static inline int
+write_varint(uint8_t *ptr, unsigned int val)
+{
+    int written = 1;
+    while (val >= 64) {
+        *ptr++ = 64 | (val & 63);
+        val >>= 6;
+        written++;
+    }
+    *ptr = (uint8_t)val;
+    return written;
+}
+
+static inline int
+write_signed_varint(uint8_t *ptr, int val)
+{
+    unsigned int uval;
+    if (val < 0) {
+        // (unsigned int)(-val) has an undefined behavior for INT_MIN
+        uval = ((0 - (unsigned int)val) << 1) | 1;
+    }
+    else {
+        uval = (unsigned int)val << 1;
+    }
+    return write_varint(ptr, uval);
+}
+
+static inline int
+write_location_entry_start(uint8_t *ptr, int code, int length)
+{
+    assert((code & 15) == code);
+    *ptr = 128 | (uint8_t)(code << 3) | (uint8_t)(length - 1);
+    return 1;
+}
+
+
+/** Counters
+ * The first 16-bit value in each inline cache is a counter.
+ * When counting misses, the counter is treated as a simple unsigned value.
+ *
+ * When counting executions until the next specialization attempt,
+ * exponential backoff is used to reduce the number of specialization failures.
+ * The high 12 bits store the counter, the low 4 bits store the backoff exponent.
+ * On a specialization failure, the backoff exponent is incremented and the
+ * counter set to (2**backoff - 1).
+ * Backoff == 6 -> starting counter == 63, backoff == 10 -> starting counter == 1023.
+ */
+
+/* With a 16-bit counter, we have 12 bits for the counter value, and 4 bits for the backoff */
+#define ADAPTIVE_BACKOFF_BITS 4
+
+// A value of 1 means that we attempt to specialize the *second* time each
+// instruction is executed. Executing twice is a much better indicator of
+// "hotness" than executing once, but additional warmup delays only prevent
+// specialization. Most types stabilize by the second execution, too:
+#define ADAPTIVE_WARMUP_VALUE 1
+#define ADAPTIVE_WARMUP_BACKOFF 1
+
+// A value of 52 means that we attempt to re-specialize after 53 misses (a prime
+// number, useful for avoiding artifacts if every nth value is a different type
+// or something). Setting the backoff to 0 means that the counter is reset to
+// the same state as a warming-up instruction (value == 1, backoff == 1) after
+// deoptimization. This isn't strictly necessary, but it is bit easier to reason
+// about when thinking about the opcode transitions as a state machine:
+#define ADAPTIVE_COOLDOWN_VALUE 52
+#define ADAPTIVE_COOLDOWN_BACKOFF 0
+
+#define MAX_BACKOFF_VALUE (16 - ADAPTIVE_BACKOFF_BITS)
+
+
+static inline uint16_t
+adaptive_counter_bits(uint16_t value, uint16_t backoff) {
+    return ((value << ADAPTIVE_BACKOFF_BITS)
+            | (backoff & ((1 << ADAPTIVE_BACKOFF_BITS) - 1)));
+}
+
+static inline uint16_t
+adaptive_counter_warmup(void) {
+    return adaptive_counter_bits(ADAPTIVE_WARMUP_VALUE,
+                                 ADAPTIVE_WARMUP_BACKOFF);
+}
+
+static inline uint16_t
+adaptive_counter_cooldown(void) {
+    return adaptive_counter_bits(ADAPTIVE_COOLDOWN_VALUE,
+                                 ADAPTIVE_COOLDOWN_BACKOFF);
+}
+
+static inline uint16_t
+adaptive_counter_backoff(uint16_t counter) {
+    uint16_t backoff = counter & ((1 << ADAPTIVE_BACKOFF_BITS) - 1);
+    backoff++;
+    if (backoff > MAX_BACKOFF_VALUE) {
+        backoff = MAX_BACKOFF_VALUE;
+    }
+    uint16_t value = (uint16_t)(1 << backoff) - 1;
+    return adaptive_counter_bits(value, backoff);
+}
+
+
+/* Line array cache for tracing */
+
+typedef struct _PyShimCodeDef {
+    const uint8_t *code;
+    int codelen;
+    int stacksize;
+    const char *cname;
+} _PyShimCodeDef;
+
+extern PyCodeObject *
+_Py_MakeShimCode(const _PyShimCodeDef *code);
+
+extern uint32_t _Py_next_func_version;
+
+
+/* Comparison bit masks. */
+
+/* Note this evaluates its arguments twice each */
+#define COMPARISON_BIT(x, y) (1 << (2 * ((x) >= (y)) + ((x) <= (y))))
+
+/*
+ * The following bits are chosen so that the value of
+ * COMPARSION_BIT(left, right)
+ * masked by the values below will be non-zero if the
+ * comparison is true, and zero if it is false */
+
+/* This is for values that are unordered, ie. NaN, not types that are unordered, e.g. sets */
+#define COMPARISON_UNORDERED 1
+
+#define COMPARISON_LESS_THAN 2
+#define COMPARISON_GREATER_THAN 4
+#define COMPARISON_EQUALS 8
+
+#define COMPARISON_NOT_EQUALS (COMPARISON_UNORDERED | COMPARISON_LESS_THAN | COMPARISON_GREATER_THAN)
+
+extern int _Py_Instrument(PyCodeObject *co, PyInterpreterState *interp);
+
+extern int _Py_GetBaseOpcode(PyCodeObject *code, int offset);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_CODE_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_compile.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_compile.h
new file mode 100644
index 000000000000..80a637e5bf9a
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_compile.h
@@ -0,0 +1,118 @@
+#ifndef Py_INTERNAL_COMPILE_H
+#define Py_INTERNAL_COMPILE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+struct _arena;   // Type defined in pycore_pyarena.h
+struct _mod;     // Type defined in pycore_ast.h
+
+// Export the symbol for test_peg_generator (built as a library)
+PyAPI_FUNC(PyCodeObject*) _PyAST_Compile(
+    struct _mod *mod,
+    PyObject *filename,
+    PyCompilerFlags *flags,
+    int optimize,
+    struct _arena *arena);
+
+static const _PyCompilerSrcLocation NO_LOCATION = {-1, -1, -1, -1};
+
+typedef struct {
+    int optimize;
+    int ff_features;
+
+    int recursion_depth;            /* current recursion depth */
+    int recursion_limit;            /* recursion limit */
+} _PyASTOptimizeState;
+
+extern int _PyAST_Optimize(
+    struct _mod *,
+    struct _arena *arena,
+    _PyASTOptimizeState *state);
+
+typedef struct {
+    int h_offset;
+    int h_startdepth;
+    int h_preserve_lasti;
+} _PyCompile_ExceptHandlerInfo;
+
+typedef struct {
+    int i_opcode;
+    int i_oparg;
+    _PyCompilerSrcLocation i_loc;
+    _PyCompile_ExceptHandlerInfo i_except_handler_info;
+} _PyCompile_Instruction;
+
+typedef struct {
+    _PyCompile_Instruction *s_instrs;
+    int s_allocated;
+    int s_used;
+
+    int *s_labelmap;       /* label id --> instr offset */
+    int s_labelmap_size;
+    int s_next_free_label; /* next free label id */
+} _PyCompile_InstructionSequence;
+
+typedef struct {
+    PyObject *u_name;
+    PyObject *u_qualname;  /* dot-separated qualified name (lazy) */
+
+    /* The following fields are dicts that map objects to
+       the index of them in co_XXX.      The index is used as
+       the argument for opcodes that refer to those collections.
+    */
+    PyObject *u_consts;    /* all constants */
+    PyObject *u_names;     /* all names */
+    PyObject *u_varnames;  /* local variables */
+    PyObject *u_cellvars;  /* cell variables */
+    PyObject *u_freevars;  /* free variables */
+    PyObject *u_fasthidden; /* dict; keys are names that are fast-locals only
+                               temporarily within an inlined comprehension. When
+                               value is True, treat as fast-local. */
+
+    Py_ssize_t u_argcount;        /* number of arguments for block */
+    Py_ssize_t u_posonlyargcount;        /* number of positional only arguments for block */
+    Py_ssize_t u_kwonlyargcount; /* number of keyword only arguments for block */
+
+    int u_firstlineno; /* the first lineno of the block */
+} _PyCompile_CodeUnitMetadata;
+
+
+/* Utility for a number of growing arrays used in the compiler */
+int _PyCompile_EnsureArrayLargeEnough(
+        int idx,
+        void **array,
+        int *alloc,
+        int default_alloc,
+        size_t item_size);
+
+int _PyCompile_ConstCacheMergeOne(PyObject *const_cache, PyObject **obj);
+
+int _PyCompile_InstrSize(int opcode, int oparg);
+
+/* Access compiler internals for unit testing */
+
+PyAPI_FUNC(PyObject*) _PyCompile_CodeGen(
+        PyObject *ast,
+        PyObject *filename,
+        PyCompilerFlags *flags,
+        int optimize,
+        int compile_mode);
+
+PyAPI_FUNC(PyObject*) _PyCompile_OptimizeCfg(
+        PyObject *instructions,
+        PyObject *consts,
+        int nlocals);
+
+PyAPI_FUNC(PyCodeObject*)
+_PyCompile_Assemble(_PyCompile_CodeUnitMetadata *umd, PyObject *filename,
+                    PyObject *instructions);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_COMPILE_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_condvar.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_condvar.h
new file mode 100644
index 000000000000..981c962bf7df
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_condvar.h
@@ -0,0 +1,97 @@
+#ifndef Py_INTERNAL_CONDVAR_H
+#define Py_INTERNAL_CONDVAR_H
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#ifndef _POSIX_THREADS
+/* This means pthreads are not implemented in libc headers, hence the macro
+   not present in unistd.h. But they still can be implemented as an external
+   library (e.g. gnu pth in pthread emulation) */
+# ifdef HAVE_PTHREAD_H
+#  include <pthread.h> /* _POSIX_THREADS */
+# endif
+#endif
+
+#ifdef _POSIX_THREADS
+/*
+ * POSIX support
+ */
+#define Py_HAVE_CONDVAR
+
+#ifdef HAVE_PTHREAD_H
+#  include <pthread.h>
+#endif
+
+#define PyMUTEX_T pthread_mutex_t
+#define PyCOND_T pthread_cond_t
+
+#elif defined(NT_THREADS)
+/*
+ * Windows (XP, 2003 server and later, as well as (hopefully) CE) support
+ *
+ * Emulated condition variables ones that work with XP and later, plus
+ * example native support on VISTA and onwards.
+ */
+#define Py_HAVE_CONDVAR
+
+/* include windows if it hasn't been done before */
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+
+/* options */
+/* non-emulated condition variables are provided for those that want
+ * to target Windows Vista.  Modify this macro to enable them.
+ */
+#ifndef _PY_EMULATED_WIN_CV
+#define _PY_EMULATED_WIN_CV 1  /* use emulated condition variables */
+#endif
+
+/* fall back to emulation if not targeting Vista */
+#if !defined NTDDI_VISTA || NTDDI_VERSION < NTDDI_VISTA
+#undef _PY_EMULATED_WIN_CV
+#define _PY_EMULATED_WIN_CV 1
+#endif
+
+#if _PY_EMULATED_WIN_CV
+
+typedef CRITICAL_SECTION PyMUTEX_T;
+
+/* The ConditionVariable object.  From XP onwards it is easily emulated
+   with a Semaphore.
+   Semaphores are available on Windows XP (2003 server) and later.
+   We use a Semaphore rather than an auto-reset event, because although
+   an auto-reset event might appear to solve the lost-wakeup bug (race
+   condition between releasing the outer lock and waiting) because it
+   maintains state even though a wait hasn't happened, there is still
+   a lost wakeup problem if more than one thread are interrupted in the
+   critical place.  A semaphore solves that, because its state is
+   counted, not Boolean.
+   Because it is ok to signal a condition variable with no one
+   waiting, we need to keep track of the number of
+   waiting threads.  Otherwise, the semaphore's state could rise
+   without bound.  This also helps reduce the number of "spurious wakeups"
+   that would otherwise happen.
+ */
+
+typedef struct _PyCOND_T
+{
+    HANDLE sem;
+    int waiting; /* to allow PyCOND_SIGNAL to be a no-op */
+} PyCOND_T;
+
+#else /* !_PY_EMULATED_WIN_CV */
+
+/* Use native Win7 primitives if build target is Win7 or higher */
+
+/* SRWLOCK is faster and better than CriticalSection */
+typedef SRWLOCK PyMUTEX_T;
+
+typedef CONDITION_VARIABLE  PyCOND_T;
+
+#endif /* _PY_EMULATED_WIN_CV */
+
+#endif /* _POSIX_THREADS, NT_THREADS */
+
+#endif /* Py_INTERNAL_CONDVAR_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_context.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_context.h
new file mode 100644
index 000000000000..52dfe3ef2338
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_context.h
@@ -0,0 +1,71 @@
+#ifndef Py_INTERNAL_CONTEXT_H
+#define Py_INTERNAL_CONTEXT_H
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_hamt.h"   /* PyHamtObject */
+
+
+extern PyTypeObject _PyContextTokenMissing_Type;
+
+/* runtime lifecycle */
+
+PyStatus _PyContext_Init(PyInterpreterState *);
+void _PyContext_Fini(PyInterpreterState *);
+
+
+/* other API */
+
+typedef struct {
+    PyObject_HEAD
+} _PyContextTokenMissing;
+
+#ifndef WITH_FREELISTS
+// without freelists
+#  define PyContext_MAXFREELIST 0
+#endif
+
+#ifndef PyContext_MAXFREELIST
+#  define PyContext_MAXFREELIST 255
+#endif
+
+struct _Py_context_state {
+#if PyContext_MAXFREELIST > 0
+    // List of free PyContext objects
+    PyContext *freelist;
+    int numfree;
+#endif
+};
+
+struct _pycontextobject {
+    PyObject_HEAD
+    PyContext *ctx_prev;
+    PyHamtObject *ctx_vars;
+    PyObject *ctx_weakreflist;
+    int ctx_entered;
+};
+
+
+struct _pycontextvarobject {
+    PyObject_HEAD
+    PyObject *var_name;
+    PyObject *var_default;
+    PyObject *var_cached;
+    uint64_t var_cached_tsid;
+    uint64_t var_cached_tsver;
+    Py_hash_t var_hash;
+};
+
+
+struct _pycontexttokenobject {
+    PyObject_HEAD
+    PyContext *tok_ctx;
+    PyContextVar *tok_var;
+    PyObject *tok_oldval;
+    int tok_used;
+};
+
+
+#endif /* !Py_INTERNAL_CONTEXT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_descrobject.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_descrobject.h
new file mode 100644
index 000000000000..76378569df90
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_descrobject.h
@@ -0,0 +1,26 @@
+#ifndef Py_INTERNAL_DESCROBJECT_H
+#define Py_INTERNAL_DESCROBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+typedef struct {
+    PyObject_HEAD
+    PyObject *prop_get;
+    PyObject *prop_set;
+    PyObject *prop_del;
+    PyObject *prop_doc;
+    PyObject *prop_name;
+    int getter_doc;
+} propertyobject;
+
+typedef propertyobject _PyPropertyObject;
+
+#ifdef __cplusplus
+}
+#endif
+#endif   /* !Py_INTERNAL_DESCROBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_dict.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_dict.h
new file mode 100644
index 000000000000..6253e0841ad3
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_dict.h
@@ -0,0 +1,199 @@
+
+#ifndef Py_INTERNAL_DICT_H
+#define Py_INTERNAL_DICT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_dict_state.h"
+#include "pycore_runtime.h"         // _PyRuntime
+
+
+/* runtime lifecycle */
+
+extern void _PyDict_Fini(PyInterpreterState *interp);
+
+
+/* other API */
+
+typedef struct {
+    /* Cached hash code of me_key. */
+    Py_hash_t me_hash;
+    PyObject *me_key;
+    PyObject *me_value; /* This field is only meaningful for combined tables */
+} PyDictKeyEntry;
+
+typedef struct {
+    PyObject *me_key;   /* The key must be Unicode and have hash. */
+    PyObject *me_value; /* This field is only meaningful for combined tables */
+} PyDictUnicodeEntry;
+
+extern PyDictKeysObject *_PyDict_NewKeysForClass(void);
+extern PyObject *_PyDict_FromKeys(PyObject *, PyObject *, PyObject *);
+
+/* Gets a version number unique to the current state of the keys of dict, if possible.
+ * Returns the version number, or zero if it was not possible to get a version number. */
+extern uint32_t _PyDictKeys_GetVersionForCurrentState(
+        PyInterpreterState *interp, PyDictKeysObject *dictkeys);
+
+extern size_t _PyDict_KeysSize(PyDictKeysObject *keys);
+
+/* _Py_dict_lookup() returns index of entry which can be used like DK_ENTRIES(dk)[index].
+ * -1 when no entry found, -3 when compare raises error.
+ */
+extern Py_ssize_t _Py_dict_lookup(PyDictObject *mp, PyObject *key, Py_hash_t hash, PyObject **value_addr);
+
+extern Py_ssize_t _PyDict_LookupIndex(PyDictObject *, PyObject *);
+extern Py_ssize_t _PyDictKeys_StringLookup(PyDictKeysObject* dictkeys, PyObject *key);
+extern PyObject *_PyDict_LoadGlobal(PyDictObject *, PyDictObject *, PyObject *);
+
+/* Consumes references to key and value */
+extern int _PyDict_SetItem_Take2(PyDictObject *op, PyObject *key, PyObject *value);
+extern int _PyObjectDict_SetItem(PyTypeObject *tp, PyObject **dictptr, PyObject *name, PyObject *value);
+
+extern PyObject *_PyDict_Pop_KnownHash(PyObject *, PyObject *, Py_hash_t, PyObject *);
+
+#define DKIX_EMPTY (-1)
+#define DKIX_DUMMY (-2)  /* Used internally */
+#define DKIX_ERROR (-3)
+#define DKIX_KEY_CHANGED (-4) /* Used internally */
+
+typedef enum {
+    DICT_KEYS_GENERAL = 0,
+    DICT_KEYS_UNICODE = 1,
+    DICT_KEYS_SPLIT = 2
+} DictKeysKind;
+
+/* See dictobject.c for actual layout of DictKeysObject */
+struct _dictkeysobject {
+    Py_ssize_t dk_refcnt;
+
+    /* Size of the hash table (dk_indices). It must be a power of 2. */
+    uint8_t dk_log2_size;
+
+    /* Size of the hash table (dk_indices) by bytes. */
+    uint8_t dk_log2_index_bytes;
+
+    /* Kind of keys */
+    uint8_t dk_kind;
+
+    /* Version number -- Reset to 0 by any modification to keys */
+    uint32_t dk_version;
+
+    /* Number of usable entries in dk_entries. */
+    Py_ssize_t dk_usable;
+
+    /* Number of used entries in dk_entries. */
+    Py_ssize_t dk_nentries;
+
+    /* Actual hash table of dk_size entries. It holds indices in dk_entries,
+       or DKIX_EMPTY(-1) or DKIX_DUMMY(-2).
+
+       Indices must be: 0 <= indice < USABLE_FRACTION(dk_size).
+
+       The size in bytes of an indice depends on dk_size:
+
+       - 1 byte if dk_size <= 0xff (char*)
+       - 2 bytes if dk_size <= 0xffff (int16_t*)
+       - 4 bytes if dk_size <= 0xffffffff (int32_t*)
+       - 8 bytes otherwise (int64_t*)
+
+       Dynamically sized, SIZEOF_VOID_P is minimum. */
+    char dk_indices[];  /* char is required to avoid strict aliasing. */
+
+    /* "PyDictKeyEntry or PyDictUnicodeEntry dk_entries[USABLE_FRACTION(DK_SIZE(dk))];" array follows:
+       see the DK_ENTRIES() macro */
+};
+
+/* This must be no more than 250, for the prefix size to fit in one byte. */
+#define SHARED_KEYS_MAX_SIZE 30
+#define NEXT_LOG2_SHARED_KEYS_MAX_SIZE 6
+
+/* Layout of dict values:
+ *
+ * The PyObject *values are preceded by an array of bytes holding
+ * the insertion order and size.
+ * [-1] = prefix size. [-2] = used size. size[-2-n...] = insertion order.
+ */
+struct _dictvalues {
+    PyObject *values[1];
+};
+
+#define DK_LOG_SIZE(dk)  _Py_RVALUE((dk)->dk_log2_size)
+#if SIZEOF_VOID_P > 4
+#define DK_SIZE(dk)      (((int64_t)1)<<DK_LOG_SIZE(dk))
+#else
+#define DK_SIZE(dk)      (1<<DK_LOG_SIZE(dk))
+#endif
+
+static inline void* _DK_ENTRIES(PyDictKeysObject *dk) {
+    int8_t *indices = (int8_t*)(dk->dk_indices);
+    size_t index = (size_t)1 << dk->dk_log2_index_bytes;
+    return (&indices[index]);
+}
+static inline PyDictKeyEntry* DK_ENTRIES(PyDictKeysObject *dk) {
+    assert(dk->dk_kind == DICT_KEYS_GENERAL);
+    return (PyDictKeyEntry*)_DK_ENTRIES(dk);
+}
+static inline PyDictUnicodeEntry* DK_UNICODE_ENTRIES(PyDictKeysObject *dk) {
+    assert(dk->dk_kind != DICT_KEYS_GENERAL);
+    return (PyDictUnicodeEntry*)_DK_ENTRIES(dk);
+}
+
+#define DK_IS_UNICODE(dk) ((dk)->dk_kind != DICT_KEYS_GENERAL)
+
+#define DICT_VERSION_INCREMENT (1 << DICT_MAX_WATCHERS)
+#define DICT_VERSION_MASK (DICT_VERSION_INCREMENT - 1)
+
+#define DICT_NEXT_VERSION(INTERP) \
+    ((INTERP)->dict_state.global_version += DICT_VERSION_INCREMENT)
+
+void
+_PyDict_SendEvent(int watcher_bits,
+                  PyDict_WatchEvent event,
+                  PyDictObject *mp,
+                  PyObject *key,
+                  PyObject *value);
+
+static inline uint64_t
+_PyDict_NotifyEvent(PyInterpreterState *interp,
+                    PyDict_WatchEvent event,
+                    PyDictObject *mp,
+                    PyObject *key,
+                    PyObject *value)
+{
+    assert(Py_REFCNT((PyObject*)mp) > 0);
+    int watcher_bits = mp->ma_version_tag & DICT_VERSION_MASK;
+    if (watcher_bits) {
+        _PyDict_SendEvent(watcher_bits, event, mp, key, value);
+        return DICT_NEXT_VERSION(interp) | watcher_bits;
+    }
+    return DICT_NEXT_VERSION(interp);
+}
+
+extern PyObject *_PyObject_MakeDictFromInstanceAttributes(PyObject *obj, PyDictValues *values);
+extern PyObject *_PyDict_FromItems(
+        PyObject *const *keys, Py_ssize_t keys_offset,
+        PyObject *const *values, Py_ssize_t values_offset,
+        Py_ssize_t length);
+
+static inline void
+_PyDictValues_AddToInsertionOrder(PyDictValues *values, Py_ssize_t ix)
+{
+    assert(ix < SHARED_KEYS_MAX_SIZE);
+    uint8_t *size_ptr = ((uint8_t *)values)-2;
+    int size = *size_ptr;
+    assert(size+2 < ((uint8_t *)values)[-1]);
+    size++;
+    size_ptr[-size] = (uint8_t)ix;
+    *size_ptr = size;
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif   /* !Py_INTERNAL_DICT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_dict_state.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_dict_state.h
new file mode 100644
index 000000000000..ece0f10ca251
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_dict_state.h
@@ -0,0 +1,50 @@
+#ifndef Py_INTERNAL_DICT_STATE_H
+#define Py_INTERNAL_DICT_STATE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+
+#ifndef WITH_FREELISTS
+// without freelists
+#  define PyDict_MAXFREELIST 0
+#endif
+
+#ifndef PyDict_MAXFREELIST
+#  define PyDict_MAXFREELIST 80
+#endif
+
+#define DICT_MAX_WATCHERS 8
+
+struct _Py_dict_state {
+    /*Global counter used to set ma_version_tag field of dictionary.
+     * It is incremented each time that a dictionary is created and each
+     * time that a dictionary is modified. */
+    uint64_t global_version;
+    uint32_t next_keys_version;
+
+#if PyDict_MAXFREELIST > 0
+    /* Dictionary reuse scheme to save calls to malloc and free */
+    PyDictObject *free_list[PyDict_MAXFREELIST];
+    PyDictKeysObject *keys_free_list[PyDict_MAXFREELIST];
+    int numfree;
+    int keys_numfree;
+#endif
+
+    PyDict_WatchCallback watchers[DICT_MAX_WATCHERS];
+};
+
+#define _dict_state_INIT \
+    { \
+        .next_keys_version = 2, \
+    }
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif   /* !Py_INTERNAL_DICT_STATE_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_dtoa.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_dtoa.h
new file mode 100644
index 000000000000..4d9681d59a64
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_dtoa.h
@@ -0,0 +1,73 @@
+#ifndef Py_INTERNAL_DTOA_H
+#define Py_INTERNAL_DTOA_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_pymath.h"        // _PY_SHORT_FLOAT_REPR
+
+
+#if _PY_SHORT_FLOAT_REPR == 1
+
+typedef uint32_t ULong;
+
+struct
+Bigint {
+    struct Bigint *next;
+    int k, maxwds, sign, wds;
+    ULong x[1];
+};
+
+#ifdef Py_USING_MEMORY_DEBUGGER
+
+struct _dtoa_state {
+    int _not_used;
+};
+#define _dtoa_interp_state_INIT(INTERP) \
+    {0}
+
+#else  // !Py_USING_MEMORY_DEBUGGER
+
+/* The size of the Bigint freelist */
+#define Bigint_Kmax 7
+
+#ifndef PRIVATE_MEM
+#define PRIVATE_MEM 2304
+#endif
+#define Bigint_PREALLOC_SIZE \
+    ((PRIVATE_MEM+sizeof(double)-1)/sizeof(double))
+
+struct _dtoa_state {
+    /* p5s is a linked list of powers of 5 of the form 5**(2**i), i >= 2 */
+    // XXX This should be freed during runtime fini.
+    struct Bigint *p5s;
+    struct Bigint *freelist[Bigint_Kmax+1];
+    double preallocated[Bigint_PREALLOC_SIZE];
+    double *preallocated_next;
+};
+#define _dtoa_state_INIT(INTERP) \
+    { \
+        .preallocated_next = (INTERP)->dtoa.preallocated, \
+    }
+
+#endif  // !Py_USING_MEMORY_DEBUGGER
+
+
+/* These functions are used by modules compiled as C extension like math:
+   they must be exported. */
+
+PyAPI_FUNC(double) _Py_dg_strtod(const char *str, char **ptr);
+PyAPI_FUNC(char *) _Py_dg_dtoa(double d, int mode, int ndigits,
+                        int *decpt, int *sign, char **rve);
+PyAPI_FUNC(void) _Py_dg_freedtoa(char *s);
+
+#endif // _PY_SHORT_FLOAT_REPR == 1
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_DTOA_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_emscripten_signal.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_emscripten_signal.h
new file mode 100644
index 000000000000..8b3287d85da4
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_emscripten_signal.h
@@ -0,0 +1,25 @@
+#ifndef Py_EMSCRIPTEN_SIGNAL_H
+#define Py_EMSCRIPTEN_SIGNAL_H
+
+#if defined(__EMSCRIPTEN__)
+
+void
+_Py_CheckEmscriptenSignals(void);
+
+void
+_Py_CheckEmscriptenSignalsPeriodically(void);
+
+#define _Py_CHECK_EMSCRIPTEN_SIGNALS() _Py_CheckEmscriptenSignals()
+
+#define _Py_CHECK_EMSCRIPTEN_SIGNALS_PERIODICALLY() _Py_CheckEmscriptenSignalsPeriodically()
+
+extern int Py_EMSCRIPTEN_SIGNAL_HANDLING;
+
+#else
+
+#define _Py_CHECK_EMSCRIPTEN_SIGNALS()
+#define _Py_CHECK_EMSCRIPTEN_SIGNALS_PERIODICALLY()
+
+#endif // defined(__EMSCRIPTEN__)
+
+#endif // ndef Py_EMSCRIPTEN_SIGNAL_H
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_exceptions.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_exceptions.h
new file mode 100644
index 000000000000..4a9df7091319
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_exceptions.h
@@ -0,0 +1,37 @@
+#ifndef Py_INTERNAL_EXCEPTIONS_H
+#define Py_INTERNAL_EXCEPTIONS_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+
+/* runtime lifecycle */
+
+extern PyStatus _PyExc_InitState(PyInterpreterState *);
+extern PyStatus _PyExc_InitGlobalObjects(PyInterpreterState *);
+extern int _PyExc_InitTypes(PyInterpreterState *);
+extern void _PyExc_Fini(PyInterpreterState *);
+
+
+/* other API */
+
+struct _Py_exc_state {
+    // The dict mapping from errno codes to OSError subclasses
+    PyObject *errnomap;
+    PyBaseExceptionObject *memerrors_freelist;
+    int memerrors_numfree;
+    // The ExceptionGroup type
+    PyObject *PyExc_ExceptionGroup;
+};
+
+extern void _PyExc_ClearExceptionGroupType(PyInterpreterState *);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_EXCEPTIONS_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_faulthandler.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_faulthandler.h
new file mode 100644
index 000000000000..e6aec7745a64
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_faulthandler.h
@@ -0,0 +1,99 @@
+#ifndef Py_INTERNAL_FAULTHANDLER_H
+#define Py_INTERNAL_FAULTHANDLER_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#ifdef HAVE_SIGACTION
+#  include <signal.h>
+#endif
+
+
+#ifndef MS_WINDOWS
+   /* register() is useless on Windows, because only SIGSEGV, SIGABRT and
+      SIGILL can be handled by the process, and these signals can only be used
+      with enable(), not using register() */
+#  define FAULTHANDLER_USER
+#endif
+
+
+#ifdef HAVE_SIGACTION
+/* Using an alternative stack requires sigaltstack()
+   and sigaction() SA_ONSTACK */
+#  ifdef HAVE_SIGALTSTACK
+#    define FAULTHANDLER_USE_ALT_STACK
+#  endif
+typedef struct sigaction _Py_sighandler_t;
+#else
+typedef PyOS_sighandler_t _Py_sighandler_t;
+#endif  // HAVE_SIGACTION
+
+
+#ifdef FAULTHANDLER_USER
+struct faulthandler_user_signal {
+    int enabled;
+    PyObject *file;
+    int fd;
+    int all_threads;
+    int chain;
+    _Py_sighandler_t previous;
+    PyInterpreterState *interp;
+};
+#endif /* FAULTHANDLER_USER */
+
+
+struct _faulthandler_runtime_state {
+    struct {
+        int enabled;
+        PyObject *file;
+        int fd;
+        int all_threads;
+        PyInterpreterState *interp;
+#ifdef MS_WINDOWS
+        void *exc_handler;
+#endif
+    } fatal_error;
+
+    struct {
+        PyObject *file;
+        int fd;
+        PY_TIMEOUT_T timeout_us;   /* timeout in microseconds */
+        int repeat;
+        PyInterpreterState *interp;
+        int exit;
+        char *header;
+        size_t header_len;
+        /* The main thread always holds this lock. It is only released when
+           faulthandler_thread() is interrupted before this thread exits, or at
+           Python exit. */
+        PyThread_type_lock cancel_event;
+        /* released by child thread when joined */
+        PyThread_type_lock running;
+    } thread;
+
+#ifdef FAULTHANDLER_USER
+    struct faulthandler_user_signal *user_signals;
+#endif
+
+#ifdef FAULTHANDLER_USE_ALT_STACK
+    stack_t stack;
+    stack_t old_stack;
+#endif
+};
+
+#define _faulthandler_runtime_state_INIT \
+    { \
+        .fatal_error = { \
+            .fd = -1, \
+        }, \
+    }
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_FAULTHANDLER_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_fileutils.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_fileutils.h
new file mode 100644
index 000000000000..7c2b6ec0bffe
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_fileutils.h
@@ -0,0 +1,292 @@
+#ifndef Py_INTERNAL_FILEUTILS_H
+#define Py_INTERNAL_FILEUTILS_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "Py_BUILD_CORE must be defined to include this header"
+#endif
+
+#include <locale.h>   /* struct lconv */
+
+
+struct _fileutils_state {
+    int force_ascii;
+};
+
+typedef enum {
+    _Py_ERROR_UNKNOWN=0,
+    _Py_ERROR_STRICT,
+    _Py_ERROR_SURROGATEESCAPE,
+    _Py_ERROR_REPLACE,
+    _Py_ERROR_IGNORE,
+    _Py_ERROR_BACKSLASHREPLACE,
+    _Py_ERROR_SURROGATEPASS,
+    _Py_ERROR_XMLCHARREFREPLACE,
+    _Py_ERROR_OTHER
+} _Py_error_handler;
+
+PyAPI_FUNC(_Py_error_handler) _Py_GetErrorHandler(const char *errors);
+
+PyAPI_FUNC(int) _Py_DecodeLocaleEx(
+    const char *arg,
+    wchar_t **wstr,
+    size_t *wlen,
+    const char **reason,
+    int current_locale,
+    _Py_error_handler errors);
+
+PyAPI_FUNC(int) _Py_EncodeLocaleEx(
+    const wchar_t *text,
+    char **str,
+    size_t *error_pos,
+    const char **reason,
+    int current_locale,
+    _Py_error_handler errors);
+
+PyAPI_FUNC(char*) _Py_EncodeLocaleRaw(
+    const wchar_t *text,
+    size_t *error_pos);
+
+PyAPI_FUNC(PyObject *) _Py_device_encoding(int);
+
+#if defined(MS_WINDOWS) || defined(__APPLE__)
+    /* On Windows, the count parameter of read() is an int (bpo-9015, bpo-9611).
+       On macOS 10.13, read() and write() with more than INT_MAX bytes
+       fail with EINVAL (bpo-24658). */
+#   define _PY_READ_MAX  INT_MAX
+#   define _PY_WRITE_MAX INT_MAX
+#else
+    /* write() should truncate the input to PY_SSIZE_T_MAX bytes,
+       but it's safer to do it ourself to have a portable behaviour */
+#   define _PY_READ_MAX  PY_SSIZE_T_MAX
+#   define _PY_WRITE_MAX PY_SSIZE_T_MAX
+#endif
+
+#ifdef MS_WINDOWS
+struct _Py_stat_struct {
+    uint64_t st_dev;
+    uint64_t st_ino;
+    unsigned short st_mode;
+    int st_nlink;
+    int st_uid;
+    int st_gid;
+    unsigned long st_rdev;
+    __int64 st_size;
+    time_t st_atime;
+    int st_atime_nsec;
+    time_t st_mtime;
+    int st_mtime_nsec;
+    time_t st_ctime;
+    int st_ctime_nsec;
+    time_t st_birthtime;
+    int st_birthtime_nsec;
+    unsigned long st_file_attributes;
+    unsigned long st_reparse_tag;
+    uint64_t st_ino_high;
+};
+#else
+#  define _Py_stat_struct stat
+#endif
+
+PyAPI_FUNC(int) _Py_fstat(
+    int fd,
+    struct _Py_stat_struct *status);
+
+PyAPI_FUNC(int) _Py_fstat_noraise(
+    int fd,
+    struct _Py_stat_struct *status);
+
+PyAPI_FUNC(int) _Py_stat(
+    PyObject *path,
+    struct stat *status);
+
+PyAPI_FUNC(int) _Py_open(
+    const char *pathname,
+    int flags);
+
+PyAPI_FUNC(int) _Py_open_noraise(
+    const char *pathname,
+    int flags);
+
+PyAPI_FUNC(FILE *) _Py_wfopen(
+    const wchar_t *path,
+    const wchar_t *mode);
+
+PyAPI_FUNC(Py_ssize_t) _Py_read(
+    int fd,
+    void *buf,
+    size_t count);
+
+PyAPI_FUNC(Py_ssize_t) _Py_write(
+    int fd,
+    const void *buf,
+    size_t count);
+
+PyAPI_FUNC(Py_ssize_t) _Py_write_noraise(
+    int fd,
+    const void *buf,
+    size_t count);
+
+#ifdef HAVE_READLINK
+PyAPI_FUNC(int) _Py_wreadlink(
+    const wchar_t *path,
+    wchar_t *buf,
+    /* Number of characters of 'buf' buffer
+       including the trailing NUL character */
+    size_t buflen);
+#endif
+
+#ifdef HAVE_REALPATH
+PyAPI_FUNC(wchar_t*) _Py_wrealpath(
+    const wchar_t *path,
+    wchar_t *resolved_path,
+    /* Number of characters of 'resolved_path' buffer
+       including the trailing NUL character */
+    size_t resolved_path_len);
+#endif
+
+PyAPI_FUNC(wchar_t*) _Py_wgetcwd(
+    wchar_t *buf,
+    /* Number of characters of 'buf' buffer
+       including the trailing NUL character */
+    size_t buflen);
+
+PyAPI_FUNC(int) _Py_get_inheritable(int fd);
+
+PyAPI_FUNC(int) _Py_set_inheritable(int fd, int inheritable,
+                                    int *atomic_flag_works);
+
+PyAPI_FUNC(int) _Py_set_inheritable_async_safe(int fd, int inheritable,
+                                               int *atomic_flag_works);
+
+PyAPI_FUNC(int) _Py_dup(int fd);
+
+PyAPI_FUNC(int) _Py_get_blocking(int fd);
+
+PyAPI_FUNC(int) _Py_set_blocking(int fd, int blocking);
+
+#ifdef MS_WINDOWS
+PyAPI_FUNC(void*) _Py_get_osfhandle_noraise(int fd);
+
+PyAPI_FUNC(void*) _Py_get_osfhandle(int fd);
+
+PyAPI_FUNC(int) _Py_open_osfhandle_noraise(void *handle, int flags);
+
+PyAPI_FUNC(int) _Py_open_osfhandle(void *handle, int flags);
+#endif  /* MS_WINDOWS */
+
+// This is used after getting NULL back from Py_DecodeLocale().
+#define DECODE_LOCALE_ERR(NAME, LEN) \
+    ((LEN) == (size_t)-2) \
+     ? _PyStatus_ERR("cannot decode " NAME) \
+     : _PyStatus_NO_MEMORY()
+
+PyAPI_DATA(int) _Py_HasFileSystemDefaultEncodeErrors;
+
+PyAPI_FUNC(int) _Py_DecodeUTF8Ex(
+    const char *arg,
+    Py_ssize_t arglen,
+    wchar_t **wstr,
+    size_t *wlen,
+    const char **reason,
+    _Py_error_handler errors);
+
+PyAPI_FUNC(int) _Py_EncodeUTF8Ex(
+    const wchar_t *text,
+    char **str,
+    size_t *error_pos,
+    const char **reason,
+    int raw_malloc,
+    _Py_error_handler errors);
+
+PyAPI_FUNC(wchar_t*) _Py_DecodeUTF8_surrogateescape(
+    const char *arg,
+    Py_ssize_t arglen,
+    size_t *wlen);
+
+extern int
+_Py_wstat(const wchar_t *, struct stat *);
+
+PyAPI_FUNC(int) _Py_GetForceASCII(void);
+
+/* Reset "force ASCII" mode (if it was initialized).
+
+   This function should be called when Python changes the LC_CTYPE locale,
+   so the "force ASCII" mode can be detected again on the new locale
+   encoding. */
+PyAPI_FUNC(void) _Py_ResetForceASCII(void);
+
+
+PyAPI_FUNC(int) _Py_GetLocaleconvNumeric(
+    struct lconv *lc,
+    PyObject **decimal_point,
+    PyObject **thousands_sep);
+
+PyAPI_FUNC(void) _Py_closerange(int first, int last);
+
+PyAPI_FUNC(wchar_t*) _Py_GetLocaleEncoding(void);
+PyAPI_FUNC(PyObject*) _Py_GetLocaleEncodingObject(void);
+
+#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
+extern int _Py_LocaleUsesNonUnicodeWchar(void);
+
+extern wchar_t* _Py_DecodeNonUnicodeWchar(
+    const wchar_t* native,
+    Py_ssize_t size);
+
+extern int _Py_EncodeNonUnicodeWchar_InPlace(
+    wchar_t* unicode,
+    Py_ssize_t size);
+#endif
+
+extern int _Py_isabs(const wchar_t *path);
+extern int _Py_abspath(const wchar_t *path, wchar_t **abspath_p);
+#ifdef MS_WINDOWS
+extern int _PyOS_getfullpathname(const wchar_t *path, wchar_t **abspath_p);
+#endif
+extern wchar_t * _Py_join_relfile(const wchar_t *dirname,
+                                  const wchar_t *relfile);
+extern int _Py_add_relfile(wchar_t *dirname,
+                           const wchar_t *relfile,
+                           size_t bufsize);
+extern size_t _Py_find_basename(const wchar_t *filename);
+PyAPI_FUNC(wchar_t*) _Py_normpath(wchar_t *path, Py_ssize_t size);
+extern wchar_t *_Py_normpath_and_size(wchar_t *path, Py_ssize_t size, Py_ssize_t *length);
+
+// The Windows Games API family does not provide these functions
+// so provide our own implementations. Remove them in case they get added
+// to the Games API family
+#if defined(MS_WINDOWS_GAMES) && !defined(MS_WINDOWS_DESKTOP)
+#include <winerror.h>
+
+extern HRESULT PathCchSkipRoot(const wchar_t *pszPath, const wchar_t **ppszRootEnd);
+#endif /* defined(MS_WINDOWS_GAMES) && !defined(MS_WINDOWS_DESKTOP) */
+
+// Macros to protect CRT calls against instant termination when passed an
+// invalid parameter (bpo-23524). IPH stands for Invalid Parameter Handler.
+// Usage:
+//
+//      _Py_BEGIN_SUPPRESS_IPH
+//      ...
+//      _Py_END_SUPPRESS_IPH
+#if defined _MSC_VER && _MSC_VER >= 1900
+
+#  include <stdlib.h>   // _set_thread_local_invalid_parameter_handler()
+
+   extern _invalid_parameter_handler _Py_silent_invalid_parameter_handler;
+#  define _Py_BEGIN_SUPPRESS_IPH \
+    { _invalid_parameter_handler _Py_old_handler = \
+      _set_thread_local_invalid_parameter_handler(_Py_silent_invalid_parameter_handler);
+#  define _Py_END_SUPPRESS_IPH \
+    _set_thread_local_invalid_parameter_handler(_Py_old_handler); }
+#else
+#  define _Py_BEGIN_SUPPRESS_IPH
+#  define _Py_END_SUPPRESS_IPH
+#endif /* _MSC_VER >= 1900 */
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_FILEUTILS_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_fileutils_windows.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_fileutils_windows.h
new file mode 100644
index 000000000000..e804d385e767
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_fileutils_windows.h
@@ -0,0 +1,98 @@
+#ifndef Py_INTERNAL_FILEUTILS_WINDOWS_H
+#define Py_INTERNAL_FILEUTILS_WINDOWS_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "Py_BUILD_CORE must be defined to include this header"
+#endif
+
+#ifdef MS_WINDOWS
+
+#if !defined(NTDDI_WIN10_NI) || !(NTDDI_VERSION >= NTDDI_WIN10_NI)
+typedef struct _FILE_STAT_BASIC_INFORMATION {
+    LARGE_INTEGER FileId;
+    LARGE_INTEGER CreationTime;
+    LARGE_INTEGER LastAccessTime;
+    LARGE_INTEGER LastWriteTime;
+    LARGE_INTEGER ChangeTime;
+    LARGE_INTEGER AllocationSize;
+    LARGE_INTEGER EndOfFile;
+    ULONG FileAttributes;
+    ULONG ReparseTag;
+    ULONG NumberOfLinks;
+    ULONG DeviceType;
+    ULONG DeviceCharacteristics;
+    ULONG Reserved;
+    LARGE_INTEGER VolumeSerialNumber;
+    FILE_ID_128 FileId128;
+} FILE_STAT_BASIC_INFORMATION;
+
+typedef enum _FILE_INFO_BY_NAME_CLASS {
+    FileStatByNameInfo,
+    FileStatLxByNameInfo,
+    FileCaseSensitiveByNameInfo,
+    FileStatBasicByNameInfo,
+    MaximumFileInfoByNameClass
+} FILE_INFO_BY_NAME_CLASS;
+#endif
+
+typedef BOOL (WINAPI *PGetFileInformationByName)(
+    PCWSTR FileName,
+    FILE_INFO_BY_NAME_CLASS FileInformationClass,
+    PVOID FileInfoBuffer,
+    ULONG FileInfoBufferSize
+);
+
+static inline BOOL _Py_GetFileInformationByName(
+    PCWSTR FileName,
+    FILE_INFO_BY_NAME_CLASS FileInformationClass,
+    PVOID FileInfoBuffer,
+    ULONG FileInfoBufferSize
+) {
+    static PGetFileInformationByName GetFileInformationByName = NULL;
+    static int GetFileInformationByName_init = -1;
+
+    if (GetFileInformationByName_init < 0) {
+        HMODULE hMod = LoadLibraryW(L"api-ms-win-core-file-l2-1-4");
+        GetFileInformationByName_init = 0;
+        if (hMod) {
+            GetFileInformationByName = (PGetFileInformationByName)GetProcAddress(
+                hMod, "GetFileInformationByName");
+            if (GetFileInformationByName) {
+                GetFileInformationByName_init = 1;
+            } else {
+                FreeLibrary(hMod);
+            }
+        }
+    }
+
+    if (GetFileInformationByName_init <= 0) {
+        SetLastError(ERROR_NOT_SUPPORTED);
+        return FALSE;
+    }
+    return GetFileInformationByName(FileName, FileInformationClass, FileInfoBuffer, FileInfoBufferSize);
+}
+
+static inline BOOL _Py_GetFileInformationByName_ErrorIsTrustworthy(int error)
+{
+    switch(error) {
+        case ERROR_FILE_NOT_FOUND:
+        case ERROR_PATH_NOT_FOUND:
+        case ERROR_NOT_READY:
+        case ERROR_BAD_NET_NAME:
+        case ERROR_BAD_NETPATH:
+        case ERROR_BAD_PATHNAME:
+        case ERROR_INVALID_NAME:
+        case ERROR_FILENAME_EXCED_RANGE:
+            return TRUE;
+        case ERROR_NOT_SUPPORTED:
+            return FALSE;
+    }
+    return FALSE;
+}
+
+#endif
+
+#endif
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_floatobject.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_floatobject.h
new file mode 100644
index 000000000000..27c63bc87f3e
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_floatobject.h
@@ -0,0 +1,71 @@
+#ifndef Py_INTERNAL_FLOATOBJECT_H
+#define Py_INTERNAL_FLOATOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+
+/* runtime lifecycle */
+
+extern void _PyFloat_InitState(PyInterpreterState *);
+extern PyStatus _PyFloat_InitTypes(PyInterpreterState *);
+extern void _PyFloat_Fini(PyInterpreterState *);
+extern void _PyFloat_FiniType(PyInterpreterState *);
+
+
+/* other API */
+
+enum _py_float_format_type {
+    _py_float_format_unknown,
+    _py_float_format_ieee_big_endian,
+    _py_float_format_ieee_little_endian,
+};
+
+struct _Py_float_runtime_state {
+    enum _py_float_format_type float_format;
+    enum _py_float_format_type double_format;
+};
+
+
+#ifndef WITH_FREELISTS
+// without freelists
+#  define PyFloat_MAXFREELIST 0
+#endif
+
+#ifndef PyFloat_MAXFREELIST
+#  define PyFloat_MAXFREELIST   100
+#endif
+
+struct _Py_float_state {
+#if PyFloat_MAXFREELIST > 0
+    /* Special free list
+       free_list is a singly-linked list of available PyFloatObjects,
+       linked via abuse of their ob_type members. */
+    int numfree;
+    PyFloatObject *free_list;
+#endif
+};
+
+void _PyFloat_ExactDealloc(PyObject *op);
+
+
+PyAPI_FUNC(void) _PyFloat_DebugMallocStats(FILE* out);
+
+
+/* Format the object based on the format_spec, as defined in PEP 3101
+   (Advanced String Formatting). */
+PyAPI_FUNC(int) _PyFloat_FormatAdvancedWriter(
+    _PyUnicodeWriter *writer,
+    PyObject *obj,
+    PyObject *format_spec,
+    Py_ssize_t start,
+    Py_ssize_t end);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_FLOATOBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_flowgraph.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_flowgraph.h
new file mode 100644
index 000000000000..98d3374a7522
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_flowgraph.h
@@ -0,0 +1,120 @@
+#ifndef Py_INTERNAL_CFG_H
+#define Py_INTERNAL_CFG_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_opcode_utils.h"
+#include "pycore_compile.h"
+
+
+typedef struct {
+    int i_opcode;
+    int i_oparg;
+    _PyCompilerSrcLocation i_loc;
+    struct _PyCfgBasicblock_ *i_target; /* target block (if jump instruction) */
+    struct _PyCfgBasicblock_ *i_except; /* target block when exception is raised */
+} _PyCfgInstruction;
+
+typedef struct {
+    int id;
+} _PyCfgJumpTargetLabel;
+
+
+typedef struct {
+    struct _PyCfgBasicblock_ *handlers[CO_MAXBLOCKS+2];
+    int depth;
+} _PyCfgExceptStack;
+
+typedef struct _PyCfgBasicblock_ {
+    /* Each basicblock in a compilation unit is linked via b_list in the
+       reverse order that the block are allocated.  b_list points to the next
+       block in this list, not to be confused with b_next, which is next by
+       control flow. */
+    struct _PyCfgBasicblock_ *b_list;
+    /* The label of this block if it is a jump target, -1 otherwise */
+    _PyCfgJumpTargetLabel b_label;
+    /* Exception stack at start of block, used by assembler to create the exception handling table */
+    _PyCfgExceptStack *b_exceptstack;
+    /* pointer to an array of instructions, initially NULL */
+    _PyCfgInstruction *b_instr;
+    /* If b_next is non-NULL, it is a pointer to the next
+       block reached by normal control flow. */
+    struct _PyCfgBasicblock_ *b_next;
+    /* number of instructions used */
+    int b_iused;
+    /* length of instruction array (b_instr) */
+    int b_ialloc;
+    /* Used by add_checks_for_loads_of_unknown_variables */
+    uint64_t b_unsafe_locals_mask;
+    /* Number of predecessors that a block has. */
+    int b_predecessors;
+    /* depth of stack upon entry of block, computed by stackdepth() */
+    int b_startdepth;
+    /* instruction offset for block, computed by assemble_jump_offsets() */
+    int b_offset;
+    /* Basic block is an exception handler that preserves lasti */
+    unsigned b_preserve_lasti : 1;
+    /* Used by compiler passes to mark whether they have visited a basic block. */
+    unsigned b_visited : 1;
+    /* b_except_handler is used by the cold-detection algorithm to mark exception targets */
+    unsigned b_except_handler : 1;
+    /* b_cold is true if this block is not perf critical (like an exception handler) */
+    unsigned b_cold : 1;
+    /* b_warm is used by the cold-detection algorithm to mark blocks which are definitely not cold */
+    unsigned b_warm : 1;
+} _PyCfgBasicblock;
+
+int _PyBasicblock_InsertInstruction(_PyCfgBasicblock *block, int pos, _PyCfgInstruction *instr);
+
+typedef struct cfg_builder_ {
+    /* The entryblock, at which control flow begins. All blocks of the
+       CFG are reachable through the b_next links */
+    _PyCfgBasicblock *g_entryblock;
+    /* Pointer to the most recently allocated block.  By following
+       b_list links, you can reach all allocated blocks. */
+    _PyCfgBasicblock *g_block_list;
+    /* pointer to the block currently being constructed */
+    _PyCfgBasicblock *g_curblock;
+    /* label for the next instruction to be placed */
+    _PyCfgJumpTargetLabel g_current_label;
+} _PyCfgBuilder;
+
+int _PyCfgBuilder_UseLabel(_PyCfgBuilder *g, _PyCfgJumpTargetLabel lbl);
+int _PyCfgBuilder_Addop(_PyCfgBuilder *g, int opcode, int oparg, _PyCompilerSrcLocation loc);
+
+int _PyCfgBuilder_Init(_PyCfgBuilder *g);
+void _PyCfgBuilder_Fini(_PyCfgBuilder *g);
+
+_PyCfgInstruction* _PyCfg_BasicblockLastInstr(const _PyCfgBasicblock *b);
+int _PyCfg_OptimizeCodeUnit(_PyCfgBuilder *g, PyObject *consts, PyObject *const_cache,
+                            int code_flags, int nlocals, int nparams, int firstlineno);
+int _PyCfg_Stackdepth(_PyCfgBasicblock *entryblock, int code_flags);
+void _PyCfg_ConvertPseudoOps(_PyCfgBasicblock *entryblock);
+int _PyCfg_ResolveJumps(_PyCfgBuilder *g);
+
+
+static inline int
+basicblock_nofallthrough(const _PyCfgBasicblock *b) {
+    _PyCfgInstruction *last = _PyCfg_BasicblockLastInstr(b);
+    return (last &&
+            (IS_SCOPE_EXIT_OPCODE(last->i_opcode) ||
+             IS_UNCONDITIONAL_JUMP_OPCODE(last->i_opcode)));
+}
+
+#define BB_NO_FALLTHROUGH(B) (basicblock_nofallthrough(B))
+#define BB_HAS_FALLTHROUGH(B) (!basicblock_nofallthrough(B))
+
+PyCodeObject *
+_PyAssemble_MakeCodeObject(_PyCompile_CodeUnitMetadata *u, PyObject *const_cache,
+                           PyObject *consts, int maxdepth, _PyCompile_InstructionSequence *instrs,
+                           int nlocalsplus, int code_flags, PyObject *filename);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_CFG_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_format.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_format.h
new file mode 100644
index 000000000000..1b8d57539ca5
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_format.h
@@ -0,0 +1,27 @@
+#ifndef Py_INTERNAL_FORMAT_H
+#define Py_INTERNAL_FORMAT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+/* Format codes
+ * F_LJUST      '-'
+ * F_SIGN       '+'
+ * F_BLANK      ' '
+ * F_ALT        '#'
+ * F_ZERO       '0'
+ */
+#define F_LJUST (1<<0)
+#define F_SIGN  (1<<1)
+#define F_BLANK (1<<2)
+#define F_ALT   (1<<3)
+#define F_ZERO  (1<<4)
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_FORMAT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_frame.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_frame.h
new file mode 100644
index 000000000000..bfe4a759bac0
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_frame.h
@@ -0,0 +1,283 @@
+#ifndef Py_INTERNAL_FRAME_H
+#define Py_INTERNAL_FRAME_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdbool.h>
+#include <stddef.h>
+#include "pycore_code.h"         // STATS
+
+/* See Objects/frame_layout.md for an explanation of the frame stack
+ * including explanation of the PyFrameObject and _PyInterpreterFrame
+ * structs. */
+
+
+struct _frame {
+    PyObject_HEAD
+    PyFrameObject *f_back;      /* previous frame, or NULL */
+    struct _PyInterpreterFrame *f_frame; /* points to the frame data */
+    PyObject *f_trace;          /* Trace function */
+    int f_lineno;               /* Current line number. Only valid if non-zero */
+    char f_trace_lines;         /* Emit per-line trace events? */
+    char f_trace_opcodes;       /* Emit per-opcode trace events? */
+    char f_fast_as_locals;      /* Have the fast locals of this frame been converted to a dict? */
+    /* The frame data, if this frame object owns the frame */
+    PyObject *_f_frame_data[1];
+};
+
+extern PyFrameObject* _PyFrame_New_NoTrack(PyCodeObject *code);
+
+
+/* other API */
+
+typedef enum _framestate {
+    FRAME_CREATED = -2,
+    FRAME_SUSPENDED = -1,
+    FRAME_EXECUTING = 0,
+    FRAME_COMPLETED = 1,
+    FRAME_CLEARED = 4
+} PyFrameState;
+
+#define FRAME_STATE_FINISHED(S) ((S) >= FRAME_COMPLETED)
+
+enum _frameowner {
+    FRAME_OWNED_BY_THREAD = 0,
+    FRAME_OWNED_BY_GENERATOR = 1,
+    FRAME_OWNED_BY_FRAME_OBJECT = 2,
+    FRAME_OWNED_BY_CSTACK = 3,
+};
+
+typedef struct _PyInterpreterFrame {
+    PyCodeObject *f_code; /* Strong reference */
+    struct _PyInterpreterFrame *previous;
+    PyObject *f_funcobj; /* Strong reference. Only valid if not on C stack */
+    PyObject *f_globals; /* Borrowed reference. Only valid if not on C stack */
+    PyObject *f_builtins; /* Borrowed reference. Only valid if not on C stack */
+    PyObject *f_locals; /* Strong reference, may be NULL. Only valid if not on C stack */
+    PyFrameObject *frame_obj; /* Strong reference, may be NULL. Only valid if not on C stack */
+    // NOTE: This is not necessarily the last instruction started in the given
+    // frame. Rather, it is the code unit *prior to* the *next* instruction. For
+    // example, it may be an inline CACHE entry, an instruction we just jumped
+    // over, or (in the case of a newly-created frame) a totally invalid value:
+    _Py_CODEUNIT *prev_instr;
+    int stacktop;  /* Offset of TOS from localsplus  */
+    /* The return_offset determines where a `RETURN` should go in the caller,
+     * relative to `prev_instr`.
+     * It is only meaningful to the callee,
+     * so it needs to be set in any CALL (to a Python function)
+     * or SEND (to a coroutine or generator).
+     * If there is no callee, then it is meaningless. */
+    uint16_t return_offset;
+    char owner;
+    /* Locals and stack */
+    PyObject *localsplus[1];
+} _PyInterpreterFrame;
+
+#define _PyInterpreterFrame_LASTI(IF) \
+    ((int)((IF)->prev_instr - _PyCode_CODE((IF)->f_code)))
+
+static inline PyObject **_PyFrame_Stackbase(_PyInterpreterFrame *f) {
+    return f->localsplus + f->f_code->co_nlocalsplus;
+}
+
+static inline PyObject *_PyFrame_StackPeek(_PyInterpreterFrame *f) {
+    assert(f->stacktop > f->f_code->co_nlocalsplus);
+    assert(f->localsplus[f->stacktop-1] != NULL);
+    return f->localsplus[f->stacktop-1];
+}
+
+static inline PyObject *_PyFrame_StackPop(_PyInterpreterFrame *f) {
+    assert(f->stacktop > f->f_code->co_nlocalsplus);
+    f->stacktop--;
+    return f->localsplus[f->stacktop];
+}
+
+static inline void _PyFrame_StackPush(_PyInterpreterFrame *f, PyObject *value) {
+    f->localsplus[f->stacktop] = value;
+    f->stacktop++;
+}
+
+#define FRAME_SPECIALS_SIZE ((int)((sizeof(_PyInterpreterFrame)-1)/sizeof(PyObject *)))
+
+static inline int
+_PyFrame_NumSlotsForCodeObject(PyCodeObject *code)
+{
+    /* This function needs to remain in sync with the calculation of
+     * co_framesize in Tools/build/deepfreeze.py */
+    assert(code->co_framesize >= FRAME_SPECIALS_SIZE);
+    return code->co_framesize - FRAME_SPECIALS_SIZE;
+}
+
+void _PyFrame_Copy(_PyInterpreterFrame *src, _PyInterpreterFrame *dest);
+
+/* Consumes reference to func and locals.
+   Does not initialize frame->previous, which happens
+   when frame is linked into the frame stack.
+ */
+static inline void
+_PyFrame_Initialize(
+    _PyInterpreterFrame *frame, PyFunctionObject *func,
+    PyObject *locals, PyCodeObject *code, int null_locals_from)
+{
+    frame->f_funcobj = (PyObject *)func;
+    frame->f_code = (PyCodeObject *)Py_NewRef(code);
+    frame->f_builtins = func->func_builtins;
+    frame->f_globals = func->func_globals;
+    frame->f_locals = locals;
+    frame->stacktop = code->co_nlocalsplus;
+    frame->frame_obj = NULL;
+    frame->prev_instr = _PyCode_CODE(code) - 1;
+    frame->return_offset = 0;
+    frame->owner = FRAME_OWNED_BY_THREAD;
+
+    for (int i = null_locals_from; i < code->co_nlocalsplus; i++) {
+        frame->localsplus[i] = NULL;
+    }
+}
+
+/* Gets the pointer to the locals array
+ * that precedes this frame.
+ */
+static inline PyObject**
+_PyFrame_GetLocalsArray(_PyInterpreterFrame *frame)
+{
+    return frame->localsplus;
+}
+
+/* Fetches the stack pointer, and sets stacktop to -1.
+   Having stacktop <= 0 ensures that invalid
+   values are not visible to the cycle GC.
+   We choose -1 rather than 0 to assist debugging. */
+static inline PyObject**
+_PyFrame_GetStackPointer(_PyInterpreterFrame *frame)
+{
+    PyObject **sp = frame->localsplus + frame->stacktop;
+    frame->stacktop = -1;
+    return sp;
+}
+
+static inline void
+_PyFrame_SetStackPointer(_PyInterpreterFrame *frame, PyObject **stack_pointer)
+{
+    frame->stacktop = (int)(stack_pointer - frame->localsplus);
+}
+
+/* Determine whether a frame is incomplete.
+ * A frame is incomplete if it is part way through
+ * creating cell objects or a generator or coroutine.
+ *
+ * Frames on the frame stack are incomplete until the
+ * first RESUME instruction.
+ * Frames owned by a generator are always complete.
+ */
+static inline bool
+_PyFrame_IsIncomplete(_PyInterpreterFrame *frame)
+{
+    return frame->owner != FRAME_OWNED_BY_GENERATOR &&
+    frame->prev_instr < _PyCode_CODE(frame->f_code) + frame->f_code->_co_firsttraceable;
+}
+
+static inline _PyInterpreterFrame *
+_PyFrame_GetFirstComplete(_PyInterpreterFrame *frame)
+{
+    while (frame && _PyFrame_IsIncomplete(frame)) {
+        frame = frame->previous;
+    }
+    return frame;
+}
+
+static inline _PyInterpreterFrame *
+_PyThreadState_GetFrame(PyThreadState *tstate)
+{
+    return _PyFrame_GetFirstComplete(tstate->cframe->current_frame);
+}
+
+/* For use by _PyFrame_GetFrameObject
+  Do not call directly. */
+PyFrameObject *
+_PyFrame_MakeAndSetFrameObject(_PyInterpreterFrame *frame);
+
+/* Gets the PyFrameObject for this frame, lazily
+ * creating it if necessary.
+ * Returns a borrowed referennce */
+static inline PyFrameObject *
+_PyFrame_GetFrameObject(_PyInterpreterFrame *frame)
+{
+
+    assert(!_PyFrame_IsIncomplete(frame));
+    PyFrameObject *res =  frame->frame_obj;
+    if (res != NULL) {
+        return res;
+    }
+    return _PyFrame_MakeAndSetFrameObject(frame);
+}
+
+/* Clears all references in the frame.
+ * If take is non-zero, then the _PyInterpreterFrame frame
+ * may be transferred to the frame object it references
+ * instead of being cleared. Either way
+ * the caller no longer owns the references
+ * in the frame.
+ * take should  be set to 1 for heap allocated
+ * frames like the ones in generators and coroutines.
+ */
+void
+_PyFrame_ClearExceptCode(_PyInterpreterFrame * frame);
+
+int
+_PyFrame_Traverse(_PyInterpreterFrame *frame, visitproc visit, void *arg);
+
+PyObject *
+_PyFrame_GetLocals(_PyInterpreterFrame *frame, int include_hidden);
+
+int
+_PyFrame_FastToLocalsWithError(_PyInterpreterFrame *frame);
+
+void
+_PyFrame_LocalsToFast(_PyInterpreterFrame *frame, int clear);
+
+static inline bool
+_PyThreadState_HasStackSpace(PyThreadState *tstate, int size)
+{
+    assert(
+        (tstate->datastack_top == NULL && tstate->datastack_limit == NULL)
+        ||
+        (tstate->datastack_top != NULL && tstate->datastack_limit != NULL)
+    );
+    return tstate->datastack_top != NULL &&
+        size < tstate->datastack_limit - tstate->datastack_top;
+}
+
+extern _PyInterpreterFrame *
+_PyThreadState_PushFrame(PyThreadState *tstate, size_t size);
+
+void _PyThreadState_PopFrame(PyThreadState *tstate, _PyInterpreterFrame *frame);
+
+/* Pushes a frame without checking for space.
+ * Must be guarded by _PyThreadState_HasStackSpace()
+ * Consumes reference to func. */
+static inline _PyInterpreterFrame *
+_PyFrame_PushUnchecked(PyThreadState *tstate, PyFunctionObject *func, int null_locals_from)
+{
+    CALL_STAT_INC(frames_pushed);
+    PyCodeObject *code = (PyCodeObject *)func->func_code;
+    _PyInterpreterFrame *new_frame = (_PyInterpreterFrame *)tstate->datastack_top;
+    tstate->datastack_top += code->co_framesize;
+    assert(tstate->datastack_top < tstate->datastack_limit);
+    _PyFrame_Initialize(new_frame, func, NULL, code, null_locals_from);
+    return new_frame;
+}
+
+static inline
+PyGenObject *_PyFrame_GetGenerator(_PyInterpreterFrame *frame)
+{
+    assert(frame->owner == FRAME_OWNED_BY_GENERATOR);
+    size_t offset_in_gen = offsetof(PyGenObject, gi_iframe);
+    return (PyGenObject *)(((char *)frame) - offset_in_gen);
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_FRAME_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_function.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_function.h
new file mode 100644
index 000000000000..ecbb7001e7d8
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_function.h
@@ -0,0 +1,26 @@
+#ifndef Py_INTERNAL_FUNCTION_H
+#define Py_INTERNAL_FUNCTION_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#define FUNC_MAX_WATCHERS 8
+
+struct _py_func_state {
+    uint32_t next_version;
+};
+
+extern PyFunctionObject* _PyFunction_FromConstructor(PyFrameConstructor *constr);
+
+extern uint32_t _PyFunction_GetVersionForCurrentState(PyFunctionObject *func);
+extern PyObject *_Py_set_function_type_params(
+    PyThreadState* unused, PyObject *func, PyObject *type_params);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_FUNCTION_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_gc.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_gc.h
new file mode 100644
index 000000000000..b3abe2030a03
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_gc.h
@@ -0,0 +1,211 @@
+#ifndef Py_INTERNAL_GC_H
+#define Py_INTERNAL_GC_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+/* GC information is stored BEFORE the object structure. */
+typedef struct {
+    // Pointer to next object in the list.
+    // 0 means the object is not tracked
+    uintptr_t _gc_next;
+
+    // Pointer to previous object in the list.
+    // Lowest two bits are used for flags documented later.
+    uintptr_t _gc_prev;
+} PyGC_Head;
+
+static inline PyGC_Head* _Py_AS_GC(PyObject *op) {
+    return (_Py_CAST(PyGC_Head*, op) - 1);
+}
+#define _PyGC_Head_UNUSED PyGC_Head
+
+/* True if the object is currently tracked by the GC. */
+static inline int _PyObject_GC_IS_TRACKED(PyObject *op) {
+    PyGC_Head *gc = _Py_AS_GC(op);
+    return (gc->_gc_next != 0);
+}
+#define _PyObject_GC_IS_TRACKED(op) _PyObject_GC_IS_TRACKED(_Py_CAST(PyObject*, op))
+
+/* True if the object may be tracked by the GC in the future, or already is.
+   This can be useful to implement some optimizations. */
+static inline int _PyObject_GC_MAY_BE_TRACKED(PyObject *obj) {
+    if (!PyObject_IS_GC(obj)) {
+        return 0;
+    }
+    if (PyTuple_CheckExact(obj)) {
+        return _PyObject_GC_IS_TRACKED(obj);
+    }
+    return 1;
+}
+
+
+/* Bit flags for _gc_prev */
+/* Bit 0 is set when tp_finalize is called */
+#define _PyGC_PREV_MASK_FINALIZED  (1)
+/* Bit 1 is set when the object is in generation which is GCed currently. */
+#define _PyGC_PREV_MASK_COLLECTING (2)
+/* The (N-2) most significant bits contain the real address. */
+#define _PyGC_PREV_SHIFT           (2)
+#define _PyGC_PREV_MASK            (((uintptr_t) -1) << _PyGC_PREV_SHIFT)
+
+// Lowest bit of _gc_next is used for flags only in GC.
+// But it is always 0 for normal code.
+static inline PyGC_Head* _PyGCHead_NEXT(PyGC_Head *gc) {
+    uintptr_t next = gc->_gc_next;
+    return _Py_CAST(PyGC_Head*, next);
+}
+static inline void _PyGCHead_SET_NEXT(PyGC_Head *gc, PyGC_Head *next) {
+    gc->_gc_next = _Py_CAST(uintptr_t, next);
+}
+
+// Lowest two bits of _gc_prev is used for _PyGC_PREV_MASK_* flags.
+static inline PyGC_Head* _PyGCHead_PREV(PyGC_Head *gc) {
+    uintptr_t prev = (gc->_gc_prev & _PyGC_PREV_MASK);
+    return _Py_CAST(PyGC_Head*, prev);
+}
+static inline void _PyGCHead_SET_PREV(PyGC_Head *gc, PyGC_Head *prev) {
+    uintptr_t uprev = _Py_CAST(uintptr_t, prev);
+    assert((uprev & ~_PyGC_PREV_MASK) == 0);
+    gc->_gc_prev = ((gc->_gc_prev & ~_PyGC_PREV_MASK) | uprev);
+}
+
+static inline int _PyGCHead_FINALIZED(PyGC_Head *gc) {
+    return ((gc->_gc_prev & _PyGC_PREV_MASK_FINALIZED) != 0);
+}
+static inline void _PyGCHead_SET_FINALIZED(PyGC_Head *gc) {
+    gc->_gc_prev |= _PyGC_PREV_MASK_FINALIZED;
+}
+
+static inline int _PyGC_FINALIZED(PyObject *op) {
+    PyGC_Head *gc = _Py_AS_GC(op);
+    return _PyGCHead_FINALIZED(gc);
+}
+static inline void _PyGC_SET_FINALIZED(PyObject *op) {
+    PyGC_Head *gc = _Py_AS_GC(op);
+    _PyGCHead_SET_FINALIZED(gc);
+}
+
+
+/* GC runtime state */
+
+/* If we change this, we need to change the default value in the
+   signature of gc.collect. */
+#define NUM_GENERATIONS 3
+/*
+   NOTE: about untracking of mutable objects.
+
+   Certain types of container cannot participate in a reference cycle, and
+   so do not need to be tracked by the garbage collector. Untracking these
+   objects reduces the cost of garbage collections. However, determining
+   which objects may be untracked is not free, and the costs must be
+   weighed against the benefits for garbage collection.
+
+   There are two possible strategies for when to untrack a container:
+
+   i) When the container is created.
+   ii) When the container is examined by the garbage collector.
+
+   Tuples containing only immutable objects (integers, strings etc, and
+   recursively, tuples of immutable objects) do not need to be tracked.
+   The interpreter creates a large number of tuples, many of which will
+   not survive until garbage collection. It is therefore not worthwhile
+   to untrack eligible tuples at creation time.
+
+   Instead, all tuples except the empty tuple are tracked when created.
+   During garbage collection it is determined whether any surviving tuples
+   can be untracked. A tuple can be untracked if all of its contents are
+   already not tracked. Tuples are examined for untracking in all garbage
+   collection cycles. It may take more than one cycle to untrack a tuple.
+
+   Dictionaries containing only immutable objects also do not need to be
+   tracked. Dictionaries are untracked when created. If a tracked item is
+   inserted into a dictionary (either as a key or value), the dictionary
+   becomes tracked. During a full garbage collection (all generations),
+   the collector will untrack any dictionaries whose contents are not
+   tracked.
+
+   The module provides the python function is_tracked(obj), which returns
+   the CURRENT tracking status of the object. Subsequent garbage
+   collections may change the tracking status of the object.
+
+   Untracking of certain containers was introduced in issue #4688, and
+   the algorithm was refined in response to issue #14775.
+*/
+
+struct gc_generation {
+    PyGC_Head head;
+    int threshold; /* collection threshold */
+    int count; /* count of allocations or collections of younger
+                  generations */
+};
+
+/* Running stats per generation */
+struct gc_generation_stats {
+    /* total number of collections */
+    Py_ssize_t collections;
+    /* total number of collected objects */
+    Py_ssize_t collected;
+    /* total number of uncollectable objects (put into gc.garbage) */
+    Py_ssize_t uncollectable;
+};
+
+struct _gc_runtime_state {
+    /* List of objects that still need to be cleaned up, singly linked
+     * via their gc headers' gc_prev pointers.  */
+    PyObject *trash_delete_later;
+    /* Current call-stack depth of tp_dealloc calls. */
+    int trash_delete_nesting;
+
+    /* Is automatic collection enabled? */
+    int enabled;
+    int debug;
+    /* linked lists of container objects */
+    struct gc_generation generations[NUM_GENERATIONS];
+    PyGC_Head *generation0;
+    /* a permanent generation which won't be collected */
+    struct gc_generation permanent_generation;
+    struct gc_generation_stats generation_stats[NUM_GENERATIONS];
+    /* true if we are currently running the collector */
+    int collecting;
+    /* list of uncollectable objects */
+    PyObject *garbage;
+    /* a list of callbacks to be invoked when collection is performed */
+    PyObject *callbacks;
+    /* This is the number of objects that survived the last full
+       collection. It approximates the number of long lived objects
+       tracked by the GC.
+
+       (by "full collection", we mean a collection of the oldest
+       generation). */
+    Py_ssize_t long_lived_total;
+    /* This is the number of objects that survived all "non-full"
+       collections, and are awaiting to undergo a full collection for
+       the first time. */
+    Py_ssize_t long_lived_pending;
+};
+
+
+extern void _PyGC_InitState(struct _gc_runtime_state *);
+
+extern Py_ssize_t _PyGC_CollectNoFail(PyThreadState *tstate);
+
+
+// Functions to clear types free lists
+extern void _PyTuple_ClearFreeList(PyInterpreterState *interp);
+extern void _PyFloat_ClearFreeList(PyInterpreterState *interp);
+extern void _PyList_ClearFreeList(PyInterpreterState *interp);
+extern void _PyDict_ClearFreeList(PyInterpreterState *interp);
+extern void _PyAsyncGen_ClearFreeLists(PyInterpreterState *interp);
+extern void _PyContext_ClearFreeList(PyInterpreterState *interp);
+extern void _Py_ScheduleGC(PyInterpreterState *interp);
+extern void _Py_RunGC(PyThreadState *tstate);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_GC_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_genobject.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_genobject.h
new file mode 100644
index 000000000000..dc60b4ca7051
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_genobject.h
@@ -0,0 +1,49 @@
+#ifndef Py_INTERNAL_GENOBJECT_H
+#define Py_INTERNAL_GENOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+extern PyObject *_PyGen_yf(PyGenObject *);
+extern PyObject *_PyCoro_GetAwaitableIter(PyObject *o);
+extern PyObject *_PyAsyncGenValueWrapperNew(PyThreadState *state, PyObject *);
+
+/* runtime lifecycle */
+
+extern void _PyAsyncGen_Fini(PyInterpreterState *);
+
+
+/* other API */
+
+#ifndef WITH_FREELISTS
+// without freelists
+#  define _PyAsyncGen_MAXFREELIST 0
+#endif
+
+#ifndef _PyAsyncGen_MAXFREELIST
+#  define _PyAsyncGen_MAXFREELIST 80
+#endif
+
+struct _Py_async_gen_state {
+#if _PyAsyncGen_MAXFREELIST > 0
+    /* Freelists boost performance 6-10%; they also reduce memory
+       fragmentation, as _PyAsyncGenWrappedValue and PyAsyncGenASend
+       are short-living objects that are instantiated for every
+       __anext__() call. */
+    struct _PyAsyncGenWrappedValue* value_freelist[_PyAsyncGen_MAXFREELIST];
+    int value_numfree;
+
+    struct PyAsyncGenASend* asend_freelist[_PyAsyncGen_MAXFREELIST];
+    int asend_numfree;
+#endif
+};
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_GENOBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_getopt.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_getopt.h
new file mode 100644
index 000000000000..7f0dd13ae577
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_getopt.h
@@ -0,0 +1,22 @@
+#ifndef Py_INTERNAL_PYGETOPT_H
+#define Py_INTERNAL_PYGETOPT_H
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+extern int _PyOS_opterr;
+extern Py_ssize_t _PyOS_optind;
+extern const wchar_t *_PyOS_optarg;
+
+extern void _PyOS_ResetGetOpt(void);
+
+typedef struct {
+    const wchar_t *name;
+    int has_arg;
+    int val;
+} _PyOS_LongOption;
+
+extern int _PyOS_GetOpt(Py_ssize_t argc, wchar_t * const *argv, int *longindex);
+
+#endif /* !Py_INTERNAL_PYGETOPT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_gil.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_gil.h
new file mode 100644
index 000000000000..8ebad37b686c
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_gil.h
@@ -0,0 +1,50 @@
+#ifndef Py_INTERNAL_GIL_H
+#define Py_INTERNAL_GIL_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_atomic.h"    /* _Py_atomic_address */
+#include "pycore_condvar.h"   /* PyCOND_T */
+
+#ifndef Py_HAVE_CONDVAR
+#  error You need either a POSIX-compatible or a Windows system!
+#endif
+
+/* Enable if you want to force the switching of threads at least
+   every `interval`. */
+#undef FORCE_SWITCHING
+#define FORCE_SWITCHING
+
+struct _gil_runtime_state {
+    /* microseconds (the Python API uses seconds, though) */
+    unsigned long interval;
+    /* Last PyThreadState holding / having held the GIL. This helps us
+       know whether anyone else was scheduled after we dropped the GIL. */
+    _Py_atomic_address last_holder;
+    /* Whether the GIL is already taken (-1 if uninitialized). This is
+       atomic because it can be read without any lock taken in ceval.c. */
+    _Py_atomic_int locked;
+    /* Number of GIL switches since the beginning. */
+    unsigned long switch_number;
+    /* This condition variable allows one or several threads to wait
+       until the GIL is released. In addition, the mutex also protects
+       the above variables. */
+    PyCOND_T cond;
+    PyMUTEX_T mutex;
+#ifdef FORCE_SWITCHING
+    /* This condition variable helps the GIL-releasing thread wait for
+       a GIL-awaiting thread to be scheduled and take the GIL. */
+    PyCOND_T switch_cond;
+    PyMUTEX_T switch_mutex;
+#endif
+};
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_GIL_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_global_objects.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_global_objects.h
new file mode 100644
index 000000000000..442f8516278b
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_global_objects.h
@@ -0,0 +1,105 @@
+#ifndef Py_INTERNAL_GLOBAL_OBJECTS_H
+#define Py_INTERNAL_GLOBAL_OBJECTS_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_hashtable.h"       // _Py_hashtable_t
+#include "pycore_gc.h"              // PyGC_Head
+#include "pycore_global_strings.h"  // struct _Py_global_strings
+#include "pycore_hamt.h"            // PyHamtNode_Bitmap
+#include "pycore_context.h"         // _PyContextTokenMissing
+#include "pycore_typeobject.h"      // pytype_slotdef
+
+
+// These would be in pycore_long.h if it weren't for an include cycle.
+#define _PY_NSMALLPOSINTS           257
+#define _PY_NSMALLNEGINTS           5
+
+
+// Only immutable objects should be considered runtime-global.
+// All others must be per-interpreter.
+
+#define _Py_GLOBAL_OBJECT(NAME) \
+    _PyRuntime.static_objects.NAME
+#define _Py_SINGLETON(NAME) \
+    _Py_GLOBAL_OBJECT(singletons.NAME)
+
+struct _Py_cached_objects {
+    // XXX We could statically allocate the hashtable.
+    _Py_hashtable_t *interned_strings;
+};
+
+struct _Py_static_objects {
+    struct {
+        /* Small integers are preallocated in this array so that they
+         * can be shared.
+         * The integers that are preallocated are those in the range
+         * -_PY_NSMALLNEGINTS (inclusive) to _PY_NSMALLPOSINTS (exclusive).
+         */
+        PyLongObject small_ints[_PY_NSMALLNEGINTS + _PY_NSMALLPOSINTS];
+
+        PyBytesObject bytes_empty;
+        struct {
+            PyBytesObject ob;
+            char eos;
+        } bytes_characters[256];
+
+        struct _Py_global_strings strings;
+
+        _PyGC_Head_UNUSED _tuple_empty_gc_not_used;
+        PyTupleObject tuple_empty;
+
+        _PyGC_Head_UNUSED _hamt_bitmap_node_empty_gc_not_used;
+        PyHamtNode_Bitmap hamt_bitmap_node_empty;
+        _PyContextTokenMissing context_token_missing;
+    } singletons;
+};
+
+#define _Py_INTERP_CACHED_OBJECT(interp, NAME) \
+    (interp)->cached_objects.NAME
+
+struct _Py_interp_cached_objects {
+    PyObject *interned_strings;
+
+    /* AST */
+    PyObject *str_replace_inf;
+
+    /* object.__reduce__ */
+    PyObject *objreduce;
+    PyObject *type_slots_pname;
+    pytype_slotdef *type_slots_ptrs[MAX_EQUIV];
+
+    /* TypeVar and related types */
+    PyTypeObject *generic_type;
+    PyTypeObject *typevar_type;
+    PyTypeObject *typevartuple_type;
+    PyTypeObject *paramspec_type;
+    PyTypeObject *paramspecargs_type;
+    PyTypeObject *paramspeckwargs_type;
+};
+
+#define _Py_INTERP_STATIC_OBJECT(interp, NAME) \
+    (interp)->static_objects.NAME
+#define _Py_INTERP_SINGLETON(interp, NAME) \
+    _Py_INTERP_STATIC_OBJECT(interp, singletons.NAME)
+
+struct _Py_interp_static_objects {
+    struct {
+        int _not_used;
+        // hamt_empty is here instead of global because of its weakreflist.
+        _PyGC_Head_UNUSED _hamt_empty_gc_not_used;
+        PyHamtObject hamt_empty;
+        PyBaseExceptionObject last_resort_memory_error;
+    } singletons;
+};
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_GLOBAL_OBJECTS_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_global_objects_fini_generated.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_global_objects_fini_generated.h
new file mode 100644
index 000000000000..439f47a263df
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_global_objects_fini_generated.h
@@ -0,0 +1,1531 @@
+#ifndef Py_INTERNAL_GLOBAL_OBJECTS_FINI_GENERATED_INIT_H
+#define Py_INTERNAL_GLOBAL_OBJECTS_FINI_GENERATED_INIT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#ifdef Py_DEBUG
+static inline void
+_PyStaticObject_CheckRefcnt(PyObject *obj) {
+    if (Py_REFCNT(obj) < _Py_IMMORTAL_REFCNT) {
+        fprintf(stderr, "Immortal Object has less refcnt than expected.\n");
+        _PyObject_Dump(obj);
+    }
+}
+#endif
+
+/* The following is auto-generated by Tools/build/generate_global_objects.py. */
+#ifdef Py_DEBUG
+static inline void
+_PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) {
+    /* generated runtime-global */
+    // (see pycore_runtime_init_generated.h)
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + -5]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + -4]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + -3]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + -2]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + -1]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 0]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 1]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 2]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 3]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 4]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 5]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 6]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 7]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 8]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 9]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 10]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 11]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 12]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 13]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 14]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 15]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 16]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 17]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 18]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 19]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 20]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 21]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 22]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 23]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 24]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 25]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 26]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 27]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 28]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 29]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 30]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 31]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 32]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 33]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 34]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 35]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 36]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 37]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 38]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 39]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 40]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 41]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 42]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 43]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 44]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 45]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 46]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 47]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 48]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 49]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 50]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 51]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 52]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 53]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 54]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 55]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 56]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 57]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 58]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 59]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 60]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 61]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 62]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 63]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 64]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 65]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 66]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 67]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 68]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 69]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 70]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 71]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 72]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 73]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 74]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 75]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 76]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 77]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 78]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 79]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 80]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 81]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 82]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 83]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 84]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 85]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 86]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 87]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 88]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 89]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 90]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 91]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 92]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 93]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 94]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 95]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 96]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 97]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 98]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 99]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 100]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 101]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 102]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 103]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 104]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 105]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 106]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 107]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 108]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 109]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 110]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 111]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 112]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 113]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 114]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 115]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 116]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 117]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 118]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 119]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 120]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 121]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 122]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 123]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 124]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 125]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 126]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 127]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 129]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 130]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 131]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 132]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 133]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 134]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 135]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 136]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 137]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 138]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 139]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 140]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 141]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 142]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 143]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 144]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 145]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 146]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 147]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 148]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 149]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 150]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 151]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 152]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 153]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 154]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 155]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 156]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 157]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 158]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 159]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 160]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 161]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 162]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 163]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 164]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 165]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 166]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 167]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 168]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 169]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 170]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 171]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 172]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 173]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 174]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 175]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 176]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 177]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 178]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 179]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 180]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 181]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 182]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 183]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 184]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 185]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 186]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 187]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 188]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 189]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 190]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 191]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 192]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 193]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 194]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 195]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 196]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 197]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 198]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 199]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 200]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 201]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 202]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 203]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 204]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 205]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 206]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 207]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 208]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 209]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 210]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 211]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 212]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 213]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 214]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 215]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 216]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 217]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 218]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 219]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 220]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 221]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 222]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 223]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 224]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 225]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 226]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 227]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 228]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 229]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 230]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 231]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 232]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 233]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 234]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 235]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 236]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 237]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 238]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 239]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 240]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 241]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 242]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 243]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 244]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 245]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 246]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 247]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 248]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 249]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 250]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 251]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 252]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 253]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 254]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 255]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + 256]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[0]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[1]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[2]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[3]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[4]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[5]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[6]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[7]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[8]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[9]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[10]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[11]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[12]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[13]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[14]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[15]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[16]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[17]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[18]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[19]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[20]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[21]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[22]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[23]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[24]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[25]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[26]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[27]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[28]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[29]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[30]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[31]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[32]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[33]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[34]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[35]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[36]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[37]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[38]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[39]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[40]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[41]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[42]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[43]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[44]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[45]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[46]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[47]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[48]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[49]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[50]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[51]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[52]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[53]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[54]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[55]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[56]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[57]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[58]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[59]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[60]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[61]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[62]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[63]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[64]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[65]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[66]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[67]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[68]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[69]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[70]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[71]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[72]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[73]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[74]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[75]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[76]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[77]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[78]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[79]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[80]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[81]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[82]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[83]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[84]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[85]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[86]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[87]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[88]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[89]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[90]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[91]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[92]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[93]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[94]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[95]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[96]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[97]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[98]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[99]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[100]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[101]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[102]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[103]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[104]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[105]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[106]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[107]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[108]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[109]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[110]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[111]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[112]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[113]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[114]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[115]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[116]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[117]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[118]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[119]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[120]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[121]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[122]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[123]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[124]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[125]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[126]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[127]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[129]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[130]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[131]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[132]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[133]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[134]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[135]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[136]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[137]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[138]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[139]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[140]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[141]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[142]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[143]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[144]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[145]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[146]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[147]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[148]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[149]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[150]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[151]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[152]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[153]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[154]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[155]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[156]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[157]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[158]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[159]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[160]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[161]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[162]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[163]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[164]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[165]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[166]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[167]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[168]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[169]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[170]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[171]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[172]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[173]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[174]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[175]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[176]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[177]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[178]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[179]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[180]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[181]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[182]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[183]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[184]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[185]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[186]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[187]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[188]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[189]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[190]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[191]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[192]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[193]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[194]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[195]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[196]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[197]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[198]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[199]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[200]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[201]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[202]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[203]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[204]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[205]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[206]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[207]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[208]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[209]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[210]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[211]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[212]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[213]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[214]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[215]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[216]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[217]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[218]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[219]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[220]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[221]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[222]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[223]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[224]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[225]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[226]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[227]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[228]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[229]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[230]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[231]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[232]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[233]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[234]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[235]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[236]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[237]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[238]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[239]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[240]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[241]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[242]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[243]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[244]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[245]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[246]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[247]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[248]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[249]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[250]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[251]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[252]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[253]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[254]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_characters)[255]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(anon_dictcomp));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(anon_genexpr));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(anon_lambda));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(anon_listcomp));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(anon_module));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(anon_setcomp));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(anon_string));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(anon_unknown));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(close_br));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(dbl_close_br));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(dbl_open_br));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(dbl_percent));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(defaults));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(dot));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(dot_locals));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(empty));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(generic_base));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(json_decoder));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(kwdefaults));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(list_err));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(newline));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(open_br));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(percent));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(shim_name));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(type_params));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_STR(utf_8));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(CANCELLED));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(FINISHED));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(False));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(JSONDecodeError));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(PENDING));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(Py_Repr));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(TextIOWrapper));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(True));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(WarningMessage));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_WindowsConsoleIO));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__IOBase_closed));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__abc_tpflags__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__abs__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__abstractmethods__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__add__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__aenter__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__aexit__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__aiter__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__all__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__and__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__anext__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__annotations__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__args__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__asyncio_running_event_loop__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__await__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__bases__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__bool__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__buffer__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__build_class__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__builtins__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__bytes__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__call__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__cantrace__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__class__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__class_getitem__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__classcell__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__classdict__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__classdictcell__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__complex__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__contains__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__copy__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__ctypes_from_outparam__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__del__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__delattr__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__delete__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__delitem__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__dict__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__dictoffset__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__dir__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__divmod__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__doc__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__enter__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__eq__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__exit__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__file__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__float__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__floordiv__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__format__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__fspath__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__ge__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__get__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__getattr__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__getattribute__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__getinitargs__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__getitem__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__getnewargs__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__getnewargs_ex__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__getstate__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__gt__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__hash__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__iadd__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__iand__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__ifloordiv__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__ilshift__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__imatmul__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__imod__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__import__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__imul__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__index__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__init__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__init_subclass__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__instancecheck__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__int__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__invert__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__ior__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__ipow__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__irshift__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__isabstractmethod__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__isub__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__iter__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__itruediv__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__ixor__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__le__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__len__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__length_hint__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__lltrace__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__loader__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__lshift__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__lt__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__main__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__matmul__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__missing__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__mod__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__module__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__mro_entries__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__mul__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__name__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__ne__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__neg__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__new__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__newobj__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__newobj_ex__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__next__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__notes__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__or__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__orig_class__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__origin__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__package__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__parameters__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__path__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__pos__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__pow__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__prepare__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__qualname__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__radd__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__rand__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__rdivmod__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__reduce__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__reduce_ex__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__release_buffer__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__repr__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__reversed__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__rfloordiv__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__rlshift__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__rmatmul__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__rmod__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__rmul__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__ror__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__round__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__rpow__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__rrshift__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__rshift__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__rsub__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__rtruediv__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__rxor__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__set__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__set_name__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__setattr__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__setitem__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__setstate__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__sizeof__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__slotnames__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__slots__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__spec__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__str__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__sub__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__subclasscheck__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__subclasshook__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__truediv__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__trunc__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__type_params__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__typing_is_unpacked_typevartuple__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__typing_prepare_subst__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__typing_subst__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__typing_unpacked_tuple_args__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__warningregistry__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__weaklistoffset__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__weakref__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(__xor__));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_abc_impl));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_abstract_));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_active));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_annotation));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_anonymous_));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_argtypes_));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_as_parameter_));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_asyncio_future_blocking));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_blksize));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_bootstrap));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_check_retval_));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_dealloc_warn));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_feature_version));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_fields_));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_finalizing));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_find_and_load));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_fix_up_module));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_flags_));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_get_sourcefile));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_handle_fromlist));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_initializing));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_io));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_is_text_encoding));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_length_));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_limbo));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_lock_unlock_module));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_loop));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_needs_com_addref_));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_pack_));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_restype_));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_showwarnmsg));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_shutdown));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_slotnames));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_strptime_datetime));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_swappedbytes_));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_type_));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_uninitialized_submodules));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_warn_unawaited_coroutine));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(_xoptions));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(a));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(abs_tol));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(access));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(add));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(add_done_callback));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(after_in_child));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(after_in_parent));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(aggregate_class));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(alias));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(append));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(arg));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(argdefs));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(args));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(arguments));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(argv));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(as_integer_ratio));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(ast));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(attribute));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(authorizer_callback));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(autocommit));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(b));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(backtick));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(base));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(before));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(big));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(binary_form));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(block));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(bound));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(buffer));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(buffer_callback));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(buffer_size));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(buffering));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(buffers));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(bufsize));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(builtins));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(byteorder));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(bytes));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(bytes_per_sep));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(c));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(c_call));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(c_exception));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(c_return));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(cached_statements));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(cadata));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(cafile));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(call));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(call_exception_handler));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(call_soon));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(cancel));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(capath));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(category));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(cb_type));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(certfile));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(check_same_thread));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(clear));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(close));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(closed));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(closefd));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(closure));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_argcount));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_cellvars));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_code));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_consts));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_exceptiontable));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_filename));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_firstlineno));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_flags));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_freevars));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_kwonlyargcount));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_linetable));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_name));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_names));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_nlocals));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_posonlyargcount));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_qualname));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_stacksize));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(co_varnames));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(code));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(command));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(comment_factory));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(compile_mode));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(consts));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(context));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(contravariant));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(cookie));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(copy));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(copyreg));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(coro));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(count));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(covariant));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(cwd));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(d));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(data));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(database));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(decode));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(decoder));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(default));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(defaultaction));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(delete));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(depth));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(detect_types));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(deterministic));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(device));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(dict));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(dictcomp));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(difference_update));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(digest));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(digest_size));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(digestmod));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(dir_fd));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(discard));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(dispatch_table));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(displayhook));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(dklen));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(doc));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(dont_inherit));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(dst));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(dst_dir_fd));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(duration));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(e));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(eager_start));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(effective_ids));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(element_factory));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(encode));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(encoding));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(end));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(end_lineno));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(end_offset));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(endpos));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(entrypoint));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(env));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(errors));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(event));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(eventmask));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exc_type));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exc_value));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(excepthook));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exception));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(existing_file_name));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(exp));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(extend));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(extra_tokens));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(facility));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(factory));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(false));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(family));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(fanout));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(fd));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(fd2));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(fdel));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(fget));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(file));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(file_actions));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(filename));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(fileno));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(filepath));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(fillvalue));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(filters));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(final));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(find_class));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(fix_imports));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(flags));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(flush));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(follow_symlinks));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(format));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(frequency));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(from_param));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(fromlist));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(fromtimestamp));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(fromutc));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(fset));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(func));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(future));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(generation));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(genexpr));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(get));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(get_debug));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(get_event_loop));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(get_loop));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(get_source));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(getattr));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(getstate));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(gid));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(globals));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(groupindex));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(groups));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(handle));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(hash_name));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(header));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(headers));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(hi));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(hook));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(id));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(ident));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(ignore));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(imag));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(importlib));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(in_fd));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(incoming));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(indexgroup));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(inf));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(infer_variance));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(inheritable));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(initial));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(initial_bytes));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(initial_value));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(initval));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(inner_size));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(input));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(insert_comments));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(insert_pis));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(instructions));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(intern));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(intersection));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(is_running));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(isatty));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(isinstance));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(isoformat));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(isolation_level));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(istext));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(item));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(items));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(iter));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(iterable));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(iterations));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(join));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(jump));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(keepends));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(key));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(keyfile));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(keys));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(kind));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(kw));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(kw1));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(kw2));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(lambda));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(last));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(last_exc));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(last_node));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(last_traceback));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(last_type));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(last_value));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(latin1));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(leaf_size));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(len));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(length));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(level));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(limit));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(line));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(line_buffering));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(lineno));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(listcomp));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(little));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(lo));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(locale));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(locals));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(logoption));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(loop));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(mapping));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(match));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(max_length));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(maxdigits));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(maxevents));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(maxmem));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(maxsplit));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(maxvalue));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(memLevel));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(memlimit));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(message));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(metaclass));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(metadata));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(method));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(mod));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(mode));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(module));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(module_globals));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(modules));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(mro));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(msg));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(mycmp));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(n));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(n_arg));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(n_fields));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(n_sequence_fields));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(n_unnamed_fields));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(name));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(name_from));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(namespace_separator));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(namespaces));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(narg));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(ndigits));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(new_file_name));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(new_limit));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(newline));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(newlines));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(next));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(nlocals));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(node_depth));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(node_offset));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(ns));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(nstype));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(nt));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(null));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(number));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(obj));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(object));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(offset));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(offset_dst));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(offset_src));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(on_type_read));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(onceregistry));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(only_keys));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(oparg));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(opcode));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(open));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(opener));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(operation));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(optimize));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(options));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(order));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(origin));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(out_fd));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(outgoing));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(overlapped));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(owner));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(p));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(pages));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(parent));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(password));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(path));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(pattern));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(peek));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(persistent_id));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(persistent_load));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(person));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(pi_factory));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(pid));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(policy));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(pos));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(pos1));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(pos2));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(posix));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(print_file_and_line));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(priority));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(progress));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(progress_handler));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(progress_routine));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(proto));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(protocol));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(ps1));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(ps2));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(query));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(quotetabs));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(r));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(raw));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(read));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(read1));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(readable));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(readall));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(readinto));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(readinto1));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(readline));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(readonly));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(real));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(reducer_override));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(registry));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(rel_tol));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(release));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(reload));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(repl));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(replace));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(reserved));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(reset));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(resetids));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(return));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(reverse));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(reversed));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(s));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(salt));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sched_priority));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(scheduler));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(seek));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(seekable));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(selectors));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(self));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(send));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sep));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sequence));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(server_hostname));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(server_side));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(session));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(setcomp));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(setpgroup));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(setsid));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(setsigdef));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(setsigmask));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(setstate));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(shape));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(show_cmd));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(signed));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(size));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sizehint));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(skip_file_prefixes));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sleep));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sock));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sort));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sound));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(source));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(source_traceback));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(src));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(src_dir_fd));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stacklevel));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(start));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(statement));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(status));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stderr));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stdin));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(stdout));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(step));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(steps));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(store_name));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(strategy));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(strftime));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(strict));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(strict_mode));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(string));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(sub_key));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(symmetric_difference_update));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(tabsize));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(tag));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(target));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(target_is_directory));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(task));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(tb_frame));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(tb_lasti));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(tb_lineno));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(tb_next));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(tell));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(template));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(term));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(text));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(threading));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(throw));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(timeout));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(times));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(timetuple));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(top));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(trace_callback));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(traceback));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(trailers));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(translate));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(true));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(truncate));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(twice));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(txt));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(type));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(type_params));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(tz));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(tzname));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(uid));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(unlink));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(unraisablehook));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(uri));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(usedforsecurity));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(value));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(values));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(version));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(volume));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(warnings));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(warnoptions));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(wbits));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(week));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(weekday));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(which));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(who));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(withdata));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(writable));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(write));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(write_through));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(x));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(year));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(zdict));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[0]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[1]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[2]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[3]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[4]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[5]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[6]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[7]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[8]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[9]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[10]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[11]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[12]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[13]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[14]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[15]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[16]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[17]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[18]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[19]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[20]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[21]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[22]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[23]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[24]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[25]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[26]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[27]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[28]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[29]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[30]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[31]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[32]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[33]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[34]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[35]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[36]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[37]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[38]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[39]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[40]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[41]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[42]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[43]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[44]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[45]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[46]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[47]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[48]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[49]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[50]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[51]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[52]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[53]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[54]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[55]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[56]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[57]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[58]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[59]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[60]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[61]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[62]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[63]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[64]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[65]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[66]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[67]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[68]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[69]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[70]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[71]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[72]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[73]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[74]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[75]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[76]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[77]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[78]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[79]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[80]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[81]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[82]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[83]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[84]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[85]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[86]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[87]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[88]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[89]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[90]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[91]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[92]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[93]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[94]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[95]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[96]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[97]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[98]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[99]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[100]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[101]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[102]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[103]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[104]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[105]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[106]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[107]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[108]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[109]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[110]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[111]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[112]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[113]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[114]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[115]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[116]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[117]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[118]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[119]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[120]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[121]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[122]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[123]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[124]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[125]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[126]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).ascii[127]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[128 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[129 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[130 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[131 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[132 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[133 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[134 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[135 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[136 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[137 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[138 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[139 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[140 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[141 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[142 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[143 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[144 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[145 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[146 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[147 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[148 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[149 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[150 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[151 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[152 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[153 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[154 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[155 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[156 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[157 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[158 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[159 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[160 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[161 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[162 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[163 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[164 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[165 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[166 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[167 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[168 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[169 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[170 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[171 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[172 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[173 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[174 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[175 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[176 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[177 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[178 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[179 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[180 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[181 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[182 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[183 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[184 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[185 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[186 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[187 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[188 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[189 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[190 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[191 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[192 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[193 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[194 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[195 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[196 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[197 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[198 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[199 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[200 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[201 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[202 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[203 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[204 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[205 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[206 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[207 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[208 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[209 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[210 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[211 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[212 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[213 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[214 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[215 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[216 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[217 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[218 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[219 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[220 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[221 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[222 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[223 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[224 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[225 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[226 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[227 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[228 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[229 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[230 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[231 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[232 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[233 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[234 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[235 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[236 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[237 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[238 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[239 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[240 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[241 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[242 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[243 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[244 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[245 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[246 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[247 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[248 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[249 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[250 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[251 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[252 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[253 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[254 - 128]);
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(strings).latin1[255 - 128]);
+    /* non-generated */
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(bytes_empty));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(tuple_empty));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(hamt_bitmap_node_empty));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_INTERP_SINGLETON(interp, hamt_empty));
+    _PyStaticObject_CheckRefcnt((PyObject *)&_Py_SINGLETON(context_token_missing));
+}
+#endif  // Py_DEBUG
+/* End auto-generated code */
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_GLOBAL_OBJECTS_FINI_GENERATED_INIT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_global_strings.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_global_strings.h
new file mode 100644
index 000000000000..0c84999cbf81
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_global_strings.h
@@ -0,0 +1,787 @@
+#ifndef Py_INTERNAL_GLOBAL_STRINGS_H
+#define Py_INTERNAL_GLOBAL_STRINGS_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+// The data structure & init here are inspired by Tools/build/deepfreeze.py.
+
+// All field names generated by ASCII_STR() have a common prefix,
+// to help avoid collisions with keywords, macros, etc.
+
+#define STRUCT_FOR_ASCII_STR(LITERAL) \
+    struct { \
+        PyASCIIObject _ascii; \
+        uint8_t _data[sizeof(LITERAL)]; \
+    }
+#define STRUCT_FOR_STR(NAME, LITERAL) \
+    STRUCT_FOR_ASCII_STR(LITERAL) _py_ ## NAME;
+#define STRUCT_FOR_ID(NAME) \
+    STRUCT_FOR_ASCII_STR(#NAME) _py_ ## NAME;
+
+// XXX Order by frequency of use?
+
+/* The following is auto-generated by Tools/build/generate_global_objects.py. */
+struct _Py_global_strings {
+    struct {
+        STRUCT_FOR_STR(anon_dictcomp, "<dictcomp>")
+        STRUCT_FOR_STR(anon_genexpr, "<genexpr>")
+        STRUCT_FOR_STR(anon_lambda, "<lambda>")
+        STRUCT_FOR_STR(anon_listcomp, "<listcomp>")
+        STRUCT_FOR_STR(anon_module, "<module>")
+        STRUCT_FOR_STR(anon_setcomp, "<setcomp>")
+        STRUCT_FOR_STR(anon_string, "<string>")
+        STRUCT_FOR_STR(anon_unknown, "<unknown>")
+        STRUCT_FOR_STR(close_br, "}")
+        STRUCT_FOR_STR(dbl_close_br, "}}")
+        STRUCT_FOR_STR(dbl_open_br, "{{")
+        STRUCT_FOR_STR(dbl_percent, "%%")
+        STRUCT_FOR_STR(defaults, ".defaults")
+        STRUCT_FOR_STR(dot, ".")
+        STRUCT_FOR_STR(dot_locals, ".<locals>")
+        STRUCT_FOR_STR(empty, "")
+        STRUCT_FOR_STR(generic_base, ".generic_base")
+        STRUCT_FOR_STR(json_decoder, "json.decoder")
+        STRUCT_FOR_STR(kwdefaults, ".kwdefaults")
+        STRUCT_FOR_STR(list_err, "list index out of range")
+        STRUCT_FOR_STR(newline, "\n")
+        STRUCT_FOR_STR(open_br, "{")
+        STRUCT_FOR_STR(percent, "%")
+        STRUCT_FOR_STR(shim_name, "<shim>")
+        STRUCT_FOR_STR(type_params, ".type_params")
+        STRUCT_FOR_STR(utf_8, "utf-8")
+    } literals;
+
+    struct {
+        STRUCT_FOR_ID(CANCELLED)
+        STRUCT_FOR_ID(FINISHED)
+        STRUCT_FOR_ID(False)
+        STRUCT_FOR_ID(JSONDecodeError)
+        STRUCT_FOR_ID(PENDING)
+        STRUCT_FOR_ID(Py_Repr)
+        STRUCT_FOR_ID(TextIOWrapper)
+        STRUCT_FOR_ID(True)
+        STRUCT_FOR_ID(WarningMessage)
+        STRUCT_FOR_ID(_)
+        STRUCT_FOR_ID(_WindowsConsoleIO)
+        STRUCT_FOR_ID(__IOBase_closed)
+        STRUCT_FOR_ID(__abc_tpflags__)
+        STRUCT_FOR_ID(__abs__)
+        STRUCT_FOR_ID(__abstractmethods__)
+        STRUCT_FOR_ID(__add__)
+        STRUCT_FOR_ID(__aenter__)
+        STRUCT_FOR_ID(__aexit__)
+        STRUCT_FOR_ID(__aiter__)
+        STRUCT_FOR_ID(__all__)
+        STRUCT_FOR_ID(__and__)
+        STRUCT_FOR_ID(__anext__)
+        STRUCT_FOR_ID(__annotations__)
+        STRUCT_FOR_ID(__args__)
+        STRUCT_FOR_ID(__asyncio_running_event_loop__)
+        STRUCT_FOR_ID(__await__)
+        STRUCT_FOR_ID(__bases__)
+        STRUCT_FOR_ID(__bool__)
+        STRUCT_FOR_ID(__buffer__)
+        STRUCT_FOR_ID(__build_class__)
+        STRUCT_FOR_ID(__builtins__)
+        STRUCT_FOR_ID(__bytes__)
+        STRUCT_FOR_ID(__call__)
+        STRUCT_FOR_ID(__cantrace__)
+        STRUCT_FOR_ID(__class__)
+        STRUCT_FOR_ID(__class_getitem__)
+        STRUCT_FOR_ID(__classcell__)
+        STRUCT_FOR_ID(__classdict__)
+        STRUCT_FOR_ID(__classdictcell__)
+        STRUCT_FOR_ID(__complex__)
+        STRUCT_FOR_ID(__contains__)
+        STRUCT_FOR_ID(__copy__)
+        STRUCT_FOR_ID(__ctypes_from_outparam__)
+        STRUCT_FOR_ID(__del__)
+        STRUCT_FOR_ID(__delattr__)
+        STRUCT_FOR_ID(__delete__)
+        STRUCT_FOR_ID(__delitem__)
+        STRUCT_FOR_ID(__dict__)
+        STRUCT_FOR_ID(__dictoffset__)
+        STRUCT_FOR_ID(__dir__)
+        STRUCT_FOR_ID(__divmod__)
+        STRUCT_FOR_ID(__doc__)
+        STRUCT_FOR_ID(__enter__)
+        STRUCT_FOR_ID(__eq__)
+        STRUCT_FOR_ID(__exit__)
+        STRUCT_FOR_ID(__file__)
+        STRUCT_FOR_ID(__float__)
+        STRUCT_FOR_ID(__floordiv__)
+        STRUCT_FOR_ID(__format__)
+        STRUCT_FOR_ID(__fspath__)
+        STRUCT_FOR_ID(__ge__)
+        STRUCT_FOR_ID(__get__)
+        STRUCT_FOR_ID(__getattr__)
+        STRUCT_FOR_ID(__getattribute__)
+        STRUCT_FOR_ID(__getinitargs__)
+        STRUCT_FOR_ID(__getitem__)
+        STRUCT_FOR_ID(__getnewargs__)
+        STRUCT_FOR_ID(__getnewargs_ex__)
+        STRUCT_FOR_ID(__getstate__)
+        STRUCT_FOR_ID(__gt__)
+        STRUCT_FOR_ID(__hash__)
+        STRUCT_FOR_ID(__iadd__)
+        STRUCT_FOR_ID(__iand__)
+        STRUCT_FOR_ID(__ifloordiv__)
+        STRUCT_FOR_ID(__ilshift__)
+        STRUCT_FOR_ID(__imatmul__)
+        STRUCT_FOR_ID(__imod__)
+        STRUCT_FOR_ID(__import__)
+        STRUCT_FOR_ID(__imul__)
+        STRUCT_FOR_ID(__index__)
+        STRUCT_FOR_ID(__init__)
+        STRUCT_FOR_ID(__init_subclass__)
+        STRUCT_FOR_ID(__instancecheck__)
+        STRUCT_FOR_ID(__int__)
+        STRUCT_FOR_ID(__invert__)
+        STRUCT_FOR_ID(__ior__)
+        STRUCT_FOR_ID(__ipow__)
+        STRUCT_FOR_ID(__irshift__)
+        STRUCT_FOR_ID(__isabstractmethod__)
+        STRUCT_FOR_ID(__isub__)
+        STRUCT_FOR_ID(__iter__)
+        STRUCT_FOR_ID(__itruediv__)
+        STRUCT_FOR_ID(__ixor__)
+        STRUCT_FOR_ID(__le__)
+        STRUCT_FOR_ID(__len__)
+        STRUCT_FOR_ID(__length_hint__)
+        STRUCT_FOR_ID(__lltrace__)
+        STRUCT_FOR_ID(__loader__)
+        STRUCT_FOR_ID(__lshift__)
+        STRUCT_FOR_ID(__lt__)
+        STRUCT_FOR_ID(__main__)
+        STRUCT_FOR_ID(__matmul__)
+        STRUCT_FOR_ID(__missing__)
+        STRUCT_FOR_ID(__mod__)
+        STRUCT_FOR_ID(__module__)
+        STRUCT_FOR_ID(__mro_entries__)
+        STRUCT_FOR_ID(__mul__)
+        STRUCT_FOR_ID(__name__)
+        STRUCT_FOR_ID(__ne__)
+        STRUCT_FOR_ID(__neg__)
+        STRUCT_FOR_ID(__new__)
+        STRUCT_FOR_ID(__newobj__)
+        STRUCT_FOR_ID(__newobj_ex__)
+        STRUCT_FOR_ID(__next__)
+        STRUCT_FOR_ID(__notes__)
+        STRUCT_FOR_ID(__or__)
+        STRUCT_FOR_ID(__orig_class__)
+        STRUCT_FOR_ID(__origin__)
+        STRUCT_FOR_ID(__package__)
+        STRUCT_FOR_ID(__parameters__)
+        STRUCT_FOR_ID(__path__)
+        STRUCT_FOR_ID(__pos__)
+        STRUCT_FOR_ID(__pow__)
+        STRUCT_FOR_ID(__prepare__)
+        STRUCT_FOR_ID(__qualname__)
+        STRUCT_FOR_ID(__radd__)
+        STRUCT_FOR_ID(__rand__)
+        STRUCT_FOR_ID(__rdivmod__)
+        STRUCT_FOR_ID(__reduce__)
+        STRUCT_FOR_ID(__reduce_ex__)
+        STRUCT_FOR_ID(__release_buffer__)
+        STRUCT_FOR_ID(__repr__)
+        STRUCT_FOR_ID(__reversed__)
+        STRUCT_FOR_ID(__rfloordiv__)
+        STRUCT_FOR_ID(__rlshift__)
+        STRUCT_FOR_ID(__rmatmul__)
+        STRUCT_FOR_ID(__rmod__)
+        STRUCT_FOR_ID(__rmul__)
+        STRUCT_FOR_ID(__ror__)
+        STRUCT_FOR_ID(__round__)
+        STRUCT_FOR_ID(__rpow__)
+        STRUCT_FOR_ID(__rrshift__)
+        STRUCT_FOR_ID(__rshift__)
+        STRUCT_FOR_ID(__rsub__)
+        STRUCT_FOR_ID(__rtruediv__)
+        STRUCT_FOR_ID(__rxor__)
+        STRUCT_FOR_ID(__set__)
+        STRUCT_FOR_ID(__set_name__)
+        STRUCT_FOR_ID(__setattr__)
+        STRUCT_FOR_ID(__setitem__)
+        STRUCT_FOR_ID(__setstate__)
+        STRUCT_FOR_ID(__sizeof__)
+        STRUCT_FOR_ID(__slotnames__)
+        STRUCT_FOR_ID(__slots__)
+        STRUCT_FOR_ID(__spec__)
+        STRUCT_FOR_ID(__str__)
+        STRUCT_FOR_ID(__sub__)
+        STRUCT_FOR_ID(__subclasscheck__)
+        STRUCT_FOR_ID(__subclasshook__)
+        STRUCT_FOR_ID(__truediv__)
+        STRUCT_FOR_ID(__trunc__)
+        STRUCT_FOR_ID(__type_params__)
+        STRUCT_FOR_ID(__typing_is_unpacked_typevartuple__)
+        STRUCT_FOR_ID(__typing_prepare_subst__)
+        STRUCT_FOR_ID(__typing_subst__)
+        STRUCT_FOR_ID(__typing_unpacked_tuple_args__)
+        STRUCT_FOR_ID(__warningregistry__)
+        STRUCT_FOR_ID(__weaklistoffset__)
+        STRUCT_FOR_ID(__weakref__)
+        STRUCT_FOR_ID(__xor__)
+        STRUCT_FOR_ID(_abc_impl)
+        STRUCT_FOR_ID(_abstract_)
+        STRUCT_FOR_ID(_active)
+        STRUCT_FOR_ID(_annotation)
+        STRUCT_FOR_ID(_anonymous_)
+        STRUCT_FOR_ID(_argtypes_)
+        STRUCT_FOR_ID(_as_parameter_)
+        STRUCT_FOR_ID(_asyncio_future_blocking)
+        STRUCT_FOR_ID(_blksize)
+        STRUCT_FOR_ID(_bootstrap)
+        STRUCT_FOR_ID(_check_retval_)
+        STRUCT_FOR_ID(_dealloc_warn)
+        STRUCT_FOR_ID(_feature_version)
+        STRUCT_FOR_ID(_fields_)
+        STRUCT_FOR_ID(_finalizing)
+        STRUCT_FOR_ID(_find_and_load)
+        STRUCT_FOR_ID(_fix_up_module)
+        STRUCT_FOR_ID(_flags_)
+        STRUCT_FOR_ID(_get_sourcefile)
+        STRUCT_FOR_ID(_handle_fromlist)
+        STRUCT_FOR_ID(_initializing)
+        STRUCT_FOR_ID(_io)
+        STRUCT_FOR_ID(_is_text_encoding)
+        STRUCT_FOR_ID(_length_)
+        STRUCT_FOR_ID(_limbo)
+        STRUCT_FOR_ID(_lock_unlock_module)
+        STRUCT_FOR_ID(_loop)
+        STRUCT_FOR_ID(_needs_com_addref_)
+        STRUCT_FOR_ID(_pack_)
+        STRUCT_FOR_ID(_restype_)
+        STRUCT_FOR_ID(_showwarnmsg)
+        STRUCT_FOR_ID(_shutdown)
+        STRUCT_FOR_ID(_slotnames)
+        STRUCT_FOR_ID(_strptime_datetime)
+        STRUCT_FOR_ID(_swappedbytes_)
+        STRUCT_FOR_ID(_type_)
+        STRUCT_FOR_ID(_uninitialized_submodules)
+        STRUCT_FOR_ID(_warn_unawaited_coroutine)
+        STRUCT_FOR_ID(_xoptions)
+        STRUCT_FOR_ID(a)
+        STRUCT_FOR_ID(abs_tol)
+        STRUCT_FOR_ID(access)
+        STRUCT_FOR_ID(add)
+        STRUCT_FOR_ID(add_done_callback)
+        STRUCT_FOR_ID(after_in_child)
+        STRUCT_FOR_ID(after_in_parent)
+        STRUCT_FOR_ID(aggregate_class)
+        STRUCT_FOR_ID(alias)
+        STRUCT_FOR_ID(append)
+        STRUCT_FOR_ID(arg)
+        STRUCT_FOR_ID(argdefs)
+        STRUCT_FOR_ID(args)
+        STRUCT_FOR_ID(arguments)
+        STRUCT_FOR_ID(argv)
+        STRUCT_FOR_ID(as_integer_ratio)
+        STRUCT_FOR_ID(ast)
+        STRUCT_FOR_ID(attribute)
+        STRUCT_FOR_ID(authorizer_callback)
+        STRUCT_FOR_ID(autocommit)
+        STRUCT_FOR_ID(b)
+        STRUCT_FOR_ID(backtick)
+        STRUCT_FOR_ID(base)
+        STRUCT_FOR_ID(before)
+        STRUCT_FOR_ID(big)
+        STRUCT_FOR_ID(binary_form)
+        STRUCT_FOR_ID(block)
+        STRUCT_FOR_ID(bound)
+        STRUCT_FOR_ID(buffer)
+        STRUCT_FOR_ID(buffer_callback)
+        STRUCT_FOR_ID(buffer_size)
+        STRUCT_FOR_ID(buffering)
+        STRUCT_FOR_ID(buffers)
+        STRUCT_FOR_ID(bufsize)
+        STRUCT_FOR_ID(builtins)
+        STRUCT_FOR_ID(byteorder)
+        STRUCT_FOR_ID(bytes)
+        STRUCT_FOR_ID(bytes_per_sep)
+        STRUCT_FOR_ID(c)
+        STRUCT_FOR_ID(c_call)
+        STRUCT_FOR_ID(c_exception)
+        STRUCT_FOR_ID(c_return)
+        STRUCT_FOR_ID(cached_statements)
+        STRUCT_FOR_ID(cadata)
+        STRUCT_FOR_ID(cafile)
+        STRUCT_FOR_ID(call)
+        STRUCT_FOR_ID(call_exception_handler)
+        STRUCT_FOR_ID(call_soon)
+        STRUCT_FOR_ID(cancel)
+        STRUCT_FOR_ID(capath)
+        STRUCT_FOR_ID(category)
+        STRUCT_FOR_ID(cb_type)
+        STRUCT_FOR_ID(certfile)
+        STRUCT_FOR_ID(check_same_thread)
+        STRUCT_FOR_ID(clear)
+        STRUCT_FOR_ID(close)
+        STRUCT_FOR_ID(closed)
+        STRUCT_FOR_ID(closefd)
+        STRUCT_FOR_ID(closure)
+        STRUCT_FOR_ID(co_argcount)
+        STRUCT_FOR_ID(co_cellvars)
+        STRUCT_FOR_ID(co_code)
+        STRUCT_FOR_ID(co_consts)
+        STRUCT_FOR_ID(co_exceptiontable)
+        STRUCT_FOR_ID(co_filename)
+        STRUCT_FOR_ID(co_firstlineno)
+        STRUCT_FOR_ID(co_flags)
+        STRUCT_FOR_ID(co_freevars)
+        STRUCT_FOR_ID(co_kwonlyargcount)
+        STRUCT_FOR_ID(co_linetable)
+        STRUCT_FOR_ID(co_name)
+        STRUCT_FOR_ID(co_names)
+        STRUCT_FOR_ID(co_nlocals)
+        STRUCT_FOR_ID(co_posonlyargcount)
+        STRUCT_FOR_ID(co_qualname)
+        STRUCT_FOR_ID(co_stacksize)
+        STRUCT_FOR_ID(co_varnames)
+        STRUCT_FOR_ID(code)
+        STRUCT_FOR_ID(command)
+        STRUCT_FOR_ID(comment_factory)
+        STRUCT_FOR_ID(compile_mode)
+        STRUCT_FOR_ID(consts)
+        STRUCT_FOR_ID(context)
+        STRUCT_FOR_ID(contravariant)
+        STRUCT_FOR_ID(cookie)
+        STRUCT_FOR_ID(copy)
+        STRUCT_FOR_ID(copyreg)
+        STRUCT_FOR_ID(coro)
+        STRUCT_FOR_ID(count)
+        STRUCT_FOR_ID(covariant)
+        STRUCT_FOR_ID(cwd)
+        STRUCT_FOR_ID(d)
+        STRUCT_FOR_ID(data)
+        STRUCT_FOR_ID(database)
+        STRUCT_FOR_ID(decode)
+        STRUCT_FOR_ID(decoder)
+        STRUCT_FOR_ID(default)
+        STRUCT_FOR_ID(defaultaction)
+        STRUCT_FOR_ID(delete)
+        STRUCT_FOR_ID(depth)
+        STRUCT_FOR_ID(detect_types)
+        STRUCT_FOR_ID(deterministic)
+        STRUCT_FOR_ID(device)
+        STRUCT_FOR_ID(dict)
+        STRUCT_FOR_ID(dictcomp)
+        STRUCT_FOR_ID(difference_update)
+        STRUCT_FOR_ID(digest)
+        STRUCT_FOR_ID(digest_size)
+        STRUCT_FOR_ID(digestmod)
+        STRUCT_FOR_ID(dir_fd)
+        STRUCT_FOR_ID(discard)
+        STRUCT_FOR_ID(dispatch_table)
+        STRUCT_FOR_ID(displayhook)
+        STRUCT_FOR_ID(dklen)
+        STRUCT_FOR_ID(doc)
+        STRUCT_FOR_ID(dont_inherit)
+        STRUCT_FOR_ID(dst)
+        STRUCT_FOR_ID(dst_dir_fd)
+        STRUCT_FOR_ID(duration)
+        STRUCT_FOR_ID(e)
+        STRUCT_FOR_ID(eager_start)
+        STRUCT_FOR_ID(effective_ids)
+        STRUCT_FOR_ID(element_factory)
+        STRUCT_FOR_ID(encode)
+        STRUCT_FOR_ID(encoding)
+        STRUCT_FOR_ID(end)
+        STRUCT_FOR_ID(end_lineno)
+        STRUCT_FOR_ID(end_offset)
+        STRUCT_FOR_ID(endpos)
+        STRUCT_FOR_ID(entrypoint)
+        STRUCT_FOR_ID(env)
+        STRUCT_FOR_ID(errors)
+        STRUCT_FOR_ID(event)
+        STRUCT_FOR_ID(eventmask)
+        STRUCT_FOR_ID(exc_type)
+        STRUCT_FOR_ID(exc_value)
+        STRUCT_FOR_ID(excepthook)
+        STRUCT_FOR_ID(exception)
+        STRUCT_FOR_ID(existing_file_name)
+        STRUCT_FOR_ID(exp)
+        STRUCT_FOR_ID(extend)
+        STRUCT_FOR_ID(extra_tokens)
+        STRUCT_FOR_ID(facility)
+        STRUCT_FOR_ID(factory)
+        STRUCT_FOR_ID(false)
+        STRUCT_FOR_ID(family)
+        STRUCT_FOR_ID(fanout)
+        STRUCT_FOR_ID(fd)
+        STRUCT_FOR_ID(fd2)
+        STRUCT_FOR_ID(fdel)
+        STRUCT_FOR_ID(fget)
+        STRUCT_FOR_ID(file)
+        STRUCT_FOR_ID(file_actions)
+        STRUCT_FOR_ID(filename)
+        STRUCT_FOR_ID(fileno)
+        STRUCT_FOR_ID(filepath)
+        STRUCT_FOR_ID(fillvalue)
+        STRUCT_FOR_ID(filters)
+        STRUCT_FOR_ID(final)
+        STRUCT_FOR_ID(find_class)
+        STRUCT_FOR_ID(fix_imports)
+        STRUCT_FOR_ID(flags)
+        STRUCT_FOR_ID(flush)
+        STRUCT_FOR_ID(follow_symlinks)
+        STRUCT_FOR_ID(format)
+        STRUCT_FOR_ID(frequency)
+        STRUCT_FOR_ID(from_param)
+        STRUCT_FOR_ID(fromlist)
+        STRUCT_FOR_ID(fromtimestamp)
+        STRUCT_FOR_ID(fromutc)
+        STRUCT_FOR_ID(fset)
+        STRUCT_FOR_ID(func)
+        STRUCT_FOR_ID(future)
+        STRUCT_FOR_ID(generation)
+        STRUCT_FOR_ID(genexpr)
+        STRUCT_FOR_ID(get)
+        STRUCT_FOR_ID(get_debug)
+        STRUCT_FOR_ID(get_event_loop)
+        STRUCT_FOR_ID(get_loop)
+        STRUCT_FOR_ID(get_source)
+        STRUCT_FOR_ID(getattr)
+        STRUCT_FOR_ID(getstate)
+        STRUCT_FOR_ID(gid)
+        STRUCT_FOR_ID(globals)
+        STRUCT_FOR_ID(groupindex)
+        STRUCT_FOR_ID(groups)
+        STRUCT_FOR_ID(handle)
+        STRUCT_FOR_ID(hash_name)
+        STRUCT_FOR_ID(header)
+        STRUCT_FOR_ID(headers)
+        STRUCT_FOR_ID(hi)
+        STRUCT_FOR_ID(hook)
+        STRUCT_FOR_ID(id)
+        STRUCT_FOR_ID(ident)
+        STRUCT_FOR_ID(ignore)
+        STRUCT_FOR_ID(imag)
+        STRUCT_FOR_ID(importlib)
+        STRUCT_FOR_ID(in_fd)
+        STRUCT_FOR_ID(incoming)
+        STRUCT_FOR_ID(indexgroup)
+        STRUCT_FOR_ID(inf)
+        STRUCT_FOR_ID(infer_variance)
+        STRUCT_FOR_ID(inheritable)
+        STRUCT_FOR_ID(initial)
+        STRUCT_FOR_ID(initial_bytes)
+        STRUCT_FOR_ID(initial_value)
+        STRUCT_FOR_ID(initval)
+        STRUCT_FOR_ID(inner_size)
+        STRUCT_FOR_ID(input)
+        STRUCT_FOR_ID(insert_comments)
+        STRUCT_FOR_ID(insert_pis)
+        STRUCT_FOR_ID(instructions)
+        STRUCT_FOR_ID(intern)
+        STRUCT_FOR_ID(intersection)
+        STRUCT_FOR_ID(is_running)
+        STRUCT_FOR_ID(isatty)
+        STRUCT_FOR_ID(isinstance)
+        STRUCT_FOR_ID(isoformat)
+        STRUCT_FOR_ID(isolation_level)
+        STRUCT_FOR_ID(istext)
+        STRUCT_FOR_ID(item)
+        STRUCT_FOR_ID(items)
+        STRUCT_FOR_ID(iter)
+        STRUCT_FOR_ID(iterable)
+        STRUCT_FOR_ID(iterations)
+        STRUCT_FOR_ID(join)
+        STRUCT_FOR_ID(jump)
+        STRUCT_FOR_ID(keepends)
+        STRUCT_FOR_ID(key)
+        STRUCT_FOR_ID(keyfile)
+        STRUCT_FOR_ID(keys)
+        STRUCT_FOR_ID(kind)
+        STRUCT_FOR_ID(kw)
+        STRUCT_FOR_ID(kw1)
+        STRUCT_FOR_ID(kw2)
+        STRUCT_FOR_ID(lambda)
+        STRUCT_FOR_ID(last)
+        STRUCT_FOR_ID(last_exc)
+        STRUCT_FOR_ID(last_node)
+        STRUCT_FOR_ID(last_traceback)
+        STRUCT_FOR_ID(last_type)
+        STRUCT_FOR_ID(last_value)
+        STRUCT_FOR_ID(latin1)
+        STRUCT_FOR_ID(leaf_size)
+        STRUCT_FOR_ID(len)
+        STRUCT_FOR_ID(length)
+        STRUCT_FOR_ID(level)
+        STRUCT_FOR_ID(limit)
+        STRUCT_FOR_ID(line)
+        STRUCT_FOR_ID(line_buffering)
+        STRUCT_FOR_ID(lineno)
+        STRUCT_FOR_ID(listcomp)
+        STRUCT_FOR_ID(little)
+        STRUCT_FOR_ID(lo)
+        STRUCT_FOR_ID(locale)
+        STRUCT_FOR_ID(locals)
+        STRUCT_FOR_ID(logoption)
+        STRUCT_FOR_ID(loop)
+        STRUCT_FOR_ID(mapping)
+        STRUCT_FOR_ID(match)
+        STRUCT_FOR_ID(max_length)
+        STRUCT_FOR_ID(maxdigits)
+        STRUCT_FOR_ID(maxevents)
+        STRUCT_FOR_ID(maxmem)
+        STRUCT_FOR_ID(maxsplit)
+        STRUCT_FOR_ID(maxvalue)
+        STRUCT_FOR_ID(memLevel)
+        STRUCT_FOR_ID(memlimit)
+        STRUCT_FOR_ID(message)
+        STRUCT_FOR_ID(metaclass)
+        STRUCT_FOR_ID(metadata)
+        STRUCT_FOR_ID(method)
+        STRUCT_FOR_ID(mod)
+        STRUCT_FOR_ID(mode)
+        STRUCT_FOR_ID(module)
+        STRUCT_FOR_ID(module_globals)
+        STRUCT_FOR_ID(modules)
+        STRUCT_FOR_ID(mro)
+        STRUCT_FOR_ID(msg)
+        STRUCT_FOR_ID(mycmp)
+        STRUCT_FOR_ID(n)
+        STRUCT_FOR_ID(n_arg)
+        STRUCT_FOR_ID(n_fields)
+        STRUCT_FOR_ID(n_sequence_fields)
+        STRUCT_FOR_ID(n_unnamed_fields)
+        STRUCT_FOR_ID(name)
+        STRUCT_FOR_ID(name_from)
+        STRUCT_FOR_ID(namespace_separator)
+        STRUCT_FOR_ID(namespaces)
+        STRUCT_FOR_ID(narg)
+        STRUCT_FOR_ID(ndigits)
+        STRUCT_FOR_ID(new_file_name)
+        STRUCT_FOR_ID(new_limit)
+        STRUCT_FOR_ID(newline)
+        STRUCT_FOR_ID(newlines)
+        STRUCT_FOR_ID(next)
+        STRUCT_FOR_ID(nlocals)
+        STRUCT_FOR_ID(node_depth)
+        STRUCT_FOR_ID(node_offset)
+        STRUCT_FOR_ID(ns)
+        STRUCT_FOR_ID(nstype)
+        STRUCT_FOR_ID(nt)
+        STRUCT_FOR_ID(null)
+        STRUCT_FOR_ID(number)
+        STRUCT_FOR_ID(obj)
+        STRUCT_FOR_ID(object)
+        STRUCT_FOR_ID(offset)
+        STRUCT_FOR_ID(offset_dst)
+        STRUCT_FOR_ID(offset_src)
+        STRUCT_FOR_ID(on_type_read)
+        STRUCT_FOR_ID(onceregistry)
+        STRUCT_FOR_ID(only_keys)
+        STRUCT_FOR_ID(oparg)
+        STRUCT_FOR_ID(opcode)
+        STRUCT_FOR_ID(open)
+        STRUCT_FOR_ID(opener)
+        STRUCT_FOR_ID(operation)
+        STRUCT_FOR_ID(optimize)
+        STRUCT_FOR_ID(options)
+        STRUCT_FOR_ID(order)
+        STRUCT_FOR_ID(origin)
+        STRUCT_FOR_ID(out_fd)
+        STRUCT_FOR_ID(outgoing)
+        STRUCT_FOR_ID(overlapped)
+        STRUCT_FOR_ID(owner)
+        STRUCT_FOR_ID(p)
+        STRUCT_FOR_ID(pages)
+        STRUCT_FOR_ID(parent)
+        STRUCT_FOR_ID(password)
+        STRUCT_FOR_ID(path)
+        STRUCT_FOR_ID(pattern)
+        STRUCT_FOR_ID(peek)
+        STRUCT_FOR_ID(persistent_id)
+        STRUCT_FOR_ID(persistent_load)
+        STRUCT_FOR_ID(person)
+        STRUCT_FOR_ID(pi_factory)
+        STRUCT_FOR_ID(pid)
+        STRUCT_FOR_ID(policy)
+        STRUCT_FOR_ID(pos)
+        STRUCT_FOR_ID(pos1)
+        STRUCT_FOR_ID(pos2)
+        STRUCT_FOR_ID(posix)
+        STRUCT_FOR_ID(print_file_and_line)
+        STRUCT_FOR_ID(priority)
+        STRUCT_FOR_ID(progress)
+        STRUCT_FOR_ID(progress_handler)
+        STRUCT_FOR_ID(progress_routine)
+        STRUCT_FOR_ID(proto)
+        STRUCT_FOR_ID(protocol)
+        STRUCT_FOR_ID(ps1)
+        STRUCT_FOR_ID(ps2)
+        STRUCT_FOR_ID(query)
+        STRUCT_FOR_ID(quotetabs)
+        STRUCT_FOR_ID(r)
+        STRUCT_FOR_ID(raw)
+        STRUCT_FOR_ID(read)
+        STRUCT_FOR_ID(read1)
+        STRUCT_FOR_ID(readable)
+        STRUCT_FOR_ID(readall)
+        STRUCT_FOR_ID(readinto)
+        STRUCT_FOR_ID(readinto1)
+        STRUCT_FOR_ID(readline)
+        STRUCT_FOR_ID(readonly)
+        STRUCT_FOR_ID(real)
+        STRUCT_FOR_ID(reducer_override)
+        STRUCT_FOR_ID(registry)
+        STRUCT_FOR_ID(rel_tol)
+        STRUCT_FOR_ID(release)
+        STRUCT_FOR_ID(reload)
+        STRUCT_FOR_ID(repl)
+        STRUCT_FOR_ID(replace)
+        STRUCT_FOR_ID(reserved)
+        STRUCT_FOR_ID(reset)
+        STRUCT_FOR_ID(resetids)
+        STRUCT_FOR_ID(return)
+        STRUCT_FOR_ID(reverse)
+        STRUCT_FOR_ID(reversed)
+        STRUCT_FOR_ID(s)
+        STRUCT_FOR_ID(salt)
+        STRUCT_FOR_ID(sched_priority)
+        STRUCT_FOR_ID(scheduler)
+        STRUCT_FOR_ID(seek)
+        STRUCT_FOR_ID(seekable)
+        STRUCT_FOR_ID(selectors)
+        STRUCT_FOR_ID(self)
+        STRUCT_FOR_ID(send)
+        STRUCT_FOR_ID(sep)
+        STRUCT_FOR_ID(sequence)
+        STRUCT_FOR_ID(server_hostname)
+        STRUCT_FOR_ID(server_side)
+        STRUCT_FOR_ID(session)
+        STRUCT_FOR_ID(setcomp)
+        STRUCT_FOR_ID(setpgroup)
+        STRUCT_FOR_ID(setsid)
+        STRUCT_FOR_ID(setsigdef)
+        STRUCT_FOR_ID(setsigmask)
+        STRUCT_FOR_ID(setstate)
+        STRUCT_FOR_ID(shape)
+        STRUCT_FOR_ID(show_cmd)
+        STRUCT_FOR_ID(signed)
+        STRUCT_FOR_ID(size)
+        STRUCT_FOR_ID(sizehint)
+        STRUCT_FOR_ID(skip_file_prefixes)
+        STRUCT_FOR_ID(sleep)
+        STRUCT_FOR_ID(sock)
+        STRUCT_FOR_ID(sort)
+        STRUCT_FOR_ID(sound)
+        STRUCT_FOR_ID(source)
+        STRUCT_FOR_ID(source_traceback)
+        STRUCT_FOR_ID(src)
+        STRUCT_FOR_ID(src_dir_fd)
+        STRUCT_FOR_ID(stacklevel)
+        STRUCT_FOR_ID(start)
+        STRUCT_FOR_ID(statement)
+        STRUCT_FOR_ID(status)
+        STRUCT_FOR_ID(stderr)
+        STRUCT_FOR_ID(stdin)
+        STRUCT_FOR_ID(stdout)
+        STRUCT_FOR_ID(step)
+        STRUCT_FOR_ID(steps)
+        STRUCT_FOR_ID(store_name)
+        STRUCT_FOR_ID(strategy)
+        STRUCT_FOR_ID(strftime)
+        STRUCT_FOR_ID(strict)
+        STRUCT_FOR_ID(strict_mode)
+        STRUCT_FOR_ID(string)
+        STRUCT_FOR_ID(sub_key)
+        STRUCT_FOR_ID(symmetric_difference_update)
+        STRUCT_FOR_ID(tabsize)
+        STRUCT_FOR_ID(tag)
+        STRUCT_FOR_ID(target)
+        STRUCT_FOR_ID(target_is_directory)
+        STRUCT_FOR_ID(task)
+        STRUCT_FOR_ID(tb_frame)
+        STRUCT_FOR_ID(tb_lasti)
+        STRUCT_FOR_ID(tb_lineno)
+        STRUCT_FOR_ID(tb_next)
+        STRUCT_FOR_ID(tell)
+        STRUCT_FOR_ID(template)
+        STRUCT_FOR_ID(term)
+        STRUCT_FOR_ID(text)
+        STRUCT_FOR_ID(threading)
+        STRUCT_FOR_ID(throw)
+        STRUCT_FOR_ID(timeout)
+        STRUCT_FOR_ID(times)
+        STRUCT_FOR_ID(timetuple)
+        STRUCT_FOR_ID(top)
+        STRUCT_FOR_ID(trace_callback)
+        STRUCT_FOR_ID(traceback)
+        STRUCT_FOR_ID(trailers)
+        STRUCT_FOR_ID(translate)
+        STRUCT_FOR_ID(true)
+        STRUCT_FOR_ID(truncate)
+        STRUCT_FOR_ID(twice)
+        STRUCT_FOR_ID(txt)
+        STRUCT_FOR_ID(type)
+        STRUCT_FOR_ID(type_params)
+        STRUCT_FOR_ID(tz)
+        STRUCT_FOR_ID(tzname)
+        STRUCT_FOR_ID(uid)
+        STRUCT_FOR_ID(unlink)
+        STRUCT_FOR_ID(unraisablehook)
+        STRUCT_FOR_ID(uri)
+        STRUCT_FOR_ID(usedforsecurity)
+        STRUCT_FOR_ID(value)
+        STRUCT_FOR_ID(values)
+        STRUCT_FOR_ID(version)
+        STRUCT_FOR_ID(volume)
+        STRUCT_FOR_ID(warnings)
+        STRUCT_FOR_ID(warnoptions)
+        STRUCT_FOR_ID(wbits)
+        STRUCT_FOR_ID(week)
+        STRUCT_FOR_ID(weekday)
+        STRUCT_FOR_ID(which)
+        STRUCT_FOR_ID(who)
+        STRUCT_FOR_ID(withdata)
+        STRUCT_FOR_ID(writable)
+        STRUCT_FOR_ID(write)
+        STRUCT_FOR_ID(write_through)
+        STRUCT_FOR_ID(x)
+        STRUCT_FOR_ID(year)
+        STRUCT_FOR_ID(zdict)
+    } identifiers;
+    struct {
+        PyASCIIObject _ascii;
+        uint8_t _data[2];
+    } ascii[128];
+    struct {
+        PyCompactUnicodeObject _latin1;
+        uint8_t _data[2];
+    } latin1[128];
+};
+/* End auto-generated code */
+
+#undef ID
+#undef STR
+
+
+#define _Py_ID(NAME) \
+     (_Py_SINGLETON(strings.identifiers._py_ ## NAME._ascii.ob_base))
+#define _Py_STR(NAME) \
+     (_Py_SINGLETON(strings.literals._py_ ## NAME._ascii.ob_base))
+
+/* _Py_DECLARE_STR() should precede all uses of _Py_STR() in a function.
+
+   This is true even if the same string has already been declared
+   elsewhere, even in the same file.  Mismatched duplicates are detected
+   by Tools/scripts/generate-global-objects.py.
+
+   Pairing _Py_DECLARE_STR() with every use of _Py_STR() makes sure the
+   string keeps working even if the declaration is removed somewhere
+   else.  It also makes it clear what the actual string is at every
+   place it is being used. */
+#define _Py_DECLARE_STR(name, str)
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_GLOBAL_STRINGS_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_hamt.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_hamt.h
new file mode 100644
index 000000000000..d8742c7cb635
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_hamt.h
@@ -0,0 +1,134 @@
+#ifndef Py_INTERNAL_HAMT_H
+#define Py_INTERNAL_HAMT_H
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+
+/*
+HAMT tree is shaped by hashes of keys. Every group of 5 bits of a hash denotes
+the exact position of the key in one level of the tree. Since we're using
+32 bit hashes, we can have at most 7 such levels. Although if there are
+two distinct keys with equal hashes, they will have to occupy the same
+cell in the 7th level of the tree -- so we'd put them in a "collision" node.
+Which brings the total possible tree depth to 8. Read more about the actual
+layout of the HAMT tree in `hamt.c`.
+
+This constant is used to define a datastucture for storing iteration state.
+*/
+#define _Py_HAMT_MAX_TREE_DEPTH 8
+
+
+extern PyTypeObject _PyHamt_Type;
+extern PyTypeObject _PyHamt_ArrayNode_Type;
+extern PyTypeObject _PyHamt_BitmapNode_Type;
+extern PyTypeObject _PyHamt_CollisionNode_Type;
+extern PyTypeObject _PyHamtKeys_Type;
+extern PyTypeObject _PyHamtValues_Type;
+extern PyTypeObject _PyHamtItems_Type;
+
+
+/* other API */
+
+#define PyHamt_Check(o) Py_IS_TYPE((o), &_PyHamt_Type)
+
+
+/* Abstract tree node. */
+typedef struct {
+    PyObject_HEAD
+} PyHamtNode;
+
+
+/* An HAMT immutable mapping collection. */
+typedef struct {
+    PyObject_HEAD
+    PyHamtNode *h_root;
+    PyObject *h_weakreflist;
+    Py_ssize_t h_count;
+} PyHamtObject;
+
+
+typedef struct {
+    PyObject_VAR_HEAD
+    uint32_t b_bitmap;
+    PyObject *b_array[1];
+} PyHamtNode_Bitmap;
+
+
+/* A struct to hold the state of depth-first traverse of the tree.
+
+   HAMT is an immutable collection.  Iterators will hold a strong reference
+   to it, and every node in the HAMT has strong references to its children.
+
+   So for iterators, we can implement zero allocations and zero reference
+   inc/dec depth-first iteration.
+
+   - i_nodes: an array of seven pointers to tree nodes
+   - i_level: the current node in i_nodes
+   - i_pos: an array of positions within nodes in i_nodes.
+*/
+typedef struct {
+    PyHamtNode *i_nodes[_Py_HAMT_MAX_TREE_DEPTH];
+    Py_ssize_t i_pos[_Py_HAMT_MAX_TREE_DEPTH];
+    int8_t i_level;
+} PyHamtIteratorState;
+
+
+/* Base iterator object.
+
+   Contains the iteration state, a pointer to the HAMT tree,
+   and a pointer to the 'yield function'.  The latter is a simple
+   function that returns a key/value tuple for the 'Items' iterator,
+   just a key for the 'Keys' iterator, and a value for the 'Values'
+   iterator.
+*/
+typedef struct {
+    PyObject_HEAD
+    PyHamtObject *hi_obj;
+    PyHamtIteratorState hi_iter;
+    binaryfunc hi_yield;
+} PyHamtIterator;
+
+
+/* Create a new HAMT immutable mapping. */
+PyHamtObject * _PyHamt_New(void);
+
+/* Return a new collection based on "o", but with an additional
+   key/val pair. */
+PyHamtObject * _PyHamt_Assoc(PyHamtObject *o, PyObject *key, PyObject *val);
+
+/* Return a new collection based on "o", but without "key". */
+PyHamtObject * _PyHamt_Without(PyHamtObject *o, PyObject *key);
+
+/* Find "key" in the "o" collection.
+
+   Return:
+   - -1: An error occurred.
+   - 0: "key" wasn't found in "o".
+   - 1: "key" is in "o"; "*val" is set to its value (a borrowed ref).
+*/
+int _PyHamt_Find(PyHamtObject *o, PyObject *key, PyObject **val);
+
+/* Check if "v" is equal to "w".
+
+   Return:
+   - 0: v != w
+   - 1: v == w
+   - -1: An error occurred.
+*/
+int _PyHamt_Eq(PyHamtObject *v, PyHamtObject *w);
+
+/* Return the size of "o"; equivalent of "len(o)". */
+Py_ssize_t _PyHamt_Len(PyHamtObject *o);
+
+/* Return a Keys iterator over "o". */
+PyObject * _PyHamt_NewIterKeys(PyHamtObject *o);
+
+/* Return a Values iterator over "o". */
+PyObject * _PyHamt_NewIterValues(PyHamtObject *o);
+
+/* Return a Items iterator over "o". */
+PyObject * _PyHamt_NewIterItems(PyHamtObject *o);
+
+#endif /* !Py_INTERNAL_HAMT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_hashtable.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_hashtable.h
new file mode 100644
index 000000000000..f57978a8d614
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_hashtable.h
@@ -0,0 +1,149 @@
+#ifndef Py_INTERNAL_HASHTABLE_H
+#define Py_INTERNAL_HASHTABLE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+/* Single linked list */
+
+typedef struct _Py_slist_item_s {
+    struct _Py_slist_item_s *next;
+} _Py_slist_item_t;
+
+typedef struct {
+    _Py_slist_item_t *head;
+} _Py_slist_t;
+
+#define _Py_SLIST_ITEM_NEXT(ITEM) _Py_RVALUE(((_Py_slist_item_t *)(ITEM))->next)
+
+#define _Py_SLIST_HEAD(SLIST) _Py_RVALUE(((_Py_slist_t *)(SLIST))->head)
+
+
+/* _Py_hashtable: table entry */
+
+typedef struct {
+    /* used by _Py_hashtable_t.buckets to link entries */
+    _Py_slist_item_t _Py_slist_item;
+
+    Py_uhash_t key_hash;
+    void *key;
+    void *value;
+} _Py_hashtable_entry_t;
+
+
+/* _Py_hashtable: prototypes */
+
+/* Forward declaration */
+struct _Py_hashtable_t;
+typedef struct _Py_hashtable_t _Py_hashtable_t;
+
+typedef Py_uhash_t (*_Py_hashtable_hash_func) (const void *key);
+typedef int (*_Py_hashtable_compare_func) (const void *key1, const void *key2);
+typedef void (*_Py_hashtable_destroy_func) (void *key);
+typedef _Py_hashtable_entry_t* (*_Py_hashtable_get_entry_func)(_Py_hashtable_t *ht,
+                                                               const void *key);
+
+typedef struct {
+    // Allocate a memory block
+    void* (*malloc) (size_t size);
+
+    // Release a memory block
+    void (*free) (void *ptr);
+} _Py_hashtable_allocator_t;
+
+
+/* _Py_hashtable: table */
+struct _Py_hashtable_t {
+    size_t nentries; // Total number of entries in the table
+    size_t nbuckets;
+    _Py_slist_t *buckets;
+
+    _Py_hashtable_get_entry_func get_entry_func;
+    _Py_hashtable_hash_func hash_func;
+    _Py_hashtable_compare_func compare_func;
+    _Py_hashtable_destroy_func key_destroy_func;
+    _Py_hashtable_destroy_func value_destroy_func;
+    _Py_hashtable_allocator_t alloc;
+};
+
+/* Hash a pointer (void*) */
+PyAPI_FUNC(Py_uhash_t) _Py_hashtable_hash_ptr(const void *key);
+
+/* Comparison using memcmp() */
+PyAPI_FUNC(int) _Py_hashtable_compare_direct(
+    const void *key1,
+    const void *key2);
+
+PyAPI_FUNC(_Py_hashtable_t *) _Py_hashtable_new(
+    _Py_hashtable_hash_func hash_func,
+    _Py_hashtable_compare_func compare_func);
+
+PyAPI_FUNC(_Py_hashtable_t *) _Py_hashtable_new_full(
+    _Py_hashtable_hash_func hash_func,
+    _Py_hashtable_compare_func compare_func,
+    _Py_hashtable_destroy_func key_destroy_func,
+    _Py_hashtable_destroy_func value_destroy_func,
+    _Py_hashtable_allocator_t *allocator);
+
+PyAPI_FUNC(void) _Py_hashtable_destroy(_Py_hashtable_t *ht);
+
+PyAPI_FUNC(void) _Py_hashtable_clear(_Py_hashtable_t *ht);
+
+typedef int (*_Py_hashtable_foreach_func) (_Py_hashtable_t *ht,
+                                           const void *key, const void *value,
+                                           void *user_data);
+
+/* Call func() on each entry of the hashtable.
+   Iteration stops if func() result is non-zero, in this case it's the result
+   of the call. Otherwise, the function returns 0. */
+PyAPI_FUNC(int) _Py_hashtable_foreach(
+    _Py_hashtable_t *ht,
+    _Py_hashtable_foreach_func func,
+    void *user_data);
+
+PyAPI_FUNC(size_t) _Py_hashtable_size(const _Py_hashtable_t *ht);
+PyAPI_FUNC(size_t) _Py_hashtable_len(const _Py_hashtable_t *ht);
+
+/* Add a new entry to the hash. The key must not be present in the hash table.
+   Return 0 on success, -1 on memory error. */
+PyAPI_FUNC(int) _Py_hashtable_set(
+    _Py_hashtable_t *ht,
+    const void *key,
+    void *value);
+
+
+/* Get an entry.
+   Return NULL if the key does not exist. */
+static inline _Py_hashtable_entry_t *
+_Py_hashtable_get_entry(_Py_hashtable_t *ht, const void *key)
+{
+    return ht->get_entry_func(ht, key);
+}
+
+
+/* Get value from an entry.
+   Return NULL if the entry is not found.
+
+   Use _Py_hashtable_get_entry() to distinguish entry value equal to NULL
+   and entry not found. */
+PyAPI_FUNC(void*) _Py_hashtable_get(_Py_hashtable_t *ht, const void *key);
+
+
+/* Remove a key and its associated value without calling key and value destroy
+   functions.
+
+   Return the removed value if the key was found.
+   Return NULL if the key was not found. */
+PyAPI_FUNC(void*) _Py_hashtable_steal(
+    _Py_hashtable_t *ht,
+    const void *key);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif   /* !Py_INTERNAL_HASHTABLE_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_import.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_import.h
new file mode 100644
index 000000000000..376957bdc998
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_import.h
@@ -0,0 +1,183 @@
+#ifndef Py_LIMITED_API
+#ifndef Py_INTERNAL_IMPORT_H
+#define Py_INTERNAL_IMPORT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "pycore_hashtable.h"     // _Py_hashtable_t
+#include "pycore_time.h"          // _PyTime_t
+
+
+struct _import_runtime_state {
+    /* The builtin modules (defined in config.c). */
+    struct _inittab *inittab;
+    /* The most recent value assigned to a PyModuleDef.m_base.m_index.
+       This is incremented each time PyModuleDef_Init() is called,
+       which is just about every time an extension module is imported.
+       See PyInterpreterState.modules_by_index for more info. */
+    Py_ssize_t last_module_index;
+    struct {
+        /* A lock to guard the cache. */
+        PyThread_type_lock mutex;
+        /* The actual cache of (filename, name, PyModuleDef) for modules.
+           Only legacy (single-phase init) extension modules are added
+           and only if they support multiple initialization (m_size >- 0)
+           or are imported in the main interpreter.
+           This is initialized lazily in _PyImport_FixupExtensionObject().
+           Modules are added there and looked up in _imp.find_extension(). */
+        _Py_hashtable_t *hashtable;
+    } extensions;
+    /* Package context -- the full module name for package imports */
+    const char * pkgcontext;
+};
+
+struct _import_state {
+    /* cached sys.modules dictionary */
+    PyObject *modules;
+    /* This is the list of module objects for all legacy (single-phase init)
+       extension modules ever loaded in this process (i.e. imported
+       in this interpreter or in any other).  Py_None stands in for
+       modules that haven't actually been imported in this interpreter.
+
+       A module's index (PyModuleDef.m_base.m_index) is used to look up
+       the corresponding module object for this interpreter, if any.
+       (See PyState_FindModule().)  When any extension module
+       is initialized during import, its moduledef gets initialized by
+       PyModuleDef_Init(), and the first time that happens for each
+       PyModuleDef, its index gets set to the current value of
+       a global counter (see _PyRuntimeState.imports.last_module_index).
+       The entry for that index in this interpreter remains unset until
+       the module is actually imported here.  (Py_None is used as
+       a placeholder.)  Note that multi-phase init modules always get
+       an index for which there will never be a module set.
+
+       This is initialized lazily in PyState_AddModule(), which is also
+       where modules get added. */
+    PyObject *modules_by_index;
+    /* importlib module._bootstrap */
+    PyObject *importlib;
+    /* override for config->use_frozen_modules (for tests)
+       (-1: "off", 1: "on", 0: no override) */
+    int override_frozen_modules;
+    int override_multi_interp_extensions_check;
+#ifdef HAVE_DLOPEN
+    int dlopenflags;
+#endif
+    PyObject *import_func;
+    /* The global import lock. */
+    struct {
+        PyThread_type_lock mutex;
+        unsigned long thread;
+        int level;
+    } lock;
+    /* diagnostic info in PyImport_ImportModuleLevelObject() */
+    struct {
+        int import_level;
+        _PyTime_t accumulated;
+        int header;
+    } find_and_load;
+};
+
+#ifdef HAVE_DLOPEN
+#  include <dlfcn.h>
+#  if HAVE_DECL_RTLD_NOW
+#    define _Py_DLOPEN_FLAGS RTLD_NOW
+#  else
+#    define _Py_DLOPEN_FLAGS RTLD_LAZY
+#  endif
+#  define DLOPENFLAGS_INIT .dlopenflags = _Py_DLOPEN_FLAGS,
+#else
+#  define _Py_DLOPEN_FLAGS 0
+#  define DLOPENFLAGS_INIT
+#endif
+
+#define IMPORTS_INIT \
+    { \
+        DLOPENFLAGS_INIT \
+        .lock = { \
+            .mutex = NULL, \
+            .thread = PYTHREAD_INVALID_THREAD_ID, \
+            .level = 0, \
+        }, \
+        .find_and_load = { \
+            .header = 1, \
+        }, \
+    }
+
+extern void _PyImport_ClearCore(PyInterpreterState *interp);
+
+extern Py_ssize_t _PyImport_GetNextModuleIndex(void);
+extern const char * _PyImport_ResolveNameWithPackageContext(const char *name);
+extern const char * _PyImport_SwapPackageContext(const char *newcontext);
+
+extern int _PyImport_GetDLOpenFlags(PyInterpreterState *interp);
+extern void _PyImport_SetDLOpenFlags(PyInterpreterState *interp, int new_val);
+
+extern PyObject * _PyImport_InitModules(PyInterpreterState *interp);
+extern PyObject * _PyImport_GetModules(PyInterpreterState *interp);
+extern void _PyImport_ClearModules(PyInterpreterState *interp);
+
+extern void _PyImport_ClearModulesByIndex(PyInterpreterState *interp);
+
+extern int _PyImport_InitDefaultImportFunc(PyInterpreterState *interp);
+extern int _PyImport_IsDefaultImportFunc(
+        PyInterpreterState *interp,
+        PyObject *func);
+
+extern PyObject * _PyImport_GetImportlibLoader(
+        PyInterpreterState *interp,
+        const char *loader_name);
+extern PyObject * _PyImport_GetImportlibExternalLoader(
+        PyInterpreterState *interp,
+        const char *loader_name);
+extern PyObject * _PyImport_BlessMyLoader(
+        PyInterpreterState *interp,
+        PyObject *module_globals);
+extern PyObject * _PyImport_ImportlibModuleRepr(
+        PyInterpreterState *interp,
+        PyObject *module);
+
+
+extern PyStatus _PyImport_Init(void);
+extern void _PyImport_Fini(void);
+extern void _PyImport_Fini2(void);
+
+extern PyStatus _PyImport_InitCore(
+        PyThreadState *tstate,
+        PyObject *sysmod,
+        int importlib);
+extern PyStatus _PyImport_InitExternal(PyThreadState *tstate);
+extern void _PyImport_FiniCore(PyInterpreterState *interp);
+extern void _PyImport_FiniExternal(PyInterpreterState *interp);
+
+
+#ifdef HAVE_FORK
+extern PyStatus _PyImport_ReInitLock(PyInterpreterState *interp);
+#endif
+
+
+extern PyObject* _PyImport_GetBuiltinModuleNames(void);
+
+struct _module_alias {
+    const char *name;                 /* ASCII encoded string */
+    const char *orig;                 /* ASCII encoded string */
+};
+
+PyAPI_DATA(const struct _frozen *) _PyImport_FrozenBootstrap;
+PyAPI_DATA(const struct _frozen *) _PyImport_FrozenStdlib;
+PyAPI_DATA(const struct _frozen *) _PyImport_FrozenTest;
+extern const struct _module_alias * _PyImport_FrozenAliases;
+
+PyAPI_FUNC(int) _PyImport_CheckSubinterpIncompatibleExtensionAllowed(
+    const char *name);
+
+
+// for testing
+PyAPI_FUNC(int) _PyImport_ClearExtension(PyObject *name, PyObject *filename);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_IMPORT_H */
+#endif /* !Py_LIMITED_API */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_initconfig.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_initconfig.h
new file mode 100644
index 000000000000..4cbd14a61d45
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_initconfig.h
@@ -0,0 +1,179 @@
+#ifndef Py_INTERNAL_CORECONFIG_H
+#define Py_INTERNAL_CORECONFIG_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+/* Forward declaration */
+struct pyruntimestate;
+
+/* --- PyStatus ----------------------------------------------- */
+
+/* Almost all errors causing Python initialization to fail */
+#ifdef _MSC_VER
+   /* Visual Studio 2015 doesn't implement C99 __func__ in C */
+#  define _PyStatus_GET_FUNC() __FUNCTION__
+#else
+#  define _PyStatus_GET_FUNC() __func__
+#endif
+
+#define _PyStatus_OK() \
+    (PyStatus){._type = _PyStatus_TYPE_OK,}
+    /* other fields are set to 0 */
+#define _PyStatus_ERR(ERR_MSG) \
+    (PyStatus){ \
+        ._type = _PyStatus_TYPE_ERROR, \
+        .func = _PyStatus_GET_FUNC(), \
+        .err_msg = (ERR_MSG)}
+        /* other fields are set to 0 */
+#define _PyStatus_NO_MEMORY() _PyStatus_ERR("memory allocation failed")
+#define _PyStatus_EXIT(EXITCODE) \
+    (PyStatus){ \
+        ._type = _PyStatus_TYPE_EXIT, \
+        .exitcode = (EXITCODE)}
+#define _PyStatus_IS_ERROR(err) \
+    ((err)._type == _PyStatus_TYPE_ERROR)
+#define _PyStatus_IS_EXIT(err) \
+    ((err)._type == _PyStatus_TYPE_EXIT)
+#define _PyStatus_EXCEPTION(err) \
+    ((err)._type != _PyStatus_TYPE_OK)
+#define _PyStatus_UPDATE_FUNC(err) \
+    do { (err).func = _PyStatus_GET_FUNC(); } while (0)
+
+/* --- PyWideStringList ------------------------------------------------ */
+
+#define _PyWideStringList_INIT (PyWideStringList){.length = 0, .items = NULL}
+
+#ifndef NDEBUG
+PyAPI_FUNC(int) _PyWideStringList_CheckConsistency(const PyWideStringList *list);
+#endif
+PyAPI_FUNC(void) _PyWideStringList_Clear(PyWideStringList *list);
+PyAPI_FUNC(int) _PyWideStringList_Copy(PyWideStringList *list,
+    const PyWideStringList *list2);
+PyAPI_FUNC(PyStatus) _PyWideStringList_Extend(PyWideStringList *list,
+    const PyWideStringList *list2);
+PyAPI_FUNC(PyObject*) _PyWideStringList_AsList(const PyWideStringList *list);
+
+
+/* --- _PyArgv ---------------------------------------------------- */
+
+typedef struct _PyArgv {
+    Py_ssize_t argc;
+    int use_bytes_argv;
+    char * const *bytes_argv;
+    wchar_t * const *wchar_argv;
+} _PyArgv;
+
+PyAPI_FUNC(PyStatus) _PyArgv_AsWstrList(const _PyArgv *args,
+    PyWideStringList *list);
+
+
+/* --- Helper functions ------------------------------------------- */
+
+PyAPI_FUNC(int) _Py_str_to_int(
+    const char *str,
+    int *result);
+PyAPI_FUNC(const wchar_t*) _Py_get_xoption(
+    const PyWideStringList *xoptions,
+    const wchar_t *name);
+PyAPI_FUNC(const char*) _Py_GetEnv(
+    int use_environment,
+    const char *name);
+PyAPI_FUNC(void) _Py_get_env_flag(
+    int use_environment,
+    int *flag,
+    const char *name);
+
+/* Py_GetArgcArgv() helper */
+PyAPI_FUNC(void) _Py_ClearArgcArgv(void);
+
+
+/* --- _PyPreCmdline ------------------------------------------------- */
+
+typedef struct {
+    PyWideStringList argv;
+    PyWideStringList xoptions;     /* "-X value" option */
+    int isolated;             /* -I option */
+    int use_environment;      /* -E option */
+    int dev_mode;             /* -X dev and PYTHONDEVMODE */
+    int warn_default_encoding;     /* -X warn_default_encoding and PYTHONWARNDEFAULTENCODING */
+} _PyPreCmdline;
+
+#define _PyPreCmdline_INIT \
+    (_PyPreCmdline){ \
+        .use_environment = -1, \
+        .isolated = -1, \
+        .dev_mode = -1}
+/* Note: _PyPreCmdline_INIT sets other fields to 0/NULL */
+
+extern void _PyPreCmdline_Clear(_PyPreCmdline *cmdline);
+extern PyStatus _PyPreCmdline_SetArgv(_PyPreCmdline *cmdline,
+    const _PyArgv *args);
+extern PyStatus _PyPreCmdline_SetConfig(
+    const _PyPreCmdline *cmdline,
+    PyConfig *config);
+extern PyStatus _PyPreCmdline_Read(_PyPreCmdline *cmdline,
+    const PyPreConfig *preconfig);
+
+
+/* --- PyPreConfig ----------------------------------------------- */
+
+PyAPI_FUNC(void) _PyPreConfig_InitCompatConfig(PyPreConfig *preconfig);
+extern void _PyPreConfig_InitFromConfig(
+    PyPreConfig *preconfig,
+    const PyConfig *config);
+extern PyStatus _PyPreConfig_InitFromPreConfig(
+    PyPreConfig *preconfig,
+    const PyPreConfig *config2);
+extern PyObject* _PyPreConfig_AsDict(const PyPreConfig *preconfig);
+extern void _PyPreConfig_GetConfig(PyPreConfig *preconfig,
+    const PyConfig *config);
+extern PyStatus _PyPreConfig_Read(PyPreConfig *preconfig,
+    const _PyArgv *args);
+extern PyStatus _PyPreConfig_Write(const PyPreConfig *preconfig);
+
+
+/* --- PyConfig ---------------------------------------------- */
+
+typedef enum {
+    /* Py_Initialize() API: backward compatibility with Python 3.6 and 3.7 */
+    _PyConfig_INIT_COMPAT = 1,
+    _PyConfig_INIT_PYTHON = 2,
+    _PyConfig_INIT_ISOLATED = 3
+} _PyConfigInitEnum;
+
+PyAPI_FUNC(void) _PyConfig_InitCompatConfig(PyConfig *config);
+extern PyStatus _PyConfig_Copy(
+    PyConfig *config,
+    const PyConfig *config2);
+extern PyStatus _PyConfig_InitPathConfig(
+    PyConfig *config,
+    int compute_path_config);
+extern PyStatus _PyConfig_InitImportConfig(PyConfig *config);
+extern PyStatus _PyConfig_Read(PyConfig *config, int compute_path_config);
+extern PyStatus _PyConfig_Write(const PyConfig *config,
+    struct pyruntimestate *runtime);
+extern PyStatus _PyConfig_SetPyArgv(
+    PyConfig *config,
+    const _PyArgv *args);
+
+PyAPI_FUNC(PyObject*) _PyConfig_AsDict(const PyConfig *config);
+PyAPI_FUNC(int) _PyConfig_FromDict(PyConfig *config, PyObject *dict);
+
+extern void _Py_DumpPathConfig(PyThreadState *tstate);
+
+PyAPI_FUNC(PyObject*) _Py_Get_Getpath_CodeObject(void);
+
+
+/* --- Function used for testing ---------------------------------- */
+
+PyAPI_FUNC(PyObject*) _Py_GetConfigsAsDict(void);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_CORECONFIG_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_instruments.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_instruments.h
new file mode 100644
index 000000000000..b8591563d76b
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_instruments.h
@@ -0,0 +1,106 @@
+
+#ifndef Py_INTERNAL_INSTRUMENT_H
+#define Py_INTERNAL_INSTRUMENT_H
+
+
+#include "pycore_bitutils.h"      // _Py_popcount32
+#include "pycore_frame.h"
+
+#include "cpython/code.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PY_MONITORING_TOOL_IDS 8
+
+/* Local events.
+ * These require bytecode instrumentation */
+
+#define PY_MONITORING_EVENT_PY_START 0
+#define PY_MONITORING_EVENT_PY_RESUME 1
+#define PY_MONITORING_EVENT_PY_RETURN 2
+#define PY_MONITORING_EVENT_PY_YIELD 3
+#define PY_MONITORING_EVENT_CALL 4
+#define PY_MONITORING_EVENT_LINE 5
+#define PY_MONITORING_EVENT_INSTRUCTION 6
+#define PY_MONITORING_EVENT_JUMP 7
+#define PY_MONITORING_EVENT_BRANCH 8
+#define PY_MONITORING_EVENT_STOP_ITERATION 9
+
+#define PY_MONITORING_IS_INSTRUMENTED_EVENT(ev) \
+    ((ev) < _PY_MONITORING_LOCAL_EVENTS)
+
+/* Other events, mainly exceptions */
+
+#define PY_MONITORING_EVENT_RAISE 10
+#define PY_MONITORING_EVENT_EXCEPTION_HANDLED 11
+#define PY_MONITORING_EVENT_PY_UNWIND 12
+#define PY_MONITORING_EVENT_PY_THROW 13
+#define PY_MONITORING_EVENT_RERAISE 14
+
+
+/* Ancillary events */
+
+#define PY_MONITORING_EVENT_C_RETURN 15
+#define PY_MONITORING_EVENT_C_RAISE 16
+
+
+typedef uint32_t _PyMonitoringEventSet;
+
+/* Tool IDs */
+
+/* These are defined in PEP 669 for convenience to avoid clashes */
+#define PY_MONITORING_DEBUGGER_ID 0
+#define PY_MONITORING_COVERAGE_ID 1
+#define PY_MONITORING_PROFILER_ID 2
+#define PY_MONITORING_OPTIMIZER_ID 5
+
+/* Internal IDs used to suuport sys.setprofile() and sys.settrace() */
+#define PY_MONITORING_SYS_PROFILE_ID 6
+#define PY_MONITORING_SYS_TRACE_ID 7
+
+
+PyObject *_PyMonitoring_RegisterCallback(int tool_id, int event_id, PyObject *obj);
+
+int _PyMonitoring_SetEvents(int tool_id, _PyMonitoringEventSet events);
+
+extern int
+_Py_call_instrumentation(PyThreadState *tstate, int event,
+    _PyInterpreterFrame *frame, _Py_CODEUNIT *instr);
+
+extern int
+_Py_call_instrumentation_line(PyThreadState *tstate, _PyInterpreterFrame* frame,
+                              _Py_CODEUNIT *instr, _Py_CODEUNIT *prev);
+
+extern int
+_Py_call_instrumentation_instruction(
+    PyThreadState *tstate, _PyInterpreterFrame* frame, _Py_CODEUNIT *instr);
+
+_Py_CODEUNIT *
+_Py_call_instrumentation_jump(
+    PyThreadState *tstate, int event,
+    _PyInterpreterFrame *frame, _Py_CODEUNIT *instr, _Py_CODEUNIT *target);
+
+extern int
+_Py_call_instrumentation_arg(PyThreadState *tstate, int event,
+    _PyInterpreterFrame *frame, _Py_CODEUNIT *instr, PyObject *arg);
+
+extern int
+_Py_call_instrumentation_2args(PyThreadState *tstate, int event,
+    _PyInterpreterFrame *frame, _Py_CODEUNIT *instr, PyObject *arg0, PyObject *arg1);
+
+extern void
+_Py_call_instrumentation_exc2(PyThreadState *tstate, int event,
+    _PyInterpreterFrame *frame, _Py_CODEUNIT *instr, PyObject *arg0, PyObject *arg1);
+
+extern int
+_Py_Instrumentation_GetLine(PyCodeObject *code, int index);
+
+extern PyObject _PyInstrumentation_MISSING;
+extern PyObject _PyInstrumentation_DISABLE;
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_INSTRUMENT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_interp.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_interp.h
new file mode 100644
index 000000000000..37cc88ed081b
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_interp.h
@@ -0,0 +1,275 @@
+#ifndef Py_INTERNAL_INTERP_H
+#define Py_INTERNAL_INTERP_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include <stdbool.h>
+
+#include "pycore_ast_state.h"     // struct ast_state
+#include "pycore_atexit.h"        // struct atexit_state
+#include "pycore_atomic.h"        // _Py_atomic_address
+#include "pycore_ceval_state.h"   // struct _ceval_state
+#include "pycore_code.h"          // struct callable_cache
+#include "pycore_context.h"       // struct _Py_context_state
+#include "pycore_dict_state.h"    // struct _Py_dict_state
+#include "pycore_dtoa.h"          // struct _dtoa_state
+#include "pycore_exceptions.h"    // struct _Py_exc_state
+#include "pycore_floatobject.h"   // struct _Py_float_state
+#include "pycore_function.h"      // FUNC_MAX_WATCHERS
+#include "pycore_genobject.h"     // struct _Py_async_gen_state
+#include "pycore_gc.h"            // struct _gc_runtime_state
+#include "pycore_global_objects.h"  // struct _Py_interp_static_objects
+#include "pycore_import.h"        // struct _import_state
+#include "pycore_instruments.h"   // _PY_MONITORING_EVENTS
+#include "pycore_list.h"          // struct _Py_list_state
+#include "pycore_object_state.h"   // struct _py_object_state
+#include "pycore_obmalloc.h"      // struct obmalloc_state
+#include "pycore_tuple.h"         // struct _Py_tuple_state
+#include "pycore_typeobject.h"    // struct type_cache
+#include "pycore_unicodeobject.h" // struct _Py_unicode_state
+#include "pycore_warnings.h"      // struct _warnings_runtime_state
+
+
+struct _Py_long_state {
+    int max_str_digits;
+};
+
+
+/* cross-interpreter data registry */
+
+/* For now we use a global registry of shareable classes.  An
+   alternative would be to add a tp_* slot for a class's
+   crossinterpdatafunc. It would be simpler and more efficient. */
+
+struct _xidregitem;
+
+struct _xidregitem {
+    struct _xidregitem *prev;
+    struct _xidregitem *next;
+    /* This can be a dangling pointer, but only if weakref is set. */
+    PyTypeObject *cls;
+    /* This is NULL for builtin types. */
+    PyObject *weakref;
+    size_t refcount;
+    crossinterpdatafunc getdata;
+};
+
+struct _xidregistry {
+    PyThread_type_lock mutex;
+    struct _xidregitem *head;
+};
+
+
+/* interpreter state */
+
+/* PyInterpreterState holds the global state for one of the runtime's
+   interpreters.  Typically the initial (main) interpreter is the only one.
+
+   The PyInterpreterState typedef is in Include/pytypedefs.h.
+   */
+struct _is {
+
+    PyInterpreterState *next;
+
+    int64_t id;
+    int64_t id_refcount;
+    int requires_idref;
+    PyThread_type_lock id_mutex;
+
+    /* Has been initialized to a safe state.
+
+       In order to be effective, this must be set to 0 during or right
+       after allocation. */
+    int _initialized;
+    int finalizing;
+
+    uint64_t monitoring_version;
+    uint64_t last_restart_version;
+    struct pythreads {
+        uint64_t next_unique_id;
+        /* The linked list of threads, newest first. */
+        PyThreadState *head;
+        /* Used in Modules/_threadmodule.c. */
+        long count;
+        /* Support for runtime thread stack size tuning.
+           A value of 0 means using the platform's default stack size
+           or the size specified by the THREAD_STACK_SIZE macro. */
+        /* Used in Python/thread.c. */
+        size_t stacksize;
+    } threads;
+
+    /* Reference to the _PyRuntime global variable. This field exists
+       to not have to pass runtime in addition to tstate to a function.
+       Get runtime from tstate: tstate->interp->runtime. */
+    struct pyruntimestate *runtime;
+
+    /* Set by Py_EndInterpreter().
+
+       Use _PyInterpreterState_GetFinalizing()
+       and _PyInterpreterState_SetFinalizing()
+       to access it, don't access it directly. */
+    _Py_atomic_address _finalizing;
+
+    struct _gc_runtime_state gc;
+
+    /* The following fields are here to avoid allocation during init.
+       The data is exposed through PyInterpreterState pointer fields.
+       These fields should not be accessed directly outside of init.
+
+       All other PyInterpreterState pointer fields are populated when
+       needed and default to NULL.
+
+       For now there are some exceptions to that rule, which require
+       allocation during init.  These will be addressed on a case-by-case
+       basis.  Also see _PyRuntimeState regarding the various mutex fields.
+       */
+
+    // Dictionary of the sys module
+    PyObject *sysdict;
+
+    // Dictionary of the builtins module
+    PyObject *builtins;
+
+    struct _ceval_state ceval;
+
+    struct _import_state imports;
+
+    /* The per-interpreter GIL, which might not be used. */
+    struct _gil_runtime_state _gil;
+
+     /* ---------- IMPORTANT ---------------------------
+     The fields above this line are declared as early as
+     possible to facilitate out-of-process observability
+     tools. */
+
+    PyObject *codec_search_path;
+    PyObject *codec_search_cache;
+    PyObject *codec_error_registry;
+    int codecs_initialized;
+
+    PyConfig config;
+    unsigned long feature_flags;
+
+    PyObject *dict;  /* Stores per-interpreter state */
+
+    PyObject *sysdict_copy;
+    PyObject *builtins_copy;
+    // Initialized to _PyEval_EvalFrameDefault().
+    _PyFrameEvalFunction eval_frame;
+
+    PyFunction_WatchCallback func_watchers[FUNC_MAX_WATCHERS];
+    // One bit is set for each non-NULL entry in func_watchers
+    uint8_t active_func_watchers;
+
+    Py_ssize_t co_extra_user_count;
+    freefunc co_extra_freefuncs[MAX_CO_EXTRA_USERS];
+
+#ifdef HAVE_FORK
+    PyObject *before_forkers;
+    PyObject *after_forkers_parent;
+    PyObject *after_forkers_child;
+#endif
+
+    struct _warnings_runtime_state warnings;
+    struct atexit_state atexit;
+
+    struct _obmalloc_state obmalloc;
+
+    PyObject *audit_hooks;
+    PyType_WatchCallback type_watchers[TYPE_MAX_WATCHERS];
+    PyCode_WatchCallback code_watchers[CODE_MAX_WATCHERS];
+    // One bit is set for each non-NULL entry in code_watchers
+    uint8_t active_code_watchers;
+
+    struct _py_object_state object_state;
+    struct _Py_unicode_state unicode;
+    struct _Py_float_state float_state;
+    struct _Py_long_state long_state;
+    struct _dtoa_state dtoa;
+    struct _py_func_state func_state;
+    /* Using a cache is very effective since typically only a single slice is
+       created and then deleted again. */
+    PySliceObject *slice_cache;
+
+    struct _Py_tuple_state tuple;
+    struct _Py_list_state list;
+    struct _Py_dict_state dict_state;
+    struct _Py_async_gen_state async_gen;
+    struct _Py_context_state context;
+    struct _Py_exc_state exc_state;
+
+    struct ast_state ast;
+    struct types_state types;
+    struct callable_cache callable_cache;
+    PyCodeObject *interpreter_trampoline;
+
+    _Py_GlobalMonitors monitors;
+    bool f_opcode_trace_set;
+    bool sys_profile_initialized;
+    bool sys_trace_initialized;
+    Py_ssize_t sys_profiling_threads; /* Count of threads with c_profilefunc set */
+    Py_ssize_t sys_tracing_threads; /* Count of threads with c_tracefunc set */
+    PyObject *monitoring_callables[PY_MONITORING_TOOL_IDS][_PY_MONITORING_EVENTS];
+    PyObject *monitoring_tool_names[PY_MONITORING_TOOL_IDS];
+
+    struct _Py_interp_cached_objects cached_objects;
+    struct _Py_interp_static_objects static_objects;
+
+    // XXX Remove this field once we have a tp_* slot.
+    struct _xidregistry xidregistry;
+    /* The thread currently executing in the __main__ module, if any. */
+    PyThreadState *threads_main;
+    /* The ID of the OS thread in which we are finalizing.
+       We use _Py_atomic_address instead of adding a new _Py_atomic_ulong. */
+    _Py_atomic_address _finalizing_id;
+
+   /* the initial PyInterpreterState.threads.head */
+    PyThreadState _initial_thread;
+};
+
+
+/* other API */
+
+extern void _PyInterpreterState_Clear(PyThreadState *tstate);
+
+
+static inline PyThreadState*
+_PyInterpreterState_GetFinalizing(PyInterpreterState *interp) {
+    return (PyThreadState*)_Py_atomic_load_relaxed(&interp->_finalizing);
+}
+
+static inline unsigned long
+_PyInterpreterState_GetFinalizingID(PyInterpreterState *interp) {
+    return (unsigned long)_Py_atomic_load_relaxed(&interp->_finalizing_id);
+}
+
+static inline void
+_PyInterpreterState_SetFinalizing(PyInterpreterState *interp, PyThreadState *tstate) {
+    _Py_atomic_store_relaxed(&interp->_finalizing, (uintptr_t)tstate);
+    if (tstate == NULL) {
+        _Py_atomic_store_relaxed(&interp->_finalizing_id, 0);
+    }
+    else {
+        // XXX Re-enable this assert once gh-109860 is fixed.
+        //assert(tstate->thread_id == PyThread_get_thread_ident());
+        _Py_atomic_store_relaxed(&interp->_finalizing_id,
+                                 (uintptr_t)tstate->thread_id);
+    }
+}
+
+
+PyAPI_FUNC(PyInterpreterState*) _PyInterpreterState_LookUpID(int64_t);
+
+PyAPI_FUNC(int) _PyInterpreterState_IDInitref(PyInterpreterState *);
+PyAPI_FUNC(int) _PyInterpreterState_IDIncref(PyInterpreterState *);
+PyAPI_FUNC(void) _PyInterpreterState_IDDecref(PyInterpreterState *);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_INTERP_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_intrinsics.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_intrinsics.h
new file mode 100644
index 000000000000..39f15681b7b2
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_intrinsics.h
@@ -0,0 +1,32 @@
+// Auto-generated by Tools/build/generate_opcode_h.py from Lib/opcode.py
+
+/* Unary Functions: */
+#define INTRINSIC_1_INVALID                      0
+#define INTRINSIC_PRINT                          1
+#define INTRINSIC_IMPORT_STAR                    2
+#define INTRINSIC_STOPITERATION_ERROR            3
+#define INTRINSIC_ASYNC_GEN_WRAP                 4
+#define INTRINSIC_UNARY_POSITIVE                 5
+#define INTRINSIC_LIST_TO_TUPLE                  6
+#define INTRINSIC_TYPEVAR                        7
+#define INTRINSIC_PARAMSPEC                      8
+#define INTRINSIC_TYPEVARTUPLE                   9
+#define INTRINSIC_SUBSCRIPT_GENERIC             10
+#define INTRINSIC_TYPEALIAS                     11
+
+#define MAX_INTRINSIC_1                         11
+
+
+/* Binary Functions: */
+#define INTRINSIC_2_INVALID                      0
+#define INTRINSIC_PREP_RERAISE_STAR              1
+#define INTRINSIC_TYPEVAR_WITH_BOUND             2
+#define INTRINSIC_TYPEVAR_WITH_CONSTRAINTS       3
+#define INTRINSIC_SET_FUNCTION_TYPE_PARAMS       4
+
+#define MAX_INTRINSIC_2                          4
+
+typedef PyObject *(*instrinsic_func1)(PyThreadState* tstate, PyObject *value);
+typedef PyObject *(*instrinsic_func2)(PyThreadState* tstate, PyObject *value1, PyObject *value2);
+extern const instrinsic_func1 _PyIntrinsics_UnaryFunctions[];
+extern const instrinsic_func2 _PyIntrinsics_BinaryFunctions[];
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_list.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_list.h
new file mode 100644
index 000000000000..2fcbe12cd655
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_list.h
@@ -0,0 +1,83 @@
+#ifndef Py_INTERNAL_LIST_H
+#define Py_INTERNAL_LIST_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "listobject.h"           // _PyList_CAST()
+
+
+/* runtime lifecycle */
+
+extern void _PyList_Fini(PyInterpreterState *);
+
+
+/* other API */
+
+#ifndef WITH_FREELISTS
+// without freelists
+#  define PyList_MAXFREELIST 0
+#endif
+
+/* Empty list reuse scheme to save calls to malloc and free */
+#ifndef PyList_MAXFREELIST
+#  define PyList_MAXFREELIST 80
+#endif
+
+struct _Py_list_state {
+#if PyList_MAXFREELIST > 0
+    PyListObject *free_list[PyList_MAXFREELIST];
+    int numfree;
+#endif
+};
+
+#define _PyList_ITEMS(op) _Py_RVALUE(_PyList_CAST(op)->ob_item)
+
+extern int
+_PyList_AppendTakeRefListResize(PyListObject *self, PyObject *newitem);
+
+static inline int
+_PyList_AppendTakeRef(PyListObject *self, PyObject *newitem)
+{
+    assert(self != NULL && newitem != NULL);
+    assert(PyList_Check(self));
+    Py_ssize_t len = PyList_GET_SIZE(self);
+    Py_ssize_t allocated = self->allocated;
+    assert((size_t)len + 1 < PY_SSIZE_T_MAX);
+    if (allocated > len) {
+        PyList_SET_ITEM(self, len, newitem);
+        Py_SET_SIZE(self, len + 1);
+        return 0;
+    }
+    return _PyList_AppendTakeRefListResize(self, newitem);
+}
+
+// Repeat the bytes of a buffer in place
+static inline void
+_Py_memory_repeat(char* dest, Py_ssize_t len_dest, Py_ssize_t len_src)
+{
+    assert(len_src > 0);
+    Py_ssize_t copied = len_src;
+    while (copied < len_dest) {
+        Py_ssize_t bytes_to_copy = Py_MIN(copied, len_dest - copied);
+        memcpy(dest + copied, dest, bytes_to_copy);
+        copied += bytes_to_copy;
+    }
+}
+
+typedef struct {
+    PyObject_HEAD
+    Py_ssize_t it_index;
+    PyListObject *it_seq; /* Set to NULL when iterator is exhausted */
+} _PyListIterObject;
+
+extern PyObject *_PyList_FromArraySteal(PyObject *const *src, Py_ssize_t n);
+
+#ifdef __cplusplus
+}
+#endif
+#endif   /* !Py_INTERNAL_LIST_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_long.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_long.h
new file mode 100644
index 000000000000..64c00cb14754
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_long.h
@@ -0,0 +1,258 @@
+#ifndef Py_INTERNAL_LONG_H
+#define Py_INTERNAL_LONG_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_global_objects.h"  // _PY_NSMALLNEGINTS
+#include "pycore_runtime.h"       // _PyRuntime
+
+/*
+ * Default int base conversion size limitation: Denial of Service prevention.
+ *
+ * Chosen such that this isn't wildly slow on modern hardware and so that
+ * everyone's existing deployed numpy test suite passes before
+ * https://github.com/numpy/numpy/issues/22098 is widely available.
+ *
+ * $ python -m timeit -s 's = "1"*4300' 'int(s)'
+ * 2000 loops, best of 5: 125 usec per loop
+ * $ python -m timeit -s 's = "1"*4300; v = int(s)' 'str(v)'
+ * 1000 loops, best of 5: 311 usec per loop
+ * (zen2 cloud VM)
+ *
+ * 4300 decimal digits fits a ~14284 bit number.
+ */
+#define _PY_LONG_DEFAULT_MAX_STR_DIGITS 4300
+/*
+ * Threshold for max digits check.  For performance reasons int() and
+ * int.__str__() don't checks values that are smaller than this
+ * threshold.  Acts as a guaranteed minimum size limit for bignums that
+ * applications can expect from CPython.
+ *
+ * % python -m timeit -s 's = "1"*640; v = int(s)' 'str(int(s))'
+ * 20000 loops, best of 5: 12 usec per loop
+ *
+ * "640 digits should be enough for anyone." - gps
+ * fits a ~2126 bit decimal number.
+ */
+#define _PY_LONG_MAX_STR_DIGITS_THRESHOLD 640
+
+#if ((_PY_LONG_DEFAULT_MAX_STR_DIGITS != 0) && \
+   (_PY_LONG_DEFAULT_MAX_STR_DIGITS < _PY_LONG_MAX_STR_DIGITS_THRESHOLD))
+# error "_PY_LONG_DEFAULT_MAX_STR_DIGITS smaller than threshold."
+#endif
+
+
+/* runtime lifecycle */
+
+extern PyStatus _PyLong_InitTypes(PyInterpreterState *);
+extern void _PyLong_FiniTypes(PyInterpreterState *interp);
+
+
+/* other API */
+
+#define _PyLong_SMALL_INTS _Py_SINGLETON(small_ints)
+
+// _PyLong_GetZero() and _PyLong_GetOne() must always be available
+// _PyLong_FromUnsignedChar must always be available
+#if _PY_NSMALLPOSINTS < 257
+#  error "_PY_NSMALLPOSINTS must be greater than or equal to 257"
+#endif
+
+// Return a borrowed reference to the zero singleton.
+// The function cannot return NULL.
+static inline PyObject* _PyLong_GetZero(void)
+{ return (PyObject *)&_PyLong_SMALL_INTS[_PY_NSMALLNEGINTS]; }
+
+// Return a borrowed reference to the one singleton.
+// The function cannot return NULL.
+static inline PyObject* _PyLong_GetOne(void)
+{ return (PyObject *)&_PyLong_SMALL_INTS[_PY_NSMALLNEGINTS+1]; }
+
+static inline PyObject* _PyLong_FromUnsignedChar(unsigned char i)
+{
+    return Py_NewRef((PyObject *)&_PyLong_SMALL_INTS[_PY_NSMALLNEGINTS+i]);
+}
+
+PyObject *_PyLong_Add(PyLongObject *left, PyLongObject *right);
+PyObject *_PyLong_Multiply(PyLongObject *left, PyLongObject *right);
+PyObject *_PyLong_Subtract(PyLongObject *left, PyLongObject *right);
+
+/* Used by Python/mystrtoul.c, _PyBytes_FromHex(),
+   _PyBytes_DecodeEscape(), etc. */
+PyAPI_DATA(unsigned char) _PyLong_DigitValue[256];
+
+/* Format the object based on the format_spec, as defined in PEP 3101
+   (Advanced String Formatting). */
+PyAPI_FUNC(int) _PyLong_FormatAdvancedWriter(
+    _PyUnicodeWriter *writer,
+    PyObject *obj,
+    PyObject *format_spec,
+    Py_ssize_t start,
+    Py_ssize_t end);
+
+PyAPI_FUNC(int) _PyLong_FormatWriter(
+    _PyUnicodeWriter *writer,
+    PyObject *obj,
+    int base,
+    int alternate);
+
+PyAPI_FUNC(char*) _PyLong_FormatBytesWriter(
+    _PyBytesWriter *writer,
+    char *str,
+    PyObject *obj,
+    int base,
+    int alternate);
+
+/* Long value tag bits:
+ * 0-1: Sign bits value = (1-sign), ie. negative=2, positive=0, zero=1.
+ * 2: Reserved for immortality bit
+ * 3+ Unsigned digit count
+ */
+#define SIGN_MASK 3
+#define SIGN_ZERO 1
+#define SIGN_NEGATIVE 2
+#define NON_SIZE_BITS 3
+
+/* The functions _PyLong_IsCompact and _PyLong_CompactValue are defined
+ * in Include/cpython/longobject.h, since they need to be inline.
+ *
+ * "Compact" values have at least one bit to spare,
+ * so that addition and subtraction can be performed on the values
+ * without risk of overflow.
+ *
+ * The inline functions need tag bits.
+ * For readability, rather than do `#define SIGN_MASK _PyLong_SIGN_MASK`
+ * we define them to the numbers in both places and then assert that
+ * they're the same.
+ */
+static_assert(SIGN_MASK == _PyLong_SIGN_MASK, "SIGN_MASK does not match _PyLong_SIGN_MASK");
+static_assert(NON_SIZE_BITS == _PyLong_NON_SIZE_BITS, "NON_SIZE_BITS does not match _PyLong_NON_SIZE_BITS");
+
+/* All *compact" values are guaranteed to fit into
+ * a Py_ssize_t with at least one bit to spare.
+ * In other words, for 64 bit machines, compact
+ * will be signed 63 (or fewer) bit values
+ */
+
+/* Return 1 if the argument is compact int */
+static inline int
+_PyLong_IsNonNegativeCompact(const PyLongObject* op) {
+    assert(PyLong_Check(op));
+    return op->long_value.lv_tag <= (1 << NON_SIZE_BITS);
+}
+
+
+static inline int
+_PyLong_BothAreCompact(const PyLongObject* a, const PyLongObject* b) {
+    assert(PyLong_Check(a));
+    assert(PyLong_Check(b));
+    return (a->long_value.lv_tag | b->long_value.lv_tag) < (2 << NON_SIZE_BITS);
+}
+
+static inline bool
+_PyLong_IsZero(const PyLongObject *op)
+{
+    return (op->long_value.lv_tag & SIGN_MASK) == SIGN_ZERO;
+}
+
+static inline bool
+_PyLong_IsNegative(const PyLongObject *op)
+{
+    return (op->long_value.lv_tag & SIGN_MASK) == SIGN_NEGATIVE;
+}
+
+static inline bool
+_PyLong_IsPositive(const PyLongObject *op)
+{
+    return (op->long_value.lv_tag & SIGN_MASK) == 0;
+}
+
+static inline Py_ssize_t
+_PyLong_DigitCount(const PyLongObject *op)
+{
+    assert(PyLong_Check(op));
+    return op->long_value.lv_tag >> NON_SIZE_BITS;
+}
+
+/* Equivalent to _PyLong_DigitCount(op) * _PyLong_NonCompactSign(op) */
+static inline Py_ssize_t
+_PyLong_SignedDigitCount(const PyLongObject *op)
+{
+    assert(PyLong_Check(op));
+    Py_ssize_t sign = 1 - (op->long_value.lv_tag & SIGN_MASK);
+    return sign * (Py_ssize_t)(op->long_value.lv_tag >> NON_SIZE_BITS);
+}
+
+static inline int
+_PyLong_CompactSign(const PyLongObject *op)
+{
+    assert(PyLong_Check(op));
+    assert(_PyLong_IsCompact(op));
+    return 1 - (op->long_value.lv_tag & SIGN_MASK);
+}
+
+static inline int
+_PyLong_NonCompactSign(const PyLongObject *op)
+{
+    assert(PyLong_Check(op));
+    assert(!_PyLong_IsCompact(op));
+    return 1 - (op->long_value.lv_tag & SIGN_MASK);
+}
+
+/* Do a and b have the same sign? */
+static inline int
+_PyLong_SameSign(const PyLongObject *a, const PyLongObject *b)
+{
+    return (a->long_value.lv_tag & SIGN_MASK) == (b->long_value.lv_tag & SIGN_MASK);
+}
+
+#define TAG_FROM_SIGN_AND_SIZE(sign, size) ((1 - (sign)) | ((size) << NON_SIZE_BITS))
+
+static inline void
+_PyLong_SetSignAndDigitCount(PyLongObject *op, int sign, Py_ssize_t size)
+{
+    assert(size >= 0);
+    assert(-1 <= sign && sign <= 1);
+    assert(sign != 0 || size == 0);
+    op->long_value.lv_tag = TAG_FROM_SIGN_AND_SIZE(sign, (size_t)size);
+}
+
+static inline void
+_PyLong_SetDigitCount(PyLongObject *op, Py_ssize_t size)
+{
+    assert(size >= 0);
+    op->long_value.lv_tag = (((size_t)size) << NON_SIZE_BITS) | (op->long_value.lv_tag & SIGN_MASK);
+}
+
+#define NON_SIZE_MASK ~((1 << NON_SIZE_BITS) - 1)
+
+static inline void
+_PyLong_FlipSign(PyLongObject *op) {
+    unsigned int flipped_sign = 2 - (op->long_value.lv_tag & SIGN_MASK);
+    op->long_value.lv_tag &= NON_SIZE_MASK;
+    op->long_value.lv_tag |= flipped_sign;
+}
+
+#define _PyLong_DIGIT_INIT(val) \
+    { \
+        .ob_base = _PyObject_HEAD_INIT(&PyLong_Type) \
+        .long_value  = { \
+            .lv_tag = TAG_FROM_SIGN_AND_SIZE( \
+                (val) == 0 ? 0 : ((val) < 0 ? -1 : 1), \
+                (val) == 0 ? 0 : 1), \
+            { ((val) >= 0 ? (val) : -(val)) }, \
+        } \
+    }
+
+#define _PyLong_FALSE_TAG TAG_FROM_SIGN_AND_SIZE(0, 0)
+#define _PyLong_TRUE_TAG TAG_FROM_SIGN_AND_SIZE(1, 1)
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_LONG_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_memoryobject.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_memoryobject.h
new file mode 100644
index 000000000000..fe19e3f9611a
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_memoryobject.h
@@ -0,0 +1,18 @@
+#ifndef Py_INTERNAL_MEMORYOBJECT_H
+#define Py_INTERNAL_MEMORYOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+PyObject *
+_PyMemoryView_FromBufferProc(PyObject *v, int flags,
+                             getbufferproc bufferproc);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_MEMORYOBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_moduleobject.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_moduleobject.h
new file mode 100644
index 000000000000..15a1bcb6ae51
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_moduleobject.h
@@ -0,0 +1,45 @@
+#ifndef Py_INTERNAL_MODULEOBJECT_H
+#define Py_INTERNAL_MODULEOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+typedef struct {
+    PyObject_HEAD
+    PyObject *md_dict;
+    PyModuleDef *md_def;
+    void *md_state;
+    PyObject *md_weaklist;
+    // for logging purposes after md_dict is cleared
+    PyObject *md_name;
+} PyModuleObject;
+
+static inline PyModuleDef* _PyModule_GetDef(PyObject *mod) {
+    assert(PyModule_Check(mod));
+    return ((PyModuleObject *)mod)->md_def;
+}
+
+static inline void* _PyModule_GetState(PyObject* mod) {
+    assert(PyModule_Check(mod));
+    return ((PyModuleObject *)mod)->md_state;
+}
+
+static inline PyObject* _PyModule_GetDict(PyObject *mod) {
+    assert(PyModule_Check(mod));
+    PyObject *dict = ((PyModuleObject *)mod) -> md_dict;
+    // _PyModule_GetDict(mod) must not be used after calling module_clear(mod)
+    assert(dict != NULL);
+    return dict;
+}
+
+PyObject* _Py_module_getattro_impl(PyModuleObject *m, PyObject *name, int suppress);
+PyObject* _Py_module_getattro(PyModuleObject *m, PyObject *name);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_MODULEOBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_namespace.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_namespace.h
new file mode 100644
index 000000000000..cb76f040693d
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_namespace.h
@@ -0,0 +1,20 @@
+// Simple namespace object interface
+
+#ifndef Py_INTERNAL_NAMESPACE_H
+#define Py_INTERNAL_NAMESPACE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+PyAPI_DATA(PyTypeObject) _PyNamespace_Type;
+
+PyAPI_FUNC(PyObject *) _PyNamespace_New(PyObject *kwds);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // !Py_INTERNAL_NAMESPACE_H
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_object.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_object.h
new file mode 100644
index 000000000000..7a2f13a21bda
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_object.h
@@ -0,0 +1,443 @@
+#ifndef Py_INTERNAL_OBJECT_H
+#define Py_INTERNAL_OBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include <stdbool.h>
+#include "pycore_gc.h"            // _PyObject_GC_IS_TRACKED()
+#include "pycore_interp.h"        // PyInterpreterState.gc
+#include "pycore_pystate.h"       // _PyInterpreterState_GET()
+#include "pycore_runtime.h"       // _PyRuntime
+
+/* We need to maintain an internal copy of Py{Var}Object_HEAD_INIT to avoid
+   designated initializer conflicts in C++20. If we use the deinition in
+   object.h, we will be mixing designated and non-designated initializers in
+   pycore objects which is forbiddent in C++20. However, if we then use
+   designated initializers in object.h then Extensions without designated break.
+   Furthermore, we can't use designated initializers in Extensions since these
+   are not supported pre-C++20. Thus, keeping an internal copy here is the most
+   backwards compatible solution */
+#define _PyObject_HEAD_INIT(type)         \
+    {                                     \
+        _PyObject_EXTRA_INIT              \
+        .ob_refcnt = _Py_IMMORTAL_REFCNT, \
+        .ob_type = (type)                 \
+    },
+#define _PyVarObject_HEAD_INIT(type, size)    \
+    {                                         \
+        .ob_base = _PyObject_HEAD_INIT(type)  \
+        .ob_size = size                       \
+    },
+
+PyAPI_FUNC(void) _Py_NO_RETURN _Py_FatalRefcountErrorFunc(
+    const char *func,
+    const char *message);
+
+#define _Py_FatalRefcountError(message) \
+    _Py_FatalRefcountErrorFunc(__func__, (message))
+
+
+#ifdef Py_REF_DEBUG
+/* The symbol is only exposed in the API for the sake of extensions
+   built against the pre-3.12 stable ABI. */
+PyAPI_DATA(Py_ssize_t) _Py_RefTotal;
+
+extern void _Py_AddRefTotal(PyInterpreterState *, Py_ssize_t);
+extern void _Py_IncRefTotal(PyInterpreterState *);
+extern void _Py_DecRefTotal(PyInterpreterState *);
+
+#  define _Py_DEC_REFTOTAL(interp) \
+    interp->object_state.reftotal--
+#endif
+
+// Increment reference count by n
+static inline void _Py_RefcntAdd(PyObject* op, Py_ssize_t n)
+{
+    if (_Py_IsImmortal(op)) {
+        return;
+    }
+#ifdef Py_REF_DEBUG
+    _Py_AddRefTotal(_PyInterpreterState_GET(), n);
+#endif
+    op->ob_refcnt += n;
+}
+#define _Py_RefcntAdd(op, n) _Py_RefcntAdd(_PyObject_CAST(op), n)
+
+static inline void _Py_SetImmortal(PyObject *op)
+{
+    if (op) {
+        op->ob_refcnt = _Py_IMMORTAL_REFCNT;
+    }
+}
+#define _Py_SetImmortal(op) _Py_SetImmortal(_PyObject_CAST(op))
+
+/* _Py_ClearImmortal() should only be used during runtime finalization. */
+static inline void _Py_ClearImmortal(PyObject *op)
+{
+    if (op) {
+        assert(op->ob_refcnt == _Py_IMMORTAL_REFCNT);
+        op->ob_refcnt = 1;
+        Py_DECREF(op);
+    }
+}
+#define _Py_ClearImmortal(op) \
+    do { \
+        _Py_ClearImmortal(_PyObject_CAST(op)); \
+        op = NULL; \
+    } while (0)
+
+static inline void
+_Py_DECREF_SPECIALIZED(PyObject *op, const destructor destruct)
+{
+    if (_Py_IsImmortal(op)) {
+        return;
+    }
+    _Py_DECREF_STAT_INC();
+#ifdef Py_REF_DEBUG
+    _Py_DEC_REFTOTAL(_PyInterpreterState_GET());
+#endif
+    if (--op->ob_refcnt != 0) {
+        assert(op->ob_refcnt > 0);
+    }
+    else {
+#ifdef Py_TRACE_REFS
+        _Py_ForgetReference(op);
+#endif
+        destruct(op);
+    }
+}
+
+static inline void
+_Py_DECREF_NO_DEALLOC(PyObject *op)
+{
+    if (_Py_IsImmortal(op)) {
+        return;
+    }
+    _Py_DECREF_STAT_INC();
+#ifdef Py_REF_DEBUG
+    _Py_DEC_REFTOTAL(_PyInterpreterState_GET());
+#endif
+    op->ob_refcnt--;
+#ifdef Py_DEBUG
+    if (op->ob_refcnt <= 0) {
+        _Py_FatalRefcountError("Expected a positive remaining refcount");
+    }
+#endif
+}
+
+#ifdef Py_REF_DEBUG
+#  undef _Py_DEC_REFTOTAL
+#endif
+
+
+PyAPI_FUNC(int) _PyType_CheckConsistency(PyTypeObject *type);
+PyAPI_FUNC(int) _PyDict_CheckConsistency(PyObject *mp, int check_content);
+
+/* Update the Python traceback of an object. This function must be called
+   when a memory block is reused from a free list.
+
+   Internal function called by _Py_NewReference(). */
+extern int _PyTraceMalloc_NewReference(PyObject *op);
+
+// Fast inlined version of PyType_HasFeature()
+static inline int
+_PyType_HasFeature(PyTypeObject *type, unsigned long feature) {
+    return ((type->tp_flags & feature) != 0);
+}
+
+extern void _PyType_InitCache(PyInterpreterState *interp);
+
+extern void _PyObject_InitState(PyInterpreterState *interp);
+
+/* Inline functions trading binary compatibility for speed:
+   _PyObject_Init() is the fast version of PyObject_Init(), and
+   _PyObject_InitVar() is the fast version of PyObject_InitVar().
+
+   These inline functions must not be called with op=NULL. */
+static inline void
+_PyObject_Init(PyObject *op, PyTypeObject *typeobj)
+{
+    assert(op != NULL);
+    Py_SET_TYPE(op, typeobj);
+    if (_PyType_HasFeature(typeobj, Py_TPFLAGS_HEAPTYPE)) {
+        Py_INCREF(typeobj);
+    }
+    _Py_NewReference(op);
+}
+
+static inline void
+_PyObject_InitVar(PyVarObject *op, PyTypeObject *typeobj, Py_ssize_t size)
+{
+    assert(op != NULL);
+    assert(typeobj != &PyLong_Type);
+    _PyObject_Init((PyObject *)op, typeobj);
+    Py_SET_SIZE(op, size);
+}
+
+
+/* Tell the GC to track this object.
+ *
+ * The object must not be tracked by the GC.
+ *
+ * NB: While the object is tracked by the collector, it must be safe to call the
+ * ob_traverse method.
+ *
+ * Internal note: interp->gc.generation0->_gc_prev doesn't have any bit flags
+ * because it's not object header.  So we don't use _PyGCHead_PREV() and
+ * _PyGCHead_SET_PREV() for it to avoid unnecessary bitwise operations.
+ *
+ * See also the public PyObject_GC_Track() function.
+ */
+static inline void _PyObject_GC_TRACK(
+// The preprocessor removes _PyObject_ASSERT_FROM() calls if NDEBUG is defined
+#ifndef NDEBUG
+    const char *filename, int lineno,
+#endif
+    PyObject *op)
+{
+    _PyObject_ASSERT_FROM(op, !_PyObject_GC_IS_TRACKED(op),
+                          "object already tracked by the garbage collector",
+                          filename, lineno, __func__);
+
+    PyGC_Head *gc = _Py_AS_GC(op);
+    _PyObject_ASSERT_FROM(op,
+                          (gc->_gc_prev & _PyGC_PREV_MASK_COLLECTING) == 0,
+                          "object is in generation which is garbage collected",
+                          filename, lineno, __func__);
+
+    PyInterpreterState *interp = _PyInterpreterState_GET();
+    PyGC_Head *generation0 = interp->gc.generation0;
+    PyGC_Head *last = (PyGC_Head*)(generation0->_gc_prev);
+    _PyGCHead_SET_NEXT(last, gc);
+    _PyGCHead_SET_PREV(gc, last);
+    _PyGCHead_SET_NEXT(gc, generation0);
+    generation0->_gc_prev = (uintptr_t)gc;
+}
+
+/* Tell the GC to stop tracking this object.
+ *
+ * Internal note: This may be called while GC. So _PyGC_PREV_MASK_COLLECTING
+ * must be cleared. But _PyGC_PREV_MASK_FINALIZED bit is kept.
+ *
+ * The object must be tracked by the GC.
+ *
+ * See also the public PyObject_GC_UnTrack() which accept an object which is
+ * not tracked.
+ */
+static inline void _PyObject_GC_UNTRACK(
+// The preprocessor removes _PyObject_ASSERT_FROM() calls if NDEBUG is defined
+#ifndef NDEBUG
+    const char *filename, int lineno,
+#endif
+    PyObject *op)
+{
+    _PyObject_ASSERT_FROM(op, _PyObject_GC_IS_TRACKED(op),
+                          "object not tracked by the garbage collector",
+                          filename, lineno, __func__);
+
+    PyGC_Head *gc = _Py_AS_GC(op);
+    PyGC_Head *prev = _PyGCHead_PREV(gc);
+    PyGC_Head *next = _PyGCHead_NEXT(gc);
+    _PyGCHead_SET_NEXT(prev, next);
+    _PyGCHead_SET_PREV(next, prev);
+    gc->_gc_next = 0;
+    gc->_gc_prev &= _PyGC_PREV_MASK_FINALIZED;
+}
+
+// Macros to accept any type for the parameter, and to automatically pass
+// the filename and the filename (if NDEBUG is not defined) where the macro
+// is called.
+#ifdef NDEBUG
+#  define _PyObject_GC_TRACK(op) \
+        _PyObject_GC_TRACK(_PyObject_CAST(op))
+#  define _PyObject_GC_UNTRACK(op) \
+        _PyObject_GC_UNTRACK(_PyObject_CAST(op))
+#else
+#  define _PyObject_GC_TRACK(op) \
+        _PyObject_GC_TRACK(__FILE__, __LINE__, _PyObject_CAST(op))
+#  define _PyObject_GC_UNTRACK(op) \
+        _PyObject_GC_UNTRACK(__FILE__, __LINE__, _PyObject_CAST(op))
+#endif
+
+#ifdef Py_REF_DEBUG
+extern void _PyInterpreterState_FinalizeRefTotal(PyInterpreterState *);
+extern void _Py_FinalizeRefTotal(_PyRuntimeState *);
+extern void _PyDebug_PrintTotalRefs(void);
+#endif
+
+#ifdef Py_TRACE_REFS
+extern void _Py_AddToAllObjects(PyObject *op, int force);
+extern void _Py_PrintReferences(PyInterpreterState *, FILE *);
+extern void _Py_PrintReferenceAddresses(PyInterpreterState *, FILE *);
+#endif
+
+
+/* Return the *address* of the object's weaklist.  The address may be
+ * dereferenced to get the current head of the weaklist.  This is useful
+ * for iterating over the linked list of weakrefs, especially when the
+ * list is being modified externally (e.g. refs getting removed).
+ *
+ * The returned pointer should not be used to change the head of the list
+ * nor should it be used to add, remove, or swap any refs in the list.
+ * That is the sole responsibility of the code in weakrefobject.c.
+ */
+static inline PyObject **
+_PyObject_GET_WEAKREFS_LISTPTR(PyObject *op)
+{
+    if (PyType_Check(op) &&
+            ((PyTypeObject *)op)->tp_flags & _Py_TPFLAGS_STATIC_BUILTIN) {
+        PyInterpreterState *interp = _PyInterpreterState_GET();
+        static_builtin_state *state = _PyStaticType_GetState(
+                                                interp, (PyTypeObject *)op);
+        return _PyStaticType_GET_WEAKREFS_LISTPTR(state);
+    }
+    // Essentially _PyObject_GET_WEAKREFS_LISTPTR_FROM_OFFSET():
+    Py_ssize_t offset = Py_TYPE(op)->tp_weaklistoffset;
+    return (PyObject **)((char *)op + offset);
+}
+
+/* This is a special case of _PyObject_GET_WEAKREFS_LISTPTR().
+ * Only the most fundamental lookup path is used.
+ * Consequently, static types should not be used.
+ *
+ * For static builtin types the returned pointer will always point
+ * to a NULL tp_weaklist.  This is fine for any deallocation cases,
+ * since static types are never deallocated and static builtin types
+ * are only finalized at the end of runtime finalization.
+ *
+ * If the weaklist for static types is actually needed then use
+ * _PyObject_GET_WEAKREFS_LISTPTR().
+ */
+static inline PyWeakReference **
+_PyObject_GET_WEAKREFS_LISTPTR_FROM_OFFSET(PyObject *op)
+{
+    assert(!PyType_Check(op) ||
+            ((PyTypeObject *)op)->tp_flags & Py_TPFLAGS_HEAPTYPE);
+    Py_ssize_t offset = Py_TYPE(op)->tp_weaklistoffset;
+    return (PyWeakReference **)((char *)op + offset);
+}
+
+
+// Fast inlined version of PyObject_IS_GC()
+static inline int
+_PyObject_IS_GC(PyObject *obj)
+{
+    return (PyType_IS_GC(Py_TYPE(obj))
+            && (Py_TYPE(obj)->tp_is_gc == NULL
+                || Py_TYPE(obj)->tp_is_gc(obj)));
+}
+
+// Fast inlined version of PyType_IS_GC()
+#define _PyType_IS_GC(t) _PyType_HasFeature((t), Py_TPFLAGS_HAVE_GC)
+
+static inline size_t
+_PyType_PreHeaderSize(PyTypeObject *tp)
+{
+    return _PyType_IS_GC(tp) * sizeof(PyGC_Head) +
+        _PyType_HasFeature(tp, Py_TPFLAGS_PREHEADER) * 2 * sizeof(PyObject *);
+}
+
+void _PyObject_GC_Link(PyObject *op);
+
+// Usage: assert(_Py_CheckSlotResult(obj, "__getitem__", result != NULL));
+extern int _Py_CheckSlotResult(
+    PyObject *obj,
+    const char *slot_name,
+    int success);
+
+// Test if a type supports weak references
+static inline int _PyType_SUPPORTS_WEAKREFS(PyTypeObject *type) {
+    return (type->tp_weaklistoffset != 0);
+}
+
+extern PyObject* _PyType_AllocNoTrack(PyTypeObject *type, Py_ssize_t nitems);
+
+extern int _PyObject_InitializeDict(PyObject *obj);
+extern int _PyObject_StoreInstanceAttribute(PyObject *obj, PyDictValues *values,
+                                          PyObject *name, PyObject *value);
+PyObject * _PyObject_GetInstanceAttribute(PyObject *obj, PyDictValues *values,
+                                        PyObject *name);
+
+typedef union {
+    PyObject *dict;
+    /* Use a char* to generate a warning if directly assigning a PyDictValues */
+    char *values;
+} PyDictOrValues;
+
+static inline PyDictOrValues *
+_PyObject_DictOrValuesPointer(PyObject *obj)
+{
+    assert(Py_TYPE(obj)->tp_flags & Py_TPFLAGS_MANAGED_DICT);
+    return ((PyDictOrValues *)obj)-3;
+}
+
+static inline int
+_PyDictOrValues_IsValues(PyDictOrValues dorv)
+{
+    return ((uintptr_t)dorv.values) & 1;
+}
+
+static inline PyDictValues *
+_PyDictOrValues_GetValues(PyDictOrValues dorv)
+{
+    assert(_PyDictOrValues_IsValues(dorv));
+    return (PyDictValues *)(dorv.values + 1);
+}
+
+static inline PyObject *
+_PyDictOrValues_GetDict(PyDictOrValues dorv)
+{
+    assert(!_PyDictOrValues_IsValues(dorv));
+    return dorv.dict;
+}
+
+static inline void
+_PyDictOrValues_SetValues(PyDictOrValues *ptr, PyDictValues *values)
+{
+    ptr->values = ((char *)values) - 1;
+}
+
+#define MANAGED_WEAKREF_OFFSET (((Py_ssize_t)sizeof(PyObject *))*-4)
+
+extern PyObject ** _PyObject_ComputedDictPointer(PyObject *);
+extern void _PyObject_FreeInstanceAttributes(PyObject *obj);
+extern int _PyObject_IsInstanceDictEmpty(PyObject *);
+
+PyAPI_FUNC(PyObject *) _PyObject_LookupSpecial(PyObject *, PyObject *);
+
+/* C function call trampolines to mitigate bad function pointer casts.
+ *
+ * Typical native ABIs ignore additional arguments or fill in missing
+ * values with 0/NULL in function pointer cast. Compilers do not show
+ * warnings when a function pointer is explicitly casted to an
+ * incompatible type.
+ *
+ * Bad fpcasts are an issue in WebAssembly. WASM's indirect_call has strict
+ * function signature checks. Argument count, types, and return type must
+ * match.
+ *
+ * Third party code unintentionally rely on problematic fpcasts. The call
+ * trampoline mitigates common occurrences of bad fpcasts on Emscripten.
+ */
+#if defined(__EMSCRIPTEN__) && defined(PY_CALL_TRAMPOLINE)
+#define _PyCFunction_TrampolineCall(meth, self, args) \
+    _PyCFunctionWithKeywords_TrampolineCall( \
+        (*(PyCFunctionWithKeywords)(void(*)(void))(meth)), (self), (args), NULL)
+extern PyObject* _PyCFunctionWithKeywords_TrampolineCall(
+    PyCFunctionWithKeywords meth, PyObject *, PyObject *, PyObject *);
+#else
+#define _PyCFunction_TrampolineCall(meth, self, args) \
+    (meth)((self), (args))
+#define _PyCFunctionWithKeywords_TrampolineCall(meth, self, args, kw) \
+    (meth)((self), (args), (kw))
+#endif // __EMSCRIPTEN__ && PY_CALL_TRAMPOLINE
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_OBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_object_state.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_object_state.h
new file mode 100644
index 000000000000..65feb5af969f
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_object_state.h
@@ -0,0 +1,36 @@
+#ifndef Py_INTERNAL_OBJECT_STATE_H
+#define Py_INTERNAL_OBJECT_STATE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+struct _py_object_runtime_state {
+#ifdef Py_REF_DEBUG
+    Py_ssize_t interpreter_leaks;
+#endif
+    int _not_used;
+};
+
+struct _py_object_state {
+#ifdef Py_REF_DEBUG
+    Py_ssize_t reftotal;
+#endif
+#ifdef Py_TRACE_REFS
+    /* Head of circular doubly-linked list of all objects.  These are linked
+     * together via the _ob_prev and _ob_next members of a PyObject, which
+     * exist only in a Py_TRACE_REFS build.
+     */
+    PyObject refchain;
+#endif
+    int _not_used;
+};
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_OBJECT_STATE_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_obmalloc.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_obmalloc.h
new file mode 100644
index 000000000000..b1c00654ac1c
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_obmalloc.h
@@ -0,0 +1,700 @@
+#ifndef Py_INTERNAL_OBMALLOC_H
+#define Py_INTERNAL_OBMALLOC_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+
+typedef unsigned int pymem_uint;  /* assuming >= 16 bits */
+
+#undef  uint
+#define uint pymem_uint
+
+
+/* An object allocator for Python.
+
+   Here is an introduction to the layers of the Python memory architecture,
+   showing where the object allocator is actually used (layer +2), It is
+   called for every object allocation and deallocation (PyObject_New/Del),
+   unless the object-specific allocators implement a proprietary allocation
+   scheme (ex.: ints use a simple free list). This is also the place where
+   the cyclic garbage collector operates selectively on container objects.
+
+
+    Object-specific allocators
+    _____   ______   ______       ________
+   [ int ] [ dict ] [ list ] ... [ string ]       Python core         |
++3 | <----- Object-specific memory -----> | <-- Non-object memory --> |
+    _______________________________       |                           |
+   [   Python's object allocator   ]      |                           |
++2 | ####### Object memory ####### | <------ Internal buffers ------> |
+    ______________________________________________________________    |
+   [          Python's raw memory allocator (PyMem_ API)          ]   |
++1 | <----- Python memory (under PyMem manager's control) ------> |   |
+    __________________________________________________________________
+   [    Underlying general-purpose allocator (ex: C library malloc)   ]
+ 0 | <------ Virtual memory allocated for the python process -------> |
+
+   =========================================================================
+    _______________________________________________________________________
+   [                OS-specific Virtual Memory Manager (VMM)               ]
+-1 | <--- Kernel dynamic storage allocation & management (page-based) ---> |
+    __________________________________   __________________________________
+   [                                  ] [                                  ]
+-2 | <-- Physical memory: ROM/RAM --> | | <-- Secondary storage (swap) --> |
+
+*/
+/*==========================================================================*/
+
+/* A fast, special-purpose memory allocator for small blocks, to be used
+   on top of a general-purpose malloc -- heavily based on previous art. */
+
+/* Vladimir Marangozov -- August 2000 */
+
+/*
+ * "Memory management is where the rubber meets the road -- if we do the wrong
+ * thing at any level, the results will not be good. And if we don't make the
+ * levels work well together, we are in serious trouble." (1)
+ *
+ * (1) Paul R. Wilson, Mark S. Johnstone, Michael Neely, and David Boles,
+ *    "Dynamic Storage Allocation: A Survey and Critical Review",
+ *    in Proc. 1995 Int'l. Workshop on Memory Management, September 1995.
+ */
+
+/* #undef WITH_MEMORY_LIMITS */         /* disable mem limit checks  */
+
+/*==========================================================================*/
+
+/*
+ * Allocation strategy abstract:
+ *
+ * For small requests, the allocator sub-allocates <Big> blocks of memory.
+ * Requests greater than SMALL_REQUEST_THRESHOLD bytes are routed to the
+ * system's allocator.
+ *
+ * Small requests are grouped in size classes spaced 8 bytes apart, due
+ * to the required valid alignment of the returned address. Requests of
+ * a particular size are serviced from memory pools of 4K (one VMM page).
+ * Pools are fragmented on demand and contain free lists of blocks of one
+ * particular size class. In other words, there is a fixed-size allocator
+ * for each size class. Free pools are shared by the different allocators
+ * thus minimizing the space reserved for a particular size class.
+ *
+ * This allocation strategy is a variant of what is known as "simple
+ * segregated storage based on array of free lists". The main drawback of
+ * simple segregated storage is that we might end up with lot of reserved
+ * memory for the different free lists, which degenerate in time. To avoid
+ * this, we partition each free list in pools and we share dynamically the
+ * reserved space between all free lists. This technique is quite efficient
+ * for memory intensive programs which allocate mainly small-sized blocks.
+ *
+ * For small requests we have the following table:
+ *
+ * Request in bytes     Size of allocated block      Size class idx
+ * ----------------------------------------------------------------
+ *        1-8                     8                       0
+ *        9-16                   16                       1
+ *       17-24                   24                       2
+ *       25-32                   32                       3
+ *       33-40                   40                       4
+ *       41-48                   48                       5
+ *       49-56                   56                       6
+ *       57-64                   64                       7
+ *       65-72                   72                       8
+ *        ...                   ...                     ...
+ *      497-504                 504                      62
+ *      505-512                 512                      63
+ *
+ *      0, SMALL_REQUEST_THRESHOLD + 1 and up: routed to the underlying
+ *      allocator.
+ */
+
+/*==========================================================================*/
+
+/*
+ * -- Main tunable settings section --
+ */
+
+/*
+ * Alignment of addresses returned to the user. 8-bytes alignment works
+ * on most current architectures (with 32-bit or 64-bit address buses).
+ * The alignment value is also used for grouping small requests in size
+ * classes spaced ALIGNMENT bytes apart.
+ *
+ * You shouldn't change this unless you know what you are doing.
+ */
+
+#if SIZEOF_VOID_P > 4
+#define ALIGNMENT              16               /* must be 2^N */
+#define ALIGNMENT_SHIFT         4
+#else
+#define ALIGNMENT               8               /* must be 2^N */
+#define ALIGNMENT_SHIFT         3
+#endif
+
+/* Return the number of bytes in size class I, as a uint. */
+#define INDEX2SIZE(I) (((pymem_uint)(I) + 1) << ALIGNMENT_SHIFT)
+
+/*
+ * Max size threshold below which malloc requests are considered to be
+ * small enough in order to use preallocated memory pools. You can tune
+ * this value according to your application behaviour and memory needs.
+ *
+ * Note: a size threshold of 512 guarantees that newly created dictionaries
+ * will be allocated from preallocated memory pools on 64-bit.
+ *
+ * The following invariants must hold:
+ *      1) ALIGNMENT <= SMALL_REQUEST_THRESHOLD <= 512
+ *      2) SMALL_REQUEST_THRESHOLD is evenly divisible by ALIGNMENT
+ *
+ * Although not required, for better performance and space efficiency,
+ * it is recommended that SMALL_REQUEST_THRESHOLD is set to a power of 2.
+ */
+#define SMALL_REQUEST_THRESHOLD 512
+#define NB_SMALL_SIZE_CLASSES   (SMALL_REQUEST_THRESHOLD / ALIGNMENT)
+
+/*
+ * The system's VMM page size can be obtained on most unices with a
+ * getpagesize() call or deduced from various header files. To make
+ * things simpler, we assume that it is 4K, which is OK for most systems.
+ * It is probably better if this is the native page size, but it doesn't
+ * have to be.  In theory, if SYSTEM_PAGE_SIZE is larger than the native page
+ * size, then `POOL_ADDR(p)->arenaindex' could rarely cause a segmentation
+ * violation fault.  4K is apparently OK for all the platforms that python
+ * currently targets.
+ */
+#define SYSTEM_PAGE_SIZE        (4 * 1024)
+
+/*
+ * Maximum amount of memory managed by the allocator for small requests.
+ */
+#ifdef WITH_MEMORY_LIMITS
+#ifndef SMALL_MEMORY_LIMIT
+#define SMALL_MEMORY_LIMIT      (64 * 1024 * 1024)      /* 64 MB -- more? */
+#endif
+#endif
+
+#if !defined(WITH_PYMALLOC_RADIX_TREE)
+/* Use radix-tree to track arena memory regions, for address_in_range().
+ * Enable by default since it allows larger pool sizes.  Can be disabled
+ * using -DWITH_PYMALLOC_RADIX_TREE=0 */
+#define WITH_PYMALLOC_RADIX_TREE 1
+#endif
+
+#if SIZEOF_VOID_P > 4
+/* on 64-bit platforms use larger pools and arenas if we can */
+#define USE_LARGE_ARENAS
+#if WITH_PYMALLOC_RADIX_TREE
+/* large pools only supported if radix-tree is enabled */
+#define USE_LARGE_POOLS
+#endif
+#endif
+
+/*
+ * The allocator sub-allocates <Big> blocks of memory (called arenas) aligned
+ * on a page boundary. This is a reserved virtual address space for the
+ * current process (obtained through a malloc()/mmap() call). In no way this
+ * means that the memory arenas will be used entirely. A malloc(<Big>) is
+ * usually an address range reservation for <Big> bytes, unless all pages within
+ * this space are referenced subsequently. So malloc'ing big blocks and not
+ * using them does not mean "wasting memory". It's an addressable range
+ * wastage...
+ *
+ * Arenas are allocated with mmap() on systems supporting anonymous memory
+ * mappings to reduce heap fragmentation.
+ */
+#ifdef USE_LARGE_ARENAS
+#define ARENA_BITS              20                    /* 1 MiB */
+#else
+#define ARENA_BITS              18                    /* 256 KiB */
+#endif
+#define ARENA_SIZE              (1 << ARENA_BITS)
+#define ARENA_SIZE_MASK         (ARENA_SIZE - 1)
+
+#ifdef WITH_MEMORY_LIMITS
+#define MAX_ARENAS              (SMALL_MEMORY_LIMIT / ARENA_SIZE)
+#endif
+
+/*
+ * Size of the pools used for small blocks.  Must be a power of 2.
+ */
+#ifdef USE_LARGE_POOLS
+#define POOL_BITS               14                  /* 16 KiB */
+#else
+#define POOL_BITS               12                  /* 4 KiB */
+#endif
+#define POOL_SIZE               (1 << POOL_BITS)
+#define POOL_SIZE_MASK          (POOL_SIZE - 1)
+
+#if !WITH_PYMALLOC_RADIX_TREE
+#if POOL_SIZE != SYSTEM_PAGE_SIZE
+#   error "pool size must be equal to system page size"
+#endif
+#endif
+
+#define MAX_POOLS_IN_ARENA  (ARENA_SIZE / POOL_SIZE)
+#if MAX_POOLS_IN_ARENA * POOL_SIZE != ARENA_SIZE
+#   error "arena size not an exact multiple of pool size"
+#endif
+
+/*
+ * -- End of tunable settings section --
+ */
+
+/*==========================================================================*/
+
+/* When you say memory, my mind reasons in terms of (pointers to) blocks */
+typedef uint8_t pymem_block;
+
+/* Pool for small blocks. */
+struct pool_header {
+    union { pymem_block *_padding;
+            uint count; } ref;          /* number of allocated blocks    */
+    pymem_block *freeblock;             /* pool's free list head         */
+    struct pool_header *nextpool;       /* next pool of this size class  */
+    struct pool_header *prevpool;       /* previous pool       ""        */
+    uint arenaindex;                    /* index into arenas of base adr */
+    uint szidx;                         /* block size class index        */
+    uint nextoffset;                    /* bytes to virgin block         */
+    uint maxnextoffset;                 /* largest valid nextoffset      */
+};
+
+typedef struct pool_header *poolp;
+
+/* Record keeping for arenas. */
+struct arena_object {
+    /* The address of the arena, as returned by malloc.  Note that 0
+     * will never be returned by a successful malloc, and is used
+     * here to mark an arena_object that doesn't correspond to an
+     * allocated arena.
+     */
+    uintptr_t address;
+
+    /* Pool-aligned pointer to the next pool to be carved off. */
+    pymem_block* pool_address;
+
+    /* The number of available pools in the arena:  free pools + never-
+     * allocated pools.
+     */
+    uint nfreepools;
+
+    /* The total number of pools in the arena, whether or not available. */
+    uint ntotalpools;
+
+    /* Singly-linked list of available pools. */
+    struct pool_header* freepools;
+
+    /* Whenever this arena_object is not associated with an allocated
+     * arena, the nextarena member is used to link all unassociated
+     * arena_objects in the singly-linked `unused_arena_objects` list.
+     * The prevarena member is unused in this case.
+     *
+     * When this arena_object is associated with an allocated arena
+     * with at least one available pool, both members are used in the
+     * doubly-linked `usable_arenas` list, which is maintained in
+     * increasing order of `nfreepools` values.
+     *
+     * Else this arena_object is associated with an allocated arena
+     * all of whose pools are in use.  `nextarena` and `prevarena`
+     * are both meaningless in this case.
+     */
+    struct arena_object* nextarena;
+    struct arena_object* prevarena;
+};
+
+#define POOL_OVERHEAD   _Py_SIZE_ROUND_UP(sizeof(struct pool_header), ALIGNMENT)
+
+#define DUMMY_SIZE_IDX          0xffff  /* size class of newly cached pools */
+
+/* Round pointer P down to the closest pool-aligned address <= P, as a poolp */
+#define POOL_ADDR(P) ((poolp)_Py_ALIGN_DOWN((P), POOL_SIZE))
+
+/* Return total number of blocks in pool of size index I, as a uint. */
+#define NUMBLOCKS(I) ((pymem_uint)(POOL_SIZE - POOL_OVERHEAD) / INDEX2SIZE(I))
+
+/*==========================================================================*/
+
+/*
+ * Pool table -- headed, circular, doubly-linked lists of partially used pools.
+
+This is involved.  For an index i, usedpools[i+i] is the header for a list of
+all partially used pools holding small blocks with "size class idx" i. So
+usedpools[0] corresponds to blocks of size 8, usedpools[2] to blocks of size
+16, and so on:  index 2*i <-> blocks of size (i+1)<<ALIGNMENT_SHIFT.
+
+Pools are carved off an arena's highwater mark (an arena_object's pool_address
+member) as needed.  Once carved off, a pool is in one of three states forever
+after:
+
+used == partially used, neither empty nor full
+    At least one block in the pool is currently allocated, and at least one
+    block in the pool is not currently allocated (note this implies a pool
+    has room for at least two blocks).
+    This is a pool's initial state, as a pool is created only when malloc
+    needs space.
+    The pool holds blocks of a fixed size, and is in the circular list headed
+    at usedpools[i] (see above).  It's linked to the other used pools of the
+    same size class via the pool_header's nextpool and prevpool members.
+    If all but one block is currently allocated, a malloc can cause a
+    transition to the full state.  If all but one block is not currently
+    allocated, a free can cause a transition to the empty state.
+
+full == all the pool's blocks are currently allocated
+    On transition to full, a pool is unlinked from its usedpools[] list.
+    It's not linked to from anything then anymore, and its nextpool and
+    prevpool members are meaningless until it transitions back to used.
+    A free of a block in a full pool puts the pool back in the used state.
+    Then it's linked in at the front of the appropriate usedpools[] list, so
+    that the next allocation for its size class will reuse the freed block.
+
+empty == all the pool's blocks are currently available for allocation
+    On transition to empty, a pool is unlinked from its usedpools[] list,
+    and linked to the front of its arena_object's singly-linked freepools list,
+    via its nextpool member.  The prevpool member has no meaning in this case.
+    Empty pools have no inherent size class:  the next time a malloc finds
+    an empty list in usedpools[], it takes the first pool off of freepools.
+    If the size class needed happens to be the same as the size class the pool
+    last had, some pool initialization can be skipped.
+
+
+Block Management
+
+Blocks within pools are again carved out as needed.  pool->freeblock points to
+the start of a singly-linked list of free blocks within the pool.  When a
+block is freed, it's inserted at the front of its pool's freeblock list.  Note
+that the available blocks in a pool are *not* linked all together when a pool
+is initialized.  Instead only "the first two" (lowest addresses) blocks are
+set up, returning the first such block, and setting pool->freeblock to a
+one-block list holding the second such block.  This is consistent with that
+pymalloc strives at all levels (arena, pool, and block) never to touch a piece
+of memory until it's actually needed.
+
+So long as a pool is in the used state, we're certain there *is* a block
+available for allocating, and pool->freeblock is not NULL.  If pool->freeblock
+points to the end of the free list before we've carved the entire pool into
+blocks, that means we simply haven't yet gotten to one of the higher-address
+blocks.  The offset from the pool_header to the start of "the next" virgin
+block is stored in the pool_header nextoffset member, and the largest value
+of nextoffset that makes sense is stored in the maxnextoffset member when a
+pool is initialized.  All the blocks in a pool have been passed out at least
+once when and only when nextoffset > maxnextoffset.
+
+
+Major obscurity:  While the usedpools vector is declared to have poolp
+entries, it doesn't really.  It really contains two pointers per (conceptual)
+poolp entry, the nextpool and prevpool members of a pool_header.  The
+excruciating initialization code below fools C so that
+
+    usedpool[i+i]
+
+"acts like" a genuine poolp, but only so long as you only reference its
+nextpool and prevpool members.  The "- 2*sizeof(pymem_block *)" gibberish is
+compensating for that a pool_header's nextpool and prevpool members
+immediately follow a pool_header's first two members:
+
+    union { pymem_block *_padding;
+            uint count; } ref;
+    pymem_block *freeblock;
+
+each of which consume sizeof(pymem_block *) bytes.  So what usedpools[i+i] really
+contains is a fudged-up pointer p such that *if* C believes it's a poolp
+pointer, then p->nextpool and p->prevpool are both p (meaning that the headed
+circular list is empty).
+
+It's unclear why the usedpools setup is so convoluted.  It could be to
+minimize the amount of cache required to hold this heavily-referenced table
+(which only *needs* the two interpool pointer members of a pool_header). OTOH,
+referencing code has to remember to "double the index" and doing so isn't
+free, usedpools[0] isn't a strictly legal pointer, and we're crucially relying
+on that C doesn't insert any padding anywhere in a pool_header at or before
+the prevpool member.
+**************************************************************************** */
+
+#define OBMALLOC_USED_POOLS_SIZE (2 * ((NB_SMALL_SIZE_CLASSES + 7) / 8) * 8)
+
+struct _obmalloc_pools {
+    poolp used[OBMALLOC_USED_POOLS_SIZE];
+};
+
+
+/*==========================================================================
+Arena management.
+
+`arenas` is a vector of arena_objects.  It contains maxarenas entries, some of
+which may not be currently used (== they're arena_objects that aren't
+currently associated with an allocated arena).  Note that arenas proper are
+separately malloc'ed.
+
+Prior to Python 2.5, arenas were never free()'ed.  Starting with Python 2.5,
+we do try to free() arenas, and use some mild heuristic strategies to increase
+the likelihood that arenas eventually can be freed.
+
+unused_arena_objects
+
+    This is a singly-linked list of the arena_objects that are currently not
+    being used (no arena is associated with them).  Objects are taken off the
+    head of the list in new_arena(), and are pushed on the head of the list in
+    PyObject_Free() when the arena is empty.  Key invariant:  an arena_object
+    is on this list if and only if its .address member is 0.
+
+usable_arenas
+
+    This is a doubly-linked list of the arena_objects associated with arenas
+    that have pools available.  These pools are either waiting to be reused,
+    or have not been used before.  The list is sorted to have the most-
+    allocated arenas first (ascending order based on the nfreepools member).
+    This means that the next allocation will come from a heavily used arena,
+    which gives the nearly empty arenas a chance to be returned to the system.
+    In my unscientific tests this dramatically improved the number of arenas
+    that could be freed.
+
+Note that an arena_object associated with an arena all of whose pools are
+currently in use isn't on either list.
+
+Changed in Python 3.8:  keeping usable_arenas sorted by number of free pools
+used to be done by one-at-a-time linear search when an arena's number of
+free pools changed.  That could, overall, consume time quadratic in the
+number of arenas.  That didn't really matter when there were only a few
+hundred arenas (typical!), but could be a timing disaster when there were
+hundreds of thousands.  See bpo-37029.
+
+Now we have a vector of "search fingers" to eliminate the need to search:
+nfp2lasta[nfp] returns the last ("rightmost") arena in usable_arenas
+with nfp free pools.  This is NULL if and only if there is no arena with
+nfp free pools in usable_arenas.
+*/
+
+/* How many arena_objects do we initially allocate?
+ * 16 = can allocate 16 arenas = 16 * ARENA_SIZE = 4MB before growing the
+ * `arenas` vector.
+ */
+#define INITIAL_ARENA_OBJECTS 16
+
+struct _obmalloc_mgmt {
+    /* Array of objects used to track chunks of memory (arenas). */
+    struct arena_object* arenas;
+    /* Number of slots currently allocated in the `arenas` vector. */
+    uint maxarenas;
+
+    /* The head of the singly-linked, NULL-terminated list of available
+     * arena_objects.
+     */
+    struct arena_object* unused_arena_objects;
+
+    /* The head of the doubly-linked, NULL-terminated at each end, list of
+     * arena_objects associated with arenas that have pools available.
+     */
+    struct arena_object* usable_arenas;
+
+    /* nfp2lasta[nfp] is the last arena in usable_arenas with nfp free pools */
+    struct arena_object* nfp2lasta[MAX_POOLS_IN_ARENA + 1];
+
+    /* Number of arenas allocated that haven't been free()'d. */
+    size_t narenas_currently_allocated;
+
+    /* Total number of times malloc() called to allocate an arena. */
+    size_t ntimes_arena_allocated;
+    /* High water mark (max value ever seen) for narenas_currently_allocated. */
+    size_t narenas_highwater;
+
+    Py_ssize_t raw_allocated_blocks;
+};
+
+
+#if WITH_PYMALLOC_RADIX_TREE
+/*==========================================================================*/
+/* radix tree for tracking arena usage.  If enabled, used to implement
+   address_in_range().
+
+   memory address bit allocation for keys
+
+   64-bit pointers, IGNORE_BITS=0 and 2^20 arena size:
+     15 -> MAP_TOP_BITS
+     15 -> MAP_MID_BITS
+     14 -> MAP_BOT_BITS
+     20 -> ideal aligned arena
+   ----
+     64
+
+   64-bit pointers, IGNORE_BITS=16, and 2^20 arena size:
+     16 -> IGNORE_BITS
+     10 -> MAP_TOP_BITS
+     10 -> MAP_MID_BITS
+      8 -> MAP_BOT_BITS
+     20 -> ideal aligned arena
+   ----
+     64
+
+   32-bit pointers and 2^18 arena size:
+     14 -> MAP_BOT_BITS
+     18 -> ideal aligned arena
+   ----
+     32
+
+*/
+
+#if SIZEOF_VOID_P == 8
+
+/* number of bits in a pointer */
+#define POINTER_BITS 64
+
+/* High bits of memory addresses that will be ignored when indexing into the
+ * radix tree.  Setting this to zero is the safe default.  For most 64-bit
+ * machines, setting this to 16 would be safe.  The kernel would not give
+ * user-space virtual memory addresses that have significant information in
+ * those high bits.  The main advantage to setting IGNORE_BITS > 0 is that less
+ * virtual memory will be used for the top and middle radix tree arrays.  Those
+ * arrays are allocated in the BSS segment and so will typically consume real
+ * memory only if actually accessed.
+ */
+#define IGNORE_BITS 0
+
+/* use the top and mid layers of the radix tree */
+#define USE_INTERIOR_NODES
+
+#elif SIZEOF_VOID_P == 4
+
+#define POINTER_BITS 32
+#define IGNORE_BITS 0
+
+#else
+
+ /* Currently this code works for 64-bit or 32-bit pointers only.  */
+#error "obmalloc radix tree requires 64-bit or 32-bit pointers."
+
+#endif /* SIZEOF_VOID_P */
+
+/* arena_coverage_t members require this to be true  */
+#if ARENA_BITS >= 32
+#   error "arena size must be < 2^32"
+#endif
+
+/* the lower bits of the address that are not ignored */
+#define ADDRESS_BITS (POINTER_BITS - IGNORE_BITS)
+
+#ifdef USE_INTERIOR_NODES
+/* number of bits used for MAP_TOP and MAP_MID nodes */
+#define INTERIOR_BITS ((ADDRESS_BITS - ARENA_BITS + 2) / 3)
+#else
+#define INTERIOR_BITS 0
+#endif
+
+#define MAP_TOP_BITS INTERIOR_BITS
+#define MAP_TOP_LENGTH (1 << MAP_TOP_BITS)
+#define MAP_TOP_MASK (MAP_TOP_LENGTH - 1)
+
+#define MAP_MID_BITS INTERIOR_BITS
+#define MAP_MID_LENGTH (1 << MAP_MID_BITS)
+#define MAP_MID_MASK (MAP_MID_LENGTH - 1)
+
+#define MAP_BOT_BITS (ADDRESS_BITS - ARENA_BITS - 2*INTERIOR_BITS)
+#define MAP_BOT_LENGTH (1 << MAP_BOT_BITS)
+#define MAP_BOT_MASK (MAP_BOT_LENGTH - 1)
+
+#define MAP_BOT_SHIFT ARENA_BITS
+#define MAP_MID_SHIFT (MAP_BOT_BITS + MAP_BOT_SHIFT)
+#define MAP_TOP_SHIFT (MAP_MID_BITS + MAP_MID_SHIFT)
+
+#define AS_UINT(p) ((uintptr_t)(p))
+#define MAP_BOT_INDEX(p) ((AS_UINT(p) >> MAP_BOT_SHIFT) & MAP_BOT_MASK)
+#define MAP_MID_INDEX(p) ((AS_UINT(p) >> MAP_MID_SHIFT) & MAP_MID_MASK)
+#define MAP_TOP_INDEX(p) ((AS_UINT(p) >> MAP_TOP_SHIFT) & MAP_TOP_MASK)
+
+#if IGNORE_BITS > 0
+/* Return the ignored part of the pointer address.  Those bits should be same
+ * for all valid pointers if IGNORE_BITS is set correctly.
+ */
+#define HIGH_BITS(p) (AS_UINT(p) >> ADDRESS_BITS)
+#else
+#define HIGH_BITS(p) 0
+#endif
+
+
+/* This is the leaf of the radix tree.  See arena_map_mark_used() for the
+ * meaning of these members. */
+typedef struct {
+    int32_t tail_hi;
+    int32_t tail_lo;
+} arena_coverage_t;
+
+typedef struct arena_map_bot {
+    /* The members tail_hi and tail_lo are accessed together.  So, it
+     * better to have them as an array of structs, rather than two
+     * arrays.
+     */
+    arena_coverage_t arenas[MAP_BOT_LENGTH];
+} arena_map_bot_t;
+
+#ifdef USE_INTERIOR_NODES
+typedef struct arena_map_mid {
+    struct arena_map_bot *ptrs[MAP_MID_LENGTH];
+} arena_map_mid_t;
+
+typedef struct arena_map_top {
+    struct arena_map_mid *ptrs[MAP_TOP_LENGTH];
+} arena_map_top_t;
+#endif
+
+struct _obmalloc_usage {
+    /* The root of radix tree.  Note that by initializing like this, the memory
+     * should be in the BSS.  The OS will only memory map pages as the MAP_MID
+     * nodes get used (OS pages are demand loaded as needed).
+     */
+#ifdef USE_INTERIOR_NODES
+    arena_map_top_t arena_map_root;
+    /* accounting for number of used interior nodes */
+    int arena_map_mid_count;
+    int arena_map_bot_count;
+#else
+    arena_map_bot_t arena_map_root;
+#endif
+};
+
+#endif /* WITH_PYMALLOC_RADIX_TREE */
+
+
+struct _obmalloc_global_state {
+    int dump_debug_stats;
+    Py_ssize_t interpreter_leaks;
+};
+
+struct _obmalloc_state {
+    struct _obmalloc_pools pools;
+    struct _obmalloc_mgmt mgmt;
+#if WITH_PYMALLOC_RADIX_TREE
+    struct _obmalloc_usage usage;
+#endif
+};
+
+
+#undef  uint
+
+
+/* Allocate memory directly from the O/S virtual memory system,
+ * where supported. Otherwise fallback on malloc */
+void *_PyObject_VirtualAlloc(size_t size);
+void _PyObject_VirtualFree(void *, size_t size);
+
+
+/* This function returns the number of allocated memory blocks, regardless of size */
+extern Py_ssize_t _Py_GetGlobalAllocatedBlocks(void);
+#define _Py_GetAllocatedBlocks() \
+    _Py_GetGlobalAllocatedBlocks()
+extern Py_ssize_t _PyInterpreterState_GetAllocatedBlocks(PyInterpreterState *);
+extern void _PyInterpreterState_FinalizeAllocatedBlocks(PyInterpreterState *);
+
+
+#ifdef WITH_PYMALLOC
+// Export the symbol for the 3rd party guppy3 project
+PyAPI_FUNC(int) _PyObject_DebugMallocStats(FILE *out);
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // !Py_INTERNAL_OBMALLOC_H
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_obmalloc_init.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_obmalloc_init.h
new file mode 100644
index 000000000000..8ee72ff2d412
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_obmalloc_init.h
@@ -0,0 +1,73 @@
+#ifndef Py_INTERNAL_OBMALLOC_INIT_H
+#define Py_INTERNAL_OBMALLOC_INIT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+
+/****************************************************/
+/* the default object allocator's state initializer */
+
+#define PTA(pools, x) \
+    ((poolp )((uint8_t *)&(pools.used[2*(x)]) - 2*sizeof(pymem_block *)))
+#define PT(p, x)   PTA(p, x), PTA(p, x)
+
+#define PT_8(p, start) \
+    PT(p, start), \
+    PT(p, start+1), \
+    PT(p, start+2), \
+    PT(p, start+3), \
+    PT(p, start+4), \
+    PT(p, start+5), \
+    PT(p, start+6), \
+    PT(p, start+7)
+
+#if NB_SMALL_SIZE_CLASSES <= 8
+#  define _obmalloc_pools_INIT(p) \
+    { PT_8(p, 0) }
+#elif NB_SMALL_SIZE_CLASSES <= 16
+#  define _obmalloc_pools_INIT(p) \
+    { PT_8(p, 0), PT_8(p, 8) }
+#elif NB_SMALL_SIZE_CLASSES <= 24
+#  define _obmalloc_pools_INIT(p) \
+    { PT_8(p, 0), PT_8(p, 8), PT_8(p, 16) }
+#elif NB_SMALL_SIZE_CLASSES <= 32
+#  define _obmalloc_pools_INIT(p) \
+    { PT_8(p, 0), PT_8(p, 8), PT_8(p, 16), PT_8(p, 24) }
+#elif NB_SMALL_SIZE_CLASSES <= 40
+#  define _obmalloc_pools_INIT(p) \
+    { PT_8(p, 0), PT_8(p, 8), PT_8(p, 16), PT_8(p, 24), PT_8(p, 32) }
+#elif NB_SMALL_SIZE_CLASSES <= 48
+#  define _obmalloc_pools_INIT(p) \
+    { PT_8(p, 0), PT_8(p, 8), PT_8(p, 16), PT_8(p, 24), PT_8(p, 32), PT_8(p, 40) }
+#elif NB_SMALL_SIZE_CLASSES <= 56
+#  define _obmalloc_pools_INIT(p) \
+    { PT_8(p, 0), PT_8(p, 8), PT_8(p, 16), PT_8(p, 24), PT_8(p, 32), PT_8(p, 40), PT_8(p, 48) }
+#elif NB_SMALL_SIZE_CLASSES <= 64
+#  define _obmalloc_pools_INIT(p) \
+    { PT_8(p, 0), PT_8(p, 8), PT_8(p, 16), PT_8(p, 24), PT_8(p, 32), PT_8(p, 40), PT_8(p, 48), PT_8(p, 56) }
+#else
+#  error "NB_SMALL_SIZE_CLASSES should be less than 64"
+#endif
+
+#define _obmalloc_global_state_INIT \
+    { \
+        .dump_debug_stats = -1, \
+    }
+
+#define _obmalloc_state_INIT(obmalloc) \
+    { \
+        .pools = { \
+            .used = _obmalloc_pools_INIT(obmalloc.pools), \
+        }, \
+    }
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // !Py_INTERNAL_OBMALLOC_INIT_H
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_opcode.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_opcode.h
new file mode 100644
index 000000000000..15d96503830f
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_opcode.h
@@ -0,0 +1,587 @@
+// Auto-generated by Tools/build/generate_opcode_h.py from Lib/opcode.py
+
+#ifndef Py_INTERNAL_OPCODE_H
+#define Py_INTERNAL_OPCODE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "opcode.h"
+
+extern const uint32_t _PyOpcode_Jump[9];
+
+extern const uint8_t _PyOpcode_Caches[256];
+
+extern const uint8_t _PyOpcode_Deopt[256];
+
+#ifdef NEED_OPCODE_TABLES
+const uint32_t _PyOpcode_Jump[9] = {
+    0U,
+    0U,
+    536870912U,
+    135020544U,
+    4163U,
+    0U,
+    0U,
+    0U,
+    48U,
+};
+
+const uint8_t _PyOpcode_Caches[256] = {
+    [BINARY_SUBSCR] = 1,
+    [STORE_SUBSCR] = 1,
+    [UNPACK_SEQUENCE] = 1,
+    [FOR_ITER] = 1,
+    [STORE_ATTR] = 4,
+    [LOAD_ATTR] = 9,
+    [COMPARE_OP] = 1,
+    [LOAD_GLOBAL] = 4,
+    [BINARY_OP] = 1,
+    [SEND] = 1,
+    [LOAD_SUPER_ATTR] = 1,
+    [CALL] = 3,
+};
+
+const uint8_t _PyOpcode_Deopt[256] = {
+    [BEFORE_ASYNC_WITH] = BEFORE_ASYNC_WITH,
+    [BEFORE_WITH] = BEFORE_WITH,
+    [BINARY_OP] = BINARY_OP,
+    [BINARY_OP_ADD_FLOAT] = BINARY_OP,
+    [BINARY_OP_ADD_INT] = BINARY_OP,
+    [BINARY_OP_ADD_UNICODE] = BINARY_OP,
+    [BINARY_OP_INPLACE_ADD_UNICODE] = BINARY_OP,
+    [BINARY_OP_MULTIPLY_FLOAT] = BINARY_OP,
+    [BINARY_OP_MULTIPLY_INT] = BINARY_OP,
+    [BINARY_OP_SUBTRACT_FLOAT] = BINARY_OP,
+    [BINARY_OP_SUBTRACT_INT] = BINARY_OP,
+    [BINARY_SLICE] = BINARY_SLICE,
+    [BINARY_SUBSCR] = BINARY_SUBSCR,
+    [BINARY_SUBSCR_DICT] = BINARY_SUBSCR,
+    [BINARY_SUBSCR_GETITEM] = BINARY_SUBSCR,
+    [BINARY_SUBSCR_LIST_INT] = BINARY_SUBSCR,
+    [BINARY_SUBSCR_TUPLE_INT] = BINARY_SUBSCR,
+    [BUILD_CONST_KEY_MAP] = BUILD_CONST_KEY_MAP,
+    [BUILD_LIST] = BUILD_LIST,
+    [BUILD_MAP] = BUILD_MAP,
+    [BUILD_SET] = BUILD_SET,
+    [BUILD_SLICE] = BUILD_SLICE,
+    [BUILD_STRING] = BUILD_STRING,
+    [BUILD_TUPLE] = BUILD_TUPLE,
+    [CACHE] = CACHE,
+    [CALL] = CALL,
+    [CALL_BOUND_METHOD_EXACT_ARGS] = CALL,
+    [CALL_BUILTIN_CLASS] = CALL,
+    [CALL_BUILTIN_FAST_WITH_KEYWORDS] = CALL,
+    [CALL_FUNCTION_EX] = CALL_FUNCTION_EX,
+    [CALL_INTRINSIC_1] = CALL_INTRINSIC_1,
+    [CALL_INTRINSIC_2] = CALL_INTRINSIC_2,
+    [CALL_METHOD_DESCRIPTOR_FAST_WITH_KEYWORDS] = CALL,
+    [CALL_NO_KW_BUILTIN_FAST] = CALL,
+    [CALL_NO_KW_BUILTIN_O] = CALL,
+    [CALL_NO_KW_ISINSTANCE] = CALL,
+    [CALL_NO_KW_LEN] = CALL,
+    [CALL_NO_KW_LIST_APPEND] = CALL,
+    [CALL_NO_KW_METHOD_DESCRIPTOR_FAST] = CALL,
+    [CALL_NO_KW_METHOD_DESCRIPTOR_NOARGS] = CALL,
+    [CALL_NO_KW_METHOD_DESCRIPTOR_O] = CALL,
+    [CALL_NO_KW_STR_1] = CALL,
+    [CALL_NO_KW_TUPLE_1] = CALL,
+    [CALL_NO_KW_TYPE_1] = CALL,
+    [CALL_PY_EXACT_ARGS] = CALL,
+    [CALL_PY_WITH_DEFAULTS] = CALL,
+    [CHECK_EG_MATCH] = CHECK_EG_MATCH,
+    [CHECK_EXC_MATCH] = CHECK_EXC_MATCH,
+    [CLEANUP_THROW] = CLEANUP_THROW,
+    [COMPARE_OP] = COMPARE_OP,
+    [COMPARE_OP_FLOAT] = COMPARE_OP,
+    [COMPARE_OP_INT] = COMPARE_OP,
+    [COMPARE_OP_STR] = COMPARE_OP,
+    [CONTAINS_OP] = CONTAINS_OP,
+    [COPY] = COPY,
+    [COPY_FREE_VARS] = COPY_FREE_VARS,
+    [DELETE_ATTR] = DELETE_ATTR,
+    [DELETE_DEREF] = DELETE_DEREF,
+    [DELETE_FAST] = DELETE_FAST,
+    [DELETE_GLOBAL] = DELETE_GLOBAL,
+    [DELETE_NAME] = DELETE_NAME,
+    [DELETE_SUBSCR] = DELETE_SUBSCR,
+    [DICT_MERGE] = DICT_MERGE,
+    [DICT_UPDATE] = DICT_UPDATE,
+    [END_ASYNC_FOR] = END_ASYNC_FOR,
+    [END_FOR] = END_FOR,
+    [END_SEND] = END_SEND,
+    [EXTENDED_ARG] = EXTENDED_ARG,
+    [FORMAT_VALUE] = FORMAT_VALUE,
+    [FOR_ITER] = FOR_ITER,
+    [FOR_ITER_GEN] = FOR_ITER,
+    [FOR_ITER_LIST] = FOR_ITER,
+    [FOR_ITER_RANGE] = FOR_ITER,
+    [FOR_ITER_TUPLE] = FOR_ITER,
+    [GET_AITER] = GET_AITER,
+    [GET_ANEXT] = GET_ANEXT,
+    [GET_AWAITABLE] = GET_AWAITABLE,
+    [GET_ITER] = GET_ITER,
+    [GET_LEN] = GET_LEN,
+    [GET_YIELD_FROM_ITER] = GET_YIELD_FROM_ITER,
+    [IMPORT_FROM] = IMPORT_FROM,
+    [IMPORT_NAME] = IMPORT_NAME,
+    [INSTRUMENTED_CALL] = INSTRUMENTED_CALL,
+    [INSTRUMENTED_CALL_FUNCTION_EX] = INSTRUMENTED_CALL_FUNCTION_EX,
+    [INSTRUMENTED_END_FOR] = INSTRUMENTED_END_FOR,
+    [INSTRUMENTED_END_SEND] = INSTRUMENTED_END_SEND,
+    [INSTRUMENTED_FOR_ITER] = INSTRUMENTED_FOR_ITER,
+    [INSTRUMENTED_INSTRUCTION] = INSTRUMENTED_INSTRUCTION,
+    [INSTRUMENTED_JUMP_BACKWARD] = INSTRUMENTED_JUMP_BACKWARD,
+    [INSTRUMENTED_JUMP_FORWARD] = INSTRUMENTED_JUMP_FORWARD,
+    [INSTRUMENTED_LINE] = INSTRUMENTED_LINE,
+    [INSTRUMENTED_LOAD_SUPER_ATTR] = INSTRUMENTED_LOAD_SUPER_ATTR,
+    [INSTRUMENTED_POP_JUMP_IF_FALSE] = INSTRUMENTED_POP_JUMP_IF_FALSE,
+    [INSTRUMENTED_POP_JUMP_IF_NONE] = INSTRUMENTED_POP_JUMP_IF_NONE,
+    [INSTRUMENTED_POP_JUMP_IF_NOT_NONE] = INSTRUMENTED_POP_JUMP_IF_NOT_NONE,
+    [INSTRUMENTED_POP_JUMP_IF_TRUE] = INSTRUMENTED_POP_JUMP_IF_TRUE,
+    [INSTRUMENTED_RESUME] = INSTRUMENTED_RESUME,
+    [INSTRUMENTED_RETURN_CONST] = INSTRUMENTED_RETURN_CONST,
+    [INSTRUMENTED_RETURN_VALUE] = INSTRUMENTED_RETURN_VALUE,
+    [INSTRUMENTED_YIELD_VALUE] = INSTRUMENTED_YIELD_VALUE,
+    [INTERPRETER_EXIT] = INTERPRETER_EXIT,
+    [IS_OP] = IS_OP,
+    [JUMP_BACKWARD] = JUMP_BACKWARD,
+    [JUMP_BACKWARD_NO_INTERRUPT] = JUMP_BACKWARD_NO_INTERRUPT,
+    [JUMP_FORWARD] = JUMP_FORWARD,
+    [KW_NAMES] = KW_NAMES,
+    [LIST_APPEND] = LIST_APPEND,
+    [LIST_EXTEND] = LIST_EXTEND,
+    [LOAD_ASSERTION_ERROR] = LOAD_ASSERTION_ERROR,
+    [LOAD_ATTR] = LOAD_ATTR,
+    [LOAD_ATTR_CLASS] = LOAD_ATTR,
+    [LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN] = LOAD_ATTR,
+    [LOAD_ATTR_INSTANCE_VALUE] = LOAD_ATTR,
+    [LOAD_ATTR_METHOD_LAZY_DICT] = LOAD_ATTR,
+    [LOAD_ATTR_METHOD_NO_DICT] = LOAD_ATTR,
+    [LOAD_ATTR_METHOD_WITH_VALUES] = LOAD_ATTR,
+    [LOAD_ATTR_MODULE] = LOAD_ATTR,
+    [LOAD_ATTR_PROPERTY] = LOAD_ATTR,
+    [LOAD_ATTR_SLOT] = LOAD_ATTR,
+    [LOAD_ATTR_WITH_HINT] = LOAD_ATTR,
+    [LOAD_BUILD_CLASS] = LOAD_BUILD_CLASS,
+    [LOAD_CLOSURE] = LOAD_CLOSURE,
+    [LOAD_CONST] = LOAD_CONST,
+    [LOAD_CONST__LOAD_FAST] = LOAD_CONST,
+    [LOAD_DEREF] = LOAD_DEREF,
+    [LOAD_FAST] = LOAD_FAST,
+    [LOAD_FAST_AND_CLEAR] = LOAD_FAST_AND_CLEAR,
+    [LOAD_FAST_CHECK] = LOAD_FAST_CHECK,
+    [LOAD_FAST__LOAD_CONST] = LOAD_FAST,
+    [LOAD_FAST__LOAD_FAST] = LOAD_FAST,
+    [LOAD_FROM_DICT_OR_DEREF] = LOAD_FROM_DICT_OR_DEREF,
+    [LOAD_FROM_DICT_OR_GLOBALS] = LOAD_FROM_DICT_OR_GLOBALS,
+    [LOAD_GLOBAL] = LOAD_GLOBAL,
+    [LOAD_GLOBAL_BUILTIN] = LOAD_GLOBAL,
+    [LOAD_GLOBAL_MODULE] = LOAD_GLOBAL,
+    [LOAD_LOCALS] = LOAD_LOCALS,
+    [LOAD_NAME] = LOAD_NAME,
+    [LOAD_SUPER_ATTR] = LOAD_SUPER_ATTR,
+    [LOAD_SUPER_ATTR_ATTR] = LOAD_SUPER_ATTR,
+    [LOAD_SUPER_ATTR_METHOD] = LOAD_SUPER_ATTR,
+    [MAKE_CELL] = MAKE_CELL,
+    [MAKE_FUNCTION] = MAKE_FUNCTION,
+    [MAP_ADD] = MAP_ADD,
+    [MATCH_CLASS] = MATCH_CLASS,
+    [MATCH_KEYS] = MATCH_KEYS,
+    [MATCH_MAPPING] = MATCH_MAPPING,
+    [MATCH_SEQUENCE] = MATCH_SEQUENCE,
+    [NOP] = NOP,
+    [POP_EXCEPT] = POP_EXCEPT,
+    [POP_JUMP_IF_FALSE] = POP_JUMP_IF_FALSE,
+    [POP_JUMP_IF_NONE] = POP_JUMP_IF_NONE,
+    [POP_JUMP_IF_NOT_NONE] = POP_JUMP_IF_NOT_NONE,
+    [POP_JUMP_IF_TRUE] = POP_JUMP_IF_TRUE,
+    [POP_TOP] = POP_TOP,
+    [PUSH_EXC_INFO] = PUSH_EXC_INFO,
+    [PUSH_NULL] = PUSH_NULL,
+    [RAISE_VARARGS] = RAISE_VARARGS,
+    [RERAISE] = RERAISE,
+    [RESERVED] = RESERVED,
+    [RESUME] = RESUME,
+    [RETURN_CONST] = RETURN_CONST,
+    [RETURN_GENERATOR] = RETURN_GENERATOR,
+    [RETURN_VALUE] = RETURN_VALUE,
+    [SEND] = SEND,
+    [SEND_GEN] = SEND,
+    [SETUP_ANNOTATIONS] = SETUP_ANNOTATIONS,
+    [SET_ADD] = SET_ADD,
+    [SET_UPDATE] = SET_UPDATE,
+    [STORE_ATTR] = STORE_ATTR,
+    [STORE_ATTR_INSTANCE_VALUE] = STORE_ATTR,
+    [STORE_ATTR_SLOT] = STORE_ATTR,
+    [STORE_ATTR_WITH_HINT] = STORE_ATTR,
+    [STORE_DEREF] = STORE_DEREF,
+    [STORE_FAST] = STORE_FAST,
+    [STORE_FAST__LOAD_FAST] = STORE_FAST,
+    [STORE_FAST__STORE_FAST] = STORE_FAST,
+    [STORE_GLOBAL] = STORE_GLOBAL,
+    [STORE_NAME] = STORE_NAME,
+    [STORE_SLICE] = STORE_SLICE,
+    [STORE_SUBSCR] = STORE_SUBSCR,
+    [STORE_SUBSCR_DICT] = STORE_SUBSCR,
+    [STORE_SUBSCR_LIST_INT] = STORE_SUBSCR,
+    [SWAP] = SWAP,
+    [UNARY_INVERT] = UNARY_INVERT,
+    [UNARY_NEGATIVE] = UNARY_NEGATIVE,
+    [UNARY_NOT] = UNARY_NOT,
+    [UNPACK_EX] = UNPACK_EX,
+    [UNPACK_SEQUENCE] = UNPACK_SEQUENCE,
+    [UNPACK_SEQUENCE_LIST] = UNPACK_SEQUENCE,
+    [UNPACK_SEQUENCE_TUPLE] = UNPACK_SEQUENCE,
+    [UNPACK_SEQUENCE_TWO_TUPLE] = UNPACK_SEQUENCE,
+    [WITH_EXCEPT_START] = WITH_EXCEPT_START,
+    [YIELD_VALUE] = YIELD_VALUE,
+};
+#endif   // NEED_OPCODE_TABLES
+
+#ifdef Py_DEBUG
+static const char *const _PyOpcode_OpName[267] = {
+    [CACHE] = "CACHE",
+    [POP_TOP] = "POP_TOP",
+    [PUSH_NULL] = "PUSH_NULL",
+    [INTERPRETER_EXIT] = "INTERPRETER_EXIT",
+    [END_FOR] = "END_FOR",
+    [END_SEND] = "END_SEND",
+    [BINARY_OP_ADD_FLOAT] = "BINARY_OP_ADD_FLOAT",
+    [BINARY_OP_ADD_INT] = "BINARY_OP_ADD_INT",
+    [BINARY_OP_ADD_UNICODE] = "BINARY_OP_ADD_UNICODE",
+    [NOP] = "NOP",
+    [BINARY_OP_INPLACE_ADD_UNICODE] = "BINARY_OP_INPLACE_ADD_UNICODE",
+    [UNARY_NEGATIVE] = "UNARY_NEGATIVE",
+    [UNARY_NOT] = "UNARY_NOT",
+    [BINARY_OP_MULTIPLY_FLOAT] = "BINARY_OP_MULTIPLY_FLOAT",
+    [BINARY_OP_MULTIPLY_INT] = "BINARY_OP_MULTIPLY_INT",
+    [UNARY_INVERT] = "UNARY_INVERT",
+    [BINARY_OP_SUBTRACT_FLOAT] = "BINARY_OP_SUBTRACT_FLOAT",
+    [RESERVED] = "RESERVED",
+    [BINARY_OP_SUBTRACT_INT] = "BINARY_OP_SUBTRACT_INT",
+    [BINARY_SUBSCR_DICT] = "BINARY_SUBSCR_DICT",
+    [BINARY_SUBSCR_GETITEM] = "BINARY_SUBSCR_GETITEM",
+    [BINARY_SUBSCR_LIST_INT] = "BINARY_SUBSCR_LIST_INT",
+    [BINARY_SUBSCR_TUPLE_INT] = "BINARY_SUBSCR_TUPLE_INT",
+    [CALL_PY_EXACT_ARGS] = "CALL_PY_EXACT_ARGS",
+    [CALL_PY_WITH_DEFAULTS] = "CALL_PY_WITH_DEFAULTS",
+    [BINARY_SUBSCR] = "BINARY_SUBSCR",
+    [BINARY_SLICE] = "BINARY_SLICE",
+    [STORE_SLICE] = "STORE_SLICE",
+    [CALL_BOUND_METHOD_EXACT_ARGS] = "CALL_BOUND_METHOD_EXACT_ARGS",
+    [CALL_BUILTIN_CLASS] = "CALL_BUILTIN_CLASS",
+    [GET_LEN] = "GET_LEN",
+    [MATCH_MAPPING] = "MATCH_MAPPING",
+    [MATCH_SEQUENCE] = "MATCH_SEQUENCE",
+    [MATCH_KEYS] = "MATCH_KEYS",
+    [CALL_BUILTIN_FAST_WITH_KEYWORDS] = "CALL_BUILTIN_FAST_WITH_KEYWORDS",
+    [PUSH_EXC_INFO] = "PUSH_EXC_INFO",
+    [CHECK_EXC_MATCH] = "CHECK_EXC_MATCH",
+    [CHECK_EG_MATCH] = "CHECK_EG_MATCH",
+    [CALL_METHOD_DESCRIPTOR_FAST_WITH_KEYWORDS] = "CALL_METHOD_DESCRIPTOR_FAST_WITH_KEYWORDS",
+    [CALL_NO_KW_BUILTIN_FAST] = "CALL_NO_KW_BUILTIN_FAST",
+    [CALL_NO_KW_BUILTIN_O] = "CALL_NO_KW_BUILTIN_O",
+    [CALL_NO_KW_ISINSTANCE] = "CALL_NO_KW_ISINSTANCE",
+    [CALL_NO_KW_LEN] = "CALL_NO_KW_LEN",
+    [CALL_NO_KW_LIST_APPEND] = "CALL_NO_KW_LIST_APPEND",
+    [CALL_NO_KW_METHOD_DESCRIPTOR_FAST] = "CALL_NO_KW_METHOD_DESCRIPTOR_FAST",
+    [CALL_NO_KW_METHOD_DESCRIPTOR_NOARGS] = "CALL_NO_KW_METHOD_DESCRIPTOR_NOARGS",
+    [CALL_NO_KW_METHOD_DESCRIPTOR_O] = "CALL_NO_KW_METHOD_DESCRIPTOR_O",
+    [CALL_NO_KW_STR_1] = "CALL_NO_KW_STR_1",
+    [CALL_NO_KW_TUPLE_1] = "CALL_NO_KW_TUPLE_1",
+    [WITH_EXCEPT_START] = "WITH_EXCEPT_START",
+    [GET_AITER] = "GET_AITER",
+    [GET_ANEXT] = "GET_ANEXT",
+    [BEFORE_ASYNC_WITH] = "BEFORE_ASYNC_WITH",
+    [BEFORE_WITH] = "BEFORE_WITH",
+    [END_ASYNC_FOR] = "END_ASYNC_FOR",
+    [CLEANUP_THROW] = "CLEANUP_THROW",
+    [CALL_NO_KW_TYPE_1] = "CALL_NO_KW_TYPE_1",
+    [COMPARE_OP_FLOAT] = "COMPARE_OP_FLOAT",
+    [COMPARE_OP_INT] = "COMPARE_OP_INT",
+    [COMPARE_OP_STR] = "COMPARE_OP_STR",
+    [STORE_SUBSCR] = "STORE_SUBSCR",
+    [DELETE_SUBSCR] = "DELETE_SUBSCR",
+    [FOR_ITER_LIST] = "FOR_ITER_LIST",
+    [FOR_ITER_TUPLE] = "FOR_ITER_TUPLE",
+    [FOR_ITER_RANGE] = "FOR_ITER_RANGE",
+    [FOR_ITER_GEN] = "FOR_ITER_GEN",
+    [LOAD_SUPER_ATTR_ATTR] = "LOAD_SUPER_ATTR_ATTR",
+    [LOAD_SUPER_ATTR_METHOD] = "LOAD_SUPER_ATTR_METHOD",
+    [GET_ITER] = "GET_ITER",
+    [GET_YIELD_FROM_ITER] = "GET_YIELD_FROM_ITER",
+    [LOAD_ATTR_CLASS] = "LOAD_ATTR_CLASS",
+    [LOAD_BUILD_CLASS] = "LOAD_BUILD_CLASS",
+    [LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN] = "LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN",
+    [LOAD_ATTR_INSTANCE_VALUE] = "LOAD_ATTR_INSTANCE_VALUE",
+    [LOAD_ASSERTION_ERROR] = "LOAD_ASSERTION_ERROR",
+    [RETURN_GENERATOR] = "RETURN_GENERATOR",
+    [LOAD_ATTR_MODULE] = "LOAD_ATTR_MODULE",
+    [LOAD_ATTR_PROPERTY] = "LOAD_ATTR_PROPERTY",
+    [LOAD_ATTR_SLOT] = "LOAD_ATTR_SLOT",
+    [LOAD_ATTR_WITH_HINT] = "LOAD_ATTR_WITH_HINT",
+    [LOAD_ATTR_METHOD_LAZY_DICT] = "LOAD_ATTR_METHOD_LAZY_DICT",
+    [LOAD_ATTR_METHOD_NO_DICT] = "LOAD_ATTR_METHOD_NO_DICT",
+    [LOAD_ATTR_METHOD_WITH_VALUES] = "LOAD_ATTR_METHOD_WITH_VALUES",
+    [RETURN_VALUE] = "RETURN_VALUE",
+    [LOAD_CONST__LOAD_FAST] = "LOAD_CONST__LOAD_FAST",
+    [SETUP_ANNOTATIONS] = "SETUP_ANNOTATIONS",
+    [LOAD_FAST__LOAD_CONST] = "LOAD_FAST__LOAD_CONST",
+    [LOAD_LOCALS] = "LOAD_LOCALS",
+    [LOAD_FAST__LOAD_FAST] = "LOAD_FAST__LOAD_FAST",
+    [POP_EXCEPT] = "POP_EXCEPT",
+    [STORE_NAME] = "STORE_NAME",
+    [DELETE_NAME] = "DELETE_NAME",
+    [UNPACK_SEQUENCE] = "UNPACK_SEQUENCE",
+    [FOR_ITER] = "FOR_ITER",
+    [UNPACK_EX] = "UNPACK_EX",
+    [STORE_ATTR] = "STORE_ATTR",
+    [DELETE_ATTR] = "DELETE_ATTR",
+    [STORE_GLOBAL] = "STORE_GLOBAL",
+    [DELETE_GLOBAL] = "DELETE_GLOBAL",
+    [SWAP] = "SWAP",
+    [LOAD_CONST] = "LOAD_CONST",
+    [LOAD_NAME] = "LOAD_NAME",
+    [BUILD_TUPLE] = "BUILD_TUPLE",
+    [BUILD_LIST] = "BUILD_LIST",
+    [BUILD_SET] = "BUILD_SET",
+    [BUILD_MAP] = "BUILD_MAP",
+    [LOAD_ATTR] = "LOAD_ATTR",
+    [COMPARE_OP] = "COMPARE_OP",
+    [IMPORT_NAME] = "IMPORT_NAME",
+    [IMPORT_FROM] = "IMPORT_FROM",
+    [JUMP_FORWARD] = "JUMP_FORWARD",
+    [LOAD_GLOBAL_BUILTIN] = "LOAD_GLOBAL_BUILTIN",
+    [LOAD_GLOBAL_MODULE] = "LOAD_GLOBAL_MODULE",
+    [STORE_ATTR_INSTANCE_VALUE] = "STORE_ATTR_INSTANCE_VALUE",
+    [POP_JUMP_IF_FALSE] = "POP_JUMP_IF_FALSE",
+    [POP_JUMP_IF_TRUE] = "POP_JUMP_IF_TRUE",
+    [LOAD_GLOBAL] = "LOAD_GLOBAL",
+    [IS_OP] = "IS_OP",
+    [CONTAINS_OP] = "CONTAINS_OP",
+    [RERAISE] = "RERAISE",
+    [COPY] = "COPY",
+    [RETURN_CONST] = "RETURN_CONST",
+    [BINARY_OP] = "BINARY_OP",
+    [SEND] = "SEND",
+    [LOAD_FAST] = "LOAD_FAST",
+    [STORE_FAST] = "STORE_FAST",
+    [DELETE_FAST] = "DELETE_FAST",
+    [LOAD_FAST_CHECK] = "LOAD_FAST_CHECK",
+    [POP_JUMP_IF_NOT_NONE] = "POP_JUMP_IF_NOT_NONE",
+    [POP_JUMP_IF_NONE] = "POP_JUMP_IF_NONE",
+    [RAISE_VARARGS] = "RAISE_VARARGS",
+    [GET_AWAITABLE] = "GET_AWAITABLE",
+    [MAKE_FUNCTION] = "MAKE_FUNCTION",
+    [BUILD_SLICE] = "BUILD_SLICE",
+    [JUMP_BACKWARD_NO_INTERRUPT] = "JUMP_BACKWARD_NO_INTERRUPT",
+    [MAKE_CELL] = "MAKE_CELL",
+    [LOAD_CLOSURE] = "LOAD_CLOSURE",
+    [LOAD_DEREF] = "LOAD_DEREF",
+    [STORE_DEREF] = "STORE_DEREF",
+    [DELETE_DEREF] = "DELETE_DEREF",
+    [JUMP_BACKWARD] = "JUMP_BACKWARD",
+    [LOAD_SUPER_ATTR] = "LOAD_SUPER_ATTR",
+    [CALL_FUNCTION_EX] = "CALL_FUNCTION_EX",
+    [LOAD_FAST_AND_CLEAR] = "LOAD_FAST_AND_CLEAR",
+    [EXTENDED_ARG] = "EXTENDED_ARG",
+    [LIST_APPEND] = "LIST_APPEND",
+    [SET_ADD] = "SET_ADD",
+    [MAP_ADD] = "MAP_ADD",
+    [STORE_ATTR_SLOT] = "STORE_ATTR_SLOT",
+    [COPY_FREE_VARS] = "COPY_FREE_VARS",
+    [YIELD_VALUE] = "YIELD_VALUE",
+    [RESUME] = "RESUME",
+    [MATCH_CLASS] = "MATCH_CLASS",
+    [STORE_ATTR_WITH_HINT] = "STORE_ATTR_WITH_HINT",
+    [STORE_FAST__LOAD_FAST] = "STORE_FAST__LOAD_FAST",
+    [FORMAT_VALUE] = "FORMAT_VALUE",
+    [BUILD_CONST_KEY_MAP] = "BUILD_CONST_KEY_MAP",
+    [BUILD_STRING] = "BUILD_STRING",
+    [STORE_FAST__STORE_FAST] = "STORE_FAST__STORE_FAST",
+    [STORE_SUBSCR_DICT] = "STORE_SUBSCR_DICT",
+    [STORE_SUBSCR_LIST_INT] = "STORE_SUBSCR_LIST_INT",
+    [UNPACK_SEQUENCE_LIST] = "UNPACK_SEQUENCE_LIST",
+    [LIST_EXTEND] = "LIST_EXTEND",
+    [SET_UPDATE] = "SET_UPDATE",
+    [DICT_MERGE] = "DICT_MERGE",
+    [DICT_UPDATE] = "DICT_UPDATE",
+    [UNPACK_SEQUENCE_TUPLE] = "UNPACK_SEQUENCE_TUPLE",
+    [UNPACK_SEQUENCE_TWO_TUPLE] = "UNPACK_SEQUENCE_TWO_TUPLE",
+    [SEND_GEN] = "SEND_GEN",
+    [169] = "<169>",
+    [170] = "<170>",
+    [CALL] = "CALL",
+    [KW_NAMES] = "KW_NAMES",
+    [CALL_INTRINSIC_1] = "CALL_INTRINSIC_1",
+    [CALL_INTRINSIC_2] = "CALL_INTRINSIC_2",
+    [LOAD_FROM_DICT_OR_GLOBALS] = "LOAD_FROM_DICT_OR_GLOBALS",
+    [LOAD_FROM_DICT_OR_DEREF] = "LOAD_FROM_DICT_OR_DEREF",
+    [177] = "<177>",
+    [178] = "<178>",
+    [179] = "<179>",
+    [180] = "<180>",
+    [181] = "<181>",
+    [182] = "<182>",
+    [183] = "<183>",
+    [184] = "<184>",
+    [185] = "<185>",
+    [186] = "<186>",
+    [187] = "<187>",
+    [188] = "<188>",
+    [189] = "<189>",
+    [190] = "<190>",
+    [191] = "<191>",
+    [192] = "<192>",
+    [193] = "<193>",
+    [194] = "<194>",
+    [195] = "<195>",
+    [196] = "<196>",
+    [197] = "<197>",
+    [198] = "<198>",
+    [199] = "<199>",
+    [200] = "<200>",
+    [201] = "<201>",
+    [202] = "<202>",
+    [203] = "<203>",
+    [204] = "<204>",
+    [205] = "<205>",
+    [206] = "<206>",
+    [207] = "<207>",
+    [208] = "<208>",
+    [209] = "<209>",
+    [210] = "<210>",
+    [211] = "<211>",
+    [212] = "<212>",
+    [213] = "<213>",
+    [214] = "<214>",
+    [215] = "<215>",
+    [216] = "<216>",
+    [217] = "<217>",
+    [218] = "<218>",
+    [219] = "<219>",
+    [220] = "<220>",
+    [221] = "<221>",
+    [222] = "<222>",
+    [223] = "<223>",
+    [224] = "<224>",
+    [225] = "<225>",
+    [226] = "<226>",
+    [227] = "<227>",
+    [228] = "<228>",
+    [229] = "<229>",
+    [230] = "<230>",
+    [231] = "<231>",
+    [232] = "<232>",
+    [233] = "<233>",
+    [234] = "<234>",
+    [235] = "<235>",
+    [236] = "<236>",
+    [INSTRUMENTED_LOAD_SUPER_ATTR] = "INSTRUMENTED_LOAD_SUPER_ATTR",
+    [INSTRUMENTED_POP_JUMP_IF_NONE] = "INSTRUMENTED_POP_JUMP_IF_NONE",
+    [INSTRUMENTED_POP_JUMP_IF_NOT_NONE] = "INSTRUMENTED_POP_JUMP_IF_NOT_NONE",
+    [INSTRUMENTED_RESUME] = "INSTRUMENTED_RESUME",
+    [INSTRUMENTED_CALL] = "INSTRUMENTED_CALL",
+    [INSTRUMENTED_RETURN_VALUE] = "INSTRUMENTED_RETURN_VALUE",
+    [INSTRUMENTED_YIELD_VALUE] = "INSTRUMENTED_YIELD_VALUE",
+    [INSTRUMENTED_CALL_FUNCTION_EX] = "INSTRUMENTED_CALL_FUNCTION_EX",
+    [INSTRUMENTED_JUMP_FORWARD] = "INSTRUMENTED_JUMP_FORWARD",
+    [INSTRUMENTED_JUMP_BACKWARD] = "INSTRUMENTED_JUMP_BACKWARD",
+    [INSTRUMENTED_RETURN_CONST] = "INSTRUMENTED_RETURN_CONST",
+    [INSTRUMENTED_FOR_ITER] = "INSTRUMENTED_FOR_ITER",
+    [INSTRUMENTED_POP_JUMP_IF_FALSE] = "INSTRUMENTED_POP_JUMP_IF_FALSE",
+    [INSTRUMENTED_POP_JUMP_IF_TRUE] = "INSTRUMENTED_POP_JUMP_IF_TRUE",
+    [INSTRUMENTED_END_FOR] = "INSTRUMENTED_END_FOR",
+    [INSTRUMENTED_END_SEND] = "INSTRUMENTED_END_SEND",
+    [INSTRUMENTED_INSTRUCTION] = "INSTRUMENTED_INSTRUCTION",
+    [INSTRUMENTED_LINE] = "INSTRUMENTED_LINE",
+    [255] = "<255>",
+    [SETUP_FINALLY] = "SETUP_FINALLY",
+    [SETUP_CLEANUP] = "SETUP_CLEANUP",
+    [SETUP_WITH] = "SETUP_WITH",
+    [POP_BLOCK] = "POP_BLOCK",
+    [JUMP] = "JUMP",
+    [JUMP_NO_INTERRUPT] = "JUMP_NO_INTERRUPT",
+    [LOAD_METHOD] = "LOAD_METHOD",
+    [LOAD_SUPER_METHOD] = "LOAD_SUPER_METHOD",
+    [LOAD_ZERO_SUPER_METHOD] = "LOAD_ZERO_SUPER_METHOD",
+    [LOAD_ZERO_SUPER_ATTR] = "LOAD_ZERO_SUPER_ATTR",
+    [STORE_FAST_MAYBE_NULL] = "STORE_FAST_MAYBE_NULL",
+};
+#endif
+
+#define EXTRA_CASES \
+    case 169: \
+    case 170: \
+    case 177: \
+    case 178: \
+    case 179: \
+    case 180: \
+    case 181: \
+    case 182: \
+    case 183: \
+    case 184: \
+    case 185: \
+    case 186: \
+    case 187: \
+    case 188: \
+    case 189: \
+    case 190: \
+    case 191: \
+    case 192: \
+    case 193: \
+    case 194: \
+    case 195: \
+    case 196: \
+    case 197: \
+    case 198: \
+    case 199: \
+    case 200: \
+    case 201: \
+    case 202: \
+    case 203: \
+    case 204: \
+    case 205: \
+    case 206: \
+    case 207: \
+    case 208: \
+    case 209: \
+    case 210: \
+    case 211: \
+    case 212: \
+    case 213: \
+    case 214: \
+    case 215: \
+    case 216: \
+    case 217: \
+    case 218: \
+    case 219: \
+    case 220: \
+    case 221: \
+    case 222: \
+    case 223: \
+    case 224: \
+    case 225: \
+    case 226: \
+    case 227: \
+    case 228: \
+    case 229: \
+    case 230: \
+    case 231: \
+    case 232: \
+    case 233: \
+    case 234: \
+    case 235: \
+    case 236: \
+    case 255: \
+        ;
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // !Py_INTERNAL_OPCODE_H
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_opcode_utils.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_opcode_utils.h
new file mode 100644
index 000000000000..1d5ff988290b
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_opcode_utils.h
@@ -0,0 +1,92 @@
+#ifndef Py_INTERNAL_OPCODE_UTILS_H
+#define Py_INTERNAL_OPCODE_UTILS_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_opcode.h"        // _PyOpcode_Jump
+
+
+#define MAX_REAL_OPCODE 254
+
+#define IS_WITHIN_OPCODE_RANGE(opcode) \
+        (((opcode) >= 0 && (opcode) <= MAX_REAL_OPCODE) || \
+         IS_PSEUDO_OPCODE(opcode))
+
+#define IS_JUMP_OPCODE(opcode) \
+         is_bit_set_in_table(_PyOpcode_Jump, opcode)
+
+#define IS_BLOCK_PUSH_OPCODE(opcode) \
+        ((opcode) == SETUP_FINALLY || \
+         (opcode) == SETUP_WITH || \
+         (opcode) == SETUP_CLEANUP)
+
+#define HAS_TARGET(opcode) \
+        (IS_JUMP_OPCODE(opcode) || IS_BLOCK_PUSH_OPCODE(opcode))
+
+/* opcodes that must be last in the basicblock */
+#define IS_TERMINATOR_OPCODE(opcode) \
+        (IS_JUMP_OPCODE(opcode) || IS_SCOPE_EXIT_OPCODE(opcode))
+
+/* opcodes which are not emitted in codegen stage, only by the assembler */
+#define IS_ASSEMBLER_OPCODE(opcode) \
+        ((opcode) == JUMP_FORWARD || \
+         (opcode) == JUMP_BACKWARD || \
+         (opcode) == JUMP_BACKWARD_NO_INTERRUPT)
+
+#define IS_BACKWARDS_JUMP_OPCODE(opcode) \
+        ((opcode) == JUMP_BACKWARD || \
+         (opcode) == JUMP_BACKWARD_NO_INTERRUPT)
+
+#define IS_UNCONDITIONAL_JUMP_OPCODE(opcode) \
+        ((opcode) == JUMP || \
+         (opcode) == JUMP_NO_INTERRUPT || \
+         (opcode) == JUMP_FORWARD || \
+         (opcode) == JUMP_BACKWARD || \
+         (opcode) == JUMP_BACKWARD_NO_INTERRUPT)
+
+#define IS_SCOPE_EXIT_OPCODE(opcode) \
+        ((opcode) == RETURN_VALUE || \
+         (opcode) == RETURN_CONST || \
+         (opcode) == RAISE_VARARGS || \
+         (opcode) == RERAISE)
+
+#define IS_SUPERINSTRUCTION_OPCODE(opcode) \
+        ((opcode) == LOAD_FAST__LOAD_FAST || \
+         (opcode) == LOAD_FAST__LOAD_CONST || \
+         (opcode) == LOAD_CONST__LOAD_FAST || \
+         (opcode) == STORE_FAST__LOAD_FAST || \
+         (opcode) == STORE_FAST__STORE_FAST)
+
+
+#define LOG_BITS_PER_INT 5
+#define MASK_LOW_LOG_BITS 31
+
+static inline int
+is_bit_set_in_table(const uint32_t *table, int bitindex) {
+    /* Is the relevant bit set in the relevant word? */
+    /* 512 bits fit into 9 32-bits words.
+     * Word is indexed by (bitindex>>ln(size of int in bits)).
+     * Bit within word is the low bits of bitindex.
+     */
+    if (bitindex >= 0 && bitindex < 512) {
+        uint32_t word = table[bitindex >> LOG_BITS_PER_INT];
+        return (word >> (bitindex & MASK_LOW_LOG_BITS)) & 1;
+    }
+    else {
+        return 0;
+    }
+}
+
+#undef LOG_BITS_PER_INT
+#undef MASK_LOW_LOG_BITS
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_OPCODE_UTILS_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_parser.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_parser.h
new file mode 100644
index 000000000000..dd51b92801ae
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_parser.h
@@ -0,0 +1,66 @@
+#ifndef Py_INTERNAL_PARSER_H
+#define Py_INTERNAL_PARSER_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+
+#include "pycore_ast.h"             // struct _expr
+#include "pycore_global_strings.h"  // _Py_DECLARE_STR()
+#include "pycore_pyarena.h"         // PyArena
+
+
+#ifdef Py_DEBUG
+#define _PYPEGEN_NSTATISTICS 2000
+#endif
+
+struct _parser_runtime_state {
+#ifdef Py_DEBUG
+    long memo_statistics[_PYPEGEN_NSTATISTICS];
+#else
+    int _not_used;
+#endif
+    struct _expr dummy_name;
+};
+
+_Py_DECLARE_STR(empty, "")
+#define _parser_runtime_state_INIT \
+    { \
+        .dummy_name = { \
+            .kind = Name_kind, \
+            .v.Name.id = &_Py_STR(empty), \
+            .v.Name.ctx = Load, \
+            .lineno = 1, \
+            .col_offset = 0, \
+            .end_lineno = 1, \
+            .end_col_offset = 0, \
+        }, \
+    }
+
+extern struct _mod* _PyParser_ASTFromString(
+    const char *str,
+    PyObject* filename,
+    int mode,
+    PyCompilerFlags *flags,
+    PyArena *arena);
+
+extern struct _mod* _PyParser_ASTFromFile(
+    FILE *fp,
+    PyObject *filename_ob,
+    const char *enc,
+    int mode,
+    const char *ps1,
+    const char *ps2,
+    PyCompilerFlags *flags,
+    int *errcode,
+    PyArena *arena);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_PARSER_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_pathconfig.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_pathconfig.h
new file mode 100644
index 000000000000..b8deaa0c3eb0
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_pathconfig.h
@@ -0,0 +1,24 @@
+#ifndef Py_INTERNAL_PATHCONFIG_H
+#define Py_INTERNAL_PATHCONFIG_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+PyAPI_FUNC(void) _PyPathConfig_ClearGlobal(void);
+extern PyStatus _PyPathConfig_ReadGlobal(PyConfig *config);
+extern PyStatus _PyPathConfig_UpdateGlobal(const PyConfig *config);
+extern const wchar_t * _PyPathConfig_GetGlobalModuleSearchPath(void);
+
+extern int _PyPathConfig_ComputeSysPath0(
+    const PyWideStringList *argv,
+    PyObject **path0);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_PATHCONFIG_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_pyarena.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_pyarena.h
new file mode 100644
index 000000000000..d78972a88ca2
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_pyarena.h
@@ -0,0 +1,64 @@
+/* An arena-like memory interface for the compiler.
+ */
+
+#ifndef Py_INTERNAL_PYARENA_H
+#define Py_INTERNAL_PYARENA_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+typedef struct _arena PyArena;
+
+/* _PyArena_New() and _PyArena_Free() create a new arena and free it,
+   respectively.  Once an arena has been created, it can be used
+   to allocate memory via _PyArena_Malloc().  Pointers to PyObject can
+   also be registered with the arena via _PyArena_AddPyObject(), and the
+   arena will ensure that the PyObjects stay alive at least until
+   _PyArena_Free() is called.  When an arena is freed, all the memory it
+   allocated is freed, the arena releases internal references to registered
+   PyObject*, and none of its pointers are valid.
+   XXX (tim) What does "none of its pointers are valid" mean?  Does it
+   XXX mean that pointers previously obtained via _PyArena_Malloc() are
+   XXX no longer valid?  (That's clearly true, but not sure that's what
+   XXX the text is trying to say.)
+
+   _PyArena_New() returns an arena pointer.  On error, it
+   returns a negative number and sets an exception.
+   XXX (tim):  Not true.  On error, _PyArena_New() actually returns NULL,
+   XXX and looks like it may or may not set an exception (e.g., if the
+   XXX internal PyList_New(0) returns NULL, _PyArena_New() passes that on
+   XXX and an exception is set; OTOH, if the internal
+   XXX block_new(DEFAULT_BLOCK_SIZE) returns NULL, that's passed on but
+   XXX an exception is not set in that case).
+*/
+PyAPI_FUNC(PyArena*) _PyArena_New(void);
+PyAPI_FUNC(void) _PyArena_Free(PyArena *);
+
+/* Mostly like malloc(), return the address of a block of memory spanning
+ * `size` bytes, or return NULL (without setting an exception) if enough
+ * new memory can't be obtained.  Unlike malloc(0), _PyArena_Malloc() with
+ * size=0 does not guarantee to return a unique pointer (the pointer
+ * returned may equal one or more other pointers obtained from
+ * _PyArena_Malloc()).
+ * Note that pointers obtained via _PyArena_Malloc() must never be passed to
+ * the system free() or realloc(), or to any of Python's similar memory-
+ * management functions.  _PyArena_Malloc()-obtained pointers remain valid
+ * until _PyArena_Free(ar) is called, at which point all pointers obtained
+ * from the arena `ar` become invalid simultaneously.
+ */
+PyAPI_FUNC(void*) _PyArena_Malloc(PyArena *, size_t size);
+
+/* This routine isn't a proper arena allocation routine.  It takes
+ * a PyObject* and records it so that it can be DECREFed when the
+ * arena is freed.
+ */
+PyAPI_FUNC(int) _PyArena_AddPyObject(PyArena *, PyObject *);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_PYARENA_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_pyerrors.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_pyerrors.h
new file mode 100644
index 000000000000..4620a2696449
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_pyerrors.h
@@ -0,0 +1,117 @@
+#ifndef Py_INTERNAL_PYERRORS_H
+#define Py_INTERNAL_PYERRORS_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+
+/* runtime lifecycle */
+
+extern PyStatus _PyErr_InitTypes(PyInterpreterState *);
+extern void _PyErr_FiniTypes(PyInterpreterState *);
+
+
+/* other API */
+
+static inline PyObject* _PyErr_Occurred(PyThreadState *tstate)
+{
+    assert(tstate != NULL);
+    if (tstate->current_exception == NULL) {
+        return NULL;
+    }
+    return (PyObject *)Py_TYPE(tstate->current_exception);
+}
+
+static inline void _PyErr_ClearExcState(_PyErr_StackItem *exc_state)
+{
+    Py_CLEAR(exc_state->exc_value);
+}
+
+PyAPI_FUNC(PyObject*) _PyErr_StackItemToExcInfoTuple(
+    _PyErr_StackItem *err_info);
+
+PyAPI_FUNC(void) _PyErr_Fetch(
+    PyThreadState *tstate,
+    PyObject **type,
+    PyObject **value,
+    PyObject **traceback);
+
+extern PyObject *
+_PyErr_GetRaisedException(PyThreadState *tstate);
+
+PyAPI_FUNC(int) _PyErr_ExceptionMatches(
+    PyThreadState *tstate,
+    PyObject *exc);
+
+void
+_PyErr_SetRaisedException(PyThreadState *tstate, PyObject *exc);
+
+PyAPI_FUNC(void) _PyErr_Restore(
+    PyThreadState *tstate,
+    PyObject *type,
+    PyObject *value,
+    PyObject *traceback);
+
+PyAPI_FUNC(void) _PyErr_SetObject(
+    PyThreadState *tstate,
+    PyObject *type,
+    PyObject *value);
+
+PyAPI_FUNC(void) _PyErr_ChainStackItem(
+    _PyErr_StackItem *exc_info);
+
+PyAPI_FUNC(void) _PyErr_Clear(PyThreadState *tstate);
+
+PyAPI_FUNC(void) _PyErr_SetNone(PyThreadState *tstate, PyObject *exception);
+
+PyAPI_FUNC(PyObject *) _PyErr_NoMemory(PyThreadState *tstate);
+
+PyAPI_FUNC(void) _PyErr_SetString(
+    PyThreadState *tstate,
+    PyObject *exception,
+    const char *string);
+
+PyAPI_FUNC(PyObject *) _PyErr_Format(
+    PyThreadState *tstate,
+    PyObject *exception,
+    const char *format,
+    ...);
+
+PyAPI_FUNC(void) _PyErr_NormalizeException(
+    PyThreadState *tstate,
+    PyObject **exc,
+    PyObject **val,
+    PyObject **tb);
+
+PyAPI_FUNC(PyObject *) _PyErr_FormatFromCauseTstate(
+    PyThreadState *tstate,
+    PyObject *exception,
+    const char *format,
+    ...);
+
+PyAPI_FUNC(PyObject *) _PyExc_CreateExceptionGroup(
+    const char *msg,
+    PyObject *excs);
+
+PyAPI_FUNC(PyObject *) _PyExc_PrepReraiseStar(
+    PyObject *orig,
+    PyObject *excs);
+
+PyAPI_FUNC(int) _PyErr_CheckSignalsTstate(PyThreadState *tstate);
+
+PyAPI_FUNC(void) _Py_DumpExtensionModules(int fd, PyInterpreterState *interp);
+
+extern PyObject* _Py_Offer_Suggestions(PyObject* exception);
+PyAPI_FUNC(Py_ssize_t) _Py_UTF8_Edit_Cost(PyObject *str_a, PyObject *str_b,
+                                          Py_ssize_t max_cost);
+
+void _PyErr_FormatNote(const char *format, ...);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_PYERRORS_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_pyhash.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_pyhash.h
new file mode 100644
index 000000000000..34dfa5377128
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_pyhash.h
@@ -0,0 +1,40 @@
+#ifndef Py_INTERNAL_HASH_H
+#define Py_INTERNAL_HASH_H
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+
+struct pyhash_runtime_state {
+    struct {
+#ifndef MS_WINDOWS
+        int fd;
+        dev_t st_dev;
+        ino_t st_ino;
+#else
+    // This is a placeholder so the struct isn't empty on Windows.
+    int _not_used;
+#endif
+    } urandom_cache;
+};
+
+#ifndef MS_WINDOWS
+# define _py_urandom_cache_INIT \
+    { \
+        .fd = -1, \
+    }
+#else
+# define _py_urandom_cache_INIT {0}
+#endif
+
+#define pyhash_state_INIT \
+    { \
+        .urandom_cache = _py_urandom_cache_INIT, \
+    }
+
+
+uint64_t _Py_KeyedHash(uint64_t, const char *, Py_ssize_t);
+
+
+#endif  // Py_INTERNAL_HASH_H
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_pylifecycle.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_pylifecycle.h
new file mode 100644
index 000000000000..7cd998a704c8
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_pylifecycle.h
@@ -0,0 +1,99 @@
+#ifndef Py_INTERNAL_LIFECYCLE_H
+#define Py_INTERNAL_LIFECYCLE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_runtime.h"       // _PyRuntimeState
+
+/* Forward declarations */
+struct _PyArgv;
+struct pyruntimestate;
+
+extern int _Py_SetFileSystemEncoding(
+    const char *encoding,
+    const char *errors);
+extern void _Py_ClearFileSystemEncoding(void);
+extern PyStatus _PyUnicode_InitEncodings(PyThreadState *tstate);
+#ifdef MS_WINDOWS
+extern int _PyUnicode_EnableLegacyWindowsFSEncoding(void);
+#endif
+
+PyAPI_FUNC(void) _Py_ClearStandardStreamEncoding(void);
+
+PyAPI_FUNC(int) _Py_IsLocaleCoercionTarget(const char *ctype_loc);
+
+/* Various one-time initializers */
+
+extern void _Py_InitVersion(void);
+extern PyStatus _PyFaulthandler_Init(int enable);
+extern PyObject * _PyBuiltin_Init(PyInterpreterState *interp);
+extern PyStatus _PySys_Create(
+    PyThreadState *tstate,
+    PyObject **sysmod_p);
+extern PyStatus _PySys_ReadPreinitWarnOptions(PyWideStringList *options);
+extern PyStatus _PySys_ReadPreinitXOptions(PyConfig *config);
+extern int _PySys_UpdateConfig(PyThreadState *tstate);
+extern void _PySys_FiniTypes(PyInterpreterState *interp);
+extern int _PyBuiltins_AddExceptions(PyObject * bltinmod);
+extern PyStatus _Py_HashRandomization_Init(const PyConfig *);
+
+extern PyStatus _PyTime_Init(void);
+extern PyStatus _PyGC_Init(PyInterpreterState *interp);
+extern PyStatus _PyAtExit_Init(PyInterpreterState *interp);
+extern int _Py_Deepfreeze_Init(void);
+
+/* Various internal finalizers */
+
+extern int _PySignal_Init(int install_signal_handlers);
+extern void _PySignal_Fini(void);
+
+extern void _PyGC_Fini(PyInterpreterState *interp);
+extern void _Py_HashRandomization_Fini(void);
+extern void _PyFaulthandler_Fini(void);
+extern void _PyHash_Fini(void);
+extern void _PyTraceMalloc_Fini(void);
+extern void _PyWarnings_Fini(PyInterpreterState *interp);
+extern void _PyAST_Fini(PyInterpreterState *interp);
+extern void _PyAtExit_Fini(PyInterpreterState *interp);
+extern void _PyThread_FiniType(PyInterpreterState *interp);
+extern void _Py_Deepfreeze_Fini(void);
+extern void _PyArg_Fini(void);
+extern void _Py_FinalizeAllocatedBlocks(_PyRuntimeState *);
+
+extern PyStatus _PyGILState_Init(PyInterpreterState *interp);
+extern PyStatus _PyGILState_SetTstate(PyThreadState *tstate);
+extern void _PyGILState_Fini(PyInterpreterState *interp);
+
+PyAPI_FUNC(void) _PyGC_DumpShutdownStats(PyInterpreterState *interp);
+
+PyAPI_FUNC(PyStatus) _Py_PreInitializeFromPyArgv(
+    const PyPreConfig *src_config,
+    const struct _PyArgv *args);
+PyAPI_FUNC(PyStatus) _Py_PreInitializeFromConfig(
+    const PyConfig *config,
+    const struct _PyArgv *args);
+
+PyAPI_FUNC(wchar_t *) _Py_GetStdlibDir(void);
+
+PyAPI_FUNC(int) _Py_HandleSystemExit(int *exitcode_p);
+
+PyAPI_FUNC(PyObject*) _PyErr_WriteUnraisableDefaultHook(PyObject *unraisable);
+
+PyAPI_FUNC(void) _PyErr_Print(PyThreadState *tstate);
+PyAPI_FUNC(void) _PyErr_Display(PyObject *file, PyObject *exception,
+                                PyObject *value, PyObject *tb);
+PyAPI_FUNC(void) _PyErr_DisplayException(PyObject *file, PyObject *exc);
+
+PyAPI_FUNC(void) _PyThreadState_DeleteCurrent(PyThreadState *tstate);
+
+extern void _PyAtExit_Call(PyInterpreterState *interp);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_LIFECYCLE_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_pymath.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_pymath.h
new file mode 100644
index 000000000000..7a4e1c1eb714
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_pymath.h
@@ -0,0 +1,205 @@
+#ifndef Py_INTERNAL_PYMATH_H
+#define Py_INTERNAL_PYMATH_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+
+/* _Py_ADJUST_ERANGE1(x)
+ * _Py_ADJUST_ERANGE2(x, y)
+ * Set errno to 0 before calling a libm function, and invoke one of these
+ * macros after, passing the function result(s) (_Py_ADJUST_ERANGE2 is useful
+ * for functions returning complex results).  This makes two kinds of
+ * adjustments to errno:  (A) If it looks like the platform libm set
+ * errno=ERANGE due to underflow, clear errno. (B) If it looks like the
+ * platform libm overflowed but didn't set errno, force errno to ERANGE.  In
+ * effect, we're trying to force a useful implementation of C89 errno
+ * behavior.
+ * Caution:
+ *    This isn't reliable.  C99 no longer requires libm to set errno under
+ *        any exceptional condition, but does require +- HUGE_VAL return
+ *        values on overflow.  A 754 box *probably* maps HUGE_VAL to a
+ *        double infinity, and we're cool if that's so, unless the input
+ *        was an infinity and an infinity is the expected result.  A C89
+ *        system sets errno to ERANGE, so we check for that too.  We're
+ *        out of luck if a C99 754 box doesn't map HUGE_VAL to +Inf, or
+ *        if the returned result is a NaN, or if a C89 box returns HUGE_VAL
+ *        in non-overflow cases.
+ */
+static inline void _Py_ADJUST_ERANGE1(double x)
+{
+    if (errno == 0) {
+        if (x == Py_HUGE_VAL || x == -Py_HUGE_VAL) {
+            errno = ERANGE;
+        }
+    }
+    else if (errno == ERANGE && x == 0.0) {
+        errno = 0;
+    }
+}
+
+static inline void _Py_ADJUST_ERANGE2(double x, double y)
+{
+    if (x == Py_HUGE_VAL || x == -Py_HUGE_VAL ||
+        y == Py_HUGE_VAL || y == -Py_HUGE_VAL)
+    {
+        if (errno == 0) {
+            errno = ERANGE;
+        }
+    }
+    else if (errno == ERANGE) {
+        errno = 0;
+    }
+}
+
+
+//--- HAVE_PY_SET_53BIT_PRECISION macro ------------------------------------
+//
+// The functions _Py_dg_strtod() and _Py_dg_dtoa() in Python/dtoa.c (which are
+// required to support the short float repr introduced in Python 3.1) require
+// that the floating-point unit that's being used for arithmetic operations on
+// C doubles is set to use 53-bit precision.  It also requires that the FPU
+// rounding mode is round-half-to-even, but that's less often an issue.
+//
+// If your FPU isn't already set to 53-bit precision/round-half-to-even, and
+// you want to make use of _Py_dg_strtod() and _Py_dg_dtoa(), then you should:
+//
+//     #define HAVE_PY_SET_53BIT_PRECISION 1
+//
+// and also give appropriate definitions for the following three macros:
+//
+// * _Py_SET_53BIT_PRECISION_HEADER: any variable declarations needed to
+//   use the two macros below.
+// * _Py_SET_53BIT_PRECISION_START: store original FPU settings, and
+//   set FPU to 53-bit precision/round-half-to-even
+// * _Py_SET_53BIT_PRECISION_END: restore original FPU settings
+//
+// The macros are designed to be used within a single C function: see
+// Python/pystrtod.c for an example of their use.
+
+
+// Get and set x87 control word for gcc/x86
+#ifdef HAVE_GCC_ASM_FOR_X87
+#define HAVE_PY_SET_53BIT_PRECISION 1
+
+// Functions defined in Python/pymath.c
+extern unsigned short _Py_get_387controlword(void);
+extern void _Py_set_387controlword(unsigned short);
+
+#define _Py_SET_53BIT_PRECISION_HEADER                                  \
+    unsigned short old_387controlword, new_387controlword
+#define _Py_SET_53BIT_PRECISION_START                                   \
+    do {                                                                \
+        old_387controlword = _Py_get_387controlword();                  \
+        new_387controlword = (old_387controlword & ~0x0f00) | 0x0200;   \
+        if (new_387controlword != old_387controlword) {                 \
+            _Py_set_387controlword(new_387controlword);                 \
+        }                                                               \
+    } while (0)
+#define _Py_SET_53BIT_PRECISION_END                                     \
+    do {                                                                \
+        if (new_387controlword != old_387controlword) {                 \
+            _Py_set_387controlword(old_387controlword);                 \
+        }                                                               \
+    } while (0)
+#endif
+
+// Get and set x87 control word for VisualStudio/x86.
+// x87 is not supported in 64-bit or ARM.
+#if defined(_MSC_VER) && !defined(_WIN64) && !defined(_M_ARM)
+#define HAVE_PY_SET_53BIT_PRECISION 1
+
+#include <float.h>                // __control87_2()
+
+#define _Py_SET_53BIT_PRECISION_HEADER \
+    unsigned int old_387controlword, new_387controlword, out_387controlword
+    // We use the __control87_2 function to set only the x87 control word.
+    // The SSE control word is unaffected.
+#define _Py_SET_53BIT_PRECISION_START                                   \
+    do {                                                                \
+        __control87_2(0, 0, &old_387controlword, NULL);                 \
+        new_387controlword =                                            \
+          (old_387controlword & ~(_MCW_PC | _MCW_RC)) | (_PC_53 | _RC_NEAR); \
+        if (new_387controlword != old_387controlword) {                 \
+            __control87_2(new_387controlword, _MCW_PC | _MCW_RC,        \
+                          &out_387controlword, NULL);                   \
+        }                                                               \
+    } while (0)
+#define _Py_SET_53BIT_PRECISION_END                                     \
+    do {                                                                \
+        if (new_387controlword != old_387controlword) {                 \
+            __control87_2(old_387controlword, _MCW_PC | _MCW_RC,        \
+                          &out_387controlword, NULL);                   \
+        }                                                               \
+    } while (0)
+#endif
+
+
+// MC68881
+#ifdef HAVE_GCC_ASM_FOR_MC68881
+#define HAVE_PY_SET_53BIT_PRECISION 1
+#define _Py_SET_53BIT_PRECISION_HEADER \
+    unsigned int old_fpcr, new_fpcr
+#define _Py_SET_53BIT_PRECISION_START                                   \
+    do {                                                                \
+        __asm__ ("fmove.l %%fpcr,%0" : "=g" (old_fpcr));                \
+        /* Set double precision / round to nearest.  */                 \
+        new_fpcr = (old_fpcr & ~0xf0) | 0x80;                           \
+        if (new_fpcr != old_fpcr) {                                     \
+              __asm__ volatile ("fmove.l %0,%%fpcr" : : "g" (new_fpcr));\
+        }                                                               \
+    } while (0)
+#define _Py_SET_53BIT_PRECISION_END                                     \
+    do {                                                                \
+        if (new_fpcr != old_fpcr) {                                     \
+            __asm__ volatile ("fmove.l %0,%%fpcr" : : "g" (old_fpcr));  \
+        }                                                               \
+    } while (0)
+#endif
+
+// Default definitions are empty
+#ifndef _Py_SET_53BIT_PRECISION_HEADER
+#  define _Py_SET_53BIT_PRECISION_HEADER
+#  define _Py_SET_53BIT_PRECISION_START
+#  define _Py_SET_53BIT_PRECISION_END
+#endif
+
+
+//--- _PY_SHORT_FLOAT_REPR macro -------------------------------------------
+
+// If we can't guarantee 53-bit precision, don't use the code
+// in Python/dtoa.c, but fall back to standard code.  This
+// means that repr of a float will be long (17 significant digits).
+//
+// Realistically, there are two things that could go wrong:
+//
+// (1) doubles aren't IEEE 754 doubles, or
+// (2) we're on x86 with the rounding precision set to 64-bits
+//     (extended precision), and we don't know how to change
+//     the rounding precision.
+#if !defined(DOUBLE_IS_LITTLE_ENDIAN_IEEE754) && \
+    !defined(DOUBLE_IS_BIG_ENDIAN_IEEE754) && \
+    !defined(DOUBLE_IS_ARM_MIXED_ENDIAN_IEEE754)
+#  define _PY_SHORT_FLOAT_REPR 0
+#endif
+
+// Double rounding is symptomatic of use of extended precision on x86.
+// If we're seeing double rounding, and we don't have any mechanism available
+// for changing the FPU rounding precision, then don't use Python/dtoa.c.
+#if defined(X87_DOUBLE_ROUNDING) && !defined(HAVE_PY_SET_53BIT_PRECISION)
+#  define _PY_SHORT_FLOAT_REPR 0
+#endif
+
+#ifndef _PY_SHORT_FLOAT_REPR
+#  define _PY_SHORT_FLOAT_REPR 1
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_PYMATH_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_pymem.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_pymem.h
new file mode 100644
index 000000000000..81a707a0a5dd
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_pymem.h
@@ -0,0 +1,98 @@
+#ifndef Py_INTERNAL_PYMEM_H
+#define Py_INTERNAL_PYMEM_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pymem.h"      // PyMemAllocatorName
+
+
+typedef struct {
+    /* We tag each block with an API ID in order to tag API violations */
+    char api_id;
+    PyMemAllocatorEx alloc;
+} debug_alloc_api_t;
+
+struct _pymem_allocators {
+    PyThread_type_lock mutex;
+    struct {
+        PyMemAllocatorEx raw;
+        PyMemAllocatorEx mem;
+        PyMemAllocatorEx obj;
+    } standard;
+    struct {
+        debug_alloc_api_t raw;
+        debug_alloc_api_t mem;
+        debug_alloc_api_t obj;
+    } debug;
+    PyObjectArenaAllocator obj_arena;
+};
+
+
+/* Set the memory allocator of the specified domain to the default.
+   Save the old allocator into *old_alloc if it's non-NULL.
+   Return on success, or return -1 if the domain is unknown. */
+PyAPI_FUNC(int) _PyMem_SetDefaultAllocator(
+    PyMemAllocatorDomain domain,
+    PyMemAllocatorEx *old_alloc);
+
+/* Special bytes broadcast into debug memory blocks at appropriate times.
+   Strings of these are unlikely to be valid addresses, floats, ints or
+   7-bit ASCII.
+
+   - PYMEM_CLEANBYTE: clean (newly allocated) memory
+   - PYMEM_DEADBYTE dead (newly freed) memory
+   - PYMEM_FORBIDDENBYTE: untouchable bytes at each end of a block
+
+   Byte patterns 0xCB, 0xDB and 0xFB have been replaced with 0xCD, 0xDD and
+   0xFD to use the same values than Windows CRT debug malloc() and free().
+   If modified, _PyMem_IsPtrFreed() should be updated as well. */
+#define PYMEM_CLEANBYTE      0xCD
+#define PYMEM_DEADBYTE       0xDD
+#define PYMEM_FORBIDDENBYTE  0xFD
+
+/* Heuristic checking if a pointer value is newly allocated
+   (uninitialized), newly freed or NULL (is equal to zero).
+
+   The pointer is not dereferenced, only the pointer value is checked.
+
+   The heuristic relies on the debug hooks on Python memory allocators which
+   fills newly allocated memory with CLEANBYTE (0xCD) and newly freed memory
+   with DEADBYTE (0xDD). Detect also "untouchable bytes" marked
+   with FORBIDDENBYTE (0xFD). */
+static inline int _PyMem_IsPtrFreed(const void *ptr)
+{
+    uintptr_t value = (uintptr_t)ptr;
+#if SIZEOF_VOID_P == 8
+    return (value == 0
+            || value == (uintptr_t)0xCDCDCDCDCDCDCDCD
+            || value == (uintptr_t)0xDDDDDDDDDDDDDDDD
+            || value == (uintptr_t)0xFDFDFDFDFDFDFDFD);
+#elif SIZEOF_VOID_P == 4
+    return (value == 0
+            || value == (uintptr_t)0xCDCDCDCD
+            || value == (uintptr_t)0xDDDDDDDD
+            || value == (uintptr_t)0xFDFDFDFD);
+#else
+#  error "unknown pointer size"
+#endif
+}
+
+PyAPI_FUNC(int) _PyMem_GetAllocatorName(
+    const char *name,
+    PyMemAllocatorName *allocator);
+
+/* Configure the Python memory allocators.
+   Pass PYMEM_ALLOCATOR_DEFAULT to use default allocators.
+   PYMEM_ALLOCATOR_NOT_SET does nothing. */
+PyAPI_FUNC(int) _PyMem_SetupAllocators(PyMemAllocatorName allocator);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_PYMEM_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_pymem_init.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_pymem_init.h
new file mode 100644
index 000000000000..78232738cb09
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_pymem_init.h
@@ -0,0 +1,85 @@
+#ifndef Py_INTERNAL_PYMEM_INIT_H
+#define Py_INTERNAL_PYMEM_INIT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_pymem.h"
+
+
+/********************************/
+/* the allocators' initializers */
+
+extern void * _PyMem_RawMalloc(void *, size_t);
+extern void * _PyMem_RawCalloc(void *, size_t, size_t);
+extern void * _PyMem_RawRealloc(void *, void *, size_t);
+extern void _PyMem_RawFree(void *, void *);
+#define PYRAW_ALLOC {NULL, _PyMem_RawMalloc, _PyMem_RawCalloc, _PyMem_RawRealloc, _PyMem_RawFree}
+
+#ifdef WITH_PYMALLOC
+extern void* _PyObject_Malloc(void *, size_t);
+extern void* _PyObject_Calloc(void *, size_t, size_t);
+extern void _PyObject_Free(void *, void *);
+extern void* _PyObject_Realloc(void *, void *, size_t);
+#  define PYOBJ_ALLOC {NULL, _PyObject_Malloc, _PyObject_Calloc, _PyObject_Realloc, _PyObject_Free}
+#else
+# define PYOBJ_ALLOC PYRAW_ALLOC
+#endif  // WITH_PYMALLOC
+
+#define PYMEM_ALLOC PYOBJ_ALLOC
+
+extern void* _PyMem_DebugRawMalloc(void *, size_t);
+extern void* _PyMem_DebugRawCalloc(void *, size_t, size_t);
+extern void* _PyMem_DebugRawRealloc(void *, void *, size_t);
+extern void _PyMem_DebugRawFree(void *, void *);
+
+extern void* _PyMem_DebugMalloc(void *, size_t);
+extern void* _PyMem_DebugCalloc(void *, size_t, size_t);
+extern void* _PyMem_DebugRealloc(void *, void *, size_t);
+extern void _PyMem_DebugFree(void *, void *);
+
+#define PYDBGRAW_ALLOC(runtime) \
+    {&(runtime).allocators.debug.raw, _PyMem_DebugRawMalloc, _PyMem_DebugRawCalloc, _PyMem_DebugRawRealloc, _PyMem_DebugRawFree}
+#define PYDBGMEM_ALLOC(runtime) \
+    {&(runtime).allocators.debug.mem, _PyMem_DebugMalloc, _PyMem_DebugCalloc, _PyMem_DebugRealloc, _PyMem_DebugFree}
+#define PYDBGOBJ_ALLOC(runtime) \
+    {&(runtime).allocators.debug.obj, _PyMem_DebugMalloc, _PyMem_DebugCalloc, _PyMem_DebugRealloc, _PyMem_DebugFree}
+
+extern void * _PyMem_ArenaAlloc(void *, size_t);
+extern void _PyMem_ArenaFree(void *, void *, size_t);
+
+#ifdef Py_DEBUG
+# define _pymem_allocators_standard_INIT(runtime) \
+    { \
+        PYDBGRAW_ALLOC(runtime), \
+        PYDBGMEM_ALLOC(runtime), \
+        PYDBGOBJ_ALLOC(runtime), \
+    }
+#else
+# define _pymem_allocators_standard_INIT(runtime) \
+    { \
+        PYRAW_ALLOC, \
+        PYMEM_ALLOC, \
+        PYOBJ_ALLOC, \
+    }
+#endif
+
+#define _pymem_allocators_debug_INIT \
+   { \
+       {'r', PYRAW_ALLOC}, \
+       {'m', PYMEM_ALLOC}, \
+       {'o', PYOBJ_ALLOC}, \
+   }
+
+#  define _pymem_allocators_obj_arena_INIT \
+    { NULL, _PyMem_ArenaAlloc, _PyMem_ArenaFree }
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // !Py_INTERNAL_PYMEM_INIT_H
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_pystate.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_pystate.h
new file mode 100644
index 000000000000..fba08ae55235
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_pystate.h
@@ -0,0 +1,180 @@
+#ifndef Py_INTERNAL_PYSTATE_H
+#define Py_INTERNAL_PYSTATE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_runtime.h"   /* PyRuntimeState */
+
+
+/* Check if the current thread is the main thread.
+   Use _Py_IsMainInterpreter() to check if it's the main interpreter. */
+static inline int
+_Py_IsMainThread(void)
+{
+    unsigned long thread = PyThread_get_thread_ident();
+    return (thread == _PyRuntime.main_thread);
+}
+
+
+static inline PyInterpreterState *
+_PyInterpreterState_Main(void)
+{
+    return _PyRuntime.interpreters.main;
+}
+
+static inline int
+_Py_IsMainInterpreter(PyInterpreterState *interp)
+{
+    return (interp == _PyInterpreterState_Main());
+}
+
+static inline int
+_Py_IsMainInterpreterFinalizing(PyInterpreterState *interp)
+{
+    /* bpo-39877: Access _PyRuntime directly rather than using
+       tstate->interp->runtime to support calls from Python daemon threads.
+       After Py_Finalize() has been called, tstate can be a dangling pointer:
+       point to PyThreadState freed memory. */
+    return (_PyRuntimeState_GetFinalizing(&_PyRuntime) != NULL &&
+            interp == &_PyRuntime._main_interpreter);
+}
+
+// Export for _xxsubinterpreters module.
+PyAPI_FUNC(int) _PyInterpreterState_SetRunningMain(PyInterpreterState *);
+PyAPI_FUNC(void) _PyInterpreterState_SetNotRunningMain(PyInterpreterState *);
+PyAPI_FUNC(int) _PyInterpreterState_IsRunningMain(PyInterpreterState *);
+
+
+static inline const PyConfig *
+_Py_GetMainConfig(void)
+{
+    PyInterpreterState *interp = _PyInterpreterState_Main();
+    if (interp == NULL) {
+        return NULL;
+    }
+    return _PyInterpreterState_GetConfig(interp);
+}
+
+
+/* Only handle signals on the main thread of the main interpreter. */
+static inline int
+_Py_ThreadCanHandleSignals(PyInterpreterState *interp)
+{
+    return (_Py_IsMainThread() && _Py_IsMainInterpreter(interp));
+}
+
+
+/* Variable and static inline functions for in-line access to current thread
+   and interpreter state */
+
+#if defined(HAVE_THREAD_LOCAL) && !defined(Py_BUILD_CORE_MODULE)
+extern _Py_thread_local PyThreadState *_Py_tss_tstate;
+#endif
+PyAPI_DATA(PyThreadState *) _PyThreadState_GetCurrent(void);
+
+#ifndef NDEBUG
+extern int _PyThreadState_CheckConsistency(PyThreadState *tstate);
+#endif
+
+extern int _PyThreadState_MustExit(PyThreadState *tstate);
+
+/* Get the current Python thread state.
+
+   This function is unsafe: it does not check for error and it can return NULL.
+
+   The caller must hold the GIL.
+
+   See also PyThreadState_Get() and _PyThreadState_UncheckedGet(). */
+static inline PyThreadState*
+_PyThreadState_GET(void)
+{
+#if defined(HAVE_THREAD_LOCAL) && !defined(Py_BUILD_CORE_MODULE)
+    return _Py_tss_tstate;
+#else
+    return _PyThreadState_GetCurrent();
+#endif
+}
+
+
+static inline void
+_Py_EnsureFuncTstateNotNULL(const char *func, PyThreadState *tstate)
+{
+    if (tstate == NULL) {
+        _Py_FatalErrorFunc(func,
+            "the function must be called with the GIL held, "
+            "after Python initialization and before Python finalization, "
+            "but the GIL is released (the current Python thread state is NULL)");
+    }
+}
+
+// Call Py_FatalError() if tstate is NULL
+#define _Py_EnsureTstateNotNULL(tstate) \
+    _Py_EnsureFuncTstateNotNULL(__func__, (tstate))
+
+
+/* Get the current interpreter state.
+
+   The function is unsafe: it does not check for error and it can return NULL.
+
+   The caller must hold the GIL.
+
+   See also _PyInterpreterState_Get()
+   and _PyGILState_GetInterpreterStateUnsafe(). */
+static inline PyInterpreterState* _PyInterpreterState_GET(void) {
+    PyThreadState *tstate = _PyThreadState_GET();
+#ifdef Py_DEBUG
+    _Py_EnsureTstateNotNULL(tstate);
+#endif
+    return tstate->interp;
+}
+
+
+// PyThreadState functions
+
+PyAPI_FUNC(PyThreadState *) _PyThreadState_New(PyInterpreterState *interp);
+PyAPI_FUNC(void) _PyThreadState_Bind(PyThreadState *tstate);
+// We keep this around exclusively for stable ABI compatibility.
+PyAPI_FUNC(void) _PyThreadState_Init(
+    PyThreadState *tstate);
+PyAPI_FUNC(void) _PyThreadState_DeleteExcept(PyThreadState *tstate);
+
+
+/* Other */
+
+PyAPI_FUNC(PyThreadState *) _PyThreadState_Swap(
+    _PyRuntimeState *runtime,
+    PyThreadState *newts);
+
+PyAPI_FUNC(PyStatus) _PyInterpreterState_Enable(_PyRuntimeState *runtime);
+
+#ifdef HAVE_FORK
+extern PyStatus _PyInterpreterState_DeleteExceptMain(_PyRuntimeState *runtime);
+extern void _PySignal_AfterFork(void);
+#endif
+
+PyAPI_FUNC(int) _PyCrossInterpreterData_ReleaseAndRawFree(_PyCrossInterpreterData *);
+
+
+PyAPI_FUNC(int) _PyState_AddModule(
+    PyThreadState *tstate,
+    PyObject* module,
+    PyModuleDef* def);
+
+
+PyAPI_FUNC(int) _PyOS_InterruptOccurred(PyThreadState *tstate);
+
+#define HEAD_LOCK(runtime) \
+    PyThread_acquire_lock((runtime)->interpreters.mutex, WAIT_LOCK)
+#define HEAD_UNLOCK(runtime) \
+    PyThread_release_lock((runtime)->interpreters.mutex)
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_PYSTATE_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_pythread.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_pythread.h
new file mode 100644
index 000000000000..f53921494c15
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_pythread.h
@@ -0,0 +1,81 @@
+#ifndef Py_INTERNAL_PYTHREAD_H
+#define Py_INTERNAL_PYTHREAD_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+
+#ifndef _POSIX_THREADS
+/* This means pthreads are not implemented in libc headers, hence the macro
+   not present in unistd.h. But they still can be implemented as an external
+   library (e.g. gnu pth in pthread emulation) */
+# ifdef HAVE_PTHREAD_H
+#  include <pthread.h> /* _POSIX_THREADS */
+# endif
+# ifndef _POSIX_THREADS
+/* Check if we're running on HP-UX and _SC_THREADS is defined. If so, then
+   enough of the Posix threads package is implemented to support python
+   threads.
+
+   This is valid for HP-UX 11.23 running on an ia64 system. If needed, add
+   a check of __ia64 to verify that we're running on an ia64 system instead
+   of a pa-risc system.
+*/
+#  ifdef __hpux
+#   ifdef _SC_THREADS
+#    define _POSIX_THREADS
+#   endif
+#  endif
+# endif /* _POSIX_THREADS */
+#endif /* _POSIX_THREADS */
+
+#if defined(_POSIX_THREADS) || defined(HAVE_PTHREAD_STUBS)
+# define _USE_PTHREADS
+#endif
+
+#if defined(_USE_PTHREADS) && defined(HAVE_PTHREAD_CONDATTR_SETCLOCK) && defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_MONOTONIC)
+// monotonic is supported statically.  It doesn't mean it works on runtime.
+# define CONDATTR_MONOTONIC
+#endif
+
+
+#if defined(HAVE_PTHREAD_STUBS)
+// pthread_key
+struct py_stub_tls_entry {
+    bool in_use;
+    void *value;
+};
+#endif
+
+struct _pythread_runtime_state {
+    int initialized;
+
+#ifdef _USE_PTHREADS
+    // This matches when thread_pthread.h is used.
+    struct {
+        /* NULL when pthread_condattr_setclock(CLOCK_MONOTONIC) is not supported. */
+        pthread_condattr_t *ptr;
+# ifdef CONDATTR_MONOTONIC
+    /* The value to which condattr_monotonic is set. */
+        pthread_condattr_t val;
+# endif
+    } _condattr_monotonic;
+
+#endif  // USE_PTHREADS
+
+#if defined(HAVE_PTHREAD_STUBS)
+    struct {
+        struct py_stub_tls_entry tls_entries[PTHREAD_KEYS_MAX];
+    } stubs;
+#endif
+};
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_PYTHREAD_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_range.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_range.h
new file mode 100644
index 000000000000..bf045ec4fd83
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_range.h
@@ -0,0 +1,21 @@
+#ifndef Py_INTERNAL_RANGE_H
+#define Py_INTERNAL_RANGE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+typedef struct {
+    PyObject_HEAD
+    long start;
+    long step;
+    long len;
+} _PyRangeIterObject;
+
+#ifdef __cplusplus
+}
+#endif
+#endif   /* !Py_INTERNAL_RANGE_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_runtime.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_runtime.h
new file mode 100644
index 000000000000..99c4b0760bfb
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_runtime.h
@@ -0,0 +1,235 @@
+#ifndef Py_INTERNAL_RUNTIME_H
+#define Py_INTERNAL_RUNTIME_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_atexit.h"          // struct atexit_runtime_state
+#include "pycore_atomic.h"          /* _Py_atomic_address */
+#include "pycore_ceval_state.h"     // struct _ceval_runtime_state
+#include "pycore_floatobject.h"     // struct _Py_float_runtime_state
+#include "pycore_faulthandler.h"    // struct _faulthandler_runtime_state
+#include "pycore_global_objects.h"  // struct _Py_global_objects
+#include "pycore_import.h"          // struct _import_runtime_state
+#include "pycore_interp.h"          // PyInterpreterState
+#include "pycore_object_state.h"    // struct _py_object_runtime_state
+#include "pycore_parser.h"          // struct _parser_runtime_state
+#include "pycore_pymem.h"           // struct _pymem_allocators
+#include "pycore_pyhash.h"          // struct pyhash_runtime_state
+#include "pycore_pythread.h"        // struct _pythread_runtime_state
+#include "pycore_signal.h"          // struct _signals_runtime_state
+#include "pycore_time.h"            // struct _time_runtime_state
+#include "pycore_tracemalloc.h"     // struct _tracemalloc_runtime_state
+#include "pycore_typeobject.h"      // struct types_runtime_state
+#include "pycore_unicodeobject.h"   // struct _Py_unicode_runtime_ids
+
+struct _getargs_runtime_state {
+    PyThread_type_lock mutex;
+    struct _PyArg_Parser *static_parsers;
+};
+
+/* GIL state */
+
+struct _gilstate_runtime_state {
+    /* bpo-26558: Flag to disable PyGILState_Check().
+       If set to non-zero, PyGILState_Check() always return 1. */
+    int check_enabled;
+    /* The single PyInterpreterState used by this process'
+       GILState implementation
+    */
+    /* TODO: Given interp_main, it may be possible to kill this ref */
+    PyInterpreterState *autoInterpreterState;
+};
+
+/* Runtime audit hook state */
+
+typedef struct _Py_AuditHookEntry {
+    struct _Py_AuditHookEntry *next;
+    Py_AuditHookFunction hookCFunction;
+    void *userData;
+} _Py_AuditHookEntry;
+
+/* Full Python runtime state */
+
+/* _PyRuntimeState holds the global state for the CPython runtime.
+   That data is exposed in the internal API as a static variable (_PyRuntime).
+   */
+typedef struct pyruntimestate {
+    /* Has been initialized to a safe state.
+
+       In order to be effective, this must be set to 0 during or right
+       after allocation. */
+    int _initialized;
+
+    /* Is running Py_PreInitialize()? */
+    int preinitializing;
+
+    /* Is Python preinitialized? Set to 1 by Py_PreInitialize() */
+    int preinitialized;
+
+    /* Is Python core initialized? Set to 1 by _Py_InitializeCore() */
+    int core_initialized;
+
+    /* Is Python fully initialized? Set to 1 by Py_Initialize() */
+    int initialized;
+
+    /* Set by Py_FinalizeEx(). Only reset to NULL if Py_Initialize()
+       is called again.
+
+       Use _PyRuntimeState_GetFinalizing() and _PyRuntimeState_SetFinalizing()
+       to access it, don't access it directly. */
+    _Py_atomic_address _finalizing;
+
+    struct pyinterpreters {
+        PyThread_type_lock mutex;
+        /* The linked list of interpreters, newest first. */
+        PyInterpreterState *head;
+        /* The runtime's initial interpreter, which has a special role
+           in the operation of the runtime.  It is also often the only
+           interpreter. */
+        PyInterpreterState *main;
+        /* next_id is an auto-numbered sequence of small
+           integers.  It gets initialized in _PyInterpreterState_Enable(),
+           which is called in Py_Initialize(), and used in
+           PyInterpreterState_New().  A negative interpreter ID
+           indicates an error occurred.  The main interpreter will
+           always have an ID of 0.  Overflow results in a RuntimeError.
+           If that becomes a problem later then we can adjust, e.g. by
+           using a Python int. */
+        int64_t next_id;
+    } interpreters;
+
+    unsigned long main_thread;
+
+    /* ---------- IMPORTANT ---------------------------
+     The fields above this line are declared as early as
+     possible to facilitate out-of-process observability
+     tools. */
+
+    // XXX Remove this field once we have a tp_* slot.
+    struct _xidregistry xidregistry;
+
+    struct _pymem_allocators allocators;
+    struct _obmalloc_global_state obmalloc;
+    struct pyhash_runtime_state pyhash_state;
+    struct _time_runtime_state time;
+    struct _pythread_runtime_state threads;
+    struct _signals_runtime_state signals;
+
+    /* Used for the thread state bound to the current thread. */
+    Py_tss_t autoTSSkey;
+
+    /* Used instead of PyThreadState.trash when there is not current tstate. */
+    Py_tss_t trashTSSkey;
+
+    PyWideStringList orig_argv;
+
+    struct _parser_runtime_state parser;
+
+    struct _atexit_runtime_state atexit;
+
+    struct _import_runtime_state imports;
+    struct _ceval_runtime_state ceval;
+    struct _gilstate_runtime_state gilstate;
+    struct _getargs_runtime_state getargs;
+    struct _fileutils_state fileutils;
+    struct _faulthandler_runtime_state faulthandler;
+    struct _tracemalloc_runtime_state tracemalloc;
+
+    PyPreConfig preconfig;
+
+    // Audit values must be preserved when Py_Initialize()/Py_Finalize()
+    // is called multiple times.
+    Py_OpenCodeHookFunction open_code_hook;
+    void *open_code_userdata;
+    struct {
+        PyThread_type_lock mutex;
+        _Py_AuditHookEntry *head;
+    } audit_hooks;
+
+    struct _py_object_runtime_state object_state;
+    struct _Py_float_runtime_state float_state;
+    struct _Py_unicode_runtime_state unicode_state;
+    struct _types_runtime_state types;
+
+    /* All the objects that are shared by the runtime's interpreters. */
+    struct _Py_static_objects static_objects;
+    struct _Py_cached_objects cached_objects;
+
+    /* The ID of the OS thread in which we are finalizing.
+       We use _Py_atomic_address instead of adding a new _Py_atomic_ulong. */
+    _Py_atomic_address _finalizing_id;
+    /* The value to use for sys.path[0] in new subinterpreters.
+       Normally this would be part of the PyConfig struct.  However,
+       we cannot add it there in 3.12 since that's an ABI change. */
+    wchar_t *sys_path_0;
+
+    /* The following fields are here to avoid allocation during init.
+       The data is exposed through _PyRuntimeState pointer fields.
+       These fields should not be accessed directly outside of init.
+
+       All other _PyRuntimeState pointer fields are populated when
+       needed and default to NULL.
+
+       For now there are some exceptions to that rule, which require
+       allocation during init.  These will be addressed on a case-by-case
+       basis.  Most notably, we don't pre-allocated the several mutex
+       (PyThread_type_lock) fields, because on Windows we only ever get
+       a pointer type.
+       */
+
+    /* PyInterpreterState.interpreters.main */
+    PyInterpreterState _main_interpreter;
+} _PyRuntimeState;
+
+
+/* other API */
+
+PyAPI_DATA(_PyRuntimeState) _PyRuntime;
+
+PyAPI_FUNC(PyStatus) _PyRuntimeState_Init(_PyRuntimeState *runtime);
+PyAPI_FUNC(void) _PyRuntimeState_Fini(_PyRuntimeState *runtime);
+
+#ifdef HAVE_FORK
+extern PyStatus _PyRuntimeState_ReInitThreads(_PyRuntimeState *runtime);
+#endif
+
+/* Initialize _PyRuntimeState.
+   Return NULL on success, or return an error message on failure. */
+PyAPI_FUNC(PyStatus) _PyRuntime_Initialize(void);
+
+PyAPI_FUNC(void) _PyRuntime_Finalize(void);
+
+
+static inline PyThreadState*
+_PyRuntimeState_GetFinalizing(_PyRuntimeState *runtime) {
+    return (PyThreadState*)_Py_atomic_load_relaxed(&runtime->_finalizing);
+}
+
+static inline unsigned long
+_PyRuntimeState_GetFinalizingID(_PyRuntimeState *runtime) {
+    return (unsigned long)_Py_atomic_load_relaxed(&runtime->_finalizing_id);
+}
+
+static inline void
+_PyRuntimeState_SetFinalizing(_PyRuntimeState *runtime, PyThreadState *tstate) {
+    _Py_atomic_store_relaxed(&runtime->_finalizing, (uintptr_t)tstate);
+    if (tstate == NULL) {
+        _Py_atomic_store_relaxed(&runtime->_finalizing_id, 0);
+    }
+    else {
+        // XXX Re-enable this assert once gh-109860 is fixed.
+        //assert(tstate->thread_id == PyThread_get_thread_ident());
+        _Py_atomic_store_relaxed(&runtime->_finalizing_id,
+                                 (uintptr_t)tstate->thread_id);
+    }
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_RUNTIME_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_runtime_init.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_runtime_init.h
new file mode 100644
index 000000000000..e5f9e17efff2
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_runtime_init.h
@@ -0,0 +1,195 @@
+#ifndef Py_INTERNAL_RUNTIME_INIT_H
+#define Py_INTERNAL_RUNTIME_INIT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_long.h"
+#include "pycore_object.h"
+#include "pycore_parser.h"
+#include "pycore_pymem_init.h"
+#include "pycore_obmalloc_init.h"
+
+
+extern PyTypeObject _PyExc_MemoryError;
+
+
+/* The static initializers defined here should only be used
+   in the runtime init code (in pystate.c and pylifecycle.c). */
+
+
+#define _PyRuntimeState_INIT(runtime) \
+    { \
+        .allocators = { \
+            .standard = _pymem_allocators_standard_INIT(runtime), \
+            .debug = _pymem_allocators_debug_INIT, \
+            .obj_arena = _pymem_allocators_obj_arena_INIT, \
+        }, \
+        .obmalloc = _obmalloc_global_state_INIT, \
+        .pyhash_state = pyhash_state_INIT, \
+        .signals = _signals_RUNTIME_INIT, \
+        .interpreters = { \
+            /* This prevents interpreters from getting created \
+              until _PyInterpreterState_Enable() is called. */ \
+            .next_id = -1, \
+        }, \
+        /* A TSS key must be initialized with Py_tss_NEEDS_INIT \
+           in accordance with the specification. */ \
+        .autoTSSkey = Py_tss_NEEDS_INIT, \
+        .parser = _parser_runtime_state_INIT, \
+        .ceval = { \
+            .perf = _PyEval_RUNTIME_PERF_INIT, \
+        }, \
+        .gilstate = { \
+            .check_enabled = 1, \
+        }, \
+        .fileutils = { \
+            .force_ascii = -1, \
+        }, \
+        .faulthandler = _faulthandler_runtime_state_INIT, \
+        .tracemalloc = _tracemalloc_runtime_state_INIT, \
+        .float_state = { \
+            .float_format = _py_float_format_unknown, \
+            .double_format = _py_float_format_unknown, \
+        }, \
+        .types = { \
+            .next_version_tag = 1, \
+        }, \
+        .static_objects = { \
+            .singletons = { \
+                .small_ints = _Py_small_ints_INIT, \
+                .bytes_empty = _PyBytes_SIMPLE_INIT(0, 0), \
+                .bytes_characters = _Py_bytes_characters_INIT, \
+                .strings = { \
+                    .literals = _Py_str_literals_INIT, \
+                    .identifiers = _Py_str_identifiers_INIT, \
+                    .ascii = _Py_str_ascii_INIT, \
+                    .latin1 = _Py_str_latin1_INIT, \
+                }, \
+                .tuple_empty = { \
+                    .ob_base = _PyVarObject_HEAD_INIT(&PyTuple_Type, 0) \
+                }, \
+                .hamt_bitmap_node_empty = { \
+                    .ob_base = _PyVarObject_HEAD_INIT(&_PyHamt_BitmapNode_Type, 0) \
+                }, \
+                .context_token_missing = { \
+                    .ob_base = _PyObject_HEAD_INIT(&_PyContextTokenMissing_Type) \
+                }, \
+            }, \
+        }, \
+        ._main_interpreter = _PyInterpreterState_INIT(runtime._main_interpreter), \
+    }
+
+#define _PyInterpreterState_INIT(INTERP) \
+    { \
+        .id_refcount = -1, \
+        .imports = IMPORTS_INIT, \
+        .obmalloc = _obmalloc_state_INIT(INTERP.obmalloc), \
+        .ceval = { \
+            .recursion_limit = Py_DEFAULT_RECURSION_LIMIT, \
+        }, \
+        .gc = { \
+            .enabled = 1, \
+            .generations = { \
+                /* .head is set in _PyGC_InitState(). */ \
+                { .threshold = 700, }, \
+                { .threshold = 10, }, \
+                { .threshold = 10, }, \
+            }, \
+        }, \
+        .object_state = _py_object_state_INIT(INTERP), \
+        .dtoa = _dtoa_state_INIT(&(INTERP)), \
+        .dict_state = _dict_state_INIT, \
+        .func_state = { \
+            .next_version = 1, \
+        }, \
+        .types = { \
+            .next_version_tag = _Py_TYPE_BASE_VERSION_TAG, \
+        }, \
+        .static_objects = { \
+            .singletons = { \
+                ._not_used = 1, \
+                .hamt_empty = { \
+                    .ob_base = _PyObject_HEAD_INIT(&_PyHamt_Type) \
+                    .h_root = (PyHamtNode*)&_Py_SINGLETON(hamt_bitmap_node_empty), \
+                }, \
+                .last_resort_memory_error = { \
+                    _PyObject_HEAD_INIT(&_PyExc_MemoryError) \
+                    .args = (PyObject*)&_Py_SINGLETON(tuple_empty) \
+                }, \
+            }, \
+        }, \
+        ._initial_thread = _PyThreadState_INIT, \
+    }
+
+#define _PyThreadState_INIT \
+    { \
+        .py_recursion_limit = Py_DEFAULT_RECURSION_LIMIT, \
+        .context_ver = 1, \
+    }
+
+#ifdef Py_TRACE_REFS
+# define _py_object_state_INIT(INTERP) \
+    { \
+        .refchain = {&INTERP.object_state.refchain, &INTERP.object_state.refchain}, \
+    }
+#else
+# define _py_object_state_INIT(INTERP) \
+    { 0 }
+#endif
+
+
+// global objects
+
+#define _PyBytes_SIMPLE_INIT(CH, LEN) \
+    { \
+        _PyVarObject_HEAD_INIT(&PyBytes_Type, (LEN)) \
+        .ob_shash = -1, \
+        .ob_sval = { (CH) }, \
+    }
+#define _PyBytes_CHAR_INIT(CH) \
+    { \
+        _PyBytes_SIMPLE_INIT((CH), 1) \
+    }
+
+#define _PyUnicode_ASCII_BASE_INIT(LITERAL, ASCII) \
+    { \
+        .ob_base = _PyObject_HEAD_INIT(&PyUnicode_Type) \
+        .length = sizeof(LITERAL) - 1, \
+        .hash = -1, \
+        .state = { \
+            .kind = 1, \
+            .compact = 1, \
+            .ascii = (ASCII), \
+            .statically_allocated = 1, \
+        }, \
+    }
+#define _PyASCIIObject_INIT(LITERAL) \
+    { \
+        ._ascii = _PyUnicode_ASCII_BASE_INIT((LITERAL), 1), \
+        ._data = (LITERAL) \
+    }
+#define INIT_STR(NAME, LITERAL) \
+    ._py_ ## NAME = _PyASCIIObject_INIT(LITERAL)
+#define INIT_ID(NAME) \
+    ._py_ ## NAME = _PyASCIIObject_INIT(#NAME)
+#define _PyUnicode_LATIN1_INIT(LITERAL, UTF8) \
+    { \
+        ._latin1 = { \
+            ._base = _PyUnicode_ASCII_BASE_INIT((LITERAL), 0), \
+            .utf8 = (UTF8), \
+            .utf8_length = sizeof(UTF8) - 1, \
+        }, \
+        ._data = (LITERAL), \
+    }
+
+#include "pycore_runtime_init_generated.h"
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_RUNTIME_INIT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_runtime_init_generated.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_runtime_init_generated.h
new file mode 100644
index 000000000000..07f237b29058
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_runtime_init_generated.h
@@ -0,0 +1,1525 @@
+#ifndef Py_INTERNAL_RUNTIME_INIT_GENERATED_H
+#define Py_INTERNAL_RUNTIME_INIT_GENERATED_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+/* The following is auto-generated by Tools/build/generate_global_objects.py. */
+#define _Py_small_ints_INIT { \
+    _PyLong_DIGIT_INIT(-5), \
+    _PyLong_DIGIT_INIT(-4), \
+    _PyLong_DIGIT_INIT(-3), \
+    _PyLong_DIGIT_INIT(-2), \
+    _PyLong_DIGIT_INIT(-1), \
+    _PyLong_DIGIT_INIT(0), \
+    _PyLong_DIGIT_INIT(1), \
+    _PyLong_DIGIT_INIT(2), \
+    _PyLong_DIGIT_INIT(3), \
+    _PyLong_DIGIT_INIT(4), \
+    _PyLong_DIGIT_INIT(5), \
+    _PyLong_DIGIT_INIT(6), \
+    _PyLong_DIGIT_INIT(7), \
+    _PyLong_DIGIT_INIT(8), \
+    _PyLong_DIGIT_INIT(9), \
+    _PyLong_DIGIT_INIT(10), \
+    _PyLong_DIGIT_INIT(11), \
+    _PyLong_DIGIT_INIT(12), \
+    _PyLong_DIGIT_INIT(13), \
+    _PyLong_DIGIT_INIT(14), \
+    _PyLong_DIGIT_INIT(15), \
+    _PyLong_DIGIT_INIT(16), \
+    _PyLong_DIGIT_INIT(17), \
+    _PyLong_DIGIT_INIT(18), \
+    _PyLong_DIGIT_INIT(19), \
+    _PyLong_DIGIT_INIT(20), \
+    _PyLong_DIGIT_INIT(21), \
+    _PyLong_DIGIT_INIT(22), \
+    _PyLong_DIGIT_INIT(23), \
+    _PyLong_DIGIT_INIT(24), \
+    _PyLong_DIGIT_INIT(25), \
+    _PyLong_DIGIT_INIT(26), \
+    _PyLong_DIGIT_INIT(27), \
+    _PyLong_DIGIT_INIT(28), \
+    _PyLong_DIGIT_INIT(29), \
+    _PyLong_DIGIT_INIT(30), \
+    _PyLong_DIGIT_INIT(31), \
+    _PyLong_DIGIT_INIT(32), \
+    _PyLong_DIGIT_INIT(33), \
+    _PyLong_DIGIT_INIT(34), \
+    _PyLong_DIGIT_INIT(35), \
+    _PyLong_DIGIT_INIT(36), \
+    _PyLong_DIGIT_INIT(37), \
+    _PyLong_DIGIT_INIT(38), \
+    _PyLong_DIGIT_INIT(39), \
+    _PyLong_DIGIT_INIT(40), \
+    _PyLong_DIGIT_INIT(41), \
+    _PyLong_DIGIT_INIT(42), \
+    _PyLong_DIGIT_INIT(43), \
+    _PyLong_DIGIT_INIT(44), \
+    _PyLong_DIGIT_INIT(45), \
+    _PyLong_DIGIT_INIT(46), \
+    _PyLong_DIGIT_INIT(47), \
+    _PyLong_DIGIT_INIT(48), \
+    _PyLong_DIGIT_INIT(49), \
+    _PyLong_DIGIT_INIT(50), \
+    _PyLong_DIGIT_INIT(51), \
+    _PyLong_DIGIT_INIT(52), \
+    _PyLong_DIGIT_INIT(53), \
+    _PyLong_DIGIT_INIT(54), \
+    _PyLong_DIGIT_INIT(55), \
+    _PyLong_DIGIT_INIT(56), \
+    _PyLong_DIGIT_INIT(57), \
+    _PyLong_DIGIT_INIT(58), \
+    _PyLong_DIGIT_INIT(59), \
+    _PyLong_DIGIT_INIT(60), \
+    _PyLong_DIGIT_INIT(61), \
+    _PyLong_DIGIT_INIT(62), \
+    _PyLong_DIGIT_INIT(63), \
+    _PyLong_DIGIT_INIT(64), \
+    _PyLong_DIGIT_INIT(65), \
+    _PyLong_DIGIT_INIT(66), \
+    _PyLong_DIGIT_INIT(67), \
+    _PyLong_DIGIT_INIT(68), \
+    _PyLong_DIGIT_INIT(69), \
+    _PyLong_DIGIT_INIT(70), \
+    _PyLong_DIGIT_INIT(71), \
+    _PyLong_DIGIT_INIT(72), \
+    _PyLong_DIGIT_INIT(73), \
+    _PyLong_DIGIT_INIT(74), \
+    _PyLong_DIGIT_INIT(75), \
+    _PyLong_DIGIT_INIT(76), \
+    _PyLong_DIGIT_INIT(77), \
+    _PyLong_DIGIT_INIT(78), \
+    _PyLong_DIGIT_INIT(79), \
+    _PyLong_DIGIT_INIT(80), \
+    _PyLong_DIGIT_INIT(81), \
+    _PyLong_DIGIT_INIT(82), \
+    _PyLong_DIGIT_INIT(83), \
+    _PyLong_DIGIT_INIT(84), \
+    _PyLong_DIGIT_INIT(85), \
+    _PyLong_DIGIT_INIT(86), \
+    _PyLong_DIGIT_INIT(87), \
+    _PyLong_DIGIT_INIT(88), \
+    _PyLong_DIGIT_INIT(89), \
+    _PyLong_DIGIT_INIT(90), \
+    _PyLong_DIGIT_INIT(91), \
+    _PyLong_DIGIT_INIT(92), \
+    _PyLong_DIGIT_INIT(93), \
+    _PyLong_DIGIT_INIT(94), \
+    _PyLong_DIGIT_INIT(95), \
+    _PyLong_DIGIT_INIT(96), \
+    _PyLong_DIGIT_INIT(97), \
+    _PyLong_DIGIT_INIT(98), \
+    _PyLong_DIGIT_INIT(99), \
+    _PyLong_DIGIT_INIT(100), \
+    _PyLong_DIGIT_INIT(101), \
+    _PyLong_DIGIT_INIT(102), \
+    _PyLong_DIGIT_INIT(103), \
+    _PyLong_DIGIT_INIT(104), \
+    _PyLong_DIGIT_INIT(105), \
+    _PyLong_DIGIT_INIT(106), \
+    _PyLong_DIGIT_INIT(107), \
+    _PyLong_DIGIT_INIT(108), \
+    _PyLong_DIGIT_INIT(109), \
+    _PyLong_DIGIT_INIT(110), \
+    _PyLong_DIGIT_INIT(111), \
+    _PyLong_DIGIT_INIT(112), \
+    _PyLong_DIGIT_INIT(113), \
+    _PyLong_DIGIT_INIT(114), \
+    _PyLong_DIGIT_INIT(115), \
+    _PyLong_DIGIT_INIT(116), \
+    _PyLong_DIGIT_INIT(117), \
+    _PyLong_DIGIT_INIT(118), \
+    _PyLong_DIGIT_INIT(119), \
+    _PyLong_DIGIT_INIT(120), \
+    _PyLong_DIGIT_INIT(121), \
+    _PyLong_DIGIT_INIT(122), \
+    _PyLong_DIGIT_INIT(123), \
+    _PyLong_DIGIT_INIT(124), \
+    _PyLong_DIGIT_INIT(125), \
+    _PyLong_DIGIT_INIT(126), \
+    _PyLong_DIGIT_INIT(127), \
+    _PyLong_DIGIT_INIT(128), \
+    _PyLong_DIGIT_INIT(129), \
+    _PyLong_DIGIT_INIT(130), \
+    _PyLong_DIGIT_INIT(131), \
+    _PyLong_DIGIT_INIT(132), \
+    _PyLong_DIGIT_INIT(133), \
+    _PyLong_DIGIT_INIT(134), \
+    _PyLong_DIGIT_INIT(135), \
+    _PyLong_DIGIT_INIT(136), \
+    _PyLong_DIGIT_INIT(137), \
+    _PyLong_DIGIT_INIT(138), \
+    _PyLong_DIGIT_INIT(139), \
+    _PyLong_DIGIT_INIT(140), \
+    _PyLong_DIGIT_INIT(141), \
+    _PyLong_DIGIT_INIT(142), \
+    _PyLong_DIGIT_INIT(143), \
+    _PyLong_DIGIT_INIT(144), \
+    _PyLong_DIGIT_INIT(145), \
+    _PyLong_DIGIT_INIT(146), \
+    _PyLong_DIGIT_INIT(147), \
+    _PyLong_DIGIT_INIT(148), \
+    _PyLong_DIGIT_INIT(149), \
+    _PyLong_DIGIT_INIT(150), \
+    _PyLong_DIGIT_INIT(151), \
+    _PyLong_DIGIT_INIT(152), \
+    _PyLong_DIGIT_INIT(153), \
+    _PyLong_DIGIT_INIT(154), \
+    _PyLong_DIGIT_INIT(155), \
+    _PyLong_DIGIT_INIT(156), \
+    _PyLong_DIGIT_INIT(157), \
+    _PyLong_DIGIT_INIT(158), \
+    _PyLong_DIGIT_INIT(159), \
+    _PyLong_DIGIT_INIT(160), \
+    _PyLong_DIGIT_INIT(161), \
+    _PyLong_DIGIT_INIT(162), \
+    _PyLong_DIGIT_INIT(163), \
+    _PyLong_DIGIT_INIT(164), \
+    _PyLong_DIGIT_INIT(165), \
+    _PyLong_DIGIT_INIT(166), \
+    _PyLong_DIGIT_INIT(167), \
+    _PyLong_DIGIT_INIT(168), \
+    _PyLong_DIGIT_INIT(169), \
+    _PyLong_DIGIT_INIT(170), \
+    _PyLong_DIGIT_INIT(171), \
+    _PyLong_DIGIT_INIT(172), \
+    _PyLong_DIGIT_INIT(173), \
+    _PyLong_DIGIT_INIT(174), \
+    _PyLong_DIGIT_INIT(175), \
+    _PyLong_DIGIT_INIT(176), \
+    _PyLong_DIGIT_INIT(177), \
+    _PyLong_DIGIT_INIT(178), \
+    _PyLong_DIGIT_INIT(179), \
+    _PyLong_DIGIT_INIT(180), \
+    _PyLong_DIGIT_INIT(181), \
+    _PyLong_DIGIT_INIT(182), \
+    _PyLong_DIGIT_INIT(183), \
+    _PyLong_DIGIT_INIT(184), \
+    _PyLong_DIGIT_INIT(185), \
+    _PyLong_DIGIT_INIT(186), \
+    _PyLong_DIGIT_INIT(187), \
+    _PyLong_DIGIT_INIT(188), \
+    _PyLong_DIGIT_INIT(189), \
+    _PyLong_DIGIT_INIT(190), \
+    _PyLong_DIGIT_INIT(191), \
+    _PyLong_DIGIT_INIT(192), \
+    _PyLong_DIGIT_INIT(193), \
+    _PyLong_DIGIT_INIT(194), \
+    _PyLong_DIGIT_INIT(195), \
+    _PyLong_DIGIT_INIT(196), \
+    _PyLong_DIGIT_INIT(197), \
+    _PyLong_DIGIT_INIT(198), \
+    _PyLong_DIGIT_INIT(199), \
+    _PyLong_DIGIT_INIT(200), \
+    _PyLong_DIGIT_INIT(201), \
+    _PyLong_DIGIT_INIT(202), \
+    _PyLong_DIGIT_INIT(203), \
+    _PyLong_DIGIT_INIT(204), \
+    _PyLong_DIGIT_INIT(205), \
+    _PyLong_DIGIT_INIT(206), \
+    _PyLong_DIGIT_INIT(207), \
+    _PyLong_DIGIT_INIT(208), \
+    _PyLong_DIGIT_INIT(209), \
+    _PyLong_DIGIT_INIT(210), \
+    _PyLong_DIGIT_INIT(211), \
+    _PyLong_DIGIT_INIT(212), \
+    _PyLong_DIGIT_INIT(213), \
+    _PyLong_DIGIT_INIT(214), \
+    _PyLong_DIGIT_INIT(215), \
+    _PyLong_DIGIT_INIT(216), \
+    _PyLong_DIGIT_INIT(217), \
+    _PyLong_DIGIT_INIT(218), \
+    _PyLong_DIGIT_INIT(219), \
+    _PyLong_DIGIT_INIT(220), \
+    _PyLong_DIGIT_INIT(221), \
+    _PyLong_DIGIT_INIT(222), \
+    _PyLong_DIGIT_INIT(223), \
+    _PyLong_DIGIT_INIT(224), \
+    _PyLong_DIGIT_INIT(225), \
+    _PyLong_DIGIT_INIT(226), \
+    _PyLong_DIGIT_INIT(227), \
+    _PyLong_DIGIT_INIT(228), \
+    _PyLong_DIGIT_INIT(229), \
+    _PyLong_DIGIT_INIT(230), \
+    _PyLong_DIGIT_INIT(231), \
+    _PyLong_DIGIT_INIT(232), \
+    _PyLong_DIGIT_INIT(233), \
+    _PyLong_DIGIT_INIT(234), \
+    _PyLong_DIGIT_INIT(235), \
+    _PyLong_DIGIT_INIT(236), \
+    _PyLong_DIGIT_INIT(237), \
+    _PyLong_DIGIT_INIT(238), \
+    _PyLong_DIGIT_INIT(239), \
+    _PyLong_DIGIT_INIT(240), \
+    _PyLong_DIGIT_INIT(241), \
+    _PyLong_DIGIT_INIT(242), \
+    _PyLong_DIGIT_INIT(243), \
+    _PyLong_DIGIT_INIT(244), \
+    _PyLong_DIGIT_INIT(245), \
+    _PyLong_DIGIT_INIT(246), \
+    _PyLong_DIGIT_INIT(247), \
+    _PyLong_DIGIT_INIT(248), \
+    _PyLong_DIGIT_INIT(249), \
+    _PyLong_DIGIT_INIT(250), \
+    _PyLong_DIGIT_INIT(251), \
+    _PyLong_DIGIT_INIT(252), \
+    _PyLong_DIGIT_INIT(253), \
+    _PyLong_DIGIT_INIT(254), \
+    _PyLong_DIGIT_INIT(255), \
+    _PyLong_DIGIT_INIT(256), \
+}
+
+#define _Py_bytes_characters_INIT { \
+    _PyBytes_CHAR_INIT(0), \
+    _PyBytes_CHAR_INIT(1), \
+    _PyBytes_CHAR_INIT(2), \
+    _PyBytes_CHAR_INIT(3), \
+    _PyBytes_CHAR_INIT(4), \
+    _PyBytes_CHAR_INIT(5), \
+    _PyBytes_CHAR_INIT(6), \
+    _PyBytes_CHAR_INIT(7), \
+    _PyBytes_CHAR_INIT(8), \
+    _PyBytes_CHAR_INIT(9), \
+    _PyBytes_CHAR_INIT(10), \
+    _PyBytes_CHAR_INIT(11), \
+    _PyBytes_CHAR_INIT(12), \
+    _PyBytes_CHAR_INIT(13), \
+    _PyBytes_CHAR_INIT(14), \
+    _PyBytes_CHAR_INIT(15), \
+    _PyBytes_CHAR_INIT(16), \
+    _PyBytes_CHAR_INIT(17), \
+    _PyBytes_CHAR_INIT(18), \
+    _PyBytes_CHAR_INIT(19), \
+    _PyBytes_CHAR_INIT(20), \
+    _PyBytes_CHAR_INIT(21), \
+    _PyBytes_CHAR_INIT(22), \
+    _PyBytes_CHAR_INIT(23), \
+    _PyBytes_CHAR_INIT(24), \
+    _PyBytes_CHAR_INIT(25), \
+    _PyBytes_CHAR_INIT(26), \
+    _PyBytes_CHAR_INIT(27), \
+    _PyBytes_CHAR_INIT(28), \
+    _PyBytes_CHAR_INIT(29), \
+    _PyBytes_CHAR_INIT(30), \
+    _PyBytes_CHAR_INIT(31), \
+    _PyBytes_CHAR_INIT(32), \
+    _PyBytes_CHAR_INIT(33), \
+    _PyBytes_CHAR_INIT(34), \
+    _PyBytes_CHAR_INIT(35), \
+    _PyBytes_CHAR_INIT(36), \
+    _PyBytes_CHAR_INIT(37), \
+    _PyBytes_CHAR_INIT(38), \
+    _PyBytes_CHAR_INIT(39), \
+    _PyBytes_CHAR_INIT(40), \
+    _PyBytes_CHAR_INIT(41), \
+    _PyBytes_CHAR_INIT(42), \
+    _PyBytes_CHAR_INIT(43), \
+    _PyBytes_CHAR_INIT(44), \
+    _PyBytes_CHAR_INIT(45), \
+    _PyBytes_CHAR_INIT(46), \
+    _PyBytes_CHAR_INIT(47), \
+    _PyBytes_CHAR_INIT(48), \
+    _PyBytes_CHAR_INIT(49), \
+    _PyBytes_CHAR_INIT(50), \
+    _PyBytes_CHAR_INIT(51), \
+    _PyBytes_CHAR_INIT(52), \
+    _PyBytes_CHAR_INIT(53), \
+    _PyBytes_CHAR_INIT(54), \
+    _PyBytes_CHAR_INIT(55), \
+    _PyBytes_CHAR_INIT(56), \
+    _PyBytes_CHAR_INIT(57), \
+    _PyBytes_CHAR_INIT(58), \
+    _PyBytes_CHAR_INIT(59), \
+    _PyBytes_CHAR_INIT(60), \
+    _PyBytes_CHAR_INIT(61), \
+    _PyBytes_CHAR_INIT(62), \
+    _PyBytes_CHAR_INIT(63), \
+    _PyBytes_CHAR_INIT(64), \
+    _PyBytes_CHAR_INIT(65), \
+    _PyBytes_CHAR_INIT(66), \
+    _PyBytes_CHAR_INIT(67), \
+    _PyBytes_CHAR_INIT(68), \
+    _PyBytes_CHAR_INIT(69), \
+    _PyBytes_CHAR_INIT(70), \
+    _PyBytes_CHAR_INIT(71), \
+    _PyBytes_CHAR_INIT(72), \
+    _PyBytes_CHAR_INIT(73), \
+    _PyBytes_CHAR_INIT(74), \
+    _PyBytes_CHAR_INIT(75), \
+    _PyBytes_CHAR_INIT(76), \
+    _PyBytes_CHAR_INIT(77), \
+    _PyBytes_CHAR_INIT(78), \
+    _PyBytes_CHAR_INIT(79), \
+    _PyBytes_CHAR_INIT(80), \
+    _PyBytes_CHAR_INIT(81), \
+    _PyBytes_CHAR_INIT(82), \
+    _PyBytes_CHAR_INIT(83), \
+    _PyBytes_CHAR_INIT(84), \
+    _PyBytes_CHAR_INIT(85), \
+    _PyBytes_CHAR_INIT(86), \
+    _PyBytes_CHAR_INIT(87), \
+    _PyBytes_CHAR_INIT(88), \
+    _PyBytes_CHAR_INIT(89), \
+    _PyBytes_CHAR_INIT(90), \
+    _PyBytes_CHAR_INIT(91), \
+    _PyBytes_CHAR_INIT(92), \
+    _PyBytes_CHAR_INIT(93), \
+    _PyBytes_CHAR_INIT(94), \
+    _PyBytes_CHAR_INIT(95), \
+    _PyBytes_CHAR_INIT(96), \
+    _PyBytes_CHAR_INIT(97), \
+    _PyBytes_CHAR_INIT(98), \
+    _PyBytes_CHAR_INIT(99), \
+    _PyBytes_CHAR_INIT(100), \
+    _PyBytes_CHAR_INIT(101), \
+    _PyBytes_CHAR_INIT(102), \
+    _PyBytes_CHAR_INIT(103), \
+    _PyBytes_CHAR_INIT(104), \
+    _PyBytes_CHAR_INIT(105), \
+    _PyBytes_CHAR_INIT(106), \
+    _PyBytes_CHAR_INIT(107), \
+    _PyBytes_CHAR_INIT(108), \
+    _PyBytes_CHAR_INIT(109), \
+    _PyBytes_CHAR_INIT(110), \
+    _PyBytes_CHAR_INIT(111), \
+    _PyBytes_CHAR_INIT(112), \
+    _PyBytes_CHAR_INIT(113), \
+    _PyBytes_CHAR_INIT(114), \
+    _PyBytes_CHAR_INIT(115), \
+    _PyBytes_CHAR_INIT(116), \
+    _PyBytes_CHAR_INIT(117), \
+    _PyBytes_CHAR_INIT(118), \
+    _PyBytes_CHAR_INIT(119), \
+    _PyBytes_CHAR_INIT(120), \
+    _PyBytes_CHAR_INIT(121), \
+    _PyBytes_CHAR_INIT(122), \
+    _PyBytes_CHAR_INIT(123), \
+    _PyBytes_CHAR_INIT(124), \
+    _PyBytes_CHAR_INIT(125), \
+    _PyBytes_CHAR_INIT(126), \
+    _PyBytes_CHAR_INIT(127), \
+    _PyBytes_CHAR_INIT(128), \
+    _PyBytes_CHAR_INIT(129), \
+    _PyBytes_CHAR_INIT(130), \
+    _PyBytes_CHAR_INIT(131), \
+    _PyBytes_CHAR_INIT(132), \
+    _PyBytes_CHAR_INIT(133), \
+    _PyBytes_CHAR_INIT(134), \
+    _PyBytes_CHAR_INIT(135), \
+    _PyBytes_CHAR_INIT(136), \
+    _PyBytes_CHAR_INIT(137), \
+    _PyBytes_CHAR_INIT(138), \
+    _PyBytes_CHAR_INIT(139), \
+    _PyBytes_CHAR_INIT(140), \
+    _PyBytes_CHAR_INIT(141), \
+    _PyBytes_CHAR_INIT(142), \
+    _PyBytes_CHAR_INIT(143), \
+    _PyBytes_CHAR_INIT(144), \
+    _PyBytes_CHAR_INIT(145), \
+    _PyBytes_CHAR_INIT(146), \
+    _PyBytes_CHAR_INIT(147), \
+    _PyBytes_CHAR_INIT(148), \
+    _PyBytes_CHAR_INIT(149), \
+    _PyBytes_CHAR_INIT(150), \
+    _PyBytes_CHAR_INIT(151), \
+    _PyBytes_CHAR_INIT(152), \
+    _PyBytes_CHAR_INIT(153), \
+    _PyBytes_CHAR_INIT(154), \
+    _PyBytes_CHAR_INIT(155), \
+    _PyBytes_CHAR_INIT(156), \
+    _PyBytes_CHAR_INIT(157), \
+    _PyBytes_CHAR_INIT(158), \
+    _PyBytes_CHAR_INIT(159), \
+    _PyBytes_CHAR_INIT(160), \
+    _PyBytes_CHAR_INIT(161), \
+    _PyBytes_CHAR_INIT(162), \
+    _PyBytes_CHAR_INIT(163), \
+    _PyBytes_CHAR_INIT(164), \
+    _PyBytes_CHAR_INIT(165), \
+    _PyBytes_CHAR_INIT(166), \
+    _PyBytes_CHAR_INIT(167), \
+    _PyBytes_CHAR_INIT(168), \
+    _PyBytes_CHAR_INIT(169), \
+    _PyBytes_CHAR_INIT(170), \
+    _PyBytes_CHAR_INIT(171), \
+    _PyBytes_CHAR_INIT(172), \
+    _PyBytes_CHAR_INIT(173), \
+    _PyBytes_CHAR_INIT(174), \
+    _PyBytes_CHAR_INIT(175), \
+    _PyBytes_CHAR_INIT(176), \
+    _PyBytes_CHAR_INIT(177), \
+    _PyBytes_CHAR_INIT(178), \
+    _PyBytes_CHAR_INIT(179), \
+    _PyBytes_CHAR_INIT(180), \
+    _PyBytes_CHAR_INIT(181), \
+    _PyBytes_CHAR_INIT(182), \
+    _PyBytes_CHAR_INIT(183), \
+    _PyBytes_CHAR_INIT(184), \
+    _PyBytes_CHAR_INIT(185), \
+    _PyBytes_CHAR_INIT(186), \
+    _PyBytes_CHAR_INIT(187), \
+    _PyBytes_CHAR_INIT(188), \
+    _PyBytes_CHAR_INIT(189), \
+    _PyBytes_CHAR_INIT(190), \
+    _PyBytes_CHAR_INIT(191), \
+    _PyBytes_CHAR_INIT(192), \
+    _PyBytes_CHAR_INIT(193), \
+    _PyBytes_CHAR_INIT(194), \
+    _PyBytes_CHAR_INIT(195), \
+    _PyBytes_CHAR_INIT(196), \
+    _PyBytes_CHAR_INIT(197), \
+    _PyBytes_CHAR_INIT(198), \
+    _PyBytes_CHAR_INIT(199), \
+    _PyBytes_CHAR_INIT(200), \
+    _PyBytes_CHAR_INIT(201), \
+    _PyBytes_CHAR_INIT(202), \
+    _PyBytes_CHAR_INIT(203), \
+    _PyBytes_CHAR_INIT(204), \
+    _PyBytes_CHAR_INIT(205), \
+    _PyBytes_CHAR_INIT(206), \
+    _PyBytes_CHAR_INIT(207), \
+    _PyBytes_CHAR_INIT(208), \
+    _PyBytes_CHAR_INIT(209), \
+    _PyBytes_CHAR_INIT(210), \
+    _PyBytes_CHAR_INIT(211), \
+    _PyBytes_CHAR_INIT(212), \
+    _PyBytes_CHAR_INIT(213), \
+    _PyBytes_CHAR_INIT(214), \
+    _PyBytes_CHAR_INIT(215), \
+    _PyBytes_CHAR_INIT(216), \
+    _PyBytes_CHAR_INIT(217), \
+    _PyBytes_CHAR_INIT(218), \
+    _PyBytes_CHAR_INIT(219), \
+    _PyBytes_CHAR_INIT(220), \
+    _PyBytes_CHAR_INIT(221), \
+    _PyBytes_CHAR_INIT(222), \
+    _PyBytes_CHAR_INIT(223), \
+    _PyBytes_CHAR_INIT(224), \
+    _PyBytes_CHAR_INIT(225), \
+    _PyBytes_CHAR_INIT(226), \
+    _PyBytes_CHAR_INIT(227), \
+    _PyBytes_CHAR_INIT(228), \
+    _PyBytes_CHAR_INIT(229), \
+    _PyBytes_CHAR_INIT(230), \
+    _PyBytes_CHAR_INIT(231), \
+    _PyBytes_CHAR_INIT(232), \
+    _PyBytes_CHAR_INIT(233), \
+    _PyBytes_CHAR_INIT(234), \
+    _PyBytes_CHAR_INIT(235), \
+    _PyBytes_CHAR_INIT(236), \
+    _PyBytes_CHAR_INIT(237), \
+    _PyBytes_CHAR_INIT(238), \
+    _PyBytes_CHAR_INIT(239), \
+    _PyBytes_CHAR_INIT(240), \
+    _PyBytes_CHAR_INIT(241), \
+    _PyBytes_CHAR_INIT(242), \
+    _PyBytes_CHAR_INIT(243), \
+    _PyBytes_CHAR_INIT(244), \
+    _PyBytes_CHAR_INIT(245), \
+    _PyBytes_CHAR_INIT(246), \
+    _PyBytes_CHAR_INIT(247), \
+    _PyBytes_CHAR_INIT(248), \
+    _PyBytes_CHAR_INIT(249), \
+    _PyBytes_CHAR_INIT(250), \
+    _PyBytes_CHAR_INIT(251), \
+    _PyBytes_CHAR_INIT(252), \
+    _PyBytes_CHAR_INIT(253), \
+    _PyBytes_CHAR_INIT(254), \
+    _PyBytes_CHAR_INIT(255), \
+}
+
+#define _Py_str_literals_INIT { \
+    INIT_STR(anon_dictcomp, "<dictcomp>"), \
+    INIT_STR(anon_genexpr, "<genexpr>"), \
+    INIT_STR(anon_lambda, "<lambda>"), \
+    INIT_STR(anon_listcomp, "<listcomp>"), \
+    INIT_STR(anon_module, "<module>"), \
+    INIT_STR(anon_setcomp, "<setcomp>"), \
+    INIT_STR(anon_string, "<string>"), \
+    INIT_STR(anon_unknown, "<unknown>"), \
+    INIT_STR(close_br, "}"), \
+    INIT_STR(dbl_close_br, "}}"), \
+    INIT_STR(dbl_open_br, "{{"), \
+    INIT_STR(dbl_percent, "%%"), \
+    INIT_STR(defaults, ".defaults"), \
+    INIT_STR(dot, "."), \
+    INIT_STR(dot_locals, ".<locals>"), \
+    INIT_STR(empty, ""), \
+    INIT_STR(generic_base, ".generic_base"), \
+    INIT_STR(json_decoder, "json.decoder"), \
+    INIT_STR(kwdefaults, ".kwdefaults"), \
+    INIT_STR(list_err, "list index out of range"), \
+    INIT_STR(newline, "\n"), \
+    INIT_STR(open_br, "{"), \
+    INIT_STR(percent, "%"), \
+    INIT_STR(shim_name, "<shim>"), \
+    INIT_STR(type_params, ".type_params"), \
+    INIT_STR(utf_8, "utf-8"), \
+}
+
+#define _Py_str_identifiers_INIT { \
+    INIT_ID(CANCELLED), \
+    INIT_ID(FINISHED), \
+    INIT_ID(False), \
+    INIT_ID(JSONDecodeError), \
+    INIT_ID(PENDING), \
+    INIT_ID(Py_Repr), \
+    INIT_ID(TextIOWrapper), \
+    INIT_ID(True), \
+    INIT_ID(WarningMessage), \
+    INIT_ID(_), \
+    INIT_ID(_WindowsConsoleIO), \
+    INIT_ID(__IOBase_closed), \
+    INIT_ID(__abc_tpflags__), \
+    INIT_ID(__abs__), \
+    INIT_ID(__abstractmethods__), \
+    INIT_ID(__add__), \
+    INIT_ID(__aenter__), \
+    INIT_ID(__aexit__), \
+    INIT_ID(__aiter__), \
+    INIT_ID(__all__), \
+    INIT_ID(__and__), \
+    INIT_ID(__anext__), \
+    INIT_ID(__annotations__), \
+    INIT_ID(__args__), \
+    INIT_ID(__asyncio_running_event_loop__), \
+    INIT_ID(__await__), \
+    INIT_ID(__bases__), \
+    INIT_ID(__bool__), \
+    INIT_ID(__buffer__), \
+    INIT_ID(__build_class__), \
+    INIT_ID(__builtins__), \
+    INIT_ID(__bytes__), \
+    INIT_ID(__call__), \
+    INIT_ID(__cantrace__), \
+    INIT_ID(__class__), \
+    INIT_ID(__class_getitem__), \
+    INIT_ID(__classcell__), \
+    INIT_ID(__classdict__), \
+    INIT_ID(__classdictcell__), \
+    INIT_ID(__complex__), \
+    INIT_ID(__contains__), \
+    INIT_ID(__copy__), \
+    INIT_ID(__ctypes_from_outparam__), \
+    INIT_ID(__del__), \
+    INIT_ID(__delattr__), \
+    INIT_ID(__delete__), \
+    INIT_ID(__delitem__), \
+    INIT_ID(__dict__), \
+    INIT_ID(__dictoffset__), \
+    INIT_ID(__dir__), \
+    INIT_ID(__divmod__), \
+    INIT_ID(__doc__), \
+    INIT_ID(__enter__), \
+    INIT_ID(__eq__), \
+    INIT_ID(__exit__), \
+    INIT_ID(__file__), \
+    INIT_ID(__float__), \
+    INIT_ID(__floordiv__), \
+    INIT_ID(__format__), \
+    INIT_ID(__fspath__), \
+    INIT_ID(__ge__), \
+    INIT_ID(__get__), \
+    INIT_ID(__getattr__), \
+    INIT_ID(__getattribute__), \
+    INIT_ID(__getinitargs__), \
+    INIT_ID(__getitem__), \
+    INIT_ID(__getnewargs__), \
+    INIT_ID(__getnewargs_ex__), \
+    INIT_ID(__getstate__), \
+    INIT_ID(__gt__), \
+    INIT_ID(__hash__), \
+    INIT_ID(__iadd__), \
+    INIT_ID(__iand__), \
+    INIT_ID(__ifloordiv__), \
+    INIT_ID(__ilshift__), \
+    INIT_ID(__imatmul__), \
+    INIT_ID(__imod__), \
+    INIT_ID(__import__), \
+    INIT_ID(__imul__), \
+    INIT_ID(__index__), \
+    INIT_ID(__init__), \
+    INIT_ID(__init_subclass__), \
+    INIT_ID(__instancecheck__), \
+    INIT_ID(__int__), \
+    INIT_ID(__invert__), \
+    INIT_ID(__ior__), \
+    INIT_ID(__ipow__), \
+    INIT_ID(__irshift__), \
+    INIT_ID(__isabstractmethod__), \
+    INIT_ID(__isub__), \
+    INIT_ID(__iter__), \
+    INIT_ID(__itruediv__), \
+    INIT_ID(__ixor__), \
+    INIT_ID(__le__), \
+    INIT_ID(__len__), \
+    INIT_ID(__length_hint__), \
+    INIT_ID(__lltrace__), \
+    INIT_ID(__loader__), \
+    INIT_ID(__lshift__), \
+    INIT_ID(__lt__), \
+    INIT_ID(__main__), \
+    INIT_ID(__matmul__), \
+    INIT_ID(__missing__), \
+    INIT_ID(__mod__), \
+    INIT_ID(__module__), \
+    INIT_ID(__mro_entries__), \
+    INIT_ID(__mul__), \
+    INIT_ID(__name__), \
+    INIT_ID(__ne__), \
+    INIT_ID(__neg__), \
+    INIT_ID(__new__), \
+    INIT_ID(__newobj__), \
+    INIT_ID(__newobj_ex__), \
+    INIT_ID(__next__), \
+    INIT_ID(__notes__), \
+    INIT_ID(__or__), \
+    INIT_ID(__orig_class__), \
+    INIT_ID(__origin__), \
+    INIT_ID(__package__), \
+    INIT_ID(__parameters__), \
+    INIT_ID(__path__), \
+    INIT_ID(__pos__), \
+    INIT_ID(__pow__), \
+    INIT_ID(__prepare__), \
+    INIT_ID(__qualname__), \
+    INIT_ID(__radd__), \
+    INIT_ID(__rand__), \
+    INIT_ID(__rdivmod__), \
+    INIT_ID(__reduce__), \
+    INIT_ID(__reduce_ex__), \
+    INIT_ID(__release_buffer__), \
+    INIT_ID(__repr__), \
+    INIT_ID(__reversed__), \
+    INIT_ID(__rfloordiv__), \
+    INIT_ID(__rlshift__), \
+    INIT_ID(__rmatmul__), \
+    INIT_ID(__rmod__), \
+    INIT_ID(__rmul__), \
+    INIT_ID(__ror__), \
+    INIT_ID(__round__), \
+    INIT_ID(__rpow__), \
+    INIT_ID(__rrshift__), \
+    INIT_ID(__rshift__), \
+    INIT_ID(__rsub__), \
+    INIT_ID(__rtruediv__), \
+    INIT_ID(__rxor__), \
+    INIT_ID(__set__), \
+    INIT_ID(__set_name__), \
+    INIT_ID(__setattr__), \
+    INIT_ID(__setitem__), \
+    INIT_ID(__setstate__), \
+    INIT_ID(__sizeof__), \
+    INIT_ID(__slotnames__), \
+    INIT_ID(__slots__), \
+    INIT_ID(__spec__), \
+    INIT_ID(__str__), \
+    INIT_ID(__sub__), \
+    INIT_ID(__subclasscheck__), \
+    INIT_ID(__subclasshook__), \
+    INIT_ID(__truediv__), \
+    INIT_ID(__trunc__), \
+    INIT_ID(__type_params__), \
+    INIT_ID(__typing_is_unpacked_typevartuple__), \
+    INIT_ID(__typing_prepare_subst__), \
+    INIT_ID(__typing_subst__), \
+    INIT_ID(__typing_unpacked_tuple_args__), \
+    INIT_ID(__warningregistry__), \
+    INIT_ID(__weaklistoffset__), \
+    INIT_ID(__weakref__), \
+    INIT_ID(__xor__), \
+    INIT_ID(_abc_impl), \
+    INIT_ID(_abstract_), \
+    INIT_ID(_active), \
+    INIT_ID(_annotation), \
+    INIT_ID(_anonymous_), \
+    INIT_ID(_argtypes_), \
+    INIT_ID(_as_parameter_), \
+    INIT_ID(_asyncio_future_blocking), \
+    INIT_ID(_blksize), \
+    INIT_ID(_bootstrap), \
+    INIT_ID(_check_retval_), \
+    INIT_ID(_dealloc_warn), \
+    INIT_ID(_feature_version), \
+    INIT_ID(_fields_), \
+    INIT_ID(_finalizing), \
+    INIT_ID(_find_and_load), \
+    INIT_ID(_fix_up_module), \
+    INIT_ID(_flags_), \
+    INIT_ID(_get_sourcefile), \
+    INIT_ID(_handle_fromlist), \
+    INIT_ID(_initializing), \
+    INIT_ID(_io), \
+    INIT_ID(_is_text_encoding), \
+    INIT_ID(_length_), \
+    INIT_ID(_limbo), \
+    INIT_ID(_lock_unlock_module), \
+    INIT_ID(_loop), \
+    INIT_ID(_needs_com_addref_), \
+    INIT_ID(_pack_), \
+    INIT_ID(_restype_), \
+    INIT_ID(_showwarnmsg), \
+    INIT_ID(_shutdown), \
+    INIT_ID(_slotnames), \
+    INIT_ID(_strptime_datetime), \
+    INIT_ID(_swappedbytes_), \
+    INIT_ID(_type_), \
+    INIT_ID(_uninitialized_submodules), \
+    INIT_ID(_warn_unawaited_coroutine), \
+    INIT_ID(_xoptions), \
+    INIT_ID(a), \
+    INIT_ID(abs_tol), \
+    INIT_ID(access), \
+    INIT_ID(add), \
+    INIT_ID(add_done_callback), \
+    INIT_ID(after_in_child), \
+    INIT_ID(after_in_parent), \
+    INIT_ID(aggregate_class), \
+    INIT_ID(alias), \
+    INIT_ID(append), \
+    INIT_ID(arg), \
+    INIT_ID(argdefs), \
+    INIT_ID(args), \
+    INIT_ID(arguments), \
+    INIT_ID(argv), \
+    INIT_ID(as_integer_ratio), \
+    INIT_ID(ast), \
+    INIT_ID(attribute), \
+    INIT_ID(authorizer_callback), \
+    INIT_ID(autocommit), \
+    INIT_ID(b), \
+    INIT_ID(backtick), \
+    INIT_ID(base), \
+    INIT_ID(before), \
+    INIT_ID(big), \
+    INIT_ID(binary_form), \
+    INIT_ID(block), \
+    INIT_ID(bound), \
+    INIT_ID(buffer), \
+    INIT_ID(buffer_callback), \
+    INIT_ID(buffer_size), \
+    INIT_ID(buffering), \
+    INIT_ID(buffers), \
+    INIT_ID(bufsize), \
+    INIT_ID(builtins), \
+    INIT_ID(byteorder), \
+    INIT_ID(bytes), \
+    INIT_ID(bytes_per_sep), \
+    INIT_ID(c), \
+    INIT_ID(c_call), \
+    INIT_ID(c_exception), \
+    INIT_ID(c_return), \
+    INIT_ID(cached_statements), \
+    INIT_ID(cadata), \
+    INIT_ID(cafile), \
+    INIT_ID(call), \
+    INIT_ID(call_exception_handler), \
+    INIT_ID(call_soon), \
+    INIT_ID(cancel), \
+    INIT_ID(capath), \
+    INIT_ID(category), \
+    INIT_ID(cb_type), \
+    INIT_ID(certfile), \
+    INIT_ID(check_same_thread), \
+    INIT_ID(clear), \
+    INIT_ID(close), \
+    INIT_ID(closed), \
+    INIT_ID(closefd), \
+    INIT_ID(closure), \
+    INIT_ID(co_argcount), \
+    INIT_ID(co_cellvars), \
+    INIT_ID(co_code), \
+    INIT_ID(co_consts), \
+    INIT_ID(co_exceptiontable), \
+    INIT_ID(co_filename), \
+    INIT_ID(co_firstlineno), \
+    INIT_ID(co_flags), \
+    INIT_ID(co_freevars), \
+    INIT_ID(co_kwonlyargcount), \
+    INIT_ID(co_linetable), \
+    INIT_ID(co_name), \
+    INIT_ID(co_names), \
+    INIT_ID(co_nlocals), \
+    INIT_ID(co_posonlyargcount), \
+    INIT_ID(co_qualname), \
+    INIT_ID(co_stacksize), \
+    INIT_ID(co_varnames), \
+    INIT_ID(code), \
+    INIT_ID(command), \
+    INIT_ID(comment_factory), \
+    INIT_ID(compile_mode), \
+    INIT_ID(consts), \
+    INIT_ID(context), \
+    INIT_ID(contravariant), \
+    INIT_ID(cookie), \
+    INIT_ID(copy), \
+    INIT_ID(copyreg), \
+    INIT_ID(coro), \
+    INIT_ID(count), \
+    INIT_ID(covariant), \
+    INIT_ID(cwd), \
+    INIT_ID(d), \
+    INIT_ID(data), \
+    INIT_ID(database), \
+    INIT_ID(decode), \
+    INIT_ID(decoder), \
+    INIT_ID(default), \
+    INIT_ID(defaultaction), \
+    INIT_ID(delete), \
+    INIT_ID(depth), \
+    INIT_ID(detect_types), \
+    INIT_ID(deterministic), \
+    INIT_ID(device), \
+    INIT_ID(dict), \
+    INIT_ID(dictcomp), \
+    INIT_ID(difference_update), \
+    INIT_ID(digest), \
+    INIT_ID(digest_size), \
+    INIT_ID(digestmod), \
+    INIT_ID(dir_fd), \
+    INIT_ID(discard), \
+    INIT_ID(dispatch_table), \
+    INIT_ID(displayhook), \
+    INIT_ID(dklen), \
+    INIT_ID(doc), \
+    INIT_ID(dont_inherit), \
+    INIT_ID(dst), \
+    INIT_ID(dst_dir_fd), \
+    INIT_ID(duration), \
+    INIT_ID(e), \
+    INIT_ID(eager_start), \
+    INIT_ID(effective_ids), \
+    INIT_ID(element_factory), \
+    INIT_ID(encode), \
+    INIT_ID(encoding), \
+    INIT_ID(end), \
+    INIT_ID(end_lineno), \
+    INIT_ID(end_offset), \
+    INIT_ID(endpos), \
+    INIT_ID(entrypoint), \
+    INIT_ID(env), \
+    INIT_ID(errors), \
+    INIT_ID(event), \
+    INIT_ID(eventmask), \
+    INIT_ID(exc_type), \
+    INIT_ID(exc_value), \
+    INIT_ID(excepthook), \
+    INIT_ID(exception), \
+    INIT_ID(existing_file_name), \
+    INIT_ID(exp), \
+    INIT_ID(extend), \
+    INIT_ID(extra_tokens), \
+    INIT_ID(facility), \
+    INIT_ID(factory), \
+    INIT_ID(false), \
+    INIT_ID(family), \
+    INIT_ID(fanout), \
+    INIT_ID(fd), \
+    INIT_ID(fd2), \
+    INIT_ID(fdel), \
+    INIT_ID(fget), \
+    INIT_ID(file), \
+    INIT_ID(file_actions), \
+    INIT_ID(filename), \
+    INIT_ID(fileno), \
+    INIT_ID(filepath), \
+    INIT_ID(fillvalue), \
+    INIT_ID(filters), \
+    INIT_ID(final), \
+    INIT_ID(find_class), \
+    INIT_ID(fix_imports), \
+    INIT_ID(flags), \
+    INIT_ID(flush), \
+    INIT_ID(follow_symlinks), \
+    INIT_ID(format), \
+    INIT_ID(frequency), \
+    INIT_ID(from_param), \
+    INIT_ID(fromlist), \
+    INIT_ID(fromtimestamp), \
+    INIT_ID(fromutc), \
+    INIT_ID(fset), \
+    INIT_ID(func), \
+    INIT_ID(future), \
+    INIT_ID(generation), \
+    INIT_ID(genexpr), \
+    INIT_ID(get), \
+    INIT_ID(get_debug), \
+    INIT_ID(get_event_loop), \
+    INIT_ID(get_loop), \
+    INIT_ID(get_source), \
+    INIT_ID(getattr), \
+    INIT_ID(getstate), \
+    INIT_ID(gid), \
+    INIT_ID(globals), \
+    INIT_ID(groupindex), \
+    INIT_ID(groups), \
+    INIT_ID(handle), \
+    INIT_ID(hash_name), \
+    INIT_ID(header), \
+    INIT_ID(headers), \
+    INIT_ID(hi), \
+    INIT_ID(hook), \
+    INIT_ID(id), \
+    INIT_ID(ident), \
+    INIT_ID(ignore), \
+    INIT_ID(imag), \
+    INIT_ID(importlib), \
+    INIT_ID(in_fd), \
+    INIT_ID(incoming), \
+    INIT_ID(indexgroup), \
+    INIT_ID(inf), \
+    INIT_ID(infer_variance), \
+    INIT_ID(inheritable), \
+    INIT_ID(initial), \
+    INIT_ID(initial_bytes), \
+    INIT_ID(initial_value), \
+    INIT_ID(initval), \
+    INIT_ID(inner_size), \
+    INIT_ID(input), \
+    INIT_ID(insert_comments), \
+    INIT_ID(insert_pis), \
+    INIT_ID(instructions), \
+    INIT_ID(intern), \
+    INIT_ID(intersection), \
+    INIT_ID(is_running), \
+    INIT_ID(isatty), \
+    INIT_ID(isinstance), \
+    INIT_ID(isoformat), \
+    INIT_ID(isolation_level), \
+    INIT_ID(istext), \
+    INIT_ID(item), \
+    INIT_ID(items), \
+    INIT_ID(iter), \
+    INIT_ID(iterable), \
+    INIT_ID(iterations), \
+    INIT_ID(join), \
+    INIT_ID(jump), \
+    INIT_ID(keepends), \
+    INIT_ID(key), \
+    INIT_ID(keyfile), \
+    INIT_ID(keys), \
+    INIT_ID(kind), \
+    INIT_ID(kw), \
+    INIT_ID(kw1), \
+    INIT_ID(kw2), \
+    INIT_ID(lambda), \
+    INIT_ID(last), \
+    INIT_ID(last_exc), \
+    INIT_ID(last_node), \
+    INIT_ID(last_traceback), \
+    INIT_ID(last_type), \
+    INIT_ID(last_value), \
+    INIT_ID(latin1), \
+    INIT_ID(leaf_size), \
+    INIT_ID(len), \
+    INIT_ID(length), \
+    INIT_ID(level), \
+    INIT_ID(limit), \
+    INIT_ID(line), \
+    INIT_ID(line_buffering), \
+    INIT_ID(lineno), \
+    INIT_ID(listcomp), \
+    INIT_ID(little), \
+    INIT_ID(lo), \
+    INIT_ID(locale), \
+    INIT_ID(locals), \
+    INIT_ID(logoption), \
+    INIT_ID(loop), \
+    INIT_ID(mapping), \
+    INIT_ID(match), \
+    INIT_ID(max_length), \
+    INIT_ID(maxdigits), \
+    INIT_ID(maxevents), \
+    INIT_ID(maxmem), \
+    INIT_ID(maxsplit), \
+    INIT_ID(maxvalue), \
+    INIT_ID(memLevel), \
+    INIT_ID(memlimit), \
+    INIT_ID(message), \
+    INIT_ID(metaclass), \
+    INIT_ID(metadata), \
+    INIT_ID(method), \
+    INIT_ID(mod), \
+    INIT_ID(mode), \
+    INIT_ID(module), \
+    INIT_ID(module_globals), \
+    INIT_ID(modules), \
+    INIT_ID(mro), \
+    INIT_ID(msg), \
+    INIT_ID(mycmp), \
+    INIT_ID(n), \
+    INIT_ID(n_arg), \
+    INIT_ID(n_fields), \
+    INIT_ID(n_sequence_fields), \
+    INIT_ID(n_unnamed_fields), \
+    INIT_ID(name), \
+    INIT_ID(name_from), \
+    INIT_ID(namespace_separator), \
+    INIT_ID(namespaces), \
+    INIT_ID(narg), \
+    INIT_ID(ndigits), \
+    INIT_ID(new_file_name), \
+    INIT_ID(new_limit), \
+    INIT_ID(newline), \
+    INIT_ID(newlines), \
+    INIT_ID(next), \
+    INIT_ID(nlocals), \
+    INIT_ID(node_depth), \
+    INIT_ID(node_offset), \
+    INIT_ID(ns), \
+    INIT_ID(nstype), \
+    INIT_ID(nt), \
+    INIT_ID(null), \
+    INIT_ID(number), \
+    INIT_ID(obj), \
+    INIT_ID(object), \
+    INIT_ID(offset), \
+    INIT_ID(offset_dst), \
+    INIT_ID(offset_src), \
+    INIT_ID(on_type_read), \
+    INIT_ID(onceregistry), \
+    INIT_ID(only_keys), \
+    INIT_ID(oparg), \
+    INIT_ID(opcode), \
+    INIT_ID(open), \
+    INIT_ID(opener), \
+    INIT_ID(operation), \
+    INIT_ID(optimize), \
+    INIT_ID(options), \
+    INIT_ID(order), \
+    INIT_ID(origin), \
+    INIT_ID(out_fd), \
+    INIT_ID(outgoing), \
+    INIT_ID(overlapped), \
+    INIT_ID(owner), \
+    INIT_ID(p), \
+    INIT_ID(pages), \
+    INIT_ID(parent), \
+    INIT_ID(password), \
+    INIT_ID(path), \
+    INIT_ID(pattern), \
+    INIT_ID(peek), \
+    INIT_ID(persistent_id), \
+    INIT_ID(persistent_load), \
+    INIT_ID(person), \
+    INIT_ID(pi_factory), \
+    INIT_ID(pid), \
+    INIT_ID(policy), \
+    INIT_ID(pos), \
+    INIT_ID(pos1), \
+    INIT_ID(pos2), \
+    INIT_ID(posix), \
+    INIT_ID(print_file_and_line), \
+    INIT_ID(priority), \
+    INIT_ID(progress), \
+    INIT_ID(progress_handler), \
+    INIT_ID(progress_routine), \
+    INIT_ID(proto), \
+    INIT_ID(protocol), \
+    INIT_ID(ps1), \
+    INIT_ID(ps2), \
+    INIT_ID(query), \
+    INIT_ID(quotetabs), \
+    INIT_ID(r), \
+    INIT_ID(raw), \
+    INIT_ID(read), \
+    INIT_ID(read1), \
+    INIT_ID(readable), \
+    INIT_ID(readall), \
+    INIT_ID(readinto), \
+    INIT_ID(readinto1), \
+    INIT_ID(readline), \
+    INIT_ID(readonly), \
+    INIT_ID(real), \
+    INIT_ID(reducer_override), \
+    INIT_ID(registry), \
+    INIT_ID(rel_tol), \
+    INIT_ID(release), \
+    INIT_ID(reload), \
+    INIT_ID(repl), \
+    INIT_ID(replace), \
+    INIT_ID(reserved), \
+    INIT_ID(reset), \
+    INIT_ID(resetids), \
+    INIT_ID(return), \
+    INIT_ID(reverse), \
+    INIT_ID(reversed), \
+    INIT_ID(s), \
+    INIT_ID(salt), \
+    INIT_ID(sched_priority), \
+    INIT_ID(scheduler), \
+    INIT_ID(seek), \
+    INIT_ID(seekable), \
+    INIT_ID(selectors), \
+    INIT_ID(self), \
+    INIT_ID(send), \
+    INIT_ID(sep), \
+    INIT_ID(sequence), \
+    INIT_ID(server_hostname), \
+    INIT_ID(server_side), \
+    INIT_ID(session), \
+    INIT_ID(setcomp), \
+    INIT_ID(setpgroup), \
+    INIT_ID(setsid), \
+    INIT_ID(setsigdef), \
+    INIT_ID(setsigmask), \
+    INIT_ID(setstate), \
+    INIT_ID(shape), \
+    INIT_ID(show_cmd), \
+    INIT_ID(signed), \
+    INIT_ID(size), \
+    INIT_ID(sizehint), \
+    INIT_ID(skip_file_prefixes), \
+    INIT_ID(sleep), \
+    INIT_ID(sock), \
+    INIT_ID(sort), \
+    INIT_ID(sound), \
+    INIT_ID(source), \
+    INIT_ID(source_traceback), \
+    INIT_ID(src), \
+    INIT_ID(src_dir_fd), \
+    INIT_ID(stacklevel), \
+    INIT_ID(start), \
+    INIT_ID(statement), \
+    INIT_ID(status), \
+    INIT_ID(stderr), \
+    INIT_ID(stdin), \
+    INIT_ID(stdout), \
+    INIT_ID(step), \
+    INIT_ID(steps), \
+    INIT_ID(store_name), \
+    INIT_ID(strategy), \
+    INIT_ID(strftime), \
+    INIT_ID(strict), \
+    INIT_ID(strict_mode), \
+    INIT_ID(string), \
+    INIT_ID(sub_key), \
+    INIT_ID(symmetric_difference_update), \
+    INIT_ID(tabsize), \
+    INIT_ID(tag), \
+    INIT_ID(target), \
+    INIT_ID(target_is_directory), \
+    INIT_ID(task), \
+    INIT_ID(tb_frame), \
+    INIT_ID(tb_lasti), \
+    INIT_ID(tb_lineno), \
+    INIT_ID(tb_next), \
+    INIT_ID(tell), \
+    INIT_ID(template), \
+    INIT_ID(term), \
+    INIT_ID(text), \
+    INIT_ID(threading), \
+    INIT_ID(throw), \
+    INIT_ID(timeout), \
+    INIT_ID(times), \
+    INIT_ID(timetuple), \
+    INIT_ID(top), \
+    INIT_ID(trace_callback), \
+    INIT_ID(traceback), \
+    INIT_ID(trailers), \
+    INIT_ID(translate), \
+    INIT_ID(true), \
+    INIT_ID(truncate), \
+    INIT_ID(twice), \
+    INIT_ID(txt), \
+    INIT_ID(type), \
+    INIT_ID(type_params), \
+    INIT_ID(tz), \
+    INIT_ID(tzname), \
+    INIT_ID(uid), \
+    INIT_ID(unlink), \
+    INIT_ID(unraisablehook), \
+    INIT_ID(uri), \
+    INIT_ID(usedforsecurity), \
+    INIT_ID(value), \
+    INIT_ID(values), \
+    INIT_ID(version), \
+    INIT_ID(volume), \
+    INIT_ID(warnings), \
+    INIT_ID(warnoptions), \
+    INIT_ID(wbits), \
+    INIT_ID(week), \
+    INIT_ID(weekday), \
+    INIT_ID(which), \
+    INIT_ID(who), \
+    INIT_ID(withdata), \
+    INIT_ID(writable), \
+    INIT_ID(write), \
+    INIT_ID(write_through), \
+    INIT_ID(x), \
+    INIT_ID(year), \
+    INIT_ID(zdict), \
+}
+
+#define _Py_str_ascii_INIT { \
+    _PyASCIIObject_INIT("\x00"), \
+    _PyASCIIObject_INIT("\x01"), \
+    _PyASCIIObject_INIT("\x02"), \
+    _PyASCIIObject_INIT("\x03"), \
+    _PyASCIIObject_INIT("\x04"), \
+    _PyASCIIObject_INIT("\x05"), \
+    _PyASCIIObject_INIT("\x06"), \
+    _PyASCIIObject_INIT("\x07"), \
+    _PyASCIIObject_INIT("\x08"), \
+    _PyASCIIObject_INIT("\x09"), \
+    _PyASCIIObject_INIT("\x0a"), \
+    _PyASCIIObject_INIT("\x0b"), \
+    _PyASCIIObject_INIT("\x0c"), \
+    _PyASCIIObject_INIT("\x0d"), \
+    _PyASCIIObject_INIT("\x0e"), \
+    _PyASCIIObject_INIT("\x0f"), \
+    _PyASCIIObject_INIT("\x10"), \
+    _PyASCIIObject_INIT("\x11"), \
+    _PyASCIIObject_INIT("\x12"), \
+    _PyASCIIObject_INIT("\x13"), \
+    _PyASCIIObject_INIT("\x14"), \
+    _PyASCIIObject_INIT("\x15"), \
+    _PyASCIIObject_INIT("\x16"), \
+    _PyASCIIObject_INIT("\x17"), \
+    _PyASCIIObject_INIT("\x18"), \
+    _PyASCIIObject_INIT("\x19"), \
+    _PyASCIIObject_INIT("\x1a"), \
+    _PyASCIIObject_INIT("\x1b"), \
+    _PyASCIIObject_INIT("\x1c"), \
+    _PyASCIIObject_INIT("\x1d"), \
+    _PyASCIIObject_INIT("\x1e"), \
+    _PyASCIIObject_INIT("\x1f"), \
+    _PyASCIIObject_INIT("\x20"), \
+    _PyASCIIObject_INIT("\x21"), \
+    _PyASCIIObject_INIT("\x22"), \
+    _PyASCIIObject_INIT("\x23"), \
+    _PyASCIIObject_INIT("\x24"), \
+    _PyASCIIObject_INIT("\x25"), \
+    _PyASCIIObject_INIT("\x26"), \
+    _PyASCIIObject_INIT("\x27"), \
+    _PyASCIIObject_INIT("\x28"), \
+    _PyASCIIObject_INIT("\x29"), \
+    _PyASCIIObject_INIT("\x2a"), \
+    _PyASCIIObject_INIT("\x2b"), \
+    _PyASCIIObject_INIT("\x2c"), \
+    _PyASCIIObject_INIT("\x2d"), \
+    _PyASCIIObject_INIT("\x2e"), \
+    _PyASCIIObject_INIT("\x2f"), \
+    _PyASCIIObject_INIT("\x30"), \
+    _PyASCIIObject_INIT("\x31"), \
+    _PyASCIIObject_INIT("\x32"), \
+    _PyASCIIObject_INIT("\x33"), \
+    _PyASCIIObject_INIT("\x34"), \
+    _PyASCIIObject_INIT("\x35"), \
+    _PyASCIIObject_INIT("\x36"), \
+    _PyASCIIObject_INIT("\x37"), \
+    _PyASCIIObject_INIT("\x38"), \
+    _PyASCIIObject_INIT("\x39"), \
+    _PyASCIIObject_INIT("\x3a"), \
+    _PyASCIIObject_INIT("\x3b"), \
+    _PyASCIIObject_INIT("\x3c"), \
+    _PyASCIIObject_INIT("\x3d"), \
+    _PyASCIIObject_INIT("\x3e"), \
+    _PyASCIIObject_INIT("\x3f"), \
+    _PyASCIIObject_INIT("\x40"), \
+    _PyASCIIObject_INIT("\x41"), \
+    _PyASCIIObject_INIT("\x42"), \
+    _PyASCIIObject_INIT("\x43"), \
+    _PyASCIIObject_INIT("\x44"), \
+    _PyASCIIObject_INIT("\x45"), \
+    _PyASCIIObject_INIT("\x46"), \
+    _PyASCIIObject_INIT("\x47"), \
+    _PyASCIIObject_INIT("\x48"), \
+    _PyASCIIObject_INIT("\x49"), \
+    _PyASCIIObject_INIT("\x4a"), \
+    _PyASCIIObject_INIT("\x4b"), \
+    _PyASCIIObject_INIT("\x4c"), \
+    _PyASCIIObject_INIT("\x4d"), \
+    _PyASCIIObject_INIT("\x4e"), \
+    _PyASCIIObject_INIT("\x4f"), \
+    _PyASCIIObject_INIT("\x50"), \
+    _PyASCIIObject_INIT("\x51"), \
+    _PyASCIIObject_INIT("\x52"), \
+    _PyASCIIObject_INIT("\x53"), \
+    _PyASCIIObject_INIT("\x54"), \
+    _PyASCIIObject_INIT("\x55"), \
+    _PyASCIIObject_INIT("\x56"), \
+    _PyASCIIObject_INIT("\x57"), \
+    _PyASCIIObject_INIT("\x58"), \
+    _PyASCIIObject_INIT("\x59"), \
+    _PyASCIIObject_INIT("\x5a"), \
+    _PyASCIIObject_INIT("\x5b"), \
+    _PyASCIIObject_INIT("\x5c"), \
+    _PyASCIIObject_INIT("\x5d"), \
+    _PyASCIIObject_INIT("\x5e"), \
+    _PyASCIIObject_INIT("\x5f"), \
+    _PyASCIIObject_INIT("\x60"), \
+    _PyASCIIObject_INIT("\x61"), \
+    _PyASCIIObject_INIT("\x62"), \
+    _PyASCIIObject_INIT("\x63"), \
+    _PyASCIIObject_INIT("\x64"), \
+    _PyASCIIObject_INIT("\x65"), \
+    _PyASCIIObject_INIT("\x66"), \
+    _PyASCIIObject_INIT("\x67"), \
+    _PyASCIIObject_INIT("\x68"), \
+    _PyASCIIObject_INIT("\x69"), \
+    _PyASCIIObject_INIT("\x6a"), \
+    _PyASCIIObject_INIT("\x6b"), \
+    _PyASCIIObject_INIT("\x6c"), \
+    _PyASCIIObject_INIT("\x6d"), \
+    _PyASCIIObject_INIT("\x6e"), \
+    _PyASCIIObject_INIT("\x6f"), \
+    _PyASCIIObject_INIT("\x70"), \
+    _PyASCIIObject_INIT("\x71"), \
+    _PyASCIIObject_INIT("\x72"), \
+    _PyASCIIObject_INIT("\x73"), \
+    _PyASCIIObject_INIT("\x74"), \
+    _PyASCIIObject_INIT("\x75"), \
+    _PyASCIIObject_INIT("\x76"), \
+    _PyASCIIObject_INIT("\x77"), \
+    _PyASCIIObject_INIT("\x78"), \
+    _PyASCIIObject_INIT("\x79"), \
+    _PyASCIIObject_INIT("\x7a"), \
+    _PyASCIIObject_INIT("\x7b"), \
+    _PyASCIIObject_INIT("\x7c"), \
+    _PyASCIIObject_INIT("\x7d"), \
+    _PyASCIIObject_INIT("\x7e"), \
+    _PyASCIIObject_INIT("\x7f"), \
+}
+
+#define _Py_str_latin1_INIT { \
+    _PyUnicode_LATIN1_INIT("\x80", "\xc2\x80"), \
+    _PyUnicode_LATIN1_INIT("\x81", "\xc2\x81"), \
+    _PyUnicode_LATIN1_INIT("\x82", "\xc2\x82"), \
+    _PyUnicode_LATIN1_INIT("\x83", "\xc2\x83"), \
+    _PyUnicode_LATIN1_INIT("\x84", "\xc2\x84"), \
+    _PyUnicode_LATIN1_INIT("\x85", "\xc2\x85"), \
+    _PyUnicode_LATIN1_INIT("\x86", "\xc2\x86"), \
+    _PyUnicode_LATIN1_INIT("\x87", "\xc2\x87"), \
+    _PyUnicode_LATIN1_INIT("\x88", "\xc2\x88"), \
+    _PyUnicode_LATIN1_INIT("\x89", "\xc2\x89"), \
+    _PyUnicode_LATIN1_INIT("\x8a", "\xc2\x8a"), \
+    _PyUnicode_LATIN1_INIT("\x8b", "\xc2\x8b"), \
+    _PyUnicode_LATIN1_INIT("\x8c", "\xc2\x8c"), \
+    _PyUnicode_LATIN1_INIT("\x8d", "\xc2\x8d"), \
+    _PyUnicode_LATIN1_INIT("\x8e", "\xc2\x8e"), \
+    _PyUnicode_LATIN1_INIT("\x8f", "\xc2\x8f"), \
+    _PyUnicode_LATIN1_INIT("\x90", "\xc2\x90"), \
+    _PyUnicode_LATIN1_INIT("\x91", "\xc2\x91"), \
+    _PyUnicode_LATIN1_INIT("\x92", "\xc2\x92"), \
+    _PyUnicode_LATIN1_INIT("\x93", "\xc2\x93"), \
+    _PyUnicode_LATIN1_INIT("\x94", "\xc2\x94"), \
+    _PyUnicode_LATIN1_INIT("\x95", "\xc2\x95"), \
+    _PyUnicode_LATIN1_INIT("\x96", "\xc2\x96"), \
+    _PyUnicode_LATIN1_INIT("\x97", "\xc2\x97"), \
+    _PyUnicode_LATIN1_INIT("\x98", "\xc2\x98"), \
+    _PyUnicode_LATIN1_INIT("\x99", "\xc2\x99"), \
+    _PyUnicode_LATIN1_INIT("\x9a", "\xc2\x9a"), \
+    _PyUnicode_LATIN1_INIT("\x9b", "\xc2\x9b"), \
+    _PyUnicode_LATIN1_INIT("\x9c", "\xc2\x9c"), \
+    _PyUnicode_LATIN1_INIT("\x9d", "\xc2\x9d"), \
+    _PyUnicode_LATIN1_INIT("\x9e", "\xc2\x9e"), \
+    _PyUnicode_LATIN1_INIT("\x9f", "\xc2\x9f"), \
+    _PyUnicode_LATIN1_INIT("\xa0", "\xc2\xa0"), \
+    _PyUnicode_LATIN1_INIT("\xa1", "\xc2\xa1"), \
+    _PyUnicode_LATIN1_INIT("\xa2", "\xc2\xa2"), \
+    _PyUnicode_LATIN1_INIT("\xa3", "\xc2\xa3"), \
+    _PyUnicode_LATIN1_INIT("\xa4", "\xc2\xa4"), \
+    _PyUnicode_LATIN1_INIT("\xa5", "\xc2\xa5"), \
+    _PyUnicode_LATIN1_INIT("\xa6", "\xc2\xa6"), \
+    _PyUnicode_LATIN1_INIT("\xa7", "\xc2\xa7"), \
+    _PyUnicode_LATIN1_INIT("\xa8", "\xc2\xa8"), \
+    _PyUnicode_LATIN1_INIT("\xa9", "\xc2\xa9"), \
+    _PyUnicode_LATIN1_INIT("\xaa", "\xc2\xaa"), \
+    _PyUnicode_LATIN1_INIT("\xab", "\xc2\xab"), \
+    _PyUnicode_LATIN1_INIT("\xac", "\xc2\xac"), \
+    _PyUnicode_LATIN1_INIT("\xad", "\xc2\xad"), \
+    _PyUnicode_LATIN1_INIT("\xae", "\xc2\xae"), \
+    _PyUnicode_LATIN1_INIT("\xaf", "\xc2\xaf"), \
+    _PyUnicode_LATIN1_INIT("\xb0", "\xc2\xb0"), \
+    _PyUnicode_LATIN1_INIT("\xb1", "\xc2\xb1"), \
+    _PyUnicode_LATIN1_INIT("\xb2", "\xc2\xb2"), \
+    _PyUnicode_LATIN1_INIT("\xb3", "\xc2\xb3"), \
+    _PyUnicode_LATIN1_INIT("\xb4", "\xc2\xb4"), \
+    _PyUnicode_LATIN1_INIT("\xb5", "\xc2\xb5"), \
+    _PyUnicode_LATIN1_INIT("\xb6", "\xc2\xb6"), \
+    _PyUnicode_LATIN1_INIT("\xb7", "\xc2\xb7"), \
+    _PyUnicode_LATIN1_INIT("\xb8", "\xc2\xb8"), \
+    _PyUnicode_LATIN1_INIT("\xb9", "\xc2\xb9"), \
+    _PyUnicode_LATIN1_INIT("\xba", "\xc2\xba"), \
+    _PyUnicode_LATIN1_INIT("\xbb", "\xc2\xbb"), \
+    _PyUnicode_LATIN1_INIT("\xbc", "\xc2\xbc"), \
+    _PyUnicode_LATIN1_INIT("\xbd", "\xc2\xbd"), \
+    _PyUnicode_LATIN1_INIT("\xbe", "\xc2\xbe"), \
+    _PyUnicode_LATIN1_INIT("\xbf", "\xc2\xbf"), \
+    _PyUnicode_LATIN1_INIT("\xc0", "\xc3\x80"), \
+    _PyUnicode_LATIN1_INIT("\xc1", "\xc3\x81"), \
+    _PyUnicode_LATIN1_INIT("\xc2", "\xc3\x82"), \
+    _PyUnicode_LATIN1_INIT("\xc3", "\xc3\x83"), \
+    _PyUnicode_LATIN1_INIT("\xc4", "\xc3\x84"), \
+    _PyUnicode_LATIN1_INIT("\xc5", "\xc3\x85"), \
+    _PyUnicode_LATIN1_INIT("\xc6", "\xc3\x86"), \
+    _PyUnicode_LATIN1_INIT("\xc7", "\xc3\x87"), \
+    _PyUnicode_LATIN1_INIT("\xc8", "\xc3\x88"), \
+    _PyUnicode_LATIN1_INIT("\xc9", "\xc3\x89"), \
+    _PyUnicode_LATIN1_INIT("\xca", "\xc3\x8a"), \
+    _PyUnicode_LATIN1_INIT("\xcb", "\xc3\x8b"), \
+    _PyUnicode_LATIN1_INIT("\xcc", "\xc3\x8c"), \
+    _PyUnicode_LATIN1_INIT("\xcd", "\xc3\x8d"), \
+    _PyUnicode_LATIN1_INIT("\xce", "\xc3\x8e"), \
+    _PyUnicode_LATIN1_INIT("\xcf", "\xc3\x8f"), \
+    _PyUnicode_LATIN1_INIT("\xd0", "\xc3\x90"), \
+    _PyUnicode_LATIN1_INIT("\xd1", "\xc3\x91"), \
+    _PyUnicode_LATIN1_INIT("\xd2", "\xc3\x92"), \
+    _PyUnicode_LATIN1_INIT("\xd3", "\xc3\x93"), \
+    _PyUnicode_LATIN1_INIT("\xd4", "\xc3\x94"), \
+    _PyUnicode_LATIN1_INIT("\xd5", "\xc3\x95"), \
+    _PyUnicode_LATIN1_INIT("\xd6", "\xc3\x96"), \
+    _PyUnicode_LATIN1_INIT("\xd7", "\xc3\x97"), \
+    _PyUnicode_LATIN1_INIT("\xd8", "\xc3\x98"), \
+    _PyUnicode_LATIN1_INIT("\xd9", "\xc3\x99"), \
+    _PyUnicode_LATIN1_INIT("\xda", "\xc3\x9a"), \
+    _PyUnicode_LATIN1_INIT("\xdb", "\xc3\x9b"), \
+    _PyUnicode_LATIN1_INIT("\xdc", "\xc3\x9c"), \
+    _PyUnicode_LATIN1_INIT("\xdd", "\xc3\x9d"), \
+    _PyUnicode_LATIN1_INIT("\xde", "\xc3\x9e"), \
+    _PyUnicode_LATIN1_INIT("\xdf", "\xc3\x9f"), \
+    _PyUnicode_LATIN1_INIT("\xe0", "\xc3\xa0"), \
+    _PyUnicode_LATIN1_INIT("\xe1", "\xc3\xa1"), \
+    _PyUnicode_LATIN1_INIT("\xe2", "\xc3\xa2"), \
+    _PyUnicode_LATIN1_INIT("\xe3", "\xc3\xa3"), \
+    _PyUnicode_LATIN1_INIT("\xe4", "\xc3\xa4"), \
+    _PyUnicode_LATIN1_INIT("\xe5", "\xc3\xa5"), \
+    _PyUnicode_LATIN1_INIT("\xe6", "\xc3\xa6"), \
+    _PyUnicode_LATIN1_INIT("\xe7", "\xc3\xa7"), \
+    _PyUnicode_LATIN1_INIT("\xe8", "\xc3\xa8"), \
+    _PyUnicode_LATIN1_INIT("\xe9", "\xc3\xa9"), \
+    _PyUnicode_LATIN1_INIT("\xea", "\xc3\xaa"), \
+    _PyUnicode_LATIN1_INIT("\xeb", "\xc3\xab"), \
+    _PyUnicode_LATIN1_INIT("\xec", "\xc3\xac"), \
+    _PyUnicode_LATIN1_INIT("\xed", "\xc3\xad"), \
+    _PyUnicode_LATIN1_INIT("\xee", "\xc3\xae"), \
+    _PyUnicode_LATIN1_INIT("\xef", "\xc3\xaf"), \
+    _PyUnicode_LATIN1_INIT("\xf0", "\xc3\xb0"), \
+    _PyUnicode_LATIN1_INIT("\xf1", "\xc3\xb1"), \
+    _PyUnicode_LATIN1_INIT("\xf2", "\xc3\xb2"), \
+    _PyUnicode_LATIN1_INIT("\xf3", "\xc3\xb3"), \
+    _PyUnicode_LATIN1_INIT("\xf4", "\xc3\xb4"), \
+    _PyUnicode_LATIN1_INIT("\xf5", "\xc3\xb5"), \
+    _PyUnicode_LATIN1_INIT("\xf6", "\xc3\xb6"), \
+    _PyUnicode_LATIN1_INIT("\xf7", "\xc3\xb7"), \
+    _PyUnicode_LATIN1_INIT("\xf8", "\xc3\xb8"), \
+    _PyUnicode_LATIN1_INIT("\xf9", "\xc3\xb9"), \
+    _PyUnicode_LATIN1_INIT("\xfa", "\xc3\xba"), \
+    _PyUnicode_LATIN1_INIT("\xfb", "\xc3\xbb"), \
+    _PyUnicode_LATIN1_INIT("\xfc", "\xc3\xbc"), \
+    _PyUnicode_LATIN1_INIT("\xfd", "\xc3\xbd"), \
+    _PyUnicode_LATIN1_INIT("\xfe", "\xc3\xbe"), \
+    _PyUnicode_LATIN1_INIT("\xff", "\xc3\xbf"), \
+}
+/* End auto-generated code */
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_RUNTIME_INIT_GENERATED_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_signal.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_signal.h
new file mode 100644
index 000000000000..ca3f69d09fc0
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_signal.h
@@ -0,0 +1,98 @@
+// Define Py_NSIG constant for signal handling.
+
+#ifndef Py_INTERNAL_SIGNAL_H
+#define Py_INTERNAL_SIGNAL_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_atomic.h"         // _Py_atomic_address
+
+#include <signal.h>                // NSIG
+
+
+#ifdef _SIG_MAXSIG
+   // gh-91145: On FreeBSD, <signal.h> defines NSIG as 32: it doesn't include
+   // realtime signals: [SIGRTMIN,SIGRTMAX]. Use _SIG_MAXSIG instead. For
+   // example on x86-64 FreeBSD 13, SIGRTMAX is 126 and _SIG_MAXSIG is 128.
+#  define Py_NSIG _SIG_MAXSIG
+#elif defined(NSIG)
+#  define Py_NSIG NSIG
+#elif defined(_NSIG)
+#  define Py_NSIG _NSIG            // BSD/SysV
+#elif defined(_SIGMAX)
+#  define Py_NSIG (_SIGMAX + 1)    // QNX
+#elif defined(SIGMAX)
+#  define Py_NSIG (SIGMAX + 1)     // djgpp
+#else
+#  define Py_NSIG 64               // Use a reasonable default value
+#endif
+
+#define INVALID_FD (-1)
+
+struct _signals_runtime_state {
+    volatile struct {
+        _Py_atomic_int tripped;
+        /* func is atomic to ensure that PyErr_SetInterrupt is async-signal-safe
+         * (even though it would probably be otherwise, anyway).
+         */
+        _Py_atomic_address func;
+    } handlers[Py_NSIG];
+
+    volatile struct {
+#ifdef MS_WINDOWS
+        /* This would be "SOCKET fd" if <winsock2.h> were always included.
+           It isn't so we must cast to SOCKET where appropriate. */
+        volatile int fd;
+#elif defined(__VXWORKS__)
+        int fd;
+#else
+        sig_atomic_t fd;
+#endif
+
+        int warn_on_full_buffer;
+#ifdef MS_WINDOWS
+        int use_send;
+#endif
+    } wakeup;
+
+    /* Speed up sigcheck() when none tripped */
+    _Py_atomic_int is_tripped;
+
+    /* These objects necessarily belong to the main interpreter. */
+    PyObject *default_handler;
+    PyObject *ignore_handler;
+
+#ifdef MS_WINDOWS
+    /* This would be "HANDLE sigint_event" if <windows.h> were always included.
+       It isn't so we must cast to HANDLE everywhere "sigint_event" is used. */
+    void *sigint_event;
+#endif
+
+    /* True if the main interpreter thread exited due to an unhandled
+     * KeyboardInterrupt exception, suggesting the user pressed ^C. */
+    int unhandled_keyboard_interrupt;
+};
+
+#ifdef MS_WINDOWS
+# define _signals_WAKEUP_INIT \
+    {.fd = INVALID_FD, .warn_on_full_buffer = 1, .use_send = 0}
+#else
+# define _signals_WAKEUP_INIT \
+    {.fd = INVALID_FD, .warn_on_full_buffer = 1}
+#endif
+
+#define _signals_RUNTIME_INIT \
+    { \
+        .wakeup = _signals_WAKEUP_INIT, \
+    }
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // !Py_INTERNAL_SIGNAL_H
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_sliceobject.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_sliceobject.h
new file mode 100644
index 000000000000..98665c3859d5
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_sliceobject.h
@@ -0,0 +1,22 @@
+#ifndef Py_INTERNAL_SLICEOBJECT_H
+#define Py_INTERNAL_SLICEOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+
+/* runtime lifecycle */
+
+extern void _PySlice_Fini(PyInterpreterState *);
+
+extern PyObject *
+_PyBuildSlice_ConsumeRefs(PyObject *start, PyObject *stop);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_SLICEOBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_strhex.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_strhex.h
new file mode 100644
index 000000000000..f427b4d695bd
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_strhex.h
@@ -0,0 +1,36 @@
+#ifndef Py_INTERNAL_STRHEX_H
+#define Py_INTERNAL_STRHEX_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+// Returns a str() containing the hex representation of argbuf.
+PyAPI_FUNC(PyObject*) _Py_strhex(const
+    char* argbuf,
+    const Py_ssize_t arglen);
+
+// Returns a bytes() containing the ASCII hex representation of argbuf.
+PyAPI_FUNC(PyObject*) _Py_strhex_bytes(
+    const char* argbuf,
+    const Py_ssize_t arglen);
+
+// These variants include support for a separator between every N bytes:
+PyAPI_FUNC(PyObject*) _Py_strhex_with_sep(
+    const char* argbuf,
+    const Py_ssize_t arglen,
+    PyObject* sep,
+    const int bytes_per_group);
+PyAPI_FUNC(PyObject*) _Py_strhex_bytes_with_sep(
+    const char* argbuf,
+    const Py_ssize_t arglen,
+    PyObject* sep,
+    const int bytes_per_group);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_STRHEX_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_structseq.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_structseq.h
new file mode 100644
index 000000000000..6f5dfc12707c
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_structseq.h
@@ -0,0 +1,39 @@
+#ifndef Py_INTERNAL_STRUCTSEQ_H
+#define Py_INTERNAL_STRUCTSEQ_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+
+/* other API */
+
+PyAPI_FUNC(PyTypeObject *) _PyStructSequence_NewType(
+    PyStructSequence_Desc *desc,
+    unsigned long tp_flags);
+
+extern int _PyStructSequence_InitBuiltinWithFlags(
+    PyInterpreterState *interp,
+    PyTypeObject *type,
+    PyStructSequence_Desc *desc,
+    unsigned long tp_flags);
+
+static inline int
+_PyStructSequence_InitBuiltin(PyInterpreterState *interp,
+                              PyTypeObject *type,
+                              PyStructSequence_Desc *desc)
+{
+    return _PyStructSequence_InitBuiltinWithFlags(interp, type, desc, 0);
+}
+
+extern void _PyStructSequence_FiniBuiltin(
+    PyInterpreterState *interp,
+    PyTypeObject *type);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_STRUCTSEQ_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_symtable.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_symtable.h
new file mode 100644
index 000000000000..b2fef177204e
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_symtable.h
@@ -0,0 +1,158 @@
+#ifndef Py_INTERNAL_SYMTABLE_H
+#define Py_INTERNAL_SYMTABLE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+struct _mod;   // Type defined in pycore_ast.h
+
+typedef enum _block_type {
+    FunctionBlock, ClassBlock, ModuleBlock,
+    // Used for annotations if 'from __future__ import annotations' is active.
+    // Annotation blocks cannot bind names and are not evaluated.
+    AnnotationBlock,
+    // Used for generics and type aliases. These work mostly like functions
+    // (see PEP 695 for details). The three different blocks function identically;
+    // they are different enum entries only so that error messages can be more
+    // precise.
+    TypeVarBoundBlock, TypeAliasBlock, TypeParamBlock
+} _Py_block_ty;
+
+typedef enum _comprehension_type {
+    NoComprehension = 0,
+    ListComprehension = 1,
+    DictComprehension = 2,
+    SetComprehension = 3,
+    GeneratorExpression = 4 } _Py_comprehension_ty;
+
+struct _symtable_entry;
+
+struct symtable {
+    PyObject *st_filename;          /* name of file being compiled,
+                                       decoded from the filesystem encoding */
+    struct _symtable_entry *st_cur; /* current symbol table entry */
+    struct _symtable_entry *st_top; /* symbol table entry for module */
+    PyObject *st_blocks;            /* dict: map AST node addresses
+                                     *       to symbol table entries */
+    PyObject *st_stack;             /* list: stack of namespace info */
+    PyObject *st_global;            /* borrowed ref to st_top->ste_symbols */
+    int st_nblocks;                 /* number of blocks used. kept for
+                                       consistency with the corresponding
+                                       compiler structure */
+    PyObject *st_private;           /* name of current class or NULL */
+    PyFutureFeatures *st_future;    /* module's future features that affect
+                                       the symbol table */
+    int recursion_depth;            /* current recursion depth */
+    int recursion_limit;            /* recursion limit */
+};
+
+typedef struct _symtable_entry {
+    PyObject_HEAD
+    PyObject *ste_id;        /* int: key in ste_table->st_blocks */
+    PyObject *ste_symbols;   /* dict: variable names to flags */
+    PyObject *ste_name;      /* string: name of current block */
+    PyObject *ste_varnames;  /* list of function parameters */
+    PyObject *ste_children;  /* list of child blocks */
+    PyObject *ste_directives;/* locations of global and nonlocal statements */
+    _Py_block_ty ste_type;
+    int ste_nested;      /* true if block is nested */
+    unsigned ste_free : 1;        /* true if block has free variables */
+    unsigned ste_child_free : 1;  /* true if a child block has free vars,
+                                     including free refs to globals */
+    unsigned ste_generator : 1;   /* true if namespace is a generator */
+    unsigned ste_coroutine : 1;   /* true if namespace is a coroutine */
+    _Py_comprehension_ty ste_comprehension;  /* Kind of comprehension (if any) */
+    unsigned ste_varargs : 1;     /* true if block has varargs */
+    unsigned ste_varkeywords : 1; /* true if block has varkeywords */
+    unsigned ste_returns_value : 1;  /* true if namespace uses return with
+                                        an argument */
+    unsigned ste_needs_class_closure : 1; /* for class scopes, true if a
+                                             closure over __class__
+                                             should be created */
+    unsigned ste_needs_classdict : 1; /* for class scopes, true if a closure
+                                         over the class dict should be created */
+    unsigned ste_comp_inlined : 1; /* true if this comprehension is inlined */
+    unsigned ste_comp_iter_target : 1; /* true if visiting comprehension target */
+    unsigned ste_can_see_class_scope : 1; /* true if this block can see names bound in an
+                                             enclosing class scope */
+    int ste_comp_iter_expr; /* non-zero if visiting a comprehension range expression */
+    int ste_lineno;          /* first line of block */
+    int ste_col_offset;      /* offset of first line of block */
+    int ste_end_lineno;      /* end line of block */
+    int ste_end_col_offset;  /* end offset of first line of block */
+    int ste_opt_lineno;      /* lineno of last exec or import * */
+    int ste_opt_col_offset;  /* offset of last exec or import * */
+    struct symtable *ste_table;
+} PySTEntryObject;
+
+extern PyTypeObject PySTEntry_Type;
+
+#define PySTEntry_Check(op) Py_IS_TYPE((op), &PySTEntry_Type)
+
+extern long _PyST_GetSymbol(PySTEntryObject *, PyObject *);
+extern int _PyST_GetScope(PySTEntryObject *, PyObject *);
+extern int _PyST_IsFunctionLike(PySTEntryObject *);
+
+extern struct symtable* _PySymtable_Build(
+    struct _mod *mod,
+    PyObject *filename,
+    PyFutureFeatures *future);
+PyAPI_FUNC(PySTEntryObject *) PySymtable_Lookup(struct symtable *, void *);
+
+extern void _PySymtable_Free(struct symtable *);
+
+extern PyObject* _Py_Mangle(PyObject *p, PyObject *name);
+
+/* Flags for def-use information */
+
+#define DEF_GLOBAL 1             /* global stmt */
+#define DEF_LOCAL 2              /* assignment in code block */
+#define DEF_PARAM (2<<1)         /* formal parameter */
+#define DEF_NONLOCAL (2<<2)      /* nonlocal stmt */
+#define USE (2<<3)               /* name is used */
+#define DEF_FREE (2<<4)          /* name used but not defined in nested block */
+#define DEF_FREE_CLASS (2<<5)    /* free variable from class's method */
+#define DEF_IMPORT (2<<6)        /* assignment occurred via import */
+#define DEF_ANNOT (2<<7)         /* this name is annotated */
+#define DEF_COMP_ITER (2<<8)     /* this name is a comprehension iteration variable */
+#define DEF_TYPE_PARAM (2<<9)    /* this name is a type parameter */
+#define DEF_COMP_CELL (2<<10)    /* this name is a cell in an inlined comprehension */
+
+#define DEF_BOUND (DEF_LOCAL | DEF_PARAM | DEF_IMPORT)
+
+/* GLOBAL_EXPLICIT and GLOBAL_IMPLICIT are used internally by the symbol
+   table.  GLOBAL is returned from PyST_GetScope() for either of them.
+   It is stored in ste_symbols at bits 13-16.
+*/
+#define SCOPE_OFFSET 12
+#define SCOPE_MASK (DEF_GLOBAL | DEF_LOCAL | DEF_PARAM | DEF_NONLOCAL)
+
+#define LOCAL 1
+#define GLOBAL_EXPLICIT 2
+#define GLOBAL_IMPLICIT 3
+#define FREE 4
+#define CELL 5
+
+#define GENERATOR 1
+#define GENERATOR_EXPRESSION 2
+
+// Used by symtablemodule.c
+extern struct symtable* _Py_SymtableStringObjectFlags(
+    const char *str,
+    PyObject *filename,
+    int start,
+    PyCompilerFlags *flags);
+
+int _PyFuture_FromAST(
+    struct _mod * mod,
+    PyObject *filename,
+    PyFutureFeatures* futures);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_SYMTABLE_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_sysmodule.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_sysmodule.h
new file mode 100644
index 000000000000..b4b1febafa44
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_sysmodule.h
@@ -0,0 +1,29 @@
+#ifndef Py_INTERNAL_SYSMODULE_H
+#define Py_INTERNAL_SYSMODULE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+PyAPI_FUNC(int) _PySys_Audit(
+    PyThreadState *tstate,
+    const char *event,
+    const char *argFormat,
+    ...);
+
+/* We want minimal exposure of this function, so use extern rather than
+   PyAPI_FUNC() to not export the symbol. */
+extern void _PySys_ClearAuditHooks(PyThreadState *tstate);
+
+PyAPI_FUNC(int) _PySys_SetAttr(PyObject *, PyObject *);
+
+extern int _PySys_ClearAttrString(PyInterpreterState *interp,
+                                  const char *name, int verbose);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_SYSMODULE_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_time.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_time.h
new file mode 100644
index 000000000000..949170c44937
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_time.h
@@ -0,0 +1,25 @@
+#ifndef Py_INTERNAL_TIME_H
+#define Py_INTERNAL_TIME_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+
+struct _time_runtime_state {
+#ifdef HAVE_TIMES
+    int ticks_per_second_initialized;
+    long ticks_per_second;
+#else
+    int _not_used;
+#endif
+};
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_TIME_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_token.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_token.h
new file mode 100644
index 000000000000..c02e637fee1e
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_token.h
@@ -0,0 +1,108 @@
+/* Auto-generated by Tools/build/generate_token.py */
+
+/* Token types */
+#ifndef Py_INTERNAL_TOKEN_H
+#define Py_INTERNAL_TOKEN_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#undef TILDE   /* Prevent clash of our definition with system macro. Ex AIX, ioctl.h */
+
+#define ENDMARKER       0
+#define NAME            1
+#define NUMBER          2
+#define STRING          3
+#define NEWLINE         4
+#define INDENT          5
+#define DEDENT          6
+#define LPAR            7
+#define RPAR            8
+#define LSQB            9
+#define RSQB            10
+#define COLON           11
+#define COMMA           12
+#define SEMI            13
+#define PLUS            14
+#define MINUS           15
+#define STAR            16
+#define SLASH           17
+#define VBAR            18
+#define AMPER           19
+#define LESS            20
+#define GREATER         21
+#define EQUAL           22
+#define DOT             23
+#define PERCENT         24
+#define LBRACE          25
+#define RBRACE          26
+#define EQEQUAL         27
+#define NOTEQUAL        28
+#define LESSEQUAL       29
+#define GREATEREQUAL    30
+#define TILDE           31
+#define CIRCUMFLEX      32
+#define LEFTSHIFT       33
+#define RIGHTSHIFT      34
+#define DOUBLESTAR      35
+#define PLUSEQUAL       36
+#define MINEQUAL        37
+#define STAREQUAL       38
+#define SLASHEQUAL      39
+#define PERCENTEQUAL    40
+#define AMPEREQUAL      41
+#define VBAREQUAL       42
+#define CIRCUMFLEXEQUAL 43
+#define LEFTSHIFTEQUAL  44
+#define RIGHTSHIFTEQUAL 45
+#define DOUBLESTAREQUAL 46
+#define DOUBLESLASH     47
+#define DOUBLESLASHEQUAL 48
+#define AT              49
+#define ATEQUAL         50
+#define RARROW          51
+#define ELLIPSIS        52
+#define COLONEQUAL      53
+#define EXCLAMATION     54
+#define OP              55
+#define AWAIT           56
+#define ASYNC           57
+#define TYPE_IGNORE     58
+#define TYPE_COMMENT    59
+#define SOFT_KEYWORD    60
+#define FSTRING_START   61
+#define FSTRING_MIDDLE  62
+#define FSTRING_END     63
+#define COMMENT         64
+#define NL              65
+#define ERRORTOKEN      66
+#define N_TOKENS        68
+#define NT_OFFSET       256
+
+/* Special definitions for cooperation with parser */
+
+#define ISTERMINAL(x)           ((x) < NT_OFFSET)
+#define ISNONTERMINAL(x)        ((x) >= NT_OFFSET)
+#define ISEOF(x)                ((x) == ENDMARKER)
+#define ISWHITESPACE(x)         ((x) == ENDMARKER || \
+                                 (x) == NEWLINE   || \
+                                 (x) == INDENT    || \
+                                 (x) == DEDENT)
+#define ISSTRINGLIT(x)          ((x) == STRING           || \
+                                 (x) == FSTRING_MIDDLE)
+
+
+// Symbols exported for test_peg_generator
+PyAPI_DATA(const char * const) _PyParser_TokenNames[]; /* Token names */
+PyAPI_FUNC(int) _PyToken_OneChar(int);
+PyAPI_FUNC(int) _PyToken_TwoChars(int, int);
+PyAPI_FUNC(int) _PyToken_ThreeChars(int, int, int);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // !Py_INTERNAL_TOKEN_H
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_traceback.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_traceback.h
new file mode 100644
index 000000000000..c393b2c136f2
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_traceback.h
@@ -0,0 +1,101 @@
+#ifndef Py_INTERNAL_TRACEBACK_H
+#define Py_INTERNAL_TRACEBACK_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+/* Write the Python traceback into the file 'fd'. For example:
+
+       Traceback (most recent call first):
+         File "xxx", line xxx in <xxx>
+         File "xxx", line xxx in <xxx>
+         ...
+         File "xxx", line xxx in <xxx>
+
+   This function is written for debug purpose only, to dump the traceback in
+   the worst case: after a segmentation fault, at fatal error, etc. That's why,
+   it is very limited. Strings are truncated to 100 characters and encoded to
+   ASCII with backslashreplace. It doesn't write the source code, only the
+   function name, filename and line number of each frame. Write only the first
+   100 frames: if the traceback is truncated, write the line " ...".
+
+   This function is signal safe. */
+
+PyAPI_FUNC(void) _Py_DumpTraceback(
+    int fd,
+    PyThreadState *tstate);
+
+/* Write the traceback of all threads into the file 'fd'. current_thread can be
+   NULL.
+
+   Return NULL on success, or an error message on error.
+
+   This function is written for debug purpose only. It calls
+   _Py_DumpTraceback() for each thread, and so has the same limitations. It
+   only write the traceback of the first 100 threads: write "..." if there are
+   more threads.
+
+   If current_tstate is NULL, the function tries to get the Python thread state
+   of the current thread. It is not an error if the function is unable to get
+   the current Python thread state.
+
+   If interp is NULL, the function tries to get the interpreter state from
+   the current Python thread state, or from
+   _PyGILState_GetInterpreterStateUnsafe() in last resort.
+
+   It is better to pass NULL to interp and current_tstate, the function tries
+   different options to retrieve this information.
+
+   This function is signal safe. */
+
+PyAPI_FUNC(const char*) _Py_DumpTracebackThreads(
+    int fd,
+    PyInterpreterState *interp,
+    PyThreadState *current_tstate);
+
+/* Write a Unicode object into the file descriptor fd. Encode the string to
+   ASCII using the backslashreplace error handler.
+
+   Do nothing if text is not a Unicode object. The function accepts Unicode
+   string which is not ready (PyUnicode_WCHAR_KIND).
+
+   This function is signal safe. */
+PyAPI_FUNC(void) _Py_DumpASCII(int fd, PyObject *text);
+
+/* Format an integer as decimal into the file descriptor fd.
+
+   This function is signal safe. */
+PyAPI_FUNC(void) _Py_DumpDecimal(
+    int fd,
+    size_t value);
+
+/* Format an integer as hexadecimal with width digits into fd file descriptor.
+   The function is signal safe. */
+PyAPI_FUNC(void) _Py_DumpHexadecimal(
+    int fd,
+    uintptr_t value,
+    Py_ssize_t width);
+
+PyAPI_FUNC(PyObject*) _PyTraceBack_FromFrame(
+    PyObject *tb_next,
+    PyFrameObject *frame);
+
+#define EXCEPTION_TB_HEADER "Traceback (most recent call last):\n"
+#define EXCEPTION_GROUP_TB_HEADER "Exception Group Traceback (most recent call last):\n"
+
+/* Write the traceback tb to file f. Prefix each line with
+   indent spaces followed by the margin (if it is not NULL). */
+PyAPI_FUNC(int) _PyTraceBack_Print_Indented(
+    PyObject *tb, int indent, const char* margin,
+    const char *header_margin, const char *header, PyObject *f);
+PyAPI_FUNC(int) _Py_WriteIndentedMargin(int, const char*, PyObject *);
+PyAPI_FUNC(int) _Py_WriteIndent(int, PyObject *);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_TRACEBACK_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_tracemalloc.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_tracemalloc.h
new file mode 100644
index 000000000000..d086adc61c31
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_tracemalloc.h
@@ -0,0 +1,123 @@
+#ifndef Py_INTERNAL_TRACEMALLOC_H
+#define Py_INTERNAL_TRACEMALLOC_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_hashtable.h"     // _Py_hashtable_t
+
+
+/* Trace memory blocks allocated by PyMem_RawMalloc() */
+#define TRACE_RAW_MALLOC
+
+
+struct _PyTraceMalloc_Config {
+    /* Module initialized?
+       Variable protected by the GIL */
+    enum {
+        TRACEMALLOC_NOT_INITIALIZED,
+        TRACEMALLOC_INITIALIZED,
+        TRACEMALLOC_FINALIZED
+    } initialized;
+
+    /* Is tracemalloc tracing memory allocations?
+       Variable protected by the GIL */
+    int tracing;
+
+    /* limit of the number of frames in a traceback, 1 by default.
+       Variable protected by the GIL. */
+    int max_nframe;
+};
+
+
+/* Pack the frame_t structure to reduce the memory footprint on 64-bit
+   architectures: 12 bytes instead of 16. */
+#if defined(_MSC_VER)
+#pragma pack(push, 4)
+#endif
+
+struct
+#ifdef __GNUC__
+__attribute__((packed))
+#endif
+tracemalloc_frame {
+    /* filename cannot be NULL: "<unknown>" is used if the Python frame
+       filename is NULL */
+    PyObject *filename;
+    unsigned int lineno;
+};
+#ifdef _MSC_VER
+#pragma pack(pop)
+#endif
+
+struct tracemalloc_traceback {
+    Py_uhash_t hash;
+    /* Number of frames stored */
+    uint16_t nframe;
+    /* Total number of frames the traceback had */
+    uint16_t total_nframe;
+    struct tracemalloc_frame frames[1];
+};
+
+
+struct _tracemalloc_runtime_state {
+    struct _PyTraceMalloc_Config config;
+
+    /* Protected by the GIL */
+    struct {
+        PyMemAllocatorEx mem;
+        PyMemAllocatorEx raw;
+        PyMemAllocatorEx obj;
+    } allocators;
+
+#if defined(TRACE_RAW_MALLOC)
+    PyThread_type_lock tables_lock;
+#endif
+    /* Size in bytes of currently traced memory.
+       Protected by TABLES_LOCK(). */
+    size_t traced_memory;
+    /* Peak size in bytes of traced memory.
+       Protected by TABLES_LOCK(). */
+    size_t peak_traced_memory;
+    /* Hash table used as a set to intern filenames:
+       PyObject* => PyObject*.
+       Protected by the GIL */
+    _Py_hashtable_t *filenames;
+    /* Buffer to store a new traceback in traceback_new().
+       Protected by the GIL. */
+    struct tracemalloc_traceback *traceback;
+    /* Hash table used as a set to intern tracebacks:
+       traceback_t* => traceback_t*
+       Protected by the GIL */
+    _Py_hashtable_t *tracebacks;
+    /* pointer (void*) => trace (trace_t*).
+       Protected by TABLES_LOCK(). */
+    _Py_hashtable_t *traces;
+    /* domain (unsigned int) => traces (_Py_hashtable_t).
+       Protected by TABLES_LOCK(). */
+    _Py_hashtable_t *domains;
+
+    struct tracemalloc_traceback empty_traceback;
+
+    Py_tss_t reentrant_key;
+};
+
+#define _tracemalloc_runtime_state_INIT \
+    { \
+        .config = { \
+            .initialized = TRACEMALLOC_NOT_INITIALIZED, \
+            .tracing = 0, \
+            .max_nframe = 1, \
+        }, \
+        .reentrant_key = Py_tss_NEEDS_INIT, \
+    }
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // !Py_INTERNAL_TRACEMALLOC_H
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_tuple.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_tuple.h
new file mode 100644
index 000000000000..335edad89792
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_tuple.h
@@ -0,0 +1,79 @@
+#ifndef Py_INTERNAL_TUPLE_H
+#define Py_INTERNAL_TUPLE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "tupleobject.h"   /* _PyTuple_CAST() */
+
+
+/* runtime lifecycle */
+
+extern PyStatus _PyTuple_InitGlobalObjects(PyInterpreterState *);
+extern void _PyTuple_Fini(PyInterpreterState *);
+
+
+/* other API */
+
+// PyTuple_MAXSAVESIZE - largest tuple to save on free list
+// PyTuple_MAXFREELIST - maximum number of tuples of each size to save
+
+#if defined(PyTuple_MAXSAVESIZE) && PyTuple_MAXSAVESIZE <= 0
+   // A build indicated that tuple freelists should not be used.
+#  define PyTuple_NFREELISTS 0
+#  undef PyTuple_MAXSAVESIZE
+#  undef PyTuple_MAXFREELIST
+
+#elif !defined(WITH_FREELISTS)
+#  define PyTuple_NFREELISTS 0
+#  undef PyTuple_MAXSAVESIZE
+#  undef PyTuple_MAXFREELIST
+
+#else
+   // We are using a freelist for tuples.
+#  ifndef PyTuple_MAXSAVESIZE
+#    define PyTuple_MAXSAVESIZE 20
+#  endif
+#  define PyTuple_NFREELISTS PyTuple_MAXSAVESIZE
+#  ifndef PyTuple_MAXFREELIST
+#    define PyTuple_MAXFREELIST 2000
+#  endif
+#endif
+
+struct _Py_tuple_state {
+#if PyTuple_NFREELISTS > 0
+    /* There is one freelist for each size from 1 to PyTuple_MAXSAVESIZE.
+       The empty tuple is handled separately.
+
+       Each tuple stored in the array is the head of the linked list
+       (and the next available tuple) for that size.  The actual tuple
+       object is used as the linked list node, with its first item
+       (ob_item[0]) pointing to the next node (i.e. the previous head).
+       Each linked list is initially NULL. */
+    PyTupleObject *free_list[PyTuple_NFREELISTS];
+    int numfree[PyTuple_NFREELISTS];
+#else
+    char _unused;  // Empty structs are not allowed.
+#endif
+};
+
+#define _PyTuple_ITEMS(op) _Py_RVALUE(_PyTuple_CAST(op)->ob_item)
+
+extern PyObject *_PyTuple_FromArray(PyObject *const *, Py_ssize_t);
+extern PyObject *_PyTuple_FromArraySteal(PyObject *const *, Py_ssize_t);
+
+
+typedef struct {
+    PyObject_HEAD
+    Py_ssize_t it_index;
+    PyTupleObject *it_seq; /* Set to NULL when iterator is exhausted */
+} _PyTupleIterObject;
+
+#ifdef __cplusplus
+}
+#endif
+#endif   /* !Py_INTERNAL_TUPLE_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_typeobject.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_typeobject.h
new file mode 100644
index 000000000000..63f76fc55c9b
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_typeobject.h
@@ -0,0 +1,149 @@
+#ifndef Py_INTERNAL_TYPEOBJECT_H
+#define Py_INTERNAL_TYPEOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "pycore_moduleobject.h"
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+
+/* state */
+
+#define _Py_TYPE_BASE_VERSION_TAG (2<<16)
+#define _Py_MAX_GLOBAL_TYPE_VERSION_TAG (_Py_TYPE_BASE_VERSION_TAG - 1)
+
+struct _types_runtime_state {
+    /* Used to set PyTypeObject.tp_version_tag for core static types. */
+    // bpo-42745: next_version_tag remains shared by all interpreters
+    // because of static types.
+    unsigned int next_version_tag;
+};
+
+
+// Type attribute lookup cache: speed up attribute and method lookups,
+// see _PyType_Lookup().
+struct type_cache_entry {
+    unsigned int version;  // initialized from type->tp_version_tag
+    PyObject *name;        // reference to exactly a str or None
+    PyObject *value;       // borrowed reference or NULL
+};
+
+#define MCACHE_SIZE_EXP 12
+
+struct type_cache {
+    struct type_cache_entry hashtable[1 << MCACHE_SIZE_EXP];
+};
+
+/* For now we hard-code this to a value for which we are confident
+   all the static builtin types will fit (for all builds). */
+#define _Py_MAX_STATIC_BUILTIN_TYPES 200
+
+typedef struct {
+    PyTypeObject *type;
+    int readying;
+    int ready;
+    // XXX tp_dict can probably be statically allocated,
+    // instead of dynamically and stored on the interpreter.
+    PyObject *tp_dict;
+    PyObject *tp_subclasses;
+    /* We never clean up weakrefs for static builtin types since
+       they will effectively never get triggered.  However, there
+       are also some diagnostic uses for the list of weakrefs,
+       so we still keep it. */
+    PyObject *tp_weaklist;
+} static_builtin_state;
+
+struct types_state {
+    /* Used to set PyTypeObject.tp_version_tag.
+       It starts at _Py_MAX_GLOBAL_TYPE_VERSION_TAG + 1,
+       where all those lower numbers are used for core static types. */
+    unsigned int next_version_tag;
+
+    struct type_cache type_cache;
+    size_t num_builtins_initialized;
+    static_builtin_state builtins[_Py_MAX_STATIC_BUILTIN_TYPES];
+};
+
+
+/* runtime lifecycle */
+
+extern PyStatus _PyTypes_InitTypes(PyInterpreterState *);
+extern void _PyTypes_FiniTypes(PyInterpreterState *);
+extern void _PyTypes_Fini(PyInterpreterState *);
+
+
+/* other API */
+
+/* Length of array of slotdef pointers used to store slots with the
+   same __name__.  There should be at most MAX_EQUIV-1 slotdef entries with
+   the same __name__, for any __name__. Since that's a static property, it is
+   appropriate to declare fixed-size arrays for this. */
+#define MAX_EQUIV 10
+
+typedef struct wrapperbase pytype_slotdef;
+
+
+static inline PyObject **
+_PyStaticType_GET_WEAKREFS_LISTPTR(static_builtin_state *state)
+{
+    assert(state != NULL);
+    return &state->tp_weaklist;
+}
+
+/* Like PyType_GetModuleState, but skips verification
+ * that type is a heap type with an associated module */
+static inline void *
+_PyType_GetModuleState(PyTypeObject *type)
+{
+    assert(PyType_Check(type));
+    assert(type->tp_flags & Py_TPFLAGS_HEAPTYPE);
+    PyHeapTypeObject *et = (PyHeapTypeObject *)type;
+    assert(et->ht_module);
+    PyModuleObject *mod = (PyModuleObject *)(et->ht_module);
+    assert(mod != NULL);
+    return mod->md_state;
+}
+
+
+extern int _PyStaticType_InitBuiltin(PyInterpreterState *, PyTypeObject *type);
+extern static_builtin_state * _PyStaticType_GetState(PyInterpreterState *, PyTypeObject *);
+extern void _PyStaticType_ClearWeakRefs(PyInterpreterState *, PyTypeObject *type);
+extern void _PyStaticType_Dealloc(PyInterpreterState *, PyTypeObject *);
+
+PyAPI_FUNC(PyObject *) _PyType_GetDict(PyTypeObject *);
+extern PyObject * _PyType_GetBases(PyTypeObject *type);
+extern PyObject * _PyType_GetMRO(PyTypeObject *type);
+extern PyObject* _PyType_GetSubclasses(PyTypeObject *);
+extern int _PyType_HasSubclasses(PyTypeObject *);
+
+// PyType_Ready() must be called if _PyType_IsReady() is false.
+// See also the Py_TPFLAGS_READY flag.
+static inline int
+_PyType_IsReady(PyTypeObject *type)
+{
+    return _PyType_GetDict(type) != NULL;
+}
+
+PyObject *
+_Py_type_getattro_impl(PyTypeObject *type, PyObject *name, int *suppress_missing_attribute);
+PyObject *
+_Py_type_getattro(PyTypeObject *type, PyObject *name);
+
+extern PyObject* _Py_BaseObject_RichCompare(PyObject* self, PyObject* other, int op);
+
+PyObject *_Py_slot_tp_getattro(PyObject *self, PyObject *name);
+PyObject *_Py_slot_tp_getattr_hook(PyObject *self, PyObject *name);
+
+PyAPI_DATA(PyTypeObject) _PyBufferWrapper_Type;
+
+PyObject *
+_PySuper_Lookup(PyTypeObject *su_type, PyObject *su_obj, PyObject *name, int *meth_found);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_TYPEOBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_typevarobject.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_typevarobject.h
new file mode 100644
index 000000000000..c9fa97d68207
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_typevarobject.h
@@ -0,0 +1,24 @@
+#ifndef Py_INTERNAL_TYPEVAROBJECT_H
+#define Py_INTERNAL_TYPEVAROBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+extern PyObject *_Py_make_typevar(PyObject *, PyObject *, PyObject *);
+extern PyObject *_Py_make_paramspec(PyThreadState *, PyObject *);
+extern PyObject *_Py_make_typevartuple(PyThreadState *, PyObject *);
+extern PyObject *_Py_make_typealias(PyThreadState *, PyObject *);
+extern PyObject *_Py_subscript_generic(PyThreadState *, PyObject *);
+extern int _Py_initialize_generic(PyInterpreterState *);
+extern void _Py_clear_generic_types(PyInterpreterState *);
+
+extern PyTypeObject _PyTypeAlias_Type;
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_TYPEVAROBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_ucnhash.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_ucnhash.h
new file mode 100644
index 000000000000..187dd68e7347
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_ucnhash.h
@@ -0,0 +1,34 @@
+/* Unicode name database interface */
+#ifndef Py_INTERNAL_UCNHASH_H
+#define Py_INTERNAL_UCNHASH_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+/* revised ucnhash CAPI interface (exported through a "wrapper") */
+
+#define PyUnicodeData_CAPSULE_NAME "unicodedata._ucnhash_CAPI"
+
+typedef struct {
+
+    /* Get name for a given character code.
+       Returns non-zero if success, zero if not.
+       Does not set Python exceptions. */
+    int (*getname)(Py_UCS4 code, char* buffer, int buflen,
+                   int with_alias_and_seq);
+
+    /* Get character code for a given name.
+       Same error handling as for getname(). */
+    int (*getcode)(const char* name, int namelen, Py_UCS4* code,
+                   int with_named_seq);
+
+} _PyUnicode_Name_CAPI;
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_UCNHASH_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_unicodeobject.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_unicodeobject.h
new file mode 100644
index 000000000000..1bb0f366e781
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_unicodeobject.h
@@ -0,0 +1,70 @@
+#ifndef Py_INTERNAL_UNICODEOBJECT_H
+#define Py_INTERNAL_UNICODEOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+#include "pycore_fileutils.h"     // _Py_error_handler
+#include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
+
+void _PyUnicode_ExactDealloc(PyObject *op);
+Py_ssize_t _PyUnicode_InternedSize(void);
+
+/* runtime lifecycle */
+
+extern void _PyUnicode_InitState(PyInterpreterState *);
+extern PyStatus _PyUnicode_InitGlobalObjects(PyInterpreterState *);
+extern PyStatus _PyUnicode_InitTypes(PyInterpreterState *);
+extern void _PyUnicode_Fini(PyInterpreterState *);
+extern void _PyUnicode_FiniTypes(PyInterpreterState *);
+
+extern PyTypeObject _PyUnicodeASCIIIter_Type;
+
+/* other API */
+
+struct _Py_unicode_runtime_ids {
+    PyThread_type_lock lock;
+    // next_index value must be preserved when Py_Initialize()/Py_Finalize()
+    // is called multiple times: see _PyUnicode_FromId() implementation.
+    Py_ssize_t next_index;
+};
+
+struct _Py_unicode_runtime_state {
+    struct _Py_unicode_runtime_ids ids;
+};
+
+/* fs_codec.encoding is initialized to NULL.
+   Later, it is set to a non-NULL string by _PyUnicode_InitEncodings(). */
+struct _Py_unicode_fs_codec {
+    char *encoding;   // Filesystem encoding (encoded to UTF-8)
+    int utf8;         // encoding=="utf-8"?
+    char *errors;     // Filesystem errors (encoded to UTF-8)
+    _Py_error_handler error_handler;
+};
+
+struct _Py_unicode_ids {
+    Py_ssize_t size;
+    PyObject **array;
+};
+
+struct _Py_unicode_state {
+    struct _Py_unicode_fs_codec fs_codec;
+
+    _PyUnicode_Name_CAPI *ucnhash_capi;
+
+    // Unicode identifiers (_Py_Identifier): see _PyUnicode_FromId()
+    struct _Py_unicode_ids ids;
+};
+
+extern void _PyUnicode_InternInPlace(PyInterpreterState *interp, PyObject **p);
+extern void _PyUnicode_ClearInterned(PyInterpreterState *interp);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_UNICODEOBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_unicodeobject_generated.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_unicodeobject_generated.h
new file mode 100644
index 000000000000..9b470094b7af
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_unicodeobject_generated.h
@@ -0,0 +1,2093 @@
+#ifndef Py_INTERNAL_UNICODEOBJECT_GENERATED_H
+#define Py_INTERNAL_UNICODEOBJECT_GENERATED_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+/* The following is auto-generated by Tools/build/generate_global_objects.py. */
+static inline void
+_PyUnicode_InitStaticStrings(PyInterpreterState *interp) {
+    PyObject *string;
+    string = &_Py_ID(CANCELLED);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(FINISHED);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(False);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(JSONDecodeError);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(PENDING);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(Py_Repr);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(TextIOWrapper);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(True);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(WarningMessage);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_WindowsConsoleIO);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__IOBase_closed);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__abc_tpflags__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__abs__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__abstractmethods__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__add__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__aenter__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__aexit__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__aiter__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__all__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__and__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__anext__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__annotations__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__args__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__asyncio_running_event_loop__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__await__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__bases__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__bool__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__buffer__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__build_class__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__builtins__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__bytes__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__call__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__cantrace__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__class__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__class_getitem__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__classcell__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__classdict__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__classdictcell__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__complex__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__contains__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__copy__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__ctypes_from_outparam__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__del__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__delattr__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__delete__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__delitem__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__dict__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__dictoffset__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__dir__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__divmod__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__doc__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__enter__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__eq__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__exit__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__file__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__float__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__floordiv__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__format__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__fspath__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__ge__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__get__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__getattr__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__getattribute__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__getinitargs__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__getitem__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__getnewargs__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__getnewargs_ex__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__getstate__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__gt__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__hash__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__iadd__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__iand__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__ifloordiv__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__ilshift__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__imatmul__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__imod__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__import__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__imul__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__index__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__init__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__init_subclass__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__instancecheck__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__int__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__invert__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__ior__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__ipow__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__irshift__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__isabstractmethod__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__isub__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__iter__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__itruediv__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__ixor__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__le__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__len__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__length_hint__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__lltrace__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__loader__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__lshift__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__lt__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__main__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__matmul__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__missing__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__mod__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__module__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__mro_entries__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__mul__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__name__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__ne__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__neg__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__new__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__newobj__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__newobj_ex__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__next__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__notes__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__or__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__orig_class__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__origin__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__package__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__parameters__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__path__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__pos__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__pow__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__prepare__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__qualname__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__radd__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__rand__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__rdivmod__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__reduce__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__reduce_ex__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__release_buffer__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__repr__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__reversed__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__rfloordiv__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__rlshift__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__rmatmul__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__rmod__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__rmul__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__ror__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__round__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__rpow__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__rrshift__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__rshift__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__rsub__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__rtruediv__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__rxor__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__set__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__set_name__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__setattr__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__setitem__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__setstate__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__sizeof__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__slotnames__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__slots__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__spec__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__str__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__sub__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__subclasscheck__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__subclasshook__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__truediv__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__trunc__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__type_params__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__typing_is_unpacked_typevartuple__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__typing_prepare_subst__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__typing_subst__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__typing_unpacked_tuple_args__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__warningregistry__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__weaklistoffset__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__weakref__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(__xor__);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_abc_impl);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_abstract_);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_active);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_annotation);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_anonymous_);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_argtypes_);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_as_parameter_);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_asyncio_future_blocking);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_blksize);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_bootstrap);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_check_retval_);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_dealloc_warn);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_feature_version);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_fields_);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_finalizing);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_find_and_load);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_fix_up_module);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_flags_);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_get_sourcefile);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_handle_fromlist);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_initializing);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_io);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_is_text_encoding);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_length_);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_limbo);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_lock_unlock_module);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_loop);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_needs_com_addref_);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_pack_);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_restype_);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_showwarnmsg);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_shutdown);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_slotnames);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_strptime_datetime);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_swappedbytes_);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_type_);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_uninitialized_submodules);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_warn_unawaited_coroutine);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(_xoptions);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(a);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(abs_tol);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(access);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(add);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(add_done_callback);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(after_in_child);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(after_in_parent);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(aggregate_class);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(alias);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(append);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(arg);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(argdefs);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(args);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(arguments);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(argv);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(as_integer_ratio);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(ast);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(attribute);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(authorizer_callback);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(autocommit);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(b);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(backtick);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(base);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(before);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(big);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(binary_form);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(block);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(bound);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(buffer);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(buffer_callback);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(buffer_size);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(buffering);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(buffers);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(bufsize);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(builtins);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(byteorder);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(bytes);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(bytes_per_sep);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(c);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(c_call);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(c_exception);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(c_return);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(cached_statements);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(cadata);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(cafile);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(call);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(call_exception_handler);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(call_soon);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(cancel);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(capath);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(category);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(cb_type);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(certfile);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(check_same_thread);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(clear);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(close);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(closed);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(closefd);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(closure);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(co_argcount);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(co_cellvars);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(co_code);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(co_consts);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(co_exceptiontable);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(co_filename);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(co_firstlineno);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(co_flags);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(co_freevars);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(co_kwonlyargcount);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(co_linetable);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(co_name);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(co_names);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(co_nlocals);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(co_posonlyargcount);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(co_qualname);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(co_stacksize);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(co_varnames);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(code);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(command);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(comment_factory);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(compile_mode);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(consts);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(context);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(contravariant);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(cookie);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(copy);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(copyreg);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(coro);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(count);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(covariant);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(cwd);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(d);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(data);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(database);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(decode);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(decoder);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(default);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(defaultaction);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(delete);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(depth);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(detect_types);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(deterministic);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(device);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(dict);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(dictcomp);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(difference_update);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(digest);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(digest_size);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(digestmod);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(dir_fd);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(discard);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(dispatch_table);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(displayhook);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(dklen);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(doc);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(dont_inherit);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(dst);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(dst_dir_fd);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(duration);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(e);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(eager_start);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(effective_ids);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(element_factory);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(encode);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(encoding);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(end);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(end_lineno);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(end_offset);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(endpos);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(entrypoint);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(env);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(errors);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(event);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(eventmask);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(exc_type);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(exc_value);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(excepthook);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(exception);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(existing_file_name);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(exp);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(extend);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(extra_tokens);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(facility);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(factory);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(false);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(family);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(fanout);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(fd);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(fd2);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(fdel);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(fget);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(file);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(file_actions);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(filename);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(fileno);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(filepath);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(fillvalue);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(filters);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(final);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(find_class);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(fix_imports);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(flags);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(flush);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(follow_symlinks);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(format);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(frequency);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(from_param);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(fromlist);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(fromtimestamp);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(fromutc);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(fset);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(func);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(future);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(generation);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(genexpr);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(get);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(get_debug);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(get_event_loop);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(get_loop);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(get_source);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(getattr);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(getstate);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(gid);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(globals);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(groupindex);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(groups);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(handle);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(hash_name);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(header);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(headers);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(hi);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(hook);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(id);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(ident);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(ignore);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(imag);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(importlib);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(in_fd);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(incoming);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(indexgroup);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(inf);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(infer_variance);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(inheritable);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(initial);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(initial_bytes);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(initial_value);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(initval);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(inner_size);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(input);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(insert_comments);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(insert_pis);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(instructions);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(intern);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(intersection);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(is_running);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(isatty);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(isinstance);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(isoformat);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(isolation_level);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(istext);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(item);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(items);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(iter);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(iterable);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(iterations);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(join);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(jump);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(keepends);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(key);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(keyfile);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(keys);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(kind);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(kw);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(kw1);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(kw2);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(lambda);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(last);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(last_exc);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(last_node);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(last_traceback);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(last_type);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(last_value);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(latin1);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(leaf_size);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(len);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(length);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(level);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(limit);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(line);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(line_buffering);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(lineno);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(listcomp);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(little);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(lo);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(locale);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(locals);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(logoption);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(loop);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(mapping);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(match);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(max_length);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(maxdigits);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(maxevents);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(maxmem);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(maxsplit);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(maxvalue);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(memLevel);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(memlimit);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(message);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(metaclass);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(metadata);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(method);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(mod);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(mode);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(module);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(module_globals);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(modules);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(mro);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(msg);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(mycmp);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(n);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(n_arg);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(n_fields);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(n_sequence_fields);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(n_unnamed_fields);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(name);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(name_from);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(namespace_separator);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(namespaces);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(narg);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(ndigits);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(new_file_name);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(new_limit);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(newline);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(newlines);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(next);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(nlocals);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(node_depth);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(node_offset);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(ns);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(nstype);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(nt);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(null);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(number);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(obj);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(object);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(offset);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(offset_dst);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(offset_src);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(on_type_read);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(onceregistry);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(only_keys);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(oparg);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(opcode);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(open);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(opener);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(operation);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(optimize);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(options);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(order);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(origin);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(out_fd);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(outgoing);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(overlapped);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(owner);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(p);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(pages);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(parent);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(password);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(path);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(pattern);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(peek);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(persistent_id);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(persistent_load);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(person);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(pi_factory);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(pid);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(policy);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(pos);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(pos1);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(pos2);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(posix);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(print_file_and_line);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(priority);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(progress);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(progress_handler);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(progress_routine);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(proto);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(protocol);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(ps1);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(ps2);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(query);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(quotetabs);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(r);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(raw);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(read);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(read1);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(readable);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(readall);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(readinto);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(readinto1);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(readline);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(readonly);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(real);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(reducer_override);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(registry);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(rel_tol);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(release);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(reload);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(repl);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(replace);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(reserved);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(reset);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(resetids);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(return);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(reverse);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(reversed);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(s);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(salt);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(sched_priority);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(scheduler);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(seek);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(seekable);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(selectors);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(self);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(send);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(sep);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(sequence);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(server_hostname);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(server_side);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(session);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(setcomp);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(setpgroup);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(setsid);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(setsigdef);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(setsigmask);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(setstate);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(shape);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(show_cmd);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(signed);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(size);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(sizehint);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(skip_file_prefixes);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(sleep);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(sock);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(sort);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(sound);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(source);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(source_traceback);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(src);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(src_dir_fd);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(stacklevel);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(start);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(statement);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(status);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(stderr);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(stdin);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(stdout);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(step);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(steps);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(store_name);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(strategy);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(strftime);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(strict);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(strict_mode);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(string);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(sub_key);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(symmetric_difference_update);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(tabsize);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(tag);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(target);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(target_is_directory);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(task);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(tb_frame);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(tb_lasti);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(tb_lineno);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(tb_next);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(tell);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(template);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(term);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(text);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(threading);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(throw);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(timeout);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(times);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(timetuple);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(top);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(trace_callback);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(traceback);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(trailers);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(translate);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(true);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(truncate);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(twice);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(txt);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(type);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(type_params);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(tz);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(tzname);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(uid);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(unlink);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(unraisablehook);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(uri);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(usedforsecurity);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(value);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(values);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(version);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(volume);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(warnings);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(warnoptions);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(wbits);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(week);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(weekday);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(which);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(who);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(withdata);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(writable);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(write);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(write_through);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(x);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(year);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+    string = &_Py_ID(zdict);
+    assert(_PyUnicode_CheckConsistency(string, 1));
+    _PyUnicode_InternInPlace(interp, &string);
+}
+/* End auto-generated code */
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_UNICODEOBJECT_GENERATED_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_unionobject.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_unionobject.h
new file mode 100644
index 000000000000..87264635b6e1
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_unionobject.h
@@ -0,0 +1,23 @@
+#ifndef Py_INTERNAL_UNIONOBJECT_H
+#define Py_INTERNAL_UNIONOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+extern PyTypeObject _PyUnion_Type;
+#define _PyUnion_Check(op) Py_IS_TYPE((op), &_PyUnion_Type)
+extern PyObject *_Py_union_type_or(PyObject *, PyObject *);
+
+#define _PyGenericAlias_Check(op) PyObject_TypeCheck((op), &Py_GenericAliasType)
+extern PyObject *_Py_subs_parameters(PyObject *, PyObject *, PyObject *, PyObject *);
+extern PyObject *_Py_make_parameters(PyObject *);
+extern PyObject *_Py_union_args(PyObject *self);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_UNIONOBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/internal/pycore_warnings.h b/nanvix-port/cpython-headers/python3.12/internal/pycore_warnings.h
new file mode 100644
index 000000000000..efb4f1cd7eac
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/internal/pycore_warnings.h
@@ -0,0 +1,29 @@
+#ifndef Py_INTERNAL_WARNINGS_H
+#define Py_INTERNAL_WARNINGS_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+struct _warnings_runtime_state {
+    /* Both 'filters' and 'onceregistry' can be set in warnings.py;
+       get_warnings_attr() will reset these variables accordingly. */
+    PyObject *filters;  /* List */
+    PyObject *once_registry;  /* Dict */
+    PyObject *default_action; /* String */
+    long filters_version;
+};
+
+extern int _PyWarnings_InitState(PyInterpreterState *interp);
+
+PyAPI_FUNC(PyObject*) _PyWarnings_Init(void);
+
+extern void _PyErr_WarnUnawaitedCoroutine(PyObject *coro);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERNAL_WARNINGS_H */
diff --git a/nanvix-port/cpython-headers/python3.12/interpreteridobject.h b/nanvix-port/cpython-headers/python3.12/interpreteridobject.h
new file mode 100644
index 000000000000..8432632f339e
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/interpreteridobject.h
@@ -0,0 +1,17 @@
+#ifndef Py_INTERPRETERIDOBJECT_H
+#define Py_INTERPRETERIDOBJECT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_INTERPRETERIDOBJECT_H
+#  include "cpython/interpreteridobject.h"
+#  undef Py_CPYTHON_INTERPRETERIDOBJECT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTERPRETERIDOBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/intrcheck.h b/nanvix-port/cpython-headers/python3.12/intrcheck.h
new file mode 100644
index 000000000000..b8cc65601683
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/intrcheck.h
@@ -0,0 +1,30 @@
+#ifndef Py_INTRCHECK_H
+#define Py_INTRCHECK_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_FUNC(int) PyOS_InterruptOccurred(void);
+#ifdef HAVE_FORK
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03070000
+PyAPI_FUNC(void) PyOS_BeforeFork(void);
+PyAPI_FUNC(void) PyOS_AfterFork_Parent(void);
+PyAPI_FUNC(void) PyOS_AfterFork_Child(void);
+#endif
+#endif
+/* Deprecated, please use PyOS_AfterFork_Child() instead */
+Py_DEPRECATED(3.7) PyAPI_FUNC(void) PyOS_AfterFork(void);
+
+#ifndef Py_LIMITED_API
+PyAPI_FUNC(int) _PyOS_IsMainThread(void);
+
+#ifdef MS_WINDOWS
+/* windows.h is not included by Python.h so use void* instead of HANDLE */
+PyAPI_FUNC(void*) _PyOS_SigintEvent(void);
+#endif
+#endif /* !Py_LIMITED_API */
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_INTRCHECK_H */
diff --git a/nanvix-port/cpython-headers/python3.12/iterobject.h b/nanvix-port/cpython-headers/python3.12/iterobject.h
new file mode 100644
index 000000000000..fff30f7176fd
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/iterobject.h
@@ -0,0 +1,27 @@
+#ifndef Py_ITEROBJECT_H
+#define Py_ITEROBJECT_H
+/* Iterators (the basic kind, over a sequence) */
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_DATA(PyTypeObject) PySeqIter_Type;
+PyAPI_DATA(PyTypeObject) PyCallIter_Type;
+#ifdef Py_BUILD_CORE
+extern PyTypeObject _PyAnextAwaitable_Type;
+#endif
+
+#define PySeqIter_Check(op) Py_IS_TYPE((op), &PySeqIter_Type)
+
+PyAPI_FUNC(PyObject *) PySeqIter_New(PyObject *);
+
+
+#define PyCallIter_Check(op) Py_IS_TYPE((op), &PyCallIter_Type)
+
+PyAPI_FUNC(PyObject *) PyCallIter_New(PyObject *, PyObject *);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_ITEROBJECT_H */
+
diff --git a/nanvix-port/cpython-headers/python3.12/listobject.h b/nanvix-port/cpython-headers/python3.12/listobject.h
new file mode 100644
index 000000000000..6b7041ba0b05
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/listobject.h
@@ -0,0 +1,52 @@
+/* List object interface
+
+   Another generally useful object type is a list of object pointers.
+   This is a mutable type: the list items can be changed, and items can be
+   added or removed. Out-of-range indices or non-list objects are ignored.
+
+   WARNING: PyList_SetItem does not increment the new item's reference count,
+   but does decrement the reference count of the item it replaces, if not nil.
+   It does *decrement* the reference count if it is *not* inserted in the list.
+   Similarly, PyList_GetItem does not increment the returned item's reference
+   count.
+*/
+
+#ifndef Py_LISTOBJECT_H
+#define Py_LISTOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_DATA(PyTypeObject) PyList_Type;
+PyAPI_DATA(PyTypeObject) PyListIter_Type;
+PyAPI_DATA(PyTypeObject) PyListRevIter_Type;
+
+#define PyList_Check(op) \
+    PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_LIST_SUBCLASS)
+#define PyList_CheckExact(op) Py_IS_TYPE((op), &PyList_Type)
+
+PyAPI_FUNC(PyObject *) PyList_New(Py_ssize_t size);
+PyAPI_FUNC(Py_ssize_t) PyList_Size(PyObject *);
+
+PyAPI_FUNC(PyObject *) PyList_GetItem(PyObject *, Py_ssize_t);
+PyAPI_FUNC(int) PyList_SetItem(PyObject *, Py_ssize_t, PyObject *);
+PyAPI_FUNC(int) PyList_Insert(PyObject *, Py_ssize_t, PyObject *);
+PyAPI_FUNC(int) PyList_Append(PyObject *, PyObject *);
+
+PyAPI_FUNC(PyObject *) PyList_GetSlice(PyObject *, Py_ssize_t, Py_ssize_t);
+PyAPI_FUNC(int) PyList_SetSlice(PyObject *, Py_ssize_t, Py_ssize_t, PyObject *);
+
+PyAPI_FUNC(int) PyList_Sort(PyObject *);
+PyAPI_FUNC(int) PyList_Reverse(PyObject *);
+PyAPI_FUNC(PyObject *) PyList_AsTuple(PyObject *);
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_LISTOBJECT_H
+#  include "cpython/listobject.h"
+#  undef Py_CPYTHON_LISTOBJECT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_LISTOBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/longobject.h b/nanvix-port/cpython-headers/python3.12/longobject.h
new file mode 100644
index 000000000000..c8b749735386
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/longobject.h
@@ -0,0 +1,108 @@
+#ifndef Py_LONGOBJECT_H
+#define Py_LONGOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Long (arbitrary precision) integer object interface */
+
+// PyLong_Type is declared by object.h
+
+#define PyLong_Check(op) \
+        PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_LONG_SUBCLASS)
+#define PyLong_CheckExact(op) Py_IS_TYPE((op), &PyLong_Type)
+
+PyAPI_FUNC(PyObject *) PyLong_FromLong(long);
+PyAPI_FUNC(PyObject *) PyLong_FromUnsignedLong(unsigned long);
+PyAPI_FUNC(PyObject *) PyLong_FromSize_t(size_t);
+PyAPI_FUNC(PyObject *) PyLong_FromSsize_t(Py_ssize_t);
+PyAPI_FUNC(PyObject *) PyLong_FromDouble(double);
+PyAPI_FUNC(long) PyLong_AsLong(PyObject *);
+PyAPI_FUNC(long) PyLong_AsLongAndOverflow(PyObject *, int *);
+PyAPI_FUNC(Py_ssize_t) PyLong_AsSsize_t(PyObject *);
+PyAPI_FUNC(size_t) PyLong_AsSize_t(PyObject *);
+PyAPI_FUNC(unsigned long) PyLong_AsUnsignedLong(PyObject *);
+PyAPI_FUNC(unsigned long) PyLong_AsUnsignedLongMask(PyObject *);
+PyAPI_FUNC(PyObject *) PyLong_GetInfo(void);
+
+/* It may be useful in the future. I've added it in the PyInt -> PyLong
+   cleanup to keep the extra information. [CH] */
+#define PyLong_AS_LONG(op) PyLong_AsLong(op)
+
+/* Issue #1983: pid_t can be longer than a C long on some systems */
+#if !defined(SIZEOF_PID_T) || SIZEOF_PID_T == SIZEOF_INT
+#define _Py_PARSE_PID "i"
+#define PyLong_FromPid PyLong_FromLong
+# ifndef Py_LIMITED_API
+#   define PyLong_AsPid _PyLong_AsInt
+# elif SIZEOF_INT == SIZEOF_LONG
+#   define PyLong_AsPid PyLong_AsLong
+# else
+static inline int
+PyLong_AsPid(PyObject *obj)
+{
+    int overflow;
+    long result = PyLong_AsLongAndOverflow(obj, &overflow);
+    if (overflow || result > INT_MAX || result < INT_MIN) {
+        PyErr_SetString(PyExc_OverflowError,
+                        "Python int too large to convert to C int");
+        return -1;
+    }
+    return (int)result;
+}
+# endif
+#elif SIZEOF_PID_T == SIZEOF_LONG
+#define _Py_PARSE_PID "l"
+#define PyLong_FromPid PyLong_FromLong
+#define PyLong_AsPid PyLong_AsLong
+#elif defined(SIZEOF_LONG_LONG) && SIZEOF_PID_T == SIZEOF_LONG_LONG
+#define _Py_PARSE_PID "L"
+#define PyLong_FromPid PyLong_FromLongLong
+#define PyLong_AsPid PyLong_AsLongLong
+#else
+#error "sizeof(pid_t) is neither sizeof(int), sizeof(long) or sizeof(long long)"
+#endif /* SIZEOF_PID_T */
+
+#if SIZEOF_VOID_P == SIZEOF_INT
+#  define _Py_PARSE_INTPTR "i"
+#  define _Py_PARSE_UINTPTR "I"
+#elif SIZEOF_VOID_P == SIZEOF_LONG
+#  define _Py_PARSE_INTPTR "l"
+#  define _Py_PARSE_UINTPTR "k"
+#elif defined(SIZEOF_LONG_LONG) && SIZEOF_VOID_P == SIZEOF_LONG_LONG
+#  define _Py_PARSE_INTPTR "L"
+#  define _Py_PARSE_UINTPTR "K"
+#else
+#  error "void* different in size from int, long and long long"
+#endif /* SIZEOF_VOID_P */
+
+PyAPI_FUNC(double) PyLong_AsDouble(PyObject *);
+PyAPI_FUNC(PyObject *) PyLong_FromVoidPtr(void *);
+PyAPI_FUNC(void *) PyLong_AsVoidPtr(PyObject *);
+
+PyAPI_FUNC(PyObject *) PyLong_FromLongLong(long long);
+PyAPI_FUNC(PyObject *) PyLong_FromUnsignedLongLong(unsigned long long);
+PyAPI_FUNC(long long) PyLong_AsLongLong(PyObject *);
+PyAPI_FUNC(unsigned long long) PyLong_AsUnsignedLongLong(PyObject *);
+PyAPI_FUNC(unsigned long long) PyLong_AsUnsignedLongLongMask(PyObject *);
+PyAPI_FUNC(long long) PyLong_AsLongLongAndOverflow(PyObject *, int *);
+
+PyAPI_FUNC(PyObject *) PyLong_FromString(const char *, char **, int);
+
+/* These aren't really part of the int object, but they're handy. The
+   functions are in Python/mystrtoul.c.
+ */
+PyAPI_FUNC(unsigned long) PyOS_strtoul(const char *, char **, int);
+PyAPI_FUNC(long) PyOS_strtol(const char *, char **, int);
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_LONGOBJECT_H
+#  include "cpython/longobject.h"
+#  undef Py_CPYTHON_LONGOBJECT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_LONGOBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/marshal.h b/nanvix-port/cpython-headers/python3.12/marshal.h
new file mode 100644
index 000000000000..f8b0de80cfc3
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/marshal.h
@@ -0,0 +1,31 @@
+
+/* Interface for marshal.c */
+
+#ifndef Py_MARSHAL_H
+#define Py_MARSHAL_H
+#ifndef Py_LIMITED_API
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_FUNC(PyObject *) PyMarshal_ReadObjectFromString(const char *,
+                                                      Py_ssize_t);
+PyAPI_FUNC(PyObject *) PyMarshal_WriteObjectToString(PyObject *, int);
+
+#define Py_MARSHAL_VERSION 4
+
+PyAPI_FUNC(long) PyMarshal_ReadLongFromFile(FILE *);
+PyAPI_FUNC(int) PyMarshal_ReadShortFromFile(FILE *);
+PyAPI_FUNC(PyObject *) PyMarshal_ReadObjectFromFile(FILE *);
+PyAPI_FUNC(PyObject *) PyMarshal_ReadLastObjectFromFile(FILE *);
+
+PyAPI_FUNC(void) PyMarshal_WriteLongToFile(long, FILE *, int);
+PyAPI_FUNC(void) PyMarshal_WriteObjectToFile(PyObject *, FILE *, int);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* Py_LIMITED_API */
+#endif /* !Py_MARSHAL_H */
diff --git a/nanvix-port/cpython-headers/python3.12/memoryobject.h b/nanvix-port/cpython-headers/python3.12/memoryobject.h
new file mode 100644
index 000000000000..2c9146aa2b5b
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/memoryobject.h
@@ -0,0 +1,34 @@
+/* Memory view object. In Python this is available as "memoryview". */
+
+#ifndef Py_MEMORYOBJECT_H
+#define Py_MEMORYOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_DATA(PyTypeObject) PyMemoryView_Type;
+
+#define PyMemoryView_Check(op) Py_IS_TYPE((op), &PyMemoryView_Type)
+
+PyAPI_FUNC(PyObject *) PyMemoryView_FromObject(PyObject *base);
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+PyAPI_FUNC(PyObject *) PyMemoryView_FromMemory(char *mem, Py_ssize_t size,
+                                               int flags);
+#endif
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030b0000
+PyAPI_FUNC(PyObject *) PyMemoryView_FromBuffer(const Py_buffer *info);
+#endif
+PyAPI_FUNC(PyObject *) PyMemoryView_GetContiguous(PyObject *base,
+                                                  int buffertype,
+                                                  char order);
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_MEMORYOBJECT_H
+#  include "cpython/memoryobject.h"
+#  undef Py_CPYTHON_MEMORYOBJECT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_MEMORYOBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/methodobject.h b/nanvix-port/cpython-headers/python3.12/methodobject.h
new file mode 100644
index 000000000000..72af5ad933df
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/methodobject.h
@@ -0,0 +1,132 @@
+
+/* Method object interface */
+
+#ifndef Py_METHODOBJECT_H
+#define Py_METHODOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* This is about the type 'builtin_function_or_method',
+   not Python methods in user-defined classes.  See classobject.h
+   for the latter. */
+
+PyAPI_DATA(PyTypeObject) PyCFunction_Type;
+
+#define PyCFunction_CheckExact(op) Py_IS_TYPE((op), &PyCFunction_Type)
+#define PyCFunction_Check(op) PyObject_TypeCheck((op), &PyCFunction_Type)
+
+typedef PyObject *(*PyCFunction)(PyObject *, PyObject *);
+typedef PyObject *(*_PyCFunctionFast) (PyObject *, PyObject *const *, Py_ssize_t);
+typedef PyObject *(*PyCFunctionWithKeywords)(PyObject *, PyObject *,
+                                             PyObject *);
+typedef PyObject *(*_PyCFunctionFastWithKeywords) (PyObject *,
+                                                   PyObject *const *, Py_ssize_t,
+                                                   PyObject *);
+typedef PyObject *(*PyCMethod)(PyObject *, PyTypeObject *, PyObject *const *,
+                               size_t, PyObject *);
+
+// Cast an function to the PyCFunction type to use it with PyMethodDef.
+//
+// This macro can be used to prevent compiler warnings if the first parameter
+// uses a different pointer type than PyObject* (ex: METH_VARARGS and METH_O
+// calling conventions).
+//
+// The macro can also be used for METH_FASTCALL and METH_VARARGS|METH_KEYWORDS
+// calling conventions to avoid compiler warnings because the function has more
+// than 2 parameters. The macro first casts the function to the
+// "void func(void)" type to prevent compiler warnings.
+//
+// If a function is declared with the METH_NOARGS calling convention, it must
+// have 2 parameters. Since the second parameter is unused, Py_UNUSED() can be
+// used to prevent a compiler warning. If the function has a single parameter,
+// it triggers an undefined behavior when Python calls it with 2 parameters
+// (bpo-33012).
+#define _PyCFunction_CAST(func) \
+    _Py_CAST(PyCFunction, _Py_CAST(void(*)(void), (func)))
+
+PyAPI_FUNC(PyCFunction) PyCFunction_GetFunction(PyObject *);
+PyAPI_FUNC(PyObject *) PyCFunction_GetSelf(PyObject *);
+PyAPI_FUNC(int) PyCFunction_GetFlags(PyObject *);
+
+Py_DEPRECATED(3.9) PyAPI_FUNC(PyObject *) PyCFunction_Call(PyObject *, PyObject *, PyObject *);
+
+struct PyMethodDef {
+    const char  *ml_name;   /* The name of the built-in function/method */
+    PyCFunction ml_meth;    /* The C function that implements it */
+    int         ml_flags;   /* Combination of METH_xxx flags, which mostly
+                               describe the args expected by the C func */
+    const char  *ml_doc;    /* The __doc__ attribute, or NULL */
+};
+
+/* PyCFunction_New is declared as a function for stable ABI (declaration is
+ * needed for e.g. GCC with -fvisibility=hidden), but redefined as a macro
+ * that calls PyCFunction_NewEx. */
+PyAPI_FUNC(PyObject *) PyCFunction_New(PyMethodDef *, PyObject *);
+#define PyCFunction_New(ML, SELF) PyCFunction_NewEx((ML), (SELF), NULL)
+
+/* PyCFunction_NewEx is similar: on 3.9+, this calls PyCMethod_New. */
+PyAPI_FUNC(PyObject *) PyCFunction_NewEx(PyMethodDef *, PyObject *,
+                                         PyObject *);
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03090000
+#define PyCFunction_NewEx(ML, SELF, MOD) PyCMethod_New((ML), (SELF), (MOD), NULL)
+PyAPI_FUNC(PyObject *) PyCMethod_New(PyMethodDef *, PyObject *,
+                                     PyObject *, PyTypeObject *);
+#endif
+
+
+/* Flag passed to newmethodobject */
+/* #define METH_OLDARGS  0x0000   -- unsupported now */
+#define METH_VARARGS  0x0001
+#define METH_KEYWORDS 0x0002
+/* METH_NOARGS and METH_O must not be combined with the flags above. */
+#define METH_NOARGS   0x0004
+#define METH_O        0x0008
+
+/* METH_CLASS and METH_STATIC are a little different; these control
+   the construction of methods for a class.  These cannot be used for
+   functions in modules. */
+#define METH_CLASS    0x0010
+#define METH_STATIC   0x0020
+
+/* METH_COEXIST allows a method to be entered even though a slot has
+   already filled the entry.  When defined, the flag allows a separate
+   method, "__contains__" for example, to coexist with a defined
+   slot like sq_contains. */
+
+#define METH_COEXIST   0x0040
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030a0000
+#  define METH_FASTCALL  0x0080
+#endif
+
+/* This bit is preserved for Stackless Python */
+#ifdef STACKLESS
+#  define METH_STACKLESS 0x0100
+#else
+#  define METH_STACKLESS 0x0000
+#endif
+
+/* METH_METHOD means the function stores an
+ * additional reference to the class that defines it;
+ * both self and class are passed to it.
+ * It uses PyCMethodObject instead of PyCFunctionObject.
+ * May not be combined with METH_NOARGS, METH_O, METH_CLASS or METH_STATIC.
+ */
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03090000
+#define METH_METHOD 0x0200
+#endif
+
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_METHODOBJECT_H
+#  include "cpython/methodobject.h"
+#  undef Py_CPYTHON_METHODOBJECT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_METHODOBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/modsupport.h b/nanvix-port/cpython-headers/python3.12/modsupport.h
new file mode 100644
index 000000000000..1592bd0db4ff
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/modsupport.h
@@ -0,0 +1,168 @@
+
+#ifndef Py_MODSUPPORT_H
+#define Py_MODSUPPORT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Module support interface */
+
+#include <stdarg.h>               // va_list
+
+/* If PY_SSIZE_T_CLEAN is defined, each functions treats #-specifier
+   to mean Py_ssize_t */
+#ifdef PY_SSIZE_T_CLEAN
+#define PyArg_Parse                     _PyArg_Parse_SizeT
+#define PyArg_ParseTuple                _PyArg_ParseTuple_SizeT
+#define PyArg_ParseTupleAndKeywords     _PyArg_ParseTupleAndKeywords_SizeT
+#define PyArg_VaParse                   _PyArg_VaParse_SizeT
+#define PyArg_VaParseTupleAndKeywords   _PyArg_VaParseTupleAndKeywords_SizeT
+#define Py_BuildValue                   _Py_BuildValue_SizeT
+#define Py_VaBuildValue                 _Py_VaBuildValue_SizeT
+#endif
+
+/* Due to a glitch in 3.2, the _SizeT versions weren't exported from the DLL. */
+#if !defined(PY_SSIZE_T_CLEAN) || !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+PyAPI_FUNC(int) PyArg_Parse(PyObject *, const char *, ...);
+PyAPI_FUNC(int) PyArg_ParseTuple(PyObject *, const char *, ...);
+PyAPI_FUNC(int) PyArg_ParseTupleAndKeywords(PyObject *, PyObject *,
+                                                  const char *, char **, ...);
+PyAPI_FUNC(int) PyArg_VaParse(PyObject *, const char *, va_list);
+PyAPI_FUNC(int) PyArg_VaParseTupleAndKeywords(PyObject *, PyObject *,
+                                                  const char *, char **, va_list);
+#endif
+PyAPI_FUNC(int) PyArg_ValidateKeywordArguments(PyObject *);
+PyAPI_FUNC(int) PyArg_UnpackTuple(PyObject *, const char *, Py_ssize_t, Py_ssize_t, ...);
+PyAPI_FUNC(PyObject *) Py_BuildValue(const char *, ...);
+PyAPI_FUNC(PyObject *) _Py_BuildValue_SizeT(const char *, ...);
+
+
+PyAPI_FUNC(PyObject *) Py_VaBuildValue(const char *, va_list);
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030a0000
+// Add an attribute with name 'name' and value 'obj' to the module 'mod.
+// On success, return 0 on success.
+// On error, raise an exception and return -1.
+PyAPI_FUNC(int) PyModule_AddObjectRef(PyObject *mod, const char *name, PyObject *value);
+#endif   /* Py_LIMITED_API */
+
+// Similar to PyModule_AddObjectRef() but steal a reference to 'obj'
+// (Py_DECREF(obj)) on success (if it returns 0).
+PyAPI_FUNC(int) PyModule_AddObject(PyObject *mod, const char *, PyObject *value);
+
+PyAPI_FUNC(int) PyModule_AddIntConstant(PyObject *, const char *, long);
+PyAPI_FUNC(int) PyModule_AddStringConstant(PyObject *, const char *, const char *);
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03090000
+/* New in 3.9 */
+PyAPI_FUNC(int) PyModule_AddType(PyObject *module, PyTypeObject *type);
+#endif /* Py_LIMITED_API */
+
+#define PyModule_AddIntMacro(m, c) PyModule_AddIntConstant((m), #c, (c))
+#define PyModule_AddStringMacro(m, c) PyModule_AddStringConstant((m), #c, (c))
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000
+/* New in 3.5 */
+PyAPI_FUNC(int) PyModule_SetDocString(PyObject *, const char *);
+PyAPI_FUNC(int) PyModule_AddFunctions(PyObject *, PyMethodDef *);
+PyAPI_FUNC(int) PyModule_ExecDef(PyObject *module, PyModuleDef *def);
+#endif
+
+#define Py_CLEANUP_SUPPORTED 0x20000
+
+#define PYTHON_API_VERSION 1013
+#define PYTHON_API_STRING "1013"
+/* The API version is maintained (independently from the Python version)
+   so we can detect mismatches between the interpreter and dynamically
+   loaded modules.  These are diagnosed by an error message but
+   the module is still loaded (because the mismatch can only be tested
+   after loading the module).  The error message is intended to
+   explain the core dump a few seconds later.
+
+   The symbol PYTHON_API_STRING defines the same value as a string
+   literal.  *** PLEASE MAKE SURE THE DEFINITIONS MATCH. ***
+
+   Please add a line or two to the top of this log for each API
+   version change:
+
+   22-Feb-2006  MvL     1013    PEP 353 - long indices for sequence lengths
+
+   19-Aug-2002  GvR     1012    Changes to string object struct for
+                                interning changes, saving 3 bytes.
+
+   17-Jul-2001  GvR     1011    Descr-branch, just to be on the safe side
+
+   25-Jan-2001  FLD     1010    Parameters added to PyCode_New() and
+                                PyFrame_New(); Python 2.1a2
+
+   14-Mar-2000  GvR     1009    Unicode API added
+
+   3-Jan-1999   GvR     1007    Decided to change back!  (Don't reuse 1008!)
+
+   3-Dec-1998   GvR     1008    Python 1.5.2b1
+
+   18-Jan-1997  GvR     1007    string interning and other speedups
+
+   11-Oct-1996  GvR     renamed Py_Ellipses to Py_Ellipsis :-(
+
+   30-Jul-1996  GvR     Slice and ellipses syntax added
+
+   23-Jul-1996  GvR     For 1.4 -- better safe than sorry this time :-)
+
+   7-Nov-1995   GvR     Keyword arguments (should've been done at 1.3 :-( )
+
+   10-Jan-1995  GvR     Renamed globals to new naming scheme
+
+   9-Jan-1995   GvR     Initial version (incompatible with older API)
+*/
+
+/* The PYTHON_ABI_VERSION is introduced in PEP 384. For the lifetime of
+   Python 3, it will stay at the value of 3; changes to the limited API
+   must be performed in a strictly backwards-compatible manner. */
+#define PYTHON_ABI_VERSION 3
+#define PYTHON_ABI_STRING "3"
+
+#ifdef Py_TRACE_REFS
+ /* When we are tracing reference counts, rename module creation functions so
+    modules compiled with incompatible settings will generate a
+    link-time error. */
+ #define PyModule_Create2 PyModule_Create2TraceRefs
+ #define PyModule_FromDefAndSpec2 PyModule_FromDefAndSpec2TraceRefs
+#endif
+
+PyAPI_FUNC(PyObject *) PyModule_Create2(PyModuleDef*, int apiver);
+
+#ifdef Py_LIMITED_API
+#define PyModule_Create(module) \
+        PyModule_Create2((module), PYTHON_ABI_VERSION)
+#else
+#define PyModule_Create(module) \
+        PyModule_Create2((module), PYTHON_API_VERSION)
+#endif
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000
+/* New in 3.5 */
+PyAPI_FUNC(PyObject *) PyModule_FromDefAndSpec2(PyModuleDef *def,
+                                                PyObject *spec,
+                                                int module_api_version);
+
+#ifdef Py_LIMITED_API
+#define PyModule_FromDefAndSpec(module, spec) \
+    PyModule_FromDefAndSpec2((module), (spec), PYTHON_ABI_VERSION)
+#else
+#define PyModule_FromDefAndSpec(module, spec) \
+    PyModule_FromDefAndSpec2((module), (spec), PYTHON_API_VERSION)
+#endif /* Py_LIMITED_API */
+
+#endif /* New in 3.5 */
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_MODSUPPORT_H
+#  include "cpython/modsupport.h"
+#  undef Py_CPYTHON_MODSUPPORT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_MODSUPPORT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/moduleobject.h b/nanvix-port/cpython-headers/python3.12/moduleobject.h
new file mode 100644
index 000000000000..354d133e45ee
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/moduleobject.h
@@ -0,0 +1,119 @@
+
+/* Module object interface */
+
+#ifndef Py_MODULEOBJECT_H
+#define Py_MODULEOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_DATA(PyTypeObject) PyModule_Type;
+
+#define PyModule_Check(op) PyObject_TypeCheck((op), &PyModule_Type)
+#define PyModule_CheckExact(op) Py_IS_TYPE((op), &PyModule_Type)
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+PyAPI_FUNC(PyObject *) PyModule_NewObject(
+    PyObject *name
+    );
+#endif
+PyAPI_FUNC(PyObject *) PyModule_New(
+    const char *name            /* UTF-8 encoded string */
+    );
+PyAPI_FUNC(PyObject *) PyModule_GetDict(PyObject *);
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+PyAPI_FUNC(PyObject *) PyModule_GetNameObject(PyObject *);
+#endif
+PyAPI_FUNC(const char *) PyModule_GetName(PyObject *);
+Py_DEPRECATED(3.2) PyAPI_FUNC(const char *) PyModule_GetFilename(PyObject *);
+PyAPI_FUNC(PyObject *) PyModule_GetFilenameObject(PyObject *);
+#ifndef Py_LIMITED_API
+PyAPI_FUNC(void) _PyModule_Clear(PyObject *);
+PyAPI_FUNC(void) _PyModule_ClearDict(PyObject *);
+PyAPI_FUNC(int) _PyModuleSpec_IsInitializing(PyObject *);
+#endif
+PyAPI_FUNC(PyModuleDef*) PyModule_GetDef(PyObject*);
+PyAPI_FUNC(void*) PyModule_GetState(PyObject*);
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000
+/* New in 3.5 */
+PyAPI_FUNC(PyObject *) PyModuleDef_Init(PyModuleDef*);
+PyAPI_DATA(PyTypeObject) PyModuleDef_Type;
+#endif
+
+typedef struct PyModuleDef_Base {
+  PyObject_HEAD
+  /* The function used to re-initialize the module.
+     This is only set for legacy (single-phase init) extension modules
+     and only used for those that support multiple initializations
+     (m_size >= 0).
+     It is set by _PyImport_LoadDynamicModuleWithSpec()
+     and _imp.create_builtin(). */
+  PyObject* (*m_init)(void);
+  /* The module's index into its interpreter's modules_by_index cache.
+     This is set for all extension modules but only used for legacy ones.
+     (See PyInterpreterState.modules_by_index for more info.)
+     It is set by PyModuleDef_Init(). */
+  Py_ssize_t m_index;
+  /* A copy of the module's __dict__ after the first time it was loaded.
+     This is only set/used for legacy modules that do not support
+     multiple initializations.
+     It is set by _PyImport_FixupExtensionObject(). */
+  PyObject* m_copy;
+} PyModuleDef_Base;
+
+#define PyModuleDef_HEAD_INIT {  \
+    PyObject_HEAD_INIT(_Py_NULL) \
+    _Py_NULL, /* m_init */       \
+    0,        /* m_index */      \
+    _Py_NULL, /* m_copy */       \
+  }
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000
+/* New in 3.5 */
+struct PyModuleDef_Slot {
+    int slot;
+    void *value;
+};
+
+#define Py_mod_create 1
+#define Py_mod_exec 2
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030c0000
+#  define Py_mod_multiple_interpreters 3
+#endif
+
+#ifndef Py_LIMITED_API
+#define _Py_mod_LAST_SLOT 3
+#endif
+
+#endif /* New in 3.5 */
+
+/* for Py_mod_multiple_interpreters: */
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030c0000
+#  define Py_MOD_MULTIPLE_INTERPRETERS_NOT_SUPPORTED ((void *)0)
+#  define Py_MOD_MULTIPLE_INTERPRETERS_SUPPORTED ((void *)1)
+#  define Py_MOD_PER_INTERPRETER_GIL_SUPPORTED ((void *)2)
+#endif
+
+struct PyModuleDef {
+  PyModuleDef_Base m_base;
+  const char* m_name;
+  const char* m_doc;
+  Py_ssize_t m_size;
+  PyMethodDef *m_methods;
+  PyModuleDef_Slot *m_slots;
+  traverseproc m_traverse;
+  inquiry m_clear;
+  freefunc m_free;
+};
+
+
+// Internal C API
+#ifdef Py_BUILD_CORE
+extern int _PyModule_IsExtension(PyObject *obj);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_MODULEOBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/object.h b/nanvix-port/cpython-headers/python3.12/object.h
new file mode 100644
index 000000000000..0d94cf825534
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/object.h
@@ -0,0 +1,993 @@
+#ifndef Py_OBJECT_H
+#define Py_OBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Object and type object interface */
+
+/*
+Objects are structures allocated on the heap.  Special rules apply to
+the use of objects to ensure they are properly garbage-collected.
+Objects are never allocated statically or on the stack; they must be
+accessed through special macros and functions only.  (Type objects are
+exceptions to the first rule; the standard types are represented by
+statically initialized type objects, although work on type/class unification
+for Python 2.2 made it possible to have heap-allocated type objects too).
+
+An object has a 'reference count' that is increased or decreased when a
+pointer to the object is copied or deleted; when the reference count
+reaches zero there are no references to the object left and it can be
+removed from the heap.
+
+An object has a 'type' that determines what it represents and what kind
+of data it contains.  An object's type is fixed when it is created.
+Types themselves are represented as objects; an object contains a
+pointer to the corresponding type object.  The type itself has a type
+pointer pointing to the object representing the type 'type', which
+contains a pointer to itself!.
+
+Objects do not float around in memory; once allocated an object keeps
+the same size and address.  Objects that must hold variable-size data
+can contain pointers to variable-size parts of the object.  Not all
+objects of the same type have the same size; but the size cannot change
+after allocation.  (These restrictions are made so a reference to an
+object can be simply a pointer -- moving an object would require
+updating all the pointers, and changing an object's size would require
+moving it if there was another object right next to it.)
+
+Objects are always accessed through pointers of the type 'PyObject *'.
+The type 'PyObject' is a structure that only contains the reference count
+and the type pointer.  The actual memory allocated for an object
+contains other data that can only be accessed after casting the pointer
+to a pointer to a longer structure type.  This longer type must start
+with the reference count and type fields; the macro PyObject_HEAD should be
+used for this (to accommodate for future changes).  The implementation
+of a particular object type can cast the object pointer to the proper
+type and back.
+
+A standard interface exists for objects that contain an array of items
+whose size is determined when the object is allocated.
+*/
+
+#include "pystats.h"
+
+/* Py_DEBUG implies Py_REF_DEBUG. */
+#if defined(Py_DEBUG) && !defined(Py_REF_DEBUG)
+#  define Py_REF_DEBUG
+#endif
+
+#if defined(Py_LIMITED_API) && defined(Py_TRACE_REFS)
+#  error Py_LIMITED_API is incompatible with Py_TRACE_REFS
+#endif
+
+#ifdef Py_TRACE_REFS
+/* Define pointers to support a doubly-linked list of all live heap objects. */
+#define _PyObject_HEAD_EXTRA            \
+    PyObject *_ob_next;           \
+    PyObject *_ob_prev;
+
+#define _PyObject_EXTRA_INIT _Py_NULL, _Py_NULL,
+
+#else
+#  define _PyObject_HEAD_EXTRA
+#  define _PyObject_EXTRA_INIT
+#endif
+
+/* PyObject_HEAD defines the initial segment of every PyObject. */
+#define PyObject_HEAD                   PyObject ob_base;
+
+/*
+Immortalization:
+
+The following indicates the immortalization strategy depending on the amount
+of available bits in the reference count field. All strategies are backwards
+compatible but the specific reference count value or immortalization check
+might change depending on the specializations for the underlying system.
+
+Proper deallocation of immortal instances requires distinguishing between
+statically allocated immortal instances vs those promoted by the runtime to be
+immortal. The latter should be the only instances that require
+cleanup during runtime finalization.
+*/
+
+#if SIZEOF_VOID_P > 4
+/*
+In 64+ bit systems, an object will be marked as immortal by setting all of the
+lower 32 bits of the reference count field, which is equal to: 0xFFFFFFFF
+
+Using the lower 32 bits makes the value backwards compatible by allowing
+C-Extensions without the updated checks in Py_INCREF and Py_DECREF to safely
+increase and decrease the objects reference count. The object would lose its
+immortality, but the execution would still be correct.
+
+Reference count increases will use saturated arithmetic, taking advantage of
+having all the lower 32 bits set, which will avoid the reference count to go
+beyond the refcount limit. Immortality checks for reference count decreases will
+be done by checking the bit sign flag in the lower 32 bits.
+*/
+#define _Py_IMMORTAL_REFCNT UINT_MAX
+
+#else
+/*
+In 32 bit systems, an object will be marked as immortal by setting all of the
+lower 30 bits of the reference count field, which is equal to: 0x3FFFFFFF
+
+Using the lower 30 bits makes the value backwards compatible by allowing
+C-Extensions without the updated checks in Py_INCREF and Py_DECREF to safely
+increase and decrease the objects reference count. The object would lose its
+immortality, but the execution would still be correct.
+
+Reference count increases and decreases will first go through an immortality
+check by comparing the reference count field to the immortality reference count.
+*/
+#define _Py_IMMORTAL_REFCNT (UINT_MAX >> 2)
+#endif
+
+// Make all internal uses of PyObject_HEAD_INIT immortal while preserving the
+// C-API expectation that the refcnt will be set to 1.
+#ifdef Py_BUILD_CORE
+#define PyObject_HEAD_INIT(type)    \
+    {                               \
+        _PyObject_EXTRA_INIT        \
+        { _Py_IMMORTAL_REFCNT },    \
+        (type)                      \
+    },
+#else
+#define PyObject_HEAD_INIT(type) \
+    {                            \
+        _PyObject_EXTRA_INIT     \
+        { 1 },                   \
+        (type)                   \
+    },
+#endif /* Py_BUILD_CORE */
+
+#define PyVarObject_HEAD_INIT(type, size) \
+    {                                     \
+        PyObject_HEAD_INIT(type)          \
+        (size)                            \
+    },
+
+/* PyObject_VAR_HEAD defines the initial segment of all variable-size
+ * container objects.  These end with a declaration of an array with 1
+ * element, but enough space is malloc'ed so that the array actually
+ * has room for ob_size elements.  Note that ob_size is an element count,
+ * not necessarily a byte count.
+ */
+#define PyObject_VAR_HEAD      PyVarObject ob_base;
+#define Py_INVALID_SIZE (Py_ssize_t)-1
+
+/* Nothing is actually declared to be a PyObject, but every pointer to
+ * a Python object can be cast to a PyObject*.  This is inheritance built
+ * by hand.  Similarly every pointer to a variable-size Python object can,
+ * in addition, be cast to PyVarObject*.
+ */
+struct _object {
+    _PyObject_HEAD_EXTRA
+
+#if (defined(__GNUC__) || defined(__clang__)) \
+        && !(defined __STDC_VERSION__ && __STDC_VERSION__ >= 201112L)
+    // On C99 and older, anonymous union is a GCC and clang extension
+    __extension__
+#endif
+#ifdef _MSC_VER
+    // Ignore MSC warning C4201: "nonstandard extension used:
+    // nameless struct/union"
+    __pragma(warning(push))
+    __pragma(warning(disable: 4201))
+#endif
+    union {
+       Py_ssize_t ob_refcnt;
+#if SIZEOF_VOID_P > 4
+       PY_UINT32_T ob_refcnt_split[2];
+#endif
+    };
+#ifdef _MSC_VER
+    __pragma(warning(pop))
+#endif
+
+    PyTypeObject *ob_type;
+};
+
+/* Cast argument to PyObject* type. */
+#define _PyObject_CAST(op) _Py_CAST(PyObject*, (op))
+
+typedef struct {
+    PyObject ob_base;
+    Py_ssize_t ob_size; /* Number of items in variable part */
+} PyVarObject;
+
+/* Cast argument to PyVarObject* type. */
+#define _PyVarObject_CAST(op) _Py_CAST(PyVarObject*, (op))
+
+
+// Test if the 'x' object is the 'y' object, the same as "x is y" in Python.
+PyAPI_FUNC(int) Py_Is(PyObject *x, PyObject *y);
+#define Py_Is(x, y) ((x) == (y))
+
+
+static inline Py_ssize_t Py_REFCNT(PyObject *ob) {
+    return ob->ob_refcnt;
+}
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
+#  define Py_REFCNT(ob) Py_REFCNT(_PyObject_CAST(ob))
+#endif
+
+
+// bpo-39573: The Py_SET_TYPE() function must be used to set an object type.
+static inline PyTypeObject* Py_TYPE(PyObject *ob) {
+    return ob->ob_type;
+}
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
+#  define Py_TYPE(ob) Py_TYPE(_PyObject_CAST(ob))
+#endif
+
+PyAPI_DATA(PyTypeObject) PyLong_Type;
+PyAPI_DATA(PyTypeObject) PyBool_Type;
+
+// bpo-39573: The Py_SET_SIZE() function must be used to set an object size.
+static inline Py_ssize_t Py_SIZE(PyObject *ob) {
+    assert(ob->ob_type != &PyLong_Type);
+    assert(ob->ob_type != &PyBool_Type);
+    return  _PyVarObject_CAST(ob)->ob_size;
+}
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
+#  define Py_SIZE(ob) Py_SIZE(_PyObject_CAST(ob))
+#endif
+
+static inline Py_ALWAYS_INLINE int _Py_IsImmortal(PyObject *op)
+{
+#if SIZEOF_VOID_P > 4
+    return _Py_CAST(PY_INT32_T, op->ob_refcnt) < 0;
+#else
+    return op->ob_refcnt == _Py_IMMORTAL_REFCNT;
+#endif
+}
+#define _Py_IsImmortal(op) _Py_IsImmortal(_PyObject_CAST(op))
+
+static inline int Py_IS_TYPE(PyObject *ob, PyTypeObject *type) {
+    return Py_TYPE(ob) == type;
+}
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
+#  define Py_IS_TYPE(ob, type) Py_IS_TYPE(_PyObject_CAST(ob), (type))
+#endif
+
+
+static inline void Py_SET_REFCNT(PyObject *ob, Py_ssize_t refcnt) {
+    // This immortal check is for code that is unaware of immortal objects.
+    // The runtime tracks these objects and we should avoid as much
+    // as possible having extensions inadvertently change the refcnt
+    // of an immortalized object.
+    if (_Py_IsImmortal(ob)) {
+        return;
+    }
+    ob->ob_refcnt = refcnt;
+}
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
+#  define Py_SET_REFCNT(ob, refcnt) Py_SET_REFCNT(_PyObject_CAST(ob), (refcnt))
+#endif
+
+
+static inline void Py_SET_TYPE(PyObject *ob, PyTypeObject *type) {
+    ob->ob_type = type;
+}
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
+#  define Py_SET_TYPE(ob, type) Py_SET_TYPE(_PyObject_CAST(ob), type)
+#endif
+
+static inline void Py_SET_SIZE(PyVarObject *ob, Py_ssize_t size) {
+    assert(ob->ob_base.ob_type != &PyLong_Type);
+    assert(ob->ob_base.ob_type != &PyBool_Type);
+    ob->ob_size = size;
+}
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
+#  define Py_SET_SIZE(ob, size) Py_SET_SIZE(_PyVarObject_CAST(ob), (size))
+#endif
+
+
+/*
+Type objects contain a string containing the type name (to help somewhat
+in debugging), the allocation parameters (see PyObject_New() and
+PyObject_NewVar()),
+and methods for accessing objects of the type.  Methods are optional, a
+nil pointer meaning that particular kind of access is not available for
+this type.  The Py_DECREF() macro uses the tp_dealloc method without
+checking for a nil pointer; it should always be implemented except if
+the implementation can guarantee that the reference count will never
+reach zero (e.g., for statically allocated type objects).
+
+NB: the methods for certain type groups are now contained in separate
+method blocks.
+*/
+
+typedef PyObject * (*unaryfunc)(PyObject *);
+typedef PyObject * (*binaryfunc)(PyObject *, PyObject *);
+typedef PyObject * (*ternaryfunc)(PyObject *, PyObject *, PyObject *);
+typedef int (*inquiry)(PyObject *);
+typedef Py_ssize_t (*lenfunc)(PyObject *);
+typedef PyObject *(*ssizeargfunc)(PyObject *, Py_ssize_t);
+typedef PyObject *(*ssizessizeargfunc)(PyObject *, Py_ssize_t, Py_ssize_t);
+typedef int(*ssizeobjargproc)(PyObject *, Py_ssize_t, PyObject *);
+typedef int(*ssizessizeobjargproc)(PyObject *, Py_ssize_t, Py_ssize_t, PyObject *);
+typedef int(*objobjargproc)(PyObject *, PyObject *, PyObject *);
+
+typedef int (*objobjproc)(PyObject *, PyObject *);
+typedef int (*visitproc)(PyObject *, void *);
+typedef int (*traverseproc)(PyObject *, visitproc, void *);
+
+
+typedef void (*freefunc)(void *);
+typedef void (*destructor)(PyObject *);
+typedef PyObject *(*getattrfunc)(PyObject *, char *);
+typedef PyObject *(*getattrofunc)(PyObject *, PyObject *);
+typedef int (*setattrfunc)(PyObject *, char *, PyObject *);
+typedef int (*setattrofunc)(PyObject *, PyObject *, PyObject *);
+typedef PyObject *(*reprfunc)(PyObject *);
+typedef Py_hash_t (*hashfunc)(PyObject *);
+typedef PyObject *(*richcmpfunc) (PyObject *, PyObject *, int);
+typedef PyObject *(*getiterfunc) (PyObject *);
+typedef PyObject *(*iternextfunc) (PyObject *);
+typedef PyObject *(*descrgetfunc) (PyObject *, PyObject *, PyObject *);
+typedef int (*descrsetfunc) (PyObject *, PyObject *, PyObject *);
+typedef int (*initproc)(PyObject *, PyObject *, PyObject *);
+typedef PyObject *(*newfunc)(PyTypeObject *, PyObject *, PyObject *);
+typedef PyObject *(*allocfunc)(PyTypeObject *, Py_ssize_t);
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030c0000 // 3.12
+typedef PyObject *(*vectorcallfunc)(PyObject *callable, PyObject *const *args,
+                                    size_t nargsf, PyObject *kwnames);
+#endif
+
+typedef struct{
+    int slot;    /* slot id, see below */
+    void *pfunc; /* function pointer */
+} PyType_Slot;
+
+typedef struct{
+    const char* name;
+    int basicsize;
+    int itemsize;
+    unsigned int flags;
+    PyType_Slot *slots; /* terminated by slot==0. */
+} PyType_Spec;
+
+PyAPI_FUNC(PyObject*) PyType_FromSpec(PyType_Spec*);
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+PyAPI_FUNC(PyObject*) PyType_FromSpecWithBases(PyType_Spec*, PyObject*);
+#endif
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03040000
+PyAPI_FUNC(void*) PyType_GetSlot(PyTypeObject*, int);
+#endif
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03090000
+PyAPI_FUNC(PyObject*) PyType_FromModuleAndSpec(PyObject *, PyType_Spec *, PyObject *);
+PyAPI_FUNC(PyObject *) PyType_GetModule(PyTypeObject *);
+PyAPI_FUNC(void *) PyType_GetModuleState(PyTypeObject *);
+#endif
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030B0000
+PyAPI_FUNC(PyObject *) PyType_GetName(PyTypeObject *);
+PyAPI_FUNC(PyObject *) PyType_GetQualName(PyTypeObject *);
+#endif
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030C0000
+PyAPI_FUNC(PyObject *) PyType_FromMetaclass(PyTypeObject*, PyObject*, PyType_Spec*, PyObject*);
+PyAPI_FUNC(void *) PyObject_GetTypeData(PyObject *obj, PyTypeObject *cls);
+PyAPI_FUNC(Py_ssize_t) PyType_GetTypeDataSize(PyTypeObject *cls);
+#endif
+
+/* Generic type check */
+PyAPI_FUNC(int) PyType_IsSubtype(PyTypeObject *, PyTypeObject *);
+
+static inline int PyObject_TypeCheck(PyObject *ob, PyTypeObject *type) {
+    return Py_IS_TYPE(ob, type) || PyType_IsSubtype(Py_TYPE(ob), type);
+}
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
+#  define PyObject_TypeCheck(ob, type) PyObject_TypeCheck(_PyObject_CAST(ob), (type))
+#endif
+
+PyAPI_DATA(PyTypeObject) PyType_Type; /* built-in 'type' */
+PyAPI_DATA(PyTypeObject) PyBaseObject_Type; /* built-in 'object' */
+PyAPI_DATA(PyTypeObject) PySuper_Type; /* built-in 'super' */
+
+PyAPI_FUNC(unsigned long) PyType_GetFlags(PyTypeObject*);
+
+PyAPI_FUNC(int) PyType_Ready(PyTypeObject *);
+PyAPI_FUNC(PyObject *) PyType_GenericAlloc(PyTypeObject *, Py_ssize_t);
+PyAPI_FUNC(PyObject *) PyType_GenericNew(PyTypeObject *,
+                                               PyObject *, PyObject *);
+PyAPI_FUNC(unsigned int) PyType_ClearCache(void);
+PyAPI_FUNC(void) PyType_Modified(PyTypeObject *);
+
+/* Generic operations on objects */
+PyAPI_FUNC(PyObject *) PyObject_Repr(PyObject *);
+PyAPI_FUNC(PyObject *) PyObject_Str(PyObject *);
+PyAPI_FUNC(PyObject *) PyObject_ASCII(PyObject *);
+PyAPI_FUNC(PyObject *) PyObject_Bytes(PyObject *);
+PyAPI_FUNC(PyObject *) PyObject_RichCompare(PyObject *, PyObject *, int);
+PyAPI_FUNC(int) PyObject_RichCompareBool(PyObject *, PyObject *, int);
+PyAPI_FUNC(PyObject *) PyObject_GetAttrString(PyObject *, const char *);
+PyAPI_FUNC(int) PyObject_SetAttrString(PyObject *, const char *, PyObject *);
+PyAPI_FUNC(int) PyObject_HasAttrString(PyObject *, const char *);
+PyAPI_FUNC(PyObject *) PyObject_GetAttr(PyObject *, PyObject *);
+PyAPI_FUNC(int) PyObject_SetAttr(PyObject *, PyObject *, PyObject *);
+PyAPI_FUNC(int) PyObject_HasAttr(PyObject *, PyObject *);
+PyAPI_FUNC(PyObject *) PyObject_SelfIter(PyObject *);
+PyAPI_FUNC(PyObject *) PyObject_GenericGetAttr(PyObject *, PyObject *);
+PyAPI_FUNC(int) PyObject_GenericSetAttr(PyObject *, PyObject *, PyObject *);
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+PyAPI_FUNC(int) PyObject_GenericSetDict(PyObject *, PyObject *, void *);
+#endif
+PyAPI_FUNC(Py_hash_t) PyObject_Hash(PyObject *);
+PyAPI_FUNC(Py_hash_t) PyObject_HashNotImplemented(PyObject *);
+PyAPI_FUNC(int) PyObject_IsTrue(PyObject *);
+PyAPI_FUNC(int) PyObject_Not(PyObject *);
+PyAPI_FUNC(int) PyCallable_Check(PyObject *);
+PyAPI_FUNC(void) PyObject_ClearWeakRefs(PyObject *);
+
+/* PyObject_Dir(obj) acts like Python builtins.dir(obj), returning a
+   list of strings.  PyObject_Dir(NULL) is like builtins.dir(),
+   returning the names of the current locals.  In this case, if there are
+   no current locals, NULL is returned, and PyErr_Occurred() is false.
+*/
+PyAPI_FUNC(PyObject *) PyObject_Dir(PyObject *);
+
+/* Pickle support. */
+#ifndef Py_LIMITED_API
+PyAPI_FUNC(PyObject *) _PyObject_GetState(PyObject *);
+#endif
+
+
+/* Helpers for printing recursive container types */
+PyAPI_FUNC(int) Py_ReprEnter(PyObject *);
+PyAPI_FUNC(void) Py_ReprLeave(PyObject *);
+
+/* Flag bits for printing: */
+#define Py_PRINT_RAW    1       /* No string quotes etc. */
+
+/*
+Type flags (tp_flags)
+
+These flags are used to change expected features and behavior for a
+particular type.
+
+Arbitration of the flag bit positions will need to be coordinated among
+all extension writers who publicly release their extensions (this will
+be fewer than you might expect!).
+
+Most flags were removed as of Python 3.0 to make room for new flags.  (Some
+flags are not for backwards compatibility but to indicate the presence of an
+optional feature; these flags remain of course.)
+
+Type definitions should use Py_TPFLAGS_DEFAULT for their tp_flags value.
+
+Code can use PyType_HasFeature(type_ob, flag_value) to test whether the
+given type object has a specified feature.
+*/
+
+#ifndef Py_LIMITED_API
+
+/* Track types initialized using _PyStaticType_InitBuiltin(). */
+#define _Py_TPFLAGS_STATIC_BUILTIN (1 << 1)
+
+/* Placement of weakref pointers are managed by the VM, not by the type.
+ * The VM will automatically set tp_weaklistoffset.
+ */
+#define Py_TPFLAGS_MANAGED_WEAKREF (1 << 3)
+
+/* Placement of dict (and values) pointers are managed by the VM, not by the type.
+ * The VM will automatically set tp_dictoffset.
+ */
+#define Py_TPFLAGS_MANAGED_DICT (1 << 4)
+
+#define Py_TPFLAGS_PREHEADER (Py_TPFLAGS_MANAGED_WEAKREF | Py_TPFLAGS_MANAGED_DICT)
+
+/* Set if instances of the type object are treated as sequences for pattern matching */
+#define Py_TPFLAGS_SEQUENCE (1 << 5)
+/* Set if instances of the type object are treated as mappings for pattern matching */
+#define Py_TPFLAGS_MAPPING (1 << 6)
+#endif
+
+/* Disallow creating instances of the type: set tp_new to NULL and don't create
+ * the "__new__" key in the type dictionary. */
+#define Py_TPFLAGS_DISALLOW_INSTANTIATION (1UL << 7)
+
+/* Set if the type object is immutable: type attributes cannot be set nor deleted */
+#define Py_TPFLAGS_IMMUTABLETYPE (1UL << 8)
+
+/* Set if the type object is dynamically allocated */
+#define Py_TPFLAGS_HEAPTYPE (1UL << 9)
+
+/* Set if the type allows subclassing */
+#define Py_TPFLAGS_BASETYPE (1UL << 10)
+
+/* Set if the type implements the vectorcall protocol (PEP 590) */
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030C0000
+#define Py_TPFLAGS_HAVE_VECTORCALL (1UL << 11)
+#ifndef Py_LIMITED_API
+// Backwards compatibility alias for API that was provisional in Python 3.8
+#define _Py_TPFLAGS_HAVE_VECTORCALL Py_TPFLAGS_HAVE_VECTORCALL
+#endif
+#endif
+
+/* Set if the type is 'ready' -- fully initialized */
+#define Py_TPFLAGS_READY (1UL << 12)
+
+/* Set while the type is being 'readied', to prevent recursive ready calls */
+#define Py_TPFLAGS_READYING (1UL << 13)
+
+/* Objects support garbage collection (see objimpl.h) */
+#define Py_TPFLAGS_HAVE_GC (1UL << 14)
+
+/* These two bits are preserved for Stackless Python, next after this is 17 */
+#ifdef STACKLESS
+#define Py_TPFLAGS_HAVE_STACKLESS_EXTENSION (3UL << 15)
+#else
+#define Py_TPFLAGS_HAVE_STACKLESS_EXTENSION 0
+#endif
+
+/* Objects behave like an unbound method */
+#define Py_TPFLAGS_METHOD_DESCRIPTOR (1UL << 17)
+
+/* Object has up-to-date type attribute cache */
+#define Py_TPFLAGS_VALID_VERSION_TAG  (1UL << 19)
+
+/* Type is abstract and cannot be instantiated */
+#define Py_TPFLAGS_IS_ABSTRACT (1UL << 20)
+
+// This undocumented flag gives certain built-ins their unique pattern-matching
+// behavior, which allows a single positional subpattern to match against the
+// subject itself (rather than a mapped attribute on it):
+#define _Py_TPFLAGS_MATCH_SELF (1UL << 22)
+
+/* Items (ob_size*tp_itemsize) are found at the end of an instance's memory */
+#define Py_TPFLAGS_ITEMS_AT_END (1UL << 23)
+
+/* These flags are used to determine if a type is a subclass. */
+#define Py_TPFLAGS_LONG_SUBCLASS        (1UL << 24)
+#define Py_TPFLAGS_LIST_SUBCLASS        (1UL << 25)
+#define Py_TPFLAGS_TUPLE_SUBCLASS       (1UL << 26)
+#define Py_TPFLAGS_BYTES_SUBCLASS       (1UL << 27)
+#define Py_TPFLAGS_UNICODE_SUBCLASS     (1UL << 28)
+#define Py_TPFLAGS_DICT_SUBCLASS        (1UL << 29)
+#define Py_TPFLAGS_BASE_EXC_SUBCLASS    (1UL << 30)
+#define Py_TPFLAGS_TYPE_SUBCLASS        (1UL << 31)
+
+#define Py_TPFLAGS_DEFAULT  ( \
+                 Py_TPFLAGS_HAVE_STACKLESS_EXTENSION | \
+                0)
+
+/* NOTE: Some of the following flags reuse lower bits (removed as part of the
+ * Python 3.0 transition). */
+
+/* The following flags are kept for compatibility; in previous
+ * versions they indicated presence of newer tp_* fields on the
+ * type struct.
+ * Starting with 3.8, binary compatibility of C extensions across
+ * feature releases of Python is not supported anymore (except when
+ * using the stable ABI, in which all classes are created dynamically,
+ * using the interpreter's memory layout.)
+ * Note that older extensions using the stable ABI set these flags,
+ * so the bits must not be repurposed.
+ */
+#define Py_TPFLAGS_HAVE_FINALIZE (1UL << 0)
+#define Py_TPFLAGS_HAVE_VERSION_TAG   (1UL << 18)
+
+
+/*
+The macros Py_INCREF(op) and Py_DECREF(op) are used to increment or decrement
+reference counts.  Py_DECREF calls the object's deallocator function when
+the refcount falls to 0; for
+objects that don't contain references to other objects or heap memory
+this can be the standard function free().  Both macros can be used
+wherever a void expression is allowed.  The argument must not be a
+NULL pointer.  If it may be NULL, use Py_XINCREF/Py_XDECREF instead.
+The macro _Py_NewReference(op) initialize reference counts to 1, and
+in special builds (Py_REF_DEBUG, Py_TRACE_REFS) performs additional
+bookkeeping appropriate to the special build.
+
+We assume that the reference count field can never overflow; this can
+be proven when the size of the field is the same as the pointer size, so
+we ignore the possibility.  Provided a C int is at least 32 bits (which
+is implicitly assumed in many parts of this code), that's enough for
+about 2**31 references to an object.
+
+XXX The following became out of date in Python 2.2, but I'm not sure
+XXX what the full truth is now.  Certainly, heap-allocated type objects
+XXX can and should be deallocated.
+Type objects should never be deallocated; the type pointer in an object
+is not considered to be a reference to the type object, to save
+complications in the deallocation function.  (This is actually a
+decision that's up to the implementer of each new type so if you want,
+you can count such references to the type object.)
+*/
+
+#if defined(Py_REF_DEBUG) && !defined(Py_LIMITED_API)
+PyAPI_FUNC(void) _Py_NegativeRefcount(const char *filename, int lineno,
+                                      PyObject *op);
+PyAPI_FUNC(void) _Py_INCREF_IncRefTotal(void);
+PyAPI_FUNC(void) _Py_DECREF_DecRefTotal(void);
+#endif  // Py_REF_DEBUG && !Py_LIMITED_API
+
+PyAPI_FUNC(void) _Py_Dealloc(PyObject *);
+
+/*
+These are provided as conveniences to Python runtime embedders, so that
+they can have object code that is not dependent on Python compilation flags.
+*/
+PyAPI_FUNC(void) Py_IncRef(PyObject *);
+PyAPI_FUNC(void) Py_DecRef(PyObject *);
+
+// Similar to Py_IncRef() and Py_DecRef() but the argument must be non-NULL.
+// Private functions used by Py_INCREF() and Py_DECREF().
+PyAPI_FUNC(void) _Py_IncRef(PyObject *);
+PyAPI_FUNC(void) _Py_DecRef(PyObject *);
+
+static inline Py_ALWAYS_INLINE void Py_INCREF(PyObject *op)
+{
+#if defined(Py_LIMITED_API) && (Py_LIMITED_API+0 >= 0x030c0000 || defined(Py_REF_DEBUG))
+    // Stable ABI implements Py_INCREF() as a function call on limited C API
+    // version 3.12 and newer, and on Python built in debug mode. _Py_IncRef()
+    // was added to Python 3.10.0a7, use Py_IncRef() on older Python versions.
+    // Py_IncRef() accepts NULL whereas _Py_IncRef() doesn't.
+#  if Py_LIMITED_API+0 >= 0x030a00A7
+    _Py_IncRef(op);
+#  else
+    Py_IncRef(op);
+#  endif
+#else
+    // Non-limited C API and limited C API for Python 3.9 and older access
+    // directly PyObject.ob_refcnt.
+#if SIZEOF_VOID_P > 4
+    // Portable saturated add, branching on the carry flag and set low bits
+    PY_UINT32_T cur_refcnt = op->ob_refcnt_split[PY_BIG_ENDIAN];
+    PY_UINT32_T new_refcnt = cur_refcnt + 1;
+    if (new_refcnt == 0) {
+        return;
+    }
+    op->ob_refcnt_split[PY_BIG_ENDIAN] = new_refcnt;
+#else
+    // Explicitly check immortality against the immortal value
+    if (_Py_IsImmortal(op)) {
+        return;
+    }
+    op->ob_refcnt++;
+#endif
+    _Py_INCREF_STAT_INC();
+#ifdef Py_REF_DEBUG
+    _Py_INCREF_IncRefTotal();
+#endif
+#endif
+}
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
+#  define Py_INCREF(op) Py_INCREF(_PyObject_CAST(op))
+#endif
+
+#if defined(Py_LIMITED_API) && (Py_LIMITED_API+0 >= 0x030c0000 || defined(Py_REF_DEBUG))
+// Stable ABI implements Py_DECREF() as a function call on limited C API
+// version 3.12 and newer, and on Python built in debug mode. _Py_DecRef() was
+// added to Python 3.10.0a7, use Py_DecRef() on older Python versions.
+// Py_DecRef() accepts NULL whereas _Py_IncRef() doesn't.
+static inline void Py_DECREF(PyObject *op) {
+#  if Py_LIMITED_API+0 >= 0x030a00A7
+    _Py_DecRef(op);
+#  else
+    Py_DecRef(op);
+#  endif
+}
+#define Py_DECREF(op) Py_DECREF(_PyObject_CAST(op))
+
+#elif defined(Py_REF_DEBUG)
+static inline void Py_DECREF(const char *filename, int lineno, PyObject *op)
+{
+    if (op->ob_refcnt <= 0) {
+        _Py_NegativeRefcount(filename, lineno, op);
+    }
+    if (_Py_IsImmortal(op)) {
+        return;
+    }
+    _Py_DECREF_STAT_INC();
+    _Py_DECREF_DecRefTotal();
+    if (--op->ob_refcnt == 0) {
+        _Py_Dealloc(op);
+    }
+}
+#define Py_DECREF(op) Py_DECREF(__FILE__, __LINE__, _PyObject_CAST(op))
+
+#else
+static inline Py_ALWAYS_INLINE void Py_DECREF(PyObject *op)
+{
+    // Non-limited C API and limited C API for Python 3.9 and older access
+    // directly PyObject.ob_refcnt.
+    if (_Py_IsImmortal(op)) {
+        return;
+    }
+    _Py_DECREF_STAT_INC();
+    if (--op->ob_refcnt == 0) {
+        _Py_Dealloc(op);
+    }
+}
+#define Py_DECREF(op) Py_DECREF(_PyObject_CAST(op))
+#endif
+
+
+/* Safely decref `op` and set `op` to NULL, especially useful in tp_clear
+ * and tp_dealloc implementations.
+ *
+ * Note that "the obvious" code can be deadly:
+ *
+ *     Py_XDECREF(op);
+ *     op = NULL;
+ *
+ * Typically, `op` is something like self->containee, and `self` is done
+ * using its `containee` member.  In the code sequence above, suppose
+ * `containee` is non-NULL with a refcount of 1.  Its refcount falls to
+ * 0 on the first line, which can trigger an arbitrary amount of code,
+ * possibly including finalizers (like __del__ methods or weakref callbacks)
+ * coded in Python, which in turn can release the GIL and allow other threads
+ * to run, etc.  Such code may even invoke methods of `self` again, or cause
+ * cyclic gc to trigger, but-- oops! --self->containee still points to the
+ * object being torn down, and it may be in an insane state while being torn
+ * down.  This has in fact been a rich historic source of miserable (rare &
+ * hard-to-diagnose) segfaulting (and other) bugs.
+ *
+ * The safe way is:
+ *
+ *      Py_CLEAR(op);
+ *
+ * That arranges to set `op` to NULL _before_ decref'ing, so that any code
+ * triggered as a side-effect of `op` getting torn down no longer believes
+ * `op` points to a valid object.
+ *
+ * There are cases where it's safe to use the naive code, but they're brittle.
+ * For example, if `op` points to a Python integer, you know that destroying
+ * one of those can't cause problems -- but in part that relies on that
+ * Python integers aren't currently weakly referencable.  Best practice is
+ * to use Py_CLEAR() even if you can't think of a reason for why you need to.
+ *
+ * gh-98724: Use a temporary variable to only evaluate the macro argument once,
+ * to avoid the duplication of side effects if the argument has side effects.
+ *
+ * gh-99701: If the PyObject* type is used with casting arguments to PyObject*,
+ * the code can be miscompiled with strict aliasing because of type punning.
+ * With strict aliasing, a compiler considers that two pointers of different
+ * types cannot read or write the same memory which enables optimization
+ * opportunities.
+ *
+ * If available, use _Py_TYPEOF() to use the 'op' type for temporary variables,
+ * and so avoid type punning. Otherwise, use memcpy() which causes type erasure
+ * and so prevents the compiler to reuse an old cached 'op' value after
+ * Py_CLEAR().
+ */
+#ifdef _Py_TYPEOF
+#define Py_CLEAR(op) \
+    do { \
+        _Py_TYPEOF(op)* _tmp_op_ptr = &(op); \
+        _Py_TYPEOF(op) _tmp_old_op = (*_tmp_op_ptr); \
+        if (_tmp_old_op != NULL) { \
+            *_tmp_op_ptr = _Py_NULL; \
+            Py_DECREF(_tmp_old_op); \
+        } \
+    } while (0)
+#else
+#define Py_CLEAR(op) \
+    do { \
+        PyObject **_tmp_op_ptr = _Py_CAST(PyObject**, &(op)); \
+        PyObject *_tmp_old_op = (*_tmp_op_ptr); \
+        if (_tmp_old_op != NULL) { \
+            PyObject *_null_ptr = _Py_NULL; \
+            memcpy(_tmp_op_ptr, &_null_ptr, sizeof(PyObject*)); \
+            Py_DECREF(_tmp_old_op); \
+        } \
+    } while (0)
+#endif
+
+
+/* Function to use in case the object pointer can be NULL: */
+static inline void Py_XINCREF(PyObject *op)
+{
+    if (op != _Py_NULL) {
+        Py_INCREF(op);
+    }
+}
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
+#  define Py_XINCREF(op) Py_XINCREF(_PyObject_CAST(op))
+#endif
+
+static inline void Py_XDECREF(PyObject *op)
+{
+    if (op != _Py_NULL) {
+        Py_DECREF(op);
+    }
+}
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
+#  define Py_XDECREF(op) Py_XDECREF(_PyObject_CAST(op))
+#endif
+
+// Create a new strong reference to an object:
+// increment the reference count of the object and return the object.
+PyAPI_FUNC(PyObject*) Py_NewRef(PyObject *obj);
+
+// Similar to Py_NewRef(), but the object can be NULL.
+PyAPI_FUNC(PyObject*) Py_XNewRef(PyObject *obj);
+
+static inline PyObject* _Py_NewRef(PyObject *obj)
+{
+    Py_INCREF(obj);
+    return obj;
+}
+
+static inline PyObject* _Py_XNewRef(PyObject *obj)
+{
+    Py_XINCREF(obj);
+    return obj;
+}
+
+// Py_NewRef() and Py_XNewRef() are exported as functions for the stable ABI.
+// Names overridden with macros by static inline functions for best
+// performances.
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
+#  define Py_NewRef(obj) _Py_NewRef(_PyObject_CAST(obj))
+#  define Py_XNewRef(obj) _Py_XNewRef(_PyObject_CAST(obj))
+#else
+#  define Py_NewRef(obj) _Py_NewRef(obj)
+#  define Py_XNewRef(obj) _Py_XNewRef(obj)
+#endif
+
+
+/*
+_Py_NoneStruct is an object of undefined type which can be used in contexts
+where NULL (nil) is not suitable (since NULL often means 'error').
+
+Don't forget to apply Py_INCREF() when returning this value!!!
+*/
+PyAPI_DATA(PyObject) _Py_NoneStruct; /* Don't use this directly */
+#define Py_None (&_Py_NoneStruct)
+
+// Test if an object is the None singleton, the same as "x is None" in Python.
+PyAPI_FUNC(int) Py_IsNone(PyObject *x);
+#define Py_IsNone(x) Py_Is((x), Py_None)
+
+/* Macro for returning Py_None from a function */
+#define Py_RETURN_NONE return Py_None
+
+/*
+Py_NotImplemented is a singleton used to signal that an operation is
+not implemented for a given type combination.
+*/
+PyAPI_DATA(PyObject) _Py_NotImplementedStruct; /* Don't use this directly */
+#define Py_NotImplemented (&_Py_NotImplementedStruct)
+
+/* Macro for returning Py_NotImplemented from a function */
+#define Py_RETURN_NOTIMPLEMENTED return Py_NotImplemented
+
+/* Rich comparison opcodes */
+#define Py_LT 0
+#define Py_LE 1
+#define Py_EQ 2
+#define Py_NE 3
+#define Py_GT 4
+#define Py_GE 5
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030A0000
+/* Result of calling PyIter_Send */
+typedef enum {
+    PYGEN_RETURN = 0,
+    PYGEN_ERROR = -1,
+    PYGEN_NEXT = 1,
+} PySendResult;
+#endif
+
+/*
+ * Macro for implementing rich comparisons
+ *
+ * Needs to be a macro because any C-comparable type can be used.
+ */
+#define Py_RETURN_RICHCOMPARE(val1, val2, op)                               \
+    do {                                                                    \
+        switch (op) {                                                       \
+        case Py_EQ: if ((val1) == (val2)) Py_RETURN_TRUE; Py_RETURN_FALSE;  \
+        case Py_NE: if ((val1) != (val2)) Py_RETURN_TRUE; Py_RETURN_FALSE;  \
+        case Py_LT: if ((val1) < (val2)) Py_RETURN_TRUE; Py_RETURN_FALSE;   \
+        case Py_GT: if ((val1) > (val2)) Py_RETURN_TRUE; Py_RETURN_FALSE;   \
+        case Py_LE: if ((val1) <= (val2)) Py_RETURN_TRUE; Py_RETURN_FALSE;  \
+        case Py_GE: if ((val1) >= (val2)) Py_RETURN_TRUE; Py_RETURN_FALSE;  \
+        default:                                                            \
+            Py_UNREACHABLE();                                               \
+        }                                                                   \
+    } while (0)
+
+
+/*
+More conventions
+================
+
+Argument Checking
+-----------------
+
+Functions that take objects as arguments normally don't check for nil
+arguments, but they do check the type of the argument, and return an
+error if the function doesn't apply to the type.
+
+Failure Modes
+-------------
+
+Functions may fail for a variety of reasons, including running out of
+memory.  This is communicated to the caller in two ways: an error string
+is set (see errors.h), and the function result differs: functions that
+normally return a pointer return NULL for failure, functions returning
+an integer return -1 (which could be a legal return value too!), and
+other functions return 0 for success and -1 for failure.
+Callers should always check for errors before using the result.  If
+an error was set, the caller must either explicitly clear it, or pass
+the error on to its caller.
+
+Reference Counts
+----------------
+
+It takes a while to get used to the proper usage of reference counts.
+
+Functions that create an object set the reference count to 1; such new
+objects must be stored somewhere or destroyed again with Py_DECREF().
+Some functions that 'store' objects, such as PyTuple_SetItem() and
+PyList_SetItem(),
+don't increment the reference count of the object, since the most
+frequent use is to store a fresh object.  Functions that 'retrieve'
+objects, such as PyTuple_GetItem() and PyDict_GetItemString(), also
+don't increment
+the reference count, since most frequently the object is only looked at
+quickly.  Thus, to retrieve an object and store it again, the caller
+must call Py_INCREF() explicitly.
+
+NOTE: functions that 'consume' a reference count, like
+PyList_SetItem(), consume the reference even if the object wasn't
+successfully stored, to simplify error handling.
+
+It seems attractive to make other functions that take an object as
+argument consume a reference count; however, this may quickly get
+confusing (even the current practice is already confusing).  Consider
+it carefully, it may save lots of calls to Py_INCREF() and Py_DECREF() at
+times.
+*/
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_OBJECT_H
+#  include "cpython/object.h"
+#  undef Py_CPYTHON_OBJECT_H
+#endif
+
+
+static inline int
+PyType_HasFeature(PyTypeObject *type, unsigned long feature)
+{
+    unsigned long flags;
+#ifdef Py_LIMITED_API
+    // PyTypeObject is opaque in the limited C API
+    flags = PyType_GetFlags(type);
+#else
+    flags = type->tp_flags;
+#endif
+    return ((flags & feature) != 0);
+}
+
+#define PyType_FastSubclass(type, flag) PyType_HasFeature((type), (flag))
+
+static inline int PyType_Check(PyObject *op) {
+    return PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_TYPE_SUBCLASS);
+}
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
+#  define PyType_Check(op) PyType_Check(_PyObject_CAST(op))
+#endif
+
+#define _PyType_CAST(op) \
+    (assert(PyType_Check(op)), _Py_CAST(PyTypeObject*, (op)))
+
+static inline int PyType_CheckExact(PyObject *op) {
+    return Py_IS_TYPE(op, &PyType_Type);
+}
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
+#  define PyType_CheckExact(op) PyType_CheckExact(_PyObject_CAST(op))
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif   // !Py_OBJECT_H
diff --git a/nanvix-port/cpython-headers/python3.12/objimpl.h b/nanvix-port/cpython-headers/python3.12/objimpl.h
new file mode 100644
index 000000000000..ef871c5ea93e
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/objimpl.h
@@ -0,0 +1,234 @@
+/* The PyObject_ memory family:  high-level object memory interfaces.
+   See pymem.h for the low-level PyMem_ family.
+*/
+
+#ifndef Py_OBJIMPL_H
+#define Py_OBJIMPL_H
+
+#include "pymem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* BEWARE:
+
+   Each interface exports both functions and macros.  Extension modules should
+   use the functions, to ensure binary compatibility across Python versions.
+   Because the Python implementation is free to change internal details, and
+   the macros may (or may not) expose details for speed, if you do use the
+   macros you must recompile your extensions with each Python release.
+
+   Never mix calls to PyObject_ memory functions with calls to the platform
+   malloc/realloc/ calloc/free, or with calls to PyMem_.
+*/
+
+/*
+Functions and macros for modules that implement new object types.
+
+ - PyObject_New(type, typeobj) allocates memory for a new object of the given
+   type, and initializes part of it.  'type' must be the C structure type used
+   to represent the object, and 'typeobj' the address of the corresponding
+   type object.  Reference count and type pointer are filled in; the rest of
+   the bytes of the object are *undefined*!  The resulting expression type is
+   'type *'.  The size of the object is determined by the tp_basicsize field
+   of the type object.
+
+ - PyObject_NewVar(type, typeobj, n) is similar but allocates a variable-size
+   object with room for n items.  In addition to the refcount and type pointer
+   fields, this also fills in the ob_size field.
+
+ - PyObject_Free(op) releases the memory allocated for an object.  It does not
+   run a destructor -- it only frees the memory.  PyObject_Free is identical.
+
+ - PyObject_Init(op, typeobj) and PyObject_InitVar(op, typeobj, n) don't
+   allocate memory.  Instead of a 'type' parameter, they take a pointer to a
+   new object (allocated by an arbitrary allocator), and initialize its object
+   header fields.
+
+Note that objects created with PyObject_{New, NewVar} are allocated using the
+specialized Python allocator (implemented in obmalloc.c), if WITH_PYMALLOC is
+enabled.  In addition, a special debugging allocator is used if Py_DEBUG
+macro is also defined.
+
+In case a specific form of memory management is needed (for example, if you
+must use the platform malloc heap(s), or shared memory, or C++ local storage or
+operator new), you must first allocate the object with your custom allocator,
+then pass its pointer to PyObject_{Init, InitVar} for filling in its Python-
+specific fields:  reference count, type pointer, possibly others.  You should
+be aware that Python has no control over these objects because they don't
+cooperate with the Python memory manager.  Such objects may not be eligible
+for automatic garbage collection and you have to make sure that they are
+released accordingly whenever their destructor gets called (cf. the specific
+form of memory management you're using).
+
+Unless you have specific memory management requirements, use
+PyObject_{New, NewVar, Del}.
+*/
+
+/*
+ * Raw object memory interface
+ * ===========================
+ */
+
+/* Functions to call the same malloc/realloc/free as used by Python's
+   object allocator.  If WITH_PYMALLOC is enabled, these may differ from
+   the platform malloc/realloc/free.  The Python object allocator is
+   designed for fast, cache-conscious allocation of many "small" objects,
+   and with low hidden memory overhead.
+
+   PyObject_Malloc(0) returns a unique non-NULL pointer if possible.
+
+   PyObject_Realloc(NULL, n) acts like PyObject_Malloc(n).
+   PyObject_Realloc(p != NULL, 0) does not return  NULL, or free the memory
+   at p.
+
+   Returned pointers must be checked for NULL explicitly; no action is
+   performed on failure other than to return NULL (no warning it printed, no
+   exception is set, etc).
+
+   For allocating objects, use PyObject_{New, NewVar} instead whenever
+   possible.  The PyObject_{Malloc, Realloc, Free} family is exposed
+   so that you can exploit Python's small-block allocator for non-object
+   uses.  If you must use these routines to allocate object memory, make sure
+   the object gets initialized via PyObject_{Init, InitVar} after obtaining
+   the raw memory.
+*/
+PyAPI_FUNC(void *) PyObject_Malloc(size_t size);
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000
+PyAPI_FUNC(void *) PyObject_Calloc(size_t nelem, size_t elsize);
+#endif
+PyAPI_FUNC(void *) PyObject_Realloc(void *ptr, size_t new_size);
+PyAPI_FUNC(void) PyObject_Free(void *ptr);
+
+
+// Deprecated aliases only kept for backward compatibility.
+// PyObject_Del and PyObject_DEL are defined with no parameter to be able to
+// use them as function pointers (ex: tp_free = PyObject_Del).
+#define PyObject_MALLOC         PyObject_Malloc
+#define PyObject_REALLOC        PyObject_Realloc
+#define PyObject_FREE           PyObject_Free
+#define PyObject_Del            PyObject_Free
+#define PyObject_DEL            PyObject_Free
+
+
+/*
+ * Generic object allocator interface
+ * ==================================
+ */
+
+/* Functions */
+PyAPI_FUNC(PyObject *) PyObject_Init(PyObject *, PyTypeObject *);
+PyAPI_FUNC(PyVarObject *) PyObject_InitVar(PyVarObject *,
+                                           PyTypeObject *, Py_ssize_t);
+
+#define PyObject_INIT(op, typeobj) \
+    PyObject_Init(_PyObject_CAST(op), (typeobj))
+#define PyObject_INIT_VAR(op, typeobj, size) \
+    PyObject_InitVar(_PyVarObject_CAST(op), (typeobj), (size))
+
+
+PyAPI_FUNC(PyObject *) _PyObject_New(PyTypeObject *);
+PyAPI_FUNC(PyVarObject *) _PyObject_NewVar(PyTypeObject *, Py_ssize_t);
+
+#define PyObject_New(type, typeobj) ((type *)_PyObject_New(typeobj))
+
+// Alias to PyObject_New(). In Python 3.8, PyObject_NEW() called directly
+// PyObject_MALLOC() with _PyObject_SIZE().
+#define PyObject_NEW(type, typeobj) PyObject_New(type, (typeobj))
+
+#define PyObject_NewVar(type, typeobj, n) \
+                ( (type *) _PyObject_NewVar((typeobj), (n)) )
+
+// Alias to PyObject_NewVar(). In Python 3.8, PyObject_NEW_VAR() called
+// directly PyObject_MALLOC() with _PyObject_VAR_SIZE().
+#define PyObject_NEW_VAR(type, typeobj, n) PyObject_NewVar(type, (typeobj), (n))
+
+
+/*
+ * Garbage Collection Support
+ * ==========================
+ */
+
+/* C equivalent of gc.collect(). */
+PyAPI_FUNC(Py_ssize_t) PyGC_Collect(void);
+/* C API for controlling the state of the garbage collector */
+PyAPI_FUNC(int) PyGC_Enable(void);
+PyAPI_FUNC(int) PyGC_Disable(void);
+PyAPI_FUNC(int) PyGC_IsEnabled(void);
+
+
+#if !defined(Py_LIMITED_API)
+/* Visit all live GC-capable objects, similar to gc.get_objects(None). The
+ * supplied callback is called on every such object with the void* arg set
+ * to the supplied arg. Returning 0 from the callback ends iteration, returning
+ * 1 allows iteration to continue. Returning any other value may result in
+ * undefined behaviour.
+ *
+ * If new objects are (de)allocated by the callback it is undefined if they
+ * will be visited.
+
+ * Garbage collection is disabled during operation. Explicitly running a
+ * collection in the callback may lead to undefined behaviour e.g. visiting the
+ * same objects multiple times or not at all.
+ */
+typedef int (*gcvisitobjects_t)(PyObject*, void*);
+PyAPI_FUNC(void) PyUnstable_GC_VisitObjects(gcvisitobjects_t callback, void* arg);
+#endif
+
+/* Test if a type has a GC head */
+#define PyType_IS_GC(t) PyType_HasFeature((t), Py_TPFLAGS_HAVE_GC)
+
+PyAPI_FUNC(PyVarObject *) _PyObject_GC_Resize(PyVarObject *, Py_ssize_t);
+#define PyObject_GC_Resize(type, op, n) \
+                ( (type *) _PyObject_GC_Resize(_PyVarObject_CAST(op), (n)) )
+
+
+
+PyAPI_FUNC(PyObject *) _PyObject_GC_New(PyTypeObject *);
+PyAPI_FUNC(PyVarObject *) _PyObject_GC_NewVar(PyTypeObject *, Py_ssize_t);
+
+/* Tell the GC to track this object.
+ *
+ * See also private _PyObject_GC_TRACK() macro. */
+PyAPI_FUNC(void) PyObject_GC_Track(void *);
+
+/* Tell the GC to stop tracking this object.
+ *
+ * See also private _PyObject_GC_UNTRACK() macro. */
+PyAPI_FUNC(void) PyObject_GC_UnTrack(void *);
+
+PyAPI_FUNC(void) PyObject_GC_Del(void *);
+
+#define PyObject_GC_New(type, typeobj) \
+    _Py_CAST(type*, _PyObject_GC_New(typeobj))
+#define PyObject_GC_NewVar(type, typeobj, n) \
+    _Py_CAST(type*, _PyObject_GC_NewVar((typeobj), (n)))
+
+PyAPI_FUNC(int) PyObject_GC_IsTracked(PyObject *);
+PyAPI_FUNC(int) PyObject_GC_IsFinalized(PyObject *);
+
+/* Utility macro to help write tp_traverse functions.
+ * To use this macro, the tp_traverse function must name its arguments
+ * "visit" and "arg".  This is intended to keep tp_traverse functions
+ * looking as much alike as possible.
+ */
+#define Py_VISIT(op)                                                    \
+    do {                                                                \
+        if (op) {                                                       \
+            int vret = visit(_PyObject_CAST(op), arg);                  \
+            if (vret)                                                   \
+                return vret;                                            \
+        }                                                               \
+    } while (0)
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_OBJIMPL_H
+#  include "cpython/objimpl.h"
+#  undef Py_CPYTHON_OBJIMPL_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_OBJIMPL_H */
diff --git a/nanvix-port/cpython-headers/python3.12/opcode.h b/nanvix-port/cpython-headers/python3.12/opcode.h
new file mode 100644
index 000000000000..9806511ba428
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/opcode.h
@@ -0,0 +1,271 @@
+// Auto-generated by Tools/build/generate_opcode_h.py from Lib/opcode.py
+
+#ifndef Py_OPCODE_H
+#define Py_OPCODE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Instruction opcodes for compiled code */
+#define CACHE                                    0
+#define POP_TOP                                  1
+#define PUSH_NULL                                2
+#define INTERPRETER_EXIT                         3
+#define END_FOR                                  4
+#define END_SEND                                 5
+#define NOP                                      9
+#define UNARY_NEGATIVE                          11
+#define UNARY_NOT                               12
+#define UNARY_INVERT                            15
+#define RESERVED                                17
+#define BINARY_SUBSCR                           25
+#define BINARY_SLICE                            26
+#define STORE_SLICE                             27
+#define GET_LEN                                 30
+#define MATCH_MAPPING                           31
+#define MATCH_SEQUENCE                          32
+#define MATCH_KEYS                              33
+#define PUSH_EXC_INFO                           35
+#define CHECK_EXC_MATCH                         36
+#define CHECK_EG_MATCH                          37
+#define WITH_EXCEPT_START                       49
+#define GET_AITER                               50
+#define GET_ANEXT                               51
+#define BEFORE_ASYNC_WITH                       52
+#define BEFORE_WITH                             53
+#define END_ASYNC_FOR                           54
+#define CLEANUP_THROW                           55
+#define STORE_SUBSCR                            60
+#define DELETE_SUBSCR                           61
+#define GET_ITER                                68
+#define GET_YIELD_FROM_ITER                     69
+#define LOAD_BUILD_CLASS                        71
+#define LOAD_ASSERTION_ERROR                    74
+#define RETURN_GENERATOR                        75
+#define RETURN_VALUE                            83
+#define SETUP_ANNOTATIONS                       85
+#define LOAD_LOCALS                             87
+#define POP_EXCEPT                              89
+#define HAVE_ARGUMENT                           90
+#define STORE_NAME                              90
+#define DELETE_NAME                             91
+#define UNPACK_SEQUENCE                         92
+#define FOR_ITER                                93
+#define UNPACK_EX                               94
+#define STORE_ATTR                              95
+#define DELETE_ATTR                             96
+#define STORE_GLOBAL                            97
+#define DELETE_GLOBAL                           98
+#define SWAP                                    99
+#define LOAD_CONST                             100
+#define LOAD_NAME                              101
+#define BUILD_TUPLE                            102
+#define BUILD_LIST                             103
+#define BUILD_SET                              104
+#define BUILD_MAP                              105
+#define LOAD_ATTR                              106
+#define COMPARE_OP                             107
+#define IMPORT_NAME                            108
+#define IMPORT_FROM                            109
+#define JUMP_FORWARD                           110
+#define POP_JUMP_IF_FALSE                      114
+#define POP_JUMP_IF_TRUE                       115
+#define LOAD_GLOBAL                            116
+#define IS_OP                                  117
+#define CONTAINS_OP                            118
+#define RERAISE                                119
+#define COPY                                   120
+#define RETURN_CONST                           121
+#define BINARY_OP                              122
+#define SEND                                   123
+#define LOAD_FAST                              124
+#define STORE_FAST                             125
+#define DELETE_FAST                            126
+#define LOAD_FAST_CHECK                        127
+#define POP_JUMP_IF_NOT_NONE                   128
+#define POP_JUMP_IF_NONE                       129
+#define RAISE_VARARGS                          130
+#define GET_AWAITABLE                          131
+#define MAKE_FUNCTION                          132
+#define BUILD_SLICE                            133
+#define JUMP_BACKWARD_NO_INTERRUPT             134
+#define MAKE_CELL                              135
+#define LOAD_CLOSURE                           136
+#define LOAD_DEREF                             137
+#define STORE_DEREF                            138
+#define DELETE_DEREF                           139
+#define JUMP_BACKWARD                          140
+#define LOAD_SUPER_ATTR                        141
+#define CALL_FUNCTION_EX                       142
+#define LOAD_FAST_AND_CLEAR                    143
+#define EXTENDED_ARG                           144
+#define LIST_APPEND                            145
+#define SET_ADD                                146
+#define MAP_ADD                                147
+#define COPY_FREE_VARS                         149
+#define YIELD_VALUE                            150
+#define RESUME                                 151
+#define MATCH_CLASS                            152
+#define FORMAT_VALUE                           155
+#define BUILD_CONST_KEY_MAP                    156
+#define BUILD_STRING                           157
+#define LIST_EXTEND                            162
+#define SET_UPDATE                             163
+#define DICT_MERGE                             164
+#define DICT_UPDATE                            165
+#define CALL                                   171
+#define KW_NAMES                               172
+#define CALL_INTRINSIC_1                       173
+#define CALL_INTRINSIC_2                       174
+#define LOAD_FROM_DICT_OR_GLOBALS              175
+#define LOAD_FROM_DICT_OR_DEREF                176
+#define MIN_INSTRUMENTED_OPCODE                237
+#define INSTRUMENTED_LOAD_SUPER_ATTR           237
+#define INSTRUMENTED_POP_JUMP_IF_NONE          238
+#define INSTRUMENTED_POP_JUMP_IF_NOT_NONE      239
+#define INSTRUMENTED_RESUME                    240
+#define INSTRUMENTED_CALL                      241
+#define INSTRUMENTED_RETURN_VALUE              242
+#define INSTRUMENTED_YIELD_VALUE               243
+#define INSTRUMENTED_CALL_FUNCTION_EX          244
+#define INSTRUMENTED_JUMP_FORWARD              245
+#define INSTRUMENTED_JUMP_BACKWARD             246
+#define INSTRUMENTED_RETURN_CONST              247
+#define INSTRUMENTED_FOR_ITER                  248
+#define INSTRUMENTED_POP_JUMP_IF_FALSE         249
+#define INSTRUMENTED_POP_JUMP_IF_TRUE          250
+#define INSTRUMENTED_END_FOR                   251
+#define INSTRUMENTED_END_SEND                  252
+#define INSTRUMENTED_INSTRUCTION               253
+#define INSTRUMENTED_LINE                      254
+#define MIN_PSEUDO_OPCODE                      256
+#define SETUP_FINALLY                          256
+#define SETUP_CLEANUP                          257
+#define SETUP_WITH                             258
+#define POP_BLOCK                              259
+#define JUMP                                   260
+#define JUMP_NO_INTERRUPT                      261
+#define LOAD_METHOD                            262
+#define LOAD_SUPER_METHOD                      263
+#define LOAD_ZERO_SUPER_METHOD                 264
+#define LOAD_ZERO_SUPER_ATTR                   265
+#define STORE_FAST_MAYBE_NULL                  266
+#define MAX_PSEUDO_OPCODE                      266
+#define BINARY_OP_ADD_FLOAT                      6
+#define BINARY_OP_ADD_INT                        7
+#define BINARY_OP_ADD_UNICODE                    8
+#define BINARY_OP_INPLACE_ADD_UNICODE           10
+#define BINARY_OP_MULTIPLY_FLOAT                13
+#define BINARY_OP_MULTIPLY_INT                  14
+#define BINARY_OP_SUBTRACT_FLOAT                16
+#define BINARY_OP_SUBTRACT_INT                  18
+#define BINARY_SUBSCR_DICT                      19
+#define BINARY_SUBSCR_GETITEM                   20
+#define BINARY_SUBSCR_LIST_INT                  21
+#define BINARY_SUBSCR_TUPLE_INT                 22
+#define CALL_PY_EXACT_ARGS                      23
+#define CALL_PY_WITH_DEFAULTS                   24
+#define CALL_BOUND_METHOD_EXACT_ARGS            28
+#define CALL_BUILTIN_CLASS                      29
+#define CALL_BUILTIN_FAST_WITH_KEYWORDS         34
+#define CALL_METHOD_DESCRIPTOR_FAST_WITH_KEYWORDS  38
+#define CALL_NO_KW_BUILTIN_FAST                 39
+#define CALL_NO_KW_BUILTIN_O                    40
+#define CALL_NO_KW_ISINSTANCE                   41
+#define CALL_NO_KW_LEN                          42
+#define CALL_NO_KW_LIST_APPEND                  43
+#define CALL_NO_KW_METHOD_DESCRIPTOR_FAST       44
+#define CALL_NO_KW_METHOD_DESCRIPTOR_NOARGS     45
+#define CALL_NO_KW_METHOD_DESCRIPTOR_O          46
+#define CALL_NO_KW_STR_1                        47
+#define CALL_NO_KW_TUPLE_1                      48
+#define CALL_NO_KW_TYPE_1                       56
+#define COMPARE_OP_FLOAT                        57
+#define COMPARE_OP_INT                          58
+#define COMPARE_OP_STR                          59
+#define FOR_ITER_LIST                           62
+#define FOR_ITER_TUPLE                          63
+#define FOR_ITER_RANGE                          64
+#define FOR_ITER_GEN                            65
+#define LOAD_SUPER_ATTR_ATTR                    66
+#define LOAD_SUPER_ATTR_METHOD                  67
+#define LOAD_ATTR_CLASS                         70
+#define LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN       72
+#define LOAD_ATTR_INSTANCE_VALUE                73
+#define LOAD_ATTR_MODULE                        76
+#define LOAD_ATTR_PROPERTY                      77
+#define LOAD_ATTR_SLOT                          78
+#define LOAD_ATTR_WITH_HINT                     79
+#define LOAD_ATTR_METHOD_LAZY_DICT              80
+#define LOAD_ATTR_METHOD_NO_DICT                81
+#define LOAD_ATTR_METHOD_WITH_VALUES            82
+#define LOAD_CONST__LOAD_FAST                   84
+#define LOAD_FAST__LOAD_CONST                   86
+#define LOAD_FAST__LOAD_FAST                    88
+#define LOAD_GLOBAL_BUILTIN                    111
+#define LOAD_GLOBAL_MODULE                     112
+#define STORE_ATTR_INSTANCE_VALUE              113
+#define STORE_ATTR_SLOT                        148
+#define STORE_ATTR_WITH_HINT                   153
+#define STORE_FAST__LOAD_FAST                  154
+#define STORE_FAST__STORE_FAST                 158
+#define STORE_SUBSCR_DICT                      159
+#define STORE_SUBSCR_LIST_INT                  160
+#define UNPACK_SEQUENCE_LIST                   161
+#define UNPACK_SEQUENCE_TUPLE                  166
+#define UNPACK_SEQUENCE_TWO_TUPLE              167
+#define SEND_GEN                               168
+
+#define HAS_ARG(op) ((((op) >= HAVE_ARGUMENT) && (!IS_PSEUDO_OPCODE(op)))\
+    || ((op) == JUMP) \
+    || ((op) == JUMP_NO_INTERRUPT) \
+    || ((op) == LOAD_METHOD) \
+    || ((op) == LOAD_SUPER_METHOD) \
+    || ((op) == LOAD_ZERO_SUPER_METHOD) \
+    || ((op) == LOAD_ZERO_SUPER_ATTR) \
+    || ((op) == STORE_FAST_MAYBE_NULL) \
+    )
+
+#define HAS_CONST(op) (false\
+    || ((op) == LOAD_CONST) \
+    || ((op) == RETURN_CONST) \
+    || ((op) == KW_NAMES) \
+    )
+
+#define NB_ADD                                   0
+#define NB_AND                                   1
+#define NB_FLOOR_DIVIDE                          2
+#define NB_LSHIFT                                3
+#define NB_MATRIX_MULTIPLY                       4
+#define NB_MULTIPLY                              5
+#define NB_REMAINDER                             6
+#define NB_OR                                    7
+#define NB_POWER                                 8
+#define NB_RSHIFT                                9
+#define NB_SUBTRACT                             10
+#define NB_TRUE_DIVIDE                          11
+#define NB_XOR                                  12
+#define NB_INPLACE_ADD                          13
+#define NB_INPLACE_AND                          14
+#define NB_INPLACE_FLOOR_DIVIDE                 15
+#define NB_INPLACE_LSHIFT                       16
+#define NB_INPLACE_MATRIX_MULTIPLY              17
+#define NB_INPLACE_MULTIPLY                     18
+#define NB_INPLACE_REMAINDER                    19
+#define NB_INPLACE_OR                           20
+#define NB_INPLACE_POWER                        21
+#define NB_INPLACE_RSHIFT                       22
+#define NB_INPLACE_SUBTRACT                     23
+#define NB_INPLACE_TRUE_DIVIDE                  24
+#define NB_INPLACE_XOR                          25
+
+/* Defined in Lib/opcode.py */
+#define ENABLE_SPECIALIZATION 1
+
+#define IS_PSEUDO_OPCODE(op) (((op) >= MIN_PSEUDO_OPCODE) && ((op) <= MAX_PSEUDO_OPCODE))
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_OPCODE_H */
diff --git a/nanvix-port/cpython-headers/python3.12/osdefs.h b/nanvix-port/cpython-headers/python3.12/osdefs.h
new file mode 100644
index 000000000000..3243944a1483
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/osdefs.h
@@ -0,0 +1,51 @@
+#ifndef Py_OSDEFS_H
+#define Py_OSDEFS_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Operating system dependencies */
+
+#ifdef MS_WINDOWS
+#define SEP L'\\'
+#define ALTSEP L'/'
+#define MAXPATHLEN 256
+#define DELIM L';'
+#endif
+
+#ifdef __VXWORKS__
+#define DELIM L';'
+#endif
+
+/* Filename separator */
+#ifndef SEP
+#define SEP L'/'
+#endif
+
+/* Max pathname length */
+#ifdef __hpux
+#include <sys/param.h>
+#include <limits.h>
+#ifndef PATH_MAX
+#define PATH_MAX MAXPATHLEN
+#endif
+#endif
+
+#ifndef MAXPATHLEN
+#if defined(PATH_MAX) && PATH_MAX > 1024
+#define MAXPATHLEN PATH_MAX
+#else
+#define MAXPATHLEN 1024
+#endif
+#endif
+
+/* Search path entry delimiter */
+#ifndef DELIM
+#define DELIM L':'
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_OSDEFS_H */
diff --git a/nanvix-port/cpython-headers/python3.12/osmodule.h b/nanvix-port/cpython-headers/python3.12/osmodule.h
new file mode 100644
index 000000000000..9095c2fdd3d6
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/osmodule.h
@@ -0,0 +1,17 @@
+
+/* os module interface */
+
+#ifndef Py_OSMODULE_H
+#define Py_OSMODULE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03060000
+PyAPI_FUNC(PyObject *) PyOS_FSPath(PyObject *path);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_OSMODULE_H */
diff --git a/nanvix-port/cpython-headers/python3.12/patchlevel.h b/nanvix-port/cpython-headers/python3.12/patchlevel.h
new file mode 100644
index 000000000000..24e6579dbbad
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/patchlevel.h
@@ -0,0 +1,35 @@
+
+/* Python version identification scheme.
+
+   When the major or minor version changes, the VERSION variable in
+   configure.ac must also be changed.
+
+   There is also (independent) API version information in modsupport.h.
+*/
+
+/* Values for PY_RELEASE_LEVEL */
+#define PY_RELEASE_LEVEL_ALPHA  0xA
+#define PY_RELEASE_LEVEL_BETA   0xB
+#define PY_RELEASE_LEVEL_GAMMA  0xC     /* For release candidates */
+#define PY_RELEASE_LEVEL_FINAL  0xF     /* Serial should be 0 here */
+                                        /* Higher for patch releases */
+
+/* Version parsed out into numeric values */
+/*--start constants--*/
+#define PY_MAJOR_VERSION        3
+#define PY_MINOR_VERSION        12
+#define PY_MICRO_VERSION        3
+#define PY_RELEASE_LEVEL        PY_RELEASE_LEVEL_FINAL
+#define PY_RELEASE_SERIAL       0
+
+/* Version as a string */
+#define PY_VERSION              "3.12.3"
+/*--end constants--*/
+
+/* Version as a single 4-byte hex number, e.g. 0x010502B2 == 1.5.2b2.
+   Use this for numeric comparisons, e.g. #if PY_VERSION_HEX >= ... */
+#define PY_VERSION_HEX ((PY_MAJOR_VERSION << 24) | \
+                        (PY_MINOR_VERSION << 16) | \
+                        (PY_MICRO_VERSION <<  8) | \
+                        (PY_RELEASE_LEVEL <<  4) | \
+                        (PY_RELEASE_SERIAL << 0))
diff --git a/nanvix-port/cpython-headers/python3.12/py_curses.h b/nanvix-port/cpython-headers/python3.12/py_curses.h
new file mode 100644
index 000000000000..e46b08e9cc41
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/py_curses.h
@@ -0,0 +1,99 @@
+
+#ifndef Py_CURSES_H
+#define Py_CURSES_H
+
+#ifdef __APPLE__
+/*
+** On Mac OS X 10.2 [n]curses.h and stdlib.h use different guards
+** against multiple definition of wchar_t.
+*/
+#ifdef _BSD_WCHAR_T_DEFINED_
+#define _WCHAR_T
+#endif
+#endif /* __APPLE__ */
+
+/* On FreeBSD, [n]curses.h and stdlib.h/wchar.h use different guards
+   against multiple definition of wchar_t and wint_t. */
+#if defined(__FreeBSD__) && defined(_XOPEN_SOURCE_EXTENDED)
+# ifndef __wchar_t
+#   define __wchar_t
+# endif
+# ifndef __wint_t
+#   define __wint_t
+# endif
+#endif
+
+#if !defined(HAVE_CURSES_IS_PAD) && defined(WINDOW_HAS_FLAGS)
+/* The following definition is necessary for ncurses 5.7; without it,
+   some of [n]curses.h set NCURSES_OPAQUE to 1, and then Python
+   can't get at the WINDOW flags field. */
+#define NCURSES_OPAQUE 0
+#endif
+
+#ifdef HAVE_NCURSES_H
+#include <ncurses.h>
+#else
+#include <curses.h>
+#endif
+
+#ifdef HAVE_NCURSES_H
+/* configure was checking <curses.h>, but we will
+   use <ncurses.h>, which has some or all these features. */
+#if !defined(WINDOW_HAS_FLAGS) && !(NCURSES_OPAQUE+0)
+#define WINDOW_HAS_FLAGS 1
+#endif
+#if !defined(HAVE_CURSES_IS_PAD) && NCURSES_VERSION_PATCH+0 >= 20090906
+#define HAVE_CURSES_IS_PAD 1
+#endif
+#ifndef MVWDELCH_IS_EXPRESSION
+#define MVWDELCH_IS_EXPRESSION 1
+#endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PyCurses_API_pointers 4
+
+/* Type declarations */
+
+typedef struct {
+    PyObject_HEAD
+    WINDOW *win;
+    char *encoding;
+} PyCursesWindowObject;
+
+#define PyCursesWindow_Check(v) Py_IS_TYPE((v), &PyCursesWindow_Type)
+
+#define PyCurses_CAPSULE_NAME "_curses._C_API"
+
+
+#ifdef CURSES_MODULE
+/* This section is used when compiling _cursesmodule.c */
+
+#else
+/* This section is used in modules that use the _cursesmodule API */
+
+static void **PyCurses_API;
+
+#define PyCursesWindow_Type (*_PyType_CAST(PyCurses_API[0]))
+#define PyCursesSetupTermCalled  {if (! ((int (*)(void))PyCurses_API[1]) () ) return NULL;}
+#define PyCursesInitialised      {if (! ((int (*)(void))PyCurses_API[2]) () ) return NULL;}
+#define PyCursesInitialisedColor {if (! ((int (*)(void))PyCurses_API[3]) () ) return NULL;}
+
+#define import_curses() \
+    PyCurses_API = (void **)PyCapsule_Import(PyCurses_CAPSULE_NAME, 1);
+
+#endif
+
+/* general error messages */
+static const char catchall_ERR[]  = "curses function returned ERR";
+static const char catchall_NULL[] = "curses function returned NULL";
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !defined(Py_CURSES_H) */
+
diff --git a/nanvix-port/cpython-headers/python3.12/pybuffer.h b/nanvix-port/cpython-headers/python3.12/pybuffer.h
new file mode 100644
index 000000000000..ca1c6058d905
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/pybuffer.h
@@ -0,0 +1,145 @@
+/* Public Py_buffer API */
+
+#ifndef Py_BUFFER_H
+#define Py_BUFFER_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030b0000
+
+/* === New Buffer API ============================================
+ * Limited API and stable ABI since Python 3.11
+ *
+ * Py_buffer struct layout and size is now part of the stable abi3. The
+ * struct layout and size must not be changed in any way, as it would
+ * break the ABI.
+ *
+ */
+
+typedef struct {
+    void *buf;
+    PyObject *obj;        /* owned reference */
+    Py_ssize_t len;
+    Py_ssize_t itemsize;  /* This is Py_ssize_t so it can be
+                             pointed to by strides in simple case.*/
+    int readonly;
+    int ndim;
+    char *format;
+    Py_ssize_t *shape;
+    Py_ssize_t *strides;
+    Py_ssize_t *suboffsets;
+    void *internal;
+} Py_buffer;
+
+typedef int (*getbufferproc)(PyObject *, Py_buffer *, int);
+typedef void (*releasebufferproc)(PyObject *, Py_buffer *);
+
+/* Return 1 if the getbuffer function is available, otherwise return 0. */
+PyAPI_FUNC(int) PyObject_CheckBuffer(PyObject *obj);
+
+/* This is a C-API version of the getbuffer function call.  It checks
+   to make sure object has the required function pointer and issues the
+   call.
+
+   Returns -1 and raises an error on failure and returns 0 on success. */
+PyAPI_FUNC(int) PyObject_GetBuffer(PyObject *obj, Py_buffer *view,
+                                   int flags);
+
+/* Get the memory area pointed to by the indices for the buffer given.
+   Note that view->ndim is the assumed size of indices. */
+PyAPI_FUNC(void *) PyBuffer_GetPointer(const Py_buffer *view, const Py_ssize_t *indices);
+
+/* Return the implied itemsize of the data-format area from a
+   struct-style description. */
+PyAPI_FUNC(Py_ssize_t) PyBuffer_SizeFromFormat(const char *format);
+
+/* Implementation in memoryobject.c */
+PyAPI_FUNC(int) PyBuffer_ToContiguous(void *buf, const Py_buffer *view,
+                                      Py_ssize_t len, char order);
+
+PyAPI_FUNC(int) PyBuffer_FromContiguous(const Py_buffer *view, const void *buf,
+                                        Py_ssize_t len, char order);
+
+/* Copy len bytes of data from the contiguous chunk of memory
+   pointed to by buf into the buffer exported by obj.  Return
+   0 on success and return -1 and raise a PyBuffer_Error on
+   error (i.e. the object does not have a buffer interface or
+   it is not working).
+
+   If fort is 'F', then if the object is multi-dimensional,
+   then the data will be copied into the array in
+   Fortran-style (first dimension varies the fastest).  If
+   fort is 'C', then the data will be copied into the array
+   in C-style (last dimension varies the fastest).  If fort
+   is 'A', then it does not matter and the copy will be made
+   in whatever way is more efficient. */
+PyAPI_FUNC(int) PyObject_CopyData(PyObject *dest, PyObject *src);
+
+/* Copy the data from the src buffer to the buffer of destination. */
+PyAPI_FUNC(int) PyBuffer_IsContiguous(const Py_buffer *view, char fort);
+
+/*Fill the strides array with byte-strides of a contiguous
+  (Fortran-style if fort is 'F' or C-style otherwise)
+  array of the given shape with the given number of bytes
+  per element. */
+PyAPI_FUNC(void) PyBuffer_FillContiguousStrides(int ndims,
+                                               Py_ssize_t *shape,
+                                               Py_ssize_t *strides,
+                                               int itemsize,
+                                               char fort);
+
+/* Fills in a buffer-info structure correctly for an exporter
+   that can only share a contiguous chunk of memory of
+   "unsigned bytes" of the given length.
+
+   Returns 0 on success and -1 (with raising an error) on error. */
+PyAPI_FUNC(int) PyBuffer_FillInfo(Py_buffer *view, PyObject *o, void *buf,
+                                  Py_ssize_t len, int readonly,
+                                  int flags);
+
+/* Releases a Py_buffer obtained from getbuffer ParseTuple's "s*". */
+PyAPI_FUNC(void) PyBuffer_Release(Py_buffer *view);
+
+/* Maximum number of dimensions */
+#define PyBUF_MAX_NDIM 64
+
+/* Flags for getting buffers. Keep these in sync with inspect.BufferFlags. */
+#define PyBUF_SIMPLE 0
+#define PyBUF_WRITABLE 0x0001
+
+#ifndef Py_LIMITED_API
+/*  we used to include an E, backwards compatible alias */
+#define PyBUF_WRITEABLE PyBUF_WRITABLE
+#endif
+
+#define PyBUF_FORMAT 0x0004
+#define PyBUF_ND 0x0008
+#define PyBUF_STRIDES (0x0010 | PyBUF_ND)
+#define PyBUF_C_CONTIGUOUS (0x0020 | PyBUF_STRIDES)
+#define PyBUF_F_CONTIGUOUS (0x0040 | PyBUF_STRIDES)
+#define PyBUF_ANY_CONTIGUOUS (0x0080 | PyBUF_STRIDES)
+#define PyBUF_INDIRECT (0x0100 | PyBUF_STRIDES)
+
+#define PyBUF_CONTIG (PyBUF_ND | PyBUF_WRITABLE)
+#define PyBUF_CONTIG_RO (PyBUF_ND)
+
+#define PyBUF_STRIDED (PyBUF_STRIDES | PyBUF_WRITABLE)
+#define PyBUF_STRIDED_RO (PyBUF_STRIDES)
+
+#define PyBUF_RECORDS (PyBUF_STRIDES | PyBUF_WRITABLE | PyBUF_FORMAT)
+#define PyBUF_RECORDS_RO (PyBUF_STRIDES | PyBUF_FORMAT)
+
+#define PyBUF_FULL (PyBUF_INDIRECT | PyBUF_WRITABLE | PyBUF_FORMAT)
+#define PyBUF_FULL_RO (PyBUF_INDIRECT | PyBUF_FORMAT)
+
+
+#define PyBUF_READ  0x100
+#define PyBUF_WRITE 0x200
+
+#endif /* !Py_LIMITED_API || Py_LIMITED_API >= 3.11 */
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* Py_BUFFER_H */
diff --git a/nanvix-port/cpython-headers/python3.12/pycapsule.h b/nanvix-port/cpython-headers/python3.12/pycapsule.h
new file mode 100644
index 000000000000..929a9a685259
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/pycapsule.h
@@ -0,0 +1,59 @@
+
+/* Capsule objects let you wrap a C "void *" pointer in a Python
+   object.  They're a way of passing data through the Python interpreter
+   without creating your own custom type.
+
+   Capsules are used for communication between extension modules.
+   They provide a way for an extension module to export a C interface
+   to other extension modules, so that extension modules can use the
+   Python import mechanism to link to one another.
+
+   For more information, please see "c-api/capsule.html" in the
+   documentation.
+*/
+
+#ifndef Py_CAPSULE_H
+#define Py_CAPSULE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_DATA(PyTypeObject) PyCapsule_Type;
+
+typedef void (*PyCapsule_Destructor)(PyObject *);
+
+#define PyCapsule_CheckExact(op) Py_IS_TYPE((op), &PyCapsule_Type)
+
+
+PyAPI_FUNC(PyObject *) PyCapsule_New(
+    void *pointer,
+    const char *name,
+    PyCapsule_Destructor destructor);
+
+PyAPI_FUNC(void *) PyCapsule_GetPointer(PyObject *capsule, const char *name);
+
+PyAPI_FUNC(PyCapsule_Destructor) PyCapsule_GetDestructor(PyObject *capsule);
+
+PyAPI_FUNC(const char *) PyCapsule_GetName(PyObject *capsule);
+
+PyAPI_FUNC(void *) PyCapsule_GetContext(PyObject *capsule);
+
+PyAPI_FUNC(int) PyCapsule_IsValid(PyObject *capsule, const char *name);
+
+PyAPI_FUNC(int) PyCapsule_SetPointer(PyObject *capsule, void *pointer);
+
+PyAPI_FUNC(int) PyCapsule_SetDestructor(PyObject *capsule, PyCapsule_Destructor destructor);
+
+PyAPI_FUNC(int) PyCapsule_SetName(PyObject *capsule, const char *name);
+
+PyAPI_FUNC(int) PyCapsule_SetContext(PyObject *capsule, void *context);
+
+PyAPI_FUNC(void *) PyCapsule_Import(
+    const char *name,           /* UTF-8 encoded string */
+    int no_block);
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_CAPSULE_H */
diff --git a/nanvix-port/cpython-headers/python3.12/pyconfig.h b/nanvix-port/cpython-headers/python3.12/pyconfig.h
new file mode 100644
index 000000000000..d612fbc95f26
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/pyconfig.h
@@ -0,0 +1,1968 @@
+/* pyconfig.h.  Generated from pyconfig.h.in by configure.  */
+/* pyconfig.h.in.  Generated from configure.ac by autoheader.  */
+
+
+#ifndef Py_PYCONFIG_H
+#define Py_PYCONFIG_H
+
+
+/* Define if building universal (internal helper macro) */
+/* #undef AC_APPLE_UNIVERSAL_BUILD */
+
+/* BUILD_GNU_TYPE + AIX_BUILDDATE are used to construct the PEP425 tag of the
+   build system. */
+/* #undef AIX_BUILDDATE */
+
+/* Define for AIX if your compiler is a genuine IBM xlC/xlC_r and you want
+   support for AIX C++ shared extension modules. */
+/* #undef AIX_GENUINE_CPLUSPLUS */
+
+/* The normal alignment of `long', in bytes. */
+#define ALIGNOF_LONG 4
+
+/* The normal alignment of `max_align_t', in bytes. */
+#define ALIGNOF_MAX_ALIGN_T 16
+
+/* The normal alignment of `size_t', in bytes. */
+#define ALIGNOF_SIZE_T 4
+
+/* Alternative SOABI used in debug build to load C extensions built in release
+   mode */
+/* #undef ALT_SOABI */
+
+/* The Android API level. */
+/* #undef ANDROID_API_LEVEL */
+
+/* Define if C doubles are 64-bit IEEE 754 binary format, stored in ARM
+   mixed-endian order (byte order 45670123) */
+/* #undef DOUBLE_IS_ARM_MIXED_ENDIAN_IEEE754 */
+
+/* Define if C doubles are 64-bit IEEE 754 binary format, stored with the most
+   significant byte first */
+/* #undef DOUBLE_IS_BIG_ENDIAN_IEEE754 */
+
+/* Define if C doubles are 64-bit IEEE 754 binary format, stored with the
+   least significant byte first */
+#define DOUBLE_IS_LITTLE_ENDIAN_IEEE754 1
+
+/* Define if --enable-ipv6 is specified */
+/* #undef ENABLE_IPV6 */
+
+/* Define to 1 if your system stores words within floats with the most
+   significant word first */
+/* #undef FLOAT_WORDS_BIGENDIAN */
+
+/* Define if getpgrp() must be called as getpgrp(0). */
+/* #undef GETPGRP_HAVE_ARG */
+
+/* Define if you have the 'accept' function. */
+#define HAVE_ACCEPT 1
+
+/* Define to 1 if you have the `accept4' function. */
+/* #undef HAVE_ACCEPT4 */
+
+/* Define to 1 if you have the `acosh' function. */
+#define HAVE_ACOSH 1
+
+/* struct addrinfo (netdb.h) */
+#define HAVE_ADDRINFO 1
+
+/* Define to 1 if you have the `alarm' function. */
+/* #undef HAVE_ALARM */
+
+/* Define if aligned memory access is required */
+#define HAVE_ALIGNED_REQUIRED 1
+
+/* Define to 1 if you have the <alloca.h> header file. */
+#define HAVE_ALLOCA_H 1
+
+/* Define this if your time.h defines altzone. */
+/* #undef HAVE_ALTZONE */
+
+/* Define to 1 if you have the `asinh' function. */
+#define HAVE_ASINH 1
+
+/* Define to 1 if you have the <asm/types.h> header file. */
+/* #undef HAVE_ASM_TYPES_H */
+
+/* Define to 1 if you have the `atanh' function. */
+#define HAVE_ATANH 1
+
+/* Define if you have the 'bind' function. */
+#define HAVE_BIND 1
+
+/* Define to 1 if you have the `bind_textdomain_codeset' function. */
+/* #undef HAVE_BIND_TEXTDOMAIN_CODESET */
+
+/* Define to 1 if you have the <bluetooth/bluetooth.h> header file. */
+/* #undef HAVE_BLUETOOTH_BLUETOOTH_H */
+
+/* Define to 1 if you have the <bluetooth.h> header file. */
+/* #undef HAVE_BLUETOOTH_H */
+
+/* Define if mbstowcs(NULL, "text", 0) does not return the number of wide
+   chars that would be converted. */
+/* #undef HAVE_BROKEN_MBSTOWCS */
+
+/* Define if nice() returns success/failure instead of the new priority. */
+/* #undef HAVE_BROKEN_NICE */
+
+/* Define if the system reports an invalid PIPE_BUF value. */
+/* #undef HAVE_BROKEN_PIPE_BUF */
+
+/* Define if poll() sets errno on invalid file descriptors. */
+/* #undef HAVE_BROKEN_POLL */
+
+/* Define if the Posix semaphores do not work on your system */
+/* #undef HAVE_BROKEN_POSIX_SEMAPHORES */
+
+/* Define if pthread_sigmask() does not work on your system. */
+/* #undef HAVE_BROKEN_PTHREAD_SIGMASK */
+
+/* define to 1 if your sem_getvalue is broken. */
+#define HAVE_BROKEN_SEM_GETVALUE 1
+
+/* Define if 'unsetenv' does not return an int. */
+/* #undef HAVE_BROKEN_UNSETENV */
+
+/* Has builtin __atomic_load_n() and __atomic_store_n() functions */
+#define HAVE_BUILTIN_ATOMIC 1
+
+/* Define to 1 if you have the <bzlib.h> header file. */
+/* #undef HAVE_BZLIB_H */
+
+/* Define to 1 if you have the `cfgetispeed' function. */
+/* #undef HAVE_CFGETISPEED */
+
+/* Define to 1 if you have the `cfgetospeed' function. */
+/* #undef HAVE_CFGETOSPEED */
+
+/* Define to 1 if you have the `cfsetispeed' function. */
+/* #undef HAVE_CFSETISPEED */
+
+/* Define to 1 if you have the `cfsetospeed' function. */
+/* #undef HAVE_CFSETOSPEED */
+
+/* Define to 1 if you have the 'chflags' function. */
+/* #undef HAVE_CHFLAGS */
+
+/* Define to 1 if you have the `chmod' function. */
+#define HAVE_CHMOD 1
+
+/* Define to 1 if you have the `chown' function. */
+#define HAVE_CHOWN 1
+
+/* Define if you have the 'chroot' function. */
+#define HAVE_CHROOT 1
+
+/* Define to 1 if you have the `clock' function. */
+#define HAVE_CLOCK 1
+
+/* Define to 1 if you have the `clock_getres' function. */
+#define HAVE_CLOCK_GETRES 1
+
+/* Define to 1 if you have the `clock_gettime' function. */
+#define HAVE_CLOCK_GETTIME 1
+
+/* Define to 1 if you have the `clock_nanosleep' function. */
+/* #undef HAVE_CLOCK_NANOSLEEP */
+
+/* Define to 1 if you have the `clock_settime' function. */
+/* #undef HAVE_CLOCK_SETTIME */
+
+/* Define to 1 if you have the `close_range' function. */
+/* #undef HAVE_CLOSE_RANGE */
+
+/* Define if the C compiler supports computed gotos. */
+#define HAVE_COMPUTED_GOTOS 1
+
+/* Define to 1 if you have the `confstr' function. */
+/* #undef HAVE_CONFSTR */
+
+/* Define to 1 if you have the <conio.h> header file. */
+/* #undef HAVE_CONIO_H */
+
+/* Define if you have the 'connect' function. */
+#define HAVE_CONNECT 1
+
+/* Define to 1 if you have the `copy_file_range' function. */
+/* #undef HAVE_COPY_FILE_RANGE */
+
+/* Define to 1 if you have the <crypt.h> header file. */
+/* #undef HAVE_CRYPT_H */
+
+/* Define if you have the crypt_r() function. */
+/* #undef HAVE_CRYPT_R */
+
+/* Define to 1 if you have the `ctermid' function. */
+/* #undef HAVE_CTERMID */
+
+/* Define if you have the 'ctermid_r' function. */
+/* #undef HAVE_CTERMID_R */
+
+/* Define if you have the 'filter' function. */
+/* #undef HAVE_CURSES_FILTER */
+
+/* Define to 1 if you have the <curses.h> header file. */
+/* #undef HAVE_CURSES_H */
+
+/* Define if you have the 'has_key' function. */
+/* #undef HAVE_CURSES_HAS_KEY */
+
+/* Define if you have the 'immedok' function. */
+/* #undef HAVE_CURSES_IMMEDOK */
+
+/* Define if you have the 'is_pad' function. */
+/* #undef HAVE_CURSES_IS_PAD */
+
+/* Define if you have the 'is_term_resized' function. */
+/* #undef HAVE_CURSES_IS_TERM_RESIZED */
+
+/* Define if you have the 'resizeterm' function. */
+/* #undef HAVE_CURSES_RESIZETERM */
+
+/* Define if you have the 'resize_term' function. */
+/* #undef HAVE_CURSES_RESIZE_TERM */
+
+/* Define if you have the 'syncok' function. */
+/* #undef HAVE_CURSES_SYNCOK */
+
+/* Define if you have the 'typeahead' function. */
+/* #undef HAVE_CURSES_TYPEAHEAD */
+
+/* Define if you have the 'use_env' function. */
+/* #undef HAVE_CURSES_USE_ENV */
+
+/* Define if you have the 'wchgat' function. */
+/* #undef HAVE_CURSES_WCHGAT */
+
+/* Define to 1 if you have the <db.h> header file. */
+/* #undef HAVE_DB_H */
+
+/* Define to 1 if you have the declaration of `RTLD_DEEPBIND', and to 0 if you
+   don't. */
+#define HAVE_DECL_RTLD_DEEPBIND 0
+
+/* Define to 1 if you have the declaration of `RTLD_GLOBAL', and to 0 if you
+   don't. */
+#define HAVE_DECL_RTLD_GLOBAL 1
+
+/* Define to 1 if you have the declaration of `RTLD_LAZY', and to 0 if you
+   don't. */
+#define HAVE_DECL_RTLD_LAZY 1
+
+/* Define to 1 if you have the declaration of `RTLD_LOCAL', and to 0 if you
+   don't. */
+#define HAVE_DECL_RTLD_LOCAL 1
+
+/* Define to 1 if you have the declaration of `RTLD_MEMBER', and to 0 if you
+   don't. */
+#define HAVE_DECL_RTLD_MEMBER 0
+
+/* Define to 1 if you have the declaration of `RTLD_NODELETE', and to 0 if you
+   don't. */
+#define HAVE_DECL_RTLD_NODELETE 0
+
+/* Define to 1 if you have the declaration of `RTLD_NOLOAD', and to 0 if you
+   don't. */
+#define HAVE_DECL_RTLD_NOLOAD 0
+
+/* Define to 1 if you have the declaration of `RTLD_NOW', and to 0 if you
+   don't. */
+#define HAVE_DECL_RTLD_NOW 1
+
+/* Define to 1 if you have the declaration of `tzname', and to 0 if you don't.
+   */
+#define HAVE_DECL_TZNAME 1
+
+/* Define to 1 if you have the device macros. */
+/* #undef HAVE_DEVICE_MACROS */
+
+/* Define to 1 if you have the /dev/ptc device file. */
+/* #undef HAVE_DEV_PTC */
+
+/* Define to 1 if you have the /dev/ptmx device file. */
+/* #undef HAVE_DEV_PTMX */
+
+/* Define to 1 if you have the <direct.h> header file. */
+/* #undef HAVE_DIRECT_H */
+
+/* Define to 1 if the dirent structure has a d_type field */
+/* #undef HAVE_DIRENT_D_TYPE */
+
+/* Define to 1 if you have the <dirent.h> header file, and it defines `DIR'.
+   */
+#define HAVE_DIRENT_H 1
+
+/* Define if you have the 'dirfd' function or macro. */
+#define HAVE_DIRFD 1
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#define HAVE_DLFCN_H 1
+
+/* Define to 1 if you have the `dlopen' function. */
+#define HAVE_DLOPEN 1
+
+/* Define to 1 if you have the `dup' function. */
+#define HAVE_DUP 1
+
+/* Define to 1 if you have the `dup2' function. */
+#define HAVE_DUP2 1
+
+/* Define to 1 if you have the `dup3' function. */
+/* #undef HAVE_DUP3 */
+
+/* Define if you have the '_dyld_shared_cache_contains_path' function. */
+/* #undef HAVE_DYLD_SHARED_CACHE_CONTAINS_PATH */
+
+/* Defined when any dynamic module loading is enabled. */
+#define HAVE_DYNAMIC_LOADING 1
+
+/* Define to 1 if you have the <editline/readline.h> header file. */
+/* #undef HAVE_EDITLINE_READLINE_H */
+
+/* Define to 1 if you have the <endian.h> header file. */
+/* #undef HAVE_ENDIAN_H */
+
+/* Define if you have the 'epoll_create' function. */
+/* #undef HAVE_EPOLL */
+
+/* Define if you have the 'epoll_create1' function. */
+/* #undef HAVE_EPOLL_CREATE1 */
+
+/* Define to 1 if you have the `erf' function. */
+#define HAVE_ERF 1
+
+/* Define to 1 if you have the `erfc' function. */
+#define HAVE_ERFC 1
+
+/* Define to 1 if you have the <errno.h> header file. */
+#define HAVE_ERRNO_H 1
+
+/* Define if you have the 'eventfd' function. */
+/* #undef HAVE_EVENTFD */
+
+/* Define to 1 if you have the `execv' function. */
+#define HAVE_EXECV 1
+
+/* Define to 1 if you have the `explicit_bzero' function. */
+#define HAVE_EXPLICIT_BZERO 1
+
+/* Define to 1 if you have the `explicit_memset' function. */
+/* #undef HAVE_EXPLICIT_MEMSET */
+
+/* Define to 1 if you have the `expm1' function. */
+#define HAVE_EXPM1 1
+
+/* Define to 1 if you have the `faccessat' function. */
+#define HAVE_FACCESSAT 1
+
+/* Define if you have the 'fchdir' function. */
+#define HAVE_FCHDIR 1
+
+/* Define to 1 if you have the `fchmod' function. */
+#define HAVE_FCHMOD 1
+
+/* Define to 1 if you have the `fchmodat' function. */
+#define HAVE_FCHMODAT 1
+
+/* Define to 1 if you have the `fchown' function. */
+#define HAVE_FCHOWN 1
+
+/* Define to 1 if you have the `fchownat' function. */
+#define HAVE_FCHOWNAT 1
+
+/* Define to 1 if you have the <fcntl.h> header file. */
+#define HAVE_FCNTL_H 1
+
+/* Define if you have the 'fdatasync' function. */
+#define HAVE_FDATASYNC 1
+
+/* Define to 1 if you have the `fdopendir' function. */
+/* #undef HAVE_FDOPENDIR */
+
+/* Define to 1 if you have the `fdwalk' function. */
+/* #undef HAVE_FDWALK */
+
+/* Define to 1 if you have the `fexecve' function. */
+/* #undef HAVE_FEXECVE */
+
+/* Define if you have the 'ffi_closure_alloc' function. */
+#define HAVE_FFI_CLOSURE_ALLOC 1
+
+/* Define if you have the 'ffi_prep_cif_var' function. */
+#define HAVE_FFI_PREP_CIF_VAR 1
+
+/* Define if you have the 'ffi_prep_closure_loc' function. */
+#define HAVE_FFI_PREP_CLOSURE_LOC 1
+
+/* Define to 1 if you have the `flock' function. */
+/* #undef HAVE_FLOCK */
+
+/* Define to 1 if you have the `fork' function. */
+#define HAVE_FORK 1
+
+/* Define to 1 if you have the `fork1' function. */
+/* #undef HAVE_FORK1 */
+
+/* Define to 1 if you have the `forkpty' function. */
+/* #undef HAVE_FORKPTY */
+
+/* Define to 1 if you have the `fpathconf' function. */
+/* #undef HAVE_FPATHCONF */
+
+/* Define to 1 if you have the `fseek64' function. */
+/* #undef HAVE_FSEEK64 */
+
+/* Define to 1 if you have the `fseeko' function. */
+#define HAVE_FSEEKO 1
+
+/* Define to 1 if you have the `fstatat' function. */
+/* #undef HAVE_FSTATAT */
+
+/* Define to 1 if you have the `fstatvfs' function. */
+/* #undef HAVE_FSTATVFS */
+
+/* Define if you have the 'fsync' function. */
+#define HAVE_FSYNC 1
+
+/* Define to 1 if you have the `ftell64' function. */
+/* #undef HAVE_FTELL64 */
+
+/* Define to 1 if you have the `ftello' function. */
+#define HAVE_FTELLO 1
+
+/* Define to 1 if you have the `ftime' function. */
+/* #undef HAVE_FTIME */
+
+/* Define to 1 if you have the `ftruncate' function. */
+#define HAVE_FTRUNCATE 1
+
+/* Define to 1 if you have the `futimens' function. */
+#define HAVE_FUTIMENS 1
+
+/* Define to 1 if you have the `futimes' function. */
+/* #undef HAVE_FUTIMES */
+
+/* Define to 1 if you have the `futimesat' function. */
+/* #undef HAVE_FUTIMESAT */
+
+/* Define to 1 if you have the `gai_strerror' function. */
+#define HAVE_GAI_STRERROR 1
+
+/* Define if we can use gcc inline assembler to get and set mc68881 fpcr */
+/* #undef HAVE_GCC_ASM_FOR_MC68881 */
+
+/* Define if we can use x64 gcc inline assembler */
+/* #undef HAVE_GCC_ASM_FOR_X64 */
+
+/* Define if we can use gcc inline assembler to get and set x87 control word
+   */
+#define HAVE_GCC_ASM_FOR_X87 1
+
+/* Define if your compiler provides __uint128_t */
+/* #undef HAVE_GCC_UINT128_T */
+
+/* Define to 1 if you have the <gdbm-ndbm.h> header file. */
+/* #undef HAVE_GDBM_DASH_NDBM_H */
+
+/* Define to 1 if you have the <gdbm.h> header file. */
+/* #undef HAVE_GDBM_H */
+
+/* Define to 1 if you have the <gdbm/ndbm.h> header file. */
+/* #undef HAVE_GDBM_NDBM_H */
+
+/* Define if you have the getaddrinfo function. */
+#define HAVE_GETADDRINFO 1
+
+/* Define this if you have flockfile(), getc_unlocked(), and funlockfile() */
+/* #undef HAVE_GETC_UNLOCKED */
+
+/* Define to 1 if you have the `getegid' function. */
+#define HAVE_GETEGID 1
+
+/* Define to 1 if you have the `getentropy' function. */
+#define HAVE_GETENTROPY 1
+
+/* Define to 1 if you have the `geteuid' function. */
+#define HAVE_GETEUID 1
+
+/* Define to 1 if you have the `getgid' function. */
+#define HAVE_GETGID 1
+
+/* Define to 1 if you have the `getgrgid' function. */
+/* #undef HAVE_GETGRGID */
+
+/* Define to 1 if you have the `getgrgid_r' function. */
+/* #undef HAVE_GETGRGID_R */
+
+/* Define to 1 if you have the `getgrnam_r' function. */
+/* #undef HAVE_GETGRNAM_R */
+
+/* Define to 1 if you have the `getgrouplist' function. */
+/* #undef HAVE_GETGROUPLIST */
+
+/* Define to 1 if you have the `getgroups' function. */
+/* #undef HAVE_GETGROUPS */
+
+/* Define if you have the 'gethostbyaddr' function. */
+#define HAVE_GETHOSTBYADDR 1
+
+/* Define to 1 if you have the `gethostbyname' function. */
+#define HAVE_GETHOSTBYNAME 1
+
+/* Define this if you have some version of gethostbyname_r() */
+/* #undef HAVE_GETHOSTBYNAME_R */
+
+/* Define this if you have the 3-arg version of gethostbyname_r(). */
+/* #undef HAVE_GETHOSTBYNAME_R_3_ARG */
+
+/* Define this if you have the 5-arg version of gethostbyname_r(). */
+/* #undef HAVE_GETHOSTBYNAME_R_5_ARG */
+
+/* Define this if you have the 6-arg version of gethostbyname_r(). */
+/* #undef HAVE_GETHOSTBYNAME_R_6_ARG */
+
+/* Define to 1 if you have the `gethostname' function. */
+#define HAVE_GETHOSTNAME 1
+
+/* Define to 1 if you have the `getitimer' function. */
+/* #undef HAVE_GETITIMER */
+
+/* Define to 1 if you have the `getloadavg' function. */
+/* #undef HAVE_GETLOADAVG */
+
+/* Define to 1 if you have the `getlogin' function. */
+/* #undef HAVE_GETLOGIN */
+
+/* Define to 1 if you have the `getnameinfo' function. */
+#define HAVE_GETNAMEINFO 1
+
+/* Define if you have the 'getpagesize' function. */
+#define HAVE_GETPAGESIZE 1
+
+/* Define if you have the 'getpeername' function. */
+#define HAVE_GETPEERNAME 1
+
+/* Define to 1 if you have the `getpgid' function. */
+/* #undef HAVE_GETPGID */
+
+/* Define to 1 if you have the `getpgrp' function. */
+/* #undef HAVE_GETPGRP */
+
+/* Define to 1 if you have the `getpid' function. */
+#define HAVE_GETPID 1
+
+/* Define to 1 if you have the `getppid' function. */
+/* #undef HAVE_GETPPID */
+
+/* Define to 1 if you have the `getpriority' function. */
+/* #undef HAVE_GETPRIORITY */
+
+/* Define if you have the 'getprotobyname' function. */
+#define HAVE_GETPROTOBYNAME 1
+
+/* Define to 1 if you have the `getpwent' function. */
+/* #undef HAVE_GETPWENT */
+
+/* Define to 1 if you have the `getpwnam_r' function. */
+/* #undef HAVE_GETPWNAM_R */
+
+/* Define to 1 if you have the `getpwuid' function. */
+#define HAVE_GETPWUID 1
+
+/* Define to 1 if you have the `getpwuid_r' function. */
+/* #undef HAVE_GETPWUID_R */
+
+/* Define to 1 if the getrandom() function is available */
+/* #undef HAVE_GETRANDOM */
+
+/* Define to 1 if the Linux getrandom() syscall is available */
+/* #undef HAVE_GETRANDOM_SYSCALL */
+
+/* Define to 1 if you have the `getresgid' function. */
+/* #undef HAVE_GETRESGID */
+
+/* Define to 1 if you have the `getresuid' function. */
+/* #undef HAVE_GETRESUID */
+
+/* Define to 1 if you have the `getrusage' function. */
+/* #undef HAVE_GETRUSAGE */
+
+/* Define if you have the 'getservbyname' function. */
+#define HAVE_GETSERVBYNAME 1
+
+/* Define if you have the 'getservbyport' function. */
+#define HAVE_GETSERVBYPORT 1
+
+/* Define to 1 if you have the `getsid' function. */
+/* #undef HAVE_GETSID */
+
+/* Define if you have the 'getsockname' function. */
+#define HAVE_GETSOCKNAME 1
+
+/* Define to 1 if you have the `getspent' function. */
+/* #undef HAVE_GETSPENT */
+
+/* Define to 1 if you have the `getspnam' function. */
+/* #undef HAVE_GETSPNAM */
+
+/* Define to 1 if you have the `getuid' function. */
+#define HAVE_GETUID 1
+
+/* Define to 1 if you have the `getwd' function. */
+/* #undef HAVE_GETWD */
+
+/* Define if glibc has incorrect _FORTIFY_SOURCE wrappers for memmove and
+   bcopy. */
+/* #undef HAVE_GLIBC_MEMMOVE_BUG */
+
+/* Define to 1 if you have the <grp.h> header file. */
+#define HAVE_GRP_H 1
+
+/* Define if you have the 'hstrerror' function. */
+/* #undef HAVE_HSTRERROR */
+
+/* Define this if you have le64toh() */
+#define HAVE_HTOLE64 1
+
+/* Define to 1 if you have the <ieeefp.h> header file. */
+#define HAVE_IEEEFP_H 1
+
+/* Define to 1 if you have the `if_nameindex' function. */
+/* #undef HAVE_IF_NAMEINDEX */
+
+/* Define if you have the 'inet_aton' function. */
+/* #undef HAVE_INET_ATON */
+
+/* Define if you have the 'inet_ntoa' function. */
+#define HAVE_INET_NTOA 1
+
+/* Define if you have the 'inet_pton' function. */
+#define HAVE_INET_PTON 1
+
+/* Define to 1 if you have the `initgroups' function. */
+/* #undef HAVE_INITGROUPS */
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#define HAVE_INTTYPES_H 1
+
+/* Define to 1 if you have the <io.h> header file. */
+/* #undef HAVE_IO_H */
+
+/* Define if gcc has the ipa-pure-const bug. */
+/* #undef HAVE_IPA_PURE_CONST_BUG */
+
+/* Define to 1 if you have the `kill' function. */
+#define HAVE_KILL 1
+
+/* Define to 1 if you have the `killpg' function. */
+/* #undef HAVE_KILLPG */
+
+/* Define if you have the 'kqueue' function. */
+/* #undef HAVE_KQUEUE */
+
+/* Define to 1 if you have the <langinfo.h> header file. */
+#define HAVE_LANGINFO_H 1
+
+/* Defined to enable large file support when an off_t is bigger than a long
+   and long long is at least as big as an off_t. You may need to add some
+   flags for configuration and compilation to enable this mode. (For Solaris
+   and Linux, the necessary defines are already defined.) */
+#define HAVE_LARGEFILE_SUPPORT 1
+
+/* Define to 1 if you have the 'lchflags' function. */
+/* #undef HAVE_LCHFLAGS */
+
+/* Define to 1 if you have the `lchmod' function. */
+#define HAVE_LCHMOD 1
+
+/* Define to 1 if you have the `lchown' function. */
+#define HAVE_LCHOWN 1
+
+/* Define to 1 if you want to build _blake2 module with libb2 */
+/* #undef HAVE_LIBB2 */
+
+/* Define to 1 if you have the `db' library (-ldb). */
+/* #undef HAVE_LIBDB */
+
+/* Define to 1 if you have the `dl' library (-ldl). */
+/* #undef HAVE_LIBDL */
+
+/* Define to 1 if you have the `dld' library (-ldld). */
+/* #undef HAVE_LIBDLD */
+
+/* Define to 1 if you have the `ieee' library (-lieee). */
+/* #undef HAVE_LIBIEEE */
+
+/* Define to 1 if you have the <libintl.h> header file. */
+/* #undef HAVE_LIBINTL_H */
+
+/* Define to 1 if you have the `resolv' library (-lresolv). */
+/* #undef HAVE_LIBRESOLV */
+
+/* Define to 1 if you have the `sendfile' library (-lsendfile). */
+/* #undef HAVE_LIBSENDFILE */
+
+/* Define to 1 if you have the `sqlite3' library (-lsqlite3). */
+#define HAVE_LIBSQLITE3 1
+
+/* Define to 1 if you have the <libutil.h> header file. */
+/* #undef HAVE_LIBUTIL_H */
+
+/* Define if you have the 'link' function. */
+#define HAVE_LINK 1
+
+/* Define to 1 if you have the `linkat' function. */
+#define HAVE_LINKAT 1
+
+/* Define to 1 if you have the <linux/auxvec.h> header file. */
+/* #undef HAVE_LINUX_AUXVEC_H */
+
+/* Define to 1 if you have the <linux/can/bcm.h> header file. */
+/* #undef HAVE_LINUX_CAN_BCM_H */
+
+/* Define to 1 if you have the <linux/can.h> header file. */
+/* #undef HAVE_LINUX_CAN_H */
+
+/* Define to 1 if you have the <linux/can/j1939.h> header file. */
+/* #undef HAVE_LINUX_CAN_J1939_H */
+
+/* Define if compiling using Linux 3.6 or later. */
+/* #undef HAVE_LINUX_CAN_RAW_FD_FRAMES */
+
+/* Define to 1 if you have the <linux/can/raw.h> header file. */
+/* #undef HAVE_LINUX_CAN_RAW_H */
+
+/* Define if compiling using Linux 4.1 or later. */
+/* #undef HAVE_LINUX_CAN_RAW_JOIN_FILTERS */
+
+/* Define to 1 if you have the <linux/fs.h> header file. */
+/* #undef HAVE_LINUX_FS_H */
+
+/* Define to 1 if you have the <linux/limits.h> header file. */
+/* #undef HAVE_LINUX_LIMITS_H */
+
+/* Define to 1 if you have the <linux/memfd.h> header file. */
+/* #undef HAVE_LINUX_MEMFD_H */
+
+/* Define to 1 if you have the <linux/netlink.h> header file. */
+/* #undef HAVE_LINUX_NETLINK_H */
+
+/* Define to 1 if you have the <linux/qrtr.h> header file. */
+/* #undef HAVE_LINUX_QRTR_H */
+
+/* Define to 1 if you have the <linux/random.h> header file. */
+/* #undef HAVE_LINUX_RANDOM_H */
+
+/* Define to 1 if you have the <linux/soundcard.h> header file. */
+/* #undef HAVE_LINUX_SOUNDCARD_H */
+
+/* Define to 1 if you have the <linux/tipc.h> header file. */
+/* #undef HAVE_LINUX_TIPC_H */
+
+/* Define to 1 if you have the <linux/vm_sockets.h> header file. */
+/* #undef HAVE_LINUX_VM_SOCKETS_H */
+
+/* Define to 1 if you have the <linux/wait.h> header file. */
+/* #undef HAVE_LINUX_WAIT_H */
+
+/* Define if you have the 'listen' function. */
+#define HAVE_LISTEN 1
+
+/* Define to 1 if you have the `lockf' function. */
+/* #undef HAVE_LOCKF */
+
+/* Define to 1 if you have the `log1p' function. */
+#define HAVE_LOG1P 1
+
+/* Define to 1 if you have the `log2' function. */
+#define HAVE_LOG2 1
+
+/* Define to 1 if you have the `login_tty' function. */
+/* #undef HAVE_LOGIN_TTY */
+
+/* Define to 1 if the system has the type `long double'. */
+#define HAVE_LONG_DOUBLE 1
+
+/* Define to 1 if you have the `lstat' function. */
+#define HAVE_LSTAT 1
+
+/* Define to 1 if you have the `lutimes' function. */
+/* #undef HAVE_LUTIMES */
+
+/* Define to 1 if you have the <lzma.h> header file. */
+/* #undef HAVE_LZMA_H */
+
+/* Define to 1 if you have the `madvise' function. */
+/* #undef HAVE_MADVISE */
+
+/* Define this if you have the makedev macro. */
+/* #undef HAVE_MAKEDEV */
+
+/* Define to 1 if you have the `mbrtowc' function. */
+#define HAVE_MBRTOWC 1
+
+/* Define if you have the 'memfd_create' function. */
+/* #undef HAVE_MEMFD_CREATE */
+
+/* Define to 1 if you have the `memrchr' function. */
+#define HAVE_MEMRCHR 1
+
+/* Define to 1 if you have the <minix/config.h> header file. */
+/* #undef HAVE_MINIX_CONFIG_H */
+
+/* Define to 1 if you have the `mkdirat' function. */
+#define HAVE_MKDIRAT 1
+
+/* Define to 1 if you have the `mkfifo' function. */
+/* #undef HAVE_MKFIFO */
+
+/* Define to 1 if you have the `mkfifoat' function. */
+/* #undef HAVE_MKFIFOAT */
+
+/* Define to 1 if you have the `mknod' function. */
+/* #undef HAVE_MKNOD */
+
+/* Define to 1 if you have the `mknodat' function. */
+/* #undef HAVE_MKNODAT */
+
+/* Define to 1 if you have the `mktime' function. */
+#define HAVE_MKTIME 1
+
+/* Define to 1 if you have the `mmap' function. */
+#define HAVE_MMAP 1
+
+/* Define to 1 if you have the `mremap' function. */
+/* #undef HAVE_MREMAP */
+
+/* Define to 1 if you have the `msync' function. */
+/* #undef HAVE_MSYNC */
+
+/* Define to 1 if you have the `nanosleep' function. */
+#define HAVE_NANOSLEEP 1
+
+/* Define to 1 if you have the `ncursesw' library. */
+/* #undef HAVE_NCURSESW */
+
+/* Define to 1 if you have the <ncurses.h> header file. */
+/* #undef HAVE_NCURSES_H */
+
+/* Define to 1 if you have the <ndbm.h> header file. */
+#define HAVE_NDBM_H 1
+
+/* Define to 1 if you have the <ndir.h> header file, and it defines `DIR'. */
+/* #undef HAVE_NDIR_H */
+
+/* Define to 1 if you have the <netcan/can.h> header file. */
+/* #undef HAVE_NETCAN_CAN_H */
+
+/* Define to 1 if you have the <netdb.h> header file. */
+#define HAVE_NETDB_H 1
+
+/* Define to 1 if you have the <netinet/in.h> header file. */
+#define HAVE_NETINET_IN_H 1
+
+/* Define to 1 if you have the <netpacket/packet.h> header file. */
+/* #undef HAVE_NETPACKET_PACKET_H */
+
+/* Define to 1 if you have the <net/ethernet.h> header file. */
+/* #undef HAVE_NET_ETHERNET_H */
+
+/* Define to 1 if you have the <net/if.h> header file. */
+/* #undef HAVE_NET_IF_H */
+
+/* Define to 1 if you have the `nice' function. */
+/* #undef HAVE_NICE */
+
+/* Define if the internal form of wchar_t in non-Unicode locales is not
+   Unicode. */
+/* #undef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION */
+
+/* Define to 1 if you have the `openat' function. */
+/* #undef HAVE_OPENAT */
+
+/* Define to 1 if you have the `opendir' function. */
+#define HAVE_OPENDIR 1
+
+/* Define to 1 if you have the `openpty' function. */
+/* #undef HAVE_OPENPTY */
+
+/* Define to 1 if you have the <panel.h> header file. */
+/* #undef HAVE_PANEL_H */
+
+/* Define to 1 if you have the `pathconf' function. */
+/* #undef HAVE_PATHCONF */
+
+/* Define to 1 if you have the `pause' function. */
+/* #undef HAVE_PAUSE */
+
+/* Define to 1 if you have the `pipe' function. */
+#define HAVE_PIPE 1
+
+/* Define to 1 if you have the `pipe2' function. */
+/* #undef HAVE_PIPE2 */
+
+/* Define to 1 if you have the `plock' function. */
+/* #undef HAVE_PLOCK */
+
+/* Define to 1 if you have the `poll' function. */
+#define HAVE_POLL 1
+
+/* Define to 1 if you have the <poll.h> header file. */
+#define HAVE_POLL_H 1
+
+/* Define to 1 if you have the `posix_fadvise' function. */
+#define HAVE_POSIX_FADVISE 1
+
+/* Define to 1 if you have the `posix_fallocate' function. */
+#define HAVE_POSIX_FALLOCATE 1
+
+/* Define to 1 if you have the `posix_spawn' function. */
+/* #undef HAVE_POSIX_SPAWN */
+
+/* Define to 1 if you have the `posix_spawnp' function. */
+/* #undef HAVE_POSIX_SPAWNP */
+
+/* Define to 1 if you have the `pread' function. */
+#define HAVE_PREAD 1
+
+/* Define to 1 if you have the `preadv' function. */
+#define HAVE_PREADV 1
+
+/* Define to 1 if you have the `preadv2' function. */
+/* #undef HAVE_PREADV2 */
+
+/* Define if you have the 'prlimit' function. */
+/* #undef HAVE_PRLIMIT */
+
+/* Define to 1 if you have the <process.h> header file. */
+/* #undef HAVE_PROCESS_H */
+
+/* Define if your compiler supports function prototype */
+#define HAVE_PROTOTYPES 1
+
+/* Define to 1 if you have the `pthread_condattr_setclock' function. */
+#define HAVE_PTHREAD_CONDATTR_SETCLOCK 1
+
+/* Defined for Solaris 2.6 bug in pthread header. */
+/* #undef HAVE_PTHREAD_DESTRUCTOR */
+
+/* Define to 1 if you have the `pthread_getcpuclockid' function. */
+/* #undef HAVE_PTHREAD_GETCPUCLOCKID */
+
+/* Define to 1 if you have the <pthread.h> header file. */
+#define HAVE_PTHREAD_H 1
+
+/* Define to 1 if you have the `pthread_init' function. */
+/* #undef HAVE_PTHREAD_INIT */
+
+/* Define to 1 if you have the `pthread_kill' function. */
+#define HAVE_PTHREAD_KILL 1
+
+/* Define to 1 if you have the `pthread_sigmask' function. */
+#define HAVE_PTHREAD_SIGMASK 1
+
+/* Define if platform requires stubbed pthreads support */
+/* #undef HAVE_PTHREAD_STUBS */
+
+/* Define to 1 if you have the <pty.h> header file. */
+/* #undef HAVE_PTY_H */
+
+/* Define to 1 if you have the `pwrite' function. */
+#define HAVE_PWRITE 1
+
+/* Define to 1 if you have the `pwritev' function. */
+#define HAVE_PWRITEV 1
+
+/* Define to 1 if you have the `pwritev2' function. */
+/* #undef HAVE_PWRITEV2 */
+
+/* Define to 1 if you have the <readline/readline.h> header file. */
+/* #undef HAVE_READLINE_READLINE_H */
+
+/* Define to 1 if you have the `readlink' function. */
+#define HAVE_READLINK 1
+
+/* Define to 1 if you have the `readlinkat' function. */
+#define HAVE_READLINKAT 1
+
+/* Define to 1 if you have the `readv' function. */
+#define HAVE_READV 1
+
+/* Define to 1 if you have the `realpath' function. */
+#define HAVE_REALPATH 1
+
+/* Define if you have the 'recvfrom' function. */
+#define HAVE_RECVFROM 1
+
+/* Define to 1 if you have the `renameat' function. */
+#define HAVE_RENAMEAT 1
+
+/* Define if readline supports append_history */
+/* #undef HAVE_RL_APPEND_HISTORY */
+
+/* Define if you can turn off readline's signal handling. */
+/* #undef HAVE_RL_CATCH_SIGNAL */
+
+/* Define if readline supports rl_compdisp_func_t */
+/* #undef HAVE_RL_COMPDISP_FUNC_T */
+
+/* Define if you have readline 2.2 */
+/* #undef HAVE_RL_COMPLETION_APPEND_CHARACTER */
+
+/* Define if you have readline 4.0 */
+/* #undef HAVE_RL_COMPLETION_DISPLAY_MATCHES_HOOK */
+
+/* Define if you have readline 4.2 */
+/* #undef HAVE_RL_COMPLETION_MATCHES */
+
+/* Define if you have rl_completion_suppress_append */
+/* #undef HAVE_RL_COMPLETION_SUPPRESS_APPEND */
+
+/* Define if you have readline 4.0 */
+/* #undef HAVE_RL_PRE_INPUT_HOOK */
+
+/* Define if you have readline 4.0 */
+/* #undef HAVE_RL_RESIZE_TERMINAL */
+
+/* Define to 1 if you have the <rpc/rpc.h> header file. */
+/* #undef HAVE_RPC_RPC_H */
+
+/* Define to 1 if you have the `rtpSpawn' function. */
+/* #undef HAVE_RTPSPAWN */
+
+/* Define to 1 if you have the `sched_get_priority_max' function. */
+/* #undef HAVE_SCHED_GET_PRIORITY_MAX */
+
+/* Define to 1 if you have the <sched.h> header file. */
+#define HAVE_SCHED_H 1
+
+/* Define to 1 if you have the `sched_rr_get_interval' function. */
+/* #undef HAVE_SCHED_RR_GET_INTERVAL */
+
+/* Define to 1 if you have the `sched_setaffinity' function. */
+/* #undef HAVE_SCHED_SETAFFINITY */
+
+/* Define to 1 if you have the `sched_setparam' function. */
+/* #undef HAVE_SCHED_SETPARAM */
+
+/* Define to 1 if you have the `sched_setscheduler' function. */
+/* #undef HAVE_SCHED_SETSCHEDULER */
+
+/* Define to 1 if you have the `sem_clockwait' function. */
+/* #undef HAVE_SEM_CLOCKWAIT */
+
+/* Define to 1 if you have the `sem_getvalue' function. */
+/* #undef HAVE_SEM_GETVALUE */
+
+/* Define to 1 if you have the `sem_open' function. */
+/* #undef HAVE_SEM_OPEN */
+
+/* Define to 1 if you have the `sem_timedwait' function. */
+/* #undef HAVE_SEM_TIMEDWAIT */
+
+/* Define to 1 if you have the `sem_unlink' function. */
+/* #undef HAVE_SEM_UNLINK */
+
+/* Define to 1 if you have the `sendfile' function. */
+/* #undef HAVE_SENDFILE */
+
+/* Define if you have the 'sendto' function. */
+#define HAVE_SENDTO 1
+
+/* Define to 1 if you have the `setegid' function. */
+#define HAVE_SETEGID 1
+
+/* Define to 1 if you have the `seteuid' function. */
+#define HAVE_SETEUID 1
+
+/* Define to 1 if you have the `setgid' function. */
+#define HAVE_SETGID 1
+
+/* Define if you have the 'setgroups' function. */
+#define HAVE_SETGROUPS 1
+
+/* Define to 1 if you have the `sethostname' function. */
+/* #undef HAVE_SETHOSTNAME */
+
+/* Define to 1 if you have the `setitimer' function. */
+/* #undef HAVE_SETITIMER */
+
+/* Define to 1 if you have the <setjmp.h> header file. */
+#define HAVE_SETJMP_H 1
+
+/* Define to 1 if you have the `setlocale' function. */
+#define HAVE_SETLOCALE 1
+
+/* Define to 1 if you have the `setns' function. */
+/* #undef HAVE_SETNS */
+
+/* Define to 1 if you have the `setpgid' function. */
+/* #undef HAVE_SETPGID */
+
+/* Define to 1 if you have the `setpgrp' function. */
+/* #undef HAVE_SETPGRP */
+
+/* Define to 1 if you have the `setpriority' function. */
+/* #undef HAVE_SETPRIORITY */
+
+/* Define to 1 if you have the `setregid' function. */
+/* #undef HAVE_SETREGID */
+
+/* Define to 1 if you have the `setresgid' function. */
+/* #undef HAVE_SETRESGID */
+
+/* Define to 1 if you have the `setresuid' function. */
+/* #undef HAVE_SETRESUID */
+
+/* Define to 1 if you have the `setreuid' function. */
+/* #undef HAVE_SETREUID */
+
+/* Define to 1 if you have the `setsid' function. */
+/* #undef HAVE_SETSID */
+
+/* Define if you have the 'setsockopt' function. */
+#define HAVE_SETSOCKOPT 1
+
+/* Define to 1 if you have the `setuid' function. */
+#define HAVE_SETUID 1
+
+/* Define to 1 if you have the `setvbuf' function. */
+#define HAVE_SETVBUF 1
+
+/* Define to 1 if you have the <shadow.h> header file. */
+/* #undef HAVE_SHADOW_H */
+
+/* Define to 1 if you have the `shm_open' function. */
+/* #undef HAVE_SHM_OPEN */
+
+/* Define to 1 if you have the `shm_unlink' function. */
+/* #undef HAVE_SHM_UNLINK */
+
+/* Define to 1 if you have the `shutdown' function. */
+#define HAVE_SHUTDOWN 1
+
+/* Define to 1 if you have the `sigaction' function. */
+/* #undef HAVE_SIGACTION */
+
+/* Define to 1 if you have the `sigaltstack' function. */
+/* #undef HAVE_SIGALTSTACK */
+
+/* Define to 1 if you have the `sigfillset' function. */
+/* #undef HAVE_SIGFILLSET */
+
+/* Define to 1 if `si_band' is a member of `siginfo_t'. */
+/* #undef HAVE_SIGINFO_T_SI_BAND */
+
+/* Define to 1 if you have the `siginterrupt' function. */
+/* #undef HAVE_SIGINTERRUPT */
+
+/* Define to 1 if you have the <signal.h> header file. */
+#define HAVE_SIGNAL_H 1
+
+/* Define to 1 if you have the `sigpending' function. */
+/* #undef HAVE_SIGPENDING */
+
+/* Define to 1 if you have the `sigrelse' function. */
+/* #undef HAVE_SIGRELSE */
+
+/* Define to 1 if you have the `sigtimedwait' function. */
+/* #undef HAVE_SIGTIMEDWAIT */
+
+/* Define to 1 if you have the `sigwait' function. */
+/* #undef HAVE_SIGWAIT */
+
+/* Define to 1 if you have the `sigwaitinfo' function. */
+/* #undef HAVE_SIGWAITINFO */
+
+/* Define to 1 if you have the `snprintf' function. */
+#define HAVE_SNPRINTF 1
+
+/* struct sockaddr_alg (linux/if_alg.h) */
+/* #undef HAVE_SOCKADDR_ALG */
+
+/* Define if sockaddr has sa_len member */
+#define HAVE_SOCKADDR_SA_LEN 1
+
+/* struct sockaddr_storage (sys/socket.h) */
+#define HAVE_SOCKADDR_STORAGE 1
+
+/* Define if you have the 'socket' function. */
+#define HAVE_SOCKET 1
+
+/* Define if you have the 'socketpair' function. */
+#define HAVE_SOCKETPAIR 1
+
+/* Define to 1 if you have the <spawn.h> header file. */
+#define HAVE_SPAWN_H 1
+
+/* Define to 1 if you have the `splice' function. */
+/* #undef HAVE_SPLICE */
+
+/* Define if your compiler provides ssize_t */
+#define HAVE_SSIZE_T 1
+
+/* Define to 1 if you have the `statvfs' function. */
+/* #undef HAVE_STATVFS */
+
+/* Define if you have struct stat.st_mtim.tv_nsec */
+#define HAVE_STAT_TV_NSEC 1
+
+/* Define if you have struct stat.st_mtimensec */
+/* #undef HAVE_STAT_TV_NSEC2 */
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdio.h> header file. */
+#define HAVE_STDIO_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Has stdatomic.h with atomic_int and atomic_uintptr_t */
+#define HAVE_STD_ATOMIC 1
+
+/* Define to 1 if you have the `strftime' function. */
+#define HAVE_STRFTIME 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+#define HAVE_STRINGS_H 1
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if you have the `strlcpy' function. */
+#define HAVE_STRLCPY 1
+
+/* Define to 1 if you have the <stropts.h> header file. */
+/* #undef HAVE_STROPTS_H */
+
+/* Define to 1 if you have the `strsignal' function. */
+#define HAVE_STRSIGNAL 1
+
+/* Define to 1 if `pw_gecos' is a member of `struct passwd'. */
+#define HAVE_STRUCT_PASSWD_PW_GECOS 1
+
+/* Define to 1 if `pw_passwd' is a member of `struct passwd'. */
+#define HAVE_STRUCT_PASSWD_PW_PASSWD 1
+
+/* Define to 1 if `st_birthtime' is a member of `struct stat'. */
+/* #undef HAVE_STRUCT_STAT_ST_BIRTHTIME */
+
+/* Define to 1 if `st_blksize' is a member of `struct stat'. */
+#define HAVE_STRUCT_STAT_ST_BLKSIZE 1
+
+/* Define to 1 if `st_blocks' is a member of `struct stat'. */
+#define HAVE_STRUCT_STAT_ST_BLOCKS 1
+
+/* Define to 1 if `st_flags' is a member of `struct stat'. */
+/* #undef HAVE_STRUCT_STAT_ST_FLAGS */
+
+/* Define to 1 if `st_gen' is a member of `struct stat'. */
+/* #undef HAVE_STRUCT_STAT_ST_GEN */
+
+/* Define to 1 if `st_rdev' is a member of `struct stat'. */
+#define HAVE_STRUCT_STAT_ST_RDEV 1
+
+/* Define to 1 if `tm_zone' is a member of `struct tm'. */
+/* #undef HAVE_STRUCT_TM_TM_ZONE */
+
+/* Define if you have the 'symlink' function. */
+#define HAVE_SYMLINK 1
+
+/* Define to 1 if you have the `symlinkat' function. */
+#define HAVE_SYMLINKAT 1
+
+/* Define to 1 if you have the `sync' function. */
+/* #undef HAVE_SYNC */
+
+/* Define to 1 if you have the `sysconf' function. */
+#define HAVE_SYSCONF 1
+
+/* Define to 1 if you have the <sysexits.h> header file. */
+/* #undef HAVE_SYSEXITS_H */
+
+/* Define to 1 if you have the <syslog.h> header file. */
+#define HAVE_SYSLOG_H 1
+
+/* Define to 1 if you have the `system' function. */
+#define HAVE_SYSTEM 1
+
+/* Define to 1 if you have the <sys/audioio.h> header file. */
+/* #undef HAVE_SYS_AUDIOIO_H */
+
+/* Define to 1 if you have the <sys/auxv.h> header file. */
+/* #undef HAVE_SYS_AUXV_H */
+
+/* Define to 1 if you have the <sys/bsdtty.h> header file. */
+/* #undef HAVE_SYS_BSDTTY_H */
+
+/* Define to 1 if you have the <sys/devpoll.h> header file. */
+/* #undef HAVE_SYS_DEVPOLL_H */
+
+/* Define to 1 if you have the <sys/dir.h> header file, and it defines `DIR'.
+   */
+/* #undef HAVE_SYS_DIR_H */
+
+/* Define to 1 if you have the <sys/endian.h> header file. */
+#define HAVE_SYS_ENDIAN_H 1
+
+/* Define to 1 if you have the <sys/epoll.h> header file. */
+/* #undef HAVE_SYS_EPOLL_H */
+
+/* Define to 1 if you have the <sys/eventfd.h> header file. */
+/* #undef HAVE_SYS_EVENTFD_H */
+
+/* Define to 1 if you have the <sys/event.h> header file. */
+/* #undef HAVE_SYS_EVENT_H */
+
+/* Define to 1 if you have the <sys/file.h> header file. */
+#define HAVE_SYS_FILE_H 1
+
+/* Define to 1 if you have the <sys/ioctl.h> header file. */
+#define HAVE_SYS_IOCTL_H 1
+
+/* Define to 1 if you have the <sys/kern_control.h> header file. */
+/* #undef HAVE_SYS_KERN_CONTROL_H */
+
+/* Define to 1 if you have the <sys/loadavg.h> header file. */
+/* #undef HAVE_SYS_LOADAVG_H */
+
+/* Define to 1 if you have the <sys/lock.h> header file. */
+#define HAVE_SYS_LOCK_H 1
+
+/* Define to 1 if you have the <sys/memfd.h> header file. */
+/* #undef HAVE_SYS_MEMFD_H */
+
+/* Define to 1 if you have the <sys/mkdev.h> header file. */
+/* #undef HAVE_SYS_MKDEV_H */
+
+/* Define to 1 if you have the <sys/mman.h> header file. */
+#define HAVE_SYS_MMAN_H 1
+
+/* Define to 1 if you have the <sys/modem.h> header file. */
+/* #undef HAVE_SYS_MODEM_H */
+
+/* Define to 1 if you have the <sys/ndir.h> header file, and it defines `DIR'.
+   */
+/* #undef HAVE_SYS_NDIR_H */
+
+/* Define to 1 if you have the <sys/param.h> header file. */
+#define HAVE_SYS_PARAM_H 1
+
+/* Define to 1 if you have the <sys/poll.h> header file. */
+/* #undef HAVE_SYS_POLL_H */
+
+/* Define to 1 if you have the <sys/random.h> header file. */
+/* #undef HAVE_SYS_RANDOM_H */
+
+/* Define to 1 if you have the <sys/resource.h> header file. */
+#define HAVE_SYS_RESOURCE_H 1
+
+/* Define to 1 if you have the <sys/select.h> header file. */
+#define HAVE_SYS_SELECT_H 1
+
+/* Define to 1 if you have the <sys/sendfile.h> header file. */
+/* #undef HAVE_SYS_SENDFILE_H */
+
+/* Define to 1 if you have the <sys/socket.h> header file. */
+#define HAVE_SYS_SOCKET_H 1
+
+/* Define to 1 if you have the <sys/soundcard.h> header file. */
+/* #undef HAVE_SYS_SOUNDCARD_H */
+
+/* Define to 1 if you have the <sys/statvfs.h> header file. */
+#define HAVE_SYS_STATVFS_H 1
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/syscall.h> header file. */
+/* #undef HAVE_SYS_SYSCALL_H */
+
+/* Define to 1 if you have the <sys/sysmacros.h> header file. */
+/* #undef HAVE_SYS_SYSMACROS_H */
+
+/* Define to 1 if you have the <sys/sys_domain.h> header file. */
+/* #undef HAVE_SYS_SYS_DOMAIN_H */
+
+/* Define to 1 if you have the <sys/termio.h> header file. */
+/* #undef HAVE_SYS_TERMIO_H */
+
+/* Define to 1 if you have the <sys/times.h> header file. */
+#define HAVE_SYS_TIMES_H 1
+
+/* Define to 1 if you have the <sys/time.h> header file. */
+#define HAVE_SYS_TIME_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the <sys/uio.h> header file. */
+#define HAVE_SYS_UIO_H 1
+
+/* Define to 1 if you have the <sys/un.h> header file. */
+#define HAVE_SYS_UN_H 1
+
+/* Define to 1 if you have the <sys/utsname.h> header file. */
+#define HAVE_SYS_UTSNAME_H 1
+
+/* Define to 1 if you have the <sys/wait.h> header file. */
+#define HAVE_SYS_WAIT_H 1
+
+/* Define to 1 if you have the <sys/xattr.h> header file. */
+/* #undef HAVE_SYS_XATTR_H */
+
+/* Define to 1 if you have the `tcdrain' function. */
+/* #undef HAVE_TCDRAIN */
+
+/* Define to 1 if you have the `tcflow' function. */
+/* #undef HAVE_TCFLOW */
+
+/* Define to 1 if you have the `tcflush' function. */
+/* #undef HAVE_TCFLUSH */
+
+/* Define to 1 if you have the `tcgetpgrp' function. */
+/* #undef HAVE_TCGETPGRP */
+
+/* Define to 1 if you have the `tcsendbreak' function. */
+/* #undef HAVE_TCSENDBREAK */
+
+/* Define to 1 if you have the `tcsetpgrp' function. */
+/* #undef HAVE_TCSETPGRP */
+
+/* Define to 1 if you have the `tempnam' function. */
+#define HAVE_TEMPNAM 1
+
+/* Define to 1 if you have the <termios.h> header file. */
+#define HAVE_TERMIOS_H 1
+
+/* Define to 1 if you have the <term.h> header file. */
+/* #undef HAVE_TERM_H */
+
+/* Define to 1 if you have the `timegm' function. */
+/* #undef HAVE_TIMEGM */
+
+/* Define to 1 if you have the `times' function. */
+#define HAVE_TIMES 1
+
+/* Define to 1 if you have the `tmpfile' function. */
+#define HAVE_TMPFILE 1
+
+/* Define to 1 if you have the `tmpnam' function. */
+#define HAVE_TMPNAM 1
+
+/* Define to 1 if you have the `tmpnam_r' function. */
+/* #undef HAVE_TMPNAM_R */
+
+/* Define to 1 if your `struct tm' has `tm_zone'. Deprecated, use
+   `HAVE_STRUCT_TM_TM_ZONE' instead. */
+/* #undef HAVE_TM_ZONE */
+
+/* Define to 1 if you have the `truncate' function. */
+#define HAVE_TRUNCATE 1
+
+/* Define to 1 if you have the `ttyname' function. */
+/* #undef HAVE_TTYNAME */
+
+/* Define to 1 if you don't have `tm_zone' but do have the external array
+   `tzname'. */
+#define HAVE_TZNAME 1
+
+/* Define to 1 if you have the `umask' function. */
+#define HAVE_UMASK 1
+
+/* Define to 1 if you have the `uname' function. */
+#define HAVE_UNAME 1
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#define HAVE_UNISTD_H 1
+
+/* Define to 1 if you have the `unlinkat' function. */
+#define HAVE_UNLINKAT 1
+
+/* Define to 1 if you have the `unshare' function. */
+/* #undef HAVE_UNSHARE */
+
+/* Define if you have a useable wchar_t type defined in wchar.h; useable means
+   wchar_t must be an unsigned type with at least 16 bits. (see
+   Include/unicodeobject.h). */
+/* #undef HAVE_USABLE_WCHAR_T */
+
+/* Define to 1 if you have the <util.h> header file. */
+/* #undef HAVE_UTIL_H */
+
+/* Define to 1 if you have the `utimensat' function. */
+#define HAVE_UTIMENSAT 1
+
+/* Define to 1 if you have the `utimes' function. */
+#define HAVE_UTIMES 1
+
+/* Define to 1 if you have the <utime.h> header file. */
+#define HAVE_UTIME_H 1
+
+/* Define to 1 if you have the <utmp.h> header file. */
+#define HAVE_UTMP_H 1
+
+/* Define to 1 if you have the `uuid_create' function. */
+/* #undef HAVE_UUID_CREATE */
+
+/* Define to 1 if you have the `uuid_enc_be' function. */
+/* #undef HAVE_UUID_ENC_BE */
+
+/* Define if uuid_generate_time_safe() exists. */
+/* #undef HAVE_UUID_GENERATE_TIME_SAFE */
+
+/* Define to 1 if you have the <uuid.h> header file. */
+/* #undef HAVE_UUID_H */
+
+/* Define to 1 if you have the <uuid/uuid.h> header file. */
+/* #undef HAVE_UUID_UUID_H */
+
+/* Define to 1 if you have the `vfork' function. */
+/* #undef HAVE_VFORK */
+
+/* Define to 1 if you have the `wait' function. */
+/* #undef HAVE_WAIT */
+
+/* Define to 1 if you have the `wait3' function. */
+/* #undef HAVE_WAIT3 */
+
+/* Define to 1 if you have the `wait4' function. */
+/* #undef HAVE_WAIT4 */
+
+/* Define to 1 if you have the `waitid' function. */
+/* #undef HAVE_WAITID */
+
+/* Define to 1 if you have the `waitpid' function. */
+#define HAVE_WAITPID 1
+
+/* Define if the compiler provides a wchar.h header file. */
+#define HAVE_WCHAR_H 1
+
+/* Define to 1 if you have the `wcscoll' function. */
+#define HAVE_WCSCOLL 1
+
+/* Define to 1 if you have the `wcsftime' function. */
+#define HAVE_WCSFTIME 1
+
+/* Define to 1 if you have the `wcsxfrm' function. */
+#define HAVE_WCSXFRM 1
+
+/* Define to 1 if you have the `wmemcmp' function. */
+#define HAVE_WMEMCMP 1
+
+/* Define if tzset() actually switches the local timezone in a meaningful way.
+   */
+/* #undef HAVE_WORKING_TZSET */
+
+/* Define to 1 if you have the `writev' function. */
+#define HAVE_WRITEV 1
+
+/* Define if the zlib library has inflateCopy */
+#define HAVE_ZLIB_COPY 1
+
+/* Define to 1 if you have the <zlib.h> header file. */
+/* #undef HAVE_ZLIB_H */
+
+/* Define to 1 if you have the `_getpty' function. */
+/* #undef HAVE__GETPTY */
+
+/* Define to 1 if `major', `minor', and `makedev' are declared in <mkdev.h>.
+   */
+/* #undef MAJOR_IN_MKDEV */
+
+/* Define to 1 if `major', `minor', and `makedev' are declared in
+   <sysmacros.h>. */
+/* #undef MAJOR_IN_SYSMACROS */
+
+/* Define if mvwdelch in curses.h is an expression. */
+/* #undef MVWDELCH_IS_EXPRESSION */
+
+/* Define to the address where bug reports for this package should be sent. */
+/* #undef PACKAGE_BUGREPORT */
+
+/* Define to the full name of this package. */
+/* #undef PACKAGE_NAME */
+
+/* Define to the full name and version of this package. */
+/* #undef PACKAGE_STRING */
+
+/* Define to the one symbol short name of this package. */
+/* #undef PACKAGE_TARNAME */
+
+/* Define to the home page for this package. */
+/* #undef PACKAGE_URL */
+
+/* Define to the version of this package. */
+/* #undef PACKAGE_VERSION */
+
+/* Define if POSIX semaphores aren't enabled on your system */
+/* #undef POSIX_SEMAPHORES_NOT_ENABLED */
+
+/* Define if pthread_key_t is compatible with int. */
+#define PTHREAD_KEY_T_IS_COMPATIBLE_WITH_INT 1
+
+/* Defined if PTHREAD_SCOPE_SYSTEM supported. */
+/* #undef PTHREAD_SYSTEM_SCHED_SUPPORTED */
+
+/* Define as the preferred size in bits of long digits */
+/* #undef PYLONG_BITS_IN_DIGIT */
+
+/* enabled builtin hash modules */
+#define PY_BUILTIN_HASHLIB_HASHES "md5,sha1,sha2,sha3,blake2"
+
+/* Define if you want to coerce the C locale to a UTF-8 based locale */
+#define PY_COERCE_C_LOCALE 1
+
+/* Define to 1 if you have the perf trampoline. */
+/* #undef PY_HAVE_PERF_TRAMPOLINE */
+
+/* Define to 1 to build the sqlite module with loadable extensions support. */
+/* #undef PY_SQLITE_ENABLE_LOAD_EXTENSION */
+
+/* Define if SQLite was compiled with the serialize API */
+#define PY_SQLITE_HAVE_SERIALIZE 1
+
+/* Default cipher suites list for ssl module. 1: Python's preferred selection,
+   2: leave OpenSSL defaults untouched, 0: custom string */
+#define PY_SSL_DEFAULT_CIPHERS 1
+
+/* Cipher suite string for PY_SSL_DEFAULT_CIPHERS=0 */
+/* #undef PY_SSL_DEFAULT_CIPHER_STRING */
+
+/* PEP 11 Support tier (1, 2, 3 or 0 for unsupported) */
+#define PY_SUPPORT_TIER 0
+
+/* Define if you want to build an interpreter with many run-time checks. */
+/* #undef Py_DEBUG */
+
+/* Defined if Python is built as a shared library. */
+/* #undef Py_ENABLE_SHARED */
+
+/* Define hash algorithm for str, bytes and memoryview. SipHash24: 1, FNV: 2,
+   SipHash13: 3, externally defined: 0 */
+/* #undef Py_HASH_ALGORITHM */
+
+/* Define if you want to enable internal statistics gathering. */
+/* #undef Py_STATS */
+
+/* The version of SunOS/Solaris as reported by `uname -r' without the dot. */
+/* #undef Py_SUNOS_VERSION */
+
+/* Define if you want to enable tracing references for debugging purpose */
+/* #undef Py_TRACE_REFS */
+
+/* assume C89 semantics that RETSIGTYPE is always void */
+#define RETSIGTYPE void
+
+/* Define if setpgrp() must be called as setpgrp(0, 0). */
+/* #undef SETPGRP_HAVE_ARG */
+
+/* Define if i>>j for signed int i does not extend the sign bit when i < 0 */
+/* #undef SIGNED_RIGHT_SHIFT_ZERO_FILLS */
+
+/* The size of `double', as computed by sizeof. */
+#define SIZEOF_DOUBLE 8
+
+/* The size of `float', as computed by sizeof. */
+#define SIZEOF_FLOAT 4
+
+/* The size of `fpos_t', as computed by sizeof. */
+#define SIZEOF_FPOS_T 4
+
+/* The size of `int', as computed by sizeof. */
+#define SIZEOF_INT 4
+
+/* The size of `long', as computed by sizeof. */
+#define SIZEOF_LONG 4
+
+/* The size of `long double', as computed by sizeof. */
+#define SIZEOF_LONG_DOUBLE 12
+
+/* The size of `long long', as computed by sizeof. */
+#define SIZEOF_LONG_LONG 8
+
+/* The size of `off_t', as computed by sizeof. */
+#define SIZEOF_OFF_T 8
+
+/* The size of `pid_t', as computed by sizeof. */
+#define SIZEOF_PID_T 4
+
+/* The size of `pthread_key_t', as computed by sizeof. */
+#define SIZEOF_PTHREAD_KEY_T 4
+
+/* The size of `pthread_t', as computed by sizeof. */
+#define SIZEOF_PTHREAD_T 4
+
+/* The size of `short', as computed by sizeof. */
+#define SIZEOF_SHORT 2
+
+/* The size of `size_t', as computed by sizeof. */
+#define SIZEOF_SIZE_T 4
+
+/* The size of `time_t', as computed by sizeof. */
+#define SIZEOF_TIME_T 8
+
+/* The size of `uintptr_t', as computed by sizeof. */
+#define SIZEOF_UINTPTR_T 4
+
+/* The size of `void *', as computed by sizeof. */
+#define SIZEOF_VOID_P 4
+
+/* The size of `wchar_t', as computed by sizeof. */
+#define SIZEOF_WCHAR_T 4
+
+/* The size of `_Bool', as computed by sizeof. */
+#define SIZEOF__BOOL 1
+
+/* Define to 1 if you have the ANSI C header files. */
+#define STDC_HEADERS 1
+
+/* Define if you can safely include both <sys/select.h> and <sys/time.h>
+   (which you can't on SCO ODT 3.0). */
+#define SYS_SELECT_WITH_SYS_TIME 1
+
+/* Custom thread stack size depending on chosen sanitizer runtimes. */
+/* #undef THREAD_STACK_SIZE */
+
+/* Library needed by timemodule.c: librt may be needed for clock_gettime() */
+/* #undef TIMEMODULE_LIB */
+
+/* Define to 1 if your <sys/time.h> declares `struct tm'. */
+/* #undef TM_IN_SYS_TIME */
+
+/* Define if you want to use computed gotos in ceval.c. */
+#define USE_COMPUTED_GOTOS 1
+
+/* Enable extensions on AIX 3, Interix.  */
+#ifndef _ALL_SOURCE
+# define _ALL_SOURCE 1
+#endif
+/* Enable general extensions on macOS.  */
+#ifndef _DARWIN_C_SOURCE
+# define _DARWIN_C_SOURCE 1
+#endif
+/* Enable general extensions on Solaris.  */
+#ifndef __EXTENSIONS__
+# define __EXTENSIONS__ 1
+#endif
+/* Enable GNU extensions on systems that have them.  */
+#ifndef _GNU_SOURCE
+# define _GNU_SOURCE 1
+#endif
+/* Enable X/Open compliant socket functions that do not require linking
+   with -lxnet on HP-UX 11.11.  */
+#ifndef _HPUX_ALT_XOPEN_SOCKET_API
+# define _HPUX_ALT_XOPEN_SOCKET_API 1
+#endif
+/* Identify the host operating system as Minix.
+   This macro does not affect the system headers' behavior.
+   A future release of Autoconf may stop defining this macro.  */
+#ifndef _MINIX
+/* # undef _MINIX */
+#endif
+/* Enable general extensions on NetBSD.
+   Enable NetBSD compatibility extensions on Minix.  */
+#ifndef _NETBSD_SOURCE
+# define _NETBSD_SOURCE 1
+#endif
+/* Enable OpenBSD compatibility extensions on NetBSD.
+   Oddly enough, this does nothing on OpenBSD.  */
+#ifndef _OPENBSD_SOURCE
+# define _OPENBSD_SOURCE 1
+#endif
+/* Define to 1 if needed for POSIX-compatible behavior.  */
+#ifndef _POSIX_SOURCE
+/* # undef _POSIX_SOURCE */
+#endif
+/* Define to 2 if needed for POSIX-compatible behavior.  */
+#ifndef _POSIX_1_SOURCE
+/* # undef _POSIX_1_SOURCE */
+#endif
+/* Enable POSIX-compatible threading on Solaris.  */
+#ifndef _POSIX_PTHREAD_SEMANTICS
+# define _POSIX_PTHREAD_SEMANTICS 1
+#endif
+/* Enable extensions specified by ISO/IEC TS 18661-5:2014.  */
+#ifndef __STDC_WANT_IEC_60559_ATTRIBS_EXT__
+# define __STDC_WANT_IEC_60559_ATTRIBS_EXT__ 1
+#endif
+/* Enable extensions specified by ISO/IEC TS 18661-1:2014.  */
+#ifndef __STDC_WANT_IEC_60559_BFP_EXT__
+# define __STDC_WANT_IEC_60559_BFP_EXT__ 1
+#endif
+/* Enable extensions specified by ISO/IEC TS 18661-2:2015.  */
+#ifndef __STDC_WANT_IEC_60559_DFP_EXT__
+# define __STDC_WANT_IEC_60559_DFP_EXT__ 1
+#endif
+/* Enable extensions specified by ISO/IEC TS 18661-4:2015.  */
+#ifndef __STDC_WANT_IEC_60559_FUNCS_EXT__
+# define __STDC_WANT_IEC_60559_FUNCS_EXT__ 1
+#endif
+/* Enable extensions specified by ISO/IEC TS 18661-3:2015.  */
+#ifndef __STDC_WANT_IEC_60559_TYPES_EXT__
+# define __STDC_WANT_IEC_60559_TYPES_EXT__ 1
+#endif
+/* Enable extensions specified by ISO/IEC TR 24731-2:2010.  */
+#ifndef __STDC_WANT_LIB_EXT2__
+# define __STDC_WANT_LIB_EXT2__ 1
+#endif
+/* Enable extensions specified by ISO/IEC 24747:2009.  */
+#ifndef __STDC_WANT_MATH_SPEC_FUNCS__
+# define __STDC_WANT_MATH_SPEC_FUNCS__ 1
+#endif
+/* Enable extensions on HP NonStop.  */
+#ifndef _TANDEM_SOURCE
+# define _TANDEM_SOURCE 1
+#endif
+/* Enable X/Open extensions.  Define to 500 only if necessary
+   to make mbstate_t available.  */
+#ifndef _XOPEN_SOURCE
+# define _XOPEN_SOURCE 700
+#endif
+
+
+/* Define if WINDOW in curses.h offers a field _flags. */
+/* #undef WINDOW_HAS_FLAGS */
+
+/* Define if you want build the _decimal module using a coroutine-local rather
+   than a thread-local context */
+#define WITH_DECIMAL_CONTEXTVAR 1
+
+/* Define if you want documentation strings in extension modules */
+#define WITH_DOC_STRINGS 1
+
+/* Define if you want to compile in DTrace support */
+/* #undef WITH_DTRACE */
+
+/* Define if you want to use the new-style (Openstep, Rhapsody, MacOS) dynamic
+   linker (dyld) instead of the old-style (NextStep) dynamic linker (rld).
+   Dyld is necessary to support frameworks. */
+/* #undef WITH_DYLD */
+
+/* Define to build the readline module against libedit. */
+/* #undef WITH_EDITLINE */
+
+/* Define if you want to compile in object freelists optimization */
+#define WITH_FREELISTS 1
+
+/* Define to 1 if libintl is needed for locale functions. */
+/* #undef WITH_LIBINTL */
+
+/* Define if you want to produce an OpenStep/Rhapsody framework (shared
+   library plus accessory files). */
+/* #undef WITH_NEXT_FRAMEWORK */
+
+/* Define if you want to compile in Python-specific mallocs */
+#define WITH_PYMALLOC 1
+
+/* Define if you want pymalloc to be disabled when running under valgrind */
+/* #undef WITH_VALGRIND */
+
+/* Define WORDS_BIGENDIAN to 1 if your processor stores words with the most
+   significant byte first (like Motorola and SPARC, unlike Intel). */
+#if defined AC_APPLE_UNIVERSAL_BUILD
+# if defined __BIG_ENDIAN__
+#  define WORDS_BIGENDIAN 1
+# endif
+#else
+# ifndef WORDS_BIGENDIAN
+/* #  undef WORDS_BIGENDIAN */
+# endif
+#endif
+
+/* Define if arithmetic is subject to x87-style double rounding issue */
+/* #undef X87_DOUBLE_ROUNDING */
+
+/* Define on OpenBSD to activate all library features */
+/* #undef _BSD_SOURCE */
+
+/* Define on Darwin to activate all library features */
+#define _DARWIN_C_SOURCE 1
+
+/* This must be set to 64 on some systems to enable large file support. */
+#define _FILE_OFFSET_BITS 64
+
+/* Define to include mbstate_t for mbrtowc */
+/* #undef _INCLUDE__STDC_A1_SOURCE */
+
+/* This must be defined on some systems to enable large file support. */
+#define _LARGEFILE_SOURCE 1
+
+/* This must be defined on AIX systems to enable large file support. */
+/* #undef _LARGE_FILES */
+
+/* Define on NetBSD to activate all library features */
+#define _NETBSD_SOURCE 1
+
+/* Define to activate features from IEEE Stds 1003.1-2008 */
+#define _POSIX_C_SOURCE 200809L
+
+/* Define if you have POSIX threads, and your system does not define that. */
+/* #undef _POSIX_THREADS */
+
+/* framework name */
+#define _PYTHONFRAMEWORK ""
+
+/* Define to force use of thread-safe errno, h_errno, and other functions */
+#define _REENTRANT 1
+
+/* Define to 1 if you want to emulate getpid() on WASI */
+/* #undef _WASI_EMULATED_GETPID */
+
+/* Define to 1 if you want to emulate process clocks on WASI */
+/* #undef _WASI_EMULATED_PROCESS_CLOCKS */
+
+/* Define to 1 if you want to emulate signals on WASI */
+/* #undef _WASI_EMULATED_SIGNAL */
+
+/* Define to the level of X/Open that your system supports */
+#define _XOPEN_SOURCE 700
+
+/* Define to activate Unix95-and-earlier features */
+#define _XOPEN_SOURCE_EXTENDED 1
+
+/* Define on FreeBSD to activate all library features */
+#define __BSD_VISIBLE 1
+
+/* Define to 'long' if <time.h> doesn't define. */
+/* #undef clock_t */
+
+/* Define to empty if `const' does not conform to ANSI C. */
+/* #undef const */
+
+/* Define to `int' if <sys/types.h> doesn't define. */
+/* #undef gid_t */
+
+/* Define to `int' if <sys/types.h> does not define. */
+/* #undef mode_t */
+
+/* Define to `long int' if <sys/types.h> does not define. */
+/* #undef off_t */
+
+/* Define as a signed integer type capable of holding a process identifier. */
+/* #undef pid_t */
+
+/* Define to empty if the keyword does not work. */
+/* #undef signed */
+
+/* Define to `unsigned int' if <sys/types.h> does not define. */
+/* #undef size_t */
+
+/* Define to `int' if <sys/socket.h> does not define. */
+/* #undef socklen_t */
+
+/* Define to `int' if <sys/types.h> doesn't define. */
+/* #undef uid_t */
+
+
+/* Define the macros needed if on a UnixWare 7.x system. */
+#if defined(__USLC__) && defined(__SCO_VERSION__)
+#define STRICT_SYSV_CURSES /* Don't use ncurses extensions */
+#endif
+
+#endif /*Py_PYCONFIG_H*/
+
diff --git a/nanvix-port/cpython-headers/python3.12/pydtrace.h b/nanvix-port/cpython-headers/python3.12/pydtrace.h
new file mode 100644
index 000000000000..e197d3669453
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/pydtrace.h
@@ -0,0 +1,59 @@
+/* Static DTrace probes interface */
+
+#ifndef Py_DTRACE_H
+#define Py_DTRACE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef WITH_DTRACE
+
+#include "pydtrace_probes.h"
+
+/* pydtrace_probes.h, on systems with DTrace, is auto-generated to include
+   `PyDTrace_{PROBE}` and `PyDTrace_{PROBE}_ENABLED()` macros for every probe
+   defined in pydtrace.d.
+
+   Calling these functions must be guarded by a `PyDTrace_{PROBE}_ENABLED()`
+   check to minimize performance impact when probing is off. For example:
+
+       if (PyDTrace_FUNCTION_ENTRY_ENABLED())
+           PyDTrace_FUNCTION_ENTRY(f);
+*/
+
+#else
+
+/* Without DTrace, compile to nothing. */
+
+static inline void PyDTrace_LINE(const char *arg0, const char *arg1, int arg2) {}
+static inline void PyDTrace_FUNCTION_ENTRY(const char *arg0, const char *arg1, int arg2)  {}
+static inline void PyDTrace_FUNCTION_RETURN(const char *arg0, const char *arg1, int arg2) {}
+static inline void PyDTrace_GC_START(int arg0) {}
+static inline void PyDTrace_GC_DONE(Py_ssize_t arg0) {}
+static inline void PyDTrace_INSTANCE_NEW_START(int arg0) {}
+static inline void PyDTrace_INSTANCE_NEW_DONE(int arg0) {}
+static inline void PyDTrace_INSTANCE_DELETE_START(int arg0) {}
+static inline void PyDTrace_INSTANCE_DELETE_DONE(int arg0) {}
+static inline void PyDTrace_IMPORT_FIND_LOAD_START(const char *arg0) {}
+static inline void PyDTrace_IMPORT_FIND_LOAD_DONE(const char *arg0, int arg1) {}
+static inline void PyDTrace_AUDIT(const char *arg0, void *arg1) {}
+
+static inline int PyDTrace_LINE_ENABLED(void) { return 0; }
+static inline int PyDTrace_FUNCTION_ENTRY_ENABLED(void) { return 0; }
+static inline int PyDTrace_FUNCTION_RETURN_ENABLED(void) { return 0; }
+static inline int PyDTrace_GC_START_ENABLED(void) { return 0; }
+static inline int PyDTrace_GC_DONE_ENABLED(void) { return 0; }
+static inline int PyDTrace_INSTANCE_NEW_START_ENABLED(void) { return 0; }
+static inline int PyDTrace_INSTANCE_NEW_DONE_ENABLED(void) { return 0; }
+static inline int PyDTrace_INSTANCE_DELETE_START_ENABLED(void) { return 0; }
+static inline int PyDTrace_INSTANCE_DELETE_DONE_ENABLED(void) { return 0; }
+static inline int PyDTrace_IMPORT_FIND_LOAD_START_ENABLED(void) { return 0; }
+static inline int PyDTrace_IMPORT_FIND_LOAD_DONE_ENABLED(void) { return 0; }
+static inline int PyDTrace_AUDIT_ENABLED(void) { return 0; }
+
+#endif /* !WITH_DTRACE */
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_DTRACE_H */
diff --git a/nanvix-port/cpython-headers/python3.12/pyerrors.h b/nanvix-port/cpython-headers/python3.12/pyerrors.h
new file mode 100644
index 000000000000..d089fa717793
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/pyerrors.h
@@ -0,0 +1,337 @@
+#ifndef Py_ERRORS_H
+#define Py_ERRORS_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdarg.h>               // va_list
+
+/* Error handling definitions */
+
+PyAPI_FUNC(void) PyErr_SetNone(PyObject *);
+PyAPI_FUNC(void) PyErr_SetObject(PyObject *, PyObject *);
+PyAPI_FUNC(void) PyErr_SetString(
+    PyObject *exception,
+    const char *string   /* decoded from utf-8 */
+    );
+PyAPI_FUNC(PyObject *) PyErr_Occurred(void);
+PyAPI_FUNC(void) PyErr_Clear(void);
+PyAPI_FUNC(void) PyErr_Fetch(PyObject **, PyObject **, PyObject **);
+PyAPI_FUNC(void) PyErr_Restore(PyObject *, PyObject *, PyObject *);
+PyAPI_FUNC(PyObject *) PyErr_GetRaisedException(void);
+PyAPI_FUNC(void) PyErr_SetRaisedException(PyObject *);
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030b0000
+PyAPI_FUNC(PyObject*) PyErr_GetHandledException(void);
+PyAPI_FUNC(void) PyErr_SetHandledException(PyObject *);
+#endif
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+PyAPI_FUNC(void) PyErr_GetExcInfo(PyObject **, PyObject **, PyObject **);
+PyAPI_FUNC(void) PyErr_SetExcInfo(PyObject *, PyObject *, PyObject *);
+#endif
+
+/* Defined in Python/pylifecycle.c
+
+   The Py_FatalError() function is replaced with a macro which logs
+   automatically the name of the current function, unless the Py_LIMITED_API
+   macro is defined. */
+PyAPI_FUNC(void) _Py_NO_RETURN Py_FatalError(const char *message);
+
+/* Error testing and normalization */
+PyAPI_FUNC(int) PyErr_GivenExceptionMatches(PyObject *, PyObject *);
+PyAPI_FUNC(int) PyErr_ExceptionMatches(PyObject *);
+PyAPI_FUNC(void) PyErr_NormalizeException(PyObject**, PyObject**, PyObject**);
+
+/* Traceback manipulation (PEP 3134) */
+PyAPI_FUNC(int) PyException_SetTraceback(PyObject *, PyObject *);
+PyAPI_FUNC(PyObject *) PyException_GetTraceback(PyObject *);
+
+/* Cause manipulation (PEP 3134) */
+PyAPI_FUNC(PyObject *) PyException_GetCause(PyObject *);
+PyAPI_FUNC(void) PyException_SetCause(PyObject *, PyObject *);
+
+/* Context manipulation (PEP 3134) */
+PyAPI_FUNC(PyObject *) PyException_GetContext(PyObject *);
+PyAPI_FUNC(void) PyException_SetContext(PyObject *, PyObject *);
+
+
+PyAPI_FUNC(PyObject *) PyException_GetArgs(PyObject *);
+PyAPI_FUNC(void) PyException_SetArgs(PyObject *, PyObject *);
+
+/* */
+
+#define PyExceptionClass_Check(x)                                       \
+    (PyType_Check((x)) &&                                               \
+     PyType_FastSubclass((PyTypeObject*)(x), Py_TPFLAGS_BASE_EXC_SUBCLASS))
+
+#define PyExceptionInstance_Check(x)                    \
+    PyType_FastSubclass(Py_TYPE(x), Py_TPFLAGS_BASE_EXC_SUBCLASS)
+
+PyAPI_FUNC(const char *) PyExceptionClass_Name(PyObject *);
+
+#define PyExceptionInstance_Class(x) _PyObject_CAST(Py_TYPE(x))
+
+#define _PyBaseExceptionGroup_Check(x)                   \
+    PyObject_TypeCheck((x), (PyTypeObject *)PyExc_BaseExceptionGroup)
+
+/* Predefined exceptions */
+
+PyAPI_DATA(PyObject *) PyExc_BaseException;
+PyAPI_DATA(PyObject *) PyExc_Exception;
+PyAPI_DATA(PyObject *) PyExc_BaseExceptionGroup;
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000
+PyAPI_DATA(PyObject *) PyExc_StopAsyncIteration;
+#endif
+PyAPI_DATA(PyObject *) PyExc_StopIteration;
+PyAPI_DATA(PyObject *) PyExc_GeneratorExit;
+PyAPI_DATA(PyObject *) PyExc_ArithmeticError;
+PyAPI_DATA(PyObject *) PyExc_LookupError;
+
+PyAPI_DATA(PyObject *) PyExc_AssertionError;
+PyAPI_DATA(PyObject *) PyExc_AttributeError;
+PyAPI_DATA(PyObject *) PyExc_BufferError;
+PyAPI_DATA(PyObject *) PyExc_EOFError;
+PyAPI_DATA(PyObject *) PyExc_FloatingPointError;
+PyAPI_DATA(PyObject *) PyExc_OSError;
+PyAPI_DATA(PyObject *) PyExc_ImportError;
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03060000
+PyAPI_DATA(PyObject *) PyExc_ModuleNotFoundError;
+#endif
+PyAPI_DATA(PyObject *) PyExc_IndexError;
+PyAPI_DATA(PyObject *) PyExc_KeyError;
+PyAPI_DATA(PyObject *) PyExc_KeyboardInterrupt;
+PyAPI_DATA(PyObject *) PyExc_MemoryError;
+PyAPI_DATA(PyObject *) PyExc_NameError;
+PyAPI_DATA(PyObject *) PyExc_OverflowError;
+PyAPI_DATA(PyObject *) PyExc_RuntimeError;
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000
+PyAPI_DATA(PyObject *) PyExc_RecursionError;
+#endif
+PyAPI_DATA(PyObject *) PyExc_NotImplementedError;
+PyAPI_DATA(PyObject *) PyExc_SyntaxError;
+PyAPI_DATA(PyObject *) PyExc_IndentationError;
+PyAPI_DATA(PyObject *) PyExc_TabError;
+PyAPI_DATA(PyObject *) PyExc_ReferenceError;
+PyAPI_DATA(PyObject *) PyExc_SystemError;
+PyAPI_DATA(PyObject *) PyExc_SystemExit;
+PyAPI_DATA(PyObject *) PyExc_TypeError;
+PyAPI_DATA(PyObject *) PyExc_UnboundLocalError;
+PyAPI_DATA(PyObject *) PyExc_UnicodeError;
+PyAPI_DATA(PyObject *) PyExc_UnicodeEncodeError;
+PyAPI_DATA(PyObject *) PyExc_UnicodeDecodeError;
+PyAPI_DATA(PyObject *) PyExc_UnicodeTranslateError;
+PyAPI_DATA(PyObject *) PyExc_ValueError;
+PyAPI_DATA(PyObject *) PyExc_ZeroDivisionError;
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+PyAPI_DATA(PyObject *) PyExc_BlockingIOError;
+PyAPI_DATA(PyObject *) PyExc_BrokenPipeError;
+PyAPI_DATA(PyObject *) PyExc_ChildProcessError;
+PyAPI_DATA(PyObject *) PyExc_ConnectionError;
+PyAPI_DATA(PyObject *) PyExc_ConnectionAbortedError;
+PyAPI_DATA(PyObject *) PyExc_ConnectionRefusedError;
+PyAPI_DATA(PyObject *) PyExc_ConnectionResetError;
+PyAPI_DATA(PyObject *) PyExc_FileExistsError;
+PyAPI_DATA(PyObject *) PyExc_FileNotFoundError;
+PyAPI_DATA(PyObject *) PyExc_InterruptedError;
+PyAPI_DATA(PyObject *) PyExc_IsADirectoryError;
+PyAPI_DATA(PyObject *) PyExc_NotADirectoryError;
+PyAPI_DATA(PyObject *) PyExc_PermissionError;
+PyAPI_DATA(PyObject *) PyExc_ProcessLookupError;
+PyAPI_DATA(PyObject *) PyExc_TimeoutError;
+#endif
+
+
+/* Compatibility aliases */
+PyAPI_DATA(PyObject *) PyExc_EnvironmentError;
+PyAPI_DATA(PyObject *) PyExc_IOError;
+#ifdef MS_WINDOWS
+PyAPI_DATA(PyObject *) PyExc_WindowsError;
+#endif
+
+/* Predefined warning categories */
+PyAPI_DATA(PyObject *) PyExc_Warning;
+PyAPI_DATA(PyObject *) PyExc_UserWarning;
+PyAPI_DATA(PyObject *) PyExc_DeprecationWarning;
+PyAPI_DATA(PyObject *) PyExc_PendingDeprecationWarning;
+PyAPI_DATA(PyObject *) PyExc_SyntaxWarning;
+PyAPI_DATA(PyObject *) PyExc_RuntimeWarning;
+PyAPI_DATA(PyObject *) PyExc_FutureWarning;
+PyAPI_DATA(PyObject *) PyExc_ImportWarning;
+PyAPI_DATA(PyObject *) PyExc_UnicodeWarning;
+PyAPI_DATA(PyObject *) PyExc_BytesWarning;
+PyAPI_DATA(PyObject *) PyExc_EncodingWarning;
+PyAPI_DATA(PyObject *) PyExc_ResourceWarning;
+
+
+/* Convenience functions */
+
+PyAPI_FUNC(int) PyErr_BadArgument(void);
+PyAPI_FUNC(PyObject *) PyErr_NoMemory(void);
+PyAPI_FUNC(PyObject *) PyErr_SetFromErrno(PyObject *);
+PyAPI_FUNC(PyObject *) PyErr_SetFromErrnoWithFilenameObject(
+    PyObject *, PyObject *);
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03040000
+PyAPI_FUNC(PyObject *) PyErr_SetFromErrnoWithFilenameObjects(
+    PyObject *, PyObject *, PyObject *);
+#endif
+PyAPI_FUNC(PyObject *) PyErr_SetFromErrnoWithFilename(
+    PyObject *exc,
+    const char *filename   /* decoded from the filesystem encoding */
+    );
+
+PyAPI_FUNC(PyObject *) PyErr_Format(
+    PyObject *exception,
+    const char *format,   /* ASCII-encoded string  */
+    ...
+    );
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000
+PyAPI_FUNC(PyObject *) PyErr_FormatV(
+    PyObject *exception,
+    const char *format,
+    va_list vargs);
+#endif
+
+#ifdef MS_WINDOWS
+PyAPI_FUNC(PyObject *) PyErr_SetFromWindowsErrWithFilename(
+    int ierr,
+    const char *filename        /* decoded from the filesystem encoding */
+    );
+PyAPI_FUNC(PyObject *) PyErr_SetFromWindowsErr(int);
+PyAPI_FUNC(PyObject *) PyErr_SetExcFromWindowsErrWithFilenameObject(
+    PyObject *,int, PyObject *);
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03040000
+PyAPI_FUNC(PyObject *) PyErr_SetExcFromWindowsErrWithFilenameObjects(
+    PyObject *,int, PyObject *, PyObject *);
+#endif
+PyAPI_FUNC(PyObject *) PyErr_SetExcFromWindowsErrWithFilename(
+    PyObject *exc,
+    int ierr,
+    const char *filename        /* decoded from the filesystem encoding */
+    );
+PyAPI_FUNC(PyObject *) PyErr_SetExcFromWindowsErr(PyObject *, int);
+#endif /* MS_WINDOWS */
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03060000
+PyAPI_FUNC(PyObject *) PyErr_SetImportErrorSubclass(PyObject *, PyObject *,
+    PyObject *, PyObject *);
+#endif
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+PyAPI_FUNC(PyObject *) PyErr_SetImportError(PyObject *, PyObject *,
+    PyObject *);
+#endif
+
+/* Export the old function so that the existing API remains available: */
+PyAPI_FUNC(void) PyErr_BadInternalCall(void);
+PyAPI_FUNC(void) _PyErr_BadInternalCall(const char *filename, int lineno);
+/* Mask the old API with a call to the new API for code compiled under
+   Python 2.0: */
+#define PyErr_BadInternalCall() _PyErr_BadInternalCall(__FILE__, __LINE__)
+
+/* Function to create a new exception */
+PyAPI_FUNC(PyObject *) PyErr_NewException(
+    const char *name, PyObject *base, PyObject *dict);
+PyAPI_FUNC(PyObject *) PyErr_NewExceptionWithDoc(
+    const char *name, const char *doc, PyObject *base, PyObject *dict);
+PyAPI_FUNC(void) PyErr_WriteUnraisable(PyObject *);
+
+
+/* In signalmodule.c */
+PyAPI_FUNC(int) PyErr_CheckSignals(void);
+PyAPI_FUNC(void) PyErr_SetInterrupt(void);
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030A0000
+PyAPI_FUNC(int) PyErr_SetInterruptEx(int signum);
+#endif
+
+/* Support for adding program text to SyntaxErrors */
+PyAPI_FUNC(void) PyErr_SyntaxLocation(
+    const char *filename,       /* decoded from the filesystem encoding */
+    int lineno);
+PyAPI_FUNC(void) PyErr_SyntaxLocationEx(
+    const char *filename,       /* decoded from the filesystem encoding */
+    int lineno,
+    int col_offset);
+PyAPI_FUNC(PyObject *) PyErr_ProgramText(
+    const char *filename,       /* decoded from the filesystem encoding */
+    int lineno);
+
+/* The following functions are used to create and modify unicode
+   exceptions from C */
+
+/* create a UnicodeDecodeError object */
+PyAPI_FUNC(PyObject *) PyUnicodeDecodeError_Create(
+    const char *encoding,       /* UTF-8 encoded string */
+    const char *object,
+    Py_ssize_t length,
+    Py_ssize_t start,
+    Py_ssize_t end,
+    const char *reason          /* UTF-8 encoded string */
+    );
+
+/* get the encoding attribute */
+PyAPI_FUNC(PyObject *) PyUnicodeEncodeError_GetEncoding(PyObject *);
+PyAPI_FUNC(PyObject *) PyUnicodeDecodeError_GetEncoding(PyObject *);
+
+/* get the object attribute */
+PyAPI_FUNC(PyObject *) PyUnicodeEncodeError_GetObject(PyObject *);
+PyAPI_FUNC(PyObject *) PyUnicodeDecodeError_GetObject(PyObject *);
+PyAPI_FUNC(PyObject *) PyUnicodeTranslateError_GetObject(PyObject *);
+
+/* get the value of the start attribute (the int * may not be NULL)
+   return 0 on success, -1 on failure */
+PyAPI_FUNC(int) PyUnicodeEncodeError_GetStart(PyObject *, Py_ssize_t *);
+PyAPI_FUNC(int) PyUnicodeDecodeError_GetStart(PyObject *, Py_ssize_t *);
+PyAPI_FUNC(int) PyUnicodeTranslateError_GetStart(PyObject *, Py_ssize_t *);
+
+/* assign a new value to the start attribute
+   return 0 on success, -1 on failure */
+PyAPI_FUNC(int) PyUnicodeEncodeError_SetStart(PyObject *, Py_ssize_t);
+PyAPI_FUNC(int) PyUnicodeDecodeError_SetStart(PyObject *, Py_ssize_t);
+PyAPI_FUNC(int) PyUnicodeTranslateError_SetStart(PyObject *, Py_ssize_t);
+
+/* get the value of the end attribute (the int *may not be NULL)
+ return 0 on success, -1 on failure */
+PyAPI_FUNC(int) PyUnicodeEncodeError_GetEnd(PyObject *, Py_ssize_t *);
+PyAPI_FUNC(int) PyUnicodeDecodeError_GetEnd(PyObject *, Py_ssize_t *);
+PyAPI_FUNC(int) PyUnicodeTranslateError_GetEnd(PyObject *, Py_ssize_t *);
+
+/* assign a new value to the end attribute
+   return 0 on success, -1 on failure */
+PyAPI_FUNC(int) PyUnicodeEncodeError_SetEnd(PyObject *, Py_ssize_t);
+PyAPI_FUNC(int) PyUnicodeDecodeError_SetEnd(PyObject *, Py_ssize_t);
+PyAPI_FUNC(int) PyUnicodeTranslateError_SetEnd(PyObject *, Py_ssize_t);
+
+/* get the value of the reason attribute */
+PyAPI_FUNC(PyObject *) PyUnicodeEncodeError_GetReason(PyObject *);
+PyAPI_FUNC(PyObject *) PyUnicodeDecodeError_GetReason(PyObject *);
+PyAPI_FUNC(PyObject *) PyUnicodeTranslateError_GetReason(PyObject *);
+
+/* assign a new value to the reason attribute
+   return 0 on success, -1 on failure */
+PyAPI_FUNC(int) PyUnicodeEncodeError_SetReason(
+    PyObject *exc,
+    const char *reason          /* UTF-8 encoded string */
+    );
+PyAPI_FUNC(int) PyUnicodeDecodeError_SetReason(
+    PyObject *exc,
+    const char *reason          /* UTF-8 encoded string */
+    );
+PyAPI_FUNC(int) PyUnicodeTranslateError_SetReason(
+    PyObject *exc,
+    const char *reason          /* UTF-8 encoded string */
+    );
+
+PyAPI_FUNC(int) PyOS_snprintf(char *str, size_t size, const char  *format, ...)
+                        Py_GCC_ATTRIBUTE((format(printf, 3, 4)));
+PyAPI_FUNC(int) PyOS_vsnprintf(char *str, size_t size, const char  *format, va_list va)
+                        Py_GCC_ATTRIBUTE((format(printf, 3, 0)));
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_ERRORS_H
+#  include "cpython/pyerrors.h"
+#  undef Py_CPYTHON_ERRORS_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_ERRORS_H */
diff --git a/nanvix-port/cpython-headers/python3.12/pyexpat.h b/nanvix-port/cpython-headers/python3.12/pyexpat.h
new file mode 100644
index 000000000000..9824d099c3df
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/pyexpat.h
@@ -0,0 +1,57 @@
+/* Stuff to export relevant 'expat' entry points from pyexpat to other
+ * parser modules, such as cElementTree. */
+
+/* note: you must import expat.h before importing this module! */
+
+#define PyExpat_CAPI_MAGIC  "pyexpat.expat_CAPI 1.1"
+#define PyExpat_CAPSULE_NAME "pyexpat.expat_CAPI"
+
+struct PyExpat_CAPI
+{
+    char* magic; /* set to PyExpat_CAPI_MAGIC */
+    int size; /* set to sizeof(struct PyExpat_CAPI) */
+    int MAJOR_VERSION;
+    int MINOR_VERSION;
+    int MICRO_VERSION;
+    /* pointers to selected expat functions.  add new functions at
+       the end, if needed */
+    const XML_LChar * (*ErrorString)(enum XML_Error code);
+    enum XML_Error (*GetErrorCode)(XML_Parser parser);
+    XML_Size (*GetErrorColumnNumber)(XML_Parser parser);
+    XML_Size (*GetErrorLineNumber)(XML_Parser parser);
+    enum XML_Status (*Parse)(
+        XML_Parser parser, const char *s, int len, int isFinal);
+    XML_Parser (*ParserCreate_MM)(
+        const XML_Char *encoding, const XML_Memory_Handling_Suite *memsuite,
+        const XML_Char *namespaceSeparator);
+    void (*ParserFree)(XML_Parser parser);
+    void (*SetCharacterDataHandler)(
+        XML_Parser parser, XML_CharacterDataHandler handler);
+    void (*SetCommentHandler)(
+        XML_Parser parser, XML_CommentHandler handler);
+    void (*SetDefaultHandlerExpand)(
+        XML_Parser parser, XML_DefaultHandler handler);
+    void (*SetElementHandler)(
+        XML_Parser parser, XML_StartElementHandler start,
+        XML_EndElementHandler end);
+    void (*SetNamespaceDeclHandler)(
+        XML_Parser parser, XML_StartNamespaceDeclHandler start,
+        XML_EndNamespaceDeclHandler end);
+    void (*SetProcessingInstructionHandler)(
+        XML_Parser parser, XML_ProcessingInstructionHandler handler);
+    void (*SetUnknownEncodingHandler)(
+        XML_Parser parser, XML_UnknownEncodingHandler handler,
+        void *encodingHandlerData);
+    void (*SetUserData)(XML_Parser parser, void *userData);
+    void (*SetStartDoctypeDeclHandler)(XML_Parser parser,
+                                       XML_StartDoctypeDeclHandler start);
+    enum XML_Status (*SetEncoding)(XML_Parser parser, const XML_Char *encoding);
+    int (*DefaultUnknownEncodingHandler)(
+        void *encodingHandlerData, const XML_Char *name, XML_Encoding *info);
+    /* might be NULL for expat < 2.1.0 */
+    int (*SetHashSalt)(XML_Parser parser, unsigned long hash_salt);
+    /* might be NULL for expat < 2.6.0 */
+    XML_Bool (*SetReparseDeferralEnabled)(XML_Parser parser, XML_Bool enabled);
+    /* always add new stuff to the end! */
+};
+
diff --git a/nanvix-port/cpython-headers/python3.12/pyframe.h b/nanvix-port/cpython-headers/python3.12/pyframe.h
new file mode 100644
index 000000000000..13d52312ea96
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/pyframe.h
@@ -0,0 +1,26 @@
+/* Limited C API of PyFrame API
+ *
+ * Include "frameobject.h" to get the PyFrameObject structure.
+ */
+
+#ifndef Py_PYFRAME_H
+#define Py_PYFRAME_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Return the line of code the frame is currently executing. */
+PyAPI_FUNC(int) PyFrame_GetLineNumber(PyFrameObject *);
+
+PyAPI_FUNC(PyCodeObject *) PyFrame_GetCode(PyFrameObject *frame);
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_PYFRAME_H
+#  include "cpython/pyframe.h"
+#  undef Py_CPYTHON_PYFRAME_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_PYFRAME_H */
diff --git a/nanvix-port/cpython-headers/python3.12/pyhash.h b/nanvix-port/cpython-headers/python3.12/pyhash.h
new file mode 100644
index 000000000000..182d223fab1c
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/pyhash.h
@@ -0,0 +1,144 @@
+#ifndef Py_HASH_H
+
+#define Py_HASH_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Helpers for hash functions */
+#ifndef Py_LIMITED_API
+PyAPI_FUNC(Py_hash_t) _Py_HashDouble(PyObject *, double);
+PyAPI_FUNC(Py_hash_t) _Py_HashPointer(const void*);
+// Similar to _Py_HashPointer(), but don't replace -1 with -2
+PyAPI_FUNC(Py_hash_t) _Py_HashPointerRaw(const void*);
+PyAPI_FUNC(Py_hash_t) _Py_HashBytes(const void*, Py_ssize_t);
+#endif
+
+/* Prime multiplier used in string and various other hashes. */
+#define _PyHASH_MULTIPLIER 1000003UL  /* 0xf4243 */
+
+/* Parameters used for the numeric hash implementation.  See notes for
+   _Py_HashDouble in Python/pyhash.c.  Numeric hashes are based on
+   reduction modulo the prime 2**_PyHASH_BITS - 1. */
+
+#if SIZEOF_VOID_P >= 8
+#  define _PyHASH_BITS 61
+#else
+#  define _PyHASH_BITS 31
+#endif
+
+#define _PyHASH_MODULUS (((size_t)1 << _PyHASH_BITS) - 1)
+#define _PyHASH_INF 314159
+#define _PyHASH_IMAG _PyHASH_MULTIPLIER
+
+
+/* hash secret
+ *
+ * memory layout on 64 bit systems
+ *   cccccccc cccccccc cccccccc  uc -- unsigned char[24]
+ *   pppppppp ssssssss ........  fnv -- two Py_hash_t
+ *   k0k0k0k0 k1k1k1k1 ........  siphash -- two uint64_t
+ *   ........ ........ ssssssss  djbx33a -- 16 bytes padding + one Py_hash_t
+ *   ........ ........ eeeeeeee  pyexpat XML hash salt
+ *
+ * memory layout on 32 bit systems
+ *   cccccccc cccccccc cccccccc  uc
+ *   ppppssss ........ ........  fnv -- two Py_hash_t
+ *   k0k0k0k0 k1k1k1k1 ........  siphash -- two uint64_t (*)
+ *   ........ ........ ssss....  djbx33a -- 16 bytes padding + one Py_hash_t
+ *   ........ ........ eeee....  pyexpat XML hash salt
+ *
+ * (*) The siphash member may not be available on 32 bit platforms without
+ *     an unsigned int64 data type.
+ */
+#ifndef Py_LIMITED_API
+typedef union {
+    /* ensure 24 bytes */
+    unsigned char uc[24];
+    /* two Py_hash_t for FNV */
+    struct {
+        Py_hash_t prefix;
+        Py_hash_t suffix;
+    } fnv;
+    /* two uint64 for SipHash24 */
+    struct {
+        uint64_t k0;
+        uint64_t k1;
+    } siphash;
+    /* a different (!) Py_hash_t for small string optimization */
+    struct {
+        unsigned char padding[16];
+        Py_hash_t suffix;
+    } djbx33a;
+    struct {
+        unsigned char padding[16];
+        Py_hash_t hashsalt;
+    } expat;
+} _Py_HashSecret_t;
+PyAPI_DATA(_Py_HashSecret_t) _Py_HashSecret;
+
+#ifdef Py_DEBUG
+PyAPI_DATA(int) _Py_HashSecret_Initialized;
+#endif
+
+
+/* hash function definition */
+typedef struct {
+    Py_hash_t (*const hash)(const void *, Py_ssize_t);
+    const char *name;
+    const int hash_bits;
+    const int seed_bits;
+} PyHash_FuncDef;
+
+PyAPI_FUNC(PyHash_FuncDef*) PyHash_GetFuncDef(void);
+#endif
+
+
+/* cutoff for small string DJBX33A optimization in range [1, cutoff).
+ *
+ * About 50% of the strings in a typical Python application are smaller than
+ * 6 to 7 chars. However DJBX33A is vulnerable to hash collision attacks.
+ * NEVER use DJBX33A for long strings!
+ *
+ * A Py_HASH_CUTOFF of 0 disables small string optimization. 32 bit platforms
+ * should use a smaller cutoff because it is easier to create colliding
+ * strings. A cutoff of 7 on 64bit platforms and 5 on 32bit platforms should
+ * provide a decent safety margin.
+ */
+#ifndef Py_HASH_CUTOFF
+#  define Py_HASH_CUTOFF 0
+#elif (Py_HASH_CUTOFF > 7 || Py_HASH_CUTOFF < 0)
+#  error Py_HASH_CUTOFF must in range 0...7.
+#endif /* Py_HASH_CUTOFF */
+
+
+/* hash algorithm selection
+ *
+ * The values for Py_HASH_* are hard-coded in the
+ * configure script.
+ *
+ * - FNV and SIPHASH* are available on all platforms and architectures.
+ * - With EXTERNAL embedders can provide an alternative implementation with::
+ *
+ *     PyHash_FuncDef PyHash_Func = {...};
+ *
+ * XXX: Figure out __declspec() for extern PyHash_FuncDef.
+ */
+#define Py_HASH_EXTERNAL 0
+#define Py_HASH_SIPHASH24 1
+#define Py_HASH_FNV 2
+#define Py_HASH_SIPHASH13 3
+
+#ifndef Py_HASH_ALGORITHM
+#  ifndef HAVE_ALIGNED_REQUIRED
+#    define Py_HASH_ALGORITHM Py_HASH_SIPHASH13
+#  else
+#    define Py_HASH_ALGORITHM Py_HASH_FNV
+#  endif /* uint64_t && uint32_t && aligned */
+#endif /* Py_HASH_ALGORITHM */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !Py_HASH_H */
diff --git a/nanvix-port/cpython-headers/python3.12/pylifecycle.h b/nanvix-port/cpython-headers/python3.12/pylifecycle.h
new file mode 100644
index 000000000000..e4c3b09c963f
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/pylifecycle.h
@@ -0,0 +1,78 @@
+
+/* Interfaces to configure, query, create & destroy the Python runtime */
+
+#ifndef Py_PYLIFECYCLE_H
+#define Py_PYLIFECYCLE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Initialization and finalization */
+PyAPI_FUNC(void) Py_Initialize(void);
+PyAPI_FUNC(void) Py_InitializeEx(int);
+PyAPI_FUNC(void) Py_Finalize(void);
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03060000
+PyAPI_FUNC(int) Py_FinalizeEx(void);
+#endif
+PyAPI_FUNC(int) Py_IsInitialized(void);
+
+/* Subinterpreter support */
+PyAPI_FUNC(PyThreadState *) Py_NewInterpreter(void);
+PyAPI_FUNC(void) Py_EndInterpreter(PyThreadState *);
+
+
+/* Py_PyAtExit is for the atexit module, Py_AtExit is for low-level
+ * exit functions.
+ */
+PyAPI_FUNC(int) Py_AtExit(void (*func)(void));
+
+PyAPI_FUNC(void) _Py_NO_RETURN Py_Exit(int);
+
+/* Bootstrap __main__ (defined in Modules/main.c) */
+PyAPI_FUNC(int) Py_Main(int argc, wchar_t **argv);
+PyAPI_FUNC(int) Py_BytesMain(int argc, char **argv);
+
+/* In pathconfig.c */
+Py_DEPRECATED(3.11) PyAPI_FUNC(void) Py_SetProgramName(const wchar_t *);
+PyAPI_FUNC(wchar_t *) Py_GetProgramName(void);
+
+Py_DEPRECATED(3.11) PyAPI_FUNC(void) Py_SetPythonHome(const wchar_t *);
+PyAPI_FUNC(wchar_t *) Py_GetPythonHome(void);
+
+PyAPI_FUNC(wchar_t *) Py_GetProgramFullPath(void);
+
+PyAPI_FUNC(wchar_t *) Py_GetPrefix(void);
+PyAPI_FUNC(wchar_t *) Py_GetExecPrefix(void);
+PyAPI_FUNC(wchar_t *) Py_GetPath(void);
+Py_DEPRECATED(3.11) PyAPI_FUNC(void) Py_SetPath(const wchar_t *);
+#ifdef MS_WINDOWS
+int _Py_CheckPython3(void);
+#endif
+
+/* In their own files */
+PyAPI_FUNC(const char *) Py_GetVersion(void);
+PyAPI_FUNC(const char *) Py_GetPlatform(void);
+PyAPI_FUNC(const char *) Py_GetCopyright(void);
+PyAPI_FUNC(const char *) Py_GetCompiler(void);
+PyAPI_FUNC(const char *) Py_GetBuildInfo(void);
+
+/* Signals */
+typedef void (*PyOS_sighandler_t)(int);
+PyAPI_FUNC(PyOS_sighandler_t) PyOS_getsig(int);
+PyAPI_FUNC(PyOS_sighandler_t) PyOS_setsig(int, PyOS_sighandler_t);
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030B0000
+PyAPI_DATA(const unsigned long) Py_Version;
+#endif
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_PYLIFECYCLE_H
+#  include "cpython/pylifecycle.h"
+#  undef Py_CPYTHON_PYLIFECYCLE_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_PYLIFECYCLE_H */
diff --git a/nanvix-port/cpython-headers/python3.12/pymacconfig.h b/nanvix-port/cpython-headers/python3.12/pymacconfig.h
new file mode 100644
index 000000000000..61f08e368efb
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/pymacconfig.h
@@ -0,0 +1,99 @@
+#ifndef PYMACCONFIG_H
+#define PYMACCONFIG_H
+     /*
+      * This file moves some of the autoconf magic to compile-time
+      * when building on MacOSX. This is needed for building 4-way
+      * universal binaries and for 64-bit universal binaries because
+      * the values redefined below aren't configure-time constant but
+      * only compile-time constant in these scenarios.
+      */
+
+#if defined(__APPLE__)
+
+# undef ALIGNOF_MAX_ALIGN_T
+# undef SIZEOF_LONG
+# undef SIZEOF_LONG_DOUBLE
+# undef SIZEOF_PTHREAD_T
+# undef SIZEOF_SIZE_T
+# undef SIZEOF_TIME_T
+# undef SIZEOF_VOID_P
+# undef SIZEOF__BOOL
+# undef SIZEOF_UINTPTR_T
+# undef SIZEOF_PTHREAD_T
+# undef WORDS_BIGENDIAN
+# undef DOUBLE_IS_ARM_MIXED_ENDIAN_IEEE754
+# undef DOUBLE_IS_BIG_ENDIAN_IEEE754
+# undef DOUBLE_IS_LITTLE_ENDIAN_IEEE754
+# undef HAVE_GCC_ASM_FOR_X87
+# undef HAVE_GCC_ASM_FOR_X64
+
+#    undef VA_LIST_IS_ARRAY
+#    if defined(__LP64__) && defined(__x86_64__)
+#        define VA_LIST_IS_ARRAY 1
+#    endif
+
+#    undef HAVE_LARGEFILE_SUPPORT
+#    ifndef __LP64__
+#         define HAVE_LARGEFILE_SUPPORT 1
+#    endif
+
+#    undef SIZEOF_LONG
+#    ifdef __LP64__
+#        define SIZEOF__BOOL            1
+#        define SIZEOF__BOOL            1
+#        define SIZEOF_LONG             8
+#        define SIZEOF_PTHREAD_T        8
+#        define SIZEOF_SIZE_T           8
+#        define SIZEOF_TIME_T           8
+#        define SIZEOF_VOID_P           8
+#        define SIZEOF_UINTPTR_T        8
+#        define SIZEOF_PTHREAD_T        8
+#    else
+#        ifdef __ppc__
+#           define SIZEOF__BOOL         4
+#        else
+#           define SIZEOF__BOOL         1
+#        endif
+#        define SIZEOF_LONG             4
+#        define SIZEOF_PTHREAD_T        4
+#        define SIZEOF_SIZE_T           4
+#        define SIZEOF_TIME_T           4
+#        define SIZEOF_VOID_P           4
+#        define SIZEOF_UINTPTR_T        4
+#        define SIZEOF_PTHREAD_T        4
+#    endif
+
+#    if defined(__LP64__)
+     /* MacOSX 10.4 (the first release to support 64-bit code
+      * at all) only supports 64-bit in the UNIX layer.
+      * Therefore suppress the toolbox-glue in 64-bit mode.
+      */
+
+    /* In 64-bit mode setpgrp always has no arguments, in 32-bit
+     * mode that depends on the compilation environment
+     */
+#       undef SETPGRP_HAVE_ARG
+
+#    endif
+
+#ifdef __BIG_ENDIAN__
+#define WORDS_BIGENDIAN 1
+#define DOUBLE_IS_BIG_ENDIAN_IEEE754
+#else
+#define DOUBLE_IS_LITTLE_ENDIAN_IEEE754
+#endif /* __BIG_ENDIAN */
+
+#if defined(__i386__) || defined(__x86_64__)
+# define HAVE_GCC_ASM_FOR_X87
+# define ALIGNOF_MAX_ALIGN_T 16
+# define HAVE_GCC_ASM_FOR_X64 1
+# define SIZEOF_LONG_DOUBLE 16
+#else
+# define ALIGNOF_MAX_ALIGN_T 8
+# define SIZEOF_LONG_DOUBLE 8
+#endif
+
+
+#endif /* defined(_APPLE__) */
+
+#endif /* PYMACCONFIG_H */
diff --git a/nanvix-port/cpython-headers/python3.12/pymacro.h b/nanvix-port/cpython-headers/python3.12/pymacro.h
new file mode 100644
index 000000000000..342d2a7b844a
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/pymacro.h
@@ -0,0 +1,163 @@
+#ifndef Py_PYMACRO_H
+#define Py_PYMACRO_H
+
+// gh-91782: On FreeBSD 12, if the _POSIX_C_SOURCE and _XOPEN_SOURCE macros are
+// defined, <sys/cdefs.h> disables C11 support and <assert.h> does not define
+// the static_assert() macro.
+// https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=255290
+//
+// macOS <= 10.10 doesn't define static_assert in assert.h at all despite
+// having C11 compiler support.
+//
+// static_assert is defined in glibc from version 2.16. Compiler support for
+// the C11 _Static_assert keyword is in gcc >= 4.6.
+//
+// MSVC makes static_assert a keyword in C11-17, contrary to the standards.
+//
+// In C++11 and C2x, static_assert is a keyword, redefining is undefined
+// behaviour. So only define if building as C (if __STDC_VERSION__ is defined),
+// not C++, and only for C11-17.
+#if !defined(static_assert) && (defined(__GNUC__) || defined(__clang__)) \
+     && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L \
+     && __STDC_VERSION__ <= 201710L
+#  define static_assert _Static_assert
+#endif
+
+/* Minimum value between x and y */
+#define Py_MIN(x, y) (((x) > (y)) ? (y) : (x))
+
+/* Maximum value between x and y */
+#define Py_MAX(x, y) (((x) > (y)) ? (x) : (y))
+
+/* Absolute value of the number x */
+#define Py_ABS(x) ((x) < 0 ? -(x) : (x))
+
+#define _Py_XSTRINGIFY(x) #x
+
+/* Convert the argument to a string. For example, Py_STRINGIFY(123) is replaced
+   with "123" by the preprocessor. Defines are also replaced by their value.
+   For example Py_STRINGIFY(__LINE__) is replaced by the line number, not
+   by "__LINE__". */
+#define Py_STRINGIFY(x) _Py_XSTRINGIFY(x)
+
+/* Get the size of a structure member in bytes */
+#define Py_MEMBER_SIZE(type, member) sizeof(((type *)0)->member)
+
+/* Argument must be a char or an int in [-128, 127] or [0, 255]. */
+#define Py_CHARMASK(c) ((unsigned char)((c) & 0xff))
+
+/* Assert a build-time dependency, as an expression.
+
+   Your compile will fail if the condition isn't true, or can't be evaluated
+   by the compiler. This can be used in an expression: its value is 0.
+
+   Example:
+
+   #define foo_to_char(foo)  \
+       ((char *)(foo)        \
+        + Py_BUILD_ASSERT_EXPR(offsetof(struct foo, string) == 0))
+
+   Written by Rusty Russell, public domain, http://ccodearchive.net/ */
+#define Py_BUILD_ASSERT_EXPR(cond) \
+    (sizeof(char [1 - 2*!(cond)]) - 1)
+
+#define Py_BUILD_ASSERT(cond)  do {         \
+        (void)Py_BUILD_ASSERT_EXPR(cond);   \
+    } while(0)
+
+/* Get the number of elements in a visible array
+
+   This does not work on pointers, or arrays declared as [], or function
+   parameters. With correct compiler support, such usage will cause a build
+   error (see Py_BUILD_ASSERT_EXPR).
+
+   Written by Rusty Russell, public domain, http://ccodearchive.net/
+
+   Requires at GCC 3.1+ */
+#if (defined(__GNUC__) && !defined(__STRICT_ANSI__) && \
+    (((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)) || (__GNUC__ >= 4)))
+/* Two gcc extensions.
+   &a[0] degrades to a pointer: a different type from an array */
+#define Py_ARRAY_LENGTH(array) \
+    (sizeof(array) / sizeof((array)[0]) \
+     + Py_BUILD_ASSERT_EXPR(!__builtin_types_compatible_p(typeof(array), \
+                                                          typeof(&(array)[0]))))
+#else
+#define Py_ARRAY_LENGTH(array) \
+    (sizeof(array) / sizeof((array)[0]))
+#endif
+
+
+/* Define macros for inline documentation. */
+#define PyDoc_VAR(name) static const char name[]
+#define PyDoc_STRVAR(name,str) PyDoc_VAR(name) = PyDoc_STR(str)
+#ifdef WITH_DOC_STRINGS
+#define PyDoc_STR(str) str
+#else
+#define PyDoc_STR(str) ""
+#endif
+
+/* Below "a" is a power of 2. */
+/* Round down size "n" to be a multiple of "a". */
+#define _Py_SIZE_ROUND_DOWN(n, a) ((size_t)(n) & ~(size_t)((a) - 1))
+/* Round up size "n" to be a multiple of "a". */
+#define _Py_SIZE_ROUND_UP(n, a) (((size_t)(n) + \
+        (size_t)((a) - 1)) & ~(size_t)((a) - 1))
+/* Round pointer "p" down to the closest "a"-aligned address <= "p". */
+#define _Py_ALIGN_DOWN(p, a) ((void *)((uintptr_t)(p) & ~(uintptr_t)((a) - 1)))
+/* Round pointer "p" up to the closest "a"-aligned address >= "p". */
+#define _Py_ALIGN_UP(p, a) ((void *)(((uintptr_t)(p) + \
+        (uintptr_t)((a) - 1)) & ~(uintptr_t)((a) - 1)))
+/* Check if pointer "p" is aligned to "a"-bytes boundary. */
+#define _Py_IS_ALIGNED(p, a) (!((uintptr_t)(p) & (uintptr_t)((a) - 1)))
+
+/* Use this for unused arguments in a function definition to silence compiler
+ * warnings. Example:
+ *
+ * int func(int a, int Py_UNUSED(b)) { return a; }
+ */
+#if defined(__GNUC__) || defined(__clang__)
+#  define Py_UNUSED(name) _unused_ ## name __attribute__((unused))
+#else
+#  define Py_UNUSED(name) _unused_ ## name
+#endif
+
+#if defined(RANDALL_WAS_HERE)
+#  define Py_UNREACHABLE() \
+    Py_FatalError( \
+        "If you're seeing this, the code is in what I thought was\n" \
+        "an unreachable state.\n\n" \
+        "I could give you advice for what to do, but honestly, why\n" \
+        "should you trust me?  I clearly screwed this up.  I'm writing\n" \
+        "a message that should never appear, yet I know it will\n" \
+        "probably appear someday.\n\n" \
+        "On a deep level, I know I'm not up to this task.\n" \
+        "I'm so sorry.\n" \
+        "https://xkcd.com/2200")
+#elif defined(Py_DEBUG)
+#  define Py_UNREACHABLE() \
+    Py_FatalError( \
+        "We've reached an unreachable state. Anything is possible.\n" \
+        "The limits were in our heads all along. Follow your dreams.\n" \
+        "https://xkcd.com/2200")
+#elif defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5))
+#  define Py_UNREACHABLE() __builtin_unreachable()
+#elif defined(__clang__) || defined(__INTEL_COMPILER)
+#  define Py_UNREACHABLE() __builtin_unreachable()
+#elif defined(_MSC_VER)
+#  define Py_UNREACHABLE() __assume(0)
+#else
+#  define Py_UNREACHABLE() \
+    Py_FatalError("Unreachable C code path reached")
+#endif
+
+// Prevent using an expression as a l-value.
+// For example, "int x; _Py_RVALUE(x) = 1;" fails with a compiler error.
+#define _Py_RVALUE(EXPR) ((void)0, (EXPR))
+
+// Return non-zero if the type is signed, return zero if it's unsigned.
+// Use "<= 0" rather than "< 0" to prevent the compiler warning:
+// "comparison of unsigned expression in '< 0' is always false".
+#define _Py_IS_TYPE_SIGNED(type) ((type)(-1) <= 0)
+
+#endif /* Py_PYMACRO_H */
diff --git a/nanvix-port/cpython-headers/python3.12/pymath.h b/nanvix-port/cpython-headers/python3.12/pymath.h
new file mode 100644
index 000000000000..4c1e3d998489
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/pymath.h
@@ -0,0 +1,62 @@
+// Symbols and macros to supply platform-independent interfaces to mathematical
+// functions and constants.
+
+#ifndef Py_PYMATH_H
+#define Py_PYMATH_H
+
+/* High precision definition of pi and e (Euler)
+ * The values are taken from libc6's math.h.
+ */
+#ifndef Py_MATH_PIl
+#define Py_MATH_PIl 3.1415926535897932384626433832795029L
+#endif
+#ifndef Py_MATH_PI
+#define Py_MATH_PI 3.14159265358979323846
+#endif
+
+#ifndef Py_MATH_El
+#define Py_MATH_El 2.7182818284590452353602874713526625L
+#endif
+
+#ifndef Py_MATH_E
+#define Py_MATH_E 2.7182818284590452354
+#endif
+
+/* Tau (2pi) to 40 digits, taken from tauday.com/tau-digits. */
+#ifndef Py_MATH_TAU
+#define Py_MATH_TAU 6.2831853071795864769252867665590057683943L
+#endif
+
+// Py_IS_NAN(X)
+// Return 1 if float or double arg is a NaN, else 0.
+#define Py_IS_NAN(X) isnan(X)
+
+// Py_IS_INFINITY(X)
+// Return 1 if float or double arg is an infinity, else 0.
+#define Py_IS_INFINITY(X) isinf(X)
+
+// Py_IS_FINITE(X)
+// Return 1 if float or double arg is neither infinite nor NAN, else 0.
+#define Py_IS_FINITE(X) isfinite(X)
+
+// Py_INFINITY: Value that evaluates to a positive double infinity.
+#ifndef Py_INFINITY
+#  define Py_INFINITY ((double)INFINITY)
+#endif
+
+/* Py_HUGE_VAL should always be the same as Py_INFINITY.  But historically
+ * this was not reliable and Python did not require IEEE floats and C99
+ * conformity.  Prefer Py_INFINITY for new code.
+ */
+#ifndef Py_HUGE_VAL
+#  define Py_HUGE_VAL HUGE_VAL
+#endif
+
+/* Py_NAN: Value that evaluates to a quiet Not-a-Number (NaN).  The sign is
+ * undefined and normally not relevant, but e.g. fixed for float("nan").
+ */
+#if !defined(Py_NAN)
+#    define Py_NAN ((double)NAN)
+#endif
+
+#endif /* Py_PYMATH_H */
diff --git a/nanvix-port/cpython-headers/python3.12/pymem.h b/nanvix-port/cpython-headers/python3.12/pymem.h
new file mode 100644
index 000000000000..e882645757bf
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/pymem.h
@@ -0,0 +1,104 @@
+/* The PyMem_ family:  low-level memory allocation interfaces.
+   See objimpl.h for the PyObject_ memory family.
+*/
+
+#ifndef Py_PYMEM_H
+#define Py_PYMEM_H
+
+#include "pyport.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* BEWARE:
+
+   Each interface exports both functions and macros.  Extension modules should
+   use the functions, to ensure binary compatibility across Python versions.
+   Because the Python implementation is free to change internal details, and
+   the macros may (or may not) expose details for speed, if you do use the
+   macros you must recompile your extensions with each Python release.
+
+   Never mix calls to PyMem_ with calls to the platform malloc/realloc/
+   calloc/free.  For example, on Windows different DLLs may end up using
+   different heaps, and if you use PyMem_Malloc you'll get the memory from the
+   heap used by the Python DLL; it could be a disaster if you free()'ed that
+   directly in your own extension.  Using PyMem_Free instead ensures Python
+   can return the memory to the proper heap.  As another example, in
+   a debug build (Py_DEBUG macro), Python wraps all calls to all PyMem_ and
+   PyObject_ memory functions in special debugging wrappers that add additional
+   debugging info to dynamic memory blocks.  The system routines have no idea
+   what to do with that stuff, and the Python wrappers have no idea what to do
+   with raw blocks obtained directly by the system routines then.
+
+   The GIL must be held when using these APIs.
+*/
+
+/*
+ * Raw memory interface
+ * ====================
+ */
+
+/* Functions
+
+   Functions supplying platform-independent semantics for malloc/realloc/
+   free.  These functions make sure that allocating 0 bytes returns a distinct
+   non-NULL pointer (whenever possible -- if we're flat out of memory, NULL
+   may be returned), even if the platform malloc and realloc don't.
+   Returned pointers must be checked for NULL explicitly.  No action is
+   performed on failure (no exception is set, no warning is printed, etc).
+*/
+
+PyAPI_FUNC(void *) PyMem_Malloc(size_t size);
+PyAPI_FUNC(void *) PyMem_Calloc(size_t nelem, size_t elsize);
+PyAPI_FUNC(void *) PyMem_Realloc(void *ptr, size_t new_size);
+PyAPI_FUNC(void) PyMem_Free(void *ptr);
+
+/*
+ * Type-oriented memory interface
+ * ==============================
+ *
+ * Allocate memory for n objects of the given type.  Returns a new pointer
+ * or NULL if the request was too large or memory allocation failed.  Use
+ * these macros rather than doing the multiplication yourself so that proper
+ * overflow checking is always done.
+ */
+
+#define PyMem_New(type, n) \
+  ( ((size_t)(n) > PY_SSIZE_T_MAX / sizeof(type)) ? NULL :      \
+        ( (type *) PyMem_Malloc((n) * sizeof(type)) ) )
+
+/*
+ * The value of (p) is always clobbered by this macro regardless of success.
+ * The caller MUST check if (p) is NULL afterwards and deal with the memory
+ * error if so.  This means the original value of (p) MUST be saved for the
+ * caller's memory error handler to not lose track of it.
+ */
+#define PyMem_Resize(p, type, n) \
+  ( (p) = ((size_t)(n) > PY_SSIZE_T_MAX / sizeof(type)) ? NULL :        \
+        (type *) PyMem_Realloc((p), (n) * sizeof(type)) )
+
+
+// Deprecated aliases only kept for backward compatibility.
+// PyMem_Del and PyMem_DEL are defined with no parameter to be able to use
+// them as function pointers (ex: dealloc = PyMem_Del).
+#define PyMem_MALLOC(n)           PyMem_Malloc((n))
+#define PyMem_NEW(type, n)        PyMem_New(type, (n))
+#define PyMem_REALLOC(p, n)       PyMem_Realloc((p), (n))
+#define PyMem_RESIZE(p, type, n)  PyMem_Resize((p), type, (n))
+#define PyMem_FREE(p)             PyMem_Free((p))
+#define PyMem_Del(p)              PyMem_Free((p))
+#define PyMem_DEL(p)              PyMem_Free((p))
+
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_PYMEM_H
+#  include "cpython/pymem.h"
+#  undef Py_CPYTHON_PYMEM_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !Py_PYMEM_H */
diff --git a/nanvix-port/cpython-headers/python3.12/pyport.h b/nanvix-port/cpython-headers/python3.12/pyport.h
new file mode 100644
index 000000000000..9792a843bf50
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/pyport.h
@@ -0,0 +1,787 @@
+#ifndef Py_PYPORT_H
+#define Py_PYPORT_H
+
+#include "pyconfig.h" /* include for defines */
+
+#include <inttypes.h>
+
+#include <limits.h>
+#ifndef UCHAR_MAX
+#  error "limits.h must define UCHAR_MAX"
+#endif
+#if UCHAR_MAX != 255
+#  error "Python's source code assumes C's unsigned char is an 8-bit type"
+#endif
+
+
+// Macro to use C++ static_cast<> in the Python C API.
+#ifdef __cplusplus
+#  define _Py_STATIC_CAST(type, expr) static_cast<type>(expr)
+#else
+#  define _Py_STATIC_CAST(type, expr) ((type)(expr))
+#endif
+// Macro to use the more powerful/dangerous C-style cast even in C++.
+#define _Py_CAST(type, expr) ((type)(expr))
+
+// Static inline functions should use _Py_NULL rather than using directly NULL
+// to prevent C++ compiler warnings. On C++11 and newer, _Py_NULL is defined as
+// nullptr.
+#if defined(__cplusplus) && __cplusplus >= 201103
+#  define _Py_NULL nullptr
+#else
+#  define _Py_NULL NULL
+#endif
+
+
+/* Defines to build Python and its standard library:
+ *
+ * - Py_BUILD_CORE: Build Python core. Give access to Python internals, but
+ *   should not be used by third-party modules.
+ * - Py_BUILD_CORE_BUILTIN: Build a Python stdlib module as a built-in module.
+ * - Py_BUILD_CORE_MODULE: Build a Python stdlib module as a dynamic library.
+ *
+ * Py_BUILD_CORE_BUILTIN and Py_BUILD_CORE_MODULE imply Py_BUILD_CORE.
+ *
+ * On Windows, Py_BUILD_CORE_MODULE exports "PyInit_xxx" symbol, whereas
+ * Py_BUILD_CORE_BUILTIN does not.
+ */
+#if defined(Py_BUILD_CORE_BUILTIN) && !defined(Py_BUILD_CORE)
+#  define Py_BUILD_CORE
+#endif
+#if defined(Py_BUILD_CORE_MODULE) && !defined(Py_BUILD_CORE)
+#  define Py_BUILD_CORE
+#endif
+
+
+/**************************************************************************
+Symbols and macros to supply platform-independent interfaces to basic
+C language & library operations whose spellings vary across platforms.
+
+Please try to make documentation here as clear as possible:  by definition,
+the stuff here is trying to illuminate C's darkest corners.
+
+Config #defines referenced here:
+
+SIGNED_RIGHT_SHIFT_ZERO_FILLS
+Meaning:  To be defined iff i>>j does not extend the sign bit when i is a
+          signed integral type and i < 0.
+Used in:  Py_ARITHMETIC_RIGHT_SHIFT
+
+Py_DEBUG
+Meaning:  Extra checks compiled in for debug mode.
+Used in:  Py_SAFE_DOWNCAST
+
+**************************************************************************/
+
+/* typedefs for some C9X-defined synonyms for integral types.
+ *
+ * The names in Python are exactly the same as the C9X names, except with a
+ * Py_ prefix.  Until C9X is universally implemented, this is the only way
+ * to ensure that Python gets reliable names that don't conflict with names
+ * in non-Python code that are playing their own tricks to define the C9X
+ * names.
+ *
+ * NOTE: don't go nuts here!  Python has no use for *most* of the C9X
+ * integral synonyms.  Only define the ones we actually need.
+ */
+
+/* long long is required. Ensure HAVE_LONG_LONG is defined for compatibility. */
+#ifndef HAVE_LONG_LONG
+#define HAVE_LONG_LONG 1
+#endif
+#ifndef PY_LONG_LONG
+#define PY_LONG_LONG long long
+/* If LLONG_MAX is defined in limits.h, use that. */
+#define PY_LLONG_MIN LLONG_MIN
+#define PY_LLONG_MAX LLONG_MAX
+#define PY_ULLONG_MAX ULLONG_MAX
+#endif
+
+#define PY_UINT32_T uint32_t
+#define PY_UINT64_T uint64_t
+
+/* Signed variants of the above */
+#define PY_INT32_T int32_t
+#define PY_INT64_T int64_t
+
+/* PYLONG_BITS_IN_DIGIT describes the number of bits per "digit" (limb) in the
+ * PyLongObject implementation (longintrepr.h). It's currently either 30 or 15,
+ * defaulting to 30. The 15-bit digit option may be removed in the future.
+ */
+#ifndef PYLONG_BITS_IN_DIGIT
+#define PYLONG_BITS_IN_DIGIT 30
+#endif
+
+/* uintptr_t is the C9X name for an unsigned integral type such that a
+ * legitimate void* can be cast to uintptr_t and then back to void* again
+ * without loss of information.  Similarly for intptr_t, wrt a signed
+ * integral type.
+ */
+typedef uintptr_t       Py_uintptr_t;
+typedef intptr_t        Py_intptr_t;
+
+/* Py_ssize_t is a signed integral type such that sizeof(Py_ssize_t) ==
+ * sizeof(size_t).  C99 doesn't define such a thing directly (size_t is an
+ * unsigned integral type).  See PEP 353 for details.
+ * PY_SSIZE_T_MAX is the largest positive value of type Py_ssize_t.
+ */
+#ifdef HAVE_PY_SSIZE_T
+
+#elif HAVE_SSIZE_T
+typedef ssize_t         Py_ssize_t;
+/* Nanvix's Newlib defines SSIZE_MAX as (__SIZE_MAX__ >> 1) which retains
+   unsigned type from __SIZE_MAX__. This causes signed/unsigned comparison
+   bugs (e.g., `x < -PY_SSIZE_T_MAX` is always true for positive x).
+   Define PY_SSIZE_T_MAX with a signed suffix to fix this. */
+#   ifdef __nanvix__
+#       define PY_SSIZE_T_MAX 0x7FFFFFFFL
+#   else
+#       define PY_SSIZE_T_MAX SSIZE_MAX
+#   endif
+#elif SIZEOF_VOID_P == SIZEOF_SIZE_T
+typedef Py_intptr_t     Py_ssize_t;
+#   define PY_SSIZE_T_MAX INTPTR_MAX
+#else
+#   error "Python needs a typedef for Py_ssize_t in pyport.h."
+#endif
+
+/* Smallest negative value of type Py_ssize_t. */
+#define PY_SSIZE_T_MIN (-PY_SSIZE_T_MAX-1)
+
+/* Py_hash_t is the same size as a pointer. */
+#define SIZEOF_PY_HASH_T SIZEOF_SIZE_T
+typedef Py_ssize_t Py_hash_t;
+/* Py_uhash_t is the unsigned equivalent needed to calculate numeric hash. */
+#define SIZEOF_PY_UHASH_T SIZEOF_SIZE_T
+typedef size_t Py_uhash_t;
+
+/* Now PY_SSIZE_T_CLEAN is mandatory. This is just for backward compatibility. */
+typedef Py_ssize_t Py_ssize_clean_t;
+
+/* Largest possible value of size_t. */
+#define PY_SIZE_MAX SIZE_MAX
+
+/* Macro kept for backward compatibility: use directly "z" in new code.
+ *
+ * PY_FORMAT_SIZE_T is a modifier for use in a printf format to convert an
+ * argument with the width of a size_t or Py_ssize_t: "z" (C99).
+ */
+#ifndef PY_FORMAT_SIZE_T
+#   define PY_FORMAT_SIZE_T "z"
+#endif
+
+/* Py_LOCAL can be used instead of static to get the fastest possible calling
+ * convention for functions that are local to a given module.
+ *
+ * Py_LOCAL_INLINE does the same thing, and also explicitly requests inlining,
+ * for platforms that support that.
+ *
+ * NOTE: You can only use this for functions that are entirely local to a
+ * module; functions that are exported via method tables, callbacks, etc,
+ * should keep using static.
+ */
+
+#if defined(_MSC_VER)
+   /* ignore warnings if the compiler decides not to inline a function */
+#  pragma warning(disable: 4710)
+   /* fastest possible local call under MSVC */
+#  define Py_LOCAL(type) static type __fastcall
+#  define Py_LOCAL_INLINE(type) static __inline type __fastcall
+#else
+#  define Py_LOCAL(type) static type
+#  define Py_LOCAL_INLINE(type) static inline type
+#endif
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 < 0x030b0000
+#  define Py_MEMCPY memcpy
+#endif
+
+#ifdef HAVE_IEEEFP_H
+#include <ieeefp.h>  /* needed for 'finite' declaration on some platforms */
+#endif
+
+#include <math.h> /* Moved here from the math section, before extern "C" */
+
+/********************************************
+ * WRAPPER FOR <time.h> and/or <sys/time.h> *
+ ********************************************/
+
+#ifdef HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif
+#include <time.h>
+
+/******************************
+ * WRAPPER FOR <sys/select.h> *
+ ******************************/
+
+/* NB caller must include <sys/types.h> */
+
+#ifdef HAVE_SYS_SELECT_H
+#include <sys/select.h>
+#endif /* !HAVE_SYS_SELECT_H */
+
+/*******************************
+ * stat() and fstat() fiddling *
+ *******************************/
+
+#ifdef HAVE_SYS_STAT_H
+#include <sys/stat.h>
+#elif defined(HAVE_STAT_H)
+#include <stat.h>
+#endif
+
+#ifndef S_IFMT
+/* VisualAge C/C++ Failed to Define MountType Field in sys/stat.h */
+#define S_IFMT 0170000
+#endif
+
+#ifndef S_IFLNK
+/* Windows doesn't define S_IFLNK but posixmodule.c maps
+ * IO_REPARSE_TAG_SYMLINK to S_IFLNK */
+#  define S_IFLNK 0120000
+#endif
+
+#ifndef S_ISREG
+#define S_ISREG(x) (((x) & S_IFMT) == S_IFREG)
+#endif
+
+#ifndef S_ISDIR
+#define S_ISDIR(x) (((x) & S_IFMT) == S_IFDIR)
+#endif
+
+#ifndef S_ISCHR
+#define S_ISCHR(x) (((x) & S_IFMT) == S_IFCHR)
+#endif
+
+#ifndef S_ISLNK
+#define S_ISLNK(x) (((x) & S_IFMT) == S_IFLNK)
+#endif
+
+#ifdef __cplusplus
+/* Move this down here since some C++ #include's don't like to be included
+   inside an extern "C" */
+extern "C" {
+#endif
+
+
+/* Py_ARITHMETIC_RIGHT_SHIFT
+ * C doesn't define whether a right-shift of a signed integer sign-extends
+ * or zero-fills.  Here a macro to force sign extension:
+ * Py_ARITHMETIC_RIGHT_SHIFT(TYPE, I, J)
+ *    Return I >> J, forcing sign extension.  Arithmetically, return the
+ *    floor of I/2**J.
+ * Requirements:
+ *    I should have signed integer type.  In the terminology of C99, this can
+ *    be either one of the five standard signed integer types (signed char,
+ *    short, int, long, long long) or an extended signed integer type.
+ *    J is an integer >= 0 and strictly less than the number of bits in the
+ *    type of I (because C doesn't define what happens for J outside that
+ *    range either).
+ *    TYPE used to specify the type of I, but is now ignored.  It's been left
+ *    in for backwards compatibility with versions <= 2.6 or 3.0.
+ * Caution:
+ *    I may be evaluated more than once.
+ */
+#ifdef SIGNED_RIGHT_SHIFT_ZERO_FILLS
+#define Py_ARITHMETIC_RIGHT_SHIFT(TYPE, I, J) \
+    ((I) < 0 ? -1-((-1-(I)) >> (J)) : (I) >> (J))
+#else
+#define Py_ARITHMETIC_RIGHT_SHIFT(TYPE, I, J) ((I) >> (J))
+#endif
+
+/* Py_FORCE_EXPANSION(X)
+ * "Simply" returns its argument.  However, macro expansions within the
+ * argument are evaluated.  This unfortunate trickery is needed to get
+ * token-pasting to work as desired in some cases.
+ */
+#define Py_FORCE_EXPANSION(X) X
+
+/* Py_SAFE_DOWNCAST(VALUE, WIDE, NARROW)
+ * Cast VALUE to type NARROW from type WIDE.  In Py_DEBUG mode, this
+ * assert-fails if any information is lost.
+ * Caution:
+ *    VALUE may be evaluated more than once.
+ */
+#ifdef Py_DEBUG
+#  define Py_SAFE_DOWNCAST(VALUE, WIDE, NARROW) \
+       (assert(_Py_STATIC_CAST(WIDE, _Py_STATIC_CAST(NARROW, (VALUE))) == (VALUE)), \
+        _Py_STATIC_CAST(NARROW, (VALUE)))
+#else
+#  define Py_SAFE_DOWNCAST(VALUE, WIDE, NARROW) _Py_STATIC_CAST(NARROW, (VALUE))
+#endif
+
+
+/* Py_DEPRECATED(version)
+ * Declare a variable, type, or function deprecated.
+ * The macro must be placed before the declaration.
+ * Usage:
+ *    Py_DEPRECATED(3.3) extern int old_var;
+ *    Py_DEPRECATED(3.4) typedef int T1;
+ *    Py_DEPRECATED(3.8) PyAPI_FUNC(int) Py_OldFunction(void);
+ */
+#if defined(__GNUC__) \
+    && ((__GNUC__ >= 4) || (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1))
+#define Py_DEPRECATED(VERSION_UNUSED) __attribute__((__deprecated__))
+#elif defined(_MSC_VER)
+#define Py_DEPRECATED(VERSION) __declspec(deprecated( \
+                                          "deprecated in " #VERSION))
+#else
+#define Py_DEPRECATED(VERSION_UNUSED)
+#endif
+
+// _Py_DEPRECATED_EXTERNALLY(version)
+// Deprecated outside CPython core.
+#ifdef Py_BUILD_CORE
+#define _Py_DEPRECATED_EXTERNALLY(VERSION_UNUSED)
+#else
+#define _Py_DEPRECATED_EXTERNALLY(version) Py_DEPRECATED(version)
+#endif
+
+
+#if defined(__clang__)
+#define _Py_COMP_DIAG_PUSH _Pragma("clang diagnostic push")
+#define _Py_COMP_DIAG_IGNORE_DEPR_DECLS \
+    _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"")
+#define _Py_COMP_DIAG_POP _Pragma("clang diagnostic pop")
+#elif defined(__GNUC__) \
+    && ((__GNUC__ >= 5) || (__GNUC__ == 4) && (__GNUC_MINOR__ >= 6))
+#define _Py_COMP_DIAG_PUSH _Pragma("GCC diagnostic push")
+#define _Py_COMP_DIAG_IGNORE_DEPR_DECLS \
+    _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
+#define _Py_COMP_DIAG_POP _Pragma("GCC diagnostic pop")
+#elif defined(_MSC_VER)
+#define _Py_COMP_DIAG_PUSH __pragma(warning(push))
+#define _Py_COMP_DIAG_IGNORE_DEPR_DECLS __pragma(warning(disable: 4996))
+#define _Py_COMP_DIAG_POP __pragma(warning(pop))
+#else
+#define _Py_COMP_DIAG_PUSH
+#define _Py_COMP_DIAG_IGNORE_DEPR_DECLS
+#define _Py_COMP_DIAG_POP
+#endif
+
+/* _Py_HOT_FUNCTION
+ * The hot attribute on a function is used to inform the compiler that the
+ * function is a hot spot of the compiled program. The function is optimized
+ * more aggressively and on many target it is placed into special subsection of
+ * the text section so all hot functions appears close together improving
+ * locality.
+ *
+ * Usage:
+ *    int _Py_HOT_FUNCTION x(void) { return 3; }
+ *
+ * Issue #28618: This attribute must not be abused, otherwise it can have a
+ * negative effect on performance. Only the functions were Python spend most of
+ * its time must use it. Use a profiler when running performance benchmark
+ * suite to find these functions.
+ */
+#if defined(__GNUC__) \
+    && ((__GNUC__ >= 5) || (__GNUC__ == 4) && (__GNUC_MINOR__ >= 3))
+#define _Py_HOT_FUNCTION __attribute__((hot))
+#else
+#define _Py_HOT_FUNCTION
+#endif
+
+// Ask the compiler to always inline a static inline function. The compiler can
+// ignore it and decides to not inline the function.
+//
+// It can be used to inline performance critical static inline functions when
+// building Python in debug mode with function inlining disabled. For example,
+// MSC disables function inlining when building in debug mode.
+//
+// Marking blindly a static inline function with Py_ALWAYS_INLINE can result in
+// worse performances (due to increased code size for example). The compiler is
+// usually smarter than the developer for the cost/benefit analysis.
+//
+// If Python is built in debug mode (if the Py_DEBUG macro is defined), the
+// Py_ALWAYS_INLINE macro does nothing.
+//
+// It must be specified before the function return type. Usage:
+//
+//     static inline Py_ALWAYS_INLINE int random(void) { return 4; }
+#if defined(Py_DEBUG)
+   // If Python is built in debug mode, usually compiler optimizations are
+   // disabled. In this case, Py_ALWAYS_INLINE can increase a lot the stack
+   // memory usage. For example, forcing inlining using gcc -O0 increases the
+   // stack usage from 6 KB to 15 KB per Python function call.
+#  define Py_ALWAYS_INLINE
+#elif defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
+#  define Py_ALWAYS_INLINE __attribute__((always_inline))
+#elif defined(_MSC_VER)
+#  define Py_ALWAYS_INLINE __forceinline
+#else
+#  define Py_ALWAYS_INLINE
+#endif
+
+// Py_NO_INLINE
+// Disable inlining on a function. For example, it reduces the C stack
+// consumption: useful on LTO+PGO builds which heavily inline code (see
+// bpo-33720).
+//
+// Usage:
+//
+//    Py_NO_INLINE static int random(void) { return 4; }
+#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
+#  define Py_NO_INLINE __attribute__ ((noinline))
+#elif defined(_MSC_VER)
+#  define Py_NO_INLINE __declspec(noinline)
+#else
+#  define Py_NO_INLINE
+#endif
+
+/**************************************************************************
+Prototypes that are missing from the standard include files on some systems
+(and possibly only some versions of such systems.)
+
+Please be conservative with adding new ones, document them and enclose them
+in platform-specific #ifdefs.
+**************************************************************************/
+
+#ifdef HAVE__GETPTY
+#include <sys/types.h>          /* we need to import mode_t */
+extern char * _getpty(int *, int, mode_t, int);
+#endif
+
+/* On QNX 6, struct termio must be declared by including sys/termio.h
+   if TCGETA, TCSETA, TCSETAW, or TCSETAF are used.  sys/termio.h must
+   be included before termios.h or it will generate an error. */
+#if defined(HAVE_SYS_TERMIO_H) && !defined(__hpux)
+#include <sys/termio.h>
+#endif
+
+
+/* On 4.4BSD-descendants, ctype functions serves the whole range of
+ * wchar_t character set rather than single byte code points only.
+ * This characteristic can break some operations of string object
+ * including str.upper() and str.split() on UTF-8 locales.  This
+ * workaround was provided by Tim Robbins of FreeBSD project.
+ */
+
+#if defined(__APPLE__)
+#  define _PY_PORT_CTYPE_UTF8_ISSUE
+#endif
+
+#ifdef _PY_PORT_CTYPE_UTF8_ISSUE
+#ifndef __cplusplus
+   /* The workaround below is unsafe in C++ because
+    * the <locale> defines these symbols as real functions,
+    * with a slightly different signature.
+    * See issue #10910
+    */
+#include <ctype.h>
+#include <wctype.h>
+#undef isalnum
+#define isalnum(c) iswalnum(btowc(c))
+#undef isalpha
+#define isalpha(c) iswalpha(btowc(c))
+#undef islower
+#define islower(c) iswlower(btowc(c))
+#undef isspace
+#define isspace(c) iswspace(btowc(c))
+#undef isupper
+#define isupper(c) iswupper(btowc(c))
+#undef tolower
+#define tolower(c) towlower(btowc(c))
+#undef toupper
+#define toupper(c) towupper(btowc(c))
+#endif
+#endif
+
+
+/* Declarations for symbol visibility.
+
+  PyAPI_FUNC(type): Declares a public Python API function and return type
+  PyAPI_DATA(type): Declares public Python data and its type
+  PyMODINIT_FUNC:   A Python module init function.  If these functions are
+                    inside the Python core, they are private to the core.
+                    If in an extension module, it may be declared with
+                    external linkage depending on the platform.
+
+  As a number of platforms support/require "__declspec(dllimport/dllexport)",
+  we support a HAVE_DECLSPEC_DLL macro to save duplication.
+*/
+
+/*
+  All windows ports, except cygwin, are handled in PC/pyconfig.h.
+
+  Cygwin is the only other autoconf platform requiring special
+  linkage handling and it uses __declspec().
+*/
+#if defined(__CYGWIN__)
+#       define HAVE_DECLSPEC_DLL
+#endif
+
+#include "exports.h"
+
+/* only get special linkage if built as shared or platform is Cygwin */
+#if defined(Py_ENABLE_SHARED) || defined(__CYGWIN__)
+#       if defined(HAVE_DECLSPEC_DLL)
+#               if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE)
+#                       define PyAPI_FUNC(RTYPE) Py_EXPORTED_SYMBOL RTYPE
+#                       define PyAPI_DATA(RTYPE) extern Py_EXPORTED_SYMBOL RTYPE
+        /* module init functions inside the core need no external linkage */
+        /* except for Cygwin to handle embedding */
+#                       if defined(__CYGWIN__)
+#                               define PyMODINIT_FUNC Py_EXPORTED_SYMBOL PyObject*
+#                       else /* __CYGWIN__ */
+#                               define PyMODINIT_FUNC PyObject*
+#                       endif /* __CYGWIN__ */
+#               else /* Py_BUILD_CORE */
+        /* Building an extension module, or an embedded situation */
+        /* public Python functions and data are imported */
+        /* Under Cygwin, auto-import functions to prevent compilation */
+        /* failures similar to those described at the bottom of 4.1: */
+        /* http://docs.python.org/extending/windows.html#a-cookbook-approach */
+#                       if !defined(__CYGWIN__)
+#                               define PyAPI_FUNC(RTYPE) Py_IMPORTED_SYMBOL RTYPE
+#                       endif /* !__CYGWIN__ */
+#                       define PyAPI_DATA(RTYPE) extern Py_IMPORTED_SYMBOL RTYPE
+        /* module init functions outside the core must be exported */
+#                       if defined(__cplusplus)
+#                               define PyMODINIT_FUNC extern "C" Py_EXPORTED_SYMBOL PyObject*
+#                       else /* __cplusplus */
+#                               define PyMODINIT_FUNC Py_EXPORTED_SYMBOL PyObject*
+#                       endif /* __cplusplus */
+#               endif /* Py_BUILD_CORE */
+#       endif /* HAVE_DECLSPEC_DLL */
+#endif /* Py_ENABLE_SHARED */
+
+/* If no external linkage macros defined by now, create defaults */
+#ifndef PyAPI_FUNC
+#       define PyAPI_FUNC(RTYPE) Py_EXPORTED_SYMBOL RTYPE
+#endif
+#ifndef PyAPI_DATA
+#       define PyAPI_DATA(RTYPE) extern Py_EXPORTED_SYMBOL RTYPE
+#endif
+#ifndef PyMODINIT_FUNC
+#       if defined(__cplusplus)
+#               define PyMODINIT_FUNC extern "C" Py_EXPORTED_SYMBOL PyObject*
+#       else /* __cplusplus */
+#               define PyMODINIT_FUNC Py_EXPORTED_SYMBOL PyObject*
+#       endif /* __cplusplus */
+#endif
+
+/* limits.h constants that may be missing */
+
+#ifndef INT_MAX
+#define INT_MAX 2147483647
+#endif
+
+#ifndef LONG_MAX
+#if SIZEOF_LONG == 4
+#define LONG_MAX 0X7FFFFFFFL
+#elif SIZEOF_LONG == 8
+#define LONG_MAX 0X7FFFFFFFFFFFFFFFL
+#else
+#error "could not set LONG_MAX in pyport.h"
+#endif
+#endif
+
+#ifndef LONG_MIN
+#define LONG_MIN (-LONG_MAX-1)
+#endif
+
+#ifndef LONG_BIT
+#define LONG_BIT (8 * SIZEOF_LONG)
+#endif
+
+#if LONG_BIT != 8 * SIZEOF_LONG
+/* 04-Oct-2000 LONG_BIT is apparently (mis)defined as 64 on some recent
+ * 32-bit platforms using gcc.  We try to catch that here at compile-time
+ * rather than waiting for integer multiplication to trigger bogus
+ * overflows.
+ */
+#error "LONG_BIT definition appears wrong for platform (bad gcc/glibc config?)."
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+/*
+ * Hide GCC attributes from compilers that don't support them.
+ */
+#if (!defined(__GNUC__) || __GNUC__ < 2 || \
+     (__GNUC__ == 2 && __GNUC_MINOR__ < 7) )
+#define Py_GCC_ATTRIBUTE(x)
+#else
+#define Py_GCC_ATTRIBUTE(x) __attribute__(x)
+#endif
+
+/*
+ * Specify alignment on compilers that support it.
+ */
+#if defined(__GNUC__) && __GNUC__ >= 3
+#define Py_ALIGNED(x) __attribute__((aligned(x)))
+#else
+#define Py_ALIGNED(x)
+#endif
+
+/* Eliminate end-of-loop code not reached warnings from SunPro C
+ * when using do{...}while(0) macros
+ */
+#ifdef __SUNPRO_C
+#pragma error_messages (off,E_END_OF_LOOP_CODE_NOT_REACHED)
+#endif
+
+#ifndef Py_LL
+#define Py_LL(x) x##LL
+#endif
+
+#ifndef Py_ULL
+#define Py_ULL(x) Py_LL(x##U)
+#endif
+
+#define Py_VA_COPY va_copy
+
+/*
+ * Convenient macros to deal with endianness of the platform. WORDS_BIGENDIAN is
+ * detected by configure and defined in pyconfig.h. The code in pyconfig.h
+ * also takes care of Apple's universal builds.
+ */
+
+#ifdef WORDS_BIGENDIAN
+#  define PY_BIG_ENDIAN 1
+#  define PY_LITTLE_ENDIAN 0
+#else
+#  define PY_BIG_ENDIAN 0
+#  define PY_LITTLE_ENDIAN 1
+#endif
+
+#ifdef __ANDROID__
+   /* The Android langinfo.h header is not used. */
+#  undef HAVE_LANGINFO_H
+#  undef CODESET
+#endif
+
+/* Maximum value of the Windows DWORD type */
+#define PY_DWORD_MAX 4294967295U
+
+/* This macro used to tell whether Python was built with multithreading
+ * enabled.  Now multithreading is always enabled, but keep the macro
+ * for compatibility.
+ */
+#ifndef WITH_THREAD
+#  define WITH_THREAD
+#endif
+
+#ifdef WITH_THREAD
+#  ifdef Py_BUILD_CORE
+#    ifdef HAVE_THREAD_LOCAL
+#      error "HAVE_THREAD_LOCAL is already defined"
+#    endif
+#    define HAVE_THREAD_LOCAL 1
+#    ifdef thread_local
+#      define _Py_thread_local thread_local
+#    elif __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__)
+#      define _Py_thread_local _Thread_local
+#    elif defined(_MSC_VER)  /* AKA NT_THREADS */
+#      define _Py_thread_local __declspec(thread)
+#    elif defined(__GNUC__)  /* includes clang */
+#      define _Py_thread_local __thread
+#    else
+       // fall back to the PyThread_tss_*() API, or ignore.
+#      undef HAVE_THREAD_LOCAL
+#    endif
+#  endif
+#endif
+
+/* Check that ALT_SOABI is consistent with Py_TRACE_REFS:
+   ./configure --with-trace-refs should must be used to define Py_TRACE_REFS */
+#if defined(ALT_SOABI) && defined(Py_TRACE_REFS)
+#  error "Py_TRACE_REFS ABI is not compatible with release and debug ABI"
+#endif
+
+#if defined(__ANDROID__) || defined(__VXWORKS__)
+   // Use UTF-8 as the locale encoding, ignore the LC_CTYPE locale.
+   // See _Py_GetLocaleEncoding(), PyUnicode_DecodeLocale()
+   // and PyUnicode_EncodeLocale().
+#  define _Py_FORCE_UTF8_LOCALE
+#endif
+
+#if defined(_Py_FORCE_UTF8_LOCALE) || defined(__APPLE__)
+   // Use UTF-8 as the filesystem encoding.
+   // See PyUnicode_DecodeFSDefaultAndSize(), PyUnicode_EncodeFSDefault(),
+   // Py_DecodeLocale() and Py_EncodeLocale().
+#  define _Py_FORCE_UTF8_FS_ENCODING
+#endif
+
+/* Mark a function which cannot return. Example:
+   PyAPI_FUNC(void) _Py_NO_RETURN PyThread_exit_thread(void);
+
+   XLC support is intentionally omitted due to bpo-40244 */
+#ifndef _Py_NO_RETURN
+#if defined(__clang__) || \
+    (defined(__GNUC__) && \
+     ((__GNUC__ >= 3) || \
+      (__GNUC__ == 2) && (__GNUC_MINOR__ >= 5)))
+#  define _Py_NO_RETURN __attribute__((__noreturn__))
+#elif defined(_MSC_VER)
+#  define _Py_NO_RETURN __declspec(noreturn)
+#else
+#  define _Py_NO_RETURN
+#endif
+#endif
+
+
+// Preprocessor check for a builtin preprocessor function. Always return 0
+// if __has_builtin() macro is not defined.
+//
+// __has_builtin() is available on clang and GCC 10.
+#ifdef __has_builtin
+#  define _Py__has_builtin(x) __has_builtin(x)
+#else
+#  define _Py__has_builtin(x) 0
+#endif
+
+// _Py_TYPEOF(expr) gets the type of an expression.
+//
+// Example: _Py_TYPEOF(x) x_copy = (x);
+//
+// The macro is only defined if GCC or clang compiler is used.
+#if defined(__GNUC__) || defined(__clang__)
+#  define _Py_TYPEOF(expr) __typeof__(expr)
+#endif
+
+
+/* A convenient way for code to know if sanitizers are enabled. */
+#if defined(__has_feature)
+#  if __has_feature(memory_sanitizer)
+#    if !defined(_Py_MEMORY_SANITIZER)
+#      define _Py_MEMORY_SANITIZER
+#    endif
+#  endif
+#  if __has_feature(address_sanitizer)
+#    if !defined(_Py_ADDRESS_SANITIZER)
+#      define _Py_ADDRESS_SANITIZER
+#    endif
+#  endif
+#  if __has_feature(thread_sanitizer)
+#    if !defined(_Py_THREAD_SANITIZER)
+#      define _Py_THREAD_SANITIZER
+#    endif
+#  endif
+#elif defined(__GNUC__)
+#  if defined(__SANITIZE_ADDRESS__)
+#    define _Py_ADDRESS_SANITIZER
+#  endif
+#endif
+
+
+/* AIX has __bool__ redefined in it's system header file. */
+#if defined(_AIX) && defined(__bool__)
+#undef __bool__
+#endif
+
+// Make sure we have maximum alignment, even if the current compiler
+// does not support max_align_t. Note that:
+// - Autoconf reports alignment of unknown types to 0.
+// - 'long double' has maximum alignment on *most* platforms,
+//   looks like the best we can do for pre-C11 compilers.
+// - The value is tested, see test_alignof_max_align_t
+#if !defined(ALIGNOF_MAX_ALIGN_T) || ALIGNOF_MAX_ALIGN_T == 0
+#   undef ALIGNOF_MAX_ALIGN_T
+#   define ALIGNOF_MAX_ALIGN_T _Alignof(long double)
+#endif
+
+#endif /* Py_PYPORT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/pystate.h b/nanvix-port/cpython-headers/python3.12/pystate.h
new file mode 100644
index 000000000000..e6b4de979c87
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/pystate.h
@@ -0,0 +1,132 @@
+/* Thread and interpreter state structures and their interfaces */
+
+
+#ifndef Py_PYSTATE_H
+#define Py_PYSTATE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* This limitation is for performance and simplicity. If needed it can be
+removed (with effort). */
+#define MAX_CO_EXTRA_USERS 255
+
+PyAPI_FUNC(PyInterpreterState *) PyInterpreterState_New(void);
+PyAPI_FUNC(void) PyInterpreterState_Clear(PyInterpreterState *);
+PyAPI_FUNC(void) PyInterpreterState_Delete(PyInterpreterState *);
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03090000
+/* New in 3.9 */
+/* Get the current interpreter state.
+
+   Issue a fatal error if there no current Python thread state or no current
+   interpreter. It cannot return NULL.
+
+   The caller must hold the GIL. */
+PyAPI_FUNC(PyInterpreterState *) PyInterpreterState_Get(void);
+#endif
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03080000
+/* New in 3.8 */
+PyAPI_FUNC(PyObject *) PyInterpreterState_GetDict(PyInterpreterState *);
+#endif
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03070000
+/* New in 3.7 */
+PyAPI_FUNC(int64_t) PyInterpreterState_GetID(PyInterpreterState *);
+#endif
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+
+/* State unique per thread */
+
+/* New in 3.3 */
+PyAPI_FUNC(int) PyState_AddModule(PyObject*, PyModuleDef*);
+PyAPI_FUNC(int) PyState_RemoveModule(PyModuleDef*);
+#endif
+PyAPI_FUNC(PyObject*) PyState_FindModule(PyModuleDef*);
+
+PyAPI_FUNC(PyThreadState *) PyThreadState_New(PyInterpreterState *);
+PyAPI_FUNC(void) PyThreadState_Clear(PyThreadState *);
+PyAPI_FUNC(void) PyThreadState_Delete(PyThreadState *);
+
+/* Get the current thread state.
+
+   When the current thread state is NULL, this issues a fatal error (so that
+   the caller needn't check for NULL).
+
+   The caller must hold the GIL.
+
+   See also _PyThreadState_UncheckedGet() and _PyThreadState_GET(). */
+PyAPI_FUNC(PyThreadState *) PyThreadState_Get(void);
+
+// Alias to PyThreadState_Get()
+#define PyThreadState_GET() PyThreadState_Get()
+
+PyAPI_FUNC(PyThreadState *) PyThreadState_Swap(PyThreadState *);
+PyAPI_FUNC(PyObject *) PyThreadState_GetDict(void);
+PyAPI_FUNC(int) PyThreadState_SetAsyncExc(unsigned long, PyObject *);
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03090000
+/* New in 3.9 */
+PyAPI_FUNC(PyInterpreterState*) PyThreadState_GetInterpreter(PyThreadState *tstate);
+PyAPI_FUNC(PyFrameObject*) PyThreadState_GetFrame(PyThreadState *tstate);
+PyAPI_FUNC(uint64_t) PyThreadState_GetID(PyThreadState *tstate);
+#endif
+
+typedef
+    enum {PyGILState_LOCKED, PyGILState_UNLOCKED}
+        PyGILState_STATE;
+
+
+/* Ensure that the current thread is ready to call the Python
+   C API, regardless of the current state of Python, or of its
+   thread lock.  This may be called as many times as desired
+   by a thread so long as each call is matched with a call to
+   PyGILState_Release().  In general, other thread-state APIs may
+   be used between _Ensure() and _Release() calls, so long as the
+   thread-state is restored to its previous state before the Release().
+   For example, normal use of the Py_BEGIN_ALLOW_THREADS/
+   Py_END_ALLOW_THREADS macros are acceptable.
+
+   The return value is an opaque "handle" to the thread state when
+   PyGILState_Ensure() was called, and must be passed to
+   PyGILState_Release() to ensure Python is left in the same state. Even
+   though recursive calls are allowed, these handles can *not* be shared -
+   each unique call to PyGILState_Ensure must save the handle for its
+   call to PyGILState_Release.
+
+   When the function returns, the current thread will hold the GIL.
+
+   Failure is a fatal error.
+*/
+PyAPI_FUNC(PyGILState_STATE) PyGILState_Ensure(void);
+
+/* Release any resources previously acquired.  After this call, Python's
+   state will be the same as it was prior to the corresponding
+   PyGILState_Ensure() call (but generally this state will be unknown to
+   the caller, hence the use of the GILState API.)
+
+   Every call to PyGILState_Ensure must be matched by a call to
+   PyGILState_Release on the same thread.
+*/
+PyAPI_FUNC(void) PyGILState_Release(PyGILState_STATE);
+
+/* Helper/diagnostic function - get the current thread state for
+   this thread.  May return NULL if no GILState API has been used
+   on the current thread.  Note that the main thread always has such a
+   thread-state, even if no auto-thread-state call has been made
+   on the main thread.
+*/
+PyAPI_FUNC(PyThreadState *) PyGILState_GetThisThreadState(void);
+
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_PYSTATE_H
+#  include "cpython/pystate.h"
+#  undef Py_CPYTHON_PYSTATE_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_PYSTATE_H */
diff --git a/nanvix-port/cpython-headers/python3.12/pystats.h b/nanvix-port/cpython-headers/python3.12/pystats.h
new file mode 100644
index 000000000000..4b961bad2a43
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/pystats.h
@@ -0,0 +1,110 @@
+
+
+#ifndef Py_PYSTATS_H
+#define Py_PYSTATS_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef Py_STATS
+
+#define SPECIALIZATION_FAILURE_KINDS 36
+
+/* Stats for determining who is calling PyEval_EvalFrame */
+#define EVAL_CALL_TOTAL 0
+#define EVAL_CALL_VECTOR 1
+#define EVAL_CALL_GENERATOR 2
+#define EVAL_CALL_LEGACY 3
+#define EVAL_CALL_FUNCTION_VECTORCALL 4
+#define EVAL_CALL_BUILD_CLASS 5
+#define EVAL_CALL_SLOT 6
+#define EVAL_CALL_FUNCTION_EX 7
+#define EVAL_CALL_API 8
+#define EVAL_CALL_METHOD 9
+
+#define EVAL_CALL_KINDS 10
+
+typedef struct _specialization_stats {
+    uint64_t success;
+    uint64_t failure;
+    uint64_t hit;
+    uint64_t deferred;
+    uint64_t miss;
+    uint64_t deopt;
+    uint64_t failure_kinds[SPECIALIZATION_FAILURE_KINDS];
+} SpecializationStats;
+
+typedef struct _opcode_stats {
+    SpecializationStats specialization;
+    uint64_t execution_count;
+    uint64_t pair_count[256];
+} OpcodeStats;
+
+typedef struct _call_stats {
+    uint64_t inlined_py_calls;
+    uint64_t pyeval_calls;
+    uint64_t frames_pushed;
+    uint64_t frame_objects_created;
+    uint64_t eval_calls[EVAL_CALL_KINDS];
+} CallStats;
+
+typedef struct _object_stats {
+    uint64_t increfs;
+    uint64_t decrefs;
+    uint64_t interpreter_increfs;
+    uint64_t interpreter_decrefs;
+    uint64_t allocations;
+    uint64_t allocations512;
+    uint64_t allocations4k;
+    uint64_t allocations_big;
+    uint64_t frees;
+    uint64_t to_freelist;
+    uint64_t from_freelist;
+    uint64_t new_values;
+    uint64_t dict_materialized_on_request;
+    uint64_t dict_materialized_new_key;
+    uint64_t dict_materialized_too_big;
+    uint64_t dict_materialized_str_subclass;
+    uint64_t type_cache_hits;
+    uint64_t type_cache_misses;
+    uint64_t type_cache_dunder_hits;
+    uint64_t type_cache_dunder_misses;
+    uint64_t type_cache_collisions;
+} ObjectStats;
+
+typedef struct _stats {
+    OpcodeStats opcode_stats[256];
+    CallStats call_stats;
+    ObjectStats object_stats;
+} PyStats;
+
+
+PyAPI_DATA(PyStats) _py_stats_struct;
+PyAPI_DATA(PyStats *) _py_stats;
+
+extern void _Py_StatsClear(void);
+extern void _Py_PrintSpecializationStats(int to_file);
+
+#ifdef _PY_INTERPRETER
+
+#define _Py_INCREF_STAT_INC() do { if (_py_stats) _py_stats->object_stats.interpreter_increfs++; } while (0)
+#define _Py_DECREF_STAT_INC() do { if (_py_stats) _py_stats->object_stats.interpreter_decrefs++; } while (0)
+
+#else
+
+#define _Py_INCREF_STAT_INC() do { if (_py_stats) _py_stats->object_stats.increfs++; } while (0)
+#define _Py_DECREF_STAT_INC() do { if (_py_stats) _py_stats->object_stats.decrefs++; } while (0)
+
+#endif
+
+#else
+
+#define _Py_INCREF_STAT_INC() ((void)0)
+#define _Py_DECREF_STAT_INC() ((void)0)
+
+#endif  // !Py_STATS
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_PYSTATs_H */
diff --git a/nanvix-port/cpython-headers/python3.12/pystrcmp.h b/nanvix-port/cpython-headers/python3.12/pystrcmp.h
new file mode 100644
index 000000000000..edb12397e3cb
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/pystrcmp.h
@@ -0,0 +1,23 @@
+#ifndef Py_STRCMP_H
+#define Py_STRCMP_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_FUNC(int) PyOS_mystrnicmp(const char *, const char *, Py_ssize_t);
+PyAPI_FUNC(int) PyOS_mystricmp(const char *, const char *);
+
+#ifdef MS_WINDOWS
+#define PyOS_strnicmp strnicmp
+#define PyOS_stricmp stricmp
+#else
+#define PyOS_strnicmp PyOS_mystrnicmp
+#define PyOS_stricmp PyOS_mystricmp
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !Py_STRCMP_H */
diff --git a/nanvix-port/cpython-headers/python3.12/pystrtod.h b/nanvix-port/cpython-headers/python3.12/pystrtod.h
new file mode 100644
index 000000000000..fa056d17b639
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/pystrtod.h
@@ -0,0 +1,46 @@
+#ifndef Py_STRTOD_H
+#define Py_STRTOD_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+PyAPI_FUNC(double) PyOS_string_to_double(const char *str,
+                                         char **endptr,
+                                         PyObject *overflow_exception);
+
+/* The caller is responsible for calling PyMem_Free to free the buffer
+   that's is returned. */
+PyAPI_FUNC(char *) PyOS_double_to_string(double val,
+                                         char format_code,
+                                         int precision,
+                                         int flags,
+                                         int *type);
+
+#ifndef Py_LIMITED_API
+PyAPI_FUNC(PyObject *) _Py_string_to_number_with_underscores(
+    const char *str, Py_ssize_t len, const char *what, PyObject *obj, void *arg,
+    PyObject *(*innerfunc)(const char *, Py_ssize_t, void *));
+
+PyAPI_FUNC(double) _Py_parse_inf_or_nan(const char *p, char **endptr);
+#endif
+
+
+/* PyOS_double_to_string's "flags" parameter can be set to 0 or more of: */
+#define Py_DTSF_SIGN      0x01 /* always add the sign */
+#define Py_DTSF_ADD_DOT_0 0x02 /* if the result is an integer add ".0" */
+#define Py_DTSF_ALT       0x04 /* "alternate" formatting. it's format_code
+                                  specific */
+#define Py_DTSF_NO_NEG_0  0x08 /* negative zero result is coerced to 0 */
+
+/* PyOS_double_to_string's "type", if non-NULL, will be set to one of: */
+#define Py_DTST_FINITE 0
+#define Py_DTST_INFINITE 1
+#define Py_DTST_NAN 2
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !Py_STRTOD_H */
diff --git a/nanvix-port/cpython-headers/python3.12/pythonrun.h b/nanvix-port/cpython-headers/python3.12/pythonrun.h
new file mode 100644
index 000000000000..154c7450cb93
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/pythonrun.h
@@ -0,0 +1,49 @@
+
+/* Interfaces to parse and execute pieces of python code */
+
+#ifndef Py_PYTHONRUN_H
+#define Py_PYTHONRUN_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_FUNC(PyObject *) Py_CompileString(const char *, const char *, int);
+
+PyAPI_FUNC(void) PyErr_Print(void);
+PyAPI_FUNC(void) PyErr_PrintEx(int);
+PyAPI_FUNC(void) PyErr_Display(PyObject *, PyObject *, PyObject *);
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030C0000
+PyAPI_FUNC(void) PyErr_DisplayException(PyObject *);
+#endif
+
+
+/* Stuff with no proper home (yet) */
+PyAPI_DATA(int) (*PyOS_InputHook)(void);
+
+/* Stack size, in "pointers" (so we get extra safety margins
+   on 64-bit platforms).  On a 32-bit platform, this translates
+   to an 8k margin. */
+#define PYOS_STACK_MARGIN 2048
+
+#if defined(WIN32) && !defined(MS_WIN64) && !defined(_M_ARM) && defined(_MSC_VER) && _MSC_VER >= 1300
+/* Enable stack checking under Microsoft C */
+// When changing the platforms, ensure PyOS_CheckStack() docs are still correct
+#define USE_STACKCHECK
+#endif
+
+#ifdef USE_STACKCHECK
+/* Check that we aren't overflowing our stack */
+PyAPI_FUNC(int) PyOS_CheckStack(void);
+#endif
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_PYTHONRUN_H
+#  include "cpython/pythonrun.h"
+#  undef Py_CPYTHON_PYTHONRUN_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_PYTHONRUN_H */
diff --git a/nanvix-port/cpython-headers/python3.12/pythread.h b/nanvix-port/cpython-headers/python3.12/pythread.h
new file mode 100644
index 000000000000..63714437c496
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/pythread.h
@@ -0,0 +1,135 @@
+#ifndef Py_PYTHREAD_H
+#define Py_PYTHREAD_H
+
+typedef void *PyThread_type_lock;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Return status codes for Python lock acquisition.  Chosen for maximum
+ * backwards compatibility, ie failure -> 0, success -> 1.  */
+typedef enum PyLockStatus {
+    PY_LOCK_FAILURE = 0,
+    PY_LOCK_ACQUIRED = 1,
+    PY_LOCK_INTR
+} PyLockStatus;
+
+PyAPI_FUNC(void) PyThread_init_thread(void);
+PyAPI_FUNC(unsigned long) PyThread_start_new_thread(void (*)(void *), void *);
+PyAPI_FUNC(void) _Py_NO_RETURN PyThread_exit_thread(void);
+PyAPI_FUNC(unsigned long) PyThread_get_thread_ident(void);
+
+#if (defined(__APPLE__) || defined(__linux__) || defined(_WIN32) \
+     || defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) \
+     || defined(__DragonFly__) || defined(_AIX))
+#define PY_HAVE_THREAD_NATIVE_ID
+PyAPI_FUNC(unsigned long) PyThread_get_thread_native_id(void);
+#endif
+
+PyAPI_FUNC(PyThread_type_lock) PyThread_allocate_lock(void);
+PyAPI_FUNC(void) PyThread_free_lock(PyThread_type_lock);
+PyAPI_FUNC(int) PyThread_acquire_lock(PyThread_type_lock, int);
+#define WAIT_LOCK       1
+#define NOWAIT_LOCK     0
+
+/* PY_TIMEOUT_T is the integral type used to specify timeouts when waiting
+   on a lock (see PyThread_acquire_lock_timed() below).
+   PY_TIMEOUT_MAX is the highest usable value (in microseconds) of that
+   type, and depends on the system threading API.
+
+   NOTE: this isn't the same value as `_thread.TIMEOUT_MAX`.  The _thread
+   module exposes a higher-level API, with timeouts expressed in seconds
+   and floating-point numbers allowed.
+*/
+#define PY_TIMEOUT_T long long
+
+#if defined(_POSIX_THREADS)
+   /* PyThread_acquire_lock_timed() uses _PyTime_FromNanoseconds(us * 1000),
+      convert microseconds to nanoseconds. */
+#  define PY_TIMEOUT_MAX (LLONG_MAX / 1000)
+#elif defined (NT_THREADS)
+   // WaitForSingleObject() accepts timeout in milliseconds in the range
+   // [0; 0xFFFFFFFE] (DWORD type). INFINITE value (0xFFFFFFFF) means no
+   // timeout. 0xFFFFFFFE milliseconds is around 49.7 days.
+#  if 0xFFFFFFFELL * 1000 < LLONG_MAX
+#    define PY_TIMEOUT_MAX (0xFFFFFFFELL * 1000)
+#  else
+#    define PY_TIMEOUT_MAX LLONG_MAX
+#  endif
+#else
+#  define PY_TIMEOUT_MAX LLONG_MAX
+#endif
+
+
+/* If microseconds == 0, the call is non-blocking: it returns immediately
+   even when the lock can't be acquired.
+   If microseconds > 0, the call waits up to the specified duration.
+   If microseconds < 0, the call waits until success (or abnormal failure)
+
+   microseconds must be less than PY_TIMEOUT_MAX. Behaviour otherwise is
+   undefined.
+
+   If intr_flag is true and the acquire is interrupted by a signal, then the
+   call will return PY_LOCK_INTR.  The caller may reattempt to acquire the
+   lock.
+*/
+PyAPI_FUNC(PyLockStatus) PyThread_acquire_lock_timed(PyThread_type_lock,
+                                                     PY_TIMEOUT_T microseconds,
+                                                     int intr_flag);
+
+PyAPI_FUNC(void) PyThread_release_lock(PyThread_type_lock);
+
+PyAPI_FUNC(size_t) PyThread_get_stacksize(void);
+PyAPI_FUNC(int) PyThread_set_stacksize(size_t);
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+PyAPI_FUNC(PyObject*) PyThread_GetInfo(void);
+#endif
+
+
+/* Thread Local Storage (TLS) API
+   TLS API is DEPRECATED.  Use Thread Specific Storage (TSS) API.
+
+   The existing TLS API has used int to represent TLS keys across all
+   platforms, but it is not POSIX-compliant.  Therefore, the new TSS API uses
+   opaque data type to represent TSS keys to be compatible (see PEP 539).
+*/
+Py_DEPRECATED(3.7) PyAPI_FUNC(int) PyThread_create_key(void);
+Py_DEPRECATED(3.7) PyAPI_FUNC(void) PyThread_delete_key(int key);
+Py_DEPRECATED(3.7) PyAPI_FUNC(int) PyThread_set_key_value(int key,
+                                                          void *value);
+Py_DEPRECATED(3.7) PyAPI_FUNC(void *) PyThread_get_key_value(int key);
+Py_DEPRECATED(3.7) PyAPI_FUNC(void) PyThread_delete_key_value(int key);
+
+/* Cleanup after a fork */
+Py_DEPRECATED(3.7) PyAPI_FUNC(void) PyThread_ReInitTLS(void);
+
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03070000
+/* New in 3.7 */
+/* Thread Specific Storage (TSS) API */
+
+typedef struct _Py_tss_t Py_tss_t;  /* opaque */
+
+PyAPI_FUNC(Py_tss_t *) PyThread_tss_alloc(void);
+PyAPI_FUNC(void) PyThread_tss_free(Py_tss_t *key);
+
+/* The parameter key must not be NULL. */
+PyAPI_FUNC(int) PyThread_tss_is_created(Py_tss_t *key);
+PyAPI_FUNC(int) PyThread_tss_create(Py_tss_t *key);
+PyAPI_FUNC(void) PyThread_tss_delete(Py_tss_t *key);
+PyAPI_FUNC(int) PyThread_tss_set(Py_tss_t *key, void *value);
+PyAPI_FUNC(void *) PyThread_tss_get(Py_tss_t *key);
+#endif  /* New in 3.7 */
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_PYTHREAD_H
+#  include "cpython/pythread.h"
+#  undef Py_CPYTHON_PYTHREAD_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_PYTHREAD_H */
diff --git a/nanvix-port/cpython-headers/python3.12/pytypedefs.h b/nanvix-port/cpython-headers/python3.12/pytypedefs.h
new file mode 100644
index 000000000000..e78ed56a3b67
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/pytypedefs.h
@@ -0,0 +1,30 @@
+// Forward declarations of types of the Python C API.
+// Declare them at the same place since redefining typedef is a C11 feature.
+// Only use a forward declaration if there is an interdependency between two
+// header files.
+
+#ifndef Py_PYTYPEDEFS_H
+#define Py_PYTYPEDEFS_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct PyModuleDef PyModuleDef;
+typedef struct PyModuleDef_Slot PyModuleDef_Slot;
+typedef struct PyMethodDef PyMethodDef;
+typedef struct PyGetSetDef PyGetSetDef;
+typedef struct PyMemberDef PyMemberDef;
+
+typedef struct _object PyObject;
+typedef struct _longobject PyLongObject;
+typedef struct _typeobject PyTypeObject;
+typedef struct PyCodeObject PyCodeObject;
+typedef struct _frame PyFrameObject;
+
+typedef struct _ts PyThreadState;
+typedef struct _is PyInterpreterState;
+
+#ifdef __cplusplus
+}
+#endif
+#endif   // !Py_PYTYPEDEFS_H
diff --git a/nanvix-port/cpython-headers/python3.12/rangeobject.h b/nanvix-port/cpython-headers/python3.12/rangeobject.h
new file mode 100644
index 000000000000..d46ce7cd41b7
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/rangeobject.h
@@ -0,0 +1,27 @@
+
+/* Range object interface */
+
+#ifndef Py_RANGEOBJECT_H
+#define Py_RANGEOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+A range object represents an integer range.  This is an immutable object;
+a range cannot change its value after creation.
+
+Range objects behave like the corresponding tuple objects except that
+they are represented by a start, stop, and step datamembers.
+*/
+
+PyAPI_DATA(PyTypeObject) PyRange_Type;
+PyAPI_DATA(PyTypeObject) PyRangeIter_Type;
+PyAPI_DATA(PyTypeObject) PyLongRangeIter_Type;
+
+#define PyRange_Check(op) Py_IS_TYPE((op), &PyRange_Type)
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_RANGEOBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/setobject.h b/nanvix-port/cpython-headers/python3.12/setobject.h
new file mode 100644
index 000000000000..62c9e6b13f89
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/setobject.h
@@ -0,0 +1,49 @@
+/* Set object interface */
+
+#ifndef Py_SETOBJECT_H
+#define Py_SETOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_DATA(PyTypeObject) PySet_Type;
+PyAPI_DATA(PyTypeObject) PyFrozenSet_Type;
+PyAPI_DATA(PyTypeObject) PySetIter_Type;
+
+PyAPI_FUNC(PyObject *) PySet_New(PyObject *);
+PyAPI_FUNC(PyObject *) PyFrozenSet_New(PyObject *);
+
+PyAPI_FUNC(int) PySet_Add(PyObject *set, PyObject *key);
+PyAPI_FUNC(int) PySet_Clear(PyObject *set);
+PyAPI_FUNC(int) PySet_Contains(PyObject *anyset, PyObject *key);
+PyAPI_FUNC(int) PySet_Discard(PyObject *set, PyObject *key);
+PyAPI_FUNC(PyObject *) PySet_Pop(PyObject *set);
+PyAPI_FUNC(Py_ssize_t) PySet_Size(PyObject *anyset);
+
+#define PyFrozenSet_CheckExact(ob) Py_IS_TYPE((ob), &PyFrozenSet_Type)
+#define PyFrozenSet_Check(ob) \
+    (Py_IS_TYPE((ob), &PyFrozenSet_Type) || \
+      PyType_IsSubtype(Py_TYPE(ob), &PyFrozenSet_Type))
+
+#define PyAnySet_CheckExact(ob) \
+    (Py_IS_TYPE((ob), &PySet_Type) || Py_IS_TYPE((ob), &PyFrozenSet_Type))
+#define PyAnySet_Check(ob) \
+    (Py_IS_TYPE((ob), &PySet_Type) || Py_IS_TYPE((ob), &PyFrozenSet_Type) || \
+      PyType_IsSubtype(Py_TYPE(ob), &PySet_Type) || \
+      PyType_IsSubtype(Py_TYPE(ob), &PyFrozenSet_Type))
+
+#define PySet_CheckExact(op) Py_IS_TYPE(op, &PySet_Type)
+#define PySet_Check(ob) \
+    (Py_IS_TYPE((ob), &PySet_Type) || \
+    PyType_IsSubtype(Py_TYPE(ob), &PySet_Type))
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_SETOBJECT_H
+#  include "cpython/setobject.h"
+#  undef Py_CPYTHON_SETOBJECT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_SETOBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/sliceobject.h b/nanvix-port/cpython-headers/python3.12/sliceobject.h
new file mode 100644
index 000000000000..c13863f27c2e
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/sliceobject.h
@@ -0,0 +1,65 @@
+#ifndef Py_SLICEOBJECT_H
+#define Py_SLICEOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* The unique ellipsis object "..." */
+
+PyAPI_DATA(PyObject) _Py_EllipsisObject; /* Don't use this directly */
+
+#define Py_Ellipsis (&_Py_EllipsisObject)
+
+/* Slice object interface */
+
+/*
+
+A slice object containing start, stop, and step data members (the
+names are from range).  After much talk with Guido, it was decided to
+let these be any arbitrary python type.  Py_None stands for omitted values.
+*/
+#ifndef Py_LIMITED_API
+typedef struct {
+    PyObject_HEAD
+    PyObject *start, *stop, *step;      /* not NULL */
+} PySliceObject;
+#endif
+
+PyAPI_DATA(PyTypeObject) PySlice_Type;
+PyAPI_DATA(PyTypeObject) PyEllipsis_Type;
+
+#define PySlice_Check(op) Py_IS_TYPE((op), &PySlice_Type)
+
+PyAPI_FUNC(PyObject *) PySlice_New(PyObject* start, PyObject* stop,
+                                  PyObject* step);
+#ifndef Py_LIMITED_API
+PyAPI_FUNC(PyObject *) _PySlice_FromIndices(Py_ssize_t start, Py_ssize_t stop);
+PyAPI_FUNC(int) _PySlice_GetLongIndices(PySliceObject *self, PyObject *length,
+                                 PyObject **start_ptr, PyObject **stop_ptr,
+                                 PyObject **step_ptr);
+#endif
+PyAPI_FUNC(int) PySlice_GetIndices(PyObject *r, Py_ssize_t length,
+                                  Py_ssize_t *start, Py_ssize_t *stop, Py_ssize_t *step);
+Py_DEPRECATED(3.7)
+PyAPI_FUNC(int) PySlice_GetIndicesEx(PyObject *r, Py_ssize_t length,
+                                     Py_ssize_t *start, Py_ssize_t *stop,
+                                     Py_ssize_t *step,
+                                     Py_ssize_t *slicelength);
+
+#if !defined(Py_LIMITED_API) || (Py_LIMITED_API+0 >= 0x03050400 && Py_LIMITED_API+0 < 0x03060000) || Py_LIMITED_API+0 >= 0x03060100
+#define PySlice_GetIndicesEx(slice, length, start, stop, step, slicelen) (  \
+    PySlice_Unpack((slice), (start), (stop), (step)) < 0 ?                  \
+    ((*(slicelen) = 0), -1) :                                               \
+    ((*(slicelen) = PySlice_AdjustIndices((length), (start), (stop), *(step))), \
+     0))
+PyAPI_FUNC(int) PySlice_Unpack(PyObject *slice,
+                               Py_ssize_t *start, Py_ssize_t *stop, Py_ssize_t *step);
+PyAPI_FUNC(Py_ssize_t) PySlice_AdjustIndices(Py_ssize_t length,
+                                             Py_ssize_t *start, Py_ssize_t *stop,
+                                             Py_ssize_t step);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_SLICEOBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/structmember.h b/nanvix-port/cpython-headers/python3.12/structmember.h
new file mode 100644
index 000000000000..f6e8fd829892
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/structmember.h
@@ -0,0 +1,56 @@
+#ifndef Py_STRUCTMEMBER_H
+#define Py_STRUCTMEMBER_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Interface to map C struct members to Python object attributes
+ *
+ * This header is deprecated: new code should not use stuff from here.
+ * New definitions are in descrobject.h.
+ *
+ * However, there's nothing wrong with old code continuing to use it,
+ * and there's not much mainenance overhead in maintaining a few aliases.
+ * So, don't be too eager to convert old code.
+ *
+ * It uses names not prefixed with Py_.
+ * It is also *not* included from Python.h and must be included individually.
+ */
+
+#include <stddef.h> /* For offsetof (not always provided by Python.h) */
+
+/* Types */
+#define T_SHORT     Py_T_SHORT
+#define T_INT       Py_T_INT
+#define T_LONG      Py_T_LONG
+#define T_FLOAT     Py_T_FLOAT
+#define T_DOUBLE    Py_T_DOUBLE
+#define T_STRING    Py_T_STRING
+#define T_OBJECT    _Py_T_OBJECT
+#define T_CHAR      Py_T_CHAR
+#define T_BYTE      Py_T_BYTE
+#define T_UBYTE     Py_T_UBYTE
+#define T_USHORT    Py_T_USHORT
+#define T_UINT      Py_T_UINT
+#define T_ULONG     Py_T_ULONG
+#define T_STRING_INPLACE    Py_T_STRING_INPLACE
+#define T_BOOL      Py_T_BOOL
+#define T_OBJECT_EX Py_T_OBJECT_EX
+#define T_LONGLONG  Py_T_LONGLONG
+#define T_ULONGLONG Py_T_ULONGLONG
+#define T_PYSSIZET  Py_T_PYSSIZET
+#define T_NONE      _Py_T_NONE
+
+/* Flags */
+#define READONLY            Py_READONLY
+#define PY_AUDIT_READ        Py_AUDIT_READ
+#define READ_RESTRICTED     Py_AUDIT_READ
+#define PY_WRITE_RESTRICTED _Py_WRITE_RESTRICTED
+#define RESTRICTED          (READ_RESTRICTED | PY_WRITE_RESTRICTED)
+
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_STRUCTMEMBER_H */
diff --git a/nanvix-port/cpython-headers/python3.12/structseq.h b/nanvix-port/cpython-headers/python3.12/structseq.h
new file mode 100644
index 000000000000..968711556119
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/structseq.h
@@ -0,0 +1,49 @@
+
+/* Named tuple object interface */
+
+#ifndef Py_STRUCTSEQ_H
+#define Py_STRUCTSEQ_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct PyStructSequence_Field {
+    const char *name;
+    const char *doc;
+} PyStructSequence_Field;
+
+typedef struct PyStructSequence_Desc {
+    const char *name;
+    const char *doc;
+    PyStructSequence_Field *fields;
+    int n_in_sequence;
+} PyStructSequence_Desc;
+
+PyAPI_DATA(const char * const) PyStructSequence_UnnamedField;
+
+#ifndef Py_LIMITED_API
+PyAPI_FUNC(void) PyStructSequence_InitType(PyTypeObject *type,
+                                           PyStructSequence_Desc *desc);
+PyAPI_FUNC(int) PyStructSequence_InitType2(PyTypeObject *type,
+                                           PyStructSequence_Desc *desc);
+#endif
+PyAPI_FUNC(PyTypeObject*) PyStructSequence_NewType(PyStructSequence_Desc *desc);
+
+PyAPI_FUNC(PyObject *) PyStructSequence_New(PyTypeObject* type);
+
+#ifndef Py_LIMITED_API
+typedef PyTupleObject PyStructSequence;
+
+/* Macro, *only* to be used to fill in brand new objects */
+#define PyStructSequence_SET_ITEM(op, i, v) PyTuple_SET_ITEM((op), (i), (v))
+
+#define PyStructSequence_GET_ITEM(op, i) PyTuple_GET_ITEM((op), (i))
+#endif
+
+PyAPI_FUNC(void) PyStructSequence_SetItem(PyObject*, Py_ssize_t, PyObject*);
+PyAPI_FUNC(PyObject*) PyStructSequence_GetItem(PyObject*, Py_ssize_t);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_STRUCTSEQ_H */
diff --git a/nanvix-port/cpython-headers/python3.12/sysmodule.h b/nanvix-port/cpython-headers/python3.12/sysmodule.h
new file mode 100644
index 000000000000..96f883870b34
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/sysmodule.h
@@ -0,0 +1,54 @@
+
+/* System module interface */
+
+#ifndef Py_SYSMODULE_H
+#define Py_SYSMODULE_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_FUNC(PyObject *) PySys_GetObject(const char *);
+PyAPI_FUNC(int) PySys_SetObject(const char *, PyObject *);
+
+Py_DEPRECATED(3.11) PyAPI_FUNC(void) PySys_SetArgv(int, wchar_t **);
+Py_DEPRECATED(3.11) PyAPI_FUNC(void) PySys_SetArgvEx(int, wchar_t **, int);
+Py_DEPRECATED(3.11) PyAPI_FUNC(void) PySys_SetPath(const wchar_t *);
+
+PyAPI_FUNC(void) PySys_WriteStdout(const char *format, ...)
+                 Py_GCC_ATTRIBUTE((format(printf, 1, 2)));
+PyAPI_FUNC(void) PySys_WriteStderr(const char *format, ...)
+                 Py_GCC_ATTRIBUTE((format(printf, 1, 2)));
+PyAPI_FUNC(void) PySys_FormatStdout(const char *format, ...);
+PyAPI_FUNC(void) PySys_FormatStderr(const char *format, ...);
+
+PyAPI_FUNC(void) PySys_ResetWarnOptions(void);
+Py_DEPRECATED(3.11) PyAPI_FUNC(void) PySys_AddWarnOption(const wchar_t *);
+Py_DEPRECATED(3.11) PyAPI_FUNC(void) PySys_AddWarnOptionUnicode(PyObject *);
+Py_DEPRECATED(3.11) PyAPI_FUNC(int) PySys_HasWarnOptions(void);
+
+Py_DEPRECATED(3.11) PyAPI_FUNC(void) PySys_AddXOption(const wchar_t *);
+PyAPI_FUNC(PyObject *) PySys_GetXOptions(void);
+
+#if !defined(Py_LIMITED_API)
+typedef struct {
+    FILE* perf_map;
+    PyThread_type_lock map_lock;
+} PerfMapState;
+
+PyAPI_FUNC(int) PyUnstable_PerfMapState_Init(void);
+
+PyAPI_FUNC(int) PyUnstable_WritePerfMapEntry(const void *code_addr, unsigned int code_size, const char *entry_name);
+
+PyAPI_FUNC(void) PyUnstable_PerfMapState_Fini(void);
+#endif
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_SYSMODULE_H
+#  include "cpython/sysmodule.h"
+#  undef Py_CPYTHON_SYSMODULE_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_SYSMODULE_H */
diff --git a/nanvix-port/cpython-headers/python3.12/traceback.h b/nanvix-port/cpython-headers/python3.12/traceback.h
new file mode 100644
index 000000000000..2b40cc9fc326
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/traceback.h
@@ -0,0 +1,26 @@
+#ifndef Py_TRACEBACK_H
+#define Py_TRACEBACK_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Traceback interface */
+
+PyAPI_FUNC(int) PyTraceBack_Here(PyFrameObject *);
+PyAPI_FUNC(int) PyTraceBack_Print(PyObject *, PyObject *);
+
+/* Reveal traceback type so we can typecheck traceback objects */
+PyAPI_DATA(PyTypeObject) PyTraceBack_Type;
+#define PyTraceBack_Check(v) Py_IS_TYPE((v), &PyTraceBack_Type)
+
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_TRACEBACK_H
+#  include "cpython/traceback.h"
+#  undef Py_CPYTHON_TRACEBACK_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_TRACEBACK_H */
diff --git a/nanvix-port/cpython-headers/python3.12/tracemalloc.h b/nanvix-port/cpython-headers/python3.12/tracemalloc.h
new file mode 100644
index 000000000000..580027a8e365
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/tracemalloc.h
@@ -0,0 +1,72 @@
+#ifndef Py_TRACEMALLOC_H
+#define Py_TRACEMALLOC_H
+
+#ifndef Py_LIMITED_API
+/* Track an allocated memory block in the tracemalloc module.
+   Return 0 on success, return -1 on error (failed to allocate memory to store
+   the trace).
+
+   Return -2 if tracemalloc is disabled.
+
+   If memory block is already tracked, update the existing trace. */
+PyAPI_FUNC(int) PyTraceMalloc_Track(
+    unsigned int domain,
+    uintptr_t ptr,
+    size_t size);
+
+/* Untrack an allocated memory block in the tracemalloc module.
+   Do nothing if the block was not tracked.
+
+   Return -2 if tracemalloc is disabled, otherwise return 0. */
+PyAPI_FUNC(int) PyTraceMalloc_Untrack(
+    unsigned int domain,
+    uintptr_t ptr);
+
+/* Get the traceback where a memory block was allocated.
+
+   Return a tuple of (filename: str, lineno: int) tuples.
+
+   Return None if the tracemalloc module is disabled or if the memory block
+   is not tracked by tracemalloc.
+
+   Raise an exception and return NULL on error. */
+PyAPI_FUNC(PyObject*) _PyTraceMalloc_GetTraceback(
+    unsigned int domain,
+    uintptr_t ptr);
+
+/* Return non-zero if tracemalloc is tracing */
+PyAPI_FUNC(int) _PyTraceMalloc_IsTracing(void);
+
+/* Clear the tracemalloc traces */
+PyAPI_FUNC(void) _PyTraceMalloc_ClearTraces(void);
+
+/* Clear the tracemalloc traces */
+PyAPI_FUNC(PyObject *) _PyTraceMalloc_GetTraces(void);
+
+/* Clear tracemalloc traceback for an object */
+PyAPI_FUNC(PyObject *) _PyTraceMalloc_GetObjectTraceback(PyObject *obj);
+
+/* Initialize tracemalloc */
+PyAPI_FUNC(int) _PyTraceMalloc_Init(void);
+
+/* Start tracemalloc */
+PyAPI_FUNC(int) _PyTraceMalloc_Start(int max_nframe);
+
+/* Stop tracemalloc */
+PyAPI_FUNC(void) _PyTraceMalloc_Stop(void);
+
+/* Get the tracemalloc traceback limit */
+PyAPI_FUNC(int) _PyTraceMalloc_GetTracebackLimit(void);
+
+/* Get the memory usage of tracemalloc in bytes */
+PyAPI_FUNC(size_t) _PyTraceMalloc_GetMemory(void);
+
+/* Get the current size and peak size of traced memory blocks as a 2-tuple */
+PyAPI_FUNC(PyObject *) _PyTraceMalloc_GetTracedMemory(void);
+
+/* Set the peak size of traced memory blocks to the current size */
+PyAPI_FUNC(void) _PyTraceMalloc_ResetPeak(void);
+
+#endif
+
+#endif /* !Py_TRACEMALLOC_H */
diff --git a/nanvix-port/cpython-headers/python3.12/tupleobject.h b/nanvix-port/cpython-headers/python3.12/tupleobject.h
new file mode 100644
index 000000000000..1f9ab54be65f
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/tupleobject.h
@@ -0,0 +1,46 @@
+/* Tuple object interface */
+
+#ifndef Py_TUPLEOBJECT_H
+#define Py_TUPLEOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+Another generally useful object type is a tuple of object pointers.
+For Python, this is an immutable type.  C code can change the tuple items
+(but not their number), and even use tuples as general-purpose arrays of
+object references, but in general only brand new tuples should be mutated,
+not ones that might already have been exposed to Python code.
+
+*** WARNING *** PyTuple_SetItem does not increment the new item's reference
+count, but does decrement the reference count of the item it replaces,
+if not nil.  It does *decrement* the reference count if it is *not*
+inserted in the tuple.  Similarly, PyTuple_GetItem does not increment the
+returned item's reference count.
+*/
+
+PyAPI_DATA(PyTypeObject) PyTuple_Type;
+PyAPI_DATA(PyTypeObject) PyTupleIter_Type;
+
+#define PyTuple_Check(op) \
+                 PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_TUPLE_SUBCLASS)
+#define PyTuple_CheckExact(op) Py_IS_TYPE((op), &PyTuple_Type)
+
+PyAPI_FUNC(PyObject *) PyTuple_New(Py_ssize_t size);
+PyAPI_FUNC(Py_ssize_t) PyTuple_Size(PyObject *);
+PyAPI_FUNC(PyObject *) PyTuple_GetItem(PyObject *, Py_ssize_t);
+PyAPI_FUNC(int) PyTuple_SetItem(PyObject *, Py_ssize_t, PyObject *);
+PyAPI_FUNC(PyObject *) PyTuple_GetSlice(PyObject *, Py_ssize_t, Py_ssize_t);
+PyAPI_FUNC(PyObject *) PyTuple_Pack(Py_ssize_t, ...);
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_TUPLEOBJECT_H
+#  include "cpython/tupleobject.h"
+#  undef Py_CPYTHON_TUPLEOBJECT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_TUPLEOBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/typeslots.h b/nanvix-port/cpython-headers/python3.12/typeslots.h
new file mode 100644
index 000000000000..506b05580de1
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/typeslots.h
@@ -0,0 +1,88 @@
+/* Do not renumber the file; these numbers are part of the stable ABI. */
+#define Py_bf_getbuffer 1
+#define Py_bf_releasebuffer 2
+#define Py_mp_ass_subscript 3
+#define Py_mp_length 4
+#define Py_mp_subscript 5
+#define Py_nb_absolute 6
+#define Py_nb_add 7
+#define Py_nb_and 8
+#define Py_nb_bool 9
+#define Py_nb_divmod 10
+#define Py_nb_float 11
+#define Py_nb_floor_divide 12
+#define Py_nb_index 13
+#define Py_nb_inplace_add 14
+#define Py_nb_inplace_and 15
+#define Py_nb_inplace_floor_divide 16
+#define Py_nb_inplace_lshift 17
+#define Py_nb_inplace_multiply 18
+#define Py_nb_inplace_or 19
+#define Py_nb_inplace_power 20
+#define Py_nb_inplace_remainder 21
+#define Py_nb_inplace_rshift 22
+#define Py_nb_inplace_subtract 23
+#define Py_nb_inplace_true_divide 24
+#define Py_nb_inplace_xor 25
+#define Py_nb_int 26
+#define Py_nb_invert 27
+#define Py_nb_lshift 28
+#define Py_nb_multiply 29
+#define Py_nb_negative 30
+#define Py_nb_or 31
+#define Py_nb_positive 32
+#define Py_nb_power 33
+#define Py_nb_remainder 34
+#define Py_nb_rshift 35
+#define Py_nb_subtract 36
+#define Py_nb_true_divide 37
+#define Py_nb_xor 38
+#define Py_sq_ass_item 39
+#define Py_sq_concat 40
+#define Py_sq_contains 41
+#define Py_sq_inplace_concat 42
+#define Py_sq_inplace_repeat 43
+#define Py_sq_item 44
+#define Py_sq_length 45
+#define Py_sq_repeat 46
+#define Py_tp_alloc 47
+#define Py_tp_base 48
+#define Py_tp_bases 49
+#define Py_tp_call 50
+#define Py_tp_clear 51
+#define Py_tp_dealloc 52
+#define Py_tp_del 53
+#define Py_tp_descr_get 54
+#define Py_tp_descr_set 55
+#define Py_tp_doc 56
+#define Py_tp_getattr 57
+#define Py_tp_getattro 58
+#define Py_tp_hash 59
+#define Py_tp_init 60
+#define Py_tp_is_gc 61
+#define Py_tp_iter 62
+#define Py_tp_iternext 63
+#define Py_tp_methods 64
+#define Py_tp_new 65
+#define Py_tp_repr 66
+#define Py_tp_richcompare 67
+#define Py_tp_setattr 68
+#define Py_tp_setattro 69
+#define Py_tp_str 70
+#define Py_tp_traverse 71
+#define Py_tp_members 72
+#define Py_tp_getset 73
+#define Py_tp_free 74
+#define Py_nb_matrix_multiply 75
+#define Py_nb_inplace_matrix_multiply 76
+#define Py_am_await 77
+#define Py_am_aiter 78
+#define Py_am_anext 79
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03050000
+/* New in 3.5 */
+#define Py_tp_finalize 80
+#endif
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030A0000
+/* New in 3.10 */
+#define Py_am_send 81
+#endif
diff --git a/nanvix-port/cpython-headers/python3.12/unicodeobject.h b/nanvix-port/cpython-headers/python3.12/unicodeobject.h
new file mode 100644
index 000000000000..5839c747a292
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/unicodeobject.h
@@ -0,0 +1,1020 @@
+#ifndef Py_UNICODEOBJECT_H
+#define Py_UNICODEOBJECT_H
+
+#include <stdarg.h>               // va_list
+
+/*
+
+Unicode implementation based on original code by Fredrik Lundh,
+modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
+Unicode Integration Proposal. (See
+http://www.egenix.com/files/python/unicode-proposal.txt).
+
+Copyright (c) Corporation for National Research Initiatives.
+
+
+ Original header:
+ --------------------------------------------------------------------
+
+ * Yet another Unicode string type for Python.  This type supports the
+ * 16-bit Basic Multilingual Plane (BMP) only.
+ *
+ * Written by Fredrik Lundh, January 1999.
+ *
+ * Copyright (c) 1999 by Secret Labs AB.
+ * Copyright (c) 1999 by Fredrik Lundh.
+ *
+ * fredrik@pythonware.com
+ * http://www.pythonware.com
+ *
+ * --------------------------------------------------------------------
+ * This Unicode String Type is
+ *
+ * Copyright (c) 1999 by Secret Labs AB
+ * Copyright (c) 1999 by Fredrik Lundh
+ *
+ * By obtaining, using, and/or copying this software and/or its
+ * associated documentation, you agree that you have read, understood,
+ * and will comply with the following terms and conditions:
+ *
+ * Permission to use, copy, modify, and distribute this software and its
+ * associated documentation for any purpose and without fee is hereby
+ * granted, provided that the above copyright notice appears in all
+ * copies, and that both that copyright notice and this permission notice
+ * appear in supporting documentation, and that the name of Secret Labs
+ * AB or the author not be used in advertising or publicity pertaining to
+ * distribution of the software without specific, written prior
+ * permission.
+ *
+ * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
+ * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+ * FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
+ * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ * -------------------------------------------------------------------- */
+
+#include <ctype.h>
+
+/* === Internal API ======================================================= */
+
+/* --- Internal Unicode Format -------------------------------------------- */
+
+/* Python 3.x requires unicode */
+#define Py_USING_UNICODE
+
+#ifndef SIZEOF_WCHAR_T
+#error Must define SIZEOF_WCHAR_T
+#endif
+
+#define Py_UNICODE_SIZE SIZEOF_WCHAR_T
+
+/* If wchar_t can be used for UCS-4 storage, set Py_UNICODE_WIDE.
+   Otherwise, Unicode strings are stored as UCS-2 (with limited support
+   for UTF-16) */
+
+#if Py_UNICODE_SIZE >= 4
+#define Py_UNICODE_WIDE
+#endif
+
+/* Set these flags if the platform has "wchar.h" and the
+   wchar_t type is a 16-bit unsigned type */
+/* #define HAVE_WCHAR_H */
+/* #define HAVE_USABLE_WCHAR_T */
+
+/* If the compiler provides a wchar_t type we try to support it
+   through the interface functions PyUnicode_FromWideChar(),
+   PyUnicode_AsWideChar() and PyUnicode_AsWideCharString(). */
+
+#ifdef HAVE_USABLE_WCHAR_T
+# ifndef HAVE_WCHAR_H
+#  define HAVE_WCHAR_H
+# endif
+#endif
+
+#ifdef HAVE_WCHAR_H
+#  include <wchar.h>
+#endif
+
+/* Py_UCS4 and Py_UCS2 are typedefs for the respective
+   unicode representations. */
+typedef uint32_t Py_UCS4;
+typedef uint16_t Py_UCS2;
+typedef uint8_t Py_UCS1;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+PyAPI_DATA(PyTypeObject) PyUnicode_Type;
+PyAPI_DATA(PyTypeObject) PyUnicodeIter_Type;
+
+#define PyUnicode_Check(op) \
+    PyType_FastSubclass(Py_TYPE(op), Py_TPFLAGS_UNICODE_SUBCLASS)
+#define PyUnicode_CheckExact(op) Py_IS_TYPE((op), &PyUnicode_Type)
+
+/* --- Constants ---------------------------------------------------------- */
+
+/* This Unicode character will be used as replacement character during
+   decoding if the errors argument is set to "replace". Note: the
+   Unicode character U+FFFD is the official REPLACEMENT CHARACTER in
+   Unicode 3.0. */
+
+#define Py_UNICODE_REPLACEMENT_CHARACTER ((Py_UCS4) 0xFFFD)
+
+/* === Public API ========================================================= */
+
+/* Similar to PyUnicode_FromUnicode(), but u points to UTF-8 encoded bytes */
+PyAPI_FUNC(PyObject*) PyUnicode_FromStringAndSize(
+    const char *u,             /* UTF-8 encoded string */
+    Py_ssize_t size            /* size of buffer */
+    );
+
+/* Similar to PyUnicode_FromUnicode(), but u points to null-terminated
+   UTF-8 encoded bytes.  The size is determined with strlen(). */
+PyAPI_FUNC(PyObject*) PyUnicode_FromString(
+    const char *u              /* UTF-8 encoded string */
+    );
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+PyAPI_FUNC(PyObject*) PyUnicode_Substring(
+    PyObject *str,
+    Py_ssize_t start,
+    Py_ssize_t end);
+#endif
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+/* Copy the string into a UCS4 buffer including the null character if copy_null
+   is set. Return NULL and raise an exception on error. Raise a SystemError if
+   the buffer is smaller than the string. Return buffer on success.
+
+   buflen is the length of the buffer in (Py_UCS4) characters. */
+PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4(
+    PyObject *unicode,
+    Py_UCS4* buffer,
+    Py_ssize_t buflen,
+    int copy_null);
+
+/* Copy the string into a UCS4 buffer. A new buffer is allocated using
+ * PyMem_Malloc; if this fails, NULL is returned with a memory error
+   exception set. */
+PyAPI_FUNC(Py_UCS4*) PyUnicode_AsUCS4Copy(PyObject *unicode);
+#endif
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+/* Get the length of the Unicode object. */
+
+PyAPI_FUNC(Py_ssize_t) PyUnicode_GetLength(
+    PyObject *unicode
+);
+#endif
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+/* Read a character from the string. */
+
+PyAPI_FUNC(Py_UCS4) PyUnicode_ReadChar(
+    PyObject *unicode,
+    Py_ssize_t index
+    );
+
+/* Write a character to the string. The string must have been created through
+   PyUnicode_New, must not be shared, and must not have been hashed yet.
+
+   Return 0 on success, -1 on error. */
+
+PyAPI_FUNC(int) PyUnicode_WriteChar(
+    PyObject *unicode,
+    Py_ssize_t index,
+    Py_UCS4 character
+    );
+#endif
+
+/* Resize a Unicode object. The length is the number of codepoints.
+
+   *unicode is modified to point to the new (resized) object and 0
+   returned on success.
+
+   Try to resize the string in place (which is usually faster than allocating
+   a new string and copy characters), or create a new string.
+
+   Error handling is implemented as follows: an exception is set, -1
+   is returned and *unicode left untouched.
+
+   WARNING: The function doesn't check string content, the result may not be a
+            string in canonical representation. */
+
+PyAPI_FUNC(int) PyUnicode_Resize(
+    PyObject **unicode,         /* Pointer to the Unicode object */
+    Py_ssize_t length           /* New length */
+    );
+
+/* Decode obj to a Unicode object.
+
+   bytes, bytearray and other bytes-like objects are decoded according to the
+   given encoding and error handler. The encoding and error handler can be
+   NULL to have the interface use UTF-8 and "strict".
+
+   All other objects (including Unicode objects) raise an exception.
+
+   The API returns NULL in case of an error. The caller is responsible
+   for decref'ing the returned objects.
+
+*/
+
+PyAPI_FUNC(PyObject*) PyUnicode_FromEncodedObject(
+    PyObject *obj,              /* Object */
+    const char *encoding,       /* encoding */
+    const char *errors          /* error handling */
+    );
+
+/* Copy an instance of a Unicode subtype to a new true Unicode object if
+   necessary. If obj is already a true Unicode object (not a subtype), return
+   the reference with *incremented* refcount.
+
+   The API returns NULL in case of an error. The caller is responsible
+   for decref'ing the returned objects.
+
+*/
+
+PyAPI_FUNC(PyObject*) PyUnicode_FromObject(
+    PyObject *obj      /* Object */
+    );
+
+PyAPI_FUNC(PyObject *) PyUnicode_FromFormatV(
+    const char *format,   /* ASCII-encoded string  */
+    va_list vargs
+    );
+PyAPI_FUNC(PyObject *) PyUnicode_FromFormat(
+    const char *format,   /* ASCII-encoded string  */
+    ...
+    );
+
+PyAPI_FUNC(void) PyUnicode_InternInPlace(PyObject **);
+PyAPI_FUNC(PyObject *) PyUnicode_InternFromString(
+    const char *u              /* UTF-8 encoded string */
+    );
+
+/* --- wchar_t support for platforms which support it --------------------- */
+
+#ifdef HAVE_WCHAR_H
+
+/* Create a Unicode Object from the wchar_t buffer w of the given
+   size.
+
+   The buffer is copied into the new object. */
+
+PyAPI_FUNC(PyObject*) PyUnicode_FromWideChar(
+    const wchar_t *w,           /* wchar_t buffer */
+    Py_ssize_t size             /* size of buffer */
+    );
+
+/* Copies the Unicode Object contents into the wchar_t buffer w.  At
+   most size wchar_t characters are copied.
+
+   Note that the resulting wchar_t string may or may not be
+   0-terminated.  It is the responsibility of the caller to make sure
+   that the wchar_t string is 0-terminated in case this is required by
+   the application.
+
+   Returns the number of wchar_t characters copied (excluding a
+   possibly trailing 0-termination character) or -1 in case of an
+   error. */
+
+PyAPI_FUNC(Py_ssize_t) PyUnicode_AsWideChar(
+    PyObject *unicode,          /* Unicode object */
+    wchar_t *w,                 /* wchar_t buffer */
+    Py_ssize_t size             /* size of buffer */
+    );
+
+/* Convert the Unicode object to a wide character string. The output string
+   always ends with a nul character. If size is not NULL, write the number of
+   wide characters (excluding the null character) into *size.
+
+   Returns a buffer allocated by PyMem_Malloc() (use PyMem_Free() to free it)
+   on success. On error, returns NULL, *size is undefined and raises a
+   MemoryError. */
+
+PyAPI_FUNC(wchar_t*) PyUnicode_AsWideCharString(
+    PyObject *unicode,          /* Unicode object */
+    Py_ssize_t *size            /* number of characters of the result */
+    );
+
+#endif
+
+/* --- Unicode ordinals --------------------------------------------------- */
+
+/* Create a Unicode Object from the given Unicode code point ordinal.
+
+   The ordinal must be in range(0x110000). A ValueError is
+   raised in case it is not.
+
+*/
+
+PyAPI_FUNC(PyObject*) PyUnicode_FromOrdinal(int ordinal);
+
+/* === Builtin Codecs =====================================================
+
+   Many of these APIs take two arguments encoding and errors. These
+   parameters encoding and errors have the same semantics as the ones
+   of the builtin str() API.
+
+   Setting encoding to NULL causes the default encoding (UTF-8) to be used.
+
+   Error handling is set by errors which may also be set to NULL
+   meaning to use the default handling defined for the codec. Default
+   error handling for all builtin codecs is "strict" (ValueErrors are
+   raised).
+
+   The codecs all use a similar interface. Only deviation from the
+   generic ones are documented.
+
+*/
+
+/* --- Manage the default encoding ---------------------------------------- */
+
+/* Returns "utf-8".  */
+PyAPI_FUNC(const char*) PyUnicode_GetDefaultEncoding(void);
+
+/* --- Generic Codecs ----------------------------------------------------- */
+
+/* Create a Unicode object by decoding the encoded string s of the
+   given size. */
+
+PyAPI_FUNC(PyObject*) PyUnicode_Decode(
+    const char *s,              /* encoded string */
+    Py_ssize_t size,            /* size of buffer */
+    const char *encoding,       /* encoding */
+    const char *errors          /* error handling */
+    );
+
+/* Decode a Unicode object unicode and return the result as Python
+   object.
+
+   This API is DEPRECATED. The only supported standard encoding is rot13.
+   Use PyCodec_Decode() to decode with rot13 and non-standard codecs
+   that decode from str. */
+
+Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedObject(
+    PyObject *unicode,          /* Unicode object */
+    const char *encoding,       /* encoding */
+    const char *errors          /* error handling */
+    );
+
+/* Decode a Unicode object unicode and return the result as Unicode
+   object.
+
+   This API is DEPRECATED. The only supported standard encoding is rot13.
+   Use PyCodec_Decode() to decode with rot13 and non-standard codecs
+   that decode from str to str. */
+
+Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsDecodedUnicode(
+    PyObject *unicode,          /* Unicode object */
+    const char *encoding,       /* encoding */
+    const char *errors          /* error handling */
+    );
+
+/* Encodes a Unicode object and returns the result as Python
+   object.
+
+   This API is DEPRECATED.  It is superseded by PyUnicode_AsEncodedString()
+   since all standard encodings (except rot13) encode str to bytes.
+   Use PyCodec_Encode() for encoding with rot13 and non-standard codecs
+   that encode form str to non-bytes. */
+
+Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedObject(
+    PyObject *unicode,          /* Unicode object */
+    const char *encoding,       /* encoding */
+    const char *errors          /* error handling */
+    );
+
+/* Encodes a Unicode object and returns the result as Python string
+   object. */
+
+PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedString(
+    PyObject *unicode,          /* Unicode object */
+    const char *encoding,       /* encoding */
+    const char *errors          /* error handling */
+    );
+
+/* Encodes a Unicode object and returns the result as Unicode
+   object.
+
+   This API is DEPRECATED.  The only supported standard encodings is rot13.
+   Use PyCodec_Encode() to encode with rot13 and non-standard codecs
+   that encode from str to str. */
+
+Py_DEPRECATED(3.6) PyAPI_FUNC(PyObject*) PyUnicode_AsEncodedUnicode(
+    PyObject *unicode,          /* Unicode object */
+    const char *encoding,       /* encoding */
+    const char *errors          /* error handling */
+    );
+
+/* Build an encoding map. */
+
+PyAPI_FUNC(PyObject*) PyUnicode_BuildEncodingMap(
+    PyObject* string            /* 256 character map */
+   );
+
+/* --- UTF-7 Codecs ------------------------------------------------------- */
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7(
+    const char *string,         /* UTF-7 encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors          /* error handling */
+    );
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF7Stateful(
+    const char *string,         /* UTF-7 encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors,         /* error handling */
+    Py_ssize_t *consumed        /* bytes consumed */
+    );
+
+/* --- UTF-8 Codecs ------------------------------------------------------- */
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8(
+    const char *string,         /* UTF-8 encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors          /* error handling */
+    );
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF8Stateful(
+    const char *string,         /* UTF-8 encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors,         /* error handling */
+    Py_ssize_t *consumed        /* bytes consumed */
+    );
+
+PyAPI_FUNC(PyObject*) PyUnicode_AsUTF8String(
+    PyObject *unicode           /* Unicode object */
+    );
+
+/* Returns a pointer to the default encoding (UTF-8) of the
+   Unicode object unicode and the size of the encoded representation
+   in bytes stored in *size.
+
+   In case of an error, no *size is set.
+
+   This function caches the UTF-8 encoded string in the unicodeobject
+   and subsequent calls will return the same string.  The memory is released
+   when the unicodeobject is deallocated.
+*/
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030A0000
+PyAPI_FUNC(const char *) PyUnicode_AsUTF8AndSize(
+    PyObject *unicode,
+    Py_ssize_t *size);
+#endif
+
+/* --- UTF-32 Codecs ------------------------------------------------------ */
+
+/* Decodes length bytes from a UTF-32 encoded buffer string and returns
+   the corresponding Unicode object.
+
+   errors (if non-NULL) defines the error handling. It defaults
+   to "strict".
+
+   If byteorder is non-NULL, the decoder starts decoding using the
+   given byte order:
+
+    *byteorder == -1: little endian
+    *byteorder == 0:  native order
+    *byteorder == 1:  big endian
+
+   In native mode, the first four bytes of the stream are checked for a
+   BOM mark. If found, the BOM mark is analysed, the byte order
+   adjusted and the BOM skipped.  In the other modes, no BOM mark
+   interpretation is done. After completion, *byteorder is set to the
+   current byte order at the end of input data.
+
+   If byteorder is NULL, the codec starts in native order mode.
+
+*/
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32(
+    const char *string,         /* UTF-32 encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors,         /* error handling */
+    int *byteorder              /* pointer to byteorder to use
+                                   0=native;-1=LE,1=BE; updated on
+                                   exit */
+    );
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF32Stateful(
+    const char *string,         /* UTF-32 encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors,         /* error handling */
+    int *byteorder,             /* pointer to byteorder to use
+                                   0=native;-1=LE,1=BE; updated on
+                                   exit */
+    Py_ssize_t *consumed        /* bytes consumed */
+    );
+
+/* Returns a Python string using the UTF-32 encoding in native byte
+   order. The string always starts with a BOM mark.  */
+
+PyAPI_FUNC(PyObject*) PyUnicode_AsUTF32String(
+    PyObject *unicode           /* Unicode object */
+    );
+
+/* Returns a Python string object holding the UTF-32 encoded value of
+   the Unicode data.
+
+   If byteorder is not 0, output is written according to the following
+   byte order:
+
+   byteorder == -1: little endian
+   byteorder == 0:  native byte order (writes a BOM mark)
+   byteorder == 1:  big endian
+
+   If byteorder is 0, the output string will always start with the
+   Unicode BOM mark (U+FEFF). In the other two modes, no BOM mark is
+   prepended.
+
+*/
+
+/* --- UTF-16 Codecs ------------------------------------------------------ */
+
+/* Decodes length bytes from a UTF-16 encoded buffer string and returns
+   the corresponding Unicode object.
+
+   errors (if non-NULL) defines the error handling. It defaults
+   to "strict".
+
+   If byteorder is non-NULL, the decoder starts decoding using the
+   given byte order:
+
+    *byteorder == -1: little endian
+    *byteorder == 0:  native order
+    *byteorder == 1:  big endian
+
+   In native mode, the first two bytes of the stream are checked for a
+   BOM mark. If found, the BOM mark is analysed, the byte order
+   adjusted and the BOM skipped.  In the other modes, no BOM mark
+   interpretation is done. After completion, *byteorder is set to the
+   current byte order at the end of input data.
+
+   If byteorder is NULL, the codec starts in native order mode.
+
+*/
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16(
+    const char *string,         /* UTF-16 encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors,         /* error handling */
+    int *byteorder              /* pointer to byteorder to use
+                                   0=native;-1=LE,1=BE; updated on
+                                   exit */
+    );
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeUTF16Stateful(
+    const char *string,         /* UTF-16 encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors,         /* error handling */
+    int *byteorder,             /* pointer to byteorder to use
+                                   0=native;-1=LE,1=BE; updated on
+                                   exit */
+    Py_ssize_t *consumed        /* bytes consumed */
+    );
+
+/* Returns a Python string using the UTF-16 encoding in native byte
+   order. The string always starts with a BOM mark.  */
+
+PyAPI_FUNC(PyObject*) PyUnicode_AsUTF16String(
+    PyObject *unicode           /* Unicode object */
+    );
+
+/* --- Unicode-Escape Codecs ---------------------------------------------- */
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeUnicodeEscape(
+    const char *string,         /* Unicode-Escape encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors          /* error handling */
+    );
+
+PyAPI_FUNC(PyObject*) PyUnicode_AsUnicodeEscapeString(
+    PyObject *unicode           /* Unicode object */
+    );
+
+/* --- Raw-Unicode-Escape Codecs ------------------------------------------ */
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeRawUnicodeEscape(
+    const char *string,         /* Raw-Unicode-Escape encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors          /* error handling */
+    );
+
+PyAPI_FUNC(PyObject*) PyUnicode_AsRawUnicodeEscapeString(
+    PyObject *unicode           /* Unicode object */
+    );
+
+/* --- Latin-1 Codecs -----------------------------------------------------
+
+   Note: Latin-1 corresponds to the first 256 Unicode ordinals. */
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeLatin1(
+    const char *string,         /* Latin-1 encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors          /* error handling */
+    );
+
+PyAPI_FUNC(PyObject*) PyUnicode_AsLatin1String(
+    PyObject *unicode           /* Unicode object */
+    );
+
+/* --- ASCII Codecs -------------------------------------------------------
+
+   Only 7-bit ASCII data is expected. All other codes generate errors.
+
+*/
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeASCII(
+    const char *string,         /* ASCII encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors          /* error handling */
+    );
+
+PyAPI_FUNC(PyObject*) PyUnicode_AsASCIIString(
+    PyObject *unicode           /* Unicode object */
+    );
+
+/* --- Character Map Codecs -----------------------------------------------
+
+   This codec uses mappings to encode and decode characters.
+
+   Decoding mappings must map byte ordinals (integers in the range from 0 to
+   255) to Unicode strings, integers (which are then interpreted as Unicode
+   ordinals) or None.  Unmapped data bytes (ones which cause a LookupError)
+   as well as mapped to None, 0xFFFE or '\ufffe' are treated as "undefined
+   mapping" and cause an error.
+
+   Encoding mappings must map Unicode ordinal integers to bytes objects,
+   integers in the range from 0 to 255 or None.  Unmapped character
+   ordinals (ones which cause a LookupError) as well as mapped to
+   None are treated as "undefined mapping" and cause an error.
+
+*/
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeCharmap(
+    const char *string,         /* Encoded string */
+    Py_ssize_t length,          /* size of string */
+    PyObject *mapping,          /* decoding mapping */
+    const char *errors          /* error handling */
+    );
+
+PyAPI_FUNC(PyObject*) PyUnicode_AsCharmapString(
+    PyObject *unicode,          /* Unicode object */
+    PyObject *mapping           /* encoding mapping */
+    );
+
+/* --- MBCS codecs for Windows -------------------------------------------- */
+
+#ifdef MS_WINDOWS
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCS(
+    const char *string,         /* MBCS encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors          /* error handling */
+    );
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeMBCSStateful(
+    const char *string,         /* MBCS encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors,         /* error handling */
+    Py_ssize_t *consumed        /* bytes consumed */
+    );
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeCodePageStateful(
+    int code_page,              /* code page number */
+    const char *string,         /* encoded string */
+    Py_ssize_t length,          /* size of string */
+    const char *errors,         /* error handling */
+    Py_ssize_t *consumed        /* bytes consumed */
+    );
+#endif
+
+PyAPI_FUNC(PyObject*) PyUnicode_AsMBCSString(
+    PyObject *unicode           /* Unicode object */
+    );
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+PyAPI_FUNC(PyObject*) PyUnicode_EncodeCodePage(
+    int code_page,              /* code page number */
+    PyObject *unicode,          /* Unicode object */
+    const char *errors          /* error handling */
+    );
+#endif
+
+#endif /* MS_WINDOWS */
+
+/* --- Locale encoding --------------------------------------------------- */
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+/* Decode a string from the current locale encoding. The decoder is strict if
+   *surrogateescape* is equal to zero, otherwise it uses the 'surrogateescape'
+   error handler (PEP 383) to escape undecodable bytes. If a byte sequence can
+   be decoded as a surrogate character and *surrogateescape* is not equal to
+   zero, the byte sequence is escaped using the 'surrogateescape' error handler
+   instead of being decoded. *str* must end with a null character but cannot
+   contain embedded null characters. */
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocaleAndSize(
+    const char *str,
+    Py_ssize_t len,
+    const char *errors);
+
+/* Similar to PyUnicode_DecodeLocaleAndSize(), but compute the string
+   length using strlen(). */
+
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeLocale(
+    const char *str,
+    const char *errors);
+
+/* Encode a Unicode object to the current locale encoding. The encoder is
+   strict is *surrogateescape* is equal to zero, otherwise the
+   "surrogateescape" error handler is used. Return a bytes object. The string
+   cannot contain embedded null characters. */
+
+PyAPI_FUNC(PyObject*) PyUnicode_EncodeLocale(
+    PyObject *unicode,
+    const char *errors
+    );
+#endif
+
+/* --- File system encoding ---------------------------------------------- */
+
+/* ParseTuple converter: encode str objects to bytes using
+   PyUnicode_EncodeFSDefault(); bytes objects are output as-is. */
+
+PyAPI_FUNC(int) PyUnicode_FSConverter(PyObject*, void*);
+
+/* ParseTuple converter: decode bytes objects to unicode using
+   PyUnicode_DecodeFSDefaultAndSize(); str objects are output as-is. */
+
+PyAPI_FUNC(int) PyUnicode_FSDecoder(PyObject*, void*);
+
+/* Decode a null-terminated string from the Python filesystem encoding
+   and error handler.
+
+   If the string length is known, use PyUnicode_DecodeFSDefaultAndSize(). */
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefault(
+    const char *s               /* encoded string */
+    );
+
+/* Decode a string from the Python filesystem encoding and error handler. */
+PyAPI_FUNC(PyObject*) PyUnicode_DecodeFSDefaultAndSize(
+    const char *s,               /* encoded string */
+    Py_ssize_t size              /* size */
+    );
+
+/* Encode a Unicode object to the Python filesystem encoding and error handler.
+   Return bytes. */
+PyAPI_FUNC(PyObject*) PyUnicode_EncodeFSDefault(
+    PyObject *unicode
+    );
+
+/* --- Methods & Slots ----------------------------------------------------
+
+   These are capable of handling Unicode objects and strings on input
+   (we refer to them as strings in the descriptions) and return
+   Unicode objects or integers as appropriate. */
+
+/* Concat two strings giving a new Unicode string. */
+
+PyAPI_FUNC(PyObject*) PyUnicode_Concat(
+    PyObject *left,             /* Left string */
+    PyObject *right             /* Right string */
+    );
+
+/* Concat two strings and put the result in *pleft
+   (sets *pleft to NULL on error) */
+
+PyAPI_FUNC(void) PyUnicode_Append(
+    PyObject **pleft,           /* Pointer to left string */
+    PyObject *right             /* Right string */
+    );
+
+/* Concat two strings, put the result in *pleft and drop the right object
+   (sets *pleft to NULL on error) */
+
+PyAPI_FUNC(void) PyUnicode_AppendAndDel(
+    PyObject **pleft,           /* Pointer to left string */
+    PyObject *right             /* Right string */
+    );
+
+/* Split a string giving a list of Unicode strings.
+
+   If sep is NULL, splitting will be done at all whitespace
+   substrings. Otherwise, splits occur at the given separator.
+
+   At most maxsplit splits will be done. If negative, no limit is set.
+
+   Separators are not included in the resulting list.
+
+*/
+
+PyAPI_FUNC(PyObject*) PyUnicode_Split(
+    PyObject *s,                /* String to split */
+    PyObject *sep,              /* String separator */
+    Py_ssize_t maxsplit         /* Maxsplit count */
+    );
+
+/* Dito, but split at line breaks.
+
+   CRLF is considered to be one line break. Line breaks are not
+   included in the resulting list. */
+
+PyAPI_FUNC(PyObject*) PyUnicode_Splitlines(
+    PyObject *s,                /* String to split */
+    int keepends                /* If true, line end markers are included */
+    );
+
+/* Partition a string using a given separator. */
+
+PyAPI_FUNC(PyObject*) PyUnicode_Partition(
+    PyObject *s,                /* String to partition */
+    PyObject *sep               /* String separator */
+    );
+
+/* Partition a string using a given separator, searching from the end of the
+   string. */
+
+PyAPI_FUNC(PyObject*) PyUnicode_RPartition(
+    PyObject *s,                /* String to partition */
+    PyObject *sep               /* String separator */
+    );
+
+/* Split a string giving a list of Unicode strings.
+
+   If sep is NULL, splitting will be done at all whitespace
+   substrings. Otherwise, splits occur at the given separator.
+
+   At most maxsplit splits will be done. But unlike PyUnicode_Split
+   PyUnicode_RSplit splits from the end of the string. If negative,
+   no limit is set.
+
+   Separators are not included in the resulting list.
+
+*/
+
+PyAPI_FUNC(PyObject*) PyUnicode_RSplit(
+    PyObject *s,                /* String to split */
+    PyObject *sep,              /* String separator */
+    Py_ssize_t maxsplit         /* Maxsplit count */
+    );
+
+/* Translate a string by applying a character mapping table to it and
+   return the resulting Unicode object.
+
+   The mapping table must map Unicode ordinal integers to Unicode strings,
+   Unicode ordinal integers or None (causing deletion of the character).
+
+   Mapping tables may be dictionaries or sequences. Unmapped character
+   ordinals (ones which cause a LookupError) are left untouched and
+   are copied as-is.
+
+*/
+
+PyAPI_FUNC(PyObject *) PyUnicode_Translate(
+    PyObject *str,              /* String */
+    PyObject *table,            /* Translate table */
+    const char *errors          /* error handling */
+    );
+
+/* Join a sequence of strings using the given separator and return
+   the resulting Unicode string. */
+
+PyAPI_FUNC(PyObject*) PyUnicode_Join(
+    PyObject *separator,        /* Separator string */
+    PyObject *seq               /* Sequence object */
+    );
+
+/* Return 1 if substr matches str[start:end] at the given tail end, 0
+   otherwise. */
+
+PyAPI_FUNC(Py_ssize_t) PyUnicode_Tailmatch(
+    PyObject *str,              /* String */
+    PyObject *substr,           /* Prefix or Suffix string */
+    Py_ssize_t start,           /* Start index */
+    Py_ssize_t end,             /* Stop index */
+    int direction               /* Tail end: -1 prefix, +1 suffix */
+    );
+
+/* Return the first position of substr in str[start:end] using the
+   given search direction or -1 if not found. -2 is returned in case
+   an error occurred and an exception is set. */
+
+PyAPI_FUNC(Py_ssize_t) PyUnicode_Find(
+    PyObject *str,              /* String */
+    PyObject *substr,           /* Substring to find */
+    Py_ssize_t start,           /* Start index */
+    Py_ssize_t end,             /* Stop index */
+    int direction               /* Find direction: +1 forward, -1 backward */
+    );
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03030000
+/* Like PyUnicode_Find, but search for single character only. */
+PyAPI_FUNC(Py_ssize_t) PyUnicode_FindChar(
+    PyObject *str,
+    Py_UCS4 ch,
+    Py_ssize_t start,
+    Py_ssize_t end,
+    int direction
+    );
+#endif
+
+/* Count the number of occurrences of substr in str[start:end]. */
+
+PyAPI_FUNC(Py_ssize_t) PyUnicode_Count(
+    PyObject *str,              /* String */
+    PyObject *substr,           /* Substring to count */
+    Py_ssize_t start,           /* Start index */
+    Py_ssize_t end              /* Stop index */
+    );
+
+/* Replace at most maxcount occurrences of substr in str with replstr
+   and return the resulting Unicode object. */
+
+PyAPI_FUNC(PyObject *) PyUnicode_Replace(
+    PyObject *str,              /* String */
+    PyObject *substr,           /* Substring to find */
+    PyObject *replstr,          /* Substring to replace */
+    Py_ssize_t maxcount         /* Max. number of replacements to apply;
+                                   -1 = all */
+    );
+
+/* Compare two strings and return -1, 0, 1 for less than, equal,
+   greater than resp.
+   Raise an exception and return -1 on error. */
+
+PyAPI_FUNC(int) PyUnicode_Compare(
+    PyObject *left,             /* Left string */
+    PyObject *right             /* Right string */
+    );
+
+/* Compare a Unicode object with C string and return -1, 0, 1 for less than,
+   equal, and greater than, respectively.  It is best to pass only
+   ASCII-encoded strings, but the function interprets the input string as
+   ISO-8859-1 if it contains non-ASCII characters.
+   This function does not raise exceptions. */
+
+PyAPI_FUNC(int) PyUnicode_CompareWithASCIIString(
+    PyObject *left,
+    const char *right           /* ASCII-encoded string */
+    );
+
+/* Rich compare two strings and return one of the following:
+
+   - NULL in case an exception was raised
+   - Py_True or Py_False for successful comparisons
+   - Py_NotImplemented in case the type combination is unknown
+
+   Possible values for op:
+
+     Py_GT, Py_GE, Py_EQ, Py_NE, Py_LT, Py_LE
+
+*/
+
+PyAPI_FUNC(PyObject *) PyUnicode_RichCompare(
+    PyObject *left,             /* Left string */
+    PyObject *right,            /* Right string */
+    int op                      /* Operation: Py_EQ, Py_NE, Py_GT, etc. */
+    );
+
+/* Apply an argument tuple or dictionary to a format string and return
+   the resulting Unicode string. */
+
+PyAPI_FUNC(PyObject *) PyUnicode_Format(
+    PyObject *format,           /* Format string */
+    PyObject *args              /* Argument tuple or dictionary */
+    );
+
+/* Checks whether element is contained in container and return 1/0
+   accordingly.
+
+   element has to coerce to a one element Unicode string. -1 is
+   returned in case of an error. */
+
+PyAPI_FUNC(int) PyUnicode_Contains(
+    PyObject *container,        /* Container string */
+    PyObject *element           /* Element string */
+    );
+
+/* Checks whether argument is a valid identifier. */
+
+PyAPI_FUNC(int) PyUnicode_IsIdentifier(PyObject *s);
+
+/* === Characters Type APIs =============================================== */
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_UNICODEOBJECT_H
+#  include "cpython/unicodeobject.h"
+#  undef Py_CPYTHON_UNICODEOBJECT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_UNICODEOBJECT_H */
diff --git a/nanvix-port/cpython-headers/python3.12/warnings.h b/nanvix-port/cpython-headers/python3.12/warnings.h
new file mode 100644
index 000000000000..18ac1543a3ca
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/warnings.h
@@ -0,0 +1,45 @@
+#ifndef Py_WARNINGS_H
+#define Py_WARNINGS_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+PyAPI_FUNC(int) PyErr_WarnEx(
+    PyObject *category,
+    const char *message,        /* UTF-8 encoded string */
+    Py_ssize_t stack_level);
+
+PyAPI_FUNC(int) PyErr_WarnFormat(
+    PyObject *category,
+    Py_ssize_t stack_level,
+    const char *format,         /* ASCII-encoded string  */
+    ...);
+
+#if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x03060000
+/* Emit a ResourceWarning warning */
+PyAPI_FUNC(int) PyErr_ResourceWarning(
+    PyObject *source,
+    Py_ssize_t stack_level,
+    const char *format,         /* ASCII-encoded string  */
+    ...);
+#endif
+
+PyAPI_FUNC(int) PyErr_WarnExplicit(
+    PyObject *category,
+    const char *message,        /* UTF-8 encoded string */
+    const char *filename,       /* decoded from the filesystem encoding */
+    int lineno,
+    const char *module,         /* UTF-8 encoded string */
+    PyObject *registry);
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_WARNINGS_H
+#  include "cpython/warnings.h"
+#  undef Py_CPYTHON_WARNINGS_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_WARNINGS_H */
+
diff --git a/nanvix-port/cpython-headers/python3.12/weakrefobject.h b/nanvix-port/cpython-headers/python3.12/weakrefobject.h
new file mode 100644
index 000000000000..8e1fa1b9286a
--- /dev/null
+++ b/nanvix-port/cpython-headers/python3.12/weakrefobject.h
@@ -0,0 +1,42 @@
+/* Weak references objects for Python. */
+
+#ifndef Py_WEAKREFOBJECT_H
+#define Py_WEAKREFOBJECT_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _PyWeakReference PyWeakReference;
+
+PyAPI_DATA(PyTypeObject) _PyWeakref_RefType;
+PyAPI_DATA(PyTypeObject) _PyWeakref_ProxyType;
+PyAPI_DATA(PyTypeObject) _PyWeakref_CallableProxyType;
+
+#define PyWeakref_CheckRef(op) PyObject_TypeCheck((op), &_PyWeakref_RefType)
+#define PyWeakref_CheckRefExact(op) \
+        Py_IS_TYPE((op), &_PyWeakref_RefType)
+#define PyWeakref_CheckProxy(op) \
+        (Py_IS_TYPE((op), &_PyWeakref_ProxyType) \
+         || Py_IS_TYPE((op), &_PyWeakref_CallableProxyType))
+
+#define PyWeakref_Check(op) \
+        (PyWeakref_CheckRef(op) || PyWeakref_CheckProxy(op))
+
+
+PyAPI_FUNC(PyObject *) PyWeakref_NewRef(PyObject *ob,
+                                        PyObject *callback);
+PyAPI_FUNC(PyObject *) PyWeakref_NewProxy(PyObject *ob,
+                                          PyObject *callback);
+PyAPI_FUNC(PyObject *) PyWeakref_GetObject(PyObject *ref);
+
+
+#ifndef Py_LIMITED_API
+#  define Py_CPYTHON_WEAKREFOBJECT_H
+#  include "cpython/weakrefobject.h"
+#  undef Py_CPYTHON_WEAKREFOBJECT_H
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_WEAKREFOBJECT_H */
diff --git a/nanvix-port/generated-headers/__multiarray_api.c b/nanvix-port/generated-headers/__multiarray_api.c
new file mode 100644
index 000000000000..4fa051c1d2e0
--- /dev/null
+++ b/nanvix-port/generated-headers/__multiarray_api.c
@@ -0,0 +1,314 @@
+
+/* These pointers will be stored in the C-object for use in other
+    extension modules
+*/
+
+void *PyArray_API[] = {
+        (void *) PyArray_GetNDArrayCVersion,
+        (void *) &PyBigArray_Type,
+        (void *) &PyArray_Type,
+        (void *) &PyArrayDescr_Type,
+        (void *) &PyArrayFlags_Type,
+        (void *) &PyArrayIter_Type,
+        (void *) &PyArrayMultiIter_Type,
+        (int *) &NPY_NUMUSERTYPES,
+        (void *) &PyBoolArrType_Type,
+        (void *) &_PyArrayScalar_BoolValues,
+        (void *) &PyGenericArrType_Type,
+        (void *) &PyNumberArrType_Type,
+        (void *) &PyIntegerArrType_Type,
+        (void *) &PySignedIntegerArrType_Type,
+        (void *) &PyUnsignedIntegerArrType_Type,
+        (void *) &PyInexactArrType_Type,
+        (void *) &PyFloatingArrType_Type,
+        (void *) &PyComplexFloatingArrType_Type,
+        (void *) &PyFlexibleArrType_Type,
+        (void *) &PyCharacterArrType_Type,
+        (void *) &PyByteArrType_Type,
+        (void *) &PyShortArrType_Type,
+        (void *) &PyIntArrType_Type,
+        (void *) &PyLongArrType_Type,
+        (void *) &PyLongLongArrType_Type,
+        (void *) &PyUByteArrType_Type,
+        (void *) &PyUShortArrType_Type,
+        (void *) &PyUIntArrType_Type,
+        (void *) &PyULongArrType_Type,
+        (void *) &PyULongLongArrType_Type,
+        (void *) &PyFloatArrType_Type,
+        (void *) &PyDoubleArrType_Type,
+        (void *) &PyLongDoubleArrType_Type,
+        (void *) &PyCFloatArrType_Type,
+        (void *) &PyCDoubleArrType_Type,
+        (void *) &PyCLongDoubleArrType_Type,
+        (void *) &PyObjectArrType_Type,
+        (void *) &PyStringArrType_Type,
+        (void *) &PyUnicodeArrType_Type,
+        (void *) &PyVoidArrType_Type,
+        (void *) PyArray_SetNumericOps,
+        (void *) PyArray_GetNumericOps,
+        (void *) PyArray_INCREF,
+        (void *) PyArray_XDECREF,
+        (void *) PyArray_SetStringFunction,
+        (void *) PyArray_DescrFromType,
+        (void *) PyArray_TypeObjectFromType,
+        (void *) PyArray_Zero,
+        (void *) PyArray_One,
+        (void *) PyArray_CastToType,
+        (void *) PyArray_CastTo,
+        (void *) PyArray_CastAnyTo,
+        (void *) PyArray_CanCastSafely,
+        (void *) PyArray_CanCastTo,
+        (void *) PyArray_ObjectType,
+        (void *) PyArray_DescrFromObject,
+        (void *) PyArray_ConvertToCommonType,
+        (void *) PyArray_DescrFromScalar,
+        (void *) PyArray_DescrFromTypeObject,
+        (void *) PyArray_Size,
+        (void *) PyArray_Scalar,
+        (void *) PyArray_FromScalar,
+        (void *) PyArray_ScalarAsCtype,
+        (void *) PyArray_CastScalarToCtype,
+        (void *) PyArray_CastScalarDirect,
+        (void *) PyArray_ScalarFromObject,
+        (void *) PyArray_GetCastFunc,
+        (void *) PyArray_FromDims,
+        (void *) PyArray_FromDimsAndDataAndDescr,
+        (void *) PyArray_FromAny,
+        (void *) PyArray_EnsureArray,
+        (void *) PyArray_EnsureAnyArray,
+        (void *) PyArray_FromFile,
+        (void *) PyArray_FromString,
+        (void *) PyArray_FromBuffer,
+        (void *) PyArray_FromIter,
+        (void *) PyArray_Return,
+        (void *) PyArray_GetField,
+        (void *) PyArray_SetField,
+        (void *) PyArray_Byteswap,
+        (void *) PyArray_Resize,
+        (void *) PyArray_MoveInto,
+        (void *) PyArray_CopyInto,
+        (void *) PyArray_CopyAnyInto,
+        (void *) PyArray_CopyObject,
+        (void *) PyArray_NewCopy,
+        (void *) PyArray_ToList,
+        (void *) PyArray_ToString,
+        (void *) PyArray_ToFile,
+        (void *) PyArray_Dump,
+        (void *) PyArray_Dumps,
+        (void *) PyArray_ValidType,
+        (void *) PyArray_UpdateFlags,
+        (void *) PyArray_New,
+        (void *) PyArray_NewFromDescr,
+        (void *) PyArray_DescrNew,
+        (void *) PyArray_DescrNewFromType,
+        (void *) PyArray_GetPriority,
+        (void *) PyArray_IterNew,
+        (void *) PyArray_MultiIterNew,
+        (void *) PyArray_PyIntAsInt,
+        (void *) PyArray_PyIntAsIntp,
+        (void *) PyArray_Broadcast,
+        (void *) PyArray_FillObjectArray,
+        (void *) PyArray_FillWithScalar,
+        (void *) PyArray_CheckStrides,
+        (void *) PyArray_DescrNewByteorder,
+        (void *) PyArray_IterAllButAxis,
+        (void *) PyArray_CheckFromAny,
+        (void *) PyArray_FromArray,
+        (void *) PyArray_FromInterface,
+        (void *) PyArray_FromStructInterface,
+        (void *) PyArray_FromArrayAttr,
+        (void *) PyArray_ScalarKind,
+        (void *) PyArray_CanCoerceScalar,
+        (void *) PyArray_NewFlagsObject,
+        (void *) PyArray_CanCastScalar,
+        (void *) PyArray_CompareUCS4,
+        (void *) PyArray_RemoveSmallest,
+        (void *) PyArray_ElementStrides,
+        (void *) PyArray_Item_INCREF,
+        (void *) PyArray_Item_XDECREF,
+        (void *) PyArray_FieldNames,
+        (void *) PyArray_Transpose,
+        (void *) PyArray_TakeFrom,
+        (void *) PyArray_PutTo,
+        (void *) PyArray_PutMask,
+        (void *) PyArray_Repeat,
+        (void *) PyArray_Choose,
+        (void *) PyArray_Sort,
+        (void *) PyArray_ArgSort,
+        (void *) PyArray_SearchSorted,
+        (void *) PyArray_ArgMax,
+        (void *) PyArray_ArgMin,
+        (void *) PyArray_Reshape,
+        (void *) PyArray_Newshape,
+        (void *) PyArray_Squeeze,
+        (void *) PyArray_View,
+        (void *) PyArray_SwapAxes,
+        (void *) PyArray_Max,
+        (void *) PyArray_Min,
+        (void *) PyArray_Ptp,
+        (void *) PyArray_Mean,
+        (void *) PyArray_Trace,
+        (void *) PyArray_Diagonal,
+        (void *) PyArray_Clip,
+        (void *) PyArray_Conjugate,
+        (void *) PyArray_Nonzero,
+        (void *) PyArray_Std,
+        (void *) PyArray_Sum,
+        (void *) PyArray_CumSum,
+        (void *) PyArray_Prod,
+        (void *) PyArray_CumProd,
+        (void *) PyArray_All,
+        (void *) PyArray_Any,
+        (void *) PyArray_Compress,
+        (void *) PyArray_Flatten,
+        (void *) PyArray_Ravel,
+        (void *) PyArray_MultiplyList,
+        (void *) PyArray_MultiplyIntList,
+        (void *) PyArray_GetPtr,
+        (void *) PyArray_CompareLists,
+        (void *) PyArray_AsCArray,
+        (void *) PyArray_As1D,
+        (void *) PyArray_As2D,
+        (void *) PyArray_Free,
+        (void *) PyArray_Converter,
+        (void *) PyArray_IntpFromSequence,
+        (void *) PyArray_Concatenate,
+        (void *) PyArray_InnerProduct,
+        (void *) PyArray_MatrixProduct,
+        (void *) PyArray_CopyAndTranspose,
+        (void *) PyArray_Correlate,
+        (void *) PyArray_TypestrConvert,
+        (void *) PyArray_DescrConverter,
+        (void *) PyArray_DescrConverter2,
+        (void *) PyArray_IntpConverter,
+        (void *) PyArray_BufferConverter,
+        (void *) PyArray_AxisConverter,
+        (void *) PyArray_BoolConverter,
+        (void *) PyArray_ByteorderConverter,
+        (void *) PyArray_OrderConverter,
+        (void *) PyArray_EquivTypes,
+        (void *) PyArray_Zeros,
+        (void *) PyArray_Empty,
+        (void *) PyArray_Where,
+        (void *) PyArray_Arange,
+        (void *) PyArray_ArangeObj,
+        (void *) PyArray_SortkindConverter,
+        (void *) PyArray_LexSort,
+        (void *) PyArray_Round,
+        (void *) PyArray_EquivTypenums,
+        (void *) PyArray_RegisterDataType,
+        (void *) PyArray_RegisterCastFunc,
+        (void *) PyArray_RegisterCanCast,
+        (void *) PyArray_InitArrFuncs,
+        (void *) PyArray_IntTupleFromIntp,
+        (void *) PyArray_TypeNumFromName,
+        (void *) PyArray_ClipmodeConverter,
+        (void *) PyArray_OutputConverter,
+        (void *) PyArray_BroadcastToShape,
+        (void *) _PyArray_SigintHandler,
+        (void *) _PyArray_GetSigintBuf,
+        (void *) PyArray_DescrAlignConverter,
+        (void *) PyArray_DescrAlignConverter2,
+        (void *) PyArray_SearchsideConverter,
+        (void *) PyArray_CheckAxis,
+        (void *) PyArray_OverflowMultiplyList,
+        (void *) PyArray_CompareString,
+        (void *) PyArray_MultiIterFromObjects,
+        (void *) PyArray_GetEndianness,
+        (void *) PyArray_GetNDArrayCFeatureVersion,
+        (void *) PyArray_Correlate2,
+        (void *) PyArray_NeighborhoodIterNew,
+        (void *) &PyTimeIntegerArrType_Type,
+        (void *) &PyDatetimeArrType_Type,
+        (void *) &PyTimedeltaArrType_Type,
+        (void *) &PyHalfArrType_Type,
+        (void *) &NpyIter_Type,
+        (void *) PyArray_SetDatetimeParseFunction,
+        (void *) PyArray_DatetimeToDatetimeStruct,
+        (void *) PyArray_TimedeltaToTimedeltaStruct,
+        (void *) PyArray_DatetimeStructToDatetime,
+        (void *) PyArray_TimedeltaStructToTimedelta,
+        (void *) NpyIter_New,
+        (void *) NpyIter_MultiNew,
+        (void *) NpyIter_AdvancedNew,
+        (void *) NpyIter_Copy,
+        (void *) NpyIter_Deallocate,
+        (void *) NpyIter_HasDelayedBufAlloc,
+        (void *) NpyIter_HasExternalLoop,
+        (void *) NpyIter_EnableExternalLoop,
+        (void *) NpyIter_GetInnerStrideArray,
+        (void *) NpyIter_GetInnerLoopSizePtr,
+        (void *) NpyIter_Reset,
+        (void *) NpyIter_ResetBasePointers,
+        (void *) NpyIter_ResetToIterIndexRange,
+        (void *) NpyIter_GetNDim,
+        (void *) NpyIter_GetNOp,
+        (void *) NpyIter_GetIterNext,
+        (void *) NpyIter_GetIterSize,
+        (void *) NpyIter_GetIterIndexRange,
+        (void *) NpyIter_GetIterIndex,
+        (void *) NpyIter_GotoIterIndex,
+        (void *) NpyIter_HasMultiIndex,
+        (void *) NpyIter_GetShape,
+        (void *) NpyIter_GetGetMultiIndex,
+        (void *) NpyIter_GotoMultiIndex,
+        (void *) NpyIter_RemoveMultiIndex,
+        (void *) NpyIter_HasIndex,
+        (void *) NpyIter_IsBuffered,
+        (void *) NpyIter_IsGrowInner,
+        (void *) NpyIter_GetBufferSize,
+        (void *) NpyIter_GetIndexPtr,
+        (void *) NpyIter_GotoIndex,
+        (void *) NpyIter_GetDataPtrArray,
+        (void *) NpyIter_GetDescrArray,
+        (void *) NpyIter_GetOperandArray,
+        (void *) NpyIter_GetIterView,
+        (void *) NpyIter_GetReadFlags,
+        (void *) NpyIter_GetWriteFlags,
+        (void *) NpyIter_DebugPrint,
+        (void *) NpyIter_IterationNeedsAPI,
+        (void *) NpyIter_GetInnerFixedStrideArray,
+        (void *) NpyIter_RemoveAxis,
+        (void *) NpyIter_GetAxisStrideArray,
+        (void *) NpyIter_RequiresBuffering,
+        (void *) NpyIter_GetInitialDataPtrArray,
+        (void *) NpyIter_CreateCompatibleStrides,
+        (void *) PyArray_CastingConverter,
+        (void *) PyArray_CountNonzero,
+        (void *) PyArray_PromoteTypes,
+        (void *) PyArray_MinScalarType,
+        (void *) PyArray_ResultType,
+        (void *) PyArray_CanCastArrayTo,
+        (void *) PyArray_CanCastTypeTo,
+        (void *) PyArray_EinsteinSum,
+        (void *) PyArray_NewLikeArray,
+        (void *) PyArray_GetArrayParamsFromObject,
+        (void *) PyArray_ConvertClipmodeSequence,
+        (void *) PyArray_MatrixProduct2,
+        (void *) NpyIter_IsFirstVisit,
+        (void *) PyArray_SetBaseObject,
+        (void *) PyArray_CreateSortedStridePerm,
+        (void *) PyArray_RemoveAxesInPlace,
+        (void *) PyArray_DebugPrint,
+        (void *) PyArray_FailUnlessWriteable,
+        (void *) PyArray_SetUpdateIfCopyBase,
+        (void *) PyDataMem_NEW,
+        (void *) PyDataMem_FREE,
+        (void *) PyDataMem_RENEW,
+        (void *) PyDataMem_SetEventHook,
+        (NPY_CASTING *) &NPY_DEFAULT_ASSIGN_CASTING,
+        (void *) PyArray_MapIterSwapAxes,
+        (void *) PyArray_MapIterArray,
+        (void *) PyArray_MapIterNext,
+        (void *) PyArray_Partition,
+        (void *) PyArray_ArgPartition,
+        (void *) PyArray_SelectkindConverter,
+        (void *) PyDataMem_NEW_ZEROED,
+        (void *) PyArray_CheckAnyScalarExact,
+        (void *) PyArray_MapIterArrayCopyIfOverlap,
+        (void *) PyArray_ResolveWritebackIfCopy,
+        (void *) PyArray_SetWritebackIfCopyBase,
+        (void *) PyDataMem_SetHandler,
+        (void *) PyDataMem_GetHandler,
+        (PyObject* *) &PyDataMem_DefaultHandler
+};
diff --git a/nanvix-port/generated-headers/__multiarray_api.h b/nanvix-port/generated-headers/__multiarray_api.h
new file mode 100644
index 000000000000..4c626832ad2a
--- /dev/null
+++ b/nanvix-port/generated-headers/__multiarray_api.h
@@ -0,0 +1,1566 @@
+
+#if defined(_MULTIARRAYMODULE) || defined(WITH_CPYCHECKER_STEALS_REFERENCE_TO_ARG_ATTRIBUTE)
+
+typedef struct {
+        PyObject_HEAD
+        npy_bool obval;
+} PyBoolScalarObject;
+
+extern NPY_NO_EXPORT PyTypeObject PyArrayMapIter_Type;
+extern NPY_NO_EXPORT PyTypeObject PyArrayNeighborhoodIter_Type;
+extern NPY_NO_EXPORT PyBoolScalarObject _PyArrayScalar_BoolValues[2];
+
+NPY_NO_EXPORT  unsigned int PyArray_GetNDArrayCVersion \
+       (void);
+extern NPY_NO_EXPORT PyTypeObject PyBigArray_Type;
+
+extern NPY_NO_EXPORT PyTypeObject PyArray_Type;
+
+extern NPY_NO_EXPORT PyArray_DTypeMeta PyArrayDescr_TypeFull;
+#define PyArrayDescr_Type (*(PyTypeObject *)(&PyArrayDescr_TypeFull))
+
+extern NPY_NO_EXPORT PyTypeObject PyArrayFlags_Type;
+
+extern NPY_NO_EXPORT PyTypeObject PyArrayIter_Type;
+
+extern NPY_NO_EXPORT PyTypeObject PyArrayMultiIter_Type;
+
+extern NPY_NO_EXPORT int NPY_NUMUSERTYPES;
+
+extern NPY_NO_EXPORT PyTypeObject PyBoolArrType_Type;
+
+extern NPY_NO_EXPORT PyBoolScalarObject _PyArrayScalar_BoolValues[2];
+
+extern NPY_NO_EXPORT PyTypeObject PyGenericArrType_Type;
+
+extern NPY_NO_EXPORT PyTypeObject PyNumberArrType_Type;
+
+extern NPY_NO_EXPORT PyTypeObject PyIntegerArrType_Type;
+
+extern NPY_NO_EXPORT PyTypeObject PySignedIntegerArrType_Type;
+
+extern NPY_NO_EXPORT PyTypeObject PyUnsignedIntegerArrType_Type;
+
+extern NPY_NO_EXPORT PyTypeObject PyInexactArrType_Type;
+
+extern NPY_NO_EXPORT PyTypeObject PyFloatingArrType_Type;
+
+extern NPY_NO_EXPORT PyTypeObject PyComplexFloatingArrType_Type;
+
+extern NPY_NO_EXPORT PyTypeObject PyFlexibleArrType_Type;
+
+extern NPY_NO_EXPORT PyTypeObject PyCharacterArrType_Type;
+
+extern NPY_NO_EXPORT PyTypeObject PyByteArrType_Type;
+
+extern NPY_NO_EXPORT PyTypeObject PyShortArrType_Type;
+
+extern NPY_NO_EXPORT PyTypeObject PyIntArrType_Type;
+
+extern NPY_NO_EXPORT PyTypeObject PyLongArrType_Type;
+
+extern NPY_NO_EXPORT PyTypeObject PyLongLongArrType_Type;
+
+extern NPY_NO_EXPORT PyTypeObject PyUByteArrType_Type;
+
+extern NPY_NO_EXPORT PyTypeObject PyUShortArrType_Type;
+
+extern NPY_NO_EXPORT PyTypeObject PyUIntArrType_Type;
+
+extern NPY_NO_EXPORT PyTypeObject PyULongArrType_Type;
+
+extern NPY_NO_EXPORT PyTypeObject PyULongLongArrType_Type;
+
+extern NPY_NO_EXPORT PyTypeObject PyFloatArrType_Type;
+
+extern NPY_NO_EXPORT PyTypeObject PyDoubleArrType_Type;
+
+extern NPY_NO_EXPORT PyTypeObject PyLongDoubleArrType_Type;
+
+extern NPY_NO_EXPORT PyTypeObject PyCFloatArrType_Type;
+
+extern NPY_NO_EXPORT PyTypeObject PyCDoubleArrType_Type;
+
+extern NPY_NO_EXPORT PyTypeObject PyCLongDoubleArrType_Type;
+
+extern NPY_NO_EXPORT PyTypeObject PyObjectArrType_Type;
+
+extern NPY_NO_EXPORT PyTypeObject PyStringArrType_Type;
+
+extern NPY_NO_EXPORT PyTypeObject PyUnicodeArrType_Type;
+
+extern NPY_NO_EXPORT PyTypeObject PyVoidArrType_Type;
+
+NPY_NO_EXPORT  int PyArray_SetNumericOps \
+       (PyObject *);
+NPY_NO_EXPORT  PyObject * PyArray_GetNumericOps \
+       (void);
+NPY_NO_EXPORT  int PyArray_INCREF \
+       (PyArrayObject *);
+NPY_NO_EXPORT  int PyArray_XDECREF \
+       (PyArrayObject *);
+NPY_NO_EXPORT  void PyArray_SetStringFunction \
+       (PyObject *, int);
+NPY_NO_EXPORT  PyArray_Descr * PyArray_DescrFromType \
+       (int);
+NPY_NO_EXPORT  PyObject * PyArray_TypeObjectFromType \
+       (int);
+NPY_NO_EXPORT  char * PyArray_Zero \
+       (PyArrayObject *);
+NPY_NO_EXPORT  char * PyArray_One \
+       (PyArrayObject *);
+NPY_NO_EXPORT NPY_STEALS_REF_TO_ARG(2) PyObject * PyArray_CastToType \
+       (PyArrayObject *, PyArray_Descr *, int);
+NPY_NO_EXPORT  int PyArray_CastTo \
+       (PyArrayObject *, PyArrayObject *);
+NPY_NO_EXPORT  int PyArray_CastAnyTo \
+       (PyArrayObject *, PyArrayObject *);
+NPY_NO_EXPORT  int PyArray_CanCastSafely \
+       (int, int);
+NPY_NO_EXPORT  npy_bool PyArray_CanCastTo \
+       (PyArray_Descr *, PyArray_Descr *);
+NPY_NO_EXPORT  int PyArray_ObjectType \
+       (PyObject *, int);
+NPY_NO_EXPORT  PyArray_Descr * PyArray_DescrFromObject \
+       (PyObject *, PyArray_Descr *);
+NPY_NO_EXPORT  PyArrayObject ** PyArray_ConvertToCommonType \
+       (PyObject *, int *);
+NPY_NO_EXPORT  PyArray_Descr * PyArray_DescrFromScalar \
+       (PyObject *);
+NPY_NO_EXPORT  PyArray_Descr * PyArray_DescrFromTypeObject \
+       (PyObject *);
+NPY_NO_EXPORT  npy_intp PyArray_Size \
+       (PyObject *);
+NPY_NO_EXPORT  PyObject * PyArray_Scalar \
+       (void *, PyArray_Descr *, PyObject *);
+NPY_NO_EXPORT NPY_STEALS_REF_TO_ARG(2) PyObject * PyArray_FromScalar \
+       (PyObject *, PyArray_Descr *);
+NPY_NO_EXPORT  void PyArray_ScalarAsCtype \
+       (PyObject *, void *);
+NPY_NO_EXPORT  int PyArray_CastScalarToCtype \
+       (PyObject *, void *, PyArray_Descr *);
+NPY_NO_EXPORT  int PyArray_CastScalarDirect \
+       (PyObject *, PyArray_Descr *, void *, int);
+NPY_NO_EXPORT  PyObject * PyArray_ScalarFromObject \
+       (PyObject *);
+NPY_NO_EXPORT  PyArray_VectorUnaryFunc * PyArray_GetCastFunc \
+       (PyArray_Descr *, int);
+NPY_NO_EXPORT  PyObject * PyArray_FromDims \
+       (int NPY_UNUSED(nd), int *NPY_UNUSED(d), int NPY_UNUSED(type));
+NPY_NO_EXPORT NPY_STEALS_REF_TO_ARG(3) PyObject * PyArray_FromDimsAndDataAndDescr \
+       (int NPY_UNUSED(nd), int *NPY_UNUSED(d), PyArray_Descr *, char *NPY_UNUSED(data));
+NPY_NO_EXPORT NPY_STEALS_REF_TO_ARG(2) PyObject * PyArray_FromAny \
+       (PyObject *, PyArray_Descr *, int, int, int, PyObject *);
+NPY_NO_EXPORT NPY_STEALS_REF_TO_ARG(1) PyObject * PyArray_EnsureArray \
+       (PyObject *);
+NPY_NO_EXPORT NPY_STEALS_REF_TO_ARG(1) PyObject * PyArray_EnsureAnyArray \
+       (PyObject *);
+NPY_NO_EXPORT  PyObject * PyArray_FromFile \
+       (FILE *, PyArray_Descr *, npy_intp, char *);
+NPY_NO_EXPORT  PyObject * PyArray_FromString \
+       (char *, npy_intp, PyArray_Descr *, npy_intp, char *);
+NPY_NO_EXPORT  PyObject * PyArray_FromBuffer \
+       (PyObject *, PyArray_Descr *, npy_intp, npy_intp);
+NPY_NO_EXPORT NPY_STEALS_REF_TO_ARG(2) PyObject * PyArray_FromIter \
+       (PyObject *, PyArray_Descr *, npy_intp);
+NPY_NO_EXPORT NPY_STEALS_REF_TO_ARG(1) PyObject * PyArray_Return \
+       (PyArrayObject *);
+NPY_NO_EXPORT NPY_STEALS_REF_TO_ARG(2) PyObject * PyArray_GetField \
+       (PyArrayObject *, PyArray_Descr *, int);
+NPY_NO_EXPORT NPY_STEALS_REF_TO_ARG(2) int PyArray_SetField \
+       (PyArrayObject *, PyArray_Descr *, int, PyObject *);
+NPY_NO_EXPORT  PyObject * PyArray_Byteswap \
+       (PyArrayObject *, npy_bool);
+NPY_NO_EXPORT  PyObject * PyArray_Resize \
+       (PyArrayObject *, PyArray_Dims *, int, NPY_ORDER NPY_UNUSED(order));
+NPY_NO_EXPORT  int PyArray_MoveInto \
+       (PyArrayObject *, PyArrayObject *);
+NPY_NO_EXPORT  int PyArray_CopyInto \
+       (PyArrayObject *, PyArrayObject *);
+NPY_NO_EXPORT  int PyArray_CopyAnyInto \
+       (PyArrayObject *, PyArrayObject *);
+NPY_NO_EXPORT  int PyArray_CopyObject \
+       (PyArrayObject *, PyObject *);
+NPY_NO_EXPORT  PyObject * PyArray_NewCopy \
+       (PyArrayObject *, NPY_ORDER);
+NPY_NO_EXPORT  PyObject * PyArray_ToList \
+       (PyArrayObject *);
+NPY_NO_EXPORT  PyObject * PyArray_ToString \
+       (PyArrayObject *, NPY_ORDER);
+NPY_NO_EXPORT  int PyArray_ToFile \
+       (PyArrayObject *, FILE *, char *, char *);
+NPY_NO_EXPORT  int PyArray_Dump \
+       (PyObject *, PyObject *, int);
+NPY_NO_EXPORT  PyObject * PyArray_Dumps \
+       (PyObject *, int);
+NPY_NO_EXPORT  int PyArray_ValidType \
+       (int);
+NPY_NO_EXPORT  void PyArray_UpdateFlags \
+       (PyArrayObject *, int);
+NPY_NO_EXPORT  PyObject * PyArray_New \
+       (PyTypeObject *, int, npy_intp const *, int, npy_intp const *, void *, int, int, PyObject *);
+NPY_NO_EXPORT NPY_STEALS_REF_TO_ARG(2) PyObject * PyArray_NewFromDescr \
+       (PyTypeObject *, PyArray_Descr *, int, npy_intp const *, npy_intp const *, void *, int, PyObject *);
+NPY_NO_EXPORT  PyArray_Descr * PyArray_DescrNew \
+       (PyArray_Descr *);
+NPY_NO_EXPORT  PyArray_Descr * PyArray_DescrNewFromType \
+       (int);
+NPY_NO_EXPORT  double PyArray_GetPriority \
+       (PyObject *, double);
+NPY_NO_EXPORT  PyObject * PyArray_IterNew \
+       (PyObject *);
+NPY_NO_EXPORT  PyObject* PyArray_MultiIterNew \
+       (int, ...);
+NPY_NO_EXPORT  int PyArray_PyIntAsInt \
+       (PyObject *);
+NPY_NO_EXPORT  npy_intp PyArray_PyIntAsIntp \
+       (PyObject *);
+NPY_NO_EXPORT  int PyArray_Broadcast \
+       (PyArrayMultiIterObject *);
+NPY_NO_EXPORT  void PyArray_FillObjectArray \
+       (PyArrayObject *, PyObject *);
+NPY_NO_EXPORT  int PyArray_FillWithScalar \
+       (PyArrayObject *, PyObject *);
+NPY_NO_EXPORT  npy_bool PyArray_CheckStrides \
+       (int, int, npy_intp, npy_intp, npy_intp const *, npy_intp const *);
+NPY_NO_EXPORT  PyArray_Descr * PyArray_DescrNewByteorder \
+       (PyArray_Descr *, char);
+NPY_NO_EXPORT  PyObject * PyArray_IterAllButAxis \
+       (PyObject *, int *);
+NPY_NO_EXPORT NPY_STEALS_REF_TO_ARG(2) PyObject * PyArray_CheckFromAny \
+       (PyObject *, PyArray_Descr *, int, int, int, PyObject *);
+NPY_NO_EXPORT NPY_STEALS_REF_TO_ARG(2) PyObject * PyArray_FromArray \
+       (PyArrayObject *, PyArray_Descr *, int);
+NPY_NO_EXPORT  PyObject * PyArray_FromInterface \
+       (PyObject *);
+NPY_NO_EXPORT  PyObject * PyArray_FromStructInterface \
+       (PyObject *);
+NPY_NO_EXPORT  PyObject * PyArray_FromArrayAttr \
+       (PyObject *, PyArray_Descr *, PyObject *);
+NPY_NO_EXPORT  NPY_SCALARKIND PyArray_ScalarKind \
+       (int, PyArrayObject **);
+NPY_NO_EXPORT  int PyArray_CanCoerceScalar \
+       (int, int, NPY_SCALARKIND);
+NPY_NO_EXPORT  PyObject * PyArray_NewFlagsObject \
+       (PyObject *);
+NPY_NO_EXPORT  npy_bool PyArray_CanCastScalar \
+       (PyTypeObject *, PyTypeObject *);
+NPY_NO_EXPORT  int PyArray_CompareUCS4 \
+       (npy_ucs4 const *, npy_ucs4 const *, size_t);
+NPY_NO_EXPORT  int PyArray_RemoveSmallest \
+       (PyArrayMultiIterObject *);
+NPY_NO_EXPORT  int PyArray_ElementStrides \
+       (PyObject *);
+NPY_NO_EXPORT  void PyArray_Item_INCREF \
+       (char *, PyArray_Descr *);
+NPY_NO_EXPORT  void PyArray_Item_XDECREF \
+       (char *, PyArray_Descr *);
+NPY_NO_EXPORT  PyObject * PyArray_FieldNames \
+       (PyObject *);
+NPY_NO_EXPORT  PyObject * PyArray_Transpose \
+       (PyArrayObject *, PyArray_Dims *);
+NPY_NO_EXPORT  PyObject * PyArray_TakeFrom \
+       (PyArrayObject *, PyObject *, int, PyArrayObject *, NPY_CLIPMODE);
+NPY_NO_EXPORT  PyObject * PyArray_PutTo \
+       (PyArrayObject *, PyObject*, PyObject *, NPY_CLIPMODE);
+NPY_NO_EXPORT  PyObject * PyArray_PutMask \
+       (PyArrayObject *, PyObject*, PyObject*);
+NPY_NO_EXPORT  PyObject * PyArray_Repeat \
+       (PyArrayObject *, PyObject *, int);
+NPY_NO_EXPORT  PyObject * PyArray_Choose \
+       (PyArrayObject *, PyObject *, PyArrayObject *, NPY_CLIPMODE);
+NPY_NO_EXPORT  int PyArray_Sort \
+       (PyArrayObject *, int, NPY_SORTKIND);
+NPY_NO_EXPORT  PyObject * PyArray_ArgSort \
+       (PyArrayObject *, int, NPY_SORTKIND);
+NPY_NO_EXPORT  PyObject * PyArray_SearchSorted \
+       (PyArrayObject *, PyObject *, NPY_SEARCHSIDE, PyObject *);
+NPY_NO_EXPORT  PyObject * PyArray_ArgMax \
+       (PyArrayObject *, int, PyArrayObject *);
+NPY_NO_EXPORT  PyObject * PyArray_ArgMin \
+       (PyArrayObject *, int, PyArrayObject *);
+NPY_NO_EXPORT  PyObject * PyArray_Reshape \
+       (PyArrayObject *, PyObject *);
+NPY_NO_EXPORT  PyObject * PyArray_Newshape \
+       (PyArrayObject *, PyArray_Dims *, NPY_ORDER);
+NPY_NO_EXPORT  PyObject * PyArray_Squeeze \
+       (PyArrayObject *);
+NPY_NO_EXPORT NPY_STEALS_REF_TO_ARG(2) PyObject * PyArray_View \
+       (PyArrayObject *, PyArray_Descr *, PyTypeObject *);
+NPY_NO_EXPORT  PyObject * PyArray_SwapAxes \
+       (PyArrayObject *, int, int);
+NPY_NO_EXPORT  PyObject * PyArray_Max \
+       (PyArrayObject *, int, PyArrayObject *);
+NPY_NO_EXPORT  PyObject * PyArray_Min \
+       (PyArrayObject *, int, PyArrayObject *);
+NPY_NO_EXPORT  PyObject * PyArray_Ptp \
+       (PyArrayObject *, int, PyArrayObject *);
+NPY_NO_EXPORT  PyObject * PyArray_Mean \
+       (PyArrayObject *, int, int, PyArrayObject *);
+NPY_NO_EXPORT  PyObject * PyArray_Trace \
+       (PyArrayObject *, int, int, int, int, PyArrayObject *);
+NPY_NO_EXPORT  PyObject * PyArray_Diagonal \
+       (PyArrayObject *, int, int, int);
+NPY_NO_EXPORT  PyObject * PyArray_Clip \
+       (PyArrayObject *, PyObject *, PyObject *, PyArrayObject *);
+NPY_NO_EXPORT  PyObject * PyArray_Conjugate \
+       (PyArrayObject *, PyArrayObject *);
+NPY_NO_EXPORT  PyObject * PyArray_Nonzero \
+       (PyArrayObject *);
+NPY_NO_EXPORT  PyObject * PyArray_Std \
+       (PyArrayObject *, int, int, PyArrayObject *, int);
+NPY_NO_EXPORT  PyObject * PyArray_Sum \
+       (PyArrayObject *, int, int, PyArrayObject *);
+NPY_NO_EXPORT  PyObject * PyArray_CumSum \
+       (PyArrayObject *, int, int, PyArrayObject *);
+NPY_NO_EXPORT  PyObject * PyArray_Prod \
+       (PyArrayObject *, int, int, PyArrayObject *);
+NPY_NO_EXPORT  PyObject * PyArray_CumProd \
+       (PyArrayObject *, int, int, PyArrayObject *);
+NPY_NO_EXPORT  PyObject * PyArray_All \
+       (PyArrayObject *, int, PyArrayObject *);
+NPY_NO_EXPORT  PyObject * PyArray_Any \
+       (PyArrayObject *, int, PyArrayObject *);
+NPY_NO_EXPORT  PyObject * PyArray_Compress \
+       (PyArrayObject *, PyObject *, int, PyArrayObject *);
+NPY_NO_EXPORT  PyObject * PyArray_Flatten \
+       (PyArrayObject *, NPY_ORDER);
+NPY_NO_EXPORT  PyObject * PyArray_Ravel \
+       (PyArrayObject *, NPY_ORDER);
+NPY_NO_EXPORT  npy_intp PyArray_MultiplyList \
+       (npy_intp const *, int);
+NPY_NO_EXPORT  int PyArray_MultiplyIntList \
+       (int const *, int);
+NPY_NO_EXPORT  void * PyArray_GetPtr \
+       (PyArrayObject *, npy_intp const*);
+NPY_NO_EXPORT  int PyArray_CompareLists \
+       (npy_intp const *, npy_intp const *, int);
+NPY_NO_EXPORT NPY_STEALS_REF_TO_ARG(5) int PyArray_AsCArray \
+       (PyObject **, void *, npy_intp *, int, PyArray_Descr*);
+NPY_NO_EXPORT  int PyArray_As1D \
+       (PyObject **NPY_UNUSED(op), char **NPY_UNUSED(ptr), int *NPY_UNUSED(d1), int NPY_UNUSED(typecode));
+NPY_NO_EXPORT  int PyArray_As2D \
+       (PyObject **NPY_UNUSED(op), char ***NPY_UNUSED(ptr), int *NPY_UNUSED(d1), int *NPY_UNUSED(d2), int NPY_UNUSED(typecode));
+NPY_NO_EXPORT  int PyArray_Free \
+       (PyObject *, void *);
+NPY_NO_EXPORT  int PyArray_Converter \
+       (PyObject *, PyObject **);
+NPY_NO_EXPORT  int PyArray_IntpFromSequence \
+       (PyObject *, npy_intp *, int);
+NPY_NO_EXPORT  PyObject * PyArray_Concatenate \
+       (PyObject *, int);
+NPY_NO_EXPORT  PyObject * PyArray_InnerProduct \
+       (PyObject *, PyObject *);
+NPY_NO_EXPORT  PyObject * PyArray_MatrixProduct \
+       (PyObject *, PyObject *);
+NPY_NO_EXPORT  PyObject * PyArray_CopyAndTranspose \
+       (PyObject *);
+NPY_NO_EXPORT  PyObject * PyArray_Correlate \
+       (PyObject *, PyObject *, int);
+NPY_NO_EXPORT  int PyArray_TypestrConvert \
+       (int, int);
+NPY_NO_EXPORT  int PyArray_DescrConverter \
+       (PyObject *, PyArray_Descr **);
+NPY_NO_EXPORT  int PyArray_DescrConverter2 \
+       (PyObject *, PyArray_Descr **);
+NPY_NO_EXPORT  int PyArray_IntpConverter \
+       (PyObject *, PyArray_Dims *);
+NPY_NO_EXPORT  int PyArray_BufferConverter \
+       (PyObject *, PyArray_Chunk *);
+NPY_NO_EXPORT  int PyArray_AxisConverter \
+       (PyObject *, int *);
+NPY_NO_EXPORT  int PyArray_BoolConverter \
+       (PyObject *, npy_bool *);
+NPY_NO_EXPORT  int PyArray_ByteorderConverter \
+       (PyObject *, char *);
+NPY_NO_EXPORT  int PyArray_OrderConverter \
+       (PyObject *, NPY_ORDER *);
+NPY_NO_EXPORT  unsigned char PyArray_EquivTypes \
+       (PyArray_Descr *, PyArray_Descr *);
+NPY_NO_EXPORT NPY_STEALS_REF_TO_ARG(3) PyObject * PyArray_Zeros \
+       (int, npy_intp const *, PyArray_Descr *, int);
+NPY_NO_EXPORT NPY_STEALS_REF_TO_ARG(3) PyObject * PyArray_Empty \
+       (int, npy_intp const *, PyArray_Descr *, int);
+NPY_NO_EXPORT  PyObject * PyArray_Where \
+       (PyObject *, PyObject *, PyObject *);
+NPY_NO_EXPORT  PyObject * PyArray_Arange \
+       (double, double, double, int);
+NPY_NO_EXPORT  PyObject * PyArray_ArangeObj \
+       (PyObject *, PyObject *, PyObject *, PyArray_Descr *);
+NPY_NO_EXPORT  int PyArray_SortkindConverter \
+       (PyObject *, NPY_SORTKIND *);
+NPY_NO_EXPORT  PyObject * PyArray_LexSort \
+       (PyObject *, int);
+NPY_NO_EXPORT  PyObject * PyArray_Round \
+       (PyArrayObject *, int, PyArrayObject *);
+NPY_NO_EXPORT  unsigned char PyArray_EquivTypenums \
+       (int, int);
+NPY_NO_EXPORT  int PyArray_RegisterDataType \
+       (PyArray_Descr *);
+NPY_NO_EXPORT  int PyArray_RegisterCastFunc \
+       (PyArray_Descr *, int, PyArray_VectorUnaryFunc *);
+NPY_NO_EXPORT  int PyArray_RegisterCanCast \
+       (PyArray_Descr *, int, NPY_SCALARKIND);
+NPY_NO_EXPORT  void PyArray_InitArrFuncs \
+       (PyArray_ArrFuncs *);
+NPY_NO_EXPORT  PyObject * PyArray_IntTupleFromIntp \
+       (int, npy_intp const *);
+NPY_NO_EXPORT  int PyArray_TypeNumFromName \
+       (char const *);
+NPY_NO_EXPORT  int PyArray_ClipmodeConverter \
+       (PyObject *, NPY_CLIPMODE *);
+NPY_NO_EXPORT  int PyArray_OutputConverter \
+       (PyObject *, PyArrayObject **);
+NPY_NO_EXPORT  PyObject * PyArray_BroadcastToShape \
+       (PyObject *, npy_intp *, int);
+NPY_NO_EXPORT  void _PyArray_SigintHandler \
+       (int);
+NPY_NO_EXPORT  void* _PyArray_GetSigintBuf \
+       (void);
+NPY_NO_EXPORT  int PyArray_DescrAlignConverter \
+       (PyObject *, PyArray_Descr **);
+NPY_NO_EXPORT  int PyArray_DescrAlignConverter2 \
+       (PyObject *, PyArray_Descr **);
+NPY_NO_EXPORT  int PyArray_SearchsideConverter \
+       (PyObject *, void *);
+NPY_NO_EXPORT  PyObject * PyArray_CheckAxis \
+       (PyArrayObject *, int *, int);
+NPY_NO_EXPORT  npy_intp PyArray_OverflowMultiplyList \
+       (npy_intp const *, int);
+NPY_NO_EXPORT  int PyArray_CompareString \
+       (const char *, const char *, size_t);
+NPY_NO_EXPORT  PyObject* PyArray_MultiIterFromObjects \
+       (PyObject **, int, int, ...);
+NPY_NO_EXPORT  int PyArray_GetEndianness \
+       (void);
+NPY_NO_EXPORT  unsigned int PyArray_GetNDArrayCFeatureVersion \
+       (void);
+NPY_NO_EXPORT  PyObject * PyArray_Correlate2 \
+       (PyObject *, PyObject *, int);
+NPY_NO_EXPORT  PyObject* PyArray_NeighborhoodIterNew \
+       (PyArrayIterObject *, const npy_intp *, int, PyArrayObject*);
+extern NPY_NO_EXPORT PyTypeObject PyTimeIntegerArrType_Type;
+
+extern NPY_NO_EXPORT PyTypeObject PyDatetimeArrType_Type;
+
+extern NPY_NO_EXPORT PyTypeObject PyTimedeltaArrType_Type;
+
+extern NPY_NO_EXPORT PyTypeObject PyHalfArrType_Type;
+
+extern NPY_NO_EXPORT PyTypeObject NpyIter_Type;
+
+NPY_NO_EXPORT  void PyArray_SetDatetimeParseFunction \
+       (PyObject *NPY_UNUSED(op));
+NPY_NO_EXPORT  void PyArray_DatetimeToDatetimeStruct \
+       (npy_datetime NPY_UNUSED(val), NPY_DATETIMEUNIT NPY_UNUSED(fr), npy_datetimestruct *);
+NPY_NO_EXPORT  void PyArray_TimedeltaToTimedeltaStruct \
+       (npy_timedelta NPY_UNUSED(val), NPY_DATETIMEUNIT NPY_UNUSED(fr), npy_timedeltastruct *);
+NPY_NO_EXPORT  npy_datetime PyArray_DatetimeStructToDatetime \
+       (NPY_DATETIMEUNIT NPY_UNUSED(fr), npy_datetimestruct *NPY_UNUSED(d));
+NPY_NO_EXPORT  npy_datetime PyArray_TimedeltaStructToTimedelta \
+       (NPY_DATETIMEUNIT NPY_UNUSED(fr), npy_timedeltastruct *NPY_UNUSED(d));
+NPY_NO_EXPORT  NpyIter * NpyIter_New \
+       (PyArrayObject *, npy_uint32, NPY_ORDER, NPY_CASTING, PyArray_Descr*);
+NPY_NO_EXPORT  NpyIter * NpyIter_MultiNew \
+       (int, PyArrayObject **, npy_uint32, NPY_ORDER, NPY_CASTING, npy_uint32 *, PyArray_Descr **);
+NPY_NO_EXPORT  NpyIter * NpyIter_AdvancedNew \
+       (int, PyArrayObject **, npy_uint32, NPY_ORDER, NPY_CASTING, npy_uint32 *, PyArray_Descr **, int, int **, npy_intp *, npy_intp);
+NPY_NO_EXPORT  NpyIter * NpyIter_Copy \
+       (NpyIter *);
+NPY_NO_EXPORT  int NpyIter_Deallocate \
+       (NpyIter *);
+NPY_NO_EXPORT  npy_bool NpyIter_HasDelayedBufAlloc \
+       (NpyIter *);
+NPY_NO_EXPORT  npy_bool NpyIter_HasExternalLoop \
+       (NpyIter *);
+NPY_NO_EXPORT  int NpyIter_EnableExternalLoop \
+       (NpyIter *);
+NPY_NO_EXPORT  npy_intp * NpyIter_GetInnerStrideArray \
+       (NpyIter *);
+NPY_NO_EXPORT  npy_intp * NpyIter_GetInnerLoopSizePtr \
+       (NpyIter *);
+NPY_NO_EXPORT  int NpyIter_Reset \
+       (NpyIter *, char **);
+NPY_NO_EXPORT  int NpyIter_ResetBasePointers \
+       (NpyIter *, char **, char **);
+NPY_NO_EXPORT  int NpyIter_ResetToIterIndexRange \
+       (NpyIter *, npy_intp, npy_intp, char **);
+NPY_NO_EXPORT  int NpyIter_GetNDim \
+       (NpyIter *);
+NPY_NO_EXPORT  int NpyIter_GetNOp \
+       (NpyIter *);
+NPY_NO_EXPORT  NpyIter_IterNextFunc * NpyIter_GetIterNext \
+       (NpyIter *, char **);
+NPY_NO_EXPORT  npy_intp NpyIter_GetIterSize \
+       (NpyIter *);
+NPY_NO_EXPORT  void NpyIter_GetIterIndexRange \
+       (NpyIter *, npy_intp *, npy_intp *);
+NPY_NO_EXPORT  npy_intp NpyIter_GetIterIndex \
+       (NpyIter *);
+NPY_NO_EXPORT  int NpyIter_GotoIterIndex \
+       (NpyIter *, npy_intp);
+NPY_NO_EXPORT  npy_bool NpyIter_HasMultiIndex \
+       (NpyIter *);
+NPY_NO_EXPORT  int NpyIter_GetShape \
+       (NpyIter *, npy_intp *);
+NPY_NO_EXPORT  NpyIter_GetMultiIndexFunc * NpyIter_GetGetMultiIndex \
+       (NpyIter *, char **);
+NPY_NO_EXPORT  int NpyIter_GotoMultiIndex \
+       (NpyIter *, npy_intp const *);
+NPY_NO_EXPORT  int NpyIter_RemoveMultiIndex \
+       (NpyIter *);
+NPY_NO_EXPORT  npy_bool NpyIter_HasIndex \
+       (NpyIter *);
+NPY_NO_EXPORT  npy_bool NpyIter_IsBuffered \
+       (NpyIter *);
+NPY_NO_EXPORT  npy_bool NpyIter_IsGrowInner \
+       (NpyIter *);
+NPY_NO_EXPORT  npy_intp NpyIter_GetBufferSize \
+       (NpyIter *);
+NPY_NO_EXPORT  npy_intp * NpyIter_GetIndexPtr \
+       (NpyIter *);
+NPY_NO_EXPORT  int NpyIter_GotoIndex \
+       (NpyIter *, npy_intp);
+NPY_NO_EXPORT  char ** NpyIter_GetDataPtrArray \
+       (NpyIter *);
+NPY_NO_EXPORT  PyArray_Descr ** NpyIter_GetDescrArray \
+       (NpyIter *);
+NPY_NO_EXPORT  PyArrayObject ** NpyIter_GetOperandArray \
+       (NpyIter *);
+NPY_NO_EXPORT  PyArrayObject * NpyIter_GetIterView \
+       (NpyIter *, npy_intp);
+NPY_NO_EXPORT  void NpyIter_GetReadFlags \
+       (NpyIter *, char *);
+NPY_NO_EXPORT  void NpyIter_GetWriteFlags \
+       (NpyIter *, char *);
+NPY_NO_EXPORT  void NpyIter_DebugPrint \
+       (NpyIter *);
+NPY_NO_EXPORT  npy_bool NpyIter_IterationNeedsAPI \
+       (NpyIter *);
+NPY_NO_EXPORT  void NpyIter_GetInnerFixedStrideArray \
+       (NpyIter *, npy_intp *);
+NPY_NO_EXPORT  int NpyIter_RemoveAxis \
+       (NpyIter *, int);
+NPY_NO_EXPORT  npy_intp * NpyIter_GetAxisStrideArray \
+       (NpyIter *, int);
+NPY_NO_EXPORT  npy_bool NpyIter_RequiresBuffering \
+       (NpyIter *);
+NPY_NO_EXPORT  char ** NpyIter_GetInitialDataPtrArray \
+       (NpyIter *);
+NPY_NO_EXPORT  int NpyIter_CreateCompatibleStrides \
+       (NpyIter *, npy_intp, npy_intp *);
+NPY_NO_EXPORT  int PyArray_CastingConverter \
+       (PyObject *, NPY_CASTING *);
+NPY_NO_EXPORT  npy_intp PyArray_CountNonzero \
+       (PyArrayObject *);
+NPY_NO_EXPORT  PyArray_Descr * PyArray_PromoteTypes \
+       (PyArray_Descr *, PyArray_Descr *);
+NPY_NO_EXPORT  PyArray_Descr * PyArray_MinScalarType \
+       (PyArrayObject *);
+NPY_NO_EXPORT  PyArray_Descr * PyArray_ResultType \
+       (npy_intp, PyArrayObject *arrs[], npy_intp, PyArray_Descr *descrs[]);
+NPY_NO_EXPORT  npy_bool PyArray_CanCastArrayTo \
+       (PyArrayObject *, PyArray_Descr *, NPY_CASTING);
+NPY_NO_EXPORT  npy_bool PyArray_CanCastTypeTo \
+       (PyArray_Descr *, PyArray_Descr *, NPY_CASTING);
+NPY_NO_EXPORT  PyArrayObject * PyArray_EinsteinSum \
+       (char *, npy_intp, PyArrayObject **, PyArray_Descr *, NPY_ORDER, NPY_CASTING, PyArrayObject *);
+NPY_NO_EXPORT NPY_STEALS_REF_TO_ARG(3) PyObject * PyArray_NewLikeArray \
+       (PyArrayObject *, NPY_ORDER, PyArray_Descr *, int);
+NPY_NO_EXPORT  int PyArray_GetArrayParamsFromObject \
+       (PyObject *NPY_UNUSED(op), PyArray_Descr *NPY_UNUSED(requested_dtype), npy_bool NPY_UNUSED(writeable), PyArray_Descr **NPY_UNUSED(out_dtype), int *NPY_UNUSED(out_ndim), npy_intp *NPY_UNUSED(out_dims), PyArrayObject **NPY_UNUSED(out_arr), PyObject *NPY_UNUSED(context));
+NPY_NO_EXPORT  int PyArray_ConvertClipmodeSequence \
+       (PyObject *, NPY_CLIPMODE *, int);
+NPY_NO_EXPORT  PyObject * PyArray_MatrixProduct2 \
+       (PyObject *, PyObject *, PyArrayObject*);
+NPY_NO_EXPORT  npy_bool NpyIter_IsFirstVisit \
+       (NpyIter *, int);
+NPY_NO_EXPORT NPY_STEALS_REF_TO_ARG(2) int PyArray_SetBaseObject \
+       (PyArrayObject *, PyObject *);
+NPY_NO_EXPORT  void PyArray_CreateSortedStridePerm \
+       (int, npy_intp const *, npy_stride_sort_item *);
+NPY_NO_EXPORT  void PyArray_RemoveAxesInPlace \
+       (PyArrayObject *, const npy_bool *);
+NPY_NO_EXPORT  void PyArray_DebugPrint \
+       (PyArrayObject *);
+NPY_NO_EXPORT  int PyArray_FailUnlessWriteable \
+       (PyArrayObject *, const char *);
+NPY_NO_EXPORT NPY_STEALS_REF_TO_ARG(2) int PyArray_SetUpdateIfCopyBase \
+       (PyArrayObject *, PyArrayObject *);
+NPY_NO_EXPORT  void * PyDataMem_NEW \
+       (size_t);
+NPY_NO_EXPORT  void PyDataMem_FREE \
+       (void *);
+NPY_NO_EXPORT  void * PyDataMem_RENEW \
+       (void *, size_t);
+NPY_NO_EXPORT  PyDataMem_EventHookFunc * PyDataMem_SetEventHook \
+       (PyDataMem_EventHookFunc *, void *, void **);
+extern NPY_NO_EXPORT NPY_CASTING NPY_DEFAULT_ASSIGN_CASTING;
+
+NPY_NO_EXPORT  void PyArray_MapIterSwapAxes \
+       (PyArrayMapIterObject *, PyArrayObject **, int);
+NPY_NO_EXPORT  PyObject * PyArray_MapIterArray \
+       (PyArrayObject *, PyObject *);
+NPY_NO_EXPORT  void PyArray_MapIterNext \
+       (PyArrayMapIterObject *);
+NPY_NO_EXPORT  int PyArray_Partition \
+       (PyArrayObject *, PyArrayObject *, int, NPY_SELECTKIND);
+NPY_NO_EXPORT  PyObject * PyArray_ArgPartition \
+       (PyArrayObject *, PyArrayObject *, int, NPY_SELECTKIND);
+NPY_NO_EXPORT  int PyArray_SelectkindConverter \
+       (PyObject *, NPY_SELECTKIND *);
+NPY_NO_EXPORT  void * PyDataMem_NEW_ZEROED \
+       (size_t, size_t);
+NPY_NO_EXPORT  int PyArray_CheckAnyScalarExact \
+       (PyObject *);
+NPY_NO_EXPORT  PyObject * PyArray_MapIterArrayCopyIfOverlap \
+       (PyArrayObject *, PyObject *, int, PyArrayObject *);
+NPY_NO_EXPORT  int PyArray_ResolveWritebackIfCopy \
+       (PyArrayObject *);
+NPY_NO_EXPORT  int PyArray_SetWritebackIfCopyBase \
+       (PyArrayObject *, PyArrayObject *);
+NPY_NO_EXPORT  PyObject * PyDataMem_SetHandler \
+       (PyObject *);
+NPY_NO_EXPORT  PyObject * PyDataMem_GetHandler \
+       (void);
+extern NPY_NO_EXPORT PyObject* PyDataMem_DefaultHandler;
+
+
+#else
+
+#if defined(PY_ARRAY_UNIQUE_SYMBOL)
+#define PyArray_API PY_ARRAY_UNIQUE_SYMBOL
+#endif
+
+#if defined(NO_IMPORT) || defined(NO_IMPORT_ARRAY)
+extern void **PyArray_API;
+#else
+#if defined(PY_ARRAY_UNIQUE_SYMBOL)
+void **PyArray_API;
+#else
+static void **PyArray_API=NULL;
+#endif
+#endif
+
+#define PyArray_GetNDArrayCVersion \
+        (*(unsigned int (*)(void)) \
+    PyArray_API[0])
+#define PyBigArray_Type (*(PyTypeObject *)PyArray_API[1])
+#define PyArray_Type (*(PyTypeObject *)PyArray_API[2])
+#define PyArrayDescr_Type (*(PyTypeObject *)PyArray_API[3])
+#define PyArrayFlags_Type (*(PyTypeObject *)PyArray_API[4])
+#define PyArrayIter_Type (*(PyTypeObject *)PyArray_API[5])
+#define PyArrayMultiIter_Type (*(PyTypeObject *)PyArray_API[6])
+#define NPY_NUMUSERTYPES (*(int *)PyArray_API[7])
+#define PyBoolArrType_Type (*(PyTypeObject *)PyArray_API[8])
+#define _PyArrayScalar_BoolValues ((PyBoolScalarObject *)PyArray_API[9])
+#define PyGenericArrType_Type (*(PyTypeObject *)PyArray_API[10])
+#define PyNumberArrType_Type (*(PyTypeObject *)PyArray_API[11])
+#define PyIntegerArrType_Type (*(PyTypeObject *)PyArray_API[12])
+#define PySignedIntegerArrType_Type (*(PyTypeObject *)PyArray_API[13])
+#define PyUnsignedIntegerArrType_Type (*(PyTypeObject *)PyArray_API[14])
+#define PyInexactArrType_Type (*(PyTypeObject *)PyArray_API[15])
+#define PyFloatingArrType_Type (*(PyTypeObject *)PyArray_API[16])
+#define PyComplexFloatingArrType_Type (*(PyTypeObject *)PyArray_API[17])
+#define PyFlexibleArrType_Type (*(PyTypeObject *)PyArray_API[18])
+#define PyCharacterArrType_Type (*(PyTypeObject *)PyArray_API[19])
+#define PyByteArrType_Type (*(PyTypeObject *)PyArray_API[20])
+#define PyShortArrType_Type (*(PyTypeObject *)PyArray_API[21])
+#define PyIntArrType_Type (*(PyTypeObject *)PyArray_API[22])
+#define PyLongArrType_Type (*(PyTypeObject *)PyArray_API[23])
+#define PyLongLongArrType_Type (*(PyTypeObject *)PyArray_API[24])
+#define PyUByteArrType_Type (*(PyTypeObject *)PyArray_API[25])
+#define PyUShortArrType_Type (*(PyTypeObject *)PyArray_API[26])
+#define PyUIntArrType_Type (*(PyTypeObject *)PyArray_API[27])
+#define PyULongArrType_Type (*(PyTypeObject *)PyArray_API[28])
+#define PyULongLongArrType_Type (*(PyTypeObject *)PyArray_API[29])
+#define PyFloatArrType_Type (*(PyTypeObject *)PyArray_API[30])
+#define PyDoubleArrType_Type (*(PyTypeObject *)PyArray_API[31])
+#define PyLongDoubleArrType_Type (*(PyTypeObject *)PyArray_API[32])
+#define PyCFloatArrType_Type (*(PyTypeObject *)PyArray_API[33])
+#define PyCDoubleArrType_Type (*(PyTypeObject *)PyArray_API[34])
+#define PyCLongDoubleArrType_Type (*(PyTypeObject *)PyArray_API[35])
+#define PyObjectArrType_Type (*(PyTypeObject *)PyArray_API[36])
+#define PyStringArrType_Type (*(PyTypeObject *)PyArray_API[37])
+#define PyUnicodeArrType_Type (*(PyTypeObject *)PyArray_API[38])
+#define PyVoidArrType_Type (*(PyTypeObject *)PyArray_API[39])
+#define PyArray_SetNumericOps \
+        (*(int (*)(PyObject *)) \
+    PyArray_API[40])
+#define PyArray_GetNumericOps \
+        (*(PyObject * (*)(void)) \
+    PyArray_API[41])
+#define PyArray_INCREF \
+        (*(int (*)(PyArrayObject *)) \
+    PyArray_API[42])
+#define PyArray_XDECREF \
+        (*(int (*)(PyArrayObject *)) \
+    PyArray_API[43])
+#define PyArray_SetStringFunction \
+        (*(void (*)(PyObject *, int)) \
+    PyArray_API[44])
+#define PyArray_DescrFromType \
+        (*(PyArray_Descr * (*)(int)) \
+    PyArray_API[45])
+#define PyArray_TypeObjectFromType \
+        (*(PyObject * (*)(int)) \
+    PyArray_API[46])
+#define PyArray_Zero \
+        (*(char * (*)(PyArrayObject *)) \
+    PyArray_API[47])
+#define PyArray_One \
+        (*(char * (*)(PyArrayObject *)) \
+    PyArray_API[48])
+#define PyArray_CastToType \
+        (*(PyObject * (*)(PyArrayObject *, PyArray_Descr *, int)) \
+    PyArray_API[49])
+#define PyArray_CastTo \
+        (*(int (*)(PyArrayObject *, PyArrayObject *)) \
+    PyArray_API[50])
+#define PyArray_CastAnyTo \
+        (*(int (*)(PyArrayObject *, PyArrayObject *)) \
+    PyArray_API[51])
+#define PyArray_CanCastSafely \
+        (*(int (*)(int, int)) \
+    PyArray_API[52])
+#define PyArray_CanCastTo \
+        (*(npy_bool (*)(PyArray_Descr *, PyArray_Descr *)) \
+    PyArray_API[53])
+#define PyArray_ObjectType \
+        (*(int (*)(PyObject *, int)) \
+    PyArray_API[54])
+#define PyArray_DescrFromObject \
+        (*(PyArray_Descr * (*)(PyObject *, PyArray_Descr *)) \
+    PyArray_API[55])
+#define PyArray_ConvertToCommonType \
+        (*(PyArrayObject ** (*)(PyObject *, int *)) \
+    PyArray_API[56])
+#define PyArray_DescrFromScalar \
+        (*(PyArray_Descr * (*)(PyObject *)) \
+    PyArray_API[57])
+#define PyArray_DescrFromTypeObject \
+        (*(PyArray_Descr * (*)(PyObject *)) \
+    PyArray_API[58])
+#define PyArray_Size \
+        (*(npy_intp (*)(PyObject *)) \
+    PyArray_API[59])
+#define PyArray_Scalar \
+        (*(PyObject * (*)(void *, PyArray_Descr *, PyObject *)) \
+    PyArray_API[60])
+#define PyArray_FromScalar \
+        (*(PyObject * (*)(PyObject *, PyArray_Descr *)) \
+    PyArray_API[61])
+#define PyArray_ScalarAsCtype \
+        (*(void (*)(PyObject *, void *)) \
+    PyArray_API[62])
+#define PyArray_CastScalarToCtype \
+        (*(int (*)(PyObject *, void *, PyArray_Descr *)) \
+    PyArray_API[63])
+#define PyArray_CastScalarDirect \
+        (*(int (*)(PyObject *, PyArray_Descr *, void *, int)) \
+    PyArray_API[64])
+#define PyArray_ScalarFromObject \
+        (*(PyObject * (*)(PyObject *)) \
+    PyArray_API[65])
+#define PyArray_GetCastFunc \
+        (*(PyArray_VectorUnaryFunc * (*)(PyArray_Descr *, int)) \
+    PyArray_API[66])
+#define PyArray_FromDims \
+        (*(PyObject * (*)(int NPY_UNUSED(nd), int *NPY_UNUSED(d), int NPY_UNUSED(type))) \
+    PyArray_API[67])
+#define PyArray_FromDimsAndDataAndDescr \
+        (*(PyObject * (*)(int NPY_UNUSED(nd), int *NPY_UNUSED(d), PyArray_Descr *, char *NPY_UNUSED(data))) \
+    PyArray_API[68])
+#define PyArray_FromAny \
+        (*(PyObject * (*)(PyObject *, PyArray_Descr *, int, int, int, PyObject *)) \
+    PyArray_API[69])
+#define PyArray_EnsureArray \
+        (*(PyObject * (*)(PyObject *)) \
+    PyArray_API[70])
+#define PyArray_EnsureAnyArray \
+        (*(PyObject * (*)(PyObject *)) \
+    PyArray_API[71])
+#define PyArray_FromFile \
+        (*(PyObject * (*)(FILE *, PyArray_Descr *, npy_intp, char *)) \
+    PyArray_API[72])
+#define PyArray_FromString \
+        (*(PyObject * (*)(char *, npy_intp, PyArray_Descr *, npy_intp, char *)) \
+    PyArray_API[73])
+#define PyArray_FromBuffer \
+        (*(PyObject * (*)(PyObject *, PyArray_Descr *, npy_intp, npy_intp)) \
+    PyArray_API[74])
+#define PyArray_FromIter \
+        (*(PyObject * (*)(PyObject *, PyArray_Descr *, npy_intp)) \
+    PyArray_API[75])
+#define PyArray_Return \
+        (*(PyObject * (*)(PyArrayObject *)) \
+    PyArray_API[76])
+#define PyArray_GetField \
+        (*(PyObject * (*)(PyArrayObject *, PyArray_Descr *, int)) \
+    PyArray_API[77])
+#define PyArray_SetField \
+        (*(int (*)(PyArrayObject *, PyArray_Descr *, int, PyObject *)) \
+    PyArray_API[78])
+#define PyArray_Byteswap \
+        (*(PyObject * (*)(PyArrayObject *, npy_bool)) \
+    PyArray_API[79])
+#define PyArray_Resize \
+        (*(PyObject * (*)(PyArrayObject *, PyArray_Dims *, int, NPY_ORDER NPY_UNUSED(order))) \
+    PyArray_API[80])
+#define PyArray_MoveInto \
+        (*(int (*)(PyArrayObject *, PyArrayObject *)) \
+    PyArray_API[81])
+#define PyArray_CopyInto \
+        (*(int (*)(PyArrayObject *, PyArrayObject *)) \
+    PyArray_API[82])
+#define PyArray_CopyAnyInto \
+        (*(int (*)(PyArrayObject *, PyArrayObject *)) \
+    PyArray_API[83])
+#define PyArray_CopyObject \
+        (*(int (*)(PyArrayObject *, PyObject *)) \
+    PyArray_API[84])
+#define PyArray_NewCopy \
+        (*(PyObject * (*)(PyArrayObject *, NPY_ORDER)) \
+    PyArray_API[85])
+#define PyArray_ToList \
+        (*(PyObject * (*)(PyArrayObject *)) \
+    PyArray_API[86])
+#define PyArray_ToString \
+        (*(PyObject * (*)(PyArrayObject *, NPY_ORDER)) \
+    PyArray_API[87])
+#define PyArray_ToFile \
+        (*(int (*)(PyArrayObject *, FILE *, char *, char *)) \
+    PyArray_API[88])
+#define PyArray_Dump \
+        (*(int (*)(PyObject *, PyObject *, int)) \
+    PyArray_API[89])
+#define PyArray_Dumps \
+        (*(PyObject * (*)(PyObject *, int)) \
+    PyArray_API[90])
+#define PyArray_ValidType \
+        (*(int (*)(int)) \
+    PyArray_API[91])
+#define PyArray_UpdateFlags \
+        (*(void (*)(PyArrayObject *, int)) \
+    PyArray_API[92])
+#define PyArray_New \
+        (*(PyObject * (*)(PyTypeObject *, int, npy_intp const *, int, npy_intp const *, void *, int, int, PyObject *)) \
+    PyArray_API[93])
+#define PyArray_NewFromDescr \
+        (*(PyObject * (*)(PyTypeObject *, PyArray_Descr *, int, npy_intp const *, npy_intp const *, void *, int, PyObject *)) \
+    PyArray_API[94])
+#define PyArray_DescrNew \
+        (*(PyArray_Descr * (*)(PyArray_Descr *)) \
+    PyArray_API[95])
+#define PyArray_DescrNewFromType \
+        (*(PyArray_Descr * (*)(int)) \
+    PyArray_API[96])
+#define PyArray_GetPriority \
+        (*(double (*)(PyObject *, double)) \
+    PyArray_API[97])
+#define PyArray_IterNew \
+        (*(PyObject * (*)(PyObject *)) \
+    PyArray_API[98])
+#define PyArray_MultiIterNew \
+        (*(PyObject* (*)(int, ...)) \
+    PyArray_API[99])
+#define PyArray_PyIntAsInt \
+        (*(int (*)(PyObject *)) \
+    PyArray_API[100])
+#define PyArray_PyIntAsIntp \
+        (*(npy_intp (*)(PyObject *)) \
+    PyArray_API[101])
+#define PyArray_Broadcast \
+        (*(int (*)(PyArrayMultiIterObject *)) \
+    PyArray_API[102])
+#define PyArray_FillObjectArray \
+        (*(void (*)(PyArrayObject *, PyObject *)) \
+    PyArray_API[103])
+#define PyArray_FillWithScalar \
+        (*(int (*)(PyArrayObject *, PyObject *)) \
+    PyArray_API[104])
+#define PyArray_CheckStrides \
+        (*(npy_bool (*)(int, int, npy_intp, npy_intp, npy_intp const *, npy_intp const *)) \
+    PyArray_API[105])
+#define PyArray_DescrNewByteorder \
+        (*(PyArray_Descr * (*)(PyArray_Descr *, char)) \
+    PyArray_API[106])
+#define PyArray_IterAllButAxis \
+        (*(PyObject * (*)(PyObject *, int *)) \
+    PyArray_API[107])
+#define PyArray_CheckFromAny \
+        (*(PyObject * (*)(PyObject *, PyArray_Descr *, int, int, int, PyObject *)) \
+    PyArray_API[108])
+#define PyArray_FromArray \
+        (*(PyObject * (*)(PyArrayObject *, PyArray_Descr *, int)) \
+    PyArray_API[109])
+#define PyArray_FromInterface \
+        (*(PyObject * (*)(PyObject *)) \
+    PyArray_API[110])
+#define PyArray_FromStructInterface \
+        (*(PyObject * (*)(PyObject *)) \
+    PyArray_API[111])
+#define PyArray_FromArrayAttr \
+        (*(PyObject * (*)(PyObject *, PyArray_Descr *, PyObject *)) \
+    PyArray_API[112])
+#define PyArray_ScalarKind \
+        (*(NPY_SCALARKIND (*)(int, PyArrayObject **)) \
+    PyArray_API[113])
+#define PyArray_CanCoerceScalar \
+        (*(int (*)(int, int, NPY_SCALARKIND)) \
+    PyArray_API[114])
+#define PyArray_NewFlagsObject \
+        (*(PyObject * (*)(PyObject *)) \
+    PyArray_API[115])
+#define PyArray_CanCastScalar \
+        (*(npy_bool (*)(PyTypeObject *, PyTypeObject *)) \
+    PyArray_API[116])
+#define PyArray_CompareUCS4 \
+        (*(int (*)(npy_ucs4 const *, npy_ucs4 const *, size_t)) \
+    PyArray_API[117])
+#define PyArray_RemoveSmallest \
+        (*(int (*)(PyArrayMultiIterObject *)) \
+    PyArray_API[118])
+#define PyArray_ElementStrides \
+        (*(int (*)(PyObject *)) \
+    PyArray_API[119])
+#define PyArray_Item_INCREF \
+        (*(void (*)(char *, PyArray_Descr *)) \
+    PyArray_API[120])
+#define PyArray_Item_XDECREF \
+        (*(void (*)(char *, PyArray_Descr *)) \
+    PyArray_API[121])
+#define PyArray_FieldNames \
+        (*(PyObject * (*)(PyObject *)) \
+    PyArray_API[122])
+#define PyArray_Transpose \
+        (*(PyObject * (*)(PyArrayObject *, PyArray_Dims *)) \
+    PyArray_API[123])
+#define PyArray_TakeFrom \
+        (*(PyObject * (*)(PyArrayObject *, PyObject *, int, PyArrayObject *, NPY_CLIPMODE)) \
+    PyArray_API[124])
+#define PyArray_PutTo \
+        (*(PyObject * (*)(PyArrayObject *, PyObject*, PyObject *, NPY_CLIPMODE)) \
+    PyArray_API[125])
+#define PyArray_PutMask \
+        (*(PyObject * (*)(PyArrayObject *, PyObject*, PyObject*)) \
+    PyArray_API[126])
+#define PyArray_Repeat \
+        (*(PyObject * (*)(PyArrayObject *, PyObject *, int)) \
+    PyArray_API[127])
+#define PyArray_Choose \
+        (*(PyObject * (*)(PyArrayObject *, PyObject *, PyArrayObject *, NPY_CLIPMODE)) \
+    PyArray_API[128])
+#define PyArray_Sort \
+        (*(int (*)(PyArrayObject *, int, NPY_SORTKIND)) \
+    PyArray_API[129])
+#define PyArray_ArgSort \
+        (*(PyObject * (*)(PyArrayObject *, int, NPY_SORTKIND)) \
+    PyArray_API[130])
+#define PyArray_SearchSorted \
+        (*(PyObject * (*)(PyArrayObject *, PyObject *, NPY_SEARCHSIDE, PyObject *)) \
+    PyArray_API[131])
+#define PyArray_ArgMax \
+        (*(PyObject * (*)(PyArrayObject *, int, PyArrayObject *)) \
+    PyArray_API[132])
+#define PyArray_ArgMin \
+        (*(PyObject * (*)(PyArrayObject *, int, PyArrayObject *)) \
+    PyArray_API[133])
+#define PyArray_Reshape \
+        (*(PyObject * (*)(PyArrayObject *, PyObject *)) \
+    PyArray_API[134])
+#define PyArray_Newshape \
+        (*(PyObject * (*)(PyArrayObject *, PyArray_Dims *, NPY_ORDER)) \
+    PyArray_API[135])
+#define PyArray_Squeeze \
+        (*(PyObject * (*)(PyArrayObject *)) \
+    PyArray_API[136])
+#define PyArray_View \
+        (*(PyObject * (*)(PyArrayObject *, PyArray_Descr *, PyTypeObject *)) \
+    PyArray_API[137])
+#define PyArray_SwapAxes \
+        (*(PyObject * (*)(PyArrayObject *, int, int)) \
+    PyArray_API[138])
+#define PyArray_Max \
+        (*(PyObject * (*)(PyArrayObject *, int, PyArrayObject *)) \
+    PyArray_API[139])
+#define PyArray_Min \
+        (*(PyObject * (*)(PyArrayObject *, int, PyArrayObject *)) \
+    PyArray_API[140])
+#define PyArray_Ptp \
+        (*(PyObject * (*)(PyArrayObject *, int, PyArrayObject *)) \
+    PyArray_API[141])
+#define PyArray_Mean \
+        (*(PyObject * (*)(PyArrayObject *, int, int, PyArrayObject *)) \
+    PyArray_API[142])
+#define PyArray_Trace \
+        (*(PyObject * (*)(PyArrayObject *, int, int, int, int, PyArrayObject *)) \
+    PyArray_API[143])
+#define PyArray_Diagonal \
+        (*(PyObject * (*)(PyArrayObject *, int, int, int)) \
+    PyArray_API[144])
+#define PyArray_Clip \
+        (*(PyObject * (*)(PyArrayObject *, PyObject *, PyObject *, PyArrayObject *)) \
+    PyArray_API[145])
+#define PyArray_Conjugate \
+        (*(PyObject * (*)(PyArrayObject *, PyArrayObject *)) \
+    PyArray_API[146])
+#define PyArray_Nonzero \
+        (*(PyObject * (*)(PyArrayObject *)) \
+    PyArray_API[147])
+#define PyArray_Std \
+        (*(PyObject * (*)(PyArrayObject *, int, int, PyArrayObject *, int)) \
+    PyArray_API[148])
+#define PyArray_Sum \
+        (*(PyObject * (*)(PyArrayObject *, int, int, PyArrayObject *)) \
+    PyArray_API[149])
+#define PyArray_CumSum \
+        (*(PyObject * (*)(PyArrayObject *, int, int, PyArrayObject *)) \
+    PyArray_API[150])
+#define PyArray_Prod \
+        (*(PyObject * (*)(PyArrayObject *, int, int, PyArrayObject *)) \
+    PyArray_API[151])
+#define PyArray_CumProd \
+        (*(PyObject * (*)(PyArrayObject *, int, int, PyArrayObject *)) \
+    PyArray_API[152])
+#define PyArray_All \
+        (*(PyObject * (*)(PyArrayObject *, int, PyArrayObject *)) \
+    PyArray_API[153])
+#define PyArray_Any \
+        (*(PyObject * (*)(PyArrayObject *, int, PyArrayObject *)) \
+    PyArray_API[154])
+#define PyArray_Compress \
+        (*(PyObject * (*)(PyArrayObject *, PyObject *, int, PyArrayObject *)) \
+    PyArray_API[155])
+#define PyArray_Flatten \
+        (*(PyObject * (*)(PyArrayObject *, NPY_ORDER)) \
+    PyArray_API[156])
+#define PyArray_Ravel \
+        (*(PyObject * (*)(PyArrayObject *, NPY_ORDER)) \
+    PyArray_API[157])
+#define PyArray_MultiplyList \
+        (*(npy_intp (*)(npy_intp const *, int)) \
+    PyArray_API[158])
+#define PyArray_MultiplyIntList \
+        (*(int (*)(int const *, int)) \
+    PyArray_API[159])
+#define PyArray_GetPtr \
+        (*(void * (*)(PyArrayObject *, npy_intp const*)) \
+    PyArray_API[160])
+#define PyArray_CompareLists \
+        (*(int (*)(npy_intp const *, npy_intp const *, int)) \
+    PyArray_API[161])
+#define PyArray_AsCArray \
+        (*(int (*)(PyObject **, void *, npy_intp *, int, PyArray_Descr*)) \
+    PyArray_API[162])
+#define PyArray_As1D \
+        (*(int (*)(PyObject **NPY_UNUSED(op), char **NPY_UNUSED(ptr), int *NPY_UNUSED(d1), int NPY_UNUSED(typecode))) \
+    PyArray_API[163])
+#define PyArray_As2D \
+        (*(int (*)(PyObject **NPY_UNUSED(op), char ***NPY_UNUSED(ptr), int *NPY_UNUSED(d1), int *NPY_UNUSED(d2), int NPY_UNUSED(typecode))) \
+    PyArray_API[164])
+#define PyArray_Free \
+        (*(int (*)(PyObject *, void *)) \
+    PyArray_API[165])
+#define PyArray_Converter \
+        (*(int (*)(PyObject *, PyObject **)) \
+    PyArray_API[166])
+#define PyArray_IntpFromSequence \
+        (*(int (*)(PyObject *, npy_intp *, int)) \
+    PyArray_API[167])
+#define PyArray_Concatenate \
+        (*(PyObject * (*)(PyObject *, int)) \
+    PyArray_API[168])
+#define PyArray_InnerProduct \
+        (*(PyObject * (*)(PyObject *, PyObject *)) \
+    PyArray_API[169])
+#define PyArray_MatrixProduct \
+        (*(PyObject * (*)(PyObject *, PyObject *)) \
+    PyArray_API[170])
+#define PyArray_CopyAndTranspose \
+        (*(PyObject * (*)(PyObject *)) \
+    PyArray_API[171])
+#define PyArray_Correlate \
+        (*(PyObject * (*)(PyObject *, PyObject *, int)) \
+    PyArray_API[172])
+#define PyArray_TypestrConvert \
+        (*(int (*)(int, int)) \
+    PyArray_API[173])
+#define PyArray_DescrConverter \
+        (*(int (*)(PyObject *, PyArray_Descr **)) \
+    PyArray_API[174])
+#define PyArray_DescrConverter2 \
+        (*(int (*)(PyObject *, PyArray_Descr **)) \
+    PyArray_API[175])
+#define PyArray_IntpConverter \
+        (*(int (*)(PyObject *, PyArray_Dims *)) \
+    PyArray_API[176])
+#define PyArray_BufferConverter \
+        (*(int (*)(PyObject *, PyArray_Chunk *)) \
+    PyArray_API[177])
+#define PyArray_AxisConverter \
+        (*(int (*)(PyObject *, int *)) \
+    PyArray_API[178])
+#define PyArray_BoolConverter \
+        (*(int (*)(PyObject *, npy_bool *)) \
+    PyArray_API[179])
+#define PyArray_ByteorderConverter \
+        (*(int (*)(PyObject *, char *)) \
+    PyArray_API[180])
+#define PyArray_OrderConverter \
+        (*(int (*)(PyObject *, NPY_ORDER *)) \
+    PyArray_API[181])
+#define PyArray_EquivTypes \
+        (*(unsigned char (*)(PyArray_Descr *, PyArray_Descr *)) \
+    PyArray_API[182])
+#define PyArray_Zeros \
+        (*(PyObject * (*)(int, npy_intp const *, PyArray_Descr *, int)) \
+    PyArray_API[183])
+#define PyArray_Empty \
+        (*(PyObject * (*)(int, npy_intp const *, PyArray_Descr *, int)) \
+    PyArray_API[184])
+#define PyArray_Where \
+        (*(PyObject * (*)(PyObject *, PyObject *, PyObject *)) \
+    PyArray_API[185])
+#define PyArray_Arange \
+        (*(PyObject * (*)(double, double, double, int)) \
+    PyArray_API[186])
+#define PyArray_ArangeObj \
+        (*(PyObject * (*)(PyObject *, PyObject *, PyObject *, PyArray_Descr *)) \
+    PyArray_API[187])
+#define PyArray_SortkindConverter \
+        (*(int (*)(PyObject *, NPY_SORTKIND *)) \
+    PyArray_API[188])
+#define PyArray_LexSort \
+        (*(PyObject * (*)(PyObject *, int)) \
+    PyArray_API[189])
+#define PyArray_Round \
+        (*(PyObject * (*)(PyArrayObject *, int, PyArrayObject *)) \
+    PyArray_API[190])
+#define PyArray_EquivTypenums \
+        (*(unsigned char (*)(int, int)) \
+    PyArray_API[191])
+#define PyArray_RegisterDataType \
+        (*(int (*)(PyArray_Descr *)) \
+    PyArray_API[192])
+#define PyArray_RegisterCastFunc \
+        (*(int (*)(PyArray_Descr *, int, PyArray_VectorUnaryFunc *)) \
+    PyArray_API[193])
+#define PyArray_RegisterCanCast \
+        (*(int (*)(PyArray_Descr *, int, NPY_SCALARKIND)) \
+    PyArray_API[194])
+#define PyArray_InitArrFuncs \
+        (*(void (*)(PyArray_ArrFuncs *)) \
+    PyArray_API[195])
+#define PyArray_IntTupleFromIntp \
+        (*(PyObject * (*)(int, npy_intp const *)) \
+    PyArray_API[196])
+#define PyArray_TypeNumFromName \
+        (*(int (*)(char const *)) \
+    PyArray_API[197])
+#define PyArray_ClipmodeConverter \
+        (*(int (*)(PyObject *, NPY_CLIPMODE *)) \
+    PyArray_API[198])
+#define PyArray_OutputConverter \
+        (*(int (*)(PyObject *, PyArrayObject **)) \
+    PyArray_API[199])
+#define PyArray_BroadcastToShape \
+        (*(PyObject * (*)(PyObject *, npy_intp *, int)) \
+    PyArray_API[200])
+#define _PyArray_SigintHandler \
+        (*(void (*)(int)) \
+    PyArray_API[201])
+#define _PyArray_GetSigintBuf \
+        (*(void* (*)(void)) \
+    PyArray_API[202])
+#define PyArray_DescrAlignConverter \
+        (*(int (*)(PyObject *, PyArray_Descr **)) \
+    PyArray_API[203])
+#define PyArray_DescrAlignConverter2 \
+        (*(int (*)(PyObject *, PyArray_Descr **)) \
+    PyArray_API[204])
+#define PyArray_SearchsideConverter \
+        (*(int (*)(PyObject *, void *)) \
+    PyArray_API[205])
+#define PyArray_CheckAxis \
+        (*(PyObject * (*)(PyArrayObject *, int *, int)) \
+    PyArray_API[206])
+#define PyArray_OverflowMultiplyList \
+        (*(npy_intp (*)(npy_intp const *, int)) \
+    PyArray_API[207])
+#define PyArray_CompareString \
+        (*(int (*)(const char *, const char *, size_t)) \
+    PyArray_API[208])
+#define PyArray_MultiIterFromObjects \
+        (*(PyObject* (*)(PyObject **, int, int, ...)) \
+    PyArray_API[209])
+#define PyArray_GetEndianness \
+        (*(int (*)(void)) \
+    PyArray_API[210])
+#define PyArray_GetNDArrayCFeatureVersion \
+        (*(unsigned int (*)(void)) \
+    PyArray_API[211])
+#define PyArray_Correlate2 \
+        (*(PyObject * (*)(PyObject *, PyObject *, int)) \
+    PyArray_API[212])
+#define PyArray_NeighborhoodIterNew \
+        (*(PyObject* (*)(PyArrayIterObject *, const npy_intp *, int, PyArrayObject*)) \
+    PyArray_API[213])
+#define PyTimeIntegerArrType_Type (*(PyTypeObject *)PyArray_API[214])
+#define PyDatetimeArrType_Type (*(PyTypeObject *)PyArray_API[215])
+#define PyTimedeltaArrType_Type (*(PyTypeObject *)PyArray_API[216])
+#define PyHalfArrType_Type (*(PyTypeObject *)PyArray_API[217])
+#define NpyIter_Type (*(PyTypeObject *)PyArray_API[218])
+#define PyArray_SetDatetimeParseFunction \
+        (*(void (*)(PyObject *NPY_UNUSED(op))) \
+    PyArray_API[219])
+#define PyArray_DatetimeToDatetimeStruct \
+        (*(void (*)(npy_datetime NPY_UNUSED(val), NPY_DATETIMEUNIT NPY_UNUSED(fr), npy_datetimestruct *)) \
+    PyArray_API[220])
+#define PyArray_TimedeltaToTimedeltaStruct \
+        (*(void (*)(npy_timedelta NPY_UNUSED(val), NPY_DATETIMEUNIT NPY_UNUSED(fr), npy_timedeltastruct *)) \
+    PyArray_API[221])
+#define PyArray_DatetimeStructToDatetime \
+        (*(npy_datetime (*)(NPY_DATETIMEUNIT NPY_UNUSED(fr), npy_datetimestruct *NPY_UNUSED(d))) \
+    PyArray_API[222])
+#define PyArray_TimedeltaStructToTimedelta \
+        (*(npy_datetime (*)(NPY_DATETIMEUNIT NPY_UNUSED(fr), npy_timedeltastruct *NPY_UNUSED(d))) \
+    PyArray_API[223])
+#define NpyIter_New \
+        (*(NpyIter * (*)(PyArrayObject *, npy_uint32, NPY_ORDER, NPY_CASTING, PyArray_Descr*)) \
+    PyArray_API[224])
+#define NpyIter_MultiNew \
+        (*(NpyIter * (*)(int, PyArrayObject **, npy_uint32, NPY_ORDER, NPY_CASTING, npy_uint32 *, PyArray_Descr **)) \
+    PyArray_API[225])
+#define NpyIter_AdvancedNew \
+        (*(NpyIter * (*)(int, PyArrayObject **, npy_uint32, NPY_ORDER, NPY_CASTING, npy_uint32 *, PyArray_Descr **, int, int **, npy_intp *, npy_intp)) \
+    PyArray_API[226])
+#define NpyIter_Copy \
+        (*(NpyIter * (*)(NpyIter *)) \
+    PyArray_API[227])
+#define NpyIter_Deallocate \
+        (*(int (*)(NpyIter *)) \
+    PyArray_API[228])
+#define NpyIter_HasDelayedBufAlloc \
+        (*(npy_bool (*)(NpyIter *)) \
+    PyArray_API[229])
+#define NpyIter_HasExternalLoop \
+        (*(npy_bool (*)(NpyIter *)) \
+    PyArray_API[230])
+#define NpyIter_EnableExternalLoop \
+        (*(int (*)(NpyIter *)) \
+    PyArray_API[231])
+#define NpyIter_GetInnerStrideArray \
+        (*(npy_intp * (*)(NpyIter *)) \
+    PyArray_API[232])
+#define NpyIter_GetInnerLoopSizePtr \
+        (*(npy_intp * (*)(NpyIter *)) \
+    PyArray_API[233])
+#define NpyIter_Reset \
+        (*(int (*)(NpyIter *, char **)) \
+    PyArray_API[234])
+#define NpyIter_ResetBasePointers \
+        (*(int (*)(NpyIter *, char **, char **)) \
+    PyArray_API[235])
+#define NpyIter_ResetToIterIndexRange \
+        (*(int (*)(NpyIter *, npy_intp, npy_intp, char **)) \
+    PyArray_API[236])
+#define NpyIter_GetNDim \
+        (*(int (*)(NpyIter *)) \
+    PyArray_API[237])
+#define NpyIter_GetNOp \
+        (*(int (*)(NpyIter *)) \
+    PyArray_API[238])
+#define NpyIter_GetIterNext \
+        (*(NpyIter_IterNextFunc * (*)(NpyIter *, char **)) \
+    PyArray_API[239])
+#define NpyIter_GetIterSize \
+        (*(npy_intp (*)(NpyIter *)) \
+    PyArray_API[240])
+#define NpyIter_GetIterIndexRange \
+        (*(void (*)(NpyIter *, npy_intp *, npy_intp *)) \
+    PyArray_API[241])
+#define NpyIter_GetIterIndex \
+        (*(npy_intp (*)(NpyIter *)) \
+    PyArray_API[242])
+#define NpyIter_GotoIterIndex \
+        (*(int (*)(NpyIter *, npy_intp)) \
+    PyArray_API[243])
+#define NpyIter_HasMultiIndex \
+        (*(npy_bool (*)(NpyIter *)) \
+    PyArray_API[244])
+#define NpyIter_GetShape \
+        (*(int (*)(NpyIter *, npy_intp *)) \
+    PyArray_API[245])
+#define NpyIter_GetGetMultiIndex \
+        (*(NpyIter_GetMultiIndexFunc * (*)(NpyIter *, char **)) \
+    PyArray_API[246])
+#define NpyIter_GotoMultiIndex \
+        (*(int (*)(NpyIter *, npy_intp const *)) \
+    PyArray_API[247])
+#define NpyIter_RemoveMultiIndex \
+        (*(int (*)(NpyIter *)) \
+    PyArray_API[248])
+#define NpyIter_HasIndex \
+        (*(npy_bool (*)(NpyIter *)) \
+    PyArray_API[249])
+#define NpyIter_IsBuffered \
+        (*(npy_bool (*)(NpyIter *)) \
+    PyArray_API[250])
+#define NpyIter_IsGrowInner \
+        (*(npy_bool (*)(NpyIter *)) \
+    PyArray_API[251])
+#define NpyIter_GetBufferSize \
+        (*(npy_intp (*)(NpyIter *)) \
+    PyArray_API[252])
+#define NpyIter_GetIndexPtr \
+        (*(npy_intp * (*)(NpyIter *)) \
+    PyArray_API[253])
+#define NpyIter_GotoIndex \
+        (*(int (*)(NpyIter *, npy_intp)) \
+    PyArray_API[254])
+#define NpyIter_GetDataPtrArray \
+        (*(char ** (*)(NpyIter *)) \
+    PyArray_API[255])
+#define NpyIter_GetDescrArray \
+        (*(PyArray_Descr ** (*)(NpyIter *)) \
+    PyArray_API[256])
+#define NpyIter_GetOperandArray \
+        (*(PyArrayObject ** (*)(NpyIter *)) \
+    PyArray_API[257])
+#define NpyIter_GetIterView \
+        (*(PyArrayObject * (*)(NpyIter *, npy_intp)) \
+    PyArray_API[258])
+#define NpyIter_GetReadFlags \
+        (*(void (*)(NpyIter *, char *)) \
+    PyArray_API[259])
+#define NpyIter_GetWriteFlags \
+        (*(void (*)(NpyIter *, char *)) \
+    PyArray_API[260])
+#define NpyIter_DebugPrint \
+        (*(void (*)(NpyIter *)) \
+    PyArray_API[261])
+#define NpyIter_IterationNeedsAPI \
+        (*(npy_bool (*)(NpyIter *)) \
+    PyArray_API[262])
+#define NpyIter_GetInnerFixedStrideArray \
+        (*(void (*)(NpyIter *, npy_intp *)) \
+    PyArray_API[263])
+#define NpyIter_RemoveAxis \
+        (*(int (*)(NpyIter *, int)) \
+    PyArray_API[264])
+#define NpyIter_GetAxisStrideArray \
+        (*(npy_intp * (*)(NpyIter *, int)) \
+    PyArray_API[265])
+#define NpyIter_RequiresBuffering \
+        (*(npy_bool (*)(NpyIter *)) \
+    PyArray_API[266])
+#define NpyIter_GetInitialDataPtrArray \
+        (*(char ** (*)(NpyIter *)) \
+    PyArray_API[267])
+#define NpyIter_CreateCompatibleStrides \
+        (*(int (*)(NpyIter *, npy_intp, npy_intp *)) \
+    PyArray_API[268])
+#define PyArray_CastingConverter \
+        (*(int (*)(PyObject *, NPY_CASTING *)) \
+    PyArray_API[269])
+#define PyArray_CountNonzero \
+        (*(npy_intp (*)(PyArrayObject *)) \
+    PyArray_API[270])
+#define PyArray_PromoteTypes \
+        (*(PyArray_Descr * (*)(PyArray_Descr *, PyArray_Descr *)) \
+    PyArray_API[271])
+#define PyArray_MinScalarType \
+        (*(PyArray_Descr * (*)(PyArrayObject *)) \
+    PyArray_API[272])
+#define PyArray_ResultType \
+        (*(PyArray_Descr * (*)(npy_intp, PyArrayObject *arrs[], npy_intp, PyArray_Descr *descrs[])) \
+    PyArray_API[273])
+#define PyArray_CanCastArrayTo \
+        (*(npy_bool (*)(PyArrayObject *, PyArray_Descr *, NPY_CASTING)) \
+    PyArray_API[274])
+#define PyArray_CanCastTypeTo \
+        (*(npy_bool (*)(PyArray_Descr *, PyArray_Descr *, NPY_CASTING)) \
+    PyArray_API[275])
+#define PyArray_EinsteinSum \
+        (*(PyArrayObject * (*)(char *, npy_intp, PyArrayObject **, PyArray_Descr *, NPY_ORDER, NPY_CASTING, PyArrayObject *)) \
+    PyArray_API[276])
+#define PyArray_NewLikeArray \
+        (*(PyObject * (*)(PyArrayObject *, NPY_ORDER, PyArray_Descr *, int)) \
+    PyArray_API[277])
+#define PyArray_GetArrayParamsFromObject \
+        (*(int (*)(PyObject *NPY_UNUSED(op), PyArray_Descr *NPY_UNUSED(requested_dtype), npy_bool NPY_UNUSED(writeable), PyArray_Descr **NPY_UNUSED(out_dtype), int *NPY_UNUSED(out_ndim), npy_intp *NPY_UNUSED(out_dims), PyArrayObject **NPY_UNUSED(out_arr), PyObject *NPY_UNUSED(context))) \
+    PyArray_API[278])
+#define PyArray_ConvertClipmodeSequence \
+        (*(int (*)(PyObject *, NPY_CLIPMODE *, int)) \
+    PyArray_API[279])
+#define PyArray_MatrixProduct2 \
+        (*(PyObject * (*)(PyObject *, PyObject *, PyArrayObject*)) \
+    PyArray_API[280])
+#define NpyIter_IsFirstVisit \
+        (*(npy_bool (*)(NpyIter *, int)) \
+    PyArray_API[281])
+#define PyArray_SetBaseObject \
+        (*(int (*)(PyArrayObject *, PyObject *)) \
+    PyArray_API[282])
+#define PyArray_CreateSortedStridePerm \
+        (*(void (*)(int, npy_intp const *, npy_stride_sort_item *)) \
+    PyArray_API[283])
+#define PyArray_RemoveAxesInPlace \
+        (*(void (*)(PyArrayObject *, const npy_bool *)) \
+    PyArray_API[284])
+#define PyArray_DebugPrint \
+        (*(void (*)(PyArrayObject *)) \
+    PyArray_API[285])
+#define PyArray_FailUnlessWriteable \
+        (*(int (*)(PyArrayObject *, const char *)) \
+    PyArray_API[286])
+#define PyArray_SetUpdateIfCopyBase \
+        (*(int (*)(PyArrayObject *, PyArrayObject *)) \
+    PyArray_API[287])
+#define PyDataMem_NEW \
+        (*(void * (*)(size_t)) \
+    PyArray_API[288])
+#define PyDataMem_FREE \
+        (*(void (*)(void *)) \
+    PyArray_API[289])
+#define PyDataMem_RENEW \
+        (*(void * (*)(void *, size_t)) \
+    PyArray_API[290])
+#define PyDataMem_SetEventHook \
+        (*(PyDataMem_EventHookFunc * (*)(PyDataMem_EventHookFunc *, void *, void **)) \
+    PyArray_API[291])
+#define NPY_DEFAULT_ASSIGN_CASTING (*(NPY_CASTING *)PyArray_API[292])
+#define PyArray_MapIterSwapAxes \
+        (*(void (*)(PyArrayMapIterObject *, PyArrayObject **, int)) \
+    PyArray_API[293])
+#define PyArray_MapIterArray \
+        (*(PyObject * (*)(PyArrayObject *, PyObject *)) \
+    PyArray_API[294])
+#define PyArray_MapIterNext \
+        (*(void (*)(PyArrayMapIterObject *)) \
+    PyArray_API[295])
+#define PyArray_Partition \
+        (*(int (*)(PyArrayObject *, PyArrayObject *, int, NPY_SELECTKIND)) \
+    PyArray_API[296])
+#define PyArray_ArgPartition \
+        (*(PyObject * (*)(PyArrayObject *, PyArrayObject *, int, NPY_SELECTKIND)) \
+    PyArray_API[297])
+#define PyArray_SelectkindConverter \
+        (*(int (*)(PyObject *, NPY_SELECTKIND *)) \
+    PyArray_API[298])
+#define PyDataMem_NEW_ZEROED \
+        (*(void * (*)(size_t, size_t)) \
+    PyArray_API[299])
+#define PyArray_CheckAnyScalarExact \
+        (*(int (*)(PyObject *)) \
+    PyArray_API[300])
+#define PyArray_MapIterArrayCopyIfOverlap \
+        (*(PyObject * (*)(PyArrayObject *, PyObject *, int, PyArrayObject *)) \
+    PyArray_API[301])
+#define PyArray_ResolveWritebackIfCopy \
+        (*(int (*)(PyArrayObject *)) \
+    PyArray_API[302])
+#define PyArray_SetWritebackIfCopyBase \
+        (*(int (*)(PyArrayObject *, PyArrayObject *)) \
+    PyArray_API[303])
+
+#if NPY_FEATURE_VERSION >= NPY_1_22_API_VERSION
+#define PyDataMem_SetHandler \
+        (*(PyObject * (*)(PyObject *)) \
+    PyArray_API[304])
+#endif
+
+#if NPY_FEATURE_VERSION >= NPY_1_22_API_VERSION
+#define PyDataMem_GetHandler \
+        (*(PyObject * (*)(void)) \
+    PyArray_API[305])
+#endif
+#define PyDataMem_DefaultHandler (*(PyObject* *)PyArray_API[306])
+
+#if !defined(NO_IMPORT_ARRAY) && !defined(NO_IMPORT)
+static int
+_import_array(void)
+{
+  int st;
+  PyObject *numpy = PyImport_ImportModule("numpy.core._multiarray_umath");
+  PyObject *c_api = NULL;
+
+  if (numpy == NULL) {
+      return -1;
+  }
+  c_api = PyObject_GetAttrString(numpy, "_ARRAY_API");
+  Py_DECREF(numpy);
+  if (c_api == NULL) {
+      return -1;
+  }
+
+  if (!PyCapsule_CheckExact(c_api)) {
+      PyErr_SetString(PyExc_RuntimeError, "_ARRAY_API is not PyCapsule object");
+      Py_DECREF(c_api);
+      return -1;
+  }
+  PyArray_API = (void **)PyCapsule_GetPointer(c_api, NULL);
+  Py_DECREF(c_api);
+  if (PyArray_API == NULL) {
+      PyErr_SetString(PyExc_RuntimeError, "_ARRAY_API is NULL pointer");
+      return -1;
+  }
+
+  /* Perform runtime check of C API version */
+  if (NPY_VERSION != PyArray_GetNDArrayCVersion()) {
+      PyErr_Format(PyExc_RuntimeError, "module compiled against "\
+             "ABI version 0x%x but this version of numpy is 0x%x", \
+             (int) NPY_VERSION, (int) PyArray_GetNDArrayCVersion());
+      return -1;
+  }
+  if (NPY_FEATURE_VERSION > PyArray_GetNDArrayCFeatureVersion()) {
+      PyErr_Format(PyExc_RuntimeError, "module compiled against "\
+             "API version 0x%x but this version of numpy is 0x%x . "\
+             "Check the section C-API incompatibility at the "\
+             "Troubleshooting ImportError section at "\
+             "https://numpy.org/devdocs/user/troubleshooting-importerror.html"\
+             "#c-api-incompatibility "\
+              "for indications on how to solve this problem .", \
+             (int) NPY_FEATURE_VERSION, (int) PyArray_GetNDArrayCFeatureVersion());
+      return -1;
+  }
+
+  /*
+   * Perform runtime check of endianness and check it matches the one set by
+   * the headers (npy_endian.h) as a safeguard
+   */
+  st = PyArray_GetEndianness();
+  if (st == NPY_CPU_UNKNOWN_ENDIAN) {
+      PyErr_SetString(PyExc_RuntimeError,
+                      "FATAL: module compiled as unknown endian");
+      return -1;
+  }
+#if NPY_BYTE_ORDER == NPY_BIG_ENDIAN
+  if (st != NPY_CPU_BIG) {
+      PyErr_SetString(PyExc_RuntimeError,
+                      "FATAL: module compiled as big endian, but "
+                      "detected different endianness at runtime");
+      return -1;
+  }
+#elif NPY_BYTE_ORDER == NPY_LITTLE_ENDIAN
+  if (st != NPY_CPU_LITTLE) {
+      PyErr_SetString(PyExc_RuntimeError,
+                      "FATAL: module compiled as little endian, but "
+                      "detected different endianness at runtime");
+      return -1;
+  }
+#endif
+
+  return 0;
+}
+
+#define import_array() {if (_import_array() < 0) {PyErr_Print(); PyErr_SetString(PyExc_ImportError, "numpy.core.multiarray failed to import"); return NULL; } }
+
+#define import_array1(ret) {if (_import_array() < 0) {PyErr_Print(); PyErr_SetString(PyExc_ImportError, "numpy.core.multiarray failed to import"); return ret; } }
+
+#define import_array2(msg, ret) {if (_import_array() < 0) {PyErr_Print(); PyErr_SetString(PyExc_ImportError, msg); return ret; } }
+
+#endif
+
+#endif
diff --git a/nanvix-port/generated-headers/__ufunc_api.c b/nanvix-port/generated-headers/__ufunc_api.c
new file mode 100644
index 000000000000..d1b4a87bb6a0
--- /dev/null
+++ b/nanvix-port/generated-headers/__ufunc_api.c
@@ -0,0 +1,50 @@
+
+/* These pointers will be stored in the C-object for use in other
+    extension modules
+*/
+
+void *PyUFunc_API[] = {
+        (void *) &PyUFunc_Type,
+        (void *) PyUFunc_FromFuncAndData,
+        (void *) PyUFunc_RegisterLoopForType,
+        (void *) PyUFunc_GenericFunction,
+        (void *) PyUFunc_f_f_As_d_d,
+        (void *) PyUFunc_d_d,
+        (void *) PyUFunc_f_f,
+        (void *) PyUFunc_g_g,
+        (void *) PyUFunc_F_F_As_D_D,
+        (void *) PyUFunc_F_F,
+        (void *) PyUFunc_D_D,
+        (void *) PyUFunc_G_G,
+        (void *) PyUFunc_O_O,
+        (void *) PyUFunc_ff_f_As_dd_d,
+        (void *) PyUFunc_ff_f,
+        (void *) PyUFunc_dd_d,
+        (void *) PyUFunc_gg_g,
+        (void *) PyUFunc_FF_F_As_DD_D,
+        (void *) PyUFunc_DD_D,
+        (void *) PyUFunc_FF_F,
+        (void *) PyUFunc_GG_G,
+        (void *) PyUFunc_OO_O,
+        (void *) PyUFunc_O_O_method,
+        (void *) PyUFunc_OO_O_method,
+        (void *) PyUFunc_On_Om,
+        (void *) PyUFunc_GetPyValues,
+        (void *) PyUFunc_checkfperr,
+        (void *) PyUFunc_clearfperr,
+        (void *) PyUFunc_getfperr,
+        (void *) PyUFunc_handlefperr,
+        (void *) PyUFunc_ReplaceLoopBySignature,
+        (void *) PyUFunc_FromFuncAndDataAndSignature,
+        (void *) PyUFunc_SetUsesArraysAsData,
+        (void *) PyUFunc_e_e,
+        (void *) PyUFunc_e_e_As_f_f,
+        (void *) PyUFunc_e_e_As_d_d,
+        (void *) PyUFunc_ee_e,
+        (void *) PyUFunc_ee_e_As_ff_f,
+        (void *) PyUFunc_ee_e_As_dd_d,
+        (void *) PyUFunc_DefaultTypeResolver,
+        (void *) PyUFunc_ValidateCasting,
+        (void *) PyUFunc_RegisterLoopForDescr,
+        (void *) PyUFunc_FromFuncAndDataAndSignatureAndIdentity
+};
diff --git a/nanvix-port/generated-headers/__ufunc_api.h b/nanvix-port/generated-headers/__ufunc_api.h
new file mode 100644
index 000000000000..e2efe29e8635
--- /dev/null
+++ b/nanvix-port/generated-headers/__ufunc_api.h
@@ -0,0 +1,314 @@
+
+#ifdef _UMATHMODULE
+
+extern NPY_NO_EXPORT PyTypeObject PyUFunc_Type;
+
+extern NPY_NO_EXPORT PyTypeObject PyUFunc_Type;
+
+NPY_NO_EXPORT  PyObject * PyUFunc_FromFuncAndData \
+       (PyUFuncGenericFunction *, void **, char *, int, int, int, int, const char *, const char *, int);
+NPY_NO_EXPORT  int PyUFunc_RegisterLoopForType \
+       (PyUFuncObject *, int, PyUFuncGenericFunction, const int *, void *);
+NPY_NO_EXPORT  int PyUFunc_GenericFunction \
+       (PyUFuncObject *NPY_UNUSED(ufunc), PyObject *NPY_UNUSED(args), PyObject *NPY_UNUSED(kwds), PyArrayObject **NPY_UNUSED(op));
+NPY_NO_EXPORT  void PyUFunc_f_f_As_d_d \
+       (char **, npy_intp const *, npy_intp const *, void *);
+NPY_NO_EXPORT  void PyUFunc_d_d \
+       (char **, npy_intp const *, npy_intp const *, void *);
+NPY_NO_EXPORT  void PyUFunc_f_f \
+       (char **, npy_intp const *, npy_intp const *, void *);
+NPY_NO_EXPORT  void PyUFunc_g_g \
+       (char **, npy_intp const *, npy_intp const *, void *);
+NPY_NO_EXPORT  void PyUFunc_F_F_As_D_D \
+       (char **, npy_intp const *, npy_intp const *, void *);
+NPY_NO_EXPORT  void PyUFunc_F_F \
+       (char **, npy_intp const *, npy_intp const *, void *);
+NPY_NO_EXPORT  void PyUFunc_D_D \
+       (char **, npy_intp const *, npy_intp const *, void *);
+NPY_NO_EXPORT  void PyUFunc_G_G \
+       (char **, npy_intp const *, npy_intp const *, void *);
+NPY_NO_EXPORT  void PyUFunc_O_O \
+       (char **, npy_intp const *, npy_intp const *, void *);
+NPY_NO_EXPORT  void PyUFunc_ff_f_As_dd_d \
+       (char **, npy_intp const *, npy_intp const *, void *);
+NPY_NO_EXPORT  void PyUFunc_ff_f \
+       (char **, npy_intp const *, npy_intp const *, void *);
+NPY_NO_EXPORT  void PyUFunc_dd_d \
+       (char **, npy_intp const *, npy_intp const *, void *);
+NPY_NO_EXPORT  void PyUFunc_gg_g \
+       (char **, npy_intp const *, npy_intp const *, void *);
+NPY_NO_EXPORT  void PyUFunc_FF_F_As_DD_D \
+       (char **, npy_intp const *, npy_intp const *, void *);
+NPY_NO_EXPORT  void PyUFunc_DD_D \
+       (char **, npy_intp const *, npy_intp const *, void *);
+NPY_NO_EXPORT  void PyUFunc_FF_F \
+       (char **, npy_intp const *, npy_intp const *, void *);
+NPY_NO_EXPORT  void PyUFunc_GG_G \
+       (char **, npy_intp const *, npy_intp const *, void *);
+NPY_NO_EXPORT  void PyUFunc_OO_O \
+       (char **, npy_intp const *, npy_intp const *, void *);
+NPY_NO_EXPORT  void PyUFunc_O_O_method \
+       (char **, npy_intp const *, npy_intp const *, void *);
+NPY_NO_EXPORT  void PyUFunc_OO_O_method \
+       (char **, npy_intp const *, npy_intp const *, void *);
+NPY_NO_EXPORT  void PyUFunc_On_Om \
+       (char **, npy_intp const *, npy_intp const *, void *);
+NPY_NO_EXPORT  int PyUFunc_GetPyValues \
+       (char *, int *, int *, PyObject **);
+NPY_NO_EXPORT  int PyUFunc_checkfperr \
+       (int, PyObject *, int *);
+NPY_NO_EXPORT  void PyUFunc_clearfperr \
+       (void);
+NPY_NO_EXPORT  int PyUFunc_getfperr \
+       (void);
+NPY_NO_EXPORT  int PyUFunc_handlefperr \
+       (int, PyObject *, int, int *);
+NPY_NO_EXPORT  int PyUFunc_ReplaceLoopBySignature \
+       (PyUFuncObject *, PyUFuncGenericFunction, const int *, PyUFuncGenericFunction *);
+NPY_NO_EXPORT  PyObject * PyUFunc_FromFuncAndDataAndSignature \
+       (PyUFuncGenericFunction *, void **, char *, int, int, int, int, const char *, const char *, int, const char *);
+NPY_NO_EXPORT  int PyUFunc_SetUsesArraysAsData \
+       (void **NPY_UNUSED(data), size_t NPY_UNUSED(i));
+NPY_NO_EXPORT  void PyUFunc_e_e \
+       (char **, npy_intp const *, npy_intp const *, void *);
+NPY_NO_EXPORT  void PyUFunc_e_e_As_f_f \
+       (char **, npy_intp const *, npy_intp const *, void *);
+NPY_NO_EXPORT  void PyUFunc_e_e_As_d_d \
+       (char **, npy_intp const *, npy_intp const *, void *);
+NPY_NO_EXPORT  void PyUFunc_ee_e \
+       (char **, npy_intp const *, npy_intp const *, void *);
+NPY_NO_EXPORT  void PyUFunc_ee_e_As_ff_f \
+       (char **, npy_intp const *, npy_intp const *, void *);
+NPY_NO_EXPORT  void PyUFunc_ee_e_As_dd_d \
+       (char **, npy_intp const *, npy_intp const *, void *);
+NPY_NO_EXPORT  int PyUFunc_DefaultTypeResolver \
+       (PyUFuncObject *, NPY_CASTING, PyArrayObject **, PyObject *, PyArray_Descr **);
+NPY_NO_EXPORT  int PyUFunc_ValidateCasting \
+       (PyUFuncObject *, NPY_CASTING, PyArrayObject **, PyArray_Descr **);
+NPY_NO_EXPORT  int PyUFunc_RegisterLoopForDescr \
+       (PyUFuncObject *, PyArray_Descr *, PyUFuncGenericFunction, PyArray_Descr **, void *);
+NPY_NO_EXPORT  PyObject * PyUFunc_FromFuncAndDataAndSignatureAndIdentity \
+       (PyUFuncGenericFunction *, void **, char *, int, int, int, int, const char *, const char *, const int, const char *, PyObject *);
+
+#else
+
+#if defined(PY_UFUNC_UNIQUE_SYMBOL)
+#define PyUFunc_API PY_UFUNC_UNIQUE_SYMBOL
+#endif
+
+#if defined(NO_IMPORT) || defined(NO_IMPORT_UFUNC)
+extern void **PyUFunc_API;
+#else
+#if defined(PY_UFUNC_UNIQUE_SYMBOL)
+void **PyUFunc_API;
+#else
+static void **PyUFunc_API=NULL;
+#endif
+#endif
+
+#define PyUFunc_Type (*(PyTypeObject *)PyUFunc_API[0])
+#define PyUFunc_FromFuncAndData \
+        (*(PyObject * (*)(PyUFuncGenericFunction *, void **, char *, int, int, int, int, const char *, const char *, int)) \
+    PyUFunc_API[1])
+#define PyUFunc_RegisterLoopForType \
+        (*(int (*)(PyUFuncObject *, int, PyUFuncGenericFunction, const int *, void *)) \
+    PyUFunc_API[2])
+#define PyUFunc_GenericFunction \
+        (*(int (*)(PyUFuncObject *NPY_UNUSED(ufunc), PyObject *NPY_UNUSED(args), PyObject *NPY_UNUSED(kwds), PyArrayObject **NPY_UNUSED(op))) \
+    PyUFunc_API[3])
+#define PyUFunc_f_f_As_d_d \
+        (*(void (*)(char **, npy_intp const *, npy_intp const *, void *)) \
+    PyUFunc_API[4])
+#define PyUFunc_d_d \
+        (*(void (*)(char **, npy_intp const *, npy_intp const *, void *)) \
+    PyUFunc_API[5])
+#define PyUFunc_f_f \
+        (*(void (*)(char **, npy_intp const *, npy_intp const *, void *)) \
+    PyUFunc_API[6])
+#define PyUFunc_g_g \
+        (*(void (*)(char **, npy_intp const *, npy_intp const *, void *)) \
+    PyUFunc_API[7])
+#define PyUFunc_F_F_As_D_D \
+        (*(void (*)(char **, npy_intp const *, npy_intp const *, void *)) \
+    PyUFunc_API[8])
+#define PyUFunc_F_F \
+        (*(void (*)(char **, npy_intp const *, npy_intp const *, void *)) \
+    PyUFunc_API[9])
+#define PyUFunc_D_D \
+        (*(void (*)(char **, npy_intp const *, npy_intp const *, void *)) \
+    PyUFunc_API[10])
+#define PyUFunc_G_G \
+        (*(void (*)(char **, npy_intp const *, npy_intp const *, void *)) \
+    PyUFunc_API[11])
+#define PyUFunc_O_O \
+        (*(void (*)(char **, npy_intp const *, npy_intp const *, void *)) \
+    PyUFunc_API[12])
+#define PyUFunc_ff_f_As_dd_d \
+        (*(void (*)(char **, npy_intp const *, npy_intp const *, void *)) \
+    PyUFunc_API[13])
+#define PyUFunc_ff_f \
+        (*(void (*)(char **, npy_intp const *, npy_intp const *, void *)) \
+    PyUFunc_API[14])
+#define PyUFunc_dd_d \
+        (*(void (*)(char **, npy_intp const *, npy_intp const *, void *)) \
+    PyUFunc_API[15])
+#define PyUFunc_gg_g \
+        (*(void (*)(char **, npy_intp const *, npy_intp const *, void *)) \
+    PyUFunc_API[16])
+#define PyUFunc_FF_F_As_DD_D \
+        (*(void (*)(char **, npy_intp const *, npy_intp const *, void *)) \
+    PyUFunc_API[17])
+#define PyUFunc_DD_D \
+        (*(void (*)(char **, npy_intp const *, npy_intp const *, void *)) \
+    PyUFunc_API[18])
+#define PyUFunc_FF_F \
+        (*(void (*)(char **, npy_intp const *, npy_intp const *, void *)) \
+    PyUFunc_API[19])
+#define PyUFunc_GG_G \
+        (*(void (*)(char **, npy_intp const *, npy_intp const *, void *)) \
+    PyUFunc_API[20])
+#define PyUFunc_OO_O \
+        (*(void (*)(char **, npy_intp const *, npy_intp const *, void *)) \
+    PyUFunc_API[21])
+#define PyUFunc_O_O_method \
+        (*(void (*)(char **, npy_intp const *, npy_intp const *, void *)) \
+    PyUFunc_API[22])
+#define PyUFunc_OO_O_method \
+        (*(void (*)(char **, npy_intp const *, npy_intp const *, void *)) \
+    PyUFunc_API[23])
+#define PyUFunc_On_Om \
+        (*(void (*)(char **, npy_intp const *, npy_intp const *, void *)) \
+    PyUFunc_API[24])
+#define PyUFunc_GetPyValues \
+        (*(int (*)(char *, int *, int *, PyObject **)) \
+    PyUFunc_API[25])
+#define PyUFunc_checkfperr \
+        (*(int (*)(int, PyObject *, int *)) \
+    PyUFunc_API[26])
+#define PyUFunc_clearfperr \
+        (*(void (*)(void)) \
+    PyUFunc_API[27])
+#define PyUFunc_getfperr \
+        (*(int (*)(void)) \
+    PyUFunc_API[28])
+#define PyUFunc_handlefperr \
+        (*(int (*)(int, PyObject *, int, int *)) \
+    PyUFunc_API[29])
+#define PyUFunc_ReplaceLoopBySignature \
+        (*(int (*)(PyUFuncObject *, PyUFuncGenericFunction, const int *, PyUFuncGenericFunction *)) \
+    PyUFunc_API[30])
+#define PyUFunc_FromFuncAndDataAndSignature \
+        (*(PyObject * (*)(PyUFuncGenericFunction *, void **, char *, int, int, int, int, const char *, const char *, int, const char *)) \
+    PyUFunc_API[31])
+#define PyUFunc_SetUsesArraysAsData \
+        (*(int (*)(void **NPY_UNUSED(data), size_t NPY_UNUSED(i))) \
+    PyUFunc_API[32])
+#define PyUFunc_e_e \
+        (*(void (*)(char **, npy_intp const *, npy_intp const *, void *)) \
+    PyUFunc_API[33])
+#define PyUFunc_e_e_As_f_f \
+        (*(void (*)(char **, npy_intp const *, npy_intp const *, void *)) \
+    PyUFunc_API[34])
+#define PyUFunc_e_e_As_d_d \
+        (*(void (*)(char **, npy_intp const *, npy_intp const *, void *)) \
+    PyUFunc_API[35])
+#define PyUFunc_ee_e \
+        (*(void (*)(char **, npy_intp const *, npy_intp const *, void *)) \
+    PyUFunc_API[36])
+#define PyUFunc_ee_e_As_ff_f \
+        (*(void (*)(char **, npy_intp const *, npy_intp const *, void *)) \
+    PyUFunc_API[37])
+#define PyUFunc_ee_e_As_dd_d \
+        (*(void (*)(char **, npy_intp const *, npy_intp const *, void *)) \
+    PyUFunc_API[38])
+#define PyUFunc_DefaultTypeResolver \
+        (*(int (*)(PyUFuncObject *, NPY_CASTING, PyArrayObject **, PyObject *, PyArray_Descr **)) \
+    PyUFunc_API[39])
+#define PyUFunc_ValidateCasting \
+        (*(int (*)(PyUFuncObject *, NPY_CASTING, PyArrayObject **, PyArray_Descr **)) \
+    PyUFunc_API[40])
+#define PyUFunc_RegisterLoopForDescr \
+        (*(int (*)(PyUFuncObject *, PyArray_Descr *, PyUFuncGenericFunction, PyArray_Descr **, void *)) \
+    PyUFunc_API[41])
+
+#if NPY_FEATURE_VERSION >= NPY_1_16_API_VERSION
+#define PyUFunc_FromFuncAndDataAndSignatureAndIdentity \
+        (*(PyObject * (*)(PyUFuncGenericFunction *, void **, char *, int, int, int, int, const char *, const char *, const int, const char *, PyObject *)) \
+    PyUFunc_API[42])
+#endif
+
+static inline int
+_import_umath(void)
+{
+  PyObject *numpy = PyImport_ImportModule("numpy.core._multiarray_umath");
+  PyObject *c_api = NULL;
+
+  if (numpy == NULL) {
+      PyErr_SetString(PyExc_ImportError,
+                      "numpy.core._multiarray_umath failed to import");
+      return -1;
+  }
+  c_api = PyObject_GetAttrString(numpy, "_UFUNC_API");
+  Py_DECREF(numpy);
+  if (c_api == NULL) {
+      PyErr_SetString(PyExc_AttributeError, "_UFUNC_API not found");
+      return -1;
+  }
+
+  if (!PyCapsule_CheckExact(c_api)) {
+      PyErr_SetString(PyExc_RuntimeError, "_UFUNC_API is not PyCapsule object");
+      Py_DECREF(c_api);
+      return -1;
+  }
+  PyUFunc_API = (void **)PyCapsule_GetPointer(c_api, NULL);
+  Py_DECREF(c_api);
+  if (PyUFunc_API == NULL) {
+      PyErr_SetString(PyExc_RuntimeError, "_UFUNC_API is NULL pointer");
+      return -1;
+  }
+  return 0;
+}
+
+#define import_umath() \
+    do {\
+        UFUNC_NOFPE\
+        if (_import_umath() < 0) {\
+            PyErr_Print();\
+            PyErr_SetString(PyExc_ImportError,\
+                    "numpy.core.umath failed to import");\
+            return NULL;\
+        }\
+    } while(0)
+
+#define import_umath1(ret) \
+    do {\
+        UFUNC_NOFPE\
+        if (_import_umath() < 0) {\
+            PyErr_Print();\
+            PyErr_SetString(PyExc_ImportError,\
+                    "numpy.core.umath failed to import");\
+            return ret;\
+        }\
+    } while(0)
+
+#define import_umath2(ret, msg) \
+    do {\
+        UFUNC_NOFPE\
+        if (_import_umath() < 0) {\
+            PyErr_Print();\
+            PyErr_SetString(PyExc_ImportError, msg);\
+            return ret;\
+        }\
+    } while(0)
+
+#define import_ufunc() \
+    do {\
+        UFUNC_NOFPE\
+        if (_import_umath() < 0) {\
+            PyErr_Print();\
+            PyErr_SetString(PyExc_ImportError,\
+                    "numpy.core.umath failed to import");\
+        }\
+    } while(0)
+
+#endif
diff --git a/nanvix-port/generated-headers/__umath_generated.c b/nanvix-port/generated-headers/__umath_generated.c
new file mode 100644
index 000000000000..bebeef603497
--- /dev/null
+++ b/nanvix-port/generated-headers/__umath_generated.c
@@ -0,0 +1,6409 @@
+
+
+/** Warning this file is autogenerated!!!
+
+    Please make changes to the code generator program (generate_umath.py)
+**/
+#include "ufunc_object.h"
+#include "ufunc_type_resolution.h"
+#include "loops.h"
+#include "matmul.h"
+#include "clip.h"
+#include "dtypemeta.h"
+#include "_umath_doc_generated.h"
+
+static PyUFuncGenericFunction _arg_functions[] = {CFLOAT__arg, CDOUBLE__arg, CLONGDOUBLE__arg};
+static void * _arg_data[] = {(void *)NULL, (void *)NULL, (void *)NULL};
+static char _arg_signatures[] = {NPY_CFLOAT, NPY_FLOAT, NPY_CDOUBLE, NPY_DOUBLE, NPY_CLONGDOUBLE, NPY_LONGDOUBLE};
+static PyUFuncGenericFunction _ones_like_functions[] = {BOOL__ones_like, BYTE__ones_like, UBYTE__ones_like, SHORT__ones_like, USHORT__ones_like, INT__ones_like, UINT__ones_like, LONG__ones_like, ULONG__ones_like, LONGLONG__ones_like, ULONGLONG__ones_like, HALF__ones_like, FLOAT__ones_like, DOUBLE__ones_like, LONGDOUBLE__ones_like, CFLOAT__ones_like, CDOUBLE__ones_like, CLONGDOUBLE__ones_like, TIMEDELTA__ones_like, DATETIME__ones_like, NULL};
+static void * _ones_like_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char _ones_like_signatures[] = {NPY_BOOL, NPY_BOOL, NPY_BYTE, NPY_BYTE, NPY_UBYTE, NPY_UBYTE, NPY_SHORT, NPY_SHORT, NPY_USHORT, NPY_USHORT, NPY_INT, NPY_INT, NPY_UINT, NPY_UINT, NPY_LONG, NPY_LONG, NPY_ULONG, NPY_ULONG, NPY_LONGLONG, NPY_LONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_DATETIME, NPY_DATETIME, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction absolute_functions[] = {BOOL_absolute, BYTE_absolute, UBYTE_absolute, SHORT_absolute, USHORT_absolute, INT_absolute, UINT_absolute, LONG_absolute, ULONG_absolute, LONGLONG_absolute, ULONGLONG_absolute, HALF_absolute, FLOAT_absolute, DOUBLE_absolute, LONGDOUBLE_absolute, TIMEDELTA_absolute, CFLOAT_absolute, CDOUBLE_absolute, CLONGDOUBLE_absolute, NULL};
+static void * absolute_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char absolute_signatures[] = {NPY_BOOL, NPY_BOOL, NPY_BYTE, NPY_BYTE, NPY_UBYTE, NPY_UBYTE, NPY_SHORT, NPY_SHORT, NPY_USHORT, NPY_USHORT, NPY_INT, NPY_INT, NPY_UINT, NPY_UINT, NPY_LONG, NPY_LONG, NPY_ULONG, NPY_ULONG, NPY_LONGLONG, NPY_LONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_CFLOAT, NPY_FLOAT, NPY_CDOUBLE, NPY_DOUBLE, NPY_CLONGDOUBLE, NPY_LONGDOUBLE, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction add_functions[] = {BOOL_logical_or, BYTE_add, UBYTE_add, SHORT_add, USHORT_add, INT_add, UINT_add, LONG_add, ULONG_add, LONGLONG_add, ULONGLONG_add, HALF_add, FLOAT_add, DOUBLE_add, LONGDOUBLE_add, CFLOAT_add, CDOUBLE_add, CLONGDOUBLE_add, DATETIME_Mm_M_add, TIMEDELTA_mm_m_add, DATETIME_mM_M_add, NULL};
+static void * add_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char add_signatures[] = {NPY_BOOL, NPY_BOOL, NPY_BOOL, NPY_BYTE, NPY_BYTE, NPY_BYTE, NPY_UBYTE, NPY_UBYTE, NPY_UBYTE, NPY_SHORT, NPY_SHORT, NPY_SHORT, NPY_USHORT, NPY_USHORT, NPY_USHORT, NPY_INT, NPY_INT, NPY_INT, NPY_UINT, NPY_UINT, NPY_UINT, NPY_LONG, NPY_LONG, NPY_LONG, NPY_ULONG, NPY_ULONG, NPY_ULONG, NPY_LONGLONG, NPY_LONGLONG, NPY_LONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_HALF, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_DATETIME, NPY_TIMEDELTA, NPY_DATETIME, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_DATETIME, NPY_DATETIME, NPY_OBJECT, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction arccos_functions[] = {HALF_arccos, FLOAT_arccos, DOUBLE_arccos, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL};
+static void * arccos_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)"arccos"};
+static char arccos_signatures[] = {NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction arccosh_functions[] = {HALF_arccosh, FLOAT_arccosh, DOUBLE_arccosh, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL};
+static void * arccosh_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)"arccosh"};
+static char arccosh_signatures[] = {NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction arcsin_functions[] = {HALF_arcsin, FLOAT_arcsin, DOUBLE_arcsin, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL};
+static void * arcsin_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)"arcsin"};
+static char arcsin_signatures[] = {NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction arcsinh_functions[] = {HALF_arcsinh, FLOAT_arcsinh, DOUBLE_arcsinh, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL};
+static void * arcsinh_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)"arcsinh"};
+static char arcsinh_signatures[] = {NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction arctan_functions[] = {HALF_arctan, FLOAT_arctan, DOUBLE_arctan, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL};
+static void * arctan_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)"arctan"};
+static char arctan_signatures[] = {NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction arctan2_functions[] = {NULL, FLOAT_arctan2, DOUBLE_arctan2, NULL, NULL};
+static void * arctan2_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)"arctan2"};
+static char arctan2_signatures[] = {NPY_HALF, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_OBJECT, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction arctanh_functions[] = {HALF_arctanh, FLOAT_arctanh, DOUBLE_arctanh, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL};
+static void * arctanh_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)"arctanh"};
+static char arctanh_signatures[] = {NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction bitwise_and_functions[] = {BOOL_logical_and, BYTE_bitwise_and, UBYTE_bitwise_and, SHORT_bitwise_and, USHORT_bitwise_and, INT_bitwise_and, UINT_bitwise_and, LONG_bitwise_and, ULONG_bitwise_and, LONGLONG_bitwise_and, ULONGLONG_bitwise_and, NULL};
+static void * bitwise_and_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char bitwise_and_signatures[] = {NPY_BOOL, NPY_BOOL, NPY_BOOL, NPY_BYTE, NPY_BYTE, NPY_BYTE, NPY_UBYTE, NPY_UBYTE, NPY_UBYTE, NPY_SHORT, NPY_SHORT, NPY_SHORT, NPY_USHORT, NPY_USHORT, NPY_USHORT, NPY_INT, NPY_INT, NPY_INT, NPY_UINT, NPY_UINT, NPY_UINT, NPY_LONG, NPY_LONG, NPY_LONG, NPY_ULONG, NPY_ULONG, NPY_ULONG, NPY_LONGLONG, NPY_LONGLONG, NPY_LONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_OBJECT, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction bitwise_or_functions[] = {BOOL_logical_or, BYTE_bitwise_or, UBYTE_bitwise_or, SHORT_bitwise_or, USHORT_bitwise_or, INT_bitwise_or, UINT_bitwise_or, LONG_bitwise_or, ULONG_bitwise_or, LONGLONG_bitwise_or, ULONGLONG_bitwise_or, NULL};
+static void * bitwise_or_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char bitwise_or_signatures[] = {NPY_BOOL, NPY_BOOL, NPY_BOOL, NPY_BYTE, NPY_BYTE, NPY_BYTE, NPY_UBYTE, NPY_UBYTE, NPY_UBYTE, NPY_SHORT, NPY_SHORT, NPY_SHORT, NPY_USHORT, NPY_USHORT, NPY_USHORT, NPY_INT, NPY_INT, NPY_INT, NPY_UINT, NPY_UINT, NPY_UINT, NPY_LONG, NPY_LONG, NPY_LONG, NPY_ULONG, NPY_ULONG, NPY_ULONG, NPY_LONGLONG, NPY_LONGLONG, NPY_LONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_OBJECT, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction bitwise_xor_functions[] = {BOOL_not_equal, BYTE_bitwise_xor, UBYTE_bitwise_xor, SHORT_bitwise_xor, USHORT_bitwise_xor, INT_bitwise_xor, UINT_bitwise_xor, LONG_bitwise_xor, ULONG_bitwise_xor, LONGLONG_bitwise_xor, ULONGLONG_bitwise_xor, NULL};
+static void * bitwise_xor_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char bitwise_xor_signatures[] = {NPY_BOOL, NPY_BOOL, NPY_BOOL, NPY_BYTE, NPY_BYTE, NPY_BYTE, NPY_UBYTE, NPY_UBYTE, NPY_UBYTE, NPY_SHORT, NPY_SHORT, NPY_SHORT, NPY_USHORT, NPY_USHORT, NPY_USHORT, NPY_INT, NPY_INT, NPY_INT, NPY_UINT, NPY_UINT, NPY_UINT, NPY_LONG, NPY_LONG, NPY_LONG, NPY_ULONG, NPY_ULONG, NPY_ULONG, NPY_LONGLONG, NPY_LONGLONG, NPY_LONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_OBJECT, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction cbrt_functions[] = {HALF_cbrt, FLOAT_cbrt, DOUBLE_cbrt, NULL, NULL, NULL, NULL, NULL};
+static void * cbrt_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)"cbrt"};
+static char cbrt_signatures[] = {NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction ceil_functions[] = {NULL, FLOAT_ceil, DOUBLE_ceil, NULL, NULL, NULL, NULL};
+static void * ceil_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char ceil_signatures[] = {NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction clip_functions[] = {BOOL_clip, BYTE_clip, UBYTE_clip, SHORT_clip, USHORT_clip, INT_clip, UINT_clip, LONG_clip, ULONG_clip, LONGLONG_clip, ULONGLONG_clip, HALF_clip, FLOAT_clip, DOUBLE_clip, LONGDOUBLE_clip, CFLOAT_clip, CDOUBLE_clip, CLONGDOUBLE_clip, TIMEDELTA_clip, DATETIME_clip, NULL};
+static void * clip_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char clip_signatures[] = {NPY_BOOL, NPY_BOOL, NPY_BOOL, NPY_BOOL, NPY_BYTE, NPY_BYTE, NPY_BYTE, NPY_BYTE, NPY_UBYTE, NPY_UBYTE, NPY_UBYTE, NPY_UBYTE, NPY_SHORT, NPY_SHORT, NPY_SHORT, NPY_SHORT, NPY_USHORT, NPY_USHORT, NPY_USHORT, NPY_USHORT, NPY_INT, NPY_INT, NPY_INT, NPY_INT, NPY_UINT, NPY_UINT, NPY_UINT, NPY_UINT, NPY_LONG, NPY_LONG, NPY_LONG, NPY_LONG, NPY_ULONG, NPY_ULONG, NPY_ULONG, NPY_ULONG, NPY_LONGLONG, NPY_LONGLONG, NPY_LONGLONG, NPY_LONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_HALF, NPY_HALF, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_DATETIME, NPY_DATETIME, NPY_DATETIME, NPY_DATETIME, NPY_OBJECT, NPY_OBJECT, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction conjugate_functions[] = {BYTE_conjugate, UBYTE_conjugate, SHORT_conjugate, USHORT_conjugate, INT_conjugate, UINT_conjugate, LONG_conjugate, ULONG_conjugate, LONGLONG_conjugate, ULONGLONG_conjugate, HALF_conjugate, FLOAT_conjugate, DOUBLE_conjugate, LONGDOUBLE_conjugate, CFLOAT_conjugate, CDOUBLE_conjugate, CLONGDOUBLE_conjugate, NULL};
+static void * conjugate_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)"conjugate"};
+static char conjugate_signatures[] = {NPY_BYTE, NPY_BYTE, NPY_UBYTE, NPY_UBYTE, NPY_SHORT, NPY_SHORT, NPY_USHORT, NPY_USHORT, NPY_INT, NPY_INT, NPY_UINT, NPY_UINT, NPY_LONG, NPY_LONG, NPY_ULONG, NPY_ULONG, NPY_LONGLONG, NPY_LONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction copysign_functions[] = {HALF_copysign, FLOAT_copysign, DOUBLE_copysign, LONGDOUBLE_copysign};
+static void * copysign_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char copysign_signatures[] = {NPY_HALF, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE};
+static PyUFuncGenericFunction cos_functions[] = {HALF_cos, FLOAT_cos, DOUBLE_cos, NULL, NULL, NULL, NULL, NULL};
+static void * cos_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)"cos"};
+static char cos_signatures[] = {NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction cosh_functions[] = {HALF_cosh, FLOAT_cosh, DOUBLE_cosh, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL};
+static void * cosh_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)"cosh"};
+static char cosh_signatures[] = {NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction deg2rad_functions[] = {NULL, NULL, NULL, NULL, NULL};
+static void * deg2rad_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)"deg2rad"};
+static char deg2rad_signatures[] = {NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction degrees_functions[] = {NULL, NULL, NULL, NULL, NULL};
+static void * degrees_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)"degrees"};
+static char degrees_signatures[] = {NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction divide_functions[] = {HALF_divide, FLOAT_divide, DOUBLE_divide, LONGDOUBLE_divide, CFLOAT_divide, CDOUBLE_divide, CLONGDOUBLE_divide, TIMEDELTA_mq_m_divide, TIMEDELTA_md_m_divide, TIMEDELTA_mm_d_divide, NULL};
+static void * divide_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char divide_signatures[] = {NPY_HALF, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_TIMEDELTA, NPY_LONGLONG, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_DOUBLE, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_DOUBLE, NPY_OBJECT, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction divmod_functions[] = {BYTE_divmod, UBYTE_divmod, SHORT_divmod, USHORT_divmod, INT_divmod, UINT_divmod, LONG_divmod, ULONG_divmod, LONGLONG_divmod, ULONGLONG_divmod, HALF_divmod, FLOAT_divmod, DOUBLE_divmod, LONGDOUBLE_divmod, TIMEDELTA_mm_qm_divmod};
+static void * divmod_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char divmod_signatures[] = {NPY_BYTE, NPY_BYTE, NPY_BYTE, NPY_BYTE, NPY_UBYTE, NPY_UBYTE, NPY_UBYTE, NPY_UBYTE, NPY_SHORT, NPY_SHORT, NPY_SHORT, NPY_SHORT, NPY_USHORT, NPY_USHORT, NPY_USHORT, NPY_USHORT, NPY_INT, NPY_INT, NPY_INT, NPY_INT, NPY_UINT, NPY_UINT, NPY_UINT, NPY_UINT, NPY_LONG, NPY_LONG, NPY_LONG, NPY_LONG, NPY_ULONG, NPY_ULONG, NPY_ULONG, NPY_ULONG, NPY_LONGLONG, NPY_LONGLONG, NPY_LONGLONG, NPY_LONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_HALF, NPY_HALF, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_LONGLONG, NPY_TIMEDELTA};
+static PyUFuncGenericFunction equal_functions[] = {BOOL_equal, BYTE_equal, UBYTE_equal, SHORT_equal, USHORT_equal, INT_equal, UINT_equal, LONG_equal, ULONG_equal, LONGLONG_equal, ULONGLONG_equal, LONGLONG_qQ_bool_equal, LONGLONG_Qq_bool_equal, HALF_equal, FLOAT_equal, DOUBLE_equal, LONGDOUBLE_equal, CFLOAT_equal, CDOUBLE_equal, CLONGDOUBLE_equal, DATETIME_equal, TIMEDELTA_equal, OBJECT_equal, OBJECT_OO_O_equal};
+static void * equal_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char equal_signatures[] = {NPY_BOOL, NPY_BOOL, NPY_BOOL, NPY_BYTE, NPY_BYTE, NPY_BOOL, NPY_UBYTE, NPY_UBYTE, NPY_BOOL, NPY_SHORT, NPY_SHORT, NPY_BOOL, NPY_USHORT, NPY_USHORT, NPY_BOOL, NPY_INT, NPY_INT, NPY_BOOL, NPY_UINT, NPY_UINT, NPY_BOOL, NPY_LONG, NPY_LONG, NPY_BOOL, NPY_ULONG, NPY_ULONG, NPY_BOOL, NPY_LONGLONG, NPY_LONGLONG, NPY_BOOL, NPY_ULONGLONG, NPY_ULONGLONG, NPY_BOOL, NPY_LONGLONG, NPY_ULONGLONG, NPY_BOOL, NPY_ULONGLONG, NPY_LONGLONG, NPY_BOOL, NPY_HALF, NPY_HALF, NPY_BOOL, NPY_FLOAT, NPY_FLOAT, NPY_BOOL, NPY_DOUBLE, NPY_DOUBLE, NPY_BOOL, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_BOOL, NPY_CFLOAT, NPY_CFLOAT, NPY_BOOL, NPY_CDOUBLE, NPY_CDOUBLE, NPY_BOOL, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_BOOL, NPY_DATETIME, NPY_DATETIME, NPY_BOOL, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_BOOL, NPY_OBJECT, NPY_OBJECT, NPY_BOOL, NPY_OBJECT, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction exp_functions[] = {HALF_exp, FLOAT_exp, DOUBLE_exp, NULL, NULL, NULL, NULL, NULL, NULL, NULL};
+static void * exp_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)"exp"};
+static char exp_signatures[] = {NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction exp2_functions[] = {HALF_exp2, FLOAT_exp2, DOUBLE_exp2, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL};
+static void * exp2_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)"exp2"};
+static char exp2_signatures[] = {NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction expm1_functions[] = {HALF_expm1, FLOAT_expm1, DOUBLE_expm1, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL};
+static void * expm1_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)"expm1"};
+static char expm1_signatures[] = {NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction fabs_functions[] = {NULL, NULL, NULL, NULL, NULL};
+static void * fabs_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)"fabs"};
+static char fabs_signatures[] = {NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction float_power_functions[] = {NULL, NULL, NULL, NULL};
+static void * float_power_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char float_power_signatures[] = {NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE};
+static PyUFuncGenericFunction floor_functions[] = {NULL, FLOAT_floor, DOUBLE_floor, NULL, NULL, NULL, NULL};
+static void * floor_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char floor_signatures[] = {NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction floor_divide_functions[] = {BYTE_divide, UBYTE_divide, SHORT_divide, USHORT_divide, INT_divide, UINT_divide, LONG_divide, ULONG_divide, LONGLONG_divide, ULONGLONG_divide, HALF_floor_divide, FLOAT_floor_divide, DOUBLE_floor_divide, LONGDOUBLE_floor_divide, TIMEDELTA_mq_m_floor_divide, TIMEDELTA_md_m_floor_divide, TIMEDELTA_mm_q_floor_divide, NULL};
+static void * floor_divide_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char floor_divide_signatures[] = {NPY_BYTE, NPY_BYTE, NPY_BYTE, NPY_UBYTE, NPY_UBYTE, NPY_UBYTE, NPY_SHORT, NPY_SHORT, NPY_SHORT, NPY_USHORT, NPY_USHORT, NPY_USHORT, NPY_INT, NPY_INT, NPY_INT, NPY_UINT, NPY_UINT, NPY_UINT, NPY_LONG, NPY_LONG, NPY_LONG, NPY_ULONG, NPY_ULONG, NPY_ULONG, NPY_LONGLONG, NPY_LONGLONG, NPY_LONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_HALF, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_TIMEDELTA, NPY_LONGLONG, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_DOUBLE, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_LONGLONG, NPY_OBJECT, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction fmax_functions[] = {BOOL_logical_or, BYTE_fmax, UBYTE_fmax, SHORT_fmax, USHORT_fmax, INT_fmax, UINT_fmax, LONG_fmax, ULONG_fmax, LONGLONG_fmax, ULONGLONG_fmax, HALF_fmax, FLOAT_fmax, DOUBLE_fmax, LONGDOUBLE_fmax, CFLOAT_fmax, CDOUBLE_fmax, CLONGDOUBLE_fmax, TIMEDELTA_fmax, DATETIME_fmax, NULL};
+static void * fmax_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char fmax_signatures[] = {NPY_BOOL, NPY_BOOL, NPY_BOOL, NPY_BYTE, NPY_BYTE, NPY_BYTE, NPY_UBYTE, NPY_UBYTE, NPY_UBYTE, NPY_SHORT, NPY_SHORT, NPY_SHORT, NPY_USHORT, NPY_USHORT, NPY_USHORT, NPY_INT, NPY_INT, NPY_INT, NPY_UINT, NPY_UINT, NPY_UINT, NPY_LONG, NPY_LONG, NPY_LONG, NPY_ULONG, NPY_ULONG, NPY_ULONG, NPY_LONGLONG, NPY_LONGLONG, NPY_LONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_HALF, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_DATETIME, NPY_DATETIME, NPY_DATETIME, NPY_OBJECT, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction fmin_functions[] = {BOOL_logical_and, BYTE_fmin, UBYTE_fmin, SHORT_fmin, USHORT_fmin, INT_fmin, UINT_fmin, LONG_fmin, ULONG_fmin, LONGLONG_fmin, ULONGLONG_fmin, HALF_fmin, FLOAT_fmin, DOUBLE_fmin, LONGDOUBLE_fmin, CFLOAT_fmin, CDOUBLE_fmin, CLONGDOUBLE_fmin, TIMEDELTA_fmin, DATETIME_fmin, NULL};
+static void * fmin_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char fmin_signatures[] = {NPY_BOOL, NPY_BOOL, NPY_BOOL, NPY_BYTE, NPY_BYTE, NPY_BYTE, NPY_UBYTE, NPY_UBYTE, NPY_UBYTE, NPY_SHORT, NPY_SHORT, NPY_SHORT, NPY_USHORT, NPY_USHORT, NPY_USHORT, NPY_INT, NPY_INT, NPY_INT, NPY_UINT, NPY_UINT, NPY_UINT, NPY_LONG, NPY_LONG, NPY_LONG, NPY_ULONG, NPY_ULONG, NPY_ULONG, NPY_LONGLONG, NPY_LONGLONG, NPY_LONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_HALF, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_DATETIME, NPY_DATETIME, NPY_DATETIME, NPY_OBJECT, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction fmod_functions[] = {BYTE_fmod, UBYTE_fmod, SHORT_fmod, USHORT_fmod, INT_fmod, UINT_fmod, LONG_fmod, ULONG_fmod, LONGLONG_fmod, ULONGLONG_fmod, NULL, NULL, NULL, NULL, NULL};
+static void * fmod_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)"fmod"};
+static char fmod_signatures[] = {NPY_BYTE, NPY_BYTE, NPY_BYTE, NPY_UBYTE, NPY_UBYTE, NPY_UBYTE, NPY_SHORT, NPY_SHORT, NPY_SHORT, NPY_USHORT, NPY_USHORT, NPY_USHORT, NPY_INT, NPY_INT, NPY_INT, NPY_UINT, NPY_UINT, NPY_UINT, NPY_LONG, NPY_LONG, NPY_LONG, NPY_ULONG, NPY_ULONG, NPY_ULONG, NPY_LONGLONG, NPY_LONGLONG, NPY_LONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_HALF, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_OBJECT, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction frexp_functions[] = {HALF_frexp, FLOAT_frexp, DOUBLE_frexp, LONGDOUBLE_frexp};
+static void * frexp_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char frexp_signatures[] = {NPY_HALF, NPY_HALF, NPY_INT, NPY_FLOAT, NPY_FLOAT, NPY_INT, NPY_DOUBLE, NPY_DOUBLE, NPY_INT, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_INT};
+static PyUFuncGenericFunction gcd_functions[] = {BYTE_gcd, UBYTE_gcd, SHORT_gcd, USHORT_gcd, INT_gcd, UINT_gcd, LONG_gcd, ULONG_gcd, LONGLONG_gcd, ULONGLONG_gcd, NULL};
+static void * gcd_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char gcd_signatures[] = {NPY_BYTE, NPY_BYTE, NPY_BYTE, NPY_UBYTE, NPY_UBYTE, NPY_UBYTE, NPY_SHORT, NPY_SHORT, NPY_SHORT, NPY_USHORT, NPY_USHORT, NPY_USHORT, NPY_INT, NPY_INT, NPY_INT, NPY_UINT, NPY_UINT, NPY_UINT, NPY_LONG, NPY_LONG, NPY_LONG, NPY_ULONG, NPY_ULONG, NPY_ULONG, NPY_LONGLONG, NPY_LONGLONG, NPY_LONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_OBJECT, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction greater_functions[] = {BOOL_greater, BYTE_greater, UBYTE_greater, SHORT_greater, USHORT_greater, INT_greater, UINT_greater, LONG_greater, ULONG_greater, LONGLONG_greater, ULONGLONG_greater, LONGLONG_qQ_bool_greater, LONGLONG_Qq_bool_greater, HALF_greater, FLOAT_greater, DOUBLE_greater, LONGDOUBLE_greater, CFLOAT_greater, CDOUBLE_greater, CLONGDOUBLE_greater, DATETIME_greater, TIMEDELTA_greater, OBJECT_greater, OBJECT_OO_O_greater};
+static void * greater_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char greater_signatures[] = {NPY_BOOL, NPY_BOOL, NPY_BOOL, NPY_BYTE, NPY_BYTE, NPY_BOOL, NPY_UBYTE, NPY_UBYTE, NPY_BOOL, NPY_SHORT, NPY_SHORT, NPY_BOOL, NPY_USHORT, NPY_USHORT, NPY_BOOL, NPY_INT, NPY_INT, NPY_BOOL, NPY_UINT, NPY_UINT, NPY_BOOL, NPY_LONG, NPY_LONG, NPY_BOOL, NPY_ULONG, NPY_ULONG, NPY_BOOL, NPY_LONGLONG, NPY_LONGLONG, NPY_BOOL, NPY_ULONGLONG, NPY_ULONGLONG, NPY_BOOL, NPY_LONGLONG, NPY_ULONGLONG, NPY_BOOL, NPY_ULONGLONG, NPY_LONGLONG, NPY_BOOL, NPY_HALF, NPY_HALF, NPY_BOOL, NPY_FLOAT, NPY_FLOAT, NPY_BOOL, NPY_DOUBLE, NPY_DOUBLE, NPY_BOOL, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_BOOL, NPY_CFLOAT, NPY_CFLOAT, NPY_BOOL, NPY_CDOUBLE, NPY_CDOUBLE, NPY_BOOL, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_BOOL, NPY_DATETIME, NPY_DATETIME, NPY_BOOL, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_BOOL, NPY_OBJECT, NPY_OBJECT, NPY_BOOL, NPY_OBJECT, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction greater_equal_functions[] = {BOOL_greater_equal, BYTE_greater_equal, UBYTE_greater_equal, SHORT_greater_equal, USHORT_greater_equal, INT_greater_equal, UINT_greater_equal, LONG_greater_equal, ULONG_greater_equal, LONGLONG_greater_equal, ULONGLONG_greater_equal, LONGLONG_qQ_bool_greater_equal, LONGLONG_Qq_bool_greater_equal, HALF_greater_equal, FLOAT_greater_equal, DOUBLE_greater_equal, LONGDOUBLE_greater_equal, CFLOAT_greater_equal, CDOUBLE_greater_equal, CLONGDOUBLE_greater_equal, DATETIME_greater_equal, TIMEDELTA_greater_equal, OBJECT_greater_equal, OBJECT_OO_O_greater_equal};
+static void * greater_equal_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char greater_equal_signatures[] = {NPY_BOOL, NPY_BOOL, NPY_BOOL, NPY_BYTE, NPY_BYTE, NPY_BOOL, NPY_UBYTE, NPY_UBYTE, NPY_BOOL, NPY_SHORT, NPY_SHORT, NPY_BOOL, NPY_USHORT, NPY_USHORT, NPY_BOOL, NPY_INT, NPY_INT, NPY_BOOL, NPY_UINT, NPY_UINT, NPY_BOOL, NPY_LONG, NPY_LONG, NPY_BOOL, NPY_ULONG, NPY_ULONG, NPY_BOOL, NPY_LONGLONG, NPY_LONGLONG, NPY_BOOL, NPY_ULONGLONG, NPY_ULONGLONG, NPY_BOOL, NPY_LONGLONG, NPY_ULONGLONG, NPY_BOOL, NPY_ULONGLONG, NPY_LONGLONG, NPY_BOOL, NPY_HALF, NPY_HALF, NPY_BOOL, NPY_FLOAT, NPY_FLOAT, NPY_BOOL, NPY_DOUBLE, NPY_DOUBLE, NPY_BOOL, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_BOOL, NPY_CFLOAT, NPY_CFLOAT, NPY_BOOL, NPY_CDOUBLE, NPY_CDOUBLE, NPY_BOOL, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_BOOL, NPY_DATETIME, NPY_DATETIME, NPY_BOOL, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_BOOL, NPY_OBJECT, NPY_OBJECT, NPY_BOOL, NPY_OBJECT, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction heaviside_functions[] = {NULL, NULL, NULL, NULL};
+static void * heaviside_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char heaviside_signatures[] = {NPY_HALF, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE};
+static PyUFuncGenericFunction hypot_functions[] = {NULL, NULL, NULL, NULL, NULL};
+static void * hypot_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)"hypot"};
+static char hypot_signatures[] = {NPY_HALF, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_OBJECT, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction invert_functions[] = {BOOL_logical_not, BYTE_invert, UBYTE_invert, SHORT_invert, USHORT_invert, INT_invert, UINT_invert, LONG_invert, ULONG_invert, LONGLONG_invert, ULONGLONG_invert, NULL};
+static void * invert_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char invert_signatures[] = {NPY_BOOL, NPY_BOOL, NPY_BYTE, NPY_BYTE, NPY_UBYTE, NPY_UBYTE, NPY_SHORT, NPY_SHORT, NPY_USHORT, NPY_USHORT, NPY_INT, NPY_INT, NPY_UINT, NPY_UINT, NPY_LONG, NPY_LONG, NPY_ULONG, NPY_ULONG, NPY_LONGLONG, NPY_LONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction isfinite_functions[] = {BOOL_isfinite, BYTE_isfinite, UBYTE_isfinite, SHORT_isfinite, USHORT_isfinite, INT_isfinite, UINT_isfinite, LONG_isfinite, ULONG_isfinite, LONGLONG_isfinite, ULONGLONG_isfinite, HALF_isfinite, FLOAT_isfinite, DOUBLE_isfinite, LONGDOUBLE_isfinite, CFLOAT_isfinite, CDOUBLE_isfinite, CLONGDOUBLE_isfinite, TIMEDELTA_isfinite, DATETIME_isfinite};
+static void * isfinite_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char isfinite_signatures[] = {NPY_BOOL, NPY_BOOL, NPY_BYTE, NPY_BOOL, NPY_UBYTE, NPY_BOOL, NPY_SHORT, NPY_BOOL, NPY_USHORT, NPY_BOOL, NPY_INT, NPY_BOOL, NPY_UINT, NPY_BOOL, NPY_LONG, NPY_BOOL, NPY_ULONG, NPY_BOOL, NPY_LONGLONG, NPY_BOOL, NPY_ULONGLONG, NPY_BOOL, NPY_HALF, NPY_BOOL, NPY_FLOAT, NPY_BOOL, NPY_DOUBLE, NPY_BOOL, NPY_LONGDOUBLE, NPY_BOOL, NPY_CFLOAT, NPY_BOOL, NPY_CDOUBLE, NPY_BOOL, NPY_CLONGDOUBLE, NPY_BOOL, NPY_TIMEDELTA, NPY_BOOL, NPY_DATETIME, NPY_BOOL};
+static PyUFuncGenericFunction isinf_functions[] = {BOOL_isinf, BYTE_isinf, UBYTE_isinf, SHORT_isinf, USHORT_isinf, INT_isinf, UINT_isinf, LONG_isinf, ULONG_isinf, LONGLONG_isinf, ULONGLONG_isinf, HALF_isinf, FLOAT_isinf, DOUBLE_isinf, LONGDOUBLE_isinf, CFLOAT_isinf, CDOUBLE_isinf, CLONGDOUBLE_isinf, TIMEDELTA_isinf, DATETIME_isinf};
+static void * isinf_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char isinf_signatures[] = {NPY_BOOL, NPY_BOOL, NPY_BYTE, NPY_BOOL, NPY_UBYTE, NPY_BOOL, NPY_SHORT, NPY_BOOL, NPY_USHORT, NPY_BOOL, NPY_INT, NPY_BOOL, NPY_UINT, NPY_BOOL, NPY_LONG, NPY_BOOL, NPY_ULONG, NPY_BOOL, NPY_LONGLONG, NPY_BOOL, NPY_ULONGLONG, NPY_BOOL, NPY_HALF, NPY_BOOL, NPY_FLOAT, NPY_BOOL, NPY_DOUBLE, NPY_BOOL, NPY_LONGDOUBLE, NPY_BOOL, NPY_CFLOAT, NPY_BOOL, NPY_CDOUBLE, NPY_BOOL, NPY_CLONGDOUBLE, NPY_BOOL, NPY_TIMEDELTA, NPY_BOOL, NPY_DATETIME, NPY_BOOL};
+static PyUFuncGenericFunction isnan_functions[] = {BOOL_isnan, BYTE_isnan, UBYTE_isnan, SHORT_isnan, USHORT_isnan, INT_isnan, UINT_isnan, LONG_isnan, ULONG_isnan, LONGLONG_isnan, ULONGLONG_isnan, HALF_isnan, FLOAT_isnan, DOUBLE_isnan, LONGDOUBLE_isnan, CFLOAT_isnan, CDOUBLE_isnan, CLONGDOUBLE_isnan, TIMEDELTA_isnan, DATETIME_isnan};
+static void * isnan_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char isnan_signatures[] = {NPY_BOOL, NPY_BOOL, NPY_BYTE, NPY_BOOL, NPY_UBYTE, NPY_BOOL, NPY_SHORT, NPY_BOOL, NPY_USHORT, NPY_BOOL, NPY_INT, NPY_BOOL, NPY_UINT, NPY_BOOL, NPY_LONG, NPY_BOOL, NPY_ULONG, NPY_BOOL, NPY_LONGLONG, NPY_BOOL, NPY_ULONGLONG, NPY_BOOL, NPY_HALF, NPY_BOOL, NPY_FLOAT, NPY_BOOL, NPY_DOUBLE, NPY_BOOL, NPY_LONGDOUBLE, NPY_BOOL, NPY_CFLOAT, NPY_BOOL, NPY_CDOUBLE, NPY_BOOL, NPY_CLONGDOUBLE, NPY_BOOL, NPY_TIMEDELTA, NPY_BOOL, NPY_DATETIME, NPY_BOOL};
+static PyUFuncGenericFunction isnat_functions[] = {DATETIME_isnat, TIMEDELTA_isnat};
+static void * isnat_data[] = {(void *)NULL, (void *)NULL};
+static char isnat_signatures[] = {NPY_DATETIME, NPY_BOOL, NPY_TIMEDELTA, NPY_BOOL};
+static PyUFuncGenericFunction lcm_functions[] = {BYTE_lcm, UBYTE_lcm, SHORT_lcm, USHORT_lcm, INT_lcm, UINT_lcm, LONG_lcm, ULONG_lcm, LONGLONG_lcm, ULONGLONG_lcm, NULL};
+static void * lcm_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char lcm_signatures[] = {NPY_BYTE, NPY_BYTE, NPY_BYTE, NPY_UBYTE, NPY_UBYTE, NPY_UBYTE, NPY_SHORT, NPY_SHORT, NPY_SHORT, NPY_USHORT, NPY_USHORT, NPY_USHORT, NPY_INT, NPY_INT, NPY_INT, NPY_UINT, NPY_UINT, NPY_UINT, NPY_LONG, NPY_LONG, NPY_LONG, NPY_ULONG, NPY_ULONG, NPY_ULONG, NPY_LONGLONG, NPY_LONGLONG, NPY_LONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_OBJECT, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction ldexp_functions[] = {HALF_ldexp, FLOAT_ldexp, HALF_ldexp_long, FLOAT_ldexp_long, DOUBLE_ldexp, DOUBLE_ldexp_long, LONGDOUBLE_ldexp, LONGDOUBLE_ldexp_long};
+static void * ldexp_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char ldexp_signatures[] = {NPY_HALF, NPY_INT, NPY_HALF, NPY_FLOAT, NPY_INT, NPY_FLOAT, NPY_HALF, NPY_LONG, NPY_HALF, NPY_FLOAT, NPY_LONG, NPY_FLOAT, NPY_DOUBLE, NPY_INT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONG, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_INT, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_LONG, NPY_LONGDOUBLE};
+static PyUFuncGenericFunction left_shift_functions[] = {BYTE_left_shift, UBYTE_left_shift, SHORT_left_shift, USHORT_left_shift, INT_left_shift, UINT_left_shift, LONG_left_shift, ULONG_left_shift, LONGLONG_left_shift, ULONGLONG_left_shift, NULL};
+static void * left_shift_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char left_shift_signatures[] = {NPY_BYTE, NPY_BYTE, NPY_BYTE, NPY_UBYTE, NPY_UBYTE, NPY_UBYTE, NPY_SHORT, NPY_SHORT, NPY_SHORT, NPY_USHORT, NPY_USHORT, NPY_USHORT, NPY_INT, NPY_INT, NPY_INT, NPY_UINT, NPY_UINT, NPY_UINT, NPY_LONG, NPY_LONG, NPY_LONG, NPY_ULONG, NPY_ULONG, NPY_ULONG, NPY_LONGLONG, NPY_LONGLONG, NPY_LONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_OBJECT, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction less_functions[] = {BOOL_less, BYTE_less, UBYTE_less, SHORT_less, USHORT_less, INT_less, UINT_less, LONG_less, ULONG_less, LONGLONG_less, ULONGLONG_less, LONGLONG_qQ_bool_less, LONGLONG_Qq_bool_less, HALF_less, FLOAT_less, DOUBLE_less, LONGDOUBLE_less, CFLOAT_less, CDOUBLE_less, CLONGDOUBLE_less, DATETIME_less, TIMEDELTA_less, OBJECT_less, OBJECT_OO_O_less};
+static void * less_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char less_signatures[] = {NPY_BOOL, NPY_BOOL, NPY_BOOL, NPY_BYTE, NPY_BYTE, NPY_BOOL, NPY_UBYTE, NPY_UBYTE, NPY_BOOL, NPY_SHORT, NPY_SHORT, NPY_BOOL, NPY_USHORT, NPY_USHORT, NPY_BOOL, NPY_INT, NPY_INT, NPY_BOOL, NPY_UINT, NPY_UINT, NPY_BOOL, NPY_LONG, NPY_LONG, NPY_BOOL, NPY_ULONG, NPY_ULONG, NPY_BOOL, NPY_LONGLONG, NPY_LONGLONG, NPY_BOOL, NPY_ULONGLONG, NPY_ULONGLONG, NPY_BOOL, NPY_LONGLONG, NPY_ULONGLONG, NPY_BOOL, NPY_ULONGLONG, NPY_LONGLONG, NPY_BOOL, NPY_HALF, NPY_HALF, NPY_BOOL, NPY_FLOAT, NPY_FLOAT, NPY_BOOL, NPY_DOUBLE, NPY_DOUBLE, NPY_BOOL, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_BOOL, NPY_CFLOAT, NPY_CFLOAT, NPY_BOOL, NPY_CDOUBLE, NPY_CDOUBLE, NPY_BOOL, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_BOOL, NPY_DATETIME, NPY_DATETIME, NPY_BOOL, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_BOOL, NPY_OBJECT, NPY_OBJECT, NPY_BOOL, NPY_OBJECT, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction less_equal_functions[] = {BOOL_less_equal, BYTE_less_equal, UBYTE_less_equal, SHORT_less_equal, USHORT_less_equal, INT_less_equal, UINT_less_equal, LONG_less_equal, ULONG_less_equal, LONGLONG_less_equal, ULONGLONG_less_equal, LONGLONG_qQ_bool_less_equal, LONGLONG_Qq_bool_less_equal, HALF_less_equal, FLOAT_less_equal, DOUBLE_less_equal, LONGDOUBLE_less_equal, CFLOAT_less_equal, CDOUBLE_less_equal, CLONGDOUBLE_less_equal, DATETIME_less_equal, TIMEDELTA_less_equal, OBJECT_less_equal, OBJECT_OO_O_less_equal};
+static void * less_equal_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char less_equal_signatures[] = {NPY_BOOL, NPY_BOOL, NPY_BOOL, NPY_BYTE, NPY_BYTE, NPY_BOOL, NPY_UBYTE, NPY_UBYTE, NPY_BOOL, NPY_SHORT, NPY_SHORT, NPY_BOOL, NPY_USHORT, NPY_USHORT, NPY_BOOL, NPY_INT, NPY_INT, NPY_BOOL, NPY_UINT, NPY_UINT, NPY_BOOL, NPY_LONG, NPY_LONG, NPY_BOOL, NPY_ULONG, NPY_ULONG, NPY_BOOL, NPY_LONGLONG, NPY_LONGLONG, NPY_BOOL, NPY_ULONGLONG, NPY_ULONGLONG, NPY_BOOL, NPY_LONGLONG, NPY_ULONGLONG, NPY_BOOL, NPY_ULONGLONG, NPY_LONGLONG, NPY_BOOL, NPY_HALF, NPY_HALF, NPY_BOOL, NPY_FLOAT, NPY_FLOAT, NPY_BOOL, NPY_DOUBLE, NPY_DOUBLE, NPY_BOOL, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_BOOL, NPY_CFLOAT, NPY_CFLOAT, NPY_BOOL, NPY_CDOUBLE, NPY_CDOUBLE, NPY_BOOL, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_BOOL, NPY_DATETIME, NPY_DATETIME, NPY_BOOL, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_BOOL, NPY_OBJECT, NPY_OBJECT, NPY_BOOL, NPY_OBJECT, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction log_functions[] = {HALF_log, FLOAT_log, DOUBLE_log, NULL, NULL, NULL, NULL, NULL, NULL, NULL};
+static void * log_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)"log"};
+static char log_signatures[] = {NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction log10_functions[] = {HALF_log10, FLOAT_log10, DOUBLE_log10, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL};
+static void * log10_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)"log10"};
+static char log10_signatures[] = {NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction log1p_functions[] = {HALF_log1p, FLOAT_log1p, DOUBLE_log1p, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL};
+static void * log1p_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)"log1p"};
+static char log1p_signatures[] = {NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction log2_functions[] = {HALF_log2, FLOAT_log2, DOUBLE_log2, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL};
+static void * log2_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)"log2"};
+static char log2_signatures[] = {NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction logaddexp_functions[] = {NULL, NULL, NULL, NULL};
+static void * logaddexp_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char logaddexp_signatures[] = {NPY_HALF, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE};
+static PyUFuncGenericFunction logaddexp2_functions[] = {NULL, NULL, NULL, NULL};
+static void * logaddexp2_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char logaddexp2_signatures[] = {NPY_HALF, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE};
+static PyUFuncGenericFunction logical_and_functions[] = {BOOL_logical_and, BYTE_logical_and, UBYTE_logical_and, SHORT_logical_and, USHORT_logical_and, INT_logical_and, UINT_logical_and, LONG_logical_and, ULONG_logical_and, LONGLONG_logical_and, ULONGLONG_logical_and, HALF_logical_and, FLOAT_logical_and, DOUBLE_logical_and, LONGDOUBLE_logical_and, CFLOAT_logical_and, CDOUBLE_logical_and, CLONGDOUBLE_logical_and, NULL};
+static void * logical_and_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char logical_and_signatures[] = {NPY_BOOL, NPY_BOOL, NPY_BOOL, NPY_BYTE, NPY_BYTE, NPY_BOOL, NPY_UBYTE, NPY_UBYTE, NPY_BOOL, NPY_SHORT, NPY_SHORT, NPY_BOOL, NPY_USHORT, NPY_USHORT, NPY_BOOL, NPY_INT, NPY_INT, NPY_BOOL, NPY_UINT, NPY_UINT, NPY_BOOL, NPY_LONG, NPY_LONG, NPY_BOOL, NPY_ULONG, NPY_ULONG, NPY_BOOL, NPY_LONGLONG, NPY_LONGLONG, NPY_BOOL, NPY_ULONGLONG, NPY_ULONGLONG, NPY_BOOL, NPY_HALF, NPY_HALF, NPY_BOOL, NPY_FLOAT, NPY_FLOAT, NPY_BOOL, NPY_DOUBLE, NPY_DOUBLE, NPY_BOOL, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_BOOL, NPY_CFLOAT, NPY_CFLOAT, NPY_BOOL, NPY_CDOUBLE, NPY_CDOUBLE, NPY_BOOL, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_BOOL, NPY_OBJECT, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction logical_not_functions[] = {BOOL_logical_not, BYTE_logical_not, UBYTE_logical_not, SHORT_logical_not, USHORT_logical_not, INT_logical_not, UINT_logical_not, LONG_logical_not, ULONG_logical_not, LONGLONG_logical_not, ULONGLONG_logical_not, HALF_logical_not, FLOAT_logical_not, DOUBLE_logical_not, LONGDOUBLE_logical_not, CFLOAT_logical_not, CDOUBLE_logical_not, CLONGDOUBLE_logical_not, NULL};
+static void * logical_not_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char logical_not_signatures[] = {NPY_BOOL, NPY_BOOL, NPY_BYTE, NPY_BOOL, NPY_UBYTE, NPY_BOOL, NPY_SHORT, NPY_BOOL, NPY_USHORT, NPY_BOOL, NPY_INT, NPY_BOOL, NPY_UINT, NPY_BOOL, NPY_LONG, NPY_BOOL, NPY_ULONG, NPY_BOOL, NPY_LONGLONG, NPY_BOOL, NPY_ULONGLONG, NPY_BOOL, NPY_HALF, NPY_BOOL, NPY_FLOAT, NPY_BOOL, NPY_DOUBLE, NPY_BOOL, NPY_LONGDOUBLE, NPY_BOOL, NPY_CFLOAT, NPY_BOOL, NPY_CDOUBLE, NPY_BOOL, NPY_CLONGDOUBLE, NPY_BOOL, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction logical_or_functions[] = {BOOL_logical_or, BYTE_logical_or, UBYTE_logical_or, SHORT_logical_or, USHORT_logical_or, INT_logical_or, UINT_logical_or, LONG_logical_or, ULONG_logical_or, LONGLONG_logical_or, ULONGLONG_logical_or, HALF_logical_or, FLOAT_logical_or, DOUBLE_logical_or, LONGDOUBLE_logical_or, CFLOAT_logical_or, CDOUBLE_logical_or, CLONGDOUBLE_logical_or, NULL};
+static void * logical_or_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char logical_or_signatures[] = {NPY_BOOL, NPY_BOOL, NPY_BOOL, NPY_BYTE, NPY_BYTE, NPY_BOOL, NPY_UBYTE, NPY_UBYTE, NPY_BOOL, NPY_SHORT, NPY_SHORT, NPY_BOOL, NPY_USHORT, NPY_USHORT, NPY_BOOL, NPY_INT, NPY_INT, NPY_BOOL, NPY_UINT, NPY_UINT, NPY_BOOL, NPY_LONG, NPY_LONG, NPY_BOOL, NPY_ULONG, NPY_ULONG, NPY_BOOL, NPY_LONGLONG, NPY_LONGLONG, NPY_BOOL, NPY_ULONGLONG, NPY_ULONGLONG, NPY_BOOL, NPY_HALF, NPY_HALF, NPY_BOOL, NPY_FLOAT, NPY_FLOAT, NPY_BOOL, NPY_DOUBLE, NPY_DOUBLE, NPY_BOOL, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_BOOL, NPY_CFLOAT, NPY_CFLOAT, NPY_BOOL, NPY_CDOUBLE, NPY_CDOUBLE, NPY_BOOL, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_BOOL, NPY_OBJECT, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction logical_xor_functions[] = {BOOL_not_equal, BYTE_logical_xor, UBYTE_logical_xor, SHORT_logical_xor, USHORT_logical_xor, INT_logical_xor, UINT_logical_xor, LONG_logical_xor, ULONG_logical_xor, LONGLONG_logical_xor, ULONGLONG_logical_xor, HALF_logical_xor, FLOAT_logical_xor, DOUBLE_logical_xor, LONGDOUBLE_logical_xor, CFLOAT_logical_xor, CDOUBLE_logical_xor, CLONGDOUBLE_logical_xor, NULL};
+static void * logical_xor_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)"logical_xor"};
+static char logical_xor_signatures[] = {NPY_BOOL, NPY_BOOL, NPY_BOOL, NPY_BYTE, NPY_BYTE, NPY_BOOL, NPY_UBYTE, NPY_UBYTE, NPY_BOOL, NPY_SHORT, NPY_SHORT, NPY_BOOL, NPY_USHORT, NPY_USHORT, NPY_BOOL, NPY_INT, NPY_INT, NPY_BOOL, NPY_UINT, NPY_UINT, NPY_BOOL, NPY_LONG, NPY_LONG, NPY_BOOL, NPY_ULONG, NPY_ULONG, NPY_BOOL, NPY_LONGLONG, NPY_LONGLONG, NPY_BOOL, NPY_ULONGLONG, NPY_ULONGLONG, NPY_BOOL, NPY_HALF, NPY_HALF, NPY_BOOL, NPY_FLOAT, NPY_FLOAT, NPY_BOOL, NPY_DOUBLE, NPY_DOUBLE, NPY_BOOL, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_BOOL, NPY_CFLOAT, NPY_CFLOAT, NPY_BOOL, NPY_CDOUBLE, NPY_CDOUBLE, NPY_BOOL, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_BOOL, NPY_OBJECT, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction matmul_functions[] = {BOOL_matmul, BYTE_matmul, UBYTE_matmul, SHORT_matmul, USHORT_matmul, INT_matmul, UINT_matmul, LONG_matmul, ULONG_matmul, LONGLONG_matmul, ULONGLONG_matmul, HALF_matmul, FLOAT_matmul, DOUBLE_matmul, LONGDOUBLE_matmul, CFLOAT_matmul, CDOUBLE_matmul, CLONGDOUBLE_matmul, OBJECT_matmul};
+static void * matmul_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char matmul_signatures[] = {NPY_BOOL, NPY_BOOL, NPY_BOOL, NPY_BYTE, NPY_BYTE, NPY_BYTE, NPY_UBYTE, NPY_UBYTE, NPY_UBYTE, NPY_SHORT, NPY_SHORT, NPY_SHORT, NPY_USHORT, NPY_USHORT, NPY_USHORT, NPY_INT, NPY_INT, NPY_INT, NPY_UINT, NPY_UINT, NPY_UINT, NPY_LONG, NPY_LONG, NPY_LONG, NPY_ULONG, NPY_ULONG, NPY_ULONG, NPY_LONGLONG, NPY_LONGLONG, NPY_LONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_HALF, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_OBJECT, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction maximum_functions[] = {BOOL_logical_or, BYTE_maximum, UBYTE_maximum, SHORT_maximum, USHORT_maximum, INT_maximum, UINT_maximum, LONG_maximum, ULONG_maximum, LONGLONG_maximum, ULONGLONG_maximum, HALF_maximum, FLOAT_maximum, DOUBLE_maximum, LONGDOUBLE_maximum, CFLOAT_maximum, CDOUBLE_maximum, CLONGDOUBLE_maximum, TIMEDELTA_maximum, DATETIME_maximum, NULL};
+static void * maximum_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char maximum_signatures[] = {NPY_BOOL, NPY_BOOL, NPY_BOOL, NPY_BYTE, NPY_BYTE, NPY_BYTE, NPY_UBYTE, NPY_UBYTE, NPY_UBYTE, NPY_SHORT, NPY_SHORT, NPY_SHORT, NPY_USHORT, NPY_USHORT, NPY_USHORT, NPY_INT, NPY_INT, NPY_INT, NPY_UINT, NPY_UINT, NPY_UINT, NPY_LONG, NPY_LONG, NPY_LONG, NPY_ULONG, NPY_ULONG, NPY_ULONG, NPY_LONGLONG, NPY_LONGLONG, NPY_LONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_HALF, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_DATETIME, NPY_DATETIME, NPY_DATETIME, NPY_OBJECT, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction minimum_functions[] = {BOOL_logical_and, BYTE_minimum, UBYTE_minimum, SHORT_minimum, USHORT_minimum, INT_minimum, UINT_minimum, LONG_minimum, ULONG_minimum, LONGLONG_minimum, ULONGLONG_minimum, HALF_minimum, FLOAT_minimum, DOUBLE_minimum, LONGDOUBLE_minimum, CFLOAT_minimum, CDOUBLE_minimum, CLONGDOUBLE_minimum, TIMEDELTA_minimum, DATETIME_minimum, NULL};
+static void * minimum_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char minimum_signatures[] = {NPY_BOOL, NPY_BOOL, NPY_BOOL, NPY_BYTE, NPY_BYTE, NPY_BYTE, NPY_UBYTE, NPY_UBYTE, NPY_UBYTE, NPY_SHORT, NPY_SHORT, NPY_SHORT, NPY_USHORT, NPY_USHORT, NPY_USHORT, NPY_INT, NPY_INT, NPY_INT, NPY_UINT, NPY_UINT, NPY_UINT, NPY_LONG, NPY_LONG, NPY_LONG, NPY_ULONG, NPY_ULONG, NPY_ULONG, NPY_LONGLONG, NPY_LONGLONG, NPY_LONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_HALF, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_DATETIME, NPY_DATETIME, NPY_DATETIME, NPY_OBJECT, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction modf_functions[] = {HALF_modf, FLOAT_modf, DOUBLE_modf, LONGDOUBLE_modf};
+static void * modf_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char modf_signatures[] = {NPY_HALF, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE};
+static PyUFuncGenericFunction multiply_functions[] = {BOOL_logical_and, BYTE_multiply, UBYTE_multiply, SHORT_multiply, USHORT_multiply, INT_multiply, UINT_multiply, LONG_multiply, ULONG_multiply, LONGLONG_multiply, ULONGLONG_multiply, HALF_multiply, FLOAT_multiply, DOUBLE_multiply, LONGDOUBLE_multiply, CFLOAT_multiply, CDOUBLE_multiply, CLONGDOUBLE_multiply, TIMEDELTA_mq_m_multiply, TIMEDELTA_qm_m_multiply, TIMEDELTA_md_m_multiply, TIMEDELTA_dm_m_multiply, NULL};
+static void * multiply_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char multiply_signatures[] = {NPY_BOOL, NPY_BOOL, NPY_BOOL, NPY_BYTE, NPY_BYTE, NPY_BYTE, NPY_UBYTE, NPY_UBYTE, NPY_UBYTE, NPY_SHORT, NPY_SHORT, NPY_SHORT, NPY_USHORT, NPY_USHORT, NPY_USHORT, NPY_INT, NPY_INT, NPY_INT, NPY_UINT, NPY_UINT, NPY_UINT, NPY_LONG, NPY_LONG, NPY_LONG, NPY_ULONG, NPY_ULONG, NPY_ULONG, NPY_LONGLONG, NPY_LONGLONG, NPY_LONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_HALF, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_TIMEDELTA, NPY_LONGLONG, NPY_TIMEDELTA, NPY_LONGLONG, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_DOUBLE, NPY_TIMEDELTA, NPY_DOUBLE, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_OBJECT, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction negative_functions[] = {BYTE_negative, UBYTE_negative, SHORT_negative, USHORT_negative, INT_negative, UINT_negative, LONG_negative, ULONG_negative, LONGLONG_negative, ULONGLONG_negative, HALF_negative, FLOAT_negative, DOUBLE_negative, LONGDOUBLE_negative, TIMEDELTA_negative, NULL, NULL, NULL, NULL};
+static void * negative_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char negative_signatures[] = {NPY_BYTE, NPY_BYTE, NPY_UBYTE, NPY_UBYTE, NPY_SHORT, NPY_SHORT, NPY_USHORT, NPY_USHORT, NPY_INT, NPY_INT, NPY_UINT, NPY_UINT, NPY_LONG, NPY_LONG, NPY_ULONG, NPY_ULONG, NPY_LONGLONG, NPY_LONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction nextafter_functions[] = {HALF_nextafter, FLOAT_nextafter, DOUBLE_nextafter, LONGDOUBLE_nextafter};
+static void * nextafter_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char nextafter_signatures[] = {NPY_HALF, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE};
+static PyUFuncGenericFunction not_equal_functions[] = {BOOL_not_equal, BYTE_not_equal, UBYTE_not_equal, SHORT_not_equal, USHORT_not_equal, INT_not_equal, UINT_not_equal, LONG_not_equal, ULONG_not_equal, LONGLONG_not_equal, ULONGLONG_not_equal, LONGLONG_qQ_bool_not_equal, LONGLONG_Qq_bool_not_equal, HALF_not_equal, FLOAT_not_equal, DOUBLE_not_equal, LONGDOUBLE_not_equal, CFLOAT_not_equal, CDOUBLE_not_equal, CLONGDOUBLE_not_equal, DATETIME_not_equal, TIMEDELTA_not_equal, OBJECT_not_equal, OBJECT_OO_O_not_equal};
+static void * not_equal_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char not_equal_signatures[] = {NPY_BOOL, NPY_BOOL, NPY_BOOL, NPY_BYTE, NPY_BYTE, NPY_BOOL, NPY_UBYTE, NPY_UBYTE, NPY_BOOL, NPY_SHORT, NPY_SHORT, NPY_BOOL, NPY_USHORT, NPY_USHORT, NPY_BOOL, NPY_INT, NPY_INT, NPY_BOOL, NPY_UINT, NPY_UINT, NPY_BOOL, NPY_LONG, NPY_LONG, NPY_BOOL, NPY_ULONG, NPY_ULONG, NPY_BOOL, NPY_LONGLONG, NPY_LONGLONG, NPY_BOOL, NPY_ULONGLONG, NPY_ULONGLONG, NPY_BOOL, NPY_LONGLONG, NPY_ULONGLONG, NPY_BOOL, NPY_ULONGLONG, NPY_LONGLONG, NPY_BOOL, NPY_HALF, NPY_HALF, NPY_BOOL, NPY_FLOAT, NPY_FLOAT, NPY_BOOL, NPY_DOUBLE, NPY_DOUBLE, NPY_BOOL, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_BOOL, NPY_CFLOAT, NPY_CFLOAT, NPY_BOOL, NPY_CDOUBLE, NPY_CDOUBLE, NPY_BOOL, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_BOOL, NPY_DATETIME, NPY_DATETIME, NPY_BOOL, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_BOOL, NPY_OBJECT, NPY_OBJECT, NPY_BOOL, NPY_OBJECT, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction positive_functions[] = {BYTE_positive, UBYTE_positive, SHORT_positive, USHORT_positive, INT_positive, UINT_positive, LONG_positive, ULONG_positive, LONGLONG_positive, ULONGLONG_positive, HALF_positive, FLOAT_positive, DOUBLE_positive, LONGDOUBLE_positive, TIMEDELTA_positive, NULL, NULL, NULL, NULL};
+static void * positive_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char positive_signatures[] = {NPY_BYTE, NPY_BYTE, NPY_UBYTE, NPY_UBYTE, NPY_SHORT, NPY_SHORT, NPY_USHORT, NPY_USHORT, NPY_INT, NPY_INT, NPY_UINT, NPY_UINT, NPY_LONG, NPY_LONG, NPY_ULONG, NPY_ULONG, NPY_LONGLONG, NPY_LONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction power_functions[] = {BYTE_power, UBYTE_power, SHORT_power, USHORT_power, INT_power, UINT_power, LONG_power, ULONG_power, LONGLONG_power, ULONGLONG_power, NULL, FLOAT_power, DOUBLE_power, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL};
+static void * power_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char power_signatures[] = {NPY_BYTE, NPY_BYTE, NPY_BYTE, NPY_UBYTE, NPY_UBYTE, NPY_UBYTE, NPY_SHORT, NPY_SHORT, NPY_SHORT, NPY_USHORT, NPY_USHORT, NPY_USHORT, NPY_INT, NPY_INT, NPY_INT, NPY_UINT, NPY_UINT, NPY_UINT, NPY_LONG, NPY_LONG, NPY_LONG, NPY_ULONG, NPY_ULONG, NPY_ULONG, NPY_LONGLONG, NPY_LONGLONG, NPY_LONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_HALF, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_HALF, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_OBJECT, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction rad2deg_functions[] = {NULL, NULL, NULL, NULL, NULL};
+static void * rad2deg_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)"rad2deg"};
+static char rad2deg_signatures[] = {NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction radians_functions[] = {NULL, NULL, NULL, NULL, NULL};
+static void * radians_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)"radians"};
+static char radians_signatures[] = {NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction reciprocal_functions[] = {BYTE_reciprocal, UBYTE_reciprocal, SHORT_reciprocal, USHORT_reciprocal, INT_reciprocal, UINT_reciprocal, LONG_reciprocal, ULONG_reciprocal, LONGLONG_reciprocal, ULONGLONG_reciprocal, HALF_reciprocal, FLOAT_reciprocal, DOUBLE_reciprocal, LONGDOUBLE_reciprocal, CFLOAT_reciprocal, CDOUBLE_reciprocal, CLONGDOUBLE_reciprocal, NULL};
+static void * reciprocal_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char reciprocal_signatures[] = {NPY_BYTE, NPY_BYTE, NPY_UBYTE, NPY_UBYTE, NPY_SHORT, NPY_SHORT, NPY_USHORT, NPY_USHORT, NPY_INT, NPY_INT, NPY_UINT, NPY_UINT, NPY_LONG, NPY_LONG, NPY_ULONG, NPY_ULONG, NPY_LONGLONG, NPY_LONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction remainder_functions[] = {BYTE_remainder, UBYTE_remainder, SHORT_remainder, USHORT_remainder, INT_remainder, UINT_remainder, LONG_remainder, ULONG_remainder, LONGLONG_remainder, ULONGLONG_remainder, HALF_remainder, FLOAT_remainder, DOUBLE_remainder, LONGDOUBLE_remainder, TIMEDELTA_mm_m_remainder, NULL};
+static void * remainder_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char remainder_signatures[] = {NPY_BYTE, NPY_BYTE, NPY_BYTE, NPY_UBYTE, NPY_UBYTE, NPY_UBYTE, NPY_SHORT, NPY_SHORT, NPY_SHORT, NPY_USHORT, NPY_USHORT, NPY_USHORT, NPY_INT, NPY_INT, NPY_INT, NPY_UINT, NPY_UINT, NPY_UINT, NPY_LONG, NPY_LONG, NPY_LONG, NPY_ULONG, NPY_ULONG, NPY_ULONG, NPY_LONGLONG, NPY_LONGLONG, NPY_LONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_HALF, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_OBJECT, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction right_shift_functions[] = {BYTE_right_shift, UBYTE_right_shift, SHORT_right_shift, USHORT_right_shift, INT_right_shift, UINT_right_shift, LONG_right_shift, ULONG_right_shift, LONGLONG_right_shift, ULONGLONG_right_shift, NULL};
+static void * right_shift_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char right_shift_signatures[] = {NPY_BYTE, NPY_BYTE, NPY_BYTE, NPY_UBYTE, NPY_UBYTE, NPY_UBYTE, NPY_SHORT, NPY_SHORT, NPY_SHORT, NPY_USHORT, NPY_USHORT, NPY_USHORT, NPY_INT, NPY_INT, NPY_INT, NPY_UINT, NPY_UINT, NPY_UINT, NPY_LONG, NPY_LONG, NPY_LONG, NPY_ULONG, NPY_ULONG, NPY_ULONG, NPY_LONGLONG, NPY_LONGLONG, NPY_LONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_OBJECT, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction rint_functions[] = {NULL, FLOAT_rint, DOUBLE_rint, NULL, NULL, NULL, NULL, NULL, NULL, NULL};
+static void * rint_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)"rint"};
+static char rint_signatures[] = {NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction sign_functions[] = {BYTE_sign, UBYTE_sign, SHORT_sign, USHORT_sign, INT_sign, UINT_sign, LONG_sign, ULONG_sign, LONGLONG_sign, ULONGLONG_sign, HALF_sign, FLOAT_sign, DOUBLE_sign, LONGDOUBLE_sign, CFLOAT_sign, CDOUBLE_sign, CLONGDOUBLE_sign, TIMEDELTA_sign, OBJECT_sign};
+static void * sign_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char sign_signatures[] = {NPY_BYTE, NPY_BYTE, NPY_UBYTE, NPY_UBYTE, NPY_SHORT, NPY_SHORT, NPY_USHORT, NPY_USHORT, NPY_INT, NPY_INT, NPY_UINT, NPY_UINT, NPY_LONG, NPY_LONG, NPY_ULONG, NPY_ULONG, NPY_LONGLONG, NPY_LONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction signbit_functions[] = {HALF_signbit, FLOAT_signbit, DOUBLE_signbit, LONGDOUBLE_signbit};
+static void * signbit_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char signbit_signatures[] = {NPY_HALF, NPY_BOOL, NPY_FLOAT, NPY_BOOL, NPY_DOUBLE, NPY_BOOL, NPY_LONGDOUBLE, NPY_BOOL};
+static PyUFuncGenericFunction sin_functions[] = {HALF_sin, FLOAT_sin, DOUBLE_sin, NULL, NULL, NULL, NULL, NULL};
+static void * sin_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)"sin"};
+static char sin_signatures[] = {NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction sinh_functions[] = {HALF_sinh, FLOAT_sinh, DOUBLE_sinh, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL};
+static void * sinh_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)"sinh"};
+static char sinh_signatures[] = {NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction spacing_functions[] = {HALF_spacing, FLOAT_spacing, DOUBLE_spacing, LONGDOUBLE_spacing};
+static void * spacing_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char spacing_signatures[] = {NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE};
+static PyUFuncGenericFunction sqrt_functions[] = {NULL, FLOAT_sqrt, DOUBLE_sqrt, NULL, NULL, NULL, NULL, NULL, NULL, NULL};
+static void * sqrt_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)"sqrt"};
+static char sqrt_signatures[] = {NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction square_functions[] = {BYTE_square, UBYTE_square, SHORT_square, USHORT_square, INT_square, UINT_square, LONG_square, ULONG_square, LONGLONG_square, ULONGLONG_square, HALF_square, FLOAT_square, DOUBLE_square, LONGDOUBLE_square, CFLOAT_square, CDOUBLE_square, CLONGDOUBLE_square, NULL};
+static void * square_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char square_signatures[] = {NPY_BYTE, NPY_BYTE, NPY_UBYTE, NPY_UBYTE, NPY_SHORT, NPY_SHORT, NPY_USHORT, NPY_USHORT, NPY_INT, NPY_INT, NPY_UINT, NPY_UINT, NPY_LONG, NPY_LONG, NPY_ULONG, NPY_ULONG, NPY_LONGLONG, NPY_LONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction subtract_functions[] = {BYTE_subtract, UBYTE_subtract, SHORT_subtract, USHORT_subtract, INT_subtract, UINT_subtract, LONG_subtract, ULONG_subtract, LONGLONG_subtract, ULONGLONG_subtract, HALF_subtract, FLOAT_subtract, DOUBLE_subtract, LONGDOUBLE_subtract, CFLOAT_subtract, CDOUBLE_subtract, CLONGDOUBLE_subtract, DATETIME_Mm_M_subtract, TIMEDELTA_mm_m_subtract, DATETIME_MM_m_subtract, NULL};
+static void * subtract_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char subtract_signatures[] = {NPY_BYTE, NPY_BYTE, NPY_BYTE, NPY_UBYTE, NPY_UBYTE, NPY_UBYTE, NPY_SHORT, NPY_SHORT, NPY_SHORT, NPY_USHORT, NPY_USHORT, NPY_USHORT, NPY_INT, NPY_INT, NPY_INT, NPY_UINT, NPY_UINT, NPY_UINT, NPY_LONG, NPY_LONG, NPY_LONG, NPY_ULONG, NPY_ULONG, NPY_ULONG, NPY_LONGLONG, NPY_LONGLONG, NPY_LONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_ULONGLONG, NPY_HALF, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_DATETIME, NPY_TIMEDELTA, NPY_DATETIME, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_TIMEDELTA, NPY_DATETIME, NPY_DATETIME, NPY_TIMEDELTA, NPY_OBJECT, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction tan_functions[] = {HALF_tan, FLOAT_tan, DOUBLE_tan, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL};
+static void * tan_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)"tan"};
+static char tan_signatures[] = {NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction tanh_functions[] = {HALF_tanh, FLOAT_tanh, DOUBLE_tanh, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL};
+static void * tanh_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)"tanh"};
+static char tanh_signatures[] = {NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_CFLOAT, NPY_CFLOAT, NPY_CDOUBLE, NPY_CDOUBLE, NPY_CLONGDOUBLE, NPY_CLONGDOUBLE, NPY_OBJECT, NPY_OBJECT};
+static PyUFuncGenericFunction trunc_functions[] = {NULL, FLOAT_trunc, DOUBLE_trunc, NULL, NULL, NULL, NULL};
+static void * trunc_data[] = {(void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL, (void *)NULL};
+static char trunc_signatures[] = {NPY_HALF, NPY_HALF, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_FLOAT, NPY_FLOAT, NPY_DOUBLE, NPY_DOUBLE, NPY_LONGDOUBLE, NPY_LONGDOUBLE, NPY_OBJECT, NPY_OBJECT};
+/* Returns a borrowed ref of the second value in the matching info tuple */
+PyObject *
+get_info_no_cast(PyUFuncObject *ufunc, PyArray_DTypeMeta *op_dtype,
+                 int ndtypes);
+
+static int
+InitOperators(PyObject *dictionary) {
+    PyObject *f, *identity;
+
+    _ones_like_functions[20] = PyUFunc_O_O;
+    _ones_like_data[20] = (void *) Py_get_one;
+    absolute_functions[19] = PyUFunc_O_O;
+    absolute_data[19] = (void *) PyNumber_Absolute;
+    add_functions[21] = PyUFunc_OO_O;
+    add_data[21] = (void *) PyNumber_Add;
+    arccos_functions[3] = PyUFunc_e_e_As_f_f;
+    arccos_data[3] = (void *) npy_acosf;
+    arccos_functions[4] = PyUFunc_f_f;
+    arccos_data[4] = (void *) npy_acosf;
+    arccos_functions[5] = PyUFunc_d_d;
+    arccos_data[5] = (void *) npy_acos;
+    arccos_functions[6] = PyUFunc_g_g;
+    arccos_data[6] = (void *) npy_acosl;
+    arccos_functions[7] = PyUFunc_F_F;
+    arccos_data[7] = (void *) nc_acosf;
+    arccos_functions[8] = PyUFunc_D_D;
+    arccos_data[8] = (void *) nc_acos;
+    arccos_functions[9] = PyUFunc_G_G;
+    arccos_data[9] = (void *) nc_acosl;
+    arccos_functions[10] = PyUFunc_O_O_method;
+    arccosh_functions[3] = PyUFunc_e_e_As_f_f;
+    arccosh_data[3] = (void *) npy_acoshf;
+    arccosh_functions[4] = PyUFunc_f_f;
+    arccosh_data[4] = (void *) npy_acoshf;
+    arccosh_functions[5] = PyUFunc_d_d;
+    arccosh_data[5] = (void *) npy_acosh;
+    arccosh_functions[6] = PyUFunc_g_g;
+    arccosh_data[6] = (void *) npy_acoshl;
+    arccosh_functions[7] = PyUFunc_F_F;
+    arccosh_data[7] = (void *) nc_acoshf;
+    arccosh_functions[8] = PyUFunc_D_D;
+    arccosh_data[8] = (void *) nc_acosh;
+    arccosh_functions[9] = PyUFunc_G_G;
+    arccosh_data[9] = (void *) nc_acoshl;
+    arccosh_functions[10] = PyUFunc_O_O_method;
+    arcsin_functions[3] = PyUFunc_e_e_As_f_f;
+    arcsin_data[3] = (void *) npy_asinf;
+    arcsin_functions[4] = PyUFunc_f_f;
+    arcsin_data[4] = (void *) npy_asinf;
+    arcsin_functions[5] = PyUFunc_d_d;
+    arcsin_data[5] = (void *) npy_asin;
+    arcsin_functions[6] = PyUFunc_g_g;
+    arcsin_data[6] = (void *) npy_asinl;
+    arcsin_functions[7] = PyUFunc_F_F;
+    arcsin_data[7] = (void *) nc_asinf;
+    arcsin_functions[8] = PyUFunc_D_D;
+    arcsin_data[8] = (void *) nc_asin;
+    arcsin_functions[9] = PyUFunc_G_G;
+    arcsin_data[9] = (void *) nc_asinl;
+    arcsin_functions[10] = PyUFunc_O_O_method;
+    arcsinh_functions[3] = PyUFunc_e_e_As_f_f;
+    arcsinh_data[3] = (void *) npy_asinhf;
+    arcsinh_functions[4] = PyUFunc_f_f;
+    arcsinh_data[4] = (void *) npy_asinhf;
+    arcsinh_functions[5] = PyUFunc_d_d;
+    arcsinh_data[5] = (void *) npy_asinh;
+    arcsinh_functions[6] = PyUFunc_g_g;
+    arcsinh_data[6] = (void *) npy_asinhl;
+    arcsinh_functions[7] = PyUFunc_F_F;
+    arcsinh_data[7] = (void *) nc_asinhf;
+    arcsinh_functions[8] = PyUFunc_D_D;
+    arcsinh_data[8] = (void *) nc_asinh;
+    arcsinh_functions[9] = PyUFunc_G_G;
+    arcsinh_data[9] = (void *) nc_asinhl;
+    arcsinh_functions[10] = PyUFunc_O_O_method;
+    arctan_functions[3] = PyUFunc_e_e_As_f_f;
+    arctan_data[3] = (void *) npy_atanf;
+    arctan_functions[4] = PyUFunc_f_f;
+    arctan_data[4] = (void *) npy_atanf;
+    arctan_functions[5] = PyUFunc_d_d;
+    arctan_data[5] = (void *) npy_atan;
+    arctan_functions[6] = PyUFunc_g_g;
+    arctan_data[6] = (void *) npy_atanl;
+    arctan_functions[7] = PyUFunc_F_F;
+    arctan_data[7] = (void *) nc_atanf;
+    arctan_functions[8] = PyUFunc_D_D;
+    arctan_data[8] = (void *) nc_atan;
+    arctan_functions[9] = PyUFunc_G_G;
+    arctan_data[9] = (void *) nc_atanl;
+    arctan_functions[10] = PyUFunc_O_O_method;
+    arctan2_functions[0] = PyUFunc_ee_e_As_ff_f;
+    arctan2_data[0] = (void *) npy_atan2f;
+    arctan2_functions[3] = PyUFunc_gg_g;
+    arctan2_data[3] = (void *) npy_atan2l;
+    arctan2_functions[4] = PyUFunc_OO_O_method;
+    arctanh_functions[3] = PyUFunc_e_e_As_f_f;
+    arctanh_data[3] = (void *) npy_atanhf;
+    arctanh_functions[4] = PyUFunc_f_f;
+    arctanh_data[4] = (void *) npy_atanhf;
+    arctanh_functions[5] = PyUFunc_d_d;
+    arctanh_data[5] = (void *) npy_atanh;
+    arctanh_functions[6] = PyUFunc_g_g;
+    arctanh_data[6] = (void *) npy_atanhl;
+    arctanh_functions[7] = PyUFunc_F_F;
+    arctanh_data[7] = (void *) nc_atanhf;
+    arctanh_functions[8] = PyUFunc_D_D;
+    arctanh_data[8] = (void *) nc_atanh;
+    arctanh_functions[9] = PyUFunc_G_G;
+    arctanh_data[9] = (void *) nc_atanhl;
+    arctanh_functions[10] = PyUFunc_O_O_method;
+    bitwise_and_functions[11] = PyUFunc_OO_O;
+    bitwise_and_data[11] = (void *) PyNumber_And;
+    bitwise_or_functions[11] = PyUFunc_OO_O;
+    bitwise_or_data[11] = (void *) PyNumber_Or;
+    bitwise_xor_functions[11] = PyUFunc_OO_O;
+    bitwise_xor_data[11] = (void *) PyNumber_Xor;
+    cbrt_functions[3] = PyUFunc_e_e_As_f_f;
+    cbrt_data[3] = (void *) npy_cbrtf;
+    cbrt_functions[4] = PyUFunc_f_f;
+    cbrt_data[4] = (void *) npy_cbrtf;
+    cbrt_functions[5] = PyUFunc_d_d;
+    cbrt_data[5] = (void *) npy_cbrt;
+    cbrt_functions[6] = PyUFunc_g_g;
+    cbrt_data[6] = (void *) npy_cbrtl;
+    cbrt_functions[7] = PyUFunc_O_O_method;
+    ceil_functions[0] = PyUFunc_e_e_As_f_f;
+    ceil_data[0] = (void *) npy_ceilf;
+    ceil_functions[3] = PyUFunc_f_f;
+    ceil_data[3] = (void *) npy_ceilf;
+    ceil_functions[4] = PyUFunc_d_d;
+    ceil_data[4] = (void *) npy_ceil;
+    ceil_functions[5] = PyUFunc_g_g;
+    ceil_data[5] = (void *) npy_ceill;
+    ceil_functions[6] = PyUFunc_O_O;
+    ceil_data[6] = (void *) npy_ObjectCeil;
+    clip_functions[20] = PyUFunc_OOO_O;
+    clip_data[20] = (void *) npy_ObjectClip;
+    conjugate_functions[17] = PyUFunc_O_O_method;
+    cos_functions[3] = PyUFunc_g_g;
+    cos_data[3] = (void *) npy_cosl;
+    cos_functions[4] = PyUFunc_F_F;
+    cos_data[4] = (void *) nc_cosf;
+    cos_functions[5] = PyUFunc_D_D;
+    cos_data[5] = (void *) nc_cos;
+    cos_functions[6] = PyUFunc_G_G;
+    cos_data[6] = (void *) nc_cosl;
+    cos_functions[7] = PyUFunc_O_O_method;
+    cosh_functions[3] = PyUFunc_e_e_As_f_f;
+    cosh_data[3] = (void *) npy_coshf;
+    cosh_functions[4] = PyUFunc_f_f;
+    cosh_data[4] = (void *) npy_coshf;
+    cosh_functions[5] = PyUFunc_d_d;
+    cosh_data[5] = (void *) npy_cosh;
+    cosh_functions[6] = PyUFunc_g_g;
+    cosh_data[6] = (void *) npy_coshl;
+    cosh_functions[7] = PyUFunc_F_F;
+    cosh_data[7] = (void *) nc_coshf;
+    cosh_functions[8] = PyUFunc_D_D;
+    cosh_data[8] = (void *) nc_cosh;
+    cosh_functions[9] = PyUFunc_G_G;
+    cosh_data[9] = (void *) nc_coshl;
+    cosh_functions[10] = PyUFunc_O_O_method;
+    deg2rad_functions[0] = PyUFunc_e_e_As_f_f;
+    deg2rad_data[0] = (void *) npy_deg2radf;
+    deg2rad_functions[1] = PyUFunc_f_f;
+    deg2rad_data[1] = (void *) npy_deg2radf;
+    deg2rad_functions[2] = PyUFunc_d_d;
+    deg2rad_data[2] = (void *) npy_deg2rad;
+    deg2rad_functions[3] = PyUFunc_g_g;
+    deg2rad_data[3] = (void *) npy_deg2radl;
+    deg2rad_functions[4] = PyUFunc_O_O_method;
+    degrees_functions[0] = PyUFunc_e_e_As_f_f;
+    degrees_data[0] = (void *) npy_degreesf;
+    degrees_functions[1] = PyUFunc_f_f;
+    degrees_data[1] = (void *) npy_degreesf;
+    degrees_functions[2] = PyUFunc_d_d;
+    degrees_data[2] = (void *) npy_degrees;
+    degrees_functions[3] = PyUFunc_g_g;
+    degrees_data[3] = (void *) npy_degreesl;
+    degrees_functions[4] = PyUFunc_O_O_method;
+    divide_functions[10] = PyUFunc_OO_O;
+    divide_data[10] = (void *) PyNumber_TrueDivide;
+    exp_functions[3] = PyUFunc_f_f;
+    exp_data[3] = (void *) npy_expf;
+    exp_functions[4] = PyUFunc_d_d;
+    exp_data[4] = (void *) npy_exp;
+    exp_functions[5] = PyUFunc_g_g;
+    exp_data[5] = (void *) npy_expl;
+    exp_functions[6] = PyUFunc_F_F;
+    exp_data[6] = (void *) nc_expf;
+    exp_functions[7] = PyUFunc_D_D;
+    exp_data[7] = (void *) nc_exp;
+    exp_functions[8] = PyUFunc_G_G;
+    exp_data[8] = (void *) nc_expl;
+    exp_functions[9] = PyUFunc_O_O_method;
+    exp2_functions[3] = PyUFunc_e_e_As_f_f;
+    exp2_data[3] = (void *) npy_exp2f;
+    exp2_functions[4] = PyUFunc_f_f;
+    exp2_data[4] = (void *) npy_exp2f;
+    exp2_functions[5] = PyUFunc_d_d;
+    exp2_data[5] = (void *) npy_exp2;
+    exp2_functions[6] = PyUFunc_g_g;
+    exp2_data[6] = (void *) npy_exp2l;
+    exp2_functions[7] = PyUFunc_F_F;
+    exp2_data[7] = (void *) nc_exp2f;
+    exp2_functions[8] = PyUFunc_D_D;
+    exp2_data[8] = (void *) nc_exp2;
+    exp2_functions[9] = PyUFunc_G_G;
+    exp2_data[9] = (void *) nc_exp2l;
+    exp2_functions[10] = PyUFunc_O_O_method;
+    expm1_functions[3] = PyUFunc_e_e_As_f_f;
+    expm1_data[3] = (void *) npy_expm1f;
+    expm1_functions[4] = PyUFunc_f_f;
+    expm1_data[4] = (void *) npy_expm1f;
+    expm1_functions[5] = PyUFunc_d_d;
+    expm1_data[5] = (void *) npy_expm1;
+    expm1_functions[6] = PyUFunc_g_g;
+    expm1_data[6] = (void *) npy_expm1l;
+    expm1_functions[7] = PyUFunc_F_F;
+    expm1_data[7] = (void *) nc_expm1f;
+    expm1_functions[8] = PyUFunc_D_D;
+    expm1_data[8] = (void *) nc_expm1;
+    expm1_functions[9] = PyUFunc_G_G;
+    expm1_data[9] = (void *) nc_expm1l;
+    expm1_functions[10] = PyUFunc_O_O_method;
+    fabs_functions[0] = PyUFunc_e_e_As_f_f;
+    fabs_data[0] = (void *) npy_fabsf;
+    fabs_functions[1] = PyUFunc_f_f;
+    fabs_data[1] = (void *) npy_fabsf;
+    fabs_functions[2] = PyUFunc_d_d;
+    fabs_data[2] = (void *) npy_fabs;
+    fabs_functions[3] = PyUFunc_g_g;
+    fabs_data[3] = (void *) npy_fabsl;
+    fabs_functions[4] = PyUFunc_O_O_method;
+    float_power_functions[0] = PyUFunc_dd_d;
+    float_power_data[0] = (void *) npy_pow;
+    float_power_functions[1] = PyUFunc_gg_g;
+    float_power_data[1] = (void *) npy_powl;
+    float_power_functions[2] = PyUFunc_DD_D;
+    float_power_data[2] = (void *) nc_pow;
+    float_power_functions[3] = PyUFunc_GG_G;
+    float_power_data[3] = (void *) nc_powl;
+    floor_functions[0] = PyUFunc_e_e_As_f_f;
+    floor_data[0] = (void *) npy_floorf;
+    floor_functions[3] = PyUFunc_f_f;
+    floor_data[3] = (void *) npy_floorf;
+    floor_functions[4] = PyUFunc_d_d;
+    floor_data[4] = (void *) npy_floor;
+    floor_functions[5] = PyUFunc_g_g;
+    floor_data[5] = (void *) npy_floorl;
+    floor_functions[6] = PyUFunc_O_O;
+    floor_data[6] = (void *) npy_ObjectFloor;
+    floor_divide_functions[17] = PyUFunc_OO_O;
+    floor_divide_data[17] = (void *) PyNumber_FloorDivide;
+    fmax_functions[20] = PyUFunc_OO_O;
+    fmax_data[20] = (void *) npy_ObjectMax;
+    fmin_functions[20] = PyUFunc_OO_O;
+    fmin_data[20] = (void *) npy_ObjectMin;
+    fmod_functions[10] = PyUFunc_ee_e_As_ff_f;
+    fmod_data[10] = (void *) npy_fmodf;
+    fmod_functions[11] = PyUFunc_ff_f;
+    fmod_data[11] = (void *) npy_fmodf;
+    fmod_functions[12] = PyUFunc_dd_d;
+    fmod_data[12] = (void *) npy_fmod;
+    fmod_functions[13] = PyUFunc_gg_g;
+    fmod_data[13] = (void *) npy_fmodl;
+    fmod_functions[14] = PyUFunc_OO_O_method;
+    gcd_functions[10] = PyUFunc_OO_O;
+    gcd_data[10] = (void *) npy_ObjectGCD;
+    heaviside_functions[0] = PyUFunc_ee_e_As_ff_f;
+    heaviside_data[0] = (void *) npy_heavisidef;
+    heaviside_functions[1] = PyUFunc_ff_f;
+    heaviside_data[1] = (void *) npy_heavisidef;
+    heaviside_functions[2] = PyUFunc_dd_d;
+    heaviside_data[2] = (void *) npy_heaviside;
+    heaviside_functions[3] = PyUFunc_gg_g;
+    heaviside_data[3] = (void *) npy_heavisidel;
+    hypot_functions[0] = PyUFunc_ee_e_As_ff_f;
+    hypot_data[0] = (void *) npy_hypotf;
+    hypot_functions[1] = PyUFunc_ff_f;
+    hypot_data[1] = (void *) npy_hypotf;
+    hypot_functions[2] = PyUFunc_dd_d;
+    hypot_data[2] = (void *) npy_hypot;
+    hypot_functions[3] = PyUFunc_gg_g;
+    hypot_data[3] = (void *) npy_hypotl;
+    hypot_functions[4] = PyUFunc_OO_O_method;
+    invert_functions[11] = PyUFunc_O_O;
+    invert_data[11] = (void *) PyNumber_Invert;
+    lcm_functions[10] = PyUFunc_OO_O;
+    lcm_data[10] = (void *) npy_ObjectLCM;
+    left_shift_functions[10] = PyUFunc_OO_O;
+    left_shift_data[10] = (void *) PyNumber_Lshift;
+    log_functions[3] = PyUFunc_f_f;
+    log_data[3] = (void *) npy_logf;
+    log_functions[4] = PyUFunc_d_d;
+    log_data[4] = (void *) npy_log;
+    log_functions[5] = PyUFunc_g_g;
+    log_data[5] = (void *) npy_logl;
+    log_functions[6] = PyUFunc_F_F;
+    log_data[6] = (void *) nc_logf;
+    log_functions[7] = PyUFunc_D_D;
+    log_data[7] = (void *) nc_log;
+    log_functions[8] = PyUFunc_G_G;
+    log_data[8] = (void *) nc_logl;
+    log_functions[9] = PyUFunc_O_O_method;
+    log10_functions[3] = PyUFunc_e_e_As_f_f;
+    log10_data[3] = (void *) npy_log10f;
+    log10_functions[4] = PyUFunc_f_f;
+    log10_data[4] = (void *) npy_log10f;
+    log10_functions[5] = PyUFunc_d_d;
+    log10_data[5] = (void *) npy_log10;
+    log10_functions[6] = PyUFunc_g_g;
+    log10_data[6] = (void *) npy_log10l;
+    log10_functions[7] = PyUFunc_F_F;
+    log10_data[7] = (void *) nc_log10f;
+    log10_functions[8] = PyUFunc_D_D;
+    log10_data[8] = (void *) nc_log10;
+    log10_functions[9] = PyUFunc_G_G;
+    log10_data[9] = (void *) nc_log10l;
+    log10_functions[10] = PyUFunc_O_O_method;
+    log1p_functions[3] = PyUFunc_e_e_As_f_f;
+    log1p_data[3] = (void *) npy_log1pf;
+    log1p_functions[4] = PyUFunc_f_f;
+    log1p_data[4] = (void *) npy_log1pf;
+    log1p_functions[5] = PyUFunc_d_d;
+    log1p_data[5] = (void *) npy_log1p;
+    log1p_functions[6] = PyUFunc_g_g;
+    log1p_data[6] = (void *) npy_log1pl;
+    log1p_functions[7] = PyUFunc_F_F;
+    log1p_data[7] = (void *) nc_log1pf;
+    log1p_functions[8] = PyUFunc_D_D;
+    log1p_data[8] = (void *) nc_log1p;
+    log1p_functions[9] = PyUFunc_G_G;
+    log1p_data[9] = (void *) nc_log1pl;
+    log1p_functions[10] = PyUFunc_O_O_method;
+    log2_functions[3] = PyUFunc_e_e_As_f_f;
+    log2_data[3] = (void *) npy_log2f;
+    log2_functions[4] = PyUFunc_f_f;
+    log2_data[4] = (void *) npy_log2f;
+    log2_functions[5] = PyUFunc_d_d;
+    log2_data[5] = (void *) npy_log2;
+    log2_functions[6] = PyUFunc_g_g;
+    log2_data[6] = (void *) npy_log2l;
+    log2_functions[7] = PyUFunc_F_F;
+    log2_data[7] = (void *) nc_log2f;
+    log2_functions[8] = PyUFunc_D_D;
+    log2_data[8] = (void *) nc_log2;
+    log2_functions[9] = PyUFunc_G_G;
+    log2_data[9] = (void *) nc_log2l;
+    log2_functions[10] = PyUFunc_O_O_method;
+    logaddexp_functions[0] = PyUFunc_ee_e_As_ff_f;
+    logaddexp_data[0] = (void *) npy_logaddexpf;
+    logaddexp_functions[1] = PyUFunc_ff_f;
+    logaddexp_data[1] = (void *) npy_logaddexpf;
+    logaddexp_functions[2] = PyUFunc_dd_d;
+    logaddexp_data[2] = (void *) npy_logaddexp;
+    logaddexp_functions[3] = PyUFunc_gg_g;
+    logaddexp_data[3] = (void *) npy_logaddexpl;
+    logaddexp2_functions[0] = PyUFunc_ee_e_As_ff_f;
+    logaddexp2_data[0] = (void *) npy_logaddexp2f;
+    logaddexp2_functions[1] = PyUFunc_ff_f;
+    logaddexp2_data[1] = (void *) npy_logaddexp2f;
+    logaddexp2_functions[2] = PyUFunc_dd_d;
+    logaddexp2_data[2] = (void *) npy_logaddexp2;
+    logaddexp2_functions[3] = PyUFunc_gg_g;
+    logaddexp2_data[3] = (void *) npy_logaddexp2l;
+    logical_and_functions[18] = PyUFunc_OO_O;
+    logical_and_data[18] = (void *) npy_ObjectLogicalAnd;
+    logical_not_functions[18] = PyUFunc_O_O;
+    logical_not_data[18] = (void *) npy_ObjectLogicalNot;
+    logical_or_functions[18] = PyUFunc_OO_O;
+    logical_or_data[18] = (void *) npy_ObjectLogicalOr;
+    logical_xor_functions[18] = PyUFunc_OO_O_method;
+    maximum_functions[20] = PyUFunc_OO_O;
+    maximum_data[20] = (void *) npy_ObjectMax;
+    minimum_functions[20] = PyUFunc_OO_O;
+    minimum_data[20] = (void *) npy_ObjectMin;
+    multiply_functions[22] = PyUFunc_OO_O;
+    multiply_data[22] = (void *) PyNumber_Multiply;
+    negative_functions[15] = PyUFunc_F_F;
+    negative_data[15] = (void *) nc_negf;
+    negative_functions[16] = PyUFunc_D_D;
+    negative_data[16] = (void *) nc_neg;
+    negative_functions[17] = PyUFunc_G_G;
+    negative_data[17] = (void *) nc_negl;
+    negative_functions[18] = PyUFunc_O_O;
+    negative_data[18] = (void *) PyNumber_Negative;
+    positive_functions[15] = PyUFunc_F_F;
+    positive_data[15] = (void *) nc_posf;
+    positive_functions[16] = PyUFunc_D_D;
+    positive_data[16] = (void *) nc_pos;
+    positive_functions[17] = PyUFunc_G_G;
+    positive_data[17] = (void *) nc_posl;
+    positive_functions[18] = PyUFunc_O_O;
+    positive_data[18] = (void *) PyNumber_Positive;
+    power_functions[10] = PyUFunc_ee_e_As_ff_f;
+    power_data[10] = (void *) npy_powf;
+    power_functions[13] = PyUFunc_ee_e_As_ff_f;
+    power_data[13] = (void *) npy_powf;
+    power_functions[14] = PyUFunc_ff_f;
+    power_data[14] = (void *) npy_powf;
+    power_functions[15] = PyUFunc_dd_d;
+    power_data[15] = (void *) npy_pow;
+    power_functions[16] = PyUFunc_gg_g;
+    power_data[16] = (void *) npy_powl;
+    power_functions[17] = PyUFunc_FF_F;
+    power_data[17] = (void *) nc_powf;
+    power_functions[18] = PyUFunc_DD_D;
+    power_data[18] = (void *) nc_pow;
+    power_functions[19] = PyUFunc_GG_G;
+    power_data[19] = (void *) nc_powl;
+    power_functions[20] = PyUFunc_OO_O;
+    power_data[20] = (void *) npy_ObjectPower;
+    rad2deg_functions[0] = PyUFunc_e_e_As_f_f;
+    rad2deg_data[0] = (void *) npy_rad2degf;
+    rad2deg_functions[1] = PyUFunc_f_f;
+    rad2deg_data[1] = (void *) npy_rad2degf;
+    rad2deg_functions[2] = PyUFunc_d_d;
+    rad2deg_data[2] = (void *) npy_rad2deg;
+    rad2deg_functions[3] = PyUFunc_g_g;
+    rad2deg_data[3] = (void *) npy_rad2degl;
+    rad2deg_functions[4] = PyUFunc_O_O_method;
+    radians_functions[0] = PyUFunc_e_e_As_f_f;
+    radians_data[0] = (void *) npy_radiansf;
+    radians_functions[1] = PyUFunc_f_f;
+    radians_data[1] = (void *) npy_radiansf;
+    radians_functions[2] = PyUFunc_d_d;
+    radians_data[2] = (void *) npy_radians;
+    radians_functions[3] = PyUFunc_g_g;
+    radians_data[3] = (void *) npy_radiansl;
+    radians_functions[4] = PyUFunc_O_O_method;
+    reciprocal_functions[17] = PyUFunc_O_O;
+    reciprocal_data[17] = (void *) Py_reciprocal;
+    remainder_functions[15] = PyUFunc_OO_O;
+    remainder_data[15] = (void *) PyNumber_Remainder;
+    right_shift_functions[10] = PyUFunc_OO_O;
+    right_shift_data[10] = (void *) PyNumber_Rshift;
+    rint_functions[0] = PyUFunc_e_e_As_f_f;
+    rint_data[0] = (void *) npy_rintf;
+    rint_functions[3] = PyUFunc_f_f;
+    rint_data[3] = (void *) npy_rintf;
+    rint_functions[4] = PyUFunc_d_d;
+    rint_data[4] = (void *) npy_rint;
+    rint_functions[5] = PyUFunc_g_g;
+    rint_data[5] = (void *) npy_rintl;
+    rint_functions[6] = PyUFunc_F_F;
+    rint_data[6] = (void *) nc_rintf;
+    rint_functions[7] = PyUFunc_D_D;
+    rint_data[7] = (void *) nc_rint;
+    rint_functions[8] = PyUFunc_G_G;
+    rint_data[8] = (void *) nc_rintl;
+    rint_functions[9] = PyUFunc_O_O_method;
+    sin_functions[3] = PyUFunc_g_g;
+    sin_data[3] = (void *) npy_sinl;
+    sin_functions[4] = PyUFunc_F_F;
+    sin_data[4] = (void *) nc_sinf;
+    sin_functions[5] = PyUFunc_D_D;
+    sin_data[5] = (void *) nc_sin;
+    sin_functions[6] = PyUFunc_G_G;
+    sin_data[6] = (void *) nc_sinl;
+    sin_functions[7] = PyUFunc_O_O_method;
+    sinh_functions[3] = PyUFunc_e_e_As_f_f;
+    sinh_data[3] = (void *) npy_sinhf;
+    sinh_functions[4] = PyUFunc_f_f;
+    sinh_data[4] = (void *) npy_sinhf;
+    sinh_functions[5] = PyUFunc_d_d;
+    sinh_data[5] = (void *) npy_sinh;
+    sinh_functions[6] = PyUFunc_g_g;
+    sinh_data[6] = (void *) npy_sinhl;
+    sinh_functions[7] = PyUFunc_F_F;
+    sinh_data[7] = (void *) nc_sinhf;
+    sinh_functions[8] = PyUFunc_D_D;
+    sinh_data[8] = (void *) nc_sinh;
+    sinh_functions[9] = PyUFunc_G_G;
+    sinh_data[9] = (void *) nc_sinhl;
+    sinh_functions[10] = PyUFunc_O_O_method;
+    sqrt_functions[0] = PyUFunc_e_e_As_f_f;
+    sqrt_data[0] = (void *) npy_sqrtf;
+    sqrt_functions[3] = PyUFunc_f_f;
+    sqrt_data[3] = (void *) npy_sqrtf;
+    sqrt_functions[4] = PyUFunc_d_d;
+    sqrt_data[4] = (void *) npy_sqrt;
+    sqrt_functions[5] = PyUFunc_g_g;
+    sqrt_data[5] = (void *) npy_sqrtl;
+    sqrt_functions[6] = PyUFunc_F_F;
+    sqrt_data[6] = (void *) nc_sqrtf;
+    sqrt_functions[7] = PyUFunc_D_D;
+    sqrt_data[7] = (void *) nc_sqrt;
+    sqrt_functions[8] = PyUFunc_G_G;
+    sqrt_data[8] = (void *) nc_sqrtl;
+    sqrt_functions[9] = PyUFunc_O_O_method;
+    square_functions[17] = PyUFunc_O_O;
+    square_data[17] = (void *) Py_square;
+    subtract_functions[20] = PyUFunc_OO_O;
+    subtract_data[20] = (void *) PyNumber_Subtract;
+    tan_functions[3] = PyUFunc_e_e_As_f_f;
+    tan_data[3] = (void *) npy_tanf;
+    tan_functions[4] = PyUFunc_f_f;
+    tan_data[4] = (void *) npy_tanf;
+    tan_functions[5] = PyUFunc_d_d;
+    tan_data[5] = (void *) npy_tan;
+    tan_functions[6] = PyUFunc_g_g;
+    tan_data[6] = (void *) npy_tanl;
+    tan_functions[7] = PyUFunc_F_F;
+    tan_data[7] = (void *) nc_tanf;
+    tan_functions[8] = PyUFunc_D_D;
+    tan_data[8] = (void *) nc_tan;
+    tan_functions[9] = PyUFunc_G_G;
+    tan_data[9] = (void *) nc_tanl;
+    tan_functions[10] = PyUFunc_O_O_method;
+    tanh_functions[3] = PyUFunc_e_e_As_f_f;
+    tanh_data[3] = (void *) npy_tanhf;
+    tanh_functions[4] = PyUFunc_f_f;
+    tanh_data[4] = (void *) npy_tanhf;
+    tanh_functions[5] = PyUFunc_d_d;
+    tanh_data[5] = (void *) npy_tanh;
+    tanh_functions[6] = PyUFunc_g_g;
+    tanh_data[6] = (void *) npy_tanhl;
+    tanh_functions[7] = PyUFunc_F_F;
+    tanh_data[7] = (void *) nc_tanhf;
+    tanh_functions[8] = PyUFunc_D_D;
+    tanh_data[8] = (void *) nc_tanh;
+    tanh_functions[9] = PyUFunc_G_G;
+    tanh_data[9] = (void *) nc_tanhl;
+    tanh_functions[10] = PyUFunc_O_O_method;
+    trunc_functions[0] = PyUFunc_e_e_As_f_f;
+    trunc_data[0] = (void *) npy_truncf;
+    trunc_functions[3] = PyUFunc_f_f;
+    trunc_data[3] = (void *) npy_truncf;
+    trunc_functions[4] = PyUFunc_d_d;
+    trunc_data[4] = (void *) npy_trunc;
+    trunc_functions[5] = PyUFunc_g_g;
+    trunc_data[5] = (void *) npy_truncl;
+    trunc_functions[6] = PyUFunc_O_O;
+    trunc_data[6] = (void *) npy_ObjectTrunc;
+    
+    #ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_logical.dispatch.h"
+    #endif
+    
+    NPY_CPU_DISPATCH_CALL_XB(absolute_functions[0] = BOOL_absolute);
+    
+    NPY_CPU_DISPATCH_CALL_XB(add_functions[0] = BOOL_logical_or);
+    
+    NPY_CPU_DISPATCH_CALL_XB(bitwise_and_functions[0] = BOOL_logical_and);
+    
+    NPY_CPU_DISPATCH_CALL_XB(bitwise_or_functions[0] = BOOL_logical_or);
+    
+    NPY_CPU_DISPATCH_CALL_XB(fmax_functions[0] = BOOL_logical_or);
+    
+    NPY_CPU_DISPATCH_CALL_XB(fmin_functions[0] = BOOL_logical_and);
+    
+    NPY_CPU_DISPATCH_CALL_XB(invert_functions[0] = BOOL_logical_not);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_and_functions[0] = BOOL_logical_and);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_not_functions[0] = BOOL_logical_not);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_or_functions[0] = BOOL_logical_or);
+    
+    NPY_CPU_DISPATCH_CALL_XB(maximum_functions[0] = BOOL_logical_or);
+    
+    NPY_CPU_DISPATCH_CALL_XB(minimum_functions[0] = BOOL_logical_and);
+    
+    NPY_CPU_DISPATCH_CALL_XB(multiply_functions[0] = BOOL_logical_and);
+    
+    
+    #ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_autovec.dispatch.h"
+    #endif
+    
+    NPY_CPU_DISPATCH_CALL_XB(absolute_functions[1] = BYTE_absolute);
+    
+    NPY_CPU_DISPATCH_CALL_XB(absolute_functions[2] = UBYTE_absolute);
+    
+    NPY_CPU_DISPATCH_CALL_XB(absolute_functions[3] = SHORT_absolute);
+    
+    NPY_CPU_DISPATCH_CALL_XB(absolute_functions[4] = USHORT_absolute);
+    
+    NPY_CPU_DISPATCH_CALL_XB(absolute_functions[5] = INT_absolute);
+    
+    NPY_CPU_DISPATCH_CALL_XB(absolute_functions[6] = UINT_absolute);
+    
+    NPY_CPU_DISPATCH_CALL_XB(absolute_functions[7] = LONG_absolute);
+    
+    NPY_CPU_DISPATCH_CALL_XB(absolute_functions[8] = ULONG_absolute);
+    
+    NPY_CPU_DISPATCH_CALL_XB(absolute_functions[9] = LONGLONG_absolute);
+    
+    NPY_CPU_DISPATCH_CALL_XB(absolute_functions[10] = ULONGLONG_absolute);
+    
+    NPY_CPU_DISPATCH_CALL_XB(absolute_functions[11] = HALF_absolute);
+    
+    NPY_CPU_DISPATCH_CALL_XB(add_functions[1] = BYTE_add);
+    
+    NPY_CPU_DISPATCH_CALL_XB(add_functions[2] = UBYTE_add);
+    
+    NPY_CPU_DISPATCH_CALL_XB(add_functions[3] = SHORT_add);
+    
+    NPY_CPU_DISPATCH_CALL_XB(add_functions[4] = USHORT_add);
+    
+    NPY_CPU_DISPATCH_CALL_XB(add_functions[5] = INT_add);
+    
+    NPY_CPU_DISPATCH_CALL_XB(add_functions[6] = UINT_add);
+    
+    NPY_CPU_DISPATCH_CALL_XB(add_functions[7] = LONG_add);
+    
+    NPY_CPU_DISPATCH_CALL_XB(add_functions[8] = ULONG_add);
+    
+    NPY_CPU_DISPATCH_CALL_XB(add_functions[9] = LONGLONG_add);
+    
+    NPY_CPU_DISPATCH_CALL_XB(add_functions[10] = ULONGLONG_add);
+    
+    NPY_CPU_DISPATCH_CALL_XB(bitwise_and_functions[1] = BYTE_bitwise_and);
+    
+    NPY_CPU_DISPATCH_CALL_XB(bitwise_and_functions[2] = UBYTE_bitwise_and);
+    
+    NPY_CPU_DISPATCH_CALL_XB(bitwise_and_functions[3] = SHORT_bitwise_and);
+    
+    NPY_CPU_DISPATCH_CALL_XB(bitwise_and_functions[4] = USHORT_bitwise_and);
+    
+    NPY_CPU_DISPATCH_CALL_XB(bitwise_and_functions[5] = INT_bitwise_and);
+    
+    NPY_CPU_DISPATCH_CALL_XB(bitwise_and_functions[6] = UINT_bitwise_and);
+    
+    NPY_CPU_DISPATCH_CALL_XB(bitwise_and_functions[7] = LONG_bitwise_and);
+    
+    NPY_CPU_DISPATCH_CALL_XB(bitwise_and_functions[8] = ULONG_bitwise_and);
+    
+    NPY_CPU_DISPATCH_CALL_XB(bitwise_and_functions[9] = LONGLONG_bitwise_and);
+    
+    NPY_CPU_DISPATCH_CALL_XB(bitwise_and_functions[10] = ULONGLONG_bitwise_and);
+    
+    NPY_CPU_DISPATCH_CALL_XB(bitwise_or_functions[1] = BYTE_bitwise_or);
+    
+    NPY_CPU_DISPATCH_CALL_XB(bitwise_or_functions[2] = UBYTE_bitwise_or);
+    
+    NPY_CPU_DISPATCH_CALL_XB(bitwise_or_functions[3] = SHORT_bitwise_or);
+    
+    NPY_CPU_DISPATCH_CALL_XB(bitwise_or_functions[4] = USHORT_bitwise_or);
+    
+    NPY_CPU_DISPATCH_CALL_XB(bitwise_or_functions[5] = INT_bitwise_or);
+    
+    NPY_CPU_DISPATCH_CALL_XB(bitwise_or_functions[6] = UINT_bitwise_or);
+    
+    NPY_CPU_DISPATCH_CALL_XB(bitwise_or_functions[7] = LONG_bitwise_or);
+    
+    NPY_CPU_DISPATCH_CALL_XB(bitwise_or_functions[8] = ULONG_bitwise_or);
+    
+    NPY_CPU_DISPATCH_CALL_XB(bitwise_or_functions[9] = LONGLONG_bitwise_or);
+    
+    NPY_CPU_DISPATCH_CALL_XB(bitwise_or_functions[10] = ULONGLONG_bitwise_or);
+    
+    NPY_CPU_DISPATCH_CALL_XB(bitwise_xor_functions[1] = BYTE_bitwise_xor);
+    
+    NPY_CPU_DISPATCH_CALL_XB(bitwise_xor_functions[2] = UBYTE_bitwise_xor);
+    
+    NPY_CPU_DISPATCH_CALL_XB(bitwise_xor_functions[3] = SHORT_bitwise_xor);
+    
+    NPY_CPU_DISPATCH_CALL_XB(bitwise_xor_functions[4] = USHORT_bitwise_xor);
+    
+    NPY_CPU_DISPATCH_CALL_XB(bitwise_xor_functions[5] = INT_bitwise_xor);
+    
+    NPY_CPU_DISPATCH_CALL_XB(bitwise_xor_functions[6] = UINT_bitwise_xor);
+    
+    NPY_CPU_DISPATCH_CALL_XB(bitwise_xor_functions[7] = LONG_bitwise_xor);
+    
+    NPY_CPU_DISPATCH_CALL_XB(bitwise_xor_functions[8] = ULONG_bitwise_xor);
+    
+    NPY_CPU_DISPATCH_CALL_XB(bitwise_xor_functions[9] = LONGLONG_bitwise_xor);
+    
+    NPY_CPU_DISPATCH_CALL_XB(bitwise_xor_functions[10] = ULONGLONG_bitwise_xor);
+    
+    NPY_CPU_DISPATCH_CALL_XB(conjugate_functions[0] = BYTE_conjugate);
+    
+    NPY_CPU_DISPATCH_CALL_XB(conjugate_functions[1] = UBYTE_conjugate);
+    
+    NPY_CPU_DISPATCH_CALL_XB(conjugate_functions[2] = SHORT_conjugate);
+    
+    NPY_CPU_DISPATCH_CALL_XB(conjugate_functions[3] = USHORT_conjugate);
+    
+    NPY_CPU_DISPATCH_CALL_XB(conjugate_functions[4] = INT_conjugate);
+    
+    NPY_CPU_DISPATCH_CALL_XB(conjugate_functions[5] = UINT_conjugate);
+    
+    NPY_CPU_DISPATCH_CALL_XB(conjugate_functions[6] = LONG_conjugate);
+    
+    NPY_CPU_DISPATCH_CALL_XB(conjugate_functions[7] = ULONG_conjugate);
+    
+    NPY_CPU_DISPATCH_CALL_XB(conjugate_functions[8] = LONGLONG_conjugate);
+    
+    NPY_CPU_DISPATCH_CALL_XB(conjugate_functions[9] = ULONGLONG_conjugate);
+    
+    NPY_CPU_DISPATCH_CALL_XB(invert_functions[1] = BYTE_invert);
+    
+    NPY_CPU_DISPATCH_CALL_XB(invert_functions[2] = UBYTE_invert);
+    
+    NPY_CPU_DISPATCH_CALL_XB(invert_functions[3] = SHORT_invert);
+    
+    NPY_CPU_DISPATCH_CALL_XB(invert_functions[4] = USHORT_invert);
+    
+    NPY_CPU_DISPATCH_CALL_XB(invert_functions[5] = INT_invert);
+    
+    NPY_CPU_DISPATCH_CALL_XB(invert_functions[6] = UINT_invert);
+    
+    NPY_CPU_DISPATCH_CALL_XB(invert_functions[7] = LONG_invert);
+    
+    NPY_CPU_DISPATCH_CALL_XB(invert_functions[8] = ULONG_invert);
+    
+    NPY_CPU_DISPATCH_CALL_XB(invert_functions[9] = LONGLONG_invert);
+    
+    NPY_CPU_DISPATCH_CALL_XB(invert_functions[10] = ULONGLONG_invert);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isfinite_functions[0] = BOOL_isfinite);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isfinite_functions[1] = BYTE_isfinite);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isfinite_functions[2] = UBYTE_isfinite);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isfinite_functions[3] = SHORT_isfinite);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isfinite_functions[4] = USHORT_isfinite);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isfinite_functions[5] = INT_isfinite);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isfinite_functions[6] = UINT_isfinite);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isfinite_functions[7] = LONG_isfinite);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isfinite_functions[8] = ULONG_isfinite);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isfinite_functions[9] = LONGLONG_isfinite);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isfinite_functions[10] = ULONGLONG_isfinite);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isinf_functions[0] = BOOL_isinf);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isinf_functions[1] = BYTE_isinf);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isinf_functions[2] = UBYTE_isinf);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isinf_functions[3] = SHORT_isinf);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isinf_functions[4] = USHORT_isinf);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isinf_functions[5] = INT_isinf);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isinf_functions[6] = UINT_isinf);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isinf_functions[7] = LONG_isinf);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isinf_functions[8] = ULONG_isinf);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isinf_functions[9] = LONGLONG_isinf);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isinf_functions[10] = ULONGLONG_isinf);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isinf_functions[18] = TIMEDELTA_isinf);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isinf_functions[19] = DATETIME_isinf);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isnan_functions[0] = BOOL_isnan);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isnan_functions[1] = BYTE_isnan);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isnan_functions[2] = UBYTE_isnan);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isnan_functions[3] = SHORT_isnan);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isnan_functions[4] = USHORT_isnan);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isnan_functions[5] = INT_isnan);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isnan_functions[6] = UINT_isnan);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isnan_functions[7] = LONG_isnan);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isnan_functions[8] = ULONG_isnan);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isnan_functions[9] = LONGLONG_isnan);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isnan_functions[10] = ULONGLONG_isnan);
+    
+    NPY_CPU_DISPATCH_CALL_XB(left_shift_functions[0] = BYTE_left_shift);
+    
+    NPY_CPU_DISPATCH_CALL_XB(left_shift_functions[1] = UBYTE_left_shift);
+    
+    NPY_CPU_DISPATCH_CALL_XB(left_shift_functions[2] = SHORT_left_shift);
+    
+    NPY_CPU_DISPATCH_CALL_XB(left_shift_functions[3] = USHORT_left_shift);
+    
+    NPY_CPU_DISPATCH_CALL_XB(left_shift_functions[4] = INT_left_shift);
+    
+    NPY_CPU_DISPATCH_CALL_XB(left_shift_functions[5] = UINT_left_shift);
+    
+    NPY_CPU_DISPATCH_CALL_XB(left_shift_functions[6] = LONG_left_shift);
+    
+    NPY_CPU_DISPATCH_CALL_XB(left_shift_functions[7] = ULONG_left_shift);
+    
+    NPY_CPU_DISPATCH_CALL_XB(left_shift_functions[8] = LONGLONG_left_shift);
+    
+    NPY_CPU_DISPATCH_CALL_XB(left_shift_functions[9] = ULONGLONG_left_shift);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_and_functions[1] = BYTE_logical_and);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_and_functions[2] = UBYTE_logical_and);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_and_functions[3] = SHORT_logical_and);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_and_functions[4] = USHORT_logical_and);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_and_functions[5] = INT_logical_and);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_and_functions[6] = UINT_logical_and);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_and_functions[7] = LONG_logical_and);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_and_functions[8] = ULONG_logical_and);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_and_functions[9] = LONGLONG_logical_and);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_and_functions[10] = ULONGLONG_logical_and);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_not_functions[1] = BYTE_logical_not);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_not_functions[2] = UBYTE_logical_not);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_not_functions[3] = SHORT_logical_not);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_not_functions[4] = USHORT_logical_not);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_not_functions[5] = INT_logical_not);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_not_functions[6] = UINT_logical_not);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_not_functions[7] = LONG_logical_not);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_not_functions[8] = ULONG_logical_not);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_not_functions[9] = LONGLONG_logical_not);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_not_functions[10] = ULONGLONG_logical_not);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_or_functions[1] = BYTE_logical_or);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_or_functions[2] = UBYTE_logical_or);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_or_functions[3] = SHORT_logical_or);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_or_functions[4] = USHORT_logical_or);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_or_functions[5] = INT_logical_or);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_or_functions[6] = UINT_logical_or);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_or_functions[7] = LONG_logical_or);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_or_functions[8] = ULONG_logical_or);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_or_functions[9] = LONGLONG_logical_or);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_or_functions[10] = ULONGLONG_logical_or);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_xor_functions[1] = BYTE_logical_xor);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_xor_functions[2] = UBYTE_logical_xor);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_xor_functions[3] = SHORT_logical_xor);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_xor_functions[4] = USHORT_logical_xor);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_xor_functions[5] = INT_logical_xor);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_xor_functions[6] = UINT_logical_xor);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_xor_functions[7] = LONG_logical_xor);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_xor_functions[8] = ULONG_logical_xor);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_xor_functions[9] = LONGLONG_logical_xor);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_xor_functions[10] = ULONGLONG_logical_xor);
+    
+    NPY_CPU_DISPATCH_CALL_XB(multiply_functions[1] = BYTE_multiply);
+    
+    NPY_CPU_DISPATCH_CALL_XB(multiply_functions[2] = UBYTE_multiply);
+    
+    NPY_CPU_DISPATCH_CALL_XB(multiply_functions[3] = SHORT_multiply);
+    
+    NPY_CPU_DISPATCH_CALL_XB(multiply_functions[4] = USHORT_multiply);
+    
+    NPY_CPU_DISPATCH_CALL_XB(multiply_functions[5] = INT_multiply);
+    
+    NPY_CPU_DISPATCH_CALL_XB(multiply_functions[6] = UINT_multiply);
+    
+    NPY_CPU_DISPATCH_CALL_XB(multiply_functions[7] = LONG_multiply);
+    
+    NPY_CPU_DISPATCH_CALL_XB(multiply_functions[8] = ULONG_multiply);
+    
+    NPY_CPU_DISPATCH_CALL_XB(multiply_functions[9] = LONGLONG_multiply);
+    
+    NPY_CPU_DISPATCH_CALL_XB(multiply_functions[10] = ULONGLONG_multiply);
+    
+    NPY_CPU_DISPATCH_CALL_XB(reciprocal_functions[0] = BYTE_reciprocal);
+    
+    NPY_CPU_DISPATCH_CALL_XB(reciprocal_functions[1] = UBYTE_reciprocal);
+    
+    NPY_CPU_DISPATCH_CALL_XB(reciprocal_functions[2] = SHORT_reciprocal);
+    
+    NPY_CPU_DISPATCH_CALL_XB(reciprocal_functions[3] = USHORT_reciprocal);
+    
+    NPY_CPU_DISPATCH_CALL_XB(reciprocal_functions[4] = INT_reciprocal);
+    
+    NPY_CPU_DISPATCH_CALL_XB(reciprocal_functions[5] = UINT_reciprocal);
+    
+    NPY_CPU_DISPATCH_CALL_XB(reciprocal_functions[6] = LONG_reciprocal);
+    
+    NPY_CPU_DISPATCH_CALL_XB(reciprocal_functions[7] = ULONG_reciprocal);
+    
+    NPY_CPU_DISPATCH_CALL_XB(reciprocal_functions[8] = LONGLONG_reciprocal);
+    
+    NPY_CPU_DISPATCH_CALL_XB(reciprocal_functions[9] = ULONGLONG_reciprocal);
+    
+    NPY_CPU_DISPATCH_CALL_XB(right_shift_functions[0] = BYTE_right_shift);
+    
+    NPY_CPU_DISPATCH_CALL_XB(right_shift_functions[1] = UBYTE_right_shift);
+    
+    NPY_CPU_DISPATCH_CALL_XB(right_shift_functions[2] = SHORT_right_shift);
+    
+    NPY_CPU_DISPATCH_CALL_XB(right_shift_functions[3] = USHORT_right_shift);
+    
+    NPY_CPU_DISPATCH_CALL_XB(right_shift_functions[4] = INT_right_shift);
+    
+    NPY_CPU_DISPATCH_CALL_XB(right_shift_functions[5] = UINT_right_shift);
+    
+    NPY_CPU_DISPATCH_CALL_XB(right_shift_functions[6] = LONG_right_shift);
+    
+    NPY_CPU_DISPATCH_CALL_XB(right_shift_functions[7] = ULONG_right_shift);
+    
+    NPY_CPU_DISPATCH_CALL_XB(right_shift_functions[8] = LONGLONG_right_shift);
+    
+    NPY_CPU_DISPATCH_CALL_XB(right_shift_functions[9] = ULONGLONG_right_shift);
+    
+    NPY_CPU_DISPATCH_CALL_XB(sign_functions[0] = BYTE_sign);
+    
+    NPY_CPU_DISPATCH_CALL_XB(sign_functions[1] = UBYTE_sign);
+    
+    NPY_CPU_DISPATCH_CALL_XB(sign_functions[2] = SHORT_sign);
+    
+    NPY_CPU_DISPATCH_CALL_XB(sign_functions[3] = USHORT_sign);
+    
+    NPY_CPU_DISPATCH_CALL_XB(sign_functions[4] = INT_sign);
+    
+    NPY_CPU_DISPATCH_CALL_XB(sign_functions[5] = UINT_sign);
+    
+    NPY_CPU_DISPATCH_CALL_XB(sign_functions[6] = LONG_sign);
+    
+    NPY_CPU_DISPATCH_CALL_XB(sign_functions[7] = ULONG_sign);
+    
+    NPY_CPU_DISPATCH_CALL_XB(sign_functions[8] = LONGLONG_sign);
+    
+    NPY_CPU_DISPATCH_CALL_XB(sign_functions[9] = ULONGLONG_sign);
+    
+    NPY_CPU_DISPATCH_CALL_XB(square_functions[0] = BYTE_square);
+    
+    NPY_CPU_DISPATCH_CALL_XB(square_functions[1] = UBYTE_square);
+    
+    NPY_CPU_DISPATCH_CALL_XB(square_functions[2] = SHORT_square);
+    
+    NPY_CPU_DISPATCH_CALL_XB(square_functions[3] = USHORT_square);
+    
+    NPY_CPU_DISPATCH_CALL_XB(square_functions[4] = INT_square);
+    
+    NPY_CPU_DISPATCH_CALL_XB(square_functions[5] = UINT_square);
+    
+    NPY_CPU_DISPATCH_CALL_XB(square_functions[6] = LONG_square);
+    
+    NPY_CPU_DISPATCH_CALL_XB(square_functions[7] = ULONG_square);
+    
+    NPY_CPU_DISPATCH_CALL_XB(square_functions[8] = LONGLONG_square);
+    
+    NPY_CPU_DISPATCH_CALL_XB(square_functions[9] = ULONGLONG_square);
+    
+    NPY_CPU_DISPATCH_CALL_XB(subtract_functions[0] = BYTE_subtract);
+    
+    NPY_CPU_DISPATCH_CALL_XB(subtract_functions[1] = UBYTE_subtract);
+    
+    NPY_CPU_DISPATCH_CALL_XB(subtract_functions[2] = SHORT_subtract);
+    
+    NPY_CPU_DISPATCH_CALL_XB(subtract_functions[3] = USHORT_subtract);
+    
+    NPY_CPU_DISPATCH_CALL_XB(subtract_functions[4] = INT_subtract);
+    
+    NPY_CPU_DISPATCH_CALL_XB(subtract_functions[5] = UINT_subtract);
+    
+    NPY_CPU_DISPATCH_CALL_XB(subtract_functions[6] = LONG_subtract);
+    
+    NPY_CPU_DISPATCH_CALL_XB(subtract_functions[7] = ULONG_subtract);
+    
+    NPY_CPU_DISPATCH_CALL_XB(subtract_functions[8] = LONGLONG_subtract);
+    
+    NPY_CPU_DISPATCH_CALL_XB(subtract_functions[9] = ULONGLONG_subtract);
+    
+    
+    #ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_unary_fp.dispatch.h"
+    #endif
+    
+    NPY_CPU_DISPATCH_CALL_XB(absolute_functions[12] = FLOAT_absolute);
+    
+    NPY_CPU_DISPATCH_CALL_XB(absolute_functions[13] = DOUBLE_absolute);
+    
+    NPY_CPU_DISPATCH_CALL_XB(ceil_functions[1] = FLOAT_ceil);
+    
+    NPY_CPU_DISPATCH_CALL_XB(ceil_functions[2] = DOUBLE_ceil);
+    
+    NPY_CPU_DISPATCH_CALL_XB(floor_functions[1] = FLOAT_floor);
+    
+    NPY_CPU_DISPATCH_CALL_XB(floor_functions[2] = DOUBLE_floor);
+    
+    NPY_CPU_DISPATCH_CALL_XB(reciprocal_functions[11] = FLOAT_reciprocal);
+    
+    NPY_CPU_DISPATCH_CALL_XB(reciprocal_functions[12] = DOUBLE_reciprocal);
+    
+    NPY_CPU_DISPATCH_CALL_XB(rint_functions[1] = FLOAT_rint);
+    
+    NPY_CPU_DISPATCH_CALL_XB(rint_functions[2] = DOUBLE_rint);
+    
+    NPY_CPU_DISPATCH_CALL_XB(sqrt_functions[1] = FLOAT_sqrt);
+    
+    NPY_CPU_DISPATCH_CALL_XB(sqrt_functions[2] = DOUBLE_sqrt);
+    
+    NPY_CPU_DISPATCH_CALL_XB(square_functions[11] = FLOAT_square);
+    
+    NPY_CPU_DISPATCH_CALL_XB(square_functions[12] = DOUBLE_square);
+    
+    NPY_CPU_DISPATCH_CALL_XB(trunc_functions[1] = FLOAT_trunc);
+    
+    NPY_CPU_DISPATCH_CALL_XB(trunc_functions[2] = DOUBLE_trunc);
+    
+    
+    #ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_unary_complex.dispatch.h"
+    #endif
+    
+    NPY_CPU_DISPATCH_CALL_XB(absolute_functions[16] = CFLOAT_absolute);
+    
+    NPY_CPU_DISPATCH_CALL_XB(absolute_functions[17] = CDOUBLE_absolute);
+    
+    
+    #ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_arithm_fp.dispatch.h"
+    #endif
+    
+    NPY_CPU_DISPATCH_CALL_XB(add_functions[12] = FLOAT_add);
+    
+    NPY_CPU_DISPATCH_CALL_XB(add_functions[13] = DOUBLE_add);
+    
+    NPY_CPU_DISPATCH_CALL_XB(add_functions[15] = CFLOAT_add);
+    
+    NPY_CPU_DISPATCH_CALL_XB(add_functions[16] = CDOUBLE_add);
+    
+    NPY_CPU_DISPATCH_CALL_XB(conjugate_functions[14] = CFLOAT_conjugate);
+    
+    NPY_CPU_DISPATCH_CALL_XB(conjugate_functions[15] = CDOUBLE_conjugate);
+    
+    NPY_CPU_DISPATCH_CALL_XB(divide_functions[1] = FLOAT_divide);
+    
+    NPY_CPU_DISPATCH_CALL_XB(divide_functions[2] = DOUBLE_divide);
+    
+    NPY_CPU_DISPATCH_CALL_XB(multiply_functions[12] = FLOAT_multiply);
+    
+    NPY_CPU_DISPATCH_CALL_XB(multiply_functions[13] = DOUBLE_multiply);
+    
+    NPY_CPU_DISPATCH_CALL_XB(multiply_functions[15] = CFLOAT_multiply);
+    
+    NPY_CPU_DISPATCH_CALL_XB(multiply_functions[16] = CDOUBLE_multiply);
+    
+    NPY_CPU_DISPATCH_CALL_XB(square_functions[14] = CFLOAT_square);
+    
+    NPY_CPU_DISPATCH_CALL_XB(square_functions[15] = CDOUBLE_square);
+    
+    NPY_CPU_DISPATCH_CALL_XB(subtract_functions[11] = FLOAT_subtract);
+    
+    NPY_CPU_DISPATCH_CALL_XB(subtract_functions[12] = DOUBLE_subtract);
+    
+    NPY_CPU_DISPATCH_CALL_XB(subtract_functions[14] = CFLOAT_subtract);
+    
+    NPY_CPU_DISPATCH_CALL_XB(subtract_functions[15] = CDOUBLE_subtract);
+    
+    
+    #ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_umath_fp.dispatch.h"
+    #endif
+    
+    NPY_CPU_DISPATCH_CALL_XB(arccos_functions[0] = HALF_arccos);
+    
+    NPY_CPU_DISPATCH_CALL_XB(arccos_functions[1] = FLOAT_arccos);
+    
+    NPY_CPU_DISPATCH_CALL_XB(arccos_functions[2] = DOUBLE_arccos);
+    
+    NPY_CPU_DISPATCH_CALL_XB(arccosh_functions[0] = HALF_arccosh);
+    
+    NPY_CPU_DISPATCH_CALL_XB(arccosh_functions[1] = FLOAT_arccosh);
+    
+    NPY_CPU_DISPATCH_CALL_XB(arccosh_functions[2] = DOUBLE_arccosh);
+    
+    NPY_CPU_DISPATCH_CALL_XB(arcsin_functions[0] = HALF_arcsin);
+    
+    NPY_CPU_DISPATCH_CALL_XB(arcsin_functions[1] = FLOAT_arcsin);
+    
+    NPY_CPU_DISPATCH_CALL_XB(arcsin_functions[2] = DOUBLE_arcsin);
+    
+    NPY_CPU_DISPATCH_CALL_XB(arcsinh_functions[0] = HALF_arcsinh);
+    
+    NPY_CPU_DISPATCH_CALL_XB(arcsinh_functions[1] = FLOAT_arcsinh);
+    
+    NPY_CPU_DISPATCH_CALL_XB(arcsinh_functions[2] = DOUBLE_arcsinh);
+    
+    NPY_CPU_DISPATCH_CALL_XB(arctan_functions[0] = HALF_arctan);
+    
+    NPY_CPU_DISPATCH_CALL_XB(arctan_functions[1] = FLOAT_arctan);
+    
+    NPY_CPU_DISPATCH_CALL_XB(arctan_functions[2] = DOUBLE_arctan);
+    
+    NPY_CPU_DISPATCH_CALL_XB(arctan2_functions[1] = FLOAT_arctan2);
+    
+    NPY_CPU_DISPATCH_CALL_XB(arctan2_functions[2] = DOUBLE_arctan2);
+    
+    NPY_CPU_DISPATCH_CALL_XB(arctanh_functions[0] = HALF_arctanh);
+    
+    NPY_CPU_DISPATCH_CALL_XB(arctanh_functions[1] = FLOAT_arctanh);
+    
+    NPY_CPU_DISPATCH_CALL_XB(arctanh_functions[2] = DOUBLE_arctanh);
+    
+    NPY_CPU_DISPATCH_CALL_XB(cbrt_functions[0] = HALF_cbrt);
+    
+    NPY_CPU_DISPATCH_CALL_XB(cbrt_functions[1] = FLOAT_cbrt);
+    
+    NPY_CPU_DISPATCH_CALL_XB(cbrt_functions[2] = DOUBLE_cbrt);
+    
+    NPY_CPU_DISPATCH_CALL_XB(cos_functions[0] = HALF_cos);
+    
+    NPY_CPU_DISPATCH_CALL_XB(cosh_functions[0] = HALF_cosh);
+    
+    NPY_CPU_DISPATCH_CALL_XB(cosh_functions[1] = FLOAT_cosh);
+    
+    NPY_CPU_DISPATCH_CALL_XB(cosh_functions[2] = DOUBLE_cosh);
+    
+    NPY_CPU_DISPATCH_CALL_XB(exp_functions[0] = HALF_exp);
+    
+    NPY_CPU_DISPATCH_CALL_XB(exp2_functions[0] = HALF_exp2);
+    
+    NPY_CPU_DISPATCH_CALL_XB(exp2_functions[1] = FLOAT_exp2);
+    
+    NPY_CPU_DISPATCH_CALL_XB(exp2_functions[2] = DOUBLE_exp2);
+    
+    NPY_CPU_DISPATCH_CALL_XB(expm1_functions[0] = HALF_expm1);
+    
+    NPY_CPU_DISPATCH_CALL_XB(expm1_functions[1] = FLOAT_expm1);
+    
+    NPY_CPU_DISPATCH_CALL_XB(expm1_functions[2] = DOUBLE_expm1);
+    
+    NPY_CPU_DISPATCH_CALL_XB(log_functions[0] = HALF_log);
+    
+    NPY_CPU_DISPATCH_CALL_XB(log10_functions[0] = HALF_log10);
+    
+    NPY_CPU_DISPATCH_CALL_XB(log10_functions[1] = FLOAT_log10);
+    
+    NPY_CPU_DISPATCH_CALL_XB(log10_functions[2] = DOUBLE_log10);
+    
+    NPY_CPU_DISPATCH_CALL_XB(log1p_functions[0] = HALF_log1p);
+    
+    NPY_CPU_DISPATCH_CALL_XB(log1p_functions[1] = FLOAT_log1p);
+    
+    NPY_CPU_DISPATCH_CALL_XB(log1p_functions[2] = DOUBLE_log1p);
+    
+    NPY_CPU_DISPATCH_CALL_XB(log2_functions[0] = HALF_log2);
+    
+    NPY_CPU_DISPATCH_CALL_XB(log2_functions[1] = FLOAT_log2);
+    
+    NPY_CPU_DISPATCH_CALL_XB(log2_functions[2] = DOUBLE_log2);
+    
+    NPY_CPU_DISPATCH_CALL_XB(power_functions[11] = FLOAT_power);
+    
+    NPY_CPU_DISPATCH_CALL_XB(power_functions[12] = DOUBLE_power);
+    
+    NPY_CPU_DISPATCH_CALL_XB(sin_functions[0] = HALF_sin);
+    
+    NPY_CPU_DISPATCH_CALL_XB(sinh_functions[0] = HALF_sinh);
+    
+    NPY_CPU_DISPATCH_CALL_XB(sinh_functions[1] = FLOAT_sinh);
+    
+    NPY_CPU_DISPATCH_CALL_XB(sinh_functions[2] = DOUBLE_sinh);
+    
+    NPY_CPU_DISPATCH_CALL_XB(tan_functions[0] = HALF_tan);
+    
+    NPY_CPU_DISPATCH_CALL_XB(tan_functions[1] = FLOAT_tan);
+    
+    NPY_CPU_DISPATCH_CALL_XB(tan_functions[2] = DOUBLE_tan);
+    
+    NPY_CPU_DISPATCH_CALL_XB(tanh_functions[0] = HALF_tanh);
+    
+    
+    #ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_comparison.dispatch.h"
+    #endif
+    
+    NPY_CPU_DISPATCH_CALL_XB(bitwise_xor_functions[0] = BOOL_not_equal);
+    
+    NPY_CPU_DISPATCH_CALL_XB(equal_functions[14] = FLOAT_equal);
+    
+    NPY_CPU_DISPATCH_CALL_XB(equal_functions[15] = DOUBLE_equal);
+    
+    NPY_CPU_DISPATCH_CALL_XB(greater_functions[14] = FLOAT_greater);
+    
+    NPY_CPU_DISPATCH_CALL_XB(greater_functions[15] = DOUBLE_greater);
+    
+    NPY_CPU_DISPATCH_CALL_XB(greater_equal_functions[14] = FLOAT_greater_equal);
+    
+    NPY_CPU_DISPATCH_CALL_XB(greater_equal_functions[15] = DOUBLE_greater_equal);
+    
+    NPY_CPU_DISPATCH_CALL_XB(less_functions[14] = FLOAT_less);
+    
+    NPY_CPU_DISPATCH_CALL_XB(less_functions[15] = DOUBLE_less);
+    
+    NPY_CPU_DISPATCH_CALL_XB(less_equal_functions[14] = FLOAT_less_equal);
+    
+    NPY_CPU_DISPATCH_CALL_XB(less_equal_functions[15] = DOUBLE_less_equal);
+    
+    NPY_CPU_DISPATCH_CALL_XB(logical_xor_functions[0] = BOOL_not_equal);
+    
+    NPY_CPU_DISPATCH_CALL_XB(not_equal_functions[14] = FLOAT_not_equal);
+    
+    NPY_CPU_DISPATCH_CALL_XB(not_equal_functions[15] = DOUBLE_not_equal);
+    
+    
+    #ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_trigonometric.dispatch.h"
+    #endif
+    
+    NPY_CPU_DISPATCH_CALL_XB(cos_functions[1] = FLOAT_cos);
+    
+    NPY_CPU_DISPATCH_CALL_XB(cos_functions[2] = DOUBLE_cos);
+    
+    NPY_CPU_DISPATCH_CALL_XB(sin_functions[1] = FLOAT_sin);
+    
+    NPY_CPU_DISPATCH_CALL_XB(sin_functions[2] = DOUBLE_sin);
+    
+    
+    #ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_modulo.dispatch.h"
+    #endif
+    
+    NPY_CPU_DISPATCH_CALL_XB(divmod_functions[0] = BYTE_divmod);
+    
+    NPY_CPU_DISPATCH_CALL_XB(divmod_functions[1] = UBYTE_divmod);
+    
+    NPY_CPU_DISPATCH_CALL_XB(divmod_functions[2] = SHORT_divmod);
+    
+    NPY_CPU_DISPATCH_CALL_XB(divmod_functions[3] = USHORT_divmod);
+    
+    NPY_CPU_DISPATCH_CALL_XB(divmod_functions[4] = INT_divmod);
+    
+    NPY_CPU_DISPATCH_CALL_XB(divmod_functions[5] = UINT_divmod);
+    
+    NPY_CPU_DISPATCH_CALL_XB(divmod_functions[6] = LONG_divmod);
+    
+    NPY_CPU_DISPATCH_CALL_XB(divmod_functions[7] = ULONG_divmod);
+    
+    NPY_CPU_DISPATCH_CALL_XB(divmod_functions[8] = LONGLONG_divmod);
+    
+    NPY_CPU_DISPATCH_CALL_XB(divmod_functions[9] = ULONGLONG_divmod);
+    
+    NPY_CPU_DISPATCH_CALL_XB(fmod_functions[0] = BYTE_fmod);
+    
+    NPY_CPU_DISPATCH_CALL_XB(fmod_functions[1] = UBYTE_fmod);
+    
+    NPY_CPU_DISPATCH_CALL_XB(fmod_functions[2] = SHORT_fmod);
+    
+    NPY_CPU_DISPATCH_CALL_XB(fmod_functions[3] = USHORT_fmod);
+    
+    NPY_CPU_DISPATCH_CALL_XB(fmod_functions[4] = INT_fmod);
+    
+    NPY_CPU_DISPATCH_CALL_XB(fmod_functions[5] = UINT_fmod);
+    
+    NPY_CPU_DISPATCH_CALL_XB(fmod_functions[6] = LONG_fmod);
+    
+    NPY_CPU_DISPATCH_CALL_XB(fmod_functions[7] = ULONG_fmod);
+    
+    NPY_CPU_DISPATCH_CALL_XB(fmod_functions[8] = LONGLONG_fmod);
+    
+    NPY_CPU_DISPATCH_CALL_XB(fmod_functions[9] = ULONGLONG_fmod);
+    
+    NPY_CPU_DISPATCH_CALL_XB(remainder_functions[0] = BYTE_remainder);
+    
+    NPY_CPU_DISPATCH_CALL_XB(remainder_functions[1] = UBYTE_remainder);
+    
+    NPY_CPU_DISPATCH_CALL_XB(remainder_functions[2] = SHORT_remainder);
+    
+    NPY_CPU_DISPATCH_CALL_XB(remainder_functions[3] = USHORT_remainder);
+    
+    NPY_CPU_DISPATCH_CALL_XB(remainder_functions[4] = INT_remainder);
+    
+    NPY_CPU_DISPATCH_CALL_XB(remainder_functions[5] = UINT_remainder);
+    
+    NPY_CPU_DISPATCH_CALL_XB(remainder_functions[6] = LONG_remainder);
+    
+    NPY_CPU_DISPATCH_CALL_XB(remainder_functions[7] = ULONG_remainder);
+    
+    NPY_CPU_DISPATCH_CALL_XB(remainder_functions[8] = LONGLONG_remainder);
+    
+    NPY_CPU_DISPATCH_CALL_XB(remainder_functions[9] = ULONGLONG_remainder);
+    
+    
+    #ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_exponent_log.dispatch.h"
+    #endif
+    
+    NPY_CPU_DISPATCH_CALL_XB(exp_functions[1] = FLOAT_exp);
+    
+    NPY_CPU_DISPATCH_CALL_XB(exp_functions[2] = DOUBLE_exp);
+    
+    NPY_CPU_DISPATCH_CALL_XB(frexp_functions[1] = FLOAT_frexp);
+    
+    NPY_CPU_DISPATCH_CALL_XB(frexp_functions[2] = DOUBLE_frexp);
+    
+    NPY_CPU_DISPATCH_CALL_XB(ldexp_functions[1] = FLOAT_ldexp);
+    
+    NPY_CPU_DISPATCH_CALL_XB(ldexp_functions[4] = DOUBLE_ldexp);
+    
+    NPY_CPU_DISPATCH_CALL_XB(log_functions[1] = FLOAT_log);
+    
+    NPY_CPU_DISPATCH_CALL_XB(log_functions[2] = DOUBLE_log);
+    
+    
+    #ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_arithmetic.dispatch.h"
+    #endif
+    
+    NPY_CPU_DISPATCH_CALL_XB(floor_divide_functions[0] = BYTE_divide);
+    
+    NPY_CPU_DISPATCH_CALL_XB(floor_divide_functions[1] = UBYTE_divide);
+    
+    NPY_CPU_DISPATCH_CALL_XB(floor_divide_functions[2] = SHORT_divide);
+    
+    NPY_CPU_DISPATCH_CALL_XB(floor_divide_functions[3] = USHORT_divide);
+    
+    NPY_CPU_DISPATCH_CALL_XB(floor_divide_functions[4] = INT_divide);
+    
+    NPY_CPU_DISPATCH_CALL_XB(floor_divide_functions[5] = UINT_divide);
+    
+    NPY_CPU_DISPATCH_CALL_XB(floor_divide_functions[6] = LONG_divide);
+    
+    NPY_CPU_DISPATCH_CALL_XB(floor_divide_functions[7] = ULONG_divide);
+    
+    NPY_CPU_DISPATCH_CALL_XB(floor_divide_functions[8] = LONGLONG_divide);
+    
+    NPY_CPU_DISPATCH_CALL_XB(floor_divide_functions[9] = ULONGLONG_divide);
+    
+    
+    #ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_minmax.dispatch.h"
+    #endif
+    
+    NPY_CPU_DISPATCH_CALL_XB(fmax_functions[12] = FLOAT_fmax);
+    
+    NPY_CPU_DISPATCH_CALL_XB(fmax_functions[13] = DOUBLE_fmax);
+    
+    NPY_CPU_DISPATCH_CALL_XB(fmax_functions[14] = LONGDOUBLE_fmax);
+    
+    NPY_CPU_DISPATCH_CALL_XB(fmin_functions[12] = FLOAT_fmin);
+    
+    NPY_CPU_DISPATCH_CALL_XB(fmin_functions[13] = DOUBLE_fmin);
+    
+    NPY_CPU_DISPATCH_CALL_XB(fmin_functions[14] = LONGDOUBLE_fmin);
+    
+    NPY_CPU_DISPATCH_CALL_XB(maximum_functions[1] = BYTE_maximum);
+    
+    NPY_CPU_DISPATCH_CALL_XB(maximum_functions[2] = UBYTE_maximum);
+    
+    NPY_CPU_DISPATCH_CALL_XB(maximum_functions[3] = SHORT_maximum);
+    
+    NPY_CPU_DISPATCH_CALL_XB(maximum_functions[4] = USHORT_maximum);
+    
+    NPY_CPU_DISPATCH_CALL_XB(maximum_functions[5] = INT_maximum);
+    
+    NPY_CPU_DISPATCH_CALL_XB(maximum_functions[6] = UINT_maximum);
+    
+    NPY_CPU_DISPATCH_CALL_XB(maximum_functions[7] = LONG_maximum);
+    
+    NPY_CPU_DISPATCH_CALL_XB(maximum_functions[8] = ULONG_maximum);
+    
+    NPY_CPU_DISPATCH_CALL_XB(maximum_functions[9] = LONGLONG_maximum);
+    
+    NPY_CPU_DISPATCH_CALL_XB(maximum_functions[10] = ULONGLONG_maximum);
+    
+    NPY_CPU_DISPATCH_CALL_XB(maximum_functions[12] = FLOAT_maximum);
+    
+    NPY_CPU_DISPATCH_CALL_XB(maximum_functions[13] = DOUBLE_maximum);
+    
+    NPY_CPU_DISPATCH_CALL_XB(maximum_functions[14] = LONGDOUBLE_maximum);
+    
+    NPY_CPU_DISPATCH_CALL_XB(minimum_functions[1] = BYTE_minimum);
+    
+    NPY_CPU_DISPATCH_CALL_XB(minimum_functions[2] = UBYTE_minimum);
+    
+    NPY_CPU_DISPATCH_CALL_XB(minimum_functions[3] = SHORT_minimum);
+    
+    NPY_CPU_DISPATCH_CALL_XB(minimum_functions[4] = USHORT_minimum);
+    
+    NPY_CPU_DISPATCH_CALL_XB(minimum_functions[5] = INT_minimum);
+    
+    NPY_CPU_DISPATCH_CALL_XB(minimum_functions[6] = UINT_minimum);
+    
+    NPY_CPU_DISPATCH_CALL_XB(minimum_functions[7] = LONG_minimum);
+    
+    NPY_CPU_DISPATCH_CALL_XB(minimum_functions[8] = ULONG_minimum);
+    
+    NPY_CPU_DISPATCH_CALL_XB(minimum_functions[9] = LONGLONG_minimum);
+    
+    NPY_CPU_DISPATCH_CALL_XB(minimum_functions[10] = ULONGLONG_minimum);
+    
+    NPY_CPU_DISPATCH_CALL_XB(minimum_functions[12] = FLOAT_minimum);
+    
+    NPY_CPU_DISPATCH_CALL_XB(minimum_functions[13] = DOUBLE_minimum);
+    
+    NPY_CPU_DISPATCH_CALL_XB(minimum_functions[14] = LONGDOUBLE_minimum);
+    
+    
+    #ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_unary_fp_le.dispatch.h"
+    #endif
+    
+    NPY_CPU_DISPATCH_CALL_XB(isfinite_functions[12] = FLOAT_isfinite);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isfinite_functions[13] = DOUBLE_isfinite);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isinf_functions[12] = FLOAT_isinf);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isinf_functions[13] = DOUBLE_isinf);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isnan_functions[12] = FLOAT_isnan);
+    
+    NPY_CPU_DISPATCH_CALL_XB(isnan_functions[13] = DOUBLE_isnan);
+    
+    NPY_CPU_DISPATCH_CALL_XB(signbit_functions[1] = FLOAT_signbit);
+    
+    NPY_CPU_DISPATCH_CALL_XB(signbit_functions[2] = DOUBLE_signbit);
+    
+    
+    #ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_unary.dispatch.h"
+    #endif
+    
+    NPY_CPU_DISPATCH_CALL_XB(negative_functions[0] = BYTE_negative);
+    
+    NPY_CPU_DISPATCH_CALL_XB(negative_functions[1] = UBYTE_negative);
+    
+    NPY_CPU_DISPATCH_CALL_XB(negative_functions[2] = SHORT_negative);
+    
+    NPY_CPU_DISPATCH_CALL_XB(negative_functions[3] = USHORT_negative);
+    
+    NPY_CPU_DISPATCH_CALL_XB(negative_functions[4] = INT_negative);
+    
+    NPY_CPU_DISPATCH_CALL_XB(negative_functions[5] = UINT_negative);
+    
+    NPY_CPU_DISPATCH_CALL_XB(negative_functions[6] = LONG_negative);
+    
+    NPY_CPU_DISPATCH_CALL_XB(negative_functions[7] = ULONG_negative);
+    
+    NPY_CPU_DISPATCH_CALL_XB(negative_functions[8] = LONGLONG_negative);
+    
+    NPY_CPU_DISPATCH_CALL_XB(negative_functions[9] = ULONGLONG_negative);
+    
+    NPY_CPU_DISPATCH_CALL_XB(negative_functions[11] = FLOAT_negative);
+    
+    NPY_CPU_DISPATCH_CALL_XB(negative_functions[12] = DOUBLE_negative);
+    
+    NPY_CPU_DISPATCH_CALL_XB(negative_functions[13] = LONGDOUBLE_negative);
+    
+    
+    #ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_hyperbolic.dispatch.h"
+    #endif
+    
+    NPY_CPU_DISPATCH_CALL_XB(tanh_functions[1] = FLOAT_tanh);
+    
+    NPY_CPU_DISPATCH_CALL_XB(tanh_functions[2] = DOUBLE_tanh);
+
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        _arg_functions, _arg_data, _arg_signatures, 3,
+        1, 1, PyUFunc_None, "_arg",
+        DOC_NUMPY_CORE_UMATH__ARG, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "_arg", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        _ones_like_functions, _ones_like_data, _ones_like_signatures, 21,
+        1, 1, PyUFunc_None, "_ones_like",
+        DOC_NUMPY_CORE_UMATH__ONES_LIKE, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    ((PyUFuncObject *)f)->type_resolver = &PyUFunc_OnesLikeTypeResolver;
+    PyDict_SetItemString(dictionary, "_ones_like", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        absolute_functions, absolute_data, absolute_signatures, 20,
+        1, 1, PyUFunc_None, "absolute",
+        DOC_NUMPY_CORE_UMATH_ABSOLUTE, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    ((PyUFuncObject *)f)->type_resolver = &PyUFunc_AbsoluteTypeResolver;
+    PyDict_SetItemString(dictionary, "absolute", f);
+    Py_DECREF(f);
+    identity = PyLong_FromLong(0);
+    if (1 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        add_functions, add_data, add_signatures, 22,
+        2, 1, PyUFunc_IdentityValue, "add",
+        DOC_NUMPY_CORE_UMATH_ADD, 0, NULL, identity
+    );
+    if (1) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    ((PyUFuncObject *)f)->type_resolver = &PyUFunc_AdditionTypeResolver;
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_BYTE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "add with NPY_BYTE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "add with NPY_BYTE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         BYTE_add_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_UBYTE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "add with NPY_UBYTE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "add with NPY_UBYTE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         UBYTE_add_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_SHORT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "add with NPY_SHORT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "add with NPY_SHORT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         SHORT_add_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_USHORT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "add with NPY_USHORT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "add with NPY_USHORT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         USHORT_add_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_INT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "add with NPY_INT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "add with NPY_INT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         INT_add_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_UINT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "add with NPY_UINT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "add with NPY_UINT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         UINT_add_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_LONG);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "add with NPY_LONG");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "add with NPY_LONG");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         LONG_add_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_ULONG);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "add with NPY_ULONG");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "add with NPY_ULONG");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         ULONG_add_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_LONGLONG);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "add with NPY_LONGLONG");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "add with NPY_LONGLONG");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         LONGLONG_add_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_ULONGLONG);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "add with NPY_ULONGLONG");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "add with NPY_ULONGLONG");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         ULONGLONG_add_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_HALF);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "add with NPY_HALF");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "add with NPY_HALF");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         HALF_add_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_FLOAT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "add with NPY_FLOAT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "add with NPY_FLOAT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         FLOAT_add_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_DOUBLE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "add with NPY_DOUBLE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "add with NPY_DOUBLE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         DOUBLE_add_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_LONGDOUBLE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "add with NPY_LONGDOUBLE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "add with NPY_LONGDOUBLE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         LONGDOUBLE_add_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_CFLOAT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "add with NPY_CFLOAT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "add with NPY_CFLOAT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         CFLOAT_add_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_CDOUBLE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "add with NPY_CDOUBLE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "add with NPY_CDOUBLE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         CDOUBLE_add_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_CLONGDOUBLE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "add with NPY_CLONGDOUBLE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "add with NPY_CLONGDOUBLE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         CLONGDOUBLE_add_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    PyDict_SetItemString(dictionary, "add", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        arccos_functions, arccos_data, arccos_signatures, 11,
+        1, 1, PyUFunc_None, "arccos",
+        DOC_NUMPY_CORE_UMATH_ARCCOS, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "arccos", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        arccosh_functions, arccosh_data, arccosh_signatures, 11,
+        1, 1, PyUFunc_None, "arccosh",
+        DOC_NUMPY_CORE_UMATH_ARCCOSH, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "arccosh", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        arcsin_functions, arcsin_data, arcsin_signatures, 11,
+        1, 1, PyUFunc_None, "arcsin",
+        DOC_NUMPY_CORE_UMATH_ARCSIN, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "arcsin", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        arcsinh_functions, arcsinh_data, arcsinh_signatures, 11,
+        1, 1, PyUFunc_None, "arcsinh",
+        DOC_NUMPY_CORE_UMATH_ARCSINH, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "arcsinh", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        arctan_functions, arctan_data, arctan_signatures, 11,
+        1, 1, PyUFunc_None, "arctan",
+        DOC_NUMPY_CORE_UMATH_ARCTAN, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "arctan", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        arctan2_functions, arctan2_data, arctan2_signatures, 5,
+        2, 1, PyUFunc_None, "arctan2",
+        DOC_NUMPY_CORE_UMATH_ARCTAN2, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "arctan2", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        arctanh_functions, arctanh_data, arctanh_signatures, 11,
+        1, 1, PyUFunc_None, "arctanh",
+        DOC_NUMPY_CORE_UMATH_ARCTANH, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "arctanh", f);
+    Py_DECREF(f);
+    identity = PyLong_FromLong(-1);
+    if (1 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        bitwise_and_functions, bitwise_and_data, bitwise_and_signatures, 12,
+        2, 1, PyUFunc_IdentityValue, "bitwise_and",
+        DOC_NUMPY_CORE_UMATH_BITWISE_AND, 0, NULL, identity
+    );
+    if (1) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "bitwise_and", f);
+    Py_DECREF(f);
+    identity = PyLong_FromLong(0);
+    if (1 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        bitwise_or_functions, bitwise_or_data, bitwise_or_signatures, 12,
+        2, 1, PyUFunc_IdentityValue, "bitwise_or",
+        DOC_NUMPY_CORE_UMATH_BITWISE_OR, 0, NULL, identity
+    );
+    if (1) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "bitwise_or", f);
+    Py_DECREF(f);
+    identity = PyLong_FromLong(0);
+    if (1 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        bitwise_xor_functions, bitwise_xor_data, bitwise_xor_signatures, 12,
+        2, 1, PyUFunc_IdentityValue, "bitwise_xor",
+        DOC_NUMPY_CORE_UMATH_BITWISE_XOR, 0, NULL, identity
+    );
+    if (1) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "bitwise_xor", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        cbrt_functions, cbrt_data, cbrt_signatures, 8,
+        1, 1, PyUFunc_None, "cbrt",
+        DOC_NUMPY_CORE_UMATH_CBRT, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "cbrt", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        ceil_functions, ceil_data, ceil_signatures, 7,
+        1, 1, PyUFunc_None, "ceil",
+        DOC_NUMPY_CORE_UMATH_CEIL, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "ceil", f);
+    Py_DECREF(f);
+    identity = (Py_INCREF(Py_None), Py_None);
+    if (1 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        clip_functions, clip_data, clip_signatures, 21,
+        3, 1, PyUFunc_IdentityValue, "clip",
+        DOC_NUMPY_CORE_UMATH_CLIP, 0, NULL, identity
+    );
+    if (1) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    ((PyUFuncObject *)f)->type_resolver = &PyUFunc_SimpleUniformOperationTypeResolver;
+    PyDict_SetItemString(dictionary, "clip", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        conjugate_functions, conjugate_data, conjugate_signatures, 18,
+        1, 1, PyUFunc_None, "conjugate",
+        DOC_NUMPY_CORE_UMATH_CONJUGATE, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "conjugate", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        copysign_functions, copysign_data, copysign_signatures, 4,
+        2, 1, PyUFunc_None, "copysign",
+        DOC_NUMPY_CORE_UMATH_COPYSIGN, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "copysign", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        cos_functions, cos_data, cos_signatures, 8,
+        1, 1, PyUFunc_None, "cos",
+        DOC_NUMPY_CORE_UMATH_COS, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "cos", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        cosh_functions, cosh_data, cosh_signatures, 11,
+        1, 1, PyUFunc_None, "cosh",
+        DOC_NUMPY_CORE_UMATH_COSH, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "cosh", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        deg2rad_functions, deg2rad_data, deg2rad_signatures, 5,
+        1, 1, PyUFunc_None, "deg2rad",
+        DOC_NUMPY_CORE_UMATH_DEG2RAD, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "deg2rad", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        degrees_functions, degrees_data, degrees_signatures, 5,
+        1, 1, PyUFunc_None, "degrees",
+        DOC_NUMPY_CORE_UMATH_DEGREES, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "degrees", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        divide_functions, divide_data, divide_signatures, 11,
+        2, 1, PyUFunc_None, "divide",
+        DOC_NUMPY_CORE_UMATH_DIVIDE, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    ((PyUFuncObject *)f)->type_resolver = &PyUFunc_TrueDivisionTypeResolver;
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_HALF);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "divide with NPY_HALF");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "divide with NPY_HALF");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         HALF_divide_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_FLOAT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "divide with NPY_FLOAT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "divide with NPY_FLOAT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         FLOAT_divide_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_DOUBLE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "divide with NPY_DOUBLE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "divide with NPY_DOUBLE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         DOUBLE_divide_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_LONGDOUBLE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "divide with NPY_LONGDOUBLE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "divide with NPY_LONGDOUBLE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         LONGDOUBLE_divide_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    PyDict_SetItemString(dictionary, "divide", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        divmod_functions, divmod_data, divmod_signatures, 15,
+        2, 2, PyUFunc_None, "divmod",
+        DOC_NUMPY_CORE_UMATH_DIVMOD, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    ((PyUFuncObject *)f)->type_resolver = &PyUFunc_DivmodTypeResolver;
+    PyDict_SetItemString(dictionary, "divmod", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        equal_functions, equal_data, equal_signatures, 24,
+        2, 1, PyUFunc_None, "equal",
+        DOC_NUMPY_CORE_UMATH_EQUAL, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    ((PyUFuncObject *)f)->type_resolver = &PyUFunc_SimpleBinaryComparisonTypeResolver;
+    PyDict_SetItemString(dictionary, "equal", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        exp_functions, exp_data, exp_signatures, 10,
+        1, 1, PyUFunc_None, "exp",
+        DOC_NUMPY_CORE_UMATH_EXP, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "exp", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        exp2_functions, exp2_data, exp2_signatures, 11,
+        1, 1, PyUFunc_None, "exp2",
+        DOC_NUMPY_CORE_UMATH_EXP2, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "exp2", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        expm1_functions, expm1_data, expm1_signatures, 11,
+        1, 1, PyUFunc_None, "expm1",
+        DOC_NUMPY_CORE_UMATH_EXPM1, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "expm1", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        fabs_functions, fabs_data, fabs_signatures, 5,
+        1, 1, PyUFunc_None, "fabs",
+        DOC_NUMPY_CORE_UMATH_FABS, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "fabs", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        float_power_functions, float_power_data, float_power_signatures, 4,
+        2, 1, PyUFunc_None, "float_power",
+        DOC_NUMPY_CORE_UMATH_FLOAT_POWER, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "float_power", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        floor_functions, floor_data, floor_signatures, 7,
+        1, 1, PyUFunc_None, "floor",
+        DOC_NUMPY_CORE_UMATH_FLOOR, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "floor", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        floor_divide_functions, floor_divide_data, floor_divide_signatures, 18,
+        2, 1, PyUFunc_None, "floor_divide",
+        DOC_NUMPY_CORE_UMATH_FLOOR_DIVIDE, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    ((PyUFuncObject *)f)->type_resolver = &PyUFunc_DivisionTypeResolver;
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_HALF);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "floor_divide with NPY_HALF");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "floor_divide with NPY_HALF");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         HALF_floor_divide_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_FLOAT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "floor_divide with NPY_FLOAT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "floor_divide with NPY_FLOAT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         FLOAT_floor_divide_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_DOUBLE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "floor_divide with NPY_DOUBLE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "floor_divide with NPY_DOUBLE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         DOUBLE_floor_divide_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_LONGDOUBLE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "floor_divide with NPY_LONGDOUBLE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "floor_divide with NPY_LONGDOUBLE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         LONGDOUBLE_floor_divide_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_BYTE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "floor_divide with NPY_BYTE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "floor_divide with NPY_BYTE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         BYTE_floor_divide_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_UBYTE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "floor_divide with NPY_UBYTE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "floor_divide with NPY_UBYTE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         UBYTE_floor_divide_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_SHORT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "floor_divide with NPY_SHORT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "floor_divide with NPY_SHORT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         SHORT_floor_divide_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_USHORT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "floor_divide with NPY_USHORT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "floor_divide with NPY_USHORT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         USHORT_floor_divide_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_INT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "floor_divide with NPY_INT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "floor_divide with NPY_INT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         INT_floor_divide_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_UINT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "floor_divide with NPY_UINT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "floor_divide with NPY_UINT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         UINT_floor_divide_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_LONG);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "floor_divide with NPY_LONG");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "floor_divide with NPY_LONG");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         LONG_floor_divide_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_ULONG);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "floor_divide with NPY_ULONG");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "floor_divide with NPY_ULONG");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         ULONG_floor_divide_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_LONGLONG);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "floor_divide with NPY_LONGLONG");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "floor_divide with NPY_LONGLONG");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         LONGLONG_floor_divide_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_ULONGLONG);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "floor_divide with NPY_ULONGLONG");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "floor_divide with NPY_ULONGLONG");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         ULONGLONG_floor_divide_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    PyDict_SetItemString(dictionary, "floor_divide", f);
+    Py_DECREF(f);
+    identity = (Py_INCREF(Py_None), Py_None);
+    if (1 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        fmax_functions, fmax_data, fmax_signatures, 21,
+        2, 1, PyUFunc_IdentityValue, "fmax",
+        DOC_NUMPY_CORE_UMATH_FMAX, 0, NULL, identity
+    );
+    if (1) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    ((PyUFuncObject *)f)->type_resolver = &PyUFunc_SimpleUniformOperationTypeResolver;
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_HALF);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "fmax with NPY_HALF");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "fmax with NPY_HALF");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         HALF_fmax_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_FLOAT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "fmax with NPY_FLOAT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "fmax with NPY_FLOAT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         FLOAT_fmax_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_DOUBLE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "fmax with NPY_DOUBLE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "fmax with NPY_DOUBLE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         DOUBLE_fmax_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_LONGDOUBLE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "fmax with NPY_LONGDOUBLE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "fmax with NPY_LONGDOUBLE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         LONGDOUBLE_fmax_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_BYTE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "fmax with NPY_BYTE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "fmax with NPY_BYTE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         BYTE_fmax_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_UBYTE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "fmax with NPY_UBYTE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "fmax with NPY_UBYTE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         UBYTE_fmax_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_SHORT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "fmax with NPY_SHORT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "fmax with NPY_SHORT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         SHORT_fmax_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_USHORT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "fmax with NPY_USHORT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "fmax with NPY_USHORT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         USHORT_fmax_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_INT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "fmax with NPY_INT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "fmax with NPY_INT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         INT_fmax_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_UINT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "fmax with NPY_UINT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "fmax with NPY_UINT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         UINT_fmax_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_LONG);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "fmax with NPY_LONG");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "fmax with NPY_LONG");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         LONG_fmax_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_ULONG);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "fmax with NPY_ULONG");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "fmax with NPY_ULONG");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         ULONG_fmax_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_LONGLONG);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "fmax with NPY_LONGLONG");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "fmax with NPY_LONGLONG");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         LONGLONG_fmax_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_ULONGLONG);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "fmax with NPY_ULONGLONG");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "fmax with NPY_ULONGLONG");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         ULONGLONG_fmax_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    PyDict_SetItemString(dictionary, "fmax", f);
+    Py_DECREF(f);
+    identity = (Py_INCREF(Py_None), Py_None);
+    if (1 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        fmin_functions, fmin_data, fmin_signatures, 21,
+        2, 1, PyUFunc_IdentityValue, "fmin",
+        DOC_NUMPY_CORE_UMATH_FMIN, 0, NULL, identity
+    );
+    if (1) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    ((PyUFuncObject *)f)->type_resolver = &PyUFunc_SimpleUniformOperationTypeResolver;
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_HALF);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "fmin with NPY_HALF");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "fmin with NPY_HALF");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         HALF_fmin_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_FLOAT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "fmin with NPY_FLOAT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "fmin with NPY_FLOAT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         FLOAT_fmin_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_DOUBLE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "fmin with NPY_DOUBLE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "fmin with NPY_DOUBLE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         DOUBLE_fmin_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_LONGDOUBLE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "fmin with NPY_LONGDOUBLE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "fmin with NPY_LONGDOUBLE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         LONGDOUBLE_fmin_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_BYTE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "fmin with NPY_BYTE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "fmin with NPY_BYTE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         BYTE_fmin_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_UBYTE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "fmin with NPY_UBYTE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "fmin with NPY_UBYTE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         UBYTE_fmin_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_SHORT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "fmin with NPY_SHORT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "fmin with NPY_SHORT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         SHORT_fmin_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_USHORT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "fmin with NPY_USHORT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "fmin with NPY_USHORT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         USHORT_fmin_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_INT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "fmin with NPY_INT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "fmin with NPY_INT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         INT_fmin_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_UINT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "fmin with NPY_UINT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "fmin with NPY_UINT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         UINT_fmin_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_LONG);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "fmin with NPY_LONG");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "fmin with NPY_LONG");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         LONG_fmin_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_ULONG);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "fmin with NPY_ULONG");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "fmin with NPY_ULONG");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         ULONG_fmin_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_LONGLONG);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "fmin with NPY_LONGLONG");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "fmin with NPY_LONGLONG");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         LONGLONG_fmin_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_ULONGLONG);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "fmin with NPY_ULONGLONG");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "fmin with NPY_ULONGLONG");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         ULONGLONG_fmin_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    PyDict_SetItemString(dictionary, "fmin", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        fmod_functions, fmod_data, fmod_signatures, 15,
+        2, 1, PyUFunc_None, "fmod",
+        DOC_NUMPY_CORE_UMATH_FMOD, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "fmod", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        frexp_functions, frexp_data, frexp_signatures, 4,
+        1, 2, PyUFunc_None, "frexp",
+        DOC_NUMPY_CORE_UMATH_FREXP, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "frexp", f);
+    Py_DECREF(f);
+    identity = PyLong_FromLong(0);
+    if (1 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        gcd_functions, gcd_data, gcd_signatures, 11,
+        2, 1, PyUFunc_IdentityValue, "gcd",
+        DOC_NUMPY_CORE_UMATH_GCD, 0, NULL, identity
+    );
+    if (1) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    ((PyUFuncObject *)f)->type_resolver = &PyUFunc_SimpleUniformOperationTypeResolver;
+    PyDict_SetItemString(dictionary, "gcd", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        greater_functions, greater_data, greater_signatures, 24,
+        2, 1, PyUFunc_None, "greater",
+        DOC_NUMPY_CORE_UMATH_GREATER, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    ((PyUFuncObject *)f)->type_resolver = &PyUFunc_SimpleBinaryComparisonTypeResolver;
+    PyDict_SetItemString(dictionary, "greater", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        greater_equal_functions, greater_equal_data, greater_equal_signatures, 24,
+        2, 1, PyUFunc_None, "greater_equal",
+        DOC_NUMPY_CORE_UMATH_GREATER_EQUAL, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    ((PyUFuncObject *)f)->type_resolver = &PyUFunc_SimpleBinaryComparisonTypeResolver;
+    PyDict_SetItemString(dictionary, "greater_equal", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        heaviside_functions, heaviside_data, heaviside_signatures, 4,
+        2, 1, PyUFunc_None, "heaviside",
+        DOC_NUMPY_CORE_UMATH_HEAVISIDE, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "heaviside", f);
+    Py_DECREF(f);
+    identity = PyLong_FromLong(0);
+    if (1 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        hypot_functions, hypot_data, hypot_signatures, 5,
+        2, 1, PyUFunc_IdentityValue, "hypot",
+        DOC_NUMPY_CORE_UMATH_HYPOT, 0, NULL, identity
+    );
+    if (1) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "hypot", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        invert_functions, invert_data, invert_signatures, 12,
+        1, 1, PyUFunc_None, "invert",
+        DOC_NUMPY_CORE_UMATH_INVERT, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "invert", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        isfinite_functions, isfinite_data, isfinite_signatures, 20,
+        1, 1, PyUFunc_None, "isfinite",
+        DOC_NUMPY_CORE_UMATH_ISFINITE, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    ((PyUFuncObject *)f)->type_resolver = &PyUFunc_IsFiniteTypeResolver;
+    PyDict_SetItemString(dictionary, "isfinite", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        isinf_functions, isinf_data, isinf_signatures, 20,
+        1, 1, PyUFunc_None, "isinf",
+        DOC_NUMPY_CORE_UMATH_ISINF, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    ((PyUFuncObject *)f)->type_resolver = &PyUFunc_IsFiniteTypeResolver;
+    PyDict_SetItemString(dictionary, "isinf", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        isnan_functions, isnan_data, isnan_signatures, 20,
+        1, 1, PyUFunc_None, "isnan",
+        DOC_NUMPY_CORE_UMATH_ISNAN, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    ((PyUFuncObject *)f)->type_resolver = &PyUFunc_IsFiniteTypeResolver;
+    PyDict_SetItemString(dictionary, "isnan", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        isnat_functions, isnat_data, isnat_signatures, 2,
+        1, 1, PyUFunc_None, "isnat",
+        DOC_NUMPY_CORE_UMATH_ISNAT, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    ((PyUFuncObject *)f)->type_resolver = &PyUFunc_IsNaTTypeResolver;
+    PyDict_SetItemString(dictionary, "isnat", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        lcm_functions, lcm_data, lcm_signatures, 11,
+        2, 1, PyUFunc_None, "lcm",
+        DOC_NUMPY_CORE_UMATH_LCM, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    ((PyUFuncObject *)f)->type_resolver = &PyUFunc_SimpleUniformOperationTypeResolver;
+    PyDict_SetItemString(dictionary, "lcm", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        ldexp_functions, ldexp_data, ldexp_signatures, 8,
+        2, 1, PyUFunc_None, "ldexp",
+        DOC_NUMPY_CORE_UMATH_LDEXP, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "ldexp", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        left_shift_functions, left_shift_data, left_shift_signatures, 11,
+        2, 1, PyUFunc_None, "left_shift",
+        DOC_NUMPY_CORE_UMATH_LEFT_SHIFT, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "left_shift", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        less_functions, less_data, less_signatures, 24,
+        2, 1, PyUFunc_None, "less",
+        DOC_NUMPY_CORE_UMATH_LESS, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    ((PyUFuncObject *)f)->type_resolver = &PyUFunc_SimpleBinaryComparisonTypeResolver;
+    PyDict_SetItemString(dictionary, "less", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        less_equal_functions, less_equal_data, less_equal_signatures, 24,
+        2, 1, PyUFunc_None, "less_equal",
+        DOC_NUMPY_CORE_UMATH_LESS_EQUAL, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    ((PyUFuncObject *)f)->type_resolver = &PyUFunc_SimpleBinaryComparisonTypeResolver;
+    PyDict_SetItemString(dictionary, "less_equal", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        log_functions, log_data, log_signatures, 10,
+        1, 1, PyUFunc_None, "log",
+        DOC_NUMPY_CORE_UMATH_LOG, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "log", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        log10_functions, log10_data, log10_signatures, 11,
+        1, 1, PyUFunc_None, "log10",
+        DOC_NUMPY_CORE_UMATH_LOG10, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "log10", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        log1p_functions, log1p_data, log1p_signatures, 11,
+        1, 1, PyUFunc_None, "log1p",
+        DOC_NUMPY_CORE_UMATH_LOG1P, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "log1p", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        log2_functions, log2_data, log2_signatures, 11,
+        1, 1, PyUFunc_None, "log2",
+        DOC_NUMPY_CORE_UMATH_LOG2, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "log2", f);
+    Py_DECREF(f);
+    identity = PyFloat_FromDouble(-NPY_INFINITY);
+    if (1 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        logaddexp_functions, logaddexp_data, logaddexp_signatures, 4,
+        2, 1, PyUFunc_IdentityValue, "logaddexp",
+        DOC_NUMPY_CORE_UMATH_LOGADDEXP, 0, NULL, identity
+    );
+    if (1) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "logaddexp", f);
+    Py_DECREF(f);
+    identity = PyFloat_FromDouble(-NPY_INFINITY);
+    if (1 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        logaddexp2_functions, logaddexp2_data, logaddexp2_signatures, 4,
+        2, 1, PyUFunc_IdentityValue, "logaddexp2",
+        DOC_NUMPY_CORE_UMATH_LOGADDEXP2, 0, NULL, identity
+    );
+    if (1) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "logaddexp2", f);
+    Py_DECREF(f);
+    identity = (Py_INCREF(Py_True), Py_True);
+    if (1 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        logical_and_functions, logical_and_data, logical_and_signatures, 19,
+        2, 1, PyUFunc_IdentityValue, "logical_and",
+        DOC_NUMPY_CORE_UMATH_LOGICAL_AND, 0, NULL, identity
+    );
+    if (1) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    ((PyUFuncObject *)f)->type_resolver = &PyUFunc_SimpleBinaryComparisonTypeResolver;
+    PyDict_SetItemString(dictionary, "logical_and", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        logical_not_functions, logical_not_data, logical_not_signatures, 19,
+        1, 1, PyUFunc_None, "logical_not",
+        DOC_NUMPY_CORE_UMATH_LOGICAL_NOT, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "logical_not", f);
+    Py_DECREF(f);
+    identity = (Py_INCREF(Py_False), Py_False);
+    if (1 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        logical_or_functions, logical_or_data, logical_or_signatures, 19,
+        2, 1, PyUFunc_IdentityValue, "logical_or",
+        DOC_NUMPY_CORE_UMATH_LOGICAL_OR, 0, NULL, identity
+    );
+    if (1) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    ((PyUFuncObject *)f)->type_resolver = &PyUFunc_SimpleBinaryComparisonTypeResolver;
+    PyDict_SetItemString(dictionary, "logical_or", f);
+    Py_DECREF(f);
+    identity = (Py_INCREF(Py_False), Py_False);
+    if (1 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        logical_xor_functions, logical_xor_data, logical_xor_signatures, 19,
+        2, 1, PyUFunc_IdentityValue, "logical_xor",
+        DOC_NUMPY_CORE_UMATH_LOGICAL_XOR, 0, NULL, identity
+    );
+    if (1) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    ((PyUFuncObject *)f)->type_resolver = &PyUFunc_SimpleBinaryComparisonTypeResolver;
+    PyDict_SetItemString(dictionary, "logical_xor", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        matmul_functions, matmul_data, matmul_signatures, 19,
+        2, 1, PyUFunc_None, "matmul",
+        DOC_NUMPY_CORE_UMATH_MATMUL, 0, "(n?,k),(k,m?)->(n?,m?)", identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    ((PyUFuncObject *)f)->type_resolver = &PyUFunc_SimpleUniformOperationTypeResolver;
+    PyDict_SetItemString(dictionary, "matmul", f);
+    Py_DECREF(f);
+    identity = (Py_INCREF(Py_None), Py_None);
+    if (1 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        maximum_functions, maximum_data, maximum_signatures, 21,
+        2, 1, PyUFunc_IdentityValue, "maximum",
+        DOC_NUMPY_CORE_UMATH_MAXIMUM, 0, NULL, identity
+    );
+    if (1) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    ((PyUFuncObject *)f)->type_resolver = &PyUFunc_SimpleUniformOperationTypeResolver;
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_HALF);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "maximum with NPY_HALF");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "maximum with NPY_HALF");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         HALF_maximum_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_FLOAT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "maximum with NPY_FLOAT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "maximum with NPY_FLOAT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         FLOAT_maximum_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_DOUBLE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "maximum with NPY_DOUBLE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "maximum with NPY_DOUBLE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         DOUBLE_maximum_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_LONGDOUBLE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "maximum with NPY_LONGDOUBLE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "maximum with NPY_LONGDOUBLE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         LONGDOUBLE_maximum_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_BYTE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "maximum with NPY_BYTE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "maximum with NPY_BYTE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         BYTE_maximum_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_UBYTE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "maximum with NPY_UBYTE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "maximum with NPY_UBYTE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         UBYTE_maximum_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_SHORT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "maximum with NPY_SHORT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "maximum with NPY_SHORT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         SHORT_maximum_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_USHORT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "maximum with NPY_USHORT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "maximum with NPY_USHORT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         USHORT_maximum_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_INT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "maximum with NPY_INT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "maximum with NPY_INT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         INT_maximum_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_UINT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "maximum with NPY_UINT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "maximum with NPY_UINT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         UINT_maximum_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_LONG);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "maximum with NPY_LONG");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "maximum with NPY_LONG");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         LONG_maximum_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_ULONG);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "maximum with NPY_ULONG");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "maximum with NPY_ULONG");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         ULONG_maximum_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_LONGLONG);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "maximum with NPY_LONGLONG");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "maximum with NPY_LONGLONG");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         LONGLONG_maximum_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_ULONGLONG);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "maximum with NPY_ULONGLONG");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "maximum with NPY_ULONGLONG");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         ULONGLONG_maximum_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    PyDict_SetItemString(dictionary, "maximum", f);
+    Py_DECREF(f);
+    identity = (Py_INCREF(Py_None), Py_None);
+    if (1 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        minimum_functions, minimum_data, minimum_signatures, 21,
+        2, 1, PyUFunc_IdentityValue, "minimum",
+        DOC_NUMPY_CORE_UMATH_MINIMUM, 0, NULL, identity
+    );
+    if (1) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    ((PyUFuncObject *)f)->type_resolver = &PyUFunc_SimpleUniformOperationTypeResolver;
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_HALF);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "minimum with NPY_HALF");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "minimum with NPY_HALF");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         HALF_minimum_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_FLOAT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "minimum with NPY_FLOAT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "minimum with NPY_FLOAT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         FLOAT_minimum_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_DOUBLE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "minimum with NPY_DOUBLE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "minimum with NPY_DOUBLE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         DOUBLE_minimum_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_LONGDOUBLE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "minimum with NPY_LONGDOUBLE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "minimum with NPY_LONGDOUBLE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         LONGDOUBLE_minimum_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_BYTE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "minimum with NPY_BYTE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "minimum with NPY_BYTE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         BYTE_minimum_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_UBYTE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "minimum with NPY_UBYTE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "minimum with NPY_UBYTE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         UBYTE_minimum_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_SHORT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "minimum with NPY_SHORT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "minimum with NPY_SHORT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         SHORT_minimum_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_USHORT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "minimum with NPY_USHORT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "minimum with NPY_USHORT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         USHORT_minimum_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_INT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "minimum with NPY_INT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "minimum with NPY_INT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         INT_minimum_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_UINT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "minimum with NPY_UINT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "minimum with NPY_UINT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         UINT_minimum_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_LONG);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "minimum with NPY_LONG");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "minimum with NPY_LONG");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         LONG_minimum_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_ULONG);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "minimum with NPY_ULONG");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "minimum with NPY_ULONG");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         ULONG_minimum_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_LONGLONG);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "minimum with NPY_LONGLONG");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "minimum with NPY_LONGLONG");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         LONGLONG_minimum_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_ULONGLONG);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "minimum with NPY_ULONGLONG");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "minimum with NPY_ULONGLONG");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         ULONGLONG_minimum_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    PyDict_SetItemString(dictionary, "minimum", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        modf_functions, modf_data, modf_signatures, 4,
+        1, 2, PyUFunc_None, "modf",
+        DOC_NUMPY_CORE_UMATH_MODF, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "modf", f);
+    Py_DECREF(f);
+    identity = PyLong_FromLong(1);
+    if (1 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        multiply_functions, multiply_data, multiply_signatures, 23,
+        2, 1, PyUFunc_IdentityValue, "multiply",
+        DOC_NUMPY_CORE_UMATH_MULTIPLY, 0, NULL, identity
+    );
+    if (1) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    ((PyUFuncObject *)f)->type_resolver = &PyUFunc_MultiplicationTypeResolver;
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_BYTE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "multiply with NPY_BYTE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "multiply with NPY_BYTE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         BYTE_multiply_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_UBYTE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "multiply with NPY_UBYTE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "multiply with NPY_UBYTE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         UBYTE_multiply_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_SHORT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "multiply with NPY_SHORT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "multiply with NPY_SHORT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         SHORT_multiply_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_USHORT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "multiply with NPY_USHORT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "multiply with NPY_USHORT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         USHORT_multiply_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_INT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "multiply with NPY_INT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "multiply with NPY_INT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         INT_multiply_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_UINT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "multiply with NPY_UINT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "multiply with NPY_UINT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         UINT_multiply_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_LONG);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "multiply with NPY_LONG");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "multiply with NPY_LONG");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         LONG_multiply_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_ULONG);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "multiply with NPY_ULONG");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "multiply with NPY_ULONG");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         ULONG_multiply_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_LONGLONG);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "multiply with NPY_LONGLONG");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "multiply with NPY_LONGLONG");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         LONGLONG_multiply_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_ULONGLONG);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "multiply with NPY_ULONGLONG");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "multiply with NPY_ULONGLONG");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         ULONGLONG_multiply_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_HALF);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "multiply with NPY_HALF");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "multiply with NPY_HALF");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         HALF_multiply_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_FLOAT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "multiply with NPY_FLOAT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "multiply with NPY_FLOAT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         FLOAT_multiply_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_DOUBLE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "multiply with NPY_DOUBLE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "multiply with NPY_DOUBLE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         DOUBLE_multiply_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_LONGDOUBLE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "multiply with NPY_LONGDOUBLE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "multiply with NPY_LONGDOUBLE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         LONGDOUBLE_multiply_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_CFLOAT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "multiply with NPY_CFLOAT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "multiply with NPY_CFLOAT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         CFLOAT_multiply_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_CDOUBLE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "multiply with NPY_CDOUBLE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "multiply with NPY_CDOUBLE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         CDOUBLE_multiply_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_CLONGDOUBLE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "multiply with NPY_CLONGDOUBLE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "multiply with NPY_CLONGDOUBLE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         CLONGDOUBLE_multiply_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    PyDict_SetItemString(dictionary, "multiply", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        negative_functions, negative_data, negative_signatures, 19,
+        1, 1, PyUFunc_None, "negative",
+        DOC_NUMPY_CORE_UMATH_NEGATIVE, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    ((PyUFuncObject *)f)->type_resolver = &PyUFunc_NegativeTypeResolver;
+    PyDict_SetItemString(dictionary, "negative", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        nextafter_functions, nextafter_data, nextafter_signatures, 4,
+        2, 1, PyUFunc_None, "nextafter",
+        DOC_NUMPY_CORE_UMATH_NEXTAFTER, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "nextafter", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        not_equal_functions, not_equal_data, not_equal_signatures, 24,
+        2, 1, PyUFunc_None, "not_equal",
+        DOC_NUMPY_CORE_UMATH_NOT_EQUAL, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    ((PyUFuncObject *)f)->type_resolver = &PyUFunc_SimpleBinaryComparisonTypeResolver;
+    PyDict_SetItemString(dictionary, "not_equal", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        positive_functions, positive_data, positive_signatures, 19,
+        1, 1, PyUFunc_None, "positive",
+        DOC_NUMPY_CORE_UMATH_POSITIVE, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    ((PyUFuncObject *)f)->type_resolver = &PyUFunc_SimpleUniformOperationTypeResolver;
+    PyDict_SetItemString(dictionary, "positive", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        power_functions, power_data, power_signatures, 21,
+        2, 1, PyUFunc_None, "power",
+        DOC_NUMPY_CORE_UMATH_POWER, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "power", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        rad2deg_functions, rad2deg_data, rad2deg_signatures, 5,
+        1, 1, PyUFunc_None, "rad2deg",
+        DOC_NUMPY_CORE_UMATH_RAD2DEG, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "rad2deg", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        radians_functions, radians_data, radians_signatures, 5,
+        1, 1, PyUFunc_None, "radians",
+        DOC_NUMPY_CORE_UMATH_RADIANS, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "radians", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        reciprocal_functions, reciprocal_data, reciprocal_signatures, 18,
+        1, 1, PyUFunc_None, "reciprocal",
+        DOC_NUMPY_CORE_UMATH_RECIPROCAL, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "reciprocal", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        remainder_functions, remainder_data, remainder_signatures, 16,
+        2, 1, PyUFunc_None, "remainder",
+        DOC_NUMPY_CORE_UMATH_REMAINDER, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    ((PyUFuncObject *)f)->type_resolver = &PyUFunc_RemainderTypeResolver;
+    PyDict_SetItemString(dictionary, "remainder", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        right_shift_functions, right_shift_data, right_shift_signatures, 11,
+        2, 1, PyUFunc_None, "right_shift",
+        DOC_NUMPY_CORE_UMATH_RIGHT_SHIFT, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "right_shift", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        rint_functions, rint_data, rint_signatures, 10,
+        1, 1, PyUFunc_None, "rint",
+        DOC_NUMPY_CORE_UMATH_RINT, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "rint", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        sign_functions, sign_data, sign_signatures, 19,
+        1, 1, PyUFunc_None, "sign",
+        DOC_NUMPY_CORE_UMATH_SIGN, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    ((PyUFuncObject *)f)->type_resolver = &PyUFunc_SimpleUniformOperationTypeResolver;
+    PyDict_SetItemString(dictionary, "sign", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        signbit_functions, signbit_data, signbit_signatures, 4,
+        1, 1, PyUFunc_None, "signbit",
+        DOC_NUMPY_CORE_UMATH_SIGNBIT, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "signbit", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        sin_functions, sin_data, sin_signatures, 8,
+        1, 1, PyUFunc_None, "sin",
+        DOC_NUMPY_CORE_UMATH_SIN, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "sin", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        sinh_functions, sinh_data, sinh_signatures, 11,
+        1, 1, PyUFunc_None, "sinh",
+        DOC_NUMPY_CORE_UMATH_SINH, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "sinh", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        spacing_functions, spacing_data, spacing_signatures, 4,
+        1, 1, PyUFunc_None, "spacing",
+        DOC_NUMPY_CORE_UMATH_SPACING, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "spacing", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        sqrt_functions, sqrt_data, sqrt_signatures, 10,
+        1, 1, PyUFunc_None, "sqrt",
+        DOC_NUMPY_CORE_UMATH_SQRT, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "sqrt", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        square_functions, square_data, square_signatures, 18,
+        1, 1, PyUFunc_None, "square",
+        DOC_NUMPY_CORE_UMATH_SQUARE, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "square", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        subtract_functions, subtract_data, subtract_signatures, 21,
+        2, 1, PyUFunc_None, "subtract",
+        DOC_NUMPY_CORE_UMATH_SUBTRACT, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    ((PyUFuncObject *)f)->type_resolver = &PyUFunc_SubtractionTypeResolver;
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_BYTE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "subtract with NPY_BYTE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "subtract with NPY_BYTE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         BYTE_subtract_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_UBYTE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "subtract with NPY_UBYTE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "subtract with NPY_UBYTE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         UBYTE_subtract_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_SHORT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "subtract with NPY_SHORT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "subtract with NPY_SHORT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         SHORT_subtract_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_USHORT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "subtract with NPY_USHORT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "subtract with NPY_USHORT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         USHORT_subtract_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_INT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "subtract with NPY_INT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "subtract with NPY_INT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         INT_subtract_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_UINT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "subtract with NPY_UINT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "subtract with NPY_UINT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         UINT_subtract_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_LONG);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "subtract with NPY_LONG");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "subtract with NPY_LONG");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         LONG_subtract_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_ULONG);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "subtract with NPY_ULONG");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "subtract with NPY_ULONG");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         ULONG_subtract_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_LONGLONG);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "subtract with NPY_LONGLONG");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "subtract with NPY_LONGLONG");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         LONGLONG_subtract_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_ULONGLONG);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "subtract with NPY_ULONGLONG");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "subtract with NPY_ULONGLONG");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         ULONGLONG_subtract_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_HALF);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "subtract with NPY_HALF");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "subtract with NPY_HALF");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         HALF_subtract_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_FLOAT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "subtract with NPY_FLOAT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "subtract with NPY_FLOAT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         FLOAT_subtract_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_DOUBLE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "subtract with NPY_DOUBLE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "subtract with NPY_DOUBLE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         DOUBLE_subtract_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_LONGDOUBLE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "subtract with NPY_LONGDOUBLE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "subtract with NPY_LONGDOUBLE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         LONGDOUBLE_subtract_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_CFLOAT);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "subtract with NPY_CFLOAT");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "subtract with NPY_CFLOAT");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         CFLOAT_subtract_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_CDOUBLE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "subtract with NPY_CDOUBLE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "subtract with NPY_CDOUBLE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         CDOUBLE_subtract_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    
+    {
+        PyArray_DTypeMeta *dtype = PyArray_DTypeFromTypeNum(NPY_CLONGDOUBLE);
+        PyObject *info = get_info_no_cast((PyUFuncObject *)f,
+                                           dtype, 3);
+        if (info == NULL) {
+            return -1;
+        }
+        if (info == Py_None) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "cannot add indexed loop to ufunc "
+                "subtract with NPY_CLONGDOUBLE");
+            return -1;
+        }
+        if (!PyObject_TypeCheck(info, &PyArrayMethod_Type)) {
+            PyErr_SetString(PyExc_RuntimeError,
+                "Not a PyArrayMethodObject in ufunc "
+                "subtract with NPY_CLONGDOUBLE");
+        }
+        ((PyArrayMethodObject*)info)->contiguous_indexed_loop =
+                                                         CLONGDOUBLE_subtract_indexed;
+        /* info is borrowed, no need to decref*/
+    }
+    
+    PyDict_SetItemString(dictionary, "subtract", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        tan_functions, tan_data, tan_signatures, 11,
+        1, 1, PyUFunc_None, "tan",
+        DOC_NUMPY_CORE_UMATH_TAN, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "tan", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        tanh_functions, tanh_data, tanh_signatures, 11,
+        1, 1, PyUFunc_None, "tanh",
+        DOC_NUMPY_CORE_UMATH_TANH, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "tanh", f);
+    Py_DECREF(f);
+    identity = NULL;
+    if (0 && identity == NULL) {
+        return -1;
+    }
+    f = PyUFunc_FromFuncAndDataAndSignatureAndIdentity(
+        trunc_functions, trunc_data, trunc_signatures, 7,
+        1, 1, PyUFunc_None, "trunc",
+        DOC_NUMPY_CORE_UMATH_TRUNC, 0, NULL, identity
+    );
+    if (0) {
+        Py_DECREF(identity);
+    }
+    if (f == NULL) {
+        return -1;
+    }
+    
+    PyDict_SetItemString(dictionary, "trunc", f);
+    Py_DECREF(f);
+
+    return 0;
+}
diff --git a/nanvix-port/generated-headers/_numpyconfig.h b/nanvix-port/generated-headers/_numpyconfig.h
new file mode 100644
index 000000000000..bece00554ba4
--- /dev/null
+++ b/nanvix-port/generated-headers/_numpyconfig.h
@@ -0,0 +1,37 @@
+/* _numpyconfig.h -- Generated for Nanvix (i686, 32-bit) */
+
+/* #undef NPY_HAVE_ENDIAN_H */
+
+#define NPY_SIZEOF_SHORT 2
+#define NPY_SIZEOF_INT 4
+#define NPY_SIZEOF_LONG 4
+#define NPY_SIZEOF_FLOAT 4
+#define NPY_SIZEOF_COMPLEX_FLOAT 8
+#define NPY_SIZEOF_DOUBLE 8
+#define NPY_SIZEOF_COMPLEX_DOUBLE 16
+#define NPY_SIZEOF_LONGDOUBLE 12
+#define NPY_SIZEOF_COMPLEX_LONGDOUBLE 24
+#define NPY_SIZEOF_PY_INTPTR_T 4
+#define NPY_SIZEOF_OFF_T 8
+#define NPY_SIZEOF_PY_LONG_LONG 8
+#define NPY_SIZEOF_LONGLONG 8
+
+#define NPY_USE_C99_COMPLEX 1
+#define NPY_HAVE_COMPLEX_DOUBLE 1
+#define NPY_HAVE_COMPLEX_FLOAT 1
+#define NPY_HAVE_COMPLEX_LONG_DOUBLE 1
+#define NPY_USE_C99_FORMATS 1
+
+/* No signal handling (microkernel) */
+#define NPY_NO_SIGNAL 1
+
+/* No SMP (single-threaded) */
+#define NPY_NO_SMP 1
+
+#define NPY_VISIBILITY_HIDDEN __attribute__((visibility("hidden")))
+#define NPY_ABI_VERSION 0x01000009
+#define NPY_API_VERSION 0x00000011
+
+#ifndef __STDC_FORMAT_MACROS
+#define __STDC_FORMAT_MACROS 1
+#endif
diff --git a/nanvix-port/generated-headers/_umath_doc_generated.h b/nanvix-port/generated-headers/_umath_doc_generated.h
new file mode 100644
index 000000000000..98e184eca5c6
--- /dev/null
+++ b/nanvix-port/generated-headers/_umath_doc_generated.h
@@ -0,0 +1,92 @@
+#ifndef NUMPY_CORE_INCLUDE__UMATH_DOC_GENERATED_H_
+#define NUMPY_CORE_INCLUDE__UMATH_DOC_GENERATED_H_
+#define DOC_NUMPY_CORE_UMATH_ABSOLUTE "Calculate the absolute value element-wise.\n""\n""``np.abs`` is a shorthand for this function.\n""\n""Parameters\n""----------\n""x : array_like\n""    Input array.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""absolute : ndarray\n""    An ndarray containing the absolute value of\n""    each element in `x`.  For complex input, ``a + ib``, the\n""    absolute value is :math:`\\sqrt{ a^2 + b^2 }`.\n""    This is a scalar if `x` is a scalar.\n""\n""Examples\n""--------\n"">>> x = np.array([-1.2, 1.2])\n"">>> np.absolute(x)\n""array([ 1.2,  1.2])\n"">>> np.absolute(1.2 + 1j)\n""1.5620499351813308\n""\n""Plot the function over ``[-10, 10]``:\n""\n"">>> import matplotlib.pyplot as plt\n""\n"">>> x = np.linspace(start=-10, stop=10, num=101)\n"">>> plt.plot(x, np.absolute(x))\n"">>> plt.show()\n""\n""Plot the function over the complex plane:\n""\n"">>> xx = x + 1j * x[:, np.newaxis]\n"">>> plt.imshow(np.abs(xx), extent=[-10, 10, -10, 10], cmap=\'gray\')\n"">>> plt.show()\n""\n""The `abs` function can be used as a shorthand for ``np.absolute`` on\n""ndarrays.\n""\n"">>> x = np.array([-1.2, 1.2])\n"">>> abs(x)\n""array([1.2, 1.2])"
+#define DOC_NUMPY_CORE_UMATH_ADD "Add arguments element-wise.\n""\n""Parameters\n""----------\n""x1, x2 : array_like\n""    The arrays to be added.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""add : ndarray or scalar\n""    The sum of `x1` and `x2`, element-wise.\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""Notes\n""-----\n""Equivalent to `x1` + `x2` in terms of array broadcasting.\n""\n""Examples\n""--------\n"">>> np.add(1.0, 4.0)\n""5.0\n"">>> x1 = np.arange(9.0).reshape((3, 3))\n"">>> x2 = np.arange(3.0)\n"">>> np.add(x1, x2)\n""array([[  0.,   2.,   4.],\n""       [  3.,   5.,   7.],\n""       [  6.,   8.,  10.]])\n""\n""The ``+`` operator can be used as a shorthand for ``np.add`` on ndarrays.\n""\n"">>> x1 = np.arange(9.0).reshape((3, 3))\n"">>> x2 = np.arange(3.0)\n"">>> x1 + x2\n""array([[ 0.,  2.,  4.],\n""       [ 3.,  5.,  7.],\n""       [ 6.,  8., 10.]])"
+#define DOC_NUMPY_CORE_UMATH_ARCCOS "Trigonometric inverse cosine, element-wise.\n""\n""The inverse of `cos` so that, if ``y = cos(x)``, then ``x = arccos(y)``.\n""\n""Parameters\n""----------\n""x : array_like\n""    `x`-coordinate on the unit circle.\n""    For real arguments, the domain is [-1, 1].\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""angle : ndarray\n""    The angle of the ray intersecting the unit circle at the given\n""    `x`-coordinate in radians [0, pi].\n""    This is a scalar if `x` is a scalar.\n""\n""See Also\n""--------\n""cos, arctan, arcsin, emath.arccos\n""\n""Notes\n""-----\n""`arccos` is a multivalued function: for each `x` there are infinitely\n""many numbers `z` such that ``cos(z) = x``. The convention is to return\n""the angle `z` whose real part lies in `[0, pi]`.\n""\n""For real-valued input data types, `arccos` always returns real output.\n""For each value that cannot be expressed as a real number or infinity,\n""it yields ``nan`` and sets the `invalid` floating point error flag.\n""\n""For complex-valued input, `arccos` is a complex analytic function that\n""has branch cuts ``[-inf, -1]`` and `[1, inf]` and is continuous from\n""above on the former and from below on the latter.\n""\n""The inverse `cos` is also known as `acos` or cos^-1.\n""\n""References\n""----------\n""M. Abramowitz and I.A. Stegun, \"Handbook of Mathematical Functions\",\n""10th printing, 1964, pp. 79.\n""https://personal.math.ubc.ca/~cbm/aands/page_79.htm\n""\n""Examples\n""--------\n""We expect the arccos of 1 to be 0, and of -1 to be pi:\n""\n"">>> np.arccos([1, -1])\n""array([ 0.        ,  3.14159265])\n""\n""Plot arccos:\n""\n"">>> import matplotlib.pyplot as plt\n"">>> x = np.linspace(-1, 1, num=100)\n"">>> plt.plot(x, np.arccos(x))\n"">>> plt.axis(\'tight\')\n"">>> plt.show()"
+#define DOC_NUMPY_CORE_UMATH_ARCCOSH "Inverse hyperbolic cosine, element-wise.\n""\n""Parameters\n""----------\n""x : array_like\n""    Input array.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""arccosh : ndarray\n""    Array of the same shape as `x`.\n""    This is a scalar if `x` is a scalar.\n""\n""See Also\n""--------\n""\n""cosh, arcsinh, sinh, arctanh, tanh\n""\n""Notes\n""-----\n""`arccosh` is a multivalued function: for each `x` there are infinitely\n""many numbers `z` such that `cosh(z) = x`. The convention is to return the\n""`z` whose imaginary part lies in ``[-pi, pi]`` and the real part in\n""``[0, inf]``.\n""\n""For real-valued input data types, `arccosh` always returns real output.\n""For each value that cannot be expressed as a real number or infinity, it\n""yields ``nan`` and sets the `invalid` floating point error flag.\n""\n""For complex-valued input, `arccosh` is a complex analytical function that\n""has a branch cut `[-inf, 1]` and is continuous from above on it.\n""\n""References\n""----------\n"".. [1] M. Abramowitz and I.A. Stegun, \"Handbook of Mathematical Functions\",\n""       10th printing, 1964, pp. 86.\n""       https://personal.math.ubc.ca/~cbm/aands/page_86.htm\n"".. [2] Wikipedia, \"Inverse hyperbolic function\",\n""       https://en.wikipedia.org/wiki/Arccosh\n""\n""Examples\n""--------\n"">>> np.arccosh([np.e, 10.0])\n""array([ 1.65745445,  2.99322285])\n"">>> np.arccosh(1)\n""0.0"
+#define DOC_NUMPY_CORE_UMATH_ARCSIN "Inverse sine, element-wise.\n""\n""Parameters\n""----------\n""x : array_like\n""    `y`-coordinate on the unit circle.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""angle : ndarray\n""    The inverse sine of each element in `x`, in radians and in the\n""    closed interval ``[-pi/2, pi/2]``.\n""    This is a scalar if `x` is a scalar.\n""\n""See Also\n""--------\n""sin, cos, arccos, tan, arctan, arctan2, emath.arcsin\n""\n""Notes\n""-----\n""`arcsin` is a multivalued function: for each `x` there are infinitely\n""many numbers `z` such that :math:`sin(z) = x`.  The convention is to\n""return the angle `z` whose real part lies in [-pi/2, pi/2].\n""\n""For real-valued input data types, *arcsin* always returns real output.\n""For each value that cannot be expressed as a real number or infinity,\n""it yields ``nan`` and sets the `invalid` floating point error flag.\n""\n""For complex-valued input, `arcsin` is a complex analytic function that\n""has, by convention, the branch cuts [-inf, -1] and [1, inf]  and is\n""continuous from above on the former and from below on the latter.\n""\n""The inverse sine is also known as `asin` or sin^{-1}.\n""\n""References\n""----------\n""Abramowitz, M. and Stegun, I. A., *Handbook of Mathematical Functions*,\n""10th printing, New York: Dover, 1964, pp. 79ff.\n""https://personal.math.ubc.ca/~cbm/aands/page_79.htm\n""\n""Examples\n""--------\n"">>> np.arcsin(1)     # pi/2\n""1.5707963267948966\n"">>> np.arcsin(-1)    # -pi/2\n""-1.5707963267948966\n"">>> np.arcsin(0)\n""0.0"
+#define DOC_NUMPY_CORE_UMATH_ARCSINH "Inverse hyperbolic sine element-wise.\n""\n""Parameters\n""----------\n""x : array_like\n""    Input array.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""out : ndarray or scalar\n""    Array of the same shape as `x`.\n""    This is a scalar if `x` is a scalar.\n""\n""Notes\n""-----\n""`arcsinh` is a multivalued function: for each `x` there are infinitely\n""many numbers `z` such that `sinh(z) = x`. The convention is to return the\n""`z` whose imaginary part lies in `[-pi/2, pi/2]`.\n""\n""For real-valued input data types, `arcsinh` always returns real output.\n""For each value that cannot be expressed as a real number or infinity, it\n""returns ``nan`` and sets the `invalid` floating point error flag.\n""\n""For complex-valued input, `arccos` is a complex analytical function that\n""has branch cuts `[1j, infj]` and `[-1j, -infj]` and is continuous from\n""the right on the former and from the left on the latter.\n""\n""The inverse hyperbolic sine is also known as `asinh` or ``sinh^-1``.\n""\n""References\n""----------\n"".. [1] M. Abramowitz and I.A. Stegun, \"Handbook of Mathematical Functions\",\n""       10th printing, 1964, pp. 86.\n""       https://personal.math.ubc.ca/~cbm/aands/page_86.htm\n"".. [2] Wikipedia, \"Inverse hyperbolic function\",\n""       https://en.wikipedia.org/wiki/Arcsinh\n""\n""Examples\n""--------\n"">>> np.arcsinh(np.array([np.e, 10.0]))\n""array([ 1.72538256,  2.99822295])"
+#define DOC_NUMPY_CORE_UMATH_ARCTAN "Trigonometric inverse tangent, element-wise.\n""\n""The inverse of tan, so that if ``y = tan(x)`` then ``x = arctan(y)``.\n""\n""Parameters\n""----------\n""x : array_like\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""out : ndarray or scalar\n""    Out has the same shape as `x`.  Its real part is in\n""    ``[-pi/2, pi/2]`` (``arctan(+/-inf)`` returns ``+/-pi/2``).\n""    This is a scalar if `x` is a scalar.\n""\n""See Also\n""--------\n""arctan2 : The \"four quadrant\" arctan of the angle formed by (`x`, `y`)\n""    and the positive `x`-axis.\n""angle : Argument of complex values.\n""\n""Notes\n""-----\n""`arctan` is a multi-valued function: for each `x` there are infinitely\n""many numbers `z` such that tan(`z`) = `x`.  The convention is to return\n""the angle `z` whose real part lies in [-pi/2, pi/2].\n""\n""For real-valued input data types, `arctan` always returns real output.\n""For each value that cannot be expressed as a real number or infinity,\n""it yields ``nan`` and sets the `invalid` floating point error flag.\n""\n""For complex-valued input, `arctan` is a complex analytic function that\n""has [``1j, infj``] and [``-1j, -infj``] as branch cuts, and is continuous\n""from the left on the former and from the right on the latter.\n""\n""The inverse tangent is also known as `atan` or tan^{-1}.\n""\n""References\n""----------\n""Abramowitz, M. and Stegun, I. A., *Handbook of Mathematical Functions*,\n""10th printing, New York: Dover, 1964, pp. 79.\n""https://personal.math.ubc.ca/~cbm/aands/page_79.htm\n""\n""Examples\n""--------\n""We expect the arctan of 0 to be 0, and of 1 to be pi/4:\n""\n"">>> np.arctan([0, 1])\n""array([ 0.        ,  0.78539816])\n""\n"">>> np.pi/4\n""0.78539816339744828\n""\n""Plot arctan:\n""\n"">>> import matplotlib.pyplot as plt\n"">>> x = np.linspace(-10, 10)\n"">>> plt.plot(x, np.arctan(x))\n"">>> plt.axis(\'tight\')\n"">>> plt.show()"
+#define DOC_NUMPY_CORE_UMATH_ARCTAN2 "Element-wise arc tangent of ``x1/x2`` choosing the quadrant correctly.\n""\n""The quadrant (i.e., branch) is chosen so that ``arctan2(x1, x2)`` is\n""the signed angle in radians between the ray ending at the origin and\n""passing through the point (1,0), and the ray ending at the origin and\n""passing through the point (`x2`, `x1`).  (Note the role reversal: the\n""\"`y`-coordinate\" is the first function parameter, the \"`x`-coordinate\"\n""is the second.)  By IEEE convention, this function is defined for\n""`x2` = +/-0 and for either or both of `x1` and `x2` = +/-inf (see\n""Notes for specific values).\n""\n""This function is not defined for complex-valued arguments; for the\n""so-called argument of complex values, use `angle`.\n""\n""Parameters\n""----------\n""x1 : array_like, real-valued\n""    `y`-coordinates.\n""x2 : array_like, real-valued\n""    `x`-coordinates.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""angle : ndarray\n""    Array of angles in radians, in the range ``[-pi, pi]``.\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""See Also\n""--------\n""arctan, tan, angle\n""\n""Notes\n""-----\n""*arctan2* is identical to the `atan2` function of the underlying\n""C library.  The following special values are defined in the C\n""standard: [1]_\n""\n""====== ====== ================\n""`x1`   `x2`   `arctan2(x1,x2)`\n""====== ====== ================\n""+/- 0  +0     +/- 0\n""+/- 0  -0     +/- pi\n"" > 0   +/-inf +0 / +pi\n"" < 0   +/-inf -0 / -pi\n""+/-inf +inf   +/- (pi/4)\n""+/-inf -inf   +/- (3*pi/4)\n""====== ====== ================\n""\n""Note that +0 and -0 are distinct floating point numbers, as are +inf\n""and -inf.\n""\n""References\n""----------\n"".. [1] ISO/IEC standard 9899:1999, \"Programming language C.\"\n""\n""Examples\n""--------\n""Consider four points in different quadrants:\n""\n"">>> x = np.array([-1, +1, +1, -1])\n"">>> y = np.array([-1, -1, +1, +1])\n"">>> np.arctan2(y, x) * 180 / np.pi\n""array([-135.,  -45.,   45.,  135.])\n""\n""Note the order of the parameters. `arctan2` is defined also when `x2` = 0\n""and at several other special points, obtaining values in\n""the range ``[-pi, pi]``:\n""\n"">>> np.arctan2([1., -1.], [0., 0.])\n""array([ 1.57079633, -1.57079633])\n"">>> np.arctan2([0., 0., np.inf], [+0., -0., np.inf])\n""array([0.        , 3.14159265, 0.78539816])"
+#define DOC_NUMPY_CORE_UMATH__ARG "DO NOT USE, ONLY FOR TESTING"
+#define DOC_NUMPY_CORE_UMATH_ARCTANH "Inverse hyperbolic tangent element-wise.\n""\n""Parameters\n""----------\n""x : array_like\n""    Input array.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""out : ndarray or scalar\n""    Array of the same shape as `x`.\n""    This is a scalar if `x` is a scalar.\n""\n""See Also\n""--------\n""emath.arctanh\n""\n""Notes\n""-----\n""`arctanh` is a multivalued function: for each `x` there are infinitely\n""many numbers `z` such that ``tanh(z) = x``. The convention is to return\n""the `z` whose imaginary part lies in `[-pi/2, pi/2]`.\n""\n""For real-valued input data types, `arctanh` always returns real output.\n""For each value that cannot be expressed as a real number or infinity,\n""it yields ``nan`` and sets the `invalid` floating point error flag.\n""\n""For complex-valued input, `arctanh` is a complex analytical function\n""that has branch cuts `[-1, -inf]` and `[1, inf]` and is continuous from\n""above on the former and from below on the latter.\n""\n""The inverse hyperbolic tangent is also known as `atanh` or ``tanh^-1``.\n""\n""References\n""----------\n"".. [1] M. Abramowitz and I.A. Stegun, \"Handbook of Mathematical Functions\",\n""       10th printing, 1964, pp. 86.\n""       https://personal.math.ubc.ca/~cbm/aands/page_86.htm\n"".. [2] Wikipedia, \"Inverse hyperbolic function\",\n""       https://en.wikipedia.org/wiki/Arctanh\n""\n""Examples\n""--------\n"">>> np.arctanh([0, -0.5])\n""array([ 0.        , -0.54930614])"
+#define DOC_NUMPY_CORE_UMATH_BITWISE_AND "Compute the bit-wise AND of two arrays element-wise.\n""\n""Computes the bit-wise AND of the underlying binary representation of\n""the integers in the input arrays. This ufunc implements the C/Python\n""operator ``&``.\n""\n""Parameters\n""----------\n""x1, x2 : array_like\n""    Only integer and boolean types are handled.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""out : ndarray or scalar\n""    Result.\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""See Also\n""--------\n""logical_and\n""bitwise_or\n""bitwise_xor\n""binary_repr :\n""    Return the binary representation of the input number as a string.\n""\n""Examples\n""--------\n""The number 13 is represented by ``00001101``.  Likewise, 17 is\n""represented by ``00010001``.  The bit-wise AND of 13 and 17 is\n""therefore ``000000001``, or 1:\n""\n"">>> np.bitwise_and(13, 17)\n""1\n""\n"">>> np.bitwise_and(14, 13)\n""12\n"">>> np.binary_repr(12)\n""\'1100\'\n"">>> np.bitwise_and([14,3], 13)\n""array([12,  1])\n""\n"">>> np.bitwise_and([11,7], [4,25])\n""array([0, 1])\n"">>> np.bitwise_and(np.array([2,5,255]), np.array([3,14,16]))\n""array([ 2,  4, 16])\n"">>> np.bitwise_and([True, True], [False, True])\n""array([False,  True])\n""\n""The ``&`` operator can be used as a shorthand for ``np.bitwise_and`` on\n""ndarrays.\n""\n"">>> x1 = np.array([2, 5, 255])\n"">>> x2 = np.array([3, 14, 16])\n"">>> x1 & x2\n""array([ 2,  4, 16])"
+#define DOC_NUMPY_CORE_UMATH_BITWISE_OR "Compute the bit-wise OR of two arrays element-wise.\n""\n""Computes the bit-wise OR of the underlying binary representation of\n""the integers in the input arrays. This ufunc implements the C/Python\n""operator ``|``.\n""\n""Parameters\n""----------\n""x1, x2 : array_like\n""    Only integer and boolean types are handled.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""out : ndarray or scalar\n""    Result.\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""See Also\n""--------\n""logical_or\n""bitwise_and\n""bitwise_xor\n""binary_repr :\n""    Return the binary representation of the input number as a string.\n""\n""Examples\n""--------\n""The number 13 has the binary representation ``00001101``. Likewise,\n""16 is represented by ``00010000``.  The bit-wise OR of 13 and 16 is\n""then ``00011101``, or 29:\n""\n"">>> np.bitwise_or(13, 16)\n""29\n"">>> np.binary_repr(29)\n""\'11101\'\n""\n"">>> np.bitwise_or(32, 2)\n""34\n"">>> np.bitwise_or([33, 4], 1)\n""array([33,  5])\n"">>> np.bitwise_or([33, 4], [1, 2])\n""array([33,  6])\n""\n"">>> np.bitwise_or(np.array([2, 5, 255]), np.array([4, 4, 4]))\n""array([  6,   5, 255])\n"">>> np.array([2, 5, 255]) | np.array([4, 4, 4])\n""array([  6,   5, 255])\n"">>> np.bitwise_or(np.array([2, 5, 255, 2147483647], dtype=np.int32),\n""...               np.array([4, 4, 4, 2147483647], dtype=np.int32))\n""array([         6,          5,        255, 2147483647])\n"">>> np.bitwise_or([True, True], [False, True])\n""array([ True,  True])\n""\n""The ``|`` operator can be used as a shorthand for ``np.bitwise_or`` on\n""ndarrays.\n""\n"">>> x1 = np.array([2, 5, 255])\n"">>> x2 = np.array([4, 4, 4])\n"">>> x1 | x2\n""array([  6,   5, 255])"
+#define DOC_NUMPY_CORE_UMATH_BITWISE_XOR "Compute the bit-wise XOR of two arrays element-wise.\n""\n""Computes the bit-wise XOR of the underlying binary representation of\n""the integers in the input arrays. This ufunc implements the C/Python\n""operator ``^``.\n""\n""Parameters\n""----------\n""x1, x2 : array_like\n""    Only integer and boolean types are handled.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""out : ndarray or scalar\n""    Result.\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""See Also\n""--------\n""logical_xor\n""bitwise_and\n""bitwise_or\n""binary_repr :\n""    Return the binary representation of the input number as a string.\n""\n""Examples\n""--------\n""The number 13 is represented by ``00001101``. Likewise, 17 is\n""represented by ``00010001``.  The bit-wise XOR of 13 and 17 is\n""therefore ``00011100``, or 28:\n""\n"">>> np.bitwise_xor(13, 17)\n""28\n"">>> np.binary_repr(28)\n""\'11100\'\n""\n"">>> np.bitwise_xor(31, 5)\n""26\n"">>> np.bitwise_xor([31,3], 5)\n""array([26,  6])\n""\n"">>> np.bitwise_xor([31,3], [5,6])\n""array([26,  5])\n"">>> np.bitwise_xor([True, True], [False, True])\n""array([ True, False])\n""\n""The ``^`` operator can be used as a shorthand for ``np.bitwise_xor`` on\n""ndarrays.\n""\n"">>> x1 = np.array([True, True])\n"">>> x2 = np.array([False, True])\n"">>> x1 ^ x2\n""array([ True, False])"
+#define DOC_NUMPY_CORE_UMATH_CEIL "Return the ceiling of the input, element-wise.\n""\n""The ceil of the scalar `x` is the smallest integer `i`, such that\n""``i >= x``.  It is often denoted as :math:`\\lceil x \\rceil`.\n""\n""Parameters\n""----------\n""x : array_like\n""    Input data.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray or scalar\n""    The ceiling of each element in `x`, with `float` dtype.\n""    This is a scalar if `x` is a scalar.\n""\n""See Also\n""--------\n""floor, trunc, rint, fix\n""\n""Examples\n""--------\n"">>> a = np.array([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0])\n"">>> np.ceil(a)\n""array([-1., -1., -0.,  1.,  2.,  2.,  2.])"
+#define DOC_NUMPY_CORE_UMATH_TRUNC "Return the truncated value of the input, element-wise.\n""\n""The truncated value of the scalar `x` is the nearest integer `i` which\n""is closer to zero than `x` is. In short, the fractional part of the\n""signed number `x` is discarded.\n""\n""Parameters\n""----------\n""x : array_like\n""    Input data.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray or scalar\n""    The truncated value of each element in `x`.\n""    This is a scalar if `x` is a scalar.\n""\n""See Also\n""--------\n""ceil, floor, rint, fix\n""\n""Notes\n""-----\n"".. versionadded:: 1.3.0\n""\n""Examples\n""--------\n"">>> a = np.array([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0])\n"">>> np.trunc(a)\n""array([-1., -1., -0.,  0.,  1.,  1.,  2.])"
+#define DOC_NUMPY_CORE_UMATH_CONJUGATE "Return the complex conjugate, element-wise.\n""\n""The complex conjugate of a complex number is obtained by changing the\n""sign of its imaginary part.\n""\n""Parameters\n""----------\n""x : array_like\n""    Input value.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray\n""    The complex conjugate of `x`, with same dtype as `y`.\n""    This is a scalar if `x` is a scalar.\n""\n""Notes\n""-----\n""`conj` is an alias for `conjugate`:\n""\n"">>> np.conj is np.conjugate\n""True\n""\n""Examples\n""--------\n"">>> np.conjugate(1+2j)\n""(1-2j)\n""\n"">>> x = np.eye(2) + 1j * np.eye(2)\n"">>> np.conjugate(x)\n""array([[ 1.-1.j,  0.-0.j],\n""       [ 0.-0.j,  1.-1.j]])"
+#define DOC_NUMPY_CORE_UMATH_COS "Cosine element-wise.\n""\n""Parameters\n""----------\n""x : array_like\n""    Input array in radians.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray\n""    The corresponding cosine values.\n""    This is a scalar if `x` is a scalar.\n""\n""Notes\n""-----\n""If `out` is provided, the function writes the result into it,\n""and returns a reference to `out`.  (See Examples)\n""\n""References\n""----------\n""M. Abramowitz and I. A. Stegun, Handbook of Mathematical Functions.\n""New York, NY: Dover, 1972.\n""\n""Examples\n""--------\n"">>> np.cos(np.array([0, np.pi/2, np.pi]))\n""array([  1.00000000e+00,   6.12303177e-17,  -1.00000000e+00])\n"">>>\n"">>> # Example of providing the optional output parameter\n"">>> out1 = np.array([0], dtype=\'d\')\n"">>> out2 = np.cos([0.1], out1)\n"">>> out2 is out1\n""True\n"">>>\n"">>> # Example of ValueError due to provision of shape mis-matched `out`\n"">>> np.cos(np.zeros((3,3)),np.zeros((2,2)))\n""Traceback (most recent call last):\n""  File \"<stdin>\", line 1, in <module>\n""ValueError: operands could not be broadcast together with shapes (3,3) (2,2)"
+#define DOC_NUMPY_CORE_UMATH_COSH "Hyperbolic cosine, element-wise.\n""\n""Equivalent to ``1/2 * (np.exp(x) + np.exp(-x))`` and ``np.cos(1j*x)``.\n""\n""Parameters\n""----------\n""x : array_like\n""    Input array.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""out : ndarray or scalar\n""    Output array of same shape as `x`.\n""    This is a scalar if `x` is a scalar.\n""\n""Examples\n""--------\n"">>> np.cosh(0)\n""1.0\n""\n""The hyperbolic cosine describes the shape of a hanging cable:\n""\n"">>> import matplotlib.pyplot as plt\n"">>> x = np.linspace(-4, 4, 1000)\n"">>> plt.plot(x, np.cosh(x))\n"">>> plt.show()"
+#define DOC_NUMPY_CORE_UMATH_DEGREES "Convert angles from radians to degrees.\n""\n""Parameters\n""----------\n""x : array_like\n""    Input array in radians.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray of floats\n""    The corresponding degree values; if `out` was supplied this is a\n""    reference to it.\n""    This is a scalar if `x` is a scalar.\n""\n""See Also\n""--------\n""rad2deg : equivalent function\n""\n""Examples\n""--------\n""Convert a radian array to degrees\n""\n"">>> rad = np.arange(12.)*np.pi/6\n"">>> np.degrees(rad)\n""array([   0.,   30.,   60.,   90.,  120.,  150.,  180.,  210.,  240.,\n""        270.,  300.,  330.])\n""\n"">>> out = np.zeros((rad.shape))\n"">>> r = np.degrees(rad, out)\n"">>> np.all(r == out)\n""True"
+#define DOC_NUMPY_CORE_UMATH_RAD2DEG "Convert angles from radians to degrees.\n""\n""Parameters\n""----------\n""x : array_like\n""    Angle in radians.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray\n""    The corresponding angle in degrees.\n""    This is a scalar if `x` is a scalar.\n""\n""See Also\n""--------\n""deg2rad : Convert angles from degrees to radians.\n""unwrap : Remove large jumps in angle by wrapping.\n""\n""Notes\n""-----\n"".. versionadded:: 1.3.0\n""\n""rad2deg(x) is ``180 * x / pi``.\n""\n""Examples\n""--------\n"">>> np.rad2deg(np.pi/2)\n""90.0"
+#define DOC_NUMPY_CORE_UMATH_HEAVISIDE "Compute the Heaviside step function.\n""\n""The Heaviside step function is defined as::\n""\n""                          0   if x1 < 0\n""    heaviside(x1, x2) =  x2   if x1 == 0\n""                          1   if x1 > 0\n""\n""where `x2` is often taken to be 0.5, but 0 and 1 are also sometimes used.\n""\n""Parameters\n""----------\n""x1 : array_like\n""    Input values.\n""x2 : array_like\n""    The value of the function when x1 is 0.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""out : ndarray or scalar\n""    The output array, element-wise Heaviside step function of `x1`.\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""Notes\n""-----\n"".. versionadded:: 1.13.0\n""\n""References\n""----------\n"".. Wikipedia, \"Heaviside step function\",\n""   https://en.wikipedia.org/wiki/Heaviside_step_function\n""\n""Examples\n""--------\n"">>> np.heaviside([-1.5, 0, 2.0], 0.5)\n""array([ 0. ,  0.5,  1. ])\n"">>> np.heaviside([-1.5, 0, 2.0], 1)\n""array([ 0.,  1.,  1.])"
+#define DOC_NUMPY_CORE_UMATH_DIVIDE "Divide arguments element-wise.\n""\n""Parameters\n""----------\n""x1 : array_like\n""    Dividend array.\n""x2 : array_like\n""    Divisor array.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray or scalar\n""    The quotient ``x1/x2``, element-wise.\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""See Also\n""--------\n""seterr : Set whether to raise or warn on overflow, underflow and\n""         division by zero.\n""\n""Notes\n""-----\n""Equivalent to ``x1`` / ``x2`` in terms of array-broadcasting.\n""\n""The ``true_divide(x1, x2)`` function is an alias for\n""``divide(x1, x2)``.\n""\n""Examples\n""--------\n"">>> np.divide(2.0, 4.0)\n""0.5\n"">>> x1 = np.arange(9.0).reshape((3, 3))\n"">>> x2 = np.arange(3.0)\n"">>> np.divide(x1, x2)\n""array([[nan, 1. , 1. ],\n""       [inf, 4. , 2.5],\n""       [inf, 7. , 4. ]])\n""\n""The ``/`` operator can be used as a shorthand for ``np.divide`` on\n""ndarrays.\n""\n"">>> x1 = np.arange(9.0).reshape((3, 3))\n"">>> x2 = 2 * np.ones(3)\n"">>> x1 / x2\n""array([[0. , 0.5, 1. ],\n""       [1.5, 2. , 2.5],\n""       [3. , 3.5, 4. ]])"
+#define DOC_NUMPY_CORE_UMATH_EQUAL "Return (x1 == x2) element-wise.\n""\n""Parameters\n""----------\n""x1, x2 : array_like\n""    Input arrays.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""out : ndarray or scalar\n""    Output array, element-wise comparison of `x1` and `x2`.\n""    Typically of type bool, unless ``dtype=object`` is passed.\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""See Also\n""--------\n""not_equal, greater_equal, less_equal, greater, less\n""\n""Examples\n""--------\n"">>> np.equal([0, 1, 3], np.arange(3))\n""array([ True,  True, False])\n""\n""What is compared are values, not types. So an int (1) and an array of\n""length one can evaluate as True:\n""\n"">>> np.equal(1, np.ones(1))\n""array([ True])\n""\n""The ``==`` operator can be used as a shorthand for ``np.equal`` on\n""ndarrays.\n""\n"">>> a = np.array([2, 4, 6])\n"">>> b = np.array([2, 4, 2])\n"">>> a == b\n""array([ True,  True, False])"
+#define DOC_NUMPY_CORE_UMATH_EXP "Calculate the exponential of all elements in the input array.\n""\n""Parameters\n""----------\n""x : array_like\n""    Input values.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""out : ndarray or scalar\n""    Output array, element-wise exponential of `x`.\n""    This is a scalar if `x` is a scalar.\n""\n""See Also\n""--------\n""expm1 : Calculate ``exp(x) - 1`` for all elements in the array.\n""exp2  : Calculate ``2**x`` for all elements in the array.\n""\n""Notes\n""-----\n""The irrational number ``e`` is also known as Euler\'s number.  It is\n""approximately 2.718281, and is the base of the natural logarithm,\n""``ln`` (this means that, if :math:`x = \\ln y = \\log_e y`,\n""then :math:`e^x = y`. For real input, ``exp(x)`` is always positive.\n""\n""For complex arguments, ``x = a + ib``, we can write\n"":math:`e^x = e^a e^{ib}`.  The first term, :math:`e^a`, is already\n""known (it is the real argument, described above).  The second term,\n"":math:`e^{ib}`, is :math:`\\cos b + i \\sin b`, a function with\n""magnitude 1 and a periodic phase.\n""\n""References\n""----------\n"".. [1] Wikipedia, \"Exponential function\",\n""       https://en.wikipedia.org/wiki/Exponential_function\n"".. [2] M. Abramovitz and I. A. Stegun, \"Handbook of Mathematical Functions\n""       with Formulas, Graphs, and Mathematical Tables,\" Dover, 1964, p. 69,\n""       https://personal.math.ubc.ca/~cbm/aands/page_69.htm\n""\n""Examples\n""--------\n""Plot the magnitude and phase of ``exp(x)`` in the complex plane:\n""\n"">>> import matplotlib.pyplot as plt\n""\n"">>> x = np.linspace(-2*np.pi, 2*np.pi, 100)\n"">>> xx = x + 1j * x[:, np.newaxis] # a + ib over complex plane\n"">>> out = np.exp(xx)\n""\n"">>> plt.subplot(121)\n"">>> plt.imshow(np.abs(out),\n""...            extent=[-2*np.pi, 2*np.pi, -2*np.pi, 2*np.pi], cmap=\'gray\')\n"">>> plt.title(\'Magnitude of exp(x)\')\n""\n"">>> plt.subplot(122)\n"">>> plt.imshow(np.angle(out),\n""...            extent=[-2*np.pi, 2*np.pi, -2*np.pi, 2*np.pi], cmap=\'hsv\')\n"">>> plt.title(\'Phase (angle) of exp(x)\')\n"">>> plt.show()"
+#define DOC_NUMPY_CORE_UMATH_EXP2 "Calculate `2**p` for all `p` in the input array.\n""\n""Parameters\n""----------\n""x : array_like\n""    Input values.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""out : ndarray or scalar\n""    Element-wise 2 to the power `x`.\n""    This is a scalar if `x` is a scalar.\n""\n""See Also\n""--------\n""power\n""\n""Notes\n""-----\n"".. versionadded:: 1.3.0\n""\n""\n""\n""Examples\n""--------\n"">>> np.exp2([2, 3])\n""array([ 4.,  8.])"
+#define DOC_NUMPY_CORE_UMATH_EXPM1 "Calculate ``exp(x) - 1`` for all elements in the array.\n""\n""Parameters\n""----------\n""x : array_like\n""    Input values.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""out : ndarray or scalar\n""    Element-wise exponential minus one: ``out = exp(x) - 1``.\n""    This is a scalar if `x` is a scalar.\n""\n""See Also\n""--------\n""log1p : ``log(1 + x)``, the inverse of expm1.\n""\n""\n""Notes\n""-----\n""This function provides greater precision than ``exp(x) - 1``\n""for small values of ``x``.\n""\n""Examples\n""--------\n""The true value of ``exp(1e-10) - 1`` is ``1.00000000005e-10`` to\n""about 32 significant digits. This example shows the superiority of\n""expm1 in this case.\n""\n"">>> np.expm1(1e-10)\n""1.00000000005e-10\n"">>> np.exp(1e-10) - 1\n""1.000000082740371e-10"
+#define DOC_NUMPY_CORE_UMATH_FABS "Compute the absolute values element-wise.\n""\n""This function returns the absolute values (positive magnitude) of the\n""data in `x`. Complex values are not handled, use `absolute` to find the\n""absolute values of complex data.\n""\n""Parameters\n""----------\n""x : array_like\n""    The array of numbers for which the absolute values are required. If\n""    `x` is a scalar, the result `y` will also be a scalar.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray or scalar\n""    The absolute values of `x`, the returned values are always floats.\n""    This is a scalar if `x` is a scalar.\n""\n""See Also\n""--------\n""absolute : Absolute values including `complex` types.\n""\n""Examples\n""--------\n"">>> np.fabs(-1)\n""1.0\n"">>> np.fabs([-1.2, 1.2])\n""array([ 1.2,  1.2])"
+#define DOC_NUMPY_CORE_UMATH_FLOOR "Return the floor of the input, element-wise.\n""\n""The floor of the scalar `x` is the largest integer `i`, such that\n""`i <= x`.  It is often denoted as :math:`\\lfloor x \\rfloor`.\n""\n""Parameters\n""----------\n""x : array_like\n""    Input data.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray or scalar\n""    The floor of each element in `x`.\n""    This is a scalar if `x` is a scalar.\n""\n""See Also\n""--------\n""ceil, trunc, rint, fix\n""\n""Notes\n""-----\n""Some spreadsheet programs calculate the \"floor-towards-zero\", where\n""``floor(-2.5) == -2``.  NumPy instead uses the definition of\n""`floor` where `floor(-2.5) == -3`. The \"floor-towards-zero\"\n""function is called ``fix`` in NumPy.\n""\n""Examples\n""--------\n"">>> a = np.array([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0])\n"">>> np.floor(a)\n""array([-2., -2., -1.,  0.,  1.,  1.,  2.])"
+#define DOC_NUMPY_CORE_UMATH_FLOOR_DIVIDE "Return the largest integer smaller or equal to the division of the inputs.\n""It is equivalent to the Python ``//`` operator and pairs with the\n""Python ``%`` (`remainder`), function so that ``a = a % b + b * (a // b)``\n""up to roundoff.\n""\n""Parameters\n""----------\n""x1 : array_like\n""    Numerator.\n""x2 : array_like\n""    Denominator.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray\n""    y = floor(`x1`/`x2`)\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""See Also\n""--------\n""remainder : Remainder complementary to floor_divide.\n""divmod : Simultaneous floor division and remainder.\n""divide : Standard division.\n""floor : Round a number to the nearest integer toward minus infinity.\n""ceil : Round a number to the nearest integer toward infinity.\n""\n""Examples\n""--------\n"">>> np.floor_divide(7,3)\n""2\n"">>> np.floor_divide([1., 2., 3., 4.], 2.5)\n""array([ 0.,  0.,  1.,  1.])\n""\n""The ``//`` operator can be used as a shorthand for ``np.floor_divide``\n""on ndarrays.\n""\n"">>> x1 = np.array([1., 2., 3., 4.])\n"">>> x1 // 2.5\n""array([0., 0., 1., 1.])"
+#define DOC_NUMPY_CORE_UMATH_FMOD "Returns the element-wise remainder of division.\n""\n""This is the NumPy implementation of the C library function fmod, the\n""remainder has the same sign as the dividend `x1`. It is equivalent to\n""the Matlab(TM) ``rem`` function and should not be confused with the\n""Python modulus operator ``x1 % x2``.\n""\n""Parameters\n""----------\n""x1 : array_like\n""    Dividend.\n""x2 : array_like\n""    Divisor.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : array_like\n""    The remainder of the division of `x1` by `x2`.\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""See Also\n""--------\n""remainder : Equivalent to the Python ``%`` operator.\n""divide\n""\n""Notes\n""-----\n""The result of the modulo operation for negative dividend and divisors\n""is bound by conventions. For `fmod`, the sign of result is the sign of\n""the dividend, while for `remainder` the sign of the result is the sign\n""of the divisor. The `fmod` function is equivalent to the Matlab(TM)\n""``rem`` function.\n""\n""Examples\n""--------\n"">>> np.fmod([-3, -2, -1, 1, 2, 3], 2)\n""array([-1,  0, -1,  1,  0,  1])\n"">>> np.remainder([-3, -2, -1, 1, 2, 3], 2)\n""array([1, 0, 1, 1, 0, 1])\n""\n"">>> np.fmod([5, 3], [2, 2.])\n""array([ 1.,  1.])\n"">>> a = np.arange(-3, 3).reshape(3, 2)\n"">>> a\n""array([[-3, -2],\n""       [-1,  0],\n""       [ 1,  2]])\n"">>> np.fmod(a, [2,2])\n""array([[-1,  0],\n""       [-1,  0],\n""       [ 1,  0]])"
+#define DOC_NUMPY_CORE_UMATH_GREATER "Return the truth value of (x1 > x2) element-wise.\n""\n""Parameters\n""----------\n""x1, x2 : array_like\n""    Input arrays.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""out : ndarray or scalar\n""    Output array, element-wise comparison of `x1` and `x2`.\n""    Typically of type bool, unless ``dtype=object`` is passed.\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""\n""See Also\n""--------\n""greater_equal, less, less_equal, equal, not_equal\n""\n""Examples\n""--------\n"">>> np.greater([4,2],[2,2])\n""array([ True, False])\n""\n""The ``>`` operator can be used as a shorthand for ``np.greater`` on\n""ndarrays.\n""\n"">>> a = np.array([4, 2])\n"">>> b = np.array([2, 2])\n"">>> a > b\n""array([ True, False])"
+#define DOC_NUMPY_CORE_UMATH_GREATER_EQUAL "Return the truth value of (x1 >= x2) element-wise.\n""\n""Parameters\n""----------\n""x1, x2 : array_like\n""    Input arrays.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""out : bool or ndarray of bool\n""    Output array, element-wise comparison of `x1` and `x2`.\n""    Typically of type bool, unless ``dtype=object`` is passed.\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""See Also\n""--------\n""greater, less, less_equal, equal, not_equal\n""\n""Examples\n""--------\n"">>> np.greater_equal([4, 2, 1], [2, 2, 2])\n""array([ True, True, False])\n""\n""The ``>=`` operator can be used as a shorthand for ``np.greater_equal``\n""on ndarrays.\n""\n"">>> a = np.array([4, 2, 1])\n"">>> b = np.array([2, 2, 2])\n"">>> a >= b\n""array([ True,  True, False])"
+#define DOC_NUMPY_CORE_UMATH_HYPOT "Given the \"legs\" of a right triangle, return its hypotenuse.\n""\n""Equivalent to ``sqrt(x1**2 + x2**2)``, element-wise.  If `x1` or\n""`x2` is scalar_like (i.e., unambiguously cast-able to a scalar type),\n""it is broadcast for use with each element of the other argument.\n""(See Examples)\n""\n""Parameters\n""----------\n""x1, x2 : array_like\n""    Leg of the triangle(s).\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""z : ndarray\n""    The hypotenuse of the triangle(s).\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""Examples\n""--------\n"">>> np.hypot(3*np.ones((3, 3)), 4*np.ones((3, 3)))\n""array([[ 5.,  5.,  5.],\n""       [ 5.,  5.,  5.],\n""       [ 5.,  5.,  5.]])\n""\n""Example showing broadcast of scalar_like argument:\n""\n"">>> np.hypot(3*np.ones((3, 3)), [4])\n""array([[ 5.,  5.,  5.],\n""       [ 5.,  5.,  5.],\n""       [ 5.,  5.,  5.]])"
+#define DOC_NUMPY_CORE_UMATH_INVERT "Compute bit-wise inversion, or bit-wise NOT, element-wise.\n""\n""Computes the bit-wise NOT of the underlying binary representation of\n""the integers in the input arrays. This ufunc implements the C/Python\n""operator ``~``.\n""\n""For signed integer inputs, the two\'s complement is returned.  In a\n""two\'s-complement system negative numbers are represented by the two\'s\n""complement of the absolute value. This is the most common method of\n""representing signed integers on computers [1]_. A N-bit\n""two\'s-complement system can represent every integer in the range\n"":math:`-2^{N-1}` to :math:`+2^{N-1}-1`.\n""\n""Parameters\n""----------\n""x : array_like\n""    Only integer and boolean types are handled.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""out : ndarray or scalar\n""    Result.\n""    This is a scalar if `x` is a scalar.\n""\n""See Also\n""--------\n""bitwise_and, bitwise_or, bitwise_xor\n""logical_not\n""binary_repr :\n""    Return the binary representation of the input number as a string.\n""\n""Notes\n""-----\n""`bitwise_not` is an alias for `invert`:\n""\n"">>> np.bitwise_not is np.invert\n""True\n""\n""References\n""----------\n"".. [1] Wikipedia, \"Two\'s complement\",\n""    https://en.wikipedia.org/wiki/Two\'s_complement\n""\n""Examples\n""--------\n""We\'ve seen that 13 is represented by ``00001101``.\n""The invert or bit-wise NOT of 13 is then:\n""\n"">>> x = np.invert(np.array(13, dtype=np.uint8))\n"">>> x\n""242\n"">>> np.binary_repr(x, width=8)\n""\'11110010\'\n""\n""The result depends on the bit-width:\n""\n"">>> x = np.invert(np.array(13, dtype=np.uint16))\n"">>> x\n""65522\n"">>> np.binary_repr(x, width=16)\n""\'1111111111110010\'\n""\n""When using signed integer types the result is the two\'s complement of\n""the result for the unsigned type:\n""\n"">>> np.invert(np.array([13], dtype=np.int8))\n""array([-14], dtype=int8)\n"">>> np.binary_repr(-14, width=8)\n""\'11110010\'\n""\n""Booleans are accepted as well:\n""\n"">>> np.invert(np.array([True, False]))\n""array([False,  True])\n""\n""The ``~`` operator can be used as a shorthand for ``np.invert`` on\n""ndarrays.\n""\n"">>> x1 = np.array([True, False])\n"">>> ~x1\n""array([False,  True])"
+#define DOC_NUMPY_CORE_UMATH_ISFINITE "Test element-wise for finiteness (not infinity and not Not a Number).\n""\n""The result is returned as a boolean array.\n""\n""Parameters\n""----------\n""x : array_like\n""    Input values.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray, bool\n""    True where ``x`` is not positive infinity, negative infinity,\n""    or NaN; false otherwise.\n""    This is a scalar if `x` is a scalar.\n""\n""See Also\n""--------\n""isinf, isneginf, isposinf, isnan\n""\n""Notes\n""-----\n""Not a Number, positive infinity and negative infinity are considered\n""to be non-finite.\n""\n""NumPy uses the IEEE Standard for Binary Floating-Point for Arithmetic\n""(IEEE 754). This means that Not a Number is not equivalent to infinity.\n""Also that positive infinity is not equivalent to negative infinity. But\n""infinity is equivalent to positive infinity.  Errors result if the\n""second argument is also supplied when `x` is a scalar input, or if\n""first and second arguments have different shapes.\n""\n""Examples\n""--------\n"">>> np.isfinite(1)\n""True\n"">>> np.isfinite(0)\n""True\n"">>> np.isfinite(np.nan)\n""False\n"">>> np.isfinite(np.inf)\n""False\n"">>> np.isfinite(np.NINF)\n""False\n"">>> np.isfinite([np.log(-1.),1.,np.log(0)])\n""array([False,  True, False])\n""\n"">>> x = np.array([-np.inf, 0., np.inf])\n"">>> y = np.array([2, 2, 2])\n"">>> np.isfinite(x, y)\n""array([0, 1, 0])\n"">>> y\n""array([0, 1, 0])"
+#define DOC_NUMPY_CORE_UMATH_ISINF "Test element-wise for positive or negative infinity.\n""\n""Returns a boolean array of the same shape as `x`, True where ``x ==\n""+/-inf``, otherwise False.\n""\n""Parameters\n""----------\n""x : array_like\n""    Input values\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : bool (scalar) or boolean ndarray\n""    True where ``x`` is positive or negative infinity, false otherwise.\n""    This is a scalar if `x` is a scalar.\n""\n""See Also\n""--------\n""isneginf, isposinf, isnan, isfinite\n""\n""Notes\n""-----\n""NumPy uses the IEEE Standard for Binary Floating-Point for Arithmetic\n""(IEEE 754).\n""\n""Errors result if the second argument is supplied when the first\n""argument is a scalar, or if the first and second arguments have\n""different shapes.\n""\n""Examples\n""--------\n"">>> np.isinf(np.inf)\n""True\n"">>> np.isinf(np.nan)\n""False\n"">>> np.isinf(np.NINF)\n""True\n"">>> np.isinf([np.inf, -np.inf, 1.0, np.nan])\n""array([ True,  True, False, False])\n""\n"">>> x = np.array([-np.inf, 0., np.inf])\n"">>> y = np.array([2, 2, 2])\n"">>> np.isinf(x, y)\n""array([1, 0, 1])\n"">>> y\n""array([1, 0, 1])"
+#define DOC_NUMPY_CORE_UMATH_ISNAN "Test element-wise for NaN and return result as a boolean array.\n""\n""Parameters\n""----------\n""x : array_like\n""    Input array.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray or bool\n""    True where ``x`` is NaN, false otherwise.\n""    This is a scalar if `x` is a scalar.\n""\n""See Also\n""--------\n""isinf, isneginf, isposinf, isfinite, isnat\n""\n""Notes\n""-----\n""NumPy uses the IEEE Standard for Binary Floating-Point for Arithmetic\n""(IEEE 754). This means that Not a Number is not equivalent to infinity.\n""\n""Examples\n""--------\n"">>> np.isnan(np.nan)\n""True\n"">>> np.isnan(np.inf)\n""False\n"">>> np.isnan([np.log(-1.),1.,np.log(0)])\n""array([ True, False, False])"
+#define DOC_NUMPY_CORE_UMATH_ISNAT "Test element-wise for NaT (not a time) and return result as a boolean array.\n""\n"".. versionadded:: 1.13.0\n""\n""Parameters\n""----------\n""x : array_like\n""    Input array with datetime or timedelta data type.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray or bool\n""    True where ``x`` is NaT, false otherwise.\n""    This is a scalar if `x` is a scalar.\n""\n""See Also\n""--------\n""isnan, isinf, isneginf, isposinf, isfinite\n""\n""Examples\n""--------\n"">>> np.isnat(np.datetime64(\"NaT\"))\n""True\n"">>> np.isnat(np.datetime64(\"2016-01-01\"))\n""False\n"">>> np.isnat(np.array([\"NaT\", \"2016-01-01\"], dtype=\"datetime64[ns]\"))\n""array([ True, False])"
+#define DOC_NUMPY_CORE_UMATH_LEFT_SHIFT "Shift the bits of an integer to the left.\n""\n""Bits are shifted to the left by appending `x2` 0s at the right of `x1`.\n""Since the internal representation of numbers is in binary format, this\n""operation is equivalent to multiplying `x1` by ``2**x2``.\n""\n""Parameters\n""----------\n""x1 : array_like of integer type\n""    Input values.\n""x2 : array_like of integer type\n""    Number of zeros to append to `x1`. Has to be non-negative.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""out : array of integer type\n""    Return `x1` with bits shifted `x2` times to the left.\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""See Also\n""--------\n""right_shift : Shift the bits of an integer to the right.\n""binary_repr : Return the binary representation of the input number\n""    as a string.\n""\n""Examples\n""--------\n"">>> np.binary_repr(5)\n""\'101\'\n"">>> np.left_shift(5, 2)\n""20\n"">>> np.binary_repr(20)\n""\'10100\'\n""\n"">>> np.left_shift(5, [1,2,3])\n""array([10, 20, 40])\n""\n""Note that the dtype of the second argument may change the dtype of the\n""result and can lead to unexpected results in some cases (see\n"":ref:`Casting Rules <ufuncs.casting>`):\n""\n"">>> a = np.left_shift(np.uint8(255), 1) # Expect 254\n"">>> print(a, type(a)) # Unexpected result due to upcasting\n""510 <class \'numpy.int64\'>\n"">>> b = np.left_shift(np.uint8(255), np.uint8(1))\n"">>> print(b, type(b))\n""254 <class \'numpy.uint8\'>\n""\n""The ``<<`` operator can be used as a shorthand for ``np.left_shift`` on\n""ndarrays.\n""\n"">>> x1 = 5\n"">>> x2 = np.array([1, 2, 3])\n"">>> x1 << x2\n""array([10, 20, 40])"
+#define DOC_NUMPY_CORE_UMATH_LESS "Return the truth value of (x1 < x2) element-wise.\n""\n""Parameters\n""----------\n""x1, x2 : array_like\n""    Input arrays.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""out : ndarray or scalar\n""    Output array, element-wise comparison of `x1` and `x2`.\n""    Typically of type bool, unless ``dtype=object`` is passed.\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""See Also\n""--------\n""greater, less_equal, greater_equal, equal, not_equal\n""\n""Examples\n""--------\n"">>> np.less([1, 2], [2, 2])\n""array([ True, False])\n""\n""The ``<`` operator can be used as a shorthand for ``np.less`` on ndarrays.\n""\n"">>> a = np.array([1, 2])\n"">>> b = np.array([2, 2])\n"">>> a < b\n""array([ True, False])"
+#define DOC_NUMPY_CORE_UMATH_LESS_EQUAL "Return the truth value of (x1 <= x2) element-wise.\n""\n""Parameters\n""----------\n""x1, x2 : array_like\n""    Input arrays.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""out : ndarray or scalar\n""    Output array, element-wise comparison of `x1` and `x2`.\n""    Typically of type bool, unless ``dtype=object`` is passed.\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""See Also\n""--------\n""greater, less, greater_equal, equal, not_equal\n""\n""Examples\n""--------\n"">>> np.less_equal([4, 2, 1], [2, 2, 2])\n""array([False,  True,  True])\n""\n""The ``<=`` operator can be used as a shorthand for ``np.less_equal`` on\n""ndarrays.\n""\n"">>> a = np.array([4, 2, 1])\n"">>> b = np.array([2, 2, 2])\n"">>> a <= b\n""array([False,  True,  True])"
+#define DOC_NUMPY_CORE_UMATH_LOG "Natural logarithm, element-wise.\n""\n""The natural logarithm `log` is the inverse of the exponential function,\n""so that `log(exp(x)) = x`. The natural logarithm is logarithm in base\n""`e`.\n""\n""Parameters\n""----------\n""x : array_like\n""    Input value.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray\n""    The natural logarithm of `x`, element-wise.\n""    This is a scalar if `x` is a scalar.\n""\n""See Also\n""--------\n""log10, log2, log1p, emath.log\n""\n""Notes\n""-----\n""Logarithm is a multivalued function: for each `x` there is an infinite\n""number of `z` such that `exp(z) = x`. The convention is to return the\n""`z` whose imaginary part lies in `(-pi, pi]`.\n""\n""For real-valued input data types, `log` always returns real output. For\n""each value that cannot be expressed as a real number or infinity, it\n""yields ``nan`` and sets the `invalid` floating point error flag.\n""\n""For complex-valued input, `log` is a complex analytical function that\n""has a branch cut `[-inf, 0]` and is continuous from above on it. `log`\n""handles the floating-point negative zero as an infinitesimal negative\n""number, conforming to the C99 standard.\n""\n""In the cases where the input has a negative real part and a very small\n""negative complex part (approaching 0), the result is so close to `-pi`\n""that it evaluates to exactly `-pi`.\n""\n""References\n""----------\n"".. [1] M. Abramowitz and I.A. Stegun, \"Handbook of Mathematical Functions\",\n""       10th printing, 1964, pp. 67.\n""       https://personal.math.ubc.ca/~cbm/aands/page_67.htm\n"".. [2] Wikipedia, \"Logarithm\". https://en.wikipedia.org/wiki/Logarithm\n""\n""Examples\n""--------\n"">>> np.log([1, np.e, np.e**2, 0])\n""array([  0.,   1.,   2., -Inf])"
+#define DOC_NUMPY_CORE_UMATH_LOG10 "Return the base 10 logarithm of the input array, element-wise.\n""\n""Parameters\n""----------\n""x : array_like\n""    Input values.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray\n""    The logarithm to the base 10 of `x`, element-wise. NaNs are\n""    returned where x is negative.\n""    This is a scalar if `x` is a scalar.\n""\n""See Also\n""--------\n""emath.log10\n""\n""Notes\n""-----\n""Logarithm is a multivalued function: for each `x` there is an infinite\n""number of `z` such that `10**z = x`. The convention is to return the\n""`z` whose imaginary part lies in `(-pi, pi]`.\n""\n""For real-valued input data types, `log10` always returns real output.\n""For each value that cannot be expressed as a real number or infinity,\n""it yields ``nan`` and sets the `invalid` floating point error flag.\n""\n""For complex-valued input, `log10` is a complex analytical function that\n""has a branch cut `[-inf, 0]` and is continuous from above on it.\n""`log10` handles the floating-point negative zero as an infinitesimal\n""negative number, conforming to the C99 standard.\n""\n""In the cases where the input has a negative real part and a very small\n""negative complex part (approaching 0), the result is so close to `-pi`\n""that it evaluates to exactly `-pi`.\n""\n""References\n""----------\n"".. [1] M. Abramowitz and I.A. Stegun, \"Handbook of Mathematical Functions\",\n""       10th printing, 1964, pp. 67.\n""       https://personal.math.ubc.ca/~cbm/aands/page_67.htm\n"".. [2] Wikipedia, \"Logarithm\". https://en.wikipedia.org/wiki/Logarithm\n""\n""Examples\n""--------\n"">>> np.log10([1e-15, -3.])\n""array([-15.,  nan])"
+#define DOC_NUMPY_CORE_UMATH_LOG2 "Base-2 logarithm of `x`.\n""\n""Parameters\n""----------\n""x : array_like\n""    Input values.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray\n""    Base-2 logarithm of `x`.\n""    This is a scalar if `x` is a scalar.\n""\n""See Also\n""--------\n""log, log10, log1p, emath.log2\n""\n""Notes\n""-----\n"".. versionadded:: 1.3.0\n""\n""Logarithm is a multivalued function: for each `x` there is an infinite\n""number of `z` such that `2**z = x`. The convention is to return the `z`\n""whose imaginary part lies in `(-pi, pi]`.\n""\n""For real-valued input data types, `log2` always returns real output.\n""For each value that cannot be expressed as a real number or infinity,\n""it yields ``nan`` and sets the `invalid` floating point error flag.\n""\n""For complex-valued input, `log2` is a complex analytical function that\n""has a branch cut `[-inf, 0]` and is continuous from above on it. `log2`\n""handles the floating-point negative zero as an infinitesimal negative\n""number, conforming to the C99 standard.\n""\n""In the cases where the input has a negative real part and a very small\n""negative complex part (approaching 0), the result is so close to `-pi`\n""that it evaluates to exactly `-pi`.\n""\n""Examples\n""--------\n"">>> x = np.array([0, 1, 2, 2**4])\n"">>> np.log2(x)\n""array([-Inf,   0.,   1.,   4.])\n""\n"">>> xi = np.array([0+1.j, 1, 2+0.j, 4.j])\n"">>> np.log2(xi)\n""array([ 0.+2.26618007j,  0.+0.j        ,  1.+0.j        ,  2.+2.26618007j])"
+#define DOC_NUMPY_CORE_UMATH_LOGADDEXP "Logarithm of the sum of exponentiations of the inputs.\n""\n""Calculates ``log(exp(x1) + exp(x2))``. This function is useful in\n""statistics where the calculated probabilities of events may be so small\n""as to exceed the range of normal floating point numbers.  In such cases\n""the logarithm of the calculated probability is stored. This function\n""allows adding probabilities stored in such a fashion.\n""\n""Parameters\n""----------\n""x1, x2 : array_like\n""    Input values.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""result : ndarray\n""    Logarithm of ``exp(x1) + exp(x2)``.\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""See Also\n""--------\n""logaddexp2: Logarithm of the sum of exponentiations of inputs in base 2.\n""\n""Notes\n""-----\n"".. versionadded:: 1.3.0\n""\n""Examples\n""--------\n"">>> prob1 = np.log(1e-50)\n"">>> prob2 = np.log(2.5e-50)\n"">>> prob12 = np.logaddexp(prob1, prob2)\n"">>> prob12\n""-113.87649168120691\n"">>> np.exp(prob12)\n""3.5000000000000057e-50"
+#define DOC_NUMPY_CORE_UMATH_LOGADDEXP2 "Logarithm of the sum of exponentiations of the inputs in base-2.\n""\n""Calculates ``log2(2**x1 + 2**x2)``. This function is useful in machine\n""learning when the calculated probabilities of events may be so small as\n""to exceed the range of normal floating point numbers.  In such cases\n""the base-2 logarithm of the calculated probability can be used instead.\n""This function allows adding probabilities stored in such a fashion.\n""\n""Parameters\n""----------\n""x1, x2 : array_like\n""    Input values.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""result : ndarray\n""    Base-2 logarithm of ``2**x1 + 2**x2``.\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""See Also\n""--------\n""logaddexp: Logarithm of the sum of exponentiations of the inputs.\n""\n""Notes\n""-----\n"".. versionadded:: 1.3.0\n""\n""Examples\n""--------\n"">>> prob1 = np.log2(1e-50)\n"">>> prob2 = np.log2(2.5e-50)\n"">>> prob12 = np.logaddexp2(prob1, prob2)\n"">>> prob1, prob2, prob12\n""(-166.09640474436813, -164.77447664948076, -164.28904982231052)\n"">>> 2**prob12\n""3.4999999999999914e-50"
+#define DOC_NUMPY_CORE_UMATH_LOG1P "Return the natural logarithm of one plus the input array, element-wise.\n""\n""Calculates ``log(1 + x)``.\n""\n""Parameters\n""----------\n""x : array_like\n""    Input values.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray\n""    Natural logarithm of `1 + x`, element-wise.\n""    This is a scalar if `x` is a scalar.\n""\n""See Also\n""--------\n""expm1 : ``exp(x) - 1``, the inverse of `log1p`.\n""\n""Notes\n""-----\n""For real-valued input, `log1p` is accurate also for `x` so small\n""that `1 + x == 1` in floating-point accuracy.\n""\n""Logarithm is a multivalued function: for each `x` there is an infinite\n""number of `z` such that `exp(z) = 1 + x`. The convention is to return\n""the `z` whose imaginary part lies in `[-pi, pi]`.\n""\n""For real-valued input data types, `log1p` always returns real output.\n""For each value that cannot be expressed as a real number or infinity,\n""it yields ``nan`` and sets the `invalid` floating point error flag.\n""\n""For complex-valued input, `log1p` is a complex analytical function that\n""has a branch cut `[-inf, -1]` and is continuous from above on it.\n""`log1p` handles the floating-point negative zero as an infinitesimal\n""negative number, conforming to the C99 standard.\n""\n""References\n""----------\n"".. [1] M. Abramowitz and I.A. Stegun, \"Handbook of Mathematical Functions\",\n""       10th printing, 1964, pp. 67.\n""       https://personal.math.ubc.ca/~cbm/aands/page_67.htm\n"".. [2] Wikipedia, \"Logarithm\". https://en.wikipedia.org/wiki/Logarithm\n""\n""Examples\n""--------\n"">>> np.log1p(1e-99)\n""1e-99\n"">>> np.log(1 + 1e-99)\n""0.0"
+#define DOC_NUMPY_CORE_UMATH_LOGICAL_AND "Compute the truth value of x1 AND x2 element-wise.\n""\n""Parameters\n""----------\n""x1, x2 : array_like\n""    Input arrays.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray or bool\n""    Boolean result of the logical AND operation applied to the elements\n""    of `x1` and `x2`; the shape is determined by broadcasting.\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""See Also\n""--------\n""logical_or, logical_not, logical_xor\n""bitwise_and\n""\n""Examples\n""--------\n"">>> np.logical_and(True, False)\n""False\n"">>> np.logical_and([True, False], [False, False])\n""array([False, False])\n""\n"">>> x = np.arange(5)\n"">>> np.logical_and(x>1, x<4)\n""array([False, False,  True,  True, False])\n""\n""\n""The ``&`` operator can be used as a shorthand for ``np.logical_and`` on\n""boolean ndarrays.\n""\n"">>> a = np.array([True, False])\n"">>> b = np.array([False, False])\n"">>> a & b\n""array([False, False])"
+#define DOC_NUMPY_CORE_UMATH_LOGICAL_NOT "Compute the truth value of NOT x element-wise.\n""\n""Parameters\n""----------\n""x : array_like\n""    Logical NOT is applied to the elements of `x`.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : bool or ndarray of bool\n""    Boolean result with the same shape as `x` of the NOT operation\n""    on elements of `x`.\n""    This is a scalar if `x` is a scalar.\n""\n""See Also\n""--------\n""logical_and, logical_or, logical_xor\n""\n""Examples\n""--------\n"">>> np.logical_not(3)\n""False\n"">>> np.logical_not([True, False, 0, 1])\n""array([False,  True,  True, False])\n""\n"">>> x = np.arange(5)\n"">>> np.logical_not(x<3)\n""array([False, False, False,  True,  True])"
+#define DOC_NUMPY_CORE_UMATH_LOGICAL_OR "Compute the truth value of x1 OR x2 element-wise.\n""\n""Parameters\n""----------\n""x1, x2 : array_like\n""    Logical OR is applied to the elements of `x1` and `x2`.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray or bool\n""    Boolean result of the logical OR operation applied to the elements\n""    of `x1` and `x2`; the shape is determined by broadcasting.\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""See Also\n""--------\n""logical_and, logical_not, logical_xor\n""bitwise_or\n""\n""Examples\n""--------\n"">>> np.logical_or(True, False)\n""True\n"">>> np.logical_or([True, False], [False, False])\n""array([ True, False])\n""\n"">>> x = np.arange(5)\n"">>> np.logical_or(x < 1, x > 3)\n""array([ True, False, False, False,  True])\n""\n""The ``|`` operator can be used as a shorthand for ``np.logical_or`` on\n""boolean ndarrays.\n""\n"">>> a = np.array([True, False])\n"">>> b = np.array([False, False])\n"">>> a | b\n""array([ True, False])"
+#define DOC_NUMPY_CORE_UMATH_LOGICAL_XOR "Compute the truth value of x1 XOR x2, element-wise.\n""\n""Parameters\n""----------\n""x1, x2 : array_like\n""    Logical XOR is applied to the elements of `x1` and `x2`.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : bool or ndarray of bool\n""    Boolean result of the logical XOR operation applied to the elements\n""    of `x1` and `x2`; the shape is determined by broadcasting.\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""See Also\n""--------\n""logical_and, logical_or, logical_not, bitwise_xor\n""\n""Examples\n""--------\n"">>> np.logical_xor(True, False)\n""True\n"">>> np.logical_xor([True, True, False, False], [True, False, True, False])\n""array([False,  True,  True, False])\n""\n"">>> x = np.arange(5)\n"">>> np.logical_xor(x < 1, x > 3)\n""array([ True, False, False, False,  True])\n""\n""Simple example showing support of broadcasting\n""\n"">>> np.logical_xor(0, np.eye(2))\n""array([[ True, False],\n""       [False,  True]])"
+#define DOC_NUMPY_CORE_UMATH_MAXIMUM "Element-wise maximum of array elements.\n""\n""Compare two arrays and return a new array containing the element-wise\n""maxima. If one of the elements being compared is a NaN, then that\n""element is returned. If both elements are NaNs then the first is\n""returned. The latter distinction is important for complex NaNs, which\n""are defined as at least one of the real or imaginary parts being a NaN.\n""The net effect is that NaNs are propagated.\n""\n""Parameters\n""----------\n""x1, x2 : array_like\n""    The arrays holding the elements to be compared.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray or scalar\n""    The maximum of `x1` and `x2`, element-wise.\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""See Also\n""--------\n""minimum :\n""    Element-wise minimum of two arrays, propagates NaNs.\n""fmax :\n""    Element-wise maximum of two arrays, ignores NaNs.\n""amax :\n""    The maximum value of an array along a given axis, propagates NaNs.\n""nanmax :\n""    The maximum value of an array along a given axis, ignores NaNs.\n""\n""fmin, amin, nanmin\n""\n""Notes\n""-----\n""The maximum is equivalent to ``np.where(x1 >= x2, x1, x2)`` when\n""neither x1 nor x2 are nans, but it is faster and does proper\n""broadcasting.\n""\n""Examples\n""--------\n"">>> np.maximum([2, 3, 4], [1, 5, 2])\n""array([2, 5, 4])\n""\n"">>> np.maximum(np.eye(2), [0.5, 2]) # broadcasting\n""array([[ 1. ,  2. ],\n""       [ 0.5,  2. ]])\n""\n"">>> np.maximum([np.nan, 0, np.nan], [0, np.nan, np.nan])\n""array([nan, nan, nan])\n"">>> np.maximum(np.Inf, 1)\n""inf"
+#define DOC_NUMPY_CORE_UMATH_MINIMUM "Element-wise minimum of array elements.\n""\n""Compare two arrays and return a new array containing the element-wise\n""minima. If one of the elements being compared is a NaN, then that\n""element is returned. If both elements are NaNs then the first is\n""returned. The latter distinction is important for complex NaNs, which\n""are defined as at least one of the real or imaginary parts being a NaN.\n""The net effect is that NaNs are propagated.\n""\n""Parameters\n""----------\n""x1, x2 : array_like\n""    The arrays holding the elements to be compared.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray or scalar\n""    The minimum of `x1` and `x2`, element-wise.\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""See Also\n""--------\n""maximum :\n""    Element-wise maximum of two arrays, propagates NaNs.\n""fmin :\n""    Element-wise minimum of two arrays, ignores NaNs.\n""amin :\n""    The minimum value of an array along a given axis, propagates NaNs.\n""nanmin :\n""    The minimum value of an array along a given axis, ignores NaNs.\n""\n""fmax, amax, nanmax\n""\n""Notes\n""-----\n""The minimum is equivalent to ``np.where(x1 <= x2, x1, x2)`` when\n""neither x1 nor x2 are NaNs, but it is faster and does proper\n""broadcasting.\n""\n""Examples\n""--------\n"">>> np.minimum([2, 3, 4], [1, 5, 2])\n""array([1, 3, 2])\n""\n"">>> np.minimum(np.eye(2), [0.5, 2]) # broadcasting\n""array([[ 0.5,  0. ],\n""       [ 0. ,  1. ]])\n""\n"">>> np.minimum([np.nan, 0, np.nan],[0, np.nan, np.nan])\n""array([nan, nan, nan])\n"">>> np.minimum(-np.Inf, 1)\n""-inf"
+#define DOC_NUMPY_CORE_UMATH_FMAX "Element-wise maximum of array elements.\n""\n""Compare two arrays and return a new array containing the element-wise\n""maxima. If one of the elements being compared is a NaN, then the\n""non-nan element is returned. If both elements are NaNs then the first\n""is returned.  The latter distinction is important for complex NaNs,\n""which are defined as at least one of the real or imaginary parts being\n""a NaN. The net effect is that NaNs are ignored when possible.\n""\n""Parameters\n""----------\n""x1, x2 : array_like\n""    The arrays holding the elements to be compared.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray or scalar\n""    The maximum of `x1` and `x2`, element-wise.\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""See Also\n""--------\n""fmin :\n""    Element-wise minimum of two arrays, ignores NaNs.\n""maximum :\n""    Element-wise maximum of two arrays, propagates NaNs.\n""amax :\n""    The maximum value of an array along a given axis, propagates NaNs.\n""nanmax :\n""    The maximum value of an array along a given axis, ignores NaNs.\n""\n""minimum, amin, nanmin\n""\n""Notes\n""-----\n"".. versionadded:: 1.3.0\n""\n""The fmax is equivalent to ``np.where(x1 >= x2, x1, x2)`` when neither\n""x1 nor x2 are NaNs, but it is faster and does proper broadcasting.\n""\n""Examples\n""--------\n"">>> np.fmax([2, 3, 4], [1, 5, 2])\n""array([ 2.,  5.,  4.])\n""\n"">>> np.fmax(np.eye(2), [0.5, 2])\n""array([[ 1. ,  2. ],\n""       [ 0.5,  2. ]])\n""\n"">>> np.fmax([np.nan, 0, np.nan],[0, np.nan, np.nan])\n""array([ 0.,  0., nan])"
+#define DOC_NUMPY_CORE_UMATH_FMIN "Element-wise minimum of array elements.\n""\n""Compare two arrays and return a new array containing the element-wise\n""minima. If one of the elements being compared is a NaN, then the\n""non-nan element is returned. If both elements are NaNs then the first\n""is returned.  The latter distinction is important for complex NaNs,\n""which are defined as at least one of the real or imaginary parts being\n""a NaN. The net effect is that NaNs are ignored when possible.\n""\n""Parameters\n""----------\n""x1, x2 : array_like\n""    The arrays holding the elements to be compared.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray or scalar\n""    The minimum of `x1` and `x2`, element-wise.\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""See Also\n""--------\n""fmax :\n""    Element-wise maximum of two arrays, ignores NaNs.\n""minimum :\n""    Element-wise minimum of two arrays, propagates NaNs.\n""amin :\n""    The minimum value of an array along a given axis, propagates NaNs.\n""nanmin :\n""    The minimum value of an array along a given axis, ignores NaNs.\n""\n""maximum, amax, nanmax\n""\n""Notes\n""-----\n"".. versionadded:: 1.3.0\n""\n""The fmin is equivalent to ``np.where(x1 <= x2, x1, x2)`` when neither\n""x1 nor x2 are NaNs, but it is faster and does proper broadcasting.\n""\n""Examples\n""--------\n"">>> np.fmin([2, 3, 4], [1, 5, 2])\n""array([1, 3, 2])\n""\n"">>> np.fmin(np.eye(2), [0.5, 2])\n""array([[ 0.5,  0. ],\n""       [ 0. ,  1. ]])\n""\n"">>> np.fmin([np.nan, 0, np.nan],[0, np.nan, np.nan])\n""array([ 0.,  0., nan])"
+#define DOC_NUMPY_CORE_UMATH_CLIP "Clip (limit) the values in an array.\n""\n""Given an interval, values outside the interval are clipped to\n""the interval edges.  For example, if an interval of ``[0, 1]``\n""is specified, values smaller than 0 become 0, and values larger\n""than 1 become 1.\n""\n""Equivalent to but faster than ``np.minimum(np.maximum(a, a_min), a_max)``.\n""\n""Parameters\n""----------\n""a : array_like\n""    Array containing elements to clip.\n""a_min : array_like\n""    Minimum value.\n""a_max : array_like\n""    Maximum value.\n""out : ndarray, optional\n""    The results will be placed in this array. It may be the input\n""    array for in-place clipping.  `out` must be of the right shape\n""    to hold the output.  Its type is preserved.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""See Also\n""--------\n""numpy.clip :\n""    Wrapper that makes the ``a_min`` and ``a_max`` arguments optional,\n""    dispatching to one of `~numpy.core.umath.clip`,\n""    `~numpy.core.umath.minimum`, and `~numpy.core.umath.maximum`.\n""\n""Returns\n""-------\n""clipped_array : ndarray\n""    An array with the elements of `a`, but where values\n""    < `a_min` are replaced with `a_min`, and those > `a_max`\n""    with `a_max`."
+#define DOC_NUMPY_CORE_UMATH_MATMUL "Matrix product of two arrays.\n""\n""Parameters\n""----------\n""x1, x2 : array_like\n""    Input arrays, scalars not allowed.\n""out : ndarray, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that matches the signature `(n,k),(k,m)->(n,m)`. If not\n""    provided or None, a freshly-allocated array is returned.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""    .. versionadded:: 1.16\n""       Now handles ufunc kwargs\n""\n""Returns\n""-------\n""y : ndarray\n""    The matrix product of the inputs.\n""    This is a scalar only when both x1, x2 are 1-d vectors.\n""\n""Raises\n""------\n""ValueError\n""    If the last dimension of `x1` is not the same size as\n""    the second-to-last dimension of `x2`.\n""\n""    If a scalar value is passed in.\n""\n""See Also\n""--------\n""vdot : Complex-conjugating dot product.\n""tensordot : Sum products over arbitrary axes.\n""einsum : Einstein summation convention.\n""dot : alternative matrix product with different broadcasting rules.\n""\n""Notes\n""-----\n""\n""The behavior depends on the arguments in the following way.\n""\n""- If both arguments are 2-D they are multiplied like conventional\n""  matrices.\n""- If either argument is N-D, N > 2, it is treated as a stack of\n""  matrices residing in the last two indexes and broadcast accordingly.\n""- If the first argument is 1-D, it is promoted to a matrix by\n""  prepending a 1 to its dimensions. After matrix multiplication\n""  the prepended 1 is removed.\n""- If the second argument is 1-D, it is promoted to a matrix by\n""  appending a 1 to its dimensions. After matrix multiplication\n""  the appended 1 is removed.\n""\n""``matmul`` differs from ``dot`` in two important ways:\n""\n""- Multiplication by scalars is not allowed, use ``*`` instead.\n""- Stacks of matrices are broadcast together as if the matrices\n""  were elements, respecting the signature ``(n,k),(k,m)->(n,m)``:\n""\n""  >>> a = np.ones([9, 5, 7, 4])\n""  >>> c = np.ones([9, 5, 4, 3])\n""  >>> np.dot(a, c).shape\n""  (9, 5, 7, 9, 5, 3)\n""  >>> np.matmul(a, c).shape\n""  (9, 5, 7, 3)\n""  >>> # n is 7, k is 4, m is 3\n""\n""The matmul function implements the semantics of the ``@`` operator\n""introduced in Python 3.5 following :pep:`465`.\n""\n""It uses an optimized BLAS library when possible (see `numpy.linalg`).\n""\n""Examples\n""--------\n""For 2-D arrays it is the matrix product:\n""\n"">>> a = np.array([[1, 0],\n""...               [0, 1]])\n"">>> b = np.array([[4, 1],\n""...               [2, 2]])\n"">>> np.matmul(a, b)\n""array([[4, 1],\n""       [2, 2]])\n""\n""For 2-D mixed with 1-D, the result is the usual.\n""\n"">>> a = np.array([[1, 0],\n""...               [0, 1]])\n"">>> b = np.array([1, 2])\n"">>> np.matmul(a, b)\n""array([1, 2])\n"">>> np.matmul(b, a)\n""array([1, 2])\n""\n""\n""Broadcasting is conventional for stacks of arrays\n""\n"">>> a = np.arange(2 * 2 * 4).reshape((2, 2, 4))\n"">>> b = np.arange(2 * 2 * 4).reshape((2, 4, 2))\n"">>> np.matmul(a,b).shape\n""(2, 2, 2)\n"">>> np.matmul(a, b)[0, 1, 1]\n""98\n"">>> sum(a[0, 1, :] * b[0 , :, 1])\n""98\n""\n""Vector, vector returns the scalar inner product, but neither argument\n""is complex-conjugated:\n""\n"">>> np.matmul([2j, 3j], [2j, 3j])\n""(-13+0j)\n""\n""Scalar multiplication raises an error.\n""\n"">>> np.matmul([1,2], 3)\n""Traceback (most recent call last):\n""...\n""ValueError: matmul: Input operand 1 does not have enough dimensions ...\n""\n""The ``@`` operator can be used as a shorthand for ``np.matmul`` on\n""ndarrays.\n""\n"">>> x1 = np.array([2j, 3j])\n"">>> x2 = np.array([2j, 3j])\n"">>> x1 @ x2\n""(-13+0j)\n""\n"".. versionadded:: 1.10.0"
+#define DOC_NUMPY_CORE_UMATH_MODF "Return the fractional and integral parts of an array, element-wise.\n""\n""The fractional and integral parts are negative if the given number is\n""negative.\n""\n""Parameters\n""----------\n""x : array_like\n""    Input array.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y1 : ndarray\n""    Fractional part of `x`.\n""    This is a scalar if `x` is a scalar.\n""y2 : ndarray\n""    Integral part of `x`.\n""    This is a scalar if `x` is a scalar.\n""\n""Notes\n""-----\n""For integer input the return values are floats.\n""\n""See Also\n""--------\n""divmod : ``divmod(x, 1)`` is equivalent to ``modf`` with the return values\n""         switched, except it always has a positive remainder.\n""\n""Examples\n""--------\n"">>> np.modf([0, 3.5])\n""(array([ 0. ,  0.5]), array([ 0.,  3.]))\n"">>> np.modf(-0.5)\n""(-0.5, -0)"
+#define DOC_NUMPY_CORE_UMATH_MULTIPLY "Multiply arguments element-wise.\n""\n""Parameters\n""----------\n""x1, x2 : array_like\n""    Input arrays to be multiplied.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray\n""    The product of `x1` and `x2`, element-wise.\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""Notes\n""-----\n""Equivalent to `x1` * `x2` in terms of array broadcasting.\n""\n""Examples\n""--------\n"">>> np.multiply(2.0, 4.0)\n""8.0\n""\n"">>> x1 = np.arange(9.0).reshape((3, 3))\n"">>> x2 = np.arange(3.0)\n"">>> np.multiply(x1, x2)\n""array([[  0.,   1.,   4.],\n""       [  0.,   4.,  10.],\n""       [  0.,   7.,  16.]])\n""\n""The ``*`` operator can be used as a shorthand for ``np.multiply`` on\n""ndarrays.\n""\n"">>> x1 = np.arange(9.0).reshape((3, 3))\n"">>> x2 = np.arange(3.0)\n"">>> x1 * x2\n""array([[  0.,   1.,   4.],\n""       [  0.,   4.,  10.],\n""       [  0.,   7.,  16.]])"
+#define DOC_NUMPY_CORE_UMATH_NEGATIVE "Numerical negative, element-wise.\n""\n""Parameters\n""----------\n""x : array_like or scalar\n""    Input array.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray or scalar\n""    Returned array or scalar: `y = -x`.\n""    This is a scalar if `x` is a scalar.\n""\n""Examples\n""--------\n"">>> np.negative([1.,-1.])\n""array([-1.,  1.])\n""\n""The unary ``-`` operator can be used as a shorthand for ``np.negative`` on\n""ndarrays.\n""\n"">>> x1 = np.array(([1., -1.]))\n"">>> -x1\n""array([-1.,  1.])"
+#define DOC_NUMPY_CORE_UMATH_POSITIVE "Numerical positive, element-wise.\n""\n"".. versionadded:: 1.13.0\n""\n""Parameters\n""----------\n""x : array_like or scalar\n""    Input array.\n""\n""Returns\n""-------\n""y : ndarray or scalar\n""    Returned array or scalar: `y = +x`.\n""    This is a scalar if `x` is a scalar.\n""\n""Notes\n""-----\n""Equivalent to `x.copy()`, but only defined for types that support\n""arithmetic.\n""\n""Examples\n""--------\n""\n"">>> x1 = np.array(([1., -1.]))\n"">>> np.positive(x1)\n""array([ 1., -1.])\n""\n""The unary ``+`` operator can be used as a shorthand for ``np.positive`` on\n""ndarrays.\n""\n"">>> x1 = np.array(([1., -1.]))\n"">>> +x1\n""array([ 1., -1.])"
+#define DOC_NUMPY_CORE_UMATH_NOT_EQUAL "Return (x1 != x2) element-wise.\n""\n""Parameters\n""----------\n""x1, x2 : array_like\n""    Input arrays.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""out : ndarray or scalar\n""    Output array, element-wise comparison of `x1` and `x2`.\n""    Typically of type bool, unless ``dtype=object`` is passed.\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""See Also\n""--------\n""equal, greater, greater_equal, less, less_equal\n""\n""Examples\n""--------\n"">>> np.not_equal([1.,2.], [1., 3.])\n""array([False,  True])\n"">>> np.not_equal([1, 2], [[1, 3],[1, 4]])\n""array([[False,  True],\n""       [False,  True]])\n""\n""The ``!=`` operator can be used as a shorthand for ``np.not_equal`` on\n""ndarrays.\n""\n"">>> a = np.array([1., 2.])\n"">>> b = np.array([1., 3.])\n"">>> a != b\n""array([False,  True])"
+#define DOC_NUMPY_CORE_UMATH__ONES_LIKE "This function used to be the numpy.ones_like, but now a specific\n""function for that has been written for consistency with the other\n""*_like functions. It is only used internally in a limited fashion now.\n""\n""See Also\n""--------\n""ones_like"
+#define DOC_NUMPY_CORE_UMATH_POWER "First array elements raised to powers from second array, element-wise.\n""\n""Raise each base in `x1` to the positionally-corresponding power in\n""`x2`.  `x1` and `x2` must be broadcastable to the same shape.\n""\n""An integer type raised to a negative integer power will raise a\n""``ValueError``.\n""\n""Negative values raised to a non-integral value will return ``nan``.\n""To get complex results, cast the input to complex, or specify the\n""``dtype`` to be ``complex`` (see the example below).\n""\n""Parameters\n""----------\n""x1 : array_like\n""    The bases.\n""x2 : array_like\n""    The exponents.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray\n""    The bases in `x1` raised to the exponents in `x2`.\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""See Also\n""--------\n""float_power : power function that promotes integers to float\n""\n""Examples\n""--------\n""Cube each element in an array.\n""\n"">>> x1 = np.arange(6)\n"">>> x1\n""[0, 1, 2, 3, 4, 5]\n"">>> np.power(x1, 3)\n""array([  0,   1,   8,  27,  64, 125])\n""\n""Raise the bases to different exponents.\n""\n"">>> x2 = [1.0, 2.0, 3.0, 3.0, 2.0, 1.0]\n"">>> np.power(x1, x2)\n""array([  0.,   1.,   8.,  27.,  16.,   5.])\n""\n""The effect of broadcasting.\n""\n"">>> x2 = np.array([[1, 2, 3, 3, 2, 1], [1, 2, 3, 3, 2, 1]])\n"">>> x2\n""array([[1, 2, 3, 3, 2, 1],\n""       [1, 2, 3, 3, 2, 1]])\n"">>> np.power(x1, x2)\n""array([[ 0,  1,  8, 27, 16,  5],\n""       [ 0,  1,  8, 27, 16,  5]])\n""\n""The ``**`` operator can be used as a shorthand for ``np.power`` on\n""ndarrays.\n""\n"">>> x2 = np.array([1, 2, 3, 3, 2, 1])\n"">>> x1 = np.arange(6)\n"">>> x1 ** x2\n""array([ 0,  1,  8, 27, 16,  5])\n""\n""Negative values raised to a non-integral value will result in ``nan``\n""(and a warning will be generated).\n""\n"">>> x3 = np.array([-1.0, -4.0])\n"">>> with np.errstate(invalid=\'ignore\'):\n""...     p = np.power(x3, 1.5)\n""...\n"">>> p\n""array([nan, nan])\n""\n""To get complex results, give the argument ``dtype=complex``.\n""\n"">>> np.power(x3, 1.5, dtype=complex)\n""array([-1.83697020e-16-1.j, -1.46957616e-15-8.j])"
+#define DOC_NUMPY_CORE_UMATH_FLOAT_POWER "First array elements raised to powers from second array, element-wise.\n""\n""Raise each base in `x1` to the positionally-corresponding power in `x2`.\n""`x1` and `x2` must be broadcastable to the same shape. This differs from\n""the power function in that integers, float16, and float32  are promoted to\n""floats with a minimum precision of float64 so that the result is always\n""inexact.  The intent is that the function will return a usable result for\n""negative powers and seldom overflow for positive powers.\n""\n""Negative values raised to a non-integral value will return ``nan``.\n""To get complex results, cast the input to complex, or specify the\n""``dtype`` to be ``complex`` (see the example below).\n""\n"".. versionadded:: 1.12.0\n""\n""Parameters\n""----------\n""x1 : array_like\n""    The bases.\n""x2 : array_like\n""    The exponents.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray\n""    The bases in `x1` raised to the exponents in `x2`.\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""See Also\n""--------\n""power : power function that preserves type\n""\n""Examples\n""--------\n""Cube each element in a list.\n""\n"">>> x1 = range(6)\n"">>> x1\n""[0, 1, 2, 3, 4, 5]\n"">>> np.float_power(x1, 3)\n""array([   0.,    1.,    8.,   27.,   64.,  125.])\n""\n""Raise the bases to different exponents.\n""\n"">>> x2 = [1.0, 2.0, 3.0, 3.0, 2.0, 1.0]\n"">>> np.float_power(x1, x2)\n""array([  0.,   1.,   8.,  27.,  16.,   5.])\n""\n""The effect of broadcasting.\n""\n"">>> x2 = np.array([[1, 2, 3, 3, 2, 1], [1, 2, 3, 3, 2, 1]])\n"">>> x2\n""array([[1, 2, 3, 3, 2, 1],\n""       [1, 2, 3, 3, 2, 1]])\n"">>> np.float_power(x1, x2)\n""array([[  0.,   1.,   8.,  27.,  16.,   5.],\n""       [  0.,   1.,   8.,  27.,  16.,   5.]])\n""\n""Negative values raised to a non-integral value will result in ``nan``\n""(and a warning will be generated).\n""\n"">>> x3 = np.array([-1, -4])\n"">>> with np.errstate(invalid=\'ignore\'):\n""...     p = np.float_power(x3, 1.5)\n""...\n"">>> p\n""array([nan, nan])\n""\n""To get complex results, give the argument ``dtype=complex``.\n""\n"">>> np.float_power(x3, 1.5, dtype=complex)\n""array([-1.83697020e-16-1.j, -1.46957616e-15-8.j])"
+#define DOC_NUMPY_CORE_UMATH_RADIANS "Convert angles from degrees to radians.\n""\n""Parameters\n""----------\n""x : array_like\n""    Input array in degrees.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray\n""    The corresponding radian values.\n""    This is a scalar if `x` is a scalar.\n""\n""See Also\n""--------\n""deg2rad : equivalent function\n""\n""Examples\n""--------\n""Convert a degree array to radians\n""\n"">>> deg = np.arange(12.) * 30.\n"">>> np.radians(deg)\n""array([ 0.        ,  0.52359878,  1.04719755,  1.57079633,  2.0943951 ,\n""        2.61799388,  3.14159265,  3.66519143,  4.1887902 ,  4.71238898,\n""        5.23598776,  5.75958653])\n""\n"">>> out = np.zeros((deg.shape))\n"">>> ret = np.radians(deg, out)\n"">>> ret is out\n""True"
+#define DOC_NUMPY_CORE_UMATH_DEG2RAD "Convert angles from degrees to radians.\n""\n""Parameters\n""----------\n""x : array_like\n""    Angles in degrees.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray\n""    The corresponding angle in radians.\n""    This is a scalar if `x` is a scalar.\n""\n""See Also\n""--------\n""rad2deg : Convert angles from radians to degrees.\n""unwrap : Remove large jumps in angle by wrapping.\n""\n""Notes\n""-----\n"".. versionadded:: 1.3.0\n""\n""``deg2rad(x)`` is ``x * pi / 180``.\n""\n""Examples\n""--------\n"">>> np.deg2rad(180)\n""3.1415926535897931"
+#define DOC_NUMPY_CORE_UMATH_RECIPROCAL "Return the reciprocal of the argument, element-wise.\n""\n""Calculates ``1/x``.\n""\n""Parameters\n""----------\n""x : array_like\n""    Input array.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray\n""    Return array.\n""    This is a scalar if `x` is a scalar.\n""\n""Notes\n""-----\n"".. note::\n""    This function is not designed to work with integers.\n""\n""For integer arguments with absolute value larger than 1 the result is\n""always zero because of the way Python handles integer division.  For\n""integer zero the result is an overflow.\n""\n""Examples\n""--------\n"">>> np.reciprocal(2.)\n""0.5\n"">>> np.reciprocal([1, 2., 3.33])\n""array([ 1.       ,  0.5      ,  0.3003003])"
+#define DOC_NUMPY_CORE_UMATH_REMAINDER "Returns the element-wise remainder of division.\n""\n""Computes the remainder complementary to the `floor_divide` function.  It is\n""equivalent to the Python modulus operator``x1 % x2`` and has the same sign\n""as the divisor `x2`. The MATLAB function equivalent to ``np.remainder``\n""is ``mod``.\n""\n"".. warning::\n""\n""    This should not be confused with:\n""\n""    * Python 3.7\'s `math.remainder` and C\'s ``remainder``, which\n""      computes the IEEE remainder, which are the complement to\n""      ``round(x1 / x2)``.\n""    * The MATLAB ``rem`` function and or the C ``%`` operator which is the\n""      complement to ``int(x1 / x2)``.\n""\n""Parameters\n""----------\n""x1 : array_like\n""    Dividend array.\n""x2 : array_like\n""    Divisor array.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray\n""    The element-wise remainder of the quotient ``floor_divide(x1, x2)``.\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""See Also\n""--------\n""floor_divide : Equivalent of Python ``//`` operator.\n""divmod : Simultaneous floor division and remainder.\n""fmod : Equivalent of the MATLAB ``rem`` function.\n""divide, floor\n""\n""Notes\n""-----\n""Returns 0 when `x2` is 0 and both `x1` and `x2` are (arrays of)\n""integers.\n""``mod`` is an alias of ``remainder``.\n""\n""Examples\n""--------\n"">>> np.remainder([4, 7], [2, 3])\n""array([0, 1])\n"">>> np.remainder(np.arange(7), 5)\n""array([0, 1, 2, 3, 4, 0, 1])\n""\n""The ``%`` operator can be used as a shorthand for ``np.remainder`` on\n""ndarrays.\n""\n"">>> x1 = np.arange(7)\n"">>> x1 % 5\n""array([0, 1, 2, 3, 4, 0, 1])"
+#define DOC_NUMPY_CORE_UMATH_DIVMOD "Return element-wise quotient and remainder simultaneously.\n""\n"".. versionadded:: 1.13.0\n""\n""``np.divmod(x, y)`` is equivalent to ``(x // y, x % y)``, but faster\n""because it avoids redundant work. It is used to implement the Python\n""built-in function ``divmod`` on NumPy arrays.\n""\n""Parameters\n""----------\n""x1 : array_like\n""    Dividend array.\n""x2 : array_like\n""    Divisor array.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""out1 : ndarray\n""    Element-wise quotient resulting from floor division.\n""    This is a scalar if both `x1` and `x2` are scalars.\n""out2 : ndarray\n""    Element-wise remainder from floor division.\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""See Also\n""--------\n""floor_divide : Equivalent to Python\'s ``//`` operator.\n""remainder : Equivalent to Python\'s ``%`` operator.\n""modf : Equivalent to ``divmod(x, 1)`` for positive ``x`` with the return\n""       values switched.\n""\n""Examples\n""--------\n"">>> np.divmod(np.arange(5), 3)\n""(array([0, 0, 0, 1, 1]), array([0, 1, 2, 0, 1]))\n""\n""The `divmod` function can be used as a shorthand for ``np.divmod`` on\n""ndarrays.\n""\n"">>> x = np.arange(5)\n"">>> divmod(x, 3)\n""(array([0, 0, 0, 1, 1]), array([0, 1, 2, 0, 1]))"
+#define DOC_NUMPY_CORE_UMATH_RIGHT_SHIFT "Shift the bits of an integer to the right.\n""\n""Bits are shifted to the right `x2`.  Because the internal\n""representation of numbers is in binary format, this operation is\n""equivalent to dividing `x1` by ``2**x2``.\n""\n""Parameters\n""----------\n""x1 : array_like, int\n""    Input values.\n""x2 : array_like, int\n""    Number of bits to remove at the right of `x1`.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""out : ndarray, int\n""    Return `x1` with bits shifted `x2` times to the right.\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""See Also\n""--------\n""left_shift : Shift the bits of an integer to the left.\n""binary_repr : Return the binary representation of the input number\n""    as a string.\n""\n""Examples\n""--------\n"">>> np.binary_repr(10)\n""\'1010\'\n"">>> np.right_shift(10, 1)\n""5\n"">>> np.binary_repr(5)\n""\'101\'\n""\n"">>> np.right_shift(10, [1,2,3])\n""array([5, 2, 1])\n""\n""The ``>>`` operator can be used as a shorthand for ``np.right_shift`` on\n""ndarrays.\n""\n"">>> x1 = 10\n"">>> x2 = np.array([1,2,3])\n"">>> x1 >> x2\n""array([5, 2, 1])"
+#define DOC_NUMPY_CORE_UMATH_RINT "Round elements of the array to the nearest integer.\n""\n""Parameters\n""----------\n""x : array_like\n""    Input array.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""out : ndarray or scalar\n""    Output array is same shape and type as `x`.\n""    This is a scalar if `x` is a scalar.\n""\n""See Also\n""--------\n""fix, ceil, floor, trunc\n""\n""Notes\n""-----\n""For values exactly halfway between rounded decimal values, NumPy\n""rounds to the nearest even value. Thus 1.5 and 2.5 round to 2.0,\n""-0.5 and 0.5 round to 0.0, etc.\n""\n""Examples\n""--------\n"">>> a = np.array([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0])\n"">>> np.rint(a)\n""array([-2., -2., -0.,  0.,  2.,  2.,  2.])"
+#define DOC_NUMPY_CORE_UMATH_SIGN "Returns an element-wise indication of the sign of a number.\n""\n""The `sign` function returns ``-1 if x < 0, 0 if x==0, 1 if x > 0``.  nan\n""is returned for nan inputs.\n""\n""For complex inputs, the `sign` function returns\n""``sign(x.real) + 0j if x.real != 0 else sign(x.imag) + 0j``.\n""\n""complex(nan, 0) is returned for complex nan inputs.\n""\n""Parameters\n""----------\n""x : array_like\n""    Input values.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray\n""    The sign of `x`.\n""    This is a scalar if `x` is a scalar.\n""\n""Notes\n""-----\n""There is more than one definition of sign in common use for complex\n""numbers.  The definition used here is equivalent to :math:`x/\\sqrt{x*x}`\n""which is different from a common alternative, :math:`x/|x|`.\n""\n""Examples\n""--------\n"">>> np.sign([-5., 4.5])\n""array([-1.,  1.])\n"">>> np.sign(0)\n""0\n"">>> np.sign(5-2j)\n""(1+0j)"
+#define DOC_NUMPY_CORE_UMATH_SIGNBIT "Returns element-wise True where signbit is set (less than zero).\n""\n""Parameters\n""----------\n""x : array_like\n""    The input value(s).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""result : ndarray of bool\n""    Output array, or reference to `out` if that was supplied.\n""    This is a scalar if `x` is a scalar.\n""\n""Examples\n""--------\n"">>> np.signbit(-1.2)\n""True\n"">>> np.signbit(np.array([1, -2.3, 2.1]))\n""array([False,  True, False])"
+#define DOC_NUMPY_CORE_UMATH_COPYSIGN "Change the sign of x1 to that of x2, element-wise.\n""\n""If `x2` is a scalar, its sign will be copied to all elements of `x1`.\n""\n""Parameters\n""----------\n""x1 : array_like\n""    Values to change the sign of.\n""x2 : array_like\n""    The sign of `x2` is copied to `x1`.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""out : ndarray or scalar\n""    The values of `x1` with the sign of `x2`.\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""Examples\n""--------\n"">>> np.copysign(1.3, -1)\n""-1.3\n"">>> 1/np.copysign(0, 1)\n""inf\n"">>> 1/np.copysign(0, -1)\n""-inf\n""\n"">>> np.copysign([-1, 0, 1], -1.1)\n""array([-1., -0., -1.])\n"">>> np.copysign([-1, 0, 1], np.arange(3)-1)\n""array([-1.,  0.,  1.])"
+#define DOC_NUMPY_CORE_UMATH_NEXTAFTER "Return the next floating-point value after x1 towards x2, element-wise.\n""\n""Parameters\n""----------\n""x1 : array_like\n""    Values to find the next representable value of.\n""x2 : array_like\n""    The direction where to look for the next representable value of `x1`.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""out : ndarray or scalar\n""    The next representable values of `x1` in the direction of `x2`.\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""Examples\n""--------\n"">>> eps = np.finfo(np.float64).eps\n"">>> np.nextafter(1, 2) == eps + 1\n""True\n"">>> np.nextafter([1, 2], [2, 1]) == [eps + 1, 2 - eps]\n""array([ True,  True])"
+#define DOC_NUMPY_CORE_UMATH_SPACING "Return the distance between x and the nearest adjacent number.\n""\n""Parameters\n""----------\n""x : array_like\n""    Values to find the spacing of.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""out : ndarray or scalar\n""    The spacing of values of `x`.\n""    This is a scalar if `x` is a scalar.\n""\n""Notes\n""-----\n""It can be considered as a generalization of EPS:\n""``spacing(np.float64(1)) == np.finfo(np.float64).eps``, and there\n""should not be any representable number between ``x + spacing(x)`` and\n""x for any finite x.\n""\n""Spacing of +- inf and NaN is NaN.\n""\n""Examples\n""--------\n"">>> np.spacing(1) == np.finfo(np.float64).eps\n""True"
+#define DOC_NUMPY_CORE_UMATH_SIN "Trigonometric sine, element-wise.\n""\n""Parameters\n""----------\n""x : array_like\n""    Angle, in radians (:math:`2 \\pi` rad equals 360 degrees).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : array_like\n""    The sine of each element of x.\n""    This is a scalar if `x` is a scalar.\n""\n""See Also\n""--------\n""arcsin, sinh, cos\n""\n""Notes\n""-----\n""The sine is one of the fundamental functions of trigonometry (the\n""mathematical study of triangles).  Consider a circle of radius 1\n""centered on the origin.  A ray comes in from the :math:`+x` axis, makes\n""an angle at the origin (measured counter-clockwise from that axis), and\n""departs from the origin.  The :math:`y` coordinate of the outgoing\n""ray\'s intersection with the unit circle is the sine of that angle.  It\n""ranges from -1 for :math:`x=3\\pi / 2` to +1 for :math:`\\pi / 2.`  The\n""function has zeroes where the angle is a multiple of :math:`\\pi`.\n""Sines of angles between :math:`\\pi` and :math:`2\\pi` are negative.\n""The numerous properties of the sine and related functions are included\n""in any standard trigonometry text.\n""\n""Examples\n""--------\n""Print sine of one angle:\n""\n"">>> np.sin(np.pi/2.)\n""1.0\n""\n""Print sines of an array of angles given in degrees:\n""\n"">>> np.sin(np.array((0., 30., 45., 60., 90.)) * np.pi / 180. )\n""array([ 0.        ,  0.5       ,  0.70710678,  0.8660254 ,  1.        ])\n""\n""Plot the sine function:\n""\n"">>> import matplotlib.pylab as plt\n"">>> x = np.linspace(-np.pi, np.pi, 201)\n"">>> plt.plot(x, np.sin(x))\n"">>> plt.xlabel(\'Angle [rad]\')\n"">>> plt.ylabel(\'sin(x)\')\n"">>> plt.axis(\'tight\')\n"">>> plt.show()"
+#define DOC_NUMPY_CORE_UMATH_SINH "Hyperbolic sine, element-wise.\n""\n""Equivalent to ``1/2 * (np.exp(x) - np.exp(-x))`` or\n""``-1j * np.sin(1j*x)``.\n""\n""Parameters\n""----------\n""x : array_like\n""    Input array.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray\n""    The corresponding hyperbolic sine values.\n""    This is a scalar if `x` is a scalar.\n""\n""Notes\n""-----\n""If `out` is provided, the function writes the result into it,\n""and returns a reference to `out`.  (See Examples)\n""\n""References\n""----------\n""M. Abramowitz and I. A. Stegun, Handbook of Mathematical Functions.\n""New York, NY: Dover, 1972, pg. 83.\n""\n""Examples\n""--------\n"">>> np.sinh(0)\n""0.0\n"">>> np.sinh(np.pi*1j/2)\n""1j\n"">>> np.sinh(np.pi*1j) # (exact value is 0)\n""1.2246063538223773e-016j\n"">>> # Discrepancy due to vagaries of floating point arithmetic.\n""\n"">>> # Example of providing the optional output parameter\n"">>> out1 = np.array([0], dtype=\'d\')\n"">>> out2 = np.sinh([0.1], out1)\n"">>> out2 is out1\n""True\n""\n"">>> # Example of ValueError due to provision of shape mis-matched `out`\n"">>> np.sinh(np.zeros((3,3)),np.zeros((2,2)))\n""Traceback (most recent call last):\n""  File \"<stdin>\", line 1, in <module>\n""ValueError: operands could not be broadcast together with shapes (3,3) (2,2)"
+#define DOC_NUMPY_CORE_UMATH_SQRT "Return the non-negative square-root of an array, element-wise.\n""\n""Parameters\n""----------\n""x : array_like\n""    The values whose square-roots are required.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray\n""    An array of the same shape as `x`, containing the positive\n""    square-root of each element in `x`.  If any element in `x` is\n""    complex, a complex array is returned (and the square-roots of\n""    negative reals are calculated).  If all of the elements in `x`\n""    are real, so is `y`, with negative elements returning ``nan``.\n""    If `out` was provided, `y` is a reference to it.\n""    This is a scalar if `x` is a scalar.\n""\n""See Also\n""--------\n""emath.sqrt\n""    A version which returns complex numbers when given negative reals.\n""    Note that 0.0 and -0.0 are handled differently for complex inputs.\n""\n""Notes\n""-----\n""*sqrt* has--consistent with common convention--as its branch cut the\n""real \"interval\" [`-inf`, 0), and is continuous from above on it.\n""A branch cut is a curve in the complex plane across which a given\n""complex function fails to be continuous.\n""\n""Examples\n""--------\n"">>> np.sqrt([1,4,9])\n""array([ 1.,  2.,  3.])\n""\n"">>> np.sqrt([4, -1, -3+4J])\n""array([ 2.+0.j,  0.+1.j,  1.+2.j])\n""\n"">>> np.sqrt([4, -1, np.inf])\n""array([ 2., nan, inf])"
+#define DOC_NUMPY_CORE_UMATH_CBRT "Return the cube-root of an array, element-wise.\n""\n"".. versionadded:: 1.10.0\n""\n""Parameters\n""----------\n""x : array_like\n""    The values whose cube-roots are required.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray\n""    An array of the same shape as `x`, containing the cube\n""    cube-root of each element in `x`.\n""    If `out` was provided, `y` is a reference to it.\n""    This is a scalar if `x` is a scalar.\n""\n""\n""Examples\n""--------\n"">>> np.cbrt([1,8,27])\n""array([ 1.,  2.,  3.])"
+#define DOC_NUMPY_CORE_UMATH_SQUARE "Return the element-wise square of the input.\n""\n""Parameters\n""----------\n""x : array_like\n""    Input data.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""out : ndarray or scalar\n""    Element-wise `x*x`, of the same shape and dtype as `x`.\n""    This is a scalar if `x` is a scalar.\n""\n""See Also\n""--------\n""numpy.linalg.matrix_power\n""sqrt\n""power\n""\n""Examples\n""--------\n"">>> np.square([-1j, 1])\n""array([-1.-0.j,  1.+0.j])"
+#define DOC_NUMPY_CORE_UMATH_SUBTRACT "Subtract arguments, element-wise.\n""\n""Parameters\n""----------\n""x1, x2 : array_like\n""    The arrays to be subtracted from each other.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray\n""    The difference of `x1` and `x2`, element-wise.\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""Notes\n""-----\n""Equivalent to ``x1 - x2`` in terms of array broadcasting.\n""\n""Examples\n""--------\n"">>> np.subtract(1.0, 4.0)\n""-3.0\n""\n"">>> x1 = np.arange(9.0).reshape((3, 3))\n"">>> x2 = np.arange(3.0)\n"">>> np.subtract(x1, x2)\n""array([[ 0.,  0.,  0.],\n""       [ 3.,  3.,  3.],\n""       [ 6.,  6.,  6.]])\n""\n""The ``-`` operator can be used as a shorthand for ``np.subtract`` on\n""ndarrays.\n""\n"">>> x1 = np.arange(9.0).reshape((3, 3))\n"">>> x2 = np.arange(3.0)\n"">>> x1 - x2\n""array([[0., 0., 0.],\n""       [3., 3., 3.],\n""       [6., 6., 6.]])"
+#define DOC_NUMPY_CORE_UMATH_TAN "Compute tangent element-wise.\n""\n""Equivalent to ``np.sin(x)/np.cos(x)`` element-wise.\n""\n""Parameters\n""----------\n""x : array_like\n""    Input array.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray\n""    The corresponding tangent values.\n""    This is a scalar if `x` is a scalar.\n""\n""Notes\n""-----\n""If `out` is provided, the function writes the result into it,\n""and returns a reference to `out`.  (See Examples)\n""\n""References\n""----------\n""M. Abramowitz and I. A. Stegun, Handbook of Mathematical Functions.\n""New York, NY: Dover, 1972.\n""\n""Examples\n""--------\n"">>> from math import pi\n"">>> np.tan(np.array([-pi,pi/2,pi]))\n""array([  1.22460635e-16,   1.63317787e+16,  -1.22460635e-16])\n"">>>\n"">>> # Example of providing the optional output parameter illustrating\n"">>> # that what is returned is a reference to said parameter\n"">>> out1 = np.array([0], dtype=\'d\')\n"">>> out2 = np.cos([0.1], out1)\n"">>> out2 is out1\n""True\n"">>>\n"">>> # Example of ValueError due to provision of shape mis-matched `out`\n"">>> np.cos(np.zeros((3,3)),np.zeros((2,2)))\n""Traceback (most recent call last):\n""  File \"<stdin>\", line 1, in <module>\n""ValueError: operands could not be broadcast together with shapes (3,3) (2,2)"
+#define DOC_NUMPY_CORE_UMATH_TANH "Compute hyperbolic tangent element-wise.\n""\n""Equivalent to ``np.sinh(x)/np.cosh(x)`` or ``-1j * np.tan(1j*x)``.\n""\n""Parameters\n""----------\n""x : array_like\n""    Input array.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray\n""    The corresponding hyperbolic tangent values.\n""    This is a scalar if `x` is a scalar.\n""\n""Notes\n""-----\n""If `out` is provided, the function writes the result into it,\n""and returns a reference to `out`.  (See Examples)\n""\n""References\n""----------\n"".. [1] M. Abramowitz and I. A. Stegun, Handbook of Mathematical Functions.\n""       New York, NY: Dover, 1972, pg. 83.\n""       https://personal.math.ubc.ca/~cbm/aands/page_83.htm\n""\n"".. [2] Wikipedia, \"Hyperbolic function\",\n""       https://en.wikipedia.org/wiki/Hyperbolic_function\n""\n""Examples\n""--------\n"">>> np.tanh((0, np.pi*1j, np.pi*1j/2))\n""array([ 0. +0.00000000e+00j,  0. -1.22460635e-16j,  0. +1.63317787e+16j])\n""\n"">>> # Example of providing the optional output parameter illustrating\n"">>> # that what is returned is a reference to said parameter\n"">>> out1 = np.array([0], dtype=\'d\')\n"">>> out2 = np.tanh([0.1], out1)\n"">>> out2 is out1\n""True\n""\n"">>> # Example of ValueError due to provision of shape mis-matched `out`\n"">>> np.tanh(np.zeros((3,3)),np.zeros((2,2)))\n""Traceback (most recent call last):\n""  File \"<stdin>\", line 1, in <module>\n""ValueError: operands could not be broadcast together with shapes (3,3) (2,2)"
+#define DOC_NUMPY_CORE_UMATH_FREXP "Decompose the elements of x into mantissa and twos exponent.\n""\n""Returns (`mantissa`, `exponent`), where ``x = mantissa * 2**exponent``.\n""The mantissa lies in the open interval(-1, 1), while the twos\n""exponent is a signed integer.\n""\n""Parameters\n""----------\n""x : array_like\n""    Array of numbers to be decomposed.\n""out1 : ndarray, optional\n""    Output array for the mantissa. Must have the same shape as `x`.\n""out2 : ndarray, optional\n""    Output array for the exponent. Must have the same shape as `x`.\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""mantissa : ndarray\n""    Floating values between -1 and 1.\n""    This is a scalar if `x` is a scalar.\n""exponent : ndarray\n""    Integer exponents of 2.\n""    This is a scalar if `x` is a scalar.\n""\n""See Also\n""--------\n""ldexp : Compute ``y = x1 * 2**x2``, the inverse of `frexp`.\n""\n""Notes\n""-----\n""Complex dtypes are not supported, they will raise a TypeError.\n""\n""Examples\n""--------\n"">>> x = np.arange(9)\n"">>> y1, y2 = np.frexp(x)\n"">>> y1\n""array([ 0.   ,  0.5  ,  0.5  ,  0.75 ,  0.5  ,  0.625,  0.75 ,  0.875,\n""        0.5  ])\n"">>> y2\n""array([0, 1, 2, 2, 3, 3, 3, 3, 4])\n"">>> y1 * 2**y2\n""array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.])"
+#define DOC_NUMPY_CORE_UMATH_LDEXP "Returns x1 * 2**x2, element-wise.\n""\n""The mantissas `x1` and twos exponents `x2` are used to construct\n""floating point numbers ``x1 * 2**x2``.\n""\n""Parameters\n""----------\n""x1 : array_like\n""    Array of multipliers.\n""x2 : array_like, int\n""    Array of twos exponents.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""out : ndarray, None, or tuple of ndarray and None, optional\n""    A location into which the result is stored. If provided, it must have\n""    a shape that the inputs broadcast to. If not provided or None,\n""    a freshly-allocated array is returned. A tuple (possible only as a\n""    keyword argument) must have length equal to the number of outputs.\n""where : array_like, optional\n""    This condition is broadcast over the input. At locations where the\n""    condition is True, the `out` array will be set to the ufunc result.\n""    Elsewhere, the `out` array will retain its original value.\n""    Note that if an uninitialized `out` array is created via the default\n""    ``out=None``, locations within it where the condition is False will\n""    remain uninitialized.\n""**kwargs\n""    For other keyword-only arguments, see the\n""    :ref:`ufunc docs <ufuncs.kwargs>`.\n""\n""Returns\n""-------\n""y : ndarray or scalar\n""    The result of ``x1 * 2**x2``.\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""See Also\n""--------\n""frexp : Return (y1, y2) from ``x = y1 * 2**y2``, inverse to `ldexp`.\n""\n""Notes\n""-----\n""Complex dtypes are not supported, they will raise a TypeError.\n""\n""`ldexp` is useful as the inverse of `frexp`, if used by itself it is\n""more clear to simply use the expression ``x1 * 2**x2``.\n""\n""Examples\n""--------\n"">>> np.ldexp(5, np.arange(4))\n""array([ 5., 10., 20., 40.], dtype=float16)\n""\n"">>> x = np.arange(6)\n"">>> np.ldexp(*np.frexp(x))\n""array([ 0.,  1.,  2.,  3.,  4.,  5.])"
+#define DOC_NUMPY_CORE_UMATH_GCD "Returns the greatest common divisor of ``|x1|`` and ``|x2|``\n""\n""Parameters\n""----------\n""x1, x2 : array_like, int\n""    Arrays of values.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""\n""Returns\n""-------\n""y : ndarray or scalar\n""    The greatest common divisor of the absolute value of the inputs\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""See Also\n""--------\n""lcm : The lowest common multiple\n""\n""Examples\n""--------\n"">>> np.gcd(12, 20)\n""4\n"">>> np.gcd.reduce([15, 25, 35])\n""5\n"">>> np.gcd(np.arange(6), 20)\n""array([20,  1,  2,  1,  4,  5])"
+#define DOC_NUMPY_CORE_UMATH_LCM "Returns the lowest common multiple of ``|x1|`` and ``|x2|``\n""\n""Parameters\n""----------\n""x1, x2 : array_like, int\n""    Arrays of values.\n""    If ``x1.shape != x2.shape``, they must be broadcastable to a common\n""    shape (which becomes the shape of the output).\n""\n""Returns\n""-------\n""y : ndarray or scalar\n""    The lowest common multiple of the absolute value of the inputs\n""    This is a scalar if both `x1` and `x2` are scalars.\n""\n""See Also\n""--------\n""gcd : The greatest common divisor\n""\n""Examples\n""--------\n"">>> np.lcm(12, 20)\n""60\n"">>> np.lcm.reduce([3, 12, 20])\n""60\n"">>> np.lcm.reduce([40, 12, 20])\n""120\n"">>> np.lcm(np.arange(6), 20)\n""array([ 0, 20, 20, 60, 20, 20])"
+#endif //NUMPY_CORE_INCLUDE__UMATH_DOC_GENERATED_H
diff --git a/nanvix-port/generated-headers/arraytypes.h b/nanvix-port/generated-headers/arraytypes.h
new file mode 100644
index 000000000000..322ac3dc7b32
--- /dev/null
+++ b/nanvix-port/generated-headers/arraytypes.h
@@ -0,0 +1,748 @@
+#line 1 "numpy/core/src/multiarray/arraytypes.h.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_ARRAYTYPES_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_ARRAYTYPES_H_
+
+#include "common.h"
+
+NPY_NO_EXPORT int
+set_typeinfo(PyObject *dict);
+
+/* needed for blasfuncs */
+NPY_NO_EXPORT void
+FLOAT_dot(char *, npy_intp, char *, npy_intp, char *, npy_intp, void *);
+
+NPY_NO_EXPORT void
+CFLOAT_dot(char *, npy_intp, char *, npy_intp, char *, npy_intp, void *);
+
+NPY_NO_EXPORT void
+DOUBLE_dot(char *, npy_intp, char *, npy_intp, char *, npy_intp, void *);
+
+NPY_NO_EXPORT void
+CDOUBLE_dot(char *, npy_intp, char *, npy_intp, char *, npy_intp, void *);
+
+
+/* for _pyarray_correlate */
+NPY_NO_EXPORT int
+small_correlate(const char * d_, npy_intp dstride,
+                npy_intp nd, enum NPY_TYPES dtype,
+                const char * k_, npy_intp kstride,
+                npy_intp nk, enum NPY_TYPES ktype,
+                char * out_, npy_intp ostride);
+
+#line 37
+/*
+ * The setitem functions are currently directly used in certain branches
+ * of the scalar-math code. (Yes, this would be nice to refactor...)
+ */
+
+NPY_NO_EXPORT int
+BYTE_setitem(PyObject *obj, void *data_ptr, void *arr);
+
+
+#line 37
+/*
+ * The setitem functions are currently directly used in certain branches
+ * of the scalar-math code. (Yes, this would be nice to refactor...)
+ */
+
+NPY_NO_EXPORT int
+UBYTE_setitem(PyObject *obj, void *data_ptr, void *arr);
+
+
+#line 37
+/*
+ * The setitem functions are currently directly used in certain branches
+ * of the scalar-math code. (Yes, this would be nice to refactor...)
+ */
+
+NPY_NO_EXPORT int
+SHORT_setitem(PyObject *obj, void *data_ptr, void *arr);
+
+
+#line 37
+/*
+ * The setitem functions are currently directly used in certain branches
+ * of the scalar-math code. (Yes, this would be nice to refactor...)
+ */
+
+NPY_NO_EXPORT int
+USHORT_setitem(PyObject *obj, void *data_ptr, void *arr);
+
+
+#line 37
+/*
+ * The setitem functions are currently directly used in certain branches
+ * of the scalar-math code. (Yes, this would be nice to refactor...)
+ */
+
+NPY_NO_EXPORT int
+INT_setitem(PyObject *obj, void *data_ptr, void *arr);
+
+
+#line 37
+/*
+ * The setitem functions are currently directly used in certain branches
+ * of the scalar-math code. (Yes, this would be nice to refactor...)
+ */
+
+NPY_NO_EXPORT int
+UINT_setitem(PyObject *obj, void *data_ptr, void *arr);
+
+
+#line 37
+/*
+ * The setitem functions are currently directly used in certain branches
+ * of the scalar-math code. (Yes, this would be nice to refactor...)
+ */
+
+NPY_NO_EXPORT int
+LONG_setitem(PyObject *obj, void *data_ptr, void *arr);
+
+
+#line 37
+/*
+ * The setitem functions are currently directly used in certain branches
+ * of the scalar-math code. (Yes, this would be nice to refactor...)
+ */
+
+NPY_NO_EXPORT int
+ULONG_setitem(PyObject *obj, void *data_ptr, void *arr);
+
+
+#line 37
+/*
+ * The setitem functions are currently directly used in certain branches
+ * of the scalar-math code. (Yes, this would be nice to refactor...)
+ */
+
+NPY_NO_EXPORT int
+LONGLONG_setitem(PyObject *obj, void *data_ptr, void *arr);
+
+
+#line 37
+/*
+ * The setitem functions are currently directly used in certain branches
+ * of the scalar-math code. (Yes, this would be nice to refactor...)
+ */
+
+NPY_NO_EXPORT int
+ULONGLONG_setitem(PyObject *obj, void *data_ptr, void *arr);
+
+
+#line 37
+/*
+ * The setitem functions are currently directly used in certain branches
+ * of the scalar-math code. (Yes, this would be nice to refactor...)
+ */
+
+NPY_NO_EXPORT int
+HALF_setitem(PyObject *obj, void *data_ptr, void *arr);
+
+
+#line 37
+/*
+ * The setitem functions are currently directly used in certain branches
+ * of the scalar-math code. (Yes, this would be nice to refactor...)
+ */
+
+NPY_NO_EXPORT int
+FLOAT_setitem(PyObject *obj, void *data_ptr, void *arr);
+
+
+#line 37
+/*
+ * The setitem functions are currently directly used in certain branches
+ * of the scalar-math code. (Yes, this would be nice to refactor...)
+ */
+
+NPY_NO_EXPORT int
+DOUBLE_setitem(PyObject *obj, void *data_ptr, void *arr);
+
+
+#line 37
+/*
+ * The setitem functions are currently directly used in certain branches
+ * of the scalar-math code. (Yes, this would be nice to refactor...)
+ */
+
+NPY_NO_EXPORT int
+LONGDOUBLE_setitem(PyObject *obj, void *data_ptr, void *arr);
+
+
+#line 37
+/*
+ * The setitem functions are currently directly used in certain branches
+ * of the scalar-math code. (Yes, this would be nice to refactor...)
+ */
+
+NPY_NO_EXPORT int
+CFLOAT_setitem(PyObject *obj, void *data_ptr, void *arr);
+
+
+#line 37
+/*
+ * The setitem functions are currently directly used in certain branches
+ * of the scalar-math code. (Yes, this would be nice to refactor...)
+ */
+
+NPY_NO_EXPORT int
+CDOUBLE_setitem(PyObject *obj, void *data_ptr, void *arr);
+
+
+#line 37
+/*
+ * The setitem functions are currently directly used in certain branches
+ * of the scalar-math code. (Yes, this would be nice to refactor...)
+ */
+
+NPY_NO_EXPORT int
+CLONGDOUBLE_setitem(PyObject *obj, void *data_ptr, void *arr);
+
+
+
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "argfunc.dispatch.h"
+#endif
+#line 59
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int BYTE_argmax,
+    (npy_byte *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int BYTE_argmin,
+    (npy_byte *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+
+#line 59
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int UBYTE_argmax,
+    (npy_ubyte *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int UBYTE_argmin,
+    (npy_ubyte *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+
+#line 59
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int SHORT_argmax,
+    (npy_short *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int SHORT_argmin,
+    (npy_short *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+
+#line 59
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int USHORT_argmax,
+    (npy_ushort *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int USHORT_argmin,
+    (npy_ushort *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+
+#line 59
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int INT_argmax,
+    (npy_int *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int INT_argmin,
+    (npy_int *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+
+#line 59
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int UINT_argmax,
+    (npy_uint *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int UINT_argmin,
+    (npy_uint *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+
+#line 59
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int LONG_argmax,
+    (npy_long *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int LONG_argmin,
+    (npy_long *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+
+#line 59
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int ULONG_argmax,
+    (npy_ulong *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int ULONG_argmin,
+    (npy_ulong *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+
+#line 59
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int LONGLONG_argmax,
+    (npy_longlong *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int LONGLONG_argmin,
+    (npy_longlong *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+
+#line 59
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int ULONGLONG_argmax,
+    (npy_ulonglong *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int ULONGLONG_argmin,
+    (npy_ulonglong *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+
+#line 59
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int FLOAT_argmax,
+    (npy_float *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int FLOAT_argmin,
+    (npy_float *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+
+#line 59
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int DOUBLE_argmax,
+    (npy_double *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int DOUBLE_argmin,
+    (npy_double *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+
+#line 59
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int LONGDOUBLE_argmax,
+    (npy_longdouble *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int LONGDOUBLE_argmin,
+    (npy_longdouble *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int BOOL_argmax,
+    (npy_bool *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+
+/*
+ * Define DType and scalar type names and aliases as used in Python.
+ */
+
+#line 91
+#define NPY_BOOL_name "bool_"
+#define NPY_BOOL_Name "Bool"
+
+
+#line 91
+#define NPY_HALF_name "float16"
+#define NPY_HALF_Name "Float16"
+
+
+#line 91
+#define NPY_FLOAT_name "float32"
+#define NPY_FLOAT_Name "Float32"
+
+
+#line 91
+#define NPY_DOUBLE_name "float64"
+#define NPY_DOUBLE_Name "Float64"
+
+
+#line 91
+#define NPY_LONGDOUBLE_name "longdouble"
+#define NPY_LONGDOUBLE_Name "LongDouble"
+
+
+#line 91
+#define NPY_CFLOAT_name "complex64"
+#define NPY_CFLOAT_Name "Complex64"
+
+
+#line 91
+#define NPY_CDOUBLE_name "complex128"
+#define NPY_CDOUBLE_Name "Complex128"
+
+
+#line 91
+#define NPY_CLONGDOUBLE_name "clongdouble"
+#define NPY_CLONGDOUBLE_Name "CLongDouble"
+
+
+#line 91
+#define NPY_STRING_name "bytes_"
+#define NPY_STRING_Name "Bytes"
+
+
+#line 91
+#define NPY_UNICODE_name "str_"
+#define NPY_UNICODE_Name "Str"
+
+
+#line 91
+#define NPY_VOID_name "void"
+#define NPY_VOID_Name "Void"
+
+
+#line 91
+#define NPY_OBJECT_name "object_"
+#define NPY_OBJECT_Name "Object"
+
+
+#line 91
+#define NPY_DATETIME_name "datetime64"
+#define NPY_DATETIME_Name "DateTime64"
+
+
+#line 91
+#define NPY_TIMEDELTA_name "timedelta64"
+#define NPY_TIMEDELTA_Name "TimeDelta64"
+
+
+
+
+/*
+ * Give integers different names when they are the same size (gh-9799).
+ * `intX` always refers to the first int of that size in the sequence
+ * `['LONG', 'LONGLONG', 'INT', 'SHORT', 'BYTE']`.
+ * Unfortunately, since the bitsize names are not strictly fixed, we add
+ * the C name for all integer types (as aliases).
+ *
+ * Right now, we do not define the C aliases for floats (which are always
+ * the same).
+ */
+
+#if NPY_SIZEOF_BYTE == NPY_SIZEOF_SHORT
+    #define BYTE_not_size_named
+#endif
+#if NPY_SIZEOF_SHORT == NPY_SIZEOF_INT
+    #define SHORT_not_size_named
+#endif
+#if NPY_SIZEOF_INT == NPY_SIZEOF_LONG
+    #define INT_not_size_named
+#endif
+#if NPY_SIZEOF_LONGLONG == NPY_SIZEOF_LONG
+    #define LONGLONG_not_size_named
+#endif
+
+
+#line 133
+
+#ifdef BYTE_not_size_named
+    #define NPY_BYTE_name "byte"
+    #define NPY_BYTE_Name "Byte"
+#else
+    /* The C-name is considered just an alias for these: */
+    #define NPY_BYTE_alias "byte"
+    #define NPY_BYTE_Alias "Byte"
+
+    /* The bitsof macro includes math, so cannot be stringified */
+    #if NPY_BITSOF_BYTE == 8
+        #define NPY_BYTE_name "int8"
+        #define NPY_BYTE_Name "Int8"
+    #elif NPY_BITSOF_BYTE == 16
+        #define NPY_BYTE_name "int16"
+        #define NPY_BYTE_Name "Int16"
+    #elif NPY_BITSOF_BYTE == 32
+        #define NPY_BYTE_name "int32"
+        #define NPY_BYTE_Name "Int32"
+    #elif NPY_BITSOF_BYTE == 64
+        #define NPY_BYTE_name "int64"
+        #define NPY_BYTE_Name "Int64"
+    #else
+        #error "need to fix integer bit-length name code"
+    #endif
+#endif
+
+
+#line 133
+
+#ifdef SHORT_not_size_named
+    #define NPY_SHORT_name "short"
+    #define NPY_SHORT_Name "Short"
+#else
+    /* The C-name is considered just an alias for these: */
+    #define NPY_SHORT_alias "short"
+    #define NPY_SHORT_Alias "Short"
+
+    /* The bitsof macro includes math, so cannot be stringified */
+    #if NPY_BITSOF_SHORT == 8
+        #define NPY_SHORT_name "int8"
+        #define NPY_SHORT_Name "Int8"
+    #elif NPY_BITSOF_SHORT == 16
+        #define NPY_SHORT_name "int16"
+        #define NPY_SHORT_Name "Int16"
+    #elif NPY_BITSOF_SHORT == 32
+        #define NPY_SHORT_name "int32"
+        #define NPY_SHORT_Name "Int32"
+    #elif NPY_BITSOF_SHORT == 64
+        #define NPY_SHORT_name "int64"
+        #define NPY_SHORT_Name "Int64"
+    #else
+        #error "need to fix integer bit-length name code"
+    #endif
+#endif
+
+
+#line 133
+
+#ifdef INT_not_size_named
+    #define NPY_INT_name "intc"
+    #define NPY_INT_Name "Int"
+#else
+    /* The C-name is considered just an alias for these: */
+    #define NPY_INT_alias "intc"
+    #define NPY_INT_Alias "Int"
+
+    /* The bitsof macro includes math, so cannot be stringified */
+    #if NPY_BITSOF_INT == 8
+        #define NPY_INT_name "int8"
+        #define NPY_INT_Name "Int8"
+    #elif NPY_BITSOF_INT == 16
+        #define NPY_INT_name "int16"
+        #define NPY_INT_Name "Int16"
+    #elif NPY_BITSOF_INT == 32
+        #define NPY_INT_name "int32"
+        #define NPY_INT_Name "Int32"
+    #elif NPY_BITSOF_INT == 64
+        #define NPY_INT_name "int64"
+        #define NPY_INT_Name "Int64"
+    #else
+        #error "need to fix integer bit-length name code"
+    #endif
+#endif
+
+
+#line 133
+
+#ifdef LONG_not_size_named
+    #define NPY_LONG_name "int_"
+    #define NPY_LONG_Name "Long"
+#else
+    /* The C-name is considered just an alias for these: */
+    #define NPY_LONG_alias "int_"
+    #define NPY_LONG_Alias "Long"
+
+    /* The bitsof macro includes math, so cannot be stringified */
+    #if NPY_BITSOF_LONG == 8
+        #define NPY_LONG_name "int8"
+        #define NPY_LONG_Name "Int8"
+    #elif NPY_BITSOF_LONG == 16
+        #define NPY_LONG_name "int16"
+        #define NPY_LONG_Name "Int16"
+    #elif NPY_BITSOF_LONG == 32
+        #define NPY_LONG_name "int32"
+        #define NPY_LONG_Name "Int32"
+    #elif NPY_BITSOF_LONG == 64
+        #define NPY_LONG_name "int64"
+        #define NPY_LONG_Name "Int64"
+    #else
+        #error "need to fix integer bit-length name code"
+    #endif
+#endif
+
+
+#line 133
+
+#ifdef LONGLONG_not_size_named
+    #define NPY_LONGLONG_name "longlong"
+    #define NPY_LONGLONG_Name "LongLong"
+#else
+    /* The C-name is considered just an alias for these: */
+    #define NPY_LONGLONG_alias "longlong"
+    #define NPY_LONGLONG_Alias "LongLong"
+
+    /* The bitsof macro includes math, so cannot be stringified */
+    #if NPY_BITSOF_LONGLONG == 8
+        #define NPY_LONGLONG_name "int8"
+        #define NPY_LONGLONG_Name "Int8"
+    #elif NPY_BITSOF_LONGLONG == 16
+        #define NPY_LONGLONG_name "int16"
+        #define NPY_LONGLONG_Name "Int16"
+    #elif NPY_BITSOF_LONGLONG == 32
+        #define NPY_LONGLONG_name "int32"
+        #define NPY_LONGLONG_Name "Int32"
+    #elif NPY_BITSOF_LONGLONG == 64
+        #define NPY_LONGLONG_name "int64"
+        #define NPY_LONGLONG_Name "Int64"
+    #else
+        #error "need to fix integer bit-length name code"
+    #endif
+#endif
+
+
+#line 133
+
+#ifdef BYTE_not_size_named
+    #define NPY_UBYTE_name "ubyte"
+    #define NPY_UBYTE_Name "UByte"
+#else
+    /* The C-name is considered just an alias for these: */
+    #define NPY_UBYTE_alias "ubyte"
+    #define NPY_UBYTE_Alias "UByte"
+
+    /* The bitsof macro includes math, so cannot be stringified */
+    #if NPY_BITSOF_BYTE == 8
+        #define NPY_UBYTE_name "uint8"
+        #define NPY_UBYTE_Name "UInt8"
+    #elif NPY_BITSOF_BYTE == 16
+        #define NPY_UBYTE_name "uint16"
+        #define NPY_UBYTE_Name "UInt16"
+    #elif NPY_BITSOF_BYTE == 32
+        #define NPY_UBYTE_name "uint32"
+        #define NPY_UBYTE_Name "UInt32"
+    #elif NPY_BITSOF_BYTE == 64
+        #define NPY_UBYTE_name "uint64"
+        #define NPY_UBYTE_Name "UInt64"
+    #else
+        #error "need to fix integer bit-length name code"
+    #endif
+#endif
+
+
+#line 133
+
+#ifdef SHORT_not_size_named
+    #define NPY_USHORT_name "ushort"
+    #define NPY_USHORT_Name "UShort"
+#else
+    /* The C-name is considered just an alias for these: */
+    #define NPY_USHORT_alias "ushort"
+    #define NPY_USHORT_Alias "UShort"
+
+    /* The bitsof macro includes math, so cannot be stringified */
+    #if NPY_BITSOF_SHORT == 8
+        #define NPY_USHORT_name "uint8"
+        #define NPY_USHORT_Name "UInt8"
+    #elif NPY_BITSOF_SHORT == 16
+        #define NPY_USHORT_name "uint16"
+        #define NPY_USHORT_Name "UInt16"
+    #elif NPY_BITSOF_SHORT == 32
+        #define NPY_USHORT_name "uint32"
+        #define NPY_USHORT_Name "UInt32"
+    #elif NPY_BITSOF_SHORT == 64
+        #define NPY_USHORT_name "uint64"
+        #define NPY_USHORT_Name "UInt64"
+    #else
+        #error "need to fix integer bit-length name code"
+    #endif
+#endif
+
+
+#line 133
+
+#ifdef INT_not_size_named
+    #define NPY_UINT_name "uintc"
+    #define NPY_UINT_Name "UInt"
+#else
+    /* The C-name is considered just an alias for these: */
+    #define NPY_UINT_alias "uintc"
+    #define NPY_UINT_Alias "UInt"
+
+    /* The bitsof macro includes math, so cannot be stringified */
+    #if NPY_BITSOF_INT == 8
+        #define NPY_UINT_name "uint8"
+        #define NPY_UINT_Name "UInt8"
+    #elif NPY_BITSOF_INT == 16
+        #define NPY_UINT_name "uint16"
+        #define NPY_UINT_Name "UInt16"
+    #elif NPY_BITSOF_INT == 32
+        #define NPY_UINT_name "uint32"
+        #define NPY_UINT_Name "UInt32"
+    #elif NPY_BITSOF_INT == 64
+        #define NPY_UINT_name "uint64"
+        #define NPY_UINT_Name "UInt64"
+    #else
+        #error "need to fix integer bit-length name code"
+    #endif
+#endif
+
+
+#line 133
+
+#ifdef LONG_not_size_named
+    #define NPY_ULONG_name "uint"
+    #define NPY_ULONG_Name "ULong"
+#else
+    /* The C-name is considered just an alias for these: */
+    #define NPY_ULONG_alias "uint"
+    #define NPY_ULONG_Alias "ULong"
+
+    /* The bitsof macro includes math, so cannot be stringified */
+    #if NPY_BITSOF_LONG == 8
+        #define NPY_ULONG_name "uint8"
+        #define NPY_ULONG_Name "UInt8"
+    #elif NPY_BITSOF_LONG == 16
+        #define NPY_ULONG_name "uint16"
+        #define NPY_ULONG_Name "UInt16"
+    #elif NPY_BITSOF_LONG == 32
+        #define NPY_ULONG_name "uint32"
+        #define NPY_ULONG_Name "UInt32"
+    #elif NPY_BITSOF_LONG == 64
+        #define NPY_ULONG_name "uint64"
+        #define NPY_ULONG_Name "UInt64"
+    #else
+        #error "need to fix integer bit-length name code"
+    #endif
+#endif
+
+
+#line 133
+
+#ifdef LONGLONG_not_size_named
+    #define NPY_ULONGLONG_name "ulonglong"
+    #define NPY_ULONGLONG_Name "ULongLong"
+#else
+    /* The C-name is considered just an alias for these: */
+    #define NPY_ULONGLONG_alias "ulonglong"
+    #define NPY_ULONGLONG_Alias "ULongLong"
+
+    /* The bitsof macro includes math, so cannot be stringified */
+    #if NPY_BITSOF_LONGLONG == 8
+        #define NPY_ULONGLONG_name "uint8"
+        #define NPY_ULONGLONG_Name "UInt8"
+    #elif NPY_BITSOF_LONGLONG == 16
+        #define NPY_ULONGLONG_name "uint16"
+        #define NPY_ULONGLONG_Name "UInt16"
+    #elif NPY_BITSOF_LONGLONG == 32
+        #define NPY_ULONGLONG_name "uint32"
+        #define NPY_ULONGLONG_Name "UInt32"
+    #elif NPY_BITSOF_LONGLONG == 64
+        #define NPY_ULONGLONG_name "uint64"
+        #define NPY_ULONGLONG_Name "UInt64"
+    #else
+        #error "need to fix integer bit-length name code"
+    #endif
+#endif
+
+
+
+#undef BYTE_not_size_named
+#undef SHORT_not_size_named
+#undef INT_not_size_named
+#undef LONGLONG_not_size_named
+
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_ARRAYTYPES_H_ */
+
diff --git a/nanvix-port/generated-headers/config.h b/nanvix-port/generated-headers/config.h
new file mode 100644
index 000000000000..375d991a9f55
--- /dev/null
+++ b/nanvix-port/generated-headers/config.h
@@ -0,0 +1,126 @@
+/* config.h -- Generated for Nanvix (i686, 32-bit, no SIMD, no threading) */
+
+#ifndef NUMPY_CORE_SRC_COMMON_NPY_CONFIG_H_
+#error config.h should never be included directly, include npy_config.h instead
+#endif
+
+/* Type sizes (i686-nanvix-gcc) */
+#define SIZEOF_PY_INTPTR_T 4
+#define SIZEOF_OFF_T       8
+#define SIZEOF_PY_LONG_LONG 8
+
+/* Functions available in Nanvix newlib */
+/* #undef HAVE_BACKTRACE */
+/* #undef HAVE_MADVISE */
+#define HAVE_FTELLO 1
+#define HAVE_FSEEKO 1
+/* #undef HAVE_FALLOCATE */
+#define HAVE_STRTOLD_L 1
+#define HAVE__THREAD 1
+/* #undef HAVE___DECLSPEC_THREAD_ */
+
+/* Optional headers */
+/* #undef HAVE_FEATURES_H */
+/* #undef HAVE_XLOCALE_H */
+#define HAVE_DLFCN_H 1
+/* #undef HAVE_EXECINFO_H */
+/* #undef HAVE_LIBUNWIND_H */
+#define HAVE_SYS_MMAN_H 1
+/* No SIMD headers on i686-nanvix */
+/* #undef HAVE_XMMINTRIN_H */
+/* #undef HAVE_EMMINTRIN_H */
+/* #undef HAVE_IMMINTRIN_H */
+
+/* GCC builtins */
+#define HAVE___BUILTIN_ISNAN 1
+#define HAVE___BUILTIN_ISINF 1
+#define HAVE___BUILTIN_ISFINITE 1
+#define HAVE___BUILTIN_BSWAP32 1
+#define HAVE___BUILTIN_BSWAP64 1
+#define HAVE___BUILTIN_EXPECT 1
+#define HAVE___BUILTIN_MUL_OVERFLOW 1
+#define HAVE___BUILTIN_PREFETCH 1
+
+/* GCC attributes */
+#define HAVE_ATTRIBUTE_OPTIMIZE_UNROLL_LOOPS 1
+#define HAVE_ATTRIBUTE_OPTIMIZE_OPT_3 1
+#define HAVE_ATTRIBUTE_OPTIMIZE_OPT_2 1
+#define HAVE_ATTRIBUTE_NONNULL 1
+
+/* C99 complex support */
+#define HAVE_COMPLEX_H 1
+#define HAVE_CABS 1
+#define HAVE_CACOS 1
+#define HAVE_CACOSH 1
+#define HAVE_CARG 1
+#define HAVE_CASIN 1
+#define HAVE_CASINH 1
+#define HAVE_CATAN 1
+#define HAVE_CATANH 1
+#define HAVE_CEXP 1
+#define HAVE_CLOG 1
+#define HAVE_CPOW 1
+#define HAVE_CSQRT 1
+#define HAVE_CABSF 1
+#define HAVE_CACOSF 1
+#define HAVE_CACOSHF 1
+#define HAVE_CARGF 1
+#define HAVE_CASINF 1
+#define HAVE_CASINHF 1
+#define HAVE_CATANF 1
+#define HAVE_CATANHF 1
+#define HAVE_CEXPF 1
+#define HAVE_CLOGF 1
+#define HAVE_CPOWF 1
+#define HAVE_CSQRTF 1
+#define HAVE_CABSL 1
+#define HAVE_CACOSL 1
+#define HAVE_CACOSHL 1
+#define HAVE_CARGL 1
+#define HAVE_CASINL 1
+#define HAVE_CASINHL 1
+#define HAVE_CATANL 1
+#define HAVE_CATANHL 1
+#define HAVE_CEXPL 1
+#define HAVE_CLOGL 1
+#define HAVE_CPOWL 1
+#define HAVE_CSQRTL 1
+#define HAVE_CSINF 1
+#define HAVE_CSINHF 1
+#define HAVE_CCOSF 1
+#define HAVE_CCOSHF 1
+#define HAVE_CTANF 1
+#define HAVE_CTANHF 1
+#define HAVE_CSIN 1
+#define HAVE_CSINH 1
+#define HAVE_CCOS 1
+#define HAVE_CCOSH 1
+#define HAVE_CTAN 1
+#define HAVE_CTANH 1
+#define HAVE_CSINL 1
+#define HAVE_CSINHL 1
+#define HAVE_CCOSL 1
+#define HAVE_CCOSHL 1
+#define HAVE_CTANL 1
+#define HAVE_CTANHL 1
+
+/* No SVML */
+/* #undef NPY_CAN_LINK_SVML */
+
+/* No relaxed strides debug */
+#define NPY_RELAXED_STRIDES_DEBUG 0
+
+/* Long double: Intel extended 80-bit in 12 bytes, little-endian */
+#define HAVE_LDOUBLE_INTEL_EXTENDED_12_BYTES_LE 1
+/* #undef HAVE_LDOUBLE_INTEL_EXTENDED_16_BYTES_LE */
+/* #undef HAVE_LDOUBLE_MOTOROLA_EXTENDED_12_BYTES_BE */
+/* #undef HAVE_LDOUBLE_IEEE_DOUBLE_LE */
+/* #undef HAVE_LDOUBLE_IEEE_DOUBLE_BE */
+/* #undef HAVE_LDOUBLE_IEEE_QUAD_LE */
+/* #undef HAVE_LDOUBLE_IEEE_QUAD_BE */
+/* #undef HAVE_LDOUBLE_IBM_DOUBLE_DOUBLE_LE */
+/* #undef HAVE_LDOUBLE_IBM_DOUBLE_DOUBLE_BE */
+
+#ifndef __cplusplus
+/* #undef inline */
+#endif
diff --git a/nanvix-port/generated-headers/loops.h b/nanvix-port/generated-headers/loops.h
new file mode 100644
index 000000000000..6df7c4c634e4
--- /dev/null
+++ b/nanvix-port/generated-headers/loops.h
@@ -0,0 +1,4985 @@
+#line 1 "numpy/core/src/umath/loops.h.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+/* -*- c -*- */
+/*
+ * vim:syntax=c
+ */
+
+#ifndef _NPY_UMATH_LOOPS_H_
+#define _NPY_UMATH_LOOPS_H_
+
+#ifndef NPY_NO_EXPORT
+    #define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN
+#endif
+
+/*
+ *****************************************************************************
+ **                             BOOLEAN LOOPS                               **
+ *****************************************************************************
+ */
+
+/*
+ * Following functions are defined by umath generator
+ * to enable runtime dispatching without the need
+ * to redefine them within dsipatch-able sources.
+ */
+// #define BOOL_invert BOOL_logical_not
+// #define BOOL_add BOOL_logical_or
+// #define BOOL_bitwise_and BOOL_logical_and
+// #define BOOL_bitwise_or BOOL_logical_or
+// #define BOOL_logical_xor BOOL_not_equal
+// #define BOOL_bitwise_xor BOOL_logical_xor
+// #define BOOL_multiply BOOL_logical_and
+// #define BOOL_maximum BOOL_logical_or
+// #define BOOL_minimum BOOL_logical_and
+// #define BOOL_fmax BOOL_maximum
+// #define BOOL_fmin BOOL_minimum
+
+typedef struct PyArrayMethod_Context_tag PyArrayMethod_Context;
+typedef struct NpyAuxData_tag NpyAuxData;
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_comparison.dispatch.h"
+#endif
+
+#line 46
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 46
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_not_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 46
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_greater,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 46
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_greater_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 46
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_less,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 46
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_less_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_logical.dispatch.h"
+#endif
+
+#line 57
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_logical_and,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 57
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_logical_or,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 57
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_logical_not,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 57
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_absolute,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+NPY_NO_EXPORT void
+BOOL__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+#line 67
+NPY_NO_EXPORT void
+BOOL_isnan(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 67
+NPY_NO_EXPORT void
+BOOL_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 67
+NPY_NO_EXPORT void
+BOOL_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_autovec.dispatch.h"
+#endif
+#line 78
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_isnan,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 78
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_isinf,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 78
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_isfinite,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+/*
+ *****************************************************************************
+ **                           INTEGER LOOPS
+ *****************************************************************************
+ */
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_arithmetic.dispatch.h"
+#endif
+
+#line 96
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_divide,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+UBYTE_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+/**end repeat3**/
+
+#line 96
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_divide,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+USHORT_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+/**end repeat3**/
+
+#line 96
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_divide,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+UINT_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+/**end repeat3**/
+
+#line 96
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_divide,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+ULONG_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+/**end repeat3**/
+
+#line 96
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_divide,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+ULONGLONG_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+/**end repeat3**/
+
+#line 96
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_divide,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+BYTE_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+/**end repeat3**/
+
+#line 96
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_divide,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+SHORT_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+/**end repeat3**/
+
+#line 96
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_divide,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+INT_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+/**end repeat3**/
+
+#line 96
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_divide,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+LONG_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+/**end repeat3**/
+
+#line 96
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_divide,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+LONGLONG_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+/**end repeat3**/
+
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_modulo.dispatch.h"
+#endif
+
+#line 113
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_divmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_fmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_remainder,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 113
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_divmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_fmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_remainder,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 113
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_divmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_fmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_remainder,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 113
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_divmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_fmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_remainder,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 113
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_divmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_fmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_remainder,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 113
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_divmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_fmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_remainder,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 113
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_divmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_fmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_remainder,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 113
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_divmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_fmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_remainder,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 113
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_divmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_fmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_remainder,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 113
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_divmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_fmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_remainder,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_comparison.dispatch.h"
+#endif
+
+#line 129
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_not_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_greater,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_greater_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_less,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_less_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 129
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_not_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_greater,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_greater_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_less,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_less_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 129
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_not_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_greater,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_greater_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_less,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_less_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 129
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_not_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_greater,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_greater_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_less,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_less_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 129
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_not_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_greater,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_greater_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_less,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_less_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 129
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_not_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_greater,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_greater_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_less,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_less_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 129
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_not_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_greater,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_greater_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_less,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_less_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 129
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_not_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_greater,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_greater_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_less,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_less_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 129
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_not_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_greater,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_greater_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_less,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_less_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 129
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_not_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_greater,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_greater_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_less,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_less_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_autovec.dispatch.h"
+#endif
+#line 145
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_invert,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_logical_not,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_conjugate,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_reciprocal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_square,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_add,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_subtract,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_multiply,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_bitwise_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_bitwise_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_bitwise_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_left_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_right_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_logical_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_logical_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_logical_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_isnan,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_isinf,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_isfinite,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_absolute,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_sign,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 145
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_invert,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_logical_not,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_conjugate,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_reciprocal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_square,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_add,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_subtract,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_multiply,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_bitwise_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_bitwise_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_bitwise_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_left_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_right_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_logical_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_logical_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_logical_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_isnan,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_isinf,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_isfinite,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_absolute,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_sign,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 145
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_invert,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_logical_not,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_conjugate,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_reciprocal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_square,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_add,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_subtract,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_multiply,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_bitwise_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_bitwise_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_bitwise_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_left_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_right_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_logical_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_logical_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_logical_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_isnan,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_isinf,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_isfinite,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_absolute,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_sign,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 145
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_invert,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_logical_not,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_conjugate,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_reciprocal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_square,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_add,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_subtract,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_multiply,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_bitwise_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_bitwise_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_bitwise_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_left_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_right_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_logical_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_logical_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_logical_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_isnan,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_isinf,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_isfinite,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_absolute,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_sign,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 145
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_invert,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_logical_not,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_conjugate,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_reciprocal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_square,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_add,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_subtract,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_multiply,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_bitwise_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_bitwise_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_bitwise_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_left_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_right_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_logical_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_logical_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_logical_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_isnan,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_isinf,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_isfinite,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_absolute,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_sign,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 145
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_invert,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_logical_not,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_conjugate,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_reciprocal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_square,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_add,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_subtract,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_multiply,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_bitwise_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_bitwise_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_bitwise_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_left_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_right_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_logical_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_logical_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_logical_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_isnan,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_isinf,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_isfinite,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_absolute,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_sign,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 145
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_invert,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_logical_not,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_conjugate,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_reciprocal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_square,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_add,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_subtract,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_multiply,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_bitwise_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_bitwise_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_bitwise_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_left_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_right_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_logical_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_logical_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_logical_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_isnan,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_isinf,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_isfinite,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_absolute,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_sign,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 145
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_invert,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_logical_not,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_conjugate,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_reciprocal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_square,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_add,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_subtract,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_multiply,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_bitwise_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_bitwise_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_bitwise_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_left_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_right_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_logical_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_logical_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_logical_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_isnan,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_isinf,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_isfinite,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_absolute,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_sign,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 145
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_invert,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_logical_not,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_conjugate,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_reciprocal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_square,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_add,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_subtract,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_multiply,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_bitwise_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_bitwise_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_bitwise_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_left_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_right_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_logical_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_logical_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_logical_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_isnan,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_isinf,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_isfinite,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_absolute,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_sign,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 145
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_invert,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_logical_not,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_conjugate,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_reciprocal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_square,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_add,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_subtract,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_multiply,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_bitwise_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_bitwise_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_bitwise_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_left_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_right_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_logical_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_logical_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_logical_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_isnan,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_isinf,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_isfinite,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_absolute,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_sign,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+
+#line 160
+#line 165
+#define BYTE_floor_divide BYTE_divide
+#define BYTE_floor_divide_indexed BYTE_divide_indexed
+#define BYTE_fmax BYTE_maximum
+#define BYTE_fmin BYTE_minimum
+#define BYTE_fmax_indexed BYTE_maximum_indexed
+#define BYTE_fmin_indexed BYTE_minimum_indexed
+
+NPY_NO_EXPORT void
+BYTE__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+BYTE_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 184
+NPY_NO_EXPORT int
+BYTE_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+BYTE_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+BYTE_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+BYTE_bitwise_and_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+BYTE_bitwise_or_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+BYTE_bitwise_xor_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+BYTE_left_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+BYTE_right_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+#line 193
+NPY_NO_EXPORT void
+BYTE_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+BYTE_maximum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 193
+NPY_NO_EXPORT void
+BYTE_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+BYTE_minimum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+NPY_NO_EXPORT void
+BYTE_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+BYTE_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+BYTE_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
+
+
+#line 165
+#define UBYTE_floor_divide UBYTE_divide
+#define UBYTE_floor_divide_indexed UBYTE_divide_indexed
+#define UBYTE_fmax UBYTE_maximum
+#define UBYTE_fmin UBYTE_minimum
+#define UBYTE_fmax_indexed UBYTE_maximum_indexed
+#define UBYTE_fmin_indexed UBYTE_minimum_indexed
+
+NPY_NO_EXPORT void
+UBYTE__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+UBYTE_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 184
+NPY_NO_EXPORT int
+UBYTE_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+UBYTE_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+UBYTE_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+UBYTE_bitwise_and_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+UBYTE_bitwise_or_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+UBYTE_bitwise_xor_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+UBYTE_left_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+UBYTE_right_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+#line 193
+NPY_NO_EXPORT void
+UBYTE_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+UBYTE_maximum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 193
+NPY_NO_EXPORT void
+UBYTE_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+UBYTE_minimum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+NPY_NO_EXPORT void
+UBYTE_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+UBYTE_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+UBYTE_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
+
+
+
+#line 160
+#line 165
+#define SHORT_floor_divide SHORT_divide
+#define SHORT_floor_divide_indexed SHORT_divide_indexed
+#define SHORT_fmax SHORT_maximum
+#define SHORT_fmin SHORT_minimum
+#define SHORT_fmax_indexed SHORT_maximum_indexed
+#define SHORT_fmin_indexed SHORT_minimum_indexed
+
+NPY_NO_EXPORT void
+SHORT__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+SHORT_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 184
+NPY_NO_EXPORT int
+SHORT_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+SHORT_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+SHORT_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+SHORT_bitwise_and_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+SHORT_bitwise_or_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+SHORT_bitwise_xor_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+SHORT_left_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+SHORT_right_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+#line 193
+NPY_NO_EXPORT void
+SHORT_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+SHORT_maximum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 193
+NPY_NO_EXPORT void
+SHORT_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+SHORT_minimum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+NPY_NO_EXPORT void
+SHORT_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+SHORT_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+SHORT_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
+
+
+#line 165
+#define USHORT_floor_divide USHORT_divide
+#define USHORT_floor_divide_indexed USHORT_divide_indexed
+#define USHORT_fmax USHORT_maximum
+#define USHORT_fmin USHORT_minimum
+#define USHORT_fmax_indexed USHORT_maximum_indexed
+#define USHORT_fmin_indexed USHORT_minimum_indexed
+
+NPY_NO_EXPORT void
+USHORT__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+USHORT_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 184
+NPY_NO_EXPORT int
+USHORT_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+USHORT_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+USHORT_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+USHORT_bitwise_and_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+USHORT_bitwise_or_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+USHORT_bitwise_xor_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+USHORT_left_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+USHORT_right_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+#line 193
+NPY_NO_EXPORT void
+USHORT_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+USHORT_maximum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 193
+NPY_NO_EXPORT void
+USHORT_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+USHORT_minimum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+NPY_NO_EXPORT void
+USHORT_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+USHORT_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+USHORT_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
+
+
+
+#line 160
+#line 165
+#define INT_floor_divide INT_divide
+#define INT_floor_divide_indexed INT_divide_indexed
+#define INT_fmax INT_maximum
+#define INT_fmin INT_minimum
+#define INT_fmax_indexed INT_maximum_indexed
+#define INT_fmin_indexed INT_minimum_indexed
+
+NPY_NO_EXPORT void
+INT__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+INT_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 184
+NPY_NO_EXPORT int
+INT_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+INT_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+INT_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+INT_bitwise_and_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+INT_bitwise_or_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+INT_bitwise_xor_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+INT_left_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+INT_right_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+#line 193
+NPY_NO_EXPORT void
+INT_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+INT_maximum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 193
+NPY_NO_EXPORT void
+INT_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+INT_minimum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+NPY_NO_EXPORT void
+INT_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+INT_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+INT_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
+
+
+#line 165
+#define UINT_floor_divide UINT_divide
+#define UINT_floor_divide_indexed UINT_divide_indexed
+#define UINT_fmax UINT_maximum
+#define UINT_fmin UINT_minimum
+#define UINT_fmax_indexed UINT_maximum_indexed
+#define UINT_fmin_indexed UINT_minimum_indexed
+
+NPY_NO_EXPORT void
+UINT__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+UINT_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 184
+NPY_NO_EXPORT int
+UINT_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+UINT_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+UINT_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+UINT_bitwise_and_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+UINT_bitwise_or_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+UINT_bitwise_xor_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+UINT_left_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+UINT_right_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+#line 193
+NPY_NO_EXPORT void
+UINT_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+UINT_maximum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 193
+NPY_NO_EXPORT void
+UINT_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+UINT_minimum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+NPY_NO_EXPORT void
+UINT_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+UINT_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+UINT_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
+
+
+
+#line 160
+#line 165
+#define LONG_floor_divide LONG_divide
+#define LONG_floor_divide_indexed LONG_divide_indexed
+#define LONG_fmax LONG_maximum
+#define LONG_fmin LONG_minimum
+#define LONG_fmax_indexed LONG_maximum_indexed
+#define LONG_fmin_indexed LONG_minimum_indexed
+
+NPY_NO_EXPORT void
+LONG__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+LONG_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 184
+NPY_NO_EXPORT int
+LONG_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+LONG_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+LONG_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+LONG_bitwise_and_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+LONG_bitwise_or_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+LONG_bitwise_xor_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+LONG_left_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+LONG_right_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+#line 193
+NPY_NO_EXPORT void
+LONG_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+LONG_maximum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 193
+NPY_NO_EXPORT void
+LONG_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+LONG_minimum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+NPY_NO_EXPORT void
+LONG_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+LONG_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+LONG_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
+
+
+#line 165
+#define ULONG_floor_divide ULONG_divide
+#define ULONG_floor_divide_indexed ULONG_divide_indexed
+#define ULONG_fmax ULONG_maximum
+#define ULONG_fmin ULONG_minimum
+#define ULONG_fmax_indexed ULONG_maximum_indexed
+#define ULONG_fmin_indexed ULONG_minimum_indexed
+
+NPY_NO_EXPORT void
+ULONG__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+ULONG_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 184
+NPY_NO_EXPORT int
+ULONG_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+ULONG_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+ULONG_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+ULONG_bitwise_and_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+ULONG_bitwise_or_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+ULONG_bitwise_xor_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+ULONG_left_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+ULONG_right_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+#line 193
+NPY_NO_EXPORT void
+ULONG_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+ULONG_maximum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 193
+NPY_NO_EXPORT void
+ULONG_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+ULONG_minimum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+NPY_NO_EXPORT void
+ULONG_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+ULONG_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+ULONG_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
+
+
+
+#line 160
+#line 165
+#define LONGLONG_floor_divide LONGLONG_divide
+#define LONGLONG_floor_divide_indexed LONGLONG_divide_indexed
+#define LONGLONG_fmax LONGLONG_maximum
+#define LONGLONG_fmin LONGLONG_minimum
+#define LONGLONG_fmax_indexed LONGLONG_maximum_indexed
+#define LONGLONG_fmin_indexed LONGLONG_minimum_indexed
+
+NPY_NO_EXPORT void
+LONGLONG__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+LONGLONG_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 184
+NPY_NO_EXPORT int
+LONGLONG_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+LONGLONG_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+LONGLONG_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+LONGLONG_bitwise_and_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+LONGLONG_bitwise_or_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+LONGLONG_bitwise_xor_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+LONGLONG_left_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+LONGLONG_right_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+#line 193
+NPY_NO_EXPORT void
+LONGLONG_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+LONGLONG_maximum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 193
+NPY_NO_EXPORT void
+LONGLONG_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+LONGLONG_minimum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+NPY_NO_EXPORT void
+LONGLONG_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+LONGLONG_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+LONGLONG_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
+
+
+#line 165
+#define ULONGLONG_floor_divide ULONGLONG_divide
+#define ULONGLONG_floor_divide_indexed ULONGLONG_divide_indexed
+#define ULONGLONG_fmax ULONGLONG_maximum
+#define ULONGLONG_fmin ULONGLONG_minimum
+#define ULONGLONG_fmax_indexed ULONGLONG_maximum_indexed
+#define ULONGLONG_fmin_indexed ULONGLONG_minimum_indexed
+
+NPY_NO_EXPORT void
+ULONGLONG__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+ULONGLONG_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 184
+NPY_NO_EXPORT int
+ULONGLONG_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+ULONGLONG_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+ULONGLONG_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+ULONGLONG_bitwise_and_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+ULONGLONG_bitwise_or_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+ULONGLONG_bitwise_xor_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+ULONGLONG_left_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+ULONGLONG_right_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+#line 193
+NPY_NO_EXPORT void
+ULONGLONG_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+ULONGLONG_maximum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 193
+NPY_NO_EXPORT void
+ULONGLONG_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+ULONGLONG_minimum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+NPY_NO_EXPORT void
+ULONGLONG_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+ULONGLONG_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+ULONGLONG_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
+
+
+
+
+#line 218
+NPY_NO_EXPORT void
+LONGLONG_Qq_bool_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+LONGLONG_qQ_bool_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 218
+NPY_NO_EXPORT void
+LONGLONG_Qq_bool_not_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+LONGLONG_qQ_bool_not_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 218
+NPY_NO_EXPORT void
+LONGLONG_Qq_bool_less(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+LONGLONG_qQ_bool_less(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 218
+NPY_NO_EXPORT void
+LONGLONG_Qq_bool_less_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+LONGLONG_qQ_bool_less_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 218
+NPY_NO_EXPORT void
+LONGLONG_Qq_bool_greater(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+LONGLONG_qQ_bool_greater(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 218
+NPY_NO_EXPORT void
+LONGLONG_Qq_bool_greater_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+LONGLONG_qQ_bool_greater_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_unary.dispatch.h"
+#endif
+#line 233
+#line 236
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_negative,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 233
+#line 236
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_negative,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 233
+#line 236
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_negative,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 233
+#line 236
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_negative,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 233
+#line 236
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_negative,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 233
+#line 236
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_negative,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 233
+#line 236
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_negative,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 233
+#line 236
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_negative,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 233
+#line 236
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_negative,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 233
+#line 236
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_negative,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+
+
+/*
+ *****************************************************************************
+ **                             FLOAT LOOPS                                 **
+ *****************************************************************************
+ */
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_unary_fp.dispatch.h"
+#endif
+#line 253
+#line 256
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_rint,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 256
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_floor,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 256
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_trunc,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 256
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_ceil,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 256
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_sqrt,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 256
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_absolute,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 256
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_square,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 256
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_reciprocal,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 253
+#line 256
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_rint,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 256
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_floor,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 256
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_trunc,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 256
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_ceil,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 256
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_sqrt,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 256
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_absolute,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 256
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_square,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 256
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_reciprocal,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_unary_fp_le.dispatch.h"
+#endif
+#line 267
+#line 270
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_isnan,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 270
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_isinf,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 270
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_isfinite,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 270
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_signbit,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 267
+#line 270
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_isnan,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 270
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_isinf,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 270
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_isfinite,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 270
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_signbit,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_unary.dispatch.h"
+#endif
+#line 281
+#line 284
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_negative,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 281
+#line 284
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_negative,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 281
+#line 284
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGDOUBLE_negative,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_arithm_fp.dispatch.h"
+#endif
+#line 295
+#line 299
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_add,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+FLOAT_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 299
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_subtract,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+FLOAT_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 299
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_multiply,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+FLOAT_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 299
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_divide,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+FLOAT_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+#line 295
+#line 299
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_add,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+DOUBLE_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 299
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_subtract,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+DOUBLE_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 299
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_multiply,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+DOUBLE_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 299
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_divide,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+DOUBLE_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_hyperbolic.dispatch.h"
+#endif
+#line 314
+#line 317
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_tanh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 314
+#line 317
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_tanh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+
+// SVML
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_umath_fp.dispatch.h"
+#endif
+
+#line 330
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_tanh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_exp2,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_log2,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_log10,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_expm1,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_log1p,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_cbrt,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_tan,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_arcsin,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_arccos,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_arctan,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_sinh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_cosh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_arcsinh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_arccosh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_arctanh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+
+#line 330
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_tanh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_exp2,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_log2,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_log10,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_expm1,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_log1p,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_cbrt,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_tan,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_arcsin,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_arccos,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_arctan,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_sinh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_cosh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_arcsinh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_arccosh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_arctanh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_sin,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_cos,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_tan,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_exp,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_exp2,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_log,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_log2,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_log10,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_expm1,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_log1p,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_cbrt,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_arcsin,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_arccos,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_arctan,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_sinh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_cosh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_tanh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_arcsinh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_arccosh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_arctanh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+
+#line 352
+#line 355
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_power,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 355
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_arctan2,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+
+#line 352
+#line 355
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_power,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 355
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_arctan2,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_trigonometric.dispatch.h"
+#endif
+
+#line 369
+#line 372
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_sin, (
+    char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+#line 372
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_cos, (
+    char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+
+#line 369
+#line 372
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_sin, (
+    char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+#line 372
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_cos, (
+    char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_exponent_log.dispatch.h"
+#endif
+#line 384
+#line 387
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_exp, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+#line 387
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_log, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+#line 387
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_frexp, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+#line 387
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_ldexp, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+
+#line 384
+#line 387
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_exp, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+#line 387
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_log, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+#line 387
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_frexp, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+#line 387
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_ldexp, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_comparison.dispatch.h"
+#endif
+#line 399
+#line 402
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_equal, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+#line 402
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_not_equal, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+#line 402
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_less, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+#line 402
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_less_equal, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+#line 402
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_greater, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+#line 402
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_greater_equal, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+
+#line 399
+#line 402
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_equal, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+#line 402
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_not_equal, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+#line 402
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_less, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+#line 402
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_less_equal, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+#line 402
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_greater, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+#line 402
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_greater_equal, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+
+
+#line 416
+
+#line 422
+NPY_NO_EXPORT void
+HALF_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+HALF_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 422
+NPY_NO_EXPORT void
+HALF_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+HALF_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 422
+NPY_NO_EXPORT void
+HALF_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+HALF_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 422
+NPY_NO_EXPORT void
+HALF_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+HALF_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+/**end repeat1**/
+
+#line 435
+NPY_NO_EXPORT void
+HALF_logical_and(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 435
+NPY_NO_EXPORT void
+HALF_logical_or(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+NPY_NO_EXPORT void
+HALF_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+HALF_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 450
+
+#if !0 || !1
+NPY_NO_EXPORT void
+HALF_isnan(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !0 || !1
+NPY_NO_EXPORT void
+HALF_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !0 || !1
+NPY_NO_EXPORT void
+HALF_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !0 || !1
+NPY_NO_EXPORT void
+HALF_signbit(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !0 || !0
+NPY_NO_EXPORT void
+HALF_copysign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !0 || !0
+NPY_NO_EXPORT void
+HALF_nextafter(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !0 || !0
+NPY_NO_EXPORT void
+HALF_spacing(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+
+#line 461
+NPY_NO_EXPORT void
+HALF_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+HALF_maximum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 461
+NPY_NO_EXPORT void
+HALF_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+HALF_minimum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+#line 472
+NPY_NO_EXPORT void
+HALF_fmax(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+HALF_fmax_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 472
+NPY_NO_EXPORT void
+HALF_fmin(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+HALF_fmin_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+NPY_NO_EXPORT void
+HALF_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+HALF_floor_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+HALF_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+HALF_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+HALF_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+HALF_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+HALF__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+HALF_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+HALF_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#if 1
+NPY_NO_EXPORT void
+HALF_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+
+NPY_NO_EXPORT void
+HALF_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+HALF_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+HALF_modf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+HALF_frexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+HALF_ldexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+HALF_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 416
+
+#line 422
+NPY_NO_EXPORT void
+FLOAT_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+FLOAT_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 422
+NPY_NO_EXPORT void
+FLOAT_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+FLOAT_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 422
+NPY_NO_EXPORT void
+FLOAT_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+FLOAT_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 422
+NPY_NO_EXPORT void
+FLOAT_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+FLOAT_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+/**end repeat1**/
+
+#line 435
+NPY_NO_EXPORT void
+FLOAT_logical_and(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 435
+NPY_NO_EXPORT void
+FLOAT_logical_or(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+NPY_NO_EXPORT void
+FLOAT_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+FLOAT_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 450
+
+#if !1 || !1
+NPY_NO_EXPORT void
+FLOAT_isnan(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !1 || !1
+NPY_NO_EXPORT void
+FLOAT_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !1 || !1
+NPY_NO_EXPORT void
+FLOAT_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !1 || !1
+NPY_NO_EXPORT void
+FLOAT_signbit(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !1 || !0
+NPY_NO_EXPORT void
+FLOAT_copysign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !1 || !0
+NPY_NO_EXPORT void
+FLOAT_nextafter(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !1 || !0
+NPY_NO_EXPORT void
+FLOAT_spacing(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+
+#line 461
+NPY_NO_EXPORT void
+FLOAT_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+FLOAT_maximum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 461
+NPY_NO_EXPORT void
+FLOAT_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+FLOAT_minimum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+#line 472
+NPY_NO_EXPORT void
+FLOAT_fmax(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+FLOAT_fmax_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 472
+NPY_NO_EXPORT void
+FLOAT_fmin(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+FLOAT_fmin_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+NPY_NO_EXPORT void
+FLOAT_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+FLOAT_floor_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+FLOAT_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+FLOAT_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+FLOAT_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+FLOAT_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+FLOAT__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+FLOAT_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+FLOAT_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#if 0
+NPY_NO_EXPORT void
+FLOAT_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+
+NPY_NO_EXPORT void
+FLOAT_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+FLOAT_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+FLOAT_modf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+FLOAT_frexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+FLOAT_ldexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+FLOAT_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 416
+
+#line 422
+NPY_NO_EXPORT void
+DOUBLE_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+DOUBLE_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 422
+NPY_NO_EXPORT void
+DOUBLE_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+DOUBLE_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 422
+NPY_NO_EXPORT void
+DOUBLE_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+DOUBLE_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 422
+NPY_NO_EXPORT void
+DOUBLE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+DOUBLE_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+/**end repeat1**/
+
+#line 435
+NPY_NO_EXPORT void
+DOUBLE_logical_and(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 435
+NPY_NO_EXPORT void
+DOUBLE_logical_or(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+NPY_NO_EXPORT void
+DOUBLE_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DOUBLE_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 450
+
+#if !1 || !1
+NPY_NO_EXPORT void
+DOUBLE_isnan(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !1 || !1
+NPY_NO_EXPORT void
+DOUBLE_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !1 || !1
+NPY_NO_EXPORT void
+DOUBLE_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !1 || !1
+NPY_NO_EXPORT void
+DOUBLE_signbit(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !1 || !0
+NPY_NO_EXPORT void
+DOUBLE_copysign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !1 || !0
+NPY_NO_EXPORT void
+DOUBLE_nextafter(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !1 || !0
+NPY_NO_EXPORT void
+DOUBLE_spacing(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+
+#line 461
+NPY_NO_EXPORT void
+DOUBLE_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+DOUBLE_maximum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 461
+NPY_NO_EXPORT void
+DOUBLE_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+DOUBLE_minimum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+#line 472
+NPY_NO_EXPORT void
+DOUBLE_fmax(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+DOUBLE_fmax_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 472
+NPY_NO_EXPORT void
+DOUBLE_fmin(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+DOUBLE_fmin_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+NPY_NO_EXPORT void
+DOUBLE_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+DOUBLE_floor_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DOUBLE_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DOUBLE_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DOUBLE_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+DOUBLE_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+DOUBLE__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+DOUBLE_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DOUBLE_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#if 0
+NPY_NO_EXPORT void
+DOUBLE_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+
+NPY_NO_EXPORT void
+DOUBLE_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DOUBLE_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DOUBLE_modf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DOUBLE_frexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DOUBLE_ldexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DOUBLE_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 416
+
+#line 422
+NPY_NO_EXPORT void
+LONGDOUBLE_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+LONGDOUBLE_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 422
+NPY_NO_EXPORT void
+LONGDOUBLE_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+LONGDOUBLE_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 422
+NPY_NO_EXPORT void
+LONGDOUBLE_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+LONGDOUBLE_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 422
+NPY_NO_EXPORT void
+LONGDOUBLE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+LONGDOUBLE_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+/**end repeat1**/
+
+#line 435
+NPY_NO_EXPORT void
+LONGDOUBLE_logical_and(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 435
+NPY_NO_EXPORT void
+LONGDOUBLE_logical_or(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+NPY_NO_EXPORT void
+LONGDOUBLE_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+LONGDOUBLE_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 450
+
+#if !0 || !1
+NPY_NO_EXPORT void
+LONGDOUBLE_isnan(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !0 || !1
+NPY_NO_EXPORT void
+LONGDOUBLE_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !0 || !1
+NPY_NO_EXPORT void
+LONGDOUBLE_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !0 || !1
+NPY_NO_EXPORT void
+LONGDOUBLE_signbit(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !0 || !0
+NPY_NO_EXPORT void
+LONGDOUBLE_copysign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !0 || !0
+NPY_NO_EXPORT void
+LONGDOUBLE_nextafter(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !0 || !0
+NPY_NO_EXPORT void
+LONGDOUBLE_spacing(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+
+#line 461
+NPY_NO_EXPORT void
+LONGDOUBLE_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+LONGDOUBLE_maximum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 461
+NPY_NO_EXPORT void
+LONGDOUBLE_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+LONGDOUBLE_minimum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+#line 472
+NPY_NO_EXPORT void
+LONGDOUBLE_fmax(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+LONGDOUBLE_fmax_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 472
+NPY_NO_EXPORT void
+LONGDOUBLE_fmin(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+LONGDOUBLE_fmin_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+NPY_NO_EXPORT void
+LONGDOUBLE_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+LONGDOUBLE_floor_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+LONGDOUBLE_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+LONGDOUBLE_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+LONGDOUBLE_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+LONGDOUBLE_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+LONGDOUBLE__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+LONGDOUBLE_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+LONGDOUBLE_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#if 0
+NPY_NO_EXPORT void
+LONGDOUBLE_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+
+NPY_NO_EXPORT void
+LONGDOUBLE_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+LONGDOUBLE_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+LONGDOUBLE_modf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+LONGDOUBLE_frexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+LONGDOUBLE_ldexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+LONGDOUBLE_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 534
+#line 537
+NPY_NO_EXPORT void
+HALF_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 537
+NPY_NO_EXPORT void
+HALF_not_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 537
+NPY_NO_EXPORT void
+HALF_less(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 537
+NPY_NO_EXPORT void
+HALF_less_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 537
+NPY_NO_EXPORT void
+HALF_greater(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 537
+NPY_NO_EXPORT void
+HALF_greater_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 534
+#line 537
+NPY_NO_EXPORT void
+LONGDOUBLE_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 537
+NPY_NO_EXPORT void
+LONGDOUBLE_not_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 537
+NPY_NO_EXPORT void
+LONGDOUBLE_less(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 537
+NPY_NO_EXPORT void
+LONGDOUBLE_less_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 537
+NPY_NO_EXPORT void
+LONGDOUBLE_greater(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 537
+NPY_NO_EXPORT void
+LONGDOUBLE_greater_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_autovec.dispatch.h"
+#endif
+#line 548
+#line 551
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_absolute,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+/*
+ *****************************************************************************
+ **                           COMPLEX LOOPS                                 **
+ *****************************************************************************
+ */
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_arithm_fp.dispatch.h"
+#endif
+#line 566
+#line 569
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void CFLOAT_add,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 569
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void CFLOAT_subtract,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 569
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void CFLOAT_multiply,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 569
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void CFLOAT_conjugate,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 569
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void CFLOAT_square,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 566
+#line 569
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void CDOUBLE_add,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 569
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void CDOUBLE_subtract,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 569
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void CDOUBLE_multiply,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 569
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void CDOUBLE_conjugate,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 569
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void CDOUBLE_square,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_unary_complex.dispatch.h"
+#endif
+#line 580
+#line 583
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void CFLOAT_absolute,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 580
+#line 583
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void CDOUBLE_absolute,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+
+#define CGE(xr,xi,yr,yi) (xr > yr || (xr == yr && xi >= yi));
+#define CLE(xr,xi,yr,yi) (xr < yr || (xr == yr && xi <= yi));
+#define CGT(xr,xi,yr,yi) (xr > yr || (xr == yr && xi > yi));
+#define CLT(xr,xi,yr,yi) (xr < yr || (xr == yr && xi < yi));
+#define CEQ(xr,xi,yr,yi) (xr == yr && xi == yi);
+#define CNE(xr,xi,yr,yi) (xr != yr || xi != yi);
+
+#line 601
+
+#line 606
+NPY_NO_EXPORT void
+CFLOAT_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+CFLOAT_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+#line 606
+NPY_NO_EXPORT void
+CFLOAT_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+CFLOAT_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+#line 606
+NPY_NO_EXPORT void
+CFLOAT_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+CFLOAT_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+NPY_NO_EXPORT void
+CFLOAT_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 620
+NPY_NO_EXPORT void
+CFLOAT_greater(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 620
+NPY_NO_EXPORT void
+CFLOAT_greater_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 620
+NPY_NO_EXPORT void
+CFLOAT_less(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 620
+NPY_NO_EXPORT void
+CFLOAT_less_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 620
+NPY_NO_EXPORT void
+CFLOAT_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 620
+NPY_NO_EXPORT void
+CFLOAT_not_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 629
+NPY_NO_EXPORT void
+CFLOAT_logical_and(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 629
+NPY_NO_EXPORT void
+CFLOAT_logical_or(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+NPY_NO_EXPORT void
+CFLOAT_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+CFLOAT_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#line 643
+NPY_NO_EXPORT void
+CFLOAT_isnan(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 643
+NPY_NO_EXPORT void
+CFLOAT_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 643
+NPY_NO_EXPORT void
+CFLOAT_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+NPY_NO_EXPORT void
+CFLOAT_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+CFLOAT__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+CFLOAT_conjugate(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+CFLOAT_absolute(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+CFLOAT_square(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+CFLOAT__arg(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+CFLOAT_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 672
+NPY_NO_EXPORT void
+CFLOAT_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 672
+NPY_NO_EXPORT void
+CFLOAT_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 680
+NPY_NO_EXPORT void
+CFLOAT_fmax(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 680
+NPY_NO_EXPORT void
+CFLOAT_fmin(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+
+#line 601
+
+#line 606
+NPY_NO_EXPORT void
+CDOUBLE_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+CDOUBLE_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+#line 606
+NPY_NO_EXPORT void
+CDOUBLE_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+CDOUBLE_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+#line 606
+NPY_NO_EXPORT void
+CDOUBLE_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+CDOUBLE_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+NPY_NO_EXPORT void
+CDOUBLE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 620
+NPY_NO_EXPORT void
+CDOUBLE_greater(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 620
+NPY_NO_EXPORT void
+CDOUBLE_greater_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 620
+NPY_NO_EXPORT void
+CDOUBLE_less(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 620
+NPY_NO_EXPORT void
+CDOUBLE_less_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 620
+NPY_NO_EXPORT void
+CDOUBLE_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 620
+NPY_NO_EXPORT void
+CDOUBLE_not_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 629
+NPY_NO_EXPORT void
+CDOUBLE_logical_and(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 629
+NPY_NO_EXPORT void
+CDOUBLE_logical_or(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+NPY_NO_EXPORT void
+CDOUBLE_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+CDOUBLE_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#line 643
+NPY_NO_EXPORT void
+CDOUBLE_isnan(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 643
+NPY_NO_EXPORT void
+CDOUBLE_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 643
+NPY_NO_EXPORT void
+CDOUBLE_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+NPY_NO_EXPORT void
+CDOUBLE_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+CDOUBLE__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+CDOUBLE_conjugate(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+CDOUBLE_absolute(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+CDOUBLE_square(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+CDOUBLE__arg(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+CDOUBLE_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 672
+NPY_NO_EXPORT void
+CDOUBLE_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 672
+NPY_NO_EXPORT void
+CDOUBLE_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 680
+NPY_NO_EXPORT void
+CDOUBLE_fmax(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 680
+NPY_NO_EXPORT void
+CDOUBLE_fmin(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+
+#line 601
+
+#line 606
+NPY_NO_EXPORT void
+CLONGDOUBLE_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+CLONGDOUBLE_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+#line 606
+NPY_NO_EXPORT void
+CLONGDOUBLE_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+CLONGDOUBLE_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+#line 606
+NPY_NO_EXPORT void
+CLONGDOUBLE_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+CLONGDOUBLE_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+NPY_NO_EXPORT void
+CLONGDOUBLE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 620
+NPY_NO_EXPORT void
+CLONGDOUBLE_greater(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 620
+NPY_NO_EXPORT void
+CLONGDOUBLE_greater_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 620
+NPY_NO_EXPORT void
+CLONGDOUBLE_less(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 620
+NPY_NO_EXPORT void
+CLONGDOUBLE_less_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 620
+NPY_NO_EXPORT void
+CLONGDOUBLE_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 620
+NPY_NO_EXPORT void
+CLONGDOUBLE_not_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 629
+NPY_NO_EXPORT void
+CLONGDOUBLE_logical_and(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 629
+NPY_NO_EXPORT void
+CLONGDOUBLE_logical_or(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+NPY_NO_EXPORT void
+CLONGDOUBLE_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+CLONGDOUBLE_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#line 643
+NPY_NO_EXPORT void
+CLONGDOUBLE_isnan(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 643
+NPY_NO_EXPORT void
+CLONGDOUBLE_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 643
+NPY_NO_EXPORT void
+CLONGDOUBLE_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+NPY_NO_EXPORT void
+CLONGDOUBLE_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+CLONGDOUBLE__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+CLONGDOUBLE_conjugate(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+CLONGDOUBLE_absolute(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+CLONGDOUBLE_square(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+CLONGDOUBLE__arg(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+CLONGDOUBLE_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 672
+NPY_NO_EXPORT void
+CLONGDOUBLE_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 672
+NPY_NO_EXPORT void
+CLONGDOUBLE_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 680
+NPY_NO_EXPORT void
+CLONGDOUBLE_fmax(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 680
+NPY_NO_EXPORT void
+CLONGDOUBLE_fmin(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+
+
+#undef CGE
+#undef CLE
+#undef CGT
+#undef CLT
+#undef CEQ
+#undef CNE
+
+/*
+ *****************************************************************************
+ **                            DATETIME LOOPS                               **
+ *****************************************************************************
+ */
+
+NPY_NO_EXPORT void
+TIMEDELTA_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 714
+
+NPY_NO_EXPORT void
+DATETIME_isnat(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DATETIME_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#define DATETIME_isnan DATETIME_isnat
+
+NPY_NO_EXPORT void
+DATETIME__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+#line 730
+NPY_NO_EXPORT void
+DATETIME_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 730
+NPY_NO_EXPORT void
+DATETIME_not_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 730
+NPY_NO_EXPORT void
+DATETIME_greater(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 730
+NPY_NO_EXPORT void
+DATETIME_greater_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 730
+NPY_NO_EXPORT void
+DATETIME_less(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 730
+NPY_NO_EXPORT void
+DATETIME_less_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 737
+NPY_NO_EXPORT void
+DATETIME_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 737
+NPY_NO_EXPORT void
+DATETIME_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 737
+NPY_NO_EXPORT void
+DATETIME_fmin(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 737
+NPY_NO_EXPORT void
+DATETIME_fmax(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+
+#line 714
+
+NPY_NO_EXPORT void
+TIMEDELTA_isnat(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#define TIMEDELTA_isnan TIMEDELTA_isnat
+
+NPY_NO_EXPORT void
+TIMEDELTA__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+#line 730
+NPY_NO_EXPORT void
+TIMEDELTA_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 730
+NPY_NO_EXPORT void
+TIMEDELTA_not_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 730
+NPY_NO_EXPORT void
+TIMEDELTA_greater(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 730
+NPY_NO_EXPORT void
+TIMEDELTA_greater_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 730
+NPY_NO_EXPORT void
+TIMEDELTA_less(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 730
+NPY_NO_EXPORT void
+TIMEDELTA_less_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 737
+NPY_NO_EXPORT void
+TIMEDELTA_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 737
+NPY_NO_EXPORT void
+TIMEDELTA_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 737
+NPY_NO_EXPORT void
+TIMEDELTA_fmin(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 737
+NPY_NO_EXPORT void
+TIMEDELTA_fmax(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+
+
+NPY_NO_EXPORT void
+DATETIME_Mm_M_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+DATETIME_mM_M_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_m_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DATETIME_Mm_M_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DATETIME_MM_m_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_m_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mq_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_qm_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_md_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_dm_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mq_m_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_md_m_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_d_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_q_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_m_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/* Special case equivalents to above functions */
+#define TIMEDELTA_mq_m_floor_divide TIMEDELTA_mq_m_divide
+#define TIMEDELTA_md_m_floor_divide TIMEDELTA_md_m_divide
+/* #define TIMEDELTA_mm_d_floor_divide TIMEDELTA_mm_d_divide */
+
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_autovec.dispatch.h"
+#endif
+#line 803
+#line 806
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void TIMEDELTA_isinf,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 803
+#line 806
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DATETIME_isinf,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+
+/*
+ *****************************************************************************
+ **                            OBJECT LOOPS                                 **
+ *****************************************************************************
+ */
+
+#line 821
+#line 824
+NPY_NO_EXPORT void
+OBJECT_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 824
+NPY_NO_EXPORT void
+OBJECT_OO_O_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 821
+#line 824
+NPY_NO_EXPORT void
+OBJECT_not_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 824
+NPY_NO_EXPORT void
+OBJECT_OO_O_not_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 821
+#line 824
+NPY_NO_EXPORT void
+OBJECT_greater(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 824
+NPY_NO_EXPORT void
+OBJECT_OO_O_greater(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 821
+#line 824
+NPY_NO_EXPORT void
+OBJECT_greater_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 824
+NPY_NO_EXPORT void
+OBJECT_OO_O_greater_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 821
+#line 824
+NPY_NO_EXPORT void
+OBJECT_less(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 824
+NPY_NO_EXPORT void
+OBJECT_OO_O_less(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 821
+#line 824
+NPY_NO_EXPORT void
+OBJECT_less_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 824
+NPY_NO_EXPORT void
+OBJECT_OO_O_less_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+
+NPY_NO_EXPORT void
+OBJECT_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+PyUFunc_OOO_O(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func);
+
+/*
+ *****************************************************************************
+ **                            MIN/MAX LOOPS                                **
+ *****************************************************************************
+ */
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_minmax.dispatch.h"
+#endif
+
+//---------- Integers ----------
+
+#line 851
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_maximum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_minimum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 851
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_maximum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_minimum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 851
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_maximum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_minimum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 851
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_maximum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_minimum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 851
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_maximum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_minimum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 851
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_maximum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_minimum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 851
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_maximum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_minimum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 851
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_maximum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_minimum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 851
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_maximum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_minimum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 851
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_maximum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_minimum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+
+//---------- Float ----------
+
+ #line 864
+#line 867
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_maximum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 867
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_minimum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 867
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_fmax,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 867
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_fmin,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 864
+#line 867
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_maximum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 867
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_minimum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 867
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_fmax,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 867
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_fmin,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 864
+#line 867
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGDOUBLE_maximum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 867
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGDOUBLE_minimum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 867
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGDOUBLE_fmax,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 867
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGDOUBLE_fmin,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+
+/*
+ *****************************************************************************
+ **                              END LOOPS                                  **
+ *****************************************************************************
+ */
+
+#endif
+
diff --git a/nanvix-port/generated-headers/loops_utils.h b/nanvix-port/generated-headers/loops_utils.h
new file mode 100644
index 000000000000..acda6a76704a
--- /dev/null
+++ b/nanvix-port/generated-headers/loops_utils.h
@@ -0,0 +1,597 @@
+#line 1 "numpy/core/src/umath/loops_utils.h.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+#ifndef _NPY_UMATH_LOOPS_UTILS_H_
+#define _NPY_UMATH_LOOPS_UTILS_H_
+
+#include "numpy/npy_common.h" // NPY_FINLINE
+#include "numpy/halffloat.h" // npy_half_to_float
+
+/**
+ * Old versions of MSVC causes ambiguous link errors when we deal with large SIMD kernels
+ * which lead to break the build, probably related to the following bug:
+ * https://developercommunity.visualstudio.com/content/problem/415095/internal-compiler-error-with-perfectly-forwarded-r.html
+ */
+#if defined(_MSC_VER) && _MSC_VER < 1916
+    #define SIMD_MSVC_NOINLINE __declspec(noinline)
+#else
+    #define SIMD_MSVC_NOINLINE
+#endif
+/*
+ * nomemoverlap - returns false if two strided arrays have an overlapping
+ * region in memory. ip_size/op_size = size of the arrays which can be negative
+ * indicating negative steps.
+ */
+NPY_FINLINE npy_bool
+nomemoverlap(char *ip, npy_intp ip_size, char *op, npy_intp op_size)
+{
+    char *ip_start, *ip_end, *op_start, *op_end;
+    if (ip_size < 0) {
+        ip_start = ip + ip_size;
+        ip_end = ip;
+    }
+    else {
+        ip_start = ip;
+        ip_end = ip + ip_size;
+    }
+    if (op_size < 0) {
+        op_start = op + op_size;
+        op_end = op;
+    }
+    else {
+        op_start = op;
+        op_end = op + op_size;
+    }
+    return (ip_start == op_start && op_end == ip_end) ||
+           (ip_start > op_end) || (op_start > ip_end);
+}
+
+// returns true if two strided arrays have an overlapping region in memory
+// same as `nomemoverlap()` but requires array length and step sizes
+NPY_FINLINE npy_bool
+is_mem_overlap(const void *src, npy_intp src_step, const void *dst, npy_intp dst_step, npy_intp len)
+{
+    return !(nomemoverlap((char*)src, src_step*len, (char*)dst, dst_step*len));
+}
+
+/*
+ * cutoff blocksize for pairwise summation
+ * decreasing it decreases errors slightly as more pairs are summed but
+ * also lowers performance, as the inner loop is unrolled eight times it is
+ * effectively 16
+ */
+#define PW_BLOCKSIZE    128
+
+#line 71
+
+/*
+ * Pairwise summation, rounding error O(lg n) instead of O(n).
+ * The recursion depth is O(lg n) as well.
+ * when updating also update similar complex floats summation
+ */
+static inline npy_float
+FLOAT_pairwise_sum(char *a, npy_intp n, npy_intp stride)
+{
+    if (n < 8) {
+        npy_intp i;
+        /*
+         * Start with -0 to preserve -0 values.  The reason is that summing
+         * only -0 should return -0, but `0 + -0 == 0` while `-0 + -0 == -0`.
+         */
+        npy_float res = -0.0;
+
+        for (i = 0; i < n; i++) {
+            res += (*((npy_float*)(a + i * stride)));
+        }
+        return res;
+    }
+    else if (n <= PW_BLOCKSIZE) {
+        npy_intp i;
+        npy_float r[8], res;
+
+        /*
+         * sum a block with 8 accumulators
+         * 8 times unroll reduces blocksize to 16 and allows vectorization with
+         * avx without changing summation ordering
+         */
+        r[0] = (*((npy_float *)(a + 0 * stride)));
+        r[1] = (*((npy_float *)(a + 1 * stride)));
+        r[2] = (*((npy_float *)(a + 2 * stride)));
+        r[3] = (*((npy_float *)(a + 3 * stride)));
+        r[4] = (*((npy_float *)(a + 4 * stride)));
+        r[5] = (*((npy_float *)(a + 5 * stride)));
+        r[6] = (*((npy_float *)(a + 6 * stride)));
+        r[7] = (*((npy_float *)(a + 7 * stride)));
+
+        for (i = 8; i < n - (n % 8); i += 8) {
+            /* small blocksizes seems to mess with hardware prefetch */
+            NPY_PREFETCH(a + (i + 512/(npy_intp)sizeof(npy_float))*stride, 0, 3);
+            r[0] += (*((npy_float *)(a + (i + 0) * stride)));
+            r[1] += (*((npy_float *)(a + (i + 1) * stride)));
+            r[2] += (*((npy_float *)(a + (i + 2) * stride)));
+            r[3] += (*((npy_float *)(a + (i + 3) * stride)));
+            r[4] += (*((npy_float *)(a + (i + 4) * stride)));
+            r[5] += (*((npy_float *)(a + (i + 5) * stride)));
+            r[6] += (*((npy_float *)(a + (i + 6) * stride)));
+            r[7] += (*((npy_float *)(a + (i + 7) * stride)));
+        }
+
+        /* accumulate now to avoid stack spills for single peel loop */
+        res = ((r[0] + r[1]) + (r[2] + r[3])) +
+              ((r[4] + r[5]) + (r[6] + r[7]));
+
+        /* do non multiple of 8 rest */
+        for (; i < n; i++) {
+            res += (*((npy_float *)(a + i * stride)));
+        }
+        return res;
+    }
+    else {
+        /* divide by two but avoid non-multiples of unroll factor */
+        npy_intp n2 = n / 2;
+
+        n2 -= n2 % 8;
+        return FLOAT_pairwise_sum(a, n2, stride) +
+               FLOAT_pairwise_sum(a + n2 * stride, n - n2, stride);
+    }
+}
+
+
+#line 71
+
+/*
+ * Pairwise summation, rounding error O(lg n) instead of O(n).
+ * The recursion depth is O(lg n) as well.
+ * when updating also update similar complex floats summation
+ */
+static inline npy_double
+DOUBLE_pairwise_sum(char *a, npy_intp n, npy_intp stride)
+{
+    if (n < 8) {
+        npy_intp i;
+        /*
+         * Start with -0 to preserve -0 values.  The reason is that summing
+         * only -0 should return -0, but `0 + -0 == 0` while `-0 + -0 == -0`.
+         */
+        npy_double res = -0.0;
+
+        for (i = 0; i < n; i++) {
+            res += (*((npy_double*)(a + i * stride)));
+        }
+        return res;
+    }
+    else if (n <= PW_BLOCKSIZE) {
+        npy_intp i;
+        npy_double r[8], res;
+
+        /*
+         * sum a block with 8 accumulators
+         * 8 times unroll reduces blocksize to 16 and allows vectorization with
+         * avx without changing summation ordering
+         */
+        r[0] = (*((npy_double *)(a + 0 * stride)));
+        r[1] = (*((npy_double *)(a + 1 * stride)));
+        r[2] = (*((npy_double *)(a + 2 * stride)));
+        r[3] = (*((npy_double *)(a + 3 * stride)));
+        r[4] = (*((npy_double *)(a + 4 * stride)));
+        r[5] = (*((npy_double *)(a + 5 * stride)));
+        r[6] = (*((npy_double *)(a + 6 * stride)));
+        r[7] = (*((npy_double *)(a + 7 * stride)));
+
+        for (i = 8; i < n - (n % 8); i += 8) {
+            /* small blocksizes seems to mess with hardware prefetch */
+            NPY_PREFETCH(a + (i + 512/(npy_intp)sizeof(npy_double))*stride, 0, 3);
+            r[0] += (*((npy_double *)(a + (i + 0) * stride)));
+            r[1] += (*((npy_double *)(a + (i + 1) * stride)));
+            r[2] += (*((npy_double *)(a + (i + 2) * stride)));
+            r[3] += (*((npy_double *)(a + (i + 3) * stride)));
+            r[4] += (*((npy_double *)(a + (i + 4) * stride)));
+            r[5] += (*((npy_double *)(a + (i + 5) * stride)));
+            r[6] += (*((npy_double *)(a + (i + 6) * stride)));
+            r[7] += (*((npy_double *)(a + (i + 7) * stride)));
+        }
+
+        /* accumulate now to avoid stack spills for single peel loop */
+        res = ((r[0] + r[1]) + (r[2] + r[3])) +
+              ((r[4] + r[5]) + (r[6] + r[7]));
+
+        /* do non multiple of 8 rest */
+        for (; i < n; i++) {
+            res += (*((npy_double *)(a + i * stride)));
+        }
+        return res;
+    }
+    else {
+        /* divide by two but avoid non-multiples of unroll factor */
+        npy_intp n2 = n / 2;
+
+        n2 -= n2 % 8;
+        return DOUBLE_pairwise_sum(a, n2, stride) +
+               DOUBLE_pairwise_sum(a + n2 * stride, n - n2, stride);
+    }
+}
+
+
+#line 71
+
+/*
+ * Pairwise summation, rounding error O(lg n) instead of O(n).
+ * The recursion depth is O(lg n) as well.
+ * when updating also update similar complex floats summation
+ */
+static inline npy_longdouble
+LONGDOUBLE_pairwise_sum(char *a, npy_intp n, npy_intp stride)
+{
+    if (n < 8) {
+        npy_intp i;
+        /*
+         * Start with -0 to preserve -0 values.  The reason is that summing
+         * only -0 should return -0, but `0 + -0 == 0` while `-0 + -0 == -0`.
+         */
+        npy_longdouble res = -0.0;
+
+        for (i = 0; i < n; i++) {
+            res += (*((npy_longdouble*)(a + i * stride)));
+        }
+        return res;
+    }
+    else if (n <= PW_BLOCKSIZE) {
+        npy_intp i;
+        npy_longdouble r[8], res;
+
+        /*
+         * sum a block with 8 accumulators
+         * 8 times unroll reduces blocksize to 16 and allows vectorization with
+         * avx without changing summation ordering
+         */
+        r[0] = (*((npy_longdouble *)(a + 0 * stride)));
+        r[1] = (*((npy_longdouble *)(a + 1 * stride)));
+        r[2] = (*((npy_longdouble *)(a + 2 * stride)));
+        r[3] = (*((npy_longdouble *)(a + 3 * stride)));
+        r[4] = (*((npy_longdouble *)(a + 4 * stride)));
+        r[5] = (*((npy_longdouble *)(a + 5 * stride)));
+        r[6] = (*((npy_longdouble *)(a + 6 * stride)));
+        r[7] = (*((npy_longdouble *)(a + 7 * stride)));
+
+        for (i = 8; i < n - (n % 8); i += 8) {
+            /* small blocksizes seems to mess with hardware prefetch */
+            NPY_PREFETCH(a + (i + 512/(npy_intp)sizeof(npy_longdouble))*stride, 0, 3);
+            r[0] += (*((npy_longdouble *)(a + (i + 0) * stride)));
+            r[1] += (*((npy_longdouble *)(a + (i + 1) * stride)));
+            r[2] += (*((npy_longdouble *)(a + (i + 2) * stride)));
+            r[3] += (*((npy_longdouble *)(a + (i + 3) * stride)));
+            r[4] += (*((npy_longdouble *)(a + (i + 4) * stride)));
+            r[5] += (*((npy_longdouble *)(a + (i + 5) * stride)));
+            r[6] += (*((npy_longdouble *)(a + (i + 6) * stride)));
+            r[7] += (*((npy_longdouble *)(a + (i + 7) * stride)));
+        }
+
+        /* accumulate now to avoid stack spills for single peel loop */
+        res = ((r[0] + r[1]) + (r[2] + r[3])) +
+              ((r[4] + r[5]) + (r[6] + r[7]));
+
+        /* do non multiple of 8 rest */
+        for (; i < n; i++) {
+            res += (*((npy_longdouble *)(a + i * stride)));
+        }
+        return res;
+    }
+    else {
+        /* divide by two but avoid non-multiples of unroll factor */
+        npy_intp n2 = n / 2;
+
+        n2 -= n2 % 8;
+        return LONGDOUBLE_pairwise_sum(a, n2, stride) +
+               LONGDOUBLE_pairwise_sum(a + n2 * stride, n - n2, stride);
+    }
+}
+
+
+#line 71
+
+/*
+ * Pairwise summation, rounding error O(lg n) instead of O(n).
+ * The recursion depth is O(lg n) as well.
+ * when updating also update similar complex floats summation
+ */
+static inline npy_float
+HALF_pairwise_sum(char *a, npy_intp n, npy_intp stride)
+{
+    if (n < 8) {
+        npy_intp i;
+        /*
+         * Start with -0 to preserve -0 values.  The reason is that summing
+         * only -0 should return -0, but `0 + -0 == 0` while `-0 + -0 == -0`.
+         */
+        npy_float res = -0.0;
+
+        for (i = 0; i < n; i++) {
+            res += npy_half_to_float(*((npy_half*)(a + i * stride)));
+        }
+        return res;
+    }
+    else if (n <= PW_BLOCKSIZE) {
+        npy_intp i;
+        npy_float r[8], res;
+
+        /*
+         * sum a block with 8 accumulators
+         * 8 times unroll reduces blocksize to 16 and allows vectorization with
+         * avx without changing summation ordering
+         */
+        r[0] = npy_half_to_float(*((npy_half *)(a + 0 * stride)));
+        r[1] = npy_half_to_float(*((npy_half *)(a + 1 * stride)));
+        r[2] = npy_half_to_float(*((npy_half *)(a + 2 * stride)));
+        r[3] = npy_half_to_float(*((npy_half *)(a + 3 * stride)));
+        r[4] = npy_half_to_float(*((npy_half *)(a + 4 * stride)));
+        r[5] = npy_half_to_float(*((npy_half *)(a + 5 * stride)));
+        r[6] = npy_half_to_float(*((npy_half *)(a + 6 * stride)));
+        r[7] = npy_half_to_float(*((npy_half *)(a + 7 * stride)));
+
+        for (i = 8; i < n - (n % 8); i += 8) {
+            /* small blocksizes seems to mess with hardware prefetch */
+            NPY_PREFETCH(a + (i + 512/(npy_intp)sizeof(npy_half))*stride, 0, 3);
+            r[0] += npy_half_to_float(*((npy_half *)(a + (i + 0) * stride)));
+            r[1] += npy_half_to_float(*((npy_half *)(a + (i + 1) * stride)));
+            r[2] += npy_half_to_float(*((npy_half *)(a + (i + 2) * stride)));
+            r[3] += npy_half_to_float(*((npy_half *)(a + (i + 3) * stride)));
+            r[4] += npy_half_to_float(*((npy_half *)(a + (i + 4) * stride)));
+            r[5] += npy_half_to_float(*((npy_half *)(a + (i + 5) * stride)));
+            r[6] += npy_half_to_float(*((npy_half *)(a + (i + 6) * stride)));
+            r[7] += npy_half_to_float(*((npy_half *)(a + (i + 7) * stride)));
+        }
+
+        /* accumulate now to avoid stack spills for single peel loop */
+        res = ((r[0] + r[1]) + (r[2] + r[3])) +
+              ((r[4] + r[5]) + (r[6] + r[7]));
+
+        /* do non multiple of 8 rest */
+        for (; i < n; i++) {
+            res += npy_half_to_float(*((npy_half *)(a + i * stride)));
+        }
+        return res;
+    }
+    else {
+        /* divide by two but avoid non-multiples of unroll factor */
+        npy_intp n2 = n / 2;
+
+        n2 -= n2 % 8;
+        return HALF_pairwise_sum(a, n2, stride) +
+               HALF_pairwise_sum(a + n2 * stride, n - n2, stride);
+    }
+}
+
+
+
+#line 154
+/* similar to pairwise sum of real floats */
+static inline void
+CFLOAT_pairwise_sum(npy_float *rr, npy_float * ri, char * a, npy_intp n,
+                    npy_intp stride)
+{
+    assert(n % 2 == 0);
+    if (n < 8) {
+        npy_intp i;
+
+        *rr = -0.0;
+        *ri = -0.0;
+        for (i = 0; i < n; i += 2) {
+            *rr += *((npy_float *)(a + i * stride + 0));
+            *ri += *((npy_float *)(a + i * stride + sizeof(npy_float)));
+        }
+        return;
+    }
+    else if (n <= PW_BLOCKSIZE) {
+        npy_intp i;
+        npy_float r[8];
+
+        /*
+         * sum a block with 8 accumulators
+         * 8 times unroll reduces blocksize to 16 and allows vectorization with
+         * avx without changing summation ordering
+         */
+        r[0] = *((npy_float *)(a + 0 * stride));
+        r[1] = *((npy_float *)(a + 0 * stride + sizeof(npy_float)));
+        r[2] = *((npy_float *)(a + 2 * stride));
+        r[3] = *((npy_float *)(a + 2 * stride + sizeof(npy_float)));
+        r[4] = *((npy_float *)(a + 4 * stride));
+        r[5] = *((npy_float *)(a + 4 * stride + sizeof(npy_float)));
+        r[6] = *((npy_float *)(a + 6 * stride));
+        r[7] = *((npy_float *)(a + 6 * stride + sizeof(npy_float)));
+
+        for (i = 8; i < n - (n % 8); i += 8) {
+            /* small blocksizes seems to mess with hardware prefetch */
+            NPY_PREFETCH(a + (i + 512/(npy_intp)sizeof(npy_float))*stride, 0, 3);
+            r[0] += *((npy_float *)(a + (i + 0) * stride));
+            r[1] += *((npy_float *)(a + (i + 0) * stride + sizeof(npy_float)));
+            r[2] += *((npy_float *)(a + (i + 2) * stride));
+            r[3] += *((npy_float *)(a + (i + 2) * stride + sizeof(npy_float)));
+            r[4] += *((npy_float *)(a + (i + 4) * stride));
+            r[5] += *((npy_float *)(a + (i + 4) * stride + sizeof(npy_float)));
+            r[6] += *((npy_float *)(a + (i + 6) * stride));
+            r[7] += *((npy_float *)(a + (i + 6) * stride + sizeof(npy_float)));
+        }
+
+        /* accumulate now to avoid stack spills for single peel loop */
+        *rr = ((r[0] + r[2]) + (r[4] + r[6]));
+        *ri = ((r[1] + r[3]) + (r[5] + r[7]));
+
+        /* do non multiple of 8 rest */
+        for (; i < n; i+=2) {
+            *rr += *((npy_float *)(a + i * stride + 0));
+            *ri += *((npy_float *)(a + i * stride + sizeof(npy_float)));
+        }
+        return;
+    }
+    else {
+        /* divide by two but avoid non-multiples of unroll factor */
+        npy_float rr1, ri1, rr2, ri2;
+        npy_intp n2 = n / 2;
+
+        n2 -= n2 % 8;
+        CFLOAT_pairwise_sum(&rr1, &ri1, a, n2, stride);
+        CFLOAT_pairwise_sum(&rr2, &ri2, a + n2 * stride, n - n2, stride);
+        *rr = rr1 + rr2;
+        *ri = ri1 + ri2;
+        return;
+    }
+}
+
+#line 154
+/* similar to pairwise sum of real floats */
+static inline void
+CDOUBLE_pairwise_sum(npy_double *rr, npy_double * ri, char * a, npy_intp n,
+                    npy_intp stride)
+{
+    assert(n % 2 == 0);
+    if (n < 8) {
+        npy_intp i;
+
+        *rr = -0.0;
+        *ri = -0.0;
+        for (i = 0; i < n; i += 2) {
+            *rr += *((npy_double *)(a + i * stride + 0));
+            *ri += *((npy_double *)(a + i * stride + sizeof(npy_double)));
+        }
+        return;
+    }
+    else if (n <= PW_BLOCKSIZE) {
+        npy_intp i;
+        npy_double r[8];
+
+        /*
+         * sum a block with 8 accumulators
+         * 8 times unroll reduces blocksize to 16 and allows vectorization with
+         * avx without changing summation ordering
+         */
+        r[0] = *((npy_double *)(a + 0 * stride));
+        r[1] = *((npy_double *)(a + 0 * stride + sizeof(npy_double)));
+        r[2] = *((npy_double *)(a + 2 * stride));
+        r[3] = *((npy_double *)(a + 2 * stride + sizeof(npy_double)));
+        r[4] = *((npy_double *)(a + 4 * stride));
+        r[5] = *((npy_double *)(a + 4 * stride + sizeof(npy_double)));
+        r[6] = *((npy_double *)(a + 6 * stride));
+        r[7] = *((npy_double *)(a + 6 * stride + sizeof(npy_double)));
+
+        for (i = 8; i < n - (n % 8); i += 8) {
+            /* small blocksizes seems to mess with hardware prefetch */
+            NPY_PREFETCH(a + (i + 512/(npy_intp)sizeof(npy_double))*stride, 0, 3);
+            r[0] += *((npy_double *)(a + (i + 0) * stride));
+            r[1] += *((npy_double *)(a + (i + 0) * stride + sizeof(npy_double)));
+            r[2] += *((npy_double *)(a + (i + 2) * stride));
+            r[3] += *((npy_double *)(a + (i + 2) * stride + sizeof(npy_double)));
+            r[4] += *((npy_double *)(a + (i + 4) * stride));
+            r[5] += *((npy_double *)(a + (i + 4) * stride + sizeof(npy_double)));
+            r[6] += *((npy_double *)(a + (i + 6) * stride));
+            r[7] += *((npy_double *)(a + (i + 6) * stride + sizeof(npy_double)));
+        }
+
+        /* accumulate now to avoid stack spills for single peel loop */
+        *rr = ((r[0] + r[2]) + (r[4] + r[6]));
+        *ri = ((r[1] + r[3]) + (r[5] + r[7]));
+
+        /* do non multiple of 8 rest */
+        for (; i < n; i+=2) {
+            *rr += *((npy_double *)(a + i * stride + 0));
+            *ri += *((npy_double *)(a + i * stride + sizeof(npy_double)));
+        }
+        return;
+    }
+    else {
+        /* divide by two but avoid non-multiples of unroll factor */
+        npy_double rr1, ri1, rr2, ri2;
+        npy_intp n2 = n / 2;
+
+        n2 -= n2 % 8;
+        CDOUBLE_pairwise_sum(&rr1, &ri1, a, n2, stride);
+        CDOUBLE_pairwise_sum(&rr2, &ri2, a + n2 * stride, n - n2, stride);
+        *rr = rr1 + rr2;
+        *ri = ri1 + ri2;
+        return;
+    }
+}
+
+#line 154
+/* similar to pairwise sum of real floats */
+static inline void
+CLONGDOUBLE_pairwise_sum(npy_longdouble *rr, npy_longdouble * ri, char * a, npy_intp n,
+                    npy_intp stride)
+{
+    assert(n % 2 == 0);
+    if (n < 8) {
+        npy_intp i;
+
+        *rr = -0.0;
+        *ri = -0.0;
+        for (i = 0; i < n; i += 2) {
+            *rr += *((npy_longdouble *)(a + i * stride + 0));
+            *ri += *((npy_longdouble *)(a + i * stride + sizeof(npy_longdouble)));
+        }
+        return;
+    }
+    else if (n <= PW_BLOCKSIZE) {
+        npy_intp i;
+        npy_longdouble r[8];
+
+        /*
+         * sum a block with 8 accumulators
+         * 8 times unroll reduces blocksize to 16 and allows vectorization with
+         * avx without changing summation ordering
+         */
+        r[0] = *((npy_longdouble *)(a + 0 * stride));
+        r[1] = *((npy_longdouble *)(a + 0 * stride + sizeof(npy_longdouble)));
+        r[2] = *((npy_longdouble *)(a + 2 * stride));
+        r[3] = *((npy_longdouble *)(a + 2 * stride + sizeof(npy_longdouble)));
+        r[4] = *((npy_longdouble *)(a + 4 * stride));
+        r[5] = *((npy_longdouble *)(a + 4 * stride + sizeof(npy_longdouble)));
+        r[6] = *((npy_longdouble *)(a + 6 * stride));
+        r[7] = *((npy_longdouble *)(a + 6 * stride + sizeof(npy_longdouble)));
+
+        for (i = 8; i < n - (n % 8); i += 8) {
+            /* small blocksizes seems to mess with hardware prefetch */
+            NPY_PREFETCH(a + (i + 512/(npy_intp)sizeof(npy_longdouble))*stride, 0, 3);
+            r[0] += *((npy_longdouble *)(a + (i + 0) * stride));
+            r[1] += *((npy_longdouble *)(a + (i + 0) * stride + sizeof(npy_longdouble)));
+            r[2] += *((npy_longdouble *)(a + (i + 2) * stride));
+            r[3] += *((npy_longdouble *)(a + (i + 2) * stride + sizeof(npy_longdouble)));
+            r[4] += *((npy_longdouble *)(a + (i + 4) * stride));
+            r[5] += *((npy_longdouble *)(a + (i + 4) * stride + sizeof(npy_longdouble)));
+            r[6] += *((npy_longdouble *)(a + (i + 6) * stride));
+            r[7] += *((npy_longdouble *)(a + (i + 6) * stride + sizeof(npy_longdouble)));
+        }
+
+        /* accumulate now to avoid stack spills for single peel loop */
+        *rr = ((r[0] + r[2]) + (r[4] + r[6]));
+        *ri = ((r[1] + r[3]) + (r[5] + r[7]));
+
+        /* do non multiple of 8 rest */
+        for (; i < n; i+=2) {
+            *rr += *((npy_longdouble *)(a + i * stride + 0));
+            *ri += *((npy_longdouble *)(a + i * stride + sizeof(npy_longdouble)));
+        }
+        return;
+    }
+    else {
+        /* divide by two but avoid non-multiples of unroll factor */
+        npy_longdouble rr1, ri1, rr2, ri2;
+        npy_intp n2 = n / 2;
+
+        n2 -= n2 % 8;
+        CLONGDOUBLE_pairwise_sum(&rr1, &ri1, a, n2, stride);
+        CLONGDOUBLE_pairwise_sum(&rr2, &ri2, a + n2 * stride, n - n2, stride);
+        *rr = rr1 + rr2;
+        *ri = ri1 + ri2;
+        return;
+    }
+}
+
+
+#endif // _NPY_UMATH_LOOPS_UTILS_H_
+
diff --git a/nanvix-port/generated-headers/matmul.h b/nanvix-port/generated-headers/matmul.h
new file mode 100644
index 000000000000..9caec1db40e4
--- /dev/null
+++ b/nanvix-port/generated-headers/matmul.h
@@ -0,0 +1,89 @@
+#line 1 "numpy/core/src/umath/matmul.h.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+#line 8
+NPY_NO_EXPORT void
+FLOAT_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 8
+NPY_NO_EXPORT void
+DOUBLE_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 8
+NPY_NO_EXPORT void
+LONGDOUBLE_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 8
+NPY_NO_EXPORT void
+HALF_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 8
+NPY_NO_EXPORT void
+CFLOAT_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 8
+NPY_NO_EXPORT void
+CDOUBLE_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 8
+NPY_NO_EXPORT void
+CLONGDOUBLE_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 8
+NPY_NO_EXPORT void
+UBYTE_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 8
+NPY_NO_EXPORT void
+USHORT_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 8
+NPY_NO_EXPORT void
+UINT_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 8
+NPY_NO_EXPORT void
+ULONG_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 8
+NPY_NO_EXPORT void
+ULONGLONG_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 8
+NPY_NO_EXPORT void
+BYTE_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 8
+NPY_NO_EXPORT void
+SHORT_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 8
+NPY_NO_EXPORT void
+INT_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 8
+NPY_NO_EXPORT void
+LONG_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 8
+NPY_NO_EXPORT void
+LONGLONG_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 8
+NPY_NO_EXPORT void
+BOOL_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 8
+NPY_NO_EXPORT void
+OBJECT_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+
+
diff --git a/nanvix-port/generated-headers/npy_math_internal.h b/nanvix-port/generated-headers/npy_math_internal.h
new file mode 100644
index 000000000000..be4d99df476f
--- /dev/null
+++ b/nanvix-port/generated-headers/npy_math_internal.h
@@ -0,0 +1,2005 @@
+#line 1 "numpy/core/src/npymath/npy_math_internal.h.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+/*
+ * vim:syntax=c
+ * A small module to implement missing C99 math capabilities required by numpy
+ *
+ * Please keep this independent of python ! Only basic types (npy_longdouble)
+ * can be used, otherwise, pure C, without any use of Python facilities
+ *
+ * How to add a function to this section
+ * -------------------------------------
+ *
+ * Say you want to add `foo`, these are the steps and the reasons for them.
+ *
+ * 1) Add foo to the appropriate list in the configuration system. The
+ *    lists can be found in numpy/core/setup.py lines 63-105. Read the
+ *    comments that come with them, they are very helpful.
+ *
+ * 2) The configuration system will define a macro HAVE_FOO if your function
+ *    can be linked from the math library. The result can depend on the
+ *    optimization flags as well as the compiler, so can't be known ahead of
+ *    time. If the function can't be linked, then either it is absent, defined
+ *    as a macro, or is an intrinsic (hardware) function.
+ *
+ *    i) Undefine any possible macros:
+ *
+ *    #ifdef foo
+ *    #undef foo
+ *    #endif
+ *
+ *    ii) Avoid as much as possible to declare any function here. Declaring
+ *    functions is not portable: some platforms define some function inline
+ *    with a non standard identifier, for example, or may put another
+ *    identifier which changes the calling convention of the function. If you
+ *    really have to, ALWAYS declare it for the one platform you are dealing
+ *    with:
+ *
+ *    Not ok:
+ *        double exp(double a);
+ *
+ *    Ok:
+ *        #ifdef SYMBOL_DEFINED_WEIRD_PLATFORM
+ *        double exp(double);
+ *        #endif
+ *
+ * Some of the code is taken from msun library in FreeBSD, with the following
+ * notice:
+ *
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+#include "npy_math_private.h"
+#ifdef _MSC_VER
+#  include <intrin.h>   // for __popcnt
+#endif
+
+/* Magic binary numbers used by bit_count
+ * For type T, the magic numbers are computed as follows:
+ * Magic[0]: 01 01 01 01 01 01... = (T)~(T)0/3
+ * Magic[1]: 0011 0011 0011...    = (T)~(T)0/15  * 3
+ * Magic[2]: 00001111 00001111... = (T)~(T)0/255 * 15
+ * Magic[3]: 00000001 00000001... = (T)~(T)0/255
+ *
+ * Counting bits set, in parallel
+ * Based on: http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+ *
+ * Generic Algorithm for type T:
+ * a = a - ((a >> 1) & (T)~(T)0/3);
+ * a = (a & (T)~(T)0/15*3) + ((a >> 2) & (T)~(T)0/15*3);
+ * a = (a + (a >> 4)) & (T)~(T)0/255*15;
+ * c = (T)(a * ((T)~(T)0/255)) >> (sizeof(T) - 1) * CHAR_BIT;
+*/
+
+static const npy_uint8  MAGIC8[]  = {0x55u,                 0x33u,                 0x0Fu,                 0x01u};
+static const npy_uint16 MAGIC16[] = {0x5555u,               0x3333u,               0x0F0Fu,               0x0101u};
+static const npy_uint32 MAGIC32[] = {0x55555555ul,          0x33333333ul,          0x0F0F0F0Ful,          0x01010101ul};
+static const npy_uint64 MAGIC64[] = {0x5555555555555555ull, 0x3333333333333333ull, 0x0F0F0F0F0F0F0F0Full, 0x0101010101010101ull};
+
+
+/*
+ *****************************************************************************
+ **                    BLOCKLIST-ABLE BASIC MATH FUNCTIONS                  **
+ *****************************************************************************
+ */
+
+/* The following functions can be blocked, even for doubles */
+
+/* Original code by Konrad Hinsen.  */
+/* Taken from FreeBSD mlib, adapted for numpy
+ *
+ * XXX: we could be a bit faster by reusing high/low words for inf/nan
+ * classification instead of calling npy_isinf/npy_isnan: we should have some
+ * macros for this, though, instead of doing it manually
+ */
+NPY_INPLACE double npy_log2(double x)
+{
+#ifndef NPY_BLOCK_LOG2
+    return log2(x);
+#else
+    if (!npy_isfinite(x) || x <= 0.) {
+        /* special value result */
+        return npy_log(x);
+    }
+    else {
+        /*
+         * fallback implementation copied from python3.4 math.log2
+         * provides int(log(2**i)) == i for i 1-64 in default rounding mode.
+         *
+         * We want log2(m * 2**e) == log(m) / log(2) + e.  Care is needed when
+         * x is just greater than 1.0: in that case e is 1, log(m) is negative,
+         * and we get significant cancellation error from the addition of
+         * log(m) / log(2) to e.  The slight rewrite of the expression below
+         * avoids this problem.
+         */
+        int e;
+        double m = frexp(x, &e);
+        if (x >= 1.0) {
+            return log(2.0 * m) / log(2.0) + (e - 1);
+        }
+        else {
+            return log(m) / log(2.0) + e;
+        }
+    }
+#endif
+}
+
+/* Taken from FreeBSD mlib, adapted for numpy
+ *
+ * XXX: we could be a bit faster by reusing high/low words for inf/nan
+ * classification instead of calling npy_isinf/npy_isnan: we should have some
+ * macros for this, though, instead of doing it manually
+ */
+/* XXX: we should have this in npy_math.h */
+#define NPY_DBL_EPSILON 1.2246467991473531772E-16
+NPY_INPLACE double npy_atan2(double y, double x)
+{
+#ifndef NPY_BLOCK_ATAN2
+    return atan2(y, x);
+#else
+    npy_int32 k, m, iy, ix, hx, hy;
+    npy_uint32 lx,ly;
+    double z;
+
+    EXTRACT_WORDS(hx, lx, x);
+    ix = hx & 0x7fffffff;
+    EXTRACT_WORDS(hy, ly, y);
+    iy = hy & 0x7fffffff;
+
+    /* if x or y is nan, return nan */
+    if (npy_isnan(x * y)) {
+        return x + y;
+    }
+
+    if (x == 1.0) {
+        return npy_atan(y);
+    }
+
+    m = 2 * (npy_signbit((x)) != 0) + (npy_signbit((y)) != 0);
+    if (y == 0.0) {
+        switch(m) {
+        case 0:
+        case 1: return  y;  /* atan(+-0,+anything)=+-0 */
+        case 2: return  NPY_PI;/* atan(+0,-anything) = pi */
+        case 3: return -NPY_PI;/* atan(-0,-anything) =-pi */
+        }
+    }
+
+    if (x == 0.0) {
+        return y > 0 ? NPY_PI_2 : -NPY_PI_2;
+    }
+
+    if (npy_isinf(x)) {
+        if (npy_isinf(y)) {
+            switch(m) {
+                case 0: return  NPY_PI_4;/* atan(+INF,+INF) */
+                case 1: return -NPY_PI_4;/* atan(-INF,+INF) */
+                case 2: return  3.0*NPY_PI_4;/*atan(+INF,-INF)*/
+                case 3: return -3.0*NPY_PI_4;/*atan(-INF,-INF)*/
+            }
+        } else {
+            switch(m) {
+                case 0: return  NPY_PZERO;  /* atan(+...,+INF) */
+                case 1: return  NPY_NZERO;  /* atan(-...,+INF) */
+                case 2: return  NPY_PI;  /* atan(+...,-INF) */
+                case 3: return -NPY_PI;  /* atan(-...,-INF) */
+            }
+        }
+    }
+
+    if (npy_isinf(y)) {
+        return y > 0 ? NPY_PI_2 : -NPY_PI_2;
+    }
+
+    /* compute y/x */
+    k = (iy - ix) >> 20;
+    if (k > 60) {            /* |y/x| >  2**60 */
+        z = NPY_PI_2 + 0.5 * NPY_DBL_EPSILON;
+        m &= 1;
+    } else if (hx < 0 && k < -60) {
+        z = 0.0;    /* 0 > |y|/x > -2**-60 */
+    } else {
+        z = npy_atan(npy_fabs(y/x));        /* safe to do y/x */
+    }
+
+    switch (m) {
+        case 0: return  z  ;    /* atan(+,+) */
+        case 1: return -z  ;    /* atan(-,+) */
+        case 2: return  NPY_PI - (z - NPY_DBL_EPSILON);/* atan(+,-) */
+        default: /* case 3 */
+            return  (z - NPY_DBL_EPSILON) - NPY_PI;/* atan(-,-) */
+    }
+#endif
+}
+
+
+
+
+
+NPY_INPLACE double npy_hypot(double x, double y)
+{
+#ifndef NPY_BLOCK_HYPOT
+    return hypot(x, y);
+#else
+    double yx;
+
+    if (npy_isinf(x) || npy_isinf(y)) {
+        return NPY_INFINITY;
+    }
+
+    if (npy_isnan(x) || npy_isnan(y)) {
+        return NPY_NAN;
+    }
+
+    x = npy_fabs(x);
+    y = npy_fabs(y);
+    if (x < y) {
+        double temp = x;
+        x = y;
+        y = temp;
+    }
+    if (x == 0.) {
+        return 0.;
+    }
+    else {
+        yx = y/x;
+        return x*npy_sqrt(1.+yx*yx);
+    }
+#endif
+}
+
+/*
+ *
+ * sin, cos, tan
+ * sinh, cosh, tanh,
+ * fabs, floor, ceil, rint, trunc
+ * sqrt, log10, log, exp, expm1
+ * asin, acos, atan,
+ * asinh, acosh, atanh
+ *
+ * hypot, atan2, pow, fmod, modf
+ * ldexp, frexp, cbrt
+ *
+ * We assume the above are always available in their double versions.
+ *
+ * NOTE: some facilities may be available as macro only  instead of functions.
+ * For simplicity, we define our own functions and undef the macros. We could
+ * instead test for the macro, but I am lazy to do that for now.
+ */
+
+
+/*
+ * Decorate all the math functions which are available on the current platform
+ */
+
+#line 285
+#undef NPY__FP_SFX
+#if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
+    #define NPY__FP_SFX(X) X
+#else
+    #define NPY__FP_SFX(X) NPY_CAT(X, l)
+#endif
+/*
+ * On arm64 macOS, there's a bug with sin, cos, and tan where they don't
+ * raise "invalid" when given INFINITY as input.
+ */
+#if defined(__APPLE__) && defined(__arm64__)
+#define WORKAROUND_APPLE_TRIG_BUG 1
+#else
+#define WORKAROUND_APPLE_TRIG_BUG 0
+#endif
+
+#line 305
+NPY_INPLACE npy_longdouble npy_sinl(npy_longdouble x)
+{
+#if WORKAROUND_APPLE_TRIG_BUG
+    if (!npy_isfinite(x)) {
+        return (x - x);
+    }
+#endif
+    return NPY__FP_SFX(sin)(x);
+}
+
+
+#line 305
+NPY_INPLACE npy_longdouble npy_cosl(npy_longdouble x)
+{
+#if WORKAROUND_APPLE_TRIG_BUG
+    if (!npy_isfinite(x)) {
+        return (x - x);
+    }
+#endif
+    return NPY__FP_SFX(cos)(x);
+}
+
+
+#line 305
+NPY_INPLACE npy_longdouble npy_tanl(npy_longdouble x)
+{
+#if WORKAROUND_APPLE_TRIG_BUG
+    if (!npy_isfinite(x)) {
+        return (x - x);
+    }
+#endif
+    return NPY__FP_SFX(tan)(x);
+}
+
+
+
+#undef WORKAROUND_APPLE_TRIG_BUG
+
+
+#line 285
+#undef NPY__FP_SFX
+#if NPY_SIZEOF_DOUBLE == NPY_SIZEOF_DOUBLE
+    #define NPY__FP_SFX(X) X
+#else
+    #define NPY__FP_SFX(X) NPY_CAT(X, )
+#endif
+/*
+ * On arm64 macOS, there's a bug with sin, cos, and tan where they don't
+ * raise "invalid" when given INFINITY as input.
+ */
+#if defined(__APPLE__) && defined(__arm64__)
+#define WORKAROUND_APPLE_TRIG_BUG 1
+#else
+#define WORKAROUND_APPLE_TRIG_BUG 0
+#endif
+
+#line 305
+NPY_INPLACE npy_double npy_sin(npy_double x)
+{
+#if WORKAROUND_APPLE_TRIG_BUG
+    if (!npy_isfinite(x)) {
+        return (x - x);
+    }
+#endif
+    return NPY__FP_SFX(sin)(x);
+}
+
+
+#line 305
+NPY_INPLACE npy_double npy_cos(npy_double x)
+{
+#if WORKAROUND_APPLE_TRIG_BUG
+    if (!npy_isfinite(x)) {
+        return (x - x);
+    }
+#endif
+    return NPY__FP_SFX(cos)(x);
+}
+
+
+#line 305
+NPY_INPLACE npy_double npy_tan(npy_double x)
+{
+#if WORKAROUND_APPLE_TRIG_BUG
+    if (!npy_isfinite(x)) {
+        return (x - x);
+    }
+#endif
+    return NPY__FP_SFX(tan)(x);
+}
+
+
+
+#undef WORKAROUND_APPLE_TRIG_BUG
+
+
+#line 285
+#undef NPY__FP_SFX
+#if NPY_SIZEOF_FLOAT == NPY_SIZEOF_DOUBLE
+    #define NPY__FP_SFX(X) X
+#else
+    #define NPY__FP_SFX(X) NPY_CAT(X, f)
+#endif
+/*
+ * On arm64 macOS, there's a bug with sin, cos, and tan where they don't
+ * raise "invalid" when given INFINITY as input.
+ */
+#if defined(__APPLE__) && defined(__arm64__)
+#define WORKAROUND_APPLE_TRIG_BUG 1
+#else
+#define WORKAROUND_APPLE_TRIG_BUG 0
+#endif
+
+#line 305
+NPY_INPLACE npy_float npy_sinf(npy_float x)
+{
+#if WORKAROUND_APPLE_TRIG_BUG
+    if (!npy_isfinite(x)) {
+        return (x - x);
+    }
+#endif
+    return NPY__FP_SFX(sin)(x);
+}
+
+
+#line 305
+NPY_INPLACE npy_float npy_cosf(npy_float x)
+{
+#if WORKAROUND_APPLE_TRIG_BUG
+    if (!npy_isfinite(x)) {
+        return (x - x);
+    }
+#endif
+    return NPY__FP_SFX(cos)(x);
+}
+
+
+#line 305
+NPY_INPLACE npy_float npy_tanf(npy_float x)
+{
+#if WORKAROUND_APPLE_TRIG_BUG
+    if (!npy_isfinite(x)) {
+        return (x - x);
+    }
+#endif
+    return NPY__FP_SFX(tan)(x);
+}
+
+
+
+#undef WORKAROUND_APPLE_TRIG_BUG
+
+
+
+/* Blocklist-able C99 functions */
+
+#line 329
+#undef NPY__FP_SFX
+#if NPY_SIZEOF_FLOAT == NPY_SIZEOF_DOUBLE
+    #define NPY__FP_SFX(X) X
+#else
+    #define NPY__FP_SFX(X) NPY_CAT(X, f)
+#endif
+
+#line 340
+
+#ifdef expf
+#undef expf
+#endif
+#ifdef NPY_BLOCK_EXPF
+NPY_INPLACE npy_float npy_expf(npy_float x)
+{
+    return (npy_float) npy_exp((double)x);
+}
+#endif
+
+#ifndef NPY_BLOCK_EXPF
+NPY_INPLACE npy_float npy_expf(npy_float x)
+{
+    return NPY__FP_SFX(exp)(x);
+}
+#endif
+
+
+#line 340
+
+#ifdef log2f
+#undef log2f
+#endif
+#ifdef NPY_BLOCK_LOG2F
+NPY_INPLACE npy_float npy_log2f(npy_float x)
+{
+    return (npy_float) npy_log2((double)x);
+}
+#endif
+
+#ifndef NPY_BLOCK_LOG2F
+NPY_INPLACE npy_float npy_log2f(npy_float x)
+{
+    return NPY__FP_SFX(log2)(x);
+}
+#endif
+
+
+#line 340
+
+#ifdef sqrtf
+#undef sqrtf
+#endif
+#ifdef NPY_BLOCK_SQRTF
+NPY_INPLACE npy_float npy_sqrtf(npy_float x)
+{
+    return (npy_float) npy_sqrt((double)x);
+}
+#endif
+
+#ifndef NPY_BLOCK_SQRTF
+NPY_INPLACE npy_float npy_sqrtf(npy_float x)
+{
+    return NPY__FP_SFX(sqrt)(x);
+}
+#endif
+
+
+
+
+#line 365
+#ifdef atan2f
+#undef atan2f
+#endif
+#ifdef NPY_BLOCK_ATAN2F
+NPY_INPLACE npy_float npy_atan2f(npy_float x, npy_float y)
+{
+    return (npy_float) npy_atan2((double)x, (double) y);
+}
+#endif
+
+#ifndef NPY_BLOCK_ATAN2F
+NPY_INPLACE npy_float npy_atan2f(npy_float x, npy_float y)
+{
+    return NPY__FP_SFX(atan2)(x, y);
+}
+#endif
+
+#line 365
+#ifdef hypotf
+#undef hypotf
+#endif
+#ifdef NPY_BLOCK_HYPOTF
+NPY_INPLACE npy_float npy_hypotf(npy_float x, npy_float y)
+{
+    return (npy_float) npy_hypot((double)x, (double) y);
+}
+#endif
+
+#ifndef NPY_BLOCK_HYPOTF
+NPY_INPLACE npy_float npy_hypotf(npy_float x, npy_float y)
+{
+    return NPY__FP_SFX(hypot)(x, y);
+}
+#endif
+
+#line 365
+#ifdef powf
+#undef powf
+#endif
+#ifdef NPY_BLOCK_POWF
+NPY_INPLACE npy_float npy_powf(npy_float x, npy_float y)
+{
+    return (npy_float) npy_pow((double)x, (double) y);
+}
+#endif
+
+#ifndef NPY_BLOCK_POWF
+NPY_INPLACE npy_float npy_powf(npy_float x, npy_float y)
+{
+    return NPY__FP_SFX(pow)(x, y);
+}
+#endif
+
+
+#ifdef modff
+#undef modff
+#endif
+#ifdef NPY_BLOCK_MODFF
+NPY_INPLACE npy_float npy_modff(npy_float x, npy_float *iptr)
+{
+    double niptr;
+    double y = npy_modf((double)x, &niptr);
+    *iptr = (npy_float) niptr;
+    return (npy_float) y;
+}
+#endif
+
+#ifndef NPY_BLOCK_MODFF
+NPY_INPLACE npy_float npy_modff(npy_float x, npy_float *iptr)
+{
+    return NPY__FP_SFX(modf)(x, iptr);
+}
+#endif
+
+
+
+#line 329
+#undef NPY__FP_SFX
+#if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
+    #define NPY__FP_SFX(X) X
+#else
+    #define NPY__FP_SFX(X) NPY_CAT(X, l)
+#endif
+
+#line 340
+
+#ifdef expl
+#undef expl
+#endif
+#ifdef NPY_BLOCK_EXPL
+NPY_INPLACE npy_longdouble npy_expl(npy_longdouble x)
+{
+    return (npy_longdouble) npy_exp((double)x);
+}
+#endif
+
+#ifndef NPY_BLOCK_EXPL
+NPY_INPLACE npy_longdouble npy_expl(npy_longdouble x)
+{
+    return NPY__FP_SFX(exp)(x);
+}
+#endif
+
+
+#line 340
+
+#ifdef log2l
+#undef log2l
+#endif
+#ifdef NPY_BLOCK_LOG2L
+NPY_INPLACE npy_longdouble npy_log2l(npy_longdouble x)
+{
+    return (npy_longdouble) npy_log2((double)x);
+}
+#endif
+
+#ifndef NPY_BLOCK_LOG2L
+NPY_INPLACE npy_longdouble npy_log2l(npy_longdouble x)
+{
+    return NPY__FP_SFX(log2)(x);
+}
+#endif
+
+
+#line 340
+
+#ifdef sqrtl
+#undef sqrtl
+#endif
+#ifdef NPY_BLOCK_SQRTL
+NPY_INPLACE npy_longdouble npy_sqrtl(npy_longdouble x)
+{
+    return (npy_longdouble) npy_sqrt((double)x);
+}
+#endif
+
+#ifndef NPY_BLOCK_SQRTL
+NPY_INPLACE npy_longdouble npy_sqrtl(npy_longdouble x)
+{
+    return NPY__FP_SFX(sqrt)(x);
+}
+#endif
+
+
+
+
+#line 365
+#ifdef atan2l
+#undef atan2l
+#endif
+#ifdef NPY_BLOCK_ATAN2L
+NPY_INPLACE npy_longdouble npy_atan2l(npy_longdouble x, npy_longdouble y)
+{
+    return (npy_longdouble) npy_atan2((double)x, (double) y);
+}
+#endif
+
+#ifndef NPY_BLOCK_ATAN2L
+NPY_INPLACE npy_longdouble npy_atan2l(npy_longdouble x, npy_longdouble y)
+{
+    return NPY__FP_SFX(atan2)(x, y);
+}
+#endif
+
+#line 365
+#ifdef hypotl
+#undef hypotl
+#endif
+#ifdef NPY_BLOCK_HYPOTL
+NPY_INPLACE npy_longdouble npy_hypotl(npy_longdouble x, npy_longdouble y)
+{
+    return (npy_longdouble) npy_hypot((double)x, (double) y);
+}
+#endif
+
+#ifndef NPY_BLOCK_HYPOTL
+NPY_INPLACE npy_longdouble npy_hypotl(npy_longdouble x, npy_longdouble y)
+{
+    return NPY__FP_SFX(hypot)(x, y);
+}
+#endif
+
+#line 365
+#ifdef powl
+#undef powl
+#endif
+#ifdef NPY_BLOCK_POWL
+NPY_INPLACE npy_longdouble npy_powl(npy_longdouble x, npy_longdouble y)
+{
+    return (npy_longdouble) npy_pow((double)x, (double) y);
+}
+#endif
+
+#ifndef NPY_BLOCK_POWL
+NPY_INPLACE npy_longdouble npy_powl(npy_longdouble x, npy_longdouble y)
+{
+    return NPY__FP_SFX(pow)(x, y);
+}
+#endif
+
+
+#ifdef modfl
+#undef modfl
+#endif
+#ifdef NPY_BLOCK_MODFL
+NPY_INPLACE npy_longdouble npy_modfl(npy_longdouble x, npy_longdouble *iptr)
+{
+    double niptr;
+    double y = npy_modf((double)x, &niptr);
+    *iptr = (npy_longdouble) niptr;
+    return (npy_longdouble) y;
+}
+#endif
+
+#ifndef NPY_BLOCK_MODFL
+NPY_INPLACE npy_longdouble npy_modfl(npy_longdouble x, npy_longdouble *iptr)
+{
+    return NPY__FP_SFX(modf)(x, iptr);
+}
+#endif
+
+
+
+
+
+#undef NPY__FP_SFX
+
+
+/*
+ * Non standard functions
+ */
+
+#line 420
+#undef NPY__FP_SFX
+#if NPY_SIZEOF_FLOAT == NPY_SIZEOF_DOUBLE
+    #define NPY__FP_SFX(X) X
+#else
+    #define NPY__FP_SFX(X) NPY_CAT(X, f)
+#endif
+npy_float npy_heavisidef(npy_float x, npy_float h0)
+{
+    if (npy_isnan(x)) {
+        return (npy_float) NPY_NAN;
+    }
+    else if (x == 0) {
+        return h0;
+    }
+    else if (x < 0) {
+        return (npy_float) 0.0;
+    }
+    else {
+        return (npy_float) 1.0;
+    }
+}
+
+#define LOGE2    NPY__FP_SFX(NPY_LOGE2)
+#define LOG2E    NPY__FP_SFX(NPY_LOG2E)
+#define RAD2DEG  (NPY__FP_SFX(180.0)/NPY__FP_SFX(NPY_PI))
+#define DEG2RAD  (NPY__FP_SFX(NPY_PI)/NPY__FP_SFX(180.0))
+
+NPY_INPLACE npy_float npy_rad2degf(npy_float x)
+{
+    return x*RAD2DEG;
+}
+
+NPY_INPLACE npy_float npy_deg2radf(npy_float x)
+{
+    return x*DEG2RAD;
+}
+
+NPY_INPLACE npy_float npy_log2_1pf(npy_float x)
+{
+    return LOG2E*npy_log1pf(x);
+}
+
+NPY_INPLACE npy_float npy_exp2_m1f(npy_float x)
+{
+    return npy_expm1f(LOGE2*x);
+}
+
+NPY_INPLACE npy_float npy_logaddexpf(npy_float x, npy_float y)
+{
+    if (x == y) {
+        /* Handles infinities of the same sign without warnings */
+        return x + LOGE2;
+    }
+    else {
+        const npy_float tmp = x - y;
+        if (tmp > 0) {
+            return x + npy_log1pf(npy_expf(-tmp));
+        }
+        else if (tmp <= 0) {
+            return y + npy_log1pf(npy_expf(tmp));
+        }
+        else {
+            /* NaNs */
+            return tmp;
+        }
+    }
+}
+
+NPY_INPLACE npy_float npy_logaddexp2f(npy_float x, npy_float y)
+{
+    if (x == y) {
+        /* Handles infinities of the same sign without warnings */
+        return x + 1;
+    }
+    else {
+        const npy_float tmp = x - y;
+        if (tmp > 0) {
+            return x + npy_log2_1pf(npy_exp2f(-tmp));
+        }
+        else if (tmp <= 0) {
+            return y + npy_log2_1pf(npy_exp2f(tmp));
+        }
+        else {
+            /* NaNs */
+            return tmp;
+        }
+    }
+}
+
+/*
+ * Wrapper function for remainder edge cases
+ * Internally calls npy_divmod*
+ */
+NPY_INPLACE npy_float
+npy_remainderf(npy_float a, npy_float b)
+{
+    npy_float mod;
+    if (NPY_UNLIKELY(!b)) {
+        /*
+         * in2 == 0 (and not NaN): normal fmod will give the correct
+         * result (always NaN). `divmod` may set additional FPE for the
+         * division by zero creating an inf.
+         */
+        mod = npy_fmodf(a, b);
+    }
+    else {
+        npy_divmodf(a, b, &mod);
+    }
+    return mod;
+}
+
+NPY_INPLACE npy_float
+npy_floor_dividef(npy_float a, npy_float b) {
+    npy_float div, mod;
+    if (NPY_UNLIKELY(!b)) {
+        /*
+         * in2 == 0 (and not NaN): normal division will give the correct
+         * result (Inf or NaN). `divmod` may set additional FPE for the modulo
+         * evaluating to NaN.
+         */
+        div = a / b;
+    }
+    else {
+        div = npy_divmodf(a, b, &mod);
+    }
+    return div;
+}
+
+/*
+ * Python version of divmod.
+ *
+ * The implementation is mostly copied from cpython 3.5.
+ */
+NPY_INPLACE npy_float
+npy_divmodf(npy_float a, npy_float b, npy_float *modulus)
+{
+    npy_float div, mod, floordiv;
+
+    mod = npy_fmodf(a, b);
+    if (NPY_UNLIKELY(!b)) {
+        /* b == 0 (not NaN): return result of fmod. For IEEE is nan */
+        *modulus = mod;
+        return a / b;
+    }
+
+    /* a - mod should be very nearly an integer multiple of b */
+    div = (a - mod) / b;
+
+    /* adjust fmod result to conform to Python convention of remainder */
+    if (mod) {
+        if (isless(b, (npy_float)0) != isless(mod, (npy_float)0)) {
+            mod += b;
+            div -= 1.0f;
+        }
+    }
+    else {
+        /* if mod is zero ensure correct sign */
+        mod = npy_copysignf(0, b);
+    }
+
+    /* snap quotient to nearest integral value */
+    if (div) {
+        floordiv = npy_floorf(div);
+        if (isgreater(div - floordiv, 0.5f))
+            floordiv += 1.0f;
+    }
+    else {
+        /* if div is zero ensure correct sign */
+        floordiv = npy_copysignf(0, a/b);
+    }
+
+    *modulus = mod;
+    return floordiv;
+}
+
+#undef LOGE2
+#undef LOG2E
+#undef RAD2DEG
+#undef DEG2RAD
+#undef NPY__FP_SFX
+
+#line 420
+#undef NPY__FP_SFX
+#if NPY_SIZEOF_DOUBLE == NPY_SIZEOF_DOUBLE
+    #define NPY__FP_SFX(X) X
+#else
+    #define NPY__FP_SFX(X) NPY_CAT(X, )
+#endif
+npy_double npy_heaviside(npy_double x, npy_double h0)
+{
+    if (npy_isnan(x)) {
+        return (npy_double) NPY_NAN;
+    }
+    else if (x == 0) {
+        return h0;
+    }
+    else if (x < 0) {
+        return (npy_double) 0.0;
+    }
+    else {
+        return (npy_double) 1.0;
+    }
+}
+
+#define LOGE2    NPY__FP_SFX(NPY_LOGE2)
+#define LOG2E    NPY__FP_SFX(NPY_LOG2E)
+#define RAD2DEG  (NPY__FP_SFX(180.0)/NPY__FP_SFX(NPY_PI))
+#define DEG2RAD  (NPY__FP_SFX(NPY_PI)/NPY__FP_SFX(180.0))
+
+NPY_INPLACE npy_double npy_rad2deg(npy_double x)
+{
+    return x*RAD2DEG;
+}
+
+NPY_INPLACE npy_double npy_deg2rad(npy_double x)
+{
+    return x*DEG2RAD;
+}
+
+NPY_INPLACE npy_double npy_log2_1p(npy_double x)
+{
+    return LOG2E*npy_log1p(x);
+}
+
+NPY_INPLACE npy_double npy_exp2_m1(npy_double x)
+{
+    return npy_expm1(LOGE2*x);
+}
+
+NPY_INPLACE npy_double npy_logaddexp(npy_double x, npy_double y)
+{
+    if (x == y) {
+        /* Handles infinities of the same sign without warnings */
+        return x + LOGE2;
+    }
+    else {
+        const npy_double tmp = x - y;
+        if (tmp > 0) {
+            return x + npy_log1p(npy_exp(-tmp));
+        }
+        else if (tmp <= 0) {
+            return y + npy_log1p(npy_exp(tmp));
+        }
+        else {
+            /* NaNs */
+            return tmp;
+        }
+    }
+}
+
+NPY_INPLACE npy_double npy_logaddexp2(npy_double x, npy_double y)
+{
+    if (x == y) {
+        /* Handles infinities of the same sign without warnings */
+        return x + 1;
+    }
+    else {
+        const npy_double tmp = x - y;
+        if (tmp > 0) {
+            return x + npy_log2_1p(npy_exp2(-tmp));
+        }
+        else if (tmp <= 0) {
+            return y + npy_log2_1p(npy_exp2(tmp));
+        }
+        else {
+            /* NaNs */
+            return tmp;
+        }
+    }
+}
+
+/*
+ * Wrapper function for remainder edge cases
+ * Internally calls npy_divmod*
+ */
+NPY_INPLACE npy_double
+npy_remainder(npy_double a, npy_double b)
+{
+    npy_double mod;
+    if (NPY_UNLIKELY(!b)) {
+        /*
+         * in2 == 0 (and not NaN): normal fmod will give the correct
+         * result (always NaN). `divmod` may set additional FPE for the
+         * division by zero creating an inf.
+         */
+        mod = npy_fmod(a, b);
+    }
+    else {
+        npy_divmod(a, b, &mod);
+    }
+    return mod;
+}
+
+NPY_INPLACE npy_double
+npy_floor_divide(npy_double a, npy_double b) {
+    npy_double div, mod;
+    if (NPY_UNLIKELY(!b)) {
+        /*
+         * in2 == 0 (and not NaN): normal division will give the correct
+         * result (Inf or NaN). `divmod` may set additional FPE for the modulo
+         * evaluating to NaN.
+         */
+        div = a / b;
+    }
+    else {
+        div = npy_divmod(a, b, &mod);
+    }
+    return div;
+}
+
+/*
+ * Python version of divmod.
+ *
+ * The implementation is mostly copied from cpython 3.5.
+ */
+NPY_INPLACE npy_double
+npy_divmod(npy_double a, npy_double b, npy_double *modulus)
+{
+    npy_double div, mod, floordiv;
+
+    mod = npy_fmod(a, b);
+    if (NPY_UNLIKELY(!b)) {
+        /* b == 0 (not NaN): return result of fmod. For IEEE is nan */
+        *modulus = mod;
+        return a / b;
+    }
+
+    /* a - mod should be very nearly an integer multiple of b */
+    div = (a - mod) / b;
+
+    /* adjust fmod result to conform to Python convention of remainder */
+    if (mod) {
+        if (isless(b, (npy_double)0) != isless(mod, (npy_double)0)) {
+            mod += b;
+            div -= 1.0;
+        }
+    }
+    else {
+        /* if mod is zero ensure correct sign */
+        mod = npy_copysign(0, b);
+    }
+
+    /* snap quotient to nearest integral value */
+    if (div) {
+        floordiv = npy_floor(div);
+        if (isgreater(div - floordiv, 0.5))
+            floordiv += 1.0;
+    }
+    else {
+        /* if div is zero ensure correct sign */
+        floordiv = npy_copysign(0, a/b);
+    }
+
+    *modulus = mod;
+    return floordiv;
+}
+
+#undef LOGE2
+#undef LOG2E
+#undef RAD2DEG
+#undef DEG2RAD
+#undef NPY__FP_SFX
+
+#line 420
+#undef NPY__FP_SFX
+#if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
+    #define NPY__FP_SFX(X) X
+#else
+    #define NPY__FP_SFX(X) NPY_CAT(X, l)
+#endif
+npy_longdouble npy_heavisidel(npy_longdouble x, npy_longdouble h0)
+{
+    if (npy_isnan(x)) {
+        return (npy_longdouble) NPY_NAN;
+    }
+    else if (x == 0) {
+        return h0;
+    }
+    else if (x < 0) {
+        return (npy_longdouble) 0.0;
+    }
+    else {
+        return (npy_longdouble) 1.0;
+    }
+}
+
+#define LOGE2    NPY__FP_SFX(NPY_LOGE2)
+#define LOG2E    NPY__FP_SFX(NPY_LOG2E)
+#define RAD2DEG  (NPY__FP_SFX(180.0)/NPY__FP_SFX(NPY_PI))
+#define DEG2RAD  (NPY__FP_SFX(NPY_PI)/NPY__FP_SFX(180.0))
+
+NPY_INPLACE npy_longdouble npy_rad2degl(npy_longdouble x)
+{
+    return x*RAD2DEG;
+}
+
+NPY_INPLACE npy_longdouble npy_deg2radl(npy_longdouble x)
+{
+    return x*DEG2RAD;
+}
+
+NPY_INPLACE npy_longdouble npy_log2_1pl(npy_longdouble x)
+{
+    return LOG2E*npy_log1pl(x);
+}
+
+NPY_INPLACE npy_longdouble npy_exp2_m1l(npy_longdouble x)
+{
+    return npy_expm1l(LOGE2*x);
+}
+
+NPY_INPLACE npy_longdouble npy_logaddexpl(npy_longdouble x, npy_longdouble y)
+{
+    if (x == y) {
+        /* Handles infinities of the same sign without warnings */
+        return x + LOGE2;
+    }
+    else {
+        const npy_longdouble tmp = x - y;
+        if (tmp > 0) {
+            return x + npy_log1pl(npy_expl(-tmp));
+        }
+        else if (tmp <= 0) {
+            return y + npy_log1pl(npy_expl(tmp));
+        }
+        else {
+            /* NaNs */
+            return tmp;
+        }
+    }
+}
+
+NPY_INPLACE npy_longdouble npy_logaddexp2l(npy_longdouble x, npy_longdouble y)
+{
+    if (x == y) {
+        /* Handles infinities of the same sign without warnings */
+        return x + 1;
+    }
+    else {
+        const npy_longdouble tmp = x - y;
+        if (tmp > 0) {
+            return x + npy_log2_1pl(npy_exp2l(-tmp));
+        }
+        else if (tmp <= 0) {
+            return y + npy_log2_1pl(npy_exp2l(tmp));
+        }
+        else {
+            /* NaNs */
+            return tmp;
+        }
+    }
+}
+
+/*
+ * Wrapper function for remainder edge cases
+ * Internally calls npy_divmod*
+ */
+NPY_INPLACE npy_longdouble
+npy_remainderl(npy_longdouble a, npy_longdouble b)
+{
+    npy_longdouble mod;
+    if (NPY_UNLIKELY(!b)) {
+        /*
+         * in2 == 0 (and not NaN): normal fmod will give the correct
+         * result (always NaN). `divmod` may set additional FPE for the
+         * division by zero creating an inf.
+         */
+        mod = npy_fmodl(a, b);
+    }
+    else {
+        npy_divmodl(a, b, &mod);
+    }
+    return mod;
+}
+
+NPY_INPLACE npy_longdouble
+npy_floor_dividel(npy_longdouble a, npy_longdouble b) {
+    npy_longdouble div, mod;
+    if (NPY_UNLIKELY(!b)) {
+        /*
+         * in2 == 0 (and not NaN): normal division will give the correct
+         * result (Inf or NaN). `divmod` may set additional FPE for the modulo
+         * evaluating to NaN.
+         */
+        div = a / b;
+    }
+    else {
+        div = npy_divmodl(a, b, &mod);
+    }
+    return div;
+}
+
+/*
+ * Python version of divmod.
+ *
+ * The implementation is mostly copied from cpython 3.5.
+ */
+NPY_INPLACE npy_longdouble
+npy_divmodl(npy_longdouble a, npy_longdouble b, npy_longdouble *modulus)
+{
+    npy_longdouble div, mod, floordiv;
+
+    mod = npy_fmodl(a, b);
+    if (NPY_UNLIKELY(!b)) {
+        /* b == 0 (not NaN): return result of fmod. For IEEE is nan */
+        *modulus = mod;
+        return a / b;
+    }
+
+    /* a - mod should be very nearly an integer multiple of b */
+    div = (a - mod) / b;
+
+    /* adjust fmod result to conform to Python convention of remainder */
+    if (mod) {
+        if (isless(b, (npy_longdouble)0) != isless(mod, (npy_longdouble)0)) {
+            mod += b;
+            div -= 1.0l;
+        }
+    }
+    else {
+        /* if mod is zero ensure correct sign */
+        mod = npy_copysignl(0, b);
+    }
+
+    /* snap quotient to nearest integral value */
+    if (div) {
+        floordiv = npy_floorl(div);
+        if (isgreater(div - floordiv, 0.5l))
+            floordiv += 1.0l;
+    }
+    else {
+        /* if div is zero ensure correct sign */
+        floordiv = npy_copysignl(0, a/b);
+    }
+
+    *modulus = mod;
+    return floordiv;
+}
+
+#undef LOGE2
+#undef LOG2E
+#undef RAD2DEG
+#undef DEG2RAD
+#undef NPY__FP_SFX
+
+
+#line 607
+NPY_INPLACE npy_uint
+npy_gcdu(npy_uint a, npy_uint b)
+{
+    npy_uint c;
+    while (a != 0) {
+        c = a;
+        a = b%a;
+        b = c;
+    }
+    return b;
+}
+
+NPY_INPLACE npy_uint
+npy_lcmu(npy_uint a, npy_uint b)
+{
+    npy_uint gcd = npy_gcdu(a, b);
+    return gcd == 0 ? 0 : a / gcd * b;
+}
+
+#line 607
+NPY_INPLACE npy_ulong
+npy_gcdul(npy_ulong a, npy_ulong b)
+{
+    npy_ulong c;
+    while (a != 0) {
+        c = a;
+        a = b%a;
+        b = c;
+    }
+    return b;
+}
+
+NPY_INPLACE npy_ulong
+npy_lcmul(npy_ulong a, npy_ulong b)
+{
+    npy_ulong gcd = npy_gcdul(a, b);
+    return gcd == 0 ? 0 : a / gcd * b;
+}
+
+#line 607
+NPY_INPLACE npy_ulonglong
+npy_gcdull(npy_ulonglong a, npy_ulonglong b)
+{
+    npy_ulonglong c;
+    while (a != 0) {
+        c = a;
+        a = b%a;
+        b = c;
+    }
+    return b;
+}
+
+NPY_INPLACE npy_ulonglong
+npy_lcmull(npy_ulonglong a, npy_ulonglong b)
+{
+    npy_ulonglong gcd = npy_gcdull(a, b);
+    return gcd == 0 ? 0 : a / gcd * b;
+}
+
+
+#line 633
+NPY_INPLACE npy_int
+npy_gcd(npy_int a, npy_int b)
+{
+    return npy_gcdu(a < 0 ? -a : a, b < 0 ? -b : b);
+}
+
+#line 633
+NPY_INPLACE npy_long
+npy_gcdl(npy_long a, npy_long b)
+{
+    return npy_gcdul(a < 0 ? -a : a, b < 0 ? -b : b);
+}
+
+#line 633
+NPY_INPLACE npy_longlong
+npy_gcdll(npy_longlong a, npy_longlong b)
+{
+    return npy_gcdull(a < 0 ? -a : a, b < 0 ? -b : b);
+}
+
+#line 633
+NPY_INPLACE npy_int
+npy_lcm(npy_int a, npy_int b)
+{
+    return npy_lcmu(a < 0 ? -a : a, b < 0 ? -b : b);
+}
+
+#line 633
+NPY_INPLACE npy_long
+npy_lcml(npy_long a, npy_long b)
+{
+    return npy_lcmul(a < 0 ? -a : a, b < 0 ? -b : b);
+}
+
+#line 633
+NPY_INPLACE npy_longlong
+npy_lcmll(npy_longlong a, npy_longlong b)
+{
+    return npy_lcmull(a < 0 ? -a : a, b < 0 ? -b : b);
+}
+
+
+/* Unlike LCM and GCD, we need byte and short variants for the shift operators,
+ * since the result is dependent on the width of the type
+ */
+#line 648
+#line 653
+NPY_INPLACE npy_ubyte
+npy_lshiftuhh(npy_ubyte a, npy_ubyte b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a << b;
+    }
+    else {
+        return 0;
+    }
+}
+NPY_INPLACE npy_ubyte
+npy_rshiftuhh(npy_ubyte a, npy_ubyte b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a >> b;
+    }
+#if 0
+    else if (a < 0) {
+        return (npy_ubyte)-1;  /* preserve the sign bit */
+    }
+#endif
+    else {
+        return 0;
+    }
+}
+
+#line 653
+NPY_INPLACE npy_byte
+npy_lshifthh(npy_byte a, npy_byte b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a << b;
+    }
+    else {
+        return 0;
+    }
+}
+NPY_INPLACE npy_byte
+npy_rshifthh(npy_byte a, npy_byte b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a >> b;
+    }
+#if 1
+    else if (a < 0) {
+        return (npy_byte)-1;  /* preserve the sign bit */
+    }
+#endif
+    else {
+        return 0;
+    }
+}
+
+
+#line 648
+#line 653
+NPY_INPLACE npy_ushort
+npy_lshiftuh(npy_ushort a, npy_ushort b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a << b;
+    }
+    else {
+        return 0;
+    }
+}
+NPY_INPLACE npy_ushort
+npy_rshiftuh(npy_ushort a, npy_ushort b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a >> b;
+    }
+#if 0
+    else if (a < 0) {
+        return (npy_ushort)-1;  /* preserve the sign bit */
+    }
+#endif
+    else {
+        return 0;
+    }
+}
+
+#line 653
+NPY_INPLACE npy_short
+npy_lshifth(npy_short a, npy_short b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a << b;
+    }
+    else {
+        return 0;
+    }
+}
+NPY_INPLACE npy_short
+npy_rshifth(npy_short a, npy_short b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a >> b;
+    }
+#if 1
+    else if (a < 0) {
+        return (npy_short)-1;  /* preserve the sign bit */
+    }
+#endif
+    else {
+        return 0;
+    }
+}
+
+
+#line 648
+#line 653
+NPY_INPLACE npy_uint
+npy_lshiftu(npy_uint a, npy_uint b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a << b;
+    }
+    else {
+        return 0;
+    }
+}
+NPY_INPLACE npy_uint
+npy_rshiftu(npy_uint a, npy_uint b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a >> b;
+    }
+#if 0
+    else if (a < 0) {
+        return (npy_uint)-1;  /* preserve the sign bit */
+    }
+#endif
+    else {
+        return 0;
+    }
+}
+
+#line 653
+NPY_INPLACE npy_int
+npy_lshift(npy_int a, npy_int b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a << b;
+    }
+    else {
+        return 0;
+    }
+}
+NPY_INPLACE npy_int
+npy_rshift(npy_int a, npy_int b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a >> b;
+    }
+#if 1
+    else if (a < 0) {
+        return (npy_int)-1;  /* preserve the sign bit */
+    }
+#endif
+    else {
+        return 0;
+    }
+}
+
+
+#line 648
+#line 653
+NPY_INPLACE npy_ulong
+npy_lshiftul(npy_ulong a, npy_ulong b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a << b;
+    }
+    else {
+        return 0;
+    }
+}
+NPY_INPLACE npy_ulong
+npy_rshiftul(npy_ulong a, npy_ulong b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a >> b;
+    }
+#if 0
+    else if (a < 0) {
+        return (npy_ulong)-1;  /* preserve the sign bit */
+    }
+#endif
+    else {
+        return 0;
+    }
+}
+
+#line 653
+NPY_INPLACE npy_long
+npy_lshiftl(npy_long a, npy_long b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a << b;
+    }
+    else {
+        return 0;
+    }
+}
+NPY_INPLACE npy_long
+npy_rshiftl(npy_long a, npy_long b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a >> b;
+    }
+#if 1
+    else if (a < 0) {
+        return (npy_long)-1;  /* preserve the sign bit */
+    }
+#endif
+    else {
+        return 0;
+    }
+}
+
+
+#line 648
+#line 653
+NPY_INPLACE npy_ulonglong
+npy_lshiftull(npy_ulonglong a, npy_ulonglong b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a << b;
+    }
+    else {
+        return 0;
+    }
+}
+NPY_INPLACE npy_ulonglong
+npy_rshiftull(npy_ulonglong a, npy_ulonglong b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a >> b;
+    }
+#if 0
+    else if (a < 0) {
+        return (npy_ulonglong)-1;  /* preserve the sign bit */
+    }
+#endif
+    else {
+        return 0;
+    }
+}
+
+#line 653
+NPY_INPLACE npy_longlong
+npy_lshiftll(npy_longlong a, npy_longlong b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a << b;
+    }
+    else {
+        return 0;
+    }
+}
+NPY_INPLACE npy_longlong
+npy_rshiftll(npy_longlong a, npy_longlong b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a >> b;
+    }
+#if 1
+    else if (a < 0) {
+        return (npy_longlong)-1;  /* preserve the sign bit */
+    }
+#endif
+    else {
+        return 0;
+    }
+}
+
+
+
+
+#define __popcnt32 __popcnt
+#line 689
+#undef TO_BITS_LEN
+#if 0
+#line 694
+#elif NPY_BITSOF_BYTE == 8
+    #define TO_BITS_LEN(X) X##8
+
+#line 694
+#elif NPY_BITSOF_BYTE == 16
+    #define TO_BITS_LEN(X) X##16
+
+#line 694
+#elif NPY_BITSOF_BYTE == 32
+    #define TO_BITS_LEN(X) X##32
+
+#line 694
+#elif NPY_BITSOF_BYTE == 64
+    #define TO_BITS_LEN(X) X##64
+
+#endif
+
+
+NPY_INPLACE uint8_t
+npy_popcount_parallelhh(npy_ubyte a)
+{
+    a = a - ((a >> 1) & (npy_ubyte) TO_BITS_LEN(MAGIC)[0]);
+    a = ((a & (npy_ubyte) TO_BITS_LEN(MAGIC)[1])) + ((a >> 2) & (npy_ubyte) TO_BITS_LEN(MAGIC)[1]);
+    a = (a + (a >> 4)) & (npy_ubyte) TO_BITS_LEN(MAGIC)[2];
+    return (npy_ubyte) (a * (npy_ubyte) TO_BITS_LEN(MAGIC)[3]) >> ((NPY_SIZEOF_BYTE - 1) * CHAR_BIT);
+}
+
+NPY_INPLACE uint8_t
+npy_popcountuhh(npy_ubyte a)
+{
+/* use built-in popcount if present, else use our implementation */
+#if (defined(__clang__) || defined(__GNUC__)) && NPY_BITSOF_BYTE >= 32
+    return __builtin_popcounthh(a);
+#elif defined(_MSC_VER) && NPY_BITSOF_BYTE >= 16 && !defined(_M_ARM64) && !defined(_M_ARM)
+    /* no builtin __popcnt64 for 32 bits */
+    #if defined(_WIN64) || (defined(_WIN32) && NPY_BITSOF_BYTE != 64)
+        return TO_BITS_LEN(__popcnt)(a);
+    /* split 64 bit number into two 32 bit ints and return sum of counts */
+    #elif (defined(_WIN32) && NPY_BITSOF_BYTE == 64)
+        npy_uint32 left  = (npy_uint32) (a>>32);
+        npy_uint32 right = (npy_uint32) a;
+        return __popcnt32(left) + __popcnt32(right);
+    #endif
+#else
+    return npy_popcount_parallelhh(a);
+#endif
+}
+
+#line 689
+#undef TO_BITS_LEN
+#if 0
+#line 694
+#elif NPY_BITSOF_SHORT == 8
+    #define TO_BITS_LEN(X) X##8
+
+#line 694
+#elif NPY_BITSOF_SHORT == 16
+    #define TO_BITS_LEN(X) X##16
+
+#line 694
+#elif NPY_BITSOF_SHORT == 32
+    #define TO_BITS_LEN(X) X##32
+
+#line 694
+#elif NPY_BITSOF_SHORT == 64
+    #define TO_BITS_LEN(X) X##64
+
+#endif
+
+
+NPY_INPLACE uint8_t
+npy_popcount_parallelh(npy_ushort a)
+{
+    a = a - ((a >> 1) & (npy_ushort) TO_BITS_LEN(MAGIC)[0]);
+    a = ((a & (npy_ushort) TO_BITS_LEN(MAGIC)[1])) + ((a >> 2) & (npy_ushort) TO_BITS_LEN(MAGIC)[1]);
+    a = (a + (a >> 4)) & (npy_ushort) TO_BITS_LEN(MAGIC)[2];
+    return (npy_ushort) (a * (npy_ushort) TO_BITS_LEN(MAGIC)[3]) >> ((NPY_SIZEOF_SHORT - 1) * CHAR_BIT);
+}
+
+NPY_INPLACE uint8_t
+npy_popcountuh(npy_ushort a)
+{
+/* use built-in popcount if present, else use our implementation */
+#if (defined(__clang__) || defined(__GNUC__)) && NPY_BITSOF_SHORT >= 32
+    return __builtin_popcounth(a);
+#elif defined(_MSC_VER) && NPY_BITSOF_SHORT >= 16 && !defined(_M_ARM64) && !defined(_M_ARM)
+    /* no builtin __popcnt64 for 32 bits */
+    #if defined(_WIN64) || (defined(_WIN32) && NPY_BITSOF_SHORT != 64)
+        return TO_BITS_LEN(__popcnt)(a);
+    /* split 64 bit number into two 32 bit ints and return sum of counts */
+    #elif (defined(_WIN32) && NPY_BITSOF_SHORT == 64)
+        npy_uint32 left  = (npy_uint32) (a>>32);
+        npy_uint32 right = (npy_uint32) a;
+        return __popcnt32(left) + __popcnt32(right);
+    #endif
+#else
+    return npy_popcount_parallelh(a);
+#endif
+}
+
+#line 689
+#undef TO_BITS_LEN
+#if 0
+#line 694
+#elif NPY_BITSOF_INT == 8
+    #define TO_BITS_LEN(X) X##8
+
+#line 694
+#elif NPY_BITSOF_INT == 16
+    #define TO_BITS_LEN(X) X##16
+
+#line 694
+#elif NPY_BITSOF_INT == 32
+    #define TO_BITS_LEN(X) X##32
+
+#line 694
+#elif NPY_BITSOF_INT == 64
+    #define TO_BITS_LEN(X) X##64
+
+#endif
+
+
+NPY_INPLACE uint8_t
+npy_popcount_parallel(npy_uint a)
+{
+    a = a - ((a >> 1) & (npy_uint) TO_BITS_LEN(MAGIC)[0]);
+    a = ((a & (npy_uint) TO_BITS_LEN(MAGIC)[1])) + ((a >> 2) & (npy_uint) TO_BITS_LEN(MAGIC)[1]);
+    a = (a + (a >> 4)) & (npy_uint) TO_BITS_LEN(MAGIC)[2];
+    return (npy_uint) (a * (npy_uint) TO_BITS_LEN(MAGIC)[3]) >> ((NPY_SIZEOF_INT - 1) * CHAR_BIT);
+}
+
+NPY_INPLACE uint8_t
+npy_popcountu(npy_uint a)
+{
+/* use built-in popcount if present, else use our implementation */
+#if (defined(__clang__) || defined(__GNUC__)) && NPY_BITSOF_INT >= 32
+    return __builtin_popcount(a);
+#elif defined(_MSC_VER) && NPY_BITSOF_INT >= 16 && !defined(_M_ARM64) && !defined(_M_ARM)
+    /* no builtin __popcnt64 for 32 bits */
+    #if defined(_WIN64) || (defined(_WIN32) && NPY_BITSOF_INT != 64)
+        return TO_BITS_LEN(__popcnt)(a);
+    /* split 64 bit number into two 32 bit ints and return sum of counts */
+    #elif (defined(_WIN32) && NPY_BITSOF_INT == 64)
+        npy_uint32 left  = (npy_uint32) (a>>32);
+        npy_uint32 right = (npy_uint32) a;
+        return __popcnt32(left) + __popcnt32(right);
+    #endif
+#else
+    return npy_popcount_parallel(a);
+#endif
+}
+
+#line 689
+#undef TO_BITS_LEN
+#if 0
+#line 694
+#elif NPY_BITSOF_LONG == 8
+    #define TO_BITS_LEN(X) X##8
+
+#line 694
+#elif NPY_BITSOF_LONG == 16
+    #define TO_BITS_LEN(X) X##16
+
+#line 694
+#elif NPY_BITSOF_LONG == 32
+    #define TO_BITS_LEN(X) X##32
+
+#line 694
+#elif NPY_BITSOF_LONG == 64
+    #define TO_BITS_LEN(X) X##64
+
+#endif
+
+
+NPY_INPLACE uint8_t
+npy_popcount_parallell(npy_ulong a)
+{
+    a = a - ((a >> 1) & (npy_ulong) TO_BITS_LEN(MAGIC)[0]);
+    a = ((a & (npy_ulong) TO_BITS_LEN(MAGIC)[1])) + ((a >> 2) & (npy_ulong) TO_BITS_LEN(MAGIC)[1]);
+    a = (a + (a >> 4)) & (npy_ulong) TO_BITS_LEN(MAGIC)[2];
+    return (npy_ulong) (a * (npy_ulong) TO_BITS_LEN(MAGIC)[3]) >> ((NPY_SIZEOF_LONG - 1) * CHAR_BIT);
+}
+
+NPY_INPLACE uint8_t
+npy_popcountul(npy_ulong a)
+{
+/* use built-in popcount if present, else use our implementation */
+#if (defined(__clang__) || defined(__GNUC__)) && NPY_BITSOF_LONG >= 32
+    return __builtin_popcountl(a);
+#elif defined(_MSC_VER) && NPY_BITSOF_LONG >= 16 && !defined(_M_ARM64) && !defined(_M_ARM)
+    /* no builtin __popcnt64 for 32 bits */
+    #if defined(_WIN64) || (defined(_WIN32) && NPY_BITSOF_LONG != 64)
+        return TO_BITS_LEN(__popcnt)(a);
+    /* split 64 bit number into two 32 bit ints and return sum of counts */
+    #elif (defined(_WIN32) && NPY_BITSOF_LONG == 64)
+        npy_uint32 left  = (npy_uint32) (a>>32);
+        npy_uint32 right = (npy_uint32) a;
+        return __popcnt32(left) + __popcnt32(right);
+    #endif
+#else
+    return npy_popcount_parallell(a);
+#endif
+}
+
+#line 689
+#undef TO_BITS_LEN
+#if 0
+#line 694
+#elif NPY_BITSOF_LONGLONG == 8
+    #define TO_BITS_LEN(X) X##8
+
+#line 694
+#elif NPY_BITSOF_LONGLONG == 16
+    #define TO_BITS_LEN(X) X##16
+
+#line 694
+#elif NPY_BITSOF_LONGLONG == 32
+    #define TO_BITS_LEN(X) X##32
+
+#line 694
+#elif NPY_BITSOF_LONGLONG == 64
+    #define TO_BITS_LEN(X) X##64
+
+#endif
+
+
+NPY_INPLACE uint8_t
+npy_popcount_parallelll(npy_ulonglong a)
+{
+    a = a - ((a >> 1) & (npy_ulonglong) TO_BITS_LEN(MAGIC)[0]);
+    a = ((a & (npy_ulonglong) TO_BITS_LEN(MAGIC)[1])) + ((a >> 2) & (npy_ulonglong) TO_BITS_LEN(MAGIC)[1]);
+    a = (a + (a >> 4)) & (npy_ulonglong) TO_BITS_LEN(MAGIC)[2];
+    return (npy_ulonglong) (a * (npy_ulonglong) TO_BITS_LEN(MAGIC)[3]) >> ((NPY_SIZEOF_LONGLONG - 1) * CHAR_BIT);
+}
+
+NPY_INPLACE uint8_t
+npy_popcountull(npy_ulonglong a)
+{
+/* use built-in popcount if present, else use our implementation */
+#if (defined(__clang__) || defined(__GNUC__)) && NPY_BITSOF_LONGLONG >= 32
+    return __builtin_popcountll(a);
+#elif defined(_MSC_VER) && NPY_BITSOF_LONGLONG >= 16 && !defined(_M_ARM64) && !defined(_M_ARM)
+    /* no builtin __popcnt64 for 32 bits */
+    #if defined(_WIN64) || (defined(_WIN32) && NPY_BITSOF_LONGLONG != 64)
+        return TO_BITS_LEN(__popcnt)(a);
+    /* split 64 bit number into two 32 bit ints and return sum of counts */
+    #elif (defined(_WIN32) && NPY_BITSOF_LONGLONG == 64)
+        npy_uint32 left  = (npy_uint32) (a>>32);
+        npy_uint32 right = (npy_uint32) a;
+        return __popcnt32(left) + __popcnt32(right);
+    #endif
+#else
+    return npy_popcount_parallelll(a);
+#endif
+}
+
+
+#line 736
+NPY_INPLACE uint8_t
+npy_popcounthh(npy_byte a)
+{
+    /* Return popcount of abs(a) */
+    return npy_popcountuhh(a < 0 ? -a : a);
+}
+
+#line 736
+NPY_INPLACE uint8_t
+npy_popcounth(npy_short a)
+{
+    /* Return popcount of abs(a) */
+    return npy_popcountuh(a < 0 ? -a : a);
+}
+
+#line 736
+NPY_INPLACE uint8_t
+npy_popcount(npy_int a)
+{
+    /* Return popcount of abs(a) */
+    return npy_popcountu(a < 0 ? -a : a);
+}
+
+#line 736
+NPY_INPLACE uint8_t
+npy_popcountl(npy_long a)
+{
+    /* Return popcount of abs(a) */
+    return npy_popcountul(a < 0 ? -a : a);
+}
+
+#line 736
+NPY_INPLACE uint8_t
+npy_popcountll(npy_longlong a)
+{
+    /* Return popcount of abs(a) */
+    return npy_popcountull(a < 0 ? -a : a);
+}
+
+
+
diff --git a/nanvix-port/generated-headers/npy_sort.h b/nanvix-port/generated-headers/npy_sort.h
new file mode 100644
index 000000000000..a2231b4b3d01
--- /dev/null
+++ b/nanvix-port/generated-headers/npy_sort.h
@@ -0,0 +1,463 @@
+#line 1 "numpy/core/src/common/npy_sort.h.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+#ifndef __NPY_SORT_H__
+#define __NPY_SORT_H__
+
+/* Python include is for future object sorts */
+#include <Python.h>
+#include <numpy/npy_common.h>
+#include <numpy/ndarraytypes.h>
+
+#define NPY_ENOMEM 1
+#define NPY_ECOMP 2
+
+static inline int npy_get_msb(npy_uintp unum)
+{
+    int depth_limit = 0;
+    while (unum >>= 1)  {
+        depth_limit++;
+    }
+    return depth_limit;
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+/*
+ *****************************************************************************
+ **                            NUMERIC SORTS                                **
+ *****************************************************************************
+ */
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_bool(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_bool(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_bool(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_bool(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_bool(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_bool(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_bool(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_bool(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_byte(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_byte(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_byte(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_byte(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_byte(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_byte(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_byte(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_byte(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_ubyte(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_ubyte(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_ubyte(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_ubyte(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_ubyte(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_ubyte(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_ubyte(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_ubyte(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_short(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_short(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_short(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_short(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_short(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_short(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_short(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_short(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_ushort(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_ushort(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_ushort(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_ushort(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_ushort(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_ushort(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_ushort(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_ushort(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_int(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_int(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_int(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_int(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_int(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_int(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_int(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_int(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_uint(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_uint(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_uint(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_uint(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_uint(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_uint(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_uint(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_uint(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_long(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_long(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_long(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_long(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_long(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_long(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_long(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_long(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_ulong(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_ulong(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_ulong(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_ulong(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_ulong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_ulong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_ulong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_ulong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_longlong(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_longlong(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_longlong(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_longlong(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_longlong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_longlong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_longlong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_longlong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_ulonglong(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_ulonglong(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_ulonglong(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_ulonglong(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_ulonglong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_ulonglong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_ulonglong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_ulonglong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_half(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_half(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_half(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_half(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_half(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_half(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_half(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_half(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_float(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_float(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_float(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_float(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_float(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_float(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_float(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_float(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_double(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_double(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_double(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_double(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_double(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_double(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_double(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_double(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_longdouble(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_longdouble(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_longdouble(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_longdouble(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_longdouble(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_longdouble(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_longdouble(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_longdouble(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_cfloat(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_cfloat(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_cfloat(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_cfloat(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_cfloat(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_cfloat(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_cfloat(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_cfloat(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_cdouble(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_cdouble(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_cdouble(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_cdouble(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_cdouble(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_cdouble(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_cdouble(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_cdouble(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_clongdouble(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_clongdouble(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_clongdouble(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_clongdouble(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_clongdouble(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_clongdouble(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_clongdouble(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_clongdouble(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_datetime(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_datetime(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_datetime(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_datetime(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_datetime(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_datetime(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_datetime(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_datetime(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_timedelta(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_timedelta(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_timedelta(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_timedelta(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_timedelta(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_timedelta(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_timedelta(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_timedelta(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+
+#line 57
+#ifdef __cplusplus
+extern "C" {
+#endif
+NPY_NO_EXPORT int radixsort_bool(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aradixsort_bool(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+#ifdef __cplusplus
+}
+#endif
+
+
+#line 57
+#ifdef __cplusplus
+extern "C" {
+#endif
+NPY_NO_EXPORT int radixsort_byte(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aradixsort_byte(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+#ifdef __cplusplus
+}
+#endif
+
+
+#line 57
+#ifdef __cplusplus
+extern "C" {
+#endif
+NPY_NO_EXPORT int radixsort_ubyte(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aradixsort_ubyte(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+#ifdef __cplusplus
+}
+#endif
+
+
+#line 57
+#ifdef __cplusplus
+extern "C" {
+#endif
+NPY_NO_EXPORT int radixsort_short(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aradixsort_short(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+#ifdef __cplusplus
+}
+#endif
+
+
+#line 57
+#ifdef __cplusplus
+extern "C" {
+#endif
+NPY_NO_EXPORT int radixsort_ushort(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aradixsort_ushort(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+#ifdef __cplusplus
+}
+#endif
+
+
+#line 57
+#ifdef __cplusplus
+extern "C" {
+#endif
+NPY_NO_EXPORT int radixsort_int(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aradixsort_int(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+#ifdef __cplusplus
+}
+#endif
+
+
+#line 57
+#ifdef __cplusplus
+extern "C" {
+#endif
+NPY_NO_EXPORT int radixsort_uint(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aradixsort_uint(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+#ifdef __cplusplus
+}
+#endif
+
+
+#line 57
+#ifdef __cplusplus
+extern "C" {
+#endif
+NPY_NO_EXPORT int radixsort_long(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aradixsort_long(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+#ifdef __cplusplus
+}
+#endif
+
+
+#line 57
+#ifdef __cplusplus
+extern "C" {
+#endif
+NPY_NO_EXPORT int radixsort_ulong(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aradixsort_ulong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+#ifdef __cplusplus
+}
+#endif
+
+
+#line 57
+#ifdef __cplusplus
+extern "C" {
+#endif
+NPY_NO_EXPORT int radixsort_longlong(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aradixsort_longlong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+#ifdef __cplusplus
+}
+#endif
+
+
+#line 57
+#ifdef __cplusplus
+extern "C" {
+#endif
+NPY_NO_EXPORT int radixsort_ulonglong(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aradixsort_ulonglong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+#ifdef __cplusplus
+}
+#endif
+
+
+
+
+
+/*
+ *****************************************************************************
+ **                             STRING SORTS                                **
+ *****************************************************************************
+ */
+
+
+#line 81
+
+NPY_NO_EXPORT int quicksort_string(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int heapsort_string(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int mergesort_string(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int timsort_string(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int aquicksort_string(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int aheapsort_string(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int amergesort_string(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int atimsort_string(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+
+
+#line 81
+
+NPY_NO_EXPORT int quicksort_unicode(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int heapsort_unicode(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int mergesort_unicode(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int timsort_unicode(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int aquicksort_unicode(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int aheapsort_unicode(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int amergesort_unicode(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int atimsort_unicode(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+
+
+
+
+/*
+ *****************************************************************************
+ **                             GENERIC SORT                                **
+ *****************************************************************************
+ */
+
+
+NPY_NO_EXPORT int npy_quicksort(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int npy_heapsort(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int npy_mergesort(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int npy_timsort(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int npy_aquicksort(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int npy_aheapsort(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int npy_amergesort(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int npy_atimsort(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/nanvix-port/generated-headers/templ_common.h b/nanvix-port/generated-headers/templ_common.h
new file mode 100644
index 000000000000..c62cae0868e0
--- /dev/null
+++ b/nanvix-port/generated-headers/templ_common.h
@@ -0,0 +1,253 @@
+#line 1 "numpy/core/src/common/templ_common.h.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+#ifndef __NPY_TYPED_COMMON_INC
+#define __NPY_TYPED_COMMON_INC
+
+/* utility functions that profit from templates */
+
+#include "numpy/npy_common.h"
+#include <assert.h>
+
+#line 20
+/*
+ * writes result of a * b into r
+ * returns 1 if a * b overflowed else returns 0
+ * 
+ * These functions are not designed to work if either a or b is negative, but
+ * that is not checked. Could use absolute values and adjust the sign if that
+ * functionality was desired.
+ */
+static inline int
+npy_mul_with_overflow_int(npy_int * r, npy_int a, npy_int b)
+{
+#ifdef HAVE___BUILTIN_MUL_OVERFLOW
+    return __builtin_mul_overflow(a, b, r);
+#else
+    const npy_int half_sz = ((npy_int)1 << ((sizeof(a) * 8 - 1 ) / 2));
+
+    *r = a * b;
+    /*
+     * avoid expensive division on common no overflow case
+     */
+    if ((NPY_UNLIKELY((a | b) >= half_sz) || (a | b) < 0) &&
+            a != 0 && 
+#if 1
+            abs(b) > abs(NPY_MAX_INT / a)
+#else
+            b > NPY_MAX_INT / a
+#endif
+            ) {
+        return 1;
+    }
+    return 0;
+#endif
+}
+
+#line 20
+/*
+ * writes result of a * b into r
+ * returns 1 if a * b overflowed else returns 0
+ * 
+ * These functions are not designed to work if either a or b is negative, but
+ * that is not checked. Could use absolute values and adjust the sign if that
+ * functionality was desired.
+ */
+static inline int
+npy_mul_with_overflow_uint(npy_uint * r, npy_uint a, npy_uint b)
+{
+#ifdef HAVE___BUILTIN_MUL_OVERFLOW
+    return __builtin_mul_overflow(a, b, r);
+#else
+    const npy_uint half_sz = ((npy_uint)1 << ((sizeof(a) * 8 - 1 ) / 2));
+
+    *r = a * b;
+    /*
+     * avoid expensive division on common no overflow case
+     */
+    if ((NPY_UNLIKELY((a | b) >= half_sz) || (a | b) < 0) &&
+            a != 0 && 
+#if 0
+            abs(b) > abs(NPY_MAX_UINT / a)
+#else
+            b > NPY_MAX_UINT / a
+#endif
+            ) {
+        return 1;
+    }
+    return 0;
+#endif
+}
+
+#line 20
+/*
+ * writes result of a * b into r
+ * returns 1 if a * b overflowed else returns 0
+ * 
+ * These functions are not designed to work if either a or b is negative, but
+ * that is not checked. Could use absolute values and adjust the sign if that
+ * functionality was desired.
+ */
+static inline int
+npy_mul_with_overflow_long(npy_long * r, npy_long a, npy_long b)
+{
+#ifdef HAVE___BUILTIN_MUL_OVERFLOW
+    return __builtin_mul_overflow(a, b, r);
+#else
+    const npy_long half_sz = ((npy_long)1 << ((sizeof(a) * 8 - 1 ) / 2));
+
+    *r = a * b;
+    /*
+     * avoid expensive division on common no overflow case
+     */
+    if ((NPY_UNLIKELY((a | b) >= half_sz) || (a | b) < 0) &&
+            a != 0 && 
+#if 1
+            labs(b) > labs(NPY_MAX_LONG / a)
+#else
+            b > NPY_MAX_LONG / a
+#endif
+            ) {
+        return 1;
+    }
+    return 0;
+#endif
+}
+
+#line 20
+/*
+ * writes result of a * b into r
+ * returns 1 if a * b overflowed else returns 0
+ * 
+ * These functions are not designed to work if either a or b is negative, but
+ * that is not checked. Could use absolute values and adjust the sign if that
+ * functionality was desired.
+ */
+static inline int
+npy_mul_with_overflow_ulong(npy_ulong * r, npy_ulong a, npy_ulong b)
+{
+#ifdef HAVE___BUILTIN_MUL_OVERFLOW
+    return __builtin_mul_overflow(a, b, r);
+#else
+    const npy_ulong half_sz = ((npy_ulong)1 << ((sizeof(a) * 8 - 1 ) / 2));
+
+    *r = a * b;
+    /*
+     * avoid expensive division on common no overflow case
+     */
+    if ((NPY_UNLIKELY((a | b) >= half_sz) || (a | b) < 0) &&
+            a != 0 && 
+#if 0
+            labs(b) > labs(NPY_MAX_ULONG / a)
+#else
+            b > NPY_MAX_ULONG / a
+#endif
+            ) {
+        return 1;
+    }
+    return 0;
+#endif
+}
+
+#line 20
+/*
+ * writes result of a * b into r
+ * returns 1 if a * b overflowed else returns 0
+ * 
+ * These functions are not designed to work if either a or b is negative, but
+ * that is not checked. Could use absolute values and adjust the sign if that
+ * functionality was desired.
+ */
+static inline int
+npy_mul_with_overflow_longlong(npy_longlong * r, npy_longlong a, npy_longlong b)
+{
+#ifdef HAVE___BUILTIN_MUL_OVERFLOW
+    return __builtin_mul_overflow(a, b, r);
+#else
+    const npy_longlong half_sz = ((npy_longlong)1 << ((sizeof(a) * 8 - 1 ) / 2));
+
+    *r = a * b;
+    /*
+     * avoid expensive division on common no overflow case
+     */
+    if ((NPY_UNLIKELY((a | b) >= half_sz) || (a | b) < 0) &&
+            a != 0 && 
+#if 1
+            llabs(b) > llabs(NPY_MAX_LONGLONG / a)
+#else
+            b > NPY_MAX_LONGLONG / a
+#endif
+            ) {
+        return 1;
+    }
+    return 0;
+#endif
+}
+
+#line 20
+/*
+ * writes result of a * b into r
+ * returns 1 if a * b overflowed else returns 0
+ * 
+ * These functions are not designed to work if either a or b is negative, but
+ * that is not checked. Could use absolute values and adjust the sign if that
+ * functionality was desired.
+ */
+static inline int
+npy_mul_with_overflow_ulonglong(npy_ulonglong * r, npy_ulonglong a, npy_ulonglong b)
+{
+#ifdef HAVE___BUILTIN_MUL_OVERFLOW
+    return __builtin_mul_overflow(a, b, r);
+#else
+    const npy_ulonglong half_sz = ((npy_ulonglong)1 << ((sizeof(a) * 8 - 1 ) / 2));
+
+    *r = a * b;
+    /*
+     * avoid expensive division on common no overflow case
+     */
+    if ((NPY_UNLIKELY((a | b) >= half_sz) || (a | b) < 0) &&
+            a != 0 && 
+#if 0
+            llabs(b) > llabs(NPY_MAX_ULONGLONG / a)
+#else
+            b > NPY_MAX_ULONGLONG / a
+#endif
+            ) {
+        return 1;
+    }
+    return 0;
+#endif
+}
+
+
+static inline int
+npy_mul_sizes_with_overflow (npy_intp * r, npy_intp a, npy_intp b)
+{
+#ifdef HAVE___BUILTIN_MUL_OVERFLOW
+    return __builtin_mul_overflow(a, b, r);
+#else
+    /* this function only supports non-negative numbers */
+    assert(a >= 0 && b >= 0);
+    const npy_intp half_sz = ((npy_intp)1 << ((sizeof(a) * 8 - 1 ) / 2));
+
+    *r = a * b;
+    /*
+     * avoid expensive division on common no overflow case
+     */
+    if (NPY_UNLIKELY((a | b) >= half_sz)
+        && a != 0 && b > NPY_MAX_INTP / a) {
+        return 1;
+    }
+    return 0;
+#endif
+}
+
+#endif
+
diff --git a/numpy/core/src/_generated/_multiarray_tests.c b/numpy/core/src/_generated/_multiarray_tests.c
new file mode 100644
index 000000000000..44569b6181b5
--- /dev/null
+++ b/numpy/core/src/_generated/_multiarray_tests.c
@@ -0,0 +1,3383 @@
+#line 1 "numpy/core/src/multiarray/_multiarray_tests.c.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+/* -*-c-*- */
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _NPY_NO_DEPRECATIONS /* for NPY_CHAR */
+#include "numpy/arrayobject.h"
+#include "numpy/arrayscalars.h"
+#include "numpy/npy_math.h"
+#include "numpy/halffloat.h"
+#include "common.h"
+#include "npy_argparse.h"
+#include "mem_overlap.h"
+#include "npy_extint128.h"
+#include "array_method.h"
+#include "npy_hashtable.h"
+#include "dtypemeta.h"
+
+#if defined(MS_WIN32) || defined(__CYGWIN__)
+#define EXPORT(x) __declspec(dllexport) x
+#else
+#define EXPORT(x) x
+#endif
+
+#define ARRAY_SIZE(a) (sizeof(a)/sizeof(a[0]))
+
+
+static PyObject *
+argparse_example_function(PyObject *NPY_UNUSED(mod),
+        PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
+{
+    NPY_PREPARE_ARGPARSER;
+    int arg1;
+    PyObject *arg2, *arg3, *arg4;
+    if (npy_parse_arguments("func", args, len_args, kwnames,
+            "", &PyArray_PythonPyIntFromInt, &arg1,
+            "arg2", NULL, &arg2,
+            "|arg3", NULL, &arg3,
+            "$arg3", NULL, &arg4,
+            NULL, NULL, NULL) < 0) {
+        return NULL;
+    }
+    Py_RETURN_NONE;
+}
+
+/* test PyArray_IsPythonScalar, before including private py3 compat header */
+static PyObject *
+IsPythonScalar(PyObject * dummy, PyObject *args)
+{
+    PyObject *arg = NULL;
+    if (!PyArg_ParseTuple(args, "O", &arg)) {
+        return NULL;
+    }
+    if (PyArray_IsPythonScalar(arg)) {
+        Py_RETURN_TRUE;
+    }
+    else {
+        Py_RETURN_FALSE;
+    }
+}
+
+#include "npy_pycompat.h"
+
+
+/** Function to test calling via ctypes */
+EXPORT(void*) forward_pointer(void *x)
+{
+    return x;
+}
+
+/*
+ * TODO:
+ *  - Handle mode
+ */
+
+#line 81
+static int copy_double(PyArrayIterObject *itx, PyArrayNeighborhoodIterObject *niterx,
+        npy_intp const *bounds,
+        PyObject **out)
+{
+    npy_intp i, j;
+    npy_double *ptr;
+    npy_intp odims[NPY_MAXDIMS];
+    PyArrayObject *aout;
+
+    /*
+     * For each point in itx, copy the current neighborhood into an array which
+     * is appended at the output list
+     */
+    for (i = itx->index; i < itx->size; ++i) {
+        PyArrayNeighborhoodIter_Reset(niterx);
+
+        for (j = 0; j < PyArray_NDIM(itx->ao); ++j) {
+            odims[j] = bounds[2 * j + 1] - bounds[2 * j] + 1;
+        }
+        aout = (PyArrayObject*)PyArray_SimpleNew(
+                                PyArray_NDIM(itx->ao), odims, NPY_DOUBLE);
+        if (aout == NULL) {
+            return -1;
+        }
+
+        ptr = (npy_double*)PyArray_DATA(aout);
+
+        for (j = 0; j < niterx->size; ++j) {
+            *ptr = *((npy_double*)niterx->dataptr);
+            PyArrayNeighborhoodIter_Next(niterx);
+            ptr += 1;
+        }
+
+        PyList_Append(*out, (PyObject*)aout);
+        Py_DECREF(aout);
+        PyArray_ITER_NEXT(itx);
+    }
+
+    return 0;
+}
+
+#line 81
+static int copy_int(PyArrayIterObject *itx, PyArrayNeighborhoodIterObject *niterx,
+        npy_intp const *bounds,
+        PyObject **out)
+{
+    npy_intp i, j;
+    npy_int *ptr;
+    npy_intp odims[NPY_MAXDIMS];
+    PyArrayObject *aout;
+
+    /*
+     * For each point in itx, copy the current neighborhood into an array which
+     * is appended at the output list
+     */
+    for (i = itx->index; i < itx->size; ++i) {
+        PyArrayNeighborhoodIter_Reset(niterx);
+
+        for (j = 0; j < PyArray_NDIM(itx->ao); ++j) {
+            odims[j] = bounds[2 * j + 1] - bounds[2 * j] + 1;
+        }
+        aout = (PyArrayObject*)PyArray_SimpleNew(
+                                PyArray_NDIM(itx->ao), odims, NPY_INT);
+        if (aout == NULL) {
+            return -1;
+        }
+
+        ptr = (npy_int*)PyArray_DATA(aout);
+
+        for (j = 0; j < niterx->size; ++j) {
+            *ptr = *((npy_int*)niterx->dataptr);
+            PyArrayNeighborhoodIter_Next(niterx);
+            ptr += 1;
+        }
+
+        PyList_Append(*out, (PyObject*)aout);
+        Py_DECREF(aout);
+        PyArray_ITER_NEXT(itx);
+    }
+
+    return 0;
+}
+
+
+static int copy_object(PyArrayIterObject *itx, PyArrayNeighborhoodIterObject *niterx,
+        npy_intp const *bounds,
+        PyObject **out)
+{
+    npy_intp i, j;
+    npy_intp odims[NPY_MAXDIMS];
+    PyArrayObject *aout;
+    PyArray_CopySwapFunc *copyswap = PyArray_DESCR(itx->ao)->f->copyswap;
+    npy_int itemsize = PyArray_ITEMSIZE(itx->ao);
+
+    /*
+     * For each point in itx, copy the current neighborhood into an array which
+     * is appended at the output list
+     */
+    for (i = itx->index; i < itx->size; ++i) {
+        PyArrayNeighborhoodIter_Reset(niterx);
+
+        for (j = 0; j < PyArray_NDIM(itx->ao); ++j) {
+            odims[j] = bounds[2 * j + 1] - bounds[2 * j] + 1;
+        }
+        aout = (PyArrayObject*)PyArray_SimpleNew(PyArray_NDIM(itx->ao), odims, NPY_OBJECT);
+        if (aout == NULL) {
+            return -1;
+        }
+
+        for (j = 0; j < niterx->size; ++j) {
+            copyswap(PyArray_BYTES(aout) + j * itemsize, niterx->dataptr, 0, NULL);
+            PyArrayNeighborhoodIter_Next(niterx);
+        }
+
+        PyList_Append(*out, (PyObject*)aout);
+        Py_DECREF(aout);
+        PyArray_ITER_NEXT(itx);
+    }
+
+    return 0;
+}
+
+static PyObject*
+test_neighborhood_iterator(PyObject* NPY_UNUSED(self), PyObject* args)
+{
+    PyObject *x, *fill, *out, *b;
+    PyArrayObject *ax, *afill;
+    PyArrayIterObject *itx;
+    int i, typenum, mode, st;
+    Py_ssize_t idxstart = 0;
+    npy_intp bounds[NPY_MAXDIMS*2];
+    PyArrayNeighborhoodIterObject *niterx;
+
+    if (!PyArg_ParseTuple(args, "OOOi|n", &x, &b, &fill, &mode, &idxstart)) {
+        return NULL;
+    }
+
+    if (!PySequence_Check(b)) {
+        return NULL;
+    }
+
+    typenum = PyArray_ObjectType(x, NPY_NOTYPE);
+    if (typenum == NPY_NOTYPE) {
+        return NULL;
+    }
+    typenum = PyArray_ObjectType(fill, typenum);
+    if (typenum == NPY_NOTYPE) {
+        return NULL;
+    }
+
+    ax = (PyArrayObject*)PyArray_FromObject(x, typenum, 1, 10);
+    if (ax == NULL) {
+        return NULL;
+    }
+    if (PySequence_Size(b) != 2 * PyArray_NDIM(ax)) {
+        PyErr_SetString(PyExc_ValueError,
+                "bounds sequence size not compatible with x input");
+        goto clean_ax;
+    }
+
+    out = PyList_New(0);
+    if (out == NULL) {
+        goto clean_ax;
+    }
+
+    itx = (PyArrayIterObject*)PyArray_IterNew(x);
+    if (itx == NULL) {
+        goto clean_out;
+    }
+
+    /* Compute boundaries for the neighborhood iterator */
+    for (i = 0; i < 2 * PyArray_NDIM(ax); ++i) {
+        PyObject* bound;
+
+        bound = PySequence_GetItem(b, i);
+        if (bound == NULL) {
+            goto clean_itx;
+        }
+        /* PyLong_AsSsize checks for PyLong */
+        bounds[i] = PyLong_AsSsize_t(bound);
+        if (error_converting(bounds[i])) {
+            PyErr_Clear();
+            PyErr_SetString(PyExc_ValueError,
+                    "bound is invalid");
+            Py_DECREF(bound);
+            goto clean_itx;
+        }
+        Py_DECREF(bound);
+    }
+
+    /* Create the neighborhood iterator */
+    afill = NULL;
+    if (mode == NPY_NEIGHBORHOOD_ITER_CONSTANT_PADDING) {
+            afill = (PyArrayObject *)PyArray_FromObject(fill, typenum, 0, 0);
+            if (afill == NULL) {
+            goto clean_itx;
+        }
+    }
+
+    if (idxstart >= itx->size) {
+        PyErr_SetString(PyExc_ValueError,
+                "start index not compatible with x input");
+        goto clean_itx;
+    }
+
+    niterx = (PyArrayNeighborhoodIterObject*)PyArray_NeighborhoodIterNew(
+                    (PyArrayIterObject*)itx, bounds, mode, afill);
+    if (niterx == NULL) {
+        goto clean_afill;
+    }
+
+    PyArray_ITER_GOTO1D((PyArrayIterObject*)itx, idxstart);
+
+    switch (typenum) {
+        case NPY_OBJECT:
+            st = copy_object(itx, niterx, bounds, &out);
+            break;
+        case NPY_INT:
+            st = copy_int(itx, niterx, bounds, &out);
+            break;
+        case NPY_DOUBLE:
+            st = copy_double(itx, niterx, bounds, &out);
+            break;
+        default:
+            PyErr_SetString(PyExc_ValueError,
+                    "Type not supported");
+            goto clean_niterx;
+    }
+
+    if (st) {
+        goto clean_niterx;
+    }
+
+    Py_DECREF(niterx);
+    Py_XDECREF(afill);
+    Py_DECREF(itx);
+
+    Py_DECREF(ax);
+
+    return out;
+
+clean_niterx:
+    Py_DECREF(niterx);
+clean_afill:
+    Py_XDECREF(afill);
+clean_itx:
+    Py_DECREF(itx);
+clean_out:
+    Py_DECREF(out);
+clean_ax:
+    Py_DECREF(ax);
+    return NULL;
+}
+
+static int
+copy_double_double(PyArrayNeighborhoodIterObject *itx,
+        PyArrayNeighborhoodIterObject *niterx,
+        npy_intp const *bounds,
+        PyObject **out)
+{
+    npy_intp i, j;
+    double *ptr;
+    npy_intp odims[NPY_MAXDIMS];
+    PyArrayObject *aout;
+
+    /*
+     * For each point in itx, copy the current neighborhood into an array which
+     * is appended at the output list
+     */
+    PyArrayNeighborhoodIter_Reset(itx);
+    for (i = 0; i < itx->size; ++i) {
+        for (j = 0; j < PyArray_NDIM(itx->ao); ++j) {
+            odims[j] = bounds[2 * j + 1] - bounds[2 * j] + 1;
+        }
+        aout = (PyArrayObject*)PyArray_SimpleNew(
+                            PyArray_NDIM(itx->ao), odims, NPY_DOUBLE);
+        if (aout == NULL) {
+            return -1;
+        }
+
+        ptr = (double*)PyArray_DATA(aout);
+
+        PyArrayNeighborhoodIter_Reset(niterx);
+        for (j = 0; j < niterx->size; ++j) {
+            *ptr = *((double*)niterx->dataptr);
+            ptr += 1;
+            PyArrayNeighborhoodIter_Next(niterx);
+        }
+        PyList_Append(*out, (PyObject*)aout);
+        Py_DECREF(aout);
+        PyArrayNeighborhoodIter_Next(itx);
+    }
+    return 0;
+}
+
+static PyObject*
+test_neighborhood_iterator_oob(PyObject* NPY_UNUSED(self), PyObject* args)
+{
+    PyObject *x, *out, *b1, *b2;
+    PyArrayObject *ax;
+    PyArrayIterObject *itx;
+    int i, typenum, mode1, mode2, st;
+    npy_intp bounds[NPY_MAXDIMS*2];
+    PyArrayNeighborhoodIterObject *niterx1, *niterx2;
+
+    if (!PyArg_ParseTuple(args, "OOiOi", &x, &b1, &mode1, &b2, &mode2)) {
+        return NULL;
+    }
+
+    if (!PySequence_Check(b1) || !PySequence_Check(b2)) {
+        return NULL;
+    }
+
+    typenum = PyArray_ObjectType(x, NPY_NOTYPE);
+    if (typenum == NPY_NOTYPE) {
+        return NULL;
+    }
+
+    ax = (PyArrayObject*)PyArray_FromObject(x, typenum, 1, 10);
+    if (ax == NULL) {
+        return NULL;
+    }
+    if (PySequence_Size(b1) != 2 * PyArray_NDIM(ax)) {
+        PyErr_SetString(PyExc_ValueError,
+                "bounds sequence 1 size not compatible with x input");
+        goto clean_ax;
+    }
+    if (PySequence_Size(b2) != 2 * PyArray_NDIM(ax)) {
+        PyErr_SetString(PyExc_ValueError,
+                "bounds sequence 2 size not compatible with x input");
+        goto clean_ax;
+    }
+
+    out = PyList_New(0);
+    if (out == NULL) {
+        goto clean_ax;
+    }
+
+    itx = (PyArrayIterObject*)PyArray_IterNew(x);
+    if (itx == NULL) {
+        goto clean_out;
+    }
+
+    /* Compute boundaries for the neighborhood iterator */
+    for (i = 0; i < 2 * PyArray_NDIM(ax); ++i) {
+        PyObject* bound;
+
+        bound = PySequence_GetItem(b1, i);
+        if (bound == NULL) {
+            goto clean_itx;
+        }
+        /* PyLong_AsSsize checks for PyLong */
+        bounds[i] = PyLong_AsSsize_t(bound);
+        if (error_converting(bounds[i])) {
+            PyErr_Clear();
+            PyErr_SetString(PyExc_ValueError,
+                    "bound is invalid");
+            Py_DECREF(bound);
+            goto clean_itx;
+        }
+        Py_DECREF(bound);
+    }
+
+    /* Create the neighborhood iterator */
+    niterx1 = (PyArrayNeighborhoodIterObject*)PyArray_NeighborhoodIterNew(
+                    (PyArrayIterObject*)itx, bounds,
+                    mode1, NULL);
+    if (niterx1 == NULL) {
+        goto clean_out;
+    }
+
+    for (i = 0; i < 2 * PyArray_NDIM(ax); ++i) {
+        PyObject* bound;
+
+        bound = PySequence_GetItem(b2, i);
+        if (bound == NULL) {
+            goto clean_itx;
+        }
+        /* PyLong_AsSsize checks for PyLong */
+        bounds[i] = PyLong_AsSsize_t(bound);
+        if (error_converting(bounds[i])) {
+            PyErr_Clear();
+            PyErr_SetString(PyExc_ValueError,
+                    "bound is invalid");
+            Py_DECREF(bound);
+            goto clean_itx;
+        }
+        Py_DECREF(bound);
+    }
+
+    niterx2 = (PyArrayNeighborhoodIterObject*)PyArray_NeighborhoodIterNew(
+                    (PyArrayIterObject*)niterx1, bounds,
+                    mode2, NULL);
+    if (niterx2 == NULL) {
+        goto clean_niterx1;
+    }
+
+    switch (typenum) {
+        case NPY_DOUBLE:
+            st = copy_double_double(niterx1, niterx2, bounds, &out);
+            break;
+        default:
+            PyErr_SetString(PyExc_ValueError,
+                    "Type not supported");
+            goto clean_niterx2;
+    }
+
+    if (st) {
+        goto clean_niterx2;
+    }
+
+    Py_DECREF(niterx2);
+    Py_DECREF(niterx1);
+    Py_DECREF(itx);
+    Py_DECREF(ax);
+    return out;
+
+clean_niterx2:
+    Py_DECREF(niterx2);
+clean_niterx1:
+    Py_DECREF(niterx1);
+clean_itx:
+    Py_DECREF(itx);
+clean_out:
+    Py_DECREF(out);
+clean_ax:
+    Py_DECREF(ax);
+    return NULL;
+}
+
+/* PyDataMem_SetHook tests */
+static int malloc_free_counts[2];
+static PyDataMem_EventHookFunc *old_hook = NULL;
+static void *old_data;
+
+static void test_hook(void *old, void *new, size_t size, void *user_data)
+{
+    int* counters = (int *) user_data;
+    if (old == NULL) {
+        counters[0]++; /* malloc counter */
+    }
+    if (size == 0) {
+        counters[1]++; /* free counter */
+    }
+}
+
+static PyObject*
+test_pydatamem_seteventhook_start(PyObject* NPY_UNUSED(self), PyObject* NPY_UNUSED(args))
+{
+    malloc_free_counts[0] = malloc_free_counts[1] = 0;
+    old_hook = PyDataMem_SetEventHook(test_hook, (void *) malloc_free_counts, &old_data);
+    Py_RETURN_NONE;
+}
+
+static PyObject*
+test_pydatamem_seteventhook_end(PyObject* NPY_UNUSED(self), PyObject* NPY_UNUSED(args))
+{
+    PyDataMem_EventHookFunc *my_hook;
+    void *my_data;
+
+    my_hook = PyDataMem_SetEventHook(old_hook, old_data, &my_data);
+    if ((my_hook != test_hook) || (my_data != (void *) malloc_free_counts)) {
+        PyErr_SetString(PyExc_ValueError,
+                        "hook/data was not the expected test hook");
+        return NULL;
+    }
+
+    if (malloc_free_counts[0] == 0) {
+        PyErr_SetString(PyExc_ValueError,
+                        "malloc count is zero after test");
+        return NULL;
+    }
+    if (malloc_free_counts[1] == 0) {
+        PyErr_SetString(PyExc_ValueError,
+                        "free count is zero after test");
+        return NULL;
+    }
+
+    Py_RETURN_NONE;
+}
+
+
+typedef void (*inplace_map_binop)(PyArrayMapIterObject *, PyArrayIterObject *);
+
+static void npy_float64_inplace_add(PyArrayMapIterObject *mit, PyArrayIterObject *it)
+{
+    int index = mit->size;
+    while (index--) {
+        ((npy_float64*)mit->dataptr)[0] = ((npy_float64*)mit->dataptr)[0] + ((npy_float64*)it->dataptr)[0];
+
+        PyArray_MapIterNext(mit);
+        PyArray_ITER_NEXT(it);
+    }
+}
+
+inplace_map_binop addition_funcs[] = {
+npy_float64_inplace_add,
+NULL};
+
+int type_numbers[] = {
+NPY_FLOAT64,
+-1000};
+
+
+
+static int
+map_increment(PyArrayMapIterObject *mit, PyObject *op, inplace_map_binop add_inplace)
+{
+    PyArrayObject *arr = NULL;
+    PyArrayIterObject *it;
+    PyArray_Descr *descr;
+
+    if (mit->ait == NULL) {
+        return -1;
+    }
+    descr = PyArray_DESCR(mit->ait->ao);
+    Py_INCREF(descr);
+    arr = (PyArrayObject *)PyArray_FromAny(op, descr,
+                                0, 0, NPY_ARRAY_FORCECAST, NULL);
+    if (arr == NULL) {
+        return -1;
+    }
+
+    if ((mit->subspace != NULL) && (mit->consec)) {
+        PyArray_MapIterSwapAxes(mit, (PyArrayObject **)&arr, 0);
+        if (arr == NULL) {
+            return -1;
+        }
+    }
+
+    if ((it = (PyArrayIterObject *)\
+            PyArray_BroadcastToShape((PyObject *)arr, mit->dimensions,
+                                     mit->nd)) == NULL) {
+        Py_DECREF(arr);
+
+        return -1;
+    }
+
+    (*add_inplace)(mit, it);
+
+    Py_DECREF(arr);
+    Py_DECREF(it);
+    return 0;
+}
+
+
+static PyObject *
+inplace_increment(PyObject *dummy, PyObject *args)
+{
+    PyObject *arg_a = NULL, *index=NULL, *inc=NULL;
+    PyArrayObject *a;
+    inplace_map_binop add_inplace = NULL;
+    int type_number = -1;
+    int i =0;
+    PyArrayMapIterObject * mit;
+
+    if (!PyArg_ParseTuple(args, "OOO", &arg_a, &index,
+            &inc)) {
+        return NULL;
+    }
+    if (!PyArray_Check(arg_a)) {
+         PyErr_SetString(PyExc_ValueError, "needs an ndarray as first argument");
+         return NULL;
+    }
+    a = (PyArrayObject *) arg_a;
+
+    if (PyArray_FailUnlessWriteable(a, "input/output array") < 0) {
+        return NULL;
+    }
+
+    if (PyArray_NDIM(a) == 0) {
+        PyErr_SetString(PyExc_IndexError, "0-d arrays can't be indexed.");
+        return NULL;
+    }
+    type_number = PyArray_TYPE(a);
+
+    while (type_numbers[i] >= 0 && addition_funcs[i] != NULL){
+        if (type_number == type_numbers[i]) {
+            add_inplace = addition_funcs[i];
+            break;
+        }
+        i++ ;
+    }
+
+    if (add_inplace == NULL) {
+        PyErr_SetString(PyExc_TypeError, "unsupported type for a");
+        return NULL;
+    }
+
+    mit = (PyArrayMapIterObject *) PyArray_MapIterArray(a, index);
+    if (mit == NULL) {
+        goto fail;
+    }
+
+    if (map_increment(mit, inc, add_inplace) != 0) {
+        goto fail;
+    }
+
+    Py_DECREF(mit);
+
+    Py_RETURN_NONE;
+
+fail:
+    Py_XDECREF(mit);
+
+    return NULL;
+}
+
+/*
+ * Helper to test fromstring of 0 terminated strings, as the C-API supports
+ * the -1 length identifier.
+ */
+static PyObject *
+fromstring_null_term_c_api(PyObject *dummy, PyObject *byte_obj)
+{
+    char *string;
+
+    string = PyBytes_AsString(byte_obj);
+    if (string == NULL) {
+        return NULL;
+    }
+    return PyArray_FromString(string, -1, NULL, -1, " ");
+}
+
+
+/*
+ * Create a custom field dtype from an existing void one (and test some errors).
+ * The dtypes created by this function may be not be usable (or even crash
+ * while using).
+ */
+static PyObject *
+create_custom_field_dtype(PyObject *NPY_UNUSED(mod), PyObject *args)
+{
+    PyArray_Descr *dtype;
+    PyTypeObject *scalar_type;
+    PyTypeObject *original_type = NULL;
+    int error_path;
+
+    if (!PyArg_ParseTuple(args, "O!O!i",
+            &PyArrayDescr_Type, &dtype,
+            &PyType_Type, &scalar_type,
+            &error_path)) {
+        return NULL;
+    }
+    /* check that the result should be more or less valid */
+    if (dtype->type_num != NPY_VOID || dtype->fields == NULL ||
+            !PyDict_CheckExact(dtype->fields) ||
+            PyTuple_Size(dtype->names) != 1 ||
+            !PyDataType_REFCHK(dtype) ||
+            dtype->elsize != sizeof(PyObject *)) {
+        PyErr_SetString(PyExc_ValueError,
+                "Bad dtype passed to test function, must be an object "
+                "containing void with a single field.");
+        return NULL;
+    }
+
+    /* Copy and then appropriate this dtype */
+    original_type = Py_TYPE(dtype);
+    dtype = PyArray_DescrNew(dtype);
+    if (dtype == NULL) {
+        return NULL;
+    }
+
+    Py_INCREF(scalar_type);
+    Py_SETREF(dtype->typeobj, scalar_type);
+    if (error_path == 1) {
+        /* Test that we reject this, if fields was not already set */
+        Py_SETREF(dtype->fields, NULL);
+    }
+    else if (error_path == 2) {
+        /*
+         * Test that we reject this if the type is not set to something that
+         * we are pretty sure can be safely replaced.
+         */
+        Py_SET_TYPE(dtype, scalar_type);
+    }
+    else if (error_path != 0) {
+        PyErr_SetString(PyExc_ValueError,
+                "invalid error argument to test function.");
+    }
+    if (PyArray_RegisterDataType(dtype) < 0) {
+        /* Fix original type in the error_path == 2 case and delete it */
+        Py_SET_TYPE(dtype, original_type);
+        Py_DECREF(dtype);
+        return NULL;
+    }
+    Py_INCREF(dtype);  /* hold on to the original (leaks a reference) */
+    return (PyObject *)dtype;
+}
+
+
+PyObject *
+corrupt_or_fix_bufferinfo(PyObject *dummy, PyObject *obj)
+{
+    void **buffer_info_ptr;
+    if (PyArray_Check(obj)) {
+        buffer_info_ptr = &((PyArrayObject_fields *)obj)->_buffer_info;
+    }
+    else if (PyArray_IsScalar(obj, Void)) {
+        buffer_info_ptr = &((PyVoidScalarObject *)obj)->_buffer_info;
+    }
+    else {
+        PyErr_SetString(PyExc_TypeError,
+                "argument must be an array or void scalar");
+        return NULL;
+    }
+    if (*buffer_info_ptr == NULL) {
+        /* set to an invalid value (as a subclass might accidentally) */
+        *buffer_info_ptr = obj;
+        assert(((uintptr_t)obj & 7) == 0);
+    }
+    else if (*buffer_info_ptr == obj) {
+        /* Reset to a NULL (good value) */
+        *buffer_info_ptr = NULL;
+    }
+    else {
+        PyErr_SetString(PyExc_TypeError,
+                "buffer was already exported, this test doesn't support that");
+        return NULL;
+    }
+    Py_RETURN_NONE;
+}
+
+
+/* check no elison for avoided increfs */
+static PyObject *
+incref_elide(PyObject *dummy, PyObject *args)
+{
+    PyObject *arg = NULL, *res, *tup;
+    if (!PyArg_ParseTuple(args, "O", &arg)) {
+        return NULL;
+    }
+
+    /* refcount 1 array but should not be elided */
+    arg = PyArray_NewCopy((PyArrayObject*)arg, NPY_KEEPORDER);
+    res = PyNumber_Add(arg, arg);
+
+    /* return original copy, should be equal to input */
+    tup = PyTuple_Pack(2, arg, res);
+    Py_DECREF(arg);
+    Py_DECREF(res);
+    return tup;
+}
+
+/* check no elison for get from list without incref */
+static PyObject *
+incref_elide_l(PyObject *dummy, PyObject *args)
+{
+    PyObject *arg = NULL, *r, *res;
+    if (!PyArg_ParseTuple(args, "O", &arg)) {
+        return NULL;
+    }
+    /* get item without increasing refcount, item may still be on the python
+     * stack but above the inaccessible top */
+    r = PyList_GetItem(arg, 4);
+    res = PyNumber_Add(r, r);
+
+    return res;
+}
+
+/* used to test NPY_CHAR usage emits deprecation warning */
+static PyObject*
+npy_char_deprecation(PyObject* NPY_UNUSED(self), PyObject* NPY_UNUSED(args))
+{
+    PyArray_Descr * descr = PyArray_DescrFromType(NPY_CHAR);
+    return (PyObject *)descr;
+}
+
+/* used to test PyArray_As1D usage emits not implemented error */
+static PyObject*
+npy_pyarrayas1d_deprecation(PyObject* NPY_UNUSED(self), PyObject* NPY_UNUSED(args))
+{
+    PyObject *op = Py_BuildValue("i", 42);
+    PyObject *result = op;
+    int dim = 4;
+    double arg[2] = {1, 2};
+    int temp = PyArray_As1D(&result, (char **)&arg, &dim, NPY_DOUBLE);
+    if (temp < 0) {
+        Py_DECREF(op);
+        return NULL;
+    }
+    /* op != result */
+    Py_DECREF(op);
+    return result;
+}
+
+/* used to test PyArray_As2D usage emits not implemented error */
+static PyObject*
+npy_pyarrayas2d_deprecation(PyObject* NPY_UNUSED(self), PyObject* NPY_UNUSED(args))
+{
+    PyObject *op = Py_BuildValue("i", 42);
+    PyObject *result = op;
+    int dim1 = 4;
+    int dim2 = 6;
+    double arg[2][2] = {{1, 2}, {3, 4}};
+    int temp = PyArray_As2D(&result, (char ***)&arg, &dim1, &dim2, NPY_DOUBLE);
+    if (temp < 0) {
+        Py_DECREF(op);
+        return NULL;
+    }
+    /* op != result */
+    Py_DECREF(op);
+    return result;
+}
+
+/* used to create array with WRITEBACKIFCOPY flag */
+static PyObject*
+npy_create_writebackifcopy(PyObject* NPY_UNUSED(self), PyObject* args)
+{
+    int flags;
+    PyObject* array;
+    if (!PyArray_Check(args)) {
+        PyErr_SetString(PyExc_TypeError, "test needs ndarray input");
+        return NULL;
+    }
+    flags = NPY_ARRAY_CARRAY | NPY_ARRAY_WRITEBACKIFCOPY;
+    array = PyArray_FromArray((PyArrayObject*)args, NULL, flags);
+    if (array == NULL)
+        return NULL;
+    return array;
+}
+
+/* used to test WRITEBACKIFCOPY without resolution emits runtime warning */
+static PyObject*
+npy_abuse_writebackifcopy(PyObject* NPY_UNUSED(self), PyObject* args)
+{
+    int flags;
+    PyObject* array;
+    if (!PyArray_Check(args)) {
+        PyErr_SetString(PyExc_TypeError, "test needs ndarray input");
+        return NULL;
+    }
+    flags = NPY_ARRAY_CARRAY | NPY_ARRAY_WRITEBACKIFCOPY;
+    array = PyArray_FromArray((PyArrayObject*)args, NULL, flags);
+    if (array == NULL)
+        return NULL;
+    Py_DECREF(array); /* calls array_dealloc even on PyPy */
+    Py_RETURN_NONE;
+}
+
+/* resolve WRITEBACKIFCOPY */
+static PyObject*
+npy_resolve(PyObject* NPY_UNUSED(self), PyObject* args)
+{
+    if (!PyArray_Check(args)) {
+        PyErr_SetString(PyExc_TypeError, "test needs ndarray input");
+        return NULL;
+    }
+    PyArray_ResolveWritebackIfCopy((PyArrayObject*)args);
+    Py_RETURN_NONE;
+}
+
+/* resolve WRITEBACKIFCOPY */
+static PyObject*
+npy_discard(PyObject* NPY_UNUSED(self), PyObject* args)
+{
+    if (!PyArray_Check(args)) {
+        PyErr_SetString(PyExc_TypeError, "test needs ndarray input");
+        return NULL;
+    }
+    PyArray_DiscardWritebackIfCopy((PyArrayObject*)args);
+    Py_RETURN_NONE;
+}
+
+/*
+ * Create python string from a FLAG and or the corresponding PyBuf flag
+ * for the use in get_buffer_info.
+ */
+#define GET_PYBUF_FLAG(FLAG)                                        \
+    buf_flag = PyUnicode_FromString(#FLAG);                         \
+    flag_matches = PyObject_RichCompareBool(buf_flag, tmp, Py_EQ);  \
+    Py_DECREF(buf_flag);                                            \
+    if (flag_matches == 1) {                                        \
+        Py_DECREF(tmp);                                             \
+        flags |= PyBUF_##FLAG;                                      \
+        continue;                                                   \
+    }                                                               \
+    else if (flag_matches == -1) {                                  \
+        Py_DECREF(tmp);                                             \
+        return NULL;                                                \
+    }
+
+
+/*
+ * Get information for a buffer through PyBuf_GetBuffer with the
+ * corresponding flags or'ed. Note that the python caller has to
+ * make sure that or'ing those flags actually makes sense.
+ * More information should probably be returned for future tests.
+ */
+static PyObject *
+get_buffer_info(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    PyObject *buffer_obj, *pyflags;
+    PyObject *tmp, *buf_flag;
+    Py_buffer buffer;
+    PyObject *shape, *strides;
+    Py_ssize_t i, n;
+    int flag_matches;
+    int flags = 0;
+
+    if (!PyArg_ParseTuple(args, "OO", &buffer_obj, &pyflags)) {
+        return NULL;
+    }
+
+    n = PySequence_Length(pyflags);
+    if (n < 0) {
+        return NULL;
+    }
+
+    for (i=0; i < n; i++) {
+        tmp = PySequence_GetItem(pyflags, i);
+        if (tmp == NULL) {
+            return NULL;
+        }
+
+        GET_PYBUF_FLAG(SIMPLE);
+        GET_PYBUF_FLAG(WRITABLE);
+        GET_PYBUF_FLAG(STRIDES);
+        GET_PYBUF_FLAG(ND);
+        GET_PYBUF_FLAG(C_CONTIGUOUS);
+        GET_PYBUF_FLAG(F_CONTIGUOUS);
+        GET_PYBUF_FLAG(ANY_CONTIGUOUS);
+        GET_PYBUF_FLAG(INDIRECT);
+        GET_PYBUF_FLAG(FORMAT);
+        GET_PYBUF_FLAG(STRIDED);
+        GET_PYBUF_FLAG(STRIDED_RO);
+        GET_PYBUF_FLAG(RECORDS);
+        GET_PYBUF_FLAG(RECORDS_RO);
+        GET_PYBUF_FLAG(FULL);
+        GET_PYBUF_FLAG(FULL_RO);
+        GET_PYBUF_FLAG(CONTIG);
+        GET_PYBUF_FLAG(CONTIG_RO);
+
+        Py_DECREF(tmp);
+
+        /* One of the flags must match */
+        PyErr_SetString(PyExc_ValueError, "invalid flag used.");
+        return NULL;
+    }
+
+    if (PyObject_GetBuffer(buffer_obj, &buffer, flags) < 0) {
+        return NULL;
+    }
+
+    if (buffer.shape == NULL) {
+        Py_INCREF(Py_None);
+        shape = Py_None;
+    }
+    else {
+        shape = PyTuple_New(buffer.ndim);
+        for (i=0; i < buffer.ndim; i++) {
+            PyTuple_SET_ITEM(shape, i, PyLong_FromSsize_t(buffer.shape[i]));
+        }
+    }
+
+    if (buffer.strides == NULL) {
+        Py_INCREF(Py_None);
+        strides = Py_None;
+    }
+    else {
+        strides = PyTuple_New(buffer.ndim);
+        for (i=0; i < buffer.ndim; i++) {
+            PyTuple_SET_ITEM(strides, i, PyLong_FromSsize_t(buffer.strides[i]));
+        }
+    }
+
+    PyBuffer_Release(&buffer);
+    return Py_BuildValue("(NN)", shape, strides);
+}
+
+#undef GET_PYBUF_FLAG
+
+/*
+ * Return a new array object wrapping existing C-allocated (dummy) data.
+ * Such an array does not own its data (must not free it), but because it
+ * wraps C data, it also has no base object. Used to test arr.flags.writeable
+ * setting behaviour.
+ */
+static PyObject*
+get_c_wrapping_array(PyObject* NPY_UNUSED(self), PyObject* arg)
+{
+    int writeable, flags;
+    PyArray_Descr *descr;
+    npy_intp zero = 0;
+
+    writeable = PyObject_IsTrue(arg);
+    if (error_converting(writeable)) {
+        return NULL;
+    }
+
+    flags = writeable ? NPY_ARRAY_WRITEABLE : 0;
+    /* Create an empty array (which points to a random place) */
+    descr =  PyArray_DescrNewFromType(NPY_INTP);
+    return PyArray_NewFromDescr(&PyArray_Type, descr,
+                                1, &zero, NULL, &zero, flags, NULL);
+}
+
+
+static PyObject *
+get_all_cast_information(PyObject *NPY_UNUSED(mod), PyObject *NPY_UNUSED(args))
+{
+    PyObject *result = PyList_New(0);
+    if (result == NULL) {
+        return NULL;
+    }
+    PyObject *classes = PyObject_CallMethod(
+            (PyObject *)&PyArrayDescr_Type, "__subclasses__", "");
+    if (classes == NULL) {
+        goto fail;
+    }
+    Py_SETREF(classes, PySequence_Fast(classes, NULL));
+    if (classes == NULL) {
+        goto fail;
+    }
+
+    Py_ssize_t nclass = PySequence_Length(classes);
+    for (Py_ssize_t  i = 0; i < nclass; i++) {
+        PyArray_DTypeMeta *from_dtype = (
+                (PyArray_DTypeMeta *)PySequence_Fast_GET_ITEM(classes, i));
+        if (NPY_DT_is_abstract(from_dtype)) {
+            /*
+             * TODO: In principle probably needs to recursively check this,
+             *       also we may allow casts to abstract dtypes at some point.
+             */
+            continue;
+        }
+
+        PyObject *to_dtype, *cast_obj;
+        Py_ssize_t pos = 0;
+
+        while (PyDict_Next(NPY_DT_SLOTS(from_dtype)->castingimpls,
+                           &pos, &to_dtype, &cast_obj)) {
+            if (cast_obj == Py_None) {
+                continue;
+            }
+            PyArrayMethodObject *cast = (PyArrayMethodObject *)cast_obj;
+
+            /* Pass some information about this cast out! */
+            PyObject *cast_info = Py_BuildValue("{sOsOsisisisisiss}",
+                    "from", from_dtype,
+                    "to", to_dtype,
+                    "legacy", (cast->name != NULL &&
+                               strncmp(cast->name, "legacy_", 7) == 0),
+                    "casting", cast->casting,
+                    "requires_pyapi", cast->flags & NPY_METH_REQUIRES_PYAPI,
+                    "supports_unaligned",
+                        cast->flags & NPY_METH_SUPPORTS_UNALIGNED,
+                    "no_floatingpoint_errors",
+                        cast->flags & NPY_METH_NO_FLOATINGPOINT_ERRORS,
+                    "name", cast->name);
+            if (cast_info == NULL) {
+                goto fail;
+            }
+            int res = PyList_Append(result, cast_info);
+            Py_DECREF(cast_info);
+            if (res < 0) {
+                goto fail;
+            }
+        }
+    }
+    Py_DECREF(classes);
+    return result;
+
+  fail:
+    Py_XDECREF(classes);
+    Py_XDECREF(result);
+    return NULL;
+}
+
+
+/*
+ * Helper to test the identity cache, takes a list of values and adds
+ * all to the cache except the last key/value pair.  The last value is
+ * ignored, instead the last key is looked up.
+ * None is returned, if the key is not found.
+ * If `replace` is True, duplicate entries are ignored when adding to the
+ * hashtable.
+ */
+static PyObject *
+identityhash_tester(PyObject *NPY_UNUSED(mod),
+        PyObject *const *args, Py_ssize_t len_args, PyObject *kwnames)
+{
+    NPY_PREPARE_ARGPARSER;
+
+    int key_len;
+    int replace;
+    PyObject *replace_obj = Py_False;
+    PyObject *sequence;
+    PyObject *result = NULL;
+
+    if (npy_parse_arguments("identityhash_tester", args, len_args, kwnames,
+            "key_len", &PyArray_PythonPyIntFromInt, &key_len,
+            "sequence", NULL, &sequence,
+            "|replace", NULL, &replace_obj,
+            NULL, NULL, NULL) < 0) {
+        return NULL;
+    }
+    replace = PyObject_IsTrue(replace_obj);
+    if (error_converting(replace)) {
+        return NULL;
+    }
+
+    if (key_len < 1 || key_len >= NPY_MAXARGS) {
+        PyErr_SetString(PyExc_ValueError, "must have 1 to max-args keys.");
+        return NULL;
+    }
+    PyArrayIdentityHash *tb = PyArrayIdentityHash_New(key_len);
+    if (tb == NULL) {
+        return NULL;
+    }
+
+    /* Replace the sequence with a guaranteed fast-sequence */
+    sequence = PySequence_Fast(sequence, "converting sequence.");
+    if (sequence == NULL) {
+        goto finish;
+    }
+
+    Py_ssize_t length = PySequence_Fast_GET_SIZE(sequence);
+    for (Py_ssize_t i = 0; i < length; i++) {
+        PyObject *key_val = PySequence_Fast_GET_ITEM(sequence, i);
+        if (!PyTuple_CheckExact(key_val) || PyTuple_GET_SIZE(key_val) != 2) {
+            PyErr_SetString(PyExc_TypeError, "bad key-value pair.");
+            goto finish;
+        }
+        PyObject *key = PyTuple_GET_ITEM(key_val, 0);
+        PyObject *value = PyTuple_GET_ITEM(key_val, 1);
+        if (!PyTuple_CheckExact(key) || PyTuple_GET_SIZE(key) != key_len) {
+            PyErr_SetString(PyExc_TypeError, "bad key tuple.");
+            goto finish;
+        }
+
+        PyObject *keys[NPY_MAXARGS];
+        for (int j = 0; j < key_len; j++) {
+            keys[j] = PyTuple_GET_ITEM(key, j);
+        }
+        if (i != length - 1) {
+            if (PyArrayIdentityHash_SetItem(tb, keys, value, replace) < 0) {
+                goto finish;
+            }
+        }
+        else {
+            result = PyArrayIdentityHash_GetItem(tb, keys);
+            if (result == NULL) {
+                result = Py_None;
+            }
+            Py_INCREF(result);
+        }
+    }
+
+  finish:
+    Py_DECREF(sequence);
+    PyArrayIdentityHash_Dealloc(tb);
+    return result;
+}
+
+
+/*
+ * Test C-api level item getting.
+ */
+static PyObject *
+array_indexing(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    int mode;
+    Py_ssize_t i;
+    PyObject *arr, *op = NULL;
+
+    if (!PyArg_ParseTuple(args, "iOn|O", &mode, &arr, &i, &op)) {
+        return NULL;
+    }
+
+    if (mode == 0) {
+        return PySequence_GetItem(arr, i);
+    }
+    if (mode == 1) {
+        if (PySequence_SetItem(arr, i, op) < 0) {
+            return NULL;
+        }
+        Py_RETURN_NONE;
+    }
+
+    PyErr_SetString(PyExc_ValueError,
+                    "invalid mode. 0: item 1: assign");
+    return NULL;
+}
+
+/*
+ * Test C-api PyArray_AsCArray item getter
+ */
+static PyObject *
+test_as_c_array(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    PyArrayObject *array_obj;
+    npy_intp dims[3];   /* max 3-dim */
+    npy_intp i=0, j=0, k=0;
+    npy_intp num_dims = 0;
+    PyArray_Descr *descr = NULL;
+    double *array1 = NULL;
+    double **array2 = NULL;
+    double ***array3 = NULL;
+    double temp = 9999;
+
+    if (!PyArg_ParseTuple(args, "O!l|ll",
+                &PyArray_Type, &array_obj,
+                &i, &j, &k)) {
+        return NULL;
+    }
+
+    if (NULL == array_obj) {
+        return NULL;
+    }
+
+    num_dims = PyArray_NDIM(array_obj);
+    descr = PyArray_DESCR(array_obj);
+    Py_INCREF(descr);  /* PyArray_AsCArray steals a reference to this */
+
+    switch (num_dims) {
+        case 1:
+            if (PyArray_AsCArray(
+                    (PyObject **) &array_obj,
+                    (void *) &array1,
+                    dims,
+                    1,
+                    descr) < 0) {
+                PyErr_SetString(PyExc_RuntimeError, "error converting 1D array");
+                return NULL;
+            }
+            temp = array1[i];
+            PyArray_Free((PyObject *) array_obj, (void *) array1);
+            break;
+        case 2:
+            if (PyArray_AsCArray(
+                    (PyObject **) &array_obj,
+                    (void **) &array2,
+                    dims,
+                    2,
+                    descr) < 0) {
+                PyErr_SetString(PyExc_RuntimeError, "error converting 2D array");
+                return NULL;
+            }
+            temp = array2[i][j];
+            PyArray_Free((PyObject *) array_obj, (void *) array2);
+            break;
+        case 3:
+            if (PyArray_AsCArray(
+                    (PyObject **) &array_obj,
+                    (void ***) &array3,
+                    dims,
+                    3,
+                    descr) < 0) {
+                PyErr_SetString(PyExc_RuntimeError, "error converting 3D array");
+                return NULL;
+            }
+            temp = array3[i][j][k];
+            PyArray_Free((PyObject *) array_obj, (void *) array3);
+            break;
+        default:
+            Py_DECREF(descr);
+            PyErr_SetString(PyExc_ValueError, "array.ndim not in [1, 3]");
+            return NULL;
+    }
+    return Py_BuildValue("f", temp);
+}
+
+/*
+ * Test nditer of too large arrays using remove axis, etc.
+ */
+static PyObject *
+test_nditer_too_large(PyObject *NPY_UNUSED(self), PyObject *args) {
+    NpyIter *iter;
+    PyObject *array_tuple, *arr;
+    PyArrayObject *arrays[NPY_MAXARGS];
+    npy_uint32 op_flags[NPY_MAXARGS];
+    Py_ssize_t nop;
+    int i, axis, mode;
+
+    npy_intp index[NPY_MAXARGS] = {0};
+    char *msg;
+
+    if (!PyArg_ParseTuple(args, "Oii", &array_tuple, &axis, &mode)) {
+        return NULL;
+    }
+
+    if (!PyTuple_CheckExact(array_tuple)) {
+        PyErr_SetString(PyExc_ValueError, "tuple required as first argument");
+        return NULL;
+    }
+    nop = PyTuple_Size(array_tuple);
+    if (nop > NPY_MAXARGS) {
+        PyErr_SetString(PyExc_ValueError, "tuple must be smaller then maxargs");
+        return NULL;
+    }
+
+    for (i=0; i < nop; i++) {
+        arr = PyTuple_GET_ITEM(array_tuple, i);
+        if (!PyArray_CheckExact(arr)) {
+            PyErr_SetString(PyExc_ValueError, "require base class ndarray");
+            return NULL;
+        }
+        arrays[i] = (PyArrayObject *)arr;
+        op_flags[i] = NPY_ITER_READONLY;
+    }
+
+    iter = NpyIter_MultiNew(nop, arrays, NPY_ITER_MULTI_INDEX | NPY_ITER_RANGED,
+                            NPY_KEEPORDER, NPY_NO_CASTING, op_flags, NULL);
+
+    if (iter == NULL) {
+        return NULL;
+    }
+
+    /* Remove an axis (negative, do not remove any) */
+    if (axis >= 0) {
+        if (!NpyIter_RemoveAxis(iter, axis)) {
+            goto fail;
+        }
+    }
+
+    switch (mode) {
+        /* Test IterNext getting */
+        case 0:
+            if (NpyIter_GetIterNext(iter, NULL) == NULL) {
+                goto fail;
+            }
+            break;
+        case 1:
+            if (NpyIter_GetIterNext(iter, &msg) == NULL) {
+                PyErr_SetString(PyExc_ValueError, msg);
+                goto fail;
+            }
+            break;
+        /* Test Multi Index removal */
+        case 2:
+            if (!NpyIter_RemoveMultiIndex(iter)) {
+                goto fail;
+            }
+            break;
+        /* Test GotoMultiIndex (just 0 hardcoded) */
+        case 3:
+            if (!NpyIter_GotoMultiIndex(iter, index)) {
+                goto fail;
+            }
+            break;
+        /* Test setting iterrange (hardcoded range of 0, 1) */
+        case 4:
+            if (!NpyIter_ResetToIterIndexRange(iter, 0, 1, NULL)) {
+                goto fail;
+            }
+            break;
+        case 5:
+            if (!NpyIter_ResetToIterIndexRange(iter, 0, 1, &msg)) {
+                PyErr_SetString(PyExc_ValueError, msg);
+                goto fail;
+            }
+            break;
+        /* Do nothing */
+        default:
+            break;
+    }
+
+    NpyIter_Deallocate(iter);
+    Py_RETURN_NONE;
+  fail:
+    NpyIter_Deallocate(iter);
+    return NULL;
+}
+
+static PyObject *
+array_solve_diophantine(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
+{
+    PyObject *A = NULL;
+    PyObject *U = NULL;
+    Py_ssize_t b_input = 0;
+    Py_ssize_t max_work = -1;
+    int simplify = 0;
+    int require_ub_nontrivial = 0;
+    static char *kwlist[] = {"A", "U", "b", "max_work", "simplify",
+                             "require_ub_nontrivial", NULL};
+
+    diophantine_term_t terms[2*NPY_MAXDIMS+2];
+    npy_int64 x[2*NPY_MAXDIMS+2];
+    npy_int64 b;
+    unsigned int nterms, j;
+    mem_overlap_t result = MEM_OVERLAP_YES;
+    PyObject *retval = NULL;
+    NPY_BEGIN_THREADS_DEF;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O!O!n|nii", kwlist,
+                                     &PyTuple_Type, &A,
+                                     &PyTuple_Type, &U,
+                                     &b_input, &max_work, &simplify,
+                                     &require_ub_nontrivial)) {
+        return NULL;
+    }
+
+    if (PyTuple_GET_SIZE(A) > (Py_ssize_t)ARRAY_SIZE(terms)) {
+        PyErr_SetString(PyExc_ValueError, "too many terms in equation");
+        goto fail;
+    }
+
+    nterms = PyTuple_GET_SIZE(A);
+
+    if (PyTuple_GET_SIZE(U) != nterms) {
+        PyErr_SetString(PyExc_ValueError, "A, U must be tuples of equal length");
+        goto fail;
+    }
+
+    for (j = 0; j < nterms; ++j) {
+        terms[j].a = (npy_int64)PyLong_AsSsize_t(PyTuple_GET_ITEM(A, j));
+        if (error_converting(terms[j].a)) {
+            goto fail;
+        }
+        terms[j].ub = (npy_int64)PyLong_AsSsize_t(PyTuple_GET_ITEM(U, j));
+        if (error_converting(terms[j].ub)) {
+            goto fail;
+        }
+    }
+
+    b = b_input;
+
+    NPY_BEGIN_THREADS;
+    if (simplify && !require_ub_nontrivial) {
+        if (diophantine_simplify(&nterms, terms, b)) {
+            result = MEM_OVERLAP_OVERFLOW;
+        }
+    }
+    if (result == MEM_OVERLAP_YES) {
+        result = solve_diophantine(nterms, terms, b, max_work, require_ub_nontrivial, x);
+    }
+    NPY_END_THREADS;
+
+    if (result == MEM_OVERLAP_YES) {
+        retval = PyTuple_New(nterms);
+        if (retval == NULL) {
+            goto fail;
+        }
+
+        for (j = 0; j < nterms; ++j) {
+            PyObject *obj;
+            obj = PyLong_FromSsize_t(x[j]);
+            if (obj == NULL) {
+                goto fail;
+            }
+            PyTuple_SET_ITEM(retval, j, obj);
+        }
+    }
+    else if (result == MEM_OVERLAP_NO) {
+        retval = Py_None;
+        Py_INCREF(retval);
+    }
+    else if (result == MEM_OVERLAP_ERROR) {
+        PyErr_SetString(PyExc_ValueError, "Invalid arguments");
+    }
+    else if (result == MEM_OVERLAP_OVERFLOW) {
+        PyErr_SetString(PyExc_OverflowError, "Integer overflow");
+    }
+    else if (result == MEM_OVERLAP_TOO_HARD) {
+        PyErr_SetString(PyExc_RuntimeError, "Too much work done");
+    }
+    else {
+        PyErr_SetString(PyExc_RuntimeError, "Unknown error");
+    }
+
+    return retval;
+
+fail:
+    Py_XDECREF(retval);
+    return NULL;
+}
+
+
+static PyObject *
+array_internal_overlap(PyObject *NPY_UNUSED(ignored), PyObject *args, PyObject *kwds)
+{
+    PyArrayObject * self = NULL;
+    static char *kwlist[] = {"self", "max_work", NULL};
+
+    mem_overlap_t result;
+    Py_ssize_t max_work = NPY_MAY_SHARE_EXACT;
+    NPY_BEGIN_THREADS_DEF;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O&|n", kwlist,
+                                     PyArray_Converter, &self,
+                                     &max_work)) {
+        return NULL;
+    }
+
+    if (max_work < -2) {
+        PyErr_SetString(PyExc_ValueError, "Invalid value for max_work");
+        goto fail;
+    }
+
+    NPY_BEGIN_THREADS;
+    result = solve_may_have_internal_overlap(self, max_work);
+    NPY_END_THREADS;
+
+    Py_XDECREF(self);
+
+    if (result == MEM_OVERLAP_NO) {
+        Py_RETURN_FALSE;
+    }
+    else if (result == MEM_OVERLAP_YES) {
+        Py_RETURN_TRUE;
+    }
+    else if (result == MEM_OVERLAP_OVERFLOW) {
+        PyErr_SetString(PyExc_OverflowError,
+                        "Integer overflow in computing overlap");
+        return NULL;
+    }
+    else if (result == MEM_OVERLAP_TOO_HARD) {
+        PyErr_SetString(PyExc_ValueError,
+                        "Exceeded max_work");
+        return NULL;
+    }
+    else {
+        /* Doesn't happen usually */
+        PyErr_SetString(PyExc_RuntimeError,
+                        "Error in computing overlap");
+        return NULL;
+    }
+
+fail:
+    Py_XDECREF(self);
+    return NULL;
+}
+
+
+static PyObject *
+pylong_from_int128(npy_extint128_t value)
+{
+    PyObject *val_64 = NULL, *val = NULL, *tmp = NULL, *tmp2 = NULL;
+
+    val_64 = PyLong_FromLong(64);
+    if (val_64 == NULL) {
+        goto fail;
+    }
+
+    val = PyLong_FromUnsignedLongLong(value.hi);
+    if (val == NULL) {
+        goto fail;
+    }
+
+    tmp = PyNumber_Lshift(val, val_64);
+    if (tmp == NULL) {
+        goto fail;
+    }
+
+    Py_DECREF(val);
+    Py_DECREF(val_64);
+    val = tmp;
+    val_64 = NULL;
+
+    tmp = PyLong_FromUnsignedLongLong(value.lo);
+    if (tmp == NULL) {
+        goto fail;
+    }
+
+    tmp2 = PyNumber_Or(val, tmp);
+    if (tmp2 == NULL) {
+        goto fail;
+    }
+
+    Py_DECREF(val);
+    Py_DECREF(tmp);
+
+    val = NULL;
+    tmp = NULL;
+
+    if (value.sign < 0) {
+        val = PyNumber_Negative(tmp2);
+        if (val == NULL) {
+            goto fail;
+        }
+        Py_DECREF(tmp2);
+        return val;
+    }
+    else {
+        val = tmp2;
+    }
+    return val;
+
+fail:
+    Py_XDECREF(val_64);
+    Py_XDECREF(tmp);
+    Py_XDECREF(tmp2);
+    Py_XDECREF(val);
+    return NULL;
+}
+
+
+static int
+int128_from_pylong(PyObject *obj, npy_extint128_t *result)
+{
+    PyObject *long_obj = NULL, *val_64 = NULL, *val_0 = NULL,
+        *mask_64 = NULL, *max_128 = NULL, *hi_bits = NULL,
+        *lo_bits = NULL, *tmp = NULL;
+    int cmp;
+    int negative_zero = 0;
+
+    if (PyBool_Check(obj)) {
+        /* False means negative zero */
+        negative_zero = 1;
+    }
+
+    long_obj = PyObject_CallFunction((PyObject*)&PyLong_Type, "O", obj);
+    if (long_obj == NULL) {
+        goto fail;
+    }
+
+    val_0 = PyLong_FromLong(0);
+    if (val_0 == NULL) {
+        goto fail;
+    }
+
+    val_64 = PyLong_FromLong(64);
+    if (val_64 == NULL) {
+        goto fail;
+    }
+
+    mask_64 = PyLong_FromUnsignedLongLong(0xffffffffffffffffULL);
+    if (mask_64 == NULL) {
+        goto fail;
+    }
+
+    tmp = PyNumber_Lshift(mask_64, val_64);
+    if (tmp == NULL) {
+        goto fail;
+    }
+    max_128 = PyNumber_Or(tmp, mask_64);
+    if (max_128 == NULL) {
+        goto fail;
+    }
+    Py_DECREF(tmp);
+    tmp = NULL;
+
+    cmp = PyObject_RichCompareBool(long_obj, val_0, Py_LT);
+    if (cmp == -1) {
+        goto fail;
+    }
+    else if (cmp == 1) {
+        tmp = PyNumber_Negative(long_obj);
+        if (tmp == NULL) {
+            goto fail;
+        }
+        Py_DECREF(long_obj);
+        long_obj = tmp;
+        tmp = NULL;
+        result->sign = -1;
+    }
+    else {
+        result->sign = 1;
+    }
+
+    cmp = PyObject_RichCompareBool(long_obj, max_128, Py_GT);
+    if (cmp == 1) {
+        PyErr_SetString(PyExc_OverflowError, "");
+        goto fail;
+    }
+    else if (cmp == -1) {
+        goto fail;
+    }
+
+    hi_bits = PyNumber_Rshift(long_obj, val_64);
+    if (hi_bits == NULL) {
+        goto fail;
+    }
+
+    lo_bits = PyNumber_And(long_obj, mask_64);
+    if (lo_bits == NULL) {
+        goto fail;
+    }
+
+    result->hi = PyLong_AsUnsignedLongLong(hi_bits);
+    if (result->hi == (unsigned PY_LONG_LONG)-1 && PyErr_Occurred()) {
+        goto fail;
+    }
+
+    result->lo = PyLong_AsUnsignedLongLong(lo_bits);
+    if (result->lo == (unsigned PY_LONG_LONG)-1 && PyErr_Occurred()) {
+        goto fail;
+    }
+
+    if (negative_zero && result->hi == 0 && result->lo == 0) {
+        result->sign = -1;
+    }
+
+    Py_XDECREF(long_obj);
+    Py_XDECREF(val_64);
+    Py_XDECREF(val_0);
+    Py_XDECREF(mask_64);
+    Py_XDECREF(max_128);
+    Py_XDECREF(hi_bits);
+    Py_XDECREF(lo_bits);
+    Py_XDECREF(tmp);
+    return 0;
+
+fail:
+    Py_XDECREF(long_obj);
+    Py_XDECREF(val_64);
+    Py_XDECREF(val_0);
+    Py_XDECREF(mask_64);
+    Py_XDECREF(max_128);
+    Py_XDECREF(hi_bits);
+    Py_XDECREF(lo_bits);
+    Py_XDECREF(tmp);
+    return -1;
+}
+
+
+static PyObject *
+extint_safe_binop(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PY_LONG_LONG a, b, c;
+    int op;
+    char overflow = 0;
+    if (!PyArg_ParseTuple(args, "LLi", &a, &b, &op)) {
+        return NULL;
+    }
+    if (op == 1) {
+        c = safe_add(a, b, &overflow);
+    }
+    else if (op == 2) {
+        c = safe_sub(a, b, &overflow);
+    }
+    else if (op == 3) {
+        c = safe_mul(a, b, &overflow);
+    }
+    else {
+        PyErr_SetString(PyExc_ValueError, "invalid op");
+        return NULL;
+    }
+    if (overflow) {
+        PyErr_SetString(PyExc_OverflowError, "");
+        return NULL;
+    }
+    return PyLong_FromLongLong(c);
+}
+
+
+static PyObject *
+extint_to_128(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PY_LONG_LONG a;
+    if (!PyArg_ParseTuple(args, "L", &a)) {
+        return NULL;
+    }
+    return pylong_from_int128(to_128(a));
+}
+
+
+static PyObject *
+extint_to_64(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PyObject *a_obj;
+    npy_extint128_t a;
+    PY_LONG_LONG r;
+    char overflow = 0;
+    if (!PyArg_ParseTuple(args, "O", &a_obj)) {
+        return NULL;
+    }
+    if (int128_from_pylong(a_obj, &a)) {
+        return NULL;
+    }
+    r = to_64(a, &overflow);
+    if (overflow) {
+        PyErr_SetString(PyExc_OverflowError, "");
+        return NULL;
+    }
+    return PyLong_FromLongLong(r);
+}
+
+
+static PyObject *
+extint_mul_64_64(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PY_LONG_LONG a, b;
+    npy_extint128_t c;
+    if (!PyArg_ParseTuple(args, "LL", &a, &b)) {
+        return NULL;
+    }
+    c = mul_64_64(a, b);
+    return pylong_from_int128(c);
+}
+
+
+static PyObject *
+extint_add_128(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PyObject *a_obj, *b_obj;
+    npy_extint128_t a, b, c;
+    char overflow = 0;
+    if (!PyArg_ParseTuple(args, "OO", &a_obj, &b_obj)) {
+        return NULL;
+    }
+    if (int128_from_pylong(a_obj, &a) || int128_from_pylong(b_obj, &b)) {
+        return NULL;
+    }
+    c = add_128(a, b, &overflow);
+    if (overflow) {
+        PyErr_SetString(PyExc_OverflowError, "");
+        return NULL;
+    }
+    return pylong_from_int128(c);
+}
+
+
+static PyObject *
+extint_sub_128(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PyObject *a_obj, *b_obj;
+    npy_extint128_t a, b, c;
+    char overflow = 0;
+    if (!PyArg_ParseTuple(args, "OO", &a_obj, &b_obj)) {
+        return NULL;
+    }
+    if (int128_from_pylong(a_obj, &a) || int128_from_pylong(b_obj, &b)) {
+        return NULL;
+    }
+    c = sub_128(a, b, &overflow);
+    if (overflow) {
+        PyErr_SetString(PyExc_OverflowError, "");
+        return NULL;
+    }
+    return pylong_from_int128(c);
+}
+
+
+static PyObject *
+extint_neg_128(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PyObject *a_obj;
+    npy_extint128_t a, b;
+    if (!PyArg_ParseTuple(args, "O", &a_obj)) {
+        return NULL;
+    }
+    if (int128_from_pylong(a_obj, &a)) {
+        return NULL;
+    }
+    b = neg_128(a);
+    return pylong_from_int128(b);
+}
+
+
+static PyObject *
+extint_shl_128(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PyObject *a_obj;
+    npy_extint128_t a, b;
+    if (!PyArg_ParseTuple(args, "O", &a_obj)) {
+        return NULL;
+    }
+    if (int128_from_pylong(a_obj, &a)) {
+        return NULL;
+    }
+    b = shl_128(a);
+    return pylong_from_int128(b);
+}
+
+
+static PyObject *
+extint_shr_128(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PyObject *a_obj;
+    npy_extint128_t a, b;
+    if (!PyArg_ParseTuple(args, "O", &a_obj)) {
+        return NULL;
+    }
+    if (int128_from_pylong(a_obj, &a)) {
+        return NULL;
+    }
+    b = shr_128(a);
+    return pylong_from_int128(b);
+}
+
+
+static PyObject *
+extint_gt_128(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PyObject *a_obj, *b_obj;
+    npy_extint128_t a, b;
+    if (!PyArg_ParseTuple(args, "OO", &a_obj, &b_obj)) {
+        return NULL;
+    }
+    if (int128_from_pylong(a_obj, &a) || int128_from_pylong(b_obj, &b)) {
+        return NULL;
+    }
+    if (gt_128(a, b)) {
+        Py_RETURN_TRUE;
+    }
+    else {
+        Py_RETURN_FALSE;
+    }
+}
+
+
+static PyObject *
+extint_divmod_128_64(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PyObject *a_obj, *ret = NULL, *tmp = NULL;
+    npy_extint128_t a, c;
+    PY_LONG_LONG b;
+    npy_int64 mod;
+    if (!PyArg_ParseTuple(args, "OL", &a_obj, &b)) {
+        goto fail;
+    }
+    if (b <= 0) {
+        PyErr_SetString(PyExc_ValueError, "");
+        goto fail;
+    }
+    if (int128_from_pylong(a_obj, &a)) {
+        goto fail;
+    }
+
+    c = divmod_128_64(a, b, &mod);
+
+    ret = PyTuple_New(2);
+
+    tmp = pylong_from_int128(c);
+    if (tmp == NULL) {
+        goto fail;
+    }
+    PyTuple_SET_ITEM(ret, 0, tmp);
+
+    tmp = PyLong_FromLongLong(mod);
+    if (tmp == NULL) {
+        goto fail;
+    }
+    PyTuple_SET_ITEM(ret, 1, tmp);
+    return ret;
+
+fail:
+    Py_XDECREF(ret);
+    Py_XDECREF(tmp);
+    return NULL;
+}
+
+
+static PyObject *
+extint_floordiv_128_64(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PyObject *a_obj;
+    npy_extint128_t a, c;
+    PY_LONG_LONG b;
+    if (!PyArg_ParseTuple(args, "OL", &a_obj, &b)) {
+        return NULL;
+    }
+    if (b <= 0) {
+        PyErr_SetString(PyExc_ValueError, "");
+        return NULL;
+    }
+    if (int128_from_pylong(a_obj, &a)) {
+        return NULL;
+    }
+    c = floordiv_128_64(a, b);
+    return pylong_from_int128(c);
+}
+
+
+static PyObject *
+extint_ceildiv_128_64(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PyObject *a_obj;
+    npy_extint128_t a, c;
+    PY_LONG_LONG b;
+    if (!PyArg_ParseTuple(args, "OL", &a_obj, &b)) {
+        return NULL;
+    }
+    if (b <= 0) {
+        PyErr_SetString(PyExc_ValueError, "");
+        return NULL;
+    }
+    if (int128_from_pylong(a_obj, &a)) {
+        return NULL;
+    }
+    c = ceildiv_128_64(a, b);
+    return pylong_from_int128(c);
+}
+
+struct TestStruct1 {
+    npy_uint8 a;
+    npy_complex64 b;
+};
+
+struct TestStruct2 {
+    npy_uint32 a;
+    npy_complex64 b;
+};
+
+struct TestStruct3 {
+    npy_uint8 a;
+    struct TestStruct1 b;
+};
+
+static PyObject *
+get_struct_alignments(PyObject *NPY_UNUSED(self), PyObject *args) {
+    PyObject *ret = PyTuple_New(3);
+    PyObject *alignment, *size, *val;
+
+    if (ret == NULL) {
+        return NULL;
+    }
+
+#line 2029
+    alignment = PyLong_FromLong(NPY_ALIGNOF(struct TestStruct1));
+    size = PyLong_FromLong(sizeof(struct TestStruct1));
+    val = PyTuple_Pack(2, alignment, size);
+    Py_DECREF(alignment);
+    Py_DECREF(size);
+    if (val == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyTuple_SET_ITEM(ret, 1-1, val);
+
+#line 2029
+    alignment = PyLong_FromLong(NPY_ALIGNOF(struct TestStruct2));
+    size = PyLong_FromLong(sizeof(struct TestStruct2));
+    val = PyTuple_Pack(2, alignment, size);
+    Py_DECREF(alignment);
+    Py_DECREF(size);
+    if (val == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyTuple_SET_ITEM(ret, 2-1, val);
+
+#line 2029
+    alignment = PyLong_FromLong(NPY_ALIGNOF(struct TestStruct3));
+    size = PyLong_FromLong(sizeof(struct TestStruct3));
+    val = PyTuple_Pack(2, alignment, size);
+    Py_DECREF(alignment);
+    Py_DECREF(size);
+    if (val == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyTuple_SET_ITEM(ret, 3-1, val);
+
+    return ret;
+}
+
+
+static char get_fpu_mode_doc[] = (
+    "get_fpu_mode()\n"
+    "\n"
+    "Get the current FPU control word, in a platform-dependent format.\n"
+    "Returns None if not implemented on current platform.");
+
+static PyObject *
+get_fpu_mode(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    if (!PyArg_ParseTuple(args, "")) {
+        return NULL;
+    }
+
+#if defined(_MSC_VER) && !defined(__clang__)
+    {
+        unsigned int result = 0;
+        result = _controlfp(0, 0);
+        return PyLong_FromLongLong(result);
+    }
+#elif (defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))) || (defined(_MSC_VER) && defined(__clang__))
+    {
+        unsigned short cw = 0;
+        __asm__("fstcw %w0" : "=m" (cw));
+        return PyLong_FromLongLong(cw);
+    }
+#else
+    Py_RETURN_NONE;
+#endif
+}
+
+/*
+ * npymath wrappers
+ */
+
+#line 2081
+
+#line 2089
+
+static PyObject *
+call_npy_cabsf(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    PyObject *z_py = NULL, *z_arr = NULL, *w_arr = NULL;
+
+    if (!PyArg_ParseTuple(args, "O", &z_py)) {
+        return NULL;
+    }
+
+    z_arr = PyArray_FROMANY(z_py, NPY_CFLOAT, 0, 0, NPY_ARRAY_CARRAY_RO);
+    if (z_arr == NULL) {
+        return NULL;
+    }
+
+    w_arr = PyArray_SimpleNew(0, NULL, NPY_FLOAT);
+    if (w_arr == NULL) {
+        Py_DECREF(z_arr);
+        return NULL;
+    }
+
+    *(npy_float*)PyArray_DATA((PyArrayObject *)w_arr) =
+        npy_cabsf(*(npy_cfloat*)PyArray_DATA((PyArrayObject *)z_arr));
+
+    Py_DECREF(z_arr);
+    return w_arr;
+}
+
+
+#line 2089
+
+static PyObject *
+call_npy_cabs(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    PyObject *z_py = NULL, *z_arr = NULL, *w_arr = NULL;
+
+    if (!PyArg_ParseTuple(args, "O", &z_py)) {
+        return NULL;
+    }
+
+    z_arr = PyArray_FROMANY(z_py, NPY_CDOUBLE, 0, 0, NPY_ARRAY_CARRAY_RO);
+    if (z_arr == NULL) {
+        return NULL;
+    }
+
+    w_arr = PyArray_SimpleNew(0, NULL, NPY_DOUBLE);
+    if (w_arr == NULL) {
+        Py_DECREF(z_arr);
+        return NULL;
+    }
+
+    *(npy_double*)PyArray_DATA((PyArrayObject *)w_arr) =
+        npy_cabs(*(npy_cdouble*)PyArray_DATA((PyArrayObject *)z_arr));
+
+    Py_DECREF(z_arr);
+    return w_arr;
+}
+
+
+#line 2089
+
+static PyObject *
+call_npy_cabsl(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    PyObject *z_py = NULL, *z_arr = NULL, *w_arr = NULL;
+
+    if (!PyArg_ParseTuple(args, "O", &z_py)) {
+        return NULL;
+    }
+
+    z_arr = PyArray_FROMANY(z_py, NPY_CLONGDOUBLE, 0, 0, NPY_ARRAY_CARRAY_RO);
+    if (z_arr == NULL) {
+        return NULL;
+    }
+
+    w_arr = PyArray_SimpleNew(0, NULL, NPY_LONGDOUBLE);
+    if (w_arr == NULL) {
+        Py_DECREF(z_arr);
+        return NULL;
+    }
+
+    *(npy_longdouble*)PyArray_DATA((PyArrayObject *)w_arr) =
+        npy_cabsl(*(npy_clongdouble*)PyArray_DATA((PyArrayObject *)z_arr));
+
+    Py_DECREF(z_arr);
+    return w_arr;
+}
+
+
+
+
+#line 2081
+
+#line 2089
+
+static PyObject *
+call_npy_cargf(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    PyObject *z_py = NULL, *z_arr = NULL, *w_arr = NULL;
+
+    if (!PyArg_ParseTuple(args, "O", &z_py)) {
+        return NULL;
+    }
+
+    z_arr = PyArray_FROMANY(z_py, NPY_CFLOAT, 0, 0, NPY_ARRAY_CARRAY_RO);
+    if (z_arr == NULL) {
+        return NULL;
+    }
+
+    w_arr = PyArray_SimpleNew(0, NULL, NPY_FLOAT);
+    if (w_arr == NULL) {
+        Py_DECREF(z_arr);
+        return NULL;
+    }
+
+    *(npy_float*)PyArray_DATA((PyArrayObject *)w_arr) =
+        npy_cargf(*(npy_cfloat*)PyArray_DATA((PyArrayObject *)z_arr));
+
+    Py_DECREF(z_arr);
+    return w_arr;
+}
+
+
+#line 2089
+
+static PyObject *
+call_npy_carg(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    PyObject *z_py = NULL, *z_arr = NULL, *w_arr = NULL;
+
+    if (!PyArg_ParseTuple(args, "O", &z_py)) {
+        return NULL;
+    }
+
+    z_arr = PyArray_FROMANY(z_py, NPY_CDOUBLE, 0, 0, NPY_ARRAY_CARRAY_RO);
+    if (z_arr == NULL) {
+        return NULL;
+    }
+
+    w_arr = PyArray_SimpleNew(0, NULL, NPY_DOUBLE);
+    if (w_arr == NULL) {
+        Py_DECREF(z_arr);
+        return NULL;
+    }
+
+    *(npy_double*)PyArray_DATA((PyArrayObject *)w_arr) =
+        npy_carg(*(npy_cdouble*)PyArray_DATA((PyArrayObject *)z_arr));
+
+    Py_DECREF(z_arr);
+    return w_arr;
+}
+
+
+#line 2089
+
+static PyObject *
+call_npy_cargl(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    PyObject *z_py = NULL, *z_arr = NULL, *w_arr = NULL;
+
+    if (!PyArg_ParseTuple(args, "O", &z_py)) {
+        return NULL;
+    }
+
+    z_arr = PyArray_FROMANY(z_py, NPY_CLONGDOUBLE, 0, 0, NPY_ARRAY_CARRAY_RO);
+    if (z_arr == NULL) {
+        return NULL;
+    }
+
+    w_arr = PyArray_SimpleNew(0, NULL, NPY_LONGDOUBLE);
+    if (w_arr == NULL) {
+        Py_DECREF(z_arr);
+        return NULL;
+    }
+
+    *(npy_longdouble*)PyArray_DATA((PyArrayObject *)w_arr) =
+        npy_cargl(*(npy_clongdouble*)PyArray_DATA((PyArrayObject *)z_arr));
+
+    Py_DECREF(z_arr);
+    return w_arr;
+}
+
+
+
+
+
+#line 2124
+
+#line 2130
+
+static PyObject *
+call_npy_log10f(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    PyObject *z_py = NULL, *z_arr = NULL, *w_arr = NULL;
+
+    if (!PyArg_ParseTuple(args, "O", &z_py)) {
+        return NULL;
+    }
+
+    z_arr = PyArray_FROMANY(z_py, NPY_FLOAT, 0, 0, NPY_ARRAY_CARRAY_RO);
+    if (z_arr == NULL) {
+        return NULL;
+    }
+
+    w_arr = PyArray_SimpleNew(0, NULL, NPY_FLOAT);
+    if (w_arr == NULL) {
+        Py_DECREF(z_arr);
+        return NULL;
+    }
+
+    *(npy_float*)PyArray_DATA((PyArrayObject *)w_arr) =
+        npy_log10f(*(npy_float*)PyArray_DATA((PyArrayObject *)z_arr));
+
+    Py_DECREF(z_arr);
+    return w_arr;
+}
+
+
+#line 2130
+
+static PyObject *
+call_npy_log10(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    PyObject *z_py = NULL, *z_arr = NULL, *w_arr = NULL;
+
+    if (!PyArg_ParseTuple(args, "O", &z_py)) {
+        return NULL;
+    }
+
+    z_arr = PyArray_FROMANY(z_py, NPY_DOUBLE, 0, 0, NPY_ARRAY_CARRAY_RO);
+    if (z_arr == NULL) {
+        return NULL;
+    }
+
+    w_arr = PyArray_SimpleNew(0, NULL, NPY_DOUBLE);
+    if (w_arr == NULL) {
+        Py_DECREF(z_arr);
+        return NULL;
+    }
+
+    *(npy_double*)PyArray_DATA((PyArrayObject *)w_arr) =
+        npy_log10(*(npy_double*)PyArray_DATA((PyArrayObject *)z_arr));
+
+    Py_DECREF(z_arr);
+    return w_arr;
+}
+
+
+#line 2130
+
+static PyObject *
+call_npy_log10l(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    PyObject *z_py = NULL, *z_arr = NULL, *w_arr = NULL;
+
+    if (!PyArg_ParseTuple(args, "O", &z_py)) {
+        return NULL;
+    }
+
+    z_arr = PyArray_FROMANY(z_py, NPY_LONGDOUBLE, 0, 0, NPY_ARRAY_CARRAY_RO);
+    if (z_arr == NULL) {
+        return NULL;
+    }
+
+    w_arr = PyArray_SimpleNew(0, NULL, NPY_LONGDOUBLE);
+    if (w_arr == NULL) {
+        Py_DECREF(z_arr);
+        return NULL;
+    }
+
+    *(npy_longdouble*)PyArray_DATA((PyArrayObject *)w_arr) =
+        npy_log10l(*(npy_longdouble*)PyArray_DATA((PyArrayObject *)z_arr));
+
+    Py_DECREF(z_arr);
+    return w_arr;
+}
+
+
+
+
+#line 2124
+
+#line 2130
+
+static PyObject *
+call_npy_coshf(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    PyObject *z_py = NULL, *z_arr = NULL, *w_arr = NULL;
+
+    if (!PyArg_ParseTuple(args, "O", &z_py)) {
+        return NULL;
+    }
+
+    z_arr = PyArray_FROMANY(z_py, NPY_FLOAT, 0, 0, NPY_ARRAY_CARRAY_RO);
+    if (z_arr == NULL) {
+        return NULL;
+    }
+
+    w_arr = PyArray_SimpleNew(0, NULL, NPY_FLOAT);
+    if (w_arr == NULL) {
+        Py_DECREF(z_arr);
+        return NULL;
+    }
+
+    *(npy_float*)PyArray_DATA((PyArrayObject *)w_arr) =
+        npy_coshf(*(npy_float*)PyArray_DATA((PyArrayObject *)z_arr));
+
+    Py_DECREF(z_arr);
+    return w_arr;
+}
+
+
+#line 2130
+
+static PyObject *
+call_npy_cosh(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    PyObject *z_py = NULL, *z_arr = NULL, *w_arr = NULL;
+
+    if (!PyArg_ParseTuple(args, "O", &z_py)) {
+        return NULL;
+    }
+
+    z_arr = PyArray_FROMANY(z_py, NPY_DOUBLE, 0, 0, NPY_ARRAY_CARRAY_RO);
+    if (z_arr == NULL) {
+        return NULL;
+    }
+
+    w_arr = PyArray_SimpleNew(0, NULL, NPY_DOUBLE);
+    if (w_arr == NULL) {
+        Py_DECREF(z_arr);
+        return NULL;
+    }
+
+    *(npy_double*)PyArray_DATA((PyArrayObject *)w_arr) =
+        npy_cosh(*(npy_double*)PyArray_DATA((PyArrayObject *)z_arr));
+
+    Py_DECREF(z_arr);
+    return w_arr;
+}
+
+
+#line 2130
+
+static PyObject *
+call_npy_coshl(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    PyObject *z_py = NULL, *z_arr = NULL, *w_arr = NULL;
+
+    if (!PyArg_ParseTuple(args, "O", &z_py)) {
+        return NULL;
+    }
+
+    z_arr = PyArray_FROMANY(z_py, NPY_LONGDOUBLE, 0, 0, NPY_ARRAY_CARRAY_RO);
+    if (z_arr == NULL) {
+        return NULL;
+    }
+
+    w_arr = PyArray_SimpleNew(0, NULL, NPY_LONGDOUBLE);
+    if (w_arr == NULL) {
+        Py_DECREF(z_arr);
+        return NULL;
+    }
+
+    *(npy_longdouble*)PyArray_DATA((PyArrayObject *)w_arr) =
+        npy_coshl(*(npy_longdouble*)PyArray_DATA((PyArrayObject *)z_arr));
+
+    Py_DECREF(z_arr);
+    return w_arr;
+}
+
+
+
+
+#line 2124
+
+#line 2130
+
+static PyObject *
+call_npy_sinhf(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    PyObject *z_py = NULL, *z_arr = NULL, *w_arr = NULL;
+
+    if (!PyArg_ParseTuple(args, "O", &z_py)) {
+        return NULL;
+    }
+
+    z_arr = PyArray_FROMANY(z_py, NPY_FLOAT, 0, 0, NPY_ARRAY_CARRAY_RO);
+    if (z_arr == NULL) {
+        return NULL;
+    }
+
+    w_arr = PyArray_SimpleNew(0, NULL, NPY_FLOAT);
+    if (w_arr == NULL) {
+        Py_DECREF(z_arr);
+        return NULL;
+    }
+
+    *(npy_float*)PyArray_DATA((PyArrayObject *)w_arr) =
+        npy_sinhf(*(npy_float*)PyArray_DATA((PyArrayObject *)z_arr));
+
+    Py_DECREF(z_arr);
+    return w_arr;
+}
+
+
+#line 2130
+
+static PyObject *
+call_npy_sinh(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    PyObject *z_py = NULL, *z_arr = NULL, *w_arr = NULL;
+
+    if (!PyArg_ParseTuple(args, "O", &z_py)) {
+        return NULL;
+    }
+
+    z_arr = PyArray_FROMANY(z_py, NPY_DOUBLE, 0, 0, NPY_ARRAY_CARRAY_RO);
+    if (z_arr == NULL) {
+        return NULL;
+    }
+
+    w_arr = PyArray_SimpleNew(0, NULL, NPY_DOUBLE);
+    if (w_arr == NULL) {
+        Py_DECREF(z_arr);
+        return NULL;
+    }
+
+    *(npy_double*)PyArray_DATA((PyArrayObject *)w_arr) =
+        npy_sinh(*(npy_double*)PyArray_DATA((PyArrayObject *)z_arr));
+
+    Py_DECREF(z_arr);
+    return w_arr;
+}
+
+
+#line 2130
+
+static PyObject *
+call_npy_sinhl(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    PyObject *z_py = NULL, *z_arr = NULL, *w_arr = NULL;
+
+    if (!PyArg_ParseTuple(args, "O", &z_py)) {
+        return NULL;
+    }
+
+    z_arr = PyArray_FROMANY(z_py, NPY_LONGDOUBLE, 0, 0, NPY_ARRAY_CARRAY_RO);
+    if (z_arr == NULL) {
+        return NULL;
+    }
+
+    w_arr = PyArray_SimpleNew(0, NULL, NPY_LONGDOUBLE);
+    if (w_arr == NULL) {
+        Py_DECREF(z_arr);
+        return NULL;
+    }
+
+    *(npy_longdouble*)PyArray_DATA((PyArrayObject *)w_arr) =
+        npy_sinhl(*(npy_longdouble*)PyArray_DATA((PyArrayObject *)z_arr));
+
+    Py_DECREF(z_arr);
+    return w_arr;
+}
+
+
+
+
+#line 2124
+
+#line 2130
+
+static PyObject *
+call_npy_tanf(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    PyObject *z_py = NULL, *z_arr = NULL, *w_arr = NULL;
+
+    if (!PyArg_ParseTuple(args, "O", &z_py)) {
+        return NULL;
+    }
+
+    z_arr = PyArray_FROMANY(z_py, NPY_FLOAT, 0, 0, NPY_ARRAY_CARRAY_RO);
+    if (z_arr == NULL) {
+        return NULL;
+    }
+
+    w_arr = PyArray_SimpleNew(0, NULL, NPY_FLOAT);
+    if (w_arr == NULL) {
+        Py_DECREF(z_arr);
+        return NULL;
+    }
+
+    *(npy_float*)PyArray_DATA((PyArrayObject *)w_arr) =
+        npy_tanf(*(npy_float*)PyArray_DATA((PyArrayObject *)z_arr));
+
+    Py_DECREF(z_arr);
+    return w_arr;
+}
+
+
+#line 2130
+
+static PyObject *
+call_npy_tan(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    PyObject *z_py = NULL, *z_arr = NULL, *w_arr = NULL;
+
+    if (!PyArg_ParseTuple(args, "O", &z_py)) {
+        return NULL;
+    }
+
+    z_arr = PyArray_FROMANY(z_py, NPY_DOUBLE, 0, 0, NPY_ARRAY_CARRAY_RO);
+    if (z_arr == NULL) {
+        return NULL;
+    }
+
+    w_arr = PyArray_SimpleNew(0, NULL, NPY_DOUBLE);
+    if (w_arr == NULL) {
+        Py_DECREF(z_arr);
+        return NULL;
+    }
+
+    *(npy_double*)PyArray_DATA((PyArrayObject *)w_arr) =
+        npy_tan(*(npy_double*)PyArray_DATA((PyArrayObject *)z_arr));
+
+    Py_DECREF(z_arr);
+    return w_arr;
+}
+
+
+#line 2130
+
+static PyObject *
+call_npy_tanl(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    PyObject *z_py = NULL, *z_arr = NULL, *w_arr = NULL;
+
+    if (!PyArg_ParseTuple(args, "O", &z_py)) {
+        return NULL;
+    }
+
+    z_arr = PyArray_FROMANY(z_py, NPY_LONGDOUBLE, 0, 0, NPY_ARRAY_CARRAY_RO);
+    if (z_arr == NULL) {
+        return NULL;
+    }
+
+    w_arr = PyArray_SimpleNew(0, NULL, NPY_LONGDOUBLE);
+    if (w_arr == NULL) {
+        Py_DECREF(z_arr);
+        return NULL;
+    }
+
+    *(npy_longdouble*)PyArray_DATA((PyArrayObject *)w_arr) =
+        npy_tanl(*(npy_longdouble*)PyArray_DATA((PyArrayObject *)z_arr));
+
+    Py_DECREF(z_arr);
+    return w_arr;
+}
+
+
+
+
+#line 2124
+
+#line 2130
+
+static PyObject *
+call_npy_tanhf(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    PyObject *z_py = NULL, *z_arr = NULL, *w_arr = NULL;
+
+    if (!PyArg_ParseTuple(args, "O", &z_py)) {
+        return NULL;
+    }
+
+    z_arr = PyArray_FROMANY(z_py, NPY_FLOAT, 0, 0, NPY_ARRAY_CARRAY_RO);
+    if (z_arr == NULL) {
+        return NULL;
+    }
+
+    w_arr = PyArray_SimpleNew(0, NULL, NPY_FLOAT);
+    if (w_arr == NULL) {
+        Py_DECREF(z_arr);
+        return NULL;
+    }
+
+    *(npy_float*)PyArray_DATA((PyArrayObject *)w_arr) =
+        npy_tanhf(*(npy_float*)PyArray_DATA((PyArrayObject *)z_arr));
+
+    Py_DECREF(z_arr);
+    return w_arr;
+}
+
+
+#line 2130
+
+static PyObject *
+call_npy_tanh(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    PyObject *z_py = NULL, *z_arr = NULL, *w_arr = NULL;
+
+    if (!PyArg_ParseTuple(args, "O", &z_py)) {
+        return NULL;
+    }
+
+    z_arr = PyArray_FROMANY(z_py, NPY_DOUBLE, 0, 0, NPY_ARRAY_CARRAY_RO);
+    if (z_arr == NULL) {
+        return NULL;
+    }
+
+    w_arr = PyArray_SimpleNew(0, NULL, NPY_DOUBLE);
+    if (w_arr == NULL) {
+        Py_DECREF(z_arr);
+        return NULL;
+    }
+
+    *(npy_double*)PyArray_DATA((PyArrayObject *)w_arr) =
+        npy_tanh(*(npy_double*)PyArray_DATA((PyArrayObject *)z_arr));
+
+    Py_DECREF(z_arr);
+    return w_arr;
+}
+
+
+#line 2130
+
+static PyObject *
+call_npy_tanhl(PyObject *NPY_UNUSED(self), PyObject *args)
+{
+    PyObject *z_py = NULL, *z_arr = NULL, *w_arr = NULL;
+
+    if (!PyArg_ParseTuple(args, "O", &z_py)) {
+        return NULL;
+    }
+
+    z_arr = PyArray_FROMANY(z_py, NPY_LONGDOUBLE, 0, 0, NPY_ARRAY_CARRAY_RO);
+    if (z_arr == NULL) {
+        return NULL;
+    }
+
+    w_arr = PyArray_SimpleNew(0, NULL, NPY_LONGDOUBLE);
+    if (w_arr == NULL) {
+        Py_DECREF(z_arr);
+        return NULL;
+    }
+
+    *(npy_longdouble*)PyArray_DATA((PyArrayObject *)w_arr) =
+        npy_tanhl(*(npy_longdouble*)PyArray_DATA((PyArrayObject *)z_arr));
+
+    Py_DECREF(z_arr);
+    return w_arr;
+}
+
+
+
+
+
+/*
+ * For development/testing purposes, it's convenient to have access to the
+ * system printf for floats. This is a very simple printf interface.
+ */
+PyObject *
+PrintFloat_Printf_g(PyObject *obj, int precision)
+{
+    char str[1024];
+
+    if (PyArray_IsScalar(obj, Half)) {
+        npy_half x = PyArrayScalar_VAL(obj, Half);
+        PyOS_snprintf(str, sizeof(str), "%.*g", precision,
+                      npy_half_to_double(x));
+    }
+    else if (PyArray_IsScalar(obj, Float)) {
+        npy_float x = PyArrayScalar_VAL(obj, Float);
+        PyOS_snprintf(str, sizeof(str), "%.*g", precision, x);
+    }
+    else if (PyArray_IsScalar(obj, Double)) {
+        npy_double x = PyArrayScalar_VAL(obj, Double);
+        PyOS_snprintf(str, sizeof(str), "%.*g", precision, x);
+        /* would be better to use lg, but not available in C90 */
+    }
+    else if (PyArray_IsScalar(obj, LongDouble)) {
+        npy_longdouble x = PyArrayScalar_VAL(obj, LongDouble);
+        PyOS_snprintf(str, sizeof(str), "%.*" NPY_LONGDOUBLE_FMT, precision, x);
+    }
+    else{
+        double val = PyFloat_AsDouble(obj);
+        if (error_converting(val)) {
+            return NULL;
+        }
+        PyOS_snprintf(str, sizeof(str), "%.*g", precision, val);
+    }
+
+    return PyUnicode_FromString(str);
+}
+
+
+static PyObject *
+printf_float_g(PyObject *NPY_UNUSED(dummy), PyObject *args, PyObject *kwds)
+{
+    PyObject *obj;
+    int precision;
+
+    if (!PyArg_ParseTuple(args,"Oi:format_float_OSprintf_g", &obj,
+                                                             &precision)) {
+        return NULL;
+    }
+
+    if (precision < 0) {
+        PyErr_SetString(PyExc_TypeError, "precision must be non-negative");
+        return NULL;
+    }
+
+    return PrintFloat_Printf_g(obj, precision);
+}
+
+static PyObject *
+getset_numericops(PyObject* NPY_UNUSED(self), PyObject* NPY_UNUSED(args))
+{
+    PyObject *ret;
+    PyObject *ops = PyArray_GetNumericOps();
+    if (ops == NULL) {
+        return NULL;
+    }
+    ret = PyLong_FromLong(PyArray_SetNumericOps(ops));
+    Py_DECREF(ops);
+    return ret;
+}
+
+
+static PyObject *
+run_byteorder_converter(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    char byteorder;
+    if (!PyArg_ParseTuple(args, "O&", PyArray_ByteorderConverter, &byteorder)) {
+        return NULL;
+    }
+    switch (byteorder) {
+        case NPY_BIG: return PyUnicode_FromString("NPY_BIG");
+        case NPY_LITTLE: return PyUnicode_FromString("NPY_LITTLE");
+        case NPY_NATIVE: return PyUnicode_FromString("NPY_NATIVE");
+        case NPY_SWAP: return PyUnicode_FromString("NPY_SWAP");
+        case NPY_IGNORE: return PyUnicode_FromString("NPY_IGNORE");
+    }
+    return PyLong_FromLong(byteorder);
+}
+
+static PyObject *
+run_sortkind_converter(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    NPY_SORTKIND kind;
+    if (!PyArg_ParseTuple(args, "O&", PyArray_SortkindConverter, &kind)) {
+        return NULL;
+    }
+    switch (kind) {
+        case NPY_QUICKSORT: return PyUnicode_FromString("NPY_QUICKSORT");
+        case NPY_HEAPSORT: return PyUnicode_FromString("NPY_HEAPSORT");
+        case NPY_STABLESORT: return PyUnicode_FromString("NPY_STABLESORT");
+    }
+    return PyLong_FromLong(kind);
+}
+
+static PyObject *
+run_selectkind_converter(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    NPY_SELECTKIND kind;
+    if (!PyArg_ParseTuple(args, "O&", PyArray_SelectkindConverter, &kind)) {
+        return NULL;
+    }
+    switch (kind) {
+        case NPY_INTROSELECT: return PyUnicode_FromString("NPY_INTROSELECT");
+    }
+    return PyLong_FromLong(kind);
+}
+
+static PyObject *
+run_searchside_converter(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    NPY_SEARCHSIDE side;
+    if (!PyArg_ParseTuple(args, "O&", PyArray_SearchsideConverter, &side)) {
+        return NULL;
+    }
+    switch (side) {
+        case NPY_SEARCHLEFT: return PyUnicode_FromString("NPY_SEARCHLEFT");
+        case NPY_SEARCHRIGHT: return PyUnicode_FromString("NPY_SEARCHRIGHT");
+    }
+    return PyLong_FromLong(side);
+}
+
+static PyObject *
+run_order_converter(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    NPY_ORDER order;
+    if (!PyArg_ParseTuple(args, "O&", PyArray_OrderConverter, &order)) {
+        return NULL;
+    }
+    switch (order) {
+        case NPY_ANYORDER: return PyUnicode_FromString("NPY_ANYORDER");
+        case NPY_CORDER: return PyUnicode_FromString("NPY_CORDER");
+        case NPY_FORTRANORDER: return PyUnicode_FromString("NPY_FORTRANORDER");
+        case NPY_KEEPORDER: return PyUnicode_FromString("NPY_KEEPORDER");
+    }
+    return PyLong_FromLong(order);
+}
+
+static PyObject *
+run_clipmode_converter(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    NPY_CLIPMODE mode;
+    if (!PyArg_ParseTuple(args, "O&", PyArray_ClipmodeConverter, &mode)) {
+        return NULL;
+    }
+    switch (mode) {
+        case NPY_CLIP: return PyUnicode_FromString("NPY_CLIP");
+        case NPY_WRAP: return PyUnicode_FromString("NPY_WRAP");
+        case NPY_RAISE: return PyUnicode_FromString("NPY_RAISE");
+    }
+    return PyLong_FromLong(mode);
+}
+
+static PyObject *
+run_casting_converter(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    NPY_CASTING casting;
+    if (!PyArg_ParseTuple(args, "O&", PyArray_CastingConverter, &casting)) {
+        return NULL;
+    }
+    switch (casting) {
+        case NPY_NO_CASTING: return PyUnicode_FromString("NPY_NO_CASTING");
+        case NPY_EQUIV_CASTING: return PyUnicode_FromString("NPY_EQUIV_CASTING");
+        case NPY_SAFE_CASTING: return PyUnicode_FromString("NPY_SAFE_CASTING");
+        case NPY_SAME_KIND_CASTING: return PyUnicode_FromString("NPY_SAME_KIND_CASTING");
+        case NPY_UNSAFE_CASTING: return PyUnicode_FromString("NPY_UNSAFE_CASTING");
+        default: return PyLong_FromLong(casting);
+    }
+}
+
+static PyObject *
+run_intp_converter(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    PyArray_Dims dims = {NULL, -1};
+    if (!PyArg_ParseTuple(args, "O&", PyArray_IntpConverter, &dims)) {
+        return NULL;
+    }
+    if (dims.len == -1) {
+        Py_RETURN_NONE;
+    }
+
+    PyObject *tup = PyArray_IntTupleFromIntp(dims.len, dims.ptr);
+    PyDimMem_FREE(dims.ptr);
+    return tup;
+}
+
+/* used to test NPY_ARRAY_ENSURENOCOPY raises ValueError */
+static PyObject*
+npy_ensurenocopy(PyObject* NPY_UNUSED(self), PyObject* args)
+{
+    int flags = NPY_ARRAY_ENSURENOCOPY;
+    if (!PyArray_CheckFromAny(args, NULL, 0, 0, flags, NULL)) {
+        return NULL;
+    }
+    Py_RETURN_NONE;
+}
+
+static PyObject *
+run_scalar_intp_converter(PyObject *NPY_UNUSED(self), PyObject *obj)
+{
+    PyArray_Dims dims;
+    if (!PyArray_IntpConverter(obj, &dims)) {
+        return NULL;
+    }
+    else {
+        PyObject *result = PyArray_IntTupleFromIntp(dims.len, dims.ptr);
+        PyDimMem_FREE(dims.ptr);
+        return result;
+    }
+}
+
+static PyObject *
+run_scalar_intp_from_sequence(PyObject *NPY_UNUSED(self), PyObject *obj)
+{
+    npy_intp vals[1];
+
+    int output = PyArray_IntpFromSequence(obj, vals, 1);
+    if (output == -1) {
+        return NULL;
+    }
+    return PyArray_IntTupleFromIntp(1, vals);
+}
+
+static PyMethodDef Multiarray_TestsMethods[] = {
+    {"argparse_example_function",
+         (PyCFunction)argparse_example_function,
+         METH_KEYWORDS | METH_FASTCALL, NULL},
+    {"IsPythonScalar",
+        IsPythonScalar,
+        METH_VARARGS, NULL},
+    {"test_neighborhood_iterator",
+        test_neighborhood_iterator,
+        METH_VARARGS, NULL},
+    {"test_neighborhood_iterator_oob",
+        test_neighborhood_iterator_oob,
+        METH_VARARGS, NULL},
+    {"test_pydatamem_seteventhook_start",
+        test_pydatamem_seteventhook_start,
+        METH_NOARGS, NULL},
+    {"test_pydatamem_seteventhook_end",
+        test_pydatamem_seteventhook_end,
+        METH_NOARGS, NULL},
+    {"test_inplace_increment",
+        inplace_increment,
+        METH_VARARGS, NULL},
+    {"fromstring_null_term_c_api",
+        fromstring_null_term_c_api,
+        METH_O, NULL},
+    {"create_custom_field_dtype",
+        create_custom_field_dtype,
+        METH_VARARGS, NULL},
+    {"corrupt_or_fix_bufferinfo",
+        corrupt_or_fix_bufferinfo,
+        METH_O, NULL},
+    {"incref_elide",
+        incref_elide,
+        METH_VARARGS, NULL},
+    {"incref_elide_l",
+        incref_elide_l,
+        METH_VARARGS, NULL},
+    {"npy_char_deprecation",
+        npy_char_deprecation,
+        METH_NOARGS, NULL},
+    {"npy_pyarrayas1d_deprecation",
+        npy_pyarrayas1d_deprecation,
+        METH_NOARGS, NULL},
+    {"npy_pyarrayas2d_deprecation",
+        npy_pyarrayas2d_deprecation,
+        METH_NOARGS, NULL},
+    {"npy_create_writebackifcopy",
+        npy_create_writebackifcopy,
+        METH_O, NULL},
+    {"npy_abuse_writebackifcopy",
+        npy_abuse_writebackifcopy,
+        METH_O, NULL},
+    {"npy_resolve",
+        npy_resolve,
+        METH_O, NULL},
+    {"npy_discard",
+        npy_discard,
+        METH_O, NULL},
+    {"npy_ensurenocopy",
+        npy_ensurenocopy,
+        METH_O, NULL},
+    {"get_buffer_info",
+        get_buffer_info,
+        METH_VARARGS, NULL},
+    {"get_c_wrapping_array",
+        get_c_wrapping_array,
+        METH_O, NULL},
+    {"get_all_cast_information",
+        get_all_cast_information,
+        METH_NOARGS,
+        "Return a list with info on all available casts. Some of the info"
+        "may differ for an actual cast if it uses value-based casting "
+        "(flexible types)."},
+    {"identityhash_tester",
+        (PyCFunction)identityhash_tester,
+        METH_KEYWORDS | METH_FASTCALL, NULL},
+    {"array_indexing",
+        array_indexing,
+        METH_VARARGS, NULL},
+    {"test_as_c_array",
+        test_as_c_array,
+        METH_VARARGS, NULL},
+    {"test_nditer_too_large",
+        test_nditer_too_large,
+        METH_VARARGS, NULL},
+    {"solve_diophantine",
+        (PyCFunction)array_solve_diophantine,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"internal_overlap",
+        (PyCFunction)array_internal_overlap,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"extint_safe_binop",
+        extint_safe_binop,
+        METH_VARARGS, NULL},
+    {"extint_to_128",
+        extint_to_128,
+        METH_VARARGS, NULL},
+    {"extint_to_64",
+        extint_to_64,
+        METH_VARARGS, NULL},
+    {"extint_mul_64_64",
+        extint_mul_64_64,
+        METH_VARARGS, NULL},
+    {"extint_add_128",
+        extint_add_128,
+        METH_VARARGS, NULL},
+    {"extint_sub_128",
+        extint_sub_128,
+        METH_VARARGS, NULL},
+    {"extint_neg_128",
+        extint_neg_128,
+        METH_VARARGS, NULL},
+    {"extint_shl_128",
+        extint_shl_128,
+        METH_VARARGS, NULL},
+    {"extint_shr_128",
+        extint_shr_128,
+        METH_VARARGS, NULL},
+    {"extint_gt_128",
+        extint_gt_128,
+        METH_VARARGS, NULL},
+    {"extint_divmod_128_64",
+        extint_divmod_128_64,
+        METH_VARARGS, NULL},
+    {"extint_floordiv_128_64",
+        extint_floordiv_128_64,
+        METH_VARARGS, NULL},
+    {"extint_ceildiv_128_64",
+        extint_ceildiv_128_64,
+        METH_VARARGS, NULL},
+    {"get_fpu_mode",
+        get_fpu_mode,
+        METH_VARARGS, get_fpu_mode_doc},
+    {"getset_numericops",
+        getset_numericops,
+        METH_NOARGS, NULL},
+#line 2533
+
+#line 2537
+    {"npy_cabsf",
+        call_npy_cabsf,
+        METH_VARARGS, NULL},
+
+#line 2537
+    {"npy_cabs",
+        call_npy_cabs,
+        METH_VARARGS, NULL},
+
+#line 2537
+    {"npy_cabsl",
+        call_npy_cabsl,
+        METH_VARARGS, NULL},
+
+
+
+#line 2533
+
+#line 2537
+    {"npy_cargf",
+        call_npy_cargf,
+        METH_VARARGS, NULL},
+
+#line 2537
+    {"npy_carg",
+        call_npy_carg,
+        METH_VARARGS, NULL},
+
+#line 2537
+    {"npy_cargl",
+        call_npy_cargl,
+        METH_VARARGS, NULL},
+
+
+
+
+#line 2547
+
+#line 2551
+    {"npy_log10f",
+        call_npy_log10f,
+        METH_VARARGS, NULL},
+
+#line 2551
+    {"npy_log10",
+        call_npy_log10,
+        METH_VARARGS, NULL},
+
+#line 2551
+    {"npy_log10l",
+        call_npy_log10l,
+        METH_VARARGS, NULL},
+
+
+
+#line 2547
+
+#line 2551
+    {"npy_coshf",
+        call_npy_coshf,
+        METH_VARARGS, NULL},
+
+#line 2551
+    {"npy_cosh",
+        call_npy_cosh,
+        METH_VARARGS, NULL},
+
+#line 2551
+    {"npy_coshl",
+        call_npy_coshl,
+        METH_VARARGS, NULL},
+
+
+
+#line 2547
+
+#line 2551
+    {"npy_sinhf",
+        call_npy_sinhf,
+        METH_VARARGS, NULL},
+
+#line 2551
+    {"npy_sinh",
+        call_npy_sinh,
+        METH_VARARGS, NULL},
+
+#line 2551
+    {"npy_sinhl",
+        call_npy_sinhl,
+        METH_VARARGS, NULL},
+
+
+
+#line 2547
+
+#line 2551
+    {"npy_tanf",
+        call_npy_tanf,
+        METH_VARARGS, NULL},
+
+#line 2551
+    {"npy_tan",
+        call_npy_tan,
+        METH_VARARGS, NULL},
+
+#line 2551
+    {"npy_tanl",
+        call_npy_tanl,
+        METH_VARARGS, NULL},
+
+
+
+#line 2547
+
+#line 2551
+    {"npy_tanhf",
+        call_npy_tanhf,
+        METH_VARARGS, NULL},
+
+#line 2551
+    {"npy_tanh",
+        call_npy_tanh,
+        METH_VARARGS, NULL},
+
+#line 2551
+    {"npy_tanhl",
+        call_npy_tanhl,
+        METH_VARARGS, NULL},
+
+
+
+    {"format_float_OSprintf_g",
+        (PyCFunction)printf_float_g,
+        METH_VARARGS , NULL},
+    {"get_struct_alignments",
+        get_struct_alignments,
+        METH_VARARGS, NULL},
+    {"run_byteorder_converter",
+        run_byteorder_converter,
+        METH_VARARGS, NULL},
+    {"run_sortkind_converter",
+        run_sortkind_converter,
+        METH_VARARGS, NULL},
+    {"run_selectkind_converter",
+        run_selectkind_converter,
+        METH_VARARGS, NULL},
+    {"run_searchside_converter",
+        run_searchside_converter,
+        METH_VARARGS, NULL},
+    {"run_order_converter",
+        run_order_converter,
+        METH_VARARGS, NULL},
+    {"run_clipmode_converter",
+        run_clipmode_converter,
+        METH_VARARGS, NULL},
+    {"run_casting_converter",
+        run_casting_converter,
+        METH_VARARGS, NULL},
+    {"run_scalar_intp_converter",
+        run_scalar_intp_converter,
+        METH_O, NULL},
+    {"run_scalar_intp_from_sequence",
+        run_scalar_intp_from_sequence,
+        METH_O, NULL},
+    {"run_intp_converter",
+        run_intp_converter,
+        METH_VARARGS, NULL},
+    {NULL, NULL, 0, NULL}        /* Sentinel */
+};
+
+
+static struct PyModuleDef moduledef = {
+        PyModuleDef_HEAD_INIT,
+        "_multiarray_tests",
+        NULL,
+        -1,
+        Multiarray_TestsMethods,
+        NULL,
+        NULL,
+        NULL,
+        NULL
+};
+
+PyMODINIT_FUNC PyInit__multiarray_tests(void)
+{
+    PyObject *m;
+
+    m = PyModule_Create(&moduledef);
+    if (m == NULL) {
+        return m;
+    }
+    import_array();
+    if (PyErr_Occurred()) {
+        PyErr_SetString(PyExc_RuntimeError,
+                        "cannot load _multiarray_tests module.");
+    }
+    return m;
+}
+
+NPY_NO_EXPORT int
+test_not_exported(void)
+{
+    return 1;
+}
+
diff --git a/numpy/core/src/_generated/_simd.dispatch.c b/numpy/core/src/_generated/_simd.dispatch.c
new file mode 100644
index 000000000000..2c303e82b9fb
--- /dev/null
+++ b/numpy/core/src/_generated/_simd.dispatch.c
@@ -0,0 +1,22095 @@
+#line 1 "numpy/core/src/_simd/_simd.dispatch.c.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+/*@targets #simd_test*/
+#include "_simd.h"
+#include "_simd_inc.h"
+
+#if NPY_SIMD
+#include "_simd_data.inc"
+#include "_simd_convert.inc"
+#include "_simd_vector.inc"
+#include "_simd_arg.inc"
+#include "_simd_easyintrin.inc"
+
+//#########################################################################
+//## Defining NPYV intrinsics as module functions
+//#########################################################################
+#line 36
+#if 1
+/***************************
+ * Memory
+ ***************************/
+#line 43
+SIMD_IMPL_INTRIN_1(load_u8, vu8, qu8)
+
+#line 43
+SIMD_IMPL_INTRIN_1(loada_u8, vu8, qu8)
+
+#line 43
+SIMD_IMPL_INTRIN_1(loads_u8, vu8, qu8)
+
+#line 43
+SIMD_IMPL_INTRIN_1(loadl_u8, vu8, qu8)
+
+SIMD_IMPL_INTRIN_1(load_u8x2, vu8x2, qu8)
+
+#line 51
+// special definition due to the nature of store
+static PyObject *
+simd__intrin_store_u8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu8};
+    simd_arg vec_arg = {.dtype = simd_data_vu8};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:store_u8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store_u8(seq_arg.data.qu8, vec_arg.data.vu8);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu8, simd_data_qu8)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of storea
+static PyObject *
+simd__intrin_storea_u8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu8};
+    simd_arg vec_arg = {.dtype = simd_data_vu8};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:storea_u8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_storea_u8(seq_arg.data.qu8, vec_arg.data.vu8);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu8, simd_data_qu8)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of stores
+static PyObject *
+simd__intrin_stores_u8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu8};
+    simd_arg vec_arg = {.dtype = simd_data_vu8};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:stores_u8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_stores_u8(seq_arg.data.qu8, vec_arg.data.vu8);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu8, simd_data_qu8)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of storel
+static PyObject *
+simd__intrin_storel_u8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu8};
+    simd_arg vec_arg = {.dtype = simd_data_vu8};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:storel_u8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_storel_u8(seq_arg.data.qu8, vec_arg.data.vu8);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu8, simd_data_qu8)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of storeh
+static PyObject *
+simd__intrin_storeh_u8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu8};
+    simd_arg vec_arg = {.dtype = simd_data_vu8};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:storeh_u8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_storeh_u8(seq_arg.data.qu8, vec_arg.data.vu8);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu8, simd_data_qu8)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of store
+static PyObject *
+simd__intrin_store_u8x2(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu8};
+    simd_arg vec_arg = {.dtype = simd_data_vu8x2};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:store_u8x2",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store_u8x2(seq_arg.data.qu8, vec_arg.data.vu8x2);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu8, simd_data_qu8)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+
+/****************************************
+ * Non-contiguous/Partial Memory access
+ ****************************************/
+#if 0
+// Partial Load
+SIMD_IMPL_INTRIN_3(load_till_u8, vu8, qu8, u32, u8)
+SIMD_IMPL_INTRIN_2(load_tillz_u8, vu8, qu8, u32)
+#if 8 == 32
+    SIMD_IMPL_INTRIN_4(load2_till_u8, vu8, qu8, u32, u8, u8)
+    SIMD_IMPL_INTRIN_2(load2_tillz_u8, vu8, qu8, u32)
+#else
+    SIMD_IMPL_INTRIN_4(load2_till_u8, vu8, qu8, u32, u8, u8)
+    SIMD_IMPL_INTRIN_2(load2_tillz_u8, vu8, qu8, u32)
+#endif
+
+// Partial Store
+#line 95
+#if !0 || 0 == 8
+static PyObject *
+simd__intrin_store_till_u8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu8};
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+    simd_arg vec_arg = {.dtype = simd_data_vu8};
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:store_till_u8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &nlane_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store_till_u8(
+        seq_arg.data.qu8, nlane_arg.data.u32, vec_arg.data.vu8
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu8, simd_data_qu8)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+#endif // chksize
+
+
+#line 95
+#if !32 || 32 == 8
+static PyObject *
+simd__intrin_store2_till_u8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu8};
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+    simd_arg vec_arg = {.dtype = simd_data_vu8};
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:store2_till_u8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &nlane_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store2_till_u8(
+        seq_arg.data.qu8, nlane_arg.data.u32, vec_arg.data.vu8
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu8, simd_data_qu8)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+#endif // chksize
+
+
+#line 95
+#if !64 || 64 == 8
+static PyObject *
+simd__intrin_store2_till_u8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu8};
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+    simd_arg vec_arg = {.dtype = simd_data_vu8};
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:store2_till_u8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &nlane_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store2_till_u8(
+        seq_arg.data.qu8, nlane_arg.data.u32, vec_arg.data.vu8
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu8, simd_data_qu8)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+#endif // chksize
+
+
+// Non-contiguous Load
+#line 136
+#if !0 || 0 == 8
+static PyObject *
+simd__intrin_loadn_u8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu8};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_u8};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_u8};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&:loadn_u8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u8 *seq_ptr = seq_arg.data.qu8;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u8;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 1;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn_u8(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_u8 rvec = npyv_loadn_u8(
+        seq_ptr, stride
+    #if 0
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.u8
+    #endif
+    #if 0
+        , fill2_arg.data.u8
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vu8, .data = {.vu8=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !32 || 32 == 8
+static PyObject *
+simd__intrin_loadn2_u8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu8};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_u8};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_u8};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&:loadn2_u8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u8 *seq_ptr = seq_arg.data.qu8;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u8;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_u8(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_u8 rvec = npyv_loadn2_u8(
+        seq_ptr, stride
+    #if 0
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.u8
+    #endif
+    #if 0
+        , fill2_arg.data.u8
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vu8, .data = {.vu8=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !64 || 64 == 8
+static PyObject *
+simd__intrin_loadn2_u8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu8};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_u8};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_u8};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&:loadn2_u8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u8 *seq_ptr = seq_arg.data.qu8;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u8;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_u8(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_u8 rvec = npyv_loadn2_u8(
+        seq_ptr, stride
+    #if 0
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.u8
+    #endif
+    #if 0
+        , fill2_arg.data.u8
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vu8, .data = {.vu8=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !0 || 0 == 8
+static PyObject *
+simd__intrin_loadn_till_u8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu8};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 1
+    simd_arg fill_arg = {.dtype = simd_data_u8};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_u8};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:loadn_till_u8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u8 *seq_ptr = seq_arg.data.qu8;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u8;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 1;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn_till_u8(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_u8 rvec = npyv_loadn_till_u8(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 1
+        , fill_arg.data.u8
+    #endif
+    #if 0
+        , fill2_arg.data.u8
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vu8, .data = {.vu8=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !32 || 32 == 8
+static PyObject *
+simd__intrin_loadn2_till_u8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu8};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 1
+    simd_arg fill_arg = {.dtype = simd_data_u8};
+#endif
+#if 1
+    simd_arg fill2_arg = {.dtype = simd_data_u8};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&O&:loadn2_till_u8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u8 *seq_ptr = seq_arg.data.qu8;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u8;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_till_u8(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_u8 rvec = npyv_loadn2_till_u8(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 1
+        , fill_arg.data.u8
+    #endif
+    #if 1
+        , fill2_arg.data.u8
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vu8, .data = {.vu8=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !64 || 64 == 8
+static PyObject *
+simd__intrin_loadn2_till_u8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu8};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 1
+    simd_arg fill_arg = {.dtype = simd_data_u8};
+#endif
+#if 1
+    simd_arg fill2_arg = {.dtype = simd_data_u8};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&O&:loadn2_till_u8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u8 *seq_ptr = seq_arg.data.qu8;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u8;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_till_u8(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_u8 rvec = npyv_loadn2_till_u8(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 1
+        , fill_arg.data.u8
+    #endif
+    #if 1
+        , fill2_arg.data.u8
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vu8, .data = {.vu8=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !0 || 0 == 8
+static PyObject *
+simd__intrin_loadn_tillz_u8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu8};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_u8};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_u8};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:loadn_tillz_u8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u8 *seq_ptr = seq_arg.data.qu8;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u8;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 1;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn_tillz_u8(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_u8 rvec = npyv_loadn_tillz_u8(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.u8
+    #endif
+    #if 0
+        , fill2_arg.data.u8
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vu8, .data = {.vu8=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !32 || 32 == 8
+static PyObject *
+simd__intrin_loadn2_tillz_u8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu8};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_u8};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_u8};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:loadn2_tillz_u8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u8 *seq_ptr = seq_arg.data.qu8;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u8;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_tillz_u8(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_u8 rvec = npyv_loadn2_tillz_u8(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.u8
+    #endif
+    #if 0
+        , fill2_arg.data.u8
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vu8, .data = {.vu8=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !64 || 64 == 8
+static PyObject *
+simd__intrin_loadn2_tillz_u8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu8};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_u8};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_u8};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:loadn2_tillz_u8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u8 *seq_ptr = seq_arg.data.qu8;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u8;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_tillz_u8(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_u8 rvec = npyv_loadn2_tillz_u8(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.u8
+    #endif
+    #if 0
+        , fill2_arg.data.u8
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vu8, .data = {.vu8=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+
+// Non-contiguous Store
+#line 216
+#if !0 || 0 == 8
+static PyObject *
+simd__intrin_storen_u8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu8};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vu8};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:storen_u8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u8 *seq_ptr = seq_arg.data.qu8;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u8;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*1;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen_u8(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen_u8(
+        seq_ptr, stride
+    #if 0
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vu8
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu8, simd_data_qu8)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !32 || 32 == 8
+static PyObject *
+simd__intrin_storen2_u8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu8};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vu8};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:storen_u8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u8 *seq_ptr = seq_arg.data.qu8;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u8;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_u8(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_u8(
+        seq_ptr, stride
+    #if 0
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vu8
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu8, simd_data_qu8)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !64 || 64 == 8
+static PyObject *
+simd__intrin_storen2_u8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu8};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vu8};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:storen_u8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u8 *seq_ptr = seq_arg.data.qu8;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u8;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_u8(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_u8(
+        seq_ptr, stride
+    #if 0
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vu8
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu8, simd_data_qu8)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !0 || 0 == 8
+static PyObject *
+simd__intrin_storen_till_u8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu8};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vu8};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:storen_u8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u8 *seq_ptr = seq_arg.data.qu8;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u8;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*1;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen_till_u8(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen_till_u8(
+        seq_ptr, stride
+    #if 1
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vu8
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu8, simd_data_qu8)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !32 || 32 == 8
+static PyObject *
+simd__intrin_storen2_till_u8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu8};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vu8};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:storen_u8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u8 *seq_ptr = seq_arg.data.qu8;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u8;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_till_u8(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_till_u8(
+        seq_ptr, stride
+    #if 1
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vu8
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu8, simd_data_qu8)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !64 || 64 == 8
+static PyObject *
+simd__intrin_storen2_till_u8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu8};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vu8};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:storen_u8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u8 *seq_ptr = seq_arg.data.qu8;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u8;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_till_u8(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_till_u8(
+        seq_ptr, stride
+    #if 1
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vu8
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu8, simd_data_qu8)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#endif // 0
+
+/****************************
+ * Lookup tables
+ ****************************/
+#if 8 == 32
+SIMD_IMPL_INTRIN_2(lut32_u8, vu8, qu8, vu8)
+#endif
+#if 8 == 64
+SIMD_IMPL_INTRIN_2(lut16_u8, vu8, qu8, vu8)
+#endif
+/***************************
+ * Misc
+ ***************************/
+SIMD_IMPL_INTRIN_0(zero_u8, vu8)
+SIMD_IMPL_INTRIN_1(extract0_u8, u8, vu8)
+SIMD_IMPL_INTRIN_1(setall_u8, vu8, u8)
+SIMD_IMPL_INTRIN_3(select_u8, vu8, vb8, vu8, vu8)
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u8_u8, vu8, vu8)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s8_u8, vs8, vu8)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u16_u8, vu16, vu8)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s16_u8, vs16, vu8)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u32_u8, vu32, vu8)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s32_u8, vs32, vu8)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u64_u8, vu64, vu8)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s64_u8, vs64, vu8)
+#endif // simd_sup2
+
+#line 296
+#if NPY_SIMD_F32
+SIMD_IMPL_INTRIN_1(reinterpret_f32_u8, vf32, vu8)
+#endif // simd_sup2
+
+#line 296
+#if NPY_SIMD_F64
+SIMD_IMPL_INTRIN_1(reinterpret_f64_u8, vf64, vu8)
+#endif // simd_sup2
+
+
+/**
+ * special definition due to the nature of intrinsics
+ * npyv_setf_u8 and npy_set_u8.
+*/
+#line 308
+static PyObject *
+simd__intrin_setf_u8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    npyv_lanetype_u8 *data = simd_sequence_from_iterable(args, simd_data_qu8, npyv_nlanes_u8);
+    if (data == NULL) {
+        return NULL;
+    }
+    simd_data r = {.vu8 = npyv_setf_u8(
+        data[0],  data[1],  data[2],  data[3],  data[4],  data[5],  data[6],  data[7],
+        data[8],  data[9],  data[10], data[11], data[12], data[13], data[14], data[15],
+        data[16], data[17], data[18], data[19], data[20], data[21], data[22], data[23],
+        data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31],
+        data[32], data[33], data[34], data[35], data[36], data[37], data[38], data[39],
+        data[40], data[41], data[42], data[43], data[44], data[45], data[46], data[47],
+        data[48], data[49], data[50], data[51], data[52], data[53], data[54], data[55],
+        data[56], data[57], data[58], data[59], data[60], data[61], data[62], data[63],
+        data[64] // for setf
+    )};
+    simd_sequence_free(data);
+    return (PyObject*)PySIMDVector_FromData(r, simd_data_vu8);
+}
+
+#line 308
+static PyObject *
+simd__intrin_set_u8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    npyv_lanetype_u8 *data = simd_sequence_from_iterable(args, simd_data_qu8, npyv_nlanes_u8);
+    if (data == NULL) {
+        return NULL;
+    }
+    simd_data r = {.vu8 = npyv_set_u8(
+        data[0],  data[1],  data[2],  data[3],  data[4],  data[5],  data[6],  data[7],
+        data[8],  data[9],  data[10], data[11], data[12], data[13], data[14], data[15],
+        data[16], data[17], data[18], data[19], data[20], data[21], data[22], data[23],
+        data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31],
+        data[32], data[33], data[34], data[35], data[36], data[37], data[38], data[39],
+        data[40], data[41], data[42], data[43], data[44], data[45], data[46], data[47],
+        data[48], data[49], data[50], data[51], data[52], data[53], data[54], data[55],
+        data[56], data[57], data[58], data[59], data[60], data[61], data[62], data[63],
+        data[64] // for setf
+    )};
+    simd_sequence_free(data);
+    return (PyObject*)PySIMDVector_FromData(r, simd_data_vu8);
+}
+
+
+/***************************
+ * Reorder
+ ***************************/
+#line 337
+SIMD_IMPL_INTRIN_2(combinel_u8, vu8, vu8, vu8)
+
+#line 337
+SIMD_IMPL_INTRIN_2(combineh_u8, vu8, vu8, vu8)
+
+
+#line 343
+SIMD_IMPL_INTRIN_2(combine_u8, vu8x2, vu8, vu8)
+
+#line 343
+SIMD_IMPL_INTRIN_2(zip_u8, vu8x2, vu8, vu8)
+
+#line 343
+SIMD_IMPL_INTRIN_2(unzip_u8, vu8x2, vu8, vu8)
+
+
+#if 1
+SIMD_IMPL_INTRIN_1(rev64_u8, vu8, vu8)
+#endif
+
+// special implementation to convert runtime constants to immediate values
+#if 8 == 32
+// one call for element index then gather them within one vector
+// instead of unroll the 255 possible cases.
+NPY_FINLINE npyv_u8
+npyv_permi128_u8_(npyv_u8 a, unsigned e0, unsigned e1, unsigned e2, unsigned e3)
+{
+   #line 360
+    npyv_u8 ve0;
+    npyv_lanetype_u8 de0[npyv_nlanes_u8];
+    if (0) {}
+   #line 366
+    else if (e0 == 1) {
+        ve0 = npyv_permi128_u8(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e0 == 2) {
+        ve0 = npyv_permi128_u8(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e0 == 3) {
+        ve0 = npyv_permi128_u8(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve0 = npyv_permi128_u8(a, 0, 0, 0, 0);
+    }
+    npyv_store_u8(de0, ve0);
+    
+#line 360
+    npyv_u8 ve1;
+    npyv_lanetype_u8 de1[npyv_nlanes_u8];
+    if (0) {}
+   #line 366
+    else if (e1 == 1) {
+        ve1 = npyv_permi128_u8(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e1 == 2) {
+        ve1 = npyv_permi128_u8(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e1 == 3) {
+        ve1 = npyv_permi128_u8(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve1 = npyv_permi128_u8(a, 0, 0, 0, 0);
+    }
+    npyv_store_u8(de1, ve1);
+    
+#line 360
+    npyv_u8 ve2;
+    npyv_lanetype_u8 de2[npyv_nlanes_u8];
+    if (0) {}
+   #line 366
+    else if (e2 == 1) {
+        ve2 = npyv_permi128_u8(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e2 == 2) {
+        ve2 = npyv_permi128_u8(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e2 == 3) {
+        ve2 = npyv_permi128_u8(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve2 = npyv_permi128_u8(a, 0, 0, 0, 0);
+    }
+    npyv_store_u8(de2, ve2);
+    
+#line 360
+    npyv_u8 ve3;
+    npyv_lanetype_u8 de3[npyv_nlanes_u8];
+    if (0) {}
+   #line 366
+    else if (e3 == 1) {
+        ve3 = npyv_permi128_u8(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e3 == 2) {
+        ve3 = npyv_permi128_u8(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e3 == 3) {
+        ve3 = npyv_permi128_u8(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve3 = npyv_permi128_u8(a, 0, 0, 0, 0);
+    }
+    npyv_store_u8(de3, ve3);
+    
+    if (e0 == e1 && e0 == e2 && e0 == e3) {
+        return ve0;
+    }
+    for (int i = 0; i < npyv_nlanes_u8; i += 4) {
+        de0[i+1] = de1[i+1];
+        de0[i+2] = de2[i+2];
+        de0[i+3] = de3[i+3];
+    }
+    return npyv_load_u8(de0);
+}
+SIMD_IMPL_INTRIN_5(permi128_u8_, vu8, vu8, u8, u8, u8, u8)
+#elif 8 == 64
+NPY_FINLINE npyv_u8
+npyv_permi128_u8_(npyv_u8 a, unsigned e0, unsigned e1)
+{
+    if (e0 == 1 && e1 == 0) {
+        return npyv_permi128_u8(a, 1, 0);
+    }
+    else if (e0 == 0 && e1 == 1) {
+        return npyv_permi128_u8(a, 0, 1);
+    }
+    else if (e0 == 1 && e1 == 1) {
+        return npyv_permi128_u8(a, 1, 1);
+    }
+    return npyv_permi128_u8(a, 0, 0);
+}
+SIMD_IMPL_INTRIN_3(permi128_u8_, vu8, vu8, u8, u8)
+#endif
+
+/***************************
+ * Operators
+ ***************************/
+#if 0 > 0
+SIMD_IMPL_INTRIN_2(shl_u8, vu8, vu8, u8)
+SIMD_IMPL_INTRIN_2(shr_u8, vu8, vu8, u8)
+// immediate constant
+SIMD_IMPL_INTRIN_2IMM(shli_u8, vu8, vu8, 0)
+SIMD_IMPL_INTRIN_2IMM(shri_u8, vu8, vu8, 0)
+#endif // shl_imm
+
+#line 418
+SIMD_IMPL_INTRIN_2(and_u8, vu8, vu8, vu8)
+
+#line 418
+SIMD_IMPL_INTRIN_2(or_u8, vu8, vu8, vu8)
+
+#line 418
+SIMD_IMPL_INTRIN_2(xor_u8, vu8, vu8, vu8)
+
+
+SIMD_IMPL_INTRIN_1(not_u8, vu8, vu8)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpeq_u8, vb8, vu8, vu8)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpneq_u8, vb8, vu8, vu8)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpgt_u8, vb8, vu8, vu8)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpge_u8, vb8, vu8, vu8)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmplt_u8, vb8, vu8, vu8)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmple_u8, vb8, vu8, vu8)
+
+
+#if 1
+SIMD_IMPL_INTRIN_2(andc_u8, vu8, vu8, vu8)
+SIMD_IMPL_INTRIN_2(andc_b8, vb8, vb8, vb8)
+SIMD_IMPL_INTRIN_2(orc_b8, vb8, vb8, vb8)
+SIMD_IMPL_INTRIN_2(xnor_b8, vb8, vb8, vb8)
+#endif
+
+// test cross all vector lanes
+#line 440
+SIMD_IMPL_INTRIN_1(any_u8, u8, vu8)
+
+#line 440
+SIMD_IMPL_INTRIN_1(all_u8, u8, vu8)
+
+/***************************
+ * Conversion
+ ***************************/
+SIMD_IMPL_INTRIN_1(cvt_u8_b8, vu8,  vb8)
+SIMD_IMPL_INTRIN_1(cvt_b8_u8, vb8, vu8)
+#if 1
+SIMD_IMPL_INTRIN_1(expand_u16_u8, vu16x2, vu8)
+#endif // expand_sup
+/***************************
+ * Arithmetic
+ ***************************/
+#line 456
+SIMD_IMPL_INTRIN_2(add_u8, vu8, vu8, vu8)
+
+#line 456
+SIMD_IMPL_INTRIN_2(sub_u8, vu8, vu8, vu8)
+
+
+#if 1
+#line 463
+SIMD_IMPL_INTRIN_2(adds_u8, vu8, vu8, vu8)
+
+#line 463
+SIMD_IMPL_INTRIN_2(subs_u8, vu8, vu8, vu8)
+
+#endif // sat_sup
+
+#if 1
+SIMD_IMPL_INTRIN_2(mul_u8, vu8, vu8, vu8)
+#endif // mul_sup
+
+#if 0
+SIMD_IMPL_INTRIN_2(div_u8, vu8, vu8, vu8)
+#endif // div_sup
+
+#if 1
+SIMD_IMPL_INTRIN_1(divisor_u8, vu8x3, u8)
+SIMD_IMPL_INTRIN_2(divc_u8, vu8, vu8, vu8x3)
+#endif // intdiv_sup
+
+#if 0
+#line 484
+SIMD_IMPL_INTRIN_3(muladd_u8, vu8, vu8, vu8, vu8)
+
+#line 484
+SIMD_IMPL_INTRIN_3(mulsub_u8, vu8, vu8, vu8, vu8)
+
+#line 484
+SIMD_IMPL_INTRIN_3(nmuladd_u8, vu8, vu8, vu8, vu8)
+
+#line 484
+SIMD_IMPL_INTRIN_3(nmulsub_u8, vu8, vu8, vu8, vu8)
+
+#line 484
+SIMD_IMPL_INTRIN_3(muladdsub_u8, vu8, vu8, vu8, vu8)
+
+#endif // fused_sup
+
+#if 0
+SIMD_IMPL_INTRIN_1(sum_u8, u8, vu8)
+#endif // sum_sup
+
+#if 1
+SIMD_IMPL_INTRIN_1(sumup_u8, u16, vu8)
+#endif // sumup_sup
+
+/***************************
+ * Math
+ ***************************/
+#if 0
+#line 503
+SIMD_IMPL_INTRIN_1(sqrt_u8, vu8, vu8)
+
+#line 503
+SIMD_IMPL_INTRIN_1(recip_u8, vu8, vu8)
+
+#line 503
+SIMD_IMPL_INTRIN_1(abs_u8, vu8, vu8)
+
+#line 503
+SIMD_IMPL_INTRIN_1(square_u8, vu8, vu8)
+
+#line 503
+SIMD_IMPL_INTRIN_1(rint_u8, vu8, vu8)
+
+#line 503
+SIMD_IMPL_INTRIN_1(ceil_u8, vu8, vu8)
+
+#line 503
+SIMD_IMPL_INTRIN_1(trunc_u8, vu8, vu8)
+
+#line 503
+SIMD_IMPL_INTRIN_1(floor_u8, vu8, vu8)
+
+#endif
+
+#line 510
+SIMD_IMPL_INTRIN_2(max_u8, vu8, vu8, vu8)
+SIMD_IMPL_INTRIN_1(reduce_max_u8, u8, vu8)
+
+#line 510
+SIMD_IMPL_INTRIN_2(min_u8, vu8, vu8, vu8)
+SIMD_IMPL_INTRIN_1(reduce_min_u8, u8, vu8)
+
+
+#if 0
+#line 518
+SIMD_IMPL_INTRIN_2(maxp_u8, vu8, vu8, vu8)
+SIMD_IMPL_INTRIN_1(reduce_maxp_u8, u8, vu8)
+
+#line 518
+SIMD_IMPL_INTRIN_2(minp_u8, vu8, vu8, vu8)
+SIMD_IMPL_INTRIN_1(reduce_minp_u8, u8, vu8)
+
+#line 518
+SIMD_IMPL_INTRIN_2(maxn_u8, vu8, vu8, vu8)
+SIMD_IMPL_INTRIN_1(reduce_maxn_u8, u8, vu8)
+
+#line 518
+SIMD_IMPL_INTRIN_2(minn_u8, vu8, vu8, vu8)
+SIMD_IMPL_INTRIN_1(reduce_minn_u8, u8, vu8)
+
+/**end repeat1**/
+#endif
+
+/***************************
+ * Mask operations
+ ***************************/
+#line 530
+ SIMD_IMPL_INTRIN_4(ifadd_u8, vu8, vb8, vu8, vu8, vu8)
+
+#line 530
+ SIMD_IMPL_INTRIN_4(ifsub_u8, vu8, vb8, vu8, vu8, vu8)
+
+
+#if 0
+SIMD_IMPL_INTRIN_4(ifdiv_u8, vu8, vb8, vu8, vu8, vu8)
+SIMD_IMPL_INTRIN_3(ifdivz_u8, vu8, vb8, vu8, vu8)
+#endif
+
+#endif // simd_sup
+
+#line 36
+#if 1
+/***************************
+ * Memory
+ ***************************/
+#line 43
+SIMD_IMPL_INTRIN_1(load_s8, vs8, qs8)
+
+#line 43
+SIMD_IMPL_INTRIN_1(loada_s8, vs8, qs8)
+
+#line 43
+SIMD_IMPL_INTRIN_1(loads_s8, vs8, qs8)
+
+#line 43
+SIMD_IMPL_INTRIN_1(loadl_s8, vs8, qs8)
+
+SIMD_IMPL_INTRIN_1(load_s8x2, vs8x2, qs8)
+
+#line 51
+// special definition due to the nature of store
+static PyObject *
+simd__intrin_store_s8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs8};
+    simd_arg vec_arg = {.dtype = simd_data_vs8};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:store_s8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store_s8(seq_arg.data.qs8, vec_arg.data.vs8);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs8, simd_data_qs8)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of storea
+static PyObject *
+simd__intrin_storea_s8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs8};
+    simd_arg vec_arg = {.dtype = simd_data_vs8};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:storea_s8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_storea_s8(seq_arg.data.qs8, vec_arg.data.vs8);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs8, simd_data_qs8)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of stores
+static PyObject *
+simd__intrin_stores_s8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs8};
+    simd_arg vec_arg = {.dtype = simd_data_vs8};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:stores_s8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_stores_s8(seq_arg.data.qs8, vec_arg.data.vs8);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs8, simd_data_qs8)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of storel
+static PyObject *
+simd__intrin_storel_s8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs8};
+    simd_arg vec_arg = {.dtype = simd_data_vs8};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:storel_s8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_storel_s8(seq_arg.data.qs8, vec_arg.data.vs8);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs8, simd_data_qs8)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of storeh
+static PyObject *
+simd__intrin_storeh_s8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs8};
+    simd_arg vec_arg = {.dtype = simd_data_vs8};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:storeh_s8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_storeh_s8(seq_arg.data.qs8, vec_arg.data.vs8);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs8, simd_data_qs8)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of store
+static PyObject *
+simd__intrin_store_s8x2(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs8};
+    simd_arg vec_arg = {.dtype = simd_data_vs8x2};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:store_s8x2",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store_s8x2(seq_arg.data.qs8, vec_arg.data.vs8x2);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs8, simd_data_qs8)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+
+/****************************************
+ * Non-contiguous/Partial Memory access
+ ****************************************/
+#if 0
+// Partial Load
+SIMD_IMPL_INTRIN_3(load_till_s8, vs8, qs8, u32, s8)
+SIMD_IMPL_INTRIN_2(load_tillz_s8, vs8, qs8, u32)
+#if 8 == 32
+    SIMD_IMPL_INTRIN_4(load2_till_s8, vs8, qs8, u32, s8, s8)
+    SIMD_IMPL_INTRIN_2(load2_tillz_s8, vs8, qs8, u32)
+#else
+    SIMD_IMPL_INTRIN_4(load2_till_s8, vs8, qs8, u32, s8, s8)
+    SIMD_IMPL_INTRIN_2(load2_tillz_s8, vs8, qs8, u32)
+#endif
+
+// Partial Store
+#line 95
+#if !0 || 0 == 8
+static PyObject *
+simd__intrin_store_till_s8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs8};
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+    simd_arg vec_arg = {.dtype = simd_data_vs8};
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:store_till_s8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &nlane_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store_till_s8(
+        seq_arg.data.qs8, nlane_arg.data.u32, vec_arg.data.vs8
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs8, simd_data_qs8)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+#endif // chksize
+
+
+#line 95
+#if !32 || 32 == 8
+static PyObject *
+simd__intrin_store2_till_s8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs8};
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+    simd_arg vec_arg = {.dtype = simd_data_vs8};
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:store2_till_s8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &nlane_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store2_till_s8(
+        seq_arg.data.qs8, nlane_arg.data.u32, vec_arg.data.vs8
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs8, simd_data_qs8)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+#endif // chksize
+
+
+#line 95
+#if !64 || 64 == 8
+static PyObject *
+simd__intrin_store2_till_s8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs8};
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+    simd_arg vec_arg = {.dtype = simd_data_vs8};
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:store2_till_s8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &nlane_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store2_till_s8(
+        seq_arg.data.qs8, nlane_arg.data.u32, vec_arg.data.vs8
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs8, simd_data_qs8)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+#endif // chksize
+
+
+// Non-contiguous Load
+#line 136
+#if !0 || 0 == 8
+static PyObject *
+simd__intrin_loadn_s8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs8};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_s8};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_s8};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&:loadn_s8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s8 *seq_ptr = seq_arg.data.qs8;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s8;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 1;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn_s8(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_s8 rvec = npyv_loadn_s8(
+        seq_ptr, stride
+    #if 0
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.s8
+    #endif
+    #if 0
+        , fill2_arg.data.s8
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vs8, .data = {.vs8=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !32 || 32 == 8
+static PyObject *
+simd__intrin_loadn2_s8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs8};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_s8};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_s8};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&:loadn2_s8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s8 *seq_ptr = seq_arg.data.qs8;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s8;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_s8(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_s8 rvec = npyv_loadn2_s8(
+        seq_ptr, stride
+    #if 0
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.s8
+    #endif
+    #if 0
+        , fill2_arg.data.s8
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vs8, .data = {.vs8=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !64 || 64 == 8
+static PyObject *
+simd__intrin_loadn2_s8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs8};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_s8};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_s8};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&:loadn2_s8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s8 *seq_ptr = seq_arg.data.qs8;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s8;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_s8(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_s8 rvec = npyv_loadn2_s8(
+        seq_ptr, stride
+    #if 0
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.s8
+    #endif
+    #if 0
+        , fill2_arg.data.s8
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vs8, .data = {.vs8=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !0 || 0 == 8
+static PyObject *
+simd__intrin_loadn_till_s8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs8};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 1
+    simd_arg fill_arg = {.dtype = simd_data_s8};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_s8};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:loadn_till_s8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s8 *seq_ptr = seq_arg.data.qs8;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s8;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 1;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn_till_s8(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_s8 rvec = npyv_loadn_till_s8(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 1
+        , fill_arg.data.s8
+    #endif
+    #if 0
+        , fill2_arg.data.s8
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vs8, .data = {.vs8=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !32 || 32 == 8
+static PyObject *
+simd__intrin_loadn2_till_s8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs8};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 1
+    simd_arg fill_arg = {.dtype = simd_data_s8};
+#endif
+#if 1
+    simd_arg fill2_arg = {.dtype = simd_data_s8};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&O&:loadn2_till_s8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s8 *seq_ptr = seq_arg.data.qs8;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s8;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_till_s8(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_s8 rvec = npyv_loadn2_till_s8(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 1
+        , fill_arg.data.s8
+    #endif
+    #if 1
+        , fill2_arg.data.s8
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vs8, .data = {.vs8=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !64 || 64 == 8
+static PyObject *
+simd__intrin_loadn2_till_s8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs8};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 1
+    simd_arg fill_arg = {.dtype = simd_data_s8};
+#endif
+#if 1
+    simd_arg fill2_arg = {.dtype = simd_data_s8};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&O&:loadn2_till_s8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s8 *seq_ptr = seq_arg.data.qs8;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s8;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_till_s8(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_s8 rvec = npyv_loadn2_till_s8(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 1
+        , fill_arg.data.s8
+    #endif
+    #if 1
+        , fill2_arg.data.s8
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vs8, .data = {.vs8=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !0 || 0 == 8
+static PyObject *
+simd__intrin_loadn_tillz_s8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs8};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_s8};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_s8};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:loadn_tillz_s8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s8 *seq_ptr = seq_arg.data.qs8;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s8;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 1;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn_tillz_s8(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_s8 rvec = npyv_loadn_tillz_s8(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.s8
+    #endif
+    #if 0
+        , fill2_arg.data.s8
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vs8, .data = {.vs8=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !32 || 32 == 8
+static PyObject *
+simd__intrin_loadn2_tillz_s8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs8};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_s8};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_s8};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:loadn2_tillz_s8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s8 *seq_ptr = seq_arg.data.qs8;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s8;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_tillz_s8(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_s8 rvec = npyv_loadn2_tillz_s8(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.s8
+    #endif
+    #if 0
+        , fill2_arg.data.s8
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vs8, .data = {.vs8=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !64 || 64 == 8
+static PyObject *
+simd__intrin_loadn2_tillz_s8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs8};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_s8};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_s8};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:loadn2_tillz_s8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s8 *seq_ptr = seq_arg.data.qs8;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s8;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_tillz_s8(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_s8 rvec = npyv_loadn2_tillz_s8(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.s8
+    #endif
+    #if 0
+        , fill2_arg.data.s8
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vs8, .data = {.vs8=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+
+// Non-contiguous Store
+#line 216
+#if !0 || 0 == 8
+static PyObject *
+simd__intrin_storen_s8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs8};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vs8};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:storen_s8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s8 *seq_ptr = seq_arg.data.qs8;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s8;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*1;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen_s8(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen_s8(
+        seq_ptr, stride
+    #if 0
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vs8
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs8, simd_data_qs8)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !32 || 32 == 8
+static PyObject *
+simd__intrin_storen2_s8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs8};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vs8};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:storen_s8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s8 *seq_ptr = seq_arg.data.qs8;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s8;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_s8(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_s8(
+        seq_ptr, stride
+    #if 0
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vs8
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs8, simd_data_qs8)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !64 || 64 == 8
+static PyObject *
+simd__intrin_storen2_s8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs8};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vs8};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:storen_s8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s8 *seq_ptr = seq_arg.data.qs8;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s8;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_s8(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_s8(
+        seq_ptr, stride
+    #if 0
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vs8
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs8, simd_data_qs8)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !0 || 0 == 8
+static PyObject *
+simd__intrin_storen_till_s8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs8};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vs8};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:storen_s8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s8 *seq_ptr = seq_arg.data.qs8;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s8;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*1;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen_till_s8(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen_till_s8(
+        seq_ptr, stride
+    #if 1
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vs8
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs8, simd_data_qs8)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !32 || 32 == 8
+static PyObject *
+simd__intrin_storen2_till_s8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs8};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vs8};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:storen_s8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s8 *seq_ptr = seq_arg.data.qs8;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s8;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_till_s8(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_till_s8(
+        seq_ptr, stride
+    #if 1
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vs8
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs8, simd_data_qs8)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !64 || 64 == 8
+static PyObject *
+simd__intrin_storen2_till_s8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs8};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vs8};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:storen_s8",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s8 *seq_ptr = seq_arg.data.qs8;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s8;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_till_s8(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_till_s8(
+        seq_ptr, stride
+    #if 1
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vs8
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs8, simd_data_qs8)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#endif // 0
+
+/****************************
+ * Lookup tables
+ ****************************/
+#if 8 == 32
+SIMD_IMPL_INTRIN_2(lut32_s8, vs8, qs8, vu8)
+#endif
+#if 8 == 64
+SIMD_IMPL_INTRIN_2(lut16_s8, vs8, qs8, vu8)
+#endif
+/***************************
+ * Misc
+ ***************************/
+SIMD_IMPL_INTRIN_0(zero_s8, vs8)
+SIMD_IMPL_INTRIN_1(extract0_s8, s8, vs8)
+SIMD_IMPL_INTRIN_1(setall_s8, vs8, s8)
+SIMD_IMPL_INTRIN_3(select_s8, vs8, vb8, vs8, vs8)
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u8_s8, vu8, vs8)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s8_s8, vs8, vs8)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u16_s8, vu16, vs8)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s16_s8, vs16, vs8)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u32_s8, vu32, vs8)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s32_s8, vs32, vs8)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u64_s8, vu64, vs8)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s64_s8, vs64, vs8)
+#endif // simd_sup2
+
+#line 296
+#if NPY_SIMD_F32
+SIMD_IMPL_INTRIN_1(reinterpret_f32_s8, vf32, vs8)
+#endif // simd_sup2
+
+#line 296
+#if NPY_SIMD_F64
+SIMD_IMPL_INTRIN_1(reinterpret_f64_s8, vf64, vs8)
+#endif // simd_sup2
+
+
+/**
+ * special definition due to the nature of intrinsics
+ * npyv_setf_s8 and npy_set_s8.
+*/
+#line 308
+static PyObject *
+simd__intrin_setf_s8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    npyv_lanetype_s8 *data = simd_sequence_from_iterable(args, simd_data_qs8, npyv_nlanes_s8);
+    if (data == NULL) {
+        return NULL;
+    }
+    simd_data r = {.vs8 = npyv_setf_s8(
+        data[0],  data[1],  data[2],  data[3],  data[4],  data[5],  data[6],  data[7],
+        data[8],  data[9],  data[10], data[11], data[12], data[13], data[14], data[15],
+        data[16], data[17], data[18], data[19], data[20], data[21], data[22], data[23],
+        data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31],
+        data[32], data[33], data[34], data[35], data[36], data[37], data[38], data[39],
+        data[40], data[41], data[42], data[43], data[44], data[45], data[46], data[47],
+        data[48], data[49], data[50], data[51], data[52], data[53], data[54], data[55],
+        data[56], data[57], data[58], data[59], data[60], data[61], data[62], data[63],
+        data[64] // for setf
+    )};
+    simd_sequence_free(data);
+    return (PyObject*)PySIMDVector_FromData(r, simd_data_vs8);
+}
+
+#line 308
+static PyObject *
+simd__intrin_set_s8(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    npyv_lanetype_s8 *data = simd_sequence_from_iterable(args, simd_data_qs8, npyv_nlanes_s8);
+    if (data == NULL) {
+        return NULL;
+    }
+    simd_data r = {.vs8 = npyv_set_s8(
+        data[0],  data[1],  data[2],  data[3],  data[4],  data[5],  data[6],  data[7],
+        data[8],  data[9],  data[10], data[11], data[12], data[13], data[14], data[15],
+        data[16], data[17], data[18], data[19], data[20], data[21], data[22], data[23],
+        data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31],
+        data[32], data[33], data[34], data[35], data[36], data[37], data[38], data[39],
+        data[40], data[41], data[42], data[43], data[44], data[45], data[46], data[47],
+        data[48], data[49], data[50], data[51], data[52], data[53], data[54], data[55],
+        data[56], data[57], data[58], data[59], data[60], data[61], data[62], data[63],
+        data[64] // for setf
+    )};
+    simd_sequence_free(data);
+    return (PyObject*)PySIMDVector_FromData(r, simd_data_vs8);
+}
+
+
+/***************************
+ * Reorder
+ ***************************/
+#line 337
+SIMD_IMPL_INTRIN_2(combinel_s8, vs8, vs8, vs8)
+
+#line 337
+SIMD_IMPL_INTRIN_2(combineh_s8, vs8, vs8, vs8)
+
+
+#line 343
+SIMD_IMPL_INTRIN_2(combine_s8, vs8x2, vs8, vs8)
+
+#line 343
+SIMD_IMPL_INTRIN_2(zip_s8, vs8x2, vs8, vs8)
+
+#line 343
+SIMD_IMPL_INTRIN_2(unzip_s8, vs8x2, vs8, vs8)
+
+
+#if 1
+SIMD_IMPL_INTRIN_1(rev64_s8, vs8, vs8)
+#endif
+
+// special implementation to convert runtime constants to immediate values
+#if 8 == 32
+// one call for element index then gather them within one vector
+// instead of unroll the 255 possible cases.
+NPY_FINLINE npyv_s8
+npyv_permi128_s8_(npyv_s8 a, unsigned e0, unsigned e1, unsigned e2, unsigned e3)
+{
+   #line 360
+    npyv_s8 ve0;
+    npyv_lanetype_s8 de0[npyv_nlanes_s8];
+    if (0) {}
+   #line 366
+    else if (e0 == 1) {
+        ve0 = npyv_permi128_s8(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e0 == 2) {
+        ve0 = npyv_permi128_s8(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e0 == 3) {
+        ve0 = npyv_permi128_s8(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve0 = npyv_permi128_s8(a, 0, 0, 0, 0);
+    }
+    npyv_store_s8(de0, ve0);
+    
+#line 360
+    npyv_s8 ve1;
+    npyv_lanetype_s8 de1[npyv_nlanes_s8];
+    if (0) {}
+   #line 366
+    else if (e1 == 1) {
+        ve1 = npyv_permi128_s8(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e1 == 2) {
+        ve1 = npyv_permi128_s8(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e1 == 3) {
+        ve1 = npyv_permi128_s8(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve1 = npyv_permi128_s8(a, 0, 0, 0, 0);
+    }
+    npyv_store_s8(de1, ve1);
+    
+#line 360
+    npyv_s8 ve2;
+    npyv_lanetype_s8 de2[npyv_nlanes_s8];
+    if (0) {}
+   #line 366
+    else if (e2 == 1) {
+        ve2 = npyv_permi128_s8(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e2 == 2) {
+        ve2 = npyv_permi128_s8(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e2 == 3) {
+        ve2 = npyv_permi128_s8(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve2 = npyv_permi128_s8(a, 0, 0, 0, 0);
+    }
+    npyv_store_s8(de2, ve2);
+    
+#line 360
+    npyv_s8 ve3;
+    npyv_lanetype_s8 de3[npyv_nlanes_s8];
+    if (0) {}
+   #line 366
+    else if (e3 == 1) {
+        ve3 = npyv_permi128_s8(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e3 == 2) {
+        ve3 = npyv_permi128_s8(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e3 == 3) {
+        ve3 = npyv_permi128_s8(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve3 = npyv_permi128_s8(a, 0, 0, 0, 0);
+    }
+    npyv_store_s8(de3, ve3);
+    
+    if (e0 == e1 && e0 == e2 && e0 == e3) {
+        return ve0;
+    }
+    for (int i = 0; i < npyv_nlanes_s8; i += 4) {
+        de0[i+1] = de1[i+1];
+        de0[i+2] = de2[i+2];
+        de0[i+3] = de3[i+3];
+    }
+    return npyv_load_s8(de0);
+}
+SIMD_IMPL_INTRIN_5(permi128_s8_, vs8, vs8, u8, u8, u8, u8)
+#elif 8 == 64
+NPY_FINLINE npyv_s8
+npyv_permi128_s8_(npyv_s8 a, unsigned e0, unsigned e1)
+{
+    if (e0 == 1 && e1 == 0) {
+        return npyv_permi128_s8(a, 1, 0);
+    }
+    else if (e0 == 0 && e1 == 1) {
+        return npyv_permi128_s8(a, 0, 1);
+    }
+    else if (e0 == 1 && e1 == 1) {
+        return npyv_permi128_s8(a, 1, 1);
+    }
+    return npyv_permi128_s8(a, 0, 0);
+}
+SIMD_IMPL_INTRIN_3(permi128_s8_, vs8, vs8, u8, u8)
+#endif
+
+/***************************
+ * Operators
+ ***************************/
+#if 0 > 0
+SIMD_IMPL_INTRIN_2(shl_s8, vs8, vs8, u8)
+SIMD_IMPL_INTRIN_2(shr_s8, vs8, vs8, u8)
+// immediate constant
+SIMD_IMPL_INTRIN_2IMM(shli_s8, vs8, vs8, 0)
+SIMD_IMPL_INTRIN_2IMM(shri_s8, vs8, vs8, 0)
+#endif // shl_imm
+
+#line 418
+SIMD_IMPL_INTRIN_2(and_s8, vs8, vs8, vs8)
+
+#line 418
+SIMD_IMPL_INTRIN_2(or_s8, vs8, vs8, vs8)
+
+#line 418
+SIMD_IMPL_INTRIN_2(xor_s8, vs8, vs8, vs8)
+
+
+SIMD_IMPL_INTRIN_1(not_s8, vs8, vs8)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpeq_s8, vb8, vs8, vs8)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpneq_s8, vb8, vs8, vs8)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpgt_s8, vb8, vs8, vs8)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpge_s8, vb8, vs8, vs8)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmplt_s8, vb8, vs8, vs8)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmple_s8, vb8, vs8, vs8)
+
+
+#if 0
+SIMD_IMPL_INTRIN_2(andc_s8, vs8, vs8, vs8)
+SIMD_IMPL_INTRIN_2(andc_b8, vb8, vb8, vb8)
+SIMD_IMPL_INTRIN_2(orc_b8, vb8, vb8, vb8)
+SIMD_IMPL_INTRIN_2(xnor_b8, vb8, vb8, vb8)
+#endif
+
+// test cross all vector lanes
+#line 440
+SIMD_IMPL_INTRIN_1(any_s8, u8, vs8)
+
+#line 440
+SIMD_IMPL_INTRIN_1(all_s8, u8, vs8)
+
+/***************************
+ * Conversion
+ ***************************/
+SIMD_IMPL_INTRIN_1(cvt_s8_b8, vs8,  vb8)
+SIMD_IMPL_INTRIN_1(cvt_b8_s8, vb8, vs8)
+#if 0
+SIMD_IMPL_INTRIN_1(expand_s8_s8, vs8x2, vs8)
+#endif // expand_sup
+/***************************
+ * Arithmetic
+ ***************************/
+#line 456
+SIMD_IMPL_INTRIN_2(add_s8, vs8, vs8, vs8)
+
+#line 456
+SIMD_IMPL_INTRIN_2(sub_s8, vs8, vs8, vs8)
+
+
+#if 1
+#line 463
+SIMD_IMPL_INTRIN_2(adds_s8, vs8, vs8, vs8)
+
+#line 463
+SIMD_IMPL_INTRIN_2(subs_s8, vs8, vs8, vs8)
+
+#endif // sat_sup
+
+#if 1
+SIMD_IMPL_INTRIN_2(mul_s8, vs8, vs8, vs8)
+#endif // mul_sup
+
+#if 0
+SIMD_IMPL_INTRIN_2(div_s8, vs8, vs8, vs8)
+#endif // div_sup
+
+#if 1
+SIMD_IMPL_INTRIN_1(divisor_s8, vs8x3, s8)
+SIMD_IMPL_INTRIN_2(divc_s8, vs8, vs8, vs8x3)
+#endif // intdiv_sup
+
+#if 0
+#line 484
+SIMD_IMPL_INTRIN_3(muladd_s8, vs8, vs8, vs8, vs8)
+
+#line 484
+SIMD_IMPL_INTRIN_3(mulsub_s8, vs8, vs8, vs8, vs8)
+
+#line 484
+SIMD_IMPL_INTRIN_3(nmuladd_s8, vs8, vs8, vs8, vs8)
+
+#line 484
+SIMD_IMPL_INTRIN_3(nmulsub_s8, vs8, vs8, vs8, vs8)
+
+#line 484
+SIMD_IMPL_INTRIN_3(muladdsub_s8, vs8, vs8, vs8, vs8)
+
+#endif // fused_sup
+
+#if 0
+SIMD_IMPL_INTRIN_1(sum_s8, s8, vs8)
+#endif // sum_sup
+
+#if 0
+SIMD_IMPL_INTRIN_1(sumup_s8, s8, vs8)
+#endif // sumup_sup
+
+/***************************
+ * Math
+ ***************************/
+#if 0
+#line 503
+SIMD_IMPL_INTRIN_1(sqrt_s8, vs8, vs8)
+
+#line 503
+SIMD_IMPL_INTRIN_1(recip_s8, vs8, vs8)
+
+#line 503
+SIMD_IMPL_INTRIN_1(abs_s8, vs8, vs8)
+
+#line 503
+SIMD_IMPL_INTRIN_1(square_s8, vs8, vs8)
+
+#line 503
+SIMD_IMPL_INTRIN_1(rint_s8, vs8, vs8)
+
+#line 503
+SIMD_IMPL_INTRIN_1(ceil_s8, vs8, vs8)
+
+#line 503
+SIMD_IMPL_INTRIN_1(trunc_s8, vs8, vs8)
+
+#line 503
+SIMD_IMPL_INTRIN_1(floor_s8, vs8, vs8)
+
+#endif
+
+#line 510
+SIMD_IMPL_INTRIN_2(max_s8, vs8, vs8, vs8)
+SIMD_IMPL_INTRIN_1(reduce_max_s8, s8, vs8)
+
+#line 510
+SIMD_IMPL_INTRIN_2(min_s8, vs8, vs8, vs8)
+SIMD_IMPL_INTRIN_1(reduce_min_s8, s8, vs8)
+
+
+#if 0
+#line 518
+SIMD_IMPL_INTRIN_2(maxp_s8, vs8, vs8, vs8)
+SIMD_IMPL_INTRIN_1(reduce_maxp_s8, s8, vs8)
+
+#line 518
+SIMD_IMPL_INTRIN_2(minp_s8, vs8, vs8, vs8)
+SIMD_IMPL_INTRIN_1(reduce_minp_s8, s8, vs8)
+
+#line 518
+SIMD_IMPL_INTRIN_2(maxn_s8, vs8, vs8, vs8)
+SIMD_IMPL_INTRIN_1(reduce_maxn_s8, s8, vs8)
+
+#line 518
+SIMD_IMPL_INTRIN_2(minn_s8, vs8, vs8, vs8)
+SIMD_IMPL_INTRIN_1(reduce_minn_s8, s8, vs8)
+
+/**end repeat1**/
+#endif
+
+/***************************
+ * Mask operations
+ ***************************/
+#line 530
+ SIMD_IMPL_INTRIN_4(ifadd_s8, vs8, vb8, vs8, vs8, vs8)
+
+#line 530
+ SIMD_IMPL_INTRIN_4(ifsub_s8, vs8, vb8, vs8, vs8, vs8)
+
+
+#if 0
+SIMD_IMPL_INTRIN_4(ifdiv_s8, vs8, vb8, vs8, vs8, vs8)
+SIMD_IMPL_INTRIN_3(ifdivz_s8, vs8, vb8, vs8, vs8)
+#endif
+
+#endif // simd_sup
+
+#line 36
+#if 1
+/***************************
+ * Memory
+ ***************************/
+#line 43
+SIMD_IMPL_INTRIN_1(load_u16, vu16, qu16)
+
+#line 43
+SIMD_IMPL_INTRIN_1(loada_u16, vu16, qu16)
+
+#line 43
+SIMD_IMPL_INTRIN_1(loads_u16, vu16, qu16)
+
+#line 43
+SIMD_IMPL_INTRIN_1(loadl_u16, vu16, qu16)
+
+SIMD_IMPL_INTRIN_1(load_u16x2, vu16x2, qu16)
+
+#line 51
+// special definition due to the nature of store
+static PyObject *
+simd__intrin_store_u16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu16};
+    simd_arg vec_arg = {.dtype = simd_data_vu16};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:store_u16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store_u16(seq_arg.data.qu16, vec_arg.data.vu16);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu16, simd_data_qu16)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of storea
+static PyObject *
+simd__intrin_storea_u16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu16};
+    simd_arg vec_arg = {.dtype = simd_data_vu16};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:storea_u16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_storea_u16(seq_arg.data.qu16, vec_arg.data.vu16);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu16, simd_data_qu16)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of stores
+static PyObject *
+simd__intrin_stores_u16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu16};
+    simd_arg vec_arg = {.dtype = simd_data_vu16};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:stores_u16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_stores_u16(seq_arg.data.qu16, vec_arg.data.vu16);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu16, simd_data_qu16)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of storel
+static PyObject *
+simd__intrin_storel_u16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu16};
+    simd_arg vec_arg = {.dtype = simd_data_vu16};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:storel_u16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_storel_u16(seq_arg.data.qu16, vec_arg.data.vu16);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu16, simd_data_qu16)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of storeh
+static PyObject *
+simd__intrin_storeh_u16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu16};
+    simd_arg vec_arg = {.dtype = simd_data_vu16};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:storeh_u16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_storeh_u16(seq_arg.data.qu16, vec_arg.data.vu16);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu16, simd_data_qu16)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of store
+static PyObject *
+simd__intrin_store_u16x2(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu16};
+    simd_arg vec_arg = {.dtype = simd_data_vu16x2};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:store_u16x2",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store_u16x2(seq_arg.data.qu16, vec_arg.data.vu16x2);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu16, simd_data_qu16)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+
+/****************************************
+ * Non-contiguous/Partial Memory access
+ ****************************************/
+#if 0
+// Partial Load
+SIMD_IMPL_INTRIN_3(load_till_u16, vu16, qu16, u32, u16)
+SIMD_IMPL_INTRIN_2(load_tillz_u16, vu16, qu16, u32)
+#if 16 == 32
+    SIMD_IMPL_INTRIN_4(load2_till_u16, vu16, qu16, u32, u16, u16)
+    SIMD_IMPL_INTRIN_2(load2_tillz_u16, vu16, qu16, u32)
+#else
+    SIMD_IMPL_INTRIN_4(load2_till_u16, vu16, qu16, u32, u16, u16)
+    SIMD_IMPL_INTRIN_2(load2_tillz_u16, vu16, qu16, u32)
+#endif
+
+// Partial Store
+#line 95
+#if !0 || 0 == 16
+static PyObject *
+simd__intrin_store_till_u16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu16};
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+    simd_arg vec_arg = {.dtype = simd_data_vu16};
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:store_till_u16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &nlane_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store_till_u16(
+        seq_arg.data.qu16, nlane_arg.data.u32, vec_arg.data.vu16
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu16, simd_data_qu16)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+#endif // chksize
+
+
+#line 95
+#if !32 || 32 == 16
+static PyObject *
+simd__intrin_store2_till_u16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu16};
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+    simd_arg vec_arg = {.dtype = simd_data_vu16};
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:store2_till_u16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &nlane_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store2_till_u16(
+        seq_arg.data.qu16, nlane_arg.data.u32, vec_arg.data.vu16
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu16, simd_data_qu16)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+#endif // chksize
+
+
+#line 95
+#if !64 || 64 == 16
+static PyObject *
+simd__intrin_store2_till_u16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu16};
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+    simd_arg vec_arg = {.dtype = simd_data_vu16};
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:store2_till_u16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &nlane_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store2_till_u16(
+        seq_arg.data.qu16, nlane_arg.data.u32, vec_arg.data.vu16
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu16, simd_data_qu16)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+#endif // chksize
+
+
+// Non-contiguous Load
+#line 136
+#if !0 || 0 == 16
+static PyObject *
+simd__intrin_loadn_u16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu16};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_u16};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_u16};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&:loadn_u16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u16 *seq_ptr = seq_arg.data.qu16;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u16;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 1;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn_u16(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_u16 rvec = npyv_loadn_u16(
+        seq_ptr, stride
+    #if 0
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.u16
+    #endif
+    #if 0
+        , fill2_arg.data.u16
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vu16, .data = {.vu16=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !32 || 32 == 16
+static PyObject *
+simd__intrin_loadn2_u16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu16};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_u16};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_u16};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&:loadn2_u16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u16 *seq_ptr = seq_arg.data.qu16;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u16;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_u16(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_u16 rvec = npyv_loadn2_u16(
+        seq_ptr, stride
+    #if 0
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.u16
+    #endif
+    #if 0
+        , fill2_arg.data.u16
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vu16, .data = {.vu16=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !64 || 64 == 16
+static PyObject *
+simd__intrin_loadn2_u16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu16};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_u16};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_u16};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&:loadn2_u16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u16 *seq_ptr = seq_arg.data.qu16;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u16;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_u16(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_u16 rvec = npyv_loadn2_u16(
+        seq_ptr, stride
+    #if 0
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.u16
+    #endif
+    #if 0
+        , fill2_arg.data.u16
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vu16, .data = {.vu16=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !0 || 0 == 16
+static PyObject *
+simd__intrin_loadn_till_u16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu16};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 1
+    simd_arg fill_arg = {.dtype = simd_data_u16};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_u16};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:loadn_till_u16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u16 *seq_ptr = seq_arg.data.qu16;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u16;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 1;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn_till_u16(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_u16 rvec = npyv_loadn_till_u16(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 1
+        , fill_arg.data.u16
+    #endif
+    #if 0
+        , fill2_arg.data.u16
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vu16, .data = {.vu16=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !32 || 32 == 16
+static PyObject *
+simd__intrin_loadn2_till_u16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu16};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 1
+    simd_arg fill_arg = {.dtype = simd_data_u16};
+#endif
+#if 1
+    simd_arg fill2_arg = {.dtype = simd_data_u16};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&O&:loadn2_till_u16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u16 *seq_ptr = seq_arg.data.qu16;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u16;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_till_u16(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_u16 rvec = npyv_loadn2_till_u16(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 1
+        , fill_arg.data.u16
+    #endif
+    #if 1
+        , fill2_arg.data.u16
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vu16, .data = {.vu16=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !64 || 64 == 16
+static PyObject *
+simd__intrin_loadn2_till_u16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu16};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 1
+    simd_arg fill_arg = {.dtype = simd_data_u16};
+#endif
+#if 1
+    simd_arg fill2_arg = {.dtype = simd_data_u16};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&O&:loadn2_till_u16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u16 *seq_ptr = seq_arg.data.qu16;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u16;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_till_u16(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_u16 rvec = npyv_loadn2_till_u16(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 1
+        , fill_arg.data.u16
+    #endif
+    #if 1
+        , fill2_arg.data.u16
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vu16, .data = {.vu16=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !0 || 0 == 16
+static PyObject *
+simd__intrin_loadn_tillz_u16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu16};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_u16};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_u16};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:loadn_tillz_u16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u16 *seq_ptr = seq_arg.data.qu16;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u16;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 1;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn_tillz_u16(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_u16 rvec = npyv_loadn_tillz_u16(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.u16
+    #endif
+    #if 0
+        , fill2_arg.data.u16
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vu16, .data = {.vu16=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !32 || 32 == 16
+static PyObject *
+simd__intrin_loadn2_tillz_u16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu16};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_u16};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_u16};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:loadn2_tillz_u16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u16 *seq_ptr = seq_arg.data.qu16;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u16;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_tillz_u16(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_u16 rvec = npyv_loadn2_tillz_u16(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.u16
+    #endif
+    #if 0
+        , fill2_arg.data.u16
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vu16, .data = {.vu16=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !64 || 64 == 16
+static PyObject *
+simd__intrin_loadn2_tillz_u16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu16};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_u16};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_u16};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:loadn2_tillz_u16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u16 *seq_ptr = seq_arg.data.qu16;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u16;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_tillz_u16(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_u16 rvec = npyv_loadn2_tillz_u16(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.u16
+    #endif
+    #if 0
+        , fill2_arg.data.u16
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vu16, .data = {.vu16=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+
+// Non-contiguous Store
+#line 216
+#if !0 || 0 == 16
+static PyObject *
+simd__intrin_storen_u16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu16};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vu16};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:storen_u16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u16 *seq_ptr = seq_arg.data.qu16;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u16;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*1;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen_u16(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen_u16(
+        seq_ptr, stride
+    #if 0
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vu16
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu16, simd_data_qu16)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !32 || 32 == 16
+static PyObject *
+simd__intrin_storen2_u16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu16};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vu16};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:storen_u16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u16 *seq_ptr = seq_arg.data.qu16;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u16;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_u16(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_u16(
+        seq_ptr, stride
+    #if 0
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vu16
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu16, simd_data_qu16)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !64 || 64 == 16
+static PyObject *
+simd__intrin_storen2_u16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu16};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vu16};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:storen_u16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u16 *seq_ptr = seq_arg.data.qu16;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u16;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_u16(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_u16(
+        seq_ptr, stride
+    #if 0
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vu16
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu16, simd_data_qu16)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !0 || 0 == 16
+static PyObject *
+simd__intrin_storen_till_u16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu16};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vu16};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:storen_u16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u16 *seq_ptr = seq_arg.data.qu16;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u16;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*1;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen_till_u16(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen_till_u16(
+        seq_ptr, stride
+    #if 1
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vu16
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu16, simd_data_qu16)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !32 || 32 == 16
+static PyObject *
+simd__intrin_storen2_till_u16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu16};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vu16};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:storen_u16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u16 *seq_ptr = seq_arg.data.qu16;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u16;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_till_u16(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_till_u16(
+        seq_ptr, stride
+    #if 1
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vu16
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu16, simd_data_qu16)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !64 || 64 == 16
+static PyObject *
+simd__intrin_storen2_till_u16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu16};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vu16};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:storen_u16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u16 *seq_ptr = seq_arg.data.qu16;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u16;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_till_u16(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_till_u16(
+        seq_ptr, stride
+    #if 1
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vu16
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu16, simd_data_qu16)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#endif // 0
+
+/****************************
+ * Lookup tables
+ ****************************/
+#if 16 == 32
+SIMD_IMPL_INTRIN_2(lut32_u16, vu16, qu16, vu16)
+#endif
+#if 16 == 64
+SIMD_IMPL_INTRIN_2(lut16_u16, vu16, qu16, vu16)
+#endif
+/***************************
+ * Misc
+ ***************************/
+SIMD_IMPL_INTRIN_0(zero_u16, vu16)
+SIMD_IMPL_INTRIN_1(extract0_u16, u16, vu16)
+SIMD_IMPL_INTRIN_1(setall_u16, vu16, u16)
+SIMD_IMPL_INTRIN_3(select_u16, vu16, vb16, vu16, vu16)
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u8_u16, vu8, vu16)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s8_u16, vs8, vu16)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u16_u16, vu16, vu16)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s16_u16, vs16, vu16)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u32_u16, vu32, vu16)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s32_u16, vs32, vu16)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u64_u16, vu64, vu16)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s64_u16, vs64, vu16)
+#endif // simd_sup2
+
+#line 296
+#if NPY_SIMD_F32
+SIMD_IMPL_INTRIN_1(reinterpret_f32_u16, vf32, vu16)
+#endif // simd_sup2
+
+#line 296
+#if NPY_SIMD_F64
+SIMD_IMPL_INTRIN_1(reinterpret_f64_u16, vf64, vu16)
+#endif // simd_sup2
+
+
+/**
+ * special definition due to the nature of intrinsics
+ * npyv_setf_u16 and npy_set_u16.
+*/
+#line 308
+static PyObject *
+simd__intrin_setf_u16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    npyv_lanetype_u16 *data = simd_sequence_from_iterable(args, simd_data_qu16, npyv_nlanes_u16);
+    if (data == NULL) {
+        return NULL;
+    }
+    simd_data r = {.vu16 = npyv_setf_u16(
+        data[0],  data[1],  data[2],  data[3],  data[4],  data[5],  data[6],  data[7],
+        data[8],  data[9],  data[10], data[11], data[12], data[13], data[14], data[15],
+        data[16], data[17], data[18], data[19], data[20], data[21], data[22], data[23],
+        data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31],
+        data[32], data[33], data[34], data[35], data[36], data[37], data[38], data[39],
+        data[40], data[41], data[42], data[43], data[44], data[45], data[46], data[47],
+        data[48], data[49], data[50], data[51], data[52], data[53], data[54], data[55],
+        data[56], data[57], data[58], data[59], data[60], data[61], data[62], data[63],
+        data[64] // for setf
+    )};
+    simd_sequence_free(data);
+    return (PyObject*)PySIMDVector_FromData(r, simd_data_vu16);
+}
+
+#line 308
+static PyObject *
+simd__intrin_set_u16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    npyv_lanetype_u16 *data = simd_sequence_from_iterable(args, simd_data_qu16, npyv_nlanes_u16);
+    if (data == NULL) {
+        return NULL;
+    }
+    simd_data r = {.vu16 = npyv_set_u16(
+        data[0],  data[1],  data[2],  data[3],  data[4],  data[5],  data[6],  data[7],
+        data[8],  data[9],  data[10], data[11], data[12], data[13], data[14], data[15],
+        data[16], data[17], data[18], data[19], data[20], data[21], data[22], data[23],
+        data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31],
+        data[32], data[33], data[34], data[35], data[36], data[37], data[38], data[39],
+        data[40], data[41], data[42], data[43], data[44], data[45], data[46], data[47],
+        data[48], data[49], data[50], data[51], data[52], data[53], data[54], data[55],
+        data[56], data[57], data[58], data[59], data[60], data[61], data[62], data[63],
+        data[64] // for setf
+    )};
+    simd_sequence_free(data);
+    return (PyObject*)PySIMDVector_FromData(r, simd_data_vu16);
+}
+
+
+/***************************
+ * Reorder
+ ***************************/
+#line 337
+SIMD_IMPL_INTRIN_2(combinel_u16, vu16, vu16, vu16)
+
+#line 337
+SIMD_IMPL_INTRIN_2(combineh_u16, vu16, vu16, vu16)
+
+
+#line 343
+SIMD_IMPL_INTRIN_2(combine_u16, vu16x2, vu16, vu16)
+
+#line 343
+SIMD_IMPL_INTRIN_2(zip_u16, vu16x2, vu16, vu16)
+
+#line 343
+SIMD_IMPL_INTRIN_2(unzip_u16, vu16x2, vu16, vu16)
+
+
+#if 1
+SIMD_IMPL_INTRIN_1(rev64_u16, vu16, vu16)
+#endif
+
+// special implementation to convert runtime constants to immediate values
+#if 16 == 32
+// one call for element index then gather them within one vector
+// instead of unroll the 255 possible cases.
+NPY_FINLINE npyv_u16
+npyv_permi128_u16_(npyv_u16 a, unsigned e0, unsigned e1, unsigned e2, unsigned e3)
+{
+   #line 360
+    npyv_u16 ve0;
+    npyv_lanetype_u16 de0[npyv_nlanes_u16];
+    if (0) {}
+   #line 366
+    else if (e0 == 1) {
+        ve0 = npyv_permi128_u16(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e0 == 2) {
+        ve0 = npyv_permi128_u16(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e0 == 3) {
+        ve0 = npyv_permi128_u16(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve0 = npyv_permi128_u16(a, 0, 0, 0, 0);
+    }
+    npyv_store_u16(de0, ve0);
+    
+#line 360
+    npyv_u16 ve1;
+    npyv_lanetype_u16 de1[npyv_nlanes_u16];
+    if (0) {}
+   #line 366
+    else if (e1 == 1) {
+        ve1 = npyv_permi128_u16(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e1 == 2) {
+        ve1 = npyv_permi128_u16(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e1 == 3) {
+        ve1 = npyv_permi128_u16(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve1 = npyv_permi128_u16(a, 0, 0, 0, 0);
+    }
+    npyv_store_u16(de1, ve1);
+    
+#line 360
+    npyv_u16 ve2;
+    npyv_lanetype_u16 de2[npyv_nlanes_u16];
+    if (0) {}
+   #line 366
+    else if (e2 == 1) {
+        ve2 = npyv_permi128_u16(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e2 == 2) {
+        ve2 = npyv_permi128_u16(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e2 == 3) {
+        ve2 = npyv_permi128_u16(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve2 = npyv_permi128_u16(a, 0, 0, 0, 0);
+    }
+    npyv_store_u16(de2, ve2);
+    
+#line 360
+    npyv_u16 ve3;
+    npyv_lanetype_u16 de3[npyv_nlanes_u16];
+    if (0) {}
+   #line 366
+    else if (e3 == 1) {
+        ve3 = npyv_permi128_u16(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e3 == 2) {
+        ve3 = npyv_permi128_u16(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e3 == 3) {
+        ve3 = npyv_permi128_u16(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve3 = npyv_permi128_u16(a, 0, 0, 0, 0);
+    }
+    npyv_store_u16(de3, ve3);
+    
+    if (e0 == e1 && e0 == e2 && e0 == e3) {
+        return ve0;
+    }
+    for (int i = 0; i < npyv_nlanes_u16; i += 4) {
+        de0[i+1] = de1[i+1];
+        de0[i+2] = de2[i+2];
+        de0[i+3] = de3[i+3];
+    }
+    return npyv_load_u16(de0);
+}
+SIMD_IMPL_INTRIN_5(permi128_u16_, vu16, vu16, u8, u8, u8, u8)
+#elif 16 == 64
+NPY_FINLINE npyv_u16
+npyv_permi128_u16_(npyv_u16 a, unsigned e0, unsigned e1)
+{
+    if (e0 == 1 && e1 == 0) {
+        return npyv_permi128_u16(a, 1, 0);
+    }
+    else if (e0 == 0 && e1 == 1) {
+        return npyv_permi128_u16(a, 0, 1);
+    }
+    else if (e0 == 1 && e1 == 1) {
+        return npyv_permi128_u16(a, 1, 1);
+    }
+    return npyv_permi128_u16(a, 0, 0);
+}
+SIMD_IMPL_INTRIN_3(permi128_u16_, vu16, vu16, u8, u8)
+#endif
+
+/***************************
+ * Operators
+ ***************************/
+#if 15 > 0
+SIMD_IMPL_INTRIN_2(shl_u16, vu16, vu16, u8)
+SIMD_IMPL_INTRIN_2(shr_u16, vu16, vu16, u8)
+// immediate constant
+SIMD_IMPL_INTRIN_2IMM(shli_u16, vu16, vu16, 15)
+SIMD_IMPL_INTRIN_2IMM(shri_u16, vu16, vu16, 16)
+#endif // shl_imm
+
+#line 418
+SIMD_IMPL_INTRIN_2(and_u16, vu16, vu16, vu16)
+
+#line 418
+SIMD_IMPL_INTRIN_2(or_u16, vu16, vu16, vu16)
+
+#line 418
+SIMD_IMPL_INTRIN_2(xor_u16, vu16, vu16, vu16)
+
+
+SIMD_IMPL_INTRIN_1(not_u16, vu16, vu16)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpeq_u16, vb16, vu16, vu16)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpneq_u16, vb16, vu16, vu16)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpgt_u16, vb16, vu16, vu16)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpge_u16, vb16, vu16, vu16)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmplt_u16, vb16, vu16, vu16)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmple_u16, vb16, vu16, vu16)
+
+
+#if 0
+SIMD_IMPL_INTRIN_2(andc_u16, vu16, vu16, vu16)
+SIMD_IMPL_INTRIN_2(andc_b16, vb16, vb16, vb16)
+SIMD_IMPL_INTRIN_2(orc_b16, vb16, vb16, vb16)
+SIMD_IMPL_INTRIN_2(xnor_b16, vb16, vb16, vb16)
+#endif
+
+// test cross all vector lanes
+#line 440
+SIMD_IMPL_INTRIN_1(any_u16, u8, vu16)
+
+#line 440
+SIMD_IMPL_INTRIN_1(all_u16, u8, vu16)
+
+/***************************
+ * Conversion
+ ***************************/
+SIMD_IMPL_INTRIN_1(cvt_u16_b16, vu16,  vb16)
+SIMD_IMPL_INTRIN_1(cvt_b16_u16, vb16, vu16)
+#if 1
+SIMD_IMPL_INTRIN_1(expand_u32_u16, vu32x2, vu16)
+#endif // expand_sup
+/***************************
+ * Arithmetic
+ ***************************/
+#line 456
+SIMD_IMPL_INTRIN_2(add_u16, vu16, vu16, vu16)
+
+#line 456
+SIMD_IMPL_INTRIN_2(sub_u16, vu16, vu16, vu16)
+
+
+#if 1
+#line 463
+SIMD_IMPL_INTRIN_2(adds_u16, vu16, vu16, vu16)
+
+#line 463
+SIMD_IMPL_INTRIN_2(subs_u16, vu16, vu16, vu16)
+
+#endif // sat_sup
+
+#if 1
+SIMD_IMPL_INTRIN_2(mul_u16, vu16, vu16, vu16)
+#endif // mul_sup
+
+#if 0
+SIMD_IMPL_INTRIN_2(div_u16, vu16, vu16, vu16)
+#endif // div_sup
+
+#if 1
+SIMD_IMPL_INTRIN_1(divisor_u16, vu16x3, u16)
+SIMD_IMPL_INTRIN_2(divc_u16, vu16, vu16, vu16x3)
+#endif // intdiv_sup
+
+#if 0
+#line 484
+SIMD_IMPL_INTRIN_3(muladd_u16, vu16, vu16, vu16, vu16)
+
+#line 484
+SIMD_IMPL_INTRIN_3(mulsub_u16, vu16, vu16, vu16, vu16)
+
+#line 484
+SIMD_IMPL_INTRIN_3(nmuladd_u16, vu16, vu16, vu16, vu16)
+
+#line 484
+SIMD_IMPL_INTRIN_3(nmulsub_u16, vu16, vu16, vu16, vu16)
+
+#line 484
+SIMD_IMPL_INTRIN_3(muladdsub_u16, vu16, vu16, vu16, vu16)
+
+#endif // fused_sup
+
+#if 0
+SIMD_IMPL_INTRIN_1(sum_u16, u16, vu16)
+#endif // sum_sup
+
+#if 1
+SIMD_IMPL_INTRIN_1(sumup_u16, u32, vu16)
+#endif // sumup_sup
+
+/***************************
+ * Math
+ ***************************/
+#if 0
+#line 503
+SIMD_IMPL_INTRIN_1(sqrt_u16, vu16, vu16)
+
+#line 503
+SIMD_IMPL_INTRIN_1(recip_u16, vu16, vu16)
+
+#line 503
+SIMD_IMPL_INTRIN_1(abs_u16, vu16, vu16)
+
+#line 503
+SIMD_IMPL_INTRIN_1(square_u16, vu16, vu16)
+
+#line 503
+SIMD_IMPL_INTRIN_1(rint_u16, vu16, vu16)
+
+#line 503
+SIMD_IMPL_INTRIN_1(ceil_u16, vu16, vu16)
+
+#line 503
+SIMD_IMPL_INTRIN_1(trunc_u16, vu16, vu16)
+
+#line 503
+SIMD_IMPL_INTRIN_1(floor_u16, vu16, vu16)
+
+#endif
+
+#line 510
+SIMD_IMPL_INTRIN_2(max_u16, vu16, vu16, vu16)
+SIMD_IMPL_INTRIN_1(reduce_max_u16, u16, vu16)
+
+#line 510
+SIMD_IMPL_INTRIN_2(min_u16, vu16, vu16, vu16)
+SIMD_IMPL_INTRIN_1(reduce_min_u16, u16, vu16)
+
+
+#if 0
+#line 518
+SIMD_IMPL_INTRIN_2(maxp_u16, vu16, vu16, vu16)
+SIMD_IMPL_INTRIN_1(reduce_maxp_u16, u16, vu16)
+
+#line 518
+SIMD_IMPL_INTRIN_2(minp_u16, vu16, vu16, vu16)
+SIMD_IMPL_INTRIN_1(reduce_minp_u16, u16, vu16)
+
+#line 518
+SIMD_IMPL_INTRIN_2(maxn_u16, vu16, vu16, vu16)
+SIMD_IMPL_INTRIN_1(reduce_maxn_u16, u16, vu16)
+
+#line 518
+SIMD_IMPL_INTRIN_2(minn_u16, vu16, vu16, vu16)
+SIMD_IMPL_INTRIN_1(reduce_minn_u16, u16, vu16)
+
+/**end repeat1**/
+#endif
+
+/***************************
+ * Mask operations
+ ***************************/
+#line 530
+ SIMD_IMPL_INTRIN_4(ifadd_u16, vu16, vb16, vu16, vu16, vu16)
+
+#line 530
+ SIMD_IMPL_INTRIN_4(ifsub_u16, vu16, vb16, vu16, vu16, vu16)
+
+
+#if 0
+SIMD_IMPL_INTRIN_4(ifdiv_u16, vu16, vb16, vu16, vu16, vu16)
+SIMD_IMPL_INTRIN_3(ifdivz_u16, vu16, vb16, vu16, vu16)
+#endif
+
+#endif // simd_sup
+
+#line 36
+#if 1
+/***************************
+ * Memory
+ ***************************/
+#line 43
+SIMD_IMPL_INTRIN_1(load_s16, vs16, qs16)
+
+#line 43
+SIMD_IMPL_INTRIN_1(loada_s16, vs16, qs16)
+
+#line 43
+SIMD_IMPL_INTRIN_1(loads_s16, vs16, qs16)
+
+#line 43
+SIMD_IMPL_INTRIN_1(loadl_s16, vs16, qs16)
+
+SIMD_IMPL_INTRIN_1(load_s16x2, vs16x2, qs16)
+
+#line 51
+// special definition due to the nature of store
+static PyObject *
+simd__intrin_store_s16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs16};
+    simd_arg vec_arg = {.dtype = simd_data_vs16};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:store_s16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store_s16(seq_arg.data.qs16, vec_arg.data.vs16);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs16, simd_data_qs16)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of storea
+static PyObject *
+simd__intrin_storea_s16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs16};
+    simd_arg vec_arg = {.dtype = simd_data_vs16};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:storea_s16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_storea_s16(seq_arg.data.qs16, vec_arg.data.vs16);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs16, simd_data_qs16)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of stores
+static PyObject *
+simd__intrin_stores_s16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs16};
+    simd_arg vec_arg = {.dtype = simd_data_vs16};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:stores_s16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_stores_s16(seq_arg.data.qs16, vec_arg.data.vs16);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs16, simd_data_qs16)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of storel
+static PyObject *
+simd__intrin_storel_s16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs16};
+    simd_arg vec_arg = {.dtype = simd_data_vs16};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:storel_s16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_storel_s16(seq_arg.data.qs16, vec_arg.data.vs16);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs16, simd_data_qs16)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of storeh
+static PyObject *
+simd__intrin_storeh_s16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs16};
+    simd_arg vec_arg = {.dtype = simd_data_vs16};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:storeh_s16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_storeh_s16(seq_arg.data.qs16, vec_arg.data.vs16);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs16, simd_data_qs16)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of store
+static PyObject *
+simd__intrin_store_s16x2(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs16};
+    simd_arg vec_arg = {.dtype = simd_data_vs16x2};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:store_s16x2",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store_s16x2(seq_arg.data.qs16, vec_arg.data.vs16x2);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs16, simd_data_qs16)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+
+/****************************************
+ * Non-contiguous/Partial Memory access
+ ****************************************/
+#if 0
+// Partial Load
+SIMD_IMPL_INTRIN_3(load_till_s16, vs16, qs16, u32, s16)
+SIMD_IMPL_INTRIN_2(load_tillz_s16, vs16, qs16, u32)
+#if 16 == 32
+    SIMD_IMPL_INTRIN_4(load2_till_s16, vs16, qs16, u32, s16, s16)
+    SIMD_IMPL_INTRIN_2(load2_tillz_s16, vs16, qs16, u32)
+#else
+    SIMD_IMPL_INTRIN_4(load2_till_s16, vs16, qs16, u32, s16, s16)
+    SIMD_IMPL_INTRIN_2(load2_tillz_s16, vs16, qs16, u32)
+#endif
+
+// Partial Store
+#line 95
+#if !0 || 0 == 16
+static PyObject *
+simd__intrin_store_till_s16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs16};
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+    simd_arg vec_arg = {.dtype = simd_data_vs16};
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:store_till_s16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &nlane_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store_till_s16(
+        seq_arg.data.qs16, nlane_arg.data.u32, vec_arg.data.vs16
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs16, simd_data_qs16)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+#endif // chksize
+
+
+#line 95
+#if !32 || 32 == 16
+static PyObject *
+simd__intrin_store2_till_s16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs16};
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+    simd_arg vec_arg = {.dtype = simd_data_vs16};
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:store2_till_s16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &nlane_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store2_till_s16(
+        seq_arg.data.qs16, nlane_arg.data.u32, vec_arg.data.vs16
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs16, simd_data_qs16)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+#endif // chksize
+
+
+#line 95
+#if !64 || 64 == 16
+static PyObject *
+simd__intrin_store2_till_s16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs16};
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+    simd_arg vec_arg = {.dtype = simd_data_vs16};
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:store2_till_s16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &nlane_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store2_till_s16(
+        seq_arg.data.qs16, nlane_arg.data.u32, vec_arg.data.vs16
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs16, simd_data_qs16)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+#endif // chksize
+
+
+// Non-contiguous Load
+#line 136
+#if !0 || 0 == 16
+static PyObject *
+simd__intrin_loadn_s16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs16};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_s16};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_s16};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&:loadn_s16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s16 *seq_ptr = seq_arg.data.qs16;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s16;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 1;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn_s16(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_s16 rvec = npyv_loadn_s16(
+        seq_ptr, stride
+    #if 0
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.s16
+    #endif
+    #if 0
+        , fill2_arg.data.s16
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vs16, .data = {.vs16=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !32 || 32 == 16
+static PyObject *
+simd__intrin_loadn2_s16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs16};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_s16};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_s16};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&:loadn2_s16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s16 *seq_ptr = seq_arg.data.qs16;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s16;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_s16(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_s16 rvec = npyv_loadn2_s16(
+        seq_ptr, stride
+    #if 0
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.s16
+    #endif
+    #if 0
+        , fill2_arg.data.s16
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vs16, .data = {.vs16=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !64 || 64 == 16
+static PyObject *
+simd__intrin_loadn2_s16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs16};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_s16};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_s16};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&:loadn2_s16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s16 *seq_ptr = seq_arg.data.qs16;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s16;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_s16(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_s16 rvec = npyv_loadn2_s16(
+        seq_ptr, stride
+    #if 0
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.s16
+    #endif
+    #if 0
+        , fill2_arg.data.s16
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vs16, .data = {.vs16=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !0 || 0 == 16
+static PyObject *
+simd__intrin_loadn_till_s16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs16};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 1
+    simd_arg fill_arg = {.dtype = simd_data_s16};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_s16};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:loadn_till_s16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s16 *seq_ptr = seq_arg.data.qs16;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s16;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 1;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn_till_s16(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_s16 rvec = npyv_loadn_till_s16(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 1
+        , fill_arg.data.s16
+    #endif
+    #if 0
+        , fill2_arg.data.s16
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vs16, .data = {.vs16=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !32 || 32 == 16
+static PyObject *
+simd__intrin_loadn2_till_s16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs16};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 1
+    simd_arg fill_arg = {.dtype = simd_data_s16};
+#endif
+#if 1
+    simd_arg fill2_arg = {.dtype = simd_data_s16};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&O&:loadn2_till_s16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s16 *seq_ptr = seq_arg.data.qs16;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s16;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_till_s16(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_s16 rvec = npyv_loadn2_till_s16(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 1
+        , fill_arg.data.s16
+    #endif
+    #if 1
+        , fill2_arg.data.s16
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vs16, .data = {.vs16=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !64 || 64 == 16
+static PyObject *
+simd__intrin_loadn2_till_s16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs16};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 1
+    simd_arg fill_arg = {.dtype = simd_data_s16};
+#endif
+#if 1
+    simd_arg fill2_arg = {.dtype = simd_data_s16};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&O&:loadn2_till_s16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s16 *seq_ptr = seq_arg.data.qs16;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s16;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_till_s16(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_s16 rvec = npyv_loadn2_till_s16(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 1
+        , fill_arg.data.s16
+    #endif
+    #if 1
+        , fill2_arg.data.s16
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vs16, .data = {.vs16=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !0 || 0 == 16
+static PyObject *
+simd__intrin_loadn_tillz_s16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs16};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_s16};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_s16};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:loadn_tillz_s16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s16 *seq_ptr = seq_arg.data.qs16;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s16;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 1;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn_tillz_s16(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_s16 rvec = npyv_loadn_tillz_s16(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.s16
+    #endif
+    #if 0
+        , fill2_arg.data.s16
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vs16, .data = {.vs16=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !32 || 32 == 16
+static PyObject *
+simd__intrin_loadn2_tillz_s16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs16};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_s16};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_s16};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:loadn2_tillz_s16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s16 *seq_ptr = seq_arg.data.qs16;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s16;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_tillz_s16(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_s16 rvec = npyv_loadn2_tillz_s16(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.s16
+    #endif
+    #if 0
+        , fill2_arg.data.s16
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vs16, .data = {.vs16=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !64 || 64 == 16
+static PyObject *
+simd__intrin_loadn2_tillz_s16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs16};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_s16};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_s16};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:loadn2_tillz_s16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s16 *seq_ptr = seq_arg.data.qs16;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s16;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_tillz_s16(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_s16 rvec = npyv_loadn2_tillz_s16(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.s16
+    #endif
+    #if 0
+        , fill2_arg.data.s16
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vs16, .data = {.vs16=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+
+// Non-contiguous Store
+#line 216
+#if !0 || 0 == 16
+static PyObject *
+simd__intrin_storen_s16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs16};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vs16};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:storen_s16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s16 *seq_ptr = seq_arg.data.qs16;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s16;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*1;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen_s16(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen_s16(
+        seq_ptr, stride
+    #if 0
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vs16
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs16, simd_data_qs16)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !32 || 32 == 16
+static PyObject *
+simd__intrin_storen2_s16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs16};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vs16};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:storen_s16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s16 *seq_ptr = seq_arg.data.qs16;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s16;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_s16(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_s16(
+        seq_ptr, stride
+    #if 0
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vs16
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs16, simd_data_qs16)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !64 || 64 == 16
+static PyObject *
+simd__intrin_storen2_s16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs16};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vs16};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:storen_s16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s16 *seq_ptr = seq_arg.data.qs16;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s16;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_s16(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_s16(
+        seq_ptr, stride
+    #if 0
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vs16
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs16, simd_data_qs16)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !0 || 0 == 16
+static PyObject *
+simd__intrin_storen_till_s16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs16};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vs16};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:storen_s16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s16 *seq_ptr = seq_arg.data.qs16;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s16;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*1;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen_till_s16(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen_till_s16(
+        seq_ptr, stride
+    #if 1
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vs16
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs16, simd_data_qs16)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !32 || 32 == 16
+static PyObject *
+simd__intrin_storen2_till_s16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs16};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vs16};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:storen_s16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s16 *seq_ptr = seq_arg.data.qs16;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s16;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_till_s16(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_till_s16(
+        seq_ptr, stride
+    #if 1
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vs16
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs16, simd_data_qs16)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !64 || 64 == 16
+static PyObject *
+simd__intrin_storen2_till_s16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs16};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vs16};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:storen_s16",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s16 *seq_ptr = seq_arg.data.qs16;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s16;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_till_s16(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_till_s16(
+        seq_ptr, stride
+    #if 1
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vs16
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs16, simd_data_qs16)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#endif // 0
+
+/****************************
+ * Lookup tables
+ ****************************/
+#if 16 == 32
+SIMD_IMPL_INTRIN_2(lut32_s16, vs16, qs16, vu16)
+#endif
+#if 16 == 64
+SIMD_IMPL_INTRIN_2(lut16_s16, vs16, qs16, vu16)
+#endif
+/***************************
+ * Misc
+ ***************************/
+SIMD_IMPL_INTRIN_0(zero_s16, vs16)
+SIMD_IMPL_INTRIN_1(extract0_s16, s16, vs16)
+SIMD_IMPL_INTRIN_1(setall_s16, vs16, s16)
+SIMD_IMPL_INTRIN_3(select_s16, vs16, vb16, vs16, vs16)
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u8_s16, vu8, vs16)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s8_s16, vs8, vs16)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u16_s16, vu16, vs16)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s16_s16, vs16, vs16)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u32_s16, vu32, vs16)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s32_s16, vs32, vs16)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u64_s16, vu64, vs16)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s64_s16, vs64, vs16)
+#endif // simd_sup2
+
+#line 296
+#if NPY_SIMD_F32
+SIMD_IMPL_INTRIN_1(reinterpret_f32_s16, vf32, vs16)
+#endif // simd_sup2
+
+#line 296
+#if NPY_SIMD_F64
+SIMD_IMPL_INTRIN_1(reinterpret_f64_s16, vf64, vs16)
+#endif // simd_sup2
+
+
+/**
+ * special definition due to the nature of intrinsics
+ * npyv_setf_s16 and npy_set_s16.
+*/
+#line 308
+static PyObject *
+simd__intrin_setf_s16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    npyv_lanetype_s16 *data = simd_sequence_from_iterable(args, simd_data_qs16, npyv_nlanes_s16);
+    if (data == NULL) {
+        return NULL;
+    }
+    simd_data r = {.vs16 = npyv_setf_s16(
+        data[0],  data[1],  data[2],  data[3],  data[4],  data[5],  data[6],  data[7],
+        data[8],  data[9],  data[10], data[11], data[12], data[13], data[14], data[15],
+        data[16], data[17], data[18], data[19], data[20], data[21], data[22], data[23],
+        data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31],
+        data[32], data[33], data[34], data[35], data[36], data[37], data[38], data[39],
+        data[40], data[41], data[42], data[43], data[44], data[45], data[46], data[47],
+        data[48], data[49], data[50], data[51], data[52], data[53], data[54], data[55],
+        data[56], data[57], data[58], data[59], data[60], data[61], data[62], data[63],
+        data[64] // for setf
+    )};
+    simd_sequence_free(data);
+    return (PyObject*)PySIMDVector_FromData(r, simd_data_vs16);
+}
+
+#line 308
+static PyObject *
+simd__intrin_set_s16(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    npyv_lanetype_s16 *data = simd_sequence_from_iterable(args, simd_data_qs16, npyv_nlanes_s16);
+    if (data == NULL) {
+        return NULL;
+    }
+    simd_data r = {.vs16 = npyv_set_s16(
+        data[0],  data[1],  data[2],  data[3],  data[4],  data[5],  data[6],  data[7],
+        data[8],  data[9],  data[10], data[11], data[12], data[13], data[14], data[15],
+        data[16], data[17], data[18], data[19], data[20], data[21], data[22], data[23],
+        data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31],
+        data[32], data[33], data[34], data[35], data[36], data[37], data[38], data[39],
+        data[40], data[41], data[42], data[43], data[44], data[45], data[46], data[47],
+        data[48], data[49], data[50], data[51], data[52], data[53], data[54], data[55],
+        data[56], data[57], data[58], data[59], data[60], data[61], data[62], data[63],
+        data[64] // for setf
+    )};
+    simd_sequence_free(data);
+    return (PyObject*)PySIMDVector_FromData(r, simd_data_vs16);
+}
+
+
+/***************************
+ * Reorder
+ ***************************/
+#line 337
+SIMD_IMPL_INTRIN_2(combinel_s16, vs16, vs16, vs16)
+
+#line 337
+SIMD_IMPL_INTRIN_2(combineh_s16, vs16, vs16, vs16)
+
+
+#line 343
+SIMD_IMPL_INTRIN_2(combine_s16, vs16x2, vs16, vs16)
+
+#line 343
+SIMD_IMPL_INTRIN_2(zip_s16, vs16x2, vs16, vs16)
+
+#line 343
+SIMD_IMPL_INTRIN_2(unzip_s16, vs16x2, vs16, vs16)
+
+
+#if 1
+SIMD_IMPL_INTRIN_1(rev64_s16, vs16, vs16)
+#endif
+
+// special implementation to convert runtime constants to immediate values
+#if 16 == 32
+// one call for element index then gather them within one vector
+// instead of unroll the 255 possible cases.
+NPY_FINLINE npyv_s16
+npyv_permi128_s16_(npyv_s16 a, unsigned e0, unsigned e1, unsigned e2, unsigned e3)
+{
+   #line 360
+    npyv_s16 ve0;
+    npyv_lanetype_s16 de0[npyv_nlanes_s16];
+    if (0) {}
+   #line 366
+    else if (e0 == 1) {
+        ve0 = npyv_permi128_s16(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e0 == 2) {
+        ve0 = npyv_permi128_s16(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e0 == 3) {
+        ve0 = npyv_permi128_s16(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve0 = npyv_permi128_s16(a, 0, 0, 0, 0);
+    }
+    npyv_store_s16(de0, ve0);
+    
+#line 360
+    npyv_s16 ve1;
+    npyv_lanetype_s16 de1[npyv_nlanes_s16];
+    if (0) {}
+   #line 366
+    else if (e1 == 1) {
+        ve1 = npyv_permi128_s16(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e1 == 2) {
+        ve1 = npyv_permi128_s16(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e1 == 3) {
+        ve1 = npyv_permi128_s16(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve1 = npyv_permi128_s16(a, 0, 0, 0, 0);
+    }
+    npyv_store_s16(de1, ve1);
+    
+#line 360
+    npyv_s16 ve2;
+    npyv_lanetype_s16 de2[npyv_nlanes_s16];
+    if (0) {}
+   #line 366
+    else if (e2 == 1) {
+        ve2 = npyv_permi128_s16(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e2 == 2) {
+        ve2 = npyv_permi128_s16(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e2 == 3) {
+        ve2 = npyv_permi128_s16(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve2 = npyv_permi128_s16(a, 0, 0, 0, 0);
+    }
+    npyv_store_s16(de2, ve2);
+    
+#line 360
+    npyv_s16 ve3;
+    npyv_lanetype_s16 de3[npyv_nlanes_s16];
+    if (0) {}
+   #line 366
+    else if (e3 == 1) {
+        ve3 = npyv_permi128_s16(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e3 == 2) {
+        ve3 = npyv_permi128_s16(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e3 == 3) {
+        ve3 = npyv_permi128_s16(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve3 = npyv_permi128_s16(a, 0, 0, 0, 0);
+    }
+    npyv_store_s16(de3, ve3);
+    
+    if (e0 == e1 && e0 == e2 && e0 == e3) {
+        return ve0;
+    }
+    for (int i = 0; i < npyv_nlanes_s16; i += 4) {
+        de0[i+1] = de1[i+1];
+        de0[i+2] = de2[i+2];
+        de0[i+3] = de3[i+3];
+    }
+    return npyv_load_s16(de0);
+}
+SIMD_IMPL_INTRIN_5(permi128_s16_, vs16, vs16, u8, u8, u8, u8)
+#elif 16 == 64
+NPY_FINLINE npyv_s16
+npyv_permi128_s16_(npyv_s16 a, unsigned e0, unsigned e1)
+{
+    if (e0 == 1 && e1 == 0) {
+        return npyv_permi128_s16(a, 1, 0);
+    }
+    else if (e0 == 0 && e1 == 1) {
+        return npyv_permi128_s16(a, 0, 1);
+    }
+    else if (e0 == 1 && e1 == 1) {
+        return npyv_permi128_s16(a, 1, 1);
+    }
+    return npyv_permi128_s16(a, 0, 0);
+}
+SIMD_IMPL_INTRIN_3(permi128_s16_, vs16, vs16, u8, u8)
+#endif
+
+/***************************
+ * Operators
+ ***************************/
+#if 15 > 0
+SIMD_IMPL_INTRIN_2(shl_s16, vs16, vs16, u8)
+SIMD_IMPL_INTRIN_2(shr_s16, vs16, vs16, u8)
+// immediate constant
+SIMD_IMPL_INTRIN_2IMM(shli_s16, vs16, vs16, 15)
+SIMD_IMPL_INTRIN_2IMM(shri_s16, vs16, vs16, 16)
+#endif // shl_imm
+
+#line 418
+SIMD_IMPL_INTRIN_2(and_s16, vs16, vs16, vs16)
+
+#line 418
+SIMD_IMPL_INTRIN_2(or_s16, vs16, vs16, vs16)
+
+#line 418
+SIMD_IMPL_INTRIN_2(xor_s16, vs16, vs16, vs16)
+
+
+SIMD_IMPL_INTRIN_1(not_s16, vs16, vs16)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpeq_s16, vb16, vs16, vs16)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpneq_s16, vb16, vs16, vs16)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpgt_s16, vb16, vs16, vs16)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpge_s16, vb16, vs16, vs16)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmplt_s16, vb16, vs16, vs16)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmple_s16, vb16, vs16, vs16)
+
+
+#if 0
+SIMD_IMPL_INTRIN_2(andc_s16, vs16, vs16, vs16)
+SIMD_IMPL_INTRIN_2(andc_b16, vb16, vb16, vb16)
+SIMD_IMPL_INTRIN_2(orc_b16, vb16, vb16, vb16)
+SIMD_IMPL_INTRIN_2(xnor_b16, vb16, vb16, vb16)
+#endif
+
+// test cross all vector lanes
+#line 440
+SIMD_IMPL_INTRIN_1(any_s16, u8, vs16)
+
+#line 440
+SIMD_IMPL_INTRIN_1(all_s16, u8, vs16)
+
+/***************************
+ * Conversion
+ ***************************/
+SIMD_IMPL_INTRIN_1(cvt_s16_b16, vs16,  vb16)
+SIMD_IMPL_INTRIN_1(cvt_b16_s16, vb16, vs16)
+#if 0
+SIMD_IMPL_INTRIN_1(expand_s16_s16, vs16x2, vs16)
+#endif // expand_sup
+/***************************
+ * Arithmetic
+ ***************************/
+#line 456
+SIMD_IMPL_INTRIN_2(add_s16, vs16, vs16, vs16)
+
+#line 456
+SIMD_IMPL_INTRIN_2(sub_s16, vs16, vs16, vs16)
+
+
+#if 1
+#line 463
+SIMD_IMPL_INTRIN_2(adds_s16, vs16, vs16, vs16)
+
+#line 463
+SIMD_IMPL_INTRIN_2(subs_s16, vs16, vs16, vs16)
+
+#endif // sat_sup
+
+#if 1
+SIMD_IMPL_INTRIN_2(mul_s16, vs16, vs16, vs16)
+#endif // mul_sup
+
+#if 0
+SIMD_IMPL_INTRIN_2(div_s16, vs16, vs16, vs16)
+#endif // div_sup
+
+#if 1
+SIMD_IMPL_INTRIN_1(divisor_s16, vs16x3, s16)
+SIMD_IMPL_INTRIN_2(divc_s16, vs16, vs16, vs16x3)
+#endif // intdiv_sup
+
+#if 0
+#line 484
+SIMD_IMPL_INTRIN_3(muladd_s16, vs16, vs16, vs16, vs16)
+
+#line 484
+SIMD_IMPL_INTRIN_3(mulsub_s16, vs16, vs16, vs16, vs16)
+
+#line 484
+SIMD_IMPL_INTRIN_3(nmuladd_s16, vs16, vs16, vs16, vs16)
+
+#line 484
+SIMD_IMPL_INTRIN_3(nmulsub_s16, vs16, vs16, vs16, vs16)
+
+#line 484
+SIMD_IMPL_INTRIN_3(muladdsub_s16, vs16, vs16, vs16, vs16)
+
+#endif // fused_sup
+
+#if 0
+SIMD_IMPL_INTRIN_1(sum_s16, s16, vs16)
+#endif // sum_sup
+
+#if 0
+SIMD_IMPL_INTRIN_1(sumup_s16, s16, vs16)
+#endif // sumup_sup
+
+/***************************
+ * Math
+ ***************************/
+#if 0
+#line 503
+SIMD_IMPL_INTRIN_1(sqrt_s16, vs16, vs16)
+
+#line 503
+SIMD_IMPL_INTRIN_1(recip_s16, vs16, vs16)
+
+#line 503
+SIMD_IMPL_INTRIN_1(abs_s16, vs16, vs16)
+
+#line 503
+SIMD_IMPL_INTRIN_1(square_s16, vs16, vs16)
+
+#line 503
+SIMD_IMPL_INTRIN_1(rint_s16, vs16, vs16)
+
+#line 503
+SIMD_IMPL_INTRIN_1(ceil_s16, vs16, vs16)
+
+#line 503
+SIMD_IMPL_INTRIN_1(trunc_s16, vs16, vs16)
+
+#line 503
+SIMD_IMPL_INTRIN_1(floor_s16, vs16, vs16)
+
+#endif
+
+#line 510
+SIMD_IMPL_INTRIN_2(max_s16, vs16, vs16, vs16)
+SIMD_IMPL_INTRIN_1(reduce_max_s16, s16, vs16)
+
+#line 510
+SIMD_IMPL_INTRIN_2(min_s16, vs16, vs16, vs16)
+SIMD_IMPL_INTRIN_1(reduce_min_s16, s16, vs16)
+
+
+#if 0
+#line 518
+SIMD_IMPL_INTRIN_2(maxp_s16, vs16, vs16, vs16)
+SIMD_IMPL_INTRIN_1(reduce_maxp_s16, s16, vs16)
+
+#line 518
+SIMD_IMPL_INTRIN_2(minp_s16, vs16, vs16, vs16)
+SIMD_IMPL_INTRIN_1(reduce_minp_s16, s16, vs16)
+
+#line 518
+SIMD_IMPL_INTRIN_2(maxn_s16, vs16, vs16, vs16)
+SIMD_IMPL_INTRIN_1(reduce_maxn_s16, s16, vs16)
+
+#line 518
+SIMD_IMPL_INTRIN_2(minn_s16, vs16, vs16, vs16)
+SIMD_IMPL_INTRIN_1(reduce_minn_s16, s16, vs16)
+
+/**end repeat1**/
+#endif
+
+/***************************
+ * Mask operations
+ ***************************/
+#line 530
+ SIMD_IMPL_INTRIN_4(ifadd_s16, vs16, vb16, vs16, vs16, vs16)
+
+#line 530
+ SIMD_IMPL_INTRIN_4(ifsub_s16, vs16, vb16, vs16, vs16, vs16)
+
+
+#if 0
+SIMD_IMPL_INTRIN_4(ifdiv_s16, vs16, vb16, vs16, vs16, vs16)
+SIMD_IMPL_INTRIN_3(ifdivz_s16, vs16, vb16, vs16, vs16)
+#endif
+
+#endif // simd_sup
+
+#line 36
+#if 1
+/***************************
+ * Memory
+ ***************************/
+#line 43
+SIMD_IMPL_INTRIN_1(load_u32, vu32, qu32)
+
+#line 43
+SIMD_IMPL_INTRIN_1(loada_u32, vu32, qu32)
+
+#line 43
+SIMD_IMPL_INTRIN_1(loads_u32, vu32, qu32)
+
+#line 43
+SIMD_IMPL_INTRIN_1(loadl_u32, vu32, qu32)
+
+SIMD_IMPL_INTRIN_1(load_u32x2, vu32x2, qu32)
+
+#line 51
+// special definition due to the nature of store
+static PyObject *
+simd__intrin_store_u32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu32};
+    simd_arg vec_arg = {.dtype = simd_data_vu32};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:store_u32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store_u32(seq_arg.data.qu32, vec_arg.data.vu32);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu32, simd_data_qu32)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of storea
+static PyObject *
+simd__intrin_storea_u32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu32};
+    simd_arg vec_arg = {.dtype = simd_data_vu32};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:storea_u32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_storea_u32(seq_arg.data.qu32, vec_arg.data.vu32);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu32, simd_data_qu32)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of stores
+static PyObject *
+simd__intrin_stores_u32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu32};
+    simd_arg vec_arg = {.dtype = simd_data_vu32};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:stores_u32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_stores_u32(seq_arg.data.qu32, vec_arg.data.vu32);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu32, simd_data_qu32)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of storel
+static PyObject *
+simd__intrin_storel_u32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu32};
+    simd_arg vec_arg = {.dtype = simd_data_vu32};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:storel_u32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_storel_u32(seq_arg.data.qu32, vec_arg.data.vu32);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu32, simd_data_qu32)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of storeh
+static PyObject *
+simd__intrin_storeh_u32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu32};
+    simd_arg vec_arg = {.dtype = simd_data_vu32};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:storeh_u32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_storeh_u32(seq_arg.data.qu32, vec_arg.data.vu32);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu32, simd_data_qu32)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of store
+static PyObject *
+simd__intrin_store_u32x2(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu32};
+    simd_arg vec_arg = {.dtype = simd_data_vu32x2};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:store_u32x2",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store_u32x2(seq_arg.data.qu32, vec_arg.data.vu32x2);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu32, simd_data_qu32)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+
+/****************************************
+ * Non-contiguous/Partial Memory access
+ ****************************************/
+#if 1
+// Partial Load
+SIMD_IMPL_INTRIN_3(load_till_u32, vu32, qu32, u32, u32)
+SIMD_IMPL_INTRIN_2(load_tillz_u32, vu32, qu32, u32)
+#if 32 == 32
+    SIMD_IMPL_INTRIN_4(load2_till_u32, vu32, qu32, u32, u32, u32)
+    SIMD_IMPL_INTRIN_2(load2_tillz_u32, vu32, qu32, u32)
+#else
+    SIMD_IMPL_INTRIN_4(load2_till_u32, vu32, qu32, u32, u32, u32)
+    SIMD_IMPL_INTRIN_2(load2_tillz_u32, vu32, qu32, u32)
+#endif
+
+// Partial Store
+#line 95
+#if !0 || 0 == 32
+static PyObject *
+simd__intrin_store_till_u32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu32};
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+    simd_arg vec_arg = {.dtype = simd_data_vu32};
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:store_till_u32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &nlane_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store_till_u32(
+        seq_arg.data.qu32, nlane_arg.data.u32, vec_arg.data.vu32
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu32, simd_data_qu32)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+#endif // chksize
+
+
+#line 95
+#if !32 || 32 == 32
+static PyObject *
+simd__intrin_store2_till_u32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu32};
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+    simd_arg vec_arg = {.dtype = simd_data_vu32};
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:store2_till_u32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &nlane_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store2_till_u32(
+        seq_arg.data.qu32, nlane_arg.data.u32, vec_arg.data.vu32
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu32, simd_data_qu32)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+#endif // chksize
+
+
+#line 95
+#if !64 || 64 == 32
+static PyObject *
+simd__intrin_store2_till_u32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu32};
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+    simd_arg vec_arg = {.dtype = simd_data_vu32};
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:store2_till_u32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &nlane_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store2_till_u32(
+        seq_arg.data.qu32, nlane_arg.data.u32, vec_arg.data.vu32
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu32, simd_data_qu32)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+#endif // chksize
+
+
+// Non-contiguous Load
+#line 136
+#if !0 || 0 == 32
+static PyObject *
+simd__intrin_loadn_u32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_u32};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&:loadn_u32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u32 *seq_ptr = seq_arg.data.qu32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 1;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn_u32(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_u32 rvec = npyv_loadn_u32(
+        seq_ptr, stride
+    #if 0
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.u32
+    #endif
+    #if 0
+        , fill2_arg.data.u32
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vu32, .data = {.vu32=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !32 || 32 == 32
+static PyObject *
+simd__intrin_loadn2_u32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_u32};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&:loadn2_u32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u32 *seq_ptr = seq_arg.data.qu32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_u32(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_u32 rvec = npyv_loadn2_u32(
+        seq_ptr, stride
+    #if 0
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.u32
+    #endif
+    #if 0
+        , fill2_arg.data.u32
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vu32, .data = {.vu32=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !64 || 64 == 32
+static PyObject *
+simd__intrin_loadn2_u32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_u32};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&:loadn2_u32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u32 *seq_ptr = seq_arg.data.qu32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_u32(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_u32 rvec = npyv_loadn2_u32(
+        seq_ptr, stride
+    #if 0
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.u32
+    #endif
+    #if 0
+        , fill2_arg.data.u32
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vu32, .data = {.vu32=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !0 || 0 == 32
+static PyObject *
+simd__intrin_loadn_till_u32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 1
+    simd_arg fill_arg = {.dtype = simd_data_u32};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:loadn_till_u32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u32 *seq_ptr = seq_arg.data.qu32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 1;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn_till_u32(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_u32 rvec = npyv_loadn_till_u32(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 1
+        , fill_arg.data.u32
+    #endif
+    #if 0
+        , fill2_arg.data.u32
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vu32, .data = {.vu32=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !32 || 32 == 32
+static PyObject *
+simd__intrin_loadn2_till_u32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 1
+    simd_arg fill_arg = {.dtype = simd_data_u32};
+#endif
+#if 1
+    simd_arg fill2_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&O&:loadn2_till_u32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u32 *seq_ptr = seq_arg.data.qu32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_till_u32(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_u32 rvec = npyv_loadn2_till_u32(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 1
+        , fill_arg.data.u32
+    #endif
+    #if 1
+        , fill2_arg.data.u32
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vu32, .data = {.vu32=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !64 || 64 == 32
+static PyObject *
+simd__intrin_loadn2_till_u32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 1
+    simd_arg fill_arg = {.dtype = simd_data_u32};
+#endif
+#if 1
+    simd_arg fill2_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&O&:loadn2_till_u32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u32 *seq_ptr = seq_arg.data.qu32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_till_u32(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_u32 rvec = npyv_loadn2_till_u32(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 1
+        , fill_arg.data.u32
+    #endif
+    #if 1
+        , fill2_arg.data.u32
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vu32, .data = {.vu32=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !0 || 0 == 32
+static PyObject *
+simd__intrin_loadn_tillz_u32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_u32};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:loadn_tillz_u32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u32 *seq_ptr = seq_arg.data.qu32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 1;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn_tillz_u32(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_u32 rvec = npyv_loadn_tillz_u32(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.u32
+    #endif
+    #if 0
+        , fill2_arg.data.u32
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vu32, .data = {.vu32=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !32 || 32 == 32
+static PyObject *
+simd__intrin_loadn2_tillz_u32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_u32};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:loadn2_tillz_u32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u32 *seq_ptr = seq_arg.data.qu32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_tillz_u32(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_u32 rvec = npyv_loadn2_tillz_u32(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.u32
+    #endif
+    #if 0
+        , fill2_arg.data.u32
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vu32, .data = {.vu32=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !64 || 64 == 32
+static PyObject *
+simd__intrin_loadn2_tillz_u32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_u32};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:loadn2_tillz_u32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u32 *seq_ptr = seq_arg.data.qu32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_tillz_u32(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_u32 rvec = npyv_loadn2_tillz_u32(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.u32
+    #endif
+    #if 0
+        , fill2_arg.data.u32
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vu32, .data = {.vu32=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+
+// Non-contiguous Store
+#line 216
+#if !0 || 0 == 32
+static PyObject *
+simd__intrin_storen_u32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vu32};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:storen_u32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u32 *seq_ptr = seq_arg.data.qu32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*1;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen_u32(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen_u32(
+        seq_ptr, stride
+    #if 0
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vu32
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu32, simd_data_qu32)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !32 || 32 == 32
+static PyObject *
+simd__intrin_storen2_u32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vu32};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:storen_u32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u32 *seq_ptr = seq_arg.data.qu32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_u32(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_u32(
+        seq_ptr, stride
+    #if 0
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vu32
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu32, simd_data_qu32)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !64 || 64 == 32
+static PyObject *
+simd__intrin_storen2_u32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vu32};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:storen_u32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u32 *seq_ptr = seq_arg.data.qu32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_u32(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_u32(
+        seq_ptr, stride
+    #if 0
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vu32
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu32, simd_data_qu32)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !0 || 0 == 32
+static PyObject *
+simd__intrin_storen_till_u32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vu32};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:storen_u32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u32 *seq_ptr = seq_arg.data.qu32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*1;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen_till_u32(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen_till_u32(
+        seq_ptr, stride
+    #if 1
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vu32
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu32, simd_data_qu32)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !32 || 32 == 32
+static PyObject *
+simd__intrin_storen2_till_u32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vu32};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:storen_u32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u32 *seq_ptr = seq_arg.data.qu32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_till_u32(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_till_u32(
+        seq_ptr, stride
+    #if 1
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vu32
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu32, simd_data_qu32)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !64 || 64 == 32
+static PyObject *
+simd__intrin_storen2_till_u32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vu32};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:storen_u32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u32 *seq_ptr = seq_arg.data.qu32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_till_u32(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_till_u32(
+        seq_ptr, stride
+    #if 1
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vu32
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu32, simd_data_qu32)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#endif // 1
+
+/****************************
+ * Lookup tables
+ ****************************/
+#if 32 == 32
+SIMD_IMPL_INTRIN_2(lut32_u32, vu32, qu32, vu32)
+#endif
+#if 32 == 64
+SIMD_IMPL_INTRIN_2(lut16_u32, vu32, qu32, vu32)
+#endif
+/***************************
+ * Misc
+ ***************************/
+SIMD_IMPL_INTRIN_0(zero_u32, vu32)
+SIMD_IMPL_INTRIN_1(extract0_u32, u32, vu32)
+SIMD_IMPL_INTRIN_1(setall_u32, vu32, u32)
+SIMD_IMPL_INTRIN_3(select_u32, vu32, vb32, vu32, vu32)
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u8_u32, vu8, vu32)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s8_u32, vs8, vu32)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u16_u32, vu16, vu32)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s16_u32, vs16, vu32)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u32_u32, vu32, vu32)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s32_u32, vs32, vu32)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u64_u32, vu64, vu32)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s64_u32, vs64, vu32)
+#endif // simd_sup2
+
+#line 296
+#if NPY_SIMD_F32
+SIMD_IMPL_INTRIN_1(reinterpret_f32_u32, vf32, vu32)
+#endif // simd_sup2
+
+#line 296
+#if NPY_SIMD_F64
+SIMD_IMPL_INTRIN_1(reinterpret_f64_u32, vf64, vu32)
+#endif // simd_sup2
+
+
+/**
+ * special definition due to the nature of intrinsics
+ * npyv_setf_u32 and npy_set_u32.
+*/
+#line 308
+static PyObject *
+simd__intrin_setf_u32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    npyv_lanetype_u32 *data = simd_sequence_from_iterable(args, simd_data_qu32, npyv_nlanes_u32);
+    if (data == NULL) {
+        return NULL;
+    }
+    simd_data r = {.vu32 = npyv_setf_u32(
+        data[0],  data[1],  data[2],  data[3],  data[4],  data[5],  data[6],  data[7],
+        data[8],  data[9],  data[10], data[11], data[12], data[13], data[14], data[15],
+        data[16], data[17], data[18], data[19], data[20], data[21], data[22], data[23],
+        data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31],
+        data[32], data[33], data[34], data[35], data[36], data[37], data[38], data[39],
+        data[40], data[41], data[42], data[43], data[44], data[45], data[46], data[47],
+        data[48], data[49], data[50], data[51], data[52], data[53], data[54], data[55],
+        data[56], data[57], data[58], data[59], data[60], data[61], data[62], data[63],
+        data[64] // for setf
+    )};
+    simd_sequence_free(data);
+    return (PyObject*)PySIMDVector_FromData(r, simd_data_vu32);
+}
+
+#line 308
+static PyObject *
+simd__intrin_set_u32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    npyv_lanetype_u32 *data = simd_sequence_from_iterable(args, simd_data_qu32, npyv_nlanes_u32);
+    if (data == NULL) {
+        return NULL;
+    }
+    simd_data r = {.vu32 = npyv_set_u32(
+        data[0],  data[1],  data[2],  data[3],  data[4],  data[5],  data[6],  data[7],
+        data[8],  data[9],  data[10], data[11], data[12], data[13], data[14], data[15],
+        data[16], data[17], data[18], data[19], data[20], data[21], data[22], data[23],
+        data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31],
+        data[32], data[33], data[34], data[35], data[36], data[37], data[38], data[39],
+        data[40], data[41], data[42], data[43], data[44], data[45], data[46], data[47],
+        data[48], data[49], data[50], data[51], data[52], data[53], data[54], data[55],
+        data[56], data[57], data[58], data[59], data[60], data[61], data[62], data[63],
+        data[64] // for setf
+    )};
+    simd_sequence_free(data);
+    return (PyObject*)PySIMDVector_FromData(r, simd_data_vu32);
+}
+
+
+/***************************
+ * Reorder
+ ***************************/
+#line 337
+SIMD_IMPL_INTRIN_2(combinel_u32, vu32, vu32, vu32)
+
+#line 337
+SIMD_IMPL_INTRIN_2(combineh_u32, vu32, vu32, vu32)
+
+
+#line 343
+SIMD_IMPL_INTRIN_2(combine_u32, vu32x2, vu32, vu32)
+
+#line 343
+SIMD_IMPL_INTRIN_2(zip_u32, vu32x2, vu32, vu32)
+
+#line 343
+SIMD_IMPL_INTRIN_2(unzip_u32, vu32x2, vu32, vu32)
+
+
+#if 1
+SIMD_IMPL_INTRIN_1(rev64_u32, vu32, vu32)
+#endif
+
+// special implementation to convert runtime constants to immediate values
+#if 32 == 32
+// one call for element index then gather them within one vector
+// instead of unroll the 255 possible cases.
+NPY_FINLINE npyv_u32
+npyv_permi128_u32_(npyv_u32 a, unsigned e0, unsigned e1, unsigned e2, unsigned e3)
+{
+   #line 360
+    npyv_u32 ve0;
+    npyv_lanetype_u32 de0[npyv_nlanes_u32];
+    if (0) {}
+   #line 366
+    else if (e0 == 1) {
+        ve0 = npyv_permi128_u32(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e0 == 2) {
+        ve0 = npyv_permi128_u32(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e0 == 3) {
+        ve0 = npyv_permi128_u32(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve0 = npyv_permi128_u32(a, 0, 0, 0, 0);
+    }
+    npyv_store_u32(de0, ve0);
+    
+#line 360
+    npyv_u32 ve1;
+    npyv_lanetype_u32 de1[npyv_nlanes_u32];
+    if (0) {}
+   #line 366
+    else if (e1 == 1) {
+        ve1 = npyv_permi128_u32(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e1 == 2) {
+        ve1 = npyv_permi128_u32(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e1 == 3) {
+        ve1 = npyv_permi128_u32(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve1 = npyv_permi128_u32(a, 0, 0, 0, 0);
+    }
+    npyv_store_u32(de1, ve1);
+    
+#line 360
+    npyv_u32 ve2;
+    npyv_lanetype_u32 de2[npyv_nlanes_u32];
+    if (0) {}
+   #line 366
+    else if (e2 == 1) {
+        ve2 = npyv_permi128_u32(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e2 == 2) {
+        ve2 = npyv_permi128_u32(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e2 == 3) {
+        ve2 = npyv_permi128_u32(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve2 = npyv_permi128_u32(a, 0, 0, 0, 0);
+    }
+    npyv_store_u32(de2, ve2);
+    
+#line 360
+    npyv_u32 ve3;
+    npyv_lanetype_u32 de3[npyv_nlanes_u32];
+    if (0) {}
+   #line 366
+    else if (e3 == 1) {
+        ve3 = npyv_permi128_u32(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e3 == 2) {
+        ve3 = npyv_permi128_u32(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e3 == 3) {
+        ve3 = npyv_permi128_u32(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve3 = npyv_permi128_u32(a, 0, 0, 0, 0);
+    }
+    npyv_store_u32(de3, ve3);
+    
+    if (e0 == e1 && e0 == e2 && e0 == e3) {
+        return ve0;
+    }
+    for (int i = 0; i < npyv_nlanes_u32; i += 4) {
+        de0[i+1] = de1[i+1];
+        de0[i+2] = de2[i+2];
+        de0[i+3] = de3[i+3];
+    }
+    return npyv_load_u32(de0);
+}
+SIMD_IMPL_INTRIN_5(permi128_u32_, vu32, vu32, u8, u8, u8, u8)
+#elif 32 == 64
+NPY_FINLINE npyv_u32
+npyv_permi128_u32_(npyv_u32 a, unsigned e0, unsigned e1)
+{
+    if (e0 == 1 && e1 == 0) {
+        return npyv_permi128_u32(a, 1, 0);
+    }
+    else if (e0 == 0 && e1 == 1) {
+        return npyv_permi128_u32(a, 0, 1);
+    }
+    else if (e0 == 1 && e1 == 1) {
+        return npyv_permi128_u32(a, 1, 1);
+    }
+    return npyv_permi128_u32(a, 0, 0);
+}
+SIMD_IMPL_INTRIN_3(permi128_u32_, vu32, vu32, u8, u8)
+#endif
+
+/***************************
+ * Operators
+ ***************************/
+#if 31 > 0
+SIMD_IMPL_INTRIN_2(shl_u32, vu32, vu32, u8)
+SIMD_IMPL_INTRIN_2(shr_u32, vu32, vu32, u8)
+// immediate constant
+SIMD_IMPL_INTRIN_2IMM(shli_u32, vu32, vu32, 31)
+SIMD_IMPL_INTRIN_2IMM(shri_u32, vu32, vu32, 32)
+#endif // shl_imm
+
+#line 418
+SIMD_IMPL_INTRIN_2(and_u32, vu32, vu32, vu32)
+
+#line 418
+SIMD_IMPL_INTRIN_2(or_u32, vu32, vu32, vu32)
+
+#line 418
+SIMD_IMPL_INTRIN_2(xor_u32, vu32, vu32, vu32)
+
+
+SIMD_IMPL_INTRIN_1(not_u32, vu32, vu32)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpeq_u32, vb32, vu32, vu32)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpneq_u32, vb32, vu32, vu32)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpgt_u32, vb32, vu32, vu32)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpge_u32, vb32, vu32, vu32)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmplt_u32, vb32, vu32, vu32)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmple_u32, vb32, vu32, vu32)
+
+
+#if 0
+SIMD_IMPL_INTRIN_2(andc_u32, vu32, vu32, vu32)
+SIMD_IMPL_INTRIN_2(andc_b32, vb32, vb32, vb32)
+SIMD_IMPL_INTRIN_2(orc_b32, vb32, vb32, vb32)
+SIMD_IMPL_INTRIN_2(xnor_b32, vb32, vb32, vb32)
+#endif
+
+// test cross all vector lanes
+#line 440
+SIMD_IMPL_INTRIN_1(any_u32, u8, vu32)
+
+#line 440
+SIMD_IMPL_INTRIN_1(all_u32, u8, vu32)
+
+/***************************
+ * Conversion
+ ***************************/
+SIMD_IMPL_INTRIN_1(cvt_u32_b32, vu32,  vb32)
+SIMD_IMPL_INTRIN_1(cvt_b32_u32, vb32, vu32)
+#if 0
+SIMD_IMPL_INTRIN_1(expand_u32_u32, vu32x2, vu32)
+#endif // expand_sup
+/***************************
+ * Arithmetic
+ ***************************/
+#line 456
+SIMD_IMPL_INTRIN_2(add_u32, vu32, vu32, vu32)
+
+#line 456
+SIMD_IMPL_INTRIN_2(sub_u32, vu32, vu32, vu32)
+
+
+#if 0
+#line 463
+SIMD_IMPL_INTRIN_2(adds_u32, vu32, vu32, vu32)
+
+#line 463
+SIMD_IMPL_INTRIN_2(subs_u32, vu32, vu32, vu32)
+
+#endif // sat_sup
+
+#if 1
+SIMD_IMPL_INTRIN_2(mul_u32, vu32, vu32, vu32)
+#endif // mul_sup
+
+#if 0
+SIMD_IMPL_INTRIN_2(div_u32, vu32, vu32, vu32)
+#endif // div_sup
+
+#if 1
+SIMD_IMPL_INTRIN_1(divisor_u32, vu32x3, u32)
+SIMD_IMPL_INTRIN_2(divc_u32, vu32, vu32, vu32x3)
+#endif // intdiv_sup
+
+#if 0
+#line 484
+SIMD_IMPL_INTRIN_3(muladd_u32, vu32, vu32, vu32, vu32)
+
+#line 484
+SIMD_IMPL_INTRIN_3(mulsub_u32, vu32, vu32, vu32, vu32)
+
+#line 484
+SIMD_IMPL_INTRIN_3(nmuladd_u32, vu32, vu32, vu32, vu32)
+
+#line 484
+SIMD_IMPL_INTRIN_3(nmulsub_u32, vu32, vu32, vu32, vu32)
+
+#line 484
+SIMD_IMPL_INTRIN_3(muladdsub_u32, vu32, vu32, vu32, vu32)
+
+#endif // fused_sup
+
+#if 1
+SIMD_IMPL_INTRIN_1(sum_u32, u32, vu32)
+#endif // sum_sup
+
+#if 0
+SIMD_IMPL_INTRIN_1(sumup_u32, u32, vu32)
+#endif // sumup_sup
+
+/***************************
+ * Math
+ ***************************/
+#if 0
+#line 503
+SIMD_IMPL_INTRIN_1(sqrt_u32, vu32, vu32)
+
+#line 503
+SIMD_IMPL_INTRIN_1(recip_u32, vu32, vu32)
+
+#line 503
+SIMD_IMPL_INTRIN_1(abs_u32, vu32, vu32)
+
+#line 503
+SIMD_IMPL_INTRIN_1(square_u32, vu32, vu32)
+
+#line 503
+SIMD_IMPL_INTRIN_1(rint_u32, vu32, vu32)
+
+#line 503
+SIMD_IMPL_INTRIN_1(ceil_u32, vu32, vu32)
+
+#line 503
+SIMD_IMPL_INTRIN_1(trunc_u32, vu32, vu32)
+
+#line 503
+SIMD_IMPL_INTRIN_1(floor_u32, vu32, vu32)
+
+#endif
+
+#line 510
+SIMD_IMPL_INTRIN_2(max_u32, vu32, vu32, vu32)
+SIMD_IMPL_INTRIN_1(reduce_max_u32, u32, vu32)
+
+#line 510
+SIMD_IMPL_INTRIN_2(min_u32, vu32, vu32, vu32)
+SIMD_IMPL_INTRIN_1(reduce_min_u32, u32, vu32)
+
+
+#if 0
+#line 518
+SIMD_IMPL_INTRIN_2(maxp_u32, vu32, vu32, vu32)
+SIMD_IMPL_INTRIN_1(reduce_maxp_u32, u32, vu32)
+
+#line 518
+SIMD_IMPL_INTRIN_2(minp_u32, vu32, vu32, vu32)
+SIMD_IMPL_INTRIN_1(reduce_minp_u32, u32, vu32)
+
+#line 518
+SIMD_IMPL_INTRIN_2(maxn_u32, vu32, vu32, vu32)
+SIMD_IMPL_INTRIN_1(reduce_maxn_u32, u32, vu32)
+
+#line 518
+SIMD_IMPL_INTRIN_2(minn_u32, vu32, vu32, vu32)
+SIMD_IMPL_INTRIN_1(reduce_minn_u32, u32, vu32)
+
+/**end repeat1**/
+#endif
+
+/***************************
+ * Mask operations
+ ***************************/
+#line 530
+ SIMD_IMPL_INTRIN_4(ifadd_u32, vu32, vb32, vu32, vu32, vu32)
+
+#line 530
+ SIMD_IMPL_INTRIN_4(ifsub_u32, vu32, vb32, vu32, vu32, vu32)
+
+
+#if 0
+SIMD_IMPL_INTRIN_4(ifdiv_u32, vu32, vb32, vu32, vu32, vu32)
+SIMD_IMPL_INTRIN_3(ifdivz_u32, vu32, vb32, vu32, vu32)
+#endif
+
+#endif // simd_sup
+
+#line 36
+#if 1
+/***************************
+ * Memory
+ ***************************/
+#line 43
+SIMD_IMPL_INTRIN_1(load_s32, vs32, qs32)
+
+#line 43
+SIMD_IMPL_INTRIN_1(loada_s32, vs32, qs32)
+
+#line 43
+SIMD_IMPL_INTRIN_1(loads_s32, vs32, qs32)
+
+#line 43
+SIMD_IMPL_INTRIN_1(loadl_s32, vs32, qs32)
+
+SIMD_IMPL_INTRIN_1(load_s32x2, vs32x2, qs32)
+
+#line 51
+// special definition due to the nature of store
+static PyObject *
+simd__intrin_store_s32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs32};
+    simd_arg vec_arg = {.dtype = simd_data_vs32};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:store_s32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store_s32(seq_arg.data.qs32, vec_arg.data.vs32);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs32, simd_data_qs32)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of storea
+static PyObject *
+simd__intrin_storea_s32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs32};
+    simd_arg vec_arg = {.dtype = simd_data_vs32};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:storea_s32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_storea_s32(seq_arg.data.qs32, vec_arg.data.vs32);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs32, simd_data_qs32)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of stores
+static PyObject *
+simd__intrin_stores_s32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs32};
+    simd_arg vec_arg = {.dtype = simd_data_vs32};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:stores_s32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_stores_s32(seq_arg.data.qs32, vec_arg.data.vs32);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs32, simd_data_qs32)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of storel
+static PyObject *
+simd__intrin_storel_s32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs32};
+    simd_arg vec_arg = {.dtype = simd_data_vs32};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:storel_s32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_storel_s32(seq_arg.data.qs32, vec_arg.data.vs32);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs32, simd_data_qs32)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of storeh
+static PyObject *
+simd__intrin_storeh_s32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs32};
+    simd_arg vec_arg = {.dtype = simd_data_vs32};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:storeh_s32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_storeh_s32(seq_arg.data.qs32, vec_arg.data.vs32);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs32, simd_data_qs32)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of store
+static PyObject *
+simd__intrin_store_s32x2(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs32};
+    simd_arg vec_arg = {.dtype = simd_data_vs32x2};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:store_s32x2",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store_s32x2(seq_arg.data.qs32, vec_arg.data.vs32x2);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs32, simd_data_qs32)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+
+/****************************************
+ * Non-contiguous/Partial Memory access
+ ****************************************/
+#if 1
+// Partial Load
+SIMD_IMPL_INTRIN_3(load_till_s32, vs32, qs32, u32, s32)
+SIMD_IMPL_INTRIN_2(load_tillz_s32, vs32, qs32, u32)
+#if 32 == 32
+    SIMD_IMPL_INTRIN_4(load2_till_s32, vs32, qs32, u32, s32, s32)
+    SIMD_IMPL_INTRIN_2(load2_tillz_s32, vs32, qs32, u32)
+#else
+    SIMD_IMPL_INTRIN_4(load2_till_s32, vs32, qs32, u32, s32, s32)
+    SIMD_IMPL_INTRIN_2(load2_tillz_s32, vs32, qs32, u32)
+#endif
+
+// Partial Store
+#line 95
+#if !0 || 0 == 32
+static PyObject *
+simd__intrin_store_till_s32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs32};
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+    simd_arg vec_arg = {.dtype = simd_data_vs32};
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:store_till_s32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &nlane_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store_till_s32(
+        seq_arg.data.qs32, nlane_arg.data.u32, vec_arg.data.vs32
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs32, simd_data_qs32)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+#endif // chksize
+
+
+#line 95
+#if !32 || 32 == 32
+static PyObject *
+simd__intrin_store2_till_s32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs32};
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+    simd_arg vec_arg = {.dtype = simd_data_vs32};
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:store2_till_s32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &nlane_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store2_till_s32(
+        seq_arg.data.qs32, nlane_arg.data.u32, vec_arg.data.vs32
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs32, simd_data_qs32)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+#endif // chksize
+
+
+#line 95
+#if !64 || 64 == 32
+static PyObject *
+simd__intrin_store2_till_s32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs32};
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+    simd_arg vec_arg = {.dtype = simd_data_vs32};
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:store2_till_s32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &nlane_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store2_till_s32(
+        seq_arg.data.qs32, nlane_arg.data.u32, vec_arg.data.vs32
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs32, simd_data_qs32)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+#endif // chksize
+
+
+// Non-contiguous Load
+#line 136
+#if !0 || 0 == 32
+static PyObject *
+simd__intrin_loadn_s32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_s32};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_s32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&:loadn_s32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s32 *seq_ptr = seq_arg.data.qs32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 1;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn_s32(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_s32 rvec = npyv_loadn_s32(
+        seq_ptr, stride
+    #if 0
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.s32
+    #endif
+    #if 0
+        , fill2_arg.data.s32
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vs32, .data = {.vs32=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !32 || 32 == 32
+static PyObject *
+simd__intrin_loadn2_s32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_s32};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_s32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&:loadn2_s32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s32 *seq_ptr = seq_arg.data.qs32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_s32(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_s32 rvec = npyv_loadn2_s32(
+        seq_ptr, stride
+    #if 0
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.s32
+    #endif
+    #if 0
+        , fill2_arg.data.s32
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vs32, .data = {.vs32=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !64 || 64 == 32
+static PyObject *
+simd__intrin_loadn2_s32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_s32};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_s32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&:loadn2_s32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s32 *seq_ptr = seq_arg.data.qs32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_s32(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_s32 rvec = npyv_loadn2_s32(
+        seq_ptr, stride
+    #if 0
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.s32
+    #endif
+    #if 0
+        , fill2_arg.data.s32
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vs32, .data = {.vs32=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !0 || 0 == 32
+static PyObject *
+simd__intrin_loadn_till_s32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 1
+    simd_arg fill_arg = {.dtype = simd_data_s32};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_s32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:loadn_till_s32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s32 *seq_ptr = seq_arg.data.qs32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 1;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn_till_s32(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_s32 rvec = npyv_loadn_till_s32(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 1
+        , fill_arg.data.s32
+    #endif
+    #if 0
+        , fill2_arg.data.s32
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vs32, .data = {.vs32=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !32 || 32 == 32
+static PyObject *
+simd__intrin_loadn2_till_s32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 1
+    simd_arg fill_arg = {.dtype = simd_data_s32};
+#endif
+#if 1
+    simd_arg fill2_arg = {.dtype = simd_data_s32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&O&:loadn2_till_s32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s32 *seq_ptr = seq_arg.data.qs32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_till_s32(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_s32 rvec = npyv_loadn2_till_s32(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 1
+        , fill_arg.data.s32
+    #endif
+    #if 1
+        , fill2_arg.data.s32
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vs32, .data = {.vs32=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !64 || 64 == 32
+static PyObject *
+simd__intrin_loadn2_till_s32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 1
+    simd_arg fill_arg = {.dtype = simd_data_s32};
+#endif
+#if 1
+    simd_arg fill2_arg = {.dtype = simd_data_s32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&O&:loadn2_till_s32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s32 *seq_ptr = seq_arg.data.qs32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_till_s32(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_s32 rvec = npyv_loadn2_till_s32(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 1
+        , fill_arg.data.s32
+    #endif
+    #if 1
+        , fill2_arg.data.s32
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vs32, .data = {.vs32=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !0 || 0 == 32
+static PyObject *
+simd__intrin_loadn_tillz_s32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_s32};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_s32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:loadn_tillz_s32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s32 *seq_ptr = seq_arg.data.qs32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 1;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn_tillz_s32(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_s32 rvec = npyv_loadn_tillz_s32(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.s32
+    #endif
+    #if 0
+        , fill2_arg.data.s32
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vs32, .data = {.vs32=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !32 || 32 == 32
+static PyObject *
+simd__intrin_loadn2_tillz_s32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_s32};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_s32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:loadn2_tillz_s32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s32 *seq_ptr = seq_arg.data.qs32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_tillz_s32(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_s32 rvec = npyv_loadn2_tillz_s32(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.s32
+    #endif
+    #if 0
+        , fill2_arg.data.s32
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vs32, .data = {.vs32=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !64 || 64 == 32
+static PyObject *
+simd__intrin_loadn2_tillz_s32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_s32};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_s32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:loadn2_tillz_s32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s32 *seq_ptr = seq_arg.data.qs32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_tillz_s32(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_s32 rvec = npyv_loadn2_tillz_s32(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.s32
+    #endif
+    #if 0
+        , fill2_arg.data.s32
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vs32, .data = {.vs32=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+
+// Non-contiguous Store
+#line 216
+#if !0 || 0 == 32
+static PyObject *
+simd__intrin_storen_s32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vs32};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:storen_s32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s32 *seq_ptr = seq_arg.data.qs32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*1;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen_s32(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen_s32(
+        seq_ptr, stride
+    #if 0
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vs32
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs32, simd_data_qs32)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !32 || 32 == 32
+static PyObject *
+simd__intrin_storen2_s32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vs32};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:storen_s32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s32 *seq_ptr = seq_arg.data.qs32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_s32(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_s32(
+        seq_ptr, stride
+    #if 0
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vs32
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs32, simd_data_qs32)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !64 || 64 == 32
+static PyObject *
+simd__intrin_storen2_s32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vs32};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:storen_s32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s32 *seq_ptr = seq_arg.data.qs32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_s32(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_s32(
+        seq_ptr, stride
+    #if 0
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vs32
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs32, simd_data_qs32)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !0 || 0 == 32
+static PyObject *
+simd__intrin_storen_till_s32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vs32};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:storen_s32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s32 *seq_ptr = seq_arg.data.qs32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*1;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen_till_s32(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen_till_s32(
+        seq_ptr, stride
+    #if 1
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vs32
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs32, simd_data_qs32)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !32 || 32 == 32
+static PyObject *
+simd__intrin_storen2_till_s32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vs32};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:storen_s32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s32 *seq_ptr = seq_arg.data.qs32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_till_s32(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_till_s32(
+        seq_ptr, stride
+    #if 1
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vs32
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs32, simd_data_qs32)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !64 || 64 == 32
+static PyObject *
+simd__intrin_storen2_till_s32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vs32};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:storen_s32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s32 *seq_ptr = seq_arg.data.qs32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_till_s32(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_till_s32(
+        seq_ptr, stride
+    #if 1
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vs32
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs32, simd_data_qs32)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#endif // 1
+
+/****************************
+ * Lookup tables
+ ****************************/
+#if 32 == 32
+SIMD_IMPL_INTRIN_2(lut32_s32, vs32, qs32, vu32)
+#endif
+#if 32 == 64
+SIMD_IMPL_INTRIN_2(lut16_s32, vs32, qs32, vu32)
+#endif
+/***************************
+ * Misc
+ ***************************/
+SIMD_IMPL_INTRIN_0(zero_s32, vs32)
+SIMD_IMPL_INTRIN_1(extract0_s32, s32, vs32)
+SIMD_IMPL_INTRIN_1(setall_s32, vs32, s32)
+SIMD_IMPL_INTRIN_3(select_s32, vs32, vb32, vs32, vs32)
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u8_s32, vu8, vs32)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s8_s32, vs8, vs32)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u16_s32, vu16, vs32)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s16_s32, vs16, vs32)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u32_s32, vu32, vs32)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s32_s32, vs32, vs32)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u64_s32, vu64, vs32)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s64_s32, vs64, vs32)
+#endif // simd_sup2
+
+#line 296
+#if NPY_SIMD_F32
+SIMD_IMPL_INTRIN_1(reinterpret_f32_s32, vf32, vs32)
+#endif // simd_sup2
+
+#line 296
+#if NPY_SIMD_F64
+SIMD_IMPL_INTRIN_1(reinterpret_f64_s32, vf64, vs32)
+#endif // simd_sup2
+
+
+/**
+ * special definition due to the nature of intrinsics
+ * npyv_setf_s32 and npy_set_s32.
+*/
+#line 308
+static PyObject *
+simd__intrin_setf_s32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    npyv_lanetype_s32 *data = simd_sequence_from_iterable(args, simd_data_qs32, npyv_nlanes_s32);
+    if (data == NULL) {
+        return NULL;
+    }
+    simd_data r = {.vs32 = npyv_setf_s32(
+        data[0],  data[1],  data[2],  data[3],  data[4],  data[5],  data[6],  data[7],
+        data[8],  data[9],  data[10], data[11], data[12], data[13], data[14], data[15],
+        data[16], data[17], data[18], data[19], data[20], data[21], data[22], data[23],
+        data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31],
+        data[32], data[33], data[34], data[35], data[36], data[37], data[38], data[39],
+        data[40], data[41], data[42], data[43], data[44], data[45], data[46], data[47],
+        data[48], data[49], data[50], data[51], data[52], data[53], data[54], data[55],
+        data[56], data[57], data[58], data[59], data[60], data[61], data[62], data[63],
+        data[64] // for setf
+    )};
+    simd_sequence_free(data);
+    return (PyObject*)PySIMDVector_FromData(r, simd_data_vs32);
+}
+
+#line 308
+static PyObject *
+simd__intrin_set_s32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    npyv_lanetype_s32 *data = simd_sequence_from_iterable(args, simd_data_qs32, npyv_nlanes_s32);
+    if (data == NULL) {
+        return NULL;
+    }
+    simd_data r = {.vs32 = npyv_set_s32(
+        data[0],  data[1],  data[2],  data[3],  data[4],  data[5],  data[6],  data[7],
+        data[8],  data[9],  data[10], data[11], data[12], data[13], data[14], data[15],
+        data[16], data[17], data[18], data[19], data[20], data[21], data[22], data[23],
+        data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31],
+        data[32], data[33], data[34], data[35], data[36], data[37], data[38], data[39],
+        data[40], data[41], data[42], data[43], data[44], data[45], data[46], data[47],
+        data[48], data[49], data[50], data[51], data[52], data[53], data[54], data[55],
+        data[56], data[57], data[58], data[59], data[60], data[61], data[62], data[63],
+        data[64] // for setf
+    )};
+    simd_sequence_free(data);
+    return (PyObject*)PySIMDVector_FromData(r, simd_data_vs32);
+}
+
+
+/***************************
+ * Reorder
+ ***************************/
+#line 337
+SIMD_IMPL_INTRIN_2(combinel_s32, vs32, vs32, vs32)
+
+#line 337
+SIMD_IMPL_INTRIN_2(combineh_s32, vs32, vs32, vs32)
+
+
+#line 343
+SIMD_IMPL_INTRIN_2(combine_s32, vs32x2, vs32, vs32)
+
+#line 343
+SIMD_IMPL_INTRIN_2(zip_s32, vs32x2, vs32, vs32)
+
+#line 343
+SIMD_IMPL_INTRIN_2(unzip_s32, vs32x2, vs32, vs32)
+
+
+#if 1
+SIMD_IMPL_INTRIN_1(rev64_s32, vs32, vs32)
+#endif
+
+// special implementation to convert runtime constants to immediate values
+#if 32 == 32
+// one call for element index then gather them within one vector
+// instead of unroll the 255 possible cases.
+NPY_FINLINE npyv_s32
+npyv_permi128_s32_(npyv_s32 a, unsigned e0, unsigned e1, unsigned e2, unsigned e3)
+{
+   #line 360
+    npyv_s32 ve0;
+    npyv_lanetype_s32 de0[npyv_nlanes_s32];
+    if (0) {}
+   #line 366
+    else if (e0 == 1) {
+        ve0 = npyv_permi128_s32(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e0 == 2) {
+        ve0 = npyv_permi128_s32(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e0 == 3) {
+        ve0 = npyv_permi128_s32(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve0 = npyv_permi128_s32(a, 0, 0, 0, 0);
+    }
+    npyv_store_s32(de0, ve0);
+    
+#line 360
+    npyv_s32 ve1;
+    npyv_lanetype_s32 de1[npyv_nlanes_s32];
+    if (0) {}
+   #line 366
+    else if (e1 == 1) {
+        ve1 = npyv_permi128_s32(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e1 == 2) {
+        ve1 = npyv_permi128_s32(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e1 == 3) {
+        ve1 = npyv_permi128_s32(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve1 = npyv_permi128_s32(a, 0, 0, 0, 0);
+    }
+    npyv_store_s32(de1, ve1);
+    
+#line 360
+    npyv_s32 ve2;
+    npyv_lanetype_s32 de2[npyv_nlanes_s32];
+    if (0) {}
+   #line 366
+    else if (e2 == 1) {
+        ve2 = npyv_permi128_s32(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e2 == 2) {
+        ve2 = npyv_permi128_s32(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e2 == 3) {
+        ve2 = npyv_permi128_s32(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve2 = npyv_permi128_s32(a, 0, 0, 0, 0);
+    }
+    npyv_store_s32(de2, ve2);
+    
+#line 360
+    npyv_s32 ve3;
+    npyv_lanetype_s32 de3[npyv_nlanes_s32];
+    if (0) {}
+   #line 366
+    else if (e3 == 1) {
+        ve3 = npyv_permi128_s32(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e3 == 2) {
+        ve3 = npyv_permi128_s32(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e3 == 3) {
+        ve3 = npyv_permi128_s32(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve3 = npyv_permi128_s32(a, 0, 0, 0, 0);
+    }
+    npyv_store_s32(de3, ve3);
+    
+    if (e0 == e1 && e0 == e2 && e0 == e3) {
+        return ve0;
+    }
+    for (int i = 0; i < npyv_nlanes_s32; i += 4) {
+        de0[i+1] = de1[i+1];
+        de0[i+2] = de2[i+2];
+        de0[i+3] = de3[i+3];
+    }
+    return npyv_load_s32(de0);
+}
+SIMD_IMPL_INTRIN_5(permi128_s32_, vs32, vs32, u8, u8, u8, u8)
+#elif 32 == 64
+NPY_FINLINE npyv_s32
+npyv_permi128_s32_(npyv_s32 a, unsigned e0, unsigned e1)
+{
+    if (e0 == 1 && e1 == 0) {
+        return npyv_permi128_s32(a, 1, 0);
+    }
+    else if (e0 == 0 && e1 == 1) {
+        return npyv_permi128_s32(a, 0, 1);
+    }
+    else if (e0 == 1 && e1 == 1) {
+        return npyv_permi128_s32(a, 1, 1);
+    }
+    return npyv_permi128_s32(a, 0, 0);
+}
+SIMD_IMPL_INTRIN_3(permi128_s32_, vs32, vs32, u8, u8)
+#endif
+
+/***************************
+ * Operators
+ ***************************/
+#if 31 > 0
+SIMD_IMPL_INTRIN_2(shl_s32, vs32, vs32, u8)
+SIMD_IMPL_INTRIN_2(shr_s32, vs32, vs32, u8)
+// immediate constant
+SIMD_IMPL_INTRIN_2IMM(shli_s32, vs32, vs32, 31)
+SIMD_IMPL_INTRIN_2IMM(shri_s32, vs32, vs32, 32)
+#endif // shl_imm
+
+#line 418
+SIMD_IMPL_INTRIN_2(and_s32, vs32, vs32, vs32)
+
+#line 418
+SIMD_IMPL_INTRIN_2(or_s32, vs32, vs32, vs32)
+
+#line 418
+SIMD_IMPL_INTRIN_2(xor_s32, vs32, vs32, vs32)
+
+
+SIMD_IMPL_INTRIN_1(not_s32, vs32, vs32)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpeq_s32, vb32, vs32, vs32)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpneq_s32, vb32, vs32, vs32)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpgt_s32, vb32, vs32, vs32)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpge_s32, vb32, vs32, vs32)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmplt_s32, vb32, vs32, vs32)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmple_s32, vb32, vs32, vs32)
+
+
+#if 0
+SIMD_IMPL_INTRIN_2(andc_s32, vs32, vs32, vs32)
+SIMD_IMPL_INTRIN_2(andc_b32, vb32, vb32, vb32)
+SIMD_IMPL_INTRIN_2(orc_b32, vb32, vb32, vb32)
+SIMD_IMPL_INTRIN_2(xnor_b32, vb32, vb32, vb32)
+#endif
+
+// test cross all vector lanes
+#line 440
+SIMD_IMPL_INTRIN_1(any_s32, u8, vs32)
+
+#line 440
+SIMD_IMPL_INTRIN_1(all_s32, u8, vs32)
+
+/***************************
+ * Conversion
+ ***************************/
+SIMD_IMPL_INTRIN_1(cvt_s32_b32, vs32,  vb32)
+SIMD_IMPL_INTRIN_1(cvt_b32_s32, vb32, vs32)
+#if 0
+SIMD_IMPL_INTRIN_1(expand_s32_s32, vs32x2, vs32)
+#endif // expand_sup
+/***************************
+ * Arithmetic
+ ***************************/
+#line 456
+SIMD_IMPL_INTRIN_2(add_s32, vs32, vs32, vs32)
+
+#line 456
+SIMD_IMPL_INTRIN_2(sub_s32, vs32, vs32, vs32)
+
+
+#if 0
+#line 463
+SIMD_IMPL_INTRIN_2(adds_s32, vs32, vs32, vs32)
+
+#line 463
+SIMD_IMPL_INTRIN_2(subs_s32, vs32, vs32, vs32)
+
+#endif // sat_sup
+
+#if 1
+SIMD_IMPL_INTRIN_2(mul_s32, vs32, vs32, vs32)
+#endif // mul_sup
+
+#if 0
+SIMD_IMPL_INTRIN_2(div_s32, vs32, vs32, vs32)
+#endif // div_sup
+
+#if 1
+SIMD_IMPL_INTRIN_1(divisor_s32, vs32x3, s32)
+SIMD_IMPL_INTRIN_2(divc_s32, vs32, vs32, vs32x3)
+#endif // intdiv_sup
+
+#if 0
+#line 484
+SIMD_IMPL_INTRIN_3(muladd_s32, vs32, vs32, vs32, vs32)
+
+#line 484
+SIMD_IMPL_INTRIN_3(mulsub_s32, vs32, vs32, vs32, vs32)
+
+#line 484
+SIMD_IMPL_INTRIN_3(nmuladd_s32, vs32, vs32, vs32, vs32)
+
+#line 484
+SIMD_IMPL_INTRIN_3(nmulsub_s32, vs32, vs32, vs32, vs32)
+
+#line 484
+SIMD_IMPL_INTRIN_3(muladdsub_s32, vs32, vs32, vs32, vs32)
+
+#endif // fused_sup
+
+#if 0
+SIMD_IMPL_INTRIN_1(sum_s32, s32, vs32)
+#endif // sum_sup
+
+#if 0
+SIMD_IMPL_INTRIN_1(sumup_s32, s32, vs32)
+#endif // sumup_sup
+
+/***************************
+ * Math
+ ***************************/
+#if 0
+#line 503
+SIMD_IMPL_INTRIN_1(sqrt_s32, vs32, vs32)
+
+#line 503
+SIMD_IMPL_INTRIN_1(recip_s32, vs32, vs32)
+
+#line 503
+SIMD_IMPL_INTRIN_1(abs_s32, vs32, vs32)
+
+#line 503
+SIMD_IMPL_INTRIN_1(square_s32, vs32, vs32)
+
+#line 503
+SIMD_IMPL_INTRIN_1(rint_s32, vs32, vs32)
+
+#line 503
+SIMD_IMPL_INTRIN_1(ceil_s32, vs32, vs32)
+
+#line 503
+SIMD_IMPL_INTRIN_1(trunc_s32, vs32, vs32)
+
+#line 503
+SIMD_IMPL_INTRIN_1(floor_s32, vs32, vs32)
+
+#endif
+
+#line 510
+SIMD_IMPL_INTRIN_2(max_s32, vs32, vs32, vs32)
+SIMD_IMPL_INTRIN_1(reduce_max_s32, s32, vs32)
+
+#line 510
+SIMD_IMPL_INTRIN_2(min_s32, vs32, vs32, vs32)
+SIMD_IMPL_INTRIN_1(reduce_min_s32, s32, vs32)
+
+
+#if 0
+#line 518
+SIMD_IMPL_INTRIN_2(maxp_s32, vs32, vs32, vs32)
+SIMD_IMPL_INTRIN_1(reduce_maxp_s32, s32, vs32)
+
+#line 518
+SIMD_IMPL_INTRIN_2(minp_s32, vs32, vs32, vs32)
+SIMD_IMPL_INTRIN_1(reduce_minp_s32, s32, vs32)
+
+#line 518
+SIMD_IMPL_INTRIN_2(maxn_s32, vs32, vs32, vs32)
+SIMD_IMPL_INTRIN_1(reduce_maxn_s32, s32, vs32)
+
+#line 518
+SIMD_IMPL_INTRIN_2(minn_s32, vs32, vs32, vs32)
+SIMD_IMPL_INTRIN_1(reduce_minn_s32, s32, vs32)
+
+/**end repeat1**/
+#endif
+
+/***************************
+ * Mask operations
+ ***************************/
+#line 530
+ SIMD_IMPL_INTRIN_4(ifadd_s32, vs32, vb32, vs32, vs32, vs32)
+
+#line 530
+ SIMD_IMPL_INTRIN_4(ifsub_s32, vs32, vb32, vs32, vs32, vs32)
+
+
+#if 0
+SIMD_IMPL_INTRIN_4(ifdiv_s32, vs32, vb32, vs32, vs32, vs32)
+SIMD_IMPL_INTRIN_3(ifdivz_s32, vs32, vb32, vs32, vs32)
+#endif
+
+#endif // simd_sup
+
+#line 36
+#if 1
+/***************************
+ * Memory
+ ***************************/
+#line 43
+SIMD_IMPL_INTRIN_1(load_u64, vu64, qu64)
+
+#line 43
+SIMD_IMPL_INTRIN_1(loada_u64, vu64, qu64)
+
+#line 43
+SIMD_IMPL_INTRIN_1(loads_u64, vu64, qu64)
+
+#line 43
+SIMD_IMPL_INTRIN_1(loadl_u64, vu64, qu64)
+
+SIMD_IMPL_INTRIN_1(load_u64x2, vu64x2, qu64)
+
+#line 51
+// special definition due to the nature of store
+static PyObject *
+simd__intrin_store_u64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu64};
+    simd_arg vec_arg = {.dtype = simd_data_vu64};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:store_u64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store_u64(seq_arg.data.qu64, vec_arg.data.vu64);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu64, simd_data_qu64)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of storea
+static PyObject *
+simd__intrin_storea_u64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu64};
+    simd_arg vec_arg = {.dtype = simd_data_vu64};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:storea_u64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_storea_u64(seq_arg.data.qu64, vec_arg.data.vu64);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu64, simd_data_qu64)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of stores
+static PyObject *
+simd__intrin_stores_u64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu64};
+    simd_arg vec_arg = {.dtype = simd_data_vu64};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:stores_u64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_stores_u64(seq_arg.data.qu64, vec_arg.data.vu64);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu64, simd_data_qu64)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of storel
+static PyObject *
+simd__intrin_storel_u64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu64};
+    simd_arg vec_arg = {.dtype = simd_data_vu64};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:storel_u64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_storel_u64(seq_arg.data.qu64, vec_arg.data.vu64);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu64, simd_data_qu64)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of storeh
+static PyObject *
+simd__intrin_storeh_u64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu64};
+    simd_arg vec_arg = {.dtype = simd_data_vu64};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:storeh_u64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_storeh_u64(seq_arg.data.qu64, vec_arg.data.vu64);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu64, simd_data_qu64)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of store
+static PyObject *
+simd__intrin_store_u64x2(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu64};
+    simd_arg vec_arg = {.dtype = simd_data_vu64x2};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:store_u64x2",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store_u64x2(seq_arg.data.qu64, vec_arg.data.vu64x2);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu64, simd_data_qu64)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+
+/****************************************
+ * Non-contiguous/Partial Memory access
+ ****************************************/
+#if 1
+// Partial Load
+SIMD_IMPL_INTRIN_3(load_till_u64, vu64, qu64, u32, u64)
+SIMD_IMPL_INTRIN_2(load_tillz_u64, vu64, qu64, u32)
+#if 64 == 32
+    SIMD_IMPL_INTRIN_4(load2_till_u64, vu64, qu64, u32, u64, u64)
+    SIMD_IMPL_INTRIN_2(load2_tillz_u64, vu64, qu64, u32)
+#else
+    SIMD_IMPL_INTRIN_4(load2_till_u64, vu64, qu64, u32, u64, u64)
+    SIMD_IMPL_INTRIN_2(load2_tillz_u64, vu64, qu64, u32)
+#endif
+
+// Partial Store
+#line 95
+#if !0 || 0 == 64
+static PyObject *
+simd__intrin_store_till_u64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu64};
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+    simd_arg vec_arg = {.dtype = simd_data_vu64};
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:store_till_u64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &nlane_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store_till_u64(
+        seq_arg.data.qu64, nlane_arg.data.u32, vec_arg.data.vu64
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu64, simd_data_qu64)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+#endif // chksize
+
+
+#line 95
+#if !32 || 32 == 64
+static PyObject *
+simd__intrin_store2_till_u64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu64};
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+    simd_arg vec_arg = {.dtype = simd_data_vu64};
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:store2_till_u64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &nlane_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store2_till_u64(
+        seq_arg.data.qu64, nlane_arg.data.u32, vec_arg.data.vu64
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu64, simd_data_qu64)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+#endif // chksize
+
+
+#line 95
+#if !64 || 64 == 64
+static PyObject *
+simd__intrin_store2_till_u64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu64};
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+    simd_arg vec_arg = {.dtype = simd_data_vu64};
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:store2_till_u64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &nlane_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store2_till_u64(
+        seq_arg.data.qu64, nlane_arg.data.u32, vec_arg.data.vu64
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu64, simd_data_qu64)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+#endif // chksize
+
+
+// Non-contiguous Load
+#line 136
+#if !0 || 0 == 64
+static PyObject *
+simd__intrin_loadn_u64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_u64};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_u64};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&:loadn_u64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u64 *seq_ptr = seq_arg.data.qu64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 1;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn_u64(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_u64 rvec = npyv_loadn_u64(
+        seq_ptr, stride
+    #if 0
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.u64
+    #endif
+    #if 0
+        , fill2_arg.data.u64
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vu64, .data = {.vu64=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !32 || 32 == 64
+static PyObject *
+simd__intrin_loadn2_u64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_u64};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_u64};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&:loadn2_u64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u64 *seq_ptr = seq_arg.data.qu64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_u64(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_u64 rvec = npyv_loadn2_u64(
+        seq_ptr, stride
+    #if 0
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.u64
+    #endif
+    #if 0
+        , fill2_arg.data.u64
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vu64, .data = {.vu64=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !64 || 64 == 64
+static PyObject *
+simd__intrin_loadn2_u64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_u64};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_u64};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&:loadn2_u64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u64 *seq_ptr = seq_arg.data.qu64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_u64(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_u64 rvec = npyv_loadn2_u64(
+        seq_ptr, stride
+    #if 0
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.u64
+    #endif
+    #if 0
+        , fill2_arg.data.u64
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vu64, .data = {.vu64=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !0 || 0 == 64
+static PyObject *
+simd__intrin_loadn_till_u64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 1
+    simd_arg fill_arg = {.dtype = simd_data_u64};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_u64};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:loadn_till_u64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u64 *seq_ptr = seq_arg.data.qu64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 1;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn_till_u64(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_u64 rvec = npyv_loadn_till_u64(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 1
+        , fill_arg.data.u64
+    #endif
+    #if 0
+        , fill2_arg.data.u64
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vu64, .data = {.vu64=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !32 || 32 == 64
+static PyObject *
+simd__intrin_loadn2_till_u64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 1
+    simd_arg fill_arg = {.dtype = simd_data_u64};
+#endif
+#if 1
+    simd_arg fill2_arg = {.dtype = simd_data_u64};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&O&:loadn2_till_u64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u64 *seq_ptr = seq_arg.data.qu64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_till_u64(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_u64 rvec = npyv_loadn2_till_u64(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 1
+        , fill_arg.data.u64
+    #endif
+    #if 1
+        , fill2_arg.data.u64
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vu64, .data = {.vu64=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !64 || 64 == 64
+static PyObject *
+simd__intrin_loadn2_till_u64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 1
+    simd_arg fill_arg = {.dtype = simd_data_u64};
+#endif
+#if 1
+    simd_arg fill2_arg = {.dtype = simd_data_u64};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&O&:loadn2_till_u64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u64 *seq_ptr = seq_arg.data.qu64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_till_u64(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_u64 rvec = npyv_loadn2_till_u64(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 1
+        , fill_arg.data.u64
+    #endif
+    #if 1
+        , fill2_arg.data.u64
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vu64, .data = {.vu64=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !0 || 0 == 64
+static PyObject *
+simd__intrin_loadn_tillz_u64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_u64};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_u64};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:loadn_tillz_u64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u64 *seq_ptr = seq_arg.data.qu64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 1;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn_tillz_u64(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_u64 rvec = npyv_loadn_tillz_u64(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.u64
+    #endif
+    #if 0
+        , fill2_arg.data.u64
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vu64, .data = {.vu64=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !32 || 32 == 64
+static PyObject *
+simd__intrin_loadn2_tillz_u64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_u64};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_u64};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:loadn2_tillz_u64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u64 *seq_ptr = seq_arg.data.qu64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_tillz_u64(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_u64 rvec = npyv_loadn2_tillz_u64(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.u64
+    #endif
+    #if 0
+        , fill2_arg.data.u64
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vu64, .data = {.vu64=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !64 || 64 == 64
+static PyObject *
+simd__intrin_loadn2_tillz_u64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_u64};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_u64};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:loadn2_tillz_u64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u64 *seq_ptr = seq_arg.data.qu64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_tillz_u64(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_u64 rvec = npyv_loadn2_tillz_u64(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.u64
+    #endif
+    #if 0
+        , fill2_arg.data.u64
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vu64, .data = {.vu64=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+
+// Non-contiguous Store
+#line 216
+#if !0 || 0 == 64
+static PyObject *
+simd__intrin_storen_u64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vu64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:storen_u64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u64 *seq_ptr = seq_arg.data.qu64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*1;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen_u64(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen_u64(
+        seq_ptr, stride
+    #if 0
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vu64
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu64, simd_data_qu64)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !32 || 32 == 64
+static PyObject *
+simd__intrin_storen2_u64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vu64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:storen_u64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u64 *seq_ptr = seq_arg.data.qu64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_u64(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_u64(
+        seq_ptr, stride
+    #if 0
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vu64
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu64, simd_data_qu64)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !64 || 64 == 64
+static PyObject *
+simd__intrin_storen2_u64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vu64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:storen_u64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u64 *seq_ptr = seq_arg.data.qu64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_u64(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_u64(
+        seq_ptr, stride
+    #if 0
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vu64
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu64, simd_data_qu64)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !0 || 0 == 64
+static PyObject *
+simd__intrin_storen_till_u64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vu64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:storen_u64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u64 *seq_ptr = seq_arg.data.qu64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*1;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen_till_u64(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen_till_u64(
+        seq_ptr, stride
+    #if 1
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vu64
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu64, simd_data_qu64)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !32 || 32 == 64
+static PyObject *
+simd__intrin_storen2_till_u64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vu64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:storen_u64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u64 *seq_ptr = seq_arg.data.qu64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_till_u64(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_till_u64(
+        seq_ptr, stride
+    #if 1
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vu64
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu64, simd_data_qu64)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !64 || 64 == 64
+static PyObject *
+simd__intrin_storen2_till_u64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qu64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vu64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:storen_u64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_u64 *seq_ptr = seq_arg.data.qu64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_u64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_till_u64(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_till_u64(
+        seq_ptr, stride
+    #if 1
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vu64
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qu64, simd_data_qu64)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#endif // 1
+
+/****************************
+ * Lookup tables
+ ****************************/
+#if 64 == 32
+SIMD_IMPL_INTRIN_2(lut32_u64, vu64, qu64, vu64)
+#endif
+#if 64 == 64
+SIMD_IMPL_INTRIN_2(lut16_u64, vu64, qu64, vu64)
+#endif
+/***************************
+ * Misc
+ ***************************/
+SIMD_IMPL_INTRIN_0(zero_u64, vu64)
+SIMD_IMPL_INTRIN_1(extract0_u64, u64, vu64)
+SIMD_IMPL_INTRIN_1(setall_u64, vu64, u64)
+SIMD_IMPL_INTRIN_3(select_u64, vu64, vb64, vu64, vu64)
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u8_u64, vu8, vu64)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s8_u64, vs8, vu64)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u16_u64, vu16, vu64)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s16_u64, vs16, vu64)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u32_u64, vu32, vu64)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s32_u64, vs32, vu64)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u64_u64, vu64, vu64)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s64_u64, vs64, vu64)
+#endif // simd_sup2
+
+#line 296
+#if NPY_SIMD_F32
+SIMD_IMPL_INTRIN_1(reinterpret_f32_u64, vf32, vu64)
+#endif // simd_sup2
+
+#line 296
+#if NPY_SIMD_F64
+SIMD_IMPL_INTRIN_1(reinterpret_f64_u64, vf64, vu64)
+#endif // simd_sup2
+
+
+/**
+ * special definition due to the nature of intrinsics
+ * npyv_setf_u64 and npy_set_u64.
+*/
+#line 308
+static PyObject *
+simd__intrin_setf_u64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    npyv_lanetype_u64 *data = simd_sequence_from_iterable(args, simd_data_qu64, npyv_nlanes_u64);
+    if (data == NULL) {
+        return NULL;
+    }
+    simd_data r = {.vu64 = npyv_setf_u64(
+        data[0],  data[1],  data[2],  data[3],  data[4],  data[5],  data[6],  data[7],
+        data[8],  data[9],  data[10], data[11], data[12], data[13], data[14], data[15],
+        data[16], data[17], data[18], data[19], data[20], data[21], data[22], data[23],
+        data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31],
+        data[32], data[33], data[34], data[35], data[36], data[37], data[38], data[39],
+        data[40], data[41], data[42], data[43], data[44], data[45], data[46], data[47],
+        data[48], data[49], data[50], data[51], data[52], data[53], data[54], data[55],
+        data[56], data[57], data[58], data[59], data[60], data[61], data[62], data[63],
+        data[64] // for setf
+    )};
+    simd_sequence_free(data);
+    return (PyObject*)PySIMDVector_FromData(r, simd_data_vu64);
+}
+
+#line 308
+static PyObject *
+simd__intrin_set_u64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    npyv_lanetype_u64 *data = simd_sequence_from_iterable(args, simd_data_qu64, npyv_nlanes_u64);
+    if (data == NULL) {
+        return NULL;
+    }
+    simd_data r = {.vu64 = npyv_set_u64(
+        data[0],  data[1],  data[2],  data[3],  data[4],  data[5],  data[6],  data[7],
+        data[8],  data[9],  data[10], data[11], data[12], data[13], data[14], data[15],
+        data[16], data[17], data[18], data[19], data[20], data[21], data[22], data[23],
+        data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31],
+        data[32], data[33], data[34], data[35], data[36], data[37], data[38], data[39],
+        data[40], data[41], data[42], data[43], data[44], data[45], data[46], data[47],
+        data[48], data[49], data[50], data[51], data[52], data[53], data[54], data[55],
+        data[56], data[57], data[58], data[59], data[60], data[61], data[62], data[63],
+        data[64] // for setf
+    )};
+    simd_sequence_free(data);
+    return (PyObject*)PySIMDVector_FromData(r, simd_data_vu64);
+}
+
+
+/***************************
+ * Reorder
+ ***************************/
+#line 337
+SIMD_IMPL_INTRIN_2(combinel_u64, vu64, vu64, vu64)
+
+#line 337
+SIMD_IMPL_INTRIN_2(combineh_u64, vu64, vu64, vu64)
+
+
+#line 343
+SIMD_IMPL_INTRIN_2(combine_u64, vu64x2, vu64, vu64)
+
+#line 343
+SIMD_IMPL_INTRIN_2(zip_u64, vu64x2, vu64, vu64)
+
+#line 343
+SIMD_IMPL_INTRIN_2(unzip_u64, vu64x2, vu64, vu64)
+
+
+#if 0
+SIMD_IMPL_INTRIN_1(rev64_u64, vu64, vu64)
+#endif
+
+// special implementation to convert runtime constants to immediate values
+#if 64 == 32
+// one call for element index then gather them within one vector
+// instead of unroll the 255 possible cases.
+NPY_FINLINE npyv_u64
+npyv_permi128_u64_(npyv_u64 a, unsigned e0, unsigned e1, unsigned e2, unsigned e3)
+{
+   #line 360
+    npyv_u64 ve0;
+    npyv_lanetype_u64 de0[npyv_nlanes_u64];
+    if (0) {}
+   #line 366
+    else if (e0 == 1) {
+        ve0 = npyv_permi128_u64(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e0 == 2) {
+        ve0 = npyv_permi128_u64(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e0 == 3) {
+        ve0 = npyv_permi128_u64(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve0 = npyv_permi128_u64(a, 0, 0, 0, 0);
+    }
+    npyv_store_u64(de0, ve0);
+    
+#line 360
+    npyv_u64 ve1;
+    npyv_lanetype_u64 de1[npyv_nlanes_u64];
+    if (0) {}
+   #line 366
+    else if (e1 == 1) {
+        ve1 = npyv_permi128_u64(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e1 == 2) {
+        ve1 = npyv_permi128_u64(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e1 == 3) {
+        ve1 = npyv_permi128_u64(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve1 = npyv_permi128_u64(a, 0, 0, 0, 0);
+    }
+    npyv_store_u64(de1, ve1);
+    
+#line 360
+    npyv_u64 ve2;
+    npyv_lanetype_u64 de2[npyv_nlanes_u64];
+    if (0) {}
+   #line 366
+    else if (e2 == 1) {
+        ve2 = npyv_permi128_u64(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e2 == 2) {
+        ve2 = npyv_permi128_u64(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e2 == 3) {
+        ve2 = npyv_permi128_u64(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve2 = npyv_permi128_u64(a, 0, 0, 0, 0);
+    }
+    npyv_store_u64(de2, ve2);
+    
+#line 360
+    npyv_u64 ve3;
+    npyv_lanetype_u64 de3[npyv_nlanes_u64];
+    if (0) {}
+   #line 366
+    else if (e3 == 1) {
+        ve3 = npyv_permi128_u64(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e3 == 2) {
+        ve3 = npyv_permi128_u64(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e3 == 3) {
+        ve3 = npyv_permi128_u64(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve3 = npyv_permi128_u64(a, 0, 0, 0, 0);
+    }
+    npyv_store_u64(de3, ve3);
+    
+    if (e0 == e1 && e0 == e2 && e0 == e3) {
+        return ve0;
+    }
+    for (int i = 0; i < npyv_nlanes_u64; i += 4) {
+        de0[i+1] = de1[i+1];
+        de0[i+2] = de2[i+2];
+        de0[i+3] = de3[i+3];
+    }
+    return npyv_load_u64(de0);
+}
+SIMD_IMPL_INTRIN_5(permi128_u64_, vu64, vu64, u8, u8, u8, u8)
+#elif 64 == 64
+NPY_FINLINE npyv_u64
+npyv_permi128_u64_(npyv_u64 a, unsigned e0, unsigned e1)
+{
+    if (e0 == 1 && e1 == 0) {
+        return npyv_permi128_u64(a, 1, 0);
+    }
+    else if (e0 == 0 && e1 == 1) {
+        return npyv_permi128_u64(a, 0, 1);
+    }
+    else if (e0 == 1 && e1 == 1) {
+        return npyv_permi128_u64(a, 1, 1);
+    }
+    return npyv_permi128_u64(a, 0, 0);
+}
+SIMD_IMPL_INTRIN_3(permi128_u64_, vu64, vu64, u8, u8)
+#endif
+
+/***************************
+ * Operators
+ ***************************/
+#if 63 > 0
+SIMD_IMPL_INTRIN_2(shl_u64, vu64, vu64, u8)
+SIMD_IMPL_INTRIN_2(shr_u64, vu64, vu64, u8)
+// immediate constant
+SIMD_IMPL_INTRIN_2IMM(shli_u64, vu64, vu64, 63)
+SIMD_IMPL_INTRIN_2IMM(shri_u64, vu64, vu64, 64)
+#endif // shl_imm
+
+#line 418
+SIMD_IMPL_INTRIN_2(and_u64, vu64, vu64, vu64)
+
+#line 418
+SIMD_IMPL_INTRIN_2(or_u64, vu64, vu64, vu64)
+
+#line 418
+SIMD_IMPL_INTRIN_2(xor_u64, vu64, vu64, vu64)
+
+
+SIMD_IMPL_INTRIN_1(not_u64, vu64, vu64)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpeq_u64, vb64, vu64, vu64)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpneq_u64, vb64, vu64, vu64)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpgt_u64, vb64, vu64, vu64)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpge_u64, vb64, vu64, vu64)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmplt_u64, vb64, vu64, vu64)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmple_u64, vb64, vu64, vu64)
+
+
+#if 0
+SIMD_IMPL_INTRIN_2(andc_u64, vu64, vu64, vu64)
+SIMD_IMPL_INTRIN_2(andc_b64, vb64, vb64, vb64)
+SIMD_IMPL_INTRIN_2(orc_b64, vb64, vb64, vb64)
+SIMD_IMPL_INTRIN_2(xnor_b64, vb64, vb64, vb64)
+#endif
+
+// test cross all vector lanes
+#line 440
+SIMD_IMPL_INTRIN_1(any_u64, u8, vu64)
+
+#line 440
+SIMD_IMPL_INTRIN_1(all_u64, u8, vu64)
+
+/***************************
+ * Conversion
+ ***************************/
+SIMD_IMPL_INTRIN_1(cvt_u64_b64, vu64,  vb64)
+SIMD_IMPL_INTRIN_1(cvt_b64_u64, vb64, vu64)
+#if 0
+SIMD_IMPL_INTRIN_1(expand_u64_u64, vu64x2, vu64)
+#endif // expand_sup
+/***************************
+ * Arithmetic
+ ***************************/
+#line 456
+SIMD_IMPL_INTRIN_2(add_u64, vu64, vu64, vu64)
+
+#line 456
+SIMD_IMPL_INTRIN_2(sub_u64, vu64, vu64, vu64)
+
+
+#if 0
+#line 463
+SIMD_IMPL_INTRIN_2(adds_u64, vu64, vu64, vu64)
+
+#line 463
+SIMD_IMPL_INTRIN_2(subs_u64, vu64, vu64, vu64)
+
+#endif // sat_sup
+
+#if 0
+SIMD_IMPL_INTRIN_2(mul_u64, vu64, vu64, vu64)
+#endif // mul_sup
+
+#if 0
+SIMD_IMPL_INTRIN_2(div_u64, vu64, vu64, vu64)
+#endif // div_sup
+
+#if 1
+SIMD_IMPL_INTRIN_1(divisor_u64, vu64x3, u64)
+SIMD_IMPL_INTRIN_2(divc_u64, vu64, vu64, vu64x3)
+#endif // intdiv_sup
+
+#if 0
+#line 484
+SIMD_IMPL_INTRIN_3(muladd_u64, vu64, vu64, vu64, vu64)
+
+#line 484
+SIMD_IMPL_INTRIN_3(mulsub_u64, vu64, vu64, vu64, vu64)
+
+#line 484
+SIMD_IMPL_INTRIN_3(nmuladd_u64, vu64, vu64, vu64, vu64)
+
+#line 484
+SIMD_IMPL_INTRIN_3(nmulsub_u64, vu64, vu64, vu64, vu64)
+
+#line 484
+SIMD_IMPL_INTRIN_3(muladdsub_u64, vu64, vu64, vu64, vu64)
+
+#endif // fused_sup
+
+#if 1
+SIMD_IMPL_INTRIN_1(sum_u64, u64, vu64)
+#endif // sum_sup
+
+#if 0
+SIMD_IMPL_INTRIN_1(sumup_u64, u64, vu64)
+#endif // sumup_sup
+
+/***************************
+ * Math
+ ***************************/
+#if 0
+#line 503
+SIMD_IMPL_INTRIN_1(sqrt_u64, vu64, vu64)
+
+#line 503
+SIMD_IMPL_INTRIN_1(recip_u64, vu64, vu64)
+
+#line 503
+SIMD_IMPL_INTRIN_1(abs_u64, vu64, vu64)
+
+#line 503
+SIMD_IMPL_INTRIN_1(square_u64, vu64, vu64)
+
+#line 503
+SIMD_IMPL_INTRIN_1(rint_u64, vu64, vu64)
+
+#line 503
+SIMD_IMPL_INTRIN_1(ceil_u64, vu64, vu64)
+
+#line 503
+SIMD_IMPL_INTRIN_1(trunc_u64, vu64, vu64)
+
+#line 503
+SIMD_IMPL_INTRIN_1(floor_u64, vu64, vu64)
+
+#endif
+
+#line 510
+SIMD_IMPL_INTRIN_2(max_u64, vu64, vu64, vu64)
+SIMD_IMPL_INTRIN_1(reduce_max_u64, u64, vu64)
+
+#line 510
+SIMD_IMPL_INTRIN_2(min_u64, vu64, vu64, vu64)
+SIMD_IMPL_INTRIN_1(reduce_min_u64, u64, vu64)
+
+
+#if 0
+#line 518
+SIMD_IMPL_INTRIN_2(maxp_u64, vu64, vu64, vu64)
+SIMD_IMPL_INTRIN_1(reduce_maxp_u64, u64, vu64)
+
+#line 518
+SIMD_IMPL_INTRIN_2(minp_u64, vu64, vu64, vu64)
+SIMD_IMPL_INTRIN_1(reduce_minp_u64, u64, vu64)
+
+#line 518
+SIMD_IMPL_INTRIN_2(maxn_u64, vu64, vu64, vu64)
+SIMD_IMPL_INTRIN_1(reduce_maxn_u64, u64, vu64)
+
+#line 518
+SIMD_IMPL_INTRIN_2(minn_u64, vu64, vu64, vu64)
+SIMD_IMPL_INTRIN_1(reduce_minn_u64, u64, vu64)
+
+/**end repeat1**/
+#endif
+
+/***************************
+ * Mask operations
+ ***************************/
+#line 530
+ SIMD_IMPL_INTRIN_4(ifadd_u64, vu64, vb64, vu64, vu64, vu64)
+
+#line 530
+ SIMD_IMPL_INTRIN_4(ifsub_u64, vu64, vb64, vu64, vu64, vu64)
+
+
+#if 0
+SIMD_IMPL_INTRIN_4(ifdiv_u64, vu64, vb64, vu64, vu64, vu64)
+SIMD_IMPL_INTRIN_3(ifdivz_u64, vu64, vb64, vu64, vu64)
+#endif
+
+#endif // simd_sup
+
+#line 36
+#if 1
+/***************************
+ * Memory
+ ***************************/
+#line 43
+SIMD_IMPL_INTRIN_1(load_s64, vs64, qs64)
+
+#line 43
+SIMD_IMPL_INTRIN_1(loada_s64, vs64, qs64)
+
+#line 43
+SIMD_IMPL_INTRIN_1(loads_s64, vs64, qs64)
+
+#line 43
+SIMD_IMPL_INTRIN_1(loadl_s64, vs64, qs64)
+
+SIMD_IMPL_INTRIN_1(load_s64x2, vs64x2, qs64)
+
+#line 51
+// special definition due to the nature of store
+static PyObject *
+simd__intrin_store_s64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs64};
+    simd_arg vec_arg = {.dtype = simd_data_vs64};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:store_s64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store_s64(seq_arg.data.qs64, vec_arg.data.vs64);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs64, simd_data_qs64)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of storea
+static PyObject *
+simd__intrin_storea_s64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs64};
+    simd_arg vec_arg = {.dtype = simd_data_vs64};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:storea_s64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_storea_s64(seq_arg.data.qs64, vec_arg.data.vs64);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs64, simd_data_qs64)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of stores
+static PyObject *
+simd__intrin_stores_s64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs64};
+    simd_arg vec_arg = {.dtype = simd_data_vs64};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:stores_s64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_stores_s64(seq_arg.data.qs64, vec_arg.data.vs64);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs64, simd_data_qs64)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of storel
+static PyObject *
+simd__intrin_storel_s64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs64};
+    simd_arg vec_arg = {.dtype = simd_data_vs64};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:storel_s64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_storel_s64(seq_arg.data.qs64, vec_arg.data.vs64);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs64, simd_data_qs64)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of storeh
+static PyObject *
+simd__intrin_storeh_s64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs64};
+    simd_arg vec_arg = {.dtype = simd_data_vs64};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:storeh_s64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_storeh_s64(seq_arg.data.qs64, vec_arg.data.vs64);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs64, simd_data_qs64)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of store
+static PyObject *
+simd__intrin_store_s64x2(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs64};
+    simd_arg vec_arg = {.dtype = simd_data_vs64x2};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:store_s64x2",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store_s64x2(seq_arg.data.qs64, vec_arg.data.vs64x2);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs64, simd_data_qs64)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+
+/****************************************
+ * Non-contiguous/Partial Memory access
+ ****************************************/
+#if 1
+// Partial Load
+SIMD_IMPL_INTRIN_3(load_till_s64, vs64, qs64, u32, s64)
+SIMD_IMPL_INTRIN_2(load_tillz_s64, vs64, qs64, u32)
+#if 64 == 32
+    SIMD_IMPL_INTRIN_4(load2_till_s64, vs64, qs64, u32, s64, s64)
+    SIMD_IMPL_INTRIN_2(load2_tillz_s64, vs64, qs64, u32)
+#else
+    SIMD_IMPL_INTRIN_4(load2_till_s64, vs64, qs64, u32, s64, s64)
+    SIMD_IMPL_INTRIN_2(load2_tillz_s64, vs64, qs64, u32)
+#endif
+
+// Partial Store
+#line 95
+#if !0 || 0 == 64
+static PyObject *
+simd__intrin_store_till_s64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs64};
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+    simd_arg vec_arg = {.dtype = simd_data_vs64};
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:store_till_s64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &nlane_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store_till_s64(
+        seq_arg.data.qs64, nlane_arg.data.u32, vec_arg.data.vs64
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs64, simd_data_qs64)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+#endif // chksize
+
+
+#line 95
+#if !32 || 32 == 64
+static PyObject *
+simd__intrin_store2_till_s64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs64};
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+    simd_arg vec_arg = {.dtype = simd_data_vs64};
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:store2_till_s64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &nlane_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store2_till_s64(
+        seq_arg.data.qs64, nlane_arg.data.u32, vec_arg.data.vs64
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs64, simd_data_qs64)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+#endif // chksize
+
+
+#line 95
+#if !64 || 64 == 64
+static PyObject *
+simd__intrin_store2_till_s64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs64};
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+    simd_arg vec_arg = {.dtype = simd_data_vs64};
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:store2_till_s64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &nlane_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store2_till_s64(
+        seq_arg.data.qs64, nlane_arg.data.u32, vec_arg.data.vs64
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs64, simd_data_qs64)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+#endif // chksize
+
+
+// Non-contiguous Load
+#line 136
+#if !0 || 0 == 64
+static PyObject *
+simd__intrin_loadn_s64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_s64};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_s64};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&:loadn_s64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s64 *seq_ptr = seq_arg.data.qs64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 1;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn_s64(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_s64 rvec = npyv_loadn_s64(
+        seq_ptr, stride
+    #if 0
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.s64
+    #endif
+    #if 0
+        , fill2_arg.data.s64
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vs64, .data = {.vs64=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !32 || 32 == 64
+static PyObject *
+simd__intrin_loadn2_s64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_s64};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_s64};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&:loadn2_s64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s64 *seq_ptr = seq_arg.data.qs64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_s64(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_s64 rvec = npyv_loadn2_s64(
+        seq_ptr, stride
+    #if 0
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.s64
+    #endif
+    #if 0
+        , fill2_arg.data.s64
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vs64, .data = {.vs64=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !64 || 64 == 64
+static PyObject *
+simd__intrin_loadn2_s64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_s64};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_s64};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&:loadn2_s64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s64 *seq_ptr = seq_arg.data.qs64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_s64(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_s64 rvec = npyv_loadn2_s64(
+        seq_ptr, stride
+    #if 0
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.s64
+    #endif
+    #if 0
+        , fill2_arg.data.s64
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vs64, .data = {.vs64=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !0 || 0 == 64
+static PyObject *
+simd__intrin_loadn_till_s64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 1
+    simd_arg fill_arg = {.dtype = simd_data_s64};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_s64};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:loadn_till_s64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s64 *seq_ptr = seq_arg.data.qs64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 1;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn_till_s64(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_s64 rvec = npyv_loadn_till_s64(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 1
+        , fill_arg.data.s64
+    #endif
+    #if 0
+        , fill2_arg.data.s64
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vs64, .data = {.vs64=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !32 || 32 == 64
+static PyObject *
+simd__intrin_loadn2_till_s64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 1
+    simd_arg fill_arg = {.dtype = simd_data_s64};
+#endif
+#if 1
+    simd_arg fill2_arg = {.dtype = simd_data_s64};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&O&:loadn2_till_s64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s64 *seq_ptr = seq_arg.data.qs64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_till_s64(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_s64 rvec = npyv_loadn2_till_s64(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 1
+        , fill_arg.data.s64
+    #endif
+    #if 1
+        , fill2_arg.data.s64
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vs64, .data = {.vs64=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !64 || 64 == 64
+static PyObject *
+simd__intrin_loadn2_till_s64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 1
+    simd_arg fill_arg = {.dtype = simd_data_s64};
+#endif
+#if 1
+    simd_arg fill2_arg = {.dtype = simd_data_s64};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&O&:loadn2_till_s64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s64 *seq_ptr = seq_arg.data.qs64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_till_s64(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_s64 rvec = npyv_loadn2_till_s64(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 1
+        , fill_arg.data.s64
+    #endif
+    #if 1
+        , fill2_arg.data.s64
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vs64, .data = {.vs64=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !0 || 0 == 64
+static PyObject *
+simd__intrin_loadn_tillz_s64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_s64};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_s64};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:loadn_tillz_s64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s64 *seq_ptr = seq_arg.data.qs64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 1;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn_tillz_s64(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_s64 rvec = npyv_loadn_tillz_s64(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.s64
+    #endif
+    #if 0
+        , fill2_arg.data.s64
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vs64, .data = {.vs64=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !32 || 32 == 64
+static PyObject *
+simd__intrin_loadn2_tillz_s64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_s64};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_s64};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:loadn2_tillz_s64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s64 *seq_ptr = seq_arg.data.qs64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_tillz_s64(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_s64 rvec = npyv_loadn2_tillz_s64(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.s64
+    #endif
+    #if 0
+        , fill2_arg.data.s64
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vs64, .data = {.vs64=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !64 || 64 == 64
+static PyObject *
+simd__intrin_loadn2_tillz_s64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_s64};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_s64};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:loadn2_tillz_s64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s64 *seq_ptr = seq_arg.data.qs64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_tillz_s64(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_s64 rvec = npyv_loadn2_tillz_s64(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.s64
+    #endif
+    #if 0
+        , fill2_arg.data.s64
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vs64, .data = {.vs64=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+
+// Non-contiguous Store
+#line 216
+#if !0 || 0 == 64
+static PyObject *
+simd__intrin_storen_s64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vs64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:storen_s64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s64 *seq_ptr = seq_arg.data.qs64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*1;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen_s64(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen_s64(
+        seq_ptr, stride
+    #if 0
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vs64
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs64, simd_data_qs64)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !32 || 32 == 64
+static PyObject *
+simd__intrin_storen2_s64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vs64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:storen_s64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s64 *seq_ptr = seq_arg.data.qs64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_s64(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_s64(
+        seq_ptr, stride
+    #if 0
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vs64
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs64, simd_data_qs64)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !64 || 64 == 64
+static PyObject *
+simd__intrin_storen2_s64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vs64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:storen_s64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s64 *seq_ptr = seq_arg.data.qs64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_s64(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_s64(
+        seq_ptr, stride
+    #if 0
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vs64
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs64, simd_data_qs64)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !0 || 0 == 64
+static PyObject *
+simd__intrin_storen_till_s64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vs64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:storen_s64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s64 *seq_ptr = seq_arg.data.qs64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*1;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen_till_s64(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen_till_s64(
+        seq_ptr, stride
+    #if 1
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vs64
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs64, simd_data_qs64)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !32 || 32 == 64
+static PyObject *
+simd__intrin_storen2_till_s64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vs64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:storen_s64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s64 *seq_ptr = seq_arg.data.qs64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_till_s64(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_till_s64(
+        seq_ptr, stride
+    #if 1
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vs64
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs64, simd_data_qs64)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !64 || 64 == 64
+static PyObject *
+simd__intrin_storen2_till_s64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qs64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vs64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:storen_s64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_s64 *seq_ptr = seq_arg.data.qs64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_s64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_till_s64(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_till_s64(
+        seq_ptr, stride
+    #if 1
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vs64
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qs64, simd_data_qs64)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#endif // 1
+
+/****************************
+ * Lookup tables
+ ****************************/
+#if 64 == 32
+SIMD_IMPL_INTRIN_2(lut32_s64, vs64, qs64, vu64)
+#endif
+#if 64 == 64
+SIMD_IMPL_INTRIN_2(lut16_s64, vs64, qs64, vu64)
+#endif
+/***************************
+ * Misc
+ ***************************/
+SIMD_IMPL_INTRIN_0(zero_s64, vs64)
+SIMD_IMPL_INTRIN_1(extract0_s64, s64, vs64)
+SIMD_IMPL_INTRIN_1(setall_s64, vs64, s64)
+SIMD_IMPL_INTRIN_3(select_s64, vs64, vb64, vs64, vs64)
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u8_s64, vu8, vs64)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s8_s64, vs8, vs64)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u16_s64, vu16, vs64)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s16_s64, vs16, vs64)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u32_s64, vu32, vs64)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s32_s64, vs32, vs64)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u64_s64, vu64, vs64)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s64_s64, vs64, vs64)
+#endif // simd_sup2
+
+#line 296
+#if NPY_SIMD_F32
+SIMD_IMPL_INTRIN_1(reinterpret_f32_s64, vf32, vs64)
+#endif // simd_sup2
+
+#line 296
+#if NPY_SIMD_F64
+SIMD_IMPL_INTRIN_1(reinterpret_f64_s64, vf64, vs64)
+#endif // simd_sup2
+
+
+/**
+ * special definition due to the nature of intrinsics
+ * npyv_setf_s64 and npy_set_s64.
+*/
+#line 308
+static PyObject *
+simd__intrin_setf_s64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    npyv_lanetype_s64 *data = simd_sequence_from_iterable(args, simd_data_qs64, npyv_nlanes_s64);
+    if (data == NULL) {
+        return NULL;
+    }
+    simd_data r = {.vs64 = npyv_setf_s64(
+        data[0],  data[1],  data[2],  data[3],  data[4],  data[5],  data[6],  data[7],
+        data[8],  data[9],  data[10], data[11], data[12], data[13], data[14], data[15],
+        data[16], data[17], data[18], data[19], data[20], data[21], data[22], data[23],
+        data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31],
+        data[32], data[33], data[34], data[35], data[36], data[37], data[38], data[39],
+        data[40], data[41], data[42], data[43], data[44], data[45], data[46], data[47],
+        data[48], data[49], data[50], data[51], data[52], data[53], data[54], data[55],
+        data[56], data[57], data[58], data[59], data[60], data[61], data[62], data[63],
+        data[64] // for setf
+    )};
+    simd_sequence_free(data);
+    return (PyObject*)PySIMDVector_FromData(r, simd_data_vs64);
+}
+
+#line 308
+static PyObject *
+simd__intrin_set_s64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    npyv_lanetype_s64 *data = simd_sequence_from_iterable(args, simd_data_qs64, npyv_nlanes_s64);
+    if (data == NULL) {
+        return NULL;
+    }
+    simd_data r = {.vs64 = npyv_set_s64(
+        data[0],  data[1],  data[2],  data[3],  data[4],  data[5],  data[6],  data[7],
+        data[8],  data[9],  data[10], data[11], data[12], data[13], data[14], data[15],
+        data[16], data[17], data[18], data[19], data[20], data[21], data[22], data[23],
+        data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31],
+        data[32], data[33], data[34], data[35], data[36], data[37], data[38], data[39],
+        data[40], data[41], data[42], data[43], data[44], data[45], data[46], data[47],
+        data[48], data[49], data[50], data[51], data[52], data[53], data[54], data[55],
+        data[56], data[57], data[58], data[59], data[60], data[61], data[62], data[63],
+        data[64] // for setf
+    )};
+    simd_sequence_free(data);
+    return (PyObject*)PySIMDVector_FromData(r, simd_data_vs64);
+}
+
+
+/***************************
+ * Reorder
+ ***************************/
+#line 337
+SIMD_IMPL_INTRIN_2(combinel_s64, vs64, vs64, vs64)
+
+#line 337
+SIMD_IMPL_INTRIN_2(combineh_s64, vs64, vs64, vs64)
+
+
+#line 343
+SIMD_IMPL_INTRIN_2(combine_s64, vs64x2, vs64, vs64)
+
+#line 343
+SIMD_IMPL_INTRIN_2(zip_s64, vs64x2, vs64, vs64)
+
+#line 343
+SIMD_IMPL_INTRIN_2(unzip_s64, vs64x2, vs64, vs64)
+
+
+#if 0
+SIMD_IMPL_INTRIN_1(rev64_s64, vs64, vs64)
+#endif
+
+// special implementation to convert runtime constants to immediate values
+#if 64 == 32
+// one call for element index then gather them within one vector
+// instead of unroll the 255 possible cases.
+NPY_FINLINE npyv_s64
+npyv_permi128_s64_(npyv_s64 a, unsigned e0, unsigned e1, unsigned e2, unsigned e3)
+{
+   #line 360
+    npyv_s64 ve0;
+    npyv_lanetype_s64 de0[npyv_nlanes_s64];
+    if (0) {}
+   #line 366
+    else if (e0 == 1) {
+        ve0 = npyv_permi128_s64(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e0 == 2) {
+        ve0 = npyv_permi128_s64(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e0 == 3) {
+        ve0 = npyv_permi128_s64(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve0 = npyv_permi128_s64(a, 0, 0, 0, 0);
+    }
+    npyv_store_s64(de0, ve0);
+    
+#line 360
+    npyv_s64 ve1;
+    npyv_lanetype_s64 de1[npyv_nlanes_s64];
+    if (0) {}
+   #line 366
+    else if (e1 == 1) {
+        ve1 = npyv_permi128_s64(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e1 == 2) {
+        ve1 = npyv_permi128_s64(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e1 == 3) {
+        ve1 = npyv_permi128_s64(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve1 = npyv_permi128_s64(a, 0, 0, 0, 0);
+    }
+    npyv_store_s64(de1, ve1);
+    
+#line 360
+    npyv_s64 ve2;
+    npyv_lanetype_s64 de2[npyv_nlanes_s64];
+    if (0) {}
+   #line 366
+    else if (e2 == 1) {
+        ve2 = npyv_permi128_s64(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e2 == 2) {
+        ve2 = npyv_permi128_s64(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e2 == 3) {
+        ve2 = npyv_permi128_s64(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve2 = npyv_permi128_s64(a, 0, 0, 0, 0);
+    }
+    npyv_store_s64(de2, ve2);
+    
+#line 360
+    npyv_s64 ve3;
+    npyv_lanetype_s64 de3[npyv_nlanes_s64];
+    if (0) {}
+   #line 366
+    else if (e3 == 1) {
+        ve3 = npyv_permi128_s64(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e3 == 2) {
+        ve3 = npyv_permi128_s64(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e3 == 3) {
+        ve3 = npyv_permi128_s64(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve3 = npyv_permi128_s64(a, 0, 0, 0, 0);
+    }
+    npyv_store_s64(de3, ve3);
+    
+    if (e0 == e1 && e0 == e2 && e0 == e3) {
+        return ve0;
+    }
+    for (int i = 0; i < npyv_nlanes_s64; i += 4) {
+        de0[i+1] = de1[i+1];
+        de0[i+2] = de2[i+2];
+        de0[i+3] = de3[i+3];
+    }
+    return npyv_load_s64(de0);
+}
+SIMD_IMPL_INTRIN_5(permi128_s64_, vs64, vs64, u8, u8, u8, u8)
+#elif 64 == 64
+NPY_FINLINE npyv_s64
+npyv_permi128_s64_(npyv_s64 a, unsigned e0, unsigned e1)
+{
+    if (e0 == 1 && e1 == 0) {
+        return npyv_permi128_s64(a, 1, 0);
+    }
+    else if (e0 == 0 && e1 == 1) {
+        return npyv_permi128_s64(a, 0, 1);
+    }
+    else if (e0 == 1 && e1 == 1) {
+        return npyv_permi128_s64(a, 1, 1);
+    }
+    return npyv_permi128_s64(a, 0, 0);
+}
+SIMD_IMPL_INTRIN_3(permi128_s64_, vs64, vs64, u8, u8)
+#endif
+
+/***************************
+ * Operators
+ ***************************/
+#if 63 > 0
+SIMD_IMPL_INTRIN_2(shl_s64, vs64, vs64, u8)
+SIMD_IMPL_INTRIN_2(shr_s64, vs64, vs64, u8)
+// immediate constant
+SIMD_IMPL_INTRIN_2IMM(shli_s64, vs64, vs64, 63)
+SIMD_IMPL_INTRIN_2IMM(shri_s64, vs64, vs64, 64)
+#endif // shl_imm
+
+#line 418
+SIMD_IMPL_INTRIN_2(and_s64, vs64, vs64, vs64)
+
+#line 418
+SIMD_IMPL_INTRIN_2(or_s64, vs64, vs64, vs64)
+
+#line 418
+SIMD_IMPL_INTRIN_2(xor_s64, vs64, vs64, vs64)
+
+
+SIMD_IMPL_INTRIN_1(not_s64, vs64, vs64)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpeq_s64, vb64, vs64, vs64)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpneq_s64, vb64, vs64, vs64)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpgt_s64, vb64, vs64, vs64)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpge_s64, vb64, vs64, vs64)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmplt_s64, vb64, vs64, vs64)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmple_s64, vb64, vs64, vs64)
+
+
+#if 0
+SIMD_IMPL_INTRIN_2(andc_s64, vs64, vs64, vs64)
+SIMD_IMPL_INTRIN_2(andc_b64, vb64, vb64, vb64)
+SIMD_IMPL_INTRIN_2(orc_b64, vb64, vb64, vb64)
+SIMD_IMPL_INTRIN_2(xnor_b64, vb64, vb64, vb64)
+#endif
+
+// test cross all vector lanes
+#line 440
+SIMD_IMPL_INTRIN_1(any_s64, u8, vs64)
+
+#line 440
+SIMD_IMPL_INTRIN_1(all_s64, u8, vs64)
+
+/***************************
+ * Conversion
+ ***************************/
+SIMD_IMPL_INTRIN_1(cvt_s64_b64, vs64,  vb64)
+SIMD_IMPL_INTRIN_1(cvt_b64_s64, vb64, vs64)
+#if 0
+SIMD_IMPL_INTRIN_1(expand_s64_s64, vs64x2, vs64)
+#endif // expand_sup
+/***************************
+ * Arithmetic
+ ***************************/
+#line 456
+SIMD_IMPL_INTRIN_2(add_s64, vs64, vs64, vs64)
+
+#line 456
+SIMD_IMPL_INTRIN_2(sub_s64, vs64, vs64, vs64)
+
+
+#if 0
+#line 463
+SIMD_IMPL_INTRIN_2(adds_s64, vs64, vs64, vs64)
+
+#line 463
+SIMD_IMPL_INTRIN_2(subs_s64, vs64, vs64, vs64)
+
+#endif // sat_sup
+
+#if 0
+SIMD_IMPL_INTRIN_2(mul_s64, vs64, vs64, vs64)
+#endif // mul_sup
+
+#if 0
+SIMD_IMPL_INTRIN_2(div_s64, vs64, vs64, vs64)
+#endif // div_sup
+
+#if 1
+SIMD_IMPL_INTRIN_1(divisor_s64, vs64x3, s64)
+SIMD_IMPL_INTRIN_2(divc_s64, vs64, vs64, vs64x3)
+#endif // intdiv_sup
+
+#if 0
+#line 484
+SIMD_IMPL_INTRIN_3(muladd_s64, vs64, vs64, vs64, vs64)
+
+#line 484
+SIMD_IMPL_INTRIN_3(mulsub_s64, vs64, vs64, vs64, vs64)
+
+#line 484
+SIMD_IMPL_INTRIN_3(nmuladd_s64, vs64, vs64, vs64, vs64)
+
+#line 484
+SIMD_IMPL_INTRIN_3(nmulsub_s64, vs64, vs64, vs64, vs64)
+
+#line 484
+SIMD_IMPL_INTRIN_3(muladdsub_s64, vs64, vs64, vs64, vs64)
+
+#endif // fused_sup
+
+#if 0
+SIMD_IMPL_INTRIN_1(sum_s64, s64, vs64)
+#endif // sum_sup
+
+#if 0
+SIMD_IMPL_INTRIN_1(sumup_s64, s64, vs64)
+#endif // sumup_sup
+
+/***************************
+ * Math
+ ***************************/
+#if 0
+#line 503
+SIMD_IMPL_INTRIN_1(sqrt_s64, vs64, vs64)
+
+#line 503
+SIMD_IMPL_INTRIN_1(recip_s64, vs64, vs64)
+
+#line 503
+SIMD_IMPL_INTRIN_1(abs_s64, vs64, vs64)
+
+#line 503
+SIMD_IMPL_INTRIN_1(square_s64, vs64, vs64)
+
+#line 503
+SIMD_IMPL_INTRIN_1(rint_s64, vs64, vs64)
+
+#line 503
+SIMD_IMPL_INTRIN_1(ceil_s64, vs64, vs64)
+
+#line 503
+SIMD_IMPL_INTRIN_1(trunc_s64, vs64, vs64)
+
+#line 503
+SIMD_IMPL_INTRIN_1(floor_s64, vs64, vs64)
+
+#endif
+
+#line 510
+SIMD_IMPL_INTRIN_2(max_s64, vs64, vs64, vs64)
+SIMD_IMPL_INTRIN_1(reduce_max_s64, s64, vs64)
+
+#line 510
+SIMD_IMPL_INTRIN_2(min_s64, vs64, vs64, vs64)
+SIMD_IMPL_INTRIN_1(reduce_min_s64, s64, vs64)
+
+
+#if 0
+#line 518
+SIMD_IMPL_INTRIN_2(maxp_s64, vs64, vs64, vs64)
+SIMD_IMPL_INTRIN_1(reduce_maxp_s64, s64, vs64)
+
+#line 518
+SIMD_IMPL_INTRIN_2(minp_s64, vs64, vs64, vs64)
+SIMD_IMPL_INTRIN_1(reduce_minp_s64, s64, vs64)
+
+#line 518
+SIMD_IMPL_INTRIN_2(maxn_s64, vs64, vs64, vs64)
+SIMD_IMPL_INTRIN_1(reduce_maxn_s64, s64, vs64)
+
+#line 518
+SIMD_IMPL_INTRIN_2(minn_s64, vs64, vs64, vs64)
+SIMD_IMPL_INTRIN_1(reduce_minn_s64, s64, vs64)
+
+/**end repeat1**/
+#endif
+
+/***************************
+ * Mask operations
+ ***************************/
+#line 530
+ SIMD_IMPL_INTRIN_4(ifadd_s64, vs64, vb64, vs64, vs64, vs64)
+
+#line 530
+ SIMD_IMPL_INTRIN_4(ifsub_s64, vs64, vb64, vs64, vs64, vs64)
+
+
+#if 0
+SIMD_IMPL_INTRIN_4(ifdiv_s64, vs64, vb64, vs64, vs64, vs64)
+SIMD_IMPL_INTRIN_3(ifdivz_s64, vs64, vb64, vs64, vs64)
+#endif
+
+#endif // simd_sup
+
+#line 36
+#if NPY_SIMD_F32
+/***************************
+ * Memory
+ ***************************/
+#line 43
+SIMD_IMPL_INTRIN_1(load_f32, vf32, qf32)
+
+#line 43
+SIMD_IMPL_INTRIN_1(loada_f32, vf32, qf32)
+
+#line 43
+SIMD_IMPL_INTRIN_1(loads_f32, vf32, qf32)
+
+#line 43
+SIMD_IMPL_INTRIN_1(loadl_f32, vf32, qf32)
+
+SIMD_IMPL_INTRIN_1(load_f32x2, vf32x2, qf32)
+
+#line 51
+// special definition due to the nature of store
+static PyObject *
+simd__intrin_store_f32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf32};
+    simd_arg vec_arg = {.dtype = simd_data_vf32};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:store_f32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store_f32(seq_arg.data.qf32, vec_arg.data.vf32);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qf32, simd_data_qf32)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of storea
+static PyObject *
+simd__intrin_storea_f32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf32};
+    simd_arg vec_arg = {.dtype = simd_data_vf32};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:storea_f32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_storea_f32(seq_arg.data.qf32, vec_arg.data.vf32);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qf32, simd_data_qf32)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of stores
+static PyObject *
+simd__intrin_stores_f32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf32};
+    simd_arg vec_arg = {.dtype = simd_data_vf32};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:stores_f32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_stores_f32(seq_arg.data.qf32, vec_arg.data.vf32);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qf32, simd_data_qf32)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of storel
+static PyObject *
+simd__intrin_storel_f32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf32};
+    simd_arg vec_arg = {.dtype = simd_data_vf32};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:storel_f32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_storel_f32(seq_arg.data.qf32, vec_arg.data.vf32);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qf32, simd_data_qf32)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of storeh
+static PyObject *
+simd__intrin_storeh_f32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf32};
+    simd_arg vec_arg = {.dtype = simd_data_vf32};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:storeh_f32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_storeh_f32(seq_arg.data.qf32, vec_arg.data.vf32);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qf32, simd_data_qf32)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of store
+static PyObject *
+simd__intrin_store_f32x2(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf32};
+    simd_arg vec_arg = {.dtype = simd_data_vf32x2};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:store_f32x2",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store_f32x2(seq_arg.data.qf32, vec_arg.data.vf32x2);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qf32, simd_data_qf32)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+
+/****************************************
+ * Non-contiguous/Partial Memory access
+ ****************************************/
+#if 1
+// Partial Load
+SIMD_IMPL_INTRIN_3(load_till_f32, vf32, qf32, u32, f32)
+SIMD_IMPL_INTRIN_2(load_tillz_f32, vf32, qf32, u32)
+#if 32 == 32
+    SIMD_IMPL_INTRIN_4(load2_till_f32, vf32, qf32, u32, f32, f32)
+    SIMD_IMPL_INTRIN_2(load2_tillz_f32, vf32, qf32, u32)
+#else
+    SIMD_IMPL_INTRIN_4(load2_till_f32, vf32, qf32, u32, f32, f32)
+    SIMD_IMPL_INTRIN_2(load2_tillz_f32, vf32, qf32, u32)
+#endif
+
+// Partial Store
+#line 95
+#if !0 || 0 == 32
+static PyObject *
+simd__intrin_store_till_f32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf32};
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+    simd_arg vec_arg = {.dtype = simd_data_vf32};
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:store_till_f32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &nlane_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store_till_f32(
+        seq_arg.data.qf32, nlane_arg.data.u32, vec_arg.data.vf32
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qf32, simd_data_qf32)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+#endif // chksize
+
+
+#line 95
+#if !32 || 32 == 32
+static PyObject *
+simd__intrin_store2_till_f32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf32};
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+    simd_arg vec_arg = {.dtype = simd_data_vf32};
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:store2_till_f32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &nlane_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store2_till_f32(
+        seq_arg.data.qf32, nlane_arg.data.u32, vec_arg.data.vf32
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qf32, simd_data_qf32)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+#endif // chksize
+
+
+#line 95
+#if !64 || 64 == 32
+static PyObject *
+simd__intrin_store2_till_f32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf32};
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+    simd_arg vec_arg = {.dtype = simd_data_vf32};
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:store2_till_f32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &nlane_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store2_till_f32(
+        seq_arg.data.qf32, nlane_arg.data.u32, vec_arg.data.vf32
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qf32, simd_data_qf32)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+#endif // chksize
+
+
+// Non-contiguous Load
+#line 136
+#if !0 || 0 == 32
+static PyObject *
+simd__intrin_loadn_f32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_f32};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_f32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&:loadn_f32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_f32 *seq_ptr = seq_arg.data.qf32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_f32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 1;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn_f32(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_f32 rvec = npyv_loadn_f32(
+        seq_ptr, stride
+    #if 0
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.f32
+    #endif
+    #if 0
+        , fill2_arg.data.f32
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vf32, .data = {.vf32=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !32 || 32 == 32
+static PyObject *
+simd__intrin_loadn2_f32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_f32};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_f32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&:loadn2_f32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_f32 *seq_ptr = seq_arg.data.qf32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_f32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_f32(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_f32 rvec = npyv_loadn2_f32(
+        seq_ptr, stride
+    #if 0
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.f32
+    #endif
+    #if 0
+        , fill2_arg.data.f32
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vf32, .data = {.vf32=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !64 || 64 == 32
+static PyObject *
+simd__intrin_loadn2_f32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_f32};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_f32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&:loadn2_f32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_f32 *seq_ptr = seq_arg.data.qf32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_f32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_f32(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_f32 rvec = npyv_loadn2_f32(
+        seq_ptr, stride
+    #if 0
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.f32
+    #endif
+    #if 0
+        , fill2_arg.data.f32
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vf32, .data = {.vf32=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !0 || 0 == 32
+static PyObject *
+simd__intrin_loadn_till_f32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 1
+    simd_arg fill_arg = {.dtype = simd_data_f32};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_f32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:loadn_till_f32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_f32 *seq_ptr = seq_arg.data.qf32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_f32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 1;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn_till_f32(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_f32 rvec = npyv_loadn_till_f32(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 1
+        , fill_arg.data.f32
+    #endif
+    #if 0
+        , fill2_arg.data.f32
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vf32, .data = {.vf32=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !32 || 32 == 32
+static PyObject *
+simd__intrin_loadn2_till_f32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 1
+    simd_arg fill_arg = {.dtype = simd_data_f32};
+#endif
+#if 1
+    simd_arg fill2_arg = {.dtype = simd_data_f32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&O&:loadn2_till_f32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_f32 *seq_ptr = seq_arg.data.qf32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_f32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_till_f32(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_f32 rvec = npyv_loadn2_till_f32(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 1
+        , fill_arg.data.f32
+    #endif
+    #if 1
+        , fill2_arg.data.f32
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vf32, .data = {.vf32=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !64 || 64 == 32
+static PyObject *
+simd__intrin_loadn2_till_f32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 1
+    simd_arg fill_arg = {.dtype = simd_data_f32};
+#endif
+#if 1
+    simd_arg fill2_arg = {.dtype = simd_data_f32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&O&:loadn2_till_f32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_f32 *seq_ptr = seq_arg.data.qf32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_f32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_till_f32(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_f32 rvec = npyv_loadn2_till_f32(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 1
+        , fill_arg.data.f32
+    #endif
+    #if 1
+        , fill2_arg.data.f32
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vf32, .data = {.vf32=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !0 || 0 == 32
+static PyObject *
+simd__intrin_loadn_tillz_f32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_f32};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_f32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:loadn_tillz_f32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_f32 *seq_ptr = seq_arg.data.qf32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_f32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 1;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn_tillz_f32(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_f32 rvec = npyv_loadn_tillz_f32(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.f32
+    #endif
+    #if 0
+        , fill2_arg.data.f32
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vf32, .data = {.vf32=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !32 || 32 == 32
+static PyObject *
+simd__intrin_loadn2_tillz_f32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_f32};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_f32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:loadn2_tillz_f32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_f32 *seq_ptr = seq_arg.data.qf32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_f32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_tillz_f32(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_f32 rvec = npyv_loadn2_tillz_f32(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.f32
+    #endif
+    #if 0
+        , fill2_arg.data.f32
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vf32, .data = {.vf32=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !64 || 64 == 32
+static PyObject *
+simd__intrin_loadn2_tillz_f32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_f32};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_f32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:loadn2_tillz_f32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_f32 *seq_ptr = seq_arg.data.qf32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_f32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_tillz_f32(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_f32 rvec = npyv_loadn2_tillz_f32(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.f32
+    #endif
+    #if 0
+        , fill2_arg.data.f32
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vf32, .data = {.vf32=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+
+// Non-contiguous Store
+#line 216
+#if !0 || 0 == 32
+static PyObject *
+simd__intrin_storen_f32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vf32};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:storen_f32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_f32 *seq_ptr = seq_arg.data.qf32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_f32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*1;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen_f32(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen_f32(
+        seq_ptr, stride
+    #if 0
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vf32
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qf32, simd_data_qf32)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !32 || 32 == 32
+static PyObject *
+simd__intrin_storen2_f32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vf32};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:storen_f32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_f32 *seq_ptr = seq_arg.data.qf32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_f32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_f32(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_f32(
+        seq_ptr, stride
+    #if 0
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vf32
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qf32, simd_data_qf32)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !64 || 64 == 32
+static PyObject *
+simd__intrin_storen2_f32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vf32};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:storen_f32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_f32 *seq_ptr = seq_arg.data.qf32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_f32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_f32(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_f32(
+        seq_ptr, stride
+    #if 0
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vf32
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qf32, simd_data_qf32)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !0 || 0 == 32
+static PyObject *
+simd__intrin_storen_till_f32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vf32};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:storen_f32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_f32 *seq_ptr = seq_arg.data.qf32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_f32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*1;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen_till_f32(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen_till_f32(
+        seq_ptr, stride
+    #if 1
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vf32
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qf32, simd_data_qf32)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !32 || 32 == 32
+static PyObject *
+simd__intrin_storen2_till_f32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vf32};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:storen_f32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_f32 *seq_ptr = seq_arg.data.qf32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_f32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_till_f32(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_till_f32(
+        seq_ptr, stride
+    #if 1
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vf32
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qf32, simd_data_qf32)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !64 || 64 == 32
+static PyObject *
+simd__intrin_storen2_till_f32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf32};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vf32};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:storen_f32",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_f32 *seq_ptr = seq_arg.data.qf32;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_f32;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_till_f32(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_till_f32(
+        seq_ptr, stride
+    #if 1
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vf32
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qf32, simd_data_qf32)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#endif // 1
+
+/****************************
+ * Lookup tables
+ ****************************/
+#if 32 == 32
+SIMD_IMPL_INTRIN_2(lut32_f32, vf32, qf32, vu32)
+#endif
+#if 32 == 64
+SIMD_IMPL_INTRIN_2(lut16_f32, vf32, qf32, vu32)
+#endif
+/***************************
+ * Misc
+ ***************************/
+SIMD_IMPL_INTRIN_0(zero_f32, vf32)
+SIMD_IMPL_INTRIN_1(extract0_f32, f32, vf32)
+SIMD_IMPL_INTRIN_1(setall_f32, vf32, f32)
+SIMD_IMPL_INTRIN_3(select_f32, vf32, vb32, vf32, vf32)
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u8_f32, vu8, vf32)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s8_f32, vs8, vf32)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u16_f32, vu16, vf32)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s16_f32, vs16, vf32)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u32_f32, vu32, vf32)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s32_f32, vs32, vf32)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u64_f32, vu64, vf32)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s64_f32, vs64, vf32)
+#endif // simd_sup2
+
+#line 296
+#if NPY_SIMD_F32
+SIMD_IMPL_INTRIN_1(reinterpret_f32_f32, vf32, vf32)
+#endif // simd_sup2
+
+#line 296
+#if NPY_SIMD_F64
+SIMD_IMPL_INTRIN_1(reinterpret_f64_f32, vf64, vf32)
+#endif // simd_sup2
+
+
+/**
+ * special definition due to the nature of intrinsics
+ * npyv_setf_f32 and npy_set_f32.
+*/
+#line 308
+static PyObject *
+simd__intrin_setf_f32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    npyv_lanetype_f32 *data = simd_sequence_from_iterable(args, simd_data_qf32, npyv_nlanes_f32);
+    if (data == NULL) {
+        return NULL;
+    }
+    simd_data r = {.vf32 = npyv_setf_f32(
+        data[0],  data[1],  data[2],  data[3],  data[4],  data[5],  data[6],  data[7],
+        data[8],  data[9],  data[10], data[11], data[12], data[13], data[14], data[15],
+        data[16], data[17], data[18], data[19], data[20], data[21], data[22], data[23],
+        data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31],
+        data[32], data[33], data[34], data[35], data[36], data[37], data[38], data[39],
+        data[40], data[41], data[42], data[43], data[44], data[45], data[46], data[47],
+        data[48], data[49], data[50], data[51], data[52], data[53], data[54], data[55],
+        data[56], data[57], data[58], data[59], data[60], data[61], data[62], data[63],
+        data[64] // for setf
+    )};
+    simd_sequence_free(data);
+    return (PyObject*)PySIMDVector_FromData(r, simd_data_vf32);
+}
+
+#line 308
+static PyObject *
+simd__intrin_set_f32(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    npyv_lanetype_f32 *data = simd_sequence_from_iterable(args, simd_data_qf32, npyv_nlanes_f32);
+    if (data == NULL) {
+        return NULL;
+    }
+    simd_data r = {.vf32 = npyv_set_f32(
+        data[0],  data[1],  data[2],  data[3],  data[4],  data[5],  data[6],  data[7],
+        data[8],  data[9],  data[10], data[11], data[12], data[13], data[14], data[15],
+        data[16], data[17], data[18], data[19], data[20], data[21], data[22], data[23],
+        data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31],
+        data[32], data[33], data[34], data[35], data[36], data[37], data[38], data[39],
+        data[40], data[41], data[42], data[43], data[44], data[45], data[46], data[47],
+        data[48], data[49], data[50], data[51], data[52], data[53], data[54], data[55],
+        data[56], data[57], data[58], data[59], data[60], data[61], data[62], data[63],
+        data[64] // for setf
+    )};
+    simd_sequence_free(data);
+    return (PyObject*)PySIMDVector_FromData(r, simd_data_vf32);
+}
+
+
+/***************************
+ * Reorder
+ ***************************/
+#line 337
+SIMD_IMPL_INTRIN_2(combinel_f32, vf32, vf32, vf32)
+
+#line 337
+SIMD_IMPL_INTRIN_2(combineh_f32, vf32, vf32, vf32)
+
+
+#line 343
+SIMD_IMPL_INTRIN_2(combine_f32, vf32x2, vf32, vf32)
+
+#line 343
+SIMD_IMPL_INTRIN_2(zip_f32, vf32x2, vf32, vf32)
+
+#line 343
+SIMD_IMPL_INTRIN_2(unzip_f32, vf32x2, vf32, vf32)
+
+
+#if 1
+SIMD_IMPL_INTRIN_1(rev64_f32, vf32, vf32)
+#endif
+
+// special implementation to convert runtime constants to immediate values
+#if 32 == 32
+// one call for element index then gather them within one vector
+// instead of unroll the 255 possible cases.
+NPY_FINLINE npyv_f32
+npyv_permi128_f32_(npyv_f32 a, unsigned e0, unsigned e1, unsigned e2, unsigned e3)
+{
+   #line 360
+    npyv_f32 ve0;
+    npyv_lanetype_f32 de0[npyv_nlanes_f32];
+    if (0) {}
+   #line 366
+    else if (e0 == 1) {
+        ve0 = npyv_permi128_f32(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e0 == 2) {
+        ve0 = npyv_permi128_f32(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e0 == 3) {
+        ve0 = npyv_permi128_f32(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve0 = npyv_permi128_f32(a, 0, 0, 0, 0);
+    }
+    npyv_store_f32(de0, ve0);
+    
+#line 360
+    npyv_f32 ve1;
+    npyv_lanetype_f32 de1[npyv_nlanes_f32];
+    if (0) {}
+   #line 366
+    else if (e1 == 1) {
+        ve1 = npyv_permi128_f32(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e1 == 2) {
+        ve1 = npyv_permi128_f32(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e1 == 3) {
+        ve1 = npyv_permi128_f32(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve1 = npyv_permi128_f32(a, 0, 0, 0, 0);
+    }
+    npyv_store_f32(de1, ve1);
+    
+#line 360
+    npyv_f32 ve2;
+    npyv_lanetype_f32 de2[npyv_nlanes_f32];
+    if (0) {}
+   #line 366
+    else if (e2 == 1) {
+        ve2 = npyv_permi128_f32(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e2 == 2) {
+        ve2 = npyv_permi128_f32(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e2 == 3) {
+        ve2 = npyv_permi128_f32(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve2 = npyv_permi128_f32(a, 0, 0, 0, 0);
+    }
+    npyv_store_f32(de2, ve2);
+    
+#line 360
+    npyv_f32 ve3;
+    npyv_lanetype_f32 de3[npyv_nlanes_f32];
+    if (0) {}
+   #line 366
+    else if (e3 == 1) {
+        ve3 = npyv_permi128_f32(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e3 == 2) {
+        ve3 = npyv_permi128_f32(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e3 == 3) {
+        ve3 = npyv_permi128_f32(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve3 = npyv_permi128_f32(a, 0, 0, 0, 0);
+    }
+    npyv_store_f32(de3, ve3);
+    
+    if (e0 == e1 && e0 == e2 && e0 == e3) {
+        return ve0;
+    }
+    for (int i = 0; i < npyv_nlanes_f32; i += 4) {
+        de0[i+1] = de1[i+1];
+        de0[i+2] = de2[i+2];
+        de0[i+3] = de3[i+3];
+    }
+    return npyv_load_f32(de0);
+}
+SIMD_IMPL_INTRIN_5(permi128_f32_, vf32, vf32, u8, u8, u8, u8)
+#elif 32 == 64
+NPY_FINLINE npyv_f32
+npyv_permi128_f32_(npyv_f32 a, unsigned e0, unsigned e1)
+{
+    if (e0 == 1 && e1 == 0) {
+        return npyv_permi128_f32(a, 1, 0);
+    }
+    else if (e0 == 0 && e1 == 1) {
+        return npyv_permi128_f32(a, 0, 1);
+    }
+    else if (e0 == 1 && e1 == 1) {
+        return npyv_permi128_f32(a, 1, 1);
+    }
+    return npyv_permi128_f32(a, 0, 0);
+}
+SIMD_IMPL_INTRIN_3(permi128_f32_, vf32, vf32, u8, u8)
+#endif
+
+/***************************
+ * Operators
+ ***************************/
+#if 0 > 0
+SIMD_IMPL_INTRIN_2(shl_f32, vf32, vf32, u8)
+SIMD_IMPL_INTRIN_2(shr_f32, vf32, vf32, u8)
+// immediate constant
+SIMD_IMPL_INTRIN_2IMM(shli_f32, vf32, vf32, 0)
+SIMD_IMPL_INTRIN_2IMM(shri_f32, vf32, vf32, 0)
+#endif // shl_imm
+
+#line 418
+SIMD_IMPL_INTRIN_2(and_f32, vf32, vf32, vf32)
+
+#line 418
+SIMD_IMPL_INTRIN_2(or_f32, vf32, vf32, vf32)
+
+#line 418
+SIMD_IMPL_INTRIN_2(xor_f32, vf32, vf32, vf32)
+
+
+SIMD_IMPL_INTRIN_1(not_f32, vf32, vf32)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpeq_f32, vb32, vf32, vf32)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpneq_f32, vb32, vf32, vf32)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpgt_f32, vb32, vf32, vf32)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpge_f32, vb32, vf32, vf32)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmplt_f32, vb32, vf32, vf32)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmple_f32, vb32, vf32, vf32)
+
+
+#if 0
+SIMD_IMPL_INTRIN_2(andc_f32, vf32, vf32, vf32)
+SIMD_IMPL_INTRIN_2(andc_b32, vb32, vb32, vb32)
+SIMD_IMPL_INTRIN_2(orc_b32, vb32, vb32, vb32)
+SIMD_IMPL_INTRIN_2(xnor_b32, vb32, vb32, vb32)
+#endif
+
+// test cross all vector lanes
+#line 440
+SIMD_IMPL_INTRIN_1(any_f32, u8, vf32)
+
+#line 440
+SIMD_IMPL_INTRIN_1(all_f32, u8, vf32)
+
+/***************************
+ * Conversion
+ ***************************/
+SIMD_IMPL_INTRIN_1(cvt_f32_b32, vf32,  vb32)
+SIMD_IMPL_INTRIN_1(cvt_b32_f32, vb32, vf32)
+#if 0
+SIMD_IMPL_INTRIN_1(expand_f32_f32, vf32x2, vf32)
+#endif // expand_sup
+/***************************
+ * Arithmetic
+ ***************************/
+#line 456
+SIMD_IMPL_INTRIN_2(add_f32, vf32, vf32, vf32)
+
+#line 456
+SIMD_IMPL_INTRIN_2(sub_f32, vf32, vf32, vf32)
+
+
+#if 0
+#line 463
+SIMD_IMPL_INTRIN_2(adds_f32, vf32, vf32, vf32)
+
+#line 463
+SIMD_IMPL_INTRIN_2(subs_f32, vf32, vf32, vf32)
+
+#endif // sat_sup
+
+#if 1
+SIMD_IMPL_INTRIN_2(mul_f32, vf32, vf32, vf32)
+#endif // mul_sup
+
+#if 1
+SIMD_IMPL_INTRIN_2(div_f32, vf32, vf32, vf32)
+#endif // div_sup
+
+#if 0
+SIMD_IMPL_INTRIN_1(divisor_f32, vf32x3, f32)
+SIMD_IMPL_INTRIN_2(divc_f32, vf32, vf32, vf32x3)
+#endif // intdiv_sup
+
+#if 1
+#line 484
+SIMD_IMPL_INTRIN_3(muladd_f32, vf32, vf32, vf32, vf32)
+
+#line 484
+SIMD_IMPL_INTRIN_3(mulsub_f32, vf32, vf32, vf32, vf32)
+
+#line 484
+SIMD_IMPL_INTRIN_3(nmuladd_f32, vf32, vf32, vf32, vf32)
+
+#line 484
+SIMD_IMPL_INTRIN_3(nmulsub_f32, vf32, vf32, vf32, vf32)
+
+#line 484
+SIMD_IMPL_INTRIN_3(muladdsub_f32, vf32, vf32, vf32, vf32)
+
+#endif // fused_sup
+
+#if 1
+SIMD_IMPL_INTRIN_1(sum_f32, f32, vf32)
+#endif // sum_sup
+
+#if 0
+SIMD_IMPL_INTRIN_1(sumup_f32, f32, vf32)
+#endif // sumup_sup
+
+/***************************
+ * Math
+ ***************************/
+#if 1
+#line 503
+SIMD_IMPL_INTRIN_1(sqrt_f32, vf32, vf32)
+
+#line 503
+SIMD_IMPL_INTRIN_1(recip_f32, vf32, vf32)
+
+#line 503
+SIMD_IMPL_INTRIN_1(abs_f32, vf32, vf32)
+
+#line 503
+SIMD_IMPL_INTRIN_1(square_f32, vf32, vf32)
+
+#line 503
+SIMD_IMPL_INTRIN_1(rint_f32, vf32, vf32)
+
+#line 503
+SIMD_IMPL_INTRIN_1(ceil_f32, vf32, vf32)
+
+#line 503
+SIMD_IMPL_INTRIN_1(trunc_f32, vf32, vf32)
+
+#line 503
+SIMD_IMPL_INTRIN_1(floor_f32, vf32, vf32)
+
+#endif
+
+#line 510
+SIMD_IMPL_INTRIN_2(max_f32, vf32, vf32, vf32)
+SIMD_IMPL_INTRIN_1(reduce_max_f32, f32, vf32)
+
+#line 510
+SIMD_IMPL_INTRIN_2(min_f32, vf32, vf32, vf32)
+SIMD_IMPL_INTRIN_1(reduce_min_f32, f32, vf32)
+
+
+#if 1
+#line 518
+SIMD_IMPL_INTRIN_2(maxp_f32, vf32, vf32, vf32)
+SIMD_IMPL_INTRIN_1(reduce_maxp_f32, f32, vf32)
+
+#line 518
+SIMD_IMPL_INTRIN_2(minp_f32, vf32, vf32, vf32)
+SIMD_IMPL_INTRIN_1(reduce_minp_f32, f32, vf32)
+
+#line 518
+SIMD_IMPL_INTRIN_2(maxn_f32, vf32, vf32, vf32)
+SIMD_IMPL_INTRIN_1(reduce_maxn_f32, f32, vf32)
+
+#line 518
+SIMD_IMPL_INTRIN_2(minn_f32, vf32, vf32, vf32)
+SIMD_IMPL_INTRIN_1(reduce_minn_f32, f32, vf32)
+
+/**end repeat1**/
+#endif
+
+/***************************
+ * Mask operations
+ ***************************/
+#line 530
+ SIMD_IMPL_INTRIN_4(ifadd_f32, vf32, vb32, vf32, vf32, vf32)
+
+#line 530
+ SIMD_IMPL_INTRIN_4(ifsub_f32, vf32, vb32, vf32, vf32, vf32)
+
+
+#if 1
+SIMD_IMPL_INTRIN_4(ifdiv_f32, vf32, vb32, vf32, vf32, vf32)
+SIMD_IMPL_INTRIN_3(ifdivz_f32, vf32, vb32, vf32, vf32)
+#endif
+
+#endif // simd_sup
+
+#line 36
+#if NPY_SIMD_F64
+/***************************
+ * Memory
+ ***************************/
+#line 43
+SIMD_IMPL_INTRIN_1(load_f64, vf64, qf64)
+
+#line 43
+SIMD_IMPL_INTRIN_1(loada_f64, vf64, qf64)
+
+#line 43
+SIMD_IMPL_INTRIN_1(loads_f64, vf64, qf64)
+
+#line 43
+SIMD_IMPL_INTRIN_1(loadl_f64, vf64, qf64)
+
+SIMD_IMPL_INTRIN_1(load_f64x2, vf64x2, qf64)
+
+#line 51
+// special definition due to the nature of store
+static PyObject *
+simd__intrin_store_f64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf64};
+    simd_arg vec_arg = {.dtype = simd_data_vf64};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:store_f64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store_f64(seq_arg.data.qf64, vec_arg.data.vf64);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qf64, simd_data_qf64)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of storea
+static PyObject *
+simd__intrin_storea_f64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf64};
+    simd_arg vec_arg = {.dtype = simd_data_vf64};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:storea_f64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_storea_f64(seq_arg.data.qf64, vec_arg.data.vf64);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qf64, simd_data_qf64)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of stores
+static PyObject *
+simd__intrin_stores_f64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf64};
+    simd_arg vec_arg = {.dtype = simd_data_vf64};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:stores_f64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_stores_f64(seq_arg.data.qf64, vec_arg.data.vf64);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qf64, simd_data_qf64)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of storel
+static PyObject *
+simd__intrin_storel_f64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf64};
+    simd_arg vec_arg = {.dtype = simd_data_vf64};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:storel_f64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_storel_f64(seq_arg.data.qf64, vec_arg.data.vf64);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qf64, simd_data_qf64)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of storeh
+static PyObject *
+simd__intrin_storeh_f64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf64};
+    simd_arg vec_arg = {.dtype = simd_data_vf64};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:storeh_f64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_storeh_f64(seq_arg.data.qf64, vec_arg.data.vf64);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qf64, simd_data_qf64)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+#line 51
+// special definition due to the nature of store
+static PyObject *
+simd__intrin_store_f64x2(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf64};
+    simd_arg vec_arg = {.dtype = simd_data_vf64x2};
+    if (!PyArg_ParseTuple(
+        args, "O&O&:store_f64x2",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store_f64x2(seq_arg.data.qf64, vec_arg.data.vf64x2);
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qf64, simd_data_qf64)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+
+
+/****************************************
+ * Non-contiguous/Partial Memory access
+ ****************************************/
+#if 1
+// Partial Load
+SIMD_IMPL_INTRIN_3(load_till_f64, vf64, qf64, u32, f64)
+SIMD_IMPL_INTRIN_2(load_tillz_f64, vf64, qf64, u32)
+#if 64 == 32
+    SIMD_IMPL_INTRIN_4(load2_till_f64, vf64, qf64, u32, f64, f64)
+    SIMD_IMPL_INTRIN_2(load2_tillz_f64, vf64, qf64, u32)
+#else
+    SIMD_IMPL_INTRIN_4(load2_till_f64, vf64, qf64, u32, f64, f64)
+    SIMD_IMPL_INTRIN_2(load2_tillz_f64, vf64, qf64, u32)
+#endif
+
+// Partial Store
+#line 95
+#if !0 || 0 == 64
+static PyObject *
+simd__intrin_store_till_f64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf64};
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+    simd_arg vec_arg = {.dtype = simd_data_vf64};
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:store_till_f64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &nlane_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store_till_f64(
+        seq_arg.data.qf64, nlane_arg.data.u32, vec_arg.data.vf64
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qf64, simd_data_qf64)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+#endif // chksize
+
+
+#line 95
+#if !32 || 32 == 64
+static PyObject *
+simd__intrin_store2_till_f64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf64};
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+    simd_arg vec_arg = {.dtype = simd_data_vf64};
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:store2_till_f64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &nlane_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store2_till_f64(
+        seq_arg.data.qf64, nlane_arg.data.u32, vec_arg.data.vf64
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qf64, simd_data_qf64)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+#endif // chksize
+
+
+#line 95
+#if !64 || 64 == 64
+static PyObject *
+simd__intrin_store2_till_f64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf64};
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+    simd_arg vec_arg = {.dtype = simd_data_vf64};
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:store2_till_f64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &nlane_arg,
+        simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_store2_till_f64(
+        seq_arg.data.qf64, nlane_arg.data.u32, vec_arg.data.vf64
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qf64, simd_data_qf64)) {
+        simd_arg_free(&seq_arg);
+        return NULL;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+}
+#endif // chksize
+
+
+// Non-contiguous Load
+#line 136
+#if !0 || 0 == 64
+static PyObject *
+simd__intrin_loadn_f64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_f64};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_f64};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&:loadn_f64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_f64 *seq_ptr = seq_arg.data.qf64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_f64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 1;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn_f64(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_f64 rvec = npyv_loadn_f64(
+        seq_ptr, stride
+    #if 0
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.f64
+    #endif
+    #if 0
+        , fill2_arg.data.f64
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vf64, .data = {.vf64=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !32 || 32 == 64
+static PyObject *
+simd__intrin_loadn2_f64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_f64};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_f64};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&:loadn2_f64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_f64 *seq_ptr = seq_arg.data.qf64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_f64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_f64(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_f64 rvec = npyv_loadn2_f64(
+        seq_ptr, stride
+    #if 0
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.f64
+    #endif
+    #if 0
+        , fill2_arg.data.f64
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vf64, .data = {.vf64=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !64 || 64 == 64
+static PyObject *
+simd__intrin_loadn2_f64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_f64};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_f64};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&:loadn2_f64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_f64 *seq_ptr = seq_arg.data.qf64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_f64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_f64(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_f64 rvec = npyv_loadn2_f64(
+        seq_ptr, stride
+    #if 0
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.f64
+    #endif
+    #if 0
+        , fill2_arg.data.f64
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vf64, .data = {.vf64=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !0 || 0 == 64
+static PyObject *
+simd__intrin_loadn_till_f64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 1
+    simd_arg fill_arg = {.dtype = simd_data_f64};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_f64};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:loadn_till_f64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_f64 *seq_ptr = seq_arg.data.qf64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_f64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 1;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn_till_f64(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_f64 rvec = npyv_loadn_till_f64(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 1
+        , fill_arg.data.f64
+    #endif
+    #if 0
+        , fill2_arg.data.f64
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vf64, .data = {.vf64=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !32 || 32 == 64
+static PyObject *
+simd__intrin_loadn2_till_f64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 1
+    simd_arg fill_arg = {.dtype = simd_data_f64};
+#endif
+#if 1
+    simd_arg fill2_arg = {.dtype = simd_data_f64};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&O&:loadn2_till_f64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_f64 *seq_ptr = seq_arg.data.qf64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_f64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_till_f64(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_f64 rvec = npyv_loadn2_till_f64(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 1
+        , fill_arg.data.f64
+    #endif
+    #if 1
+        , fill2_arg.data.f64
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vf64, .data = {.vf64=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !64 || 64 == 64
+static PyObject *
+simd__intrin_loadn2_till_f64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 1
+    simd_arg fill_arg = {.dtype = simd_data_f64};
+#endif
+#if 1
+    simd_arg fill2_arg = {.dtype = simd_data_f64};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&O&:loadn2_till_f64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 1
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_f64 *seq_ptr = seq_arg.data.qf64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_f64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_till_f64(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_f64 rvec = npyv_loadn2_till_f64(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 1
+        , fill_arg.data.f64
+    #endif
+    #if 1
+        , fill2_arg.data.f64
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vf64, .data = {.vf64=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !0 || 0 == 64
+static PyObject *
+simd__intrin_loadn_tillz_f64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_f64};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_f64};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:loadn_tillz_f64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_f64 *seq_ptr = seq_arg.data.qf64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_f64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 1;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn_tillz_f64(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_f64 rvec = npyv_loadn_tillz_f64(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.f64
+    #endif
+    #if 0
+        , fill2_arg.data.f64
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vf64, .data = {.vf64=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !32 || 32 == 64
+static PyObject *
+simd__intrin_loadn2_tillz_f64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_f64};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_f64};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:loadn2_tillz_f64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_f64 *seq_ptr = seq_arg.data.qf64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_f64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_tillz_f64(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_f64 rvec = npyv_loadn2_tillz_f64(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.f64
+    #endif
+    #if 0
+        , fill2_arg.data.f64
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vf64, .data = {.vf64=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 136
+#if !64 || 64 == 64
+static PyObject *
+simd__intrin_loadn2_tillz_f64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif // till
+#if 0
+    simd_arg fill_arg = {.dtype = simd_data_f64};
+#endif
+#if 0
+    simd_arg fill2_arg = {.dtype = simd_data_f64};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:loadn2_tillz_f64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill_arg
+#endif
+#if 0
+        ,simd_arg_converter, &fill2_arg
+#endif
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_f64 *seq_ptr = seq_arg.data.qf64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_f64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1 * 2;
+        min_seq_len = -min_seq_len;
+    }
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "loadn2_tillz_f64(), according to provided stride %d, the "
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_f64 rvec = npyv_loadn2_tillz_f64(
+        seq_ptr, stride
+    #if 1
+        , nlane_arg.data.u32
+    #endif
+    #if 0
+        , fill_arg.data.f64
+    #endif
+    #if 0
+        , fill2_arg.data.f64
+    #endif
+    );
+    simd_arg ret = {
+        .dtype = simd_data_vf64, .data = {.vf64=rvec}
+    };
+    simd_arg_free(&seq_arg);
+    return simd_arg_to_obj(&ret);
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+
+// Non-contiguous Store
+#line 216
+#if !0 || 0 == 64
+static PyObject *
+simd__intrin_storen_f64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vf64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:storen_f64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_f64 *seq_ptr = seq_arg.data.qf64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_f64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*1;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen_f64(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen_f64(
+        seq_ptr, stride
+    #if 0
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vf64
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qf64, simd_data_qf64)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !32 || 32 == 64
+static PyObject *
+simd__intrin_storen2_f64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vf64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:storen_f64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_f64 *seq_ptr = seq_arg.data.qf64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_f64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_f64(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_f64(
+        seq_ptr, stride
+    #if 0
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vf64
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qf64, simd_data_qf64)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !64 || 64 == 64
+static PyObject *
+simd__intrin_storen2_f64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vf64};
+#if 0
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&:storen_f64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 0
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_f64 *seq_ptr = seq_arg.data.qf64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_f64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_f64(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_f64(
+        seq_ptr, stride
+    #if 0
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vf64
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qf64, simd_data_qf64)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !0 || 0 == 64
+static PyObject *
+simd__intrin_storen_till_f64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vf64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:storen_f64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_f64 *seq_ptr = seq_arg.data.qf64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_f64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*1;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen_till_f64(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen_till_f64(
+        seq_ptr, stride
+    #if 1
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vf64
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qf64, simd_data_qf64)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !32 || 32 == 64
+static PyObject *
+simd__intrin_storen2_till_f64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vf64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:storen_f64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_f64 *seq_ptr = seq_arg.data.qf64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_f64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_till_f64(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_till_f64(
+        seq_ptr, stride
+    #if 1
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vf64
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qf64, simd_data_qf64)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#line 216
+#if !64 || 64 == 64
+static PyObject *
+simd__intrin_storen2_till_f64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    simd_arg seq_arg = {.dtype = simd_data_qf64};
+    simd_arg stride_arg = {.dtype = simd_data_s64};
+    simd_arg vec_arg = {.dtype = simd_data_vf64};
+#if 1
+    simd_arg nlane_arg = {.dtype = simd_data_u32};
+#endif
+    if (!PyArg_ParseTuple(
+        args, "O&O&O&O&:storen_f64",
+        simd_arg_converter, &seq_arg,
+        simd_arg_converter, &stride_arg
+#if 1
+        ,simd_arg_converter, &nlane_arg
+#endif
+        ,simd_arg_converter, &vec_arg
+    )) {
+        return NULL;
+    }
+    npyv_lanetype_f64 *seq_ptr = seq_arg.data.qf64;
+    npy_intp stride = (npy_intp)stride_arg.data.s64;
+    Py_ssize_t cur_seq_len = simd_sequence_len(seq_ptr);
+    Py_ssize_t min_seq_len = stride * npyv_nlanes_f64;
+    if (stride < 0) {
+        seq_ptr += cur_seq_len - 1*2;
+        min_seq_len = -min_seq_len;
+    }
+    // overflow guard
+    if (cur_seq_len < min_seq_len) {
+        PyErr_Format(PyExc_ValueError,
+            "storen2_till_f64(), according to provided stride %d, the"
+            "minimum acceptable size of the required sequence is %d, given(%d)",
+            stride, min_seq_len, cur_seq_len
+        );
+        goto err;
+    }
+    npyv_storen2_till_f64(
+        seq_ptr, stride
+    #if 1
+        ,nlane_arg.data.u32
+    #endif
+        ,vec_arg.data.vf64
+    );
+    // write-back
+    if (simd_sequence_fill_iterable(seq_arg.obj, seq_arg.data.qf64, simd_data_qf64)) {
+        goto err;
+    }
+    simd_arg_free(&seq_arg);
+    Py_RETURN_NONE;
+err:
+    simd_arg_free(&seq_arg);
+    return NULL;
+}
+#endif // chksize
+
+#endif // 1
+
+/****************************
+ * Lookup tables
+ ****************************/
+#if 64 == 32
+SIMD_IMPL_INTRIN_2(lut32_f64, vf64, qf64, vu64)
+#endif
+#if 64 == 64
+SIMD_IMPL_INTRIN_2(lut16_f64, vf64, qf64, vu64)
+#endif
+/***************************
+ * Misc
+ ***************************/
+SIMD_IMPL_INTRIN_0(zero_f64, vf64)
+SIMD_IMPL_INTRIN_1(extract0_f64, f64, vf64)
+SIMD_IMPL_INTRIN_1(setall_f64, vf64, f64)
+SIMD_IMPL_INTRIN_3(select_f64, vf64, vb64, vf64, vf64)
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u8_f64, vu8, vf64)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s8_f64, vs8, vf64)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u16_f64, vu16, vf64)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s16_f64, vs16, vf64)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u32_f64, vu32, vf64)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s32_f64, vs32, vf64)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_u64_f64, vu64, vf64)
+#endif // simd_sup2
+
+#line 296
+#if 1
+SIMD_IMPL_INTRIN_1(reinterpret_s64_f64, vs64, vf64)
+#endif // simd_sup2
+
+#line 296
+#if NPY_SIMD_F32
+SIMD_IMPL_INTRIN_1(reinterpret_f32_f64, vf32, vf64)
+#endif // simd_sup2
+
+#line 296
+#if NPY_SIMD_F64
+SIMD_IMPL_INTRIN_1(reinterpret_f64_f64, vf64, vf64)
+#endif // simd_sup2
+
+
+/**
+ * special definition due to the nature of intrinsics
+ * npyv_setf_f64 and npy_set_f64.
+*/
+#line 308
+static PyObject *
+simd__intrin_setf_f64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    npyv_lanetype_f64 *data = simd_sequence_from_iterable(args, simd_data_qf64, npyv_nlanes_f64);
+    if (data == NULL) {
+        return NULL;
+    }
+    simd_data r = {.vf64 = npyv_setf_f64(
+        data[0],  data[1],  data[2],  data[3],  data[4],  data[5],  data[6],  data[7],
+        data[8],  data[9],  data[10], data[11], data[12], data[13], data[14], data[15],
+        data[16], data[17], data[18], data[19], data[20], data[21], data[22], data[23],
+        data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31],
+        data[32], data[33], data[34], data[35], data[36], data[37], data[38], data[39],
+        data[40], data[41], data[42], data[43], data[44], data[45], data[46], data[47],
+        data[48], data[49], data[50], data[51], data[52], data[53], data[54], data[55],
+        data[56], data[57], data[58], data[59], data[60], data[61], data[62], data[63],
+        data[64] // for setf
+    )};
+    simd_sequence_free(data);
+    return (PyObject*)PySIMDVector_FromData(r, simd_data_vf64);
+}
+
+#line 308
+static PyObject *
+simd__intrin_set_f64(PyObject* NPY_UNUSED(self), PyObject *args)
+{
+    npyv_lanetype_f64 *data = simd_sequence_from_iterable(args, simd_data_qf64, npyv_nlanes_f64);
+    if (data == NULL) {
+        return NULL;
+    }
+    simd_data r = {.vf64 = npyv_set_f64(
+        data[0],  data[1],  data[2],  data[3],  data[4],  data[5],  data[6],  data[7],
+        data[8],  data[9],  data[10], data[11], data[12], data[13], data[14], data[15],
+        data[16], data[17], data[18], data[19], data[20], data[21], data[22], data[23],
+        data[24], data[25], data[26], data[27], data[28], data[29], data[30], data[31],
+        data[32], data[33], data[34], data[35], data[36], data[37], data[38], data[39],
+        data[40], data[41], data[42], data[43], data[44], data[45], data[46], data[47],
+        data[48], data[49], data[50], data[51], data[52], data[53], data[54], data[55],
+        data[56], data[57], data[58], data[59], data[60], data[61], data[62], data[63],
+        data[64] // for setf
+    )};
+    simd_sequence_free(data);
+    return (PyObject*)PySIMDVector_FromData(r, simd_data_vf64);
+}
+
+
+/***************************
+ * Reorder
+ ***************************/
+#line 337
+SIMD_IMPL_INTRIN_2(combinel_f64, vf64, vf64, vf64)
+
+#line 337
+SIMD_IMPL_INTRIN_2(combineh_f64, vf64, vf64, vf64)
+
+
+#line 343
+SIMD_IMPL_INTRIN_2(combine_f64, vf64x2, vf64, vf64)
+
+#line 343
+SIMD_IMPL_INTRIN_2(zip_f64, vf64x2, vf64, vf64)
+
+#line 343
+SIMD_IMPL_INTRIN_2(unzip_f64, vf64x2, vf64, vf64)
+
+
+#if 0
+SIMD_IMPL_INTRIN_1(rev64_f64, vf64, vf64)
+#endif
+
+// special implementation to convert runtime constants to immediate values
+#if 64 == 32
+// one call for element index then gather them within one vector
+// instead of unroll the 255 possible cases.
+NPY_FINLINE npyv_f64
+npyv_permi128_f64_(npyv_f64 a, unsigned e0, unsigned e1, unsigned e2, unsigned e3)
+{
+   #line 360
+    npyv_f64 ve0;
+    npyv_lanetype_f64 de0[npyv_nlanes_f64];
+    if (0) {}
+   #line 366
+    else if (e0 == 1) {
+        ve0 = npyv_permi128_f64(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e0 == 2) {
+        ve0 = npyv_permi128_f64(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e0 == 3) {
+        ve0 = npyv_permi128_f64(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve0 = npyv_permi128_f64(a, 0, 0, 0, 0);
+    }
+    npyv_store_f64(de0, ve0);
+    
+#line 360
+    npyv_f64 ve1;
+    npyv_lanetype_f64 de1[npyv_nlanes_f64];
+    if (0) {}
+   #line 366
+    else if (e1 == 1) {
+        ve1 = npyv_permi128_f64(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e1 == 2) {
+        ve1 = npyv_permi128_f64(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e1 == 3) {
+        ve1 = npyv_permi128_f64(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve1 = npyv_permi128_f64(a, 0, 0, 0, 0);
+    }
+    npyv_store_f64(de1, ve1);
+    
+#line 360
+    npyv_f64 ve2;
+    npyv_lanetype_f64 de2[npyv_nlanes_f64];
+    if (0) {}
+   #line 366
+    else if (e2 == 1) {
+        ve2 = npyv_permi128_f64(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e2 == 2) {
+        ve2 = npyv_permi128_f64(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e2 == 3) {
+        ve2 = npyv_permi128_f64(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve2 = npyv_permi128_f64(a, 0, 0, 0, 0);
+    }
+    npyv_store_f64(de2, ve2);
+    
+#line 360
+    npyv_f64 ve3;
+    npyv_lanetype_f64 de3[npyv_nlanes_f64];
+    if (0) {}
+   #line 366
+    else if (e3 == 1) {
+        ve3 = npyv_permi128_f64(a, 1, 1, 1, 1);
+    }
+    
+#line 366
+    else if (e3 == 2) {
+        ve3 = npyv_permi128_f64(a, 2, 2, 2, 2);
+    }
+    
+#line 366
+    else if (e3 == 3) {
+        ve3 = npyv_permi128_f64(a, 3, 3, 3, 3);
+    }
+    
+    else {
+        ve3 = npyv_permi128_f64(a, 0, 0, 0, 0);
+    }
+    npyv_store_f64(de3, ve3);
+    
+    if (e0 == e1 && e0 == e2 && e0 == e3) {
+        return ve0;
+    }
+    for (int i = 0; i < npyv_nlanes_f64; i += 4) {
+        de0[i+1] = de1[i+1];
+        de0[i+2] = de2[i+2];
+        de0[i+3] = de3[i+3];
+    }
+    return npyv_load_f64(de0);
+}
+SIMD_IMPL_INTRIN_5(permi128_f64_, vf64, vf64, u8, u8, u8, u8)
+#elif 64 == 64
+NPY_FINLINE npyv_f64
+npyv_permi128_f64_(npyv_f64 a, unsigned e0, unsigned e1)
+{
+    if (e0 == 1 && e1 == 0) {
+        return npyv_permi128_f64(a, 1, 0);
+    }
+    else if (e0 == 0 && e1 == 1) {
+        return npyv_permi128_f64(a, 0, 1);
+    }
+    else if (e0 == 1 && e1 == 1) {
+        return npyv_permi128_f64(a, 1, 1);
+    }
+    return npyv_permi128_f64(a, 0, 0);
+}
+SIMD_IMPL_INTRIN_3(permi128_f64_, vf64, vf64, u8, u8)
+#endif
+
+/***************************
+ * Operators
+ ***************************/
+#if 0 > 0
+SIMD_IMPL_INTRIN_2(shl_f64, vf64, vf64, u8)
+SIMD_IMPL_INTRIN_2(shr_f64, vf64, vf64, u8)
+// immediate constant
+SIMD_IMPL_INTRIN_2IMM(shli_f64, vf64, vf64, 0)
+SIMD_IMPL_INTRIN_2IMM(shri_f64, vf64, vf64, 0)
+#endif // shl_imm
+
+#line 418
+SIMD_IMPL_INTRIN_2(and_f64, vf64, vf64, vf64)
+
+#line 418
+SIMD_IMPL_INTRIN_2(or_f64, vf64, vf64, vf64)
+
+#line 418
+SIMD_IMPL_INTRIN_2(xor_f64, vf64, vf64, vf64)
+
+
+SIMD_IMPL_INTRIN_1(not_f64, vf64, vf64)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpeq_f64, vb64, vf64, vf64)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpneq_f64, vb64, vf64, vf64)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpgt_f64, vb64, vf64, vf64)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmpge_f64, vb64, vf64, vf64)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmplt_f64, vb64, vf64, vf64)
+
+#line 426
+SIMD_IMPL_INTRIN_2(cmple_f64, vb64, vf64, vf64)
+
+
+#if 0
+SIMD_IMPL_INTRIN_2(andc_f64, vf64, vf64, vf64)
+SIMD_IMPL_INTRIN_2(andc_b64, vb64, vb64, vb64)
+SIMD_IMPL_INTRIN_2(orc_b64, vb64, vb64, vb64)
+SIMD_IMPL_INTRIN_2(xnor_b64, vb64, vb64, vb64)
+#endif
+
+// test cross all vector lanes
+#line 440
+SIMD_IMPL_INTRIN_1(any_f64, u8, vf64)
+
+#line 440
+SIMD_IMPL_INTRIN_1(all_f64, u8, vf64)
+
+/***************************
+ * Conversion
+ ***************************/
+SIMD_IMPL_INTRIN_1(cvt_f64_b64, vf64,  vb64)
+SIMD_IMPL_INTRIN_1(cvt_b64_f64, vb64, vf64)
+#if 0
+SIMD_IMPL_INTRIN_1(expand_f64_f64, vf64x2, vf64)
+#endif // expand_sup
+/***************************
+ * Arithmetic
+ ***************************/
+#line 456
+SIMD_IMPL_INTRIN_2(add_f64, vf64, vf64, vf64)
+
+#line 456
+SIMD_IMPL_INTRIN_2(sub_f64, vf64, vf64, vf64)
+
+
+#if 0
+#line 463
+SIMD_IMPL_INTRIN_2(adds_f64, vf64, vf64, vf64)
+
+#line 463
+SIMD_IMPL_INTRIN_2(subs_f64, vf64, vf64, vf64)
+
+#endif // sat_sup
+
+#if 1
+SIMD_IMPL_INTRIN_2(mul_f64, vf64, vf64, vf64)
+#endif // mul_sup
+
+#if 1
+SIMD_IMPL_INTRIN_2(div_f64, vf64, vf64, vf64)
+#endif // div_sup
+
+#if 0
+SIMD_IMPL_INTRIN_1(divisor_f64, vf64x3, f64)
+SIMD_IMPL_INTRIN_2(divc_f64, vf64, vf64, vf64x3)
+#endif // intdiv_sup
+
+#if 1
+#line 484
+SIMD_IMPL_INTRIN_3(muladd_f64, vf64, vf64, vf64, vf64)
+
+#line 484
+SIMD_IMPL_INTRIN_3(mulsub_f64, vf64, vf64, vf64, vf64)
+
+#line 484
+SIMD_IMPL_INTRIN_3(nmuladd_f64, vf64, vf64, vf64, vf64)
+
+#line 484
+SIMD_IMPL_INTRIN_3(nmulsub_f64, vf64, vf64, vf64, vf64)
+
+#line 484
+SIMD_IMPL_INTRIN_3(muladdsub_f64, vf64, vf64, vf64, vf64)
+
+#endif // fused_sup
+
+#if 1
+SIMD_IMPL_INTRIN_1(sum_f64, f64, vf64)
+#endif // sum_sup
+
+#if 0
+SIMD_IMPL_INTRIN_1(sumup_f64, f64, vf64)
+#endif // sumup_sup
+
+/***************************
+ * Math
+ ***************************/
+#if 1
+#line 503
+SIMD_IMPL_INTRIN_1(sqrt_f64, vf64, vf64)
+
+#line 503
+SIMD_IMPL_INTRIN_1(recip_f64, vf64, vf64)
+
+#line 503
+SIMD_IMPL_INTRIN_1(abs_f64, vf64, vf64)
+
+#line 503
+SIMD_IMPL_INTRIN_1(square_f64, vf64, vf64)
+
+#line 503
+SIMD_IMPL_INTRIN_1(rint_f64, vf64, vf64)
+
+#line 503
+SIMD_IMPL_INTRIN_1(ceil_f64, vf64, vf64)
+
+#line 503
+SIMD_IMPL_INTRIN_1(trunc_f64, vf64, vf64)
+
+#line 503
+SIMD_IMPL_INTRIN_1(floor_f64, vf64, vf64)
+
+#endif
+
+#line 510
+SIMD_IMPL_INTRIN_2(max_f64, vf64, vf64, vf64)
+SIMD_IMPL_INTRIN_1(reduce_max_f64, f64, vf64)
+
+#line 510
+SIMD_IMPL_INTRIN_2(min_f64, vf64, vf64, vf64)
+SIMD_IMPL_INTRIN_1(reduce_min_f64, f64, vf64)
+
+
+#if 1
+#line 518
+SIMD_IMPL_INTRIN_2(maxp_f64, vf64, vf64, vf64)
+SIMD_IMPL_INTRIN_1(reduce_maxp_f64, f64, vf64)
+
+#line 518
+SIMD_IMPL_INTRIN_2(minp_f64, vf64, vf64, vf64)
+SIMD_IMPL_INTRIN_1(reduce_minp_f64, f64, vf64)
+
+#line 518
+SIMD_IMPL_INTRIN_2(maxn_f64, vf64, vf64, vf64)
+SIMD_IMPL_INTRIN_1(reduce_maxn_f64, f64, vf64)
+
+#line 518
+SIMD_IMPL_INTRIN_2(minn_f64, vf64, vf64, vf64)
+SIMD_IMPL_INTRIN_1(reduce_minn_f64, f64, vf64)
+
+/**end repeat1**/
+#endif
+
+/***************************
+ * Mask operations
+ ***************************/
+#line 530
+ SIMD_IMPL_INTRIN_4(ifadd_f64, vf64, vb64, vf64, vf64, vf64)
+
+#line 530
+ SIMD_IMPL_INTRIN_4(ifsub_f64, vf64, vb64, vf64, vf64, vf64)
+
+
+#if 1
+SIMD_IMPL_INTRIN_4(ifdiv_f64, vf64, vb64, vf64, vf64, vf64)
+SIMD_IMPL_INTRIN_3(ifdivz_f64, vf64, vb64, vf64, vf64)
+#endif
+
+#endif // simd_sup
+
+/*************************************************************************
+ * Variant
+ ************************************************************************/
+SIMD_IMPL_INTRIN_0N(cleanup)
+
+/*************************************************************************
+ * A special section for f32/f64 intrinsics outside the main repeater
+ ************************************************************************/
+/***************************
+ * Operators
+ ***************************/
+// check special cases
+#if NPY_SIMD_F32
+    SIMD_IMPL_INTRIN_1(notnan_f32, vb32, vf32)
+#endif
+#if NPY_SIMD_F64
+    SIMD_IMPL_INTRIN_1(notnan_f64, vb64, vf64)
+#endif
+/***************************
+ * Conversions
+ ***************************/
+// round to nearest integer (assume even)
+#if NPY_SIMD_F32
+    SIMD_IMPL_INTRIN_1(round_s32_f32, vs32, vf32)
+#endif
+#if NPY_SIMD_F64
+    SIMD_IMPL_INTRIN_2(round_s32_f64, vs32, vf64, vf64)
+#endif
+
+/*************************************************************************
+ * A special section for boolean intrinsics outside the main repeater
+ ************************************************************************/
+/***************************
+ * Operators
+ ***************************/
+#line 578
+// Logical
+SIMD_IMPL_INTRIN_2(and_b8, vb8, vb8, vb8)
+SIMD_IMPL_INTRIN_2(or_b8,  vb8, vb8, vb8)
+SIMD_IMPL_INTRIN_2(xor_b8, vb8, vb8, vb8)
+SIMD_IMPL_INTRIN_1(not_b8, vb8, vb8)
+// test cross vector's lanes
+#line 587
+SIMD_IMPL_INTRIN_1(any_b8, u8, vb8)
+
+#line 587
+SIMD_IMPL_INTRIN_1(all_b8, u8, vb8)
+
+
+#line 578
+// Logical
+SIMD_IMPL_INTRIN_2(and_b16, vb16, vb16, vb16)
+SIMD_IMPL_INTRIN_2(or_b16,  vb16, vb16, vb16)
+SIMD_IMPL_INTRIN_2(xor_b16, vb16, vb16, vb16)
+SIMD_IMPL_INTRIN_1(not_b16, vb16, vb16)
+// test cross vector's lanes
+#line 587
+SIMD_IMPL_INTRIN_1(any_b16, u8, vb16)
+
+#line 587
+SIMD_IMPL_INTRIN_1(all_b16, u8, vb16)
+
+
+#line 578
+// Logical
+SIMD_IMPL_INTRIN_2(and_b32, vb32, vb32, vb32)
+SIMD_IMPL_INTRIN_2(or_b32,  vb32, vb32, vb32)
+SIMD_IMPL_INTRIN_2(xor_b32, vb32, vb32, vb32)
+SIMD_IMPL_INTRIN_1(not_b32, vb32, vb32)
+// test cross vector's lanes
+#line 587
+SIMD_IMPL_INTRIN_1(any_b32, u8, vb32)
+
+#line 587
+SIMD_IMPL_INTRIN_1(all_b32, u8, vb32)
+
+
+#line 578
+// Logical
+SIMD_IMPL_INTRIN_2(and_b64, vb64, vb64, vb64)
+SIMD_IMPL_INTRIN_2(or_b64,  vb64, vb64, vb64)
+SIMD_IMPL_INTRIN_2(xor_b64, vb64, vb64, vb64)
+SIMD_IMPL_INTRIN_1(not_b64, vb64, vb64)
+// test cross vector's lanes
+#line 587
+SIMD_IMPL_INTRIN_1(any_b64, u8, vb64)
+
+#line 587
+SIMD_IMPL_INTRIN_1(all_b64, u8, vb64)
+
+
+/***************************
+ * Conversions
+ ***************************/
+// Convert mask vector to integer bitfield
+#line 597
+SIMD_IMPL_INTRIN_1(tobits_b8, u64, vb8)
+
+#line 597
+SIMD_IMPL_INTRIN_1(tobits_b16, u64, vb16)
+
+#line 597
+SIMD_IMPL_INTRIN_1(tobits_b32, u64, vb32)
+
+#line 597
+SIMD_IMPL_INTRIN_1(tobits_b64, u64, vb64)
+
+
+SIMD_IMPL_INTRIN_2(pack_b8_b16, vb8, vb16, vb16)
+SIMD_IMPL_INTRIN_4(pack_b8_b32, vb8, vb32, vb32, vb32, vb32)
+SIMD_IMPL_INTRIN_8(pack_b8_b64, vb8, vb64, vb64, vb64, vb64,
+                                     vb64, vb64, vb64, vb64)
+
+//#########################################################################
+//## Attach module functions
+//#########################################################################
+static PyMethodDef simd__intrinsics_methods[] = {
+#line 630
+#if 1
+
+/***************************
+ * Memory
+ ***************************/
+#line 638
+SIMD_INTRIN_DEF(load_u8)
+
+#line 638
+SIMD_INTRIN_DEF(loada_u8)
+
+#line 638
+SIMD_INTRIN_DEF(loads_u8)
+
+#line 638
+SIMD_INTRIN_DEF(loadl_u8)
+
+#line 638
+SIMD_INTRIN_DEF(store_u8)
+
+#line 638
+SIMD_INTRIN_DEF(storea_u8)
+
+#line 638
+SIMD_INTRIN_DEF(stores_u8)
+
+#line 638
+SIMD_INTRIN_DEF(storel_u8)
+
+#line 638
+SIMD_INTRIN_DEF(storeh_u8)
+
+
+#line 644
+SIMD_INTRIN_DEF(load_u8x2)
+
+#line 644
+SIMD_INTRIN_DEF(store_u8x2)
+
+
+/****************************************
+ * Non-contiguous/Partial Memory access
+ ****************************************/
+#if 0
+#line 655
+SIMD_INTRIN_DEF(load_till_u8)
+
+#line 655
+SIMD_INTRIN_DEF(load_tillz_u8)
+
+#line 655
+SIMD_INTRIN_DEF(loadn_u8)
+
+#line 655
+SIMD_INTRIN_DEF(loadn_till_u8)
+
+#line 655
+SIMD_INTRIN_DEF(loadn_tillz_u8)
+
+#line 655
+SIMD_INTRIN_DEF(store_till_u8)
+
+#line 655
+SIMD_INTRIN_DEF(storen_u8)
+
+#line 655
+SIMD_INTRIN_DEF(storen_till_u8)
+
+#if 8 == 32
+    #line 662
+    SIMD_INTRIN_DEF(load2_till_u8)
+    
+#line 662
+    SIMD_INTRIN_DEF(load2_tillz_u8)
+    
+#line 662
+    SIMD_INTRIN_DEF(loadn2_u8)
+    
+#line 662
+    SIMD_INTRIN_DEF(loadn2_till_u8)
+    
+#line 662
+    SIMD_INTRIN_DEF(loadn2_tillz_u8)
+    
+#line 662
+    SIMD_INTRIN_DEF(store2_till_u8)
+    
+#line 662
+    SIMD_INTRIN_DEF(storen2_u8)
+    
+#line 662
+    SIMD_INTRIN_DEF(storen2_till_u8)
+    
+#else
+    #line 669
+    SIMD_INTRIN_DEF(load2_till_u8)
+    
+#line 669
+    SIMD_INTRIN_DEF(load2_tillz_u8)
+    
+#line 669
+    SIMD_INTRIN_DEF(loadn2_u8)
+    
+#line 669
+    SIMD_INTRIN_DEF(loadn2_till_u8)
+    
+#line 669
+    SIMD_INTRIN_DEF(loadn2_tillz_u8)
+    
+#line 669
+    SIMD_INTRIN_DEF(store2_till_u8)
+    
+#line 669
+    SIMD_INTRIN_DEF(storen2_u8)
+    
+#line 669
+    SIMD_INTRIN_DEF(storen2_till_u8)
+    
+#endif
+#endif // ncont_sup
+
+/****************************
+ * Lookup tables
+ ****************************/
+#if 8 == 32
+SIMD_INTRIN_DEF(lut32_u8)
+#endif
+#if 8 == 64
+SIMD_INTRIN_DEF(lut16_u8)
+#endif
+/***************************
+ * Misc
+ ***************************/
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u8_u8)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s8_u8)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u16_u8)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s16_u8)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u32_u8)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s32_u8)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u64_u8)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s64_u8)
+#endif // simd_sup2
+
+#line 690
+#if NPY_SIMD_F32
+SIMD_INTRIN_DEF(reinterpret_f32_u8)
+#endif // simd_sup2
+
+#line 690
+#if NPY_SIMD_F64
+SIMD_INTRIN_DEF(reinterpret_f64_u8)
+#endif // simd_sup2
+
+
+#line 698
+SIMD_INTRIN_DEF(set_u8)
+
+#line 698
+SIMD_INTRIN_DEF(setf_u8)
+
+#line 698
+SIMD_INTRIN_DEF(setall_u8)
+
+#line 698
+SIMD_INTRIN_DEF(zero_u8)
+
+#line 698
+SIMD_INTRIN_DEF(select_u8)
+
+#line 698
+SIMD_INTRIN_DEF(extract0_u8)
+
+
+/***************************
+ * Reorder
+ ***************************/
+#line 707
+SIMD_INTRIN_DEF(combinel_u8)
+
+#line 707
+SIMD_INTRIN_DEF(combineh_u8)
+
+#line 707
+SIMD_INTRIN_DEF(combine_u8)
+
+#line 707
+SIMD_INTRIN_DEF(zip_u8)
+
+#line 707
+SIMD_INTRIN_DEF(unzip_u8)
+
+
+#if 1
+SIMD_INTRIN_DEF(rev64_u8)
+#endif
+
+#if 8 > 16
+{ "permi128_u8", simd__intrin_permi128_u8_, METH_VARARGS, NULL },
+#endif
+
+/***************************
+ * Operators
+ ***************************/
+#if 0 > 0
+#line 725
+SIMD_INTRIN_DEF(shl_u8)
+
+#line 725
+SIMD_INTRIN_DEF(shr_u8)
+
+#line 725
+SIMD_INTRIN_DEF(shli_u8)
+
+#line 725
+SIMD_INTRIN_DEF(shri_u8)
+
+#endif // shl_imm
+
+#line 733
+SIMD_INTRIN_DEF(and_u8)
+
+#line 733
+SIMD_INTRIN_DEF(or_u8)
+
+#line 733
+SIMD_INTRIN_DEF(xor_u8)
+
+#line 733
+SIMD_INTRIN_DEF(not_u8)
+
+#line 733
+SIMD_INTRIN_DEF(cmpeq_u8)
+
+#line 733
+SIMD_INTRIN_DEF(cmpneq_u8)
+
+#line 733
+SIMD_INTRIN_DEF(cmpgt_u8)
+
+#line 733
+SIMD_INTRIN_DEF(cmpge_u8)
+
+#line 733
+SIMD_INTRIN_DEF(cmplt_u8)
+
+#line 733
+SIMD_INTRIN_DEF(cmple_u8)
+
+#line 733
+SIMD_INTRIN_DEF(any_u8)
+
+#line 733
+SIMD_INTRIN_DEF(all_u8)
+
+
+#if 1
+SIMD_INTRIN_DEF(andc_u8)
+SIMD_INTRIN_DEF(andc_b8)
+SIMD_INTRIN_DEF(orc_b8)
+SIMD_INTRIN_DEF(xnor_b8)
+#endif
+
+/***************************
+ * Conversion
+ ***************************/
+SIMD_INTRIN_DEF(cvt_u8_b8)
+SIMD_INTRIN_DEF(cvt_b8_u8)
+#if 1
+SIMD_INTRIN_DEF(expand_u16_u8)
+#endif // expand_sup
+/***************************
+ * Arithmetic
+ ***************************/
+#line 757
+SIMD_INTRIN_DEF(add_u8)
+
+#line 757
+SIMD_INTRIN_DEF(sub_u8)
+
+
+#if 1
+#line 764
+SIMD_INTRIN_DEF(adds_u8)
+
+#line 764
+SIMD_INTRIN_DEF(subs_u8)
+
+#endif // sat_sup
+
+#if 1
+SIMD_INTRIN_DEF(mul_u8)
+#endif // mul_sup
+
+#if 0
+SIMD_INTRIN_DEF(div_u8)
+#endif // div_sup
+
+#if 1
+SIMD_INTRIN_DEF(divisor_u8)
+SIMD_INTRIN_DEF(divc_u8)
+#endif // intdiv_sup
+
+#if 0
+#line 785
+SIMD_INTRIN_DEF(muladd_u8)
+
+#line 785
+SIMD_INTRIN_DEF(mulsub_u8)
+
+#line 785
+SIMD_INTRIN_DEF(nmuladd_u8)
+
+#line 785
+SIMD_INTRIN_DEF(nmulsub_u8)
+
+#line 785
+SIMD_INTRIN_DEF(muladdsub_u8)
+
+#endif // fused_sup
+
+#if 0
+SIMD_INTRIN_DEF(sum_u8)
+#endif // sum_sup
+
+#if 1
+SIMD_INTRIN_DEF(sumup_u8)
+#endif // sumup_sup
+/***************************
+ * Math
+ ***************************/
+#if 0
+#line 803
+SIMD_INTRIN_DEF(sqrt_u8)
+
+#line 803
+SIMD_INTRIN_DEF(recip_u8)
+
+#line 803
+SIMD_INTRIN_DEF(abs_u8)
+
+#line 803
+SIMD_INTRIN_DEF(square_u8)
+
+#line 803
+SIMD_INTRIN_DEF(rint_u8)
+
+#line 803
+SIMD_INTRIN_DEF(ceil_u8)
+
+#line 803
+SIMD_INTRIN_DEF(trunc_u8)
+
+#line 803
+SIMD_INTRIN_DEF(floor_u8)
+
+#endif
+
+#line 810
+SIMD_INTRIN_DEF(max_u8)
+SIMD_INTRIN_DEF(reduce_max_u8)
+
+#line 810
+SIMD_INTRIN_DEF(min_u8)
+SIMD_INTRIN_DEF(reduce_min_u8)
+
+
+#if 0
+#line 818
+SIMD_INTRIN_DEF(maxp_u8)
+SIMD_INTRIN_DEF(reduce_maxp_u8)
+
+#line 818
+SIMD_INTRIN_DEF(minp_u8)
+SIMD_INTRIN_DEF(reduce_minp_u8)
+
+#line 818
+SIMD_INTRIN_DEF(maxn_u8)
+SIMD_INTRIN_DEF(reduce_maxn_u8)
+
+#line 818
+SIMD_INTRIN_DEF(minn_u8)
+SIMD_INTRIN_DEF(reduce_minn_u8)
+
+/**end repeat1**/
+#endif
+
+/***************************
+ * Mask operations
+ ***************************/
+#line 830
+ SIMD_INTRIN_DEF(ifadd_u8)
+
+#line 830
+ SIMD_INTRIN_DEF(ifsub_u8)
+
+
+#if 0
+#line 837
+SIMD_INTRIN_DEF(ifdiv_u8)
+
+#line 837
+SIMD_INTRIN_DEF(ifdivz_u8)
+
+#endif
+
+#endif // simd_sup
+
+#line 630
+#if 1
+
+/***************************
+ * Memory
+ ***************************/
+#line 638
+SIMD_INTRIN_DEF(load_s8)
+
+#line 638
+SIMD_INTRIN_DEF(loada_s8)
+
+#line 638
+SIMD_INTRIN_DEF(loads_s8)
+
+#line 638
+SIMD_INTRIN_DEF(loadl_s8)
+
+#line 638
+SIMD_INTRIN_DEF(store_s8)
+
+#line 638
+SIMD_INTRIN_DEF(storea_s8)
+
+#line 638
+SIMD_INTRIN_DEF(stores_s8)
+
+#line 638
+SIMD_INTRIN_DEF(storel_s8)
+
+#line 638
+SIMD_INTRIN_DEF(storeh_s8)
+
+
+#line 644
+SIMD_INTRIN_DEF(load_s8x2)
+
+#line 644
+SIMD_INTRIN_DEF(store_s8x2)
+
+
+/****************************************
+ * Non-contiguous/Partial Memory access
+ ****************************************/
+#if 0
+#line 655
+SIMD_INTRIN_DEF(load_till_s8)
+
+#line 655
+SIMD_INTRIN_DEF(load_tillz_s8)
+
+#line 655
+SIMD_INTRIN_DEF(loadn_s8)
+
+#line 655
+SIMD_INTRIN_DEF(loadn_till_s8)
+
+#line 655
+SIMD_INTRIN_DEF(loadn_tillz_s8)
+
+#line 655
+SIMD_INTRIN_DEF(store_till_s8)
+
+#line 655
+SIMD_INTRIN_DEF(storen_s8)
+
+#line 655
+SIMD_INTRIN_DEF(storen_till_s8)
+
+#if 8 == 32
+    #line 662
+    SIMD_INTRIN_DEF(load2_till_s8)
+    
+#line 662
+    SIMD_INTRIN_DEF(load2_tillz_s8)
+    
+#line 662
+    SIMD_INTRIN_DEF(loadn2_s8)
+    
+#line 662
+    SIMD_INTRIN_DEF(loadn2_till_s8)
+    
+#line 662
+    SIMD_INTRIN_DEF(loadn2_tillz_s8)
+    
+#line 662
+    SIMD_INTRIN_DEF(store2_till_s8)
+    
+#line 662
+    SIMD_INTRIN_DEF(storen2_s8)
+    
+#line 662
+    SIMD_INTRIN_DEF(storen2_till_s8)
+    
+#else
+    #line 669
+    SIMD_INTRIN_DEF(load2_till_s8)
+    
+#line 669
+    SIMD_INTRIN_DEF(load2_tillz_s8)
+    
+#line 669
+    SIMD_INTRIN_DEF(loadn2_s8)
+    
+#line 669
+    SIMD_INTRIN_DEF(loadn2_till_s8)
+    
+#line 669
+    SIMD_INTRIN_DEF(loadn2_tillz_s8)
+    
+#line 669
+    SIMD_INTRIN_DEF(store2_till_s8)
+    
+#line 669
+    SIMD_INTRIN_DEF(storen2_s8)
+    
+#line 669
+    SIMD_INTRIN_DEF(storen2_till_s8)
+    
+#endif
+#endif // ncont_sup
+
+/****************************
+ * Lookup tables
+ ****************************/
+#if 8 == 32
+SIMD_INTRIN_DEF(lut32_s8)
+#endif
+#if 8 == 64
+SIMD_INTRIN_DEF(lut16_s8)
+#endif
+/***************************
+ * Misc
+ ***************************/
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u8_s8)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s8_s8)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u16_s8)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s16_s8)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u32_s8)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s32_s8)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u64_s8)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s64_s8)
+#endif // simd_sup2
+
+#line 690
+#if NPY_SIMD_F32
+SIMD_INTRIN_DEF(reinterpret_f32_s8)
+#endif // simd_sup2
+
+#line 690
+#if NPY_SIMD_F64
+SIMD_INTRIN_DEF(reinterpret_f64_s8)
+#endif // simd_sup2
+
+
+#line 698
+SIMD_INTRIN_DEF(set_s8)
+
+#line 698
+SIMD_INTRIN_DEF(setf_s8)
+
+#line 698
+SIMD_INTRIN_DEF(setall_s8)
+
+#line 698
+SIMD_INTRIN_DEF(zero_s8)
+
+#line 698
+SIMD_INTRIN_DEF(select_s8)
+
+#line 698
+SIMD_INTRIN_DEF(extract0_s8)
+
+
+/***************************
+ * Reorder
+ ***************************/
+#line 707
+SIMD_INTRIN_DEF(combinel_s8)
+
+#line 707
+SIMD_INTRIN_DEF(combineh_s8)
+
+#line 707
+SIMD_INTRIN_DEF(combine_s8)
+
+#line 707
+SIMD_INTRIN_DEF(zip_s8)
+
+#line 707
+SIMD_INTRIN_DEF(unzip_s8)
+
+
+#if 1
+SIMD_INTRIN_DEF(rev64_s8)
+#endif
+
+#if 8 > 16
+{ "permi128_s8", simd__intrin_permi128_s8_, METH_VARARGS, NULL },
+#endif
+
+/***************************
+ * Operators
+ ***************************/
+#if 0 > 0
+#line 725
+SIMD_INTRIN_DEF(shl_s8)
+
+#line 725
+SIMD_INTRIN_DEF(shr_s8)
+
+#line 725
+SIMD_INTRIN_DEF(shli_s8)
+
+#line 725
+SIMD_INTRIN_DEF(shri_s8)
+
+#endif // shl_imm
+
+#line 733
+SIMD_INTRIN_DEF(and_s8)
+
+#line 733
+SIMD_INTRIN_DEF(or_s8)
+
+#line 733
+SIMD_INTRIN_DEF(xor_s8)
+
+#line 733
+SIMD_INTRIN_DEF(not_s8)
+
+#line 733
+SIMD_INTRIN_DEF(cmpeq_s8)
+
+#line 733
+SIMD_INTRIN_DEF(cmpneq_s8)
+
+#line 733
+SIMD_INTRIN_DEF(cmpgt_s8)
+
+#line 733
+SIMD_INTRIN_DEF(cmpge_s8)
+
+#line 733
+SIMD_INTRIN_DEF(cmplt_s8)
+
+#line 733
+SIMD_INTRIN_DEF(cmple_s8)
+
+#line 733
+SIMD_INTRIN_DEF(any_s8)
+
+#line 733
+SIMD_INTRIN_DEF(all_s8)
+
+
+#if 0
+SIMD_INTRIN_DEF(andc_s8)
+SIMD_INTRIN_DEF(andc_b8)
+SIMD_INTRIN_DEF(orc_b8)
+SIMD_INTRIN_DEF(xnor_b8)
+#endif
+
+/***************************
+ * Conversion
+ ***************************/
+SIMD_INTRIN_DEF(cvt_s8_b8)
+SIMD_INTRIN_DEF(cvt_b8_s8)
+#if 0
+SIMD_INTRIN_DEF(expand_s8_s8)
+#endif // expand_sup
+/***************************
+ * Arithmetic
+ ***************************/
+#line 757
+SIMD_INTRIN_DEF(add_s8)
+
+#line 757
+SIMD_INTRIN_DEF(sub_s8)
+
+
+#if 1
+#line 764
+SIMD_INTRIN_DEF(adds_s8)
+
+#line 764
+SIMD_INTRIN_DEF(subs_s8)
+
+#endif // sat_sup
+
+#if 1
+SIMD_INTRIN_DEF(mul_s8)
+#endif // mul_sup
+
+#if 0
+SIMD_INTRIN_DEF(div_s8)
+#endif // div_sup
+
+#if 1
+SIMD_INTRIN_DEF(divisor_s8)
+SIMD_INTRIN_DEF(divc_s8)
+#endif // intdiv_sup
+
+#if 0
+#line 785
+SIMD_INTRIN_DEF(muladd_s8)
+
+#line 785
+SIMD_INTRIN_DEF(mulsub_s8)
+
+#line 785
+SIMD_INTRIN_DEF(nmuladd_s8)
+
+#line 785
+SIMD_INTRIN_DEF(nmulsub_s8)
+
+#line 785
+SIMD_INTRIN_DEF(muladdsub_s8)
+
+#endif // fused_sup
+
+#if 0
+SIMD_INTRIN_DEF(sum_s8)
+#endif // sum_sup
+
+#if 0
+SIMD_INTRIN_DEF(sumup_s8)
+#endif // sumup_sup
+/***************************
+ * Math
+ ***************************/
+#if 0
+#line 803
+SIMD_INTRIN_DEF(sqrt_s8)
+
+#line 803
+SIMD_INTRIN_DEF(recip_s8)
+
+#line 803
+SIMD_INTRIN_DEF(abs_s8)
+
+#line 803
+SIMD_INTRIN_DEF(square_s8)
+
+#line 803
+SIMD_INTRIN_DEF(rint_s8)
+
+#line 803
+SIMD_INTRIN_DEF(ceil_s8)
+
+#line 803
+SIMD_INTRIN_DEF(trunc_s8)
+
+#line 803
+SIMD_INTRIN_DEF(floor_s8)
+
+#endif
+
+#line 810
+SIMD_INTRIN_DEF(max_s8)
+SIMD_INTRIN_DEF(reduce_max_s8)
+
+#line 810
+SIMD_INTRIN_DEF(min_s8)
+SIMD_INTRIN_DEF(reduce_min_s8)
+
+
+#if 0
+#line 818
+SIMD_INTRIN_DEF(maxp_s8)
+SIMD_INTRIN_DEF(reduce_maxp_s8)
+
+#line 818
+SIMD_INTRIN_DEF(minp_s8)
+SIMD_INTRIN_DEF(reduce_minp_s8)
+
+#line 818
+SIMD_INTRIN_DEF(maxn_s8)
+SIMD_INTRIN_DEF(reduce_maxn_s8)
+
+#line 818
+SIMD_INTRIN_DEF(minn_s8)
+SIMD_INTRIN_DEF(reduce_minn_s8)
+
+/**end repeat1**/
+#endif
+
+/***************************
+ * Mask operations
+ ***************************/
+#line 830
+ SIMD_INTRIN_DEF(ifadd_s8)
+
+#line 830
+ SIMD_INTRIN_DEF(ifsub_s8)
+
+
+#if 0
+#line 837
+SIMD_INTRIN_DEF(ifdiv_s8)
+
+#line 837
+SIMD_INTRIN_DEF(ifdivz_s8)
+
+#endif
+
+#endif // simd_sup
+
+#line 630
+#if 1
+
+/***************************
+ * Memory
+ ***************************/
+#line 638
+SIMD_INTRIN_DEF(load_u16)
+
+#line 638
+SIMD_INTRIN_DEF(loada_u16)
+
+#line 638
+SIMD_INTRIN_DEF(loads_u16)
+
+#line 638
+SIMD_INTRIN_DEF(loadl_u16)
+
+#line 638
+SIMD_INTRIN_DEF(store_u16)
+
+#line 638
+SIMD_INTRIN_DEF(storea_u16)
+
+#line 638
+SIMD_INTRIN_DEF(stores_u16)
+
+#line 638
+SIMD_INTRIN_DEF(storel_u16)
+
+#line 638
+SIMD_INTRIN_DEF(storeh_u16)
+
+
+#line 644
+SIMD_INTRIN_DEF(load_u16x2)
+
+#line 644
+SIMD_INTRIN_DEF(store_u16x2)
+
+
+/****************************************
+ * Non-contiguous/Partial Memory access
+ ****************************************/
+#if 0
+#line 655
+SIMD_INTRIN_DEF(load_till_u16)
+
+#line 655
+SIMD_INTRIN_DEF(load_tillz_u16)
+
+#line 655
+SIMD_INTRIN_DEF(loadn_u16)
+
+#line 655
+SIMD_INTRIN_DEF(loadn_till_u16)
+
+#line 655
+SIMD_INTRIN_DEF(loadn_tillz_u16)
+
+#line 655
+SIMD_INTRIN_DEF(store_till_u16)
+
+#line 655
+SIMD_INTRIN_DEF(storen_u16)
+
+#line 655
+SIMD_INTRIN_DEF(storen_till_u16)
+
+#if 16 == 32
+    #line 662
+    SIMD_INTRIN_DEF(load2_till_u16)
+    
+#line 662
+    SIMD_INTRIN_DEF(load2_tillz_u16)
+    
+#line 662
+    SIMD_INTRIN_DEF(loadn2_u16)
+    
+#line 662
+    SIMD_INTRIN_DEF(loadn2_till_u16)
+    
+#line 662
+    SIMD_INTRIN_DEF(loadn2_tillz_u16)
+    
+#line 662
+    SIMD_INTRIN_DEF(store2_till_u16)
+    
+#line 662
+    SIMD_INTRIN_DEF(storen2_u16)
+    
+#line 662
+    SIMD_INTRIN_DEF(storen2_till_u16)
+    
+#else
+    #line 669
+    SIMD_INTRIN_DEF(load2_till_u16)
+    
+#line 669
+    SIMD_INTRIN_DEF(load2_tillz_u16)
+    
+#line 669
+    SIMD_INTRIN_DEF(loadn2_u16)
+    
+#line 669
+    SIMD_INTRIN_DEF(loadn2_till_u16)
+    
+#line 669
+    SIMD_INTRIN_DEF(loadn2_tillz_u16)
+    
+#line 669
+    SIMD_INTRIN_DEF(store2_till_u16)
+    
+#line 669
+    SIMD_INTRIN_DEF(storen2_u16)
+    
+#line 669
+    SIMD_INTRIN_DEF(storen2_till_u16)
+    
+#endif
+#endif // ncont_sup
+
+/****************************
+ * Lookup tables
+ ****************************/
+#if 16 == 32
+SIMD_INTRIN_DEF(lut32_u16)
+#endif
+#if 16 == 64
+SIMD_INTRIN_DEF(lut16_u16)
+#endif
+/***************************
+ * Misc
+ ***************************/
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u8_u16)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s8_u16)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u16_u16)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s16_u16)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u32_u16)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s32_u16)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u64_u16)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s64_u16)
+#endif // simd_sup2
+
+#line 690
+#if NPY_SIMD_F32
+SIMD_INTRIN_DEF(reinterpret_f32_u16)
+#endif // simd_sup2
+
+#line 690
+#if NPY_SIMD_F64
+SIMD_INTRIN_DEF(reinterpret_f64_u16)
+#endif // simd_sup2
+
+
+#line 698
+SIMD_INTRIN_DEF(set_u16)
+
+#line 698
+SIMD_INTRIN_DEF(setf_u16)
+
+#line 698
+SIMD_INTRIN_DEF(setall_u16)
+
+#line 698
+SIMD_INTRIN_DEF(zero_u16)
+
+#line 698
+SIMD_INTRIN_DEF(select_u16)
+
+#line 698
+SIMD_INTRIN_DEF(extract0_u16)
+
+
+/***************************
+ * Reorder
+ ***************************/
+#line 707
+SIMD_INTRIN_DEF(combinel_u16)
+
+#line 707
+SIMD_INTRIN_DEF(combineh_u16)
+
+#line 707
+SIMD_INTRIN_DEF(combine_u16)
+
+#line 707
+SIMD_INTRIN_DEF(zip_u16)
+
+#line 707
+SIMD_INTRIN_DEF(unzip_u16)
+
+
+#if 1
+SIMD_INTRIN_DEF(rev64_u16)
+#endif
+
+#if 16 > 16
+{ "permi128_u16", simd__intrin_permi128_u16_, METH_VARARGS, NULL },
+#endif
+
+/***************************
+ * Operators
+ ***************************/
+#if 15 > 0
+#line 725
+SIMD_INTRIN_DEF(shl_u16)
+
+#line 725
+SIMD_INTRIN_DEF(shr_u16)
+
+#line 725
+SIMD_INTRIN_DEF(shli_u16)
+
+#line 725
+SIMD_INTRIN_DEF(shri_u16)
+
+#endif // shl_imm
+
+#line 733
+SIMD_INTRIN_DEF(and_u16)
+
+#line 733
+SIMD_INTRIN_DEF(or_u16)
+
+#line 733
+SIMD_INTRIN_DEF(xor_u16)
+
+#line 733
+SIMD_INTRIN_DEF(not_u16)
+
+#line 733
+SIMD_INTRIN_DEF(cmpeq_u16)
+
+#line 733
+SIMD_INTRIN_DEF(cmpneq_u16)
+
+#line 733
+SIMD_INTRIN_DEF(cmpgt_u16)
+
+#line 733
+SIMD_INTRIN_DEF(cmpge_u16)
+
+#line 733
+SIMD_INTRIN_DEF(cmplt_u16)
+
+#line 733
+SIMD_INTRIN_DEF(cmple_u16)
+
+#line 733
+SIMD_INTRIN_DEF(any_u16)
+
+#line 733
+SIMD_INTRIN_DEF(all_u16)
+
+
+#if 0
+SIMD_INTRIN_DEF(andc_u16)
+SIMD_INTRIN_DEF(andc_b16)
+SIMD_INTRIN_DEF(orc_b16)
+SIMD_INTRIN_DEF(xnor_b16)
+#endif
+
+/***************************
+ * Conversion
+ ***************************/
+SIMD_INTRIN_DEF(cvt_u16_b16)
+SIMD_INTRIN_DEF(cvt_b16_u16)
+#if 1
+SIMD_INTRIN_DEF(expand_u32_u16)
+#endif // expand_sup
+/***************************
+ * Arithmetic
+ ***************************/
+#line 757
+SIMD_INTRIN_DEF(add_u16)
+
+#line 757
+SIMD_INTRIN_DEF(sub_u16)
+
+
+#if 1
+#line 764
+SIMD_INTRIN_DEF(adds_u16)
+
+#line 764
+SIMD_INTRIN_DEF(subs_u16)
+
+#endif // sat_sup
+
+#if 1
+SIMD_INTRIN_DEF(mul_u16)
+#endif // mul_sup
+
+#if 0
+SIMD_INTRIN_DEF(div_u16)
+#endif // div_sup
+
+#if 1
+SIMD_INTRIN_DEF(divisor_u16)
+SIMD_INTRIN_DEF(divc_u16)
+#endif // intdiv_sup
+
+#if 0
+#line 785
+SIMD_INTRIN_DEF(muladd_u16)
+
+#line 785
+SIMD_INTRIN_DEF(mulsub_u16)
+
+#line 785
+SIMD_INTRIN_DEF(nmuladd_u16)
+
+#line 785
+SIMD_INTRIN_DEF(nmulsub_u16)
+
+#line 785
+SIMD_INTRIN_DEF(muladdsub_u16)
+
+#endif // fused_sup
+
+#if 0
+SIMD_INTRIN_DEF(sum_u16)
+#endif // sum_sup
+
+#if 1
+SIMD_INTRIN_DEF(sumup_u16)
+#endif // sumup_sup
+/***************************
+ * Math
+ ***************************/
+#if 0
+#line 803
+SIMD_INTRIN_DEF(sqrt_u16)
+
+#line 803
+SIMD_INTRIN_DEF(recip_u16)
+
+#line 803
+SIMD_INTRIN_DEF(abs_u16)
+
+#line 803
+SIMD_INTRIN_DEF(square_u16)
+
+#line 803
+SIMD_INTRIN_DEF(rint_u16)
+
+#line 803
+SIMD_INTRIN_DEF(ceil_u16)
+
+#line 803
+SIMD_INTRIN_DEF(trunc_u16)
+
+#line 803
+SIMD_INTRIN_DEF(floor_u16)
+
+#endif
+
+#line 810
+SIMD_INTRIN_DEF(max_u16)
+SIMD_INTRIN_DEF(reduce_max_u16)
+
+#line 810
+SIMD_INTRIN_DEF(min_u16)
+SIMD_INTRIN_DEF(reduce_min_u16)
+
+
+#if 0
+#line 818
+SIMD_INTRIN_DEF(maxp_u16)
+SIMD_INTRIN_DEF(reduce_maxp_u16)
+
+#line 818
+SIMD_INTRIN_DEF(minp_u16)
+SIMD_INTRIN_DEF(reduce_minp_u16)
+
+#line 818
+SIMD_INTRIN_DEF(maxn_u16)
+SIMD_INTRIN_DEF(reduce_maxn_u16)
+
+#line 818
+SIMD_INTRIN_DEF(minn_u16)
+SIMD_INTRIN_DEF(reduce_minn_u16)
+
+/**end repeat1**/
+#endif
+
+/***************************
+ * Mask operations
+ ***************************/
+#line 830
+ SIMD_INTRIN_DEF(ifadd_u16)
+
+#line 830
+ SIMD_INTRIN_DEF(ifsub_u16)
+
+
+#if 0
+#line 837
+SIMD_INTRIN_DEF(ifdiv_u16)
+
+#line 837
+SIMD_INTRIN_DEF(ifdivz_u16)
+
+#endif
+
+#endif // simd_sup
+
+#line 630
+#if 1
+
+/***************************
+ * Memory
+ ***************************/
+#line 638
+SIMD_INTRIN_DEF(load_s16)
+
+#line 638
+SIMD_INTRIN_DEF(loada_s16)
+
+#line 638
+SIMD_INTRIN_DEF(loads_s16)
+
+#line 638
+SIMD_INTRIN_DEF(loadl_s16)
+
+#line 638
+SIMD_INTRIN_DEF(store_s16)
+
+#line 638
+SIMD_INTRIN_DEF(storea_s16)
+
+#line 638
+SIMD_INTRIN_DEF(stores_s16)
+
+#line 638
+SIMD_INTRIN_DEF(storel_s16)
+
+#line 638
+SIMD_INTRIN_DEF(storeh_s16)
+
+
+#line 644
+SIMD_INTRIN_DEF(load_s16x2)
+
+#line 644
+SIMD_INTRIN_DEF(store_s16x2)
+
+
+/****************************************
+ * Non-contiguous/Partial Memory access
+ ****************************************/
+#if 0
+#line 655
+SIMD_INTRIN_DEF(load_till_s16)
+
+#line 655
+SIMD_INTRIN_DEF(load_tillz_s16)
+
+#line 655
+SIMD_INTRIN_DEF(loadn_s16)
+
+#line 655
+SIMD_INTRIN_DEF(loadn_till_s16)
+
+#line 655
+SIMD_INTRIN_DEF(loadn_tillz_s16)
+
+#line 655
+SIMD_INTRIN_DEF(store_till_s16)
+
+#line 655
+SIMD_INTRIN_DEF(storen_s16)
+
+#line 655
+SIMD_INTRIN_DEF(storen_till_s16)
+
+#if 16 == 32
+    #line 662
+    SIMD_INTRIN_DEF(load2_till_s16)
+    
+#line 662
+    SIMD_INTRIN_DEF(load2_tillz_s16)
+    
+#line 662
+    SIMD_INTRIN_DEF(loadn2_s16)
+    
+#line 662
+    SIMD_INTRIN_DEF(loadn2_till_s16)
+    
+#line 662
+    SIMD_INTRIN_DEF(loadn2_tillz_s16)
+    
+#line 662
+    SIMD_INTRIN_DEF(store2_till_s16)
+    
+#line 662
+    SIMD_INTRIN_DEF(storen2_s16)
+    
+#line 662
+    SIMD_INTRIN_DEF(storen2_till_s16)
+    
+#else
+    #line 669
+    SIMD_INTRIN_DEF(load2_till_s16)
+    
+#line 669
+    SIMD_INTRIN_DEF(load2_tillz_s16)
+    
+#line 669
+    SIMD_INTRIN_DEF(loadn2_s16)
+    
+#line 669
+    SIMD_INTRIN_DEF(loadn2_till_s16)
+    
+#line 669
+    SIMD_INTRIN_DEF(loadn2_tillz_s16)
+    
+#line 669
+    SIMD_INTRIN_DEF(store2_till_s16)
+    
+#line 669
+    SIMD_INTRIN_DEF(storen2_s16)
+    
+#line 669
+    SIMD_INTRIN_DEF(storen2_till_s16)
+    
+#endif
+#endif // ncont_sup
+
+/****************************
+ * Lookup tables
+ ****************************/
+#if 16 == 32
+SIMD_INTRIN_DEF(lut32_s16)
+#endif
+#if 16 == 64
+SIMD_INTRIN_DEF(lut16_s16)
+#endif
+/***************************
+ * Misc
+ ***************************/
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u8_s16)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s8_s16)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u16_s16)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s16_s16)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u32_s16)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s32_s16)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u64_s16)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s64_s16)
+#endif // simd_sup2
+
+#line 690
+#if NPY_SIMD_F32
+SIMD_INTRIN_DEF(reinterpret_f32_s16)
+#endif // simd_sup2
+
+#line 690
+#if NPY_SIMD_F64
+SIMD_INTRIN_DEF(reinterpret_f64_s16)
+#endif // simd_sup2
+
+
+#line 698
+SIMD_INTRIN_DEF(set_s16)
+
+#line 698
+SIMD_INTRIN_DEF(setf_s16)
+
+#line 698
+SIMD_INTRIN_DEF(setall_s16)
+
+#line 698
+SIMD_INTRIN_DEF(zero_s16)
+
+#line 698
+SIMD_INTRIN_DEF(select_s16)
+
+#line 698
+SIMD_INTRIN_DEF(extract0_s16)
+
+
+/***************************
+ * Reorder
+ ***************************/
+#line 707
+SIMD_INTRIN_DEF(combinel_s16)
+
+#line 707
+SIMD_INTRIN_DEF(combineh_s16)
+
+#line 707
+SIMD_INTRIN_DEF(combine_s16)
+
+#line 707
+SIMD_INTRIN_DEF(zip_s16)
+
+#line 707
+SIMD_INTRIN_DEF(unzip_s16)
+
+
+#if 1
+SIMD_INTRIN_DEF(rev64_s16)
+#endif
+
+#if 16 > 16
+{ "permi128_s16", simd__intrin_permi128_s16_, METH_VARARGS, NULL },
+#endif
+
+/***************************
+ * Operators
+ ***************************/
+#if 15 > 0
+#line 725
+SIMD_INTRIN_DEF(shl_s16)
+
+#line 725
+SIMD_INTRIN_DEF(shr_s16)
+
+#line 725
+SIMD_INTRIN_DEF(shli_s16)
+
+#line 725
+SIMD_INTRIN_DEF(shri_s16)
+
+#endif // shl_imm
+
+#line 733
+SIMD_INTRIN_DEF(and_s16)
+
+#line 733
+SIMD_INTRIN_DEF(or_s16)
+
+#line 733
+SIMD_INTRIN_DEF(xor_s16)
+
+#line 733
+SIMD_INTRIN_DEF(not_s16)
+
+#line 733
+SIMD_INTRIN_DEF(cmpeq_s16)
+
+#line 733
+SIMD_INTRIN_DEF(cmpneq_s16)
+
+#line 733
+SIMD_INTRIN_DEF(cmpgt_s16)
+
+#line 733
+SIMD_INTRIN_DEF(cmpge_s16)
+
+#line 733
+SIMD_INTRIN_DEF(cmplt_s16)
+
+#line 733
+SIMD_INTRIN_DEF(cmple_s16)
+
+#line 733
+SIMD_INTRIN_DEF(any_s16)
+
+#line 733
+SIMD_INTRIN_DEF(all_s16)
+
+
+#if 0
+SIMD_INTRIN_DEF(andc_s16)
+SIMD_INTRIN_DEF(andc_b16)
+SIMD_INTRIN_DEF(orc_b16)
+SIMD_INTRIN_DEF(xnor_b16)
+#endif
+
+/***************************
+ * Conversion
+ ***************************/
+SIMD_INTRIN_DEF(cvt_s16_b16)
+SIMD_INTRIN_DEF(cvt_b16_s16)
+#if 0
+SIMD_INTRIN_DEF(expand_s16_s16)
+#endif // expand_sup
+/***************************
+ * Arithmetic
+ ***************************/
+#line 757
+SIMD_INTRIN_DEF(add_s16)
+
+#line 757
+SIMD_INTRIN_DEF(sub_s16)
+
+
+#if 1
+#line 764
+SIMD_INTRIN_DEF(adds_s16)
+
+#line 764
+SIMD_INTRIN_DEF(subs_s16)
+
+#endif // sat_sup
+
+#if 1
+SIMD_INTRIN_DEF(mul_s16)
+#endif // mul_sup
+
+#if 0
+SIMD_INTRIN_DEF(div_s16)
+#endif // div_sup
+
+#if 1
+SIMD_INTRIN_DEF(divisor_s16)
+SIMD_INTRIN_DEF(divc_s16)
+#endif // intdiv_sup
+
+#if 0
+#line 785
+SIMD_INTRIN_DEF(muladd_s16)
+
+#line 785
+SIMD_INTRIN_DEF(mulsub_s16)
+
+#line 785
+SIMD_INTRIN_DEF(nmuladd_s16)
+
+#line 785
+SIMD_INTRIN_DEF(nmulsub_s16)
+
+#line 785
+SIMD_INTRIN_DEF(muladdsub_s16)
+
+#endif // fused_sup
+
+#if 0
+SIMD_INTRIN_DEF(sum_s16)
+#endif // sum_sup
+
+#if 0
+SIMD_INTRIN_DEF(sumup_s16)
+#endif // sumup_sup
+/***************************
+ * Math
+ ***************************/
+#if 0
+#line 803
+SIMD_INTRIN_DEF(sqrt_s16)
+
+#line 803
+SIMD_INTRIN_DEF(recip_s16)
+
+#line 803
+SIMD_INTRIN_DEF(abs_s16)
+
+#line 803
+SIMD_INTRIN_DEF(square_s16)
+
+#line 803
+SIMD_INTRIN_DEF(rint_s16)
+
+#line 803
+SIMD_INTRIN_DEF(ceil_s16)
+
+#line 803
+SIMD_INTRIN_DEF(trunc_s16)
+
+#line 803
+SIMD_INTRIN_DEF(floor_s16)
+
+#endif
+
+#line 810
+SIMD_INTRIN_DEF(max_s16)
+SIMD_INTRIN_DEF(reduce_max_s16)
+
+#line 810
+SIMD_INTRIN_DEF(min_s16)
+SIMD_INTRIN_DEF(reduce_min_s16)
+
+
+#if 0
+#line 818
+SIMD_INTRIN_DEF(maxp_s16)
+SIMD_INTRIN_DEF(reduce_maxp_s16)
+
+#line 818
+SIMD_INTRIN_DEF(minp_s16)
+SIMD_INTRIN_DEF(reduce_minp_s16)
+
+#line 818
+SIMD_INTRIN_DEF(maxn_s16)
+SIMD_INTRIN_DEF(reduce_maxn_s16)
+
+#line 818
+SIMD_INTRIN_DEF(minn_s16)
+SIMD_INTRIN_DEF(reduce_minn_s16)
+
+/**end repeat1**/
+#endif
+
+/***************************
+ * Mask operations
+ ***************************/
+#line 830
+ SIMD_INTRIN_DEF(ifadd_s16)
+
+#line 830
+ SIMD_INTRIN_DEF(ifsub_s16)
+
+
+#if 0
+#line 837
+SIMD_INTRIN_DEF(ifdiv_s16)
+
+#line 837
+SIMD_INTRIN_DEF(ifdivz_s16)
+
+#endif
+
+#endif // simd_sup
+
+#line 630
+#if 1
+
+/***************************
+ * Memory
+ ***************************/
+#line 638
+SIMD_INTRIN_DEF(load_u32)
+
+#line 638
+SIMD_INTRIN_DEF(loada_u32)
+
+#line 638
+SIMD_INTRIN_DEF(loads_u32)
+
+#line 638
+SIMD_INTRIN_DEF(loadl_u32)
+
+#line 638
+SIMD_INTRIN_DEF(store_u32)
+
+#line 638
+SIMD_INTRIN_DEF(storea_u32)
+
+#line 638
+SIMD_INTRIN_DEF(stores_u32)
+
+#line 638
+SIMD_INTRIN_DEF(storel_u32)
+
+#line 638
+SIMD_INTRIN_DEF(storeh_u32)
+
+
+#line 644
+SIMD_INTRIN_DEF(load_u32x2)
+
+#line 644
+SIMD_INTRIN_DEF(store_u32x2)
+
+
+/****************************************
+ * Non-contiguous/Partial Memory access
+ ****************************************/
+#if 1
+#line 655
+SIMD_INTRIN_DEF(load_till_u32)
+
+#line 655
+SIMD_INTRIN_DEF(load_tillz_u32)
+
+#line 655
+SIMD_INTRIN_DEF(loadn_u32)
+
+#line 655
+SIMD_INTRIN_DEF(loadn_till_u32)
+
+#line 655
+SIMD_INTRIN_DEF(loadn_tillz_u32)
+
+#line 655
+SIMD_INTRIN_DEF(store_till_u32)
+
+#line 655
+SIMD_INTRIN_DEF(storen_u32)
+
+#line 655
+SIMD_INTRIN_DEF(storen_till_u32)
+
+#if 32 == 32
+    #line 662
+    SIMD_INTRIN_DEF(load2_till_u32)
+    
+#line 662
+    SIMD_INTRIN_DEF(load2_tillz_u32)
+    
+#line 662
+    SIMD_INTRIN_DEF(loadn2_u32)
+    
+#line 662
+    SIMD_INTRIN_DEF(loadn2_till_u32)
+    
+#line 662
+    SIMD_INTRIN_DEF(loadn2_tillz_u32)
+    
+#line 662
+    SIMD_INTRIN_DEF(store2_till_u32)
+    
+#line 662
+    SIMD_INTRIN_DEF(storen2_u32)
+    
+#line 662
+    SIMD_INTRIN_DEF(storen2_till_u32)
+    
+#else
+    #line 669
+    SIMD_INTRIN_DEF(load2_till_u32)
+    
+#line 669
+    SIMD_INTRIN_DEF(load2_tillz_u32)
+    
+#line 669
+    SIMD_INTRIN_DEF(loadn2_u32)
+    
+#line 669
+    SIMD_INTRIN_DEF(loadn2_till_u32)
+    
+#line 669
+    SIMD_INTRIN_DEF(loadn2_tillz_u32)
+    
+#line 669
+    SIMD_INTRIN_DEF(store2_till_u32)
+    
+#line 669
+    SIMD_INTRIN_DEF(storen2_u32)
+    
+#line 669
+    SIMD_INTRIN_DEF(storen2_till_u32)
+    
+#endif
+#endif // ncont_sup
+
+/****************************
+ * Lookup tables
+ ****************************/
+#if 32 == 32
+SIMD_INTRIN_DEF(lut32_u32)
+#endif
+#if 32 == 64
+SIMD_INTRIN_DEF(lut16_u32)
+#endif
+/***************************
+ * Misc
+ ***************************/
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u8_u32)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s8_u32)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u16_u32)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s16_u32)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u32_u32)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s32_u32)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u64_u32)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s64_u32)
+#endif // simd_sup2
+
+#line 690
+#if NPY_SIMD_F32
+SIMD_INTRIN_DEF(reinterpret_f32_u32)
+#endif // simd_sup2
+
+#line 690
+#if NPY_SIMD_F64
+SIMD_INTRIN_DEF(reinterpret_f64_u32)
+#endif // simd_sup2
+
+
+#line 698
+SIMD_INTRIN_DEF(set_u32)
+
+#line 698
+SIMD_INTRIN_DEF(setf_u32)
+
+#line 698
+SIMD_INTRIN_DEF(setall_u32)
+
+#line 698
+SIMD_INTRIN_DEF(zero_u32)
+
+#line 698
+SIMD_INTRIN_DEF(select_u32)
+
+#line 698
+SIMD_INTRIN_DEF(extract0_u32)
+
+
+/***************************
+ * Reorder
+ ***************************/
+#line 707
+SIMD_INTRIN_DEF(combinel_u32)
+
+#line 707
+SIMD_INTRIN_DEF(combineh_u32)
+
+#line 707
+SIMD_INTRIN_DEF(combine_u32)
+
+#line 707
+SIMD_INTRIN_DEF(zip_u32)
+
+#line 707
+SIMD_INTRIN_DEF(unzip_u32)
+
+
+#if 1
+SIMD_INTRIN_DEF(rev64_u32)
+#endif
+
+#if 32 > 16
+{ "permi128_u32", simd__intrin_permi128_u32_, METH_VARARGS, NULL },
+#endif
+
+/***************************
+ * Operators
+ ***************************/
+#if 31 > 0
+#line 725
+SIMD_INTRIN_DEF(shl_u32)
+
+#line 725
+SIMD_INTRIN_DEF(shr_u32)
+
+#line 725
+SIMD_INTRIN_DEF(shli_u32)
+
+#line 725
+SIMD_INTRIN_DEF(shri_u32)
+
+#endif // shl_imm
+
+#line 733
+SIMD_INTRIN_DEF(and_u32)
+
+#line 733
+SIMD_INTRIN_DEF(or_u32)
+
+#line 733
+SIMD_INTRIN_DEF(xor_u32)
+
+#line 733
+SIMD_INTRIN_DEF(not_u32)
+
+#line 733
+SIMD_INTRIN_DEF(cmpeq_u32)
+
+#line 733
+SIMD_INTRIN_DEF(cmpneq_u32)
+
+#line 733
+SIMD_INTRIN_DEF(cmpgt_u32)
+
+#line 733
+SIMD_INTRIN_DEF(cmpge_u32)
+
+#line 733
+SIMD_INTRIN_DEF(cmplt_u32)
+
+#line 733
+SIMD_INTRIN_DEF(cmple_u32)
+
+#line 733
+SIMD_INTRIN_DEF(any_u32)
+
+#line 733
+SIMD_INTRIN_DEF(all_u32)
+
+
+#if 0
+SIMD_INTRIN_DEF(andc_u32)
+SIMD_INTRIN_DEF(andc_b32)
+SIMD_INTRIN_DEF(orc_b32)
+SIMD_INTRIN_DEF(xnor_b32)
+#endif
+
+/***************************
+ * Conversion
+ ***************************/
+SIMD_INTRIN_DEF(cvt_u32_b32)
+SIMD_INTRIN_DEF(cvt_b32_u32)
+#if 0
+SIMD_INTRIN_DEF(expand_u32_u32)
+#endif // expand_sup
+/***************************
+ * Arithmetic
+ ***************************/
+#line 757
+SIMD_INTRIN_DEF(add_u32)
+
+#line 757
+SIMD_INTRIN_DEF(sub_u32)
+
+
+#if 0
+#line 764
+SIMD_INTRIN_DEF(adds_u32)
+
+#line 764
+SIMD_INTRIN_DEF(subs_u32)
+
+#endif // sat_sup
+
+#if 1
+SIMD_INTRIN_DEF(mul_u32)
+#endif // mul_sup
+
+#if 0
+SIMD_INTRIN_DEF(div_u32)
+#endif // div_sup
+
+#if 1
+SIMD_INTRIN_DEF(divisor_u32)
+SIMD_INTRIN_DEF(divc_u32)
+#endif // intdiv_sup
+
+#if 0
+#line 785
+SIMD_INTRIN_DEF(muladd_u32)
+
+#line 785
+SIMD_INTRIN_DEF(mulsub_u32)
+
+#line 785
+SIMD_INTRIN_DEF(nmuladd_u32)
+
+#line 785
+SIMD_INTRIN_DEF(nmulsub_u32)
+
+#line 785
+SIMD_INTRIN_DEF(muladdsub_u32)
+
+#endif // fused_sup
+
+#if 1
+SIMD_INTRIN_DEF(sum_u32)
+#endif // sum_sup
+
+#if 0
+SIMD_INTRIN_DEF(sumup_u32)
+#endif // sumup_sup
+/***************************
+ * Math
+ ***************************/
+#if 0
+#line 803
+SIMD_INTRIN_DEF(sqrt_u32)
+
+#line 803
+SIMD_INTRIN_DEF(recip_u32)
+
+#line 803
+SIMD_INTRIN_DEF(abs_u32)
+
+#line 803
+SIMD_INTRIN_DEF(square_u32)
+
+#line 803
+SIMD_INTRIN_DEF(rint_u32)
+
+#line 803
+SIMD_INTRIN_DEF(ceil_u32)
+
+#line 803
+SIMD_INTRIN_DEF(trunc_u32)
+
+#line 803
+SIMD_INTRIN_DEF(floor_u32)
+
+#endif
+
+#line 810
+SIMD_INTRIN_DEF(max_u32)
+SIMD_INTRIN_DEF(reduce_max_u32)
+
+#line 810
+SIMD_INTRIN_DEF(min_u32)
+SIMD_INTRIN_DEF(reduce_min_u32)
+
+
+#if 0
+#line 818
+SIMD_INTRIN_DEF(maxp_u32)
+SIMD_INTRIN_DEF(reduce_maxp_u32)
+
+#line 818
+SIMD_INTRIN_DEF(minp_u32)
+SIMD_INTRIN_DEF(reduce_minp_u32)
+
+#line 818
+SIMD_INTRIN_DEF(maxn_u32)
+SIMD_INTRIN_DEF(reduce_maxn_u32)
+
+#line 818
+SIMD_INTRIN_DEF(minn_u32)
+SIMD_INTRIN_DEF(reduce_minn_u32)
+
+/**end repeat1**/
+#endif
+
+/***************************
+ * Mask operations
+ ***************************/
+#line 830
+ SIMD_INTRIN_DEF(ifadd_u32)
+
+#line 830
+ SIMD_INTRIN_DEF(ifsub_u32)
+
+
+#if 0
+#line 837
+SIMD_INTRIN_DEF(ifdiv_u32)
+
+#line 837
+SIMD_INTRIN_DEF(ifdivz_u32)
+
+#endif
+
+#endif // simd_sup
+
+#line 630
+#if 1
+
+/***************************
+ * Memory
+ ***************************/
+#line 638
+SIMD_INTRIN_DEF(load_s32)
+
+#line 638
+SIMD_INTRIN_DEF(loada_s32)
+
+#line 638
+SIMD_INTRIN_DEF(loads_s32)
+
+#line 638
+SIMD_INTRIN_DEF(loadl_s32)
+
+#line 638
+SIMD_INTRIN_DEF(store_s32)
+
+#line 638
+SIMD_INTRIN_DEF(storea_s32)
+
+#line 638
+SIMD_INTRIN_DEF(stores_s32)
+
+#line 638
+SIMD_INTRIN_DEF(storel_s32)
+
+#line 638
+SIMD_INTRIN_DEF(storeh_s32)
+
+
+#line 644
+SIMD_INTRIN_DEF(load_s32x2)
+
+#line 644
+SIMD_INTRIN_DEF(store_s32x2)
+
+
+/****************************************
+ * Non-contiguous/Partial Memory access
+ ****************************************/
+#if 1
+#line 655
+SIMD_INTRIN_DEF(load_till_s32)
+
+#line 655
+SIMD_INTRIN_DEF(load_tillz_s32)
+
+#line 655
+SIMD_INTRIN_DEF(loadn_s32)
+
+#line 655
+SIMD_INTRIN_DEF(loadn_till_s32)
+
+#line 655
+SIMD_INTRIN_DEF(loadn_tillz_s32)
+
+#line 655
+SIMD_INTRIN_DEF(store_till_s32)
+
+#line 655
+SIMD_INTRIN_DEF(storen_s32)
+
+#line 655
+SIMD_INTRIN_DEF(storen_till_s32)
+
+#if 32 == 32
+    #line 662
+    SIMD_INTRIN_DEF(load2_till_s32)
+    
+#line 662
+    SIMD_INTRIN_DEF(load2_tillz_s32)
+    
+#line 662
+    SIMD_INTRIN_DEF(loadn2_s32)
+    
+#line 662
+    SIMD_INTRIN_DEF(loadn2_till_s32)
+    
+#line 662
+    SIMD_INTRIN_DEF(loadn2_tillz_s32)
+    
+#line 662
+    SIMD_INTRIN_DEF(store2_till_s32)
+    
+#line 662
+    SIMD_INTRIN_DEF(storen2_s32)
+    
+#line 662
+    SIMD_INTRIN_DEF(storen2_till_s32)
+    
+#else
+    #line 669
+    SIMD_INTRIN_DEF(load2_till_s32)
+    
+#line 669
+    SIMD_INTRIN_DEF(load2_tillz_s32)
+    
+#line 669
+    SIMD_INTRIN_DEF(loadn2_s32)
+    
+#line 669
+    SIMD_INTRIN_DEF(loadn2_till_s32)
+    
+#line 669
+    SIMD_INTRIN_DEF(loadn2_tillz_s32)
+    
+#line 669
+    SIMD_INTRIN_DEF(store2_till_s32)
+    
+#line 669
+    SIMD_INTRIN_DEF(storen2_s32)
+    
+#line 669
+    SIMD_INTRIN_DEF(storen2_till_s32)
+    
+#endif
+#endif // ncont_sup
+
+/****************************
+ * Lookup tables
+ ****************************/
+#if 32 == 32
+SIMD_INTRIN_DEF(lut32_s32)
+#endif
+#if 32 == 64
+SIMD_INTRIN_DEF(lut16_s32)
+#endif
+/***************************
+ * Misc
+ ***************************/
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u8_s32)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s8_s32)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u16_s32)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s16_s32)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u32_s32)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s32_s32)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u64_s32)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s64_s32)
+#endif // simd_sup2
+
+#line 690
+#if NPY_SIMD_F32
+SIMD_INTRIN_DEF(reinterpret_f32_s32)
+#endif // simd_sup2
+
+#line 690
+#if NPY_SIMD_F64
+SIMD_INTRIN_DEF(reinterpret_f64_s32)
+#endif // simd_sup2
+
+
+#line 698
+SIMD_INTRIN_DEF(set_s32)
+
+#line 698
+SIMD_INTRIN_DEF(setf_s32)
+
+#line 698
+SIMD_INTRIN_DEF(setall_s32)
+
+#line 698
+SIMD_INTRIN_DEF(zero_s32)
+
+#line 698
+SIMD_INTRIN_DEF(select_s32)
+
+#line 698
+SIMD_INTRIN_DEF(extract0_s32)
+
+
+/***************************
+ * Reorder
+ ***************************/
+#line 707
+SIMD_INTRIN_DEF(combinel_s32)
+
+#line 707
+SIMD_INTRIN_DEF(combineh_s32)
+
+#line 707
+SIMD_INTRIN_DEF(combine_s32)
+
+#line 707
+SIMD_INTRIN_DEF(zip_s32)
+
+#line 707
+SIMD_INTRIN_DEF(unzip_s32)
+
+
+#if 1
+SIMD_INTRIN_DEF(rev64_s32)
+#endif
+
+#if 32 > 16
+{ "permi128_s32", simd__intrin_permi128_s32_, METH_VARARGS, NULL },
+#endif
+
+/***************************
+ * Operators
+ ***************************/
+#if 31 > 0
+#line 725
+SIMD_INTRIN_DEF(shl_s32)
+
+#line 725
+SIMD_INTRIN_DEF(shr_s32)
+
+#line 725
+SIMD_INTRIN_DEF(shli_s32)
+
+#line 725
+SIMD_INTRIN_DEF(shri_s32)
+
+#endif // shl_imm
+
+#line 733
+SIMD_INTRIN_DEF(and_s32)
+
+#line 733
+SIMD_INTRIN_DEF(or_s32)
+
+#line 733
+SIMD_INTRIN_DEF(xor_s32)
+
+#line 733
+SIMD_INTRIN_DEF(not_s32)
+
+#line 733
+SIMD_INTRIN_DEF(cmpeq_s32)
+
+#line 733
+SIMD_INTRIN_DEF(cmpneq_s32)
+
+#line 733
+SIMD_INTRIN_DEF(cmpgt_s32)
+
+#line 733
+SIMD_INTRIN_DEF(cmpge_s32)
+
+#line 733
+SIMD_INTRIN_DEF(cmplt_s32)
+
+#line 733
+SIMD_INTRIN_DEF(cmple_s32)
+
+#line 733
+SIMD_INTRIN_DEF(any_s32)
+
+#line 733
+SIMD_INTRIN_DEF(all_s32)
+
+
+#if 0
+SIMD_INTRIN_DEF(andc_s32)
+SIMD_INTRIN_DEF(andc_b32)
+SIMD_INTRIN_DEF(orc_b32)
+SIMD_INTRIN_DEF(xnor_b32)
+#endif
+
+/***************************
+ * Conversion
+ ***************************/
+SIMD_INTRIN_DEF(cvt_s32_b32)
+SIMD_INTRIN_DEF(cvt_b32_s32)
+#if 0
+SIMD_INTRIN_DEF(expand_s32_s32)
+#endif // expand_sup
+/***************************
+ * Arithmetic
+ ***************************/
+#line 757
+SIMD_INTRIN_DEF(add_s32)
+
+#line 757
+SIMD_INTRIN_DEF(sub_s32)
+
+
+#if 0
+#line 764
+SIMD_INTRIN_DEF(adds_s32)
+
+#line 764
+SIMD_INTRIN_DEF(subs_s32)
+
+#endif // sat_sup
+
+#if 1
+SIMD_INTRIN_DEF(mul_s32)
+#endif // mul_sup
+
+#if 0
+SIMD_INTRIN_DEF(div_s32)
+#endif // div_sup
+
+#if 1
+SIMD_INTRIN_DEF(divisor_s32)
+SIMD_INTRIN_DEF(divc_s32)
+#endif // intdiv_sup
+
+#if 0
+#line 785
+SIMD_INTRIN_DEF(muladd_s32)
+
+#line 785
+SIMD_INTRIN_DEF(mulsub_s32)
+
+#line 785
+SIMD_INTRIN_DEF(nmuladd_s32)
+
+#line 785
+SIMD_INTRIN_DEF(nmulsub_s32)
+
+#line 785
+SIMD_INTRIN_DEF(muladdsub_s32)
+
+#endif // fused_sup
+
+#if 0
+SIMD_INTRIN_DEF(sum_s32)
+#endif // sum_sup
+
+#if 0
+SIMD_INTRIN_DEF(sumup_s32)
+#endif // sumup_sup
+/***************************
+ * Math
+ ***************************/
+#if 0
+#line 803
+SIMD_INTRIN_DEF(sqrt_s32)
+
+#line 803
+SIMD_INTRIN_DEF(recip_s32)
+
+#line 803
+SIMD_INTRIN_DEF(abs_s32)
+
+#line 803
+SIMD_INTRIN_DEF(square_s32)
+
+#line 803
+SIMD_INTRIN_DEF(rint_s32)
+
+#line 803
+SIMD_INTRIN_DEF(ceil_s32)
+
+#line 803
+SIMD_INTRIN_DEF(trunc_s32)
+
+#line 803
+SIMD_INTRIN_DEF(floor_s32)
+
+#endif
+
+#line 810
+SIMD_INTRIN_DEF(max_s32)
+SIMD_INTRIN_DEF(reduce_max_s32)
+
+#line 810
+SIMD_INTRIN_DEF(min_s32)
+SIMD_INTRIN_DEF(reduce_min_s32)
+
+
+#if 0
+#line 818
+SIMD_INTRIN_DEF(maxp_s32)
+SIMD_INTRIN_DEF(reduce_maxp_s32)
+
+#line 818
+SIMD_INTRIN_DEF(minp_s32)
+SIMD_INTRIN_DEF(reduce_minp_s32)
+
+#line 818
+SIMD_INTRIN_DEF(maxn_s32)
+SIMD_INTRIN_DEF(reduce_maxn_s32)
+
+#line 818
+SIMD_INTRIN_DEF(minn_s32)
+SIMD_INTRIN_DEF(reduce_minn_s32)
+
+/**end repeat1**/
+#endif
+
+/***************************
+ * Mask operations
+ ***************************/
+#line 830
+ SIMD_INTRIN_DEF(ifadd_s32)
+
+#line 830
+ SIMD_INTRIN_DEF(ifsub_s32)
+
+
+#if 0
+#line 837
+SIMD_INTRIN_DEF(ifdiv_s32)
+
+#line 837
+SIMD_INTRIN_DEF(ifdivz_s32)
+
+#endif
+
+#endif // simd_sup
+
+#line 630
+#if 1
+
+/***************************
+ * Memory
+ ***************************/
+#line 638
+SIMD_INTRIN_DEF(load_u64)
+
+#line 638
+SIMD_INTRIN_DEF(loada_u64)
+
+#line 638
+SIMD_INTRIN_DEF(loads_u64)
+
+#line 638
+SIMD_INTRIN_DEF(loadl_u64)
+
+#line 638
+SIMD_INTRIN_DEF(store_u64)
+
+#line 638
+SIMD_INTRIN_DEF(storea_u64)
+
+#line 638
+SIMD_INTRIN_DEF(stores_u64)
+
+#line 638
+SIMD_INTRIN_DEF(storel_u64)
+
+#line 638
+SIMD_INTRIN_DEF(storeh_u64)
+
+
+#line 644
+SIMD_INTRIN_DEF(load_u64x2)
+
+#line 644
+SIMD_INTRIN_DEF(store_u64x2)
+
+
+/****************************************
+ * Non-contiguous/Partial Memory access
+ ****************************************/
+#if 1
+#line 655
+SIMD_INTRIN_DEF(load_till_u64)
+
+#line 655
+SIMD_INTRIN_DEF(load_tillz_u64)
+
+#line 655
+SIMD_INTRIN_DEF(loadn_u64)
+
+#line 655
+SIMD_INTRIN_DEF(loadn_till_u64)
+
+#line 655
+SIMD_INTRIN_DEF(loadn_tillz_u64)
+
+#line 655
+SIMD_INTRIN_DEF(store_till_u64)
+
+#line 655
+SIMD_INTRIN_DEF(storen_u64)
+
+#line 655
+SIMD_INTRIN_DEF(storen_till_u64)
+
+#if 64 == 32
+    #line 662
+    SIMD_INTRIN_DEF(load2_till_u64)
+    
+#line 662
+    SIMD_INTRIN_DEF(load2_tillz_u64)
+    
+#line 662
+    SIMD_INTRIN_DEF(loadn2_u64)
+    
+#line 662
+    SIMD_INTRIN_DEF(loadn2_till_u64)
+    
+#line 662
+    SIMD_INTRIN_DEF(loadn2_tillz_u64)
+    
+#line 662
+    SIMD_INTRIN_DEF(store2_till_u64)
+    
+#line 662
+    SIMD_INTRIN_DEF(storen2_u64)
+    
+#line 662
+    SIMD_INTRIN_DEF(storen2_till_u64)
+    
+#else
+    #line 669
+    SIMD_INTRIN_DEF(load2_till_u64)
+    
+#line 669
+    SIMD_INTRIN_DEF(load2_tillz_u64)
+    
+#line 669
+    SIMD_INTRIN_DEF(loadn2_u64)
+    
+#line 669
+    SIMD_INTRIN_DEF(loadn2_till_u64)
+    
+#line 669
+    SIMD_INTRIN_DEF(loadn2_tillz_u64)
+    
+#line 669
+    SIMD_INTRIN_DEF(store2_till_u64)
+    
+#line 669
+    SIMD_INTRIN_DEF(storen2_u64)
+    
+#line 669
+    SIMD_INTRIN_DEF(storen2_till_u64)
+    
+#endif
+#endif // ncont_sup
+
+/****************************
+ * Lookup tables
+ ****************************/
+#if 64 == 32
+SIMD_INTRIN_DEF(lut32_u64)
+#endif
+#if 64 == 64
+SIMD_INTRIN_DEF(lut16_u64)
+#endif
+/***************************
+ * Misc
+ ***************************/
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u8_u64)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s8_u64)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u16_u64)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s16_u64)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u32_u64)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s32_u64)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u64_u64)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s64_u64)
+#endif // simd_sup2
+
+#line 690
+#if NPY_SIMD_F32
+SIMD_INTRIN_DEF(reinterpret_f32_u64)
+#endif // simd_sup2
+
+#line 690
+#if NPY_SIMD_F64
+SIMD_INTRIN_DEF(reinterpret_f64_u64)
+#endif // simd_sup2
+
+
+#line 698
+SIMD_INTRIN_DEF(set_u64)
+
+#line 698
+SIMD_INTRIN_DEF(setf_u64)
+
+#line 698
+SIMD_INTRIN_DEF(setall_u64)
+
+#line 698
+SIMD_INTRIN_DEF(zero_u64)
+
+#line 698
+SIMD_INTRIN_DEF(select_u64)
+
+#line 698
+SIMD_INTRIN_DEF(extract0_u64)
+
+
+/***************************
+ * Reorder
+ ***************************/
+#line 707
+SIMD_INTRIN_DEF(combinel_u64)
+
+#line 707
+SIMD_INTRIN_DEF(combineh_u64)
+
+#line 707
+SIMD_INTRIN_DEF(combine_u64)
+
+#line 707
+SIMD_INTRIN_DEF(zip_u64)
+
+#line 707
+SIMD_INTRIN_DEF(unzip_u64)
+
+
+#if 0
+SIMD_INTRIN_DEF(rev64_u64)
+#endif
+
+#if 64 > 16
+{ "permi128_u64", simd__intrin_permi128_u64_, METH_VARARGS, NULL },
+#endif
+
+/***************************
+ * Operators
+ ***************************/
+#if 63 > 0
+#line 725
+SIMD_INTRIN_DEF(shl_u64)
+
+#line 725
+SIMD_INTRIN_DEF(shr_u64)
+
+#line 725
+SIMD_INTRIN_DEF(shli_u64)
+
+#line 725
+SIMD_INTRIN_DEF(shri_u64)
+
+#endif // shl_imm
+
+#line 733
+SIMD_INTRIN_DEF(and_u64)
+
+#line 733
+SIMD_INTRIN_DEF(or_u64)
+
+#line 733
+SIMD_INTRIN_DEF(xor_u64)
+
+#line 733
+SIMD_INTRIN_DEF(not_u64)
+
+#line 733
+SIMD_INTRIN_DEF(cmpeq_u64)
+
+#line 733
+SIMD_INTRIN_DEF(cmpneq_u64)
+
+#line 733
+SIMD_INTRIN_DEF(cmpgt_u64)
+
+#line 733
+SIMD_INTRIN_DEF(cmpge_u64)
+
+#line 733
+SIMD_INTRIN_DEF(cmplt_u64)
+
+#line 733
+SIMD_INTRIN_DEF(cmple_u64)
+
+#line 733
+SIMD_INTRIN_DEF(any_u64)
+
+#line 733
+SIMD_INTRIN_DEF(all_u64)
+
+
+#if 0
+SIMD_INTRIN_DEF(andc_u64)
+SIMD_INTRIN_DEF(andc_b64)
+SIMD_INTRIN_DEF(orc_b64)
+SIMD_INTRIN_DEF(xnor_b64)
+#endif
+
+/***************************
+ * Conversion
+ ***************************/
+SIMD_INTRIN_DEF(cvt_u64_b64)
+SIMD_INTRIN_DEF(cvt_b64_u64)
+#if 0
+SIMD_INTRIN_DEF(expand_u64_u64)
+#endif // expand_sup
+/***************************
+ * Arithmetic
+ ***************************/
+#line 757
+SIMD_INTRIN_DEF(add_u64)
+
+#line 757
+SIMD_INTRIN_DEF(sub_u64)
+
+
+#if 0
+#line 764
+SIMD_INTRIN_DEF(adds_u64)
+
+#line 764
+SIMD_INTRIN_DEF(subs_u64)
+
+#endif // sat_sup
+
+#if 0
+SIMD_INTRIN_DEF(mul_u64)
+#endif // mul_sup
+
+#if 0
+SIMD_INTRIN_DEF(div_u64)
+#endif // div_sup
+
+#if 1
+SIMD_INTRIN_DEF(divisor_u64)
+SIMD_INTRIN_DEF(divc_u64)
+#endif // intdiv_sup
+
+#if 0
+#line 785
+SIMD_INTRIN_DEF(muladd_u64)
+
+#line 785
+SIMD_INTRIN_DEF(mulsub_u64)
+
+#line 785
+SIMD_INTRIN_DEF(nmuladd_u64)
+
+#line 785
+SIMD_INTRIN_DEF(nmulsub_u64)
+
+#line 785
+SIMD_INTRIN_DEF(muladdsub_u64)
+
+#endif // fused_sup
+
+#if 1
+SIMD_INTRIN_DEF(sum_u64)
+#endif // sum_sup
+
+#if 0
+SIMD_INTRIN_DEF(sumup_u64)
+#endif // sumup_sup
+/***************************
+ * Math
+ ***************************/
+#if 0
+#line 803
+SIMD_INTRIN_DEF(sqrt_u64)
+
+#line 803
+SIMD_INTRIN_DEF(recip_u64)
+
+#line 803
+SIMD_INTRIN_DEF(abs_u64)
+
+#line 803
+SIMD_INTRIN_DEF(square_u64)
+
+#line 803
+SIMD_INTRIN_DEF(rint_u64)
+
+#line 803
+SIMD_INTRIN_DEF(ceil_u64)
+
+#line 803
+SIMD_INTRIN_DEF(trunc_u64)
+
+#line 803
+SIMD_INTRIN_DEF(floor_u64)
+
+#endif
+
+#line 810
+SIMD_INTRIN_DEF(max_u64)
+SIMD_INTRIN_DEF(reduce_max_u64)
+
+#line 810
+SIMD_INTRIN_DEF(min_u64)
+SIMD_INTRIN_DEF(reduce_min_u64)
+
+
+#if 0
+#line 818
+SIMD_INTRIN_DEF(maxp_u64)
+SIMD_INTRIN_DEF(reduce_maxp_u64)
+
+#line 818
+SIMD_INTRIN_DEF(minp_u64)
+SIMD_INTRIN_DEF(reduce_minp_u64)
+
+#line 818
+SIMD_INTRIN_DEF(maxn_u64)
+SIMD_INTRIN_DEF(reduce_maxn_u64)
+
+#line 818
+SIMD_INTRIN_DEF(minn_u64)
+SIMD_INTRIN_DEF(reduce_minn_u64)
+
+/**end repeat1**/
+#endif
+
+/***************************
+ * Mask operations
+ ***************************/
+#line 830
+ SIMD_INTRIN_DEF(ifadd_u64)
+
+#line 830
+ SIMD_INTRIN_DEF(ifsub_u64)
+
+
+#if 0
+#line 837
+SIMD_INTRIN_DEF(ifdiv_u64)
+
+#line 837
+SIMD_INTRIN_DEF(ifdivz_u64)
+
+#endif
+
+#endif // simd_sup
+
+#line 630
+#if 1
+
+/***************************
+ * Memory
+ ***************************/
+#line 638
+SIMD_INTRIN_DEF(load_s64)
+
+#line 638
+SIMD_INTRIN_DEF(loada_s64)
+
+#line 638
+SIMD_INTRIN_DEF(loads_s64)
+
+#line 638
+SIMD_INTRIN_DEF(loadl_s64)
+
+#line 638
+SIMD_INTRIN_DEF(store_s64)
+
+#line 638
+SIMD_INTRIN_DEF(storea_s64)
+
+#line 638
+SIMD_INTRIN_DEF(stores_s64)
+
+#line 638
+SIMD_INTRIN_DEF(storel_s64)
+
+#line 638
+SIMD_INTRIN_DEF(storeh_s64)
+
+
+#line 644
+SIMD_INTRIN_DEF(load_s64x2)
+
+#line 644
+SIMD_INTRIN_DEF(store_s64x2)
+
+
+/****************************************
+ * Non-contiguous/Partial Memory access
+ ****************************************/
+#if 1
+#line 655
+SIMD_INTRIN_DEF(load_till_s64)
+
+#line 655
+SIMD_INTRIN_DEF(load_tillz_s64)
+
+#line 655
+SIMD_INTRIN_DEF(loadn_s64)
+
+#line 655
+SIMD_INTRIN_DEF(loadn_till_s64)
+
+#line 655
+SIMD_INTRIN_DEF(loadn_tillz_s64)
+
+#line 655
+SIMD_INTRIN_DEF(store_till_s64)
+
+#line 655
+SIMD_INTRIN_DEF(storen_s64)
+
+#line 655
+SIMD_INTRIN_DEF(storen_till_s64)
+
+#if 64 == 32
+    #line 662
+    SIMD_INTRIN_DEF(load2_till_s64)
+    
+#line 662
+    SIMD_INTRIN_DEF(load2_tillz_s64)
+    
+#line 662
+    SIMD_INTRIN_DEF(loadn2_s64)
+    
+#line 662
+    SIMD_INTRIN_DEF(loadn2_till_s64)
+    
+#line 662
+    SIMD_INTRIN_DEF(loadn2_tillz_s64)
+    
+#line 662
+    SIMD_INTRIN_DEF(store2_till_s64)
+    
+#line 662
+    SIMD_INTRIN_DEF(storen2_s64)
+    
+#line 662
+    SIMD_INTRIN_DEF(storen2_till_s64)
+    
+#else
+    #line 669
+    SIMD_INTRIN_DEF(load2_till_s64)
+    
+#line 669
+    SIMD_INTRIN_DEF(load2_tillz_s64)
+    
+#line 669
+    SIMD_INTRIN_DEF(loadn2_s64)
+    
+#line 669
+    SIMD_INTRIN_DEF(loadn2_till_s64)
+    
+#line 669
+    SIMD_INTRIN_DEF(loadn2_tillz_s64)
+    
+#line 669
+    SIMD_INTRIN_DEF(store2_till_s64)
+    
+#line 669
+    SIMD_INTRIN_DEF(storen2_s64)
+    
+#line 669
+    SIMD_INTRIN_DEF(storen2_till_s64)
+    
+#endif
+#endif // ncont_sup
+
+/****************************
+ * Lookup tables
+ ****************************/
+#if 64 == 32
+SIMD_INTRIN_DEF(lut32_s64)
+#endif
+#if 64 == 64
+SIMD_INTRIN_DEF(lut16_s64)
+#endif
+/***************************
+ * Misc
+ ***************************/
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u8_s64)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s8_s64)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u16_s64)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s16_s64)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u32_s64)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s32_s64)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u64_s64)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s64_s64)
+#endif // simd_sup2
+
+#line 690
+#if NPY_SIMD_F32
+SIMD_INTRIN_DEF(reinterpret_f32_s64)
+#endif // simd_sup2
+
+#line 690
+#if NPY_SIMD_F64
+SIMD_INTRIN_DEF(reinterpret_f64_s64)
+#endif // simd_sup2
+
+
+#line 698
+SIMD_INTRIN_DEF(set_s64)
+
+#line 698
+SIMD_INTRIN_DEF(setf_s64)
+
+#line 698
+SIMD_INTRIN_DEF(setall_s64)
+
+#line 698
+SIMD_INTRIN_DEF(zero_s64)
+
+#line 698
+SIMD_INTRIN_DEF(select_s64)
+
+#line 698
+SIMD_INTRIN_DEF(extract0_s64)
+
+
+/***************************
+ * Reorder
+ ***************************/
+#line 707
+SIMD_INTRIN_DEF(combinel_s64)
+
+#line 707
+SIMD_INTRIN_DEF(combineh_s64)
+
+#line 707
+SIMD_INTRIN_DEF(combine_s64)
+
+#line 707
+SIMD_INTRIN_DEF(zip_s64)
+
+#line 707
+SIMD_INTRIN_DEF(unzip_s64)
+
+
+#if 0
+SIMD_INTRIN_DEF(rev64_s64)
+#endif
+
+#if 64 > 16
+{ "permi128_s64", simd__intrin_permi128_s64_, METH_VARARGS, NULL },
+#endif
+
+/***************************
+ * Operators
+ ***************************/
+#if 63 > 0
+#line 725
+SIMD_INTRIN_DEF(shl_s64)
+
+#line 725
+SIMD_INTRIN_DEF(shr_s64)
+
+#line 725
+SIMD_INTRIN_DEF(shli_s64)
+
+#line 725
+SIMD_INTRIN_DEF(shri_s64)
+
+#endif // shl_imm
+
+#line 733
+SIMD_INTRIN_DEF(and_s64)
+
+#line 733
+SIMD_INTRIN_DEF(or_s64)
+
+#line 733
+SIMD_INTRIN_DEF(xor_s64)
+
+#line 733
+SIMD_INTRIN_DEF(not_s64)
+
+#line 733
+SIMD_INTRIN_DEF(cmpeq_s64)
+
+#line 733
+SIMD_INTRIN_DEF(cmpneq_s64)
+
+#line 733
+SIMD_INTRIN_DEF(cmpgt_s64)
+
+#line 733
+SIMD_INTRIN_DEF(cmpge_s64)
+
+#line 733
+SIMD_INTRIN_DEF(cmplt_s64)
+
+#line 733
+SIMD_INTRIN_DEF(cmple_s64)
+
+#line 733
+SIMD_INTRIN_DEF(any_s64)
+
+#line 733
+SIMD_INTRIN_DEF(all_s64)
+
+
+#if 0
+SIMD_INTRIN_DEF(andc_s64)
+SIMD_INTRIN_DEF(andc_b64)
+SIMD_INTRIN_DEF(orc_b64)
+SIMD_INTRIN_DEF(xnor_b64)
+#endif
+
+/***************************
+ * Conversion
+ ***************************/
+SIMD_INTRIN_DEF(cvt_s64_b64)
+SIMD_INTRIN_DEF(cvt_b64_s64)
+#if 0
+SIMD_INTRIN_DEF(expand_s64_s64)
+#endif // expand_sup
+/***************************
+ * Arithmetic
+ ***************************/
+#line 757
+SIMD_INTRIN_DEF(add_s64)
+
+#line 757
+SIMD_INTRIN_DEF(sub_s64)
+
+
+#if 0
+#line 764
+SIMD_INTRIN_DEF(adds_s64)
+
+#line 764
+SIMD_INTRIN_DEF(subs_s64)
+
+#endif // sat_sup
+
+#if 0
+SIMD_INTRIN_DEF(mul_s64)
+#endif // mul_sup
+
+#if 0
+SIMD_INTRIN_DEF(div_s64)
+#endif // div_sup
+
+#if 1
+SIMD_INTRIN_DEF(divisor_s64)
+SIMD_INTRIN_DEF(divc_s64)
+#endif // intdiv_sup
+
+#if 0
+#line 785
+SIMD_INTRIN_DEF(muladd_s64)
+
+#line 785
+SIMD_INTRIN_DEF(mulsub_s64)
+
+#line 785
+SIMD_INTRIN_DEF(nmuladd_s64)
+
+#line 785
+SIMD_INTRIN_DEF(nmulsub_s64)
+
+#line 785
+SIMD_INTRIN_DEF(muladdsub_s64)
+
+#endif // fused_sup
+
+#if 0
+SIMD_INTRIN_DEF(sum_s64)
+#endif // sum_sup
+
+#if 0
+SIMD_INTRIN_DEF(sumup_s64)
+#endif // sumup_sup
+/***************************
+ * Math
+ ***************************/
+#if 0
+#line 803
+SIMD_INTRIN_DEF(sqrt_s64)
+
+#line 803
+SIMD_INTRIN_DEF(recip_s64)
+
+#line 803
+SIMD_INTRIN_DEF(abs_s64)
+
+#line 803
+SIMD_INTRIN_DEF(square_s64)
+
+#line 803
+SIMD_INTRIN_DEF(rint_s64)
+
+#line 803
+SIMD_INTRIN_DEF(ceil_s64)
+
+#line 803
+SIMD_INTRIN_DEF(trunc_s64)
+
+#line 803
+SIMD_INTRIN_DEF(floor_s64)
+
+#endif
+
+#line 810
+SIMD_INTRIN_DEF(max_s64)
+SIMD_INTRIN_DEF(reduce_max_s64)
+
+#line 810
+SIMD_INTRIN_DEF(min_s64)
+SIMD_INTRIN_DEF(reduce_min_s64)
+
+
+#if 0
+#line 818
+SIMD_INTRIN_DEF(maxp_s64)
+SIMD_INTRIN_DEF(reduce_maxp_s64)
+
+#line 818
+SIMD_INTRIN_DEF(minp_s64)
+SIMD_INTRIN_DEF(reduce_minp_s64)
+
+#line 818
+SIMD_INTRIN_DEF(maxn_s64)
+SIMD_INTRIN_DEF(reduce_maxn_s64)
+
+#line 818
+SIMD_INTRIN_DEF(minn_s64)
+SIMD_INTRIN_DEF(reduce_minn_s64)
+
+/**end repeat1**/
+#endif
+
+/***************************
+ * Mask operations
+ ***************************/
+#line 830
+ SIMD_INTRIN_DEF(ifadd_s64)
+
+#line 830
+ SIMD_INTRIN_DEF(ifsub_s64)
+
+
+#if 0
+#line 837
+SIMD_INTRIN_DEF(ifdiv_s64)
+
+#line 837
+SIMD_INTRIN_DEF(ifdivz_s64)
+
+#endif
+
+#endif // simd_sup
+
+#line 630
+#if NPY_SIMD_F32
+
+/***************************
+ * Memory
+ ***************************/
+#line 638
+SIMD_INTRIN_DEF(load_f32)
+
+#line 638
+SIMD_INTRIN_DEF(loada_f32)
+
+#line 638
+SIMD_INTRIN_DEF(loads_f32)
+
+#line 638
+SIMD_INTRIN_DEF(loadl_f32)
+
+#line 638
+SIMD_INTRIN_DEF(store_f32)
+
+#line 638
+SIMD_INTRIN_DEF(storea_f32)
+
+#line 638
+SIMD_INTRIN_DEF(stores_f32)
+
+#line 638
+SIMD_INTRIN_DEF(storel_f32)
+
+#line 638
+SIMD_INTRIN_DEF(storeh_f32)
+
+
+#line 644
+SIMD_INTRIN_DEF(load_f32x2)
+
+#line 644
+SIMD_INTRIN_DEF(store_f32x2)
+
+
+/****************************************
+ * Non-contiguous/Partial Memory access
+ ****************************************/
+#if 1
+#line 655
+SIMD_INTRIN_DEF(load_till_f32)
+
+#line 655
+SIMD_INTRIN_DEF(load_tillz_f32)
+
+#line 655
+SIMD_INTRIN_DEF(loadn_f32)
+
+#line 655
+SIMD_INTRIN_DEF(loadn_till_f32)
+
+#line 655
+SIMD_INTRIN_DEF(loadn_tillz_f32)
+
+#line 655
+SIMD_INTRIN_DEF(store_till_f32)
+
+#line 655
+SIMD_INTRIN_DEF(storen_f32)
+
+#line 655
+SIMD_INTRIN_DEF(storen_till_f32)
+
+#if 32 == 32
+    #line 662
+    SIMD_INTRIN_DEF(load2_till_f32)
+    
+#line 662
+    SIMD_INTRIN_DEF(load2_tillz_f32)
+    
+#line 662
+    SIMD_INTRIN_DEF(loadn2_f32)
+    
+#line 662
+    SIMD_INTRIN_DEF(loadn2_till_f32)
+    
+#line 662
+    SIMD_INTRIN_DEF(loadn2_tillz_f32)
+    
+#line 662
+    SIMD_INTRIN_DEF(store2_till_f32)
+    
+#line 662
+    SIMD_INTRIN_DEF(storen2_f32)
+    
+#line 662
+    SIMD_INTRIN_DEF(storen2_till_f32)
+    
+#else
+    #line 669
+    SIMD_INTRIN_DEF(load2_till_f32)
+    
+#line 669
+    SIMD_INTRIN_DEF(load2_tillz_f32)
+    
+#line 669
+    SIMD_INTRIN_DEF(loadn2_f32)
+    
+#line 669
+    SIMD_INTRIN_DEF(loadn2_till_f32)
+    
+#line 669
+    SIMD_INTRIN_DEF(loadn2_tillz_f32)
+    
+#line 669
+    SIMD_INTRIN_DEF(store2_till_f32)
+    
+#line 669
+    SIMD_INTRIN_DEF(storen2_f32)
+    
+#line 669
+    SIMD_INTRIN_DEF(storen2_till_f32)
+    
+#endif
+#endif // ncont_sup
+
+/****************************
+ * Lookup tables
+ ****************************/
+#if 32 == 32
+SIMD_INTRIN_DEF(lut32_f32)
+#endif
+#if 32 == 64
+SIMD_INTRIN_DEF(lut16_f32)
+#endif
+/***************************
+ * Misc
+ ***************************/
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u8_f32)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s8_f32)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u16_f32)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s16_f32)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u32_f32)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s32_f32)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u64_f32)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s64_f32)
+#endif // simd_sup2
+
+#line 690
+#if NPY_SIMD_F32
+SIMD_INTRIN_DEF(reinterpret_f32_f32)
+#endif // simd_sup2
+
+#line 690
+#if NPY_SIMD_F64
+SIMD_INTRIN_DEF(reinterpret_f64_f32)
+#endif // simd_sup2
+
+
+#line 698
+SIMD_INTRIN_DEF(set_f32)
+
+#line 698
+SIMD_INTRIN_DEF(setf_f32)
+
+#line 698
+SIMD_INTRIN_DEF(setall_f32)
+
+#line 698
+SIMD_INTRIN_DEF(zero_f32)
+
+#line 698
+SIMD_INTRIN_DEF(select_f32)
+
+#line 698
+SIMD_INTRIN_DEF(extract0_f32)
+
+
+/***************************
+ * Reorder
+ ***************************/
+#line 707
+SIMD_INTRIN_DEF(combinel_f32)
+
+#line 707
+SIMD_INTRIN_DEF(combineh_f32)
+
+#line 707
+SIMD_INTRIN_DEF(combine_f32)
+
+#line 707
+SIMD_INTRIN_DEF(zip_f32)
+
+#line 707
+SIMD_INTRIN_DEF(unzip_f32)
+
+
+#if 1
+SIMD_INTRIN_DEF(rev64_f32)
+#endif
+
+#if 32 > 16
+{ "permi128_f32", simd__intrin_permi128_f32_, METH_VARARGS, NULL },
+#endif
+
+/***************************
+ * Operators
+ ***************************/
+#if 0 > 0
+#line 725
+SIMD_INTRIN_DEF(shl_f32)
+
+#line 725
+SIMD_INTRIN_DEF(shr_f32)
+
+#line 725
+SIMD_INTRIN_DEF(shli_f32)
+
+#line 725
+SIMD_INTRIN_DEF(shri_f32)
+
+#endif // shl_imm
+
+#line 733
+SIMD_INTRIN_DEF(and_f32)
+
+#line 733
+SIMD_INTRIN_DEF(or_f32)
+
+#line 733
+SIMD_INTRIN_DEF(xor_f32)
+
+#line 733
+SIMD_INTRIN_DEF(not_f32)
+
+#line 733
+SIMD_INTRIN_DEF(cmpeq_f32)
+
+#line 733
+SIMD_INTRIN_DEF(cmpneq_f32)
+
+#line 733
+SIMD_INTRIN_DEF(cmpgt_f32)
+
+#line 733
+SIMD_INTRIN_DEF(cmpge_f32)
+
+#line 733
+SIMD_INTRIN_DEF(cmplt_f32)
+
+#line 733
+SIMD_INTRIN_DEF(cmple_f32)
+
+#line 733
+SIMD_INTRIN_DEF(any_f32)
+
+#line 733
+SIMD_INTRIN_DEF(all_f32)
+
+
+#if 0
+SIMD_INTRIN_DEF(andc_f32)
+SIMD_INTRIN_DEF(andc_b32)
+SIMD_INTRIN_DEF(orc_b32)
+SIMD_INTRIN_DEF(xnor_b32)
+#endif
+
+/***************************
+ * Conversion
+ ***************************/
+SIMD_INTRIN_DEF(cvt_f32_b32)
+SIMD_INTRIN_DEF(cvt_b32_f32)
+#if 0
+SIMD_INTRIN_DEF(expand_f32_f32)
+#endif // expand_sup
+/***************************
+ * Arithmetic
+ ***************************/
+#line 757
+SIMD_INTRIN_DEF(add_f32)
+
+#line 757
+SIMD_INTRIN_DEF(sub_f32)
+
+
+#if 0
+#line 764
+SIMD_INTRIN_DEF(adds_f32)
+
+#line 764
+SIMD_INTRIN_DEF(subs_f32)
+
+#endif // sat_sup
+
+#if 1
+SIMD_INTRIN_DEF(mul_f32)
+#endif // mul_sup
+
+#if 1
+SIMD_INTRIN_DEF(div_f32)
+#endif // div_sup
+
+#if 0
+SIMD_INTRIN_DEF(divisor_f32)
+SIMD_INTRIN_DEF(divc_f32)
+#endif // intdiv_sup
+
+#if 1
+#line 785
+SIMD_INTRIN_DEF(muladd_f32)
+
+#line 785
+SIMD_INTRIN_DEF(mulsub_f32)
+
+#line 785
+SIMD_INTRIN_DEF(nmuladd_f32)
+
+#line 785
+SIMD_INTRIN_DEF(nmulsub_f32)
+
+#line 785
+SIMD_INTRIN_DEF(muladdsub_f32)
+
+#endif // fused_sup
+
+#if 1
+SIMD_INTRIN_DEF(sum_f32)
+#endif // sum_sup
+
+#if 0
+SIMD_INTRIN_DEF(sumup_f32)
+#endif // sumup_sup
+/***************************
+ * Math
+ ***************************/
+#if 1
+#line 803
+SIMD_INTRIN_DEF(sqrt_f32)
+
+#line 803
+SIMD_INTRIN_DEF(recip_f32)
+
+#line 803
+SIMD_INTRIN_DEF(abs_f32)
+
+#line 803
+SIMD_INTRIN_DEF(square_f32)
+
+#line 803
+SIMD_INTRIN_DEF(rint_f32)
+
+#line 803
+SIMD_INTRIN_DEF(ceil_f32)
+
+#line 803
+SIMD_INTRIN_DEF(trunc_f32)
+
+#line 803
+SIMD_INTRIN_DEF(floor_f32)
+
+#endif
+
+#line 810
+SIMD_INTRIN_DEF(max_f32)
+SIMD_INTRIN_DEF(reduce_max_f32)
+
+#line 810
+SIMD_INTRIN_DEF(min_f32)
+SIMD_INTRIN_DEF(reduce_min_f32)
+
+
+#if 1
+#line 818
+SIMD_INTRIN_DEF(maxp_f32)
+SIMD_INTRIN_DEF(reduce_maxp_f32)
+
+#line 818
+SIMD_INTRIN_DEF(minp_f32)
+SIMD_INTRIN_DEF(reduce_minp_f32)
+
+#line 818
+SIMD_INTRIN_DEF(maxn_f32)
+SIMD_INTRIN_DEF(reduce_maxn_f32)
+
+#line 818
+SIMD_INTRIN_DEF(minn_f32)
+SIMD_INTRIN_DEF(reduce_minn_f32)
+
+/**end repeat1**/
+#endif
+
+/***************************
+ * Mask operations
+ ***************************/
+#line 830
+ SIMD_INTRIN_DEF(ifadd_f32)
+
+#line 830
+ SIMD_INTRIN_DEF(ifsub_f32)
+
+
+#if 1
+#line 837
+SIMD_INTRIN_DEF(ifdiv_f32)
+
+#line 837
+SIMD_INTRIN_DEF(ifdivz_f32)
+
+#endif
+
+#endif // simd_sup
+
+#line 630
+#if NPY_SIMD_F64
+
+/***************************
+ * Memory
+ ***************************/
+#line 638
+SIMD_INTRIN_DEF(load_f64)
+
+#line 638
+SIMD_INTRIN_DEF(loada_f64)
+
+#line 638
+SIMD_INTRIN_DEF(loads_f64)
+
+#line 638
+SIMD_INTRIN_DEF(loadl_f64)
+
+#line 638
+SIMD_INTRIN_DEF(store_f64)
+
+#line 638
+SIMD_INTRIN_DEF(storea_f64)
+
+#line 638
+SIMD_INTRIN_DEF(stores_f64)
+
+#line 638
+SIMD_INTRIN_DEF(storel_f64)
+
+#line 638
+SIMD_INTRIN_DEF(storeh_f64)
+
+
+#line 644
+SIMD_INTRIN_DEF(load_f64x2)
+
+#line 644
+SIMD_INTRIN_DEF(store_f64x2)
+
+
+/****************************************
+ * Non-contiguous/Partial Memory access
+ ****************************************/
+#if 1
+#line 655
+SIMD_INTRIN_DEF(load_till_f64)
+
+#line 655
+SIMD_INTRIN_DEF(load_tillz_f64)
+
+#line 655
+SIMD_INTRIN_DEF(loadn_f64)
+
+#line 655
+SIMD_INTRIN_DEF(loadn_till_f64)
+
+#line 655
+SIMD_INTRIN_DEF(loadn_tillz_f64)
+
+#line 655
+SIMD_INTRIN_DEF(store_till_f64)
+
+#line 655
+SIMD_INTRIN_DEF(storen_f64)
+
+#line 655
+SIMD_INTRIN_DEF(storen_till_f64)
+
+#if 64 == 32
+    #line 662
+    SIMD_INTRIN_DEF(load2_till_f64)
+    
+#line 662
+    SIMD_INTRIN_DEF(load2_tillz_f64)
+    
+#line 662
+    SIMD_INTRIN_DEF(loadn2_f64)
+    
+#line 662
+    SIMD_INTRIN_DEF(loadn2_till_f64)
+    
+#line 662
+    SIMD_INTRIN_DEF(loadn2_tillz_f64)
+    
+#line 662
+    SIMD_INTRIN_DEF(store2_till_f64)
+    
+#line 662
+    SIMD_INTRIN_DEF(storen2_f64)
+    
+#line 662
+    SIMD_INTRIN_DEF(storen2_till_f64)
+    
+#else
+    #line 669
+    SIMD_INTRIN_DEF(load2_till_f64)
+    
+#line 669
+    SIMD_INTRIN_DEF(load2_tillz_f64)
+    
+#line 669
+    SIMD_INTRIN_DEF(loadn2_f64)
+    
+#line 669
+    SIMD_INTRIN_DEF(loadn2_till_f64)
+    
+#line 669
+    SIMD_INTRIN_DEF(loadn2_tillz_f64)
+    
+#line 669
+    SIMD_INTRIN_DEF(store2_till_f64)
+    
+#line 669
+    SIMD_INTRIN_DEF(storen2_f64)
+    
+#line 669
+    SIMD_INTRIN_DEF(storen2_till_f64)
+    
+#endif
+#endif // ncont_sup
+
+/****************************
+ * Lookup tables
+ ****************************/
+#if 64 == 32
+SIMD_INTRIN_DEF(lut32_f64)
+#endif
+#if 64 == 64
+SIMD_INTRIN_DEF(lut16_f64)
+#endif
+/***************************
+ * Misc
+ ***************************/
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u8_f64)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s8_f64)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u16_f64)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s16_f64)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u32_f64)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s32_f64)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_u64_f64)
+#endif // simd_sup2
+
+#line 690
+#if 1
+SIMD_INTRIN_DEF(reinterpret_s64_f64)
+#endif // simd_sup2
+
+#line 690
+#if NPY_SIMD_F32
+SIMD_INTRIN_DEF(reinterpret_f32_f64)
+#endif // simd_sup2
+
+#line 690
+#if NPY_SIMD_F64
+SIMD_INTRIN_DEF(reinterpret_f64_f64)
+#endif // simd_sup2
+
+
+#line 698
+SIMD_INTRIN_DEF(set_f64)
+
+#line 698
+SIMD_INTRIN_DEF(setf_f64)
+
+#line 698
+SIMD_INTRIN_DEF(setall_f64)
+
+#line 698
+SIMD_INTRIN_DEF(zero_f64)
+
+#line 698
+SIMD_INTRIN_DEF(select_f64)
+
+#line 698
+SIMD_INTRIN_DEF(extract0_f64)
+
+
+/***************************
+ * Reorder
+ ***************************/
+#line 707
+SIMD_INTRIN_DEF(combinel_f64)
+
+#line 707
+SIMD_INTRIN_DEF(combineh_f64)
+
+#line 707
+SIMD_INTRIN_DEF(combine_f64)
+
+#line 707
+SIMD_INTRIN_DEF(zip_f64)
+
+#line 707
+SIMD_INTRIN_DEF(unzip_f64)
+
+
+#if 0
+SIMD_INTRIN_DEF(rev64_f64)
+#endif
+
+#if 64 > 16
+{ "permi128_f64", simd__intrin_permi128_f64_, METH_VARARGS, NULL },
+#endif
+
+/***************************
+ * Operators
+ ***************************/
+#if 0 > 0
+#line 725
+SIMD_INTRIN_DEF(shl_f64)
+
+#line 725
+SIMD_INTRIN_DEF(shr_f64)
+
+#line 725
+SIMD_INTRIN_DEF(shli_f64)
+
+#line 725
+SIMD_INTRIN_DEF(shri_f64)
+
+#endif // shl_imm
+
+#line 733
+SIMD_INTRIN_DEF(and_f64)
+
+#line 733
+SIMD_INTRIN_DEF(or_f64)
+
+#line 733
+SIMD_INTRIN_DEF(xor_f64)
+
+#line 733
+SIMD_INTRIN_DEF(not_f64)
+
+#line 733
+SIMD_INTRIN_DEF(cmpeq_f64)
+
+#line 733
+SIMD_INTRIN_DEF(cmpneq_f64)
+
+#line 733
+SIMD_INTRIN_DEF(cmpgt_f64)
+
+#line 733
+SIMD_INTRIN_DEF(cmpge_f64)
+
+#line 733
+SIMD_INTRIN_DEF(cmplt_f64)
+
+#line 733
+SIMD_INTRIN_DEF(cmple_f64)
+
+#line 733
+SIMD_INTRIN_DEF(any_f64)
+
+#line 733
+SIMD_INTRIN_DEF(all_f64)
+
+
+#if 0
+SIMD_INTRIN_DEF(andc_f64)
+SIMD_INTRIN_DEF(andc_b64)
+SIMD_INTRIN_DEF(orc_b64)
+SIMD_INTRIN_DEF(xnor_b64)
+#endif
+
+/***************************
+ * Conversion
+ ***************************/
+SIMD_INTRIN_DEF(cvt_f64_b64)
+SIMD_INTRIN_DEF(cvt_b64_f64)
+#if 0
+SIMD_INTRIN_DEF(expand_f64_f64)
+#endif // expand_sup
+/***************************
+ * Arithmetic
+ ***************************/
+#line 757
+SIMD_INTRIN_DEF(add_f64)
+
+#line 757
+SIMD_INTRIN_DEF(sub_f64)
+
+
+#if 0
+#line 764
+SIMD_INTRIN_DEF(adds_f64)
+
+#line 764
+SIMD_INTRIN_DEF(subs_f64)
+
+#endif // sat_sup
+
+#if 1
+SIMD_INTRIN_DEF(mul_f64)
+#endif // mul_sup
+
+#if 1
+SIMD_INTRIN_DEF(div_f64)
+#endif // div_sup
+
+#if 0
+SIMD_INTRIN_DEF(divisor_f64)
+SIMD_INTRIN_DEF(divc_f64)
+#endif // intdiv_sup
+
+#if 1
+#line 785
+SIMD_INTRIN_DEF(muladd_f64)
+
+#line 785
+SIMD_INTRIN_DEF(mulsub_f64)
+
+#line 785
+SIMD_INTRIN_DEF(nmuladd_f64)
+
+#line 785
+SIMD_INTRIN_DEF(nmulsub_f64)
+
+#line 785
+SIMD_INTRIN_DEF(muladdsub_f64)
+
+#endif // fused_sup
+
+#if 1
+SIMD_INTRIN_DEF(sum_f64)
+#endif // sum_sup
+
+#if 0
+SIMD_INTRIN_DEF(sumup_f64)
+#endif // sumup_sup
+/***************************
+ * Math
+ ***************************/
+#if 1
+#line 803
+SIMD_INTRIN_DEF(sqrt_f64)
+
+#line 803
+SIMD_INTRIN_DEF(recip_f64)
+
+#line 803
+SIMD_INTRIN_DEF(abs_f64)
+
+#line 803
+SIMD_INTRIN_DEF(square_f64)
+
+#line 803
+SIMD_INTRIN_DEF(rint_f64)
+
+#line 803
+SIMD_INTRIN_DEF(ceil_f64)
+
+#line 803
+SIMD_INTRIN_DEF(trunc_f64)
+
+#line 803
+SIMD_INTRIN_DEF(floor_f64)
+
+#endif
+
+#line 810
+SIMD_INTRIN_DEF(max_f64)
+SIMD_INTRIN_DEF(reduce_max_f64)
+
+#line 810
+SIMD_INTRIN_DEF(min_f64)
+SIMD_INTRIN_DEF(reduce_min_f64)
+
+
+#if 1
+#line 818
+SIMD_INTRIN_DEF(maxp_f64)
+SIMD_INTRIN_DEF(reduce_maxp_f64)
+
+#line 818
+SIMD_INTRIN_DEF(minp_f64)
+SIMD_INTRIN_DEF(reduce_minp_f64)
+
+#line 818
+SIMD_INTRIN_DEF(maxn_f64)
+SIMD_INTRIN_DEF(reduce_maxn_f64)
+
+#line 818
+SIMD_INTRIN_DEF(minn_f64)
+SIMD_INTRIN_DEF(reduce_minn_f64)
+
+/**end repeat1**/
+#endif
+
+/***************************
+ * Mask operations
+ ***************************/
+#line 830
+ SIMD_INTRIN_DEF(ifadd_f64)
+
+#line 830
+ SIMD_INTRIN_DEF(ifsub_f64)
+
+
+#if 1
+#line 837
+SIMD_INTRIN_DEF(ifdiv_f64)
+
+#line 837
+SIMD_INTRIN_DEF(ifdivz_f64)
+
+#endif
+
+#endif // simd_sup
+
+/*************************************************************************
+ * Variant
+ ************************************************************************/
+SIMD_INTRIN_DEF(cleanup)
+
+/*************************************************************************
+ * A special section for f32/f64 intrinsics outside the main repeater
+ ************************************************************************/
+/***************************
+ * Operators
+ ***************************/
+// check special cases
+#if NPY_SIMD_F32
+    SIMD_INTRIN_DEF(notnan_f32)
+#endif
+#if NPY_SIMD_F64
+    SIMD_INTRIN_DEF(notnan_f64)
+#endif
+/***************************
+ * Conversions
+ ***************************/
+// round to nearest integer (assume even)
+#if NPY_SIMD_F32
+    SIMD_INTRIN_DEF(round_s32_f32)
+#endif
+#if NPY_SIMD_F64
+    SIMD_INTRIN_DEF(round_s32_f64)
+#endif
+
+/*************************************************************************
+ * A special section for boolean intrinsics outside the main repeater
+ ************************************************************************/
+/***************************
+ * Operators
+ ***************************/
+#line 881
+// Logical
+SIMD_INTRIN_DEF(and_b8)
+SIMD_INTRIN_DEF(or_b8)
+SIMD_INTRIN_DEF(xor_b8)
+SIMD_INTRIN_DEF(not_b8)
+// test cross vector's lanes
+#line 890
+SIMD_INTRIN_DEF(any_b8)
+
+#line 890
+SIMD_INTRIN_DEF(all_b8)
+
+
+#line 881
+// Logical
+SIMD_INTRIN_DEF(and_b16)
+SIMD_INTRIN_DEF(or_b16)
+SIMD_INTRIN_DEF(xor_b16)
+SIMD_INTRIN_DEF(not_b16)
+// test cross vector's lanes
+#line 890
+SIMD_INTRIN_DEF(any_b16)
+
+#line 890
+SIMD_INTRIN_DEF(all_b16)
+
+
+#line 881
+// Logical
+SIMD_INTRIN_DEF(and_b32)
+SIMD_INTRIN_DEF(or_b32)
+SIMD_INTRIN_DEF(xor_b32)
+SIMD_INTRIN_DEF(not_b32)
+// test cross vector's lanes
+#line 890
+SIMD_INTRIN_DEF(any_b32)
+
+#line 890
+SIMD_INTRIN_DEF(all_b32)
+
+
+#line 881
+// Logical
+SIMD_INTRIN_DEF(and_b64)
+SIMD_INTRIN_DEF(or_b64)
+SIMD_INTRIN_DEF(xor_b64)
+SIMD_INTRIN_DEF(not_b64)
+// test cross vector's lanes
+#line 890
+SIMD_INTRIN_DEF(any_b64)
+
+#line 890
+SIMD_INTRIN_DEF(all_b64)
+
+
+/***************************
+ * Conversions
+ ***************************/
+// Convert mask vector to integer bitfield
+#line 900
+SIMD_INTRIN_DEF(tobits_b8)
+
+#line 900
+SIMD_INTRIN_DEF(tobits_b16)
+
+#line 900
+SIMD_INTRIN_DEF(tobits_b32)
+
+#line 900
+SIMD_INTRIN_DEF(tobits_b64)
+
+
+// Pack multiple vectors into one
+SIMD_INTRIN_DEF(pack_b8_b16)
+SIMD_INTRIN_DEF(pack_b8_b32)
+SIMD_INTRIN_DEF(pack_b8_b64)
+
+/************************************************************************/
+{NULL, NULL, 0, NULL}
+}; // PyMethodDef
+
+#endif // NPY_SIMD
+
+//#########################################################################
+//## Defining a separate module for each target
+//#########################################################################
+NPY_VISIBILITY_HIDDEN PyObject *
+NPY_CPU_DISPATCH_CURFX(simd_create_module)(void)
+{
+    static struct PyModuleDef defs = {
+        .m_base = PyModuleDef_HEAD_INIT,
+    #if defined(NPY_MTARGETS_CURRENT) // meson build
+        .m_name = "numpy.core._simd." NPY_TOSTRING(NPY_MTARGETS_CURRENT),
+    #elif defined(NPY__CPU_TARGET_CURRENT)
+        .m_name = "numpy.core._simd." NPY_TOSTRING(NPY__CPU_TARGET_CURRENT),
+    #else
+        .m_name = "numpy.core._simd.baseline",
+    #endif
+        .m_size = -1,
+    #if NPY_SIMD
+        .m_methods = simd__intrinsics_methods
+    #else
+        .m_methods = NULL
+    #endif
+    };
+    PyObject *m = PyModule_Create(&defs);
+    if (m == NULL) {
+        return NULL;
+    }
+    if (PyModule_AddIntConstant(m, "simd", NPY_SIMD)) {
+        goto err;
+    }
+    if (PyModule_AddIntConstant(m, "simd_f64", NPY_SIMD_F64)) {
+        goto err;
+    }
+    if (PyModule_AddIntConstant(m, "simd_f32", NPY_SIMD_F32)) {
+        goto err;
+    }
+    if (PyModule_AddIntConstant(m, "simd_fma3", NPY_SIMD_FMA3)) {
+        goto err;
+    }
+    if (PyModule_AddIntConstant(m, "simd_width", NPY_SIMD_WIDTH)) {
+        goto err;
+    }
+    if (PyModule_AddIntConstant(m, "simd_bigendian", NPY_SIMD_BIGENDIAN)) {
+        goto err;
+    }
+#if NPY_SIMD
+    if (PySIMDVectorType_Init(m)) {
+        goto err;
+    }
+    #line 965
+    if (PyModule_AddIntConstant(m, "nlanes_u8", npyv_nlanes_u8)) {
+        goto err;
+    }
+    
+#line 965
+    if (PyModule_AddIntConstant(m, "nlanes_s8", npyv_nlanes_s8)) {
+        goto err;
+    }
+    
+#line 965
+    if (PyModule_AddIntConstant(m, "nlanes_u16", npyv_nlanes_u16)) {
+        goto err;
+    }
+    
+#line 965
+    if (PyModule_AddIntConstant(m, "nlanes_s16", npyv_nlanes_s16)) {
+        goto err;
+    }
+    
+#line 965
+    if (PyModule_AddIntConstant(m, "nlanes_u32", npyv_nlanes_u32)) {
+        goto err;
+    }
+    
+#line 965
+    if (PyModule_AddIntConstant(m, "nlanes_s32", npyv_nlanes_s32)) {
+        goto err;
+    }
+    
+#line 965
+    if (PyModule_AddIntConstant(m, "nlanes_u64", npyv_nlanes_u64)) {
+        goto err;
+    }
+    
+#line 965
+    if (PyModule_AddIntConstant(m, "nlanes_s64", npyv_nlanes_s64)) {
+        goto err;
+    }
+    
+#line 965
+    if (PyModule_AddIntConstant(m, "nlanes_f32", npyv_nlanes_f32)) {
+        goto err;
+    }
+    
+#line 965
+    if (PyModule_AddIntConstant(m, "nlanes_f64", npyv_nlanes_f64)) {
+        goto err;
+    }
+    
+#endif // NPY_SIMD
+    return m;
+err:
+    Py_DECREF(m);
+    return NULL;
+}
+
diff --git a/numpy/core/src/_generated/_simd_data.inc b/numpy/core/src/_generated/_simd_data.inc
new file mode 100644
index 000000000000..3759852c24e5
--- /dev/null
+++ b/numpy/core/src/_generated/_simd_data.inc
@@ -0,0 +1,415 @@
+#line 1 "numpy/core/src/_simd/_simd_data.inc.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+/**
+ * This file is included by `_simd.dispatch.c.src`. Its contents are affected by the simd configuration, and
+ * therefore must be built multiple times. Making it a standalone `.c` file with `NPY_VISIBILITY_HIDDEN`
+ * symbols would require judicious use of `NPY_CPU_DISPATCH_DECLARE` and `NPY_CPU_DISPATCH_CURFX`, which was
+ * deemed too harmful to readability.
+ */
+/************************************
+ ** Private Definitions
+ ************************************/
+static simd_data_info simd__data_registry[simd_data_end] =
+{
+    [simd_data_none] = {.pyname="none"},
+    #line 19
+    [simd_data_u8] = {
+        .pyname="int", .is_unsigned=!0&&!0, .is_signed=0, .is_float=0,
+        .is_scalar=1, .to_scalar = simd_data_u8, .to_vector = simd_data_vu8,
+        .lane_size = sizeof(npyv_lanetype_u8)
+    },
+    
+#line 19
+    [simd_data_u16] = {
+        .pyname="int", .is_unsigned=!0&&!0, .is_signed=0, .is_float=0,
+        .is_scalar=1, .to_scalar = simd_data_u16, .to_vector = simd_data_vu16,
+        .lane_size = sizeof(npyv_lanetype_u16)
+    },
+    
+#line 19
+    [simd_data_u32] = {
+        .pyname="int", .is_unsigned=!0&&!0, .is_signed=0, .is_float=0,
+        .is_scalar=1, .to_scalar = simd_data_u32, .to_vector = simd_data_vu32,
+        .lane_size = sizeof(npyv_lanetype_u32)
+    },
+    
+#line 19
+    [simd_data_u64] = {
+        .pyname="int", .is_unsigned=!0&&!0, .is_signed=0, .is_float=0,
+        .is_scalar=1, .to_scalar = simd_data_u64, .to_vector = simd_data_vu64,
+        .lane_size = sizeof(npyv_lanetype_u64)
+    },
+    
+#line 19
+    [simd_data_s8] = {
+        .pyname="int", .is_unsigned=!1&&!0, .is_signed=1, .is_float=0,
+        .is_scalar=1, .to_scalar = simd_data_s8, .to_vector = simd_data_vs8,
+        .lane_size = sizeof(npyv_lanetype_s8)
+    },
+    
+#line 19
+    [simd_data_s16] = {
+        .pyname="int", .is_unsigned=!1&&!0, .is_signed=1, .is_float=0,
+        .is_scalar=1, .to_scalar = simd_data_s16, .to_vector = simd_data_vs16,
+        .lane_size = sizeof(npyv_lanetype_s16)
+    },
+    
+#line 19
+    [simd_data_s32] = {
+        .pyname="int", .is_unsigned=!1&&!0, .is_signed=1, .is_float=0,
+        .is_scalar=1, .to_scalar = simd_data_s32, .to_vector = simd_data_vs32,
+        .lane_size = sizeof(npyv_lanetype_s32)
+    },
+    
+#line 19
+    [simd_data_s64] = {
+        .pyname="int", .is_unsigned=!1&&!0, .is_signed=1, .is_float=0,
+        .is_scalar=1, .to_scalar = simd_data_s64, .to_vector = simd_data_vs64,
+        .lane_size = sizeof(npyv_lanetype_s64)
+    },
+    
+#line 19
+    [simd_data_f32] = {
+        .pyname="float", .is_unsigned=!0&&!1, .is_signed=0, .is_float=1,
+        .is_scalar=1, .to_scalar = simd_data_f32, .to_vector = simd_data_vf32,
+        .lane_size = sizeof(npyv_lanetype_f32)
+    },
+    
+#line 19
+    [simd_data_f64] = {
+        .pyname="float", .is_unsigned=!0&&!1, .is_signed=0, .is_float=1,
+        .is_scalar=1, .to_scalar = simd_data_f64, .to_vector = simd_data_vf64,
+        .lane_size = sizeof(npyv_lanetype_f64)
+    },
+    
+    // sequences
+    #line 32
+    [simd_data_qu8] = {
+        .pyname="[int]", .is_unsigned=!0&&!0, .is_signed=0, .is_float=0,
+        .is_sequence=1, .to_scalar = simd_data_u8, .to_vector = simd_data_vu8,
+        .nlanes=npyv_nlanes_u8, .lane_size = sizeof(npyv_lanetype_u8)
+    },
+    
+#line 32
+    [simd_data_qu16] = {
+        .pyname="[int]", .is_unsigned=!0&&!0, .is_signed=0, .is_float=0,
+        .is_sequence=1, .to_scalar = simd_data_u16, .to_vector = simd_data_vu16,
+        .nlanes=npyv_nlanes_u16, .lane_size = sizeof(npyv_lanetype_u16)
+    },
+    
+#line 32
+    [simd_data_qu32] = {
+        .pyname="[int]", .is_unsigned=!0&&!0, .is_signed=0, .is_float=0,
+        .is_sequence=1, .to_scalar = simd_data_u32, .to_vector = simd_data_vu32,
+        .nlanes=npyv_nlanes_u32, .lane_size = sizeof(npyv_lanetype_u32)
+    },
+    
+#line 32
+    [simd_data_qu64] = {
+        .pyname="[int]", .is_unsigned=!0&&!0, .is_signed=0, .is_float=0,
+        .is_sequence=1, .to_scalar = simd_data_u64, .to_vector = simd_data_vu64,
+        .nlanes=npyv_nlanes_u64, .lane_size = sizeof(npyv_lanetype_u64)
+    },
+    
+#line 32
+    [simd_data_qs8] = {
+        .pyname="[int]", .is_unsigned=!1&&!0, .is_signed=1, .is_float=0,
+        .is_sequence=1, .to_scalar = simd_data_s8, .to_vector = simd_data_vs8,
+        .nlanes=npyv_nlanes_s8, .lane_size = sizeof(npyv_lanetype_s8)
+    },
+    
+#line 32
+    [simd_data_qs16] = {
+        .pyname="[int]", .is_unsigned=!1&&!0, .is_signed=1, .is_float=0,
+        .is_sequence=1, .to_scalar = simd_data_s16, .to_vector = simd_data_vs16,
+        .nlanes=npyv_nlanes_s16, .lane_size = sizeof(npyv_lanetype_s16)
+    },
+    
+#line 32
+    [simd_data_qs32] = {
+        .pyname="[int]", .is_unsigned=!1&&!0, .is_signed=1, .is_float=0,
+        .is_sequence=1, .to_scalar = simd_data_s32, .to_vector = simd_data_vs32,
+        .nlanes=npyv_nlanes_s32, .lane_size = sizeof(npyv_lanetype_s32)
+    },
+    
+#line 32
+    [simd_data_qs64] = {
+        .pyname="[int]", .is_unsigned=!1&&!0, .is_signed=1, .is_float=0,
+        .is_sequence=1, .to_scalar = simd_data_s64, .to_vector = simd_data_vs64,
+        .nlanes=npyv_nlanes_s64, .lane_size = sizeof(npyv_lanetype_s64)
+    },
+    
+#line 32
+    [simd_data_qf32] = {
+        .pyname="[float]", .is_unsigned=!0&&!1, .is_signed=0, .is_float=1,
+        .is_sequence=1, .to_scalar = simd_data_f32, .to_vector = simd_data_vf32,
+        .nlanes=npyv_nlanes_f32, .lane_size = sizeof(npyv_lanetype_f32)
+    },
+    
+#line 32
+    [simd_data_qf64] = {
+        .pyname="[float]", .is_unsigned=!0&&!1, .is_signed=0, .is_float=1,
+        .is_sequence=1, .to_scalar = simd_data_f64, .to_vector = simd_data_vf64,
+        .nlanes=npyv_nlanes_f64, .lane_size = sizeof(npyv_lanetype_f64)
+    },
+    
+    // vectors
+    #line 44
+    [simd_data_vu8] = {
+        .pyname="npyv_u8", .is_unsigned=!0&&!0, .is_signed=0, .is_float=0,
+        .is_vector=1, .to_scalar = simd_data_u8, .to_vector = simd_data_vu8,
+        .nlanes=npyv_nlanes_u8, .lane_size = sizeof(npyv_lanetype_u8)
+    },
+    
+#line 44
+    [simd_data_vu16] = {
+        .pyname="npyv_u16", .is_unsigned=!0&&!0, .is_signed=0, .is_float=0,
+        .is_vector=1, .to_scalar = simd_data_u16, .to_vector = simd_data_vu16,
+        .nlanes=npyv_nlanes_u16, .lane_size = sizeof(npyv_lanetype_u16)
+    },
+    
+#line 44
+    [simd_data_vu32] = {
+        .pyname="npyv_u32", .is_unsigned=!0&&!0, .is_signed=0, .is_float=0,
+        .is_vector=1, .to_scalar = simd_data_u32, .to_vector = simd_data_vu32,
+        .nlanes=npyv_nlanes_u32, .lane_size = sizeof(npyv_lanetype_u32)
+    },
+    
+#line 44
+    [simd_data_vu64] = {
+        .pyname="npyv_u64", .is_unsigned=!0&&!0, .is_signed=0, .is_float=0,
+        .is_vector=1, .to_scalar = simd_data_u64, .to_vector = simd_data_vu64,
+        .nlanes=npyv_nlanes_u64, .lane_size = sizeof(npyv_lanetype_u64)
+    },
+    
+#line 44
+    [simd_data_vs8] = {
+        .pyname="npyv_s8", .is_unsigned=!1&&!0, .is_signed=1, .is_float=0,
+        .is_vector=1, .to_scalar = simd_data_s8, .to_vector = simd_data_vs8,
+        .nlanes=npyv_nlanes_s8, .lane_size = sizeof(npyv_lanetype_s8)
+    },
+    
+#line 44
+    [simd_data_vs16] = {
+        .pyname="npyv_s16", .is_unsigned=!1&&!0, .is_signed=1, .is_float=0,
+        .is_vector=1, .to_scalar = simd_data_s16, .to_vector = simd_data_vs16,
+        .nlanes=npyv_nlanes_s16, .lane_size = sizeof(npyv_lanetype_s16)
+    },
+    
+#line 44
+    [simd_data_vs32] = {
+        .pyname="npyv_s32", .is_unsigned=!1&&!0, .is_signed=1, .is_float=0,
+        .is_vector=1, .to_scalar = simd_data_s32, .to_vector = simd_data_vs32,
+        .nlanes=npyv_nlanes_s32, .lane_size = sizeof(npyv_lanetype_s32)
+    },
+    
+#line 44
+    [simd_data_vs64] = {
+        .pyname="npyv_s64", .is_unsigned=!1&&!0, .is_signed=1, .is_float=0,
+        .is_vector=1, .to_scalar = simd_data_s64, .to_vector = simd_data_vs64,
+        .nlanes=npyv_nlanes_s64, .lane_size = sizeof(npyv_lanetype_s64)
+    },
+    
+#line 44
+    [simd_data_vf32] = {
+        .pyname="npyv_f32", .is_unsigned=!0&&!1, .is_signed=0, .is_float=1,
+        .is_vector=1, .to_scalar = simd_data_f32, .to_vector = simd_data_vf32,
+        .nlanes=npyv_nlanes_f32, .lane_size = sizeof(npyv_lanetype_f32)
+    },
+    
+#line 44
+    [simd_data_vf64] = {
+        .pyname="npyv_f64", .is_unsigned=!0&&!1, .is_signed=0, .is_float=1,
+        .is_vector=1, .to_scalar = simd_data_f64, .to_vector = simd_data_vf64,
+        .nlanes=npyv_nlanes_f64, .lane_size = sizeof(npyv_lanetype_f64)
+    },
+    
+    // boolean vectors, treated as unsigned and converted internally
+    // to add compatibility among all SIMD extensions
+    #line 56
+    [simd_data_vb8] = {
+        .pyname="npyv_b8", .is_bool=1, .is_vector=1,
+        .to_scalar = simd_data_u8, .to_vector = simd_data_vu8,
+        .nlanes=npyv_nlanes_u8, .lane_size = sizeof(npyv_lanetype_u8)
+    },
+    
+#line 56
+    [simd_data_vb16] = {
+        .pyname="npyv_b16", .is_bool=1, .is_vector=1,
+        .to_scalar = simd_data_u16, .to_vector = simd_data_vu16,
+        .nlanes=npyv_nlanes_u16, .lane_size = sizeof(npyv_lanetype_u16)
+    },
+    
+#line 56
+    [simd_data_vb32] = {
+        .pyname="npyv_b32", .is_bool=1, .is_vector=1,
+        .to_scalar = simd_data_u32, .to_vector = simd_data_vu32,
+        .nlanes=npyv_nlanes_u32, .lane_size = sizeof(npyv_lanetype_u32)
+    },
+    
+#line 56
+    [simd_data_vb64] = {
+        .pyname="npyv_b64", .is_bool=1, .is_vector=1,
+        .to_scalar = simd_data_u64, .to_vector = simd_data_vu64,
+        .nlanes=npyv_nlanes_u64, .lane_size = sizeof(npyv_lanetype_u64)
+    },
+    
+    // multi-vectors x2
+    #line 68
+    [simd_data_vu8x2] = {
+        .pyname="npyv_u8x2", .is_unsigned=!0&&!0, .is_signed=0, .is_float=0,
+        .is_vectorx=2, .to_scalar = simd_data_u8, .to_vector = simd_data_vu8,
+        .nlanes=2, .lane_size = sizeof(npyv_lanetype_u8)
+    },
+    
+#line 68
+    [simd_data_vu16x2] = {
+        .pyname="npyv_u16x2", .is_unsigned=!0&&!0, .is_signed=0, .is_float=0,
+        .is_vectorx=2, .to_scalar = simd_data_u16, .to_vector = simd_data_vu16,
+        .nlanes=2, .lane_size = sizeof(npyv_lanetype_u16)
+    },
+    
+#line 68
+    [simd_data_vu32x2] = {
+        .pyname="npyv_u32x2", .is_unsigned=!0&&!0, .is_signed=0, .is_float=0,
+        .is_vectorx=2, .to_scalar = simd_data_u32, .to_vector = simd_data_vu32,
+        .nlanes=2, .lane_size = sizeof(npyv_lanetype_u32)
+    },
+    
+#line 68
+    [simd_data_vu64x2] = {
+        .pyname="npyv_u64x2", .is_unsigned=!0&&!0, .is_signed=0, .is_float=0,
+        .is_vectorx=2, .to_scalar = simd_data_u64, .to_vector = simd_data_vu64,
+        .nlanes=2, .lane_size = sizeof(npyv_lanetype_u64)
+    },
+    
+#line 68
+    [simd_data_vs8x2] = {
+        .pyname="npyv_s8x2", .is_unsigned=!1&&!0, .is_signed=1, .is_float=0,
+        .is_vectorx=2, .to_scalar = simd_data_s8, .to_vector = simd_data_vs8,
+        .nlanes=2, .lane_size = sizeof(npyv_lanetype_s8)
+    },
+    
+#line 68
+    [simd_data_vs16x2] = {
+        .pyname="npyv_s16x2", .is_unsigned=!1&&!0, .is_signed=1, .is_float=0,
+        .is_vectorx=2, .to_scalar = simd_data_s16, .to_vector = simd_data_vs16,
+        .nlanes=2, .lane_size = sizeof(npyv_lanetype_s16)
+    },
+    
+#line 68
+    [simd_data_vs32x2] = {
+        .pyname="npyv_s32x2", .is_unsigned=!1&&!0, .is_signed=1, .is_float=0,
+        .is_vectorx=2, .to_scalar = simd_data_s32, .to_vector = simd_data_vs32,
+        .nlanes=2, .lane_size = sizeof(npyv_lanetype_s32)
+    },
+    
+#line 68
+    [simd_data_vs64x2] = {
+        .pyname="npyv_s64x2", .is_unsigned=!1&&!0, .is_signed=1, .is_float=0,
+        .is_vectorx=2, .to_scalar = simd_data_s64, .to_vector = simd_data_vs64,
+        .nlanes=2, .lane_size = sizeof(npyv_lanetype_s64)
+    },
+    
+#line 68
+    [simd_data_vf32x2] = {
+        .pyname="npyv_f32x2", .is_unsigned=!0&&!1, .is_signed=0, .is_float=1,
+        .is_vectorx=2, .to_scalar = simd_data_f32, .to_vector = simd_data_vf32,
+        .nlanes=2, .lane_size = sizeof(npyv_lanetype_f32)
+    },
+    
+#line 68
+    [simd_data_vf64x2] = {
+        .pyname="npyv_f64x2", .is_unsigned=!0&&!1, .is_signed=0, .is_float=1,
+        .is_vectorx=2, .to_scalar = simd_data_f64, .to_vector = simd_data_vf64,
+        .nlanes=2, .lane_size = sizeof(npyv_lanetype_f64)
+    },
+    
+    // multi-vectors x3
+    #line 80
+    [simd_data_vu8x3] = {
+        .pyname="npyv_u8x3", .is_unsigned=!0&&!0, .is_signed=0, .is_float=0,
+        .is_vectorx=3, .to_scalar = simd_data_u8, .to_vector = simd_data_vu8,
+        .nlanes=3, .lane_size = sizeof(npyv_lanetype_u8)
+    },
+    
+#line 80
+    [simd_data_vu16x3] = {
+        .pyname="npyv_u16x3", .is_unsigned=!0&&!0, .is_signed=0, .is_float=0,
+        .is_vectorx=3, .to_scalar = simd_data_u16, .to_vector = simd_data_vu16,
+        .nlanes=3, .lane_size = sizeof(npyv_lanetype_u16)
+    },
+    
+#line 80
+    [simd_data_vu32x3] = {
+        .pyname="npyv_u32x3", .is_unsigned=!0&&!0, .is_signed=0, .is_float=0,
+        .is_vectorx=3, .to_scalar = simd_data_u32, .to_vector = simd_data_vu32,
+        .nlanes=3, .lane_size = sizeof(npyv_lanetype_u32)
+    },
+    
+#line 80
+    [simd_data_vu64x3] = {
+        .pyname="npyv_u64x3", .is_unsigned=!0&&!0, .is_signed=0, .is_float=0,
+        .is_vectorx=3, .to_scalar = simd_data_u64, .to_vector = simd_data_vu64,
+        .nlanes=3, .lane_size = sizeof(npyv_lanetype_u64)
+    },
+    
+#line 80
+    [simd_data_vs8x3] = {
+        .pyname="npyv_s8x3", .is_unsigned=!1&&!0, .is_signed=1, .is_float=0,
+        .is_vectorx=3, .to_scalar = simd_data_s8, .to_vector = simd_data_vs8,
+        .nlanes=3, .lane_size = sizeof(npyv_lanetype_s8)
+    },
+    
+#line 80
+    [simd_data_vs16x3] = {
+        .pyname="npyv_s16x3", .is_unsigned=!1&&!0, .is_signed=1, .is_float=0,
+        .is_vectorx=3, .to_scalar = simd_data_s16, .to_vector = simd_data_vs16,
+        .nlanes=3, .lane_size = sizeof(npyv_lanetype_s16)
+    },
+    
+#line 80
+    [simd_data_vs32x3] = {
+        .pyname="npyv_s32x3", .is_unsigned=!1&&!0, .is_signed=1, .is_float=0,
+        .is_vectorx=3, .to_scalar = simd_data_s32, .to_vector = simd_data_vs32,
+        .nlanes=3, .lane_size = sizeof(npyv_lanetype_s32)
+    },
+    
+#line 80
+    [simd_data_vs64x3] = {
+        .pyname="npyv_s64x3", .is_unsigned=!1&&!0, .is_signed=1, .is_float=0,
+        .is_vectorx=3, .to_scalar = simd_data_s64, .to_vector = simd_data_vs64,
+        .nlanes=3, .lane_size = sizeof(npyv_lanetype_s64)
+    },
+    
+#line 80
+    [simd_data_vf32x3] = {
+        .pyname="npyv_f32x3", .is_unsigned=!0&&!1, .is_signed=0, .is_float=1,
+        .is_vectorx=3, .to_scalar = simd_data_f32, .to_vector = simd_data_vf32,
+        .nlanes=3, .lane_size = sizeof(npyv_lanetype_f32)
+    },
+    
+#line 80
+    [simd_data_vf64x3] = {
+        .pyname="npyv_f64x3", .is_unsigned=!0&&!1, .is_signed=0, .is_float=1,
+        .is_vectorx=3, .to_scalar = simd_data_f64, .to_vector = simd_data_vf64,
+        .nlanes=3, .lane_size = sizeof(npyv_lanetype_f64)
+    },
+    
+};
+
+/************************************
+ ** Protected Definitions
+ ************************************/
+static const simd_data_info *
+simd_data_getinfo(simd_data_type dtype)
+{ return &simd__data_registry[dtype]; }
+
diff --git a/numpy/core/src/_generated/_simd_inc.h b/numpy/core/src/_generated/_simd_inc.h
new file mode 100644
index 000000000000..c6e41ee421cd
--- /dev/null
+++ b/numpy/core/src/_generated/_simd_inc.h
@@ -0,0 +1,693 @@
+#line 1 "numpy/core/src/_simd/_simd_inc.h.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+#ifndef _SIMD_SIMD_INC_H_
+#define _SIMD_SIMD_INC_H_
+
+#include <Python.h>
+#include "simd/simd.h"
+
+#if NPY_SIMD
+/************************************
+ ** Types
+ ************************************/
+/**
+ * Gather all data types supported by the module.
+*/
+typedef union
+{
+    // scalars
+    #line 20
+    npyv_lanetype_u8 u8;
+    
+#line 20
+    npyv_lanetype_u16 u16;
+    
+#line 20
+    npyv_lanetype_u32 u32;
+    
+#line 20
+    npyv_lanetype_u64 u64;
+    
+#line 20
+    npyv_lanetype_s8 s8;
+    
+#line 20
+    npyv_lanetype_s16 s16;
+    
+#line 20
+    npyv_lanetype_s32 s32;
+    
+#line 20
+    npyv_lanetype_s64 s64;
+    
+#line 20
+    npyv_lanetype_f32 f32;
+    
+#line 20
+    npyv_lanetype_f64 f64;
+    
+    // sequence
+    #line 26
+    npyv_lanetype_u8 *qu8;
+    
+#line 26
+    npyv_lanetype_u16 *qu16;
+    
+#line 26
+    npyv_lanetype_u32 *qu32;
+    
+#line 26
+    npyv_lanetype_u64 *qu64;
+    
+#line 26
+    npyv_lanetype_s8 *qs8;
+    
+#line 26
+    npyv_lanetype_s16 *qs16;
+    
+#line 26
+    npyv_lanetype_s32 *qs32;
+    
+#line 26
+    npyv_lanetype_s64 *qs64;
+    
+#line 26
+    npyv_lanetype_f32 *qf32;
+    
+#line 26
+    npyv_lanetype_f64 *qf64;
+    
+    // vectors
+    #line 32
+    npyv_u8 vu8;
+    
+#line 32
+    npyv_u16 vu16;
+    
+#line 32
+    npyv_u32 vu32;
+    
+#line 32
+    npyv_u64 vu64;
+    
+#line 32
+    npyv_s8 vs8;
+    
+#line 32
+    npyv_s16 vs16;
+    
+#line 32
+    npyv_s32 vs32;
+    
+#line 32
+    npyv_s64 vs64;
+    
+#line 32
+    npyv_b8 vb8;
+    
+#line 32
+    npyv_b16 vb16;
+    
+#line 32
+    npyv_b32 vb32;
+    
+#line 32
+    npyv_b64 vb64;
+    
+    // multi-vectors x2
+    #line 38
+    npyv_u8x2 vu8x2;
+    
+#line 38
+    npyv_u16x2 vu16x2;
+    
+#line 38
+    npyv_u32x2 vu32x2;
+    
+#line 38
+    npyv_u64x2 vu64x2;
+    
+#line 38
+    npyv_s8x2 vs8x2;
+    
+#line 38
+    npyv_s16x2 vs16x2;
+    
+#line 38
+    npyv_s32x2 vs32x2;
+    
+#line 38
+    npyv_s64x2 vs64x2;
+    
+    // multi-vectors x3
+    #line 44
+    npyv_u8x3 vu8x3;
+    
+#line 44
+    npyv_u16x3 vu16x3;
+    
+#line 44
+    npyv_u32x3 vu32x3;
+    
+#line 44
+    npyv_u64x3 vu64x3;
+    
+#line 44
+    npyv_s8x3 vs8x3;
+    
+#line 44
+    npyv_s16x3 vs16x3;
+    
+#line 44
+    npyv_s32x3 vs32x3;
+    
+#line 44
+    npyv_s64x3 vs64x3;
+    
+#if NPY_SIMD_F32
+    npyv_f32    vf32;
+    npyv_f32x2  vf32x2;
+    npyv_f32x3  vf32x3;
+#endif
+#if NPY_SIMD_F64
+    npyv_f64    vf64;
+    npyv_f64x2  vf64x2;
+    npyv_f64x3  vf64x3;
+#endif
+} simd_data;
+
+/**
+ * Data types IDs and suffixes. Must be same data types as the ones
+ * in union 'simd_data' to fit the macros in '_simd_inc_easyintrin.h'.
+*/
+typedef enum
+{
+    simd_data_none = 0,
+    // scalars
+    #line 69
+    simd_data_u8,
+    
+#line 69
+    simd_data_u16,
+    
+#line 69
+    simd_data_u32,
+    
+#line 69
+    simd_data_u64,
+    
+#line 69
+    simd_data_s8,
+    
+#line 69
+    simd_data_s16,
+    
+#line 69
+    simd_data_s32,
+    
+#line 69
+    simd_data_s64,
+    
+#line 69
+    simd_data_f32,
+    
+#line 69
+    simd_data_f64,
+    
+    // sequences
+    #line 75
+    simd_data_qu8,
+    
+#line 75
+    simd_data_qu16,
+    
+#line 75
+    simd_data_qu32,
+    
+#line 75
+    simd_data_qu64,
+    
+#line 75
+    simd_data_qs8,
+    
+#line 75
+    simd_data_qs16,
+    
+#line 75
+    simd_data_qs32,
+    
+#line 75
+    simd_data_qs64,
+    
+#line 75
+    simd_data_qf32,
+    
+#line 75
+    simd_data_qf64,
+    
+    // vectors
+    #line 81
+    simd_data_vu8,
+    
+#line 81
+    simd_data_vu16,
+    
+#line 81
+    simd_data_vu32,
+    
+#line 81
+    simd_data_vu64,
+    
+#line 81
+    simd_data_vs8,
+    
+#line 81
+    simd_data_vs16,
+    
+#line 81
+    simd_data_vs32,
+    
+#line 81
+    simd_data_vs64,
+    
+#line 81
+    simd_data_vf32,
+    
+#line 81
+    simd_data_vf64,
+    
+#line 81
+    simd_data_vb8,
+    
+#line 81
+    simd_data_vb16,
+    
+#line 81
+    simd_data_vb32,
+    
+#line 81
+    simd_data_vb64,
+    
+    // multi-vectors x2
+    #line 87
+    simd_data_vu8x2,
+    
+#line 87
+    simd_data_vu16x2,
+    
+#line 87
+    simd_data_vu32x2,
+    
+#line 87
+    simd_data_vu64x2,
+    
+#line 87
+    simd_data_vs8x2,
+    
+#line 87
+    simd_data_vs16x2,
+    
+#line 87
+    simd_data_vs32x2,
+    
+#line 87
+    simd_data_vs64x2,
+    
+#line 87
+    simd_data_vf32x2,
+    
+#line 87
+    simd_data_vf64x2,
+    
+    // multi-vectors x3
+    #line 93
+    simd_data_vu8x3,
+    
+#line 93
+    simd_data_vu16x3,
+    
+#line 93
+    simd_data_vu32x3,
+    
+#line 93
+    simd_data_vu64x3,
+    
+#line 93
+    simd_data_vs8x3,
+    
+#line 93
+    simd_data_vs16x3,
+    
+#line 93
+    simd_data_vs32x3,
+    
+#line 93
+    simd_data_vs64x3,
+    
+#line 93
+    simd_data_vf32x3,
+    
+#line 93
+    simd_data_vf64x3,
+    
+    simd_data_end,
+} simd_data_type;
+/************************************
+ ** Declarations (inc_data)
+ ************************************/
+/**
+ * simd_data_type information
+ */
+typedef struct
+{
+    // type name compatible with python style
+    const char *pyname;
+    // returns '1' if the type represent a unsigned integer
+    unsigned int is_unsigned:1;
+    // returns '1' if the type represent a signed integer
+    unsigned int is_signed:1;
+    // returns '1' if the type represent a single or double precision
+    unsigned int is_float:1;
+    // returns '1' if the type represent a boolean
+    unsigned int is_bool:1;
+    // returns '1' if the type represent a sequence
+    unsigned int is_sequence:1;
+    // returns '1' if the type represent a scalar
+    unsigned int is_scalar:1;
+    // returns '1' if the type represent a vector
+    unsigned int is_vector:1;
+    // returns the len of multi-vector if the type represent x2 or x3 vector
+    // otherwise returns 0, e.g. returns 2 if data type is simd_data_vu8x2
+    int is_vectorx;
+    // returns the equivalent scalar data type e.g. simd_data_vu8 -> simd_data_u8
+    simd_data_type to_scalar;
+    // returns the equivalent scalar data type e.g. simd_data_s8 -> simd_data_vs8
+    // NOTE: returns the will equivalent "unsigned" vector type in case of "boolean" vector
+    // e.g. simd_data_vb8 -> simd_data_vu8
+    simd_data_type to_vector;
+    // number of vector lanes
+    int nlanes;
+    // sizeof lane type
+    int lane_size;
+} simd_data_info;
+
+/**
+ * Returns data info of certain dtype.
+ *
+ * Example:
+ **  const simd_data_info *info = simd_data_getinfo(simd_data_vu8);
+ **  if (info->is_vector && info->is_unsigned) {
+ **     ...
+ **  }
+ */
+static const simd_data_info *
+simd_data_getinfo(simd_data_type dtype);
+
+/************************************
+ ** Declarations (inc_vector)
+ ************************************/
+typedef struct
+{
+    PyObject_HEAD
+    // vector type id
+    simd_data_type dtype;
+    // vector data, aligned for safe casting
+    npyv_lanetype_u8 NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) data[NPY_SIMD_WIDTH];
+} PySIMDVectorObject;
+/**
+ * Create a Python obj(PySIMDVectorObject) from a NPYV vector based on the contents
+ * of `data`(simd_data) and according to the vector data type `dtype`
+ * on range(simd_data_[vu8:vf64]).
+ * Return NULL and a Python exception on failure, otherwise new reference.
+ *
+ * Example:
+ ** simd_data data = {.vu8 = npyv_setall_u8(0xff)};
+ ** PySIMDVectorObject *obj = PySIMDVector_FromData(data, simd_data_vu8);
+ ** if (obj != NULL) {
+ **    printf("I have a valid vector obj and first element is \n", obj->data[0]);
+ **    Py_DECREF(obj);
+ ** }
+ */
+static PySIMDVectorObject *
+PySIMDVector_FromData(simd_data data, simd_data_type dtype);
+/**
+ * Return a NPYV vector(simd_data) representation of `obj`(PySIMDVectorObject) and
+ * according to the vector data type `dtype` on range (simd_data_[vu8:vf64]).
+ * Raise a Python exception on failure.
+ *
+ * Example:
+ ** simd_data data = PySIMDVector_AsData(vec_obj, simd_data_vf32);
+ ** if (!PyErr_Occurred()) {
+ **    npyv_f32 add_1 = npyv_add_f32(data.vf32, npyv_setall_f32(1));
+ **    ...
+ ** }
+ */
+static simd_data
+PySIMDVector_AsData(PySIMDVectorObject *obj, simd_data_type dtype);
+/**
+ * initialize and register PySIMDVectorType to certain PyModule,
+ * PySIMDVectorType can be reached through attribute 'vector_type'.
+ * return -1 on error, 0 on success.
+ */
+static int
+PySIMDVectorType_Init(PyObject *module);
+
+/************************************
+ ** Declarations (inc_convert)
+ ************************************/
+/**
+ * Return a C scalar(simd_data) representation of `obj` and
+ * according to the scalar data type `dtype` on range (simd_data_[u8:f64]).
+ * Raise a Python exception on failure.
+ *
+ * Example:
+ ** simd_data data = simd_scalar_from_number(obj, simd_data_f32);
+ ** if (!PyErr_Occurred()) {
+ **    printf("I have a valid float %d\n", data.f32);
+ ** }
+ */
+static simd_data
+simd_scalar_from_number(PyObject *obj, simd_data_type dtype);
+/**
+ * Create a Python scalar from a C scalar based on the contents
+ * of `data`(simd_data) and according to the scalar data type `dtype`
+ * on range(simd_data_[u8:f64]).
+ * Return NULL and a Python exception on failure, otherwise new reference.
+ *
+ * Example:
+ ** simd_data data = {.u32 = 0x7fffffff};
+ ** PyObject *obj = simd_scalar_to_number(data, simd_data_s32);
+ ** if (obj != NULL) {
+ **    printf("I have a valid Python integer %d\n", PyLong_AsLong(obj));
+ **    Py_DECREF(obj);
+ ** }
+ */
+static PyObject *
+simd_scalar_to_number(simd_data data, simd_data_type dtype);
+/**
+ * Allocate a C array in memory according to number of elements `len`
+ * and sequence data type `dtype` on range(simd_data_[qu8:qf64]).
+ *
+ * Return aligned pointer based on `NPY_SIMD_WIDTH` or NULL
+ * with a Python exception on failure.
+ *
+ * Example:
+ ** npyv_lanetype_f64 *aligned_ptr = simd_sequence_new(npyv_nlanes_f64, simd_data_f64);
+ ** if (aligned_ptr != NULL) {
+ **    // aligned store
+ **    npyv_storea_f64(aligned_ptr, npyv_setall_f64(1.0));
+ **    printf("The first element of my array %f\n", aligned_ptr[0]);
+ **    simd_sequence_free(aligned_ptr);
+ ** }
+ */
+static void *
+simd_sequence_new(Py_ssize_t len, simd_data_type dtype);
+/**
+ * Return the number of elements of the allocated C array `ptr`
+ * by `simd_sequence_new()` or `simd_sequence_from_iterable()`.
+ */
+static Py_ssize_t
+simd_sequence_len(const void *ptr);
+/**
+ * Free the allocated C array by `simd_sequence_new()` or
+ * `simd_sequence_from_iterable()`.
+ */
+static void
+simd_sequence_free(void *ptr);
+/**
+ * Return a C array representation of a PyObject sequence `obj` and
+ * according to the sequence data type `dtype` on range (simd_data_[qu8:qf64]).
+ *
+ * Note: parameter `min_size` takes the number of minimum acceptable elements.
+ *
+ * Return aligned pointer based on `NPY_SIMD_WIDTH` or NULL
+ * with a Python exception on failure.
+ *
+ * Example:
+ ** npyv_lanetype_u32 *ptr = simd_sequence_from_iterable(seq_obj, simd_data_qu32, npyv_nlanes_u32);
+ ** if (ptr != NULL) {
+ **     npyv_u32 a = npyv_load_u32(ptr);
+ **     ...
+ **     simd_sequence_free(ptr);
+ ** }
+ **
+ */
+static void *
+simd_sequence_from_iterable(PyObject *obj, simd_data_type dtype, Py_ssize_t min_size);
+/**
+ * Fill a Python sequence object `obj` with a C array `ptr` allocated by
+ * `simd_sequence_new()` or `simd_sequence_from_iterable()` according to
+ * to the sequence data type `dtype` on range (simd_data_[qu8:qf64]).
+ *
+ * Return 0 on success or -1 with a Python exception on failure.
+ */
+static int
+simd_sequence_fill_iterable(PyObject *obj, const void *ptr, simd_data_type dtype);
+/**
+ * Create a Python list from a C array `ptr` allocated by
+ * `simd_sequence_new()` or `simd_sequence_from_iterable()` according to
+ * to the sequence data type `dtype` on range (simd_data_[qu8:qf64]).
+ *
+ * Return NULL and a Python exception on failure, otherwise new reference.
+ */
+static PyObject *
+simd_sequence_to_list(const void *ptr, simd_data_type dtype);
+/**
+ * Return a SIMD multi-vector(simd_data) representation of Python tuple of
+ * (simd_vector*,) `obj` according to the scalar data type `dtype`
+ * on range (simd_data_[vu8x2:vf64x2])-(simd_data_[vu8x3:vf64x3]).
+ *
+ * Raise a Python exception on failure.
+ *
+ * Example:
+ ** simd_data data = simd_vectorx_from_tuple(tuple_obj, simd_data_vf32x2);
+ ** if (!PyErr_Occurred()) {
+ **     npyv_f32 sum = npyv_add_f32(data.vf32x2.val[0], data.vf32x2.val[1]);
+ **     ...
+ ** }
+ **
+ */
+static simd_data
+simd_vectorx_from_tuple(PyObject *obj, simd_data_type dtype);
+/**
+ * Create a Python tuple of 'simd_vector' from a SIMD multi-vector
+ * based on the contents of `data`(simd_data) and according to
+ * the multi-vector data type `dtype` on range
+ * (simd_data_[vu8x2:vf64x2])-(simd_data_[vu8x3:vf64x3]).
+ *
+ * Return NULL and a Python exception on failure, otherwise new reference.
+ */
+static PyObject *
+simd_vectorx_to_tuple(simd_data data, simd_data_type dtype);
+
+/************************************
+ ** Declarations (inc_arg)
+ ************************************/
+typedef struct
+{
+    simd_data_type dtype;
+    simd_data data;
+    // set by simd_arg_converter()
+    PyObject *obj;
+} simd_arg;
+/**
+ * The following functions gather all conversions between all data types
+ * and they can used instead of all above functions.
+ */
+/**
+ * Convert a Python object `obj` into simd_data `arg->data` according to the
+ * required data type `arg->dtype`.
+ *
+ * Return -1 and raise Python exception on failure, otherwise return 0.
+ *
+ * Notes:
+ *  - requires `simd_arg_free()` or `simd_sequence_free()`
+ *    to free allocated C array, in case of sequence data types.
+ *  - the number of minimum acceptable elements for sequence data
+ *    types is the number of lanes of the equivalent vector data type.
+ *
+ * Example #1:
+ ** simd_arg arg = {.dtype = simd_data_qu8};
+ ** if (simd_arg_from_obj(seq_obj, &arg) < 0) {
+ **     // fails to convert a python sequence object to C array of uint8
+ **     return;
+ ** }
+ ** npyv_u8 v_u8 = npyv_load_u8(arg->data.qu8);
+ ** ...
+ ** simd_arg_free(&arg);
+ *
+ * Example #2:
+ ** simd_arg arg = {.dtype = simd_data_vf32};
+ ** if (simd_arg_from_obj(vector_obj, &arg) < 0) {
+ **     // fails to convert a python simd_vector to NPYV vector
+ **     return;
+ ** }
+ ** npyv_f32 add_one = npyv_add_f32(arg->data.vu8, npyv_setall_f32(1));
+ ** ...
+ */
+static int
+simd_arg_from_obj(PyObject *obj, simd_arg *arg);
+/**
+ * Convert a simd_data `arg->data` to into a Python object according to the
+ * required data type `arg->dtype`.
+ *
+ * Return NULL and raise Python exception on failure, otherwise return
+ * new reference.
+ *
+ * Example:
+ ** simd_arg arg = {.dtype = simd_data_u32, .data = {.u32 = 0xffffffff}};
+ ** PyObject *obj = simd_arg_to_obj(&arg);
+ ** if (obj == NULL) {
+ **    // fails convert C uint32 to Python integer.
+ **    return;
+ ** }
+ **
+ */
+static PyObject *
+simd_arg_to_obj(const simd_arg *arg);
+/**
+ * Converter function used similar to simd_arg_from_obj() but
+ * used with PyArg_Parse*().
+ *
+ * Notes:
+ *  - requires `simd_arg_free()` or `simd_sequence_free()`
+ *    to free allocated C array, in case of sequence data types.
+ *  - the number of minimum acceptable elements for sequence data
+ *    types is the number of lanes of the equivalent vector data type.
+ *  - use 'arg->obj' to retrieve the parameter obj.
+ *
+ * Example:
+ **  simd_arg seq_f32 = {.dtype = simd_data_qf32};
+ **  simd_arg vec_f32 = {.dtype = simd_data_vf32};
+ **  if (!PyArg_ParseTuple(
+ **     args, "O&O&:add_sum_f32",
+ **     simd_arg_converter, &seq_f32,
+ **     simd_arg_converter, &vec_f32
+ **  )) {
+ **     // fail
+ **     return;
+ **  }
+ **  npyv_f32 load_a = npyv_load_f32(seq_f32.data.qf32);
+ **  npyv_f32 sum = npyv_add_f32(load_a, vec_f32.data.vf32);
+ **  ...
+ **  simd_arg_free(&seq_f32);
+ */
+static int
+simd_arg_converter(PyObject *obj, simd_arg *arg);
+/**
+ * Free the allocated C array, if the arg hold sequence data type.
+ */
+static void
+simd_arg_free(simd_arg *arg);
+
+#endif // NPY_SIMD
+#endif // _SIMD_SIMD_INC_H_
+
diff --git a/numpy/core/src/_generated/_umath_tests.c b/numpy/core/src/_generated/_umath_tests.c
new file mode 100644
index 000000000000..514e756541cd
--- /dev/null
+++ b/numpy/core/src/_generated/_umath_tests.c
@@ -0,0 +1,1122 @@
+#line 1 "numpy/core/src/umath/_umath_tests.c.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+/* -*- c -*- */
+
+/*
+ *****************************************************************************
+ **                            INCLUDES                                     **
+ *****************************************************************************
+ */
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#if defined(NPY_INTERNAL_BUILD)
+#undef NPY_INTERNAL_BUILD
+#endif
+#include "numpy/arrayobject.h"
+#include "numpy/ufuncobject.h"
+#include "numpy/npy_math.h"
+
+#include "npy_pycompat.h"
+
+#include "npy_config.h"
+#include "npy_cpu_features.h"
+#include "npy_cpu_dispatch.h"
+#include "numpy/npy_cpu.h"
+#include "npy_import.h"
+#include "numpy/experimental_dtype_api.h"
+
+
+/*
+ *****************************************************************************
+ **                            BASICS                                       **
+ *****************************************************************************
+ */
+
+#define INIT_OUTER_LOOP_1       \
+    npy_intp dN = *dimensions++;\
+    npy_intp N_;                \
+    npy_intp s0 = *steps++;
+
+#define INIT_OUTER_LOOP_2       \
+    INIT_OUTER_LOOP_1           \
+    npy_intp s1 = *steps++;
+
+#define INIT_OUTER_LOOP_3       \
+    INIT_OUTER_LOOP_2           \
+    npy_intp s2 = *steps++;
+
+#define INIT_OUTER_LOOP_4       \
+    INIT_OUTER_LOOP_3           \
+    npy_intp s3 = *steps++;
+
+#define BEGIN_OUTER_LOOP_2      \
+    for (N_ = 0; N_ < dN; N_++, args[0] += s0, args[1] += s1) {
+
+#define BEGIN_OUTER_LOOP_3      \
+    for (N_ = 0; N_ < dN; N_++, args[0] += s0, args[1] += s1, args[2] += s2) {
+
+#define BEGIN_OUTER_LOOP_4      \
+    for (N_ = 0; N_ < dN; N_++, args[0] += s0, args[1] += s1, args[2] += s2, args[3] += s3) {
+
+#define END_OUTER_LOOP  }
+
+
+/*
+ *****************************************************************************
+ **                             UFUNC LOOPS                                 **
+ *****************************************************************************
+ */
+
+static void
+always_error_loop(
+        char **NPY_UNUSED(args), npy_intp const *NPY_UNUSED(dimensions),
+        npy_intp const *NPY_UNUSED(steps), void *NPY_UNUSED(func))
+{
+    NPY_ALLOW_C_API_DEF
+    NPY_ALLOW_C_API;
+    PyErr_SetString(PyExc_RuntimeError, "How unexpected :)!");
+    NPY_DISABLE_C_API;
+    return;
+}
+
+
+char *inner1d_signature = "(i),(i)->()";
+
+#line 90
+
+/*
+ *  This implements the function
+ *        out[n] = sum_i { in1[n, i] * in2[n, i] }.
+ */
+
+static void
+LONG_inner1d(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    INIT_OUTER_LOOP_3
+    npy_intp di = dimensions[0];
+    npy_intp i;
+    npy_intp is1=steps[0], is2=steps[1];
+    BEGIN_OUTER_LOOP_3
+        char *ip1=args[0], *ip2=args[1], *op=args[2];
+        npy_long sum = 0;
+        for (i = 0; i < di; i++) {
+            sum += (*(npy_long *)ip1) * (*(npy_long *)ip2);
+            ip1 += is1;
+            ip2 += is2;
+        }
+        *(npy_long *)op = sum;
+    END_OUTER_LOOP
+}
+
+
+#line 90
+
+/*
+ *  This implements the function
+ *        out[n] = sum_i { in1[n, i] * in2[n, i] }.
+ */
+
+static void
+DOUBLE_inner1d(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    INIT_OUTER_LOOP_3
+    npy_intp di = dimensions[0];
+    npy_intp i;
+    npy_intp is1=steps[0], is2=steps[1];
+    BEGIN_OUTER_LOOP_3
+        char *ip1=args[0], *ip2=args[1], *op=args[2];
+        npy_double sum = 0;
+        for (i = 0; i < di; i++) {
+            sum += (*(npy_double *)ip1) * (*(npy_double *)ip2);
+            ip1 += is1;
+            ip2 += is2;
+        }
+        *(npy_double *)op = sum;
+    END_OUTER_LOOP
+}
+
+
+
+char *innerwt_signature = "(i),(i),(i)->()";
+
+#line 124
+
+
+/*
+ *  This implements the function
+ *        out[n] = sum_i { in1[n, i] * in2[n, i] * in3[n, i] }.
+ */
+
+static void
+LONG_innerwt(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    INIT_OUTER_LOOP_4
+    npy_intp di = dimensions[0];
+    npy_intp i;
+    npy_intp is1=steps[0], is2=steps[1], is3=steps[2];
+    BEGIN_OUTER_LOOP_4
+        char *ip1=args[0], *ip2=args[1], *ip3=args[2], *op=args[3];
+        npy_long sum = 0;
+        for (i = 0; i < di; i++) {
+            sum += (*(npy_long *)ip1) * (*(npy_long *)ip2) * (*(npy_long *)ip3);
+            ip1 += is1;
+            ip2 += is2;
+            ip3 += is3;
+        }
+        *(npy_long *)op = sum;
+    END_OUTER_LOOP
+}
+
+
+#line 124
+
+
+/*
+ *  This implements the function
+ *        out[n] = sum_i { in1[n, i] * in2[n, i] * in3[n, i] }.
+ */
+
+static void
+DOUBLE_innerwt(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    INIT_OUTER_LOOP_4
+    npy_intp di = dimensions[0];
+    npy_intp i;
+    npy_intp is1=steps[0], is2=steps[1], is3=steps[2];
+    BEGIN_OUTER_LOOP_4
+        char *ip1=args[0], *ip2=args[1], *ip3=args[2], *op=args[3];
+        npy_double sum = 0;
+        for (i = 0; i < di; i++) {
+            sum += (*(npy_double *)ip1) * (*(npy_double *)ip2) * (*(npy_double *)ip3);
+            ip1 += is1;
+            ip2 += is2;
+            ip3 += is3;
+        }
+        *(npy_double *)op = sum;
+    END_OUTER_LOOP
+}
+
+
+
+char *matrix_multiply_signature = "(m,n),(n,p)->(m,p)";
+/* for use with matrix_multiply code, but different signature */
+char *matmul_signature = "(m?,n),(n,p?)->(m?,p?)";
+
+#line 162
+
+/*
+ *  This implements the function
+ *        out[k, m, p] = sum_n { in1[k, m, n] * in2[k, n, p] }.
+ */
+
+static void
+FLOAT_matrix_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /* no BLAS is available */
+    INIT_OUTER_LOOP_3
+    npy_intp dm = dimensions[0];
+    npy_intp dn = dimensions[1];
+    npy_intp dp = dimensions[2];
+    npy_intp m,n,p;
+    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
+         os_m=steps[4], os_p=steps[5];
+    npy_intp ib1_n = is1_n*dn;
+    npy_intp ib2_n = is2_n*dn;
+    npy_intp ib2_p = is2_p*dp;
+    npy_intp ob_p  = os_p *dp;
+    if (dn == 0) {
+        /* No operand, need to zero the output */
+        BEGIN_OUTER_LOOP_3
+            char *op=args[2];
+            for (m = 0; m < dm; m++) {
+                for (p = 0; p < dp; p++) {
+                    *(npy_float *)op = 0;
+                    op  +=  os_p;
+                }
+                op  +=  os_m - ob_p;
+            }
+        END_OUTER_LOOP
+        return;
+    }
+    BEGIN_OUTER_LOOP_3
+        char *ip1=args[0], *ip2=args[1], *op=args[2];
+        for (m = 0; m < dm; m++) {
+            for (n = 0; n < dn; n++) {
+                npy_float val1 = (*(npy_float *)ip1);
+                for (p = 0; p < dp; p++) {
+                    if (n == 0) *(npy_float *)op = 0;
+                    *(npy_float *)op += val1 * (*(npy_float *)ip2);
+                    ip2 += is2_p;
+                    op  +=  os_p;
+                }
+                ip2 -= ib2_p;
+                op  -=  ob_p;
+                ip1 += is1_n;
+                ip2 += is2_n;
+            }
+            ip1 -= ib1_n;
+            ip2 -= ib2_n;
+            ip1 += is1_m;
+            op  +=  os_m;
+        }
+    END_OUTER_LOOP
+}
+
+
+#line 162
+
+/*
+ *  This implements the function
+ *        out[k, m, p] = sum_n { in1[k, m, n] * in2[k, n, p] }.
+ */
+
+static void
+DOUBLE_matrix_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /* no BLAS is available */
+    INIT_OUTER_LOOP_3
+    npy_intp dm = dimensions[0];
+    npy_intp dn = dimensions[1];
+    npy_intp dp = dimensions[2];
+    npy_intp m,n,p;
+    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
+         os_m=steps[4], os_p=steps[5];
+    npy_intp ib1_n = is1_n*dn;
+    npy_intp ib2_n = is2_n*dn;
+    npy_intp ib2_p = is2_p*dp;
+    npy_intp ob_p  = os_p *dp;
+    if (dn == 0) {
+        /* No operand, need to zero the output */
+        BEGIN_OUTER_LOOP_3
+            char *op=args[2];
+            for (m = 0; m < dm; m++) {
+                for (p = 0; p < dp; p++) {
+                    *(npy_double *)op = 0;
+                    op  +=  os_p;
+                }
+                op  +=  os_m - ob_p;
+            }
+        END_OUTER_LOOP
+        return;
+    }
+    BEGIN_OUTER_LOOP_3
+        char *ip1=args[0], *ip2=args[1], *op=args[2];
+        for (m = 0; m < dm; m++) {
+            for (n = 0; n < dn; n++) {
+                npy_double val1 = (*(npy_double *)ip1);
+                for (p = 0; p < dp; p++) {
+                    if (n == 0) *(npy_double *)op = 0;
+                    *(npy_double *)op += val1 * (*(npy_double *)ip2);
+                    ip2 += is2_p;
+                    op  +=  os_p;
+                }
+                ip2 -= ib2_p;
+                op  -=  ob_p;
+                ip1 += is1_n;
+                ip2 += is2_n;
+            }
+            ip1 -= ib1_n;
+            ip2 -= ib2_n;
+            ip1 += is1_m;
+            op  +=  os_m;
+        }
+    END_OUTER_LOOP
+}
+
+
+#line 162
+
+/*
+ *  This implements the function
+ *        out[k, m, p] = sum_n { in1[k, m, n] * in2[k, n, p] }.
+ */
+
+static void
+LONG_matrix_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /* no BLAS is available */
+    INIT_OUTER_LOOP_3
+    npy_intp dm = dimensions[0];
+    npy_intp dn = dimensions[1];
+    npy_intp dp = dimensions[2];
+    npy_intp m,n,p;
+    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
+         os_m=steps[4], os_p=steps[5];
+    npy_intp ib1_n = is1_n*dn;
+    npy_intp ib2_n = is2_n*dn;
+    npy_intp ib2_p = is2_p*dp;
+    npy_intp ob_p  = os_p *dp;
+    if (dn == 0) {
+        /* No operand, need to zero the output */
+        BEGIN_OUTER_LOOP_3
+            char *op=args[2];
+            for (m = 0; m < dm; m++) {
+                for (p = 0; p < dp; p++) {
+                    *(npy_long *)op = 0;
+                    op  +=  os_p;
+                }
+                op  +=  os_m - ob_p;
+            }
+        END_OUTER_LOOP
+        return;
+    }
+    BEGIN_OUTER_LOOP_3
+        char *ip1=args[0], *ip2=args[1], *op=args[2];
+        for (m = 0; m < dm; m++) {
+            for (n = 0; n < dn; n++) {
+                npy_long val1 = (*(npy_long *)ip1);
+                for (p = 0; p < dp; p++) {
+                    if (n == 0) *(npy_long *)op = 0;
+                    *(npy_long *)op += val1 * (*(npy_long *)ip2);
+                    ip2 += is2_p;
+                    op  +=  os_p;
+                }
+                ip2 -= ib2_p;
+                op  -=  ob_p;
+                ip1 += is1_n;
+                ip2 += is2_n;
+            }
+            ip1 -= ib1_n;
+            ip2 -= ib2_n;
+            ip1 += is1_m;
+            op  +=  os_m;
+        }
+    END_OUTER_LOOP
+}
+
+
+
+char *cross1d_signature = "(3),(3)->(3)";
+
+#line 230
+
+/*
+ *  This implements the cross product:
+ *        out[n, 0] = in1[n, 1]*in2[n, 2] - in1[n, 2]*in2[n, 1]
+ *        out[n, 1] = in1[n, 2]*in2[n, 0] - in1[n, 0]*in2[n, 2]
+ *        out[n, 2] = in1[n, 0]*in2[n, 1] - in1[n, 1]*in2[n, 0]
+ */
+static void
+LONG_cross1d(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    INIT_OUTER_LOOP_3
+    npy_intp is1=steps[0], is2=steps[1], os = steps[2];
+    BEGIN_OUTER_LOOP_3
+        npy_long i1_x = *(npy_long *)(args[0] + 0*is1);
+        npy_long i1_y = *(npy_long *)(args[0] + 1*is1);
+        npy_long i1_z = *(npy_long *)(args[0] + 2*is1);
+
+        npy_long i2_x = *(npy_long *)(args[1] + 0*is2);
+        npy_long i2_y = *(npy_long *)(args[1] + 1*is2);
+        npy_long i2_z = *(npy_long *)(args[1] + 2*is2);
+        char *op = args[2];
+
+        *(npy_long *)op = i1_y * i2_z - i1_z * i2_y;
+        op += os;
+        *(npy_long *)op = i1_z * i2_x - i1_x * i2_z;
+        op += os;
+        *(npy_long *)op = i1_x * i2_y - i1_y * i2_x;
+    END_OUTER_LOOP
+}
+
+
+#line 230
+
+/*
+ *  This implements the cross product:
+ *        out[n, 0] = in1[n, 1]*in2[n, 2] - in1[n, 2]*in2[n, 1]
+ *        out[n, 1] = in1[n, 2]*in2[n, 0] - in1[n, 0]*in2[n, 2]
+ *        out[n, 2] = in1[n, 0]*in2[n, 1] - in1[n, 1]*in2[n, 0]
+ */
+static void
+DOUBLE_cross1d(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    INIT_OUTER_LOOP_3
+    npy_intp is1=steps[0], is2=steps[1], os = steps[2];
+    BEGIN_OUTER_LOOP_3
+        npy_double i1_x = *(npy_double *)(args[0] + 0*is1);
+        npy_double i1_y = *(npy_double *)(args[0] + 1*is1);
+        npy_double i1_z = *(npy_double *)(args[0] + 2*is1);
+
+        npy_double i2_x = *(npy_double *)(args[1] + 0*is2);
+        npy_double i2_y = *(npy_double *)(args[1] + 1*is2);
+        npy_double i2_z = *(npy_double *)(args[1] + 2*is2);
+        char *op = args[2];
+
+        *(npy_double *)op = i1_y * i2_z - i1_z * i2_y;
+        op += os;
+        *(npy_double *)op = i1_z * i2_x - i1_x * i2_z;
+        op += os;
+        *(npy_double *)op = i1_x * i2_y - i1_y * i2_x;
+    END_OUTER_LOOP
+}
+
+
+
+char *euclidean_pdist_signature = "(n,d)->(p)";
+
+#line 270
+
+/*
+ *  This implements the function
+ *        out[j*(2*n-3-j)+k-1] = sum_d { (in1[j, d] - in1[k, d])^2 }
+ *  with 0 < k < j < n, i.e. computes all unique pairwise euclidean distances.
+ */
+
+static void
+FLOAT_euclidean_pdist(char **args, npy_intp const *dimensions, npy_intp const *steps,
+                       void *NPY_UNUSED(func))
+{
+    INIT_OUTER_LOOP_2
+    npy_intp len_n = *dimensions++;
+    npy_intp len_d = *dimensions++;
+    npy_intp stride_n = *steps++;
+    npy_intp stride_d = *steps++;
+    npy_intp stride_p = *steps;
+
+    assert(len_n * (len_n - 1) / 2 == *dimensions);
+
+    BEGIN_OUTER_LOOP_2
+        const char *data_this = (const char *)args[0];
+        char *data_out = args[1];
+        npy_intp n;
+        for (n = 0; n < len_n; ++n) {
+            const char *data_that = data_this + stride_n;
+            npy_intp nn;
+            for (nn = n + 1; nn < len_n; ++nn) {
+                const char *ptr_this = data_this;
+                const char *ptr_that = data_that;
+                npy_float out = 0;
+                npy_intp d;
+                for (d = 0; d < len_d; ++d) {
+                    const npy_float delta = *(const npy_float *)ptr_this -
+                                        *(const npy_float *)ptr_that;
+                    out += delta * delta;
+                    ptr_this += stride_d;
+                    ptr_that += stride_d;
+                }
+                *(npy_float *)data_out = sqrtf(out);
+                data_that += stride_n;
+                data_out += stride_p;
+            }
+            data_this += stride_n;
+        }
+    END_OUTER_LOOP
+}
+
+
+#line 270
+
+/*
+ *  This implements the function
+ *        out[j*(2*n-3-j)+k-1] = sum_d { (in1[j, d] - in1[k, d])^2 }
+ *  with 0 < k < j < n, i.e. computes all unique pairwise euclidean distances.
+ */
+
+static void
+DOUBLE_euclidean_pdist(char **args, npy_intp const *dimensions, npy_intp const *steps,
+                       void *NPY_UNUSED(func))
+{
+    INIT_OUTER_LOOP_2
+    npy_intp len_n = *dimensions++;
+    npy_intp len_d = *dimensions++;
+    npy_intp stride_n = *steps++;
+    npy_intp stride_d = *steps++;
+    npy_intp stride_p = *steps;
+
+    assert(len_n * (len_n - 1) / 2 == *dimensions);
+
+    BEGIN_OUTER_LOOP_2
+        const char *data_this = (const char *)args[0];
+        char *data_out = args[1];
+        npy_intp n;
+        for (n = 0; n < len_n; ++n) {
+            const char *data_that = data_this + stride_n;
+            npy_intp nn;
+            for (nn = n + 1; nn < len_n; ++nn) {
+                const char *ptr_this = data_this;
+                const char *ptr_that = data_that;
+                npy_double out = 0;
+                npy_intp d;
+                for (d = 0; d < len_d; ++d) {
+                    const npy_double delta = *(const npy_double *)ptr_this -
+                                        *(const npy_double *)ptr_that;
+                    out += delta * delta;
+                    ptr_this += stride_d;
+                    ptr_that += stride_d;
+                }
+                *(npy_double *)data_out = sqrt(out);
+                data_that += stride_n;
+                data_out += stride_p;
+            }
+            data_this += stride_n;
+        }
+    END_OUTER_LOOP
+}
+
+
+
+char *cumsum_signature = "(i)->(i)";
+
+/*
+ *  This implements the function
+ *        out[n] = sum_i^n in[i]
+ */
+
+#line 332
+
+static void
+LONG_cumsum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    INIT_OUTER_LOOP_2
+    npy_intp di = dimensions[0];
+    npy_intp i;
+    npy_intp is=steps[0], os=steps[1];
+    BEGIN_OUTER_LOOP_2
+        char *ip=args[0], *op=args[1];
+        npy_long cumsum = 0;
+        for (i = 0; i < di; i++, ip += is, op += os) {
+            cumsum += (*(npy_long *)ip);
+            *(npy_long *)op = cumsum;
+        }
+    END_OUTER_LOOP
+}
+
+
+#line 332
+
+static void
+DOUBLE_cumsum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    INIT_OUTER_LOOP_2
+    npy_intp di = dimensions[0];
+    npy_intp i;
+    npy_intp is=steps[0], os=steps[1];
+    BEGIN_OUTER_LOOP_2
+        char *ip=args[0], *op=args[1];
+        npy_double cumsum = 0;
+        for (i = 0; i < di; i++, ip += is, op += os) {
+            cumsum += (*(npy_double *)ip);
+            *(npy_double *)op = cumsum;
+        }
+    END_OUTER_LOOP
+}
+
+
+
+static int
+INT32_negative(PyArrayMethod_Context *NPY_UNUSED(context),
+               char **args, npy_intp const *dimensions,
+               npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    npy_intp di = dimensions[0];
+    npy_intp i;
+    npy_intp is=steps[0], os=steps[1];
+    char *ip=args[0], *op=args[1];
+    for (i = 0; i < di; i++, ip += is, op += os) {
+        if (i == 3) {
+            *(int32_t *)op = - 100;
+        } else {
+            *(int32_t *)op = - *(int32_t *)ip;
+        }
+    }
+    return 0;
+}
+
+
+static int
+INT32_negative_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                           char * const*args, npy_intp const *dimensions,
+                           npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    npy_intp is1 = steps[0], isindex = steps[1];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    int32_t *indexed;
+    for(i = 0; i < n; i++, indxp += isindex) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (int32_t *)(ip1 + is1 * indx);
+        if (i == 3) {
+            *indexed = -200;
+        } else {
+            *indexed = - *indexed;
+        }
+    }
+    return 0;
+}
+
+
+
+/*  The following lines were generated using a slightly modified
+    version of code_generators/generate_umath.py and adding these
+    lines to defdict:
+
+defdict = {
+'inner1d' :
+    Ufunc(2, 1, None_,
+        r'''inner on the last dimension and broadcast on the rest \n"
+        "     \"(i),(i)->()\" \n''',
+        TD('ld'),
+        ),
+'innerwt' :
+    Ufunc(3, 1, None_,
+        r'''inner1d with a weight argument \n"
+        "     \"(i),(i),(i)->()\" \n''',
+        TD('ld'),
+        ),
+}
+
+*/
+
+static PyUFuncGenericFunction always_error_functions[] = { always_error_loop };
+static void *always_error_data[] = { (void *)NULL };
+static char always_error_signatures[] = { NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE };
+static PyUFuncGenericFunction inner1d_functions[] = { LONG_inner1d, DOUBLE_inner1d };
+static void *inner1d_data[] = { (void *)NULL, (void *)NULL };
+static char inner1d_signatures[] = { NPY_LONG, NPY_LONG, NPY_LONG, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE };
+static PyUFuncGenericFunction innerwt_functions[] = { LONG_innerwt, DOUBLE_innerwt };
+static void *innerwt_data[] = { (void *)NULL, (void *)NULL };
+static char innerwt_signatures[] = { NPY_LONG, NPY_LONG, NPY_LONG, NPY_LONG, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE };
+static PyUFuncGenericFunction matrix_multiply_functions[] = { LONG_matrix_multiply, FLOAT_matrix_multiply, DOUBLE_matrix_multiply };
+static void *matrix_multiply_data[] = { (void *)NULL, (void *)NULL, (void *)NULL };
+static char matrix_multiply_signatures[] = { NPY_LONG, NPY_LONG, NPY_LONG,  NPY_FLOAT, NPY_FLOAT, NPY_FLOAT,  NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE };
+static PyUFuncGenericFunction cross1d_functions[] = { LONG_cross1d, DOUBLE_cross1d };
+static void *cross1d_data[] = { (void *)NULL, (void *)NULL };
+static char cross1d_signatures[] = { NPY_LONG, NPY_LONG, NPY_LONG, NPY_DOUBLE, NPY_DOUBLE, NPY_DOUBLE };
+static PyUFuncGenericFunction euclidean_pdist_functions[] =
+                            { FLOAT_euclidean_pdist, DOUBLE_euclidean_pdist };
+static void *eucldiean_pdist_data[] = { (void *)NULL, (void *)NULL };
+static char euclidean_pdist_signatures[] = { NPY_FLOAT, NPY_FLOAT,
+                                             NPY_DOUBLE, NPY_DOUBLE };
+
+static PyUFuncGenericFunction cumsum_functions[] = { LONG_cumsum, DOUBLE_cumsum };
+static void *cumsum_data[] = { (void *)NULL, (void *)NULL };
+static char cumsum_signatures[] = { NPY_LONG, NPY_LONG, NPY_DOUBLE, NPY_DOUBLE };
+
+
+static int
+addUfuncs(PyObject *dictionary) {
+    PyObject *f;
+
+    f = PyUFunc_FromFuncAndData(always_error_functions, always_error_data,
+            always_error_signatures, 1, 2, 1, PyUFunc_None, "always_error",
+            "simply, broken, ufunc that sets an error (but releases the GIL).",
+            0);
+    if (f == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(dictionary, "always_error", f);
+    Py_DECREF(f);
+    f = PyUFunc_FromFuncAndDataAndSignature(always_error_functions,
+            always_error_data, always_error_signatures, 1, 2, 1, PyUFunc_None,
+            "always_error_gufunc",
+            "simply, broken, gufunc that sets an error (but releases the GIL).",
+            0, "(i),()->()");
+    if (f == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(dictionary, "always_error_gufunc", f);
+    Py_DECREF(f);
+    f = PyUFunc_FromFuncAndDataAndSignature(inner1d_functions, inner1d_data,
+                    inner1d_signatures, 2, 2, 1, PyUFunc_None, "inner1d",
+                    "inner on the last dimension and broadcast on the rest \n"
+                    "     \"(i),(i)->()\" \n",
+                    0, inner1d_signature);
+    /*
+     * yes, this should not happen, but I (MHvK) just spent an hour looking at
+     * segfaults because I screwed up something that seemed totally unrelated.
+     */
+    if (f == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(dictionary, "inner1d", f);
+    Py_DECREF(f);
+    f = PyUFunc_FromFuncAndDataAndSignature(innerwt_functions, innerwt_data,
+                    innerwt_signatures, 2, 3, 1, PyUFunc_None, "innerwt",
+                    "inner1d with a weight argument \n"
+                    "     \"(i),(i),(i)->()\" \n",
+                    0, innerwt_signature);
+    if (f == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(dictionary, "innerwt", f);
+    Py_DECREF(f);
+    f = PyUFunc_FromFuncAndDataAndSignature(matrix_multiply_functions,
+                    matrix_multiply_data, matrix_multiply_signatures,
+                    3, 2, 1, PyUFunc_None, "matrix_multiply",
+                    "matrix multiplication on last two dimensions \n"
+                    "     \"(m,n),(n,p)->(m,p)\" \n",
+                    0, matrix_multiply_signature);
+    if (f == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(dictionary, "matrix_multiply", f);
+    Py_DECREF(f);
+    f = PyUFunc_FromFuncAndDataAndSignature(matrix_multiply_functions,
+                    matrix_multiply_data, matrix_multiply_signatures,
+                    3, 2, 1, PyUFunc_None, "matmul",
+                    "matmul on last two dimensions, with some being optional\n"
+                    "     \"(m?,n),(n,p?)->(m?,p?)\" \n",
+                    0, matmul_signature);
+    if (f == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(dictionary, "matmul", f);
+    Py_DECREF(f);
+    f = PyUFunc_FromFuncAndDataAndSignature(euclidean_pdist_functions,
+                    eucldiean_pdist_data, euclidean_pdist_signatures,
+                    2, 1, 1, PyUFunc_None, "euclidean_pdist",
+                    "pairwise euclidean distance on last two dimensions \n"
+                    "     \"(n,d)->(p)\" \n",
+                    0, euclidean_pdist_signature);
+    if (f == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(dictionary, "euclidean_pdist", f);
+    Py_DECREF(f);
+    f = PyUFunc_FromFuncAndDataAndSignature(cumsum_functions,
+                    cumsum_data, cumsum_signatures,
+                    2, 1, 1, PyUFunc_None, "cumsum",
+                    "Cumulative sum of the input (n)->(n)\n",
+                    0, cumsum_signature);
+    if (f == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(dictionary, "cumsum", f);
+    Py_DECREF(f);
+    f = PyUFunc_FromFuncAndDataAndSignature(inner1d_functions, inner1d_data,
+                    inner1d_signatures, 2, 2, 1, PyUFunc_None, "inner1d_no_doc",
+                    NULL,
+                    0, inner1d_signature);
+    if (f == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(dictionary, "inner1d_no_doc", f);
+    Py_DECREF(f);
+    f = PyUFunc_FromFuncAndDataAndSignature(cross1d_functions, cross1d_data,
+                    cross1d_signatures, 2, 2, 1, PyUFunc_None, "cross1d",
+                    "cross product on the last dimension and broadcast on the rest \n"\
+                    "     \"(3),(3)->(3)\" \n",
+                    0, cross1d_signature);
+    if (f == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(dictionary, "cross1d", f);
+    Py_DECREF(f);
+
+    f = PyUFunc_FromFuncAndDataAndSignature(NULL, NULL,
+            NULL, 0, 0, 0, PyUFunc_None, "_pickleable_module_global.ufunc",
+            "A dotted name for pickle testing, does nothing.", 0, NULL);
+    if (f == NULL) {
+        return -1;
+    }
+    PyDict_SetItemString(dictionary, "_pickleable_module_global_ufunc", f);
+    Py_DECREF(f);
+
+    return 0;
+}
+
+
+static PyObject *
+UMath_Tests_test_signature(PyObject *NPY_UNUSED(dummy), PyObject *args)
+{
+    int nin, nout, i;
+    PyObject *signature=NULL, *sig_str=NULL;
+    PyUFuncObject *f=NULL;
+    PyObject *core_num_dims=NULL, *core_dim_ixs=NULL;
+    PyObject *core_dim_flags=NULL, *core_dim_sizes=NULL;
+    int core_enabled;
+    int core_num_ixs = 0;
+
+    if (!PyArg_ParseTuple(args, "iiO", &nin, &nout, &signature)) {
+        return NULL;
+    }
+
+    if (PyBytes_Check(signature)) {
+        sig_str = signature;
+    } else if (PyUnicode_Check(signature)) {
+        sig_str = PyUnicode_AsUTF8String(signature);
+    } else {
+        PyErr_SetString(PyExc_ValueError, "signature should be a string");
+        return NULL;
+    }
+
+    f = (PyUFuncObject*)PyUFunc_FromFuncAndDataAndSignature(
+        NULL, NULL, NULL,
+        0, nin, nout, PyUFunc_None, "no name",
+        "doc:none",
+        1, PyBytes_AS_STRING(sig_str));
+    if (sig_str != signature) {
+        Py_DECREF(sig_str);
+    }
+    if (f == NULL) {
+        return NULL;
+    }
+    core_enabled = f->core_enabled;
+    /*
+     * Don't presume core_num_dims and core_dim_ixs are defined;
+     * they currently are even if core_enabled=0, but there's no real
+     * reason they should be.  So avoid segfaults if we change our mind.
+     */
+    if (f->core_num_dims != NULL) {
+        core_num_dims = PyTuple_New(f->nargs);
+        if (core_num_dims == NULL) {
+            goto fail;
+        }
+        for (i = 0; i < f->nargs; i++) {
+            PyObject *val = PyLong_FromLong(f->core_num_dims[i]);
+            PyTuple_SET_ITEM(core_num_dims, i, val);
+            core_num_ixs += f->core_num_dims[i];
+        }
+    }
+    else {
+        Py_INCREF(Py_None);
+        core_num_dims = Py_None;
+    }
+    if (f->core_dim_ixs != NULL) {
+        core_dim_ixs = PyTuple_New(core_num_ixs);
+        if (core_dim_ixs == NULL) {
+            goto fail;
+        }
+        for (i = 0; i < core_num_ixs; i++) {
+            PyObject *val = PyLong_FromLong(f->core_dim_ixs[i]);
+            PyTuple_SET_ITEM(core_dim_ixs, i, val);
+        }
+    }
+    else {
+        Py_INCREF(Py_None);
+        core_dim_ixs = Py_None;
+    }
+    if (f->core_dim_flags != NULL) {
+        core_dim_flags = PyTuple_New(f->core_num_dim_ix);
+        if (core_dim_flags == NULL) {
+            goto fail;
+        }
+        for (i = 0; i < f->core_num_dim_ix; i++) {
+            PyObject *val = PyLong_FromLong(f->core_dim_flags[i]);
+            PyTuple_SET_ITEM(core_dim_flags, i, val);
+        }
+    }
+    else {
+        Py_INCREF(Py_None);
+        core_dim_flags = Py_None;
+    }
+    if (f->core_dim_sizes != NULL) {
+        core_dim_sizes = PyTuple_New(f->core_num_dim_ix);
+        if (core_dim_sizes == NULL) {
+            goto fail;
+        }
+        for (i = 0; i < f->core_num_dim_ix; i++) {
+            PyObject *val = PyLong_FromLong(f->core_dim_sizes[i]);
+            PyTuple_SET_ITEM(core_dim_sizes, i, val);
+        }
+    }
+    else {
+        Py_INCREF(Py_None);
+        core_dim_sizes = Py_None;
+    }
+    Py_DECREF(f);
+    return Py_BuildValue("iNNNN", core_enabled, core_num_dims,
+                         core_dim_ixs, core_dim_flags, core_dim_sizes);
+
+fail:
+    Py_XDECREF(f);
+    Py_XDECREF(core_num_dims);
+    Py_XDECREF(core_dim_ixs);
+    Py_XDECREF(core_dim_flags);
+    Py_XDECREF(core_dim_sizes);
+    return NULL;
+}
+
+// Testing the utilities of the CPU dispatcher
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "_umath_tests.dispatch.h"
+#endif
+NPY_CPU_DISPATCH_DECLARE(extern const char *_umath_tests_dispatch_var)
+NPY_CPU_DISPATCH_DECLARE(const char *_umath_tests_dispatch_func, (void))
+NPY_CPU_DISPATCH_DECLARE(void _umath_tests_dispatch_attach, (PyObject *list))
+
+static PyObject *
+UMath_Tests_test_dispatch(PyObject *NPY_UNUSED(dummy), PyObject *NPY_UNUSED(dummy2))
+{
+    const char *highest_func, *highest_var;
+    NPY_CPU_DISPATCH_CALL(highest_func = _umath_tests_dispatch_func, ());
+    NPY_CPU_DISPATCH_CALL(highest_var  = _umath_tests_dispatch_var);
+    const char *highest_func_xb = "nobase", *highest_var_xb = "nobase";
+    NPY_CPU_DISPATCH_CALL_XB(highest_func_xb = _umath_tests_dispatch_func, ());
+    NPY_CPU_DISPATCH_CALL_XB(highest_var_xb  = _umath_tests_dispatch_var);
+
+    PyObject *dict = PyDict_New(), *item;
+    if (dict == NULL) {
+        return NULL;
+    }
+    #line 707
+    item = PyUnicode_FromString(highest_func);
+    if (item == NULL || PyDict_SetItemString(dict, "func", item) < 0) {
+        goto err;
+    }
+    Py_DECREF(item);
+    
+#line 707
+    item = PyUnicode_FromString(highest_var);
+    if (item == NULL || PyDict_SetItemString(dict, "var", item) < 0) {
+        goto err;
+    }
+    Py_DECREF(item);
+    
+#line 707
+    item = PyUnicode_FromString(highest_func_xb);
+    if (item == NULL || PyDict_SetItemString(dict, "func_xb", item) < 0) {
+        goto err;
+    }
+    Py_DECREF(item);
+    
+#line 707
+    item = PyUnicode_FromString(highest_var_xb);
+    if (item == NULL || PyDict_SetItemString(dict, "var_xb", item) < 0) {
+        goto err;
+    }
+    Py_DECREF(item);
+    
+    item = PyList_New(0);
+    if (item == NULL || PyDict_SetItemString(dict, "all", item) < 0) {
+        goto err;
+    }
+    NPY_CPU_DISPATCH_CALL_ALL(_umath_tests_dispatch_attach, (item));
+    Py_SETREF(item, NULL);
+    if (PyErr_Occurred()) {
+        goto err;
+    }
+    return dict;
+err:
+    Py_XDECREF(item);
+    Py_DECREF(dict);
+    return NULL;
+}
+
+static int
+add_INT32_negative_indexed(PyObject *module, PyObject *dict) {
+    if (import_experimental_dtype_api(__EXPERIMENTAL_DTYPE_API_VERSION) < 0) {
+        return -1;
+    }
+
+    PyObject * negative = PyUFunc_FromFuncAndData(NULL, NULL, NULL, 0, 1, 1,
+                                    PyUFunc_Zero, "indexed_negative", NULL, 0);
+    if (negative == NULL) {
+        return -1;
+    }
+    PyArray_DTypeMeta *dtypes[] = {&PyArray_Int32DType, &PyArray_Int32DType};
+
+    PyType_Slot slots[] = {
+        {NPY_METH_contiguous_indexed_loop, INT32_negative_indexed},
+        {NPY_METH_strided_loop, INT32_negative},
+        {0, NULL}
+    };
+
+    PyArrayMethod_Spec spec = {
+        .name = "negative_indexed_loop",
+        .nin = 1,
+        .nout = 1,
+        .dtypes = dtypes,
+        .slots = slots,
+        .flags = NPY_METH_NO_FLOATINGPOINT_ERRORS
+    };
+
+    if (PyUFunc_AddLoopFromSpec(negative, &spec) < 0) {
+        Py_DECREF(negative);
+        return -1;
+    }
+    PyDict_SetItemString(dict, "indexed_negative", negative);
+    Py_DECREF(negative);
+    return 0;
+}
+
+static PyMethodDef UMath_TestsMethods[] = {
+    {"test_signature",  UMath_Tests_test_signature, METH_VARARGS,
+     "Test signature parsing of ufunc. \n"
+     "Arguments: nin nout signature \n"
+     "If fails, it returns NULL. Otherwise it returns a tuple of ufunc "
+     "internals. \n",
+     },
+    {"test_dispatch", UMath_Tests_test_dispatch, METH_NOARGS, NULL},
+    {NULL, NULL, 0, NULL}        /* Sentinel */
+};
+
+static struct PyModuleDef moduledef = {
+        PyModuleDef_HEAD_INIT,
+        "_umath_tests",
+        NULL,
+        -1,
+        UMath_TestsMethods,
+        NULL,
+        NULL,
+        NULL,
+        NULL
+};
+
+/* Initialization function for the module */
+PyMODINIT_FUNC PyInit__umath_tests(void) {
+    PyObject *m;
+    PyObject *d;
+    PyObject *version;
+
+    // Initialize CPU features
+    if (npy_cpu_init() < 0) {
+        return NULL;
+    }
+
+    m = PyModule_Create(&moduledef);
+    if (m == NULL) {
+        return NULL;
+    }
+
+    import_array();
+    if (PyErr_Occurred()) {
+        return NULL;
+    }
+    import_ufunc();
+    if (PyErr_Occurred()) {
+        return NULL;
+    }
+
+    d = PyModule_GetDict(m);
+
+    version = PyUnicode_FromString("0.1");
+    PyDict_SetItemString(d, "__version__", version);
+    Py_DECREF(version);
+
+    /* Load the ufunc operators into the module's namespace */
+    if (addUfuncs(d) < 0) {
+        Py_DECREF(m);
+        PyErr_Print();
+        PyErr_SetString(PyExc_RuntimeError,
+                        "cannot load _umath_tests module.");
+        return NULL;
+    }
+
+    if (add_INT32_negative_indexed(m, d) < 0) {
+        Py_DECREF(m);
+        PyErr_Print();
+        PyErr_SetString(PyExc_RuntimeError,
+                        "cannot load _umath_tests module.");
+        return NULL;
+    }
+    return m;
+}
+
diff --git a/numpy/core/src/_generated/argfunc.dispatch.c b/numpy/core/src/_generated/argfunc.dispatch.c
new file mode 100644
index 000000000000..132fc7deb899
--- /dev/null
+++ b/numpy/core/src/_generated/argfunc.dispatch.c
@@ -0,0 +1,4374 @@
+#line 1 "numpy/core/src/multiarray/argfunc.dispatch.c.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+/* -*- c -*- */
+/*@targets
+ ** $maxopt baseline
+ ** sse2 sse42 xop avx2 avx512_skx
+ ** vsx2
+ ** neon asimd
+ ** vx vxe
+ **/
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "simd/simd.h"
+#include "numpy/npy_math.h"
+
+#include "arraytypes.h"
+
+#define MIN(a,b) (((a)<(b))?(a):(b))
+
+#if NPY_SIMD
+#if NPY_SIMD > 512 || NPY_SIMD < 0
+    #error "the following 8/16-bit argmax kernel isn't applicable for larger SIMD"
+    // TODO: add special loop for large SIMD width.
+    // i.e avoid unroll by x4 should be numerically safe till 2048-bit SIMD width
+    // or maybe expand the indices to 32|64-bit vectors(slower).
+#endif
+#line 32
+#line 37
+static inline npy_intp
+simd_argmax_u8(npyv_lanetype_u8 *ip, npy_intp len)
+{
+    npyv_lanetype_u8 s_acc = *ip;
+    npy_intp ret_idx = 0, i = 0;
+
+    const int vstep = npyv_nlanes_u8;
+    const int wstep = vstep*4;
+    npyv_lanetype_u8 d_vindices[npyv_nlanes_u8*4];
+    for (int vi = 0; vi < wstep; ++vi) {
+        d_vindices[vi] = vi;
+    }
+    const npyv_u8 vindices_0 = npyv_load_u8(d_vindices);
+    const npyv_u8 vindices_1 = npyv_load_u8(d_vindices + vstep);
+    const npyv_u8 vindices_2 = npyv_load_u8(d_vindices + vstep*2);
+    const npyv_u8 vindices_3 = npyv_load_u8(d_vindices + vstep*3);
+
+    const npy_intp max_block = NPY_MAX_UINT8*wstep & -wstep;
+    npy_intp len0 = len & -wstep;
+    while (i < len0) {
+        npyv_u8 acc = npyv_setall_u8(s_acc);
+        npyv_u8 acc_indices = npyv_zero_u8();
+        npyv_u8 acc_indices_scale = npyv_zero_u8();
+
+        npy_intp n = i + MIN(len0 - i, max_block);
+        npy_intp ik = i, i2 = 0;
+        for (; i < n; i += wstep, ++i2) {
+            npyv_u8 vi = npyv_setall_u8((npyv_lanetype_u8)i2);
+            npyv_u8 a = npyv_load_u8(ip + i);
+            npyv_u8 b = npyv_load_u8(ip + i + vstep);
+            npyv_u8 c = npyv_load_u8(ip + i + vstep*2);
+            npyv_u8 d = npyv_load_u8(ip + i + vstep*3);
+
+            // reverse to put lowest index first in case of matched values
+            npyv_b8 m_ba = npyv_cmpgt_u8(b, a);
+            npyv_b8 m_dc = npyv_cmpgt_u8(d, c);
+            npyv_u8  x_ba = npyv_select_u8(m_ba, b, a);
+            npyv_u8  x_dc = npyv_select_u8(m_dc, d, c);
+            npyv_b8 m_dcba = npyv_cmpgt_u8(x_dc, x_ba);
+            npyv_u8  x_dcba = npyv_select_u8(m_dcba, x_dc, x_ba);
+
+            npyv_u8 idx_ba = npyv_select_u8(m_ba, vindices_1, vindices_0);
+            npyv_u8 idx_dc = npyv_select_u8(m_dc, vindices_3, vindices_2);
+            npyv_u8 idx_dcba = npyv_select_u8(m_dcba, idx_dc, idx_ba);
+            npyv_b8 m_acc = npyv_cmpgt_u8(x_dcba, acc);
+            acc = npyv_select_u8(m_acc, x_dcba, acc);
+            acc_indices = npyv_select_u8(m_acc, idx_dcba, acc_indices);
+            acc_indices_scale = npyv_select_u8(m_acc, vi, acc_indices_scale);
+        }
+        // reduce
+        npyv_lanetype_u8 dacc[npyv_nlanes_u8];
+        npyv_lanetype_u8 dacc_i[npyv_nlanes_u8];
+        npyv_lanetype_u8 dacc_s[npyv_nlanes_u8];
+        npyv_store_u8(dacc, acc);
+        npyv_store_u8(dacc_i, acc_indices);
+        npyv_store_u8(dacc_s, acc_indices_scale);
+
+        for (int vi = 0; vi < vstep; ++vi) {
+            if (dacc[vi] > s_acc) {
+                s_acc = dacc[vi];
+                ret_idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
+            }
+        }
+        // get the lowest index in case of matched values
+        for (int vi = 0; vi < vstep; ++vi) {
+            npy_intp idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
+            if (s_acc == dacc[vi] && ret_idx > idx) {
+                ret_idx = idx;
+            }
+        }
+    }
+    for (; i < len; ++i) {
+        npyv_lanetype_u8 a = ip[i];
+        if (a > s_acc) {
+            s_acc = a;
+            ret_idx = i;
+        }
+    }
+    return ret_idx;
+}
+
+#line 37
+static inline npy_intp
+simd_argmin_u8(npyv_lanetype_u8 *ip, npy_intp len)
+{
+    npyv_lanetype_u8 s_acc = *ip;
+    npy_intp ret_idx = 0, i = 0;
+
+    const int vstep = npyv_nlanes_u8;
+    const int wstep = vstep*4;
+    npyv_lanetype_u8 d_vindices[npyv_nlanes_u8*4];
+    for (int vi = 0; vi < wstep; ++vi) {
+        d_vindices[vi] = vi;
+    }
+    const npyv_u8 vindices_0 = npyv_load_u8(d_vindices);
+    const npyv_u8 vindices_1 = npyv_load_u8(d_vindices + vstep);
+    const npyv_u8 vindices_2 = npyv_load_u8(d_vindices + vstep*2);
+    const npyv_u8 vindices_3 = npyv_load_u8(d_vindices + vstep*3);
+
+    const npy_intp max_block = NPY_MAX_UINT8*wstep & -wstep;
+    npy_intp len0 = len & -wstep;
+    while (i < len0) {
+        npyv_u8 acc = npyv_setall_u8(s_acc);
+        npyv_u8 acc_indices = npyv_zero_u8();
+        npyv_u8 acc_indices_scale = npyv_zero_u8();
+
+        npy_intp n = i + MIN(len0 - i, max_block);
+        npy_intp ik = i, i2 = 0;
+        for (; i < n; i += wstep, ++i2) {
+            npyv_u8 vi = npyv_setall_u8((npyv_lanetype_u8)i2);
+            npyv_u8 a = npyv_load_u8(ip + i);
+            npyv_u8 b = npyv_load_u8(ip + i + vstep);
+            npyv_u8 c = npyv_load_u8(ip + i + vstep*2);
+            npyv_u8 d = npyv_load_u8(ip + i + vstep*3);
+
+            // reverse to put lowest index first in case of matched values
+            npyv_b8 m_ba = npyv_cmplt_u8(b, a);
+            npyv_b8 m_dc = npyv_cmplt_u8(d, c);
+            npyv_u8  x_ba = npyv_select_u8(m_ba, b, a);
+            npyv_u8  x_dc = npyv_select_u8(m_dc, d, c);
+            npyv_b8 m_dcba = npyv_cmplt_u8(x_dc, x_ba);
+            npyv_u8  x_dcba = npyv_select_u8(m_dcba, x_dc, x_ba);
+
+            npyv_u8 idx_ba = npyv_select_u8(m_ba, vindices_1, vindices_0);
+            npyv_u8 idx_dc = npyv_select_u8(m_dc, vindices_3, vindices_2);
+            npyv_u8 idx_dcba = npyv_select_u8(m_dcba, idx_dc, idx_ba);
+            npyv_b8 m_acc = npyv_cmplt_u8(x_dcba, acc);
+            acc = npyv_select_u8(m_acc, x_dcba, acc);
+            acc_indices = npyv_select_u8(m_acc, idx_dcba, acc_indices);
+            acc_indices_scale = npyv_select_u8(m_acc, vi, acc_indices_scale);
+        }
+        // reduce
+        npyv_lanetype_u8 dacc[npyv_nlanes_u8];
+        npyv_lanetype_u8 dacc_i[npyv_nlanes_u8];
+        npyv_lanetype_u8 dacc_s[npyv_nlanes_u8];
+        npyv_store_u8(dacc, acc);
+        npyv_store_u8(dacc_i, acc_indices);
+        npyv_store_u8(dacc_s, acc_indices_scale);
+
+        for (int vi = 0; vi < vstep; ++vi) {
+            if (dacc[vi] < s_acc) {
+                s_acc = dacc[vi];
+                ret_idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
+            }
+        }
+        // get the lowest index in case of matched values
+        for (int vi = 0; vi < vstep; ++vi) {
+            npy_intp idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
+            if (s_acc == dacc[vi] && ret_idx > idx) {
+                ret_idx = idx;
+            }
+        }
+    }
+    for (; i < len; ++i) {
+        npyv_lanetype_u8 a = ip[i];
+        if (a < s_acc) {
+            s_acc = a;
+            ret_idx = i;
+        }
+    }
+    return ret_idx;
+}
+
+
+#line 32
+#line 37
+static inline npy_intp
+simd_argmax_s8(npyv_lanetype_s8 *ip, npy_intp len)
+{
+    npyv_lanetype_s8 s_acc = *ip;
+    npy_intp ret_idx = 0, i = 0;
+
+    const int vstep = npyv_nlanes_s8;
+    const int wstep = vstep*4;
+    npyv_lanetype_u8 d_vindices[npyv_nlanes_s8*4];
+    for (int vi = 0; vi < wstep; ++vi) {
+        d_vindices[vi] = vi;
+    }
+    const npyv_u8 vindices_0 = npyv_load_u8(d_vindices);
+    const npyv_u8 vindices_1 = npyv_load_u8(d_vindices + vstep);
+    const npyv_u8 vindices_2 = npyv_load_u8(d_vindices + vstep*2);
+    const npyv_u8 vindices_3 = npyv_load_u8(d_vindices + vstep*3);
+
+    const npy_intp max_block = NPY_MAX_UINT8*wstep & -wstep;
+    npy_intp len0 = len & -wstep;
+    while (i < len0) {
+        npyv_s8 acc = npyv_setall_s8(s_acc);
+        npyv_u8 acc_indices = npyv_zero_u8();
+        npyv_u8 acc_indices_scale = npyv_zero_u8();
+
+        npy_intp n = i + MIN(len0 - i, max_block);
+        npy_intp ik = i, i2 = 0;
+        for (; i < n; i += wstep, ++i2) {
+            npyv_u8 vi = npyv_setall_u8((npyv_lanetype_u8)i2);
+            npyv_s8 a = npyv_load_s8(ip + i);
+            npyv_s8 b = npyv_load_s8(ip + i + vstep);
+            npyv_s8 c = npyv_load_s8(ip + i + vstep*2);
+            npyv_s8 d = npyv_load_s8(ip + i + vstep*3);
+
+            // reverse to put lowest index first in case of matched values
+            npyv_b8 m_ba = npyv_cmpgt_s8(b, a);
+            npyv_b8 m_dc = npyv_cmpgt_s8(d, c);
+            npyv_s8  x_ba = npyv_select_s8(m_ba, b, a);
+            npyv_s8  x_dc = npyv_select_s8(m_dc, d, c);
+            npyv_b8 m_dcba = npyv_cmpgt_s8(x_dc, x_ba);
+            npyv_s8  x_dcba = npyv_select_s8(m_dcba, x_dc, x_ba);
+
+            npyv_u8 idx_ba = npyv_select_u8(m_ba, vindices_1, vindices_0);
+            npyv_u8 idx_dc = npyv_select_u8(m_dc, vindices_3, vindices_2);
+            npyv_u8 idx_dcba = npyv_select_u8(m_dcba, idx_dc, idx_ba);
+            npyv_b8 m_acc = npyv_cmpgt_s8(x_dcba, acc);
+            acc = npyv_select_s8(m_acc, x_dcba, acc);
+            acc_indices = npyv_select_u8(m_acc, idx_dcba, acc_indices);
+            acc_indices_scale = npyv_select_u8(m_acc, vi, acc_indices_scale);
+        }
+        // reduce
+        npyv_lanetype_s8 dacc[npyv_nlanes_s8];
+        npyv_lanetype_u8 dacc_i[npyv_nlanes_s8];
+        npyv_lanetype_u8 dacc_s[npyv_nlanes_s8];
+        npyv_store_s8(dacc, acc);
+        npyv_store_u8(dacc_i, acc_indices);
+        npyv_store_u8(dacc_s, acc_indices_scale);
+
+        for (int vi = 0; vi < vstep; ++vi) {
+            if (dacc[vi] > s_acc) {
+                s_acc = dacc[vi];
+                ret_idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
+            }
+        }
+        // get the lowest index in case of matched values
+        for (int vi = 0; vi < vstep; ++vi) {
+            npy_intp idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
+            if (s_acc == dacc[vi] && ret_idx > idx) {
+                ret_idx = idx;
+            }
+        }
+    }
+    for (; i < len; ++i) {
+        npyv_lanetype_s8 a = ip[i];
+        if (a > s_acc) {
+            s_acc = a;
+            ret_idx = i;
+        }
+    }
+    return ret_idx;
+}
+
+#line 37
+static inline npy_intp
+simd_argmin_s8(npyv_lanetype_s8 *ip, npy_intp len)
+{
+    npyv_lanetype_s8 s_acc = *ip;
+    npy_intp ret_idx = 0, i = 0;
+
+    const int vstep = npyv_nlanes_s8;
+    const int wstep = vstep*4;
+    npyv_lanetype_u8 d_vindices[npyv_nlanes_s8*4];
+    for (int vi = 0; vi < wstep; ++vi) {
+        d_vindices[vi] = vi;
+    }
+    const npyv_u8 vindices_0 = npyv_load_u8(d_vindices);
+    const npyv_u8 vindices_1 = npyv_load_u8(d_vindices + vstep);
+    const npyv_u8 vindices_2 = npyv_load_u8(d_vindices + vstep*2);
+    const npyv_u8 vindices_3 = npyv_load_u8(d_vindices + vstep*3);
+
+    const npy_intp max_block = NPY_MAX_UINT8*wstep & -wstep;
+    npy_intp len0 = len & -wstep;
+    while (i < len0) {
+        npyv_s8 acc = npyv_setall_s8(s_acc);
+        npyv_u8 acc_indices = npyv_zero_u8();
+        npyv_u8 acc_indices_scale = npyv_zero_u8();
+
+        npy_intp n = i + MIN(len0 - i, max_block);
+        npy_intp ik = i, i2 = 0;
+        for (; i < n; i += wstep, ++i2) {
+            npyv_u8 vi = npyv_setall_u8((npyv_lanetype_u8)i2);
+            npyv_s8 a = npyv_load_s8(ip + i);
+            npyv_s8 b = npyv_load_s8(ip + i + vstep);
+            npyv_s8 c = npyv_load_s8(ip + i + vstep*2);
+            npyv_s8 d = npyv_load_s8(ip + i + vstep*3);
+
+            // reverse to put lowest index first in case of matched values
+            npyv_b8 m_ba = npyv_cmplt_s8(b, a);
+            npyv_b8 m_dc = npyv_cmplt_s8(d, c);
+            npyv_s8  x_ba = npyv_select_s8(m_ba, b, a);
+            npyv_s8  x_dc = npyv_select_s8(m_dc, d, c);
+            npyv_b8 m_dcba = npyv_cmplt_s8(x_dc, x_ba);
+            npyv_s8  x_dcba = npyv_select_s8(m_dcba, x_dc, x_ba);
+
+            npyv_u8 idx_ba = npyv_select_u8(m_ba, vindices_1, vindices_0);
+            npyv_u8 idx_dc = npyv_select_u8(m_dc, vindices_3, vindices_2);
+            npyv_u8 idx_dcba = npyv_select_u8(m_dcba, idx_dc, idx_ba);
+            npyv_b8 m_acc = npyv_cmplt_s8(x_dcba, acc);
+            acc = npyv_select_s8(m_acc, x_dcba, acc);
+            acc_indices = npyv_select_u8(m_acc, idx_dcba, acc_indices);
+            acc_indices_scale = npyv_select_u8(m_acc, vi, acc_indices_scale);
+        }
+        // reduce
+        npyv_lanetype_s8 dacc[npyv_nlanes_s8];
+        npyv_lanetype_u8 dacc_i[npyv_nlanes_s8];
+        npyv_lanetype_u8 dacc_s[npyv_nlanes_s8];
+        npyv_store_s8(dacc, acc);
+        npyv_store_u8(dacc_i, acc_indices);
+        npyv_store_u8(dacc_s, acc_indices_scale);
+
+        for (int vi = 0; vi < vstep; ++vi) {
+            if (dacc[vi] < s_acc) {
+                s_acc = dacc[vi];
+                ret_idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
+            }
+        }
+        // get the lowest index in case of matched values
+        for (int vi = 0; vi < vstep; ++vi) {
+            npy_intp idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
+            if (s_acc == dacc[vi] && ret_idx > idx) {
+                ret_idx = idx;
+            }
+        }
+    }
+    for (; i < len; ++i) {
+        npyv_lanetype_s8 a = ip[i];
+        if (a < s_acc) {
+            s_acc = a;
+            ret_idx = i;
+        }
+    }
+    return ret_idx;
+}
+
+
+#line 32
+#line 37
+static inline npy_intp
+simd_argmax_u16(npyv_lanetype_u16 *ip, npy_intp len)
+{
+    npyv_lanetype_u16 s_acc = *ip;
+    npy_intp ret_idx = 0, i = 0;
+
+    const int vstep = npyv_nlanes_u16;
+    const int wstep = vstep*4;
+    npyv_lanetype_u16 d_vindices[npyv_nlanes_u16*4];
+    for (int vi = 0; vi < wstep; ++vi) {
+        d_vindices[vi] = vi;
+    }
+    const npyv_u16 vindices_0 = npyv_load_u16(d_vindices);
+    const npyv_u16 vindices_1 = npyv_load_u16(d_vindices + vstep);
+    const npyv_u16 vindices_2 = npyv_load_u16(d_vindices + vstep*2);
+    const npyv_u16 vindices_3 = npyv_load_u16(d_vindices + vstep*3);
+
+    const npy_intp max_block = NPY_MAX_UINT16*wstep & -wstep;
+    npy_intp len0 = len & -wstep;
+    while (i < len0) {
+        npyv_u16 acc = npyv_setall_u16(s_acc);
+        npyv_u16 acc_indices = npyv_zero_u16();
+        npyv_u16 acc_indices_scale = npyv_zero_u16();
+
+        npy_intp n = i + MIN(len0 - i, max_block);
+        npy_intp ik = i, i2 = 0;
+        for (; i < n; i += wstep, ++i2) {
+            npyv_u16 vi = npyv_setall_u16((npyv_lanetype_u16)i2);
+            npyv_u16 a = npyv_load_u16(ip + i);
+            npyv_u16 b = npyv_load_u16(ip + i + vstep);
+            npyv_u16 c = npyv_load_u16(ip + i + vstep*2);
+            npyv_u16 d = npyv_load_u16(ip + i + vstep*3);
+
+            // reverse to put lowest index first in case of matched values
+            npyv_b16 m_ba = npyv_cmpgt_u16(b, a);
+            npyv_b16 m_dc = npyv_cmpgt_u16(d, c);
+            npyv_u16  x_ba = npyv_select_u16(m_ba, b, a);
+            npyv_u16  x_dc = npyv_select_u16(m_dc, d, c);
+            npyv_b16 m_dcba = npyv_cmpgt_u16(x_dc, x_ba);
+            npyv_u16  x_dcba = npyv_select_u16(m_dcba, x_dc, x_ba);
+
+            npyv_u16 idx_ba = npyv_select_u16(m_ba, vindices_1, vindices_0);
+            npyv_u16 idx_dc = npyv_select_u16(m_dc, vindices_3, vindices_2);
+            npyv_u16 idx_dcba = npyv_select_u16(m_dcba, idx_dc, idx_ba);
+            npyv_b16 m_acc = npyv_cmpgt_u16(x_dcba, acc);
+            acc = npyv_select_u16(m_acc, x_dcba, acc);
+            acc_indices = npyv_select_u16(m_acc, idx_dcba, acc_indices);
+            acc_indices_scale = npyv_select_u16(m_acc, vi, acc_indices_scale);
+        }
+        // reduce
+        npyv_lanetype_u16 dacc[npyv_nlanes_u16];
+        npyv_lanetype_u16 dacc_i[npyv_nlanes_u16];
+        npyv_lanetype_u16 dacc_s[npyv_nlanes_u16];
+        npyv_store_u16(dacc, acc);
+        npyv_store_u16(dacc_i, acc_indices);
+        npyv_store_u16(dacc_s, acc_indices_scale);
+
+        for (int vi = 0; vi < vstep; ++vi) {
+            if (dacc[vi] > s_acc) {
+                s_acc = dacc[vi];
+                ret_idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
+            }
+        }
+        // get the lowest index in case of matched values
+        for (int vi = 0; vi < vstep; ++vi) {
+            npy_intp idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
+            if (s_acc == dacc[vi] && ret_idx > idx) {
+                ret_idx = idx;
+            }
+        }
+    }
+    for (; i < len; ++i) {
+        npyv_lanetype_u16 a = ip[i];
+        if (a > s_acc) {
+            s_acc = a;
+            ret_idx = i;
+        }
+    }
+    return ret_idx;
+}
+
+#line 37
+static inline npy_intp
+simd_argmin_u16(npyv_lanetype_u16 *ip, npy_intp len)
+{
+    npyv_lanetype_u16 s_acc = *ip;
+    npy_intp ret_idx = 0, i = 0;
+
+    const int vstep = npyv_nlanes_u16;
+    const int wstep = vstep*4;
+    npyv_lanetype_u16 d_vindices[npyv_nlanes_u16*4];
+    for (int vi = 0; vi < wstep; ++vi) {
+        d_vindices[vi] = vi;
+    }
+    const npyv_u16 vindices_0 = npyv_load_u16(d_vindices);
+    const npyv_u16 vindices_1 = npyv_load_u16(d_vindices + vstep);
+    const npyv_u16 vindices_2 = npyv_load_u16(d_vindices + vstep*2);
+    const npyv_u16 vindices_3 = npyv_load_u16(d_vindices + vstep*3);
+
+    const npy_intp max_block = NPY_MAX_UINT16*wstep & -wstep;
+    npy_intp len0 = len & -wstep;
+    while (i < len0) {
+        npyv_u16 acc = npyv_setall_u16(s_acc);
+        npyv_u16 acc_indices = npyv_zero_u16();
+        npyv_u16 acc_indices_scale = npyv_zero_u16();
+
+        npy_intp n = i + MIN(len0 - i, max_block);
+        npy_intp ik = i, i2 = 0;
+        for (; i < n; i += wstep, ++i2) {
+            npyv_u16 vi = npyv_setall_u16((npyv_lanetype_u16)i2);
+            npyv_u16 a = npyv_load_u16(ip + i);
+            npyv_u16 b = npyv_load_u16(ip + i + vstep);
+            npyv_u16 c = npyv_load_u16(ip + i + vstep*2);
+            npyv_u16 d = npyv_load_u16(ip + i + vstep*3);
+
+            // reverse to put lowest index first in case of matched values
+            npyv_b16 m_ba = npyv_cmplt_u16(b, a);
+            npyv_b16 m_dc = npyv_cmplt_u16(d, c);
+            npyv_u16  x_ba = npyv_select_u16(m_ba, b, a);
+            npyv_u16  x_dc = npyv_select_u16(m_dc, d, c);
+            npyv_b16 m_dcba = npyv_cmplt_u16(x_dc, x_ba);
+            npyv_u16  x_dcba = npyv_select_u16(m_dcba, x_dc, x_ba);
+
+            npyv_u16 idx_ba = npyv_select_u16(m_ba, vindices_1, vindices_0);
+            npyv_u16 idx_dc = npyv_select_u16(m_dc, vindices_3, vindices_2);
+            npyv_u16 idx_dcba = npyv_select_u16(m_dcba, idx_dc, idx_ba);
+            npyv_b16 m_acc = npyv_cmplt_u16(x_dcba, acc);
+            acc = npyv_select_u16(m_acc, x_dcba, acc);
+            acc_indices = npyv_select_u16(m_acc, idx_dcba, acc_indices);
+            acc_indices_scale = npyv_select_u16(m_acc, vi, acc_indices_scale);
+        }
+        // reduce
+        npyv_lanetype_u16 dacc[npyv_nlanes_u16];
+        npyv_lanetype_u16 dacc_i[npyv_nlanes_u16];
+        npyv_lanetype_u16 dacc_s[npyv_nlanes_u16];
+        npyv_store_u16(dacc, acc);
+        npyv_store_u16(dacc_i, acc_indices);
+        npyv_store_u16(dacc_s, acc_indices_scale);
+
+        for (int vi = 0; vi < vstep; ++vi) {
+            if (dacc[vi] < s_acc) {
+                s_acc = dacc[vi];
+                ret_idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
+            }
+        }
+        // get the lowest index in case of matched values
+        for (int vi = 0; vi < vstep; ++vi) {
+            npy_intp idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
+            if (s_acc == dacc[vi] && ret_idx > idx) {
+                ret_idx = idx;
+            }
+        }
+    }
+    for (; i < len; ++i) {
+        npyv_lanetype_u16 a = ip[i];
+        if (a < s_acc) {
+            s_acc = a;
+            ret_idx = i;
+        }
+    }
+    return ret_idx;
+}
+
+
+#line 32
+#line 37
+static inline npy_intp
+simd_argmax_s16(npyv_lanetype_s16 *ip, npy_intp len)
+{
+    npyv_lanetype_s16 s_acc = *ip;
+    npy_intp ret_idx = 0, i = 0;
+
+    const int vstep = npyv_nlanes_s16;
+    const int wstep = vstep*4;
+    npyv_lanetype_u16 d_vindices[npyv_nlanes_s16*4];
+    for (int vi = 0; vi < wstep; ++vi) {
+        d_vindices[vi] = vi;
+    }
+    const npyv_u16 vindices_0 = npyv_load_u16(d_vindices);
+    const npyv_u16 vindices_1 = npyv_load_u16(d_vindices + vstep);
+    const npyv_u16 vindices_2 = npyv_load_u16(d_vindices + vstep*2);
+    const npyv_u16 vindices_3 = npyv_load_u16(d_vindices + vstep*3);
+
+    const npy_intp max_block = NPY_MAX_UINT16*wstep & -wstep;
+    npy_intp len0 = len & -wstep;
+    while (i < len0) {
+        npyv_s16 acc = npyv_setall_s16(s_acc);
+        npyv_u16 acc_indices = npyv_zero_u16();
+        npyv_u16 acc_indices_scale = npyv_zero_u16();
+
+        npy_intp n = i + MIN(len0 - i, max_block);
+        npy_intp ik = i, i2 = 0;
+        for (; i < n; i += wstep, ++i2) {
+            npyv_u16 vi = npyv_setall_u16((npyv_lanetype_u16)i2);
+            npyv_s16 a = npyv_load_s16(ip + i);
+            npyv_s16 b = npyv_load_s16(ip + i + vstep);
+            npyv_s16 c = npyv_load_s16(ip + i + vstep*2);
+            npyv_s16 d = npyv_load_s16(ip + i + vstep*3);
+
+            // reverse to put lowest index first in case of matched values
+            npyv_b16 m_ba = npyv_cmpgt_s16(b, a);
+            npyv_b16 m_dc = npyv_cmpgt_s16(d, c);
+            npyv_s16  x_ba = npyv_select_s16(m_ba, b, a);
+            npyv_s16  x_dc = npyv_select_s16(m_dc, d, c);
+            npyv_b16 m_dcba = npyv_cmpgt_s16(x_dc, x_ba);
+            npyv_s16  x_dcba = npyv_select_s16(m_dcba, x_dc, x_ba);
+
+            npyv_u16 idx_ba = npyv_select_u16(m_ba, vindices_1, vindices_0);
+            npyv_u16 idx_dc = npyv_select_u16(m_dc, vindices_3, vindices_2);
+            npyv_u16 idx_dcba = npyv_select_u16(m_dcba, idx_dc, idx_ba);
+            npyv_b16 m_acc = npyv_cmpgt_s16(x_dcba, acc);
+            acc = npyv_select_s16(m_acc, x_dcba, acc);
+            acc_indices = npyv_select_u16(m_acc, idx_dcba, acc_indices);
+            acc_indices_scale = npyv_select_u16(m_acc, vi, acc_indices_scale);
+        }
+        // reduce
+        npyv_lanetype_s16 dacc[npyv_nlanes_s16];
+        npyv_lanetype_u16 dacc_i[npyv_nlanes_s16];
+        npyv_lanetype_u16 dacc_s[npyv_nlanes_s16];
+        npyv_store_s16(dacc, acc);
+        npyv_store_u16(dacc_i, acc_indices);
+        npyv_store_u16(dacc_s, acc_indices_scale);
+
+        for (int vi = 0; vi < vstep; ++vi) {
+            if (dacc[vi] > s_acc) {
+                s_acc = dacc[vi];
+                ret_idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
+            }
+        }
+        // get the lowest index in case of matched values
+        for (int vi = 0; vi < vstep; ++vi) {
+            npy_intp idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
+            if (s_acc == dacc[vi] && ret_idx > idx) {
+                ret_idx = idx;
+            }
+        }
+    }
+    for (; i < len; ++i) {
+        npyv_lanetype_s16 a = ip[i];
+        if (a > s_acc) {
+            s_acc = a;
+            ret_idx = i;
+        }
+    }
+    return ret_idx;
+}
+
+#line 37
+static inline npy_intp
+simd_argmin_s16(npyv_lanetype_s16 *ip, npy_intp len)
+{
+    npyv_lanetype_s16 s_acc = *ip;
+    npy_intp ret_idx = 0, i = 0;
+
+    const int vstep = npyv_nlanes_s16;
+    const int wstep = vstep*4;
+    npyv_lanetype_u16 d_vindices[npyv_nlanes_s16*4];
+    for (int vi = 0; vi < wstep; ++vi) {
+        d_vindices[vi] = vi;
+    }
+    const npyv_u16 vindices_0 = npyv_load_u16(d_vindices);
+    const npyv_u16 vindices_1 = npyv_load_u16(d_vindices + vstep);
+    const npyv_u16 vindices_2 = npyv_load_u16(d_vindices + vstep*2);
+    const npyv_u16 vindices_3 = npyv_load_u16(d_vindices + vstep*3);
+
+    const npy_intp max_block = NPY_MAX_UINT16*wstep & -wstep;
+    npy_intp len0 = len & -wstep;
+    while (i < len0) {
+        npyv_s16 acc = npyv_setall_s16(s_acc);
+        npyv_u16 acc_indices = npyv_zero_u16();
+        npyv_u16 acc_indices_scale = npyv_zero_u16();
+
+        npy_intp n = i + MIN(len0 - i, max_block);
+        npy_intp ik = i, i2 = 0;
+        for (; i < n; i += wstep, ++i2) {
+            npyv_u16 vi = npyv_setall_u16((npyv_lanetype_u16)i2);
+            npyv_s16 a = npyv_load_s16(ip + i);
+            npyv_s16 b = npyv_load_s16(ip + i + vstep);
+            npyv_s16 c = npyv_load_s16(ip + i + vstep*2);
+            npyv_s16 d = npyv_load_s16(ip + i + vstep*3);
+
+            // reverse to put lowest index first in case of matched values
+            npyv_b16 m_ba = npyv_cmplt_s16(b, a);
+            npyv_b16 m_dc = npyv_cmplt_s16(d, c);
+            npyv_s16  x_ba = npyv_select_s16(m_ba, b, a);
+            npyv_s16  x_dc = npyv_select_s16(m_dc, d, c);
+            npyv_b16 m_dcba = npyv_cmplt_s16(x_dc, x_ba);
+            npyv_s16  x_dcba = npyv_select_s16(m_dcba, x_dc, x_ba);
+
+            npyv_u16 idx_ba = npyv_select_u16(m_ba, vindices_1, vindices_0);
+            npyv_u16 idx_dc = npyv_select_u16(m_dc, vindices_3, vindices_2);
+            npyv_u16 idx_dcba = npyv_select_u16(m_dcba, idx_dc, idx_ba);
+            npyv_b16 m_acc = npyv_cmplt_s16(x_dcba, acc);
+            acc = npyv_select_s16(m_acc, x_dcba, acc);
+            acc_indices = npyv_select_u16(m_acc, idx_dcba, acc_indices);
+            acc_indices_scale = npyv_select_u16(m_acc, vi, acc_indices_scale);
+        }
+        // reduce
+        npyv_lanetype_s16 dacc[npyv_nlanes_s16];
+        npyv_lanetype_u16 dacc_i[npyv_nlanes_s16];
+        npyv_lanetype_u16 dacc_s[npyv_nlanes_s16];
+        npyv_store_s16(dacc, acc);
+        npyv_store_u16(dacc_i, acc_indices);
+        npyv_store_u16(dacc_s, acc_indices_scale);
+
+        for (int vi = 0; vi < vstep; ++vi) {
+            if (dacc[vi] < s_acc) {
+                s_acc = dacc[vi];
+                ret_idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
+            }
+        }
+        // get the lowest index in case of matched values
+        for (int vi = 0; vi < vstep; ++vi) {
+            npy_intp idx = ik + (npy_intp)dacc_s[vi]*wstep + dacc_i[vi];
+            if (s_acc == dacc[vi] && ret_idx > idx) {
+                ret_idx = idx;
+            }
+        }
+    }
+    for (; i < len; ++i) {
+        npyv_lanetype_s16 a = ip[i];
+        if (a < s_acc) {
+            s_acc = a;
+            ret_idx = i;
+        }
+    }
+    return ret_idx;
+}
+
+
+#endif
+
+#line 129
+#if NPY_SIMD
+#line 136
+static inline npy_intp
+simd_argmax_u32(npyv_lanetype_u32 *ip, npy_intp len)
+{
+    npyv_lanetype_u32 s_acc = *ip;
+    npy_intp ret_idx = 0, i = 0;
+    const int vstep = npyv_nlanes_u32;
+    const int wstep = vstep*4;
+    // loop by a scalar will perform better for small arrays
+    if (len < wstep) {
+        goto scalar_loop;
+    }
+    npy_intp len0 = len;
+    // guard against wraparound vector addition for 32-bit indices
+    // in case of the array length is larger than 16gb
+#if 1
+    if (len0 > NPY_MAX_UINT32) {
+        len0 = NPY_MAX_UINT32;
+    }
+#endif
+    // create index for vector indices
+    npyv_lanetype_u32 d_vindices[npyv_nlanes_u32*4];
+    for (int vi = 0; vi < wstep; ++vi) {
+        d_vindices[vi] = vi;
+    }
+    const npyv_u32 vindices_0 = npyv_load_u32(d_vindices);
+    const npyv_u32 vindices_1 = npyv_load_u32(d_vindices + vstep);
+    const npyv_u32 vindices_2 = npyv_load_u32(d_vindices + vstep*2);
+    const npyv_u32 vindices_3 = npyv_load_u32(d_vindices + vstep*3);
+    // initialize vector accumulator for highest values and its indexes
+    npyv_u32 acc_indices = npyv_zero_u32();
+    npyv_u32 acc = npyv_setall_u32(s_acc);
+    for (npy_intp n = len0 & -wstep; i < n; i += wstep) {
+        npyv_u32 vi = npyv_setall_u32((npyv_lanetype_u32)i);
+        npyv_u32 a = npyv_load_u32(ip + i);
+        npyv_u32 b = npyv_load_u32(ip + i + vstep);
+        npyv_u32 c = npyv_load_u32(ip + i + vstep*2);
+        npyv_u32 d = npyv_load_u32(ip + i + vstep*3);
+
+        // reverse to put lowest index first in case of matched values
+        npyv_b32 m_ba = npyv_cmpgt_u32(b, a);
+        npyv_b32 m_dc = npyv_cmpgt_u32(d, c);
+        npyv_u32  x_ba = npyv_select_u32(m_ba, b, a);
+        npyv_u32  x_dc = npyv_select_u32(m_dc, d, c);
+        npyv_b32 m_dcba = npyv_cmpgt_u32(x_dc, x_ba);
+        npyv_u32  x_dcba = npyv_select_u32(m_dcba, x_dc, x_ba);
+
+        npyv_u32 idx_ba = npyv_select_u32(m_ba, vindices_1, vindices_0);
+        npyv_u32 idx_dc = npyv_select_u32(m_dc, vindices_3, vindices_2);
+        npyv_u32 idx_dcba = npyv_select_u32(m_dcba, idx_dc, idx_ba);
+        npyv_b32 m_acc = npyv_cmpgt_u32(x_dcba, acc);
+        acc = npyv_select_u32(m_acc, x_dcba, acc);
+        acc_indices = npyv_select_u32(m_acc, npyv_add_u32(vi, idx_dcba), acc_indices);
+
+    #if 0
+        npyv_b32 nnan_a = npyv_notnan_u32(a);
+        npyv_b32 nnan_b = npyv_notnan_u32(b);
+        npyv_b32 nnan_c = npyv_notnan_u32(c);
+        npyv_b32 nnan_d = npyv_notnan_u32(d);
+        npyv_b32 nnan_ab = npyv_and_b32(nnan_a, nnan_b);
+        npyv_b32 nnan_cd = npyv_and_b32(nnan_c, nnan_d);
+        npy_uint64 nnan = npyv_tobits_b32(npyv_and_b32(nnan_ab, nnan_cd));
+        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
+            npy_uint64 nnan_4[4];
+            nnan_4[0] = npyv_tobits_b32(nnan_a);
+            nnan_4[1] = npyv_tobits_b32(nnan_b);
+            nnan_4[2] = npyv_tobits_b32(nnan_c);
+            nnan_4[3] = npyv_tobits_b32(nnan_d);
+            for (int ni = 0; ni < 4; ++ni) {
+                for (int vi = 0; vi < vstep; ++vi) {
+                    if (!((nnan_4[ni] >> vi) & 1)) {
+                        return i + ni*vstep + vi;
+                    }
+                }
+            }
+        }
+    #endif
+    }
+    for (npy_intp n = len0 & -vstep; i < n; i += vstep) {
+        npyv_u32 vi = npyv_setall_u32((npyv_lanetype_u32)i);
+        npyv_u32 a = npyv_load_u32(ip + i);
+        npyv_b32 m_acc = npyv_cmpgt_u32(a, acc);
+        acc = npyv_select_u32(m_acc, a, acc);
+        acc_indices = npyv_select_u32(m_acc, npyv_add_u32(vi, vindices_0), acc_indices);
+    #if 0
+        npyv_b32 nnan_a = npyv_notnan_u32(a);
+        npy_uint64 nnan = npyv_tobits_b32(nnan_a);
+        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
+            for (int vi = 0; vi < vstep; ++vi) {
+                if (!((nnan >> vi) & 1)) {
+                    return i + vi;
+                }
+            }
+        }
+    #endif
+    }
+
+    // reduce
+    npyv_lanetype_u32 dacc[npyv_nlanes_u32];
+    npyv_lanetype_u32 dacc_i[npyv_nlanes_u32];
+    npyv_store_u32(dacc_i, acc_indices);
+    npyv_store_u32(dacc, acc);
+
+    s_acc = dacc[0];
+    ret_idx = dacc_i[0];
+    for (int vi = 1; vi < vstep; ++vi) {
+        if (dacc[vi] > s_acc) {
+            s_acc = dacc[vi];
+            ret_idx = (npy_intp)dacc_i[vi];
+        }
+    }
+    // get the lowest index in case of matched values
+    for (int vi = 0; vi < vstep; ++vi) {
+        if (s_acc == dacc[vi] && ret_idx > (npy_intp)dacc_i[vi]) {
+            ret_idx = dacc_i[vi];
+        }
+    }
+scalar_loop:
+    for (; i < len; ++i) {
+        npyv_lanetype_u32 a = ip[i];
+    #if 0
+        if (!(a <= s_acc)) {  // negated, for correct nan handling
+    #else
+        if (a > s_acc) {
+    #endif
+            s_acc = a;
+            ret_idx = i;
+        #if 0
+            if (npy_isnan(s_acc)) {
+                // nan encountered, it's maximal
+                return ret_idx;
+            }
+        #endif
+        }
+    }
+    return ret_idx;
+}
+
+#line 136
+static inline npy_intp
+simd_argmin_u32(npyv_lanetype_u32 *ip, npy_intp len)
+{
+    npyv_lanetype_u32 s_acc = *ip;
+    npy_intp ret_idx = 0, i = 0;
+    const int vstep = npyv_nlanes_u32;
+    const int wstep = vstep*4;
+    // loop by a scalar will perform better for small arrays
+    if (len < wstep) {
+        goto scalar_loop;
+    }
+    npy_intp len0 = len;
+    // guard against wraparound vector addition for 32-bit indices
+    // in case of the array length is larger than 16gb
+#if 1
+    if (len0 > NPY_MAX_UINT32) {
+        len0 = NPY_MAX_UINT32;
+    }
+#endif
+    // create index for vector indices
+    npyv_lanetype_u32 d_vindices[npyv_nlanes_u32*4];
+    for (int vi = 0; vi < wstep; ++vi) {
+        d_vindices[vi] = vi;
+    }
+    const npyv_u32 vindices_0 = npyv_load_u32(d_vindices);
+    const npyv_u32 vindices_1 = npyv_load_u32(d_vindices + vstep);
+    const npyv_u32 vindices_2 = npyv_load_u32(d_vindices + vstep*2);
+    const npyv_u32 vindices_3 = npyv_load_u32(d_vindices + vstep*3);
+    // initialize vector accumulator for highest values and its indexes
+    npyv_u32 acc_indices = npyv_zero_u32();
+    npyv_u32 acc = npyv_setall_u32(s_acc);
+    for (npy_intp n = len0 & -wstep; i < n; i += wstep) {
+        npyv_u32 vi = npyv_setall_u32((npyv_lanetype_u32)i);
+        npyv_u32 a = npyv_load_u32(ip + i);
+        npyv_u32 b = npyv_load_u32(ip + i + vstep);
+        npyv_u32 c = npyv_load_u32(ip + i + vstep*2);
+        npyv_u32 d = npyv_load_u32(ip + i + vstep*3);
+
+        // reverse to put lowest index first in case of matched values
+        npyv_b32 m_ba = npyv_cmplt_u32(b, a);
+        npyv_b32 m_dc = npyv_cmplt_u32(d, c);
+        npyv_u32  x_ba = npyv_select_u32(m_ba, b, a);
+        npyv_u32  x_dc = npyv_select_u32(m_dc, d, c);
+        npyv_b32 m_dcba = npyv_cmplt_u32(x_dc, x_ba);
+        npyv_u32  x_dcba = npyv_select_u32(m_dcba, x_dc, x_ba);
+
+        npyv_u32 idx_ba = npyv_select_u32(m_ba, vindices_1, vindices_0);
+        npyv_u32 idx_dc = npyv_select_u32(m_dc, vindices_3, vindices_2);
+        npyv_u32 idx_dcba = npyv_select_u32(m_dcba, idx_dc, idx_ba);
+        npyv_b32 m_acc = npyv_cmplt_u32(x_dcba, acc);
+        acc = npyv_select_u32(m_acc, x_dcba, acc);
+        acc_indices = npyv_select_u32(m_acc, npyv_add_u32(vi, idx_dcba), acc_indices);
+
+    #if 0
+        npyv_b32 nnan_a = npyv_notnan_u32(a);
+        npyv_b32 nnan_b = npyv_notnan_u32(b);
+        npyv_b32 nnan_c = npyv_notnan_u32(c);
+        npyv_b32 nnan_d = npyv_notnan_u32(d);
+        npyv_b32 nnan_ab = npyv_and_b32(nnan_a, nnan_b);
+        npyv_b32 nnan_cd = npyv_and_b32(nnan_c, nnan_d);
+        npy_uint64 nnan = npyv_tobits_b32(npyv_and_b32(nnan_ab, nnan_cd));
+        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
+            npy_uint64 nnan_4[4];
+            nnan_4[0] = npyv_tobits_b32(nnan_a);
+            nnan_4[1] = npyv_tobits_b32(nnan_b);
+            nnan_4[2] = npyv_tobits_b32(nnan_c);
+            nnan_4[3] = npyv_tobits_b32(nnan_d);
+            for (int ni = 0; ni < 4; ++ni) {
+                for (int vi = 0; vi < vstep; ++vi) {
+                    if (!((nnan_4[ni] >> vi) & 1)) {
+                        return i + ni*vstep + vi;
+                    }
+                }
+            }
+        }
+    #endif
+    }
+    for (npy_intp n = len0 & -vstep; i < n; i += vstep) {
+        npyv_u32 vi = npyv_setall_u32((npyv_lanetype_u32)i);
+        npyv_u32 a = npyv_load_u32(ip + i);
+        npyv_b32 m_acc = npyv_cmplt_u32(a, acc);
+        acc = npyv_select_u32(m_acc, a, acc);
+        acc_indices = npyv_select_u32(m_acc, npyv_add_u32(vi, vindices_0), acc_indices);
+    #if 0
+        npyv_b32 nnan_a = npyv_notnan_u32(a);
+        npy_uint64 nnan = npyv_tobits_b32(nnan_a);
+        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
+            for (int vi = 0; vi < vstep; ++vi) {
+                if (!((nnan >> vi) & 1)) {
+                    return i + vi;
+                }
+            }
+        }
+    #endif
+    }
+
+    // reduce
+    npyv_lanetype_u32 dacc[npyv_nlanes_u32];
+    npyv_lanetype_u32 dacc_i[npyv_nlanes_u32];
+    npyv_store_u32(dacc_i, acc_indices);
+    npyv_store_u32(dacc, acc);
+
+    s_acc = dacc[0];
+    ret_idx = dacc_i[0];
+    for (int vi = 1; vi < vstep; ++vi) {
+        if (dacc[vi] < s_acc) {
+            s_acc = dacc[vi];
+            ret_idx = (npy_intp)dacc_i[vi];
+        }
+    }
+    // get the lowest index in case of matched values
+    for (int vi = 0; vi < vstep; ++vi) {
+        if (s_acc == dacc[vi] && ret_idx > (npy_intp)dacc_i[vi]) {
+            ret_idx = dacc_i[vi];
+        }
+    }
+scalar_loop:
+    for (; i < len; ++i) {
+        npyv_lanetype_u32 a = ip[i];
+    #if 0
+        if (!(a >= s_acc)) {  // negated, for correct nan handling
+    #else
+        if (a < s_acc) {
+    #endif
+            s_acc = a;
+            ret_idx = i;
+        #if 0
+            if (npy_isnan(s_acc)) {
+                // nan encountered, it's maximal
+                return ret_idx;
+            }
+        #endif
+        }
+    }
+    return ret_idx;
+}
+
+#endif // chk_simd
+
+#line 129
+#if NPY_SIMD
+#line 136
+static inline npy_intp
+simd_argmax_s32(npyv_lanetype_s32 *ip, npy_intp len)
+{
+    npyv_lanetype_s32 s_acc = *ip;
+    npy_intp ret_idx = 0, i = 0;
+    const int vstep = npyv_nlanes_s32;
+    const int wstep = vstep*4;
+    // loop by a scalar will perform better for small arrays
+    if (len < wstep) {
+        goto scalar_loop;
+    }
+    npy_intp len0 = len;
+    // guard against wraparound vector addition for 32-bit indices
+    // in case of the array length is larger than 16gb
+#if 1
+    if (len0 > NPY_MAX_UINT32) {
+        len0 = NPY_MAX_UINT32;
+    }
+#endif
+    // create index for vector indices
+    npyv_lanetype_u32 d_vindices[npyv_nlanes_s32*4];
+    for (int vi = 0; vi < wstep; ++vi) {
+        d_vindices[vi] = vi;
+    }
+    const npyv_u32 vindices_0 = npyv_load_u32(d_vindices);
+    const npyv_u32 vindices_1 = npyv_load_u32(d_vindices + vstep);
+    const npyv_u32 vindices_2 = npyv_load_u32(d_vindices + vstep*2);
+    const npyv_u32 vindices_3 = npyv_load_u32(d_vindices + vstep*3);
+    // initialize vector accumulator for highest values and its indexes
+    npyv_u32 acc_indices = npyv_zero_u32();
+    npyv_s32 acc = npyv_setall_s32(s_acc);
+    for (npy_intp n = len0 & -wstep; i < n; i += wstep) {
+        npyv_u32 vi = npyv_setall_u32((npyv_lanetype_u32)i);
+        npyv_s32 a = npyv_load_s32(ip + i);
+        npyv_s32 b = npyv_load_s32(ip + i + vstep);
+        npyv_s32 c = npyv_load_s32(ip + i + vstep*2);
+        npyv_s32 d = npyv_load_s32(ip + i + vstep*3);
+
+        // reverse to put lowest index first in case of matched values
+        npyv_b32 m_ba = npyv_cmpgt_s32(b, a);
+        npyv_b32 m_dc = npyv_cmpgt_s32(d, c);
+        npyv_s32  x_ba = npyv_select_s32(m_ba, b, a);
+        npyv_s32  x_dc = npyv_select_s32(m_dc, d, c);
+        npyv_b32 m_dcba = npyv_cmpgt_s32(x_dc, x_ba);
+        npyv_s32  x_dcba = npyv_select_s32(m_dcba, x_dc, x_ba);
+
+        npyv_u32 idx_ba = npyv_select_u32(m_ba, vindices_1, vindices_0);
+        npyv_u32 idx_dc = npyv_select_u32(m_dc, vindices_3, vindices_2);
+        npyv_u32 idx_dcba = npyv_select_u32(m_dcba, idx_dc, idx_ba);
+        npyv_b32 m_acc = npyv_cmpgt_s32(x_dcba, acc);
+        acc = npyv_select_s32(m_acc, x_dcba, acc);
+        acc_indices = npyv_select_u32(m_acc, npyv_add_u32(vi, idx_dcba), acc_indices);
+
+    #if 0
+        npyv_b32 nnan_a = npyv_notnan_s32(a);
+        npyv_b32 nnan_b = npyv_notnan_s32(b);
+        npyv_b32 nnan_c = npyv_notnan_s32(c);
+        npyv_b32 nnan_d = npyv_notnan_s32(d);
+        npyv_b32 nnan_ab = npyv_and_b32(nnan_a, nnan_b);
+        npyv_b32 nnan_cd = npyv_and_b32(nnan_c, nnan_d);
+        npy_uint64 nnan = npyv_tobits_b32(npyv_and_b32(nnan_ab, nnan_cd));
+        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
+            npy_uint64 nnan_4[4];
+            nnan_4[0] = npyv_tobits_b32(nnan_a);
+            nnan_4[1] = npyv_tobits_b32(nnan_b);
+            nnan_4[2] = npyv_tobits_b32(nnan_c);
+            nnan_4[3] = npyv_tobits_b32(nnan_d);
+            for (int ni = 0; ni < 4; ++ni) {
+                for (int vi = 0; vi < vstep; ++vi) {
+                    if (!((nnan_4[ni] >> vi) & 1)) {
+                        return i + ni*vstep + vi;
+                    }
+                }
+            }
+        }
+    #endif
+    }
+    for (npy_intp n = len0 & -vstep; i < n; i += vstep) {
+        npyv_u32 vi = npyv_setall_u32((npyv_lanetype_u32)i);
+        npyv_s32 a = npyv_load_s32(ip + i);
+        npyv_b32 m_acc = npyv_cmpgt_s32(a, acc);
+        acc = npyv_select_s32(m_acc, a, acc);
+        acc_indices = npyv_select_u32(m_acc, npyv_add_u32(vi, vindices_0), acc_indices);
+    #if 0
+        npyv_b32 nnan_a = npyv_notnan_s32(a);
+        npy_uint64 nnan = npyv_tobits_b32(nnan_a);
+        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
+            for (int vi = 0; vi < vstep; ++vi) {
+                if (!((nnan >> vi) & 1)) {
+                    return i + vi;
+                }
+            }
+        }
+    #endif
+    }
+
+    // reduce
+    npyv_lanetype_s32 dacc[npyv_nlanes_s32];
+    npyv_lanetype_u32 dacc_i[npyv_nlanes_s32];
+    npyv_store_u32(dacc_i, acc_indices);
+    npyv_store_s32(dacc, acc);
+
+    s_acc = dacc[0];
+    ret_idx = dacc_i[0];
+    for (int vi = 1; vi < vstep; ++vi) {
+        if (dacc[vi] > s_acc) {
+            s_acc = dacc[vi];
+            ret_idx = (npy_intp)dacc_i[vi];
+        }
+    }
+    // get the lowest index in case of matched values
+    for (int vi = 0; vi < vstep; ++vi) {
+        if (s_acc == dacc[vi] && ret_idx > (npy_intp)dacc_i[vi]) {
+            ret_idx = dacc_i[vi];
+        }
+    }
+scalar_loop:
+    for (; i < len; ++i) {
+        npyv_lanetype_s32 a = ip[i];
+    #if 0
+        if (!(a <= s_acc)) {  // negated, for correct nan handling
+    #else
+        if (a > s_acc) {
+    #endif
+            s_acc = a;
+            ret_idx = i;
+        #if 0
+            if (npy_isnan(s_acc)) {
+                // nan encountered, it's maximal
+                return ret_idx;
+            }
+        #endif
+        }
+    }
+    return ret_idx;
+}
+
+#line 136
+static inline npy_intp
+simd_argmin_s32(npyv_lanetype_s32 *ip, npy_intp len)
+{
+    npyv_lanetype_s32 s_acc = *ip;
+    npy_intp ret_idx = 0, i = 0;
+    const int vstep = npyv_nlanes_s32;
+    const int wstep = vstep*4;
+    // loop by a scalar will perform better for small arrays
+    if (len < wstep) {
+        goto scalar_loop;
+    }
+    npy_intp len0 = len;
+    // guard against wraparound vector addition for 32-bit indices
+    // in case of the array length is larger than 16gb
+#if 1
+    if (len0 > NPY_MAX_UINT32) {
+        len0 = NPY_MAX_UINT32;
+    }
+#endif
+    // create index for vector indices
+    npyv_lanetype_u32 d_vindices[npyv_nlanes_s32*4];
+    for (int vi = 0; vi < wstep; ++vi) {
+        d_vindices[vi] = vi;
+    }
+    const npyv_u32 vindices_0 = npyv_load_u32(d_vindices);
+    const npyv_u32 vindices_1 = npyv_load_u32(d_vindices + vstep);
+    const npyv_u32 vindices_2 = npyv_load_u32(d_vindices + vstep*2);
+    const npyv_u32 vindices_3 = npyv_load_u32(d_vindices + vstep*3);
+    // initialize vector accumulator for highest values and its indexes
+    npyv_u32 acc_indices = npyv_zero_u32();
+    npyv_s32 acc = npyv_setall_s32(s_acc);
+    for (npy_intp n = len0 & -wstep; i < n; i += wstep) {
+        npyv_u32 vi = npyv_setall_u32((npyv_lanetype_u32)i);
+        npyv_s32 a = npyv_load_s32(ip + i);
+        npyv_s32 b = npyv_load_s32(ip + i + vstep);
+        npyv_s32 c = npyv_load_s32(ip + i + vstep*2);
+        npyv_s32 d = npyv_load_s32(ip + i + vstep*3);
+
+        // reverse to put lowest index first in case of matched values
+        npyv_b32 m_ba = npyv_cmplt_s32(b, a);
+        npyv_b32 m_dc = npyv_cmplt_s32(d, c);
+        npyv_s32  x_ba = npyv_select_s32(m_ba, b, a);
+        npyv_s32  x_dc = npyv_select_s32(m_dc, d, c);
+        npyv_b32 m_dcba = npyv_cmplt_s32(x_dc, x_ba);
+        npyv_s32  x_dcba = npyv_select_s32(m_dcba, x_dc, x_ba);
+
+        npyv_u32 idx_ba = npyv_select_u32(m_ba, vindices_1, vindices_0);
+        npyv_u32 idx_dc = npyv_select_u32(m_dc, vindices_3, vindices_2);
+        npyv_u32 idx_dcba = npyv_select_u32(m_dcba, idx_dc, idx_ba);
+        npyv_b32 m_acc = npyv_cmplt_s32(x_dcba, acc);
+        acc = npyv_select_s32(m_acc, x_dcba, acc);
+        acc_indices = npyv_select_u32(m_acc, npyv_add_u32(vi, idx_dcba), acc_indices);
+
+    #if 0
+        npyv_b32 nnan_a = npyv_notnan_s32(a);
+        npyv_b32 nnan_b = npyv_notnan_s32(b);
+        npyv_b32 nnan_c = npyv_notnan_s32(c);
+        npyv_b32 nnan_d = npyv_notnan_s32(d);
+        npyv_b32 nnan_ab = npyv_and_b32(nnan_a, nnan_b);
+        npyv_b32 nnan_cd = npyv_and_b32(nnan_c, nnan_d);
+        npy_uint64 nnan = npyv_tobits_b32(npyv_and_b32(nnan_ab, nnan_cd));
+        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
+            npy_uint64 nnan_4[4];
+            nnan_4[0] = npyv_tobits_b32(nnan_a);
+            nnan_4[1] = npyv_tobits_b32(nnan_b);
+            nnan_4[2] = npyv_tobits_b32(nnan_c);
+            nnan_4[3] = npyv_tobits_b32(nnan_d);
+            for (int ni = 0; ni < 4; ++ni) {
+                for (int vi = 0; vi < vstep; ++vi) {
+                    if (!((nnan_4[ni] >> vi) & 1)) {
+                        return i + ni*vstep + vi;
+                    }
+                }
+            }
+        }
+    #endif
+    }
+    for (npy_intp n = len0 & -vstep; i < n; i += vstep) {
+        npyv_u32 vi = npyv_setall_u32((npyv_lanetype_u32)i);
+        npyv_s32 a = npyv_load_s32(ip + i);
+        npyv_b32 m_acc = npyv_cmplt_s32(a, acc);
+        acc = npyv_select_s32(m_acc, a, acc);
+        acc_indices = npyv_select_u32(m_acc, npyv_add_u32(vi, vindices_0), acc_indices);
+    #if 0
+        npyv_b32 nnan_a = npyv_notnan_s32(a);
+        npy_uint64 nnan = npyv_tobits_b32(nnan_a);
+        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
+            for (int vi = 0; vi < vstep; ++vi) {
+                if (!((nnan >> vi) & 1)) {
+                    return i + vi;
+                }
+            }
+        }
+    #endif
+    }
+
+    // reduce
+    npyv_lanetype_s32 dacc[npyv_nlanes_s32];
+    npyv_lanetype_u32 dacc_i[npyv_nlanes_s32];
+    npyv_store_u32(dacc_i, acc_indices);
+    npyv_store_s32(dacc, acc);
+
+    s_acc = dacc[0];
+    ret_idx = dacc_i[0];
+    for (int vi = 1; vi < vstep; ++vi) {
+        if (dacc[vi] < s_acc) {
+            s_acc = dacc[vi];
+            ret_idx = (npy_intp)dacc_i[vi];
+        }
+    }
+    // get the lowest index in case of matched values
+    for (int vi = 0; vi < vstep; ++vi) {
+        if (s_acc == dacc[vi] && ret_idx > (npy_intp)dacc_i[vi]) {
+            ret_idx = dacc_i[vi];
+        }
+    }
+scalar_loop:
+    for (; i < len; ++i) {
+        npyv_lanetype_s32 a = ip[i];
+    #if 0
+        if (!(a >= s_acc)) {  // negated, for correct nan handling
+    #else
+        if (a < s_acc) {
+    #endif
+            s_acc = a;
+            ret_idx = i;
+        #if 0
+            if (npy_isnan(s_acc)) {
+                // nan encountered, it's maximal
+                return ret_idx;
+            }
+        #endif
+        }
+    }
+    return ret_idx;
+}
+
+#endif // chk_simd
+
+#line 129
+#if NPY_SIMD
+#line 136
+static inline npy_intp
+simd_argmax_u64(npyv_lanetype_u64 *ip, npy_intp len)
+{
+    npyv_lanetype_u64 s_acc = *ip;
+    npy_intp ret_idx = 0, i = 0;
+    const int vstep = npyv_nlanes_u64;
+    const int wstep = vstep*4;
+    // loop by a scalar will perform better for small arrays
+    if (len < wstep) {
+        goto scalar_loop;
+    }
+    npy_intp len0 = len;
+    // guard against wraparound vector addition for 32-bit indices
+    // in case of the array length is larger than 16gb
+#if 0
+    if (len0 > NPY_MAX_UINT32) {
+        len0 = NPY_MAX_UINT32;
+    }
+#endif
+    // create index for vector indices
+    npyv_lanetype_u64 d_vindices[npyv_nlanes_u64*4];
+    for (int vi = 0; vi < wstep; ++vi) {
+        d_vindices[vi] = vi;
+    }
+    const npyv_u64 vindices_0 = npyv_load_u64(d_vindices);
+    const npyv_u64 vindices_1 = npyv_load_u64(d_vindices + vstep);
+    const npyv_u64 vindices_2 = npyv_load_u64(d_vindices + vstep*2);
+    const npyv_u64 vindices_3 = npyv_load_u64(d_vindices + vstep*3);
+    // initialize vector accumulator for highest values and its indexes
+    npyv_u64 acc_indices = npyv_zero_u64();
+    npyv_u64 acc = npyv_setall_u64(s_acc);
+    for (npy_intp n = len0 & -wstep; i < n; i += wstep) {
+        npyv_u64 vi = npyv_setall_u64((npyv_lanetype_u64)i);
+        npyv_u64 a = npyv_load_u64(ip + i);
+        npyv_u64 b = npyv_load_u64(ip + i + vstep);
+        npyv_u64 c = npyv_load_u64(ip + i + vstep*2);
+        npyv_u64 d = npyv_load_u64(ip + i + vstep*3);
+
+        // reverse to put lowest index first in case of matched values
+        npyv_b64 m_ba = npyv_cmpgt_u64(b, a);
+        npyv_b64 m_dc = npyv_cmpgt_u64(d, c);
+        npyv_u64  x_ba = npyv_select_u64(m_ba, b, a);
+        npyv_u64  x_dc = npyv_select_u64(m_dc, d, c);
+        npyv_b64 m_dcba = npyv_cmpgt_u64(x_dc, x_ba);
+        npyv_u64  x_dcba = npyv_select_u64(m_dcba, x_dc, x_ba);
+
+        npyv_u64 idx_ba = npyv_select_u64(m_ba, vindices_1, vindices_0);
+        npyv_u64 idx_dc = npyv_select_u64(m_dc, vindices_3, vindices_2);
+        npyv_u64 idx_dcba = npyv_select_u64(m_dcba, idx_dc, idx_ba);
+        npyv_b64 m_acc = npyv_cmpgt_u64(x_dcba, acc);
+        acc = npyv_select_u64(m_acc, x_dcba, acc);
+        acc_indices = npyv_select_u64(m_acc, npyv_add_u64(vi, idx_dcba), acc_indices);
+
+    #if 0
+        npyv_b64 nnan_a = npyv_notnan_u64(a);
+        npyv_b64 nnan_b = npyv_notnan_u64(b);
+        npyv_b64 nnan_c = npyv_notnan_u64(c);
+        npyv_b64 nnan_d = npyv_notnan_u64(d);
+        npyv_b64 nnan_ab = npyv_and_b64(nnan_a, nnan_b);
+        npyv_b64 nnan_cd = npyv_and_b64(nnan_c, nnan_d);
+        npy_uint64 nnan = npyv_tobits_b64(npyv_and_b64(nnan_ab, nnan_cd));
+        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
+            npy_uint64 nnan_4[4];
+            nnan_4[0] = npyv_tobits_b64(nnan_a);
+            nnan_4[1] = npyv_tobits_b64(nnan_b);
+            nnan_4[2] = npyv_tobits_b64(nnan_c);
+            nnan_4[3] = npyv_tobits_b64(nnan_d);
+            for (int ni = 0; ni < 4; ++ni) {
+                for (int vi = 0; vi < vstep; ++vi) {
+                    if (!((nnan_4[ni] >> vi) & 1)) {
+                        return i + ni*vstep + vi;
+                    }
+                }
+            }
+        }
+    #endif
+    }
+    for (npy_intp n = len0 & -vstep; i < n; i += vstep) {
+        npyv_u64 vi = npyv_setall_u64((npyv_lanetype_u64)i);
+        npyv_u64 a = npyv_load_u64(ip + i);
+        npyv_b64 m_acc = npyv_cmpgt_u64(a, acc);
+        acc = npyv_select_u64(m_acc, a, acc);
+        acc_indices = npyv_select_u64(m_acc, npyv_add_u64(vi, vindices_0), acc_indices);
+    #if 0
+        npyv_b64 nnan_a = npyv_notnan_u64(a);
+        npy_uint64 nnan = npyv_tobits_b64(nnan_a);
+        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
+            for (int vi = 0; vi < vstep; ++vi) {
+                if (!((nnan >> vi) & 1)) {
+                    return i + vi;
+                }
+            }
+        }
+    #endif
+    }
+
+    // reduce
+    npyv_lanetype_u64 dacc[npyv_nlanes_u64];
+    npyv_lanetype_u64 dacc_i[npyv_nlanes_u64];
+    npyv_store_u64(dacc_i, acc_indices);
+    npyv_store_u64(dacc, acc);
+
+    s_acc = dacc[0];
+    ret_idx = dacc_i[0];
+    for (int vi = 1; vi < vstep; ++vi) {
+        if (dacc[vi] > s_acc) {
+            s_acc = dacc[vi];
+            ret_idx = (npy_intp)dacc_i[vi];
+        }
+    }
+    // get the lowest index in case of matched values
+    for (int vi = 0; vi < vstep; ++vi) {
+        if (s_acc == dacc[vi] && ret_idx > (npy_intp)dacc_i[vi]) {
+            ret_idx = dacc_i[vi];
+        }
+    }
+scalar_loop:
+    for (; i < len; ++i) {
+        npyv_lanetype_u64 a = ip[i];
+    #if 0
+        if (!(a <= s_acc)) {  // negated, for correct nan handling
+    #else
+        if (a > s_acc) {
+    #endif
+            s_acc = a;
+            ret_idx = i;
+        #if 0
+            if (npy_isnan(s_acc)) {
+                // nan encountered, it's maximal
+                return ret_idx;
+            }
+        #endif
+        }
+    }
+    return ret_idx;
+}
+
+#line 136
+static inline npy_intp
+simd_argmin_u64(npyv_lanetype_u64 *ip, npy_intp len)
+{
+    npyv_lanetype_u64 s_acc = *ip;
+    npy_intp ret_idx = 0, i = 0;
+    const int vstep = npyv_nlanes_u64;
+    const int wstep = vstep*4;
+    // loop by a scalar will perform better for small arrays
+    if (len < wstep) {
+        goto scalar_loop;
+    }
+    npy_intp len0 = len;
+    // guard against wraparound vector addition for 32-bit indices
+    // in case of the array length is larger than 16gb
+#if 0
+    if (len0 > NPY_MAX_UINT32) {
+        len0 = NPY_MAX_UINT32;
+    }
+#endif
+    // create index for vector indices
+    npyv_lanetype_u64 d_vindices[npyv_nlanes_u64*4];
+    for (int vi = 0; vi < wstep; ++vi) {
+        d_vindices[vi] = vi;
+    }
+    const npyv_u64 vindices_0 = npyv_load_u64(d_vindices);
+    const npyv_u64 vindices_1 = npyv_load_u64(d_vindices + vstep);
+    const npyv_u64 vindices_2 = npyv_load_u64(d_vindices + vstep*2);
+    const npyv_u64 vindices_3 = npyv_load_u64(d_vindices + vstep*3);
+    // initialize vector accumulator for highest values and its indexes
+    npyv_u64 acc_indices = npyv_zero_u64();
+    npyv_u64 acc = npyv_setall_u64(s_acc);
+    for (npy_intp n = len0 & -wstep; i < n; i += wstep) {
+        npyv_u64 vi = npyv_setall_u64((npyv_lanetype_u64)i);
+        npyv_u64 a = npyv_load_u64(ip + i);
+        npyv_u64 b = npyv_load_u64(ip + i + vstep);
+        npyv_u64 c = npyv_load_u64(ip + i + vstep*2);
+        npyv_u64 d = npyv_load_u64(ip + i + vstep*3);
+
+        // reverse to put lowest index first in case of matched values
+        npyv_b64 m_ba = npyv_cmplt_u64(b, a);
+        npyv_b64 m_dc = npyv_cmplt_u64(d, c);
+        npyv_u64  x_ba = npyv_select_u64(m_ba, b, a);
+        npyv_u64  x_dc = npyv_select_u64(m_dc, d, c);
+        npyv_b64 m_dcba = npyv_cmplt_u64(x_dc, x_ba);
+        npyv_u64  x_dcba = npyv_select_u64(m_dcba, x_dc, x_ba);
+
+        npyv_u64 idx_ba = npyv_select_u64(m_ba, vindices_1, vindices_0);
+        npyv_u64 idx_dc = npyv_select_u64(m_dc, vindices_3, vindices_2);
+        npyv_u64 idx_dcba = npyv_select_u64(m_dcba, idx_dc, idx_ba);
+        npyv_b64 m_acc = npyv_cmplt_u64(x_dcba, acc);
+        acc = npyv_select_u64(m_acc, x_dcba, acc);
+        acc_indices = npyv_select_u64(m_acc, npyv_add_u64(vi, idx_dcba), acc_indices);
+
+    #if 0
+        npyv_b64 nnan_a = npyv_notnan_u64(a);
+        npyv_b64 nnan_b = npyv_notnan_u64(b);
+        npyv_b64 nnan_c = npyv_notnan_u64(c);
+        npyv_b64 nnan_d = npyv_notnan_u64(d);
+        npyv_b64 nnan_ab = npyv_and_b64(nnan_a, nnan_b);
+        npyv_b64 nnan_cd = npyv_and_b64(nnan_c, nnan_d);
+        npy_uint64 nnan = npyv_tobits_b64(npyv_and_b64(nnan_ab, nnan_cd));
+        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
+            npy_uint64 nnan_4[4];
+            nnan_4[0] = npyv_tobits_b64(nnan_a);
+            nnan_4[1] = npyv_tobits_b64(nnan_b);
+            nnan_4[2] = npyv_tobits_b64(nnan_c);
+            nnan_4[3] = npyv_tobits_b64(nnan_d);
+            for (int ni = 0; ni < 4; ++ni) {
+                for (int vi = 0; vi < vstep; ++vi) {
+                    if (!((nnan_4[ni] >> vi) & 1)) {
+                        return i + ni*vstep + vi;
+                    }
+                }
+            }
+        }
+    #endif
+    }
+    for (npy_intp n = len0 & -vstep; i < n; i += vstep) {
+        npyv_u64 vi = npyv_setall_u64((npyv_lanetype_u64)i);
+        npyv_u64 a = npyv_load_u64(ip + i);
+        npyv_b64 m_acc = npyv_cmplt_u64(a, acc);
+        acc = npyv_select_u64(m_acc, a, acc);
+        acc_indices = npyv_select_u64(m_acc, npyv_add_u64(vi, vindices_0), acc_indices);
+    #if 0
+        npyv_b64 nnan_a = npyv_notnan_u64(a);
+        npy_uint64 nnan = npyv_tobits_b64(nnan_a);
+        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
+            for (int vi = 0; vi < vstep; ++vi) {
+                if (!((nnan >> vi) & 1)) {
+                    return i + vi;
+                }
+            }
+        }
+    #endif
+    }
+
+    // reduce
+    npyv_lanetype_u64 dacc[npyv_nlanes_u64];
+    npyv_lanetype_u64 dacc_i[npyv_nlanes_u64];
+    npyv_store_u64(dacc_i, acc_indices);
+    npyv_store_u64(dacc, acc);
+
+    s_acc = dacc[0];
+    ret_idx = dacc_i[0];
+    for (int vi = 1; vi < vstep; ++vi) {
+        if (dacc[vi] < s_acc) {
+            s_acc = dacc[vi];
+            ret_idx = (npy_intp)dacc_i[vi];
+        }
+    }
+    // get the lowest index in case of matched values
+    for (int vi = 0; vi < vstep; ++vi) {
+        if (s_acc == dacc[vi] && ret_idx > (npy_intp)dacc_i[vi]) {
+            ret_idx = dacc_i[vi];
+        }
+    }
+scalar_loop:
+    for (; i < len; ++i) {
+        npyv_lanetype_u64 a = ip[i];
+    #if 0
+        if (!(a >= s_acc)) {  // negated, for correct nan handling
+    #else
+        if (a < s_acc) {
+    #endif
+            s_acc = a;
+            ret_idx = i;
+        #if 0
+            if (npy_isnan(s_acc)) {
+                // nan encountered, it's maximal
+                return ret_idx;
+            }
+        #endif
+        }
+    }
+    return ret_idx;
+}
+
+#endif // chk_simd
+
+#line 129
+#if NPY_SIMD
+#line 136
+static inline npy_intp
+simd_argmax_s64(npyv_lanetype_s64 *ip, npy_intp len)
+{
+    npyv_lanetype_s64 s_acc = *ip;
+    npy_intp ret_idx = 0, i = 0;
+    const int vstep = npyv_nlanes_s64;
+    const int wstep = vstep*4;
+    // loop by a scalar will perform better for small arrays
+    if (len < wstep) {
+        goto scalar_loop;
+    }
+    npy_intp len0 = len;
+    // guard against wraparound vector addition for 32-bit indices
+    // in case of the array length is larger than 16gb
+#if 0
+    if (len0 > NPY_MAX_UINT32) {
+        len0 = NPY_MAX_UINT32;
+    }
+#endif
+    // create index for vector indices
+    npyv_lanetype_u64 d_vindices[npyv_nlanes_s64*4];
+    for (int vi = 0; vi < wstep; ++vi) {
+        d_vindices[vi] = vi;
+    }
+    const npyv_u64 vindices_0 = npyv_load_u64(d_vindices);
+    const npyv_u64 vindices_1 = npyv_load_u64(d_vindices + vstep);
+    const npyv_u64 vindices_2 = npyv_load_u64(d_vindices + vstep*2);
+    const npyv_u64 vindices_3 = npyv_load_u64(d_vindices + vstep*3);
+    // initialize vector accumulator for highest values and its indexes
+    npyv_u64 acc_indices = npyv_zero_u64();
+    npyv_s64 acc = npyv_setall_s64(s_acc);
+    for (npy_intp n = len0 & -wstep; i < n; i += wstep) {
+        npyv_u64 vi = npyv_setall_u64((npyv_lanetype_u64)i);
+        npyv_s64 a = npyv_load_s64(ip + i);
+        npyv_s64 b = npyv_load_s64(ip + i + vstep);
+        npyv_s64 c = npyv_load_s64(ip + i + vstep*2);
+        npyv_s64 d = npyv_load_s64(ip + i + vstep*3);
+
+        // reverse to put lowest index first in case of matched values
+        npyv_b64 m_ba = npyv_cmpgt_s64(b, a);
+        npyv_b64 m_dc = npyv_cmpgt_s64(d, c);
+        npyv_s64  x_ba = npyv_select_s64(m_ba, b, a);
+        npyv_s64  x_dc = npyv_select_s64(m_dc, d, c);
+        npyv_b64 m_dcba = npyv_cmpgt_s64(x_dc, x_ba);
+        npyv_s64  x_dcba = npyv_select_s64(m_dcba, x_dc, x_ba);
+
+        npyv_u64 idx_ba = npyv_select_u64(m_ba, vindices_1, vindices_0);
+        npyv_u64 idx_dc = npyv_select_u64(m_dc, vindices_3, vindices_2);
+        npyv_u64 idx_dcba = npyv_select_u64(m_dcba, idx_dc, idx_ba);
+        npyv_b64 m_acc = npyv_cmpgt_s64(x_dcba, acc);
+        acc = npyv_select_s64(m_acc, x_dcba, acc);
+        acc_indices = npyv_select_u64(m_acc, npyv_add_u64(vi, idx_dcba), acc_indices);
+
+    #if 0
+        npyv_b64 nnan_a = npyv_notnan_s64(a);
+        npyv_b64 nnan_b = npyv_notnan_s64(b);
+        npyv_b64 nnan_c = npyv_notnan_s64(c);
+        npyv_b64 nnan_d = npyv_notnan_s64(d);
+        npyv_b64 nnan_ab = npyv_and_b64(nnan_a, nnan_b);
+        npyv_b64 nnan_cd = npyv_and_b64(nnan_c, nnan_d);
+        npy_uint64 nnan = npyv_tobits_b64(npyv_and_b64(nnan_ab, nnan_cd));
+        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
+            npy_uint64 nnan_4[4];
+            nnan_4[0] = npyv_tobits_b64(nnan_a);
+            nnan_4[1] = npyv_tobits_b64(nnan_b);
+            nnan_4[2] = npyv_tobits_b64(nnan_c);
+            nnan_4[3] = npyv_tobits_b64(nnan_d);
+            for (int ni = 0; ni < 4; ++ni) {
+                for (int vi = 0; vi < vstep; ++vi) {
+                    if (!((nnan_4[ni] >> vi) & 1)) {
+                        return i + ni*vstep + vi;
+                    }
+                }
+            }
+        }
+    #endif
+    }
+    for (npy_intp n = len0 & -vstep; i < n; i += vstep) {
+        npyv_u64 vi = npyv_setall_u64((npyv_lanetype_u64)i);
+        npyv_s64 a = npyv_load_s64(ip + i);
+        npyv_b64 m_acc = npyv_cmpgt_s64(a, acc);
+        acc = npyv_select_s64(m_acc, a, acc);
+        acc_indices = npyv_select_u64(m_acc, npyv_add_u64(vi, vindices_0), acc_indices);
+    #if 0
+        npyv_b64 nnan_a = npyv_notnan_s64(a);
+        npy_uint64 nnan = npyv_tobits_b64(nnan_a);
+        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
+            for (int vi = 0; vi < vstep; ++vi) {
+                if (!((nnan >> vi) & 1)) {
+                    return i + vi;
+                }
+            }
+        }
+    #endif
+    }
+
+    // reduce
+    npyv_lanetype_s64 dacc[npyv_nlanes_s64];
+    npyv_lanetype_u64 dacc_i[npyv_nlanes_s64];
+    npyv_store_u64(dacc_i, acc_indices);
+    npyv_store_s64(dacc, acc);
+
+    s_acc = dacc[0];
+    ret_idx = dacc_i[0];
+    for (int vi = 1; vi < vstep; ++vi) {
+        if (dacc[vi] > s_acc) {
+            s_acc = dacc[vi];
+            ret_idx = (npy_intp)dacc_i[vi];
+        }
+    }
+    // get the lowest index in case of matched values
+    for (int vi = 0; vi < vstep; ++vi) {
+        if (s_acc == dacc[vi] && ret_idx > (npy_intp)dacc_i[vi]) {
+            ret_idx = dacc_i[vi];
+        }
+    }
+scalar_loop:
+    for (; i < len; ++i) {
+        npyv_lanetype_s64 a = ip[i];
+    #if 0
+        if (!(a <= s_acc)) {  // negated, for correct nan handling
+    #else
+        if (a > s_acc) {
+    #endif
+            s_acc = a;
+            ret_idx = i;
+        #if 0
+            if (npy_isnan(s_acc)) {
+                // nan encountered, it's maximal
+                return ret_idx;
+            }
+        #endif
+        }
+    }
+    return ret_idx;
+}
+
+#line 136
+static inline npy_intp
+simd_argmin_s64(npyv_lanetype_s64 *ip, npy_intp len)
+{
+    npyv_lanetype_s64 s_acc = *ip;
+    npy_intp ret_idx = 0, i = 0;
+    const int vstep = npyv_nlanes_s64;
+    const int wstep = vstep*4;
+    // loop by a scalar will perform better for small arrays
+    if (len < wstep) {
+        goto scalar_loop;
+    }
+    npy_intp len0 = len;
+    // guard against wraparound vector addition for 32-bit indices
+    // in case of the array length is larger than 16gb
+#if 0
+    if (len0 > NPY_MAX_UINT32) {
+        len0 = NPY_MAX_UINT32;
+    }
+#endif
+    // create index for vector indices
+    npyv_lanetype_u64 d_vindices[npyv_nlanes_s64*4];
+    for (int vi = 0; vi < wstep; ++vi) {
+        d_vindices[vi] = vi;
+    }
+    const npyv_u64 vindices_0 = npyv_load_u64(d_vindices);
+    const npyv_u64 vindices_1 = npyv_load_u64(d_vindices + vstep);
+    const npyv_u64 vindices_2 = npyv_load_u64(d_vindices + vstep*2);
+    const npyv_u64 vindices_3 = npyv_load_u64(d_vindices + vstep*3);
+    // initialize vector accumulator for highest values and its indexes
+    npyv_u64 acc_indices = npyv_zero_u64();
+    npyv_s64 acc = npyv_setall_s64(s_acc);
+    for (npy_intp n = len0 & -wstep; i < n; i += wstep) {
+        npyv_u64 vi = npyv_setall_u64((npyv_lanetype_u64)i);
+        npyv_s64 a = npyv_load_s64(ip + i);
+        npyv_s64 b = npyv_load_s64(ip + i + vstep);
+        npyv_s64 c = npyv_load_s64(ip + i + vstep*2);
+        npyv_s64 d = npyv_load_s64(ip + i + vstep*3);
+
+        // reverse to put lowest index first in case of matched values
+        npyv_b64 m_ba = npyv_cmplt_s64(b, a);
+        npyv_b64 m_dc = npyv_cmplt_s64(d, c);
+        npyv_s64  x_ba = npyv_select_s64(m_ba, b, a);
+        npyv_s64  x_dc = npyv_select_s64(m_dc, d, c);
+        npyv_b64 m_dcba = npyv_cmplt_s64(x_dc, x_ba);
+        npyv_s64  x_dcba = npyv_select_s64(m_dcba, x_dc, x_ba);
+
+        npyv_u64 idx_ba = npyv_select_u64(m_ba, vindices_1, vindices_0);
+        npyv_u64 idx_dc = npyv_select_u64(m_dc, vindices_3, vindices_2);
+        npyv_u64 idx_dcba = npyv_select_u64(m_dcba, idx_dc, idx_ba);
+        npyv_b64 m_acc = npyv_cmplt_s64(x_dcba, acc);
+        acc = npyv_select_s64(m_acc, x_dcba, acc);
+        acc_indices = npyv_select_u64(m_acc, npyv_add_u64(vi, idx_dcba), acc_indices);
+
+    #if 0
+        npyv_b64 nnan_a = npyv_notnan_s64(a);
+        npyv_b64 nnan_b = npyv_notnan_s64(b);
+        npyv_b64 nnan_c = npyv_notnan_s64(c);
+        npyv_b64 nnan_d = npyv_notnan_s64(d);
+        npyv_b64 nnan_ab = npyv_and_b64(nnan_a, nnan_b);
+        npyv_b64 nnan_cd = npyv_and_b64(nnan_c, nnan_d);
+        npy_uint64 nnan = npyv_tobits_b64(npyv_and_b64(nnan_ab, nnan_cd));
+        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
+            npy_uint64 nnan_4[4];
+            nnan_4[0] = npyv_tobits_b64(nnan_a);
+            nnan_4[1] = npyv_tobits_b64(nnan_b);
+            nnan_4[2] = npyv_tobits_b64(nnan_c);
+            nnan_4[3] = npyv_tobits_b64(nnan_d);
+            for (int ni = 0; ni < 4; ++ni) {
+                for (int vi = 0; vi < vstep; ++vi) {
+                    if (!((nnan_4[ni] >> vi) & 1)) {
+                        return i + ni*vstep + vi;
+                    }
+                }
+            }
+        }
+    #endif
+    }
+    for (npy_intp n = len0 & -vstep; i < n; i += vstep) {
+        npyv_u64 vi = npyv_setall_u64((npyv_lanetype_u64)i);
+        npyv_s64 a = npyv_load_s64(ip + i);
+        npyv_b64 m_acc = npyv_cmplt_s64(a, acc);
+        acc = npyv_select_s64(m_acc, a, acc);
+        acc_indices = npyv_select_u64(m_acc, npyv_add_u64(vi, vindices_0), acc_indices);
+    #if 0
+        npyv_b64 nnan_a = npyv_notnan_s64(a);
+        npy_uint64 nnan = npyv_tobits_b64(nnan_a);
+        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
+            for (int vi = 0; vi < vstep; ++vi) {
+                if (!((nnan >> vi) & 1)) {
+                    return i + vi;
+                }
+            }
+        }
+    #endif
+    }
+
+    // reduce
+    npyv_lanetype_s64 dacc[npyv_nlanes_s64];
+    npyv_lanetype_u64 dacc_i[npyv_nlanes_s64];
+    npyv_store_u64(dacc_i, acc_indices);
+    npyv_store_s64(dacc, acc);
+
+    s_acc = dacc[0];
+    ret_idx = dacc_i[0];
+    for (int vi = 1; vi < vstep; ++vi) {
+        if (dacc[vi] < s_acc) {
+            s_acc = dacc[vi];
+            ret_idx = (npy_intp)dacc_i[vi];
+        }
+    }
+    // get the lowest index in case of matched values
+    for (int vi = 0; vi < vstep; ++vi) {
+        if (s_acc == dacc[vi] && ret_idx > (npy_intp)dacc_i[vi]) {
+            ret_idx = dacc_i[vi];
+        }
+    }
+scalar_loop:
+    for (; i < len; ++i) {
+        npyv_lanetype_s64 a = ip[i];
+    #if 0
+        if (!(a >= s_acc)) {  // negated, for correct nan handling
+    #else
+        if (a < s_acc) {
+    #endif
+            s_acc = a;
+            ret_idx = i;
+        #if 0
+            if (npy_isnan(s_acc)) {
+                // nan encountered, it's maximal
+                return ret_idx;
+            }
+        #endif
+        }
+    }
+    return ret_idx;
+}
+
+#endif // chk_simd
+
+#line 129
+#if NPY_SIMD_F32
+#line 136
+static inline npy_intp
+simd_argmax_f32(npyv_lanetype_f32 *ip, npy_intp len)
+{
+    npyv_lanetype_f32 s_acc = *ip;
+    npy_intp ret_idx = 0, i = 0;
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep*4;
+    // loop by a scalar will perform better for small arrays
+    if (len < wstep) {
+        goto scalar_loop;
+    }
+    npy_intp len0 = len;
+    // guard against wraparound vector addition for 32-bit indices
+    // in case of the array length is larger than 16gb
+#if 1
+    if (len0 > NPY_MAX_UINT32) {
+        len0 = NPY_MAX_UINT32;
+    }
+#endif
+    // create index for vector indices
+    npyv_lanetype_u32 d_vindices[npyv_nlanes_f32*4];
+    for (int vi = 0; vi < wstep; ++vi) {
+        d_vindices[vi] = vi;
+    }
+    const npyv_u32 vindices_0 = npyv_load_u32(d_vindices);
+    const npyv_u32 vindices_1 = npyv_load_u32(d_vindices + vstep);
+    const npyv_u32 vindices_2 = npyv_load_u32(d_vindices + vstep*2);
+    const npyv_u32 vindices_3 = npyv_load_u32(d_vindices + vstep*3);
+    // initialize vector accumulator for highest values and its indexes
+    npyv_u32 acc_indices = npyv_zero_u32();
+    npyv_f32 acc = npyv_setall_f32(s_acc);
+    for (npy_intp n = len0 & -wstep; i < n; i += wstep) {
+        npyv_u32 vi = npyv_setall_u32((npyv_lanetype_u32)i);
+        npyv_f32 a = npyv_load_f32(ip + i);
+        npyv_f32 b = npyv_load_f32(ip + i + vstep);
+        npyv_f32 c = npyv_load_f32(ip + i + vstep*2);
+        npyv_f32 d = npyv_load_f32(ip + i + vstep*3);
+
+        // reverse to put lowest index first in case of matched values
+        npyv_b32 m_ba = npyv_cmpgt_f32(b, a);
+        npyv_b32 m_dc = npyv_cmpgt_f32(d, c);
+        npyv_f32  x_ba = npyv_select_f32(m_ba, b, a);
+        npyv_f32  x_dc = npyv_select_f32(m_dc, d, c);
+        npyv_b32 m_dcba = npyv_cmpgt_f32(x_dc, x_ba);
+        npyv_f32  x_dcba = npyv_select_f32(m_dcba, x_dc, x_ba);
+
+        npyv_u32 idx_ba = npyv_select_u32(m_ba, vindices_1, vindices_0);
+        npyv_u32 idx_dc = npyv_select_u32(m_dc, vindices_3, vindices_2);
+        npyv_u32 idx_dcba = npyv_select_u32(m_dcba, idx_dc, idx_ba);
+        npyv_b32 m_acc = npyv_cmpgt_f32(x_dcba, acc);
+        acc = npyv_select_f32(m_acc, x_dcba, acc);
+        acc_indices = npyv_select_u32(m_acc, npyv_add_u32(vi, idx_dcba), acc_indices);
+
+    #if 1
+        npyv_b32 nnan_a = npyv_notnan_f32(a);
+        npyv_b32 nnan_b = npyv_notnan_f32(b);
+        npyv_b32 nnan_c = npyv_notnan_f32(c);
+        npyv_b32 nnan_d = npyv_notnan_f32(d);
+        npyv_b32 nnan_ab = npyv_and_b32(nnan_a, nnan_b);
+        npyv_b32 nnan_cd = npyv_and_b32(nnan_c, nnan_d);
+        npy_uint64 nnan = npyv_tobits_b32(npyv_and_b32(nnan_ab, nnan_cd));
+        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
+            npy_uint64 nnan_4[4];
+            nnan_4[0] = npyv_tobits_b32(nnan_a);
+            nnan_4[1] = npyv_tobits_b32(nnan_b);
+            nnan_4[2] = npyv_tobits_b32(nnan_c);
+            nnan_4[3] = npyv_tobits_b32(nnan_d);
+            for (int ni = 0; ni < 4; ++ni) {
+                for (int vi = 0; vi < vstep; ++vi) {
+                    if (!((nnan_4[ni] >> vi) & 1)) {
+                        return i + ni*vstep + vi;
+                    }
+                }
+            }
+        }
+    #endif
+    }
+    for (npy_intp n = len0 & -vstep; i < n; i += vstep) {
+        npyv_u32 vi = npyv_setall_u32((npyv_lanetype_u32)i);
+        npyv_f32 a = npyv_load_f32(ip + i);
+        npyv_b32 m_acc = npyv_cmpgt_f32(a, acc);
+        acc = npyv_select_f32(m_acc, a, acc);
+        acc_indices = npyv_select_u32(m_acc, npyv_add_u32(vi, vindices_0), acc_indices);
+    #if 1
+        npyv_b32 nnan_a = npyv_notnan_f32(a);
+        npy_uint64 nnan = npyv_tobits_b32(nnan_a);
+        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
+            for (int vi = 0; vi < vstep; ++vi) {
+                if (!((nnan >> vi) & 1)) {
+                    return i + vi;
+                }
+            }
+        }
+    #endif
+    }
+
+    // reduce
+    npyv_lanetype_f32 dacc[npyv_nlanes_f32];
+    npyv_lanetype_u32 dacc_i[npyv_nlanes_f32];
+    npyv_store_u32(dacc_i, acc_indices);
+    npyv_store_f32(dacc, acc);
+
+    s_acc = dacc[0];
+    ret_idx = dacc_i[0];
+    for (int vi = 1; vi < vstep; ++vi) {
+        if (dacc[vi] > s_acc) {
+            s_acc = dacc[vi];
+            ret_idx = (npy_intp)dacc_i[vi];
+        }
+    }
+    // get the lowest index in case of matched values
+    for (int vi = 0; vi < vstep; ++vi) {
+        if (s_acc == dacc[vi] && ret_idx > (npy_intp)dacc_i[vi]) {
+            ret_idx = dacc_i[vi];
+        }
+    }
+scalar_loop:
+    for (; i < len; ++i) {
+        npyv_lanetype_f32 a = ip[i];
+    #if 1
+        if (!(a <= s_acc)) {  // negated, for correct nan handling
+    #else
+        if (a > s_acc) {
+    #endif
+            s_acc = a;
+            ret_idx = i;
+        #if 1
+            if (npy_isnan(s_acc)) {
+                // nan encountered, it's maximal
+                return ret_idx;
+            }
+        #endif
+        }
+    }
+    return ret_idx;
+}
+
+#line 136
+static inline npy_intp
+simd_argmin_f32(npyv_lanetype_f32 *ip, npy_intp len)
+{
+    npyv_lanetype_f32 s_acc = *ip;
+    npy_intp ret_idx = 0, i = 0;
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep*4;
+    // loop by a scalar will perform better for small arrays
+    if (len < wstep) {
+        goto scalar_loop;
+    }
+    npy_intp len0 = len;
+    // guard against wraparound vector addition for 32-bit indices
+    // in case of the array length is larger than 16gb
+#if 1
+    if (len0 > NPY_MAX_UINT32) {
+        len0 = NPY_MAX_UINT32;
+    }
+#endif
+    // create index for vector indices
+    npyv_lanetype_u32 d_vindices[npyv_nlanes_f32*4];
+    for (int vi = 0; vi < wstep; ++vi) {
+        d_vindices[vi] = vi;
+    }
+    const npyv_u32 vindices_0 = npyv_load_u32(d_vindices);
+    const npyv_u32 vindices_1 = npyv_load_u32(d_vindices + vstep);
+    const npyv_u32 vindices_2 = npyv_load_u32(d_vindices + vstep*2);
+    const npyv_u32 vindices_3 = npyv_load_u32(d_vindices + vstep*3);
+    // initialize vector accumulator for highest values and its indexes
+    npyv_u32 acc_indices = npyv_zero_u32();
+    npyv_f32 acc = npyv_setall_f32(s_acc);
+    for (npy_intp n = len0 & -wstep; i < n; i += wstep) {
+        npyv_u32 vi = npyv_setall_u32((npyv_lanetype_u32)i);
+        npyv_f32 a = npyv_load_f32(ip + i);
+        npyv_f32 b = npyv_load_f32(ip + i + vstep);
+        npyv_f32 c = npyv_load_f32(ip + i + vstep*2);
+        npyv_f32 d = npyv_load_f32(ip + i + vstep*3);
+
+        // reverse to put lowest index first in case of matched values
+        npyv_b32 m_ba = npyv_cmplt_f32(b, a);
+        npyv_b32 m_dc = npyv_cmplt_f32(d, c);
+        npyv_f32  x_ba = npyv_select_f32(m_ba, b, a);
+        npyv_f32  x_dc = npyv_select_f32(m_dc, d, c);
+        npyv_b32 m_dcba = npyv_cmplt_f32(x_dc, x_ba);
+        npyv_f32  x_dcba = npyv_select_f32(m_dcba, x_dc, x_ba);
+
+        npyv_u32 idx_ba = npyv_select_u32(m_ba, vindices_1, vindices_0);
+        npyv_u32 idx_dc = npyv_select_u32(m_dc, vindices_3, vindices_2);
+        npyv_u32 idx_dcba = npyv_select_u32(m_dcba, idx_dc, idx_ba);
+        npyv_b32 m_acc = npyv_cmplt_f32(x_dcba, acc);
+        acc = npyv_select_f32(m_acc, x_dcba, acc);
+        acc_indices = npyv_select_u32(m_acc, npyv_add_u32(vi, idx_dcba), acc_indices);
+
+    #if 1
+        npyv_b32 nnan_a = npyv_notnan_f32(a);
+        npyv_b32 nnan_b = npyv_notnan_f32(b);
+        npyv_b32 nnan_c = npyv_notnan_f32(c);
+        npyv_b32 nnan_d = npyv_notnan_f32(d);
+        npyv_b32 nnan_ab = npyv_and_b32(nnan_a, nnan_b);
+        npyv_b32 nnan_cd = npyv_and_b32(nnan_c, nnan_d);
+        npy_uint64 nnan = npyv_tobits_b32(npyv_and_b32(nnan_ab, nnan_cd));
+        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
+            npy_uint64 nnan_4[4];
+            nnan_4[0] = npyv_tobits_b32(nnan_a);
+            nnan_4[1] = npyv_tobits_b32(nnan_b);
+            nnan_4[2] = npyv_tobits_b32(nnan_c);
+            nnan_4[3] = npyv_tobits_b32(nnan_d);
+            for (int ni = 0; ni < 4; ++ni) {
+                for (int vi = 0; vi < vstep; ++vi) {
+                    if (!((nnan_4[ni] >> vi) & 1)) {
+                        return i + ni*vstep + vi;
+                    }
+                }
+            }
+        }
+    #endif
+    }
+    for (npy_intp n = len0 & -vstep; i < n; i += vstep) {
+        npyv_u32 vi = npyv_setall_u32((npyv_lanetype_u32)i);
+        npyv_f32 a = npyv_load_f32(ip + i);
+        npyv_b32 m_acc = npyv_cmplt_f32(a, acc);
+        acc = npyv_select_f32(m_acc, a, acc);
+        acc_indices = npyv_select_u32(m_acc, npyv_add_u32(vi, vindices_0), acc_indices);
+    #if 1
+        npyv_b32 nnan_a = npyv_notnan_f32(a);
+        npy_uint64 nnan = npyv_tobits_b32(nnan_a);
+        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
+            for (int vi = 0; vi < vstep; ++vi) {
+                if (!((nnan >> vi) & 1)) {
+                    return i + vi;
+                }
+            }
+        }
+    #endif
+    }
+
+    // reduce
+    npyv_lanetype_f32 dacc[npyv_nlanes_f32];
+    npyv_lanetype_u32 dacc_i[npyv_nlanes_f32];
+    npyv_store_u32(dacc_i, acc_indices);
+    npyv_store_f32(dacc, acc);
+
+    s_acc = dacc[0];
+    ret_idx = dacc_i[0];
+    for (int vi = 1; vi < vstep; ++vi) {
+        if (dacc[vi] < s_acc) {
+            s_acc = dacc[vi];
+            ret_idx = (npy_intp)dacc_i[vi];
+        }
+    }
+    // get the lowest index in case of matched values
+    for (int vi = 0; vi < vstep; ++vi) {
+        if (s_acc == dacc[vi] && ret_idx > (npy_intp)dacc_i[vi]) {
+            ret_idx = dacc_i[vi];
+        }
+    }
+scalar_loop:
+    for (; i < len; ++i) {
+        npyv_lanetype_f32 a = ip[i];
+    #if 1
+        if (!(a >= s_acc)) {  // negated, for correct nan handling
+    #else
+        if (a < s_acc) {
+    #endif
+            s_acc = a;
+            ret_idx = i;
+        #if 1
+            if (npy_isnan(s_acc)) {
+                // nan encountered, it's maximal
+                return ret_idx;
+            }
+        #endif
+        }
+    }
+    return ret_idx;
+}
+
+#endif // chk_simd
+
+#line 129
+#if NPY_SIMD_F64
+#line 136
+static inline npy_intp
+simd_argmax_f64(npyv_lanetype_f64 *ip, npy_intp len)
+{
+    npyv_lanetype_f64 s_acc = *ip;
+    npy_intp ret_idx = 0, i = 0;
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep*4;
+    // loop by a scalar will perform better for small arrays
+    if (len < wstep) {
+        goto scalar_loop;
+    }
+    npy_intp len0 = len;
+    // guard against wraparound vector addition for 32-bit indices
+    // in case of the array length is larger than 16gb
+#if 0
+    if (len0 > NPY_MAX_UINT32) {
+        len0 = NPY_MAX_UINT32;
+    }
+#endif
+    // create index for vector indices
+    npyv_lanetype_u64 d_vindices[npyv_nlanes_f64*4];
+    for (int vi = 0; vi < wstep; ++vi) {
+        d_vindices[vi] = vi;
+    }
+    const npyv_u64 vindices_0 = npyv_load_u64(d_vindices);
+    const npyv_u64 vindices_1 = npyv_load_u64(d_vindices + vstep);
+    const npyv_u64 vindices_2 = npyv_load_u64(d_vindices + vstep*2);
+    const npyv_u64 vindices_3 = npyv_load_u64(d_vindices + vstep*3);
+    // initialize vector accumulator for highest values and its indexes
+    npyv_u64 acc_indices = npyv_zero_u64();
+    npyv_f64 acc = npyv_setall_f64(s_acc);
+    for (npy_intp n = len0 & -wstep; i < n; i += wstep) {
+        npyv_u64 vi = npyv_setall_u64((npyv_lanetype_u64)i);
+        npyv_f64 a = npyv_load_f64(ip + i);
+        npyv_f64 b = npyv_load_f64(ip + i + vstep);
+        npyv_f64 c = npyv_load_f64(ip + i + vstep*2);
+        npyv_f64 d = npyv_load_f64(ip + i + vstep*3);
+
+        // reverse to put lowest index first in case of matched values
+        npyv_b64 m_ba = npyv_cmpgt_f64(b, a);
+        npyv_b64 m_dc = npyv_cmpgt_f64(d, c);
+        npyv_f64  x_ba = npyv_select_f64(m_ba, b, a);
+        npyv_f64  x_dc = npyv_select_f64(m_dc, d, c);
+        npyv_b64 m_dcba = npyv_cmpgt_f64(x_dc, x_ba);
+        npyv_f64  x_dcba = npyv_select_f64(m_dcba, x_dc, x_ba);
+
+        npyv_u64 idx_ba = npyv_select_u64(m_ba, vindices_1, vindices_0);
+        npyv_u64 idx_dc = npyv_select_u64(m_dc, vindices_3, vindices_2);
+        npyv_u64 idx_dcba = npyv_select_u64(m_dcba, idx_dc, idx_ba);
+        npyv_b64 m_acc = npyv_cmpgt_f64(x_dcba, acc);
+        acc = npyv_select_f64(m_acc, x_dcba, acc);
+        acc_indices = npyv_select_u64(m_acc, npyv_add_u64(vi, idx_dcba), acc_indices);
+
+    #if 1
+        npyv_b64 nnan_a = npyv_notnan_f64(a);
+        npyv_b64 nnan_b = npyv_notnan_f64(b);
+        npyv_b64 nnan_c = npyv_notnan_f64(c);
+        npyv_b64 nnan_d = npyv_notnan_f64(d);
+        npyv_b64 nnan_ab = npyv_and_b64(nnan_a, nnan_b);
+        npyv_b64 nnan_cd = npyv_and_b64(nnan_c, nnan_d);
+        npy_uint64 nnan = npyv_tobits_b64(npyv_and_b64(nnan_ab, nnan_cd));
+        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
+            npy_uint64 nnan_4[4];
+            nnan_4[0] = npyv_tobits_b64(nnan_a);
+            nnan_4[1] = npyv_tobits_b64(nnan_b);
+            nnan_4[2] = npyv_tobits_b64(nnan_c);
+            nnan_4[3] = npyv_tobits_b64(nnan_d);
+            for (int ni = 0; ni < 4; ++ni) {
+                for (int vi = 0; vi < vstep; ++vi) {
+                    if (!((nnan_4[ni] >> vi) & 1)) {
+                        return i + ni*vstep + vi;
+                    }
+                }
+            }
+        }
+    #endif
+    }
+    for (npy_intp n = len0 & -vstep; i < n; i += vstep) {
+        npyv_u64 vi = npyv_setall_u64((npyv_lanetype_u64)i);
+        npyv_f64 a = npyv_load_f64(ip + i);
+        npyv_b64 m_acc = npyv_cmpgt_f64(a, acc);
+        acc = npyv_select_f64(m_acc, a, acc);
+        acc_indices = npyv_select_u64(m_acc, npyv_add_u64(vi, vindices_0), acc_indices);
+    #if 1
+        npyv_b64 nnan_a = npyv_notnan_f64(a);
+        npy_uint64 nnan = npyv_tobits_b64(nnan_a);
+        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
+            for (int vi = 0; vi < vstep; ++vi) {
+                if (!((nnan >> vi) & 1)) {
+                    return i + vi;
+                }
+            }
+        }
+    #endif
+    }
+
+    // reduce
+    npyv_lanetype_f64 dacc[npyv_nlanes_f64];
+    npyv_lanetype_u64 dacc_i[npyv_nlanes_f64];
+    npyv_store_u64(dacc_i, acc_indices);
+    npyv_store_f64(dacc, acc);
+
+    s_acc = dacc[0];
+    ret_idx = dacc_i[0];
+    for (int vi = 1; vi < vstep; ++vi) {
+        if (dacc[vi] > s_acc) {
+            s_acc = dacc[vi];
+            ret_idx = (npy_intp)dacc_i[vi];
+        }
+    }
+    // get the lowest index in case of matched values
+    for (int vi = 0; vi < vstep; ++vi) {
+        if (s_acc == dacc[vi] && ret_idx > (npy_intp)dacc_i[vi]) {
+            ret_idx = dacc_i[vi];
+        }
+    }
+scalar_loop:
+    for (; i < len; ++i) {
+        npyv_lanetype_f64 a = ip[i];
+    #if 1
+        if (!(a <= s_acc)) {  // negated, for correct nan handling
+    #else
+        if (a > s_acc) {
+    #endif
+            s_acc = a;
+            ret_idx = i;
+        #if 1
+            if (npy_isnan(s_acc)) {
+                // nan encountered, it's maximal
+                return ret_idx;
+            }
+        #endif
+        }
+    }
+    return ret_idx;
+}
+
+#line 136
+static inline npy_intp
+simd_argmin_f64(npyv_lanetype_f64 *ip, npy_intp len)
+{
+    npyv_lanetype_f64 s_acc = *ip;
+    npy_intp ret_idx = 0, i = 0;
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep*4;
+    // loop by a scalar will perform better for small arrays
+    if (len < wstep) {
+        goto scalar_loop;
+    }
+    npy_intp len0 = len;
+    // guard against wraparound vector addition for 32-bit indices
+    // in case of the array length is larger than 16gb
+#if 0
+    if (len0 > NPY_MAX_UINT32) {
+        len0 = NPY_MAX_UINT32;
+    }
+#endif
+    // create index for vector indices
+    npyv_lanetype_u64 d_vindices[npyv_nlanes_f64*4];
+    for (int vi = 0; vi < wstep; ++vi) {
+        d_vindices[vi] = vi;
+    }
+    const npyv_u64 vindices_0 = npyv_load_u64(d_vindices);
+    const npyv_u64 vindices_1 = npyv_load_u64(d_vindices + vstep);
+    const npyv_u64 vindices_2 = npyv_load_u64(d_vindices + vstep*2);
+    const npyv_u64 vindices_3 = npyv_load_u64(d_vindices + vstep*3);
+    // initialize vector accumulator for highest values and its indexes
+    npyv_u64 acc_indices = npyv_zero_u64();
+    npyv_f64 acc = npyv_setall_f64(s_acc);
+    for (npy_intp n = len0 & -wstep; i < n; i += wstep) {
+        npyv_u64 vi = npyv_setall_u64((npyv_lanetype_u64)i);
+        npyv_f64 a = npyv_load_f64(ip + i);
+        npyv_f64 b = npyv_load_f64(ip + i + vstep);
+        npyv_f64 c = npyv_load_f64(ip + i + vstep*2);
+        npyv_f64 d = npyv_load_f64(ip + i + vstep*3);
+
+        // reverse to put lowest index first in case of matched values
+        npyv_b64 m_ba = npyv_cmplt_f64(b, a);
+        npyv_b64 m_dc = npyv_cmplt_f64(d, c);
+        npyv_f64  x_ba = npyv_select_f64(m_ba, b, a);
+        npyv_f64  x_dc = npyv_select_f64(m_dc, d, c);
+        npyv_b64 m_dcba = npyv_cmplt_f64(x_dc, x_ba);
+        npyv_f64  x_dcba = npyv_select_f64(m_dcba, x_dc, x_ba);
+
+        npyv_u64 idx_ba = npyv_select_u64(m_ba, vindices_1, vindices_0);
+        npyv_u64 idx_dc = npyv_select_u64(m_dc, vindices_3, vindices_2);
+        npyv_u64 idx_dcba = npyv_select_u64(m_dcba, idx_dc, idx_ba);
+        npyv_b64 m_acc = npyv_cmplt_f64(x_dcba, acc);
+        acc = npyv_select_f64(m_acc, x_dcba, acc);
+        acc_indices = npyv_select_u64(m_acc, npyv_add_u64(vi, idx_dcba), acc_indices);
+
+    #if 1
+        npyv_b64 nnan_a = npyv_notnan_f64(a);
+        npyv_b64 nnan_b = npyv_notnan_f64(b);
+        npyv_b64 nnan_c = npyv_notnan_f64(c);
+        npyv_b64 nnan_d = npyv_notnan_f64(d);
+        npyv_b64 nnan_ab = npyv_and_b64(nnan_a, nnan_b);
+        npyv_b64 nnan_cd = npyv_and_b64(nnan_c, nnan_d);
+        npy_uint64 nnan = npyv_tobits_b64(npyv_and_b64(nnan_ab, nnan_cd));
+        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
+            npy_uint64 nnan_4[4];
+            nnan_4[0] = npyv_tobits_b64(nnan_a);
+            nnan_4[1] = npyv_tobits_b64(nnan_b);
+            nnan_4[2] = npyv_tobits_b64(nnan_c);
+            nnan_4[3] = npyv_tobits_b64(nnan_d);
+            for (int ni = 0; ni < 4; ++ni) {
+                for (int vi = 0; vi < vstep; ++vi) {
+                    if (!((nnan_4[ni] >> vi) & 1)) {
+                        return i + ni*vstep + vi;
+                    }
+                }
+            }
+        }
+    #endif
+    }
+    for (npy_intp n = len0 & -vstep; i < n; i += vstep) {
+        npyv_u64 vi = npyv_setall_u64((npyv_lanetype_u64)i);
+        npyv_f64 a = npyv_load_f64(ip + i);
+        npyv_b64 m_acc = npyv_cmplt_f64(a, acc);
+        acc = npyv_select_f64(m_acc, a, acc);
+        acc_indices = npyv_select_u64(m_acc, npyv_add_u64(vi, vindices_0), acc_indices);
+    #if 1
+        npyv_b64 nnan_a = npyv_notnan_f64(a);
+        npy_uint64 nnan = npyv_tobits_b64(nnan_a);
+        if ((unsigned long long int)nnan != ((1ULL << vstep) - 1)) {
+            for (int vi = 0; vi < vstep; ++vi) {
+                if (!((nnan >> vi) & 1)) {
+                    return i + vi;
+                }
+            }
+        }
+    #endif
+    }
+
+    // reduce
+    npyv_lanetype_f64 dacc[npyv_nlanes_f64];
+    npyv_lanetype_u64 dacc_i[npyv_nlanes_f64];
+    npyv_store_u64(dacc_i, acc_indices);
+    npyv_store_f64(dacc, acc);
+
+    s_acc = dacc[0];
+    ret_idx = dacc_i[0];
+    for (int vi = 1; vi < vstep; ++vi) {
+        if (dacc[vi] < s_acc) {
+            s_acc = dacc[vi];
+            ret_idx = (npy_intp)dacc_i[vi];
+        }
+    }
+    // get the lowest index in case of matched values
+    for (int vi = 0; vi < vstep; ++vi) {
+        if (s_acc == dacc[vi] && ret_idx > (npy_intp)dacc_i[vi]) {
+            ret_idx = dacc_i[vi];
+        }
+    }
+scalar_loop:
+    for (; i < len; ++i) {
+        npyv_lanetype_f64 a = ip[i];
+    #if 1
+        if (!(a >= s_acc)) {  // negated, for correct nan handling
+    #else
+        if (a < s_acc) {
+    #endif
+            s_acc = a;
+            ret_idx = i;
+        #if 1
+            if (npy_isnan(s_acc)) {
+                // nan encountered, it's maximal
+                return ret_idx;
+            }
+        #endif
+        }
+    }
+    return ret_idx;
+}
+
+#endif // chk_simd
+
+
+#line 291
+#undef TO_SIMD_SFX
+#if 0
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_BYTE == 8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_BYTE == 16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_BYTE == 32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_BYTE == 64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 318
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_argmax)
+(npy_ubyte *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
+{
+#if 0
+    if (npy_isnan(*ip)) {
+        // nan encountered; it's maximal|minimal
+        *mindx = 0;
+        return 0;
+    }
+#endif
+#ifdef TO_SIMD_SFX
+    *mindx = TO_SIMD_SFX(simd_argmax)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
+    npyv_cleanup();
+#else
+    npy_ubyte mp = *ip;
+    *mindx = 0;
+    npy_intp i = 1;
+
+    for (; i < n; ++i) {
+        npy_ubyte a = ip[i];
+    #if 0
+        if (!(a <= mp)) {  // negated, for correct nan handling
+    #else
+        if (a > mp) {
+    #endif
+            mp = a;
+            *mindx = i;
+        #if 0
+            if (npy_isnan(mp)) {
+                // nan encountered, it's maximal|minimal
+                break;
+            }
+        #endif
+        }
+    }
+#endif // TO_SIMD_SFX
+    return 0;
+}
+
+#line 318
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_argmin)
+(npy_ubyte *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
+{
+#if 0
+    if (npy_isnan(*ip)) {
+        // nan encountered; it's maximal|minimal
+        *mindx = 0;
+        return 0;
+    }
+#endif
+#ifdef TO_SIMD_SFX
+    *mindx = TO_SIMD_SFX(simd_argmin)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
+    npyv_cleanup();
+#else
+    npy_ubyte mp = *ip;
+    *mindx = 0;
+    npy_intp i = 1;
+
+    for (; i < n; ++i) {
+        npy_ubyte a = ip[i];
+    #if 0
+        if (!(a >= mp)) {  // negated, for correct nan handling
+    #else
+        if (a < mp) {
+    #endif
+            mp = a;
+            *mindx = i;
+        #if 0
+            if (npy_isnan(mp)) {
+                // nan encountered, it's maximal|minimal
+                break;
+            }
+        #endif
+        }
+    }
+#endif // TO_SIMD_SFX
+    return 0;
+}
+
+
+#line 291
+#undef TO_SIMD_SFX
+#if 0
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_SHORT == 8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_SHORT == 16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_SHORT == 32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_SHORT == 64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 318
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_argmax)
+(npy_ushort *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
+{
+#if 0
+    if (npy_isnan(*ip)) {
+        // nan encountered; it's maximal|minimal
+        *mindx = 0;
+        return 0;
+    }
+#endif
+#ifdef TO_SIMD_SFX
+    *mindx = TO_SIMD_SFX(simd_argmax)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
+    npyv_cleanup();
+#else
+    npy_ushort mp = *ip;
+    *mindx = 0;
+    npy_intp i = 1;
+
+    for (; i < n; ++i) {
+        npy_ushort a = ip[i];
+    #if 0
+        if (!(a <= mp)) {  // negated, for correct nan handling
+    #else
+        if (a > mp) {
+    #endif
+            mp = a;
+            *mindx = i;
+        #if 0
+            if (npy_isnan(mp)) {
+                // nan encountered, it's maximal|minimal
+                break;
+            }
+        #endif
+        }
+    }
+#endif // TO_SIMD_SFX
+    return 0;
+}
+
+#line 318
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_argmin)
+(npy_ushort *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
+{
+#if 0
+    if (npy_isnan(*ip)) {
+        // nan encountered; it's maximal|minimal
+        *mindx = 0;
+        return 0;
+    }
+#endif
+#ifdef TO_SIMD_SFX
+    *mindx = TO_SIMD_SFX(simd_argmin)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
+    npyv_cleanup();
+#else
+    npy_ushort mp = *ip;
+    *mindx = 0;
+    npy_intp i = 1;
+
+    for (; i < n; ++i) {
+        npy_ushort a = ip[i];
+    #if 0
+        if (!(a >= mp)) {  // negated, for correct nan handling
+    #else
+        if (a < mp) {
+    #endif
+            mp = a;
+            *mindx = i;
+        #if 0
+            if (npy_isnan(mp)) {
+                // nan encountered, it's maximal|minimal
+                break;
+            }
+        #endif
+        }
+    }
+#endif // TO_SIMD_SFX
+    return 0;
+}
+
+
+#line 291
+#undef TO_SIMD_SFX
+#if 0
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_INT == 8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_INT == 16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_INT == 32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_INT == 64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 318
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_argmax)
+(npy_uint *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
+{
+#if 0
+    if (npy_isnan(*ip)) {
+        // nan encountered; it's maximal|minimal
+        *mindx = 0;
+        return 0;
+    }
+#endif
+#ifdef TO_SIMD_SFX
+    *mindx = TO_SIMD_SFX(simd_argmax)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
+    npyv_cleanup();
+#else
+    npy_uint mp = *ip;
+    *mindx = 0;
+    npy_intp i = 1;
+
+    for (; i < n; ++i) {
+        npy_uint a = ip[i];
+    #if 0
+        if (!(a <= mp)) {  // negated, for correct nan handling
+    #else
+        if (a > mp) {
+    #endif
+            mp = a;
+            *mindx = i;
+        #if 0
+            if (npy_isnan(mp)) {
+                // nan encountered, it's maximal|minimal
+                break;
+            }
+        #endif
+        }
+    }
+#endif // TO_SIMD_SFX
+    return 0;
+}
+
+#line 318
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_argmin)
+(npy_uint *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
+{
+#if 0
+    if (npy_isnan(*ip)) {
+        // nan encountered; it's maximal|minimal
+        *mindx = 0;
+        return 0;
+    }
+#endif
+#ifdef TO_SIMD_SFX
+    *mindx = TO_SIMD_SFX(simd_argmin)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
+    npyv_cleanup();
+#else
+    npy_uint mp = *ip;
+    *mindx = 0;
+    npy_intp i = 1;
+
+    for (; i < n; ++i) {
+        npy_uint a = ip[i];
+    #if 0
+        if (!(a >= mp)) {  // negated, for correct nan handling
+    #else
+        if (a < mp) {
+    #endif
+            mp = a;
+            *mindx = i;
+        #if 0
+            if (npy_isnan(mp)) {
+                // nan encountered, it's maximal|minimal
+                break;
+            }
+        #endif
+        }
+    }
+#endif // TO_SIMD_SFX
+    return 0;
+}
+
+
+#line 291
+#undef TO_SIMD_SFX
+#if 0
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_LONG == 8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_LONG == 16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_LONG == 32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_LONG == 64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 318
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_argmax)
+(npy_ulong *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
+{
+#if 0
+    if (npy_isnan(*ip)) {
+        // nan encountered; it's maximal|minimal
+        *mindx = 0;
+        return 0;
+    }
+#endif
+#ifdef TO_SIMD_SFX
+    *mindx = TO_SIMD_SFX(simd_argmax)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
+    npyv_cleanup();
+#else
+    npy_ulong mp = *ip;
+    *mindx = 0;
+    npy_intp i = 1;
+
+    for (; i < n; ++i) {
+        npy_ulong a = ip[i];
+    #if 0
+        if (!(a <= mp)) {  // negated, for correct nan handling
+    #else
+        if (a > mp) {
+    #endif
+            mp = a;
+            *mindx = i;
+        #if 0
+            if (npy_isnan(mp)) {
+                // nan encountered, it's maximal|minimal
+                break;
+            }
+        #endif
+        }
+    }
+#endif // TO_SIMD_SFX
+    return 0;
+}
+
+#line 318
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_argmin)
+(npy_ulong *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
+{
+#if 0
+    if (npy_isnan(*ip)) {
+        // nan encountered; it's maximal|minimal
+        *mindx = 0;
+        return 0;
+    }
+#endif
+#ifdef TO_SIMD_SFX
+    *mindx = TO_SIMD_SFX(simd_argmin)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
+    npyv_cleanup();
+#else
+    npy_ulong mp = *ip;
+    *mindx = 0;
+    npy_intp i = 1;
+
+    for (; i < n; ++i) {
+        npy_ulong a = ip[i];
+    #if 0
+        if (!(a >= mp)) {  // negated, for correct nan handling
+    #else
+        if (a < mp) {
+    #endif
+            mp = a;
+            *mindx = i;
+        #if 0
+            if (npy_isnan(mp)) {
+                // nan encountered, it's maximal|minimal
+                break;
+            }
+        #endif
+        }
+    }
+#endif // TO_SIMD_SFX
+    return 0;
+}
+
+
+#line 291
+#undef TO_SIMD_SFX
+#if 0
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 318
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_argmax)
+(npy_ulonglong *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
+{
+#if 0
+    if (npy_isnan(*ip)) {
+        // nan encountered; it's maximal|minimal
+        *mindx = 0;
+        return 0;
+    }
+#endif
+#ifdef TO_SIMD_SFX
+    *mindx = TO_SIMD_SFX(simd_argmax)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
+    npyv_cleanup();
+#else
+    npy_ulonglong mp = *ip;
+    *mindx = 0;
+    npy_intp i = 1;
+
+    for (; i < n; ++i) {
+        npy_ulonglong a = ip[i];
+    #if 0
+        if (!(a <= mp)) {  // negated, for correct nan handling
+    #else
+        if (a > mp) {
+    #endif
+            mp = a;
+            *mindx = i;
+        #if 0
+            if (npy_isnan(mp)) {
+                // nan encountered, it's maximal|minimal
+                break;
+            }
+        #endif
+        }
+    }
+#endif // TO_SIMD_SFX
+    return 0;
+}
+
+#line 318
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_argmin)
+(npy_ulonglong *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
+{
+#if 0
+    if (npy_isnan(*ip)) {
+        // nan encountered; it's maximal|minimal
+        *mindx = 0;
+        return 0;
+    }
+#endif
+#ifdef TO_SIMD_SFX
+    *mindx = TO_SIMD_SFX(simd_argmin)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
+    npyv_cleanup();
+#else
+    npy_ulonglong mp = *ip;
+    *mindx = 0;
+    npy_intp i = 1;
+
+    for (; i < n; ++i) {
+        npy_ulonglong a = ip[i];
+    #if 0
+        if (!(a >= mp)) {  // negated, for correct nan handling
+    #else
+        if (a < mp) {
+    #endif
+            mp = a;
+            *mindx = i;
+        #if 0
+            if (npy_isnan(mp)) {
+                // nan encountered, it's maximal|minimal
+                break;
+            }
+        #endif
+        }
+    }
+#endif // TO_SIMD_SFX
+    return 0;
+}
+
+
+#line 291
+#undef TO_SIMD_SFX
+#if 0
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_BYTE == 8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_BYTE == 16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_BYTE == 32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_BYTE == 64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 318
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_argmax)
+(npy_byte *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
+{
+#if 0
+    if (npy_isnan(*ip)) {
+        // nan encountered; it's maximal|minimal
+        *mindx = 0;
+        return 0;
+    }
+#endif
+#ifdef TO_SIMD_SFX
+    *mindx = TO_SIMD_SFX(simd_argmax)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
+    npyv_cleanup();
+#else
+    npy_byte mp = *ip;
+    *mindx = 0;
+    npy_intp i = 1;
+
+    for (; i < n; ++i) {
+        npy_byte a = ip[i];
+    #if 0
+        if (!(a <= mp)) {  // negated, for correct nan handling
+    #else
+        if (a > mp) {
+    #endif
+            mp = a;
+            *mindx = i;
+        #if 0
+            if (npy_isnan(mp)) {
+                // nan encountered, it's maximal|minimal
+                break;
+            }
+        #endif
+        }
+    }
+#endif // TO_SIMD_SFX
+    return 0;
+}
+
+#line 318
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_argmin)
+(npy_byte *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
+{
+#if 0
+    if (npy_isnan(*ip)) {
+        // nan encountered; it's maximal|minimal
+        *mindx = 0;
+        return 0;
+    }
+#endif
+#ifdef TO_SIMD_SFX
+    *mindx = TO_SIMD_SFX(simd_argmin)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
+    npyv_cleanup();
+#else
+    npy_byte mp = *ip;
+    *mindx = 0;
+    npy_intp i = 1;
+
+    for (; i < n; ++i) {
+        npy_byte a = ip[i];
+    #if 0
+        if (!(a >= mp)) {  // negated, for correct nan handling
+    #else
+        if (a < mp) {
+    #endif
+            mp = a;
+            *mindx = i;
+        #if 0
+            if (npy_isnan(mp)) {
+                // nan encountered, it's maximal|minimal
+                break;
+            }
+        #endif
+        }
+    }
+#endif // TO_SIMD_SFX
+    return 0;
+}
+
+
+#line 291
+#undef TO_SIMD_SFX
+#if 0
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_SHORT == 8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_SHORT == 16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_SHORT == 32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_SHORT == 64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 318
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_argmax)
+(npy_short *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
+{
+#if 0
+    if (npy_isnan(*ip)) {
+        // nan encountered; it's maximal|minimal
+        *mindx = 0;
+        return 0;
+    }
+#endif
+#ifdef TO_SIMD_SFX
+    *mindx = TO_SIMD_SFX(simd_argmax)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
+    npyv_cleanup();
+#else
+    npy_short mp = *ip;
+    *mindx = 0;
+    npy_intp i = 1;
+
+    for (; i < n; ++i) {
+        npy_short a = ip[i];
+    #if 0
+        if (!(a <= mp)) {  // negated, for correct nan handling
+    #else
+        if (a > mp) {
+    #endif
+            mp = a;
+            *mindx = i;
+        #if 0
+            if (npy_isnan(mp)) {
+                // nan encountered, it's maximal|minimal
+                break;
+            }
+        #endif
+        }
+    }
+#endif // TO_SIMD_SFX
+    return 0;
+}
+
+#line 318
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_argmin)
+(npy_short *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
+{
+#if 0
+    if (npy_isnan(*ip)) {
+        // nan encountered; it's maximal|minimal
+        *mindx = 0;
+        return 0;
+    }
+#endif
+#ifdef TO_SIMD_SFX
+    *mindx = TO_SIMD_SFX(simd_argmin)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
+    npyv_cleanup();
+#else
+    npy_short mp = *ip;
+    *mindx = 0;
+    npy_intp i = 1;
+
+    for (; i < n; ++i) {
+        npy_short a = ip[i];
+    #if 0
+        if (!(a >= mp)) {  // negated, for correct nan handling
+    #else
+        if (a < mp) {
+    #endif
+            mp = a;
+            *mindx = i;
+        #if 0
+            if (npy_isnan(mp)) {
+                // nan encountered, it's maximal|minimal
+                break;
+            }
+        #endif
+        }
+    }
+#endif // TO_SIMD_SFX
+    return 0;
+}
+
+
+#line 291
+#undef TO_SIMD_SFX
+#if 0
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_INT == 8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_INT == 16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_INT == 32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_INT == 64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 318
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_argmax)
+(npy_int *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
+{
+#if 0
+    if (npy_isnan(*ip)) {
+        // nan encountered; it's maximal|minimal
+        *mindx = 0;
+        return 0;
+    }
+#endif
+#ifdef TO_SIMD_SFX
+    *mindx = TO_SIMD_SFX(simd_argmax)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
+    npyv_cleanup();
+#else
+    npy_int mp = *ip;
+    *mindx = 0;
+    npy_intp i = 1;
+
+    for (; i < n; ++i) {
+        npy_int a = ip[i];
+    #if 0
+        if (!(a <= mp)) {  // negated, for correct nan handling
+    #else
+        if (a > mp) {
+    #endif
+            mp = a;
+            *mindx = i;
+        #if 0
+            if (npy_isnan(mp)) {
+                // nan encountered, it's maximal|minimal
+                break;
+            }
+        #endif
+        }
+    }
+#endif // TO_SIMD_SFX
+    return 0;
+}
+
+#line 318
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_argmin)
+(npy_int *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
+{
+#if 0
+    if (npy_isnan(*ip)) {
+        // nan encountered; it's maximal|minimal
+        *mindx = 0;
+        return 0;
+    }
+#endif
+#ifdef TO_SIMD_SFX
+    *mindx = TO_SIMD_SFX(simd_argmin)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
+    npyv_cleanup();
+#else
+    npy_int mp = *ip;
+    *mindx = 0;
+    npy_intp i = 1;
+
+    for (; i < n; ++i) {
+        npy_int a = ip[i];
+    #if 0
+        if (!(a >= mp)) {  // negated, for correct nan handling
+    #else
+        if (a < mp) {
+    #endif
+            mp = a;
+            *mindx = i;
+        #if 0
+            if (npy_isnan(mp)) {
+                // nan encountered, it's maximal|minimal
+                break;
+            }
+        #endif
+        }
+    }
+#endif // TO_SIMD_SFX
+    return 0;
+}
+
+
+#line 291
+#undef TO_SIMD_SFX
+#if 0
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_LONG == 8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_LONG == 16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_LONG == 32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_LONG == 64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 318
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_argmax)
+(npy_long *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
+{
+#if 0
+    if (npy_isnan(*ip)) {
+        // nan encountered; it's maximal|minimal
+        *mindx = 0;
+        return 0;
+    }
+#endif
+#ifdef TO_SIMD_SFX
+    *mindx = TO_SIMD_SFX(simd_argmax)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
+    npyv_cleanup();
+#else
+    npy_long mp = *ip;
+    *mindx = 0;
+    npy_intp i = 1;
+
+    for (; i < n; ++i) {
+        npy_long a = ip[i];
+    #if 0
+        if (!(a <= mp)) {  // negated, for correct nan handling
+    #else
+        if (a > mp) {
+    #endif
+            mp = a;
+            *mindx = i;
+        #if 0
+            if (npy_isnan(mp)) {
+                // nan encountered, it's maximal|minimal
+                break;
+            }
+        #endif
+        }
+    }
+#endif // TO_SIMD_SFX
+    return 0;
+}
+
+#line 318
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_argmin)
+(npy_long *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
+{
+#if 0
+    if (npy_isnan(*ip)) {
+        // nan encountered; it's maximal|minimal
+        *mindx = 0;
+        return 0;
+    }
+#endif
+#ifdef TO_SIMD_SFX
+    *mindx = TO_SIMD_SFX(simd_argmin)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
+    npyv_cleanup();
+#else
+    npy_long mp = *ip;
+    *mindx = 0;
+    npy_intp i = 1;
+
+    for (; i < n; ++i) {
+        npy_long a = ip[i];
+    #if 0
+        if (!(a >= mp)) {  // negated, for correct nan handling
+    #else
+        if (a < mp) {
+    #endif
+            mp = a;
+            *mindx = i;
+        #if 0
+            if (npy_isnan(mp)) {
+                // nan encountered, it's maximal|minimal
+                break;
+            }
+        #endif
+        }
+    }
+#endif // TO_SIMD_SFX
+    return 0;
+}
+
+
+#line 291
+#undef TO_SIMD_SFX
+#if 0
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 318
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_argmax)
+(npy_longlong *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
+{
+#if 0
+    if (npy_isnan(*ip)) {
+        // nan encountered; it's maximal|minimal
+        *mindx = 0;
+        return 0;
+    }
+#endif
+#ifdef TO_SIMD_SFX
+    *mindx = TO_SIMD_SFX(simd_argmax)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
+    npyv_cleanup();
+#else
+    npy_longlong mp = *ip;
+    *mindx = 0;
+    npy_intp i = 1;
+
+    for (; i < n; ++i) {
+        npy_longlong a = ip[i];
+    #if 0
+        if (!(a <= mp)) {  // negated, for correct nan handling
+    #else
+        if (a > mp) {
+    #endif
+            mp = a;
+            *mindx = i;
+        #if 0
+            if (npy_isnan(mp)) {
+                // nan encountered, it's maximal|minimal
+                break;
+            }
+        #endif
+        }
+    }
+#endif // TO_SIMD_SFX
+    return 0;
+}
+
+#line 318
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_argmin)
+(npy_longlong *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
+{
+#if 0
+    if (npy_isnan(*ip)) {
+        // nan encountered; it's maximal|minimal
+        *mindx = 0;
+        return 0;
+    }
+#endif
+#ifdef TO_SIMD_SFX
+    *mindx = TO_SIMD_SFX(simd_argmin)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
+    npyv_cleanup();
+#else
+    npy_longlong mp = *ip;
+    *mindx = 0;
+    npy_intp i = 1;
+
+    for (; i < n; ++i) {
+        npy_longlong a = ip[i];
+    #if 0
+        if (!(a >= mp)) {  // negated, for correct nan handling
+    #else
+        if (a < mp) {
+    #endif
+            mp = a;
+            *mindx = i;
+        #if 0
+            if (npy_isnan(mp)) {
+                // nan encountered, it's maximal|minimal
+                break;
+            }
+        #endif
+        }
+    }
+#endif // TO_SIMD_SFX
+    return 0;
+}
+
+
+#line 291
+#undef TO_SIMD_SFX
+#if 0
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_FLOAT == 8
+    #if 1
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_FLOAT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_FLOAT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_FLOAT == 16
+    #if 1
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_FLOAT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_FLOAT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_FLOAT == 32
+    #if 1
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_FLOAT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_FLOAT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_FLOAT == 64
+    #if 1
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_FLOAT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_FLOAT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 318
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_argmax)
+(npy_float *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
+{
+#if 1
+    if (npy_isnan(*ip)) {
+        // nan encountered; it's maximal|minimal
+        *mindx = 0;
+        return 0;
+    }
+#endif
+#ifdef TO_SIMD_SFX
+    *mindx = TO_SIMD_SFX(simd_argmax)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
+    npyv_cleanup();
+#else
+    npy_float mp = *ip;
+    *mindx = 0;
+    npy_intp i = 1;
+
+    for (; i < n; ++i) {
+        npy_float a = ip[i];
+    #if 1
+        if (!(a <= mp)) {  // negated, for correct nan handling
+    #else
+        if (a > mp) {
+    #endif
+            mp = a;
+            *mindx = i;
+        #if 1
+            if (npy_isnan(mp)) {
+                // nan encountered, it's maximal|minimal
+                break;
+            }
+        #endif
+        }
+    }
+#endif // TO_SIMD_SFX
+    return 0;
+}
+
+#line 318
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_argmin)
+(npy_float *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
+{
+#if 1
+    if (npy_isnan(*ip)) {
+        // nan encountered; it's maximal|minimal
+        *mindx = 0;
+        return 0;
+    }
+#endif
+#ifdef TO_SIMD_SFX
+    *mindx = TO_SIMD_SFX(simd_argmin)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
+    npyv_cleanup();
+#else
+    npy_float mp = *ip;
+    *mindx = 0;
+    npy_intp i = 1;
+
+    for (; i < n; ++i) {
+        npy_float a = ip[i];
+    #if 1
+        if (!(a >= mp)) {  // negated, for correct nan handling
+    #else
+        if (a < mp) {
+    #endif
+            mp = a;
+            *mindx = i;
+        #if 1
+            if (npy_isnan(mp)) {
+                // nan encountered, it's maximal|minimal
+                break;
+            }
+        #endif
+        }
+    }
+#endif // TO_SIMD_SFX
+    return 0;
+}
+
+
+#line 291
+#undef TO_SIMD_SFX
+#if 0
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 8
+    #if 1
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_DOUBLE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_DOUBLE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 16
+    #if 1
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_DOUBLE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_DOUBLE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 32
+    #if 1
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_DOUBLE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_DOUBLE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 64
+    #if 1
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_DOUBLE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_DOUBLE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 318
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_argmax)
+(npy_double *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
+{
+#if 1
+    if (npy_isnan(*ip)) {
+        // nan encountered; it's maximal|minimal
+        *mindx = 0;
+        return 0;
+    }
+#endif
+#ifdef TO_SIMD_SFX
+    *mindx = TO_SIMD_SFX(simd_argmax)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
+    npyv_cleanup();
+#else
+    npy_double mp = *ip;
+    *mindx = 0;
+    npy_intp i = 1;
+
+    for (; i < n; ++i) {
+        npy_double a = ip[i];
+    #if 1
+        if (!(a <= mp)) {  // negated, for correct nan handling
+    #else
+        if (a > mp) {
+    #endif
+            mp = a;
+            *mindx = i;
+        #if 1
+            if (npy_isnan(mp)) {
+                // nan encountered, it's maximal|minimal
+                break;
+            }
+        #endif
+        }
+    }
+#endif // TO_SIMD_SFX
+    return 0;
+}
+
+#line 318
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_argmin)
+(npy_double *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
+{
+#if 1
+    if (npy_isnan(*ip)) {
+        // nan encountered; it's maximal|minimal
+        *mindx = 0;
+        return 0;
+    }
+#endif
+#ifdef TO_SIMD_SFX
+    *mindx = TO_SIMD_SFX(simd_argmin)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
+    npyv_cleanup();
+#else
+    npy_double mp = *ip;
+    *mindx = 0;
+    npy_intp i = 1;
+
+    for (; i < n; ++i) {
+        npy_double a = ip[i];
+    #if 1
+        if (!(a >= mp)) {  // negated, for correct nan handling
+    #else
+        if (a < mp) {
+    #endif
+            mp = a;
+            *mindx = i;
+        #if 1
+            if (npy_isnan(mp)) {
+                // nan encountered, it's maximal|minimal
+                break;
+            }
+        #endif
+        }
+    }
+#endif // TO_SIMD_SFX
+    return 0;
+}
+
+
+#line 291
+#undef TO_SIMD_SFX
+#if 0
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 8
+    #if 1
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_LONGDOUBLE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONGDOUBLE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 16
+    #if 1
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_LONGDOUBLE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONGDOUBLE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 32
+    #if 1
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_LONGDOUBLE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONGDOUBLE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 296
+#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 64
+    #if 1
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_LONGDOUBLE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONGDOUBLE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 318
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_argmax)
+(npy_longdouble *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
+{
+#if 1
+    if (npy_isnan(*ip)) {
+        // nan encountered; it's maximal|minimal
+        *mindx = 0;
+        return 0;
+    }
+#endif
+#ifdef TO_SIMD_SFX
+    *mindx = TO_SIMD_SFX(simd_argmax)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
+    npyv_cleanup();
+#else
+    npy_longdouble mp = *ip;
+    *mindx = 0;
+    npy_intp i = 1;
+
+    for (; i < n; ++i) {
+        npy_longdouble a = ip[i];
+    #if 1
+        if (!(a <= mp)) {  // negated, for correct nan handling
+    #else
+        if (a > mp) {
+    #endif
+            mp = a;
+            *mindx = i;
+        #if 1
+            if (npy_isnan(mp)) {
+                // nan encountered, it's maximal|minimal
+                break;
+            }
+        #endif
+        }
+    }
+#endif // TO_SIMD_SFX
+    return 0;
+}
+
+#line 318
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_argmin)
+(npy_longdouble *ip, npy_intp n, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
+{
+#if 1
+    if (npy_isnan(*ip)) {
+        // nan encountered; it's maximal|minimal
+        *mindx = 0;
+        return 0;
+    }
+#endif
+#ifdef TO_SIMD_SFX
+    *mindx = TO_SIMD_SFX(simd_argmin)((TO_SIMD_SFX(npyv_lanetype)*)ip, n);
+    npyv_cleanup();
+#else
+    npy_longdouble mp = *ip;
+    *mindx = 0;
+    npy_intp i = 1;
+
+    for (; i < n; ++i) {
+        npy_longdouble a = ip[i];
+    #if 1
+        if (!(a >= mp)) {  // negated, for correct nan handling
+    #else
+        if (a < mp) {
+    #endif
+            mp = a;
+            *mindx = i;
+        #if 1
+            if (npy_isnan(mp)) {
+                // nan encountered, it's maximal|minimal
+                break;
+            }
+        #endif
+        }
+    }
+#endif // TO_SIMD_SFX
+    return 0;
+}
+
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BOOL_argmax)
+(npy_bool *ip, npy_intp len, npy_intp *mindx, PyArrayObject *NPY_UNUSED(aip))
+
+{
+    npy_intp i = 0;
+#if NPY_SIMD
+    const npyv_u8 zero = npyv_zero_u8();
+    const int vstep = npyv_nlanes_u8;
+    const int wstep = vstep * 4;
+    for (npy_intp n = len & -wstep; i < n; i += wstep) {
+        npyv_u8 a = npyv_load_u8(ip + i + vstep*0);
+        npyv_u8 b = npyv_load_u8(ip + i + vstep*1);
+        npyv_u8 c = npyv_load_u8(ip + i + vstep*2);
+        npyv_u8 d = npyv_load_u8(ip + i + vstep*3);
+        npyv_b8 m_a = npyv_cmpeq_u8(a, zero);
+        npyv_b8 m_b = npyv_cmpeq_u8(b, zero);
+        npyv_b8 m_c = npyv_cmpeq_u8(c, zero);
+        npyv_b8 m_d = npyv_cmpeq_u8(d, zero);
+        npyv_b8 m_ab = npyv_and_b8(m_a, m_b);
+        npyv_b8 m_cd = npyv_and_b8(m_c, m_d);
+        npy_uint64 m = npyv_tobits_b8(npyv_and_b8(m_ab, m_cd));
+    #if NPY_SIMD == 512
+        if (m != NPY_MAX_UINT64) {
+    #else
+        if ((npy_int64)m != ((1LL << vstep) - 1)) {
+    #endif
+            break;
+        }
+    }
+    npyv_cleanup();
+#endif // NPY_SIMD
+    for (; i < len; ++i) {
+        if (ip[i]) {
+            *mindx = i;
+            return 0;
+        }
+    }
+    *mindx = 0;
+    return 0;
+}
+
diff --git a/numpy/core/src/_generated/arraytypes.c b/numpy/core/src/_generated/arraytypes.c
new file mode 100644
index 000000000000..efa4c271cf65
--- /dev/null
+++ b/numpy/core/src/_generated/arraytypes.c
@@ -0,0 +1,33439 @@
+#line 1 "numpy/core/src/multiarray/arraytypes.c.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+/* -*- c -*- */
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+#include <structmember.h>
+#include <limits.h>
+#include <assert.h>
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#define _UMATHMODULE
+#define _NPY_NO_DEPRECATIONS /* for NPY_CHAR */
+
+#include "numpy/npy_common.h"
+#include "numpy/arrayobject.h"
+#include "numpy/arrayscalars.h"
+#include "npy_pycompat.h"
+#include "numpy/npy_math.h"
+#include "numpy/halffloat.h"
+
+#include "npy_config.h"
+#include "npy_sort.h"
+#include "common.h"
+#include "ctors.h"
+#include "convert_datatype.h"
+#include "dtypemeta.h"
+#include "lowlevel_strided_loops.h"
+#include "usertypes.h"
+#include "_datetime.h"
+#include "arrayobject.h"
+#include "alloc.h"
+#include "typeinfo.h"
+
+#include "npy_longdouble.h"
+#include "numpyos.h"
+#include <string.h>
+
+#include "cblasfuncs.h"
+#include "npy_cblas.h"
+#include "npy_buffer.h"
+
+#include "arraytypes.h"
+
+#include "umathmodule.h"
+
+/*
+ * Define a stack allocated dummy array with only the minimum information set:
+ *   1. The descr, the main field interesting here.
+ *   2. The flags, which are needed for alignment;.
+ *   3. The type is set to NULL and the base is the original array, if this
+ *      is used within a subarray getitem to create a new view, the base
+ *      must be walked until the type is not NULL.
+ *
+ * The following should create errors in debug mode (if deallocated
+ * incorrectly), since base would be incorrectly decref'd as well.
+ * This is especially important for nonzero and copyswap, which may run with
+ * the GIL released.
+ */
+static inline PyArrayObject_fields
+get_dummy_stack_array(PyArrayObject *orig)
+{
+    PyArrayObject_fields new_fields;
+    new_fields.flags = PyArray_FLAGS(orig);
+    /* Set to NULL so the dummy object can be distinguished from the real one */
+    Py_SET_TYPE(&new_fields, NULL);
+    new_fields.base = (PyObject *)orig;
+    return new_fields;
+}
+
+
+/* check for sequences, but ignore the types numpy considers scalars */
+static inline npy_bool
+PySequence_NoString_Check(PyObject *op) {
+    return
+        PySequence_Check(op) &&
+        !PyBytes_Check(op) &&
+        !PyUnicode_Check(op) &&
+        !PyArray_IsZeroDim(op);
+}
+
+/*
+ *****************************************************************************
+ **                        PYTHON TYPES TO C TYPES                          **
+ *****************************************************************************
+ */
+
+static double
+MyPyFloat_AsDouble(PyObject *obj)
+{
+    double ret = 0;
+    PyObject *num;
+
+    if (obj == Py_None) {
+        return NPY_NAN;
+    }
+    num = PyNumber_Float(obj);
+    if (num == NULL) {
+        return NPY_NAN;
+    }
+    ret = PyFloat_AS_DOUBLE(num);
+    Py_DECREF(num);
+    return ret;
+}
+
+
+static float
+MyPyFloat_AsFloat(PyObject *obj)
+{
+    double d_val = MyPyFloat_AsDouble(obj);
+    float res = (float)d_val;
+    if (NPY_UNLIKELY(npy_isinf(res) && !npy_isinf(d_val))) {
+        if (PyUFunc_GiveFloatingpointErrors("cast", NPY_FPE_OVERFLOW) < 0) {
+            return -1;
+        }
+    }
+    return res;
+}
+
+
+static npy_half
+MyPyFloat_AsHalf(PyObject *obj)
+{
+    double d_val = MyPyFloat_AsDouble(obj);
+    npy_half res = npy_double_to_half(d_val);
+    if (NPY_UNLIKELY(npy_half_isinf(res) && !npy_isinf(d_val))) {
+        if (PyUFunc_GiveFloatingpointErrors("cast", NPY_FPE_OVERFLOW) < 0) {
+            return npy_double_to_half(-1.);
+        }
+    }
+    return res;
+}
+
+static PyObject *
+MyPyFloat_FromHalf(npy_half h)
+{
+    return PyFloat_FromDouble(npy_half_to_double(h));
+}
+
+/* Handle case of assigning from an array scalar in setitem */
+static int
+convert_to_scalar_and_retry(PyObject *op, void *ov, void *vap,
+                      int (*setitem)(PyObject *op, void *ov, void *vap))
+{
+    PyObject *temp;
+
+    assert(PyArray_IsZeroDim(op));
+    temp = PyArray_ToScalar(PyArray_BYTES((PyArrayObject *)op),
+                                      (PyArrayObject *)op);
+    if (temp == NULL) {
+        return -1;
+    }
+    else {
+        int res = setitem(temp, ov, vap);
+        Py_DECREF(temp);
+        return res;
+    }
+}
+
+
+#line 164
+static npy_long
+MyPyLong_AsLong (PyObject *obj)
+{
+    npy_long ret;
+    PyObject *num = PyNumber_Long(obj);
+
+    if (num == NULL) {
+        return -1;
+    }
+    ret = PyLong_AsLong(num);
+    Py_DECREF(num);
+    return ret;
+}
+
+static npy_long
+MyPyLong_AsLongWithWrap(PyObject *obj, int *wraparound)
+{
+    *wraparound = 0;  /* Never happens within the function */
+    return MyPyLong_AsLong(obj);
+}
+
+
+#line 164
+static npy_longlong
+MyPyLong_AsLongLong (PyObject *obj)
+{
+    npy_longlong ret;
+    PyObject *num = PyNumber_Long(obj);
+
+    if (num == NULL) {
+        return -1;
+    }
+    ret = PyLong_AsLongLong(num);
+    Py_DECREF(num);
+    return ret;
+}
+
+static npy_longlong
+MyPyLong_AsLongLongWithWrap(PyObject *obj, int *wraparound)
+{
+    *wraparound = 0;  /* Never happens within the function */
+    return MyPyLong_AsLongLong(obj);
+}
+
+
+
+#line 192
+static npy_ulong
+MyPyLong_AsUnsignedLongWithWrap(PyObject *obj, int *wraparound)
+{
+    npy_ulong ret;
+    *wraparound = 0;
+    PyObject *num = PyNumber_Long(obj);
+
+    if (num == NULL) {
+        return -1;
+    }
+    ret = PyLong_AsUnsignedLong(num);
+    if (PyErr_Occurred()) {
+        PyErr_Clear();
+        *wraparound = 1;  /* negative wrapped to positive */
+        ret = PyLong_AsLong(num);
+    }
+    Py_DECREF(num);
+    return ret;
+}
+
+static npy_ulong
+MyPyLong_AsUnsignedLong(PyObject *obj)
+{
+    int wraparound;
+    return MyPyLong_AsUnsignedLongWithWrap(obj, &wraparound);
+}
+
+
+
+#line 192
+static npy_ulonglong
+MyPyLong_AsUnsignedLongLongWithWrap(PyObject *obj, int *wraparound)
+{
+    npy_ulonglong ret;
+    *wraparound = 0;
+    PyObject *num = PyNumber_Long(obj);
+
+    if (num == NULL) {
+        return -1;
+    }
+    ret = PyLong_AsUnsignedLongLong(num);
+    if (PyErr_Occurred()) {
+        PyErr_Clear();
+        *wraparound = 1;  /* negative wrapped to positive */
+        ret = PyLong_AsLongLong(num);
+    }
+    Py_DECREF(num);
+    return ret;
+}
+
+static npy_ulonglong
+MyPyLong_AsUnsignedLongLong(PyObject *obj)
+{
+    int wraparound;
+    return MyPyLong_AsUnsignedLongLongWithWrap(obj, &wraparound);
+}
+
+
+
+
+/*
+ *****************************************************************************
+ **                         GETITEM AND SETITEM                             **
+ *****************************************************************************
+ */
+/*
+ * Disable harmless compiler warning "4116: unnamed type definition in
+ * parentheses" which is caused by the _ALIGN macro.
+ */
+#if defined(_MSC_VER)
+#pragma warning(disable:4116)
+#endif
+
+
+#line 250
+
+/*
+ * Helper for conversion from Python integers.  This uses the same conversion
+ * function as below for compatibility (which may seem strange).
+ * However, it adds more strict integer overflow checks to prevent mainly
+ * conversion of negative integers.  These are considered deprecated, which is
+ * related to NEP 50 (but somewhat independent).
+ */
+static int
+BYTE_safe_pyint_setitem(PyObject *obj, npy_byte *result)
+{
+    /* Input is guaranteed to be a Python integer */
+    assert(PyLong_Check(obj));
+    int wraparound;
+    npy_long value = MyPyLong_AsLongWithWrap(obj, &wraparound);
+    if (value == (npy_long)-1 && PyErr_Occurred()) {
+        return -1;
+    }
+    *result = (npy_byte)value;
+
+    if (wraparound
+#if NPY_SIZEOF_BYTE < NPY_SIZEOF_LONG
+            || *result != value
+#endif
+            ) {
+        PyArray_Descr *descr = PyArray_DescrFromType(NPY_BYTE);
+
+        if (npy_promotion_state == NPY_USE_LEGACY_PROMOTION || (
+                    npy_promotion_state == NPY_USE_WEAK_PROMOTION_AND_WARN
+                        && !npy_give_promotion_warnings())) {
+            /*
+             * This path will be taken both for the "promotion" case such as
+             * `uint8_arr + 123` as well as the assignment case.
+             * The "legacy" path should only ever be taken for assignment
+             * (legacy promotion will prevent overflows by promoting up)
+             * so a normal deprecation makes sense.
+             * When weak promotion is active, we use "future" behavior unless
+             * warnings were explicitly opt-in.
+             */
+            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+                    "NumPy will stop allowing conversion of out-of-bound "
+                    "Python integers to integer arrays.  The conversion "
+                    "of %.100R to %S will fail in the future.\n"
+                    "For the old behavior, usually:\n"
+                    "    np.array(value).astype(dtype)\n"
+                    "will give the desired result (the cast overflows).",
+                    obj, descr) < 0) {
+                Py_DECREF(descr);
+                return -1;
+            }
+            Py_DECREF(descr);
+            return 0;
+        }
+        else {
+            /* Live in the future, outright error: */
+            PyErr_Format(PyExc_OverflowError,
+                    "Python integer %R out of bounds for %S", obj, descr);
+            Py_DECREF(descr);
+            return -1;
+            }
+        assert(0);
+    }
+    return 0;
+}
+
+
+#line 250
+
+/*
+ * Helper for conversion from Python integers.  This uses the same conversion
+ * function as below for compatibility (which may seem strange).
+ * However, it adds more strict integer overflow checks to prevent mainly
+ * conversion of negative integers.  These are considered deprecated, which is
+ * related to NEP 50 (but somewhat independent).
+ */
+static int
+SHORT_safe_pyint_setitem(PyObject *obj, npy_short *result)
+{
+    /* Input is guaranteed to be a Python integer */
+    assert(PyLong_Check(obj));
+    int wraparound;
+    npy_long value = MyPyLong_AsLongWithWrap(obj, &wraparound);
+    if (value == (npy_long)-1 && PyErr_Occurred()) {
+        return -1;
+    }
+    *result = (npy_short)value;
+
+    if (wraparound
+#if NPY_SIZEOF_SHORT < NPY_SIZEOF_LONG
+            || *result != value
+#endif
+            ) {
+        PyArray_Descr *descr = PyArray_DescrFromType(NPY_SHORT);
+
+        if (npy_promotion_state == NPY_USE_LEGACY_PROMOTION || (
+                    npy_promotion_state == NPY_USE_WEAK_PROMOTION_AND_WARN
+                        && !npy_give_promotion_warnings())) {
+            /*
+             * This path will be taken both for the "promotion" case such as
+             * `uint8_arr + 123` as well as the assignment case.
+             * The "legacy" path should only ever be taken for assignment
+             * (legacy promotion will prevent overflows by promoting up)
+             * so a normal deprecation makes sense.
+             * When weak promotion is active, we use "future" behavior unless
+             * warnings were explicitly opt-in.
+             */
+            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+                    "NumPy will stop allowing conversion of out-of-bound "
+                    "Python integers to integer arrays.  The conversion "
+                    "of %.100R to %S will fail in the future.\n"
+                    "For the old behavior, usually:\n"
+                    "    np.array(value).astype(dtype)\n"
+                    "will give the desired result (the cast overflows).",
+                    obj, descr) < 0) {
+                Py_DECREF(descr);
+                return -1;
+            }
+            Py_DECREF(descr);
+            return 0;
+        }
+        else {
+            /* Live in the future, outright error: */
+            PyErr_Format(PyExc_OverflowError,
+                    "Python integer %R out of bounds for %S", obj, descr);
+            Py_DECREF(descr);
+            return -1;
+            }
+        assert(0);
+    }
+    return 0;
+}
+
+
+#line 250
+
+/*
+ * Helper for conversion from Python integers.  This uses the same conversion
+ * function as below for compatibility (which may seem strange).
+ * However, it adds more strict integer overflow checks to prevent mainly
+ * conversion of negative integers.  These are considered deprecated, which is
+ * related to NEP 50 (but somewhat independent).
+ */
+static int
+INT_safe_pyint_setitem(PyObject *obj, npy_int *result)
+{
+    /* Input is guaranteed to be a Python integer */
+    assert(PyLong_Check(obj));
+    int wraparound;
+    npy_long value = MyPyLong_AsLongWithWrap(obj, &wraparound);
+    if (value == (npy_long)-1 && PyErr_Occurred()) {
+        return -1;
+    }
+    *result = (npy_int)value;
+
+    if (wraparound
+#if NPY_SIZEOF_INT < NPY_SIZEOF_LONG
+            || *result != value
+#endif
+            ) {
+        PyArray_Descr *descr = PyArray_DescrFromType(NPY_INT);
+
+        if (npy_promotion_state == NPY_USE_LEGACY_PROMOTION || (
+                    npy_promotion_state == NPY_USE_WEAK_PROMOTION_AND_WARN
+                        && !npy_give_promotion_warnings())) {
+            /*
+             * This path will be taken both for the "promotion" case such as
+             * `uint8_arr + 123` as well as the assignment case.
+             * The "legacy" path should only ever be taken for assignment
+             * (legacy promotion will prevent overflows by promoting up)
+             * so a normal deprecation makes sense.
+             * When weak promotion is active, we use "future" behavior unless
+             * warnings were explicitly opt-in.
+             */
+            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+                    "NumPy will stop allowing conversion of out-of-bound "
+                    "Python integers to integer arrays.  The conversion "
+                    "of %.100R to %S will fail in the future.\n"
+                    "For the old behavior, usually:\n"
+                    "    np.array(value).astype(dtype)\n"
+                    "will give the desired result (the cast overflows).",
+                    obj, descr) < 0) {
+                Py_DECREF(descr);
+                return -1;
+            }
+            Py_DECREF(descr);
+            return 0;
+        }
+        else {
+            /* Live in the future, outright error: */
+            PyErr_Format(PyExc_OverflowError,
+                    "Python integer %R out of bounds for %S", obj, descr);
+            Py_DECREF(descr);
+            return -1;
+            }
+        assert(0);
+    }
+    return 0;
+}
+
+
+#line 250
+
+/*
+ * Helper for conversion from Python integers.  This uses the same conversion
+ * function as below for compatibility (which may seem strange).
+ * However, it adds more strict integer overflow checks to prevent mainly
+ * conversion of negative integers.  These are considered deprecated, which is
+ * related to NEP 50 (but somewhat independent).
+ */
+static int
+LONG_safe_pyint_setitem(PyObject *obj, npy_long *result)
+{
+    /* Input is guaranteed to be a Python integer */
+    assert(PyLong_Check(obj));
+    int wraparound;
+    npy_long value = MyPyLong_AsLongWithWrap(obj, &wraparound);
+    if (value == (npy_long)-1 && PyErr_Occurred()) {
+        return -1;
+    }
+    *result = (npy_long)value;
+
+    if (wraparound
+#if NPY_SIZEOF_LONG < NPY_SIZEOF_LONG
+            || *result != value
+#endif
+            ) {
+        PyArray_Descr *descr = PyArray_DescrFromType(NPY_LONG);
+
+        if (npy_promotion_state == NPY_USE_LEGACY_PROMOTION || (
+                    npy_promotion_state == NPY_USE_WEAK_PROMOTION_AND_WARN
+                        && !npy_give_promotion_warnings())) {
+            /*
+             * This path will be taken both for the "promotion" case such as
+             * `uint8_arr + 123` as well as the assignment case.
+             * The "legacy" path should only ever be taken for assignment
+             * (legacy promotion will prevent overflows by promoting up)
+             * so a normal deprecation makes sense.
+             * When weak promotion is active, we use "future" behavior unless
+             * warnings were explicitly opt-in.
+             */
+            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+                    "NumPy will stop allowing conversion of out-of-bound "
+                    "Python integers to integer arrays.  The conversion "
+                    "of %.100R to %S will fail in the future.\n"
+                    "For the old behavior, usually:\n"
+                    "    np.array(value).astype(dtype)\n"
+                    "will give the desired result (the cast overflows).",
+                    obj, descr) < 0) {
+                Py_DECREF(descr);
+                return -1;
+            }
+            Py_DECREF(descr);
+            return 0;
+        }
+        else {
+            /* Live in the future, outright error: */
+            PyErr_Format(PyExc_OverflowError,
+                    "Python integer %R out of bounds for %S", obj, descr);
+            Py_DECREF(descr);
+            return -1;
+            }
+        assert(0);
+    }
+    return 0;
+}
+
+
+#line 250
+
+/*
+ * Helper for conversion from Python integers.  This uses the same conversion
+ * function as below for compatibility (which may seem strange).
+ * However, it adds more strict integer overflow checks to prevent mainly
+ * conversion of negative integers.  These are considered deprecated, which is
+ * related to NEP 50 (but somewhat independent).
+ */
+static int
+LONGLONG_safe_pyint_setitem(PyObject *obj, npy_longlong *result)
+{
+    /* Input is guaranteed to be a Python integer */
+    assert(PyLong_Check(obj));
+    int wraparound;
+    npy_longlong value = MyPyLong_AsLongLongWithWrap(obj, &wraparound);
+    if (value == (npy_longlong)-1 && PyErr_Occurred()) {
+        return -1;
+    }
+    *result = (npy_longlong)value;
+
+    if (wraparound
+#if NPY_SIZEOF_LONGLONG < NPY_SIZEOF_LONGLONG
+            || *result != value
+#endif
+            ) {
+        PyArray_Descr *descr = PyArray_DescrFromType(NPY_LONGLONG);
+
+        if (npy_promotion_state == NPY_USE_LEGACY_PROMOTION || (
+                    npy_promotion_state == NPY_USE_WEAK_PROMOTION_AND_WARN
+                        && !npy_give_promotion_warnings())) {
+            /*
+             * This path will be taken both for the "promotion" case such as
+             * `uint8_arr + 123` as well as the assignment case.
+             * The "legacy" path should only ever be taken for assignment
+             * (legacy promotion will prevent overflows by promoting up)
+             * so a normal deprecation makes sense.
+             * When weak promotion is active, we use "future" behavior unless
+             * warnings were explicitly opt-in.
+             */
+            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+                    "NumPy will stop allowing conversion of out-of-bound "
+                    "Python integers to integer arrays.  The conversion "
+                    "of %.100R to %S will fail in the future.\n"
+                    "For the old behavior, usually:\n"
+                    "    np.array(value).astype(dtype)\n"
+                    "will give the desired result (the cast overflows).",
+                    obj, descr) < 0) {
+                Py_DECREF(descr);
+                return -1;
+            }
+            Py_DECREF(descr);
+            return 0;
+        }
+        else {
+            /* Live in the future, outright error: */
+            PyErr_Format(PyExc_OverflowError,
+                    "Python integer %R out of bounds for %S", obj, descr);
+            Py_DECREF(descr);
+            return -1;
+            }
+        assert(0);
+    }
+    return 0;
+}
+
+
+#line 250
+
+/*
+ * Helper for conversion from Python integers.  This uses the same conversion
+ * function as below for compatibility (which may seem strange).
+ * However, it adds more strict integer overflow checks to prevent mainly
+ * conversion of negative integers.  These are considered deprecated, which is
+ * related to NEP 50 (but somewhat independent).
+ */
+static int
+UBYTE_safe_pyint_setitem(PyObject *obj, npy_ubyte *result)
+{
+    /* Input is guaranteed to be a Python integer */
+    assert(PyLong_Check(obj));
+    int wraparound;
+    npy_ulong value = MyPyLong_AsLongWithWrap(obj, &wraparound);
+    if (value == (npy_ulong)-1 && PyErr_Occurred()) {
+        return -1;
+    }
+    *result = (npy_ubyte)value;
+
+    if (wraparound
+#if NPY_SIZEOF_BYTE < NPY_SIZEOF_LONG
+            || *result != value
+#endif
+            ) {
+        PyArray_Descr *descr = PyArray_DescrFromType(NPY_UBYTE);
+
+        if (npy_promotion_state == NPY_USE_LEGACY_PROMOTION || (
+                    npy_promotion_state == NPY_USE_WEAK_PROMOTION_AND_WARN
+                        && !npy_give_promotion_warnings())) {
+            /*
+             * This path will be taken both for the "promotion" case such as
+             * `uint8_arr + 123` as well as the assignment case.
+             * The "legacy" path should only ever be taken for assignment
+             * (legacy promotion will prevent overflows by promoting up)
+             * so a normal deprecation makes sense.
+             * When weak promotion is active, we use "future" behavior unless
+             * warnings were explicitly opt-in.
+             */
+            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+                    "NumPy will stop allowing conversion of out-of-bound "
+                    "Python integers to integer arrays.  The conversion "
+                    "of %.100R to %S will fail in the future.\n"
+                    "For the old behavior, usually:\n"
+                    "    np.array(value).astype(dtype)\n"
+                    "will give the desired result (the cast overflows).",
+                    obj, descr) < 0) {
+                Py_DECREF(descr);
+                return -1;
+            }
+            Py_DECREF(descr);
+            return 0;
+        }
+        else {
+            /* Live in the future, outright error: */
+            PyErr_Format(PyExc_OverflowError,
+                    "Python integer %R out of bounds for %S", obj, descr);
+            Py_DECREF(descr);
+            return -1;
+            }
+        assert(0);
+    }
+    return 0;
+}
+
+
+#line 250
+
+/*
+ * Helper for conversion from Python integers.  This uses the same conversion
+ * function as below for compatibility (which may seem strange).
+ * However, it adds more strict integer overflow checks to prevent mainly
+ * conversion of negative integers.  These are considered deprecated, which is
+ * related to NEP 50 (but somewhat independent).
+ */
+static int
+USHORT_safe_pyint_setitem(PyObject *obj, npy_ushort *result)
+{
+    /* Input is guaranteed to be a Python integer */
+    assert(PyLong_Check(obj));
+    int wraparound;
+    npy_ulong value = MyPyLong_AsLongWithWrap(obj, &wraparound);
+    if (value == (npy_ulong)-1 && PyErr_Occurred()) {
+        return -1;
+    }
+    *result = (npy_ushort)value;
+
+    if (wraparound
+#if NPY_SIZEOF_SHORT < NPY_SIZEOF_LONG
+            || *result != value
+#endif
+            ) {
+        PyArray_Descr *descr = PyArray_DescrFromType(NPY_USHORT);
+
+        if (npy_promotion_state == NPY_USE_LEGACY_PROMOTION || (
+                    npy_promotion_state == NPY_USE_WEAK_PROMOTION_AND_WARN
+                        && !npy_give_promotion_warnings())) {
+            /*
+             * This path will be taken both for the "promotion" case such as
+             * `uint8_arr + 123` as well as the assignment case.
+             * The "legacy" path should only ever be taken for assignment
+             * (legacy promotion will prevent overflows by promoting up)
+             * so a normal deprecation makes sense.
+             * When weak promotion is active, we use "future" behavior unless
+             * warnings were explicitly opt-in.
+             */
+            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+                    "NumPy will stop allowing conversion of out-of-bound "
+                    "Python integers to integer arrays.  The conversion "
+                    "of %.100R to %S will fail in the future.\n"
+                    "For the old behavior, usually:\n"
+                    "    np.array(value).astype(dtype)\n"
+                    "will give the desired result (the cast overflows).",
+                    obj, descr) < 0) {
+                Py_DECREF(descr);
+                return -1;
+            }
+            Py_DECREF(descr);
+            return 0;
+        }
+        else {
+            /* Live in the future, outright error: */
+            PyErr_Format(PyExc_OverflowError,
+                    "Python integer %R out of bounds for %S", obj, descr);
+            Py_DECREF(descr);
+            return -1;
+            }
+        assert(0);
+    }
+    return 0;
+}
+
+
+#line 250
+
+/*
+ * Helper for conversion from Python integers.  This uses the same conversion
+ * function as below for compatibility (which may seem strange).
+ * However, it adds more strict integer overflow checks to prevent mainly
+ * conversion of negative integers.  These are considered deprecated, which is
+ * related to NEP 50 (but somewhat independent).
+ */
+static int
+UINT_safe_pyint_setitem(PyObject *obj, npy_uint *result)
+{
+    /* Input is guaranteed to be a Python integer */
+    assert(PyLong_Check(obj));
+    int wraparound;
+    npy_ulong value = MyPyLong_AsUnsignedLongWithWrap(obj, &wraparound);
+    if (value == (npy_ulong)-1 && PyErr_Occurred()) {
+        return -1;
+    }
+    *result = (npy_uint)value;
+
+    if (wraparound
+#if NPY_SIZEOF_INT < NPY_SIZEOF_LONG
+            || *result != value
+#endif
+            ) {
+        PyArray_Descr *descr = PyArray_DescrFromType(NPY_UINT);
+
+        if (npy_promotion_state == NPY_USE_LEGACY_PROMOTION || (
+                    npy_promotion_state == NPY_USE_WEAK_PROMOTION_AND_WARN
+                        && !npy_give_promotion_warnings())) {
+            /*
+             * This path will be taken both for the "promotion" case such as
+             * `uint8_arr + 123` as well as the assignment case.
+             * The "legacy" path should only ever be taken for assignment
+             * (legacy promotion will prevent overflows by promoting up)
+             * so a normal deprecation makes sense.
+             * When weak promotion is active, we use "future" behavior unless
+             * warnings were explicitly opt-in.
+             */
+            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+                    "NumPy will stop allowing conversion of out-of-bound "
+                    "Python integers to integer arrays.  The conversion "
+                    "of %.100R to %S will fail in the future.\n"
+                    "For the old behavior, usually:\n"
+                    "    np.array(value).astype(dtype)\n"
+                    "will give the desired result (the cast overflows).",
+                    obj, descr) < 0) {
+                Py_DECREF(descr);
+                return -1;
+            }
+            Py_DECREF(descr);
+            return 0;
+        }
+        else {
+            /* Live in the future, outright error: */
+            PyErr_Format(PyExc_OverflowError,
+                    "Python integer %R out of bounds for %S", obj, descr);
+            Py_DECREF(descr);
+            return -1;
+            }
+        assert(0);
+    }
+    return 0;
+}
+
+
+#line 250
+
+/*
+ * Helper for conversion from Python integers.  This uses the same conversion
+ * function as below for compatibility (which may seem strange).
+ * However, it adds more strict integer overflow checks to prevent mainly
+ * conversion of negative integers.  These are considered deprecated, which is
+ * related to NEP 50 (but somewhat independent).
+ */
+static int
+ULONG_safe_pyint_setitem(PyObject *obj, npy_ulong *result)
+{
+    /* Input is guaranteed to be a Python integer */
+    assert(PyLong_Check(obj));
+    int wraparound;
+    npy_ulong value = MyPyLong_AsUnsignedLongWithWrap(obj, &wraparound);
+    if (value == (npy_ulong)-1 && PyErr_Occurred()) {
+        return -1;
+    }
+    *result = (npy_ulong)value;
+
+    if (wraparound
+#if NPY_SIZEOF_LONG < NPY_SIZEOF_LONG
+            || *result != value
+#endif
+            ) {
+        PyArray_Descr *descr = PyArray_DescrFromType(NPY_ULONG);
+
+        if (npy_promotion_state == NPY_USE_LEGACY_PROMOTION || (
+                    npy_promotion_state == NPY_USE_WEAK_PROMOTION_AND_WARN
+                        && !npy_give_promotion_warnings())) {
+            /*
+             * This path will be taken both for the "promotion" case such as
+             * `uint8_arr + 123` as well as the assignment case.
+             * The "legacy" path should only ever be taken for assignment
+             * (legacy promotion will prevent overflows by promoting up)
+             * so a normal deprecation makes sense.
+             * When weak promotion is active, we use "future" behavior unless
+             * warnings were explicitly opt-in.
+             */
+            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+                    "NumPy will stop allowing conversion of out-of-bound "
+                    "Python integers to integer arrays.  The conversion "
+                    "of %.100R to %S will fail in the future.\n"
+                    "For the old behavior, usually:\n"
+                    "    np.array(value).astype(dtype)\n"
+                    "will give the desired result (the cast overflows).",
+                    obj, descr) < 0) {
+                Py_DECREF(descr);
+                return -1;
+            }
+            Py_DECREF(descr);
+            return 0;
+        }
+        else {
+            /* Live in the future, outright error: */
+            PyErr_Format(PyExc_OverflowError,
+                    "Python integer %R out of bounds for %S", obj, descr);
+            Py_DECREF(descr);
+            return -1;
+            }
+        assert(0);
+    }
+    return 0;
+}
+
+
+#line 250
+
+/*
+ * Helper for conversion from Python integers.  This uses the same conversion
+ * function as below for compatibility (which may seem strange).
+ * However, it adds more strict integer overflow checks to prevent mainly
+ * conversion of negative integers.  These are considered deprecated, which is
+ * related to NEP 50 (but somewhat independent).
+ */
+static int
+ULONGLONG_safe_pyint_setitem(PyObject *obj, npy_ulonglong *result)
+{
+    /* Input is guaranteed to be a Python integer */
+    assert(PyLong_Check(obj));
+    int wraparound;
+    npy_ulonglong value = MyPyLong_AsUnsignedLongLongWithWrap(obj, &wraparound);
+    if (value == (npy_ulonglong)-1 && PyErr_Occurred()) {
+        return -1;
+    }
+    *result = (npy_ulonglong)value;
+
+    if (wraparound
+#if NPY_SIZEOF_LONGLONG < NPY_SIZEOF_LONGLONG
+            || *result != value
+#endif
+            ) {
+        PyArray_Descr *descr = PyArray_DescrFromType(NPY_ULONGLONG);
+
+        if (npy_promotion_state == NPY_USE_LEGACY_PROMOTION || (
+                    npy_promotion_state == NPY_USE_WEAK_PROMOTION_AND_WARN
+                        && !npy_give_promotion_warnings())) {
+            /*
+             * This path will be taken both for the "promotion" case such as
+             * `uint8_arr + 123` as well as the assignment case.
+             * The "legacy" path should only ever be taken for assignment
+             * (legacy promotion will prevent overflows by promoting up)
+             * so a normal deprecation makes sense.
+             * When weak promotion is active, we use "future" behavior unless
+             * warnings were explicitly opt-in.
+             */
+            if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
+                    "NumPy will stop allowing conversion of out-of-bound "
+                    "Python integers to integer arrays.  The conversion "
+                    "of %.100R to %S will fail in the future.\n"
+                    "For the old behavior, usually:\n"
+                    "    np.array(value).astype(dtype)\n"
+                    "will give the desired result (the cast overflows).",
+                    obj, descr) < 0) {
+                Py_DECREF(descr);
+                return -1;
+            }
+            Py_DECREF(descr);
+            return 0;
+        }
+        else {
+            /* Live in the future, outright error: */
+            PyErr_Format(PyExc_OverflowError,
+                    "Python integer %R out of bounds for %S", obj, descr);
+            Py_DECREF(descr);
+            return -1;
+            }
+        assert(0);
+    }
+    return 0;
+}
+
+
+
+
+#line 338
+static PyObject *
+BOOL_getitem(void *input, void *vap)
+{
+    PyArrayObject *ap = vap;
+    char *ip = input;
+    npy_bool t1;
+
+    if ((ap == NULL) || PyArray_ISBEHAVED_RO(ap)) {
+        t1 = *((npy_bool *)ip);
+        return PyBool_FromLong((long)t1);
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(&t1, ip, PyArray_ISBYTESWAPPED(ap), ap);
+        return PyBool_FromLong((long)t1);
+    }
+}
+
+NPY_NO_EXPORT int
+BOOL_setitem(PyObject *op, void *ov, void *vap)
+{
+    PyArrayObject *ap = vap;
+    npy_bool temp;  /* ensures alignment */
+
+#if 0
+    if (PyLong_Check(op)) {
+        /*
+         * When weak promotion is enabled (using NEP 50) we also use more
+         * strict parsing of integers:  All out-of-bound Python integer
+         * parsing fails.
+         */
+        if (BOOL_safe_pyint_setitem(op, &temp) < 0) {
+            return -1;
+        }
+    }
+    else  /* continue with if below */
+#endif
+
+    if (PyArray_IsScalar(op, Bool)) {
+        temp = PyArrayScalar_VAL(op, Bool);
+    }
+    else {
+        temp = (npy_bool)PyObject_IsTrue(op);
+    }
+    if (PyErr_Occurred()) {
+        PyObject *type, *value, *traceback;
+        PyErr_Fetch(&type, &value, &traceback);
+        if (PySequence_NoString_Check(op)) {
+            PyErr_SetString(PyExc_ValueError,
+                    "setting an array element with a sequence.");
+            npy_PyErr_ChainExceptionsCause(type, value, traceback);
+        }
+        else {
+            PyErr_Restore(type, value, traceback);
+        }
+        return -1;
+    }
+    if (ap == NULL || PyArray_ISBEHAVED(ap)) {
+        assert(npy_is_aligned(ov, NPY_ALIGNOF(npy_bool)));
+        *((npy_bool *)ov)=temp;
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(ov, &temp, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+    }
+    return 0;
+}
+
+
+#line 338
+static PyObject *
+BYTE_getitem(void *input, void *vap)
+{
+    PyArrayObject *ap = vap;
+    char *ip = input;
+    npy_byte t1;
+
+    if ((ap == NULL) || PyArray_ISBEHAVED_RO(ap)) {
+        t1 = *((npy_byte *)ip);
+        return PyLong_FromLong((long)t1);
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(&t1, ip, PyArray_ISBYTESWAPPED(ap), ap);
+        return PyLong_FromLong((long)t1);
+    }
+}
+
+NPY_NO_EXPORT int
+BYTE_setitem(PyObject *op, void *ov, void *vap)
+{
+    PyArrayObject *ap = vap;
+    npy_byte temp;  /* ensures alignment */
+
+#if 1
+    if (PyLong_Check(op)) {
+        /*
+         * When weak promotion is enabled (using NEP 50) we also use more
+         * strict parsing of integers:  All out-of-bound Python integer
+         * parsing fails.
+         */
+        if (BYTE_safe_pyint_setitem(op, &temp) < 0) {
+            return -1;
+        }
+    }
+    else  /* continue with if below */
+#endif
+
+    if (PyArray_IsScalar(op, Byte)) {
+        temp = PyArrayScalar_VAL(op, Byte);
+    }
+    else {
+        temp = (npy_byte)MyPyLong_AsLong(op);
+    }
+    if (PyErr_Occurred()) {
+        PyObject *type, *value, *traceback;
+        PyErr_Fetch(&type, &value, &traceback);
+        if (PySequence_NoString_Check(op)) {
+            PyErr_SetString(PyExc_ValueError,
+                    "setting an array element with a sequence.");
+            npy_PyErr_ChainExceptionsCause(type, value, traceback);
+        }
+        else {
+            PyErr_Restore(type, value, traceback);
+        }
+        return -1;
+    }
+    if (ap == NULL || PyArray_ISBEHAVED(ap)) {
+        assert(npy_is_aligned(ov, NPY_ALIGNOF(npy_byte)));
+        *((npy_byte *)ov)=temp;
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(ov, &temp, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+    }
+    return 0;
+}
+
+
+#line 338
+static PyObject *
+UBYTE_getitem(void *input, void *vap)
+{
+    PyArrayObject *ap = vap;
+    char *ip = input;
+    npy_ubyte t1;
+
+    if ((ap == NULL) || PyArray_ISBEHAVED_RO(ap)) {
+        t1 = *((npy_ubyte *)ip);
+        return PyLong_FromLong((long)t1);
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(&t1, ip, PyArray_ISBYTESWAPPED(ap), ap);
+        return PyLong_FromLong((long)t1);
+    }
+}
+
+NPY_NO_EXPORT int
+UBYTE_setitem(PyObject *op, void *ov, void *vap)
+{
+    PyArrayObject *ap = vap;
+    npy_ubyte temp;  /* ensures alignment */
+
+#if 1
+    if (PyLong_Check(op)) {
+        /*
+         * When weak promotion is enabled (using NEP 50) we also use more
+         * strict parsing of integers:  All out-of-bound Python integer
+         * parsing fails.
+         */
+        if (UBYTE_safe_pyint_setitem(op, &temp) < 0) {
+            return -1;
+        }
+    }
+    else  /* continue with if below */
+#endif
+
+    if (PyArray_IsScalar(op, UByte)) {
+        temp = PyArrayScalar_VAL(op, UByte);
+    }
+    else {
+        temp = (npy_ubyte)MyPyLong_AsLong(op);
+    }
+    if (PyErr_Occurred()) {
+        PyObject *type, *value, *traceback;
+        PyErr_Fetch(&type, &value, &traceback);
+        if (PySequence_NoString_Check(op)) {
+            PyErr_SetString(PyExc_ValueError,
+                    "setting an array element with a sequence.");
+            npy_PyErr_ChainExceptionsCause(type, value, traceback);
+        }
+        else {
+            PyErr_Restore(type, value, traceback);
+        }
+        return -1;
+    }
+    if (ap == NULL || PyArray_ISBEHAVED(ap)) {
+        assert(npy_is_aligned(ov, NPY_ALIGNOF(npy_ubyte)));
+        *((npy_ubyte *)ov)=temp;
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(ov, &temp, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+    }
+    return 0;
+}
+
+
+#line 338
+static PyObject *
+SHORT_getitem(void *input, void *vap)
+{
+    PyArrayObject *ap = vap;
+    char *ip = input;
+    npy_short t1;
+
+    if ((ap == NULL) || PyArray_ISBEHAVED_RO(ap)) {
+        t1 = *((npy_short *)ip);
+        return PyLong_FromLong((long)t1);
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(&t1, ip, PyArray_ISBYTESWAPPED(ap), ap);
+        return PyLong_FromLong((long)t1);
+    }
+}
+
+NPY_NO_EXPORT int
+SHORT_setitem(PyObject *op, void *ov, void *vap)
+{
+    PyArrayObject *ap = vap;
+    npy_short temp;  /* ensures alignment */
+
+#if 1
+    if (PyLong_Check(op)) {
+        /*
+         * When weak promotion is enabled (using NEP 50) we also use more
+         * strict parsing of integers:  All out-of-bound Python integer
+         * parsing fails.
+         */
+        if (SHORT_safe_pyint_setitem(op, &temp) < 0) {
+            return -1;
+        }
+    }
+    else  /* continue with if below */
+#endif
+
+    if (PyArray_IsScalar(op, Short)) {
+        temp = PyArrayScalar_VAL(op, Short);
+    }
+    else {
+        temp = (npy_short)MyPyLong_AsLong(op);
+    }
+    if (PyErr_Occurred()) {
+        PyObject *type, *value, *traceback;
+        PyErr_Fetch(&type, &value, &traceback);
+        if (PySequence_NoString_Check(op)) {
+            PyErr_SetString(PyExc_ValueError,
+                    "setting an array element with a sequence.");
+            npy_PyErr_ChainExceptionsCause(type, value, traceback);
+        }
+        else {
+            PyErr_Restore(type, value, traceback);
+        }
+        return -1;
+    }
+    if (ap == NULL || PyArray_ISBEHAVED(ap)) {
+        assert(npy_is_aligned(ov, NPY_ALIGNOF(npy_short)));
+        *((npy_short *)ov)=temp;
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(ov, &temp, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+    }
+    return 0;
+}
+
+
+#line 338
+static PyObject *
+USHORT_getitem(void *input, void *vap)
+{
+    PyArrayObject *ap = vap;
+    char *ip = input;
+    npy_ushort t1;
+
+    if ((ap == NULL) || PyArray_ISBEHAVED_RO(ap)) {
+        t1 = *((npy_ushort *)ip);
+        return PyLong_FromLong((long)t1);
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(&t1, ip, PyArray_ISBYTESWAPPED(ap), ap);
+        return PyLong_FromLong((long)t1);
+    }
+}
+
+NPY_NO_EXPORT int
+USHORT_setitem(PyObject *op, void *ov, void *vap)
+{
+    PyArrayObject *ap = vap;
+    npy_ushort temp;  /* ensures alignment */
+
+#if 1
+    if (PyLong_Check(op)) {
+        /*
+         * When weak promotion is enabled (using NEP 50) we also use more
+         * strict parsing of integers:  All out-of-bound Python integer
+         * parsing fails.
+         */
+        if (USHORT_safe_pyint_setitem(op, &temp) < 0) {
+            return -1;
+        }
+    }
+    else  /* continue with if below */
+#endif
+
+    if (PyArray_IsScalar(op, UShort)) {
+        temp = PyArrayScalar_VAL(op, UShort);
+    }
+    else {
+        temp = (npy_ushort)MyPyLong_AsLong(op);
+    }
+    if (PyErr_Occurred()) {
+        PyObject *type, *value, *traceback;
+        PyErr_Fetch(&type, &value, &traceback);
+        if (PySequence_NoString_Check(op)) {
+            PyErr_SetString(PyExc_ValueError,
+                    "setting an array element with a sequence.");
+            npy_PyErr_ChainExceptionsCause(type, value, traceback);
+        }
+        else {
+            PyErr_Restore(type, value, traceback);
+        }
+        return -1;
+    }
+    if (ap == NULL || PyArray_ISBEHAVED(ap)) {
+        assert(npy_is_aligned(ov, NPY_ALIGNOF(npy_ushort)));
+        *((npy_ushort *)ov)=temp;
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(ov, &temp, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+    }
+    return 0;
+}
+
+
+#line 338
+static PyObject *
+INT_getitem(void *input, void *vap)
+{
+    PyArrayObject *ap = vap;
+    char *ip = input;
+    npy_int t1;
+
+    if ((ap == NULL) || PyArray_ISBEHAVED_RO(ap)) {
+        t1 = *((npy_int *)ip);
+        return PyLong_FromLong((long)t1);
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(&t1, ip, PyArray_ISBYTESWAPPED(ap), ap);
+        return PyLong_FromLong((long)t1);
+    }
+}
+
+NPY_NO_EXPORT int
+INT_setitem(PyObject *op, void *ov, void *vap)
+{
+    PyArrayObject *ap = vap;
+    npy_int temp;  /* ensures alignment */
+
+#if 1
+    if (PyLong_Check(op)) {
+        /*
+         * When weak promotion is enabled (using NEP 50) we also use more
+         * strict parsing of integers:  All out-of-bound Python integer
+         * parsing fails.
+         */
+        if (INT_safe_pyint_setitem(op, &temp) < 0) {
+            return -1;
+        }
+    }
+    else  /* continue with if below */
+#endif
+
+    if (PyArray_IsScalar(op, Int)) {
+        temp = PyArrayScalar_VAL(op, Int);
+    }
+    else {
+        temp = (npy_int)MyPyLong_AsLong(op);
+    }
+    if (PyErr_Occurred()) {
+        PyObject *type, *value, *traceback;
+        PyErr_Fetch(&type, &value, &traceback);
+        if (PySequence_NoString_Check(op)) {
+            PyErr_SetString(PyExc_ValueError,
+                    "setting an array element with a sequence.");
+            npy_PyErr_ChainExceptionsCause(type, value, traceback);
+        }
+        else {
+            PyErr_Restore(type, value, traceback);
+        }
+        return -1;
+    }
+    if (ap == NULL || PyArray_ISBEHAVED(ap)) {
+        assert(npy_is_aligned(ov, NPY_ALIGNOF(npy_int)));
+        *((npy_int *)ov)=temp;
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(ov, &temp, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+    }
+    return 0;
+}
+
+
+#line 338
+static PyObject *
+LONG_getitem(void *input, void *vap)
+{
+    PyArrayObject *ap = vap;
+    char *ip = input;
+    npy_long t1;
+
+    if ((ap == NULL) || PyArray_ISBEHAVED_RO(ap)) {
+        t1 = *((npy_long *)ip);
+        return PyLong_FromLong((long)t1);
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(&t1, ip, PyArray_ISBYTESWAPPED(ap), ap);
+        return PyLong_FromLong((long)t1);
+    }
+}
+
+NPY_NO_EXPORT int
+LONG_setitem(PyObject *op, void *ov, void *vap)
+{
+    PyArrayObject *ap = vap;
+    npy_long temp;  /* ensures alignment */
+
+#if 1
+    if (PyLong_Check(op)) {
+        /*
+         * When weak promotion is enabled (using NEP 50) we also use more
+         * strict parsing of integers:  All out-of-bound Python integer
+         * parsing fails.
+         */
+        if (LONG_safe_pyint_setitem(op, &temp) < 0) {
+            return -1;
+        }
+    }
+    else  /* continue with if below */
+#endif
+
+    if (PyArray_IsScalar(op, Long)) {
+        temp = PyArrayScalar_VAL(op, Long);
+    }
+    else {
+        temp = (npy_long)MyPyLong_AsLong(op);
+    }
+    if (PyErr_Occurred()) {
+        PyObject *type, *value, *traceback;
+        PyErr_Fetch(&type, &value, &traceback);
+        if (PySequence_NoString_Check(op)) {
+            PyErr_SetString(PyExc_ValueError,
+                    "setting an array element with a sequence.");
+            npy_PyErr_ChainExceptionsCause(type, value, traceback);
+        }
+        else {
+            PyErr_Restore(type, value, traceback);
+        }
+        return -1;
+    }
+    if (ap == NULL || PyArray_ISBEHAVED(ap)) {
+        assert(npy_is_aligned(ov, NPY_ALIGNOF(npy_long)));
+        *((npy_long *)ov)=temp;
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(ov, &temp, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+    }
+    return 0;
+}
+
+
+#line 338
+static PyObject *
+UINT_getitem(void *input, void *vap)
+{
+    PyArrayObject *ap = vap;
+    char *ip = input;
+    npy_uint t1;
+
+    if ((ap == NULL) || PyArray_ISBEHAVED_RO(ap)) {
+        t1 = *((npy_uint *)ip);
+        return PyLong_FromUnsignedLong((npy_ulong)t1);
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(&t1, ip, PyArray_ISBYTESWAPPED(ap), ap);
+        return PyLong_FromUnsignedLong((npy_ulong)t1);
+    }
+}
+
+NPY_NO_EXPORT int
+UINT_setitem(PyObject *op, void *ov, void *vap)
+{
+    PyArrayObject *ap = vap;
+    npy_uint temp;  /* ensures alignment */
+
+#if 1
+    if (PyLong_Check(op)) {
+        /*
+         * When weak promotion is enabled (using NEP 50) we also use more
+         * strict parsing of integers:  All out-of-bound Python integer
+         * parsing fails.
+         */
+        if (UINT_safe_pyint_setitem(op, &temp) < 0) {
+            return -1;
+        }
+    }
+    else  /* continue with if below */
+#endif
+
+    if (PyArray_IsScalar(op, UInt)) {
+        temp = PyArrayScalar_VAL(op, UInt);
+    }
+    else {
+        temp = (npy_uint)MyPyLong_AsUnsignedLong(op);
+    }
+    if (PyErr_Occurred()) {
+        PyObject *type, *value, *traceback;
+        PyErr_Fetch(&type, &value, &traceback);
+        if (PySequence_NoString_Check(op)) {
+            PyErr_SetString(PyExc_ValueError,
+                    "setting an array element with a sequence.");
+            npy_PyErr_ChainExceptionsCause(type, value, traceback);
+        }
+        else {
+            PyErr_Restore(type, value, traceback);
+        }
+        return -1;
+    }
+    if (ap == NULL || PyArray_ISBEHAVED(ap)) {
+        assert(npy_is_aligned(ov, NPY_ALIGNOF(npy_uint)));
+        *((npy_uint *)ov)=temp;
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(ov, &temp, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+    }
+    return 0;
+}
+
+
+#line 338
+static PyObject *
+ULONG_getitem(void *input, void *vap)
+{
+    PyArrayObject *ap = vap;
+    char *ip = input;
+    npy_ulong t1;
+
+    if ((ap == NULL) || PyArray_ISBEHAVED_RO(ap)) {
+        t1 = *((npy_ulong *)ip);
+        return PyLong_FromUnsignedLong((npy_ulong)t1);
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(&t1, ip, PyArray_ISBYTESWAPPED(ap), ap);
+        return PyLong_FromUnsignedLong((npy_ulong)t1);
+    }
+}
+
+NPY_NO_EXPORT int
+ULONG_setitem(PyObject *op, void *ov, void *vap)
+{
+    PyArrayObject *ap = vap;
+    npy_ulong temp;  /* ensures alignment */
+
+#if 1
+    if (PyLong_Check(op)) {
+        /*
+         * When weak promotion is enabled (using NEP 50) we also use more
+         * strict parsing of integers:  All out-of-bound Python integer
+         * parsing fails.
+         */
+        if (ULONG_safe_pyint_setitem(op, &temp) < 0) {
+            return -1;
+        }
+    }
+    else  /* continue with if below */
+#endif
+
+    if (PyArray_IsScalar(op, ULong)) {
+        temp = PyArrayScalar_VAL(op, ULong);
+    }
+    else {
+        temp = (npy_ulong)MyPyLong_AsUnsignedLong(op);
+    }
+    if (PyErr_Occurred()) {
+        PyObject *type, *value, *traceback;
+        PyErr_Fetch(&type, &value, &traceback);
+        if (PySequence_NoString_Check(op)) {
+            PyErr_SetString(PyExc_ValueError,
+                    "setting an array element with a sequence.");
+            npy_PyErr_ChainExceptionsCause(type, value, traceback);
+        }
+        else {
+            PyErr_Restore(type, value, traceback);
+        }
+        return -1;
+    }
+    if (ap == NULL || PyArray_ISBEHAVED(ap)) {
+        assert(npy_is_aligned(ov, NPY_ALIGNOF(npy_ulong)));
+        *((npy_ulong *)ov)=temp;
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(ov, &temp, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+    }
+    return 0;
+}
+
+
+#line 338
+static PyObject *
+LONGLONG_getitem(void *input, void *vap)
+{
+    PyArrayObject *ap = vap;
+    char *ip = input;
+    npy_longlong t1;
+
+    if ((ap == NULL) || PyArray_ISBEHAVED_RO(ap)) {
+        t1 = *((npy_longlong *)ip);
+        return PyLong_FromLongLong((npy_longlong)t1);
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(&t1, ip, PyArray_ISBYTESWAPPED(ap), ap);
+        return PyLong_FromLongLong((npy_longlong)t1);
+    }
+}
+
+NPY_NO_EXPORT int
+LONGLONG_setitem(PyObject *op, void *ov, void *vap)
+{
+    PyArrayObject *ap = vap;
+    npy_longlong temp;  /* ensures alignment */
+
+#if 1
+    if (PyLong_Check(op)) {
+        /*
+         * When weak promotion is enabled (using NEP 50) we also use more
+         * strict parsing of integers:  All out-of-bound Python integer
+         * parsing fails.
+         */
+        if (LONGLONG_safe_pyint_setitem(op, &temp) < 0) {
+            return -1;
+        }
+    }
+    else  /* continue with if below */
+#endif
+
+    if (PyArray_IsScalar(op, LongLong)) {
+        temp = PyArrayScalar_VAL(op, LongLong);
+    }
+    else {
+        temp = (npy_longlong)MyPyLong_AsLongLong(op);
+    }
+    if (PyErr_Occurred()) {
+        PyObject *type, *value, *traceback;
+        PyErr_Fetch(&type, &value, &traceback);
+        if (PySequence_NoString_Check(op)) {
+            PyErr_SetString(PyExc_ValueError,
+                    "setting an array element with a sequence.");
+            npy_PyErr_ChainExceptionsCause(type, value, traceback);
+        }
+        else {
+            PyErr_Restore(type, value, traceback);
+        }
+        return -1;
+    }
+    if (ap == NULL || PyArray_ISBEHAVED(ap)) {
+        assert(npy_is_aligned(ov, NPY_ALIGNOF(npy_longlong)));
+        *((npy_longlong *)ov)=temp;
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(ov, &temp, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+    }
+    return 0;
+}
+
+
+#line 338
+static PyObject *
+ULONGLONG_getitem(void *input, void *vap)
+{
+    PyArrayObject *ap = vap;
+    char *ip = input;
+    npy_ulonglong t1;
+
+    if ((ap == NULL) || PyArray_ISBEHAVED_RO(ap)) {
+        t1 = *((npy_ulonglong *)ip);
+        return PyLong_FromUnsignedLongLong((npy_ulonglong)t1);
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(&t1, ip, PyArray_ISBYTESWAPPED(ap), ap);
+        return PyLong_FromUnsignedLongLong((npy_ulonglong)t1);
+    }
+}
+
+NPY_NO_EXPORT int
+ULONGLONG_setitem(PyObject *op, void *ov, void *vap)
+{
+    PyArrayObject *ap = vap;
+    npy_ulonglong temp;  /* ensures alignment */
+
+#if 1
+    if (PyLong_Check(op)) {
+        /*
+         * When weak promotion is enabled (using NEP 50) we also use more
+         * strict parsing of integers:  All out-of-bound Python integer
+         * parsing fails.
+         */
+        if (ULONGLONG_safe_pyint_setitem(op, &temp) < 0) {
+            return -1;
+        }
+    }
+    else  /* continue with if below */
+#endif
+
+    if (PyArray_IsScalar(op, ULongLong)) {
+        temp = PyArrayScalar_VAL(op, ULongLong);
+    }
+    else {
+        temp = (npy_ulonglong)MyPyLong_AsUnsignedLongLong(op);
+    }
+    if (PyErr_Occurred()) {
+        PyObject *type, *value, *traceback;
+        PyErr_Fetch(&type, &value, &traceback);
+        if (PySequence_NoString_Check(op)) {
+            PyErr_SetString(PyExc_ValueError,
+                    "setting an array element with a sequence.");
+            npy_PyErr_ChainExceptionsCause(type, value, traceback);
+        }
+        else {
+            PyErr_Restore(type, value, traceback);
+        }
+        return -1;
+    }
+    if (ap == NULL || PyArray_ISBEHAVED(ap)) {
+        assert(npy_is_aligned(ov, NPY_ALIGNOF(npy_ulonglong)));
+        *((npy_ulonglong *)ov)=temp;
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(ov, &temp, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+    }
+    return 0;
+}
+
+
+#line 338
+static PyObject *
+HALF_getitem(void *input, void *vap)
+{
+    PyArrayObject *ap = vap;
+    char *ip = input;
+    npy_half t1;
+
+    if ((ap == NULL) || PyArray_ISBEHAVED_RO(ap)) {
+        t1 = *((npy_half *)ip);
+        return MyPyFloat_FromHalf((npy_half)t1);
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(&t1, ip, PyArray_ISBYTESWAPPED(ap), ap);
+        return MyPyFloat_FromHalf((npy_half)t1);
+    }
+}
+
+NPY_NO_EXPORT int
+HALF_setitem(PyObject *op, void *ov, void *vap)
+{
+    PyArrayObject *ap = vap;
+    npy_half temp;  /* ensures alignment */
+
+#if 0
+    if (PyLong_Check(op)) {
+        /*
+         * When weak promotion is enabled (using NEP 50) we also use more
+         * strict parsing of integers:  All out-of-bound Python integer
+         * parsing fails.
+         */
+        if (HALF_safe_pyint_setitem(op, &temp) < 0) {
+            return -1;
+        }
+    }
+    else  /* continue with if below */
+#endif
+
+    if (PyArray_IsScalar(op, Half)) {
+        temp = PyArrayScalar_VAL(op, Half);
+    }
+    else {
+        temp = (npy_half)MyPyFloat_AsHalf(op);
+    }
+    if (PyErr_Occurred()) {
+        PyObject *type, *value, *traceback;
+        PyErr_Fetch(&type, &value, &traceback);
+        if (PySequence_NoString_Check(op)) {
+            PyErr_SetString(PyExc_ValueError,
+                    "setting an array element with a sequence.");
+            npy_PyErr_ChainExceptionsCause(type, value, traceback);
+        }
+        else {
+            PyErr_Restore(type, value, traceback);
+        }
+        return -1;
+    }
+    if (ap == NULL || PyArray_ISBEHAVED(ap)) {
+        assert(npy_is_aligned(ov, NPY_ALIGNOF(npy_half)));
+        *((npy_half *)ov)=temp;
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(ov, &temp, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+    }
+    return 0;
+}
+
+
+#line 338
+static PyObject *
+FLOAT_getitem(void *input, void *vap)
+{
+    PyArrayObject *ap = vap;
+    char *ip = input;
+    npy_float t1;
+
+    if ((ap == NULL) || PyArray_ISBEHAVED_RO(ap)) {
+        t1 = *((npy_float *)ip);
+        return PyFloat_FromDouble((npy_float)t1);
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(&t1, ip, PyArray_ISBYTESWAPPED(ap), ap);
+        return PyFloat_FromDouble((npy_float)t1);
+    }
+}
+
+NPY_NO_EXPORT int
+FLOAT_setitem(PyObject *op, void *ov, void *vap)
+{
+    PyArrayObject *ap = vap;
+    npy_float temp;  /* ensures alignment */
+
+#if 0
+    if (PyLong_Check(op)) {
+        /*
+         * When weak promotion is enabled (using NEP 50) we also use more
+         * strict parsing of integers:  All out-of-bound Python integer
+         * parsing fails.
+         */
+        if (FLOAT_safe_pyint_setitem(op, &temp) < 0) {
+            return -1;
+        }
+    }
+    else  /* continue with if below */
+#endif
+
+    if (PyArray_IsScalar(op, Float)) {
+        temp = PyArrayScalar_VAL(op, Float);
+    }
+    else {
+        temp = (npy_float)MyPyFloat_AsFloat(op);
+    }
+    if (PyErr_Occurred()) {
+        PyObject *type, *value, *traceback;
+        PyErr_Fetch(&type, &value, &traceback);
+        if (PySequence_NoString_Check(op)) {
+            PyErr_SetString(PyExc_ValueError,
+                    "setting an array element with a sequence.");
+            npy_PyErr_ChainExceptionsCause(type, value, traceback);
+        }
+        else {
+            PyErr_Restore(type, value, traceback);
+        }
+        return -1;
+    }
+    if (ap == NULL || PyArray_ISBEHAVED(ap)) {
+        assert(npy_is_aligned(ov, NPY_ALIGNOF(npy_float)));
+        *((npy_float *)ov)=temp;
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(ov, &temp, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+    }
+    return 0;
+}
+
+
+#line 338
+static PyObject *
+DOUBLE_getitem(void *input, void *vap)
+{
+    PyArrayObject *ap = vap;
+    char *ip = input;
+    npy_double t1;
+
+    if ((ap == NULL) || PyArray_ISBEHAVED_RO(ap)) {
+        t1 = *((npy_double *)ip);
+        return PyFloat_FromDouble((npy_double)t1);
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(&t1, ip, PyArray_ISBYTESWAPPED(ap), ap);
+        return PyFloat_FromDouble((npy_double)t1);
+    }
+}
+
+NPY_NO_EXPORT int
+DOUBLE_setitem(PyObject *op, void *ov, void *vap)
+{
+    PyArrayObject *ap = vap;
+    npy_double temp;  /* ensures alignment */
+
+#if 0
+    if (PyLong_Check(op)) {
+        /*
+         * When weak promotion is enabled (using NEP 50) we also use more
+         * strict parsing of integers:  All out-of-bound Python integer
+         * parsing fails.
+         */
+        if (DOUBLE_safe_pyint_setitem(op, &temp) < 0) {
+            return -1;
+        }
+    }
+    else  /* continue with if below */
+#endif
+
+    if (PyArray_IsScalar(op, Double)) {
+        temp = PyArrayScalar_VAL(op, Double);
+    }
+    else {
+        temp = (npy_double)MyPyFloat_AsDouble(op);
+    }
+    if (PyErr_Occurred()) {
+        PyObject *type, *value, *traceback;
+        PyErr_Fetch(&type, &value, &traceback);
+        if (PySequence_NoString_Check(op)) {
+            PyErr_SetString(PyExc_ValueError,
+                    "setting an array element with a sequence.");
+            npy_PyErr_ChainExceptionsCause(type, value, traceback);
+        }
+        else {
+            PyErr_Restore(type, value, traceback);
+        }
+        return -1;
+    }
+    if (ap == NULL || PyArray_ISBEHAVED(ap)) {
+        assert(npy_is_aligned(ov, NPY_ALIGNOF(npy_double)));
+        *((npy_double *)ov)=temp;
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(ov, &temp, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+    }
+    return 0;
+}
+
+
+
+
+#line 413
+static PyObject *
+CFLOAT_getitem(void *input, void *vap)
+{
+    PyArrayObject *ap = vap;
+    char *ip = input;
+    npy_float t1, t2;
+
+    if ((ap == NULL) || PyArray_ISBEHAVED_RO(ap)) {
+        return PyComplex_FromDoubles((double)((npy_float *)ip)[0],
+                (double)((npy_float *)ip)[1]);
+    }
+    else {
+        int size = sizeof(npy_float);
+
+        npy_bool swap = PyArray_ISBYTESWAPPED(ap);
+        copy_and_swap(&t1, ip, size, 1, 0, swap);
+        copy_and_swap(&t2, ip + size, size, 1, 0, swap);
+        return PyComplex_FromDoubles((double)t1, (double)t2);
+    }
+}
+
+
+#line 413
+static PyObject *
+CDOUBLE_getitem(void *input, void *vap)
+{
+    PyArrayObject *ap = vap;
+    char *ip = input;
+    npy_double t1, t2;
+
+    if ((ap == NULL) || PyArray_ISBEHAVED_RO(ap)) {
+        return PyComplex_FromDoubles((double)((npy_double *)ip)[0],
+                (double)((npy_double *)ip)[1]);
+    }
+    else {
+        int size = sizeof(npy_double);
+
+        npy_bool swap = PyArray_ISBYTESWAPPED(ap);
+        copy_and_swap(&t1, ip, size, 1, 0, swap);
+        copy_and_swap(&t2, ip + size, size, 1, 0, swap);
+        return PyComplex_FromDoubles((double)t1, (double)t2);
+    }
+}
+
+
+
+
+
+#line 445
+NPY_NO_EXPORT int
+CFLOAT_setitem(PyObject *op, void *ov, void *vap)
+{
+    PyArrayObject *ap = vap;
+    Py_complex oop;
+    npy_cfloat temp;
+
+    if (PyArray_IsZeroDim(op)) {
+        return convert_to_scalar_and_retry(op, ov, vap, CFLOAT_setitem);
+    }
+
+    if (PyArray_IsScalar(op, CFloat)){
+        temp = PyArrayScalar_VAL(op, CFloat);
+    }
+    else {
+        if (op == Py_None) {
+            oop.real = NPY_NAN;
+            oop.imag = NPY_NAN;
+        }
+        else if (PyBytes_Check(op) || PyUnicode_Check(op)) {
+            /*
+             * Unlike most numeric conversion functions PyComplex_AsCComplex
+             * does not handle strings, so we have to use its constructor.
+             */
+            PyObject *pycomplex, *args;
+            if (PyBytes_Check(op)) {
+                /* The complex constructor expects unicode */
+                PyObject *unicode;
+                unicode = PyUnicode_FromEncodedObject(op, NULL, NULL);
+                if (unicode == NULL) {
+                    return -1;
+                }
+                args = PyTuple_Pack(1, unicode);
+                Py_DECREF(unicode);
+            }
+            else {
+                args = PyTuple_Pack(1, op);
+            }
+            if (args == NULL) {
+                return -1;
+            }
+            pycomplex = PyComplex_Type.tp_new(&PyComplex_Type, args, NULL);
+            Py_DECREF(args);
+            if (pycomplex == NULL) {
+                return -1;
+            }
+            oop = PyComplex_AsCComplex(pycomplex);
+            Py_DECREF(pycomplex);
+            if (error_converting(oop.real)) {
+                return -1;
+            }
+        }
+        else {
+            oop = PyComplex_AsCComplex(op);
+            if (error_converting(oop.real)) {
+                return -1;
+            }
+        }
+        temp.real = (npy_float) oop.real;
+        temp.imag = (npy_float) oop.imag;
+
+#if NPY_SIZEOF_CFLOAT < NPY_SIZEOF_CDOUBLE  /* really just float... */
+        /* Overflow could have occurred converting double to float */
+        if (NPY_UNLIKELY((npy_isinf(temp.real) && !npy_isinf(oop.real)) ||
+                         (npy_isinf(temp.imag) && !npy_isinf(oop.imag)))) {
+            if (PyUFunc_GiveFloatingpointErrors("cast", NPY_FPE_OVERFLOW) < 0) {
+                return -1;
+            }
+        }
+#endif
+    }
+
+    memcpy(ov, &temp, NPY_SIZEOF_CFLOAT);
+    if (ap != NULL && PyArray_ISBYTESWAPPED(ap)) {
+        byte_swap_vector(ov, 2, sizeof(npy_float));
+    }
+    return 0;
+}
+
+
+#line 445
+NPY_NO_EXPORT int
+CDOUBLE_setitem(PyObject *op, void *ov, void *vap)
+{
+    PyArrayObject *ap = vap;
+    Py_complex oop;
+    npy_cdouble temp;
+
+    if (PyArray_IsZeroDim(op)) {
+        return convert_to_scalar_and_retry(op, ov, vap, CDOUBLE_setitem);
+    }
+
+    if (PyArray_IsScalar(op, CDouble)){
+        temp = PyArrayScalar_VAL(op, CDouble);
+    }
+    else {
+        if (op == Py_None) {
+            oop.real = NPY_NAN;
+            oop.imag = NPY_NAN;
+        }
+        else if (PyBytes_Check(op) || PyUnicode_Check(op)) {
+            /*
+             * Unlike most numeric conversion functions PyComplex_AsCComplex
+             * does not handle strings, so we have to use its constructor.
+             */
+            PyObject *pycomplex, *args;
+            if (PyBytes_Check(op)) {
+                /* The complex constructor expects unicode */
+                PyObject *unicode;
+                unicode = PyUnicode_FromEncodedObject(op, NULL, NULL);
+                if (unicode == NULL) {
+                    return -1;
+                }
+                args = PyTuple_Pack(1, unicode);
+                Py_DECREF(unicode);
+            }
+            else {
+                args = PyTuple_Pack(1, op);
+            }
+            if (args == NULL) {
+                return -1;
+            }
+            pycomplex = PyComplex_Type.tp_new(&PyComplex_Type, args, NULL);
+            Py_DECREF(args);
+            if (pycomplex == NULL) {
+                return -1;
+            }
+            oop = PyComplex_AsCComplex(pycomplex);
+            Py_DECREF(pycomplex);
+            if (error_converting(oop.real)) {
+                return -1;
+            }
+        }
+        else {
+            oop = PyComplex_AsCComplex(op);
+            if (error_converting(oop.real)) {
+                return -1;
+            }
+        }
+        temp.real = (npy_double) oop.real;
+        temp.imag = (npy_double) oop.imag;
+
+#if NPY_SIZEOF_CDOUBLE < NPY_SIZEOF_CDOUBLE  /* really just float... */
+        /* Overflow could have occurred converting double to float */
+        if (NPY_UNLIKELY((npy_isinf(temp.real) && !npy_isinf(oop.real)) ||
+                         (npy_isinf(temp.imag) && !npy_isinf(oop.imag)))) {
+            if (PyUFunc_GiveFloatingpointErrors("cast", NPY_FPE_OVERFLOW) < 0) {
+                return -1;
+            }
+        }
+#endif
+    }
+
+    memcpy(ov, &temp, NPY_SIZEOF_CDOUBLE);
+    if (ap != NULL && PyArray_ISBYTESWAPPED(ap)) {
+        byte_swap_vector(ov, 2, sizeof(npy_double));
+    }
+    return 0;
+}
+
+
+#line 445
+NPY_NO_EXPORT int
+CLONGDOUBLE_setitem(PyObject *op, void *ov, void *vap)
+{
+    PyArrayObject *ap = vap;
+    Py_complex oop;
+    npy_clongdouble temp;
+
+    if (PyArray_IsZeroDim(op)) {
+        return convert_to_scalar_and_retry(op, ov, vap, CLONGDOUBLE_setitem);
+    }
+
+    if (PyArray_IsScalar(op, CLongDouble)){
+        temp = PyArrayScalar_VAL(op, CLongDouble);
+    }
+    else {
+        if (op == Py_None) {
+            oop.real = NPY_NAN;
+            oop.imag = NPY_NAN;
+        }
+        else if (PyBytes_Check(op) || PyUnicode_Check(op)) {
+            /*
+             * Unlike most numeric conversion functions PyComplex_AsCComplex
+             * does not handle strings, so we have to use its constructor.
+             */
+            PyObject *pycomplex, *args;
+            if (PyBytes_Check(op)) {
+                /* The complex constructor expects unicode */
+                PyObject *unicode;
+                unicode = PyUnicode_FromEncodedObject(op, NULL, NULL);
+                if (unicode == NULL) {
+                    return -1;
+                }
+                args = PyTuple_Pack(1, unicode);
+                Py_DECREF(unicode);
+            }
+            else {
+                args = PyTuple_Pack(1, op);
+            }
+            if (args == NULL) {
+                return -1;
+            }
+            pycomplex = PyComplex_Type.tp_new(&PyComplex_Type, args, NULL);
+            Py_DECREF(args);
+            if (pycomplex == NULL) {
+                return -1;
+            }
+            oop = PyComplex_AsCComplex(pycomplex);
+            Py_DECREF(pycomplex);
+            if (error_converting(oop.real)) {
+                return -1;
+            }
+        }
+        else {
+            oop = PyComplex_AsCComplex(op);
+            if (error_converting(oop.real)) {
+                return -1;
+            }
+        }
+        temp.real = (npy_longdouble) oop.real;
+        temp.imag = (npy_longdouble) oop.imag;
+
+#if NPY_SIZEOF_CLONGDOUBLE < NPY_SIZEOF_CDOUBLE  /* really just float... */
+        /* Overflow could have occurred converting double to float */
+        if (NPY_UNLIKELY((npy_isinf(temp.real) && !npy_isinf(oop.real)) ||
+                         (npy_isinf(temp.imag) && !npy_isinf(oop.imag)))) {
+            if (PyUFunc_GiveFloatingpointErrors("cast", NPY_FPE_OVERFLOW) < 0) {
+                return -1;
+            }
+        }
+#endif
+    }
+
+    memcpy(ov, &temp, NPY_SIZEOF_CLONGDOUBLE);
+    if (ap != NULL && PyArray_ISBYTESWAPPED(ap)) {
+        byte_swap_vector(ov, 2, sizeof(npy_longdouble));
+    }
+    return 0;
+}
+
+
+
+static inline npy_longdouble
+string_to_long_double(PyObject*op)
+{
+    char *s;
+    char *end;
+    npy_longdouble temp;
+    PyObject* b;
+
+    /* Convert python long objects to a longdouble, without precision or range
+     * loss via a double.
+     */
+    if ((PyLong_Check(op) && !PyBool_Check(op))) {
+        return npy_longdouble_from_PyLong(op);
+    }
+
+    if (PyUnicode_Check(op)) {
+        b = PyUnicode_AsUTF8String(op);
+        if (!b) {
+            return 0;
+        }
+    }
+    else {
+        b = op;
+        Py_XINCREF(b);
+    }
+    s = PyBytes_AsString(b);
+    if (s) {
+        errno = 0;
+        temp = NumPyOS_ascii_strtold(s, &end);
+        if (errno == ERANGE) {
+           if (PyErr_Warn(PyExc_RuntimeWarning,
+                   "overflow encountered in conversion from string") < 0) {
+               Py_XDECREF(b);
+               return 0;
+           }
+           /* strtold returns INFINITY of the correct sign. */
+        }
+        else if (errno) {
+            PyErr_Format(PyExc_ValueError,
+                         "invalid literal for long double: %s (%s)",
+                         s,
+                         strerror(errno));
+            Py_XDECREF(b);
+            return 0;
+        }
+
+        /* Extra characters at the end of the string, or nothing parsed */
+        if (end == s || *end) {
+            PyErr_Format(PyExc_ValueError,
+                         "invalid literal for long double: %s",
+                         s);
+            Py_XDECREF(b);
+            return 0;
+        }
+        Py_XDECREF(b);
+    }
+    else {
+        /* Probably wasn't a string, try converting it via a python double */
+        PyErr_Clear();
+        Py_XDECREF(b);
+        temp = (npy_longdouble) MyPyFloat_AsDouble(op);
+    }
+    return temp;
+}
+
+/*
+ * These return array scalars which are different than other date-types.
+ */
+
+static PyObject *
+LONGDOUBLE_getitem(void *ip, void *ap)
+{
+    return PyArray_Scalar(ip, PyArray_DESCR((PyArrayObject *)ap), NULL);
+}
+
+NPY_NO_EXPORT int
+LONGDOUBLE_setitem(PyObject *op, void *ov, void *vap)
+{
+    PyArrayObject *ap = vap;
+    /* ensure alignment */
+    npy_longdouble temp;
+
+    if (PyArray_IsZeroDim(op)) {
+        return convert_to_scalar_and_retry(op, ov, vap, LONGDOUBLE_setitem);
+    }
+
+    if (PyArray_IsScalar(op, LongDouble)) {
+        temp = PyArrayScalar_VAL(op, LongDouble);
+    }
+    else {
+        /* In case something funny happened in PyArray_IsScalar */
+        if (PyErr_Occurred()) {
+            return -1;
+        }
+        temp = string_to_long_double(op);
+    }
+    if (PyErr_Occurred()) {
+        return -1;
+    }
+    if (ap == NULL || PyArray_ISBEHAVED(ap)) {
+        *((npy_longdouble *)ov) = temp;
+    }
+    else {
+        copy_and_swap(ov, &temp, PyArray_DESCR(ap)->elsize, 1, 0,
+                      PyArray_ISBYTESWAPPED(ap));
+    }
+    return 0;
+}
+
+static PyObject *
+CLONGDOUBLE_getitem(void *ip, void *ap)
+{
+    return PyArray_Scalar(ip, PyArray_DESCR((PyArrayObject *)ap), NULL);
+}
+
+/* UNICODE */
+static PyObject *
+UNICODE_getitem(void *ip, void *vap)
+{
+    PyArrayObject *ap = vap;
+    Py_ssize_t size = PyArray_ITEMSIZE(ap);
+    int swap = PyArray_ISBYTESWAPPED(ap);
+    int align = !PyArray_ISALIGNED(ap);
+
+    return (PyObject *)PyUnicode_FromUCS4(ip, size, swap, align);
+}
+
+static int
+UNICODE_setitem(PyObject *op, void *ov, void *vap)
+{
+    PyArrayObject *ap = vap;
+
+    if (PyArray_IsZeroDim(op)) {
+        return convert_to_scalar_and_retry(op, ov, vap, UNICODE_setitem);
+    }
+
+    if (PySequence_NoString_Check(op)) {
+        PyErr_SetString(PyExc_ValueError,
+                "setting an array element with a sequence");
+        return -1;
+    }
+
+    PyObject *temp;
+    if (PyBytes_Check(op)) {
+        /* Try to decode from ASCII */
+        temp = PyUnicode_FromEncodedObject(op, "ASCII", "strict");
+        if (temp == NULL) {
+            return -1;
+        }
+    }
+    else if ((temp=PyObject_Str(op)) == NULL) {
+        return -1;
+    }
+
+    /* truncate if needed */
+    Py_ssize_t max_len = PyArray_DESCR(ap)->elsize >> 2;
+    Py_ssize_t actual_len = PyUnicode_GetLength(temp);
+    if (actual_len < 0) {
+        Py_DECREF(temp);
+        return -1;
+    }
+    if (actual_len > max_len) {
+        Py_SETREF(temp, PyUnicode_Substring(temp, 0, max_len));
+        if (temp == NULL) {
+            return -1;
+        }
+        actual_len = max_len;
+    }
+
+    Py_ssize_t num_bytes = actual_len * 4;
+
+    char *buffer;
+    if (!PyArray_ISALIGNED(ap)) {
+        buffer = PyArray_malloc(num_bytes);
+        if (buffer == NULL) {
+            Py_DECREF(temp);
+            PyErr_NoMemory();
+            return -1;
+        }
+    }
+    else {
+        buffer = ov;
+    }
+    if (PyUnicode_AsUCS4(temp, (Py_UCS4 *)buffer, actual_len, 0) == NULL) {
+        PyArray_free(buffer);
+        Py_DECREF(temp);
+        return -1;
+    }
+
+    if (!PyArray_ISALIGNED(ap)) {
+        memcpy(ov, buffer, num_bytes);
+        PyArray_free(buffer);
+    }
+
+    /* Fill in the rest of the space with 0 */
+    if (PyArray_DESCR(ap)->elsize > num_bytes) {
+        memset((char*)ov + num_bytes, 0, (PyArray_DESCR(ap)->elsize - num_bytes));
+    }
+    if (PyArray_ISBYTESWAPPED(ap)) {
+        byte_swap_vector(ov, actual_len, 4);
+    }
+    Py_DECREF(temp);
+    return 0;
+}
+
+/* STRING
+ *
+ * can handle both NULL-terminated and not NULL-terminated cases
+ * will truncate all ending NULLs in returned string.
+ */
+static PyObject *
+STRING_getitem(void *ip, void *vap)
+{
+    PyArrayObject *ap = vap;
+    /* Will eliminate NULLs at the end */
+    char *ptr;
+    int size = PyArray_DESCR(ap)->elsize;
+
+    ptr = (char *)ip + size - 1;
+    while (size > 0 && *ptr-- == '\0') {
+        size--;
+    }
+    return PyBytes_FromStringAndSize(ip,size);
+}
+
+static int
+STRING_setitem(PyObject *op, void *ov, void *vap)
+{
+    PyArrayObject *ap = vap;
+    char *ptr;
+    Py_ssize_t len;
+    PyObject *temp = NULL;
+
+    if (PyArray_IsZeroDim(op)) {
+        return convert_to_scalar_and_retry(op, ov, vap, STRING_setitem);
+    }
+
+    if (PySequence_NoString_Check(op)) {
+        PyErr_SetString(PyExc_ValueError,
+                "setting an array element with a sequence");
+        return -1;
+    }
+    if (PyUnicode_Check(op)) {
+        /* Assume ASCII codec -- function similarly as Python 2 */
+        temp = PyUnicode_AsASCIIString(op);
+        if (temp == NULL) {
+            return -1;
+        }
+    }
+    else if (PyBytes_Check(op) || PyMemoryView_Check(op)) {
+        temp = PyObject_Bytes(op);
+        if (temp == NULL) {
+            return -1;
+        }
+    }
+    else {
+        /* Emulate similar casting behavior as on Python 2 */
+        PyObject *str;
+        str = PyObject_Str(op);
+        if (str == NULL) {
+            return -1;
+        }
+        temp = PyUnicode_AsASCIIString(str);
+        Py_DECREF(str);
+        if (temp == NULL) {
+            return -1;
+        }
+    }
+    if (PyBytes_AsStringAndSize(temp, &ptr, &len) < 0) {
+        Py_DECREF(temp);
+        return -1;
+    }
+    memcpy(ov, ptr, PyArray_MIN(PyArray_DESCR(ap)->elsize,len));
+    /*
+     * If string length is smaller than room in array
+     * Then fill the rest of the element size with NULL
+     */
+    if (PyArray_DESCR(ap)->elsize > len) {
+        memset((char *)ov + len, 0, (PyArray_DESCR(ap)->elsize - len));
+    }
+    Py_DECREF(temp);
+    return 0;
+}
+
+/* OBJECT */
+
+#define NPY__ALIGNED(obj, sz) ((((size_t) obj) % (sz))==0)
+
+static PyObject *
+OBJECT_getitem(void *ip, void *NPY_UNUSED(ap))
+{
+    PyObject *obj;
+    memcpy(&obj, ip, sizeof(obj));
+    if (obj == NULL) {
+        /* We support NULL, but still try to guarantee this never happens! */
+        Py_RETURN_NONE;
+    }
+    else {
+        Py_INCREF(obj);
+        return obj;
+    }
+}
+
+
+static int
+OBJECT_setitem(PyObject *op, void *ov, void *NPY_UNUSED(ap))
+{
+    PyObject *obj;
+
+    memcpy(&obj, ov, sizeof(obj));
+
+    Py_INCREF(op);
+    /* A newly created array/buffer may only be NULLed, so XDECREF */
+    Py_XDECREF(obj);
+
+    memcpy(ov, &op, sizeof(op));
+
+    return PyErr_Occurred() ? -1 : 0;
+}
+
+
+/* VOID */
+
+static PyObject *
+VOID_getitem(void *input, void *vap)
+{
+    PyArrayObject *ap = vap;
+    char *ip = input;
+    PyArray_Descr* descr = PyArray_DESCR(vap);
+
+    if (PyDataType_HASFIELDS(descr)) {
+        PyObject *key;
+        PyObject *names;
+        int i, n;
+        PyObject *ret;
+        PyObject *tup;
+        PyArrayObject_fields dummy_fields = get_dummy_stack_array(ap);
+        PyArrayObject *dummy_arr = (PyArrayObject *)&dummy_fields;
+
+        /* get the names from the fields dictionary*/
+        names = descr->names;
+        n = PyTuple_GET_SIZE(names);
+        ret = PyTuple_New(n);
+        for (i = 0; i < n; i++) {
+            npy_intp offset;
+            PyArray_Descr *new;
+            key = PyTuple_GET_ITEM(names, i);
+            tup = PyDict_GetItem(descr->fields, key);
+            if (_unpack_field(tup, &new, &offset) < 0) {
+                Py_DECREF(ret);
+                return NULL;
+            }
+            dummy_fields.descr = new;
+            /* update alignment based on offset */
+            if ((new->alignment > 1)
+                    && ((((npy_intp)(ip+offset)) % new->alignment) != 0)) {
+                PyArray_CLEARFLAGS(dummy_arr, NPY_ARRAY_ALIGNED);
+            }
+            else {
+                PyArray_ENABLEFLAGS(dummy_arr, NPY_ARRAY_ALIGNED);
+            }
+            PyTuple_SET_ITEM(ret, i, PyArray_GETITEM(dummy_arr, ip+offset));
+        }
+        return ret;
+    }
+
+    if (descr->subarray) {
+        /* return an array of the basic type */
+        PyArray_Dims shape = {NULL, -1};
+        PyArrayObject *ret;
+
+        if (!(PyArray_IntpConverter(descr->subarray->shape, &shape))) {
+            npy_free_cache_dim_obj(shape);
+            PyErr_SetString(PyExc_ValueError,
+                    "invalid shape in fixed-type tuple.");
+            return NULL;
+        }
+        Py_INCREF(descr->subarray->base);
+
+        /*
+         * NOTE: There is the possibility of recursive calls from the above
+         *       field branch. These calls use a dummy arr for thread
+         *       (and general) safety. However, we must set the base array,
+         *       so if such a dummy array was passed (its type is NULL),
+         *       we have walk its base until the initial array is found.
+         *
+         * TODO: This should be fixed, the next "generation" of GETITEM will
+         *       probably need to pass in the original array (in addition
+         *       to the dtype as a method). Alternatively, VOID dtypes
+         *       could have special handling.
+         */
+        PyObject *base = (PyObject *)ap;
+        while (base != NULL && Py_TYPE(base) == NULL) {
+            base = PyArray_BASE((PyArrayObject *)base);
+        }
+        ret = (PyArrayObject *)PyArray_NewFromDescrAndBase(
+                &PyArray_Type, descr->subarray->base,
+                shape.len, shape.ptr, NULL, ip,
+                PyArray_FLAGS(ap) & ~NPY_ARRAY_F_CONTIGUOUS,
+                NULL, base);
+        if (base == NULL) {
+            /*
+             * Need to create a copy, or we may point to wrong data.  This path
+             * is taken when no "valid" array is passed.  This happens for
+             * casts.
+             */
+            PyObject *copy = PyArray_FromArray(ret, NULL, NPY_ARRAY_ENSURECOPY);
+            Py_SETREF(ret, (PyArrayObject *)copy);
+        }
+        npy_free_cache_dim_obj(shape);
+        return (PyObject *)ret;
+    }
+
+    return PyBytes_FromStringAndSize(ip, descr->elsize);
+}
+
+
+NPY_NO_EXPORT int PyArray_CopyObject(PyArrayObject *, PyObject *);
+
+/* Given a structured PyArrayObject arr, index i and structured datatype descr,
+ * modify the dtype of arr to contain a single field corresponding to the ith
+ * field of descr, recompute the alignment flag, and return the offset of the
+ * field (in offset_p). This is useful in preparation for calling copyswap on
+ * individual fields of a numpy structure, in VOID_setitem.  Compare to inner
+ * loops in VOID_getitem and VOID_nonzero.
+ *
+ * WARNING: Clobbers arr's dtype and alignment flag, should not be used
+ *          on the original array!
+ */
+NPY_NO_EXPORT int
+_setup_field(int i, PyArray_Descr *descr, PyArrayObject *arr,
+            npy_intp *offset_p, char *dstdata)
+{
+    PyObject *key;
+    PyObject *tup;
+    PyArray_Descr *new;
+    npy_intp offset;
+
+    key = PyTuple_GET_ITEM(descr->names, i);
+    tup = PyDict_GetItem(descr->fields, key);
+    if (_unpack_field(tup, &new, &offset) < 0) {
+        return -1;
+    }
+
+    ((PyArrayObject_fields *)(arr))->descr = new;
+    if ((new->alignment > 1) &&
+                ((((uintptr_t)dstdata + offset) % new->alignment) != 0)) {
+        PyArray_CLEARFLAGS(arr, NPY_ARRAY_ALIGNED);
+    }
+    else {
+        PyArray_ENABLEFLAGS(arr, NPY_ARRAY_ALIGNED);
+    }
+
+    *offset_p = offset;
+    return 0;
+}
+
+/* Helper function for VOID_setitem, which uses the copyswap or casting code to
+ * copy structured datatypes between numpy arrays or scalars.
+ */
+static int
+_copy_and_return_void_setitem(PyArray_Descr *dstdescr, char *dstdata,
+                              PyArray_Descr *srcdescr, char *srcdata){
+    PyArrayObject_fields dummy_struct;
+    PyArrayObject *dummy_arr = (PyArrayObject *)&dummy_struct;
+    npy_int names_size = PyTuple_GET_SIZE(dstdescr->names);
+    npy_intp offset;
+    npy_int i;
+    int ret;
+
+    /* Fast path if dtypes are equal */
+    if (PyArray_EquivTypes(srcdescr, dstdescr)) {
+        for (i = 0; i < names_size; i++) {
+            /* neither line can ever fail, in principle */
+            if (_setup_field(i, dstdescr, dummy_arr, &offset, dstdata)) {
+                return -1;
+            }
+            PyArray_DESCR(dummy_arr)->f->copyswap(dstdata + offset,
+                    srcdata + offset, 0, dummy_arr);
+        }
+        return 0;
+    }
+
+    /* Slow path */
+    ret = PyArray_CastRawArrays(1, srcdata, dstdata, 0, 0,
+                                srcdescr, dstdescr, 0);
+    if (ret != NPY_SUCCEED) {
+        return -1;
+    }
+    return 0;
+}
+
+static int
+VOID_setitem(PyObject *op, void *input, void *vap)
+{
+    char *ip = input;
+    PyArrayObject *ap = vap;
+    int itemsize = PyArray_DESCR(ap)->elsize;
+    int res;
+    PyArray_Descr *descr = PyArray_DESCR(ap);
+
+    if (PyDataType_HASFIELDS(descr)) {
+        PyObject *errmsg;
+        npy_int i;
+        npy_intp offset;
+        int failed = 0;
+
+        /* If op is 0d-ndarray or numpy scalar, directly get dtype & data ptr */
+        if (PyArray_Check(op)) {
+            PyArrayObject *oparr = (PyArrayObject *)op;
+            if (PyArray_SIZE(oparr) != 1) {
+                PyErr_SetString(PyExc_ValueError,
+                        "setting an array element with a sequence.");
+                return -1;
+            }
+            return _copy_and_return_void_setitem(descr, ip,
+                                    PyArray_DESCR(oparr), PyArray_DATA(oparr));
+        }
+        else if (PyArray_IsScalar(op, Void)) {
+            PyArray_Descr *srcdescr = ((PyVoidScalarObject *)op)->descr;
+            char *srcdata = ((PyVoidScalarObject *)op)->obval;
+            return _copy_and_return_void_setitem(descr, ip, srcdescr, srcdata);
+        }
+        else if (PyTuple_Check(op)) {
+            /* if it's a tuple, copy field-by-field to ap, */
+            npy_intp names_size = PyTuple_GET_SIZE(descr->names);
+
+            if (names_size != PyTuple_Size(op)) {
+                errmsg = PyUnicode_FromFormat(
+                        "could not assign tuple of length %zd to structure "
+                        "with %" NPY_INTP_FMT " fields.",
+                        PyTuple_Size(op), names_size);
+                PyErr_SetObject(PyExc_ValueError, errmsg);
+                Py_DECREF(errmsg);
+                return -1;
+            }
+
+            PyArrayObject_fields dummy_fields = get_dummy_stack_array(ap);
+            PyArrayObject *dummy_arr = (PyArrayObject *)&dummy_fields;
+
+            for (i = 0; i < names_size; i++) {
+                PyObject *item;
+
+                if (_setup_field(i, descr, dummy_arr, &offset, ip) == -1) {
+                    failed = 1;
+                    break;
+                }
+                item = PyTuple_GetItem(op, i);
+                if (item == NULL) {
+                    failed = 1;
+                    break;
+                }
+                /* use setitem to set this field */
+                if (PyArray_SETITEM(dummy_arr, ip + offset, item) < 0) {
+                    failed = 1;
+                    break;
+                }
+            }
+        }
+        else {
+            /* Otherwise must be non-void scalar. Try to assign to each field */
+            npy_intp names_size = PyTuple_GET_SIZE(descr->names);
+
+            PyArrayObject_fields dummy_fields = get_dummy_stack_array(ap);
+            PyArrayObject *dummy_arr = (PyArrayObject *)&dummy_fields;
+
+            for (i = 0; i < names_size; i++) {
+                /* temporarily make ap have only this field */
+                if (_setup_field(i, descr, dummy_arr, &offset, ip) == -1) {
+                    failed = 1;
+                    break;
+                }
+                /* use setitem to set this field */
+                if (PyArray_SETITEM(dummy_arr, ip + offset, op) < 0) {
+                    failed = 1;
+                    break;
+                }
+            }
+        }
+
+        if (failed) {
+            return -1;
+        }
+        return 0;
+    }
+    else if (PyDataType_HASSUBARRAY(descr)) {
+        /* copy into an array of the same basic type */
+        PyArray_Dims shape = {NULL, -1};
+        if (!(PyArray_IntpConverter(descr->subarray->shape, &shape))) {
+            npy_free_cache_dim_obj(shape);
+            PyErr_SetString(PyExc_ValueError,
+                    "invalid shape in fixed-type tuple.");
+            return -1;
+        }
+        Py_INCREF(descr->subarray->base);
+        /*
+         * Note we set no base object here, as to not rely on the input
+         * being a valid object for base setting. `ret` nevertheless does
+         * does not own its data, this is generally not good, but localized.
+         */
+        PyArrayObject *ret = (PyArrayObject *)PyArray_NewFromDescrAndBase(
+                &PyArray_Type, descr->subarray->base,
+                shape.len, shape.ptr, NULL, ip,
+                PyArray_FLAGS(ap), NULL, NULL);
+        npy_free_cache_dim_obj(shape);
+        if (!ret) {
+            return -1;
+        }
+        res = PyArray_CopyObject(ret, op);
+        Py_DECREF(ret);
+        return res;
+    }
+
+    /*
+     * Fall through case - non-structured void datatype. This is a very
+     * undiscerning case: It interprets any object as a buffer
+     * and reads as many bytes as possible, padding with 0.
+     */
+    {
+        Py_buffer view;
+
+        if (PyObject_GetBuffer(op, &view, PyBUF_SIMPLE) < 0) {
+            return -1;
+        }
+        memcpy(ip, view.buf, PyArray_MIN(view.len, itemsize));
+        if (itemsize > view.len) {
+            memset(ip + view.len, 0, itemsize - view.len);
+        }
+        PyBuffer_Release(&view);
+    }
+    return 0;
+}
+
+static PyObject *
+DATETIME_getitem(void *ip, void *vap)
+{
+    PyArrayObject *ap = vap;
+    npy_datetime dt;
+    PyArray_DatetimeMetaData *meta = NULL;
+
+    /* Get the datetime units metadata */
+    meta = get_datetime_metadata_from_dtype(PyArray_DESCR(ap));
+    if (meta == NULL) {
+        return NULL;
+    }
+
+    if ((ap == NULL) || PyArray_ISBEHAVED_RO(ap)) {
+        dt = *((npy_datetime *)ip);
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(&dt, ip, PyArray_ISBYTESWAPPED(ap), ap);
+    }
+
+    return convert_datetime_to_pyobject(dt, meta);
+}
+
+
+static PyObject *
+TIMEDELTA_getitem(void *ip, void *vap)
+{
+    PyArrayObject *ap = vap;
+    npy_timedelta td;
+    PyArray_DatetimeMetaData *meta = NULL;
+
+    /* Get the datetime units metadata */
+    meta = get_datetime_metadata_from_dtype(PyArray_DESCR(ap));
+    if (meta == NULL) {
+        return NULL;
+    }
+
+    if ((ap == NULL) || PyArray_ISBEHAVED_RO(ap)) {
+        td = *((npy_timedelta *)ip);
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(&td, ip, PyArray_ISBYTESWAPPED(ap), ap);
+    }
+
+    return convert_timedelta_to_pyobject(td, meta);
+}
+
+static int
+DATETIME_setitem(PyObject *op, void *ov, void *vap)
+{
+    PyArrayObject *ap = vap;
+    /* ensure alignment */
+    npy_datetime temp = 0;
+    PyArray_DatetimeMetaData *meta = NULL;
+
+    /* Get the datetime units metadata */
+    meta = get_datetime_metadata_from_dtype(PyArray_DESCR(ap));
+    if (meta == NULL) {
+        return -1;
+    }
+
+    /* Convert the object into a NumPy datetime */
+    if (convert_pyobject_to_datetime(meta, op,
+                            NPY_SAME_KIND_CASTING, &temp) < 0) {
+        return -1;
+    }
+
+    /* Copy the value into the output */
+    if (ap == NULL || PyArray_ISBEHAVED(ap)) {
+        *((npy_datetime *)ov)=temp;
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(ov, &temp, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+    }
+
+    return 0;
+}
+
+static int
+TIMEDELTA_setitem(PyObject *op, void *ov, void *vap)
+{
+    PyArrayObject *ap = vap;
+    /* ensure alignment */
+    npy_timedelta temp = 0;
+    PyArray_DatetimeMetaData *meta = NULL;
+
+    /* Get the datetime units metadata */
+    meta = get_datetime_metadata_from_dtype(PyArray_DESCR(ap));
+    if (meta == NULL) {
+        return -1;
+    }
+
+    /* Convert the object into a NumPy datetime */
+    if (convert_pyobject_to_timedelta(meta, op,
+                            NPY_SAME_KIND_CASTING, &temp) < 0) {
+        return -1;
+    }
+
+    /* Copy the value into the output */
+    if (ap == NULL || PyArray_ISBEHAVED(ap)) {
+        *((npy_timedelta *)ov)=temp;
+    }
+    else {
+        PyArray_DESCR(ap)->f->copyswap(ov, &temp, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+    }
+
+    return 0;
+}
+
+
+/*
+ *****************************************************************************
+ **                       TYPE TO TYPE CONVERSIONS                          **
+ *****************************************************************************
+ */
+
+
+/* Assumes contiguous, and aligned, from and to */
+
+
+#line 1292
+
+#line 1304
+static void
+BYTE_to_BYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_byte *ip = input;
+    npy_byte *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_byte f = *ip++;
+        npy_byte t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_byte)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_byte)f;
+        }
+#else
+        npy_byte t = (npy_byte)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+UBYTE_to_BYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ubyte *ip = input;
+    npy_byte *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ubyte f = *ip++;
+        npy_byte t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_byte)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_byte)f;
+        }
+#else
+        npy_byte t = (npy_byte)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+SHORT_to_BYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_short *ip = input;
+    npy_byte *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_short f = *ip++;
+        npy_byte t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_byte)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_byte)f;
+        }
+#else
+        npy_byte t = (npy_byte)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+USHORT_to_BYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ushort *ip = input;
+    npy_byte *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ushort f = *ip++;
+        npy_byte t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_byte)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_byte)f;
+        }
+#else
+        npy_byte t = (npy_byte)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+INT_to_BYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_int *ip = input;
+    npy_byte *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_int f = *ip++;
+        npy_byte t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_byte)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_byte)f;
+        }
+#else
+        npy_byte t = (npy_byte)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+UINT_to_BYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_uint *ip = input;
+    npy_byte *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_uint f = *ip++;
+        npy_byte t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_byte)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_byte)f;
+        }
+#else
+        npy_byte t = (npy_byte)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONG_to_BYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_long *ip = input;
+    npy_byte *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_long f = *ip++;
+        npy_byte t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_byte)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_byte)f;
+        }
+#else
+        npy_byte t = (npy_byte)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+ULONG_to_BYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulong *ip = input;
+    npy_byte *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ulong f = *ip++;
+        npy_byte t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_byte)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_byte)f;
+        }
+#else
+        npy_byte t = (npy_byte)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONGLONG_to_BYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longlong *ip = input;
+    npy_byte *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longlong f = *ip++;
+        npy_byte t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_byte)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_byte)f;
+        }
+#else
+        npy_byte t = (npy_byte)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+ULONGLONG_to_BYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulonglong *ip = input;
+    npy_byte *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ulonglong f = *ip++;
+        npy_byte t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_byte)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_byte)f;
+        }
+#else
+        npy_byte t = (npy_byte)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+FLOAT_to_BYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_byte *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_float f = *ip++;
+        npy_byte t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_byte)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_byte)f;
+        }
+#else
+        npy_byte t = (npy_byte)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+DOUBLE_to_BYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_byte *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_double f = *ip++;
+        npy_byte t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_byte)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_byte)f;
+        }
+#else
+        npy_byte t = (npy_byte)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONGDOUBLE_to_BYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_byte *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longdouble f = *ip++;
+        npy_byte t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_byte)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_byte)f;
+        }
+#else
+        npy_byte t = (npy_byte)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+DATETIME_to_BYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_datetime *ip = input;
+    npy_byte *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_datetime f = *ip++;
+        npy_byte t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_byte)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_byte)f;
+        }
+#else
+        npy_byte t = (npy_byte)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+TIMEDELTA_to_BYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_timedelta *ip = input;
+    npy_byte *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_timedelta f = *ip++;
+        npy_byte t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_byte)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_byte)f;
+        }
+#else
+        npy_byte t = (npy_byte)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+
+#line 1339
+static void
+CFLOAT_to_BYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_byte *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_float f = *ip;
+        npy_byte t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_byte)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_byte)f;
+        }
+#else
+        npy_byte t = (npy_byte)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+#line 1339
+static void
+CDOUBLE_to_BYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_byte *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_double f = *ip;
+        npy_byte t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_byte)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_byte)f;
+        }
+#else
+        npy_byte t = (npy_byte)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+#line 1339
+static void
+CLONGDOUBLE_to_BYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_byte *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longdouble f = *ip;
+        npy_byte t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_byte)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_byte)f;
+        }
+#else
+        npy_byte t = (npy_byte)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+
+
+#line 1292
+
+#line 1304
+static void
+BYTE_to_UBYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_byte *ip = input;
+    npy_ubyte *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_byte f = *ip++;
+        npy_ubyte t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ubyte)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ubyte)f;
+        }
+#else
+        npy_ubyte t = (npy_ubyte)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+UBYTE_to_UBYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ubyte *ip = input;
+    npy_ubyte *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ubyte f = *ip++;
+        npy_ubyte t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ubyte)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ubyte)f;
+        }
+#else
+        npy_ubyte t = (npy_ubyte)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+SHORT_to_UBYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_short *ip = input;
+    npy_ubyte *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_short f = *ip++;
+        npy_ubyte t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ubyte)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ubyte)f;
+        }
+#else
+        npy_ubyte t = (npy_ubyte)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+USHORT_to_UBYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ushort *ip = input;
+    npy_ubyte *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ushort f = *ip++;
+        npy_ubyte t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ubyte)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ubyte)f;
+        }
+#else
+        npy_ubyte t = (npy_ubyte)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+INT_to_UBYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_int *ip = input;
+    npy_ubyte *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_int f = *ip++;
+        npy_ubyte t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ubyte)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ubyte)f;
+        }
+#else
+        npy_ubyte t = (npy_ubyte)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+UINT_to_UBYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_uint *ip = input;
+    npy_ubyte *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_uint f = *ip++;
+        npy_ubyte t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ubyte)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ubyte)f;
+        }
+#else
+        npy_ubyte t = (npy_ubyte)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONG_to_UBYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_long *ip = input;
+    npy_ubyte *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_long f = *ip++;
+        npy_ubyte t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ubyte)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ubyte)f;
+        }
+#else
+        npy_ubyte t = (npy_ubyte)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+ULONG_to_UBYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulong *ip = input;
+    npy_ubyte *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ulong f = *ip++;
+        npy_ubyte t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ubyte)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ubyte)f;
+        }
+#else
+        npy_ubyte t = (npy_ubyte)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONGLONG_to_UBYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longlong *ip = input;
+    npy_ubyte *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longlong f = *ip++;
+        npy_ubyte t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ubyte)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ubyte)f;
+        }
+#else
+        npy_ubyte t = (npy_ubyte)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+ULONGLONG_to_UBYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulonglong *ip = input;
+    npy_ubyte *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ulonglong f = *ip++;
+        npy_ubyte t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ubyte)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ubyte)f;
+        }
+#else
+        npy_ubyte t = (npy_ubyte)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+FLOAT_to_UBYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_ubyte *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_float f = *ip++;
+        npy_ubyte t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ubyte)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ubyte)f;
+        }
+#else
+        npy_ubyte t = (npy_ubyte)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+DOUBLE_to_UBYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_ubyte *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_double f = *ip++;
+        npy_ubyte t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ubyte)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ubyte)f;
+        }
+#else
+        npy_ubyte t = (npy_ubyte)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONGDOUBLE_to_UBYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_ubyte *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longdouble f = *ip++;
+        npy_ubyte t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ubyte)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ubyte)f;
+        }
+#else
+        npy_ubyte t = (npy_ubyte)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+DATETIME_to_UBYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_datetime *ip = input;
+    npy_ubyte *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_datetime f = *ip++;
+        npy_ubyte t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ubyte)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ubyte)f;
+        }
+#else
+        npy_ubyte t = (npy_ubyte)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+TIMEDELTA_to_UBYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_timedelta *ip = input;
+    npy_ubyte *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_timedelta f = *ip++;
+        npy_ubyte t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ubyte)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ubyte)f;
+        }
+#else
+        npy_ubyte t = (npy_ubyte)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+
+#line 1339
+static void
+CFLOAT_to_UBYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_ubyte *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_float f = *ip;
+        npy_ubyte t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ubyte)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ubyte)f;
+        }
+#else
+        npy_ubyte t = (npy_ubyte)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+#line 1339
+static void
+CDOUBLE_to_UBYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_ubyte *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_double f = *ip;
+        npy_ubyte t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ubyte)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ubyte)f;
+        }
+#else
+        npy_ubyte t = (npy_ubyte)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+#line 1339
+static void
+CLONGDOUBLE_to_UBYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_ubyte *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longdouble f = *ip;
+        npy_ubyte t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ubyte)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ubyte)f;
+        }
+#else
+        npy_ubyte t = (npy_ubyte)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+
+
+#line 1292
+
+#line 1304
+static void
+BYTE_to_SHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_byte *ip = input;
+    npy_short *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_byte f = *ip++;
+        npy_short t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_short)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_short)f;
+        }
+#else
+        npy_short t = (npy_short)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+UBYTE_to_SHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ubyte *ip = input;
+    npy_short *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ubyte f = *ip++;
+        npy_short t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_short)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_short)f;
+        }
+#else
+        npy_short t = (npy_short)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+SHORT_to_SHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_short *ip = input;
+    npy_short *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_short f = *ip++;
+        npy_short t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_short)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_short)f;
+        }
+#else
+        npy_short t = (npy_short)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+USHORT_to_SHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ushort *ip = input;
+    npy_short *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ushort f = *ip++;
+        npy_short t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_short)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_short)f;
+        }
+#else
+        npy_short t = (npy_short)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+INT_to_SHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_int *ip = input;
+    npy_short *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_int f = *ip++;
+        npy_short t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_short)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_short)f;
+        }
+#else
+        npy_short t = (npy_short)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+UINT_to_SHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_uint *ip = input;
+    npy_short *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_uint f = *ip++;
+        npy_short t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_short)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_short)f;
+        }
+#else
+        npy_short t = (npy_short)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONG_to_SHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_long *ip = input;
+    npy_short *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_long f = *ip++;
+        npy_short t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_short)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_short)f;
+        }
+#else
+        npy_short t = (npy_short)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+ULONG_to_SHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulong *ip = input;
+    npy_short *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ulong f = *ip++;
+        npy_short t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_short)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_short)f;
+        }
+#else
+        npy_short t = (npy_short)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONGLONG_to_SHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longlong *ip = input;
+    npy_short *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longlong f = *ip++;
+        npy_short t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_short)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_short)f;
+        }
+#else
+        npy_short t = (npy_short)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+ULONGLONG_to_SHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulonglong *ip = input;
+    npy_short *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ulonglong f = *ip++;
+        npy_short t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_short)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_short)f;
+        }
+#else
+        npy_short t = (npy_short)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+FLOAT_to_SHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_short *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_float f = *ip++;
+        npy_short t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_short)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_short)f;
+        }
+#else
+        npy_short t = (npy_short)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+DOUBLE_to_SHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_short *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_double f = *ip++;
+        npy_short t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_short)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_short)f;
+        }
+#else
+        npy_short t = (npy_short)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONGDOUBLE_to_SHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_short *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longdouble f = *ip++;
+        npy_short t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_short)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_short)f;
+        }
+#else
+        npy_short t = (npy_short)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+DATETIME_to_SHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_datetime *ip = input;
+    npy_short *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_datetime f = *ip++;
+        npy_short t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_short)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_short)f;
+        }
+#else
+        npy_short t = (npy_short)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+TIMEDELTA_to_SHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_timedelta *ip = input;
+    npy_short *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_timedelta f = *ip++;
+        npy_short t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_short)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_short)f;
+        }
+#else
+        npy_short t = (npy_short)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+
+#line 1339
+static void
+CFLOAT_to_SHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_short *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_float f = *ip;
+        npy_short t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_short)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_short)f;
+        }
+#else
+        npy_short t = (npy_short)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+#line 1339
+static void
+CDOUBLE_to_SHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_short *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_double f = *ip;
+        npy_short t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_short)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_short)f;
+        }
+#else
+        npy_short t = (npy_short)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+#line 1339
+static void
+CLONGDOUBLE_to_SHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_short *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longdouble f = *ip;
+        npy_short t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_short)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_short)f;
+        }
+#else
+        npy_short t = (npy_short)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+
+
+#line 1292
+
+#line 1304
+static void
+BYTE_to_USHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_byte *ip = input;
+    npy_ushort *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_byte f = *ip++;
+        npy_ushort t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ushort)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ushort)f;
+        }
+#else
+        npy_ushort t = (npy_ushort)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+UBYTE_to_USHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ubyte *ip = input;
+    npy_ushort *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ubyte f = *ip++;
+        npy_ushort t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ushort)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ushort)f;
+        }
+#else
+        npy_ushort t = (npy_ushort)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+SHORT_to_USHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_short *ip = input;
+    npy_ushort *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_short f = *ip++;
+        npy_ushort t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ushort)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ushort)f;
+        }
+#else
+        npy_ushort t = (npy_ushort)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+USHORT_to_USHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ushort *ip = input;
+    npy_ushort *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ushort f = *ip++;
+        npy_ushort t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ushort)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ushort)f;
+        }
+#else
+        npy_ushort t = (npy_ushort)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+INT_to_USHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_int *ip = input;
+    npy_ushort *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_int f = *ip++;
+        npy_ushort t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ushort)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ushort)f;
+        }
+#else
+        npy_ushort t = (npy_ushort)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+UINT_to_USHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_uint *ip = input;
+    npy_ushort *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_uint f = *ip++;
+        npy_ushort t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ushort)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ushort)f;
+        }
+#else
+        npy_ushort t = (npy_ushort)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONG_to_USHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_long *ip = input;
+    npy_ushort *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_long f = *ip++;
+        npy_ushort t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ushort)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ushort)f;
+        }
+#else
+        npy_ushort t = (npy_ushort)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+ULONG_to_USHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulong *ip = input;
+    npy_ushort *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ulong f = *ip++;
+        npy_ushort t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ushort)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ushort)f;
+        }
+#else
+        npy_ushort t = (npy_ushort)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONGLONG_to_USHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longlong *ip = input;
+    npy_ushort *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longlong f = *ip++;
+        npy_ushort t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ushort)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ushort)f;
+        }
+#else
+        npy_ushort t = (npy_ushort)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+ULONGLONG_to_USHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulonglong *ip = input;
+    npy_ushort *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ulonglong f = *ip++;
+        npy_ushort t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ushort)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ushort)f;
+        }
+#else
+        npy_ushort t = (npy_ushort)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+FLOAT_to_USHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_ushort *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_float f = *ip++;
+        npy_ushort t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ushort)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ushort)f;
+        }
+#else
+        npy_ushort t = (npy_ushort)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+DOUBLE_to_USHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_ushort *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_double f = *ip++;
+        npy_ushort t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ushort)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ushort)f;
+        }
+#else
+        npy_ushort t = (npy_ushort)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONGDOUBLE_to_USHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_ushort *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longdouble f = *ip++;
+        npy_ushort t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ushort)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ushort)f;
+        }
+#else
+        npy_ushort t = (npy_ushort)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+DATETIME_to_USHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_datetime *ip = input;
+    npy_ushort *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_datetime f = *ip++;
+        npy_ushort t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ushort)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ushort)f;
+        }
+#else
+        npy_ushort t = (npy_ushort)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+TIMEDELTA_to_USHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_timedelta *ip = input;
+    npy_ushort *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_timedelta f = *ip++;
+        npy_ushort t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ushort)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ushort)f;
+        }
+#else
+        npy_ushort t = (npy_ushort)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+
+#line 1339
+static void
+CFLOAT_to_USHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_ushort *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_float f = *ip;
+        npy_ushort t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ushort)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ushort)f;
+        }
+#else
+        npy_ushort t = (npy_ushort)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+#line 1339
+static void
+CDOUBLE_to_USHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_ushort *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_double f = *ip;
+        npy_ushort t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ushort)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ushort)f;
+        }
+#else
+        npy_ushort t = (npy_ushort)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+#line 1339
+static void
+CLONGDOUBLE_to_USHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_ushort *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longdouble f = *ip;
+        npy_ushort t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ushort)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ushort)f;
+        }
+#else
+        npy_ushort t = (npy_ushort)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+
+
+#line 1292
+
+#line 1304
+static void
+BYTE_to_INT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_byte *ip = input;
+    npy_int *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_byte f = *ip++;
+        npy_int t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_int)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_int)f;
+        }
+#else
+        npy_int t = (npy_int)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+UBYTE_to_INT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ubyte *ip = input;
+    npy_int *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ubyte f = *ip++;
+        npy_int t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_int)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_int)f;
+        }
+#else
+        npy_int t = (npy_int)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+SHORT_to_INT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_short *ip = input;
+    npy_int *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_short f = *ip++;
+        npy_int t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_int)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_int)f;
+        }
+#else
+        npy_int t = (npy_int)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+USHORT_to_INT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ushort *ip = input;
+    npy_int *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ushort f = *ip++;
+        npy_int t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_int)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_int)f;
+        }
+#else
+        npy_int t = (npy_int)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+INT_to_INT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_int *ip = input;
+    npy_int *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_int f = *ip++;
+        npy_int t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_int)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_int)f;
+        }
+#else
+        npy_int t = (npy_int)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+UINT_to_INT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_uint *ip = input;
+    npy_int *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_uint f = *ip++;
+        npy_int t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_int)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_int)f;
+        }
+#else
+        npy_int t = (npy_int)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONG_to_INT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_long *ip = input;
+    npy_int *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_long f = *ip++;
+        npy_int t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_int)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_int)f;
+        }
+#else
+        npy_int t = (npy_int)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+ULONG_to_INT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulong *ip = input;
+    npy_int *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ulong f = *ip++;
+        npy_int t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_int)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_int)f;
+        }
+#else
+        npy_int t = (npy_int)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONGLONG_to_INT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longlong *ip = input;
+    npy_int *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longlong f = *ip++;
+        npy_int t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_int)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_int)f;
+        }
+#else
+        npy_int t = (npy_int)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+ULONGLONG_to_INT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulonglong *ip = input;
+    npy_int *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ulonglong f = *ip++;
+        npy_int t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_int)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_int)f;
+        }
+#else
+        npy_int t = (npy_int)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+FLOAT_to_INT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_int *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_float f = *ip++;
+        npy_int t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_int)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_int)f;
+        }
+#else
+        npy_int t = (npy_int)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+DOUBLE_to_INT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_int *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_double f = *ip++;
+        npy_int t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_int)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_int)f;
+        }
+#else
+        npy_int t = (npy_int)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONGDOUBLE_to_INT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_int *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longdouble f = *ip++;
+        npy_int t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_int)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_int)f;
+        }
+#else
+        npy_int t = (npy_int)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+DATETIME_to_INT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_datetime *ip = input;
+    npy_int *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_datetime f = *ip++;
+        npy_int t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_int)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_int)f;
+        }
+#else
+        npy_int t = (npy_int)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+TIMEDELTA_to_INT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_timedelta *ip = input;
+    npy_int *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_timedelta f = *ip++;
+        npy_int t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_int)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_int)f;
+        }
+#else
+        npy_int t = (npy_int)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+
+#line 1339
+static void
+CFLOAT_to_INT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_int *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_float f = *ip;
+        npy_int t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_int)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_int)f;
+        }
+#else
+        npy_int t = (npy_int)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+#line 1339
+static void
+CDOUBLE_to_INT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_int *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_double f = *ip;
+        npy_int t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_int)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_int)f;
+        }
+#else
+        npy_int t = (npy_int)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+#line 1339
+static void
+CLONGDOUBLE_to_INT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_int *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longdouble f = *ip;
+        npy_int t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_int)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_int)f;
+        }
+#else
+        npy_int t = (npy_int)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+
+
+#line 1292
+
+#line 1304
+static void
+BYTE_to_UINT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_byte *ip = input;
+    npy_uint *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_byte f = *ip++;
+        npy_uint t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_uint)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_uint)f;
+        }
+#else
+        npy_uint t = (npy_uint)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+UBYTE_to_UINT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ubyte *ip = input;
+    npy_uint *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ubyte f = *ip++;
+        npy_uint t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_uint)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_uint)f;
+        }
+#else
+        npy_uint t = (npy_uint)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+SHORT_to_UINT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_short *ip = input;
+    npy_uint *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_short f = *ip++;
+        npy_uint t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_uint)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_uint)f;
+        }
+#else
+        npy_uint t = (npy_uint)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+USHORT_to_UINT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ushort *ip = input;
+    npy_uint *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ushort f = *ip++;
+        npy_uint t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_uint)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_uint)f;
+        }
+#else
+        npy_uint t = (npy_uint)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+INT_to_UINT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_int *ip = input;
+    npy_uint *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_int f = *ip++;
+        npy_uint t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_uint)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_uint)f;
+        }
+#else
+        npy_uint t = (npy_uint)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+UINT_to_UINT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_uint *ip = input;
+    npy_uint *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_uint f = *ip++;
+        npy_uint t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_uint)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_uint)f;
+        }
+#else
+        npy_uint t = (npy_uint)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONG_to_UINT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_long *ip = input;
+    npy_uint *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_long f = *ip++;
+        npy_uint t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_uint)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_uint)f;
+        }
+#else
+        npy_uint t = (npy_uint)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+ULONG_to_UINT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulong *ip = input;
+    npy_uint *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ulong f = *ip++;
+        npy_uint t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_uint)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_uint)f;
+        }
+#else
+        npy_uint t = (npy_uint)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONGLONG_to_UINT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longlong *ip = input;
+    npy_uint *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longlong f = *ip++;
+        npy_uint t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_uint)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_uint)f;
+        }
+#else
+        npy_uint t = (npy_uint)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+ULONGLONG_to_UINT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulonglong *ip = input;
+    npy_uint *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ulonglong f = *ip++;
+        npy_uint t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_uint)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_uint)f;
+        }
+#else
+        npy_uint t = (npy_uint)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+FLOAT_to_UINT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_uint *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_float f = *ip++;
+        npy_uint t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_uint)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_uint)f;
+        }
+#else
+        npy_uint t = (npy_uint)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+DOUBLE_to_UINT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_uint *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_double f = *ip++;
+        npy_uint t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_uint)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_uint)f;
+        }
+#else
+        npy_uint t = (npy_uint)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONGDOUBLE_to_UINT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_uint *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longdouble f = *ip++;
+        npy_uint t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_uint)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_uint)f;
+        }
+#else
+        npy_uint t = (npy_uint)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+DATETIME_to_UINT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_datetime *ip = input;
+    npy_uint *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_datetime f = *ip++;
+        npy_uint t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_uint)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_uint)f;
+        }
+#else
+        npy_uint t = (npy_uint)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+TIMEDELTA_to_UINT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_timedelta *ip = input;
+    npy_uint *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_timedelta f = *ip++;
+        npy_uint t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_uint)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_uint)f;
+        }
+#else
+        npy_uint t = (npy_uint)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+
+#line 1339
+static void
+CFLOAT_to_UINT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_uint *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_float f = *ip;
+        npy_uint t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_uint)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_uint)f;
+        }
+#else
+        npy_uint t = (npy_uint)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+#line 1339
+static void
+CDOUBLE_to_UINT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_uint *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_double f = *ip;
+        npy_uint t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_uint)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_uint)f;
+        }
+#else
+        npy_uint t = (npy_uint)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+#line 1339
+static void
+CLONGDOUBLE_to_UINT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_uint *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longdouble f = *ip;
+        npy_uint t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_uint)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_uint)f;
+        }
+#else
+        npy_uint t = (npy_uint)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+
+
+#line 1292
+
+#line 1304
+static void
+BYTE_to_LONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_byte *ip = input;
+    npy_long *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_byte f = *ip++;
+        npy_long t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_long)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_long)f;
+        }
+#else
+        npy_long t = (npy_long)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+UBYTE_to_LONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ubyte *ip = input;
+    npy_long *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ubyte f = *ip++;
+        npy_long t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_long)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_long)f;
+        }
+#else
+        npy_long t = (npy_long)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+SHORT_to_LONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_short *ip = input;
+    npy_long *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_short f = *ip++;
+        npy_long t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_long)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_long)f;
+        }
+#else
+        npy_long t = (npy_long)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+USHORT_to_LONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ushort *ip = input;
+    npy_long *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ushort f = *ip++;
+        npy_long t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_long)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_long)f;
+        }
+#else
+        npy_long t = (npy_long)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+INT_to_LONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_int *ip = input;
+    npy_long *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_int f = *ip++;
+        npy_long t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_long)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_long)f;
+        }
+#else
+        npy_long t = (npy_long)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+UINT_to_LONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_uint *ip = input;
+    npy_long *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_uint f = *ip++;
+        npy_long t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_long)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_long)f;
+        }
+#else
+        npy_long t = (npy_long)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONG_to_LONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_long *ip = input;
+    npy_long *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_long f = *ip++;
+        npy_long t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_long)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_long)f;
+        }
+#else
+        npy_long t = (npy_long)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+ULONG_to_LONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulong *ip = input;
+    npy_long *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ulong f = *ip++;
+        npy_long t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_long)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_long)f;
+        }
+#else
+        npy_long t = (npy_long)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONGLONG_to_LONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longlong *ip = input;
+    npy_long *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longlong f = *ip++;
+        npy_long t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_long)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_long)f;
+        }
+#else
+        npy_long t = (npy_long)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+ULONGLONG_to_LONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulonglong *ip = input;
+    npy_long *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ulonglong f = *ip++;
+        npy_long t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_long)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_long)f;
+        }
+#else
+        npy_long t = (npy_long)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+FLOAT_to_LONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_long *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_float f = *ip++;
+        npy_long t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_long)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_long)f;
+        }
+#else
+        npy_long t = (npy_long)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+DOUBLE_to_LONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_long *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_double f = *ip++;
+        npy_long t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_long)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_long)f;
+        }
+#else
+        npy_long t = (npy_long)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONGDOUBLE_to_LONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_long *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longdouble f = *ip++;
+        npy_long t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_long)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_long)f;
+        }
+#else
+        npy_long t = (npy_long)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+DATETIME_to_LONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_datetime *ip = input;
+    npy_long *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_datetime f = *ip++;
+        npy_long t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_long)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_long)f;
+        }
+#else
+        npy_long t = (npy_long)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+TIMEDELTA_to_LONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_timedelta *ip = input;
+    npy_long *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_timedelta f = *ip++;
+        npy_long t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_long)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_long)f;
+        }
+#else
+        npy_long t = (npy_long)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+
+#line 1339
+static void
+CFLOAT_to_LONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_long *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_float f = *ip;
+        npy_long t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_long)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_long)f;
+        }
+#else
+        npy_long t = (npy_long)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+#line 1339
+static void
+CDOUBLE_to_LONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_long *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_double f = *ip;
+        npy_long t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_long)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_long)f;
+        }
+#else
+        npy_long t = (npy_long)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+#line 1339
+static void
+CLONGDOUBLE_to_LONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_long *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longdouble f = *ip;
+        npy_long t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_long)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_long)f;
+        }
+#else
+        npy_long t = (npy_long)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+
+
+#line 1292
+
+#line 1304
+static void
+BYTE_to_ULONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_byte *ip = input;
+    npy_ulong *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_byte f = *ip++;
+        npy_ulong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ulong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ulong)f;
+        }
+#else
+        npy_ulong t = (npy_ulong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+UBYTE_to_ULONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ubyte *ip = input;
+    npy_ulong *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ubyte f = *ip++;
+        npy_ulong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ulong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ulong)f;
+        }
+#else
+        npy_ulong t = (npy_ulong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+SHORT_to_ULONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_short *ip = input;
+    npy_ulong *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_short f = *ip++;
+        npy_ulong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ulong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ulong)f;
+        }
+#else
+        npy_ulong t = (npy_ulong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+USHORT_to_ULONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ushort *ip = input;
+    npy_ulong *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ushort f = *ip++;
+        npy_ulong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ulong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ulong)f;
+        }
+#else
+        npy_ulong t = (npy_ulong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+INT_to_ULONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_int *ip = input;
+    npy_ulong *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_int f = *ip++;
+        npy_ulong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ulong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ulong)f;
+        }
+#else
+        npy_ulong t = (npy_ulong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+UINT_to_ULONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_uint *ip = input;
+    npy_ulong *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_uint f = *ip++;
+        npy_ulong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ulong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ulong)f;
+        }
+#else
+        npy_ulong t = (npy_ulong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONG_to_ULONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_long *ip = input;
+    npy_ulong *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_long f = *ip++;
+        npy_ulong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ulong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ulong)f;
+        }
+#else
+        npy_ulong t = (npy_ulong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+ULONG_to_ULONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulong *ip = input;
+    npy_ulong *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ulong f = *ip++;
+        npy_ulong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ulong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ulong)f;
+        }
+#else
+        npy_ulong t = (npy_ulong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONGLONG_to_ULONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longlong *ip = input;
+    npy_ulong *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longlong f = *ip++;
+        npy_ulong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ulong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ulong)f;
+        }
+#else
+        npy_ulong t = (npy_ulong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+ULONGLONG_to_ULONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulonglong *ip = input;
+    npy_ulong *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ulonglong f = *ip++;
+        npy_ulong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ulong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ulong)f;
+        }
+#else
+        npy_ulong t = (npy_ulong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+FLOAT_to_ULONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_ulong *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_float f = *ip++;
+        npy_ulong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ulong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ulong)f;
+        }
+#else
+        npy_ulong t = (npy_ulong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+DOUBLE_to_ULONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_ulong *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_double f = *ip++;
+        npy_ulong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ulong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ulong)f;
+        }
+#else
+        npy_ulong t = (npy_ulong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONGDOUBLE_to_ULONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_ulong *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longdouble f = *ip++;
+        npy_ulong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ulong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ulong)f;
+        }
+#else
+        npy_ulong t = (npy_ulong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+DATETIME_to_ULONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_datetime *ip = input;
+    npy_ulong *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_datetime f = *ip++;
+        npy_ulong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ulong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ulong)f;
+        }
+#else
+        npy_ulong t = (npy_ulong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+TIMEDELTA_to_ULONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_timedelta *ip = input;
+    npy_ulong *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_timedelta f = *ip++;
+        npy_ulong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ulong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ulong)f;
+        }
+#else
+        npy_ulong t = (npy_ulong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+
+#line 1339
+static void
+CFLOAT_to_ULONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_ulong *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_float f = *ip;
+        npy_ulong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ulong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ulong)f;
+        }
+#else
+        npy_ulong t = (npy_ulong)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+#line 1339
+static void
+CDOUBLE_to_ULONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_ulong *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_double f = *ip;
+        npy_ulong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ulong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ulong)f;
+        }
+#else
+        npy_ulong t = (npy_ulong)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+#line 1339
+static void
+CLONGDOUBLE_to_ULONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_ulong *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longdouble f = *ip;
+        npy_ulong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ulong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ulong)f;
+        }
+#else
+        npy_ulong t = (npy_ulong)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+
+
+#line 1292
+
+#line 1304
+static void
+BYTE_to_LONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_byte *ip = input;
+    npy_longlong *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_byte f = *ip++;
+        npy_longlong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_longlong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_longlong)f;
+        }
+#else
+        npy_longlong t = (npy_longlong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+UBYTE_to_LONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ubyte *ip = input;
+    npy_longlong *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ubyte f = *ip++;
+        npy_longlong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_longlong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_longlong)f;
+        }
+#else
+        npy_longlong t = (npy_longlong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+SHORT_to_LONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_short *ip = input;
+    npy_longlong *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_short f = *ip++;
+        npy_longlong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_longlong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_longlong)f;
+        }
+#else
+        npy_longlong t = (npy_longlong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+USHORT_to_LONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ushort *ip = input;
+    npy_longlong *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ushort f = *ip++;
+        npy_longlong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_longlong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_longlong)f;
+        }
+#else
+        npy_longlong t = (npy_longlong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+INT_to_LONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_int *ip = input;
+    npy_longlong *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_int f = *ip++;
+        npy_longlong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_longlong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_longlong)f;
+        }
+#else
+        npy_longlong t = (npy_longlong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+UINT_to_LONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_uint *ip = input;
+    npy_longlong *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_uint f = *ip++;
+        npy_longlong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_longlong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_longlong)f;
+        }
+#else
+        npy_longlong t = (npy_longlong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONG_to_LONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_long *ip = input;
+    npy_longlong *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_long f = *ip++;
+        npy_longlong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_longlong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_longlong)f;
+        }
+#else
+        npy_longlong t = (npy_longlong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+ULONG_to_LONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulong *ip = input;
+    npy_longlong *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ulong f = *ip++;
+        npy_longlong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_longlong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_longlong)f;
+        }
+#else
+        npy_longlong t = (npy_longlong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONGLONG_to_LONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longlong *ip = input;
+    npy_longlong *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longlong f = *ip++;
+        npy_longlong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_longlong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_longlong)f;
+        }
+#else
+        npy_longlong t = (npy_longlong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+ULONGLONG_to_LONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulonglong *ip = input;
+    npy_longlong *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ulonglong f = *ip++;
+        npy_longlong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_longlong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_longlong)f;
+        }
+#else
+        npy_longlong t = (npy_longlong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+FLOAT_to_LONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_longlong *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_float f = *ip++;
+        npy_longlong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_longlong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_longlong)f;
+        }
+#else
+        npy_longlong t = (npy_longlong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+DOUBLE_to_LONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_longlong *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_double f = *ip++;
+        npy_longlong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_longlong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_longlong)f;
+        }
+#else
+        npy_longlong t = (npy_longlong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONGDOUBLE_to_LONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_longlong *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longdouble f = *ip++;
+        npy_longlong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_longlong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_longlong)f;
+        }
+#else
+        npy_longlong t = (npy_longlong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+DATETIME_to_LONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_datetime *ip = input;
+    npy_longlong *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_datetime f = *ip++;
+        npy_longlong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_longlong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_longlong)f;
+        }
+#else
+        npy_longlong t = (npy_longlong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+TIMEDELTA_to_LONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_timedelta *ip = input;
+    npy_longlong *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_timedelta f = *ip++;
+        npy_longlong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_longlong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_longlong)f;
+        }
+#else
+        npy_longlong t = (npy_longlong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+
+#line 1339
+static void
+CFLOAT_to_LONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_longlong *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_float f = *ip;
+        npy_longlong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_longlong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_longlong)f;
+        }
+#else
+        npy_longlong t = (npy_longlong)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+#line 1339
+static void
+CDOUBLE_to_LONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_longlong *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_double f = *ip;
+        npy_longlong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_longlong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_longlong)f;
+        }
+#else
+        npy_longlong t = (npy_longlong)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+#line 1339
+static void
+CLONGDOUBLE_to_LONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_longlong *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longdouble f = *ip;
+        npy_longlong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_longlong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_longlong)f;
+        }
+#else
+        npy_longlong t = (npy_longlong)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+
+
+#line 1292
+
+#line 1304
+static void
+BYTE_to_ULONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_byte *ip = input;
+    npy_ulonglong *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_byte f = *ip++;
+        npy_ulonglong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ulonglong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ulonglong)f;
+        }
+#else
+        npy_ulonglong t = (npy_ulonglong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+UBYTE_to_ULONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ubyte *ip = input;
+    npy_ulonglong *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ubyte f = *ip++;
+        npy_ulonglong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ulonglong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ulonglong)f;
+        }
+#else
+        npy_ulonglong t = (npy_ulonglong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+SHORT_to_ULONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_short *ip = input;
+    npy_ulonglong *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_short f = *ip++;
+        npy_ulonglong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ulonglong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ulonglong)f;
+        }
+#else
+        npy_ulonglong t = (npy_ulonglong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+USHORT_to_ULONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ushort *ip = input;
+    npy_ulonglong *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ushort f = *ip++;
+        npy_ulonglong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ulonglong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ulonglong)f;
+        }
+#else
+        npy_ulonglong t = (npy_ulonglong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+INT_to_ULONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_int *ip = input;
+    npy_ulonglong *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_int f = *ip++;
+        npy_ulonglong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ulonglong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ulonglong)f;
+        }
+#else
+        npy_ulonglong t = (npy_ulonglong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+UINT_to_ULONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_uint *ip = input;
+    npy_ulonglong *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_uint f = *ip++;
+        npy_ulonglong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ulonglong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ulonglong)f;
+        }
+#else
+        npy_ulonglong t = (npy_ulonglong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONG_to_ULONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_long *ip = input;
+    npy_ulonglong *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_long f = *ip++;
+        npy_ulonglong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ulonglong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ulonglong)f;
+        }
+#else
+        npy_ulonglong t = (npy_ulonglong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+ULONG_to_ULONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulong *ip = input;
+    npy_ulonglong *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ulong f = *ip++;
+        npy_ulonglong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ulonglong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ulonglong)f;
+        }
+#else
+        npy_ulonglong t = (npy_ulonglong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONGLONG_to_ULONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longlong *ip = input;
+    npy_ulonglong *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longlong f = *ip++;
+        npy_ulonglong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ulonglong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ulonglong)f;
+        }
+#else
+        npy_ulonglong t = (npy_ulonglong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+ULONGLONG_to_ULONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulonglong *ip = input;
+    npy_ulonglong *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ulonglong f = *ip++;
+        npy_ulonglong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ulonglong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ulonglong)f;
+        }
+#else
+        npy_ulonglong t = (npy_ulonglong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+FLOAT_to_ULONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_ulonglong *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_float f = *ip++;
+        npy_ulonglong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ulonglong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ulonglong)f;
+        }
+#else
+        npy_ulonglong t = (npy_ulonglong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+DOUBLE_to_ULONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_ulonglong *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_double f = *ip++;
+        npy_ulonglong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ulonglong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ulonglong)f;
+        }
+#else
+        npy_ulonglong t = (npy_ulonglong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONGDOUBLE_to_ULONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_ulonglong *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longdouble f = *ip++;
+        npy_ulonglong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ulonglong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ulonglong)f;
+        }
+#else
+        npy_ulonglong t = (npy_ulonglong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+DATETIME_to_ULONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_datetime *ip = input;
+    npy_ulonglong *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_datetime f = *ip++;
+        npy_ulonglong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ulonglong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ulonglong)f;
+        }
+#else
+        npy_ulonglong t = (npy_ulonglong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+TIMEDELTA_to_ULONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_timedelta *ip = input;
+    npy_ulonglong *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_timedelta f = *ip++;
+        npy_ulonglong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ulonglong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ulonglong)f;
+        }
+#else
+        npy_ulonglong t = (npy_ulonglong)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+
+#line 1339
+static void
+CFLOAT_to_ULONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_ulonglong *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_float f = *ip;
+        npy_ulonglong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ulonglong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ulonglong)f;
+        }
+#else
+        npy_ulonglong t = (npy_ulonglong)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+#line 1339
+static void
+CDOUBLE_to_ULONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_ulonglong *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_double f = *ip;
+        npy_ulonglong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ulonglong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ulonglong)f;
+        }
+#else
+        npy_ulonglong t = (npy_ulonglong)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+#line 1339
+static void
+CLONGDOUBLE_to_ULONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_ulonglong *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longdouble f = *ip;
+        npy_ulonglong t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_ulonglong)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_ulonglong)f;
+        }
+#else
+        npy_ulonglong t = (npy_ulonglong)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+
+
+#line 1292
+
+#line 1304
+static void
+BYTE_to_FLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_byte *ip = input;
+    npy_float *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_byte f = *ip++;
+        npy_float t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_float)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_float)f;
+        }
+#else
+        npy_float t = (npy_float)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+UBYTE_to_FLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ubyte *ip = input;
+    npy_float *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ubyte f = *ip++;
+        npy_float t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_float)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_float)f;
+        }
+#else
+        npy_float t = (npy_float)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+SHORT_to_FLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_short *ip = input;
+    npy_float *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_short f = *ip++;
+        npy_float t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_float)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_float)f;
+        }
+#else
+        npy_float t = (npy_float)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+USHORT_to_FLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ushort *ip = input;
+    npy_float *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ushort f = *ip++;
+        npy_float t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_float)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_float)f;
+        }
+#else
+        npy_float t = (npy_float)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+INT_to_FLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_int *ip = input;
+    npy_float *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_int f = *ip++;
+        npy_float t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_float)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_float)f;
+        }
+#else
+        npy_float t = (npy_float)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+UINT_to_FLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_uint *ip = input;
+    npy_float *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_uint f = *ip++;
+        npy_float t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_float)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_float)f;
+        }
+#else
+        npy_float t = (npy_float)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONG_to_FLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_long *ip = input;
+    npy_float *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_long f = *ip++;
+        npy_float t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_float)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_float)f;
+        }
+#else
+        npy_float t = (npy_float)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+ULONG_to_FLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulong *ip = input;
+    npy_float *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ulong f = *ip++;
+        npy_float t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_float)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_float)f;
+        }
+#else
+        npy_float t = (npy_float)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONGLONG_to_FLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longlong *ip = input;
+    npy_float *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longlong f = *ip++;
+        npy_float t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_float)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_float)f;
+        }
+#else
+        npy_float t = (npy_float)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+ULONGLONG_to_FLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulonglong *ip = input;
+    npy_float *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ulonglong f = *ip++;
+        npy_float t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_float)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_float)f;
+        }
+#else
+        npy_float t = (npy_float)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+FLOAT_to_FLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_float *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_float f = *ip++;
+        npy_float t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_float)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_float)f;
+        }
+#else
+        npy_float t = (npy_float)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+DOUBLE_to_FLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_float *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_double f = *ip++;
+        npy_float t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_float)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_float)f;
+        }
+#else
+        npy_float t = (npy_float)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONGDOUBLE_to_FLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_float *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longdouble f = *ip++;
+        npy_float t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_float)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_float)f;
+        }
+#else
+        npy_float t = (npy_float)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+DATETIME_to_FLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_datetime *ip = input;
+    npy_float *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_datetime f = *ip++;
+        npy_float t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_float)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_float)f;
+        }
+#else
+        npy_float t = (npy_float)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+TIMEDELTA_to_FLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_timedelta *ip = input;
+    npy_float *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_timedelta f = *ip++;
+        npy_float t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_float)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_float)f;
+        }
+#else
+        npy_float t = (npy_float)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+
+#line 1339
+static void
+CFLOAT_to_FLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_float *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_float f = *ip;
+        npy_float t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_float)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_float)f;
+        }
+#else
+        npy_float t = (npy_float)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+#line 1339
+static void
+CDOUBLE_to_FLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_float *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_double f = *ip;
+        npy_float t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_float)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_float)f;
+        }
+#else
+        npy_float t = (npy_float)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+#line 1339
+static void
+CLONGDOUBLE_to_FLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_float *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longdouble f = *ip;
+        npy_float t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_float)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_float)f;
+        }
+#else
+        npy_float t = (npy_float)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+
+
+#line 1292
+
+#line 1304
+static void
+BYTE_to_DOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_byte *ip = input;
+    npy_double *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_byte f = *ip++;
+        npy_double t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_double)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_double)f;
+        }
+#else
+        npy_double t = (npy_double)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+UBYTE_to_DOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ubyte *ip = input;
+    npy_double *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ubyte f = *ip++;
+        npy_double t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_double)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_double)f;
+        }
+#else
+        npy_double t = (npy_double)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+SHORT_to_DOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_short *ip = input;
+    npy_double *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_short f = *ip++;
+        npy_double t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_double)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_double)f;
+        }
+#else
+        npy_double t = (npy_double)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+USHORT_to_DOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ushort *ip = input;
+    npy_double *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ushort f = *ip++;
+        npy_double t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_double)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_double)f;
+        }
+#else
+        npy_double t = (npy_double)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+INT_to_DOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_int *ip = input;
+    npy_double *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_int f = *ip++;
+        npy_double t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_double)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_double)f;
+        }
+#else
+        npy_double t = (npy_double)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+UINT_to_DOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_uint *ip = input;
+    npy_double *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_uint f = *ip++;
+        npy_double t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_double)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_double)f;
+        }
+#else
+        npy_double t = (npy_double)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONG_to_DOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_long *ip = input;
+    npy_double *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_long f = *ip++;
+        npy_double t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_double)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_double)f;
+        }
+#else
+        npy_double t = (npy_double)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+ULONG_to_DOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulong *ip = input;
+    npy_double *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ulong f = *ip++;
+        npy_double t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_double)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_double)f;
+        }
+#else
+        npy_double t = (npy_double)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONGLONG_to_DOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longlong *ip = input;
+    npy_double *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longlong f = *ip++;
+        npy_double t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_double)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_double)f;
+        }
+#else
+        npy_double t = (npy_double)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+ULONGLONG_to_DOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulonglong *ip = input;
+    npy_double *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ulonglong f = *ip++;
+        npy_double t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_double)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_double)f;
+        }
+#else
+        npy_double t = (npy_double)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+FLOAT_to_DOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_double *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_float f = *ip++;
+        npy_double t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_double)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_double)f;
+        }
+#else
+        npy_double t = (npy_double)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+DOUBLE_to_DOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_double *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_double f = *ip++;
+        npy_double t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_double)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_double)f;
+        }
+#else
+        npy_double t = (npy_double)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONGDOUBLE_to_DOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_double *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longdouble f = *ip++;
+        npy_double t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_double)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_double)f;
+        }
+#else
+        npy_double t = (npy_double)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+DATETIME_to_DOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_datetime *ip = input;
+    npy_double *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_datetime f = *ip++;
+        npy_double t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_double)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_double)f;
+        }
+#else
+        npy_double t = (npy_double)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+TIMEDELTA_to_DOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_timedelta *ip = input;
+    npy_double *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_timedelta f = *ip++;
+        npy_double t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_double)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_double)f;
+        }
+#else
+        npy_double t = (npy_double)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+
+#line 1339
+static void
+CFLOAT_to_DOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_double *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_float f = *ip;
+        npy_double t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_double)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_double)f;
+        }
+#else
+        npy_double t = (npy_double)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+#line 1339
+static void
+CDOUBLE_to_DOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_double *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_double f = *ip;
+        npy_double t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_double)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_double)f;
+        }
+#else
+        npy_double t = (npy_double)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+#line 1339
+static void
+CLONGDOUBLE_to_DOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_double *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longdouble f = *ip;
+        npy_double t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_double)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_double)f;
+        }
+#else
+        npy_double t = (npy_double)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+
+
+#line 1292
+
+#line 1304
+static void
+BYTE_to_LONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_byte *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_byte f = *ip++;
+        npy_longdouble t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_longdouble)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_longdouble)f;
+        }
+#else
+        npy_longdouble t = (npy_longdouble)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+UBYTE_to_LONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ubyte *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ubyte f = *ip++;
+        npy_longdouble t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_longdouble)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_longdouble)f;
+        }
+#else
+        npy_longdouble t = (npy_longdouble)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+SHORT_to_LONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_short *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_short f = *ip++;
+        npy_longdouble t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_longdouble)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_longdouble)f;
+        }
+#else
+        npy_longdouble t = (npy_longdouble)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+USHORT_to_LONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ushort *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ushort f = *ip++;
+        npy_longdouble t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_longdouble)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_longdouble)f;
+        }
+#else
+        npy_longdouble t = (npy_longdouble)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+INT_to_LONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_int *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_int f = *ip++;
+        npy_longdouble t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_longdouble)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_longdouble)f;
+        }
+#else
+        npy_longdouble t = (npy_longdouble)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+UINT_to_LONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_uint *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_uint f = *ip++;
+        npy_longdouble t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_longdouble)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_longdouble)f;
+        }
+#else
+        npy_longdouble t = (npy_longdouble)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONG_to_LONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_long *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_long f = *ip++;
+        npy_longdouble t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_longdouble)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_longdouble)f;
+        }
+#else
+        npy_longdouble t = (npy_longdouble)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+ULONG_to_LONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulong *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ulong f = *ip++;
+        npy_longdouble t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_longdouble)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_longdouble)f;
+        }
+#else
+        npy_longdouble t = (npy_longdouble)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONGLONG_to_LONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longlong *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longlong f = *ip++;
+        npy_longdouble t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_longdouble)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_longdouble)f;
+        }
+#else
+        npy_longdouble t = (npy_longdouble)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+ULONGLONG_to_LONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulonglong *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ulonglong f = *ip++;
+        npy_longdouble t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_longdouble)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_longdouble)f;
+        }
+#else
+        npy_longdouble t = (npy_longdouble)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+FLOAT_to_LONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_float f = *ip++;
+        npy_longdouble t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_longdouble)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_longdouble)f;
+        }
+#else
+        npy_longdouble t = (npy_longdouble)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+DOUBLE_to_LONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_double f = *ip++;
+        npy_longdouble t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_longdouble)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_longdouble)f;
+        }
+#else
+        npy_longdouble t = (npy_longdouble)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONGDOUBLE_to_LONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+#if 0 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longdouble f = *ip++;
+        npy_longdouble t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_longdouble)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_longdouble)f;
+        }
+#else
+        npy_longdouble t = (npy_longdouble)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+DATETIME_to_LONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_datetime *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_datetime f = *ip++;
+        npy_longdouble t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_longdouble)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_longdouble)f;
+        }
+#else
+        npy_longdouble t = (npy_longdouble)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+TIMEDELTA_to_LONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_timedelta *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+#if 0 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_timedelta f = *ip++;
+        npy_longdouble t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_longdouble)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_longdouble)f;
+        }
+#else
+        npy_longdouble t = (npy_longdouble)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+
+#line 1339
+static void
+CFLOAT_to_LONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_float f = *ip;
+        npy_longdouble t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_longdouble)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_longdouble)f;
+        }
+#else
+        npy_longdouble t = (npy_longdouble)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+#line 1339
+static void
+CDOUBLE_to_LONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_double f = *ip;
+        npy_longdouble t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_longdouble)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_longdouble)f;
+        }
+#else
+        npy_longdouble t = (npy_longdouble)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+#line 1339
+static void
+CLONGDOUBLE_to_LONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+#if 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longdouble f = *ip;
+        npy_longdouble t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_longdouble)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_longdouble)f;
+        }
+#else
+        npy_longdouble t = (npy_longdouble)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+
+
+#line 1292
+
+#line 1304
+static void
+BYTE_to_DATETIME(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_byte *ip = input;
+    npy_datetime *op = output;
+
+    while (n--) {
+#if 1 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_byte f = *ip++;
+        npy_datetime t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_datetime)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_datetime)f;
+        }
+#else
+        npy_datetime t = (npy_datetime)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+UBYTE_to_DATETIME(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ubyte *ip = input;
+    npy_datetime *op = output;
+
+    while (n--) {
+#if 1 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ubyte f = *ip++;
+        npy_datetime t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_datetime)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_datetime)f;
+        }
+#else
+        npy_datetime t = (npy_datetime)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+SHORT_to_DATETIME(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_short *ip = input;
+    npy_datetime *op = output;
+
+    while (n--) {
+#if 1 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_short f = *ip++;
+        npy_datetime t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_datetime)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_datetime)f;
+        }
+#else
+        npy_datetime t = (npy_datetime)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+USHORT_to_DATETIME(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ushort *ip = input;
+    npy_datetime *op = output;
+
+    while (n--) {
+#if 1 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ushort f = *ip++;
+        npy_datetime t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_datetime)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_datetime)f;
+        }
+#else
+        npy_datetime t = (npy_datetime)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+INT_to_DATETIME(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_int *ip = input;
+    npy_datetime *op = output;
+
+    while (n--) {
+#if 1 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_int f = *ip++;
+        npy_datetime t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_datetime)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_datetime)f;
+        }
+#else
+        npy_datetime t = (npy_datetime)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+UINT_to_DATETIME(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_uint *ip = input;
+    npy_datetime *op = output;
+
+    while (n--) {
+#if 1 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_uint f = *ip++;
+        npy_datetime t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_datetime)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_datetime)f;
+        }
+#else
+        npy_datetime t = (npy_datetime)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONG_to_DATETIME(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_long *ip = input;
+    npy_datetime *op = output;
+
+    while (n--) {
+#if 1 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_long f = *ip++;
+        npy_datetime t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_datetime)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_datetime)f;
+        }
+#else
+        npy_datetime t = (npy_datetime)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+ULONG_to_DATETIME(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulong *ip = input;
+    npy_datetime *op = output;
+
+    while (n--) {
+#if 1 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ulong f = *ip++;
+        npy_datetime t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_datetime)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_datetime)f;
+        }
+#else
+        npy_datetime t = (npy_datetime)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONGLONG_to_DATETIME(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longlong *ip = input;
+    npy_datetime *op = output;
+
+    while (n--) {
+#if 1 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longlong f = *ip++;
+        npy_datetime t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_datetime)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_datetime)f;
+        }
+#else
+        npy_datetime t = (npy_datetime)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+ULONGLONG_to_DATETIME(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulonglong *ip = input;
+    npy_datetime *op = output;
+
+    while (n--) {
+#if 1 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ulonglong f = *ip++;
+        npy_datetime t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_datetime)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_datetime)f;
+        }
+#else
+        npy_datetime t = (npy_datetime)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+FLOAT_to_DATETIME(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_datetime *op = output;
+
+    while (n--) {
+#if 1 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_float f = *ip++;
+        npy_datetime t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_datetime)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_datetime)f;
+        }
+#else
+        npy_datetime t = (npy_datetime)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+DOUBLE_to_DATETIME(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_datetime *op = output;
+
+    while (n--) {
+#if 1 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_double f = *ip++;
+        npy_datetime t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_datetime)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_datetime)f;
+        }
+#else
+        npy_datetime t = (npy_datetime)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONGDOUBLE_to_DATETIME(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_datetime *op = output;
+
+    while (n--) {
+#if 1 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longdouble f = *ip++;
+        npy_datetime t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_datetime)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_datetime)f;
+        }
+#else
+        npy_datetime t = (npy_datetime)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+DATETIME_to_DATETIME(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_datetime *ip = input;
+    npy_datetime *op = output;
+
+    while (n--) {
+#if 1 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_datetime f = *ip++;
+        npy_datetime t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_datetime)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_datetime)f;
+        }
+#else
+        npy_datetime t = (npy_datetime)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+TIMEDELTA_to_DATETIME(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_timedelta *ip = input;
+    npy_datetime *op = output;
+
+    while (n--) {
+#if 1 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_timedelta f = *ip++;
+        npy_datetime t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_datetime)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_datetime)f;
+        }
+#else
+        npy_datetime t = (npy_datetime)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+
+#line 1339
+static void
+CFLOAT_to_DATETIME(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_datetime *op = output;
+
+    while (n--) {
+#if 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_float f = *ip;
+        npy_datetime t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_datetime)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_datetime)f;
+        }
+#else
+        npy_datetime t = (npy_datetime)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+#line 1339
+static void
+CDOUBLE_to_DATETIME(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_datetime *op = output;
+
+    while (n--) {
+#if 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_double f = *ip;
+        npy_datetime t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_datetime)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_datetime)f;
+        }
+#else
+        npy_datetime t = (npy_datetime)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+#line 1339
+static void
+CLONGDOUBLE_to_DATETIME(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_datetime *op = output;
+
+    while (n--) {
+#if 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longdouble f = *ip;
+        npy_datetime t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_datetime)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_datetime)f;
+        }
+#else
+        npy_datetime t = (npy_datetime)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+
+
+#line 1292
+
+#line 1304
+static void
+BYTE_to_TIMEDELTA(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_byte *ip = input;
+    npy_timedelta *op = output;
+
+    while (n--) {
+#if 1 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_byte f = *ip++;
+        npy_timedelta t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_timedelta)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_timedelta)f;
+        }
+#else
+        npy_timedelta t = (npy_timedelta)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+UBYTE_to_TIMEDELTA(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ubyte *ip = input;
+    npy_timedelta *op = output;
+
+    while (n--) {
+#if 1 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ubyte f = *ip++;
+        npy_timedelta t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_timedelta)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_timedelta)f;
+        }
+#else
+        npy_timedelta t = (npy_timedelta)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+SHORT_to_TIMEDELTA(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_short *ip = input;
+    npy_timedelta *op = output;
+
+    while (n--) {
+#if 1 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_short f = *ip++;
+        npy_timedelta t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_timedelta)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_timedelta)f;
+        }
+#else
+        npy_timedelta t = (npy_timedelta)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+USHORT_to_TIMEDELTA(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ushort *ip = input;
+    npy_timedelta *op = output;
+
+    while (n--) {
+#if 1 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ushort f = *ip++;
+        npy_timedelta t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_timedelta)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_timedelta)f;
+        }
+#else
+        npy_timedelta t = (npy_timedelta)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+INT_to_TIMEDELTA(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_int *ip = input;
+    npy_timedelta *op = output;
+
+    while (n--) {
+#if 1 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_int f = *ip++;
+        npy_timedelta t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_timedelta)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_timedelta)f;
+        }
+#else
+        npy_timedelta t = (npy_timedelta)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+UINT_to_TIMEDELTA(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_uint *ip = input;
+    npy_timedelta *op = output;
+
+    while (n--) {
+#if 1 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_uint f = *ip++;
+        npy_timedelta t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_timedelta)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_timedelta)f;
+        }
+#else
+        npy_timedelta t = (npy_timedelta)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONG_to_TIMEDELTA(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_long *ip = input;
+    npy_timedelta *op = output;
+
+    while (n--) {
+#if 1 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_long f = *ip++;
+        npy_timedelta t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_timedelta)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_timedelta)f;
+        }
+#else
+        npy_timedelta t = (npy_timedelta)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+ULONG_to_TIMEDELTA(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulong *ip = input;
+    npy_timedelta *op = output;
+
+    while (n--) {
+#if 1 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ulong f = *ip++;
+        npy_timedelta t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_timedelta)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_timedelta)f;
+        }
+#else
+        npy_timedelta t = (npy_timedelta)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONGLONG_to_TIMEDELTA(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longlong *ip = input;
+    npy_timedelta *op = output;
+
+    while (n--) {
+#if 1 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longlong f = *ip++;
+        npy_timedelta t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_timedelta)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_timedelta)f;
+        }
+#else
+        npy_timedelta t = (npy_timedelta)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+ULONGLONG_to_TIMEDELTA(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulonglong *ip = input;
+    npy_timedelta *op = output;
+
+    while (n--) {
+#if 1 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_ulonglong f = *ip++;
+        npy_timedelta t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_timedelta)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_timedelta)f;
+        }
+#else
+        npy_timedelta t = (npy_timedelta)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+FLOAT_to_TIMEDELTA(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_timedelta *op = output;
+
+    while (n--) {
+#if 1 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_float f = *ip++;
+        npy_timedelta t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_timedelta)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_timedelta)f;
+        }
+#else
+        npy_timedelta t = (npy_timedelta)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+DOUBLE_to_TIMEDELTA(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_timedelta *op = output;
+
+    while (n--) {
+#if 1 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_double f = *ip++;
+        npy_timedelta t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_timedelta)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_timedelta)f;
+        }
+#else
+        npy_timedelta t = (npy_timedelta)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+LONGDOUBLE_to_TIMEDELTA(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_timedelta *op = output;
+
+    while (n--) {
+#if 1 && 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longdouble f = *ip++;
+        npy_timedelta t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_timedelta)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_timedelta)f;
+        }
+#else
+        npy_timedelta t = (npy_timedelta)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+DATETIME_to_TIMEDELTA(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_datetime *ip = input;
+    npy_timedelta *op = output;
+
+    while (n--) {
+#if 1 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_datetime f = *ip++;
+        npy_timedelta t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_timedelta)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_timedelta)f;
+        }
+#else
+        npy_timedelta t = (npy_timedelta)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+#line 1304
+static void
+TIMEDELTA_to_TIMEDELTA(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_timedelta *ip = input;
+    npy_timedelta *op = output;
+
+    while (n--) {
+#if 1 && 0
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_timedelta f = *ip++;
+        npy_timedelta t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_timedelta)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_timedelta)f;
+        }
+#else
+        npy_timedelta t = (npy_timedelta)*ip++;
+#endif
+        *op++ = t;
+    }
+}
+
+
+#line 1339
+static void
+CFLOAT_to_TIMEDELTA(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_timedelta *op = output;
+
+    while (n--) {
+#if 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_float f = *ip;
+        npy_timedelta t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_timedelta)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_timedelta)f;
+        }
+#else
+        npy_timedelta t = (npy_timedelta)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+#line 1339
+static void
+CDOUBLE_to_TIMEDELTA(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_timedelta *op = output;
+
+    while (n--) {
+#if 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_double f = *ip;
+        npy_timedelta t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_timedelta)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_timedelta)f;
+        }
+#else
+        npy_timedelta t = (npy_timedelta)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+#line 1339
+static void
+CLONGDOUBLE_to_TIMEDELTA(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_timedelta *op = output;
+
+    while (n--) {
+#if 1
+        /*
+         * volatile works around clang (and gcc sometimes) not branching
+         * correctly, leading to floating point errors in the test suite.
+         */
+        volatile npy_longdouble f = *ip;
+        npy_timedelta t;
+        /* Avoid undefined behaviour and warning for NaN -> NaT */
+        if (npy_isnan(f)) {
+            t = (npy_timedelta)NPY_DATETIME_NAT;
+        }
+        else {
+            t = (npy_timedelta)f;
+        }
+#else
+        npy_timedelta t = (npy_timedelta)*ip;
+#endif
+        *op++ = t;
+        ip += 2;
+    }
+}
+
+
+
+
+
+#line 1383
+
+static void
+BYTE_to_HALF(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_byte *ip = input;
+    npy_half *op = output;
+
+    while (n--) {
+        *op++ = npy_float_to_half((float)(*ip++));
+    }
+}
+
+static void
+HALF_to_BYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_half *ip = input;
+    npy_byte *op = output;
+
+    while (n--) {
+        *op++ = (npy_byte)npy_half_to_float(*ip++);
+    }
+}
+
+
+#line 1383
+
+static void
+UBYTE_to_HALF(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ubyte *ip = input;
+    npy_half *op = output;
+
+    while (n--) {
+        *op++ = npy_float_to_half((float)(*ip++));
+    }
+}
+
+static void
+HALF_to_UBYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_half *ip = input;
+    npy_ubyte *op = output;
+
+    while (n--) {
+        *op++ = (npy_ubyte)npy_half_to_float(*ip++);
+    }
+}
+
+
+#line 1383
+
+static void
+SHORT_to_HALF(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_short *ip = input;
+    npy_half *op = output;
+
+    while (n--) {
+        *op++ = npy_float_to_half((float)(*ip++));
+    }
+}
+
+static void
+HALF_to_SHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_half *ip = input;
+    npy_short *op = output;
+
+    while (n--) {
+        *op++ = (npy_short)npy_half_to_float(*ip++);
+    }
+}
+
+
+#line 1383
+
+static void
+USHORT_to_HALF(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ushort *ip = input;
+    npy_half *op = output;
+
+    while (n--) {
+        *op++ = npy_float_to_half((float)(*ip++));
+    }
+}
+
+static void
+HALF_to_USHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_half *ip = input;
+    npy_ushort *op = output;
+
+    while (n--) {
+        *op++ = (npy_ushort)npy_half_to_float(*ip++);
+    }
+}
+
+
+#line 1383
+
+static void
+INT_to_HALF(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_int *ip = input;
+    npy_half *op = output;
+
+    while (n--) {
+        *op++ = npy_float_to_half((float)(*ip++));
+    }
+}
+
+static void
+HALF_to_INT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_half *ip = input;
+    npy_int *op = output;
+
+    while (n--) {
+        *op++ = (npy_int)npy_half_to_float(*ip++);
+    }
+}
+
+
+#line 1383
+
+static void
+UINT_to_HALF(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_uint *ip = input;
+    npy_half *op = output;
+
+    while (n--) {
+        *op++ = npy_float_to_half((float)(*ip++));
+    }
+}
+
+static void
+HALF_to_UINT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_half *ip = input;
+    npy_uint *op = output;
+
+    while (n--) {
+        *op++ = (npy_uint)npy_half_to_float(*ip++);
+    }
+}
+
+
+#line 1383
+
+static void
+LONG_to_HALF(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_long *ip = input;
+    npy_half *op = output;
+
+    while (n--) {
+        *op++ = npy_float_to_half((float)(*ip++));
+    }
+}
+
+static void
+HALF_to_LONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_half *ip = input;
+    npy_long *op = output;
+
+    while (n--) {
+        *op++ = (npy_long)npy_half_to_float(*ip++);
+    }
+}
+
+
+#line 1383
+
+static void
+ULONG_to_HALF(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulong *ip = input;
+    npy_half *op = output;
+
+    while (n--) {
+        *op++ = npy_float_to_half((float)(*ip++));
+    }
+}
+
+static void
+HALF_to_ULONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_half *ip = input;
+    npy_ulong *op = output;
+
+    while (n--) {
+        *op++ = (npy_ulong)npy_half_to_float(*ip++);
+    }
+}
+
+
+#line 1383
+
+static void
+LONGLONG_to_HALF(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longlong *ip = input;
+    npy_half *op = output;
+
+    while (n--) {
+        *op++ = npy_float_to_half((float)(*ip++));
+    }
+}
+
+static void
+HALF_to_LONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_half *ip = input;
+    npy_longlong *op = output;
+
+    while (n--) {
+        *op++ = (npy_longlong)npy_half_to_float(*ip++);
+    }
+}
+
+
+#line 1383
+
+static void
+ULONGLONG_to_HALF(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulonglong *ip = input;
+    npy_half *op = output;
+
+    while (n--) {
+        *op++ = npy_float_to_half((float)(*ip++));
+    }
+}
+
+static void
+HALF_to_ULONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_half *ip = input;
+    npy_ulonglong *op = output;
+
+    while (n--) {
+        *op++ = (npy_ulonglong)npy_half_to_float(*ip++);
+    }
+}
+
+
+#line 1383
+
+static void
+LONGDOUBLE_to_HALF(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_half *op = output;
+
+    while (n--) {
+        *op++ = npy_float_to_half((float)(*ip++));
+    }
+}
+
+static void
+HALF_to_LONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_half *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+        *op++ = (npy_longdouble)npy_half_to_float(*ip++);
+    }
+}
+
+
+#line 1383
+
+static void
+DATETIME_to_HALF(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_datetime *ip = input;
+    npy_half *op = output;
+
+    while (n--) {
+        *op++ = npy_float_to_half((float)(*ip++));
+    }
+}
+
+static void
+HALF_to_DATETIME(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_half *ip = input;
+    npy_datetime *op = output;
+
+    while (n--) {
+        *op++ = (npy_datetime)npy_half_to_float(*ip++);
+    }
+}
+
+
+#line 1383
+
+static void
+TIMEDELTA_to_HALF(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_timedelta *ip = input;
+    npy_half *op = output;
+
+    while (n--) {
+        *op++ = npy_float_to_half((float)(*ip++));
+    }
+}
+
+static void
+HALF_to_TIMEDELTA(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_half *ip = input;
+    npy_timedelta *op = output;
+
+    while (n--) {
+        *op++ = (npy_timedelta)npy_half_to_float(*ip++);
+    }
+}
+
+
+#if NPY_SIZEOF_SHORT == 2
+#define HALF_to_HALF SHORT_to_SHORT
+#elif NPY_SIZEOF_INT == 2
+#define HALF_to_HALF INT_to_INT
+#endif
+
+#line 1422
+
+static void
+FLOAT_to_HALF(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_uint32 *ip = input;
+    npy_half *op = output;
+
+    while (n--) {
+        *op++ = npy_floatbits_to_halfbits(*ip);
+#if 0
+        ip += 2;
+#else
+        ip++;
+#endif
+    }
+}
+
+static void
+HALF_to_FLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_half *ip = input;
+    npy_uint32 *op = output;
+
+    while (n--) {
+        *op++ = npy_halfbits_to_floatbits(*ip++);
+#if 0
+        *op++ = 0;
+#endif
+    }
+}
+
+
+#line 1422
+
+static void
+DOUBLE_to_HALF(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_uint64 *ip = input;
+    npy_half *op = output;
+
+    while (n--) {
+        *op++ = npy_doublebits_to_halfbits(*ip);
+#if 0
+        ip += 2;
+#else
+        ip++;
+#endif
+    }
+}
+
+static void
+HALF_to_DOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_half *ip = input;
+    npy_uint64 *op = output;
+
+    while (n--) {
+        *op++ = npy_halfbits_to_doublebits(*ip++);
+#if 0
+        *op++ = 0;
+#endif
+    }
+}
+
+
+#line 1422
+
+static void
+CFLOAT_to_HALF(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_uint32 *ip = input;
+    npy_half *op = output;
+
+    while (n--) {
+        *op++ = npy_floatbits_to_halfbits(*ip);
+#if 1
+        ip += 2;
+#else
+        ip++;
+#endif
+    }
+}
+
+static void
+HALF_to_CFLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_half *ip = input;
+    npy_uint32 *op = output;
+
+    while (n--) {
+        *op++ = npy_halfbits_to_floatbits(*ip++);
+#if 1
+        *op++ = 0;
+#endif
+    }
+}
+
+
+#line 1422
+
+static void
+CDOUBLE_to_HALF(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_uint64 *ip = input;
+    npy_half *op = output;
+
+    while (n--) {
+        *op++ = npy_doublebits_to_halfbits(*ip);
+#if 1
+        ip += 2;
+#else
+        ip++;
+#endif
+    }
+}
+
+static void
+HALF_to_CDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_half *ip = input;
+    npy_uint64 *op = output;
+
+    while (n--) {
+        *op++ = npy_halfbits_to_doublebits(*ip++);
+#if 1
+        *op++ = 0;
+#endif
+    }
+}
+
+
+
+static void
+CLONGDOUBLE_to_HALF(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_half *op = output;
+
+    while (n--) {
+        *op++ = npy_double_to_half((double) (*ip++));
+        ip += 2;
+    }
+}
+
+static void
+HALF_to_CLONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_half *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+        *op++ = npy_half_to_double(*ip++);
+        *op++ = 0;
+    }
+}
+
+#line 1496
+static void
+BOOL_to_BOOL(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_bool *ip = input;
+    npy_bool *op = output;
+
+    while (n--) {
+        *op++ = (npy_bool)(*ip++ != NPY_FALSE);
+    }
+}
+
+#line 1496
+static void
+BYTE_to_BOOL(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_byte *ip = input;
+    npy_bool *op = output;
+
+    while (n--) {
+        *op++ = (npy_bool)(*ip++ != NPY_FALSE);
+    }
+}
+
+#line 1496
+static void
+UBYTE_to_BOOL(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ubyte *ip = input;
+    npy_bool *op = output;
+
+    while (n--) {
+        *op++ = (npy_bool)(*ip++ != NPY_FALSE);
+    }
+}
+
+#line 1496
+static void
+SHORT_to_BOOL(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_short *ip = input;
+    npy_bool *op = output;
+
+    while (n--) {
+        *op++ = (npy_bool)(*ip++ != NPY_FALSE);
+    }
+}
+
+#line 1496
+static void
+USHORT_to_BOOL(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ushort *ip = input;
+    npy_bool *op = output;
+
+    while (n--) {
+        *op++ = (npy_bool)(*ip++ != NPY_FALSE);
+    }
+}
+
+#line 1496
+static void
+INT_to_BOOL(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_int *ip = input;
+    npy_bool *op = output;
+
+    while (n--) {
+        *op++ = (npy_bool)(*ip++ != NPY_FALSE);
+    }
+}
+
+#line 1496
+static void
+UINT_to_BOOL(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_uint *ip = input;
+    npy_bool *op = output;
+
+    while (n--) {
+        *op++ = (npy_bool)(*ip++ != NPY_FALSE);
+    }
+}
+
+#line 1496
+static void
+LONG_to_BOOL(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_long *ip = input;
+    npy_bool *op = output;
+
+    while (n--) {
+        *op++ = (npy_bool)(*ip++ != NPY_FALSE);
+    }
+}
+
+#line 1496
+static void
+ULONG_to_BOOL(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulong *ip = input;
+    npy_bool *op = output;
+
+    while (n--) {
+        *op++ = (npy_bool)(*ip++ != NPY_FALSE);
+    }
+}
+
+#line 1496
+static void
+LONGLONG_to_BOOL(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longlong *ip = input;
+    npy_bool *op = output;
+
+    while (n--) {
+        *op++ = (npy_bool)(*ip++ != NPY_FALSE);
+    }
+}
+
+#line 1496
+static void
+ULONGLONG_to_BOOL(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulonglong *ip = input;
+    npy_bool *op = output;
+
+    while (n--) {
+        *op++ = (npy_bool)(*ip++ != NPY_FALSE);
+    }
+}
+
+#line 1496
+static void
+FLOAT_to_BOOL(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_bool *op = output;
+
+    while (n--) {
+        *op++ = (npy_bool)(*ip++ != NPY_FALSE);
+    }
+}
+
+#line 1496
+static void
+DOUBLE_to_BOOL(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_bool *op = output;
+
+    while (n--) {
+        *op++ = (npy_bool)(*ip++ != NPY_FALSE);
+    }
+}
+
+#line 1496
+static void
+LONGDOUBLE_to_BOOL(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_bool *op = output;
+
+    while (n--) {
+        *op++ = (npy_bool)(*ip++ != NPY_FALSE);
+    }
+}
+
+#line 1496
+static void
+DATETIME_to_BOOL(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_datetime *ip = input;
+    npy_bool *op = output;
+
+    while (n--) {
+        *op++ = (npy_bool)(*ip++ != NPY_FALSE);
+    }
+}
+
+#line 1496
+static void
+TIMEDELTA_to_BOOL(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_timedelta *ip = input;
+    npy_bool *op = output;
+
+    while (n--) {
+        *op++ = (npy_bool)(*ip++ != NPY_FALSE);
+    }
+}
+
+
+static void
+HALF_to_BOOL(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_half *ip = input;
+    npy_bool *op = output;
+
+    while (n--) {
+        *op++ = (npy_bool)(!npy_half_iszero(*ip++));
+    }
+}
+
+#line 1526
+static void
+CFLOAT_to_BOOL(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_cfloat *ip = input;
+    npy_bool *op = output;
+
+    while (n--) {
+        *op = (npy_bool)((ip->real != NPY_FALSE) ||
+                (ip->imag != NPY_FALSE));
+        op++;
+        ip++;
+    }
+}
+
+#line 1526
+static void
+CDOUBLE_to_BOOL(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_cdouble *ip = input;
+    npy_bool *op = output;
+
+    while (n--) {
+        *op = (npy_bool)((ip->real != NPY_FALSE) ||
+                (ip->imag != NPY_FALSE));
+        op++;
+        ip++;
+    }
+}
+
+#line 1526
+static void
+CLONGDOUBLE_to_BOOL(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_clongdouble *ip = input;
+    npy_bool *op = output;
+
+    while (n--) {
+        *op = (npy_bool)((ip->real != NPY_FALSE) ||
+                (ip->imag != NPY_FALSE));
+        op++;
+        ip++;
+    }
+}
+
+
+#line 1554
+static void
+BOOL_to_BYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_bool *ip = input;
+    npy_byte *op = output;
+
+    while (n--) {
+        *op++ = (npy_byte)((*ip++ != NPY_FALSE) ? 1 : 0);
+    }
+}
+
+#line 1554
+static void
+BOOL_to_UBYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_bool *ip = input;
+    npy_ubyte *op = output;
+
+    while (n--) {
+        *op++ = (npy_ubyte)((*ip++ != NPY_FALSE) ? 1 : 0);
+    }
+}
+
+#line 1554
+static void
+BOOL_to_SHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_bool *ip = input;
+    npy_short *op = output;
+
+    while (n--) {
+        *op++ = (npy_short)((*ip++ != NPY_FALSE) ? 1 : 0);
+    }
+}
+
+#line 1554
+static void
+BOOL_to_USHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_bool *ip = input;
+    npy_ushort *op = output;
+
+    while (n--) {
+        *op++ = (npy_ushort)((*ip++ != NPY_FALSE) ? 1 : 0);
+    }
+}
+
+#line 1554
+static void
+BOOL_to_INT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_bool *ip = input;
+    npy_int *op = output;
+
+    while (n--) {
+        *op++ = (npy_int)((*ip++ != NPY_FALSE) ? 1 : 0);
+    }
+}
+
+#line 1554
+static void
+BOOL_to_UINT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_bool *ip = input;
+    npy_uint *op = output;
+
+    while (n--) {
+        *op++ = (npy_uint)((*ip++ != NPY_FALSE) ? 1 : 0);
+    }
+}
+
+#line 1554
+static void
+BOOL_to_LONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_bool *ip = input;
+    npy_long *op = output;
+
+    while (n--) {
+        *op++ = (npy_long)((*ip++ != NPY_FALSE) ? 1 : 0);
+    }
+}
+
+#line 1554
+static void
+BOOL_to_ULONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_bool *ip = input;
+    npy_ulong *op = output;
+
+    while (n--) {
+        *op++ = (npy_ulong)((*ip++ != NPY_FALSE) ? 1 : 0);
+    }
+}
+
+#line 1554
+static void
+BOOL_to_LONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_bool *ip = input;
+    npy_longlong *op = output;
+
+    while (n--) {
+        *op++ = (npy_longlong)((*ip++ != NPY_FALSE) ? 1 : 0);
+    }
+}
+
+#line 1554
+static void
+BOOL_to_ULONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_bool *ip = input;
+    npy_ulonglong *op = output;
+
+    while (n--) {
+        *op++ = (npy_ulonglong)((*ip++ != NPY_FALSE) ? 1 : 0);
+    }
+}
+
+#line 1554
+static void
+BOOL_to_HALF(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_bool *ip = input;
+    npy_half *op = output;
+
+    while (n--) {
+        *op++ = (npy_half)((*ip++ != NPY_FALSE) ? NPY_HALF_ONE : NPY_HALF_ZERO);
+    }
+}
+
+#line 1554
+static void
+BOOL_to_FLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_bool *ip = input;
+    npy_float *op = output;
+
+    while (n--) {
+        *op++ = (npy_float)((*ip++ != NPY_FALSE) ? 1 : 0);
+    }
+}
+
+#line 1554
+static void
+BOOL_to_DOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_bool *ip = input;
+    npy_double *op = output;
+
+    while (n--) {
+        *op++ = (npy_double)((*ip++ != NPY_FALSE) ? 1 : 0);
+    }
+}
+
+#line 1554
+static void
+BOOL_to_LONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_bool *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+        *op++ = (npy_longdouble)((*ip++ != NPY_FALSE) ? 1 : 0);
+    }
+}
+
+#line 1554
+static void
+BOOL_to_DATETIME(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_bool *ip = input;
+    npy_datetime *op = output;
+
+    while (n--) {
+        *op++ = (npy_datetime)((*ip++ != NPY_FALSE) ? 1 : 0);
+    }
+}
+
+#line 1554
+static void
+BOOL_to_TIMEDELTA(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_bool *ip = input;
+    npy_timedelta *op = output;
+
+    while (n--) {
+        *op++ = (npy_timedelta)((*ip++ != NPY_FALSE) ? 1 : 0);
+    }
+}
+
+
+#line 1572
+
+#line 1585
+static void
+BOOL_to_CFLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_bool *ip = input;
+    npy_float *op = output;
+
+    while (n--) {
+        *op++ = (npy_float)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+BYTE_to_CFLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_byte *ip = input;
+    npy_float *op = output;
+
+    while (n--) {
+        *op++ = (npy_float)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+UBYTE_to_CFLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ubyte *ip = input;
+    npy_float *op = output;
+
+    while (n--) {
+        *op++ = (npy_float)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+SHORT_to_CFLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_short *ip = input;
+    npy_float *op = output;
+
+    while (n--) {
+        *op++ = (npy_float)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+USHORT_to_CFLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ushort *ip = input;
+    npy_float *op = output;
+
+    while (n--) {
+        *op++ = (npy_float)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+INT_to_CFLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_int *ip = input;
+    npy_float *op = output;
+
+    while (n--) {
+        *op++ = (npy_float)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+UINT_to_CFLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_uint *ip = input;
+    npy_float *op = output;
+
+    while (n--) {
+        *op++ = (npy_float)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+LONG_to_CFLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_long *ip = input;
+    npy_float *op = output;
+
+    while (n--) {
+        *op++ = (npy_float)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+ULONG_to_CFLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulong *ip = input;
+    npy_float *op = output;
+
+    while (n--) {
+        *op++ = (npy_float)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+LONGLONG_to_CFLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longlong *ip = input;
+    npy_float *op = output;
+
+    while (n--) {
+        *op++ = (npy_float)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+ULONGLONG_to_CFLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulonglong *ip = input;
+    npy_float *op = output;
+
+    while (n--) {
+        *op++ = (npy_float)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+FLOAT_to_CFLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_float *op = output;
+
+    while (n--) {
+        *op++ = (npy_float)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+DOUBLE_to_CFLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_float *op = output;
+
+    while (n--) {
+        *op++ = (npy_float)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+LONGDOUBLE_to_CFLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_float *op = output;
+
+    while (n--) {
+        *op++ = (npy_float)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+DATETIME_to_CFLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_datetime *ip = input;
+    npy_float *op = output;
+
+    while (n--) {
+        *op++ = (npy_float)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+TIMEDELTA_to_CFLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_timedelta *ip = input;
+    npy_float *op = output;
+
+    while (n--) {
+        *op++ = (npy_float)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+
+#line 1572
+
+#line 1585
+static void
+BOOL_to_CDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_bool *ip = input;
+    npy_double *op = output;
+
+    while (n--) {
+        *op++ = (npy_double)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+BYTE_to_CDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_byte *ip = input;
+    npy_double *op = output;
+
+    while (n--) {
+        *op++ = (npy_double)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+UBYTE_to_CDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ubyte *ip = input;
+    npy_double *op = output;
+
+    while (n--) {
+        *op++ = (npy_double)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+SHORT_to_CDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_short *ip = input;
+    npy_double *op = output;
+
+    while (n--) {
+        *op++ = (npy_double)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+USHORT_to_CDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ushort *ip = input;
+    npy_double *op = output;
+
+    while (n--) {
+        *op++ = (npy_double)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+INT_to_CDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_int *ip = input;
+    npy_double *op = output;
+
+    while (n--) {
+        *op++ = (npy_double)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+UINT_to_CDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_uint *ip = input;
+    npy_double *op = output;
+
+    while (n--) {
+        *op++ = (npy_double)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+LONG_to_CDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_long *ip = input;
+    npy_double *op = output;
+
+    while (n--) {
+        *op++ = (npy_double)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+ULONG_to_CDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulong *ip = input;
+    npy_double *op = output;
+
+    while (n--) {
+        *op++ = (npy_double)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+LONGLONG_to_CDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longlong *ip = input;
+    npy_double *op = output;
+
+    while (n--) {
+        *op++ = (npy_double)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+ULONGLONG_to_CDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulonglong *ip = input;
+    npy_double *op = output;
+
+    while (n--) {
+        *op++ = (npy_double)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+FLOAT_to_CDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_double *op = output;
+
+    while (n--) {
+        *op++ = (npy_double)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+DOUBLE_to_CDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_double *op = output;
+
+    while (n--) {
+        *op++ = (npy_double)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+LONGDOUBLE_to_CDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_double *op = output;
+
+    while (n--) {
+        *op++ = (npy_double)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+DATETIME_to_CDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_datetime *ip = input;
+    npy_double *op = output;
+
+    while (n--) {
+        *op++ = (npy_double)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+TIMEDELTA_to_CDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_timedelta *ip = input;
+    npy_double *op = output;
+
+    while (n--) {
+        *op++ = (npy_double)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+
+#line 1572
+
+#line 1585
+static void
+BOOL_to_CLONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_bool *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+        *op++ = (npy_longdouble)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+BYTE_to_CLONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_byte *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+        *op++ = (npy_longdouble)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+UBYTE_to_CLONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ubyte *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+        *op++ = (npy_longdouble)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+SHORT_to_CLONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_short *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+        *op++ = (npy_longdouble)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+USHORT_to_CLONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ushort *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+        *op++ = (npy_longdouble)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+INT_to_CLONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_int *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+        *op++ = (npy_longdouble)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+UINT_to_CLONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_uint *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+        *op++ = (npy_longdouble)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+LONG_to_CLONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_long *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+        *op++ = (npy_longdouble)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+ULONG_to_CLONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulong *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+        *op++ = (npy_longdouble)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+LONGLONG_to_CLONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longlong *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+        *op++ = (npy_longdouble)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+ULONGLONG_to_CLONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_ulonglong *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+        *op++ = (npy_longdouble)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+FLOAT_to_CLONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+        *op++ = (npy_longdouble)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+DOUBLE_to_CLONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+        *op++ = (npy_longdouble)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+LONGDOUBLE_to_CLONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+        *op++ = (npy_longdouble)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+DATETIME_to_CLONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_datetime *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+        *op++ = (npy_longdouble)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+#line 1585
+static void
+TIMEDELTA_to_CLONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_timedelta *ip = input;
+    npy_longdouble *op = output;
+
+    while (n--) {
+        *op++ = (npy_longdouble)*ip++;
+        *op++ = 0.0;
+    }
+
+}
+
+
+
+#line 1606
+
+#line 1611
+static void
+CFLOAT_to_CFLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_float *op = output;
+
+    n <<= 1;
+    while (n--) {
+        *op++ = (npy_float)*ip++;
+    }
+}
+
+
+#line 1611
+static void
+CDOUBLE_to_CFLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_float *op = output;
+
+    n <<= 1;
+    while (n--) {
+        *op++ = (npy_float)*ip++;
+    }
+}
+
+
+#line 1611
+static void
+CLONGDOUBLE_to_CFLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_float *op = output;
+
+    n <<= 1;
+    while (n--) {
+        *op++ = (npy_float)*ip++;
+    }
+}
+
+
+
+#line 1606
+
+#line 1611
+static void
+CFLOAT_to_CDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_double *op = output;
+
+    n <<= 1;
+    while (n--) {
+        *op++ = (npy_double)*ip++;
+    }
+}
+
+
+#line 1611
+static void
+CDOUBLE_to_CDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_double *op = output;
+
+    n <<= 1;
+    while (n--) {
+        *op++ = (npy_double)*ip++;
+    }
+}
+
+
+#line 1611
+static void
+CLONGDOUBLE_to_CDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_double *op = output;
+
+    n <<= 1;
+    while (n--) {
+        *op++ = (npy_double)*ip++;
+    }
+}
+
+
+
+#line 1606
+
+#line 1611
+static void
+CFLOAT_to_CLONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_float *ip = input;
+    npy_longdouble *op = output;
+
+    n <<= 1;
+    while (n--) {
+        *op++ = (npy_longdouble)*ip++;
+    }
+}
+
+
+#line 1611
+static void
+CDOUBLE_to_CLONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_double *ip = input;
+    npy_longdouble *op = output;
+
+    n <<= 1;
+    while (n--) {
+        *op++ = (npy_longdouble)*ip++;
+    }
+}
+
+
+#line 1611
+static void
+CLONGDOUBLE_to_CLONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *NPY_UNUSED(aop))
+{
+    const npy_longdouble *ip = input;
+    npy_longdouble *op = output;
+
+    n <<= 1;
+    while (n--) {
+        *op++ = (npy_longdouble)*ip++;
+    }
+}
+
+
+
+
+#line 1645
+static void
+BOOL_to_OBJECT(void *input, void *output, npy_intp n,
+        void *vaip, void *NPY_UNUSED(aop))
+{
+    npy_bool *ip = input;
+    PyObject **op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = 1;
+    PyObject *tmp;
+    for (i = 0; i < n; i++, ip +=skip, op++) {
+        tmp = *op;
+        *op = BOOL_getitem(ip, aip);
+        Py_XDECREF(tmp);
+    }
+}
+
+#line 1645
+static void
+BYTE_to_OBJECT(void *input, void *output, npy_intp n,
+        void *vaip, void *NPY_UNUSED(aop))
+{
+    npy_byte *ip = input;
+    PyObject **op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = 1;
+    PyObject *tmp;
+    for (i = 0; i < n; i++, ip +=skip, op++) {
+        tmp = *op;
+        *op = BYTE_getitem(ip, aip);
+        Py_XDECREF(tmp);
+    }
+}
+
+#line 1645
+static void
+UBYTE_to_OBJECT(void *input, void *output, npy_intp n,
+        void *vaip, void *NPY_UNUSED(aop))
+{
+    npy_ubyte *ip = input;
+    PyObject **op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = 1;
+    PyObject *tmp;
+    for (i = 0; i < n; i++, ip +=skip, op++) {
+        tmp = *op;
+        *op = UBYTE_getitem(ip, aip);
+        Py_XDECREF(tmp);
+    }
+}
+
+#line 1645
+static void
+SHORT_to_OBJECT(void *input, void *output, npy_intp n,
+        void *vaip, void *NPY_UNUSED(aop))
+{
+    npy_short *ip = input;
+    PyObject **op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = 1;
+    PyObject *tmp;
+    for (i = 0; i < n; i++, ip +=skip, op++) {
+        tmp = *op;
+        *op = SHORT_getitem(ip, aip);
+        Py_XDECREF(tmp);
+    }
+}
+
+#line 1645
+static void
+USHORT_to_OBJECT(void *input, void *output, npy_intp n,
+        void *vaip, void *NPY_UNUSED(aop))
+{
+    npy_ushort *ip = input;
+    PyObject **op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = 1;
+    PyObject *tmp;
+    for (i = 0; i < n; i++, ip +=skip, op++) {
+        tmp = *op;
+        *op = USHORT_getitem(ip, aip);
+        Py_XDECREF(tmp);
+    }
+}
+
+#line 1645
+static void
+INT_to_OBJECT(void *input, void *output, npy_intp n,
+        void *vaip, void *NPY_UNUSED(aop))
+{
+    npy_int *ip = input;
+    PyObject **op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = 1;
+    PyObject *tmp;
+    for (i = 0; i < n; i++, ip +=skip, op++) {
+        tmp = *op;
+        *op = INT_getitem(ip, aip);
+        Py_XDECREF(tmp);
+    }
+}
+
+#line 1645
+static void
+UINT_to_OBJECT(void *input, void *output, npy_intp n,
+        void *vaip, void *NPY_UNUSED(aop))
+{
+    npy_uint *ip = input;
+    PyObject **op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = 1;
+    PyObject *tmp;
+    for (i = 0; i < n; i++, ip +=skip, op++) {
+        tmp = *op;
+        *op = UINT_getitem(ip, aip);
+        Py_XDECREF(tmp);
+    }
+}
+
+#line 1645
+static void
+LONG_to_OBJECT(void *input, void *output, npy_intp n,
+        void *vaip, void *NPY_UNUSED(aop))
+{
+    npy_long *ip = input;
+    PyObject **op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = 1;
+    PyObject *tmp;
+    for (i = 0; i < n; i++, ip +=skip, op++) {
+        tmp = *op;
+        *op = LONG_getitem(ip, aip);
+        Py_XDECREF(tmp);
+    }
+}
+
+#line 1645
+static void
+ULONG_to_OBJECT(void *input, void *output, npy_intp n,
+        void *vaip, void *NPY_UNUSED(aop))
+{
+    npy_ulong *ip = input;
+    PyObject **op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = 1;
+    PyObject *tmp;
+    for (i = 0; i < n; i++, ip +=skip, op++) {
+        tmp = *op;
+        *op = ULONG_getitem(ip, aip);
+        Py_XDECREF(tmp);
+    }
+}
+
+#line 1645
+static void
+LONGLONG_to_OBJECT(void *input, void *output, npy_intp n,
+        void *vaip, void *NPY_UNUSED(aop))
+{
+    npy_longlong *ip = input;
+    PyObject **op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = 1;
+    PyObject *tmp;
+    for (i = 0; i < n; i++, ip +=skip, op++) {
+        tmp = *op;
+        *op = LONGLONG_getitem(ip, aip);
+        Py_XDECREF(tmp);
+    }
+}
+
+#line 1645
+static void
+ULONGLONG_to_OBJECT(void *input, void *output, npy_intp n,
+        void *vaip, void *NPY_UNUSED(aop))
+{
+    npy_ulonglong *ip = input;
+    PyObject **op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = 1;
+    PyObject *tmp;
+    for (i = 0; i < n; i++, ip +=skip, op++) {
+        tmp = *op;
+        *op = ULONGLONG_getitem(ip, aip);
+        Py_XDECREF(tmp);
+    }
+}
+
+#line 1645
+static void
+HALF_to_OBJECT(void *input, void *output, npy_intp n,
+        void *vaip, void *NPY_UNUSED(aop))
+{
+    npy_half *ip = input;
+    PyObject **op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = 1;
+    PyObject *tmp;
+    for (i = 0; i < n; i++, ip +=skip, op++) {
+        tmp = *op;
+        *op = HALF_getitem(ip, aip);
+        Py_XDECREF(tmp);
+    }
+}
+
+#line 1645
+static void
+FLOAT_to_OBJECT(void *input, void *output, npy_intp n,
+        void *vaip, void *NPY_UNUSED(aop))
+{
+    npy_float *ip = input;
+    PyObject **op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = 1;
+    PyObject *tmp;
+    for (i = 0; i < n; i++, ip +=skip, op++) {
+        tmp = *op;
+        *op = FLOAT_getitem(ip, aip);
+        Py_XDECREF(tmp);
+    }
+}
+
+#line 1645
+static void
+DOUBLE_to_OBJECT(void *input, void *output, npy_intp n,
+        void *vaip, void *NPY_UNUSED(aop))
+{
+    npy_double *ip = input;
+    PyObject **op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = 1;
+    PyObject *tmp;
+    for (i = 0; i < n; i++, ip +=skip, op++) {
+        tmp = *op;
+        *op = DOUBLE_getitem(ip, aip);
+        Py_XDECREF(tmp);
+    }
+}
+
+#line 1645
+static void
+LONGDOUBLE_to_OBJECT(void *input, void *output, npy_intp n,
+        void *vaip, void *NPY_UNUSED(aop))
+{
+    npy_longdouble *ip = input;
+    PyObject **op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = 1;
+    PyObject *tmp;
+    for (i = 0; i < n; i++, ip +=skip, op++) {
+        tmp = *op;
+        *op = LONGDOUBLE_getitem(ip, aip);
+        Py_XDECREF(tmp);
+    }
+}
+
+#line 1645
+static void
+CFLOAT_to_OBJECT(void *input, void *output, npy_intp n,
+        void *vaip, void *NPY_UNUSED(aop))
+{
+    npy_cfloat *ip = input;
+    PyObject **op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = 1;
+    PyObject *tmp;
+    for (i = 0; i < n; i++, ip +=skip, op++) {
+        tmp = *op;
+        *op = CFLOAT_getitem(ip, aip);
+        Py_XDECREF(tmp);
+    }
+}
+
+#line 1645
+static void
+CDOUBLE_to_OBJECT(void *input, void *output, npy_intp n,
+        void *vaip, void *NPY_UNUSED(aop))
+{
+    npy_cdouble *ip = input;
+    PyObject **op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = 1;
+    PyObject *tmp;
+    for (i = 0; i < n; i++, ip +=skip, op++) {
+        tmp = *op;
+        *op = CDOUBLE_getitem(ip, aip);
+        Py_XDECREF(tmp);
+    }
+}
+
+#line 1645
+static void
+CLONGDOUBLE_to_OBJECT(void *input, void *output, npy_intp n,
+        void *vaip, void *NPY_UNUSED(aop))
+{
+    npy_clongdouble *ip = input;
+    PyObject **op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = 1;
+    PyObject *tmp;
+    for (i = 0; i < n; i++, ip +=skip, op++) {
+        tmp = *op;
+        *op = CLONGDOUBLE_getitem(ip, aip);
+        Py_XDECREF(tmp);
+    }
+}
+
+#line 1645
+static void
+STRING_to_OBJECT(void *input, void *output, npy_intp n,
+        void *vaip, void *NPY_UNUSED(aop))
+{
+    npy_char *ip = input;
+    PyObject **op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    PyObject *tmp;
+    for (i = 0; i < n; i++, ip +=skip, op++) {
+        tmp = *op;
+        *op = STRING_getitem(ip, aip);
+        Py_XDECREF(tmp);
+    }
+}
+
+#line 1645
+static void
+UNICODE_to_OBJECT(void *input, void *output, npy_intp n,
+        void *vaip, void *NPY_UNUSED(aop))
+{
+    npy_char *ip = input;
+    PyObject **op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    PyObject *tmp;
+    for (i = 0; i < n; i++, ip +=skip, op++) {
+        tmp = *op;
+        *op = UNICODE_getitem(ip, aip);
+        Py_XDECREF(tmp);
+    }
+}
+
+#line 1645
+static void
+VOID_to_OBJECT(void *input, void *output, npy_intp n,
+        void *vaip, void *NPY_UNUSED(aop))
+{
+    npy_char *ip = input;
+    PyObject **op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    PyObject *tmp;
+    for (i = 0; i < n; i++, ip +=skip, op++) {
+        tmp = *op;
+        *op = VOID_getitem(ip, aip);
+        Py_XDECREF(tmp);
+    }
+}
+
+#line 1645
+static void
+OBJECT_to_OBJECT(void *input, void *output, npy_intp n,
+        void *vaip, void *NPY_UNUSED(aop))
+{
+    PyObject * *ip = input;
+    PyObject **op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = 1;
+    PyObject *tmp;
+    for (i = 0; i < n; i++, ip +=skip, op++) {
+        tmp = *op;
+        *op = OBJECT_getitem(ip, aip);
+        Py_XDECREF(tmp);
+    }
+}
+
+#line 1645
+static void
+DATETIME_to_OBJECT(void *input, void *output, npy_intp n,
+        void *vaip, void *NPY_UNUSED(aop))
+{
+    npy_datetime *ip = input;
+    PyObject **op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = 1;
+    PyObject *tmp;
+    for (i = 0; i < n; i++, ip +=skip, op++) {
+        tmp = *op;
+        *op = DATETIME_getitem(ip, aip);
+        Py_XDECREF(tmp);
+    }
+}
+
+#line 1645
+static void
+TIMEDELTA_to_OBJECT(void *input, void *output, npy_intp n,
+        void *vaip, void *NPY_UNUSED(aop))
+{
+    npy_timedelta *ip = input;
+    PyObject **op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = 1;
+    PyObject *tmp;
+    for (i = 0; i < n; i++, ip +=skip, op++) {
+        tmp = *op;
+        *op = TIMEDELTA_getitem(ip, aip);
+        Py_XDECREF(tmp);
+    }
+}
+
+
+#define _NPY_UNUSEDBOOL  NPY_UNUSED
+#define _NPY_UNUSEDBYTE  NPY_UNUSED
+#define _NPY_UNUSEDUBYTE  NPY_UNUSED
+#define _NPY_UNUSEDSHORT  NPY_UNUSED
+#define _NPY_UNUSEDUSHORT  NPY_UNUSED
+#define _NPY_UNUSEDINT  NPY_UNUSED
+#define _NPY_UNUSEDUINT  NPY_UNUSED
+#define _NPY_UNUSEDLONG  NPY_UNUSED
+#define _NPY_UNUSEDULONG  NPY_UNUSED
+#define _NPY_UNUSEDLONGLONG  NPY_UNUSED
+#define _NPY_UNUSEDULONGLONG  NPY_UNUSED
+#define _NPY_UNUSEDHALF NPY_UNUSED
+#define _NPY_UNUSEDFLOAT  NPY_UNUSED
+#define _NPY_UNUSEDDOUBLE  NPY_UNUSED
+#define _NPY_UNUSEDLONGDOUBLE  NPY_UNUSED
+#define _NPY_UNUSEDCFLOAT  NPY_UNUSED
+#define _NPY_UNUSEDCDOUBLE  NPY_UNUSED
+#define _NPY_UNUSEDCLONGDOUBLE  NPY_UNUSED
+#define _NPY_UNUSEDDATETIME  NPY_UNUSED
+#define _NPY_UNUSEDTIMEDELTA  NPY_UNUSED
+#define _NPY_UNUSEDHALF NPY_UNUSED
+#define _NPY_UNUSEDSTRING
+#define _NPY_UNUSEDVOID
+#define _NPY_UNUSEDUNICODE
+
+#line 1707
+static void
+OBJECT_to_BOOL(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *aop)
+{
+    PyObject **ip = input;
+    npy_bool *op = output;
+
+    npy_intp i;
+    int skip = 1;
+
+    for (i = 0; i < n; i++, ip++, op += skip) {
+        if (*ip == NULL) {
+            if (BOOL_setitem(Py_False, op, aop) < 0) {
+                return;
+            }
+        }
+        else {
+            if (BOOL_setitem(*ip, op, aop) < 0) {
+                return;
+            }
+        }
+    }
+}
+
+#line 1707
+static void
+OBJECT_to_BYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *aop)
+{
+    PyObject **ip = input;
+    npy_byte *op = output;
+
+    npy_intp i;
+    int skip = 1;
+
+    for (i = 0; i < n; i++, ip++, op += skip) {
+        if (*ip == NULL) {
+            if (BYTE_setitem(Py_False, op, aop) < 0) {
+                return;
+            }
+        }
+        else {
+            if (BYTE_setitem(*ip, op, aop) < 0) {
+                return;
+            }
+        }
+    }
+}
+
+#line 1707
+static void
+OBJECT_to_UBYTE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *aop)
+{
+    PyObject **ip = input;
+    npy_ubyte *op = output;
+
+    npy_intp i;
+    int skip = 1;
+
+    for (i = 0; i < n; i++, ip++, op += skip) {
+        if (*ip == NULL) {
+            if (UBYTE_setitem(Py_False, op, aop) < 0) {
+                return;
+            }
+        }
+        else {
+            if (UBYTE_setitem(*ip, op, aop) < 0) {
+                return;
+            }
+        }
+    }
+}
+
+#line 1707
+static void
+OBJECT_to_SHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *aop)
+{
+    PyObject **ip = input;
+    npy_short *op = output;
+
+    npy_intp i;
+    int skip = 1;
+
+    for (i = 0; i < n; i++, ip++, op += skip) {
+        if (*ip == NULL) {
+            if (SHORT_setitem(Py_False, op, aop) < 0) {
+                return;
+            }
+        }
+        else {
+            if (SHORT_setitem(*ip, op, aop) < 0) {
+                return;
+            }
+        }
+    }
+}
+
+#line 1707
+static void
+OBJECT_to_USHORT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *aop)
+{
+    PyObject **ip = input;
+    npy_ushort *op = output;
+
+    npy_intp i;
+    int skip = 1;
+
+    for (i = 0; i < n; i++, ip++, op += skip) {
+        if (*ip == NULL) {
+            if (USHORT_setitem(Py_False, op, aop) < 0) {
+                return;
+            }
+        }
+        else {
+            if (USHORT_setitem(*ip, op, aop) < 0) {
+                return;
+            }
+        }
+    }
+}
+
+#line 1707
+static void
+OBJECT_to_INT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *aop)
+{
+    PyObject **ip = input;
+    npy_int *op = output;
+
+    npy_intp i;
+    int skip = 1;
+
+    for (i = 0; i < n; i++, ip++, op += skip) {
+        if (*ip == NULL) {
+            if (INT_setitem(Py_False, op, aop) < 0) {
+                return;
+            }
+        }
+        else {
+            if (INT_setitem(*ip, op, aop) < 0) {
+                return;
+            }
+        }
+    }
+}
+
+#line 1707
+static void
+OBJECT_to_UINT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *aop)
+{
+    PyObject **ip = input;
+    npy_uint *op = output;
+
+    npy_intp i;
+    int skip = 1;
+
+    for (i = 0; i < n; i++, ip++, op += skip) {
+        if (*ip == NULL) {
+            if (UINT_setitem(Py_False, op, aop) < 0) {
+                return;
+            }
+        }
+        else {
+            if (UINT_setitem(*ip, op, aop) < 0) {
+                return;
+            }
+        }
+    }
+}
+
+#line 1707
+static void
+OBJECT_to_LONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *aop)
+{
+    PyObject **ip = input;
+    npy_long *op = output;
+
+    npy_intp i;
+    int skip = 1;
+
+    for (i = 0; i < n; i++, ip++, op += skip) {
+        if (*ip == NULL) {
+            if (LONG_setitem(Py_False, op, aop) < 0) {
+                return;
+            }
+        }
+        else {
+            if (LONG_setitem(*ip, op, aop) < 0) {
+                return;
+            }
+        }
+    }
+}
+
+#line 1707
+static void
+OBJECT_to_ULONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *aop)
+{
+    PyObject **ip = input;
+    npy_ulong *op = output;
+
+    npy_intp i;
+    int skip = 1;
+
+    for (i = 0; i < n; i++, ip++, op += skip) {
+        if (*ip == NULL) {
+            if (ULONG_setitem(Py_False, op, aop) < 0) {
+                return;
+            }
+        }
+        else {
+            if (ULONG_setitem(*ip, op, aop) < 0) {
+                return;
+            }
+        }
+    }
+}
+
+#line 1707
+static void
+OBJECT_to_LONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *aop)
+{
+    PyObject **ip = input;
+    npy_longlong *op = output;
+
+    npy_intp i;
+    int skip = 1;
+
+    for (i = 0; i < n; i++, ip++, op += skip) {
+        if (*ip == NULL) {
+            if (LONGLONG_setitem(Py_False, op, aop) < 0) {
+                return;
+            }
+        }
+        else {
+            if (LONGLONG_setitem(*ip, op, aop) < 0) {
+                return;
+            }
+        }
+    }
+}
+
+#line 1707
+static void
+OBJECT_to_ULONGLONG(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *aop)
+{
+    PyObject **ip = input;
+    npy_ulonglong *op = output;
+
+    npy_intp i;
+    int skip = 1;
+
+    for (i = 0; i < n; i++, ip++, op += skip) {
+        if (*ip == NULL) {
+            if (ULONGLONG_setitem(Py_False, op, aop) < 0) {
+                return;
+            }
+        }
+        else {
+            if (ULONGLONG_setitem(*ip, op, aop) < 0) {
+                return;
+            }
+        }
+    }
+}
+
+#line 1707
+static void
+OBJECT_to_HALF(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *aop)
+{
+    PyObject **ip = input;
+    npy_half *op = output;
+
+    npy_intp i;
+    int skip = 1;
+
+    for (i = 0; i < n; i++, ip++, op += skip) {
+        if (*ip == NULL) {
+            if (HALF_setitem(Py_False, op, aop) < 0) {
+                return;
+            }
+        }
+        else {
+            if (HALF_setitem(*ip, op, aop) < 0) {
+                return;
+            }
+        }
+    }
+}
+
+#line 1707
+static void
+OBJECT_to_FLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *aop)
+{
+    PyObject **ip = input;
+    npy_float *op = output;
+
+    npy_intp i;
+    int skip = 1;
+
+    for (i = 0; i < n; i++, ip++, op += skip) {
+        if (*ip == NULL) {
+            if (FLOAT_setitem(Py_False, op, aop) < 0) {
+                return;
+            }
+        }
+        else {
+            if (FLOAT_setitem(*ip, op, aop) < 0) {
+                return;
+            }
+        }
+    }
+}
+
+#line 1707
+static void
+OBJECT_to_DOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *aop)
+{
+    PyObject **ip = input;
+    npy_double *op = output;
+
+    npy_intp i;
+    int skip = 1;
+
+    for (i = 0; i < n; i++, ip++, op += skip) {
+        if (*ip == NULL) {
+            if (DOUBLE_setitem(Py_False, op, aop) < 0) {
+                return;
+            }
+        }
+        else {
+            if (DOUBLE_setitem(*ip, op, aop) < 0) {
+                return;
+            }
+        }
+    }
+}
+
+#line 1707
+static void
+OBJECT_to_LONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *aop)
+{
+    PyObject **ip = input;
+    npy_longdouble *op = output;
+
+    npy_intp i;
+    int skip = 1;
+
+    for (i = 0; i < n; i++, ip++, op += skip) {
+        if (*ip == NULL) {
+            if (LONGDOUBLE_setitem(Py_False, op, aop) < 0) {
+                return;
+            }
+        }
+        else {
+            if (LONGDOUBLE_setitem(*ip, op, aop) < 0) {
+                return;
+            }
+        }
+    }
+}
+
+#line 1707
+static void
+OBJECT_to_CFLOAT(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *aop)
+{
+    PyObject **ip = input;
+    npy_cfloat *op = output;
+
+    npy_intp i;
+    int skip = 1;
+
+    for (i = 0; i < n; i++, ip++, op += skip) {
+        if (*ip == NULL) {
+            if (CFLOAT_setitem(Py_False, op, aop) < 0) {
+                return;
+            }
+        }
+        else {
+            if (CFLOAT_setitem(*ip, op, aop) < 0) {
+                return;
+            }
+        }
+    }
+}
+
+#line 1707
+static void
+OBJECT_to_CDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *aop)
+{
+    PyObject **ip = input;
+    npy_cdouble *op = output;
+
+    npy_intp i;
+    int skip = 1;
+
+    for (i = 0; i < n; i++, ip++, op += skip) {
+        if (*ip == NULL) {
+            if (CDOUBLE_setitem(Py_False, op, aop) < 0) {
+                return;
+            }
+        }
+        else {
+            if (CDOUBLE_setitem(*ip, op, aop) < 0) {
+                return;
+            }
+        }
+    }
+}
+
+#line 1707
+static void
+OBJECT_to_CLONGDOUBLE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *aop)
+{
+    PyObject **ip = input;
+    npy_clongdouble *op = output;
+
+    npy_intp i;
+    int skip = 1;
+
+    for (i = 0; i < n; i++, ip++, op += skip) {
+        if (*ip == NULL) {
+            if (CLONGDOUBLE_setitem(Py_False, op, aop) < 0) {
+                return;
+            }
+        }
+        else {
+            if (CLONGDOUBLE_setitem(*ip, op, aop) < 0) {
+                return;
+            }
+        }
+    }
+}
+
+#line 1707
+static void
+OBJECT_to_STRING(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *aop)
+{
+    PyObject **ip = input;
+    npy_char *op = output;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aop)->elsize;
+
+    for (i = 0; i < n; i++, ip++, op += skip) {
+        if (*ip == NULL) {
+            if (STRING_setitem(Py_False, op, aop) < 0) {
+                return;
+            }
+        }
+        else {
+            if (STRING_setitem(*ip, op, aop) < 0) {
+                return;
+            }
+        }
+    }
+}
+
+#line 1707
+static void
+OBJECT_to_UNICODE(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *aop)
+{
+    PyObject **ip = input;
+    npy_char *op = output;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aop)->elsize;
+
+    for (i = 0; i < n; i++, ip++, op += skip) {
+        if (*ip == NULL) {
+            if (UNICODE_setitem(Py_False, op, aop) < 0) {
+                return;
+            }
+        }
+        else {
+            if (UNICODE_setitem(*ip, op, aop) < 0) {
+                return;
+            }
+        }
+    }
+}
+
+#line 1707
+static void
+OBJECT_to_VOID(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *aop)
+{
+    PyObject **ip = input;
+    npy_char *op = output;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aop)->elsize;
+
+    for (i = 0; i < n; i++, ip++, op += skip) {
+        if (*ip == NULL) {
+            if (VOID_setitem(Py_False, op, aop) < 0) {
+                return;
+            }
+        }
+        else {
+            if (VOID_setitem(*ip, op, aop) < 0) {
+                return;
+            }
+        }
+    }
+}
+
+#line 1707
+static void
+OBJECT_to_DATETIME(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *aop)
+{
+    PyObject **ip = input;
+    npy_datetime *op = output;
+
+    npy_intp i;
+    int skip = 1;
+
+    for (i = 0; i < n; i++, ip++, op += skip) {
+        if (*ip == NULL) {
+            if (DATETIME_setitem(Py_False, op, aop) < 0) {
+                return;
+            }
+        }
+        else {
+            if (DATETIME_setitem(*ip, op, aop) < 0) {
+                return;
+            }
+        }
+    }
+}
+
+#line 1707
+static void
+OBJECT_to_TIMEDELTA(void *input, void *output, npy_intp n,
+        void *NPY_UNUSED(aip), void *aop)
+{
+    PyObject **ip = input;
+    npy_timedelta *op = output;
+
+    npy_intp i;
+    int skip = 1;
+
+    for (i = 0; i < n; i++, ip++, op += skip) {
+        if (*ip == NULL) {
+            if (TIMEDELTA_setitem(Py_False, op, aop) < 0) {
+                return;
+            }
+        }
+        else {
+            if (TIMEDELTA_setitem(*ip, op, aop) < 0) {
+                return;
+            }
+        }
+    }
+}
+
+
+
+#line 1756
+
+static void
+STRING_to_BOOL(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_bool *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 1
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (BOOL_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+STRING_to_BYTE(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_byte *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (BYTE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+STRING_to_UBYTE(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_ubyte *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (UBYTE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+STRING_to_SHORT(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_short *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (SHORT_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+STRING_to_USHORT(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_ushort *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (USHORT_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+STRING_to_INT(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_int *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (INT_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+STRING_to_UINT(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_uint *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (UINT_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+STRING_to_LONG(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_long *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (LONG_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+STRING_to_ULONG(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_ulong *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (ULONG_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+STRING_to_LONGLONG(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_longlong *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (LONGLONG_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+STRING_to_ULONGLONG(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_ulonglong *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (ULONGLONG_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+STRING_to_HALF(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_half *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (HALF_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+STRING_to_FLOAT(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_float *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (FLOAT_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+STRING_to_DOUBLE(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_double *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (DOUBLE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+STRING_to_LONGDOUBLE(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_longdouble *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (LONGDOUBLE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+STRING_to_CFLOAT(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_cfloat *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (CFLOAT_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+STRING_to_CDOUBLE(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_cdouble *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (CDOUBLE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+STRING_to_CLONGDOUBLE(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_clongdouble *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (CLONGDOUBLE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+STRING_to_STRING(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = (PyArray_DESCR(aop)->elsize);
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (STRING_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+STRING_to_UNICODE(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = (PyArray_DESCR(aop)->elsize);
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (UNICODE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+STRING_to_VOID(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = (PyArray_DESCR(aop)->elsize);
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (VOID_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+STRING_to_DATETIME(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_datetime *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (DATETIME_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+STRING_to_TIMEDELTA(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_timedelta *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (TIMEDELTA_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+UNICODE_to_BOOL(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_bool *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 1
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (BOOL_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+UNICODE_to_BYTE(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_byte *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (BYTE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+UNICODE_to_UBYTE(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_ubyte *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (UBYTE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+UNICODE_to_SHORT(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_short *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (SHORT_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+UNICODE_to_USHORT(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_ushort *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (USHORT_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+UNICODE_to_INT(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_int *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (INT_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+UNICODE_to_UINT(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_uint *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (UINT_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+UNICODE_to_LONG(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_long *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (LONG_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+UNICODE_to_ULONG(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_ulong *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (ULONG_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+UNICODE_to_LONGLONG(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_longlong *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (LONGLONG_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+UNICODE_to_ULONGLONG(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_ulonglong *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (ULONGLONG_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+UNICODE_to_HALF(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_half *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (HALF_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+UNICODE_to_FLOAT(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_float *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (FLOAT_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+UNICODE_to_DOUBLE(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_double *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (DOUBLE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+UNICODE_to_LONGDOUBLE(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_longdouble *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (LONGDOUBLE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+UNICODE_to_CFLOAT(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_cfloat *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (CFLOAT_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+UNICODE_to_CDOUBLE(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_cdouble *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (CDOUBLE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+UNICODE_to_CLONGDOUBLE(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_clongdouble *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (CLONGDOUBLE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+UNICODE_to_STRING(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = (PyArray_DESCR(aop)->elsize);
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (STRING_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+UNICODE_to_UNICODE(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = (PyArray_DESCR(aop)->elsize);
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (UNICODE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+UNICODE_to_VOID(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = (PyArray_DESCR(aop)->elsize);
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (VOID_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+UNICODE_to_DATETIME(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_datetime *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (DATETIME_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+UNICODE_to_TIMEDELTA(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_timedelta *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (TIMEDELTA_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+VOID_to_BOOL(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_bool *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (BOOL_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+VOID_to_BYTE(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_byte *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (BYTE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+VOID_to_UBYTE(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_ubyte *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (UBYTE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+VOID_to_SHORT(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_short *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (SHORT_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+VOID_to_USHORT(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_ushort *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (USHORT_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+VOID_to_INT(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_int *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (INT_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+VOID_to_UINT(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_uint *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (UINT_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+VOID_to_LONG(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_long *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (LONG_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+VOID_to_ULONG(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_ulong *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (ULONG_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+VOID_to_LONGLONG(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_longlong *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (LONGLONG_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+VOID_to_ULONGLONG(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_ulonglong *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (ULONGLONG_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+VOID_to_HALF(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_half *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (HALF_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+VOID_to_FLOAT(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_float *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (FLOAT_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+VOID_to_DOUBLE(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_double *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (DOUBLE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+VOID_to_LONGDOUBLE(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_longdouble *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (LONGDOUBLE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+VOID_to_CFLOAT(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_cfloat *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (CFLOAT_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+VOID_to_CDOUBLE(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_cdouble *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (CDOUBLE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+VOID_to_CLONGDOUBLE(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_clongdouble *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (CLONGDOUBLE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+VOID_to_STRING(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = (PyArray_DESCR(aop)->elsize);
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (STRING_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+VOID_to_UNICODE(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = (PyArray_DESCR(aop)->elsize);
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (UNICODE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+VOID_to_VOID(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = (PyArray_DESCR(aop)->elsize);
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (VOID_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+VOID_to_DATETIME(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_datetime *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (DATETIME_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+#line 1756
+
+static void
+VOID_to_TIMEDELTA(void *input, void *output, npy_intp n,
+        void *vaip, void *aop)
+{
+    npy_char *ip = input;
+    npy_timedelta *op = output;
+    PyArrayObject *aip = vaip;
+
+    npy_intp i;
+    int skip = PyArray_DESCR(aip)->elsize;
+    int oskip = 1;
+
+    for (i = 0; i < n; i++, ip+=skip, op+=oskip) {
+        PyObject *temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            return;
+        }
+#if 0
+        /* Legacy behaviour converts strings to integers before going to bool */
+        Py_SETREF(temp, PyNumber_Long(temp));
+        if (temp == NULL) {
+            return;
+        }
+#endif
+        if (TIMEDELTA_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+
+
+#line 1810
+static void
+BOOL_to_STRING(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_bool *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (STRING_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+BYTE_to_STRING(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_byte *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (STRING_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+UBYTE_to_STRING(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_ubyte *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (STRING_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+SHORT_to_STRING(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_short *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (STRING_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+USHORT_to_STRING(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_ushort *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (STRING_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+INT_to_STRING(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_int *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (STRING_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+UINT_to_STRING(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_uint *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (STRING_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+LONG_to_STRING(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_long *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (STRING_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+ULONG_to_STRING(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_ulong *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (STRING_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+LONGLONG_to_STRING(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_longlong *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (STRING_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+ULONGLONG_to_STRING(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_ulonglong *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (STRING_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+HALF_to_STRING(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_half *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (STRING_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+FLOAT_to_STRING(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_float *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (STRING_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+DOUBLE_to_STRING(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_double *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (STRING_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+LONGDOUBLE_to_STRING(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_longdouble *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (STRING_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+CFLOAT_to_STRING(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_cfloat *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (STRING_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+CDOUBLE_to_STRING(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_cdouble *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (STRING_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+CLONGDOUBLE_to_STRING(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_clongdouble *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (STRING_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+DATETIME_to_STRING(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_datetime *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (STRING_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+TIMEDELTA_to_STRING(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_timedelta *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (STRING_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+BOOL_to_UNICODE(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_bool *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (UNICODE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+BYTE_to_UNICODE(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_byte *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (UNICODE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+UBYTE_to_UNICODE(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_ubyte *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (UNICODE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+SHORT_to_UNICODE(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_short *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (UNICODE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+USHORT_to_UNICODE(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_ushort *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (UNICODE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+INT_to_UNICODE(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_int *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (UNICODE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+UINT_to_UNICODE(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_uint *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (UNICODE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+LONG_to_UNICODE(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_long *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (UNICODE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+ULONG_to_UNICODE(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_ulong *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (UNICODE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+LONGLONG_to_UNICODE(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_longlong *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (UNICODE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+ULONGLONG_to_UNICODE(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_ulonglong *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (UNICODE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+HALF_to_UNICODE(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_half *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (UNICODE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+FLOAT_to_UNICODE(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_float *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (UNICODE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+DOUBLE_to_UNICODE(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_double *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (UNICODE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+LONGDOUBLE_to_UNICODE(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_longdouble *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (UNICODE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+CFLOAT_to_UNICODE(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_cfloat *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (UNICODE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+CDOUBLE_to_UNICODE(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_cdouble *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (UNICODE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+CLONGDOUBLE_to_UNICODE(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_clongdouble *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (UNICODE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+DATETIME_to_UNICODE(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_datetime *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (UNICODE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+TIMEDELTA_to_UNICODE(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_timedelta *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (UNICODE_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+BOOL_to_VOID(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_bool *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (VOID_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+BYTE_to_VOID(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_byte *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (VOID_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+UBYTE_to_VOID(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_ubyte *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (VOID_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+SHORT_to_VOID(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_short *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (VOID_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+USHORT_to_VOID(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_ushort *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (VOID_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+INT_to_VOID(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_int *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (VOID_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+UINT_to_VOID(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_uint *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (VOID_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+LONG_to_VOID(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_long *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (VOID_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+ULONG_to_VOID(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_ulong *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (VOID_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+LONGLONG_to_VOID(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_longlong *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (VOID_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+ULONGLONG_to_VOID(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_ulonglong *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (VOID_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+HALF_to_VOID(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_half *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (VOID_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+FLOAT_to_VOID(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_float *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (VOID_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+DOUBLE_to_VOID(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_double *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (VOID_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+LONGDOUBLE_to_VOID(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_longdouble *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (VOID_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+CFLOAT_to_VOID(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_cfloat *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (VOID_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+CDOUBLE_to_VOID(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_cdouble *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (VOID_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+CLONGDOUBLE_to_VOID(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_clongdouble *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (VOID_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+DATETIME_to_VOID(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_datetime *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (VOID_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+#line 1810
+static void
+TIMEDELTA_to_VOID(void *input, void *output, npy_intp n,
+        void *vaip, void *vaop)
+{
+    npy_timedelta *ip = input;
+    npy_char *op = output;
+    PyArrayObject *aip = vaip;
+    PyArrayObject *aop = vaop;
+
+    npy_intp i;
+    PyObject *temp = NULL;
+    int skip = 1;
+    int oskip = PyArray_DESCR(aop)->elsize;
+    for (i = 0; i < n; i++, ip += skip, op += oskip) {
+        temp = PyArray_Scalar(ip, PyArray_DESCR(aip), (PyObject *)aip);
+        if (temp == NULL) {
+            Py_INCREF(Py_False);
+            temp = Py_False;
+        }
+        if (VOID_setitem(temp, op, aop)) {
+            Py_DECREF(temp);
+            return;
+        }
+        Py_DECREF(temp);
+    }
+}
+
+
+
+
+/*
+ *****************************************************************************
+ **                               SCAN                                      **
+ *****************************************************************************
+ */
+
+
+/*
+ * The first ignore argument is for backwards compatibility.
+ * Should be removed when the API version is bumped up.
+ */
+
+#line 1860
+static int
+SHORT_scan(FILE *fp, npy_short *ip, void *NPY_UNUSED(ignore),
+        PyArray_Descr *NPY_UNUSED(ignored))
+{
+    return fscanf(fp, "%""hd", ip);
+}
+
+#line 1860
+static int
+USHORT_scan(FILE *fp, npy_ushort *ip, void *NPY_UNUSED(ignore),
+        PyArray_Descr *NPY_UNUSED(ignored))
+{
+    return fscanf(fp, "%""hu", ip);
+}
+
+#line 1860
+static int
+INT_scan(FILE *fp, npy_int *ip, void *NPY_UNUSED(ignore),
+        PyArray_Descr *NPY_UNUSED(ignored))
+{
+    return fscanf(fp, "%""d", ip);
+}
+
+#line 1860
+static int
+UINT_scan(FILE *fp, npy_uint *ip, void *NPY_UNUSED(ignore),
+        PyArray_Descr *NPY_UNUSED(ignored))
+{
+    return fscanf(fp, "%""u", ip);
+}
+
+#line 1860
+static int
+LONG_scan(FILE *fp, npy_long *ip, void *NPY_UNUSED(ignore),
+        PyArray_Descr *NPY_UNUSED(ignored))
+{
+    return fscanf(fp, "%""ld", ip);
+}
+
+#line 1860
+static int
+ULONG_scan(FILE *fp, npy_ulong *ip, void *NPY_UNUSED(ignore),
+        PyArray_Descr *NPY_UNUSED(ignored))
+{
+    return fscanf(fp, "%""lu", ip);
+}
+
+#line 1860
+static int
+LONGLONG_scan(FILE *fp, npy_longlong *ip, void *NPY_UNUSED(ignore),
+        PyArray_Descr *NPY_UNUSED(ignored))
+{
+    return fscanf(fp, "%"NPY_LONGLONG_FMT, ip);
+}
+
+#line 1860
+static int
+ULONGLONG_scan(FILE *fp, npy_ulonglong *ip, void *NPY_UNUSED(ignore),
+        PyArray_Descr *NPY_UNUSED(ignored))
+{
+    return fscanf(fp, "%"NPY_ULONGLONG_FMT, ip);
+}
+
+
+#line 1872
+static int
+FLOAT_scan(FILE *fp, npy_float *ip, void *NPY_UNUSED(ignore),
+        PyArray_Descr *NPY_UNUSED(ignored))
+{
+    double result;
+    int ret;
+
+    ret = NumPyOS_ascii_ftolf(fp, &result);
+    *ip = (npy_float) result;
+    return ret;
+}
+
+#line 1872
+static int
+DOUBLE_scan(FILE *fp, npy_double *ip, void *NPY_UNUSED(ignore),
+        PyArray_Descr *NPY_UNUSED(ignored))
+{
+    double result;
+    int ret;
+
+    ret = NumPyOS_ascii_ftolf(fp, &result);
+    *ip = (npy_double) result;
+    return ret;
+}
+
+
+static int
+LONGDOUBLE_scan(FILE *fp, npy_longdouble *ip, void *NPY_UNUSED(ignore),
+        PyArray_Descr *NPY_UNUSED(ignored))
+{
+    long double result;
+    int ret;
+
+    ret = NumPyOS_ascii_ftoLf(fp, &result);
+    *ip = (npy_longdouble) result;
+    return ret;
+}
+
+static int
+HALF_scan(FILE *fp, npy_half *ip, void *NPY_UNUSED(ignore),
+        PyArray_Descr *NPY_UNUSED(ignored))
+{
+    double result;
+    int ret;
+
+    ret = NumPyOS_ascii_ftolf(fp, &result);
+    *ip = npy_double_to_half(result);
+    return ret;
+}
+
+#line 1915
+static int
+BYTE_scan(FILE *fp, npy_byte *ip, void *NPY_UNUSED(ignore),
+        PyArray_Descr *NPY_UNUSED(ignore2))
+{
+    npy_int temp;
+    int num;
+
+    num = fscanf(fp, "%""d", &temp);
+    *ip = (npy_byte) temp;
+    return num;
+}
+
+#line 1915
+static int
+UBYTE_scan(FILE *fp, npy_ubyte *ip, void *NPY_UNUSED(ignore),
+        PyArray_Descr *NPY_UNUSED(ignore2))
+{
+    npy_uint temp;
+    int num;
+
+    num = fscanf(fp, "%""u", &temp);
+    *ip = (npy_ubyte) temp;
+    return num;
+}
+
+
+static int
+BOOL_scan(FILE *fp, npy_bool *ip, void *NPY_UNUSED(ignore),
+        PyArray_Descr *NPY_UNUSED(ignore2))
+{
+    double result;
+    int ret;
+
+    ret = NumPyOS_ascii_ftolf(fp, &result);
+    *ip = (npy_bool) (result != 0.0);
+    return ret;
+}
+
+#line 1944
+static int
+CFLOAT_scan(FILE *fp, npy_cfloat *ip, void *NPY_UNUSED(ignore),
+             PyArray_Descr *NPY_UNUSED(ignored))
+{
+    double result;
+    int ret_real, ret_imag;
+
+    ret_real = NumPyOS_ascii_ftolf(fp, &result);
+    npy_cfloat output;
+    // Peek next character
+    char next = getc(fp);
+    if ((next == '+') || (next == '-')) {
+        // Imaginary component specified
+        output.real = result;
+        // Revert peek and read imaginary component
+        ungetc(next, fp);
+        ret_imag = NumPyOS_ascii_ftolf(fp, &result);
+        // Peak next character
+        next = getc(fp);
+        if ((ret_imag == 1) && (next == 'j')) {
+            // If read is successful and the immediate following char is j
+            output.imag = result;
+        }
+        else {
+            output.imag = 0;
+            // Push an invalid char to trigger the not everything is read error
+            ungetc('a', fp);
+        }
+    }
+    else if (next == 'j') {
+        // Real component not specified
+        output.real = 0;
+        output.imag = result;
+    }
+    else {
+        // Imaginary component not specified
+        output.real = result;
+        output.imag = 0.;
+        // Next character is not + / - / j. Revert peek.
+        ungetc(next, fp);
+    }
+    *(npy_cfloat *)ip = output;
+    return ret_real;
+}
+
+#line 1944
+static int
+CDOUBLE_scan(FILE *fp, npy_cdouble *ip, void *NPY_UNUSED(ignore),
+             PyArray_Descr *NPY_UNUSED(ignored))
+{
+    double result;
+    int ret_real, ret_imag;
+
+    ret_real = NumPyOS_ascii_ftolf(fp, &result);
+    npy_cdouble output;
+    // Peek next character
+    char next = getc(fp);
+    if ((next == '+') || (next == '-')) {
+        // Imaginary component specified
+        output.real = result;
+        // Revert peek and read imaginary component
+        ungetc(next, fp);
+        ret_imag = NumPyOS_ascii_ftolf(fp, &result);
+        // Peak next character
+        next = getc(fp);
+        if ((ret_imag == 1) && (next == 'j')) {
+            // If read is successful and the immediate following char is j
+            output.imag = result;
+        }
+        else {
+            output.imag = 0;
+            // Push an invalid char to trigger the not everything is read error
+            ungetc('a', fp);
+        }
+    }
+    else if (next == 'j') {
+        // Real component not specified
+        output.real = 0;
+        output.imag = result;
+    }
+    else {
+        // Imaginary component not specified
+        output.real = result;
+        output.imag = 0.;
+        // Next character is not + / - / j. Revert peek.
+        ungetc(next, fp);
+    }
+    *(npy_cdouble *)ip = output;
+    return ret_real;
+}
+
+
+
+#line 1996
+
+#define CLONGDOUBLE_scan NULL
+
+
+#line 1996
+
+#define OBJECT_scan NULL
+
+
+#line 1996
+
+#define STRING_scan NULL
+
+
+#line 1996
+
+#define UNICODE_scan NULL
+
+
+#line 1996
+
+#define VOID_scan NULL
+
+
+#line 1996
+
+#define DATETIME_scan NULL
+
+
+#line 1996
+
+#define TIMEDELTA_scan NULL
+
+
+
+
+/*
+ *****************************************************************************
+ **                             FROMSTR                                     **
+ *****************************************************************************
+ */
+
+
+#line 2021
+static int
+BYTE_fromstr(char *str, void *ip, char **endptr,
+        PyArray_Descr *NPY_UNUSED(ignore))
+{
+    npy_long result;
+
+    result = PyOS_strtol(str, endptr, 10);
+    *(npy_byte *)ip = result;
+    return 0;
+}
+
+#line 2021
+static int
+UBYTE_fromstr(char *str, void *ip, char **endptr,
+        PyArray_Descr *NPY_UNUSED(ignore))
+{
+    npy_ulong result;
+
+    result = PyOS_strtoul(str, endptr, 10);
+    *(npy_ubyte *)ip = result;
+    return 0;
+}
+
+#line 2021
+static int
+SHORT_fromstr(char *str, void *ip, char **endptr,
+        PyArray_Descr *NPY_UNUSED(ignore))
+{
+    npy_long result;
+
+    result = PyOS_strtol(str, endptr, 10);
+    *(npy_short *)ip = result;
+    return 0;
+}
+
+#line 2021
+static int
+USHORT_fromstr(char *str, void *ip, char **endptr,
+        PyArray_Descr *NPY_UNUSED(ignore))
+{
+    npy_ulong result;
+
+    result = PyOS_strtoul(str, endptr, 10);
+    *(npy_ushort *)ip = result;
+    return 0;
+}
+
+#line 2021
+static int
+INT_fromstr(char *str, void *ip, char **endptr,
+        PyArray_Descr *NPY_UNUSED(ignore))
+{
+    npy_long result;
+
+    result = PyOS_strtol(str, endptr, 10);
+    *(npy_int *)ip = result;
+    return 0;
+}
+
+#line 2021
+static int
+UINT_fromstr(char *str, void *ip, char **endptr,
+        PyArray_Descr *NPY_UNUSED(ignore))
+{
+    npy_ulong result;
+
+    result = PyOS_strtoul(str, endptr, 10);
+    *(npy_uint *)ip = result;
+    return 0;
+}
+
+#line 2021
+static int
+LONG_fromstr(char *str, void *ip, char **endptr,
+        PyArray_Descr *NPY_UNUSED(ignore))
+{
+    npy_long result;
+
+    result = PyOS_strtol(str, endptr, 10);
+    *(npy_long *)ip = result;
+    return 0;
+}
+
+#line 2021
+static int
+ULONG_fromstr(char *str, void *ip, char **endptr,
+        PyArray_Descr *NPY_UNUSED(ignore))
+{
+    npy_ulong result;
+
+    result = PyOS_strtoul(str, endptr, 10);
+    *(npy_ulong *)ip = result;
+    return 0;
+}
+
+#line 2021
+static int
+LONGLONG_fromstr(char *str, void *ip, char **endptr,
+        PyArray_Descr *NPY_UNUSED(ignore))
+{
+    npy_longlong result;
+
+    result = NumPyOS_strtoll(str, endptr, 10);
+    *(npy_longlong *)ip = result;
+    return 0;
+}
+
+#line 2021
+static int
+ULONGLONG_fromstr(char *str, void *ip, char **endptr,
+        PyArray_Descr *NPY_UNUSED(ignore))
+{
+    npy_ulonglong result;
+
+    result = NumPyOS_strtoull(str, endptr, 10);
+    *(npy_ulonglong *)ip = result;
+    return 0;
+}
+
+#line 2021
+static int
+DATETIME_fromstr(char *str, void *ip, char **endptr,
+        PyArray_Descr *NPY_UNUSED(ignore))
+{
+    npy_longlong result;
+
+    result = NumPyOS_strtoll(str, endptr, 10);
+    *(npy_datetime *)ip = result;
+    return 0;
+}
+
+#line 2021
+static int
+TIMEDELTA_fromstr(char *str, void *ip, char **endptr,
+        PyArray_Descr *NPY_UNUSED(ignore))
+{
+    npy_longlong result;
+
+    result = NumPyOS_strtoll(str, endptr, 10);
+    *(npy_timedelta *)ip = result;
+    return 0;
+}
+
+
+#line 2038
+static int
+FLOAT_fromstr(char *str, void *ip, char **endptr,
+        PyArray_Descr *NPY_UNUSED(ignore))
+{
+    double result;
+
+    result = NumPyOS_ascii_strtod(str, endptr);
+    *(npy_float *)ip = result;
+    return 0;
+}
+
+#line 2038
+static int
+DOUBLE_fromstr(char *str, void *ip, char **endptr,
+        PyArray_Descr *NPY_UNUSED(ignore))
+{
+    double result;
+
+    result = NumPyOS_ascii_strtod(str, endptr);
+    *(npy_double *)ip = result;
+    return 0;
+}
+
+
+static int
+LONGDOUBLE_fromstr(char *str, void *ip, char **endptr,
+        PyArray_Descr *NPY_UNUSED(ignore))
+{
+    long double result;
+
+    result = NumPyOS_ascii_strtold(str, endptr);
+    *(npy_longdouble *)ip = result;
+    return 0;
+}
+
+static int
+HALF_fromstr(char *str, void *ip, char **endptr,
+        PyArray_Descr *NPY_UNUSED(ignore))
+{
+    double result;
+
+    result = NumPyOS_ascii_strtod(str, endptr);
+    *(npy_half *)ip = npy_double_to_half(result);
+    return 0;
+}
+
+static int
+BOOL_fromstr(char *str, void *ip, char **endptr,
+        PyArray_Descr *NPY_UNUSED(ignore))
+{
+    double result;
+
+    result = NumPyOS_ascii_strtod(str, endptr);
+    *(npy_bool *)ip = (result != 0.0);
+    return 0;
+}
+
+#line 2087
+static int
+CFLOAT_fromstr(char *str, void *ip, char **endptr,
+        PyArray_Descr *NPY_UNUSED(ignore))
+{
+    double result;
+
+    result = NumPyOS_ascii_strtod(str, endptr);
+    npy_cfloat output;
+
+    if (endptr && ((*endptr[0] == '+') || (*endptr[0] == '-'))) {
+        // Imaginary component specified
+        output.real = result;
+        // Reading imaginary component
+        char **prev = endptr;
+        str = *endptr;
+        result = NumPyOS_ascii_strtod(str, endptr);
+        if (endptr && *endptr[0] == 'j') {
+            // Read is successful if the immediate following char is j
+            output.imag = result;
+            // Skip j
+            ++*endptr;
+        }
+        else {
+            /*
+             * Set endptr to previous char to trigger the not everything is
+             * read error
+             */
+            endptr = prev;
+            output.imag = 0;
+        }
+    }
+    else if (endptr && *endptr[0] == 'j') {
+        // Real component not specified
+        output.real = 0;
+        output.imag = result;
+        // Skip j
+        ++*endptr;
+    }
+    else {
+        // Imaginary component not specified
+        output.real = result;
+        output.imag = 0.;
+    }
+    *(npy_cfloat *)ip = output;
+    return 0;
+}
+
+#line 2087
+static int
+CDOUBLE_fromstr(char *str, void *ip, char **endptr,
+        PyArray_Descr *NPY_UNUSED(ignore))
+{
+    double result;
+
+    result = NumPyOS_ascii_strtod(str, endptr);
+    npy_cdouble output;
+
+    if (endptr && ((*endptr[0] == '+') || (*endptr[0] == '-'))) {
+        // Imaginary component specified
+        output.real = result;
+        // Reading imaginary component
+        char **prev = endptr;
+        str = *endptr;
+        result = NumPyOS_ascii_strtod(str, endptr);
+        if (endptr && *endptr[0] == 'j') {
+            // Read is successful if the immediate following char is j
+            output.imag = result;
+            // Skip j
+            ++*endptr;
+        }
+        else {
+            /*
+             * Set endptr to previous char to trigger the not everything is
+             * read error
+             */
+            endptr = prev;
+            output.imag = 0;
+        }
+    }
+    else if (endptr && *endptr[0] == 'j') {
+        // Real component not specified
+        output.real = 0;
+        output.imag = result;
+        // Skip j
+        ++*endptr;
+    }
+    else {
+        // Imaginary component not specified
+        output.real = result;
+        output.imag = 0.;
+    }
+    *(npy_cdouble *)ip = output;
+    return 0;
+}
+
+
+
+#line 2140
+
+#define CLONGDOUBLE_fromstr NULL
+
+
+#line 2140
+
+#define OBJECT_fromstr NULL
+
+
+#line 2140
+
+#define STRING_fromstr NULL
+
+
+#line 2140
+
+#define UNICODE_fromstr NULL
+
+
+#line 2140
+
+#define VOID_fromstr NULL
+
+
+
+
+/*
+ *****************************************************************************
+ **                            COPYSWAPN                                    **
+ *****************************************************************************
+ */
+
+
+static inline void
+_basic_copyn(void *dst, npy_intp dstride, void *src, npy_intp sstride,
+             npy_intp n, int elsize) {
+    if (src == NULL) {
+        return;
+    }
+    if (sstride == elsize && dstride == elsize) {
+        memcpy(dst, src, n*elsize);
+    }
+    else {
+        _unaligned_strided_byte_copy(dst, dstride, src, sstride,
+                n, elsize);
+    }
+}
+
+static inline void
+_basic_copy(void *dst, void *src, int elsize) {
+    if (src == NULL) {
+        return;
+    }
+    memcpy(dst, src, elsize);
+}
+
+
+#line 2192
+static void
+SHORT_copyswapn (void *dst, npy_intp dstride, void *src, npy_intp sstride,
+                   npy_intp n, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copyn(dst, dstride, src, sstride, n, sizeof(npy_short));
+    if (swap) {
+        _strided_byte_swap(dst, dstride, n, sizeof(npy_short));
+    }
+}
+
+static void
+SHORT_copyswap (void *dst, void *src, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copy(dst, src, sizeof(npy_short));
+
+    if (swap) {
+        char *a, *b, c;
+
+        a = (char *)dst;
+#if NPY_SIZEOF_SHORT == 2
+        b = a + 1;
+        c = *a; *a++ = *b; *b = c;
+#elif NPY_SIZEOF_SHORT == 4
+        b = a + 3;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_SHORT == 8
+        b = a + 7;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_SHORT == 10
+        b = a + 9;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_SHORT == 12
+        b = a + 11;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_SHORT == 16
+        b = a + 15;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#else
+        {
+            int i, nn;
+
+            b = a + (NPY_SIZEOF_SHORT-1);
+            nn = NPY_SIZEOF_SHORT / 2;
+            for (i = 0; i < nn; i++) {
+                c = *a;
+                *a++ = *b;
+                *b-- = c;
+            }
+        }
+#endif
+    }
+}
+
+
+#line 2192
+static void
+USHORT_copyswapn (void *dst, npy_intp dstride, void *src, npy_intp sstride,
+                   npy_intp n, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copyn(dst, dstride, src, sstride, n, sizeof(npy_ushort));
+    if (swap) {
+        _strided_byte_swap(dst, dstride, n, sizeof(npy_ushort));
+    }
+}
+
+static void
+USHORT_copyswap (void *dst, void *src, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copy(dst, src, sizeof(npy_ushort));
+
+    if (swap) {
+        char *a, *b, c;
+
+        a = (char *)dst;
+#if NPY_SIZEOF_SHORT == 2
+        b = a + 1;
+        c = *a; *a++ = *b; *b = c;
+#elif NPY_SIZEOF_SHORT == 4
+        b = a + 3;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_SHORT == 8
+        b = a + 7;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_SHORT == 10
+        b = a + 9;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_SHORT == 12
+        b = a + 11;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_SHORT == 16
+        b = a + 15;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#else
+        {
+            int i, nn;
+
+            b = a + (NPY_SIZEOF_SHORT-1);
+            nn = NPY_SIZEOF_SHORT / 2;
+            for (i = 0; i < nn; i++) {
+                c = *a;
+                *a++ = *b;
+                *b-- = c;
+            }
+        }
+#endif
+    }
+}
+
+
+#line 2192
+static void
+INT_copyswapn (void *dst, npy_intp dstride, void *src, npy_intp sstride,
+                   npy_intp n, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copyn(dst, dstride, src, sstride, n, sizeof(npy_int));
+    if (swap) {
+        _strided_byte_swap(dst, dstride, n, sizeof(npy_int));
+    }
+}
+
+static void
+INT_copyswap (void *dst, void *src, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copy(dst, src, sizeof(npy_int));
+
+    if (swap) {
+        char *a, *b, c;
+
+        a = (char *)dst;
+#if NPY_SIZEOF_INT == 2
+        b = a + 1;
+        c = *a; *a++ = *b; *b = c;
+#elif NPY_SIZEOF_INT == 4
+        b = a + 3;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_INT == 8
+        b = a + 7;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_INT == 10
+        b = a + 9;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_INT == 12
+        b = a + 11;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_INT == 16
+        b = a + 15;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#else
+        {
+            int i, nn;
+
+            b = a + (NPY_SIZEOF_INT-1);
+            nn = NPY_SIZEOF_INT / 2;
+            for (i = 0; i < nn; i++) {
+                c = *a;
+                *a++ = *b;
+                *b-- = c;
+            }
+        }
+#endif
+    }
+}
+
+
+#line 2192
+static void
+UINT_copyswapn (void *dst, npy_intp dstride, void *src, npy_intp sstride,
+                   npy_intp n, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copyn(dst, dstride, src, sstride, n, sizeof(npy_uint));
+    if (swap) {
+        _strided_byte_swap(dst, dstride, n, sizeof(npy_uint));
+    }
+}
+
+static void
+UINT_copyswap (void *dst, void *src, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copy(dst, src, sizeof(npy_uint));
+
+    if (swap) {
+        char *a, *b, c;
+
+        a = (char *)dst;
+#if NPY_SIZEOF_INT == 2
+        b = a + 1;
+        c = *a; *a++ = *b; *b = c;
+#elif NPY_SIZEOF_INT == 4
+        b = a + 3;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_INT == 8
+        b = a + 7;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_INT == 10
+        b = a + 9;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_INT == 12
+        b = a + 11;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_INT == 16
+        b = a + 15;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#else
+        {
+            int i, nn;
+
+            b = a + (NPY_SIZEOF_INT-1);
+            nn = NPY_SIZEOF_INT / 2;
+            for (i = 0; i < nn; i++) {
+                c = *a;
+                *a++ = *b;
+                *b-- = c;
+            }
+        }
+#endif
+    }
+}
+
+
+#line 2192
+static void
+LONG_copyswapn (void *dst, npy_intp dstride, void *src, npy_intp sstride,
+                   npy_intp n, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copyn(dst, dstride, src, sstride, n, sizeof(npy_long));
+    if (swap) {
+        _strided_byte_swap(dst, dstride, n, sizeof(npy_long));
+    }
+}
+
+static void
+LONG_copyswap (void *dst, void *src, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copy(dst, src, sizeof(npy_long));
+
+    if (swap) {
+        char *a, *b, c;
+
+        a = (char *)dst;
+#if NPY_SIZEOF_LONG == 2
+        b = a + 1;
+        c = *a; *a++ = *b; *b = c;
+#elif NPY_SIZEOF_LONG == 4
+        b = a + 3;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_LONG == 8
+        b = a + 7;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_LONG == 10
+        b = a + 9;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_LONG == 12
+        b = a + 11;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_LONG == 16
+        b = a + 15;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#else
+        {
+            int i, nn;
+
+            b = a + (NPY_SIZEOF_LONG-1);
+            nn = NPY_SIZEOF_LONG / 2;
+            for (i = 0; i < nn; i++) {
+                c = *a;
+                *a++ = *b;
+                *b-- = c;
+            }
+        }
+#endif
+    }
+}
+
+
+#line 2192
+static void
+ULONG_copyswapn (void *dst, npy_intp dstride, void *src, npy_intp sstride,
+                   npy_intp n, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copyn(dst, dstride, src, sstride, n, sizeof(npy_ulong));
+    if (swap) {
+        _strided_byte_swap(dst, dstride, n, sizeof(npy_ulong));
+    }
+}
+
+static void
+ULONG_copyswap (void *dst, void *src, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copy(dst, src, sizeof(npy_ulong));
+
+    if (swap) {
+        char *a, *b, c;
+
+        a = (char *)dst;
+#if NPY_SIZEOF_LONG == 2
+        b = a + 1;
+        c = *a; *a++ = *b; *b = c;
+#elif NPY_SIZEOF_LONG == 4
+        b = a + 3;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_LONG == 8
+        b = a + 7;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_LONG == 10
+        b = a + 9;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_LONG == 12
+        b = a + 11;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_LONG == 16
+        b = a + 15;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#else
+        {
+            int i, nn;
+
+            b = a + (NPY_SIZEOF_LONG-1);
+            nn = NPY_SIZEOF_LONG / 2;
+            for (i = 0; i < nn; i++) {
+                c = *a;
+                *a++ = *b;
+                *b-- = c;
+            }
+        }
+#endif
+    }
+}
+
+
+#line 2192
+static void
+LONGLONG_copyswapn (void *dst, npy_intp dstride, void *src, npy_intp sstride,
+                   npy_intp n, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copyn(dst, dstride, src, sstride, n, sizeof(npy_longlong));
+    if (swap) {
+        _strided_byte_swap(dst, dstride, n, sizeof(npy_longlong));
+    }
+}
+
+static void
+LONGLONG_copyswap (void *dst, void *src, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copy(dst, src, sizeof(npy_longlong));
+
+    if (swap) {
+        char *a, *b, c;
+
+        a = (char *)dst;
+#if NPY_SIZEOF_LONGLONG == 2
+        b = a + 1;
+        c = *a; *a++ = *b; *b = c;
+#elif NPY_SIZEOF_LONGLONG == 4
+        b = a + 3;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_LONGLONG == 8
+        b = a + 7;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_LONGLONG == 10
+        b = a + 9;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_LONGLONG == 12
+        b = a + 11;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_LONGLONG == 16
+        b = a + 15;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#else
+        {
+            int i, nn;
+
+            b = a + (NPY_SIZEOF_LONGLONG-1);
+            nn = NPY_SIZEOF_LONGLONG / 2;
+            for (i = 0; i < nn; i++) {
+                c = *a;
+                *a++ = *b;
+                *b-- = c;
+            }
+        }
+#endif
+    }
+}
+
+
+#line 2192
+static void
+ULONGLONG_copyswapn (void *dst, npy_intp dstride, void *src, npy_intp sstride,
+                   npy_intp n, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copyn(dst, dstride, src, sstride, n, sizeof(npy_ulonglong));
+    if (swap) {
+        _strided_byte_swap(dst, dstride, n, sizeof(npy_ulonglong));
+    }
+}
+
+static void
+ULONGLONG_copyswap (void *dst, void *src, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copy(dst, src, sizeof(npy_ulonglong));
+
+    if (swap) {
+        char *a, *b, c;
+
+        a = (char *)dst;
+#if NPY_SIZEOF_LONGLONG == 2
+        b = a + 1;
+        c = *a; *a++ = *b; *b = c;
+#elif NPY_SIZEOF_LONGLONG == 4
+        b = a + 3;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_LONGLONG == 8
+        b = a + 7;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_LONGLONG == 10
+        b = a + 9;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_LONGLONG == 12
+        b = a + 11;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_LONGLONG == 16
+        b = a + 15;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#else
+        {
+            int i, nn;
+
+            b = a + (NPY_SIZEOF_LONGLONG-1);
+            nn = NPY_SIZEOF_LONGLONG / 2;
+            for (i = 0; i < nn; i++) {
+                c = *a;
+                *a++ = *b;
+                *b-- = c;
+            }
+        }
+#endif
+    }
+}
+
+
+#line 2192
+static void
+HALF_copyswapn (void *dst, npy_intp dstride, void *src, npy_intp sstride,
+                   npy_intp n, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copyn(dst, dstride, src, sstride, n, sizeof(npy_half));
+    if (swap) {
+        _strided_byte_swap(dst, dstride, n, sizeof(npy_half));
+    }
+}
+
+static void
+HALF_copyswap (void *dst, void *src, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copy(dst, src, sizeof(npy_half));
+
+    if (swap) {
+        char *a, *b, c;
+
+        a = (char *)dst;
+#if NPY_SIZEOF_HALF == 2
+        b = a + 1;
+        c = *a; *a++ = *b; *b = c;
+#elif NPY_SIZEOF_HALF == 4
+        b = a + 3;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_HALF == 8
+        b = a + 7;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_HALF == 10
+        b = a + 9;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_HALF == 12
+        b = a + 11;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_HALF == 16
+        b = a + 15;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#else
+        {
+            int i, nn;
+
+            b = a + (NPY_SIZEOF_HALF-1);
+            nn = NPY_SIZEOF_HALF / 2;
+            for (i = 0; i < nn; i++) {
+                c = *a;
+                *a++ = *b;
+                *b-- = c;
+            }
+        }
+#endif
+    }
+}
+
+
+#line 2192
+static void
+FLOAT_copyswapn (void *dst, npy_intp dstride, void *src, npy_intp sstride,
+                   npy_intp n, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copyn(dst, dstride, src, sstride, n, sizeof(npy_float));
+    if (swap) {
+        _strided_byte_swap(dst, dstride, n, sizeof(npy_float));
+    }
+}
+
+static void
+FLOAT_copyswap (void *dst, void *src, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copy(dst, src, sizeof(npy_float));
+
+    if (swap) {
+        char *a, *b, c;
+
+        a = (char *)dst;
+#if NPY_SIZEOF_FLOAT == 2
+        b = a + 1;
+        c = *a; *a++ = *b; *b = c;
+#elif NPY_SIZEOF_FLOAT == 4
+        b = a + 3;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_FLOAT == 8
+        b = a + 7;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_FLOAT == 10
+        b = a + 9;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_FLOAT == 12
+        b = a + 11;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_FLOAT == 16
+        b = a + 15;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#else
+        {
+            int i, nn;
+
+            b = a + (NPY_SIZEOF_FLOAT-1);
+            nn = NPY_SIZEOF_FLOAT / 2;
+            for (i = 0; i < nn; i++) {
+                c = *a;
+                *a++ = *b;
+                *b-- = c;
+            }
+        }
+#endif
+    }
+}
+
+
+#line 2192
+static void
+DOUBLE_copyswapn (void *dst, npy_intp dstride, void *src, npy_intp sstride,
+                   npy_intp n, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copyn(dst, dstride, src, sstride, n, sizeof(npy_double));
+    if (swap) {
+        _strided_byte_swap(dst, dstride, n, sizeof(npy_double));
+    }
+}
+
+static void
+DOUBLE_copyswap (void *dst, void *src, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copy(dst, src, sizeof(npy_double));
+
+    if (swap) {
+        char *a, *b, c;
+
+        a = (char *)dst;
+#if NPY_SIZEOF_DOUBLE == 2
+        b = a + 1;
+        c = *a; *a++ = *b; *b = c;
+#elif NPY_SIZEOF_DOUBLE == 4
+        b = a + 3;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_DOUBLE == 8
+        b = a + 7;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_DOUBLE == 10
+        b = a + 9;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_DOUBLE == 12
+        b = a + 11;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_DOUBLE == 16
+        b = a + 15;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#else
+        {
+            int i, nn;
+
+            b = a + (NPY_SIZEOF_DOUBLE-1);
+            nn = NPY_SIZEOF_DOUBLE / 2;
+            for (i = 0; i < nn; i++) {
+                c = *a;
+                *a++ = *b;
+                *b-- = c;
+            }
+        }
+#endif
+    }
+}
+
+
+#line 2192
+static void
+LONGDOUBLE_copyswapn (void *dst, npy_intp dstride, void *src, npy_intp sstride,
+                   npy_intp n, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copyn(dst, dstride, src, sstride, n, sizeof(npy_longdouble));
+    if (swap) {
+        _strided_byte_swap(dst, dstride, n, sizeof(npy_longdouble));
+    }
+}
+
+static void
+LONGDOUBLE_copyswap (void *dst, void *src, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copy(dst, src, sizeof(npy_longdouble));
+
+    if (swap) {
+        char *a, *b, c;
+
+        a = (char *)dst;
+#if NPY_SIZEOF_LONGDOUBLE == 2
+        b = a + 1;
+        c = *a; *a++ = *b; *b = c;
+#elif NPY_SIZEOF_LONGDOUBLE == 4
+        b = a + 3;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_LONGDOUBLE == 8
+        b = a + 7;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_LONGDOUBLE == 10
+        b = a + 9;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_LONGDOUBLE == 12
+        b = a + 11;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_LONGDOUBLE == 16
+        b = a + 15;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#else
+        {
+            int i, nn;
+
+            b = a + (NPY_SIZEOF_LONGDOUBLE-1);
+            nn = NPY_SIZEOF_LONGDOUBLE / 2;
+            for (i = 0; i < nn; i++) {
+                c = *a;
+                *a++ = *b;
+                *b-- = c;
+            }
+        }
+#endif
+    }
+}
+
+
+#line 2192
+static void
+DATETIME_copyswapn (void *dst, npy_intp dstride, void *src, npy_intp sstride,
+                   npy_intp n, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copyn(dst, dstride, src, sstride, n, sizeof(npy_datetime));
+    if (swap) {
+        _strided_byte_swap(dst, dstride, n, sizeof(npy_datetime));
+    }
+}
+
+static void
+DATETIME_copyswap (void *dst, void *src, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copy(dst, src, sizeof(npy_datetime));
+
+    if (swap) {
+        char *a, *b, c;
+
+        a = (char *)dst;
+#if NPY_SIZEOF_DATETIME == 2
+        b = a + 1;
+        c = *a; *a++ = *b; *b = c;
+#elif NPY_SIZEOF_DATETIME == 4
+        b = a + 3;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_DATETIME == 8
+        b = a + 7;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_DATETIME == 10
+        b = a + 9;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_DATETIME == 12
+        b = a + 11;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_DATETIME == 16
+        b = a + 15;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#else
+        {
+            int i, nn;
+
+            b = a + (NPY_SIZEOF_DATETIME-1);
+            nn = NPY_SIZEOF_DATETIME / 2;
+            for (i = 0; i < nn; i++) {
+                c = *a;
+                *a++ = *b;
+                *b-- = c;
+            }
+        }
+#endif
+    }
+}
+
+
+#line 2192
+static void
+TIMEDELTA_copyswapn (void *dst, npy_intp dstride, void *src, npy_intp sstride,
+                   npy_intp n, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copyn(dst, dstride, src, sstride, n, sizeof(npy_timedelta));
+    if (swap) {
+        _strided_byte_swap(dst, dstride, n, sizeof(npy_timedelta));
+    }
+}
+
+static void
+TIMEDELTA_copyswap (void *dst, void *src, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copy(dst, src, sizeof(npy_timedelta));
+
+    if (swap) {
+        char *a, *b, c;
+
+        a = (char *)dst;
+#if NPY_SIZEOF_TIMEDELTA == 2
+        b = a + 1;
+        c = *a; *a++ = *b; *b = c;
+#elif NPY_SIZEOF_TIMEDELTA == 4
+        b = a + 3;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_TIMEDELTA == 8
+        b = a + 7;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_TIMEDELTA == 10
+        b = a + 9;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_TIMEDELTA == 12
+        b = a + 11;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_TIMEDELTA == 16
+        b = a + 15;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#else
+        {
+            int i, nn;
+
+            b = a + (NPY_SIZEOF_TIMEDELTA-1);
+            nn = NPY_SIZEOF_TIMEDELTA / 2;
+            for (i = 0; i < nn; i++) {
+                c = *a;
+                *a++ = *b;
+                *b-- = c;
+            }
+        }
+#endif
+    }
+}
+
+
+
+#line 2276
+static void
+BOOL_copyswapn (void *dst, npy_intp dstride, void *src, npy_intp sstride,
+        npy_intp n, int NPY_UNUSED(swap), void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copyn(dst, dstride, src, sstride, n, sizeof(npy_bool));
+    /* ignore swap */
+}
+
+static void
+BOOL_copyswap (void *dst, void *src, int NPY_UNUSED(swap),
+        void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copy(dst, src, sizeof(npy_bool));
+    /* ignore swap */
+}
+
+
+#line 2276
+static void
+BYTE_copyswapn (void *dst, npy_intp dstride, void *src, npy_intp sstride,
+        npy_intp n, int NPY_UNUSED(swap), void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copyn(dst, dstride, src, sstride, n, sizeof(npy_byte));
+    /* ignore swap */
+}
+
+static void
+BYTE_copyswap (void *dst, void *src, int NPY_UNUSED(swap),
+        void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copy(dst, src, sizeof(npy_byte));
+    /* ignore swap */
+}
+
+
+#line 2276
+static void
+UBYTE_copyswapn (void *dst, npy_intp dstride, void *src, npy_intp sstride,
+        npy_intp n, int NPY_UNUSED(swap), void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copyn(dst, dstride, src, sstride, n, sizeof(npy_ubyte));
+    /* ignore swap */
+}
+
+static void
+UBYTE_copyswap (void *dst, void *src, int NPY_UNUSED(swap),
+        void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copy(dst, src, sizeof(npy_ubyte));
+    /* ignore swap */
+}
+
+
+
+
+
+#line 2304
+static void
+CFLOAT_copyswapn (void *dst, npy_intp dstride, void *src, npy_intp sstride,
+        npy_intp n, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copyn(dst, dstride, src, sstride, n, sizeof(npy_cfloat));
+
+    if (swap) {
+        _strided_byte_swap(dst, dstride, n, NPY_SIZEOF_FLOAT);
+        _strided_byte_swap(((char *)dst + NPY_SIZEOF_FLOAT), dstride,
+                n, NPY_SIZEOF_FLOAT);
+    }
+}
+
+static void
+CFLOAT_copyswap (void *dst, void *src, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copy(dst, src, sizeof(npy_cfloat));
+
+    if (swap) {
+        char *a, *b, c;
+        a = (char *)dst;
+#if NPY_SIZEOF_FLOAT == 4
+        b = a + 3;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+        a += 2;
+        b = a + 3;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_FLOAT == 8
+        b = a + 7;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+        a += 4;
+        b = a + 7;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_FLOAT == 10
+        b = a + 9;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+        a += 5;
+        b = a + 9;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_FLOAT == 12
+        b = a + 11;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+        a += 6;
+        b = a + 11;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_FLOAT == 16
+        b = a + 15;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+        a += 8;
+        b = a + 15;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#else
+        {
+            int i, nn;
+
+            b = a + (NPY_SIZEOF_FLOAT - 1);
+            nn = NPY_SIZEOF_FLOAT / 2;
+            for (i = 0; i < nn; i++) {
+                c = *a;
+                *a++ = *b;
+                *b-- = c;
+            }
+            a += nn;
+            b = a + (NPY_SIZEOF_FLOAT - 1);
+            for (i = 0; i < nn; i++) {
+                c = *a;
+                *a++ = *b;
+                *b-- = c;
+            }
+        }
+#endif
+    }
+}
+
+
+#line 2304
+static void
+CDOUBLE_copyswapn (void *dst, npy_intp dstride, void *src, npy_intp sstride,
+        npy_intp n, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copyn(dst, dstride, src, sstride, n, sizeof(npy_cdouble));
+
+    if (swap) {
+        _strided_byte_swap(dst, dstride, n, NPY_SIZEOF_DOUBLE);
+        _strided_byte_swap(((char *)dst + NPY_SIZEOF_DOUBLE), dstride,
+                n, NPY_SIZEOF_DOUBLE);
+    }
+}
+
+static void
+CDOUBLE_copyswap (void *dst, void *src, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copy(dst, src, sizeof(npy_cdouble));
+
+    if (swap) {
+        char *a, *b, c;
+        a = (char *)dst;
+#if NPY_SIZEOF_DOUBLE == 4
+        b = a + 3;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+        a += 2;
+        b = a + 3;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_DOUBLE == 8
+        b = a + 7;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+        a += 4;
+        b = a + 7;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_DOUBLE == 10
+        b = a + 9;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+        a += 5;
+        b = a + 9;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_DOUBLE == 12
+        b = a + 11;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+        a += 6;
+        b = a + 11;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_DOUBLE == 16
+        b = a + 15;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+        a += 8;
+        b = a + 15;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#else
+        {
+            int i, nn;
+
+            b = a + (NPY_SIZEOF_DOUBLE - 1);
+            nn = NPY_SIZEOF_DOUBLE / 2;
+            for (i = 0; i < nn; i++) {
+                c = *a;
+                *a++ = *b;
+                *b-- = c;
+            }
+            a += nn;
+            b = a + (NPY_SIZEOF_DOUBLE - 1);
+            for (i = 0; i < nn; i++) {
+                c = *a;
+                *a++ = *b;
+                *b-- = c;
+            }
+        }
+#endif
+    }
+}
+
+
+#line 2304
+static void
+CLONGDOUBLE_copyswapn (void *dst, npy_intp dstride, void *src, npy_intp sstride,
+        npy_intp n, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copyn(dst, dstride, src, sstride, n, sizeof(npy_clongdouble));
+
+    if (swap) {
+        _strided_byte_swap(dst, dstride, n, NPY_SIZEOF_LONGDOUBLE);
+        _strided_byte_swap(((char *)dst + NPY_SIZEOF_LONGDOUBLE), dstride,
+                n, NPY_SIZEOF_LONGDOUBLE);
+    }
+}
+
+static void
+CLONGDOUBLE_copyswap (void *dst, void *src, int swap, void *NPY_UNUSED(arr))
+{
+    /* copy first if needed */
+    _basic_copy(dst, src, sizeof(npy_clongdouble));
+
+    if (swap) {
+        char *a, *b, c;
+        a = (char *)dst;
+#if NPY_SIZEOF_LONGDOUBLE == 4
+        b = a + 3;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+        a += 2;
+        b = a + 3;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_LONGDOUBLE == 8
+        b = a + 7;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+        a += 4;
+        b = a + 7;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_LONGDOUBLE == 10
+        b = a + 9;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+        a += 5;
+        b = a + 9;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_LONGDOUBLE == 12
+        b = a + 11;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+        a += 6;
+        b = a + 11;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#elif NPY_SIZEOF_LONGDOUBLE == 16
+        b = a + 15;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+        a += 8;
+        b = a + 15;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b-- = c;
+        c = *a; *a++ = *b; *b   = c;
+#else
+        {
+            int i, nn;
+
+            b = a + (NPY_SIZEOF_LONGDOUBLE - 1);
+            nn = NPY_SIZEOF_LONGDOUBLE / 2;
+            for (i = 0; i < nn; i++) {
+                c = *a;
+                *a++ = *b;
+                *b-- = c;
+            }
+            a += nn;
+            b = a + (NPY_SIZEOF_LONGDOUBLE - 1);
+            for (i = 0; i < nn; i++) {
+                c = *a;
+                *a++ = *b;
+                *b-- = c;
+            }
+        }
+#endif
+    }
+}
+
+
+
+static void
+OBJECT_copyswapn(PyObject **dst, npy_intp dstride, PyObject **src,
+        npy_intp sstride, npy_intp n, int NPY_UNUSED(swap),
+        void *NPY_UNUSED(arr))
+{
+    npy_intp i;
+    if (src != NULL) {
+        if (NPY__ALIGNED(dst, sizeof(PyObject **))
+                && NPY__ALIGNED(src, sizeof(PyObject **))
+                && NPY__ALIGNED(dstride, sizeof(PyObject **))
+                && NPY__ALIGNED(sstride, sizeof(PyObject **))) {
+            dstride /= sizeof(PyObject **);
+            sstride /= sizeof(PyObject **);
+            for (i = 0; i < n; i++) {
+                Py_XINCREF(*src);
+                Py_XDECREF(*dst);
+                *dst = *src;
+                dst += dstride;
+                src += sstride;
+            }
+        }
+        else {
+            unsigned char *dstp, *srcp;
+            PyObject *tmp;
+            dstp = (unsigned char*)dst;
+            srcp = (unsigned char*)src;
+            for (i = 0; i < n; i++) {
+                memcpy(&tmp, srcp, sizeof(tmp));
+                Py_XINCREF(tmp);
+                memcpy(&tmp, dstp, sizeof(tmp));
+                Py_XDECREF(tmp);
+                memcpy(dstp, srcp, sizeof(tmp));
+                dstp += dstride;
+                srcp += sstride;
+            }
+        }
+    }
+    /* ignore swap */
+    return;
+}
+
+static void
+OBJECT_copyswap(PyObject **dst, PyObject **src, int NPY_UNUSED(swap),
+        void *NPY_UNUSED(arr))
+{
+
+    if (src != NULL) {
+        if (NPY__ALIGNED(dst,sizeof(PyObject **)) &&
+                NPY__ALIGNED(src,sizeof(PyObject **))) {
+            Py_XINCREF(*src);
+            Py_XDECREF(*dst);
+            *dst = *src;
+        }
+        else {
+            PyObject *tmp;
+            memcpy(&tmp, src, sizeof(tmp));
+            Py_XINCREF(tmp);
+            memcpy(&tmp, dst, sizeof(tmp));
+            Py_XDECREF(tmp);
+            memcpy(dst, src, sizeof(tmp));
+        }
+    }
+}
+
+/* ignore swap */
+static void
+STRING_copyswapn (char *dst, npy_intp dstride, char *src, npy_intp sstride,
+                  npy_intp n, int NPY_UNUSED(swap), PyArrayObject *arr)
+{
+    assert(arr != NULL);
+    if (arr == NULL) {
+        return;
+    }
+    _basic_copyn(dst, dstride, src, sstride, n, PyArray_DESCR(arr)->elsize);
+    return;
+}
+
+
+/* */
+static void
+VOID_copyswapn (char *dst, npy_intp dstride, char *src, npy_intp sstride,
+                npy_intp n, int swap, PyArrayObject *arr)
+{
+    PyArray_Descr *descr;
+
+    assert(arr != NULL);
+    if (arr == NULL) {
+        return;
+    }
+
+    descr = PyArray_DESCR(arr);
+
+    if (PyArray_HASFIELDS(arr)) {
+        PyObject *key, *value;
+        Py_ssize_t pos = 0;
+
+        PyArrayObject_fields dummy_fields = get_dummy_stack_array(arr);
+        PyArrayObject *dummy_arr = (PyArrayObject *)&dummy_fields;
+
+        while (PyDict_Next(descr->fields, &pos, &key, &value)) {
+            npy_intp offset;
+            PyArray_Descr *new;
+            if (NPY_TITLE_KEY(key, value)) {
+                continue;
+            }
+            if (_unpack_field(value, &new, &offset) < 0) {
+                return;
+            }
+
+            dummy_fields.descr = new;
+            new->f->copyswapn(dst+offset, dstride,
+                    (src != NULL ? src+offset : NULL),
+                    sstride, n, swap, dummy_arr);
+        }
+        return;
+    }
+    if (PyDataType_HASSUBARRAY(descr)) {
+        PyArray_Descr *new;
+        npy_intp num;
+        npy_intp i;
+        int subitemsize;
+        char *dstptr, *srcptr;
+        /*
+         * In certain cases subarray copy can be optimized. This is when
+         * swapping is unnecessary and the subarrays data type can certainly
+         * be simply copied (no object, fields, subarray, and not a user dtype).
+         */
+        npy_bool can_optimize_subarray = (!swap &&
+                !PyDataType_HASFIELDS(descr->subarray->base) &&
+                !PyDataType_HASSUBARRAY(descr->subarray->base) &&
+                !PyDataType_REFCHK(descr->subarray->base) &&
+                (descr->subarray->base->type_num < NPY_NTYPES));
+
+        if (can_optimize_subarray) {
+            _basic_copyn(dst, dstride, src, sstride, n, descr->elsize);
+            return;
+        }
+
+        new = descr->subarray->base;
+        dstptr = dst;
+        srcptr = src;
+        subitemsize = new->elsize;
+        if (subitemsize == 0) {
+            /* There cannot be any elements, so return */
+            return;
+        }
+
+        PyArrayObject_fields dummy_fields = get_dummy_stack_array(arr);
+        PyArrayObject *dummy_arr = (PyArrayObject *)&dummy_fields;
+        ((PyArrayObject_fields *)dummy_arr)->descr = new;
+
+        num = descr->elsize / subitemsize;
+        for (i = 0; i < n; i++) {
+            new->f->copyswapn(dstptr, subitemsize, srcptr,
+                    subitemsize, num, swap, dummy_arr);
+            dstptr += dstride;
+            if (srcptr) {
+                srcptr += sstride;
+            }
+        }
+        return;
+    }
+    /* Must be a naive Void type (e.g. a "V8") so simple copy is sufficient. */
+    _basic_copyn(dst, dstride, src, sstride, n, descr->elsize);
+    return;
+}
+
+static void
+VOID_copyswap (char *dst, char *src, int swap, PyArrayObject *arr)
+{
+    PyArray_Descr *descr;
+
+    assert(arr != NULL);
+    if (arr == NULL) {
+        return;
+    }
+
+    descr = PyArray_DESCR(arr);
+
+    if (PyArray_HASFIELDS(arr)) {
+        PyObject *key, *value;
+        Py_ssize_t pos = 0;
+
+        PyArrayObject_fields dummy_fields = get_dummy_stack_array(arr);
+        PyArrayObject *dummy_arr = (PyArrayObject *)&dummy_fields;
+
+        while (PyDict_Next(descr->fields, &pos, &key, &value)) {
+            npy_intp offset;
+
+            PyArray_Descr * new;
+            if (NPY_TITLE_KEY(key, value)) {
+                continue;
+            }
+            if (_unpack_field(value, &new, &offset) < 0) {
+                return;
+            }
+            dummy_fields.descr = new;
+            new->f->copyswap(dst+offset,
+                    (src != NULL ? src+offset : NULL),
+                    swap, dummy_arr);
+        }
+        return;
+    }
+    if (PyDataType_HASSUBARRAY(descr)) {
+        PyArray_Descr *new;
+        npy_intp num;
+        int subitemsize;
+        /*
+         * In certain cases subarray copy can be optimized. This is when
+         * swapping is unnecessary and the subarrays data type can certainly
+         * be simply copied (no object, fields, subarray, and not a user dtype).
+         */
+        npy_bool can_optimize_subarray = (!swap &&
+                !PyDataType_HASFIELDS(descr->subarray->base) &&
+                !PyDataType_HASSUBARRAY(descr->subarray->base) &&
+                !PyDataType_REFCHK(descr->subarray->base) &&
+                (descr->subarray->base->type_num < NPY_NTYPES));
+
+        if (can_optimize_subarray) {
+            _basic_copy(dst, src, descr->elsize);
+            return;
+        }
+
+        new = descr->subarray->base;
+        subitemsize = new->elsize;
+        if (subitemsize == 0) {
+            /* There cannot be any elements, so return */
+            return;
+        }
+
+        PyArrayObject_fields dummy_fields = get_dummy_stack_array(arr);
+        PyArrayObject *dummy_arr = (PyArrayObject *)&dummy_fields;
+        dummy_fields.descr = new;
+
+        num = descr->elsize / subitemsize;
+        new->f->copyswapn(dst, subitemsize, src,
+                subitemsize, num, swap, dummy_arr);
+        return;
+    }
+    /* Must be a naive Void type (e.g. a "V8") so simple copy is sufficient. */
+    _basic_copy(dst, src, descr->elsize);
+    return;
+}
+
+
+static void
+UNICODE_copyswapn (char *dst, npy_intp dstride, char *src, npy_intp sstride,
+                   npy_intp n, int swap, PyArrayObject *arr)
+{
+    int itemsize;
+
+    assert(arr != NULL);
+    if (arr == NULL) {
+        return;
+    }
+    itemsize = PyArray_DESCR(arr)->elsize;
+    _basic_copyn(dst, dstride, src, sstride, n, itemsize);
+
+    if (swap) {
+        int i;
+        char *_dst;
+        itemsize = itemsize / 4;
+
+        while (n > 0) {
+            _dst = dst;
+            for (i=0; i < itemsize; i++) {
+                npy_bswap4_unaligned(_dst);
+                _dst += 4;
+            }
+            dst += dstride;
+            --n;
+        }
+    }
+}
+
+
+static void
+STRING_copyswap(char *dst, char *src, int NPY_UNUSED(swap), PyArrayObject *arr)
+{
+    assert(arr != NULL);
+    if (arr == NULL) {
+        return;
+    }
+    /* copy first if needed */
+    _basic_copy(dst, src, PyArray_DESCR(arr)->elsize);
+}
+
+static void
+UNICODE_copyswap (char *dst, char *src, int swap, PyArrayObject *arr)
+{
+    int itemsize;
+
+    assert(arr != NULL);
+    if (arr == NULL) {
+        return;
+    }
+    itemsize = PyArray_DESCR(arr)->elsize;
+    _basic_copy(dst, src, itemsize);
+
+    if (swap) {
+        int i;
+        char *_dst;
+        itemsize = itemsize / 4;
+
+        _dst = dst;
+        for (i=0; i < itemsize; i++) {
+            npy_bswap4_unaligned(_dst);
+            _dst += 4;
+        }
+    }
+}
+
+
+/*
+ *****************************************************************************
+ **                                 NONZERO                                 **
+ *****************************************************************************
+ */
+
+#define _NONZERO(a) ((a) != 0)
+
+#line 2758
+static npy_bool
+BOOL_nonzero (char *ip, PyArrayObject *ap)
+{
+    if (ap == NULL || PyArray_ISBEHAVED_RO(ap)) {
+        npy_bool *ptmp = (npy_bool *)ip;
+        return (npy_bool) _NONZERO(*ptmp);
+    }
+    else {
+        /*
+         * Don't worry about swapping for integer types,
+         * since we are just testing for equality with 0.
+         * For float types, the signed zeros require us to swap.
+         */
+        npy_bool tmp;
+#if 0
+        PyArray_DESCR(ap)->f->copyswap(&tmp, ip, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+#else
+        memcpy(&tmp, ip, sizeof(npy_bool));
+#endif
+        return (npy_bool) _NONZERO(tmp);
+    }
+}
+
+#line 2758
+static npy_bool
+BYTE_nonzero (char *ip, PyArrayObject *ap)
+{
+    if (ap == NULL || PyArray_ISBEHAVED_RO(ap)) {
+        npy_byte *ptmp = (npy_byte *)ip;
+        return (npy_bool) _NONZERO(*ptmp);
+    }
+    else {
+        /*
+         * Don't worry about swapping for integer types,
+         * since we are just testing for equality with 0.
+         * For float types, the signed zeros require us to swap.
+         */
+        npy_byte tmp;
+#if 0
+        PyArray_DESCR(ap)->f->copyswap(&tmp, ip, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+#else
+        memcpy(&tmp, ip, sizeof(npy_byte));
+#endif
+        return (npy_bool) _NONZERO(tmp);
+    }
+}
+
+#line 2758
+static npy_bool
+UBYTE_nonzero (char *ip, PyArrayObject *ap)
+{
+    if (ap == NULL || PyArray_ISBEHAVED_RO(ap)) {
+        npy_ubyte *ptmp = (npy_ubyte *)ip;
+        return (npy_bool) _NONZERO(*ptmp);
+    }
+    else {
+        /*
+         * Don't worry about swapping for integer types,
+         * since we are just testing for equality with 0.
+         * For float types, the signed zeros require us to swap.
+         */
+        npy_ubyte tmp;
+#if 0
+        PyArray_DESCR(ap)->f->copyswap(&tmp, ip, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+#else
+        memcpy(&tmp, ip, sizeof(npy_ubyte));
+#endif
+        return (npy_bool) _NONZERO(tmp);
+    }
+}
+
+#line 2758
+static npy_bool
+SHORT_nonzero (char *ip, PyArrayObject *ap)
+{
+    if (ap == NULL || PyArray_ISBEHAVED_RO(ap)) {
+        npy_short *ptmp = (npy_short *)ip;
+        return (npy_bool) _NONZERO(*ptmp);
+    }
+    else {
+        /*
+         * Don't worry about swapping for integer types,
+         * since we are just testing for equality with 0.
+         * For float types, the signed zeros require us to swap.
+         */
+        npy_short tmp;
+#if 0
+        PyArray_DESCR(ap)->f->copyswap(&tmp, ip, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+#else
+        memcpy(&tmp, ip, sizeof(npy_short));
+#endif
+        return (npy_bool) _NONZERO(tmp);
+    }
+}
+
+#line 2758
+static npy_bool
+USHORT_nonzero (char *ip, PyArrayObject *ap)
+{
+    if (ap == NULL || PyArray_ISBEHAVED_RO(ap)) {
+        npy_ushort *ptmp = (npy_ushort *)ip;
+        return (npy_bool) _NONZERO(*ptmp);
+    }
+    else {
+        /*
+         * Don't worry about swapping for integer types,
+         * since we are just testing for equality with 0.
+         * For float types, the signed zeros require us to swap.
+         */
+        npy_ushort tmp;
+#if 0
+        PyArray_DESCR(ap)->f->copyswap(&tmp, ip, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+#else
+        memcpy(&tmp, ip, sizeof(npy_ushort));
+#endif
+        return (npy_bool) _NONZERO(tmp);
+    }
+}
+
+#line 2758
+static npy_bool
+INT_nonzero (char *ip, PyArrayObject *ap)
+{
+    if (ap == NULL || PyArray_ISBEHAVED_RO(ap)) {
+        npy_int *ptmp = (npy_int *)ip;
+        return (npy_bool) _NONZERO(*ptmp);
+    }
+    else {
+        /*
+         * Don't worry about swapping for integer types,
+         * since we are just testing for equality with 0.
+         * For float types, the signed zeros require us to swap.
+         */
+        npy_int tmp;
+#if 0
+        PyArray_DESCR(ap)->f->copyswap(&tmp, ip, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+#else
+        memcpy(&tmp, ip, sizeof(npy_int));
+#endif
+        return (npy_bool) _NONZERO(tmp);
+    }
+}
+
+#line 2758
+static npy_bool
+UINT_nonzero (char *ip, PyArrayObject *ap)
+{
+    if (ap == NULL || PyArray_ISBEHAVED_RO(ap)) {
+        npy_uint *ptmp = (npy_uint *)ip;
+        return (npy_bool) _NONZERO(*ptmp);
+    }
+    else {
+        /*
+         * Don't worry about swapping for integer types,
+         * since we are just testing for equality with 0.
+         * For float types, the signed zeros require us to swap.
+         */
+        npy_uint tmp;
+#if 0
+        PyArray_DESCR(ap)->f->copyswap(&tmp, ip, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+#else
+        memcpy(&tmp, ip, sizeof(npy_uint));
+#endif
+        return (npy_bool) _NONZERO(tmp);
+    }
+}
+
+#line 2758
+static npy_bool
+LONG_nonzero (char *ip, PyArrayObject *ap)
+{
+    if (ap == NULL || PyArray_ISBEHAVED_RO(ap)) {
+        npy_long *ptmp = (npy_long *)ip;
+        return (npy_bool) _NONZERO(*ptmp);
+    }
+    else {
+        /*
+         * Don't worry about swapping for integer types,
+         * since we are just testing for equality with 0.
+         * For float types, the signed zeros require us to swap.
+         */
+        npy_long tmp;
+#if 0
+        PyArray_DESCR(ap)->f->copyswap(&tmp, ip, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+#else
+        memcpy(&tmp, ip, sizeof(npy_long));
+#endif
+        return (npy_bool) _NONZERO(tmp);
+    }
+}
+
+#line 2758
+static npy_bool
+ULONG_nonzero (char *ip, PyArrayObject *ap)
+{
+    if (ap == NULL || PyArray_ISBEHAVED_RO(ap)) {
+        npy_ulong *ptmp = (npy_ulong *)ip;
+        return (npy_bool) _NONZERO(*ptmp);
+    }
+    else {
+        /*
+         * Don't worry about swapping for integer types,
+         * since we are just testing for equality with 0.
+         * For float types, the signed zeros require us to swap.
+         */
+        npy_ulong tmp;
+#if 0
+        PyArray_DESCR(ap)->f->copyswap(&tmp, ip, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+#else
+        memcpy(&tmp, ip, sizeof(npy_ulong));
+#endif
+        return (npy_bool) _NONZERO(tmp);
+    }
+}
+
+#line 2758
+static npy_bool
+LONGLONG_nonzero (char *ip, PyArrayObject *ap)
+{
+    if (ap == NULL || PyArray_ISBEHAVED_RO(ap)) {
+        npy_longlong *ptmp = (npy_longlong *)ip;
+        return (npy_bool) _NONZERO(*ptmp);
+    }
+    else {
+        /*
+         * Don't worry about swapping for integer types,
+         * since we are just testing for equality with 0.
+         * For float types, the signed zeros require us to swap.
+         */
+        npy_longlong tmp;
+#if 0
+        PyArray_DESCR(ap)->f->copyswap(&tmp, ip, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+#else
+        memcpy(&tmp, ip, sizeof(npy_longlong));
+#endif
+        return (npy_bool) _NONZERO(tmp);
+    }
+}
+
+#line 2758
+static npy_bool
+ULONGLONG_nonzero (char *ip, PyArrayObject *ap)
+{
+    if (ap == NULL || PyArray_ISBEHAVED_RO(ap)) {
+        npy_ulonglong *ptmp = (npy_ulonglong *)ip;
+        return (npy_bool) _NONZERO(*ptmp);
+    }
+    else {
+        /*
+         * Don't worry about swapping for integer types,
+         * since we are just testing for equality with 0.
+         * For float types, the signed zeros require us to swap.
+         */
+        npy_ulonglong tmp;
+#if 0
+        PyArray_DESCR(ap)->f->copyswap(&tmp, ip, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+#else
+        memcpy(&tmp, ip, sizeof(npy_ulonglong));
+#endif
+        return (npy_bool) _NONZERO(tmp);
+    }
+}
+
+#line 2758
+static npy_bool
+HALF_nonzero (char *ip, PyArrayObject *ap)
+{
+    if (ap == NULL || PyArray_ISBEHAVED_RO(ap)) {
+        npy_half *ptmp = (npy_half *)ip;
+        return (npy_bool) !npy_half_iszero(*ptmp);
+    }
+    else {
+        /*
+         * Don't worry about swapping for integer types,
+         * since we are just testing for equality with 0.
+         * For float types, the signed zeros require us to swap.
+         */
+        npy_half tmp;
+#if 1
+        PyArray_DESCR(ap)->f->copyswap(&tmp, ip, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+#else
+        memcpy(&tmp, ip, sizeof(npy_half));
+#endif
+        return (npy_bool) !npy_half_iszero(tmp);
+    }
+}
+
+#line 2758
+static npy_bool
+FLOAT_nonzero (char *ip, PyArrayObject *ap)
+{
+    if (ap == NULL || PyArray_ISBEHAVED_RO(ap)) {
+        npy_float *ptmp = (npy_float *)ip;
+        return (npy_bool) _NONZERO(*ptmp);
+    }
+    else {
+        /*
+         * Don't worry about swapping for integer types,
+         * since we are just testing for equality with 0.
+         * For float types, the signed zeros require us to swap.
+         */
+        npy_float tmp;
+#if 1
+        PyArray_DESCR(ap)->f->copyswap(&tmp, ip, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+#else
+        memcpy(&tmp, ip, sizeof(npy_float));
+#endif
+        return (npy_bool) _NONZERO(tmp);
+    }
+}
+
+#line 2758
+static npy_bool
+DOUBLE_nonzero (char *ip, PyArrayObject *ap)
+{
+    if (ap == NULL || PyArray_ISBEHAVED_RO(ap)) {
+        npy_double *ptmp = (npy_double *)ip;
+        return (npy_bool) _NONZERO(*ptmp);
+    }
+    else {
+        /*
+         * Don't worry about swapping for integer types,
+         * since we are just testing for equality with 0.
+         * For float types, the signed zeros require us to swap.
+         */
+        npy_double tmp;
+#if 1
+        PyArray_DESCR(ap)->f->copyswap(&tmp, ip, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+#else
+        memcpy(&tmp, ip, sizeof(npy_double));
+#endif
+        return (npy_bool) _NONZERO(tmp);
+    }
+}
+
+#line 2758
+static npy_bool
+LONGDOUBLE_nonzero (char *ip, PyArrayObject *ap)
+{
+    if (ap == NULL || PyArray_ISBEHAVED_RO(ap)) {
+        npy_longdouble *ptmp = (npy_longdouble *)ip;
+        return (npy_bool) _NONZERO(*ptmp);
+    }
+    else {
+        /*
+         * Don't worry about swapping for integer types,
+         * since we are just testing for equality with 0.
+         * For float types, the signed zeros require us to swap.
+         */
+        npy_longdouble tmp;
+#if 1
+        PyArray_DESCR(ap)->f->copyswap(&tmp, ip, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+#else
+        memcpy(&tmp, ip, sizeof(npy_longdouble));
+#endif
+        return (npy_bool) _NONZERO(tmp);
+    }
+}
+
+#line 2758
+static npy_bool
+DATETIME_nonzero (char *ip, PyArrayObject *ap)
+{
+    if (ap == NULL || PyArray_ISBEHAVED_RO(ap)) {
+        npy_datetime *ptmp = (npy_datetime *)ip;
+        return (npy_bool) _NONZERO(*ptmp);
+    }
+    else {
+        /*
+         * Don't worry about swapping for integer types,
+         * since we are just testing for equality with 0.
+         * For float types, the signed zeros require us to swap.
+         */
+        npy_datetime tmp;
+#if 0
+        PyArray_DESCR(ap)->f->copyswap(&tmp, ip, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+#else
+        memcpy(&tmp, ip, sizeof(npy_datetime));
+#endif
+        return (npy_bool) _NONZERO(tmp);
+    }
+}
+
+#line 2758
+static npy_bool
+TIMEDELTA_nonzero (char *ip, PyArrayObject *ap)
+{
+    if (ap == NULL || PyArray_ISBEHAVED_RO(ap)) {
+        npy_timedelta *ptmp = (npy_timedelta *)ip;
+        return (npy_bool) _NONZERO(*ptmp);
+    }
+    else {
+        /*
+         * Don't worry about swapping for integer types,
+         * since we are just testing for equality with 0.
+         * For float types, the signed zeros require us to swap.
+         */
+        npy_timedelta tmp;
+#if 0
+        PyArray_DESCR(ap)->f->copyswap(&tmp, ip, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+#else
+        memcpy(&tmp, ip, sizeof(npy_timedelta));
+#endif
+        return (npy_bool) _NONZERO(tmp);
+    }
+}
+
+
+#line 2788
+static npy_bool
+CFLOAT_nonzero (char *ip, PyArrayObject *ap)
+{
+    if (ap == NULL || PyArray_ISBEHAVED_RO(ap)) {
+        npy_cfloat *ptmp = (npy_cfloat *)ip;
+        return (npy_bool) ((ptmp->real != 0) || (ptmp->imag != 0));
+    }
+    else {
+        npy_cfloat tmp;
+        PyArray_DESCR(ap)->f->copyswap(&tmp, ip, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+        return (npy_bool) ((tmp.real != 0) || (tmp.imag != 0));
+    }
+}
+
+#line 2788
+static npy_bool
+CDOUBLE_nonzero (char *ip, PyArrayObject *ap)
+{
+    if (ap == NULL || PyArray_ISBEHAVED_RO(ap)) {
+        npy_cdouble *ptmp = (npy_cdouble *)ip;
+        return (npy_bool) ((ptmp->real != 0) || (ptmp->imag != 0));
+    }
+    else {
+        npy_cdouble tmp;
+        PyArray_DESCR(ap)->f->copyswap(&tmp, ip, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+        return (npy_bool) ((tmp.real != 0) || (tmp.imag != 0));
+    }
+}
+
+#line 2788
+static npy_bool
+CLONGDOUBLE_nonzero (char *ip, PyArrayObject *ap)
+{
+    if (ap == NULL || PyArray_ISBEHAVED_RO(ap)) {
+        npy_clongdouble *ptmp = (npy_clongdouble *)ip;
+        return (npy_bool) ((ptmp->real != 0) || (ptmp->imag != 0));
+    }
+    else {
+        npy_clongdouble tmp;
+        PyArray_DESCR(ap)->f->copyswap(&tmp, ip, PyArray_ISBYTESWAPPED(ap),
+                                       ap);
+        return (npy_bool) ((tmp.real != 0) || (tmp.imag != 0));
+    }
+}
+
+
+
+#define WHITESPACE " \t\n\r\v\f"
+#define WHITELEN 6
+
+static npy_bool
+Py_STRING_ISSPACE(char ch)
+{
+    char white[] = WHITESPACE;
+    int j;
+    npy_bool space = NPY_FALSE;
+
+    for (j = 0; j < WHITELEN; j++) {
+        if (ch == white[j]) {
+            space = NPY_TRUE;
+            break;
+        }
+    }
+    return space;
+}
+
+static npy_bool
+STRING_nonzero (char *ip, PyArrayObject *ap)
+{
+    int len = PyArray_DESCR(ap)->elsize;
+    int i;
+    npy_bool nonz = NPY_FALSE;
+    npy_bool seen_null = NPY_FALSE;
+
+    for (i = 0; i < len; i++) {
+        if (*ip == '\0') {
+            seen_null = NPY_TRUE;
+        }
+        else if (seen_null || !Py_STRING_ISSPACE(*ip)) {
+            nonz = NPY_TRUE;
+            break;
+        }
+        ip++;
+    }
+    return nonz;
+}
+
+static npy_bool
+UNICODE_nonzero (npy_ucs4 *ip, PyArrayObject *ap)
+{
+    int len = PyArray_DESCR(ap)->elsize >> 2;
+    int i;
+    npy_bool nonz = NPY_FALSE;
+    npy_bool seen_null = NPY_FALSE;
+    char *buffer = NULL;
+
+    if (PyArray_ISBYTESWAPPED(ap) || !PyArray_ISALIGNED(ap)) {
+        buffer = PyArray_malloc(PyArray_DESCR(ap)->elsize);
+        if (buffer == NULL) {
+            return nonz;
+        }
+        memcpy(buffer, ip, PyArray_DESCR(ap)->elsize);
+        if (PyArray_ISBYTESWAPPED(ap)) {
+            byte_swap_vector(buffer, len, 4);
+        }
+        ip = (npy_ucs4 *)buffer;
+    }
+
+    for (i = 0; i < len; i++) {
+        if (*ip == '\0') {
+            seen_null = NPY_TRUE;
+        }
+        else if (seen_null || !Py_UNICODE_ISSPACE(*ip)) {
+            nonz = NPY_TRUE;
+            break;
+        }
+        ip++;
+    }
+    PyArray_free(buffer);
+    return nonz;
+}
+
+static npy_bool
+OBJECT_nonzero (PyObject **ip, PyArrayObject *ap)
+{
+
+    if (PyArray_ISALIGNED(ap)) {
+        if (*ip == NULL) {
+            return NPY_FALSE;
+        }
+        return (npy_bool) PyObject_IsTrue(*ip);
+    }
+    else {
+        PyObject *obj;
+        memcpy(&obj, (void *)ip, sizeof(obj));
+        if (obj == NULL) {
+            return NPY_FALSE;
+        }
+        return (npy_bool) PyObject_IsTrue(obj);
+    }
+}
+
+/*
+ * if we have fields, then nonzero only if all sub-fields are nonzero.
+ */
+static npy_bool
+VOID_nonzero (char *ip, PyArrayObject *ap)
+{
+    int i;
+    int len;
+    npy_bool nonz = NPY_FALSE;
+
+    if (PyArray_HASFIELDS(ap)) {
+        PyArray_Descr *descr;
+        PyObject *key, *value;
+        Py_ssize_t pos = 0;
+        PyArrayObject_fields dummy_fields = get_dummy_stack_array(ap);
+        PyArrayObject *dummy_arr = (PyArrayObject *)&dummy_fields;
+
+        descr = PyArray_DESCR(ap);
+        while (PyDict_Next(descr->fields, &pos, &key, &value)) {
+            PyArray_Descr * new;
+            npy_intp offset;
+            if (NPY_TITLE_KEY(key, value)) {
+                continue;
+            }
+            if (_unpack_field(value, &new, &offset) < 0) {
+                PyErr_Clear();
+                continue;
+            }
+
+            dummy_fields.descr = new;
+            if ((new->alignment > 1) && !NPY__ALIGNED(ip + offset,
+                        new->alignment)) {
+                PyArray_CLEARFLAGS(dummy_arr, NPY_ARRAY_ALIGNED);
+            }
+            else {
+                PyArray_ENABLEFLAGS(dummy_arr, NPY_ARRAY_ALIGNED);
+            }
+            if (new->f->nonzero(ip+offset, dummy_arr)) {
+                nonz = NPY_TRUE;
+                break;
+            }
+        }
+        return nonz;
+    }
+    len = PyArray_DESCR(ap)->elsize;
+    for (i = 0; i < len; i++) {
+        if (*ip != '\0') {
+            nonz = NPY_TRUE;
+            break;
+        }
+        ip++;
+    }
+    return nonz;
+}
+
+#undef NPY__ALIGNED
+
+
+/*
+ *****************************************************************************
+ **                                 COMPARE                                 **
+ *****************************************************************************
+ */
+
+
+/* boolean type */
+
+static int
+BOOL_compare(npy_bool *ip1, npy_bool *ip2, PyArrayObject *NPY_UNUSED(ap))
+{
+    return (*ip1 ? (*ip2 ? 0 : 1) : (*ip2 ? -1 : 0));
+}
+
+
+/* integer types */
+
+#line 2982
+
+static int
+BYTE_compare (npy_byte *pa, npy_byte *pb, PyArrayObject *NPY_UNUSED(ap))
+{
+    const npy_byte a = *pa;
+    const npy_byte b = *pb;
+
+    return a < b ? -1 : a == b ? 0 : 1;
+}
+
+
+#line 2982
+
+static int
+UBYTE_compare (npy_ubyte *pa, npy_ubyte *pb, PyArrayObject *NPY_UNUSED(ap))
+{
+    const npy_ubyte a = *pa;
+    const npy_ubyte b = *pb;
+
+    return a < b ? -1 : a == b ? 0 : 1;
+}
+
+
+#line 2982
+
+static int
+SHORT_compare (npy_short *pa, npy_short *pb, PyArrayObject *NPY_UNUSED(ap))
+{
+    const npy_short a = *pa;
+    const npy_short b = *pb;
+
+    return a < b ? -1 : a == b ? 0 : 1;
+}
+
+
+#line 2982
+
+static int
+USHORT_compare (npy_ushort *pa, npy_ushort *pb, PyArrayObject *NPY_UNUSED(ap))
+{
+    const npy_ushort a = *pa;
+    const npy_ushort b = *pb;
+
+    return a < b ? -1 : a == b ? 0 : 1;
+}
+
+
+#line 2982
+
+static int
+INT_compare (npy_int *pa, npy_int *pb, PyArrayObject *NPY_UNUSED(ap))
+{
+    const npy_int a = *pa;
+    const npy_int b = *pb;
+
+    return a < b ? -1 : a == b ? 0 : 1;
+}
+
+
+#line 2982
+
+static int
+UINT_compare (npy_uint *pa, npy_uint *pb, PyArrayObject *NPY_UNUSED(ap))
+{
+    const npy_uint a = *pa;
+    const npy_uint b = *pb;
+
+    return a < b ? -1 : a == b ? 0 : 1;
+}
+
+
+#line 2982
+
+static int
+LONG_compare (npy_long *pa, npy_long *pb, PyArrayObject *NPY_UNUSED(ap))
+{
+    const npy_long a = *pa;
+    const npy_long b = *pb;
+
+    return a < b ? -1 : a == b ? 0 : 1;
+}
+
+
+#line 2982
+
+static int
+ULONG_compare (npy_ulong *pa, npy_ulong *pb, PyArrayObject *NPY_UNUSED(ap))
+{
+    const npy_ulong a = *pa;
+    const npy_ulong b = *pb;
+
+    return a < b ? -1 : a == b ? 0 : 1;
+}
+
+
+#line 2982
+
+static int
+LONGLONG_compare (npy_longlong *pa, npy_longlong *pb, PyArrayObject *NPY_UNUSED(ap))
+{
+    const npy_longlong a = *pa;
+    const npy_longlong b = *pb;
+
+    return a < b ? -1 : a == b ? 0 : 1;
+}
+
+
+#line 2982
+
+static int
+ULONGLONG_compare (npy_ulonglong *pa, npy_ulonglong *pb, PyArrayObject *NPY_UNUSED(ap))
+{
+    const npy_ulonglong a = *pa;
+    const npy_ulonglong b = *pb;
+
+    return a < b ? -1 : a == b ? 0 : 1;
+}
+
+
+
+
+/* float types */
+
+/*
+ * The real/complex comparison functions are compatible with the new sort
+ * order for nans introduced in numpy 1.4.0. All nan values now compare
+ * larger than non-nan values and are sorted to the end. The comparison
+ * order is:
+ *
+ *      Real: [R, nan]
+ *      Complex: [R + Rj, R + nanj, nan + Rj, nan + nanj]
+ *
+ *  where complex values with the same nan placements are sorted according
+ *  to the non-nan part if it exists. If both the real and imaginary parts
+ *  of complex types are non-nan the order is the same as the real parts
+ *  unless they happen to be equal, in which case the order is that of the
+ *  imaginary parts.
+ */
+
+#line 3018
+
+#define LT(a,b) ((a) < (b) || ((b) != (b) && (a) ==(a)))
+
+static int
+FLOAT_compare(npy_float *pa, npy_float *pb, PyArrayObject *NPY_UNUSED(ap))
+{
+    const npy_float a = *pa;
+    const npy_float b = *pb;
+    int ret;
+
+    if (LT(a,b)) {
+        ret = -1;
+    }
+    else if (LT(b,a)) {
+        ret = 1;
+    }
+    else {
+        ret = 0;
+    }
+    return ret;
+}
+
+
+static int
+CFLOAT_compare(npy_float *pa, npy_float *pb, PyArrayObject *NPY_UNUSED(ap))
+{
+    const npy_float ar = pa[0];
+    const npy_float ai = pa[1];
+    const npy_float br = pb[0];
+    const npy_float bi = pb[1];
+    int ret;
+
+    if (ar < br) {
+        if (ai == ai || bi != bi) {
+            ret = -1;
+        }
+        else {
+            ret = 1;
+        }
+    }
+    else if (br < ar) {
+        if (bi == bi || ai != ai) {
+            ret = 1;
+        }
+        else {
+            ret = -1;
+        }
+    }
+    else if (ar == br || (ar != ar && br != br)) {
+        if (LT(ai,bi)) {
+            ret = -1;
+        }
+        else if (LT(bi,ai)) {
+            ret = 1;
+        }
+        else {
+            ret = 0;
+        }
+    }
+    else if (ar == ar) {
+        ret = -1;
+    }
+    else {
+        ret = 1;
+    }
+
+    return ret;
+}
+
+#undef LT
+
+
+#line 3018
+
+#define LT(a,b) ((a) < (b) || ((b) != (b) && (a) ==(a)))
+
+static int
+DOUBLE_compare(npy_double *pa, npy_double *pb, PyArrayObject *NPY_UNUSED(ap))
+{
+    const npy_double a = *pa;
+    const npy_double b = *pb;
+    int ret;
+
+    if (LT(a,b)) {
+        ret = -1;
+    }
+    else if (LT(b,a)) {
+        ret = 1;
+    }
+    else {
+        ret = 0;
+    }
+    return ret;
+}
+
+
+static int
+CDOUBLE_compare(npy_double *pa, npy_double *pb, PyArrayObject *NPY_UNUSED(ap))
+{
+    const npy_double ar = pa[0];
+    const npy_double ai = pa[1];
+    const npy_double br = pb[0];
+    const npy_double bi = pb[1];
+    int ret;
+
+    if (ar < br) {
+        if (ai == ai || bi != bi) {
+            ret = -1;
+        }
+        else {
+            ret = 1;
+        }
+    }
+    else if (br < ar) {
+        if (bi == bi || ai != ai) {
+            ret = 1;
+        }
+        else {
+            ret = -1;
+        }
+    }
+    else if (ar == br || (ar != ar && br != br)) {
+        if (LT(ai,bi)) {
+            ret = -1;
+        }
+        else if (LT(bi,ai)) {
+            ret = 1;
+        }
+        else {
+            ret = 0;
+        }
+    }
+    else if (ar == ar) {
+        ret = -1;
+    }
+    else {
+        ret = 1;
+    }
+
+    return ret;
+}
+
+#undef LT
+
+
+#line 3018
+
+#define LT(a,b) ((a) < (b) || ((b) != (b) && (a) ==(a)))
+
+static int
+LONGDOUBLE_compare(npy_longdouble *pa, npy_longdouble *pb, PyArrayObject *NPY_UNUSED(ap))
+{
+    const npy_longdouble a = *pa;
+    const npy_longdouble b = *pb;
+    int ret;
+
+    if (LT(a,b)) {
+        ret = -1;
+    }
+    else if (LT(b,a)) {
+        ret = 1;
+    }
+    else {
+        ret = 0;
+    }
+    return ret;
+}
+
+
+static int
+CLONGDOUBLE_compare(npy_longdouble *pa, npy_longdouble *pb, PyArrayObject *NPY_UNUSED(ap))
+{
+    const npy_longdouble ar = pa[0];
+    const npy_longdouble ai = pa[1];
+    const npy_longdouble br = pb[0];
+    const npy_longdouble bi = pb[1];
+    int ret;
+
+    if (ar < br) {
+        if (ai == ai || bi != bi) {
+            ret = -1;
+        }
+        else {
+            ret = 1;
+        }
+    }
+    else if (br < ar) {
+        if (bi == bi || ai != ai) {
+            ret = 1;
+        }
+        else {
+            ret = -1;
+        }
+    }
+    else if (ar == br || (ar != ar && br != br)) {
+        if (LT(ai,bi)) {
+            ret = -1;
+        }
+        else if (LT(bi,ai)) {
+            ret = 1;
+        }
+        else {
+            ret = 0;
+        }
+    }
+    else if (ar == ar) {
+        ret = -1;
+    }
+    else {
+        ret = 1;
+    }
+
+    return ret;
+}
+
+#undef LT
+
+
+
+#line 3095
+
+static int
+DATETIME_compare(npy_datetime *pa, npy_datetime *pb, PyArrayObject *NPY_UNUSED(ap))
+{
+    const npy_datetime a = *pa;
+    const npy_datetime b = *pb;
+    int ret;
+
+    if (a == NPY_DATETIME_NAT) {
+        if (b == NPY_DATETIME_NAT) {
+            ret = 0;
+        }
+        else {
+            ret = 1;
+        }
+    }
+    else if (b == NPY_DATETIME_NAT) {
+        ret = -1;
+    }
+    else {
+        ret = a < b ? -1 : a == b ? 0 : 1;
+    }
+    return ret;
+}
+
+
+#line 3095
+
+static int
+TIMEDELTA_compare(npy_timedelta *pa, npy_timedelta *pb, PyArrayObject *NPY_UNUSED(ap))
+{
+    const npy_timedelta a = *pa;
+    const npy_timedelta b = *pb;
+    int ret;
+
+    if (a == NPY_DATETIME_NAT) {
+        if (b == NPY_DATETIME_NAT) {
+            ret = 0;
+        }
+        else {
+            ret = 1;
+        }
+    }
+    else if (b == NPY_DATETIME_NAT) {
+        ret = -1;
+    }
+    else {
+        ret = a < b ? -1 : a == b ? 0 : 1;
+    }
+    return ret;
+}
+
+
+
+static int
+HALF_compare (npy_half *pa, npy_half *pb, PyArrayObject *NPY_UNUSED(ap))
+{
+    npy_half a = *pa, b = *pb;
+    npy_bool a_isnan, b_isnan;
+    int ret;
+
+    a_isnan = npy_half_isnan(a);
+    b_isnan = npy_half_isnan(b);
+
+    if (a_isnan) {
+        ret = b_isnan ? 0 : -1;
+    }
+    else if (b_isnan) {
+        ret = 1;
+    }
+    else if(npy_half_lt_nonan(a, b)) {
+        ret = -1;
+    }
+    else if(npy_half_lt_nonan(b, a)) {
+        ret = 1;
+    }
+    else {
+        ret = 0;
+    }
+
+    return ret;
+}
+
+
+/* object type */
+
+static int
+OBJECT_compare(PyObject **ip1, PyObject **ip2, PyArrayObject *NPY_UNUSED(ap))
+{
+    /*
+     * ALIGNMENT NOTE: It seems that PyArray_Sort is already handling
+     * the alignment of pointers, so it doesn't need to be handled
+     * here.
+     */
+
+    int ret;
+    /*
+     * work around gh-3879, we cannot abort an in-progress quicksort
+     * so at least do not raise again
+     */
+    if (PyErr_Occurred()) {
+        return 0;
+    }
+    if ((*ip1 == NULL) || (*ip2 == NULL)) {
+        if (ip1 == ip2) {
+            return 1;
+        }
+        if (ip1 == NULL) {
+            return -1;
+        }
+        return 1;
+    }
+
+    ret = PyObject_RichCompareBool(*ip1, *ip2, Py_LT);
+    if (ret < 0) {
+        /* error occurred, avoid the next call to PyObject_RichCompareBool */
+        return 0;
+    }
+    if (ret == 1) {
+        return -1;
+    }
+    else if (PyObject_RichCompareBool(*ip1, *ip2, Py_GT) == 1) {
+        return 1;
+    }
+    else {
+        return 0;
+    }
+}
+
+
+/* string type */
+
+static int
+STRING_compare(char *ip1, char *ip2, PyArrayObject *ap)
+{
+    const unsigned char *c1 = (unsigned char *)ip1;
+    const unsigned char *c2 = (unsigned char *)ip2;
+    const size_t len = PyArray_DESCR(ap)->elsize;
+    int i;
+
+    i = memcmp(c1, c2, len);
+    if (i > 0) {
+        return 1;
+    }
+    else if (i < 0) {
+        return -1;
+    }
+    return 0;
+}
+
+
+/* unicode type */
+
+static int
+UNICODE_compare(npy_ucs4 *ip1, npy_ucs4 *ip2,
+                PyArrayObject *ap)
+{
+    int itemsize = PyArray_DESCR(ap)->elsize;
+
+    if (itemsize < 0) {
+        return 0;
+    }
+    itemsize /= sizeof(npy_ucs4);
+    while (itemsize-- > 0) {
+        npy_ucs4 c1 = *ip1++;
+        npy_ucs4 c2 = *ip2++;
+        if (c1 != c2) {
+            return (c1 < c2) ? -1 : 1;
+        }
+    }
+    return 0;
+}
+
+
+/* void type */
+
+/*
+ * If fields are defined, then compare on first field and if equal
+ * compare on second field.  Continue until done or comparison results
+ * in not_equal.
+ *
+ * Must align data passed on to sub-comparisons.
+ * Also must swap data based on to sub-comparisons.
+ */
+static int
+VOID_compare(char *ip1, char *ip2, PyArrayObject *ap)
+{
+    PyArray_Descr *descr;
+    PyObject *names, *key;
+    PyObject *tup;
+    PyArrayObject_fields dummy_struct;
+    PyArrayObject *dummy = (PyArrayObject *)&dummy_struct;
+    char *nip1, *nip2;
+    int i, res = 0, swap = 0;
+
+    if (!PyArray_HASFIELDS(ap)) {
+        return STRING_compare(ip1, ip2, ap);
+    }
+    PyObject *mem_handler = PyDataMem_GetHandler();
+    if (mem_handler == NULL) {
+        goto finish;
+    }
+    descr = PyArray_DESCR(ap);
+    /*
+     * Compare on the first-field.  If equal, then
+     * compare on the second-field, etc.
+     */
+    names = descr->names;
+    for (i = 0; i < PyTuple_GET_SIZE(names); i++) {
+        PyArray_Descr *new;
+        npy_intp offset;
+        key = PyTuple_GET_ITEM(names, i);
+        tup = PyDict_GetItem(descr->fields, key);
+        if (_unpack_field(tup, &new, &offset) < 0) {
+            goto finish;
+        }
+        /* Set the fields needed by compare or copyswap */
+        dummy_struct.descr = new;
+
+        swap = PyArray_ISBYTESWAPPED(dummy);
+        nip1 = ip1 + offset;
+        nip2 = ip2 + offset;
+        if (swap || new->alignment > 1) {
+            if (swap || !npy_is_aligned(nip1, new->alignment)) {
+                /*
+                 * create temporary buffer and copy,
+                 * always use the current handler for internal allocations
+                 */
+                nip1 = PyDataMem_UserNEW(new->elsize, mem_handler);
+                if (nip1 == NULL) {
+                    goto finish;
+                }
+                memcpy(nip1, ip1 + offset, new->elsize);
+                if (swap)
+                    new->f->copyswap(nip1, NULL, swap, dummy);
+            }
+            if (swap || !npy_is_aligned(nip2, new->alignment)) {
+                /*
+                 * create temporary buffer and copy,
+                 * always use the current handler for internal allocations
+                 */
+                nip2 = PyDataMem_UserNEW(new->elsize, mem_handler);
+                if (nip2 == NULL) {
+                    if (nip1 != ip1 + offset) {
+                        /* destroy temporary buffer */
+                        PyDataMem_UserFREE(nip1, new->elsize, mem_handler);
+                    }
+                    goto finish;
+                }
+                memcpy(nip2, ip2 + offset, new->elsize);
+                if (swap)
+                    new->f->copyswap(nip2, NULL, swap, dummy);
+            }
+        }
+        res = new->f->compare(nip1, nip2, dummy);
+        if (swap || new->alignment > 1) {
+            if (nip1 != ip1 + offset) {
+                /* destroy temporary buffer */
+                PyDataMem_UserFREE(nip1, new->elsize, mem_handler);
+            }
+            if (nip2 != ip2 + offset) {
+                /* destroy temporary buffer */
+                PyDataMem_UserFREE(nip2, new->elsize, mem_handler);
+            }
+        }
+        if (res != 0) {
+            break;
+        }
+    }
+
+finish:
+    Py_XDECREF(mem_handler);
+    return res;
+}
+
+
+/*
+ *****************************************************************************
+ **                                 ARGFUNC                                 **
+ *****************************************************************************
+ */
+
+#define _LESS_THAN_OR_EQUAL(a,b) ((a) <= (b))
+
+#line 3365
+static int
+HALF_argmax(npy_half *ip, npy_intp n, npy_intp *max_ind,
+        PyArrayObject *NPY_UNUSED(aip))
+{
+    npy_intp i;
+    npy_half mp = *ip;
+#if 0
+    npy_half mp_im = ip[1];
+#endif
+
+    *max_ind = 0;
+
+#if 1
+    if (npy_half_isnan(mp)) {
+        /* nan encountered; it's maximal */
+        return 0;
+    }
+#endif
+#if 0
+    if (npy_half_isnan(mp_im)) {
+        /* nan encountered; it's maximal */
+        return 0;
+    }
+#endif
+#if 0
+    if (mp == NPY_DATETIME_NAT) {
+        /* NaT encountered, it's maximal */
+        return 0;
+    }
+#endif
+
+    for (i = 1; i < n; i++) {
+        ip++;
+        /*
+         * Propagate nans, similarly as max() and min()
+         */
+#if 0
+        /* Lexical order for complex numbers */
+        if ((ip[0] > mp) || ((ip[0] == mp) && (ip[1] > mp_im))
+                || npy_half_isnan(ip[0]) || npy_half_isnan(ip[1])) {
+            mp = ip[0];
+            mp_im = ip[1];
+            *max_ind = i;
+            if (npy_half_isnan(mp) || npy_half_isnan(mp_im)) {
+                /* nan encountered, it's maximal */
+                break;
+            }
+        }
+#else
+#if 0
+        if (*ip == NPY_DATETIME_NAT) {
+            /* NaT encountered, it's maximal */
+            *max_ind = i;
+            break;
+        }
+#endif
+        if (!npy_half_le(*ip, mp)) {  /* negated, for correct nan handling */
+            mp = *ip;
+            *max_ind = i;
+#if 1
+            if (npy_half_isnan(mp)) {
+                /* nan encountered, it's maximal */
+                break;
+            }
+#endif
+        }
+#endif
+    }
+    return 0;
+}
+
+
+#line 3365
+static int
+CFLOAT_argmax(npy_float *ip, npy_intp n, npy_intp *max_ind,
+        PyArrayObject *NPY_UNUSED(aip))
+{
+    npy_intp i;
+    npy_float mp = *ip;
+#if 1
+    npy_float mp_im = ip[1];
+#endif
+
+    *max_ind = 0;
+
+#if 1
+    if (npy_isnan(mp)) {
+        /* nan encountered; it's maximal */
+        return 0;
+    }
+#endif
+#if 1
+    if (npy_isnan(mp_im)) {
+        /* nan encountered; it's maximal */
+        return 0;
+    }
+#endif
+#if 0
+    if (mp == NPY_DATETIME_NAT) {
+        /* NaT encountered, it's maximal */
+        return 0;
+    }
+#endif
+
+    for (i = 1; i < n; i++) {
+        ip+=2;
+        /*
+         * Propagate nans, similarly as max() and min()
+         */
+#if 1
+        /* Lexical order for complex numbers */
+        if ((ip[0] > mp) || ((ip[0] == mp) && (ip[1] > mp_im))
+                || npy_isnan(ip[0]) || npy_isnan(ip[1])) {
+            mp = ip[0];
+            mp_im = ip[1];
+            *max_ind = i;
+            if (npy_isnan(mp) || npy_isnan(mp_im)) {
+                /* nan encountered, it's maximal */
+                break;
+            }
+        }
+#else
+#if 0
+        if (*ip == NPY_DATETIME_NAT) {
+            /* NaT encountered, it's maximal */
+            *max_ind = i;
+            break;
+        }
+#endif
+        if (!_LESS_THAN_OR_EQUAL(*ip, mp)) {  /* negated, for correct nan handling */
+            mp = *ip;
+            *max_ind = i;
+#if 1
+            if (npy_isnan(mp)) {
+                /* nan encountered, it's maximal */
+                break;
+            }
+#endif
+        }
+#endif
+    }
+    return 0;
+}
+
+
+#line 3365
+static int
+CDOUBLE_argmax(npy_double *ip, npy_intp n, npy_intp *max_ind,
+        PyArrayObject *NPY_UNUSED(aip))
+{
+    npy_intp i;
+    npy_double mp = *ip;
+#if 1
+    npy_double mp_im = ip[1];
+#endif
+
+    *max_ind = 0;
+
+#if 1
+    if (npy_isnan(mp)) {
+        /* nan encountered; it's maximal */
+        return 0;
+    }
+#endif
+#if 1
+    if (npy_isnan(mp_im)) {
+        /* nan encountered; it's maximal */
+        return 0;
+    }
+#endif
+#if 0
+    if (mp == NPY_DATETIME_NAT) {
+        /* NaT encountered, it's maximal */
+        return 0;
+    }
+#endif
+
+    for (i = 1; i < n; i++) {
+        ip+=2;
+        /*
+         * Propagate nans, similarly as max() and min()
+         */
+#if 1
+        /* Lexical order for complex numbers */
+        if ((ip[0] > mp) || ((ip[0] == mp) && (ip[1] > mp_im))
+                || npy_isnan(ip[0]) || npy_isnan(ip[1])) {
+            mp = ip[0];
+            mp_im = ip[1];
+            *max_ind = i;
+            if (npy_isnan(mp) || npy_isnan(mp_im)) {
+                /* nan encountered, it's maximal */
+                break;
+            }
+        }
+#else
+#if 0
+        if (*ip == NPY_DATETIME_NAT) {
+            /* NaT encountered, it's maximal */
+            *max_ind = i;
+            break;
+        }
+#endif
+        if (!_LESS_THAN_OR_EQUAL(*ip, mp)) {  /* negated, for correct nan handling */
+            mp = *ip;
+            *max_ind = i;
+#if 1
+            if (npy_isnan(mp)) {
+                /* nan encountered, it's maximal */
+                break;
+            }
+#endif
+        }
+#endif
+    }
+    return 0;
+}
+
+
+#line 3365
+static int
+CLONGDOUBLE_argmax(npy_longdouble *ip, npy_intp n, npy_intp *max_ind,
+        PyArrayObject *NPY_UNUSED(aip))
+{
+    npy_intp i;
+    npy_longdouble mp = *ip;
+#if 1
+    npy_longdouble mp_im = ip[1];
+#endif
+
+    *max_ind = 0;
+
+#if 1
+    if (npy_isnan(mp)) {
+        /* nan encountered; it's maximal */
+        return 0;
+    }
+#endif
+#if 1
+    if (npy_isnan(mp_im)) {
+        /* nan encountered; it's maximal */
+        return 0;
+    }
+#endif
+#if 0
+    if (mp == NPY_DATETIME_NAT) {
+        /* NaT encountered, it's maximal */
+        return 0;
+    }
+#endif
+
+    for (i = 1; i < n; i++) {
+        ip+=2;
+        /*
+         * Propagate nans, similarly as max() and min()
+         */
+#if 1
+        /* Lexical order for complex numbers */
+        if ((ip[0] > mp) || ((ip[0] == mp) && (ip[1] > mp_im))
+                || npy_isnan(ip[0]) || npy_isnan(ip[1])) {
+            mp = ip[0];
+            mp_im = ip[1];
+            *max_ind = i;
+            if (npy_isnan(mp) || npy_isnan(mp_im)) {
+                /* nan encountered, it's maximal */
+                break;
+            }
+        }
+#else
+#if 0
+        if (*ip == NPY_DATETIME_NAT) {
+            /* NaT encountered, it's maximal */
+            *max_ind = i;
+            break;
+        }
+#endif
+        if (!_LESS_THAN_OR_EQUAL(*ip, mp)) {  /* negated, for correct nan handling */
+            mp = *ip;
+            *max_ind = i;
+#if 1
+            if (npy_isnan(mp)) {
+                /* nan encountered, it's maximal */
+                break;
+            }
+#endif
+        }
+#endif
+    }
+    return 0;
+}
+
+
+#line 3365
+static int
+DATETIME_argmax(npy_datetime *ip, npy_intp n, npy_intp *max_ind,
+        PyArrayObject *NPY_UNUSED(aip))
+{
+    npy_intp i;
+    npy_datetime mp = *ip;
+#if 0
+    npy_datetime mp_im = ip[1];
+#endif
+
+    *max_ind = 0;
+
+#if 0
+    if (nop(mp)) {
+        /* nan encountered; it's maximal */
+        return 0;
+    }
+#endif
+#if 0
+    if (nop(mp_im)) {
+        /* nan encountered; it's maximal */
+        return 0;
+    }
+#endif
+#if 1
+    if (mp == NPY_DATETIME_NAT) {
+        /* NaT encountered, it's maximal */
+        return 0;
+    }
+#endif
+
+    for (i = 1; i < n; i++) {
+        ip++;
+        /*
+         * Propagate nans, similarly as max() and min()
+         */
+#if 0
+        /* Lexical order for complex numbers */
+        if ((ip[0] > mp) || ((ip[0] == mp) && (ip[1] > mp_im))
+                || nop(ip[0]) || nop(ip[1])) {
+            mp = ip[0];
+            mp_im = ip[1];
+            *max_ind = i;
+            if (nop(mp) || nop(mp_im)) {
+                /* nan encountered, it's maximal */
+                break;
+            }
+        }
+#else
+#if 1
+        if (*ip == NPY_DATETIME_NAT) {
+            /* NaT encountered, it's maximal */
+            *max_ind = i;
+            break;
+        }
+#endif
+        if (!_LESS_THAN_OR_EQUAL(*ip, mp)) {  /* negated, for correct nan handling */
+            mp = *ip;
+            *max_ind = i;
+#if 0
+            if (nop(mp)) {
+                /* nan encountered, it's maximal */
+                break;
+            }
+#endif
+        }
+#endif
+    }
+    return 0;
+}
+
+
+#line 3365
+static int
+TIMEDELTA_argmax(npy_timedelta *ip, npy_intp n, npy_intp *max_ind,
+        PyArrayObject *NPY_UNUSED(aip))
+{
+    npy_intp i;
+    npy_timedelta mp = *ip;
+#if 0
+    npy_timedelta mp_im = ip[1];
+#endif
+
+    *max_ind = 0;
+
+#if 0
+    if (nop(mp)) {
+        /* nan encountered; it's maximal */
+        return 0;
+    }
+#endif
+#if 0
+    if (nop(mp_im)) {
+        /* nan encountered; it's maximal */
+        return 0;
+    }
+#endif
+#if 1
+    if (mp == NPY_DATETIME_NAT) {
+        /* NaT encountered, it's maximal */
+        return 0;
+    }
+#endif
+
+    for (i = 1; i < n; i++) {
+        ip++;
+        /*
+         * Propagate nans, similarly as max() and min()
+         */
+#if 0
+        /* Lexical order for complex numbers */
+        if ((ip[0] > mp) || ((ip[0] == mp) && (ip[1] > mp_im))
+                || nop(ip[0]) || nop(ip[1])) {
+            mp = ip[0];
+            mp_im = ip[1];
+            *max_ind = i;
+            if (nop(mp) || nop(mp_im)) {
+                /* nan encountered, it's maximal */
+                break;
+            }
+        }
+#else
+#if 1
+        if (*ip == NPY_DATETIME_NAT) {
+            /* NaT encountered, it's maximal */
+            *max_ind = i;
+            break;
+        }
+#endif
+        if (!_LESS_THAN_OR_EQUAL(*ip, mp)) {  /* negated, for correct nan handling */
+            mp = *ip;
+            *max_ind = i;
+#if 0
+            if (nop(mp)) {
+                /* nan encountered, it's maximal */
+                break;
+            }
+#endif
+        }
+#endif
+    }
+    return 0;
+}
+
+
+
+static int
+BOOL_argmin(npy_bool *ip, npy_intp n, npy_intp *min_ind,
+            PyArrayObject *NPY_UNUSED(aip))
+
+{
+    npy_bool * p = memchr(ip, 0, n * sizeof(*ip));
+    if (p == NULL) {
+        *min_ind = 0;
+        return 0;
+    }
+    *min_ind = p - ip;
+    return 0;
+}
+
+#line 3465
+static int
+HALF_argmin(npy_half *ip, npy_intp n, npy_intp *min_ind,
+        PyArrayObject *NPY_UNUSED(aip))
+{
+    npy_intp i;
+    npy_half mp = *ip;
+#if 0
+    npy_half mp_im = ip[1];
+#endif
+
+    *min_ind = 0;
+
+#if 1
+    if (npy_half_isnan(mp)) {
+        /* nan encountered; it's minimal */
+        return 0;
+    }
+#endif
+#if 0
+    if (npy_half_isnan(mp_im)) {
+        /* nan encountered; it's minimal */
+        return 0;
+    }
+#endif
+#if 0
+    if (mp == NPY_DATETIME_NAT) {
+        /* NaT encountered, it's minimal */
+        return 0;
+    }
+#endif
+
+    for (i = 1; i < n; i++) {
+        ip++;
+        /*
+         * Propagate nans, similarly as max() and min()
+         */
+#if 0
+        /* Lexical order for complex numbers */
+        if ((mp > ip[0]) || ((ip[0] == mp) && (mp_im > ip[1]))
+                || npy_half_isnan(ip[0]) || npy_half_isnan(ip[1])) {
+            mp = ip[0];
+            mp_im = ip[1];
+            *min_ind = i;
+            if (npy_half_isnan(mp) || npy_half_isnan(mp_im)) {
+                /* nan encountered, it's minimal */
+                break;
+            }
+        }
+#else
+#if 0
+        if (*ip == NPY_DATETIME_NAT) {
+            /* NaT encountered, it's minimal */
+            *min_ind = i;
+            break;
+        }
+#endif
+        if (!npy_half_le(mp, *ip)) {  /* negated, for correct nan handling */
+            mp = *ip;
+            *min_ind = i;
+#if 1
+            if (npy_half_isnan(mp)) {
+                /* nan encountered, it's minimal */
+                break;
+            }
+#endif
+        }
+#endif
+    }
+    return 0;
+}
+
+
+#line 3465
+static int
+CFLOAT_argmin(npy_float *ip, npy_intp n, npy_intp *min_ind,
+        PyArrayObject *NPY_UNUSED(aip))
+{
+    npy_intp i;
+    npy_float mp = *ip;
+#if 1
+    npy_float mp_im = ip[1];
+#endif
+
+    *min_ind = 0;
+
+#if 1
+    if (npy_isnan(mp)) {
+        /* nan encountered; it's minimal */
+        return 0;
+    }
+#endif
+#if 1
+    if (npy_isnan(mp_im)) {
+        /* nan encountered; it's minimal */
+        return 0;
+    }
+#endif
+#if 0
+    if (mp == NPY_DATETIME_NAT) {
+        /* NaT encountered, it's minimal */
+        return 0;
+    }
+#endif
+
+    for (i = 1; i < n; i++) {
+        ip+=2;
+        /*
+         * Propagate nans, similarly as max() and min()
+         */
+#if 1
+        /* Lexical order for complex numbers */
+        if ((mp > ip[0]) || ((ip[0] == mp) && (mp_im > ip[1]))
+                || npy_isnan(ip[0]) || npy_isnan(ip[1])) {
+            mp = ip[0];
+            mp_im = ip[1];
+            *min_ind = i;
+            if (npy_isnan(mp) || npy_isnan(mp_im)) {
+                /* nan encountered, it's minimal */
+                break;
+            }
+        }
+#else
+#if 0
+        if (*ip == NPY_DATETIME_NAT) {
+            /* NaT encountered, it's minimal */
+            *min_ind = i;
+            break;
+        }
+#endif
+        if (!_LESS_THAN_OR_EQUAL(mp, *ip)) {  /* negated, for correct nan handling */
+            mp = *ip;
+            *min_ind = i;
+#if 1
+            if (npy_isnan(mp)) {
+                /* nan encountered, it's minimal */
+                break;
+            }
+#endif
+        }
+#endif
+    }
+    return 0;
+}
+
+
+#line 3465
+static int
+CDOUBLE_argmin(npy_double *ip, npy_intp n, npy_intp *min_ind,
+        PyArrayObject *NPY_UNUSED(aip))
+{
+    npy_intp i;
+    npy_double mp = *ip;
+#if 1
+    npy_double mp_im = ip[1];
+#endif
+
+    *min_ind = 0;
+
+#if 1
+    if (npy_isnan(mp)) {
+        /* nan encountered; it's minimal */
+        return 0;
+    }
+#endif
+#if 1
+    if (npy_isnan(mp_im)) {
+        /* nan encountered; it's minimal */
+        return 0;
+    }
+#endif
+#if 0
+    if (mp == NPY_DATETIME_NAT) {
+        /* NaT encountered, it's minimal */
+        return 0;
+    }
+#endif
+
+    for (i = 1; i < n; i++) {
+        ip+=2;
+        /*
+         * Propagate nans, similarly as max() and min()
+         */
+#if 1
+        /* Lexical order for complex numbers */
+        if ((mp > ip[0]) || ((ip[0] == mp) && (mp_im > ip[1]))
+                || npy_isnan(ip[0]) || npy_isnan(ip[1])) {
+            mp = ip[0];
+            mp_im = ip[1];
+            *min_ind = i;
+            if (npy_isnan(mp) || npy_isnan(mp_im)) {
+                /* nan encountered, it's minimal */
+                break;
+            }
+        }
+#else
+#if 0
+        if (*ip == NPY_DATETIME_NAT) {
+            /* NaT encountered, it's minimal */
+            *min_ind = i;
+            break;
+        }
+#endif
+        if (!_LESS_THAN_OR_EQUAL(mp, *ip)) {  /* negated, for correct nan handling */
+            mp = *ip;
+            *min_ind = i;
+#if 1
+            if (npy_isnan(mp)) {
+                /* nan encountered, it's minimal */
+                break;
+            }
+#endif
+        }
+#endif
+    }
+    return 0;
+}
+
+
+#line 3465
+static int
+CLONGDOUBLE_argmin(npy_longdouble *ip, npy_intp n, npy_intp *min_ind,
+        PyArrayObject *NPY_UNUSED(aip))
+{
+    npy_intp i;
+    npy_longdouble mp = *ip;
+#if 1
+    npy_longdouble mp_im = ip[1];
+#endif
+
+    *min_ind = 0;
+
+#if 1
+    if (npy_isnan(mp)) {
+        /* nan encountered; it's minimal */
+        return 0;
+    }
+#endif
+#if 1
+    if (npy_isnan(mp_im)) {
+        /* nan encountered; it's minimal */
+        return 0;
+    }
+#endif
+#if 0
+    if (mp == NPY_DATETIME_NAT) {
+        /* NaT encountered, it's minimal */
+        return 0;
+    }
+#endif
+
+    for (i = 1; i < n; i++) {
+        ip+=2;
+        /*
+         * Propagate nans, similarly as max() and min()
+         */
+#if 1
+        /* Lexical order for complex numbers */
+        if ((mp > ip[0]) || ((ip[0] == mp) && (mp_im > ip[1]))
+                || npy_isnan(ip[0]) || npy_isnan(ip[1])) {
+            mp = ip[0];
+            mp_im = ip[1];
+            *min_ind = i;
+            if (npy_isnan(mp) || npy_isnan(mp_im)) {
+                /* nan encountered, it's minimal */
+                break;
+            }
+        }
+#else
+#if 0
+        if (*ip == NPY_DATETIME_NAT) {
+            /* NaT encountered, it's minimal */
+            *min_ind = i;
+            break;
+        }
+#endif
+        if (!_LESS_THAN_OR_EQUAL(mp, *ip)) {  /* negated, for correct nan handling */
+            mp = *ip;
+            *min_ind = i;
+#if 1
+            if (npy_isnan(mp)) {
+                /* nan encountered, it's minimal */
+                break;
+            }
+#endif
+        }
+#endif
+    }
+    return 0;
+}
+
+
+#line 3465
+static int
+DATETIME_argmin(npy_datetime *ip, npy_intp n, npy_intp *min_ind,
+        PyArrayObject *NPY_UNUSED(aip))
+{
+    npy_intp i;
+    npy_datetime mp = *ip;
+#if 0
+    npy_datetime mp_im = ip[1];
+#endif
+
+    *min_ind = 0;
+
+#if 0
+    if (nop(mp)) {
+        /* nan encountered; it's minimal */
+        return 0;
+    }
+#endif
+#if 0
+    if (nop(mp_im)) {
+        /* nan encountered; it's minimal */
+        return 0;
+    }
+#endif
+#if 1
+    if (mp == NPY_DATETIME_NAT) {
+        /* NaT encountered, it's minimal */
+        return 0;
+    }
+#endif
+
+    for (i = 1; i < n; i++) {
+        ip++;
+        /*
+         * Propagate nans, similarly as max() and min()
+         */
+#if 0
+        /* Lexical order for complex numbers */
+        if ((mp > ip[0]) || ((ip[0] == mp) && (mp_im > ip[1]))
+                || nop(ip[0]) || nop(ip[1])) {
+            mp = ip[0];
+            mp_im = ip[1];
+            *min_ind = i;
+            if (nop(mp) || nop(mp_im)) {
+                /* nan encountered, it's minimal */
+                break;
+            }
+        }
+#else
+#if 1
+        if (*ip == NPY_DATETIME_NAT) {
+            /* NaT encountered, it's minimal */
+            *min_ind = i;
+            break;
+        }
+#endif
+        if (!_LESS_THAN_OR_EQUAL(mp, *ip)) {  /* negated, for correct nan handling */
+            mp = *ip;
+            *min_ind = i;
+#if 0
+            if (nop(mp)) {
+                /* nan encountered, it's minimal */
+                break;
+            }
+#endif
+        }
+#endif
+    }
+    return 0;
+}
+
+
+#line 3465
+static int
+TIMEDELTA_argmin(npy_timedelta *ip, npy_intp n, npy_intp *min_ind,
+        PyArrayObject *NPY_UNUSED(aip))
+{
+    npy_intp i;
+    npy_timedelta mp = *ip;
+#if 0
+    npy_timedelta mp_im = ip[1];
+#endif
+
+    *min_ind = 0;
+
+#if 0
+    if (nop(mp)) {
+        /* nan encountered; it's minimal */
+        return 0;
+    }
+#endif
+#if 0
+    if (nop(mp_im)) {
+        /* nan encountered; it's minimal */
+        return 0;
+    }
+#endif
+#if 1
+    if (mp == NPY_DATETIME_NAT) {
+        /* NaT encountered, it's minimal */
+        return 0;
+    }
+#endif
+
+    for (i = 1; i < n; i++) {
+        ip++;
+        /*
+         * Propagate nans, similarly as max() and min()
+         */
+#if 0
+        /* Lexical order for complex numbers */
+        if ((mp > ip[0]) || ((ip[0] == mp) && (mp_im > ip[1]))
+                || nop(ip[0]) || nop(ip[1])) {
+            mp = ip[0];
+            mp_im = ip[1];
+            *min_ind = i;
+            if (nop(mp) || nop(mp_im)) {
+                /* nan encountered, it's minimal */
+                break;
+            }
+        }
+#else
+#if 1
+        if (*ip == NPY_DATETIME_NAT) {
+            /* NaT encountered, it's minimal */
+            *min_ind = i;
+            break;
+        }
+#endif
+        if (!_LESS_THAN_OR_EQUAL(mp, *ip)) {  /* negated, for correct nan handling */
+            mp = *ip;
+            *min_ind = i;
+#if 0
+            if (nop(mp)) {
+                /* nan encountered, it's minimal */
+                break;
+            }
+#endif
+        }
+#endif
+    }
+    return 0;
+}
+
+
+
+#undef _LESS_THAN_OR_EQUAL
+
+static int
+OBJECT_argmax(PyObject **ip, npy_intp n, npy_intp *max_ind,
+              PyArrayObject *NPY_UNUSED(aip))
+{
+    npy_intp i;
+
+    *max_ind = 0;
+    /* Skip over all leading NULL entries */
+    for (i = 0; i < n && ip[i] == NULL; ++i);
+    if (i < n) {
+        /* Found first non-NULL entry */
+        PyObject *mp = ip[i];
+        *max_ind = i;
+        for (i = i + 1; i < n; ++i) {
+            PyObject *val = ip[i];
+            if (val != NULL) {
+                int greater_than = PyObject_RichCompareBool(val, mp, Py_GT);
+
+                if (greater_than < 0) {
+                    return 0;
+                }
+                if (greater_than) {
+                    mp = val;
+                    *max_ind = i;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+#line 3577
+static int
+STRING_argmax(npy_char *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip)
+{
+    npy_intp i;
+    int elsize = PyArray_DESCR(aip)->elsize;
+    npy_char *mp = (npy_char *)PyArray_malloc(elsize);
+
+    if (mp == NULL) {
+        return 0;
+    }
+    memcpy(mp, ip, elsize);
+    *max_ind = 0;
+    for (i = 1; i < n; i++) {
+        ip += elsize / sizeof(npy_char);
+        if (STRING_compare(ip, mp, aip) > 0) {
+            memcpy(mp, ip, elsize);
+            *max_ind = i;
+        }
+    }
+    PyArray_free(mp);
+    return 0;
+}
+
+
+#line 3577
+static int
+UNICODE_argmax(npy_ucs4 *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip)
+{
+    npy_intp i;
+    int elsize = PyArray_DESCR(aip)->elsize;
+    npy_ucs4 *mp = (npy_ucs4 *)PyArray_malloc(elsize);
+
+    if (mp == NULL) {
+        return 0;
+    }
+    memcpy(mp, ip, elsize);
+    *max_ind = 0;
+    for (i = 1; i < n; i++) {
+        ip += elsize / sizeof(npy_ucs4);
+        if (UNICODE_compare(ip, mp, aip) > 0) {
+            memcpy(mp, ip, elsize);
+            *max_ind = i;
+        }
+    }
+    PyArray_free(mp);
+    return 0;
+}
+
+
+
+#define VOID_argmax NULL
+
+static int
+OBJECT_argmin(PyObject **ip, npy_intp n, npy_intp *min_ind,
+              PyArrayObject *NPY_UNUSED(aip))
+{
+    npy_intp i;
+
+    *min_ind = 0;
+    /* Skip over all leading NULL entries */
+    for (i = 0; i < n && ip[i] == NULL; ++i);
+    if (i < n) {
+        /* Found first non-NULL entry */
+        PyObject *mp = ip[i];
+        *min_ind = i;
+        for (i = i + 1; i < n ; ++i) {
+            PyObject *val = ip[i];
+            if (val != NULL) {
+                int less_than = PyObject_RichCompareBool(val, mp, Py_LT);
+
+                if (less_than < 0) {
+                    return 0;
+                }
+                if (less_than) {
+                    mp = val;
+                    *min_ind = i;
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+#line 3641
+static int
+STRING_argmin(npy_char *ip, npy_intp n, npy_intp *min_ind, PyArrayObject *aip)
+{
+    npy_intp i;
+    int elsize = PyArray_DESCR(aip)->elsize;
+    npy_char *mp = (npy_char *)PyArray_malloc(elsize);
+
+    if (mp==NULL) return 0;
+    memcpy(mp, ip, elsize);
+    *min_ind = 0;
+    for(i=1; i<n; i++) {
+        ip += elsize / sizeof(npy_char);
+        if (STRING_compare(mp,ip,aip) > 0) {
+            memcpy(mp, ip, elsize);
+            *min_ind=i;
+        }
+    }
+    PyArray_free(mp);
+    return 0;
+}
+
+
+#line 3641
+static int
+UNICODE_argmin(npy_ucs4 *ip, npy_intp n, npy_intp *min_ind, PyArrayObject *aip)
+{
+    npy_intp i;
+    int elsize = PyArray_DESCR(aip)->elsize;
+    npy_ucs4 *mp = (npy_ucs4 *)PyArray_malloc(elsize);
+
+    if (mp==NULL) return 0;
+    memcpy(mp, ip, elsize);
+    *min_ind = 0;
+    for(i=1; i<n; i++) {
+        ip += elsize / sizeof(npy_ucs4);
+        if (UNICODE_compare(mp,ip,aip) > 0) {
+            memcpy(mp, ip, elsize);
+            *min_ind=i;
+        }
+    }
+    PyArray_free(mp);
+    return 0;
+}
+
+
+
+
+#define VOID_argmin NULL
+
+
+/*
+ *****************************************************************************
+ **                                  DOT                                    **
+ *****************************************************************************
+ */
+
+/*
+ * dot means inner product
+ */
+
+/************************** MAYBE USE CBLAS *********************************/
+
+
+#line 3687
+NPY_NO_EXPORT void
+FLOAT_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op,
+           npy_intp n, void *NPY_UNUSED(ignore))
+{
+#if defined(HAVE_CBLAS)
+    CBLAS_INT is1b = blas_stride(is1, sizeof(npy_float));
+    CBLAS_INT is2b = blas_stride(is2, sizeof(npy_float));
+
+    if (is1b && is2b)
+    {
+        double sum = 0.;  /* double for stability */
+
+        while (n > 0) {
+            CBLAS_INT chunk = n < NPY_CBLAS_CHUNK ? n : NPY_CBLAS_CHUNK;
+
+            sum += CBLAS_FUNC(cblas_sdot)(chunk,
+                                     (npy_float *) ip1, is1b,
+                                     (npy_float *) ip2, is2b);
+            /* use char strides here */
+            ip1 += chunk * is1;
+            ip2 += chunk * is2;
+            n -= chunk;
+        }
+        *((npy_float *)op) = (npy_float)sum;
+    }
+    else
+#endif
+    {
+        npy_float sum = (npy_float)0;  /* could make this double */
+        npy_intp i;
+
+        for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+            const npy_float ip1r = *((npy_float *)ip1);
+            const npy_float ip2r = *((npy_float *)ip2);
+
+            sum += ip1r * ip2r;
+        }
+        *((npy_float *)op) = sum;
+    }
+}
+
+#line 3687
+NPY_NO_EXPORT void
+DOUBLE_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op,
+           npy_intp n, void *NPY_UNUSED(ignore))
+{
+#if defined(HAVE_CBLAS)
+    CBLAS_INT is1b = blas_stride(is1, sizeof(npy_double));
+    CBLAS_INT is2b = blas_stride(is2, sizeof(npy_double));
+
+    if (is1b && is2b)
+    {
+        double sum = 0.;  /* double for stability */
+
+        while (n > 0) {
+            CBLAS_INT chunk = n < NPY_CBLAS_CHUNK ? n : NPY_CBLAS_CHUNK;
+
+            sum += CBLAS_FUNC(cblas_ddot)(chunk,
+                                     (npy_double *) ip1, is1b,
+                                     (npy_double *) ip2, is2b);
+            /* use char strides here */
+            ip1 += chunk * is1;
+            ip2 += chunk * is2;
+            n -= chunk;
+        }
+        *((npy_double *)op) = (npy_double)sum;
+    }
+    else
+#endif
+    {
+        npy_double sum = (npy_double)0;  /* could make this double */
+        npy_intp i;
+
+        for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+            const npy_double ip1r = *((npy_double *)ip1);
+            const npy_double ip2r = *((npy_double *)ip2);
+
+            sum += ip1r * ip2r;
+        }
+        *((npy_double *)op) = sum;
+    }
+}
+
+
+#line 3736
+NPY_NO_EXPORT void
+CFLOAT_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2,
+           char *op, npy_intp n, void *NPY_UNUSED(ignore))
+{
+#if defined(HAVE_CBLAS)
+    CBLAS_INT is1b = blas_stride(is1, sizeof(npy_cfloat));
+    CBLAS_INT is2b = blas_stride(is2, sizeof(npy_cfloat));
+
+    if (is1b && is2b) {
+        double sum[2] = {0., 0.};  /* double for stability */
+
+        while (n > 0) {
+            CBLAS_INT chunk = n < NPY_CBLAS_CHUNK ? n : NPY_CBLAS_CHUNK;
+            npy_float tmp[2];
+
+            CBLAS_FUNC(cblas_cdotu_sub)(
+                    (CBLAS_INT)chunk, ip1, is1b, ip2, is2b, tmp);
+            sum[0] += (double)tmp[0];
+            sum[1] += (double)tmp[1];
+            /* use char strides here */
+            ip1 += chunk * is1;
+            ip2 += chunk * is2;
+            n -= chunk;
+        }
+        ((npy_float *)op)[0] = (npy_float)sum[0];
+        ((npy_float *)op)[1] = (npy_float)sum[1];
+    }
+    else
+#endif
+    {
+        npy_float sumr = (npy_float)0.0;
+        npy_float sumi = (npy_float)0.0;
+        npy_intp i;
+
+        for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+            const npy_float ip1r = ((npy_float *)ip1)[0];
+            const npy_float ip1i = ((npy_float *)ip1)[1];
+            const npy_float ip2r = ((npy_float *)ip2)[0];
+            const npy_float ip2i = ((npy_float *)ip2)[1];
+
+            sumr += ip1r * ip2r - ip1i * ip2i;
+            sumi += ip1r * ip2i + ip1i * ip2r;
+        }
+        ((npy_float *)op)[0] = sumr;
+        ((npy_float *)op)[1] = sumi;
+    }
+}
+
+
+#line 3736
+NPY_NO_EXPORT void
+CDOUBLE_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2,
+           char *op, npy_intp n, void *NPY_UNUSED(ignore))
+{
+#if defined(HAVE_CBLAS)
+    CBLAS_INT is1b = blas_stride(is1, sizeof(npy_cdouble));
+    CBLAS_INT is2b = blas_stride(is2, sizeof(npy_cdouble));
+
+    if (is1b && is2b) {
+        double sum[2] = {0., 0.};  /* double for stability */
+
+        while (n > 0) {
+            CBLAS_INT chunk = n < NPY_CBLAS_CHUNK ? n : NPY_CBLAS_CHUNK;
+            npy_double tmp[2];
+
+            CBLAS_FUNC(cblas_zdotu_sub)(
+                    (CBLAS_INT)chunk, ip1, is1b, ip2, is2b, tmp);
+            sum[0] += (double)tmp[0];
+            sum[1] += (double)tmp[1];
+            /* use char strides here */
+            ip1 += chunk * is1;
+            ip2 += chunk * is2;
+            n -= chunk;
+        }
+        ((npy_double *)op)[0] = (npy_double)sum[0];
+        ((npy_double *)op)[1] = (npy_double)sum[1];
+    }
+    else
+#endif
+    {
+        npy_double sumr = (npy_double)0.0;
+        npy_double sumi = (npy_double)0.0;
+        npy_intp i;
+
+        for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+            const npy_double ip1r = ((npy_double *)ip1)[0];
+            const npy_double ip1i = ((npy_double *)ip1)[1];
+            const npy_double ip2r = ((npy_double *)ip2)[0];
+            const npy_double ip2i = ((npy_double *)ip2)[1];
+
+            sumr += ip1r * ip2r - ip1i * ip2i;
+            sumi += ip1r * ip2i + ip1i * ip2r;
+        }
+        ((npy_double *)op)[0] = sumr;
+        ((npy_double *)op)[1] = sumi;
+    }
+}
+
+
+
+/**************************** NO CBLAS VERSIONS *****************************/
+
+static void
+BOOL_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n,
+         void *NPY_UNUSED(ignore))
+{
+    npy_bool tmp = NPY_FALSE;
+    npy_intp i;
+
+    for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+        if ((*((npy_bool *)ip1) != 0) && (*((npy_bool *)ip2) != 0)) {
+            tmp = NPY_TRUE;
+            break;
+        }
+    }
+    *((npy_bool *)op) = tmp;
+}
+
+/*
+ * `dot` does not make sense for times, for DATETIME it never worked.
+ *  For timedelta it does/did , but should probably also just be removed.
+ */
+#define DATETIME_dot NULL
+
+#line 3822
+static void
+BYTE_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n,
+           void *NPY_UNUSED(ignore))
+{
+    npy_long tmp = (npy_long)0;
+    npy_intp i;
+
+    for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+        tmp += (npy_long)(*((npy_byte *)ip1)) *
+               (npy_long)(*((npy_byte *)ip2));
+    }
+    *((npy_byte *)op) = (npy_byte) tmp;
+}
+
+#line 3822
+static void
+UBYTE_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n,
+           void *NPY_UNUSED(ignore))
+{
+    npy_ulong tmp = (npy_ulong)0;
+    npy_intp i;
+
+    for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+        tmp += (npy_ulong)(*((npy_ubyte *)ip1)) *
+               (npy_ulong)(*((npy_ubyte *)ip2));
+    }
+    *((npy_ubyte *)op) = (npy_ubyte) tmp;
+}
+
+#line 3822
+static void
+SHORT_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n,
+           void *NPY_UNUSED(ignore))
+{
+    npy_long tmp = (npy_long)0;
+    npy_intp i;
+
+    for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+        tmp += (npy_long)(*((npy_short *)ip1)) *
+               (npy_long)(*((npy_short *)ip2));
+    }
+    *((npy_short *)op) = (npy_short) tmp;
+}
+
+#line 3822
+static void
+USHORT_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n,
+           void *NPY_UNUSED(ignore))
+{
+    npy_ulong tmp = (npy_ulong)0;
+    npy_intp i;
+
+    for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+        tmp += (npy_ulong)(*((npy_ushort *)ip1)) *
+               (npy_ulong)(*((npy_ushort *)ip2));
+    }
+    *((npy_ushort *)op) = (npy_ushort) tmp;
+}
+
+#line 3822
+static void
+INT_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n,
+           void *NPY_UNUSED(ignore))
+{
+    npy_long tmp = (npy_long)0;
+    npy_intp i;
+
+    for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+        tmp += (npy_long)(*((npy_int *)ip1)) *
+               (npy_long)(*((npy_int *)ip2));
+    }
+    *((npy_int *)op) = (npy_int) tmp;
+}
+
+#line 3822
+static void
+UINT_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n,
+           void *NPY_UNUSED(ignore))
+{
+    npy_ulong tmp = (npy_ulong)0;
+    npy_intp i;
+
+    for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+        tmp += (npy_ulong)(*((npy_uint *)ip1)) *
+               (npy_ulong)(*((npy_uint *)ip2));
+    }
+    *((npy_uint *)op) = (npy_uint) tmp;
+}
+
+#line 3822
+static void
+LONG_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n,
+           void *NPY_UNUSED(ignore))
+{
+    npy_long tmp = (npy_long)0;
+    npy_intp i;
+
+    for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+        tmp += (npy_long)(*((npy_long *)ip1)) *
+               (npy_long)(*((npy_long *)ip2));
+    }
+    *((npy_long *)op) = (npy_long) tmp;
+}
+
+#line 3822
+static void
+ULONG_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n,
+           void *NPY_UNUSED(ignore))
+{
+    npy_ulong tmp = (npy_ulong)0;
+    npy_intp i;
+
+    for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+        tmp += (npy_ulong)(*((npy_ulong *)ip1)) *
+               (npy_ulong)(*((npy_ulong *)ip2));
+    }
+    *((npy_ulong *)op) = (npy_ulong) tmp;
+}
+
+#line 3822
+static void
+LONGLONG_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n,
+           void *NPY_UNUSED(ignore))
+{
+    npy_longlong tmp = (npy_longlong)0;
+    npy_intp i;
+
+    for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+        tmp += (npy_longlong)(*((npy_longlong *)ip1)) *
+               (npy_longlong)(*((npy_longlong *)ip2));
+    }
+    *((npy_longlong *)op) = (npy_longlong) tmp;
+}
+
+#line 3822
+static void
+ULONGLONG_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n,
+           void *NPY_UNUSED(ignore))
+{
+    npy_ulonglong tmp = (npy_ulonglong)0;
+    npy_intp i;
+
+    for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+        tmp += (npy_ulonglong)(*((npy_ulonglong *)ip1)) *
+               (npy_ulonglong)(*((npy_ulonglong *)ip2));
+    }
+    *((npy_ulonglong *)op) = (npy_ulonglong) tmp;
+}
+
+#line 3822
+static void
+LONGDOUBLE_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n,
+           void *NPY_UNUSED(ignore))
+{
+    npy_longdouble tmp = (npy_longdouble)0;
+    npy_intp i;
+
+    for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+        tmp += (npy_longdouble)(*((npy_longdouble *)ip1)) *
+               (npy_longdouble)(*((npy_longdouble *)ip2));
+    }
+    *((npy_longdouble *)op) = (npy_longdouble) tmp;
+}
+
+#line 3822
+static void
+TIMEDELTA_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n,
+           void *NPY_UNUSED(ignore))
+{
+    npy_timedelta tmp = (npy_timedelta)0;
+    npy_intp i;
+
+    for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+        tmp += (npy_timedelta)(*((npy_timedelta *)ip1)) *
+               (npy_timedelta)(*((npy_timedelta *)ip2));
+    }
+    *((npy_timedelta *)op) = (npy_timedelta) tmp;
+}
+
+
+static void
+HALF_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op,
+         npy_intp n, void *NPY_UNUSED(ignore))
+{
+    float tmp = 0.0f;
+    npy_intp i;
+
+    for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+        tmp += npy_half_to_float(*((npy_half *)ip1)) *
+               npy_half_to_float(*((npy_half *)ip2));
+    }
+    *((npy_half *)op) = npy_float_to_half(tmp);
+}
+
+static void
+CLONGDOUBLE_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2,
+                            char *op, npy_intp n, void *NPY_UNUSED(ignore))
+{
+    npy_longdouble tmpr = 0.0L;
+    npy_longdouble tmpi = 0.0L;
+    npy_intp i;
+
+    for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+        const npy_longdouble ip1r = ((npy_longdouble *)ip1)[0];
+        const npy_longdouble ip1i = ((npy_longdouble *)ip1)[1];
+        const npy_longdouble ip2r = ((npy_longdouble *)ip2)[0];
+        const npy_longdouble ip2i = ((npy_longdouble *)ip2)[1];
+
+        tmpr += ip1r * ip2r - ip1i * ip2i;
+        tmpi += ip1r * ip2i + ip1i * ip2r;
+    }
+    ((npy_longdouble *)op)[0] = tmpr;
+    ((npy_longdouble *)op)[1] = tmpi;
+}
+
+static void
+OBJECT_dot(char *ip1, npy_intp is1, char *ip2, npy_intp is2, char *op, npy_intp n,
+           void *NPY_UNUSED(ignore))
+{
+    /*
+     * ALIGNMENT NOTE: np.dot, np.inner etc. enforce that the array is
+     * BEHAVED before getting to this point, so unaligned pointers aren't
+     * handled here.
+     */
+    npy_intp i;
+    PyObject *tmp1, *tmp2, *tmp = NULL;
+    PyObject **tmp3;
+    for (i = 0; i < n; i++, ip1 += is1, ip2 += is2) {
+        if ((*((PyObject **)ip1) == NULL) || (*((PyObject **)ip2) == NULL)) {
+            tmp1 = Py_False;
+            Py_INCREF(Py_False);
+        }
+        else {
+            tmp1 = PyNumber_Multiply(*((PyObject **)ip1), *((PyObject **)ip2));
+            if (!tmp1) {
+                Py_XDECREF(tmp);
+                return;
+            }
+        }
+        if (i == 0) {
+            tmp = tmp1;
+        }
+        else {
+            tmp2 = PyNumber_Add(tmp, tmp1);
+            Py_XDECREF(tmp);
+            Py_XDECREF(tmp1);
+            if (!tmp2) {
+                return;
+            }
+            tmp = tmp2;
+        }
+    }
+    tmp3 = (PyObject**) op;
+    tmp2 = *tmp3;
+    *((PyObject **)op) = tmp;
+    Py_XDECREF(tmp2);
+}
+
+
+/*
+ *****************************************************************************
+ **                                 FILL                                    **
+ *****************************************************************************
+ */
+
+
+/* Boolean fill never works, but define it so that it works up to length 2 */
+static int
+BOOL_fill(PyObject **buffer, npy_intp length, void *NPY_UNUSED(ignored))
+{
+    NPY_ALLOW_C_API_DEF;
+    NPY_ALLOW_C_API;
+    PyErr_SetString(PyExc_TypeError,
+            "arange() is only supported for booleans when the result has at "
+            "most length 2.");
+    NPY_DISABLE_C_API;
+    return -1;
+}
+
+/* this requires buffer to be filled with objects or NULL */
+static int
+OBJECT_fill(PyObject **buffer, npy_intp length, void *NPY_UNUSED(ignored))
+{
+    int retval = 0;
+    npy_intp i;
+    PyObject *start = buffer[0];
+    PyObject *delta = buffer[1];
+    PyObject *second;
+
+    delta = PyNumber_Subtract(delta, start);
+    if (!delta) {
+        return -1;
+    }
+    second = start = PyNumber_Add(start, delta);
+    if (!start) {
+        goto error;
+    }
+    buffer += 2;
+
+    for (i = 2; i < length; i++, buffer++) {
+        start = PyNumber_Add(start, delta);
+        if (!start) {
+            goto error;
+        }
+        Py_XDECREF(*buffer);
+        *buffer = start;
+    }
+    goto finish;
+
+error:
+    retval = -1;
+
+finish:
+    Py_XDECREF(second);
+    Py_DECREF(delta);
+    return retval;
+}
+
+#line 3986
+static int
+BYTE_fill(npy_byte *buffer, npy_intp length, void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    npy_byte start = buffer[0];
+    npy_byte delta = buffer[1];
+
+    delta -= start;
+    for (i = 2; i < length; ++i) {
+        buffer[i] = start + i*delta;
+    }
+    return 0;
+}
+
+#line 3986
+static int
+UBYTE_fill(npy_ubyte *buffer, npy_intp length, void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    npy_ubyte start = buffer[0];
+    npy_ubyte delta = buffer[1];
+
+    delta -= start;
+    for (i = 2; i < length; ++i) {
+        buffer[i] = start + i*delta;
+    }
+    return 0;
+}
+
+#line 3986
+static int
+SHORT_fill(npy_short *buffer, npy_intp length, void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    npy_short start = buffer[0];
+    npy_short delta = buffer[1];
+
+    delta -= start;
+    for (i = 2; i < length; ++i) {
+        buffer[i] = start + i*delta;
+    }
+    return 0;
+}
+
+#line 3986
+static int
+USHORT_fill(npy_ushort *buffer, npy_intp length, void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    npy_ushort start = buffer[0];
+    npy_ushort delta = buffer[1];
+
+    delta -= start;
+    for (i = 2; i < length; ++i) {
+        buffer[i] = start + i*delta;
+    }
+    return 0;
+}
+
+#line 3986
+static int
+INT_fill(npy_int *buffer, npy_intp length, void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    npy_int start = buffer[0];
+    npy_int delta = buffer[1];
+
+    delta -= start;
+    for (i = 2; i < length; ++i) {
+        buffer[i] = start + i*delta;
+    }
+    return 0;
+}
+
+#line 3986
+static int
+UINT_fill(npy_uint *buffer, npy_intp length, void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    npy_uint start = buffer[0];
+    npy_uint delta = buffer[1];
+
+    delta -= start;
+    for (i = 2; i < length; ++i) {
+        buffer[i] = start + i*delta;
+    }
+    return 0;
+}
+
+#line 3986
+static int
+LONG_fill(npy_long *buffer, npy_intp length, void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    npy_long start = buffer[0];
+    npy_long delta = buffer[1];
+
+    delta -= start;
+    for (i = 2; i < length; ++i) {
+        buffer[i] = start + i*delta;
+    }
+    return 0;
+}
+
+#line 3986
+static int
+ULONG_fill(npy_ulong *buffer, npy_intp length, void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    npy_ulong start = buffer[0];
+    npy_ulong delta = buffer[1];
+
+    delta -= start;
+    for (i = 2; i < length; ++i) {
+        buffer[i] = start + i*delta;
+    }
+    return 0;
+}
+
+#line 3986
+static int
+LONGLONG_fill(npy_longlong *buffer, npy_intp length, void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    npy_longlong start = buffer[0];
+    npy_longlong delta = buffer[1];
+
+    delta -= start;
+    for (i = 2; i < length; ++i) {
+        buffer[i] = start + i*delta;
+    }
+    return 0;
+}
+
+#line 3986
+static int
+ULONGLONG_fill(npy_ulonglong *buffer, npy_intp length, void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    npy_ulonglong start = buffer[0];
+    npy_ulonglong delta = buffer[1];
+
+    delta -= start;
+    for (i = 2; i < length; ++i) {
+        buffer[i] = start + i*delta;
+    }
+    return 0;
+}
+
+#line 3986
+static int
+FLOAT_fill(npy_float *buffer, npy_intp length, void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    npy_float start = buffer[0];
+    npy_float delta = buffer[1];
+
+    delta -= start;
+    for (i = 2; i < length; ++i) {
+        buffer[i] = start + i*delta;
+    }
+    return 0;
+}
+
+#line 3986
+static int
+DOUBLE_fill(npy_double *buffer, npy_intp length, void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    npy_double start = buffer[0];
+    npy_double delta = buffer[1];
+
+    delta -= start;
+    for (i = 2; i < length; ++i) {
+        buffer[i] = start + i*delta;
+    }
+    return 0;
+}
+
+#line 3986
+static int
+LONGDOUBLE_fill(npy_longdouble *buffer, npy_intp length, void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    npy_longdouble start = buffer[0];
+    npy_longdouble delta = buffer[1];
+
+    delta -= start;
+    for (i = 2; i < length; ++i) {
+        buffer[i] = start + i*delta;
+    }
+    return 0;
+}
+
+#line 3986
+static int
+DATETIME_fill(npy_datetime *buffer, npy_intp length, void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    npy_datetime start = buffer[0];
+    npy_datetime delta = buffer[1];
+
+    delta -= start;
+    for (i = 2; i < length; ++i) {
+        buffer[i] = start + i*delta;
+    }
+    return 0;
+}
+
+#line 3986
+static int
+TIMEDELTA_fill(npy_timedelta *buffer, npy_intp length, void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    npy_timedelta start = buffer[0];
+    npy_timedelta delta = buffer[1];
+
+    delta -= start;
+    for (i = 2; i < length; ++i) {
+        buffer[i] = start + i*delta;
+    }
+    return 0;
+}
+
+
+static int
+HALF_fill(npy_half *buffer, npy_intp length, void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    float start = npy_half_to_float(buffer[0]);
+    float delta = npy_half_to_float(buffer[1]);
+
+    delta -= start;
+    for (i = 2; i < length; ++i) {
+        buffer[i] = npy_float_to_half(start + i*delta);
+    }
+    return 0;
+}
+
+#line 4020
+static int
+CFLOAT_fill(npy_cfloat *buffer, npy_intp length, void *NPY_UNUSED(ignore))
+{
+    npy_intp i;
+    npy_cfloat start;
+    npy_cfloat delta;
+
+    start.real = buffer->real;
+    start.imag = buffer->imag;
+    delta.real = buffer[1].real;
+    delta.imag = buffer[1].imag;
+    delta.real -= start.real;
+    delta.imag -= start.imag;
+    buffer += 2;
+    for (i = 2; i < length; i++, buffer++) {
+        buffer->real = start.real + i*delta.real;
+        buffer->imag = start.imag + i*delta.imag;
+    }
+    return 0;
+}
+
+#line 4020
+static int
+CDOUBLE_fill(npy_cdouble *buffer, npy_intp length, void *NPY_UNUSED(ignore))
+{
+    npy_intp i;
+    npy_cdouble start;
+    npy_cdouble delta;
+
+    start.real = buffer->real;
+    start.imag = buffer->imag;
+    delta.real = buffer[1].real;
+    delta.imag = buffer[1].imag;
+    delta.real -= start.real;
+    delta.imag -= start.imag;
+    buffer += 2;
+    for (i = 2; i < length; i++, buffer++) {
+        buffer->real = start.real + i*delta.real;
+        buffer->imag = start.imag + i*delta.imag;
+    }
+    return 0;
+}
+
+#line 4020
+static int
+CLONGDOUBLE_fill(npy_clongdouble *buffer, npy_intp length, void *NPY_UNUSED(ignore))
+{
+    npy_intp i;
+    npy_clongdouble start;
+    npy_clongdouble delta;
+
+    start.real = buffer->real;
+    start.imag = buffer->imag;
+    delta.real = buffer[1].real;
+    delta.imag = buffer[1].imag;
+    delta.real -= start.real;
+    delta.imag -= start.imag;
+    buffer += 2;
+    for (i = 2; i < length; i++, buffer++) {
+        buffer->real = start.real + i*delta.real;
+        buffer->imag = start.imag + i*delta.imag;
+    }
+    return 0;
+}
+
+
+
+/* this requires buffer to be filled with objects or NULL */
+static void
+OBJECT_fillwithscalar(PyObject **buffer, npy_intp length, PyObject **value,
+        void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    PyObject *val = *value;
+    for (i = 0; i < length; i++) {
+        Py_XINCREF(val);
+        Py_XDECREF(buffer[i]);
+        buffer[i] = val;
+    }
+}
+#line 4061
+static void
+BOOL_fillwithscalar(npy_bool *buffer, npy_intp length, npy_bool *value,
+        void *NPY_UNUSED(ignored))
+{
+    memset(buffer, *value, length);
+}
+
+#line 4061
+static void
+BYTE_fillwithscalar(npy_byte *buffer, npy_intp length, npy_byte *value,
+        void *NPY_UNUSED(ignored))
+{
+    memset(buffer, *value, length);
+}
+
+#line 4061
+static void
+UBYTE_fillwithscalar(npy_ubyte *buffer, npy_intp length, npy_ubyte *value,
+        void *NPY_UNUSED(ignored))
+{
+    memset(buffer, *value, length);
+}
+
+
+#line 4082
+static void
+SHORT_fillwithscalar(npy_short *buffer, npy_intp length, npy_short *value,
+        void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    npy_short val = *value;
+
+    for (i = 0; i < length; ++i) {
+        buffer[i] = val;
+    }
+}
+
+#line 4082
+static void
+USHORT_fillwithscalar(npy_ushort *buffer, npy_intp length, npy_ushort *value,
+        void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    npy_ushort val = *value;
+
+    for (i = 0; i < length; ++i) {
+        buffer[i] = val;
+    }
+}
+
+#line 4082
+static void
+INT_fillwithscalar(npy_int *buffer, npy_intp length, npy_int *value,
+        void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    npy_int val = *value;
+
+    for (i = 0; i < length; ++i) {
+        buffer[i] = val;
+    }
+}
+
+#line 4082
+static void
+UINT_fillwithscalar(npy_uint *buffer, npy_intp length, npy_uint *value,
+        void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    npy_uint val = *value;
+
+    for (i = 0; i < length; ++i) {
+        buffer[i] = val;
+    }
+}
+
+#line 4082
+static void
+LONG_fillwithscalar(npy_long *buffer, npy_intp length, npy_long *value,
+        void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    npy_long val = *value;
+
+    for (i = 0; i < length; ++i) {
+        buffer[i] = val;
+    }
+}
+
+#line 4082
+static void
+ULONG_fillwithscalar(npy_ulong *buffer, npy_intp length, npy_ulong *value,
+        void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    npy_ulong val = *value;
+
+    for (i = 0; i < length; ++i) {
+        buffer[i] = val;
+    }
+}
+
+#line 4082
+static void
+LONGLONG_fillwithscalar(npy_longlong *buffer, npy_intp length, npy_longlong *value,
+        void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    npy_longlong val = *value;
+
+    for (i = 0; i < length; ++i) {
+        buffer[i] = val;
+    }
+}
+
+#line 4082
+static void
+ULONGLONG_fillwithscalar(npy_ulonglong *buffer, npy_intp length, npy_ulonglong *value,
+        void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    npy_ulonglong val = *value;
+
+    for (i = 0; i < length; ++i) {
+        buffer[i] = val;
+    }
+}
+
+#line 4082
+static void
+HALF_fillwithscalar(npy_half *buffer, npy_intp length, npy_half *value,
+        void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    npy_half val = *value;
+
+    for (i = 0; i < length; ++i) {
+        buffer[i] = val;
+    }
+}
+
+#line 4082
+static void
+FLOAT_fillwithscalar(npy_float *buffer, npy_intp length, npy_float *value,
+        void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    npy_float val = *value;
+
+    for (i = 0; i < length; ++i) {
+        buffer[i] = val;
+    }
+}
+
+#line 4082
+static void
+DOUBLE_fillwithscalar(npy_double *buffer, npy_intp length, npy_double *value,
+        void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    npy_double val = *value;
+
+    for (i = 0; i < length; ++i) {
+        buffer[i] = val;
+    }
+}
+
+#line 4082
+static void
+LONGDOUBLE_fillwithscalar(npy_longdouble *buffer, npy_intp length, npy_longdouble *value,
+        void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    npy_longdouble val = *value;
+
+    for (i = 0; i < length; ++i) {
+        buffer[i] = val;
+    }
+}
+
+#line 4082
+static void
+CFLOAT_fillwithscalar(npy_cfloat *buffer, npy_intp length, npy_cfloat *value,
+        void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    npy_cfloat val = *value;
+
+    for (i = 0; i < length; ++i) {
+        buffer[i] = val;
+    }
+}
+
+#line 4082
+static void
+CDOUBLE_fillwithscalar(npy_cdouble *buffer, npy_intp length, npy_cdouble *value,
+        void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    npy_cdouble val = *value;
+
+    for (i = 0; i < length; ++i) {
+        buffer[i] = val;
+    }
+}
+
+#line 4082
+static void
+CLONGDOUBLE_fillwithscalar(npy_clongdouble *buffer, npy_intp length, npy_clongdouble *value,
+        void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    npy_clongdouble val = *value;
+
+    for (i = 0; i < length; ++i) {
+        buffer[i] = val;
+    }
+}
+
+#line 4082
+static void
+DATETIME_fillwithscalar(npy_datetime *buffer, npy_intp length, npy_datetime *value,
+        void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    npy_datetime val = *value;
+
+    for (i = 0; i < length; ++i) {
+        buffer[i] = val;
+    }
+}
+
+#line 4082
+static void
+TIMEDELTA_fillwithscalar(npy_timedelta *buffer, npy_intp length, npy_timedelta *value,
+        void *NPY_UNUSED(ignored))
+{
+    npy_intp i;
+    npy_timedelta val = *value;
+
+    for (i = 0; i < length; ++i) {
+        buffer[i] = val;
+    }
+}
+
+
+
+/*
+ *****************************************************************************
+ **                       small correlate                                   **
+ *****************************************************************************
+ */
+
+/*
+ * Compute correlation of data with small kernels
+ * Calling a BLAS dot product for the inner loop of the correlation is overkill
+ * for small kernels. It is faster to compute it directly.
+ * Intended to be used by _pyarray_correlate so no input verifications is done
+ * especially it does not handle the boundaries, they should be handled by the
+ * caller.
+ * Returns 0 if kernel is considered too large or types are not supported, then
+ * the regular array dot should be used to process the data.
+ *
+ * d_, dstride, nd, dtype: data pointer, its stride in bytes, number of
+ *                         elements and type of data
+ * k_, kstride, nk, ktype: kernel pointer, its stride in bytes, number of
+ *                         elements and type of data
+ * out_, ostride: output data pointer and its stride in bytes
+ */
+NPY_NO_EXPORT int
+small_correlate(const char * d_, npy_intp dstride,
+                npy_intp nd, enum NPY_TYPES dtype,
+                const char * k_, npy_intp kstride,
+                npy_intp nk, enum NPY_TYPES ktype,
+                char * out_, npy_intp ostride)
+{
+    /* only handle small kernels and uniform types */
+    if (nk > 11 || dtype != ktype) {
+        return 0;
+    }
+
+    switch (dtype) {
+#line 4136
+        case NPY_FLOAT:
+            {
+                npy_intp i;
+                const npy_float * d = (npy_float*)d_;
+                const npy_float * k = (npy_float*)k_;
+                npy_float * out = (npy_float*)out_;
+                dstride /= sizeof(npy_float);
+                kstride /= sizeof(npy_float);
+                ostride /= sizeof(npy_float);
+                /* unroll inner loop to optimize register usage of the kernel*/
+                switch (nk) {
+#line 4149
+                    case 1:
+                    {
+#line 4153
+#if 1 <= 1
+                        /* load kernel */
+                        const npy_float k1 = k[(1 - 1) * kstride];
+#endif
+
+#line 4153
+#if 2 <= 1
+                        /* load kernel */
+                        const npy_float k2 = k[(2 - 1) * kstride];
+#endif
+
+#line 4153
+#if 3 <= 1
+                        /* load kernel */
+                        const npy_float k3 = k[(3 - 1) * kstride];
+#endif
+
+#line 4153
+#if 4 <= 1
+                        /* load kernel */
+                        const npy_float k4 = k[(4 - 1) * kstride];
+#endif
+
+#line 4153
+#if 5 <= 1
+                        /* load kernel */
+                        const npy_float k5 = k[(5 - 1) * kstride];
+#endif
+
+#line 4153
+#if 6 <= 1
+                        /* load kernel */
+                        const npy_float k6 = k[(6 - 1) * kstride];
+#endif
+
+#line 4153
+#if 7 <= 1
+                        /* load kernel */
+                        const npy_float k7 = k[(7 - 1) * kstride];
+#endif
+
+#line 4153
+#if 8 <= 1
+                        /* load kernel */
+                        const npy_float k8 = k[(8 - 1) * kstride];
+#endif
+
+#line 4153
+#if 9 <= 1
+                        /* load kernel */
+                        const npy_float k9 = k[(9 - 1) * kstride];
+#endif
+
+#line 4153
+#if 10 <= 1
+                        /* load kernel */
+                        const npy_float k10 = k[(10 - 1) * kstride];
+#endif
+
+#line 4153
+#if 11 <= 1
+                        /* load kernel */
+                        const npy_float k11 = k[(11 - 1) * kstride];
+#endif
+
+                        for (i = 0; i < nd; i++) {
+                            npy_float s = 0;
+#line 4162
+#if 1 <= 1
+                            s += d[(i + 1 - 1) * dstride] * k1;
+#endif
+
+#line 4162
+#if 2 <= 1
+                            s += d[(i + 2 - 1) * dstride] * k2;
+#endif
+
+#line 4162
+#if 3 <= 1
+                            s += d[(i + 3 - 1) * dstride] * k3;
+#endif
+
+#line 4162
+#if 4 <= 1
+                            s += d[(i + 4 - 1) * dstride] * k4;
+#endif
+
+#line 4162
+#if 5 <= 1
+                            s += d[(i + 5 - 1) * dstride] * k5;
+#endif
+
+#line 4162
+#if 6 <= 1
+                            s += d[(i + 6 - 1) * dstride] * k6;
+#endif
+
+#line 4162
+#if 7 <= 1
+                            s += d[(i + 7 - 1) * dstride] * k7;
+#endif
+
+#line 4162
+#if 8 <= 1
+                            s += d[(i + 8 - 1) * dstride] * k8;
+#endif
+
+#line 4162
+#if 9 <= 1
+                            s += d[(i + 9 - 1) * dstride] * k9;
+#endif
+
+#line 4162
+#if 10 <= 1
+                            s += d[(i + 10 - 1) * dstride] * k10;
+#endif
+
+#line 4162
+#if 11 <= 1
+                            s += d[(i + 11 - 1) * dstride] * k11;
+#endif
+
+                            out[i * ostride] = s;
+                        }
+                        return 1;
+                    }
+
+#line 4149
+                    case 2:
+                    {
+#line 4153
+#if 1 <= 2
+                        /* load kernel */
+                        const npy_float k1 = k[(1 - 1) * kstride];
+#endif
+
+#line 4153
+#if 2 <= 2
+                        /* load kernel */
+                        const npy_float k2 = k[(2 - 1) * kstride];
+#endif
+
+#line 4153
+#if 3 <= 2
+                        /* load kernel */
+                        const npy_float k3 = k[(3 - 1) * kstride];
+#endif
+
+#line 4153
+#if 4 <= 2
+                        /* load kernel */
+                        const npy_float k4 = k[(4 - 1) * kstride];
+#endif
+
+#line 4153
+#if 5 <= 2
+                        /* load kernel */
+                        const npy_float k5 = k[(5 - 1) * kstride];
+#endif
+
+#line 4153
+#if 6 <= 2
+                        /* load kernel */
+                        const npy_float k6 = k[(6 - 1) * kstride];
+#endif
+
+#line 4153
+#if 7 <= 2
+                        /* load kernel */
+                        const npy_float k7 = k[(7 - 1) * kstride];
+#endif
+
+#line 4153
+#if 8 <= 2
+                        /* load kernel */
+                        const npy_float k8 = k[(8 - 1) * kstride];
+#endif
+
+#line 4153
+#if 9 <= 2
+                        /* load kernel */
+                        const npy_float k9 = k[(9 - 1) * kstride];
+#endif
+
+#line 4153
+#if 10 <= 2
+                        /* load kernel */
+                        const npy_float k10 = k[(10 - 1) * kstride];
+#endif
+
+#line 4153
+#if 11 <= 2
+                        /* load kernel */
+                        const npy_float k11 = k[(11 - 1) * kstride];
+#endif
+
+                        for (i = 0; i < nd; i++) {
+                            npy_float s = 0;
+#line 4162
+#if 1 <= 2
+                            s += d[(i + 1 - 1) * dstride] * k1;
+#endif
+
+#line 4162
+#if 2 <= 2
+                            s += d[(i + 2 - 1) * dstride] * k2;
+#endif
+
+#line 4162
+#if 3 <= 2
+                            s += d[(i + 3 - 1) * dstride] * k3;
+#endif
+
+#line 4162
+#if 4 <= 2
+                            s += d[(i + 4 - 1) * dstride] * k4;
+#endif
+
+#line 4162
+#if 5 <= 2
+                            s += d[(i + 5 - 1) * dstride] * k5;
+#endif
+
+#line 4162
+#if 6 <= 2
+                            s += d[(i + 6 - 1) * dstride] * k6;
+#endif
+
+#line 4162
+#if 7 <= 2
+                            s += d[(i + 7 - 1) * dstride] * k7;
+#endif
+
+#line 4162
+#if 8 <= 2
+                            s += d[(i + 8 - 1) * dstride] * k8;
+#endif
+
+#line 4162
+#if 9 <= 2
+                            s += d[(i + 9 - 1) * dstride] * k9;
+#endif
+
+#line 4162
+#if 10 <= 2
+                            s += d[(i + 10 - 1) * dstride] * k10;
+#endif
+
+#line 4162
+#if 11 <= 2
+                            s += d[(i + 11 - 1) * dstride] * k11;
+#endif
+
+                            out[i * ostride] = s;
+                        }
+                        return 1;
+                    }
+
+#line 4149
+                    case 3:
+                    {
+#line 4153
+#if 1 <= 3
+                        /* load kernel */
+                        const npy_float k1 = k[(1 - 1) * kstride];
+#endif
+
+#line 4153
+#if 2 <= 3
+                        /* load kernel */
+                        const npy_float k2 = k[(2 - 1) * kstride];
+#endif
+
+#line 4153
+#if 3 <= 3
+                        /* load kernel */
+                        const npy_float k3 = k[(3 - 1) * kstride];
+#endif
+
+#line 4153
+#if 4 <= 3
+                        /* load kernel */
+                        const npy_float k4 = k[(4 - 1) * kstride];
+#endif
+
+#line 4153
+#if 5 <= 3
+                        /* load kernel */
+                        const npy_float k5 = k[(5 - 1) * kstride];
+#endif
+
+#line 4153
+#if 6 <= 3
+                        /* load kernel */
+                        const npy_float k6 = k[(6 - 1) * kstride];
+#endif
+
+#line 4153
+#if 7 <= 3
+                        /* load kernel */
+                        const npy_float k7 = k[(7 - 1) * kstride];
+#endif
+
+#line 4153
+#if 8 <= 3
+                        /* load kernel */
+                        const npy_float k8 = k[(8 - 1) * kstride];
+#endif
+
+#line 4153
+#if 9 <= 3
+                        /* load kernel */
+                        const npy_float k9 = k[(9 - 1) * kstride];
+#endif
+
+#line 4153
+#if 10 <= 3
+                        /* load kernel */
+                        const npy_float k10 = k[(10 - 1) * kstride];
+#endif
+
+#line 4153
+#if 11 <= 3
+                        /* load kernel */
+                        const npy_float k11 = k[(11 - 1) * kstride];
+#endif
+
+                        for (i = 0; i < nd; i++) {
+                            npy_float s = 0;
+#line 4162
+#if 1 <= 3
+                            s += d[(i + 1 - 1) * dstride] * k1;
+#endif
+
+#line 4162
+#if 2 <= 3
+                            s += d[(i + 2 - 1) * dstride] * k2;
+#endif
+
+#line 4162
+#if 3 <= 3
+                            s += d[(i + 3 - 1) * dstride] * k3;
+#endif
+
+#line 4162
+#if 4 <= 3
+                            s += d[(i + 4 - 1) * dstride] * k4;
+#endif
+
+#line 4162
+#if 5 <= 3
+                            s += d[(i + 5 - 1) * dstride] * k5;
+#endif
+
+#line 4162
+#if 6 <= 3
+                            s += d[(i + 6 - 1) * dstride] * k6;
+#endif
+
+#line 4162
+#if 7 <= 3
+                            s += d[(i + 7 - 1) * dstride] * k7;
+#endif
+
+#line 4162
+#if 8 <= 3
+                            s += d[(i + 8 - 1) * dstride] * k8;
+#endif
+
+#line 4162
+#if 9 <= 3
+                            s += d[(i + 9 - 1) * dstride] * k9;
+#endif
+
+#line 4162
+#if 10 <= 3
+                            s += d[(i + 10 - 1) * dstride] * k10;
+#endif
+
+#line 4162
+#if 11 <= 3
+                            s += d[(i + 11 - 1) * dstride] * k11;
+#endif
+
+                            out[i * ostride] = s;
+                        }
+                        return 1;
+                    }
+
+#line 4149
+                    case 4:
+                    {
+#line 4153
+#if 1 <= 4
+                        /* load kernel */
+                        const npy_float k1 = k[(1 - 1) * kstride];
+#endif
+
+#line 4153
+#if 2 <= 4
+                        /* load kernel */
+                        const npy_float k2 = k[(2 - 1) * kstride];
+#endif
+
+#line 4153
+#if 3 <= 4
+                        /* load kernel */
+                        const npy_float k3 = k[(3 - 1) * kstride];
+#endif
+
+#line 4153
+#if 4 <= 4
+                        /* load kernel */
+                        const npy_float k4 = k[(4 - 1) * kstride];
+#endif
+
+#line 4153
+#if 5 <= 4
+                        /* load kernel */
+                        const npy_float k5 = k[(5 - 1) * kstride];
+#endif
+
+#line 4153
+#if 6 <= 4
+                        /* load kernel */
+                        const npy_float k6 = k[(6 - 1) * kstride];
+#endif
+
+#line 4153
+#if 7 <= 4
+                        /* load kernel */
+                        const npy_float k7 = k[(7 - 1) * kstride];
+#endif
+
+#line 4153
+#if 8 <= 4
+                        /* load kernel */
+                        const npy_float k8 = k[(8 - 1) * kstride];
+#endif
+
+#line 4153
+#if 9 <= 4
+                        /* load kernel */
+                        const npy_float k9 = k[(9 - 1) * kstride];
+#endif
+
+#line 4153
+#if 10 <= 4
+                        /* load kernel */
+                        const npy_float k10 = k[(10 - 1) * kstride];
+#endif
+
+#line 4153
+#if 11 <= 4
+                        /* load kernel */
+                        const npy_float k11 = k[(11 - 1) * kstride];
+#endif
+
+                        for (i = 0; i < nd; i++) {
+                            npy_float s = 0;
+#line 4162
+#if 1 <= 4
+                            s += d[(i + 1 - 1) * dstride] * k1;
+#endif
+
+#line 4162
+#if 2 <= 4
+                            s += d[(i + 2 - 1) * dstride] * k2;
+#endif
+
+#line 4162
+#if 3 <= 4
+                            s += d[(i + 3 - 1) * dstride] * k3;
+#endif
+
+#line 4162
+#if 4 <= 4
+                            s += d[(i + 4 - 1) * dstride] * k4;
+#endif
+
+#line 4162
+#if 5 <= 4
+                            s += d[(i + 5 - 1) * dstride] * k5;
+#endif
+
+#line 4162
+#if 6 <= 4
+                            s += d[(i + 6 - 1) * dstride] * k6;
+#endif
+
+#line 4162
+#if 7 <= 4
+                            s += d[(i + 7 - 1) * dstride] * k7;
+#endif
+
+#line 4162
+#if 8 <= 4
+                            s += d[(i + 8 - 1) * dstride] * k8;
+#endif
+
+#line 4162
+#if 9 <= 4
+                            s += d[(i + 9 - 1) * dstride] * k9;
+#endif
+
+#line 4162
+#if 10 <= 4
+                            s += d[(i + 10 - 1) * dstride] * k10;
+#endif
+
+#line 4162
+#if 11 <= 4
+                            s += d[(i + 11 - 1) * dstride] * k11;
+#endif
+
+                            out[i * ostride] = s;
+                        }
+                        return 1;
+                    }
+
+#line 4149
+                    case 5:
+                    {
+#line 4153
+#if 1 <= 5
+                        /* load kernel */
+                        const npy_float k1 = k[(1 - 1) * kstride];
+#endif
+
+#line 4153
+#if 2 <= 5
+                        /* load kernel */
+                        const npy_float k2 = k[(2 - 1) * kstride];
+#endif
+
+#line 4153
+#if 3 <= 5
+                        /* load kernel */
+                        const npy_float k3 = k[(3 - 1) * kstride];
+#endif
+
+#line 4153
+#if 4 <= 5
+                        /* load kernel */
+                        const npy_float k4 = k[(4 - 1) * kstride];
+#endif
+
+#line 4153
+#if 5 <= 5
+                        /* load kernel */
+                        const npy_float k5 = k[(5 - 1) * kstride];
+#endif
+
+#line 4153
+#if 6 <= 5
+                        /* load kernel */
+                        const npy_float k6 = k[(6 - 1) * kstride];
+#endif
+
+#line 4153
+#if 7 <= 5
+                        /* load kernel */
+                        const npy_float k7 = k[(7 - 1) * kstride];
+#endif
+
+#line 4153
+#if 8 <= 5
+                        /* load kernel */
+                        const npy_float k8 = k[(8 - 1) * kstride];
+#endif
+
+#line 4153
+#if 9 <= 5
+                        /* load kernel */
+                        const npy_float k9 = k[(9 - 1) * kstride];
+#endif
+
+#line 4153
+#if 10 <= 5
+                        /* load kernel */
+                        const npy_float k10 = k[(10 - 1) * kstride];
+#endif
+
+#line 4153
+#if 11 <= 5
+                        /* load kernel */
+                        const npy_float k11 = k[(11 - 1) * kstride];
+#endif
+
+                        for (i = 0; i < nd; i++) {
+                            npy_float s = 0;
+#line 4162
+#if 1 <= 5
+                            s += d[(i + 1 - 1) * dstride] * k1;
+#endif
+
+#line 4162
+#if 2 <= 5
+                            s += d[(i + 2 - 1) * dstride] * k2;
+#endif
+
+#line 4162
+#if 3 <= 5
+                            s += d[(i + 3 - 1) * dstride] * k3;
+#endif
+
+#line 4162
+#if 4 <= 5
+                            s += d[(i + 4 - 1) * dstride] * k4;
+#endif
+
+#line 4162
+#if 5 <= 5
+                            s += d[(i + 5 - 1) * dstride] * k5;
+#endif
+
+#line 4162
+#if 6 <= 5
+                            s += d[(i + 6 - 1) * dstride] * k6;
+#endif
+
+#line 4162
+#if 7 <= 5
+                            s += d[(i + 7 - 1) * dstride] * k7;
+#endif
+
+#line 4162
+#if 8 <= 5
+                            s += d[(i + 8 - 1) * dstride] * k8;
+#endif
+
+#line 4162
+#if 9 <= 5
+                            s += d[(i + 9 - 1) * dstride] * k9;
+#endif
+
+#line 4162
+#if 10 <= 5
+                            s += d[(i + 10 - 1) * dstride] * k10;
+#endif
+
+#line 4162
+#if 11 <= 5
+                            s += d[(i + 11 - 1) * dstride] * k11;
+#endif
+
+                            out[i * ostride] = s;
+                        }
+                        return 1;
+                    }
+
+#line 4149
+                    case 6:
+                    {
+#line 4153
+#if 1 <= 6
+                        /* load kernel */
+                        const npy_float k1 = k[(1 - 1) * kstride];
+#endif
+
+#line 4153
+#if 2 <= 6
+                        /* load kernel */
+                        const npy_float k2 = k[(2 - 1) * kstride];
+#endif
+
+#line 4153
+#if 3 <= 6
+                        /* load kernel */
+                        const npy_float k3 = k[(3 - 1) * kstride];
+#endif
+
+#line 4153
+#if 4 <= 6
+                        /* load kernel */
+                        const npy_float k4 = k[(4 - 1) * kstride];
+#endif
+
+#line 4153
+#if 5 <= 6
+                        /* load kernel */
+                        const npy_float k5 = k[(5 - 1) * kstride];
+#endif
+
+#line 4153
+#if 6 <= 6
+                        /* load kernel */
+                        const npy_float k6 = k[(6 - 1) * kstride];
+#endif
+
+#line 4153
+#if 7 <= 6
+                        /* load kernel */
+                        const npy_float k7 = k[(7 - 1) * kstride];
+#endif
+
+#line 4153
+#if 8 <= 6
+                        /* load kernel */
+                        const npy_float k8 = k[(8 - 1) * kstride];
+#endif
+
+#line 4153
+#if 9 <= 6
+                        /* load kernel */
+                        const npy_float k9 = k[(9 - 1) * kstride];
+#endif
+
+#line 4153
+#if 10 <= 6
+                        /* load kernel */
+                        const npy_float k10 = k[(10 - 1) * kstride];
+#endif
+
+#line 4153
+#if 11 <= 6
+                        /* load kernel */
+                        const npy_float k11 = k[(11 - 1) * kstride];
+#endif
+
+                        for (i = 0; i < nd; i++) {
+                            npy_float s = 0;
+#line 4162
+#if 1 <= 6
+                            s += d[(i + 1 - 1) * dstride] * k1;
+#endif
+
+#line 4162
+#if 2 <= 6
+                            s += d[(i + 2 - 1) * dstride] * k2;
+#endif
+
+#line 4162
+#if 3 <= 6
+                            s += d[(i + 3 - 1) * dstride] * k3;
+#endif
+
+#line 4162
+#if 4 <= 6
+                            s += d[(i + 4 - 1) * dstride] * k4;
+#endif
+
+#line 4162
+#if 5 <= 6
+                            s += d[(i + 5 - 1) * dstride] * k5;
+#endif
+
+#line 4162
+#if 6 <= 6
+                            s += d[(i + 6 - 1) * dstride] * k6;
+#endif
+
+#line 4162
+#if 7 <= 6
+                            s += d[(i + 7 - 1) * dstride] * k7;
+#endif
+
+#line 4162
+#if 8 <= 6
+                            s += d[(i + 8 - 1) * dstride] * k8;
+#endif
+
+#line 4162
+#if 9 <= 6
+                            s += d[(i + 9 - 1) * dstride] * k9;
+#endif
+
+#line 4162
+#if 10 <= 6
+                            s += d[(i + 10 - 1) * dstride] * k10;
+#endif
+
+#line 4162
+#if 11 <= 6
+                            s += d[(i + 11 - 1) * dstride] * k11;
+#endif
+
+                            out[i * ostride] = s;
+                        }
+                        return 1;
+                    }
+
+#line 4149
+                    case 7:
+                    {
+#line 4153
+#if 1 <= 7
+                        /* load kernel */
+                        const npy_float k1 = k[(1 - 1) * kstride];
+#endif
+
+#line 4153
+#if 2 <= 7
+                        /* load kernel */
+                        const npy_float k2 = k[(2 - 1) * kstride];
+#endif
+
+#line 4153
+#if 3 <= 7
+                        /* load kernel */
+                        const npy_float k3 = k[(3 - 1) * kstride];
+#endif
+
+#line 4153
+#if 4 <= 7
+                        /* load kernel */
+                        const npy_float k4 = k[(4 - 1) * kstride];
+#endif
+
+#line 4153
+#if 5 <= 7
+                        /* load kernel */
+                        const npy_float k5 = k[(5 - 1) * kstride];
+#endif
+
+#line 4153
+#if 6 <= 7
+                        /* load kernel */
+                        const npy_float k6 = k[(6 - 1) * kstride];
+#endif
+
+#line 4153
+#if 7 <= 7
+                        /* load kernel */
+                        const npy_float k7 = k[(7 - 1) * kstride];
+#endif
+
+#line 4153
+#if 8 <= 7
+                        /* load kernel */
+                        const npy_float k8 = k[(8 - 1) * kstride];
+#endif
+
+#line 4153
+#if 9 <= 7
+                        /* load kernel */
+                        const npy_float k9 = k[(9 - 1) * kstride];
+#endif
+
+#line 4153
+#if 10 <= 7
+                        /* load kernel */
+                        const npy_float k10 = k[(10 - 1) * kstride];
+#endif
+
+#line 4153
+#if 11 <= 7
+                        /* load kernel */
+                        const npy_float k11 = k[(11 - 1) * kstride];
+#endif
+
+                        for (i = 0; i < nd; i++) {
+                            npy_float s = 0;
+#line 4162
+#if 1 <= 7
+                            s += d[(i + 1 - 1) * dstride] * k1;
+#endif
+
+#line 4162
+#if 2 <= 7
+                            s += d[(i + 2 - 1) * dstride] * k2;
+#endif
+
+#line 4162
+#if 3 <= 7
+                            s += d[(i + 3 - 1) * dstride] * k3;
+#endif
+
+#line 4162
+#if 4 <= 7
+                            s += d[(i + 4 - 1) * dstride] * k4;
+#endif
+
+#line 4162
+#if 5 <= 7
+                            s += d[(i + 5 - 1) * dstride] * k5;
+#endif
+
+#line 4162
+#if 6 <= 7
+                            s += d[(i + 6 - 1) * dstride] * k6;
+#endif
+
+#line 4162
+#if 7 <= 7
+                            s += d[(i + 7 - 1) * dstride] * k7;
+#endif
+
+#line 4162
+#if 8 <= 7
+                            s += d[(i + 8 - 1) * dstride] * k8;
+#endif
+
+#line 4162
+#if 9 <= 7
+                            s += d[(i + 9 - 1) * dstride] * k9;
+#endif
+
+#line 4162
+#if 10 <= 7
+                            s += d[(i + 10 - 1) * dstride] * k10;
+#endif
+
+#line 4162
+#if 11 <= 7
+                            s += d[(i + 11 - 1) * dstride] * k11;
+#endif
+
+                            out[i * ostride] = s;
+                        }
+                        return 1;
+                    }
+
+#line 4149
+                    case 8:
+                    {
+#line 4153
+#if 1 <= 8
+                        /* load kernel */
+                        const npy_float k1 = k[(1 - 1) * kstride];
+#endif
+
+#line 4153
+#if 2 <= 8
+                        /* load kernel */
+                        const npy_float k2 = k[(2 - 1) * kstride];
+#endif
+
+#line 4153
+#if 3 <= 8
+                        /* load kernel */
+                        const npy_float k3 = k[(3 - 1) * kstride];
+#endif
+
+#line 4153
+#if 4 <= 8
+                        /* load kernel */
+                        const npy_float k4 = k[(4 - 1) * kstride];
+#endif
+
+#line 4153
+#if 5 <= 8
+                        /* load kernel */
+                        const npy_float k5 = k[(5 - 1) * kstride];
+#endif
+
+#line 4153
+#if 6 <= 8
+                        /* load kernel */
+                        const npy_float k6 = k[(6 - 1) * kstride];
+#endif
+
+#line 4153
+#if 7 <= 8
+                        /* load kernel */
+                        const npy_float k7 = k[(7 - 1) * kstride];
+#endif
+
+#line 4153
+#if 8 <= 8
+                        /* load kernel */
+                        const npy_float k8 = k[(8 - 1) * kstride];
+#endif
+
+#line 4153
+#if 9 <= 8
+                        /* load kernel */
+                        const npy_float k9 = k[(9 - 1) * kstride];
+#endif
+
+#line 4153
+#if 10 <= 8
+                        /* load kernel */
+                        const npy_float k10 = k[(10 - 1) * kstride];
+#endif
+
+#line 4153
+#if 11 <= 8
+                        /* load kernel */
+                        const npy_float k11 = k[(11 - 1) * kstride];
+#endif
+
+                        for (i = 0; i < nd; i++) {
+                            npy_float s = 0;
+#line 4162
+#if 1 <= 8
+                            s += d[(i + 1 - 1) * dstride] * k1;
+#endif
+
+#line 4162
+#if 2 <= 8
+                            s += d[(i + 2 - 1) * dstride] * k2;
+#endif
+
+#line 4162
+#if 3 <= 8
+                            s += d[(i + 3 - 1) * dstride] * k3;
+#endif
+
+#line 4162
+#if 4 <= 8
+                            s += d[(i + 4 - 1) * dstride] * k4;
+#endif
+
+#line 4162
+#if 5 <= 8
+                            s += d[(i + 5 - 1) * dstride] * k5;
+#endif
+
+#line 4162
+#if 6 <= 8
+                            s += d[(i + 6 - 1) * dstride] * k6;
+#endif
+
+#line 4162
+#if 7 <= 8
+                            s += d[(i + 7 - 1) * dstride] * k7;
+#endif
+
+#line 4162
+#if 8 <= 8
+                            s += d[(i + 8 - 1) * dstride] * k8;
+#endif
+
+#line 4162
+#if 9 <= 8
+                            s += d[(i + 9 - 1) * dstride] * k9;
+#endif
+
+#line 4162
+#if 10 <= 8
+                            s += d[(i + 10 - 1) * dstride] * k10;
+#endif
+
+#line 4162
+#if 11 <= 8
+                            s += d[(i + 11 - 1) * dstride] * k11;
+#endif
+
+                            out[i * ostride] = s;
+                        }
+                        return 1;
+                    }
+
+#line 4149
+                    case 9:
+                    {
+#line 4153
+#if 1 <= 9
+                        /* load kernel */
+                        const npy_float k1 = k[(1 - 1) * kstride];
+#endif
+
+#line 4153
+#if 2 <= 9
+                        /* load kernel */
+                        const npy_float k2 = k[(2 - 1) * kstride];
+#endif
+
+#line 4153
+#if 3 <= 9
+                        /* load kernel */
+                        const npy_float k3 = k[(3 - 1) * kstride];
+#endif
+
+#line 4153
+#if 4 <= 9
+                        /* load kernel */
+                        const npy_float k4 = k[(4 - 1) * kstride];
+#endif
+
+#line 4153
+#if 5 <= 9
+                        /* load kernel */
+                        const npy_float k5 = k[(5 - 1) * kstride];
+#endif
+
+#line 4153
+#if 6 <= 9
+                        /* load kernel */
+                        const npy_float k6 = k[(6 - 1) * kstride];
+#endif
+
+#line 4153
+#if 7 <= 9
+                        /* load kernel */
+                        const npy_float k7 = k[(7 - 1) * kstride];
+#endif
+
+#line 4153
+#if 8 <= 9
+                        /* load kernel */
+                        const npy_float k8 = k[(8 - 1) * kstride];
+#endif
+
+#line 4153
+#if 9 <= 9
+                        /* load kernel */
+                        const npy_float k9 = k[(9 - 1) * kstride];
+#endif
+
+#line 4153
+#if 10 <= 9
+                        /* load kernel */
+                        const npy_float k10 = k[(10 - 1) * kstride];
+#endif
+
+#line 4153
+#if 11 <= 9
+                        /* load kernel */
+                        const npy_float k11 = k[(11 - 1) * kstride];
+#endif
+
+                        for (i = 0; i < nd; i++) {
+                            npy_float s = 0;
+#line 4162
+#if 1 <= 9
+                            s += d[(i + 1 - 1) * dstride] * k1;
+#endif
+
+#line 4162
+#if 2 <= 9
+                            s += d[(i + 2 - 1) * dstride] * k2;
+#endif
+
+#line 4162
+#if 3 <= 9
+                            s += d[(i + 3 - 1) * dstride] * k3;
+#endif
+
+#line 4162
+#if 4 <= 9
+                            s += d[(i + 4 - 1) * dstride] * k4;
+#endif
+
+#line 4162
+#if 5 <= 9
+                            s += d[(i + 5 - 1) * dstride] * k5;
+#endif
+
+#line 4162
+#if 6 <= 9
+                            s += d[(i + 6 - 1) * dstride] * k6;
+#endif
+
+#line 4162
+#if 7 <= 9
+                            s += d[(i + 7 - 1) * dstride] * k7;
+#endif
+
+#line 4162
+#if 8 <= 9
+                            s += d[(i + 8 - 1) * dstride] * k8;
+#endif
+
+#line 4162
+#if 9 <= 9
+                            s += d[(i + 9 - 1) * dstride] * k9;
+#endif
+
+#line 4162
+#if 10 <= 9
+                            s += d[(i + 10 - 1) * dstride] * k10;
+#endif
+
+#line 4162
+#if 11 <= 9
+                            s += d[(i + 11 - 1) * dstride] * k11;
+#endif
+
+                            out[i * ostride] = s;
+                        }
+                        return 1;
+                    }
+
+#line 4149
+                    case 10:
+                    {
+#line 4153
+#if 1 <= 10
+                        /* load kernel */
+                        const npy_float k1 = k[(1 - 1) * kstride];
+#endif
+
+#line 4153
+#if 2 <= 10
+                        /* load kernel */
+                        const npy_float k2 = k[(2 - 1) * kstride];
+#endif
+
+#line 4153
+#if 3 <= 10
+                        /* load kernel */
+                        const npy_float k3 = k[(3 - 1) * kstride];
+#endif
+
+#line 4153
+#if 4 <= 10
+                        /* load kernel */
+                        const npy_float k4 = k[(4 - 1) * kstride];
+#endif
+
+#line 4153
+#if 5 <= 10
+                        /* load kernel */
+                        const npy_float k5 = k[(5 - 1) * kstride];
+#endif
+
+#line 4153
+#if 6 <= 10
+                        /* load kernel */
+                        const npy_float k6 = k[(6 - 1) * kstride];
+#endif
+
+#line 4153
+#if 7 <= 10
+                        /* load kernel */
+                        const npy_float k7 = k[(7 - 1) * kstride];
+#endif
+
+#line 4153
+#if 8 <= 10
+                        /* load kernel */
+                        const npy_float k8 = k[(8 - 1) * kstride];
+#endif
+
+#line 4153
+#if 9 <= 10
+                        /* load kernel */
+                        const npy_float k9 = k[(9 - 1) * kstride];
+#endif
+
+#line 4153
+#if 10 <= 10
+                        /* load kernel */
+                        const npy_float k10 = k[(10 - 1) * kstride];
+#endif
+
+#line 4153
+#if 11 <= 10
+                        /* load kernel */
+                        const npy_float k11 = k[(11 - 1) * kstride];
+#endif
+
+                        for (i = 0; i < nd; i++) {
+                            npy_float s = 0;
+#line 4162
+#if 1 <= 10
+                            s += d[(i + 1 - 1) * dstride] * k1;
+#endif
+
+#line 4162
+#if 2 <= 10
+                            s += d[(i + 2 - 1) * dstride] * k2;
+#endif
+
+#line 4162
+#if 3 <= 10
+                            s += d[(i + 3 - 1) * dstride] * k3;
+#endif
+
+#line 4162
+#if 4 <= 10
+                            s += d[(i + 4 - 1) * dstride] * k4;
+#endif
+
+#line 4162
+#if 5 <= 10
+                            s += d[(i + 5 - 1) * dstride] * k5;
+#endif
+
+#line 4162
+#if 6 <= 10
+                            s += d[(i + 6 - 1) * dstride] * k6;
+#endif
+
+#line 4162
+#if 7 <= 10
+                            s += d[(i + 7 - 1) * dstride] * k7;
+#endif
+
+#line 4162
+#if 8 <= 10
+                            s += d[(i + 8 - 1) * dstride] * k8;
+#endif
+
+#line 4162
+#if 9 <= 10
+                            s += d[(i + 9 - 1) * dstride] * k9;
+#endif
+
+#line 4162
+#if 10 <= 10
+                            s += d[(i + 10 - 1) * dstride] * k10;
+#endif
+
+#line 4162
+#if 11 <= 10
+                            s += d[(i + 11 - 1) * dstride] * k11;
+#endif
+
+                            out[i * ostride] = s;
+                        }
+                        return 1;
+                    }
+
+#line 4149
+                    case 11:
+                    {
+#line 4153
+#if 1 <= 11
+                        /* load kernel */
+                        const npy_float k1 = k[(1 - 1) * kstride];
+#endif
+
+#line 4153
+#if 2 <= 11
+                        /* load kernel */
+                        const npy_float k2 = k[(2 - 1) * kstride];
+#endif
+
+#line 4153
+#if 3 <= 11
+                        /* load kernel */
+                        const npy_float k3 = k[(3 - 1) * kstride];
+#endif
+
+#line 4153
+#if 4 <= 11
+                        /* load kernel */
+                        const npy_float k4 = k[(4 - 1) * kstride];
+#endif
+
+#line 4153
+#if 5 <= 11
+                        /* load kernel */
+                        const npy_float k5 = k[(5 - 1) * kstride];
+#endif
+
+#line 4153
+#if 6 <= 11
+                        /* load kernel */
+                        const npy_float k6 = k[(6 - 1) * kstride];
+#endif
+
+#line 4153
+#if 7 <= 11
+                        /* load kernel */
+                        const npy_float k7 = k[(7 - 1) * kstride];
+#endif
+
+#line 4153
+#if 8 <= 11
+                        /* load kernel */
+                        const npy_float k8 = k[(8 - 1) * kstride];
+#endif
+
+#line 4153
+#if 9 <= 11
+                        /* load kernel */
+                        const npy_float k9 = k[(9 - 1) * kstride];
+#endif
+
+#line 4153
+#if 10 <= 11
+                        /* load kernel */
+                        const npy_float k10 = k[(10 - 1) * kstride];
+#endif
+
+#line 4153
+#if 11 <= 11
+                        /* load kernel */
+                        const npy_float k11 = k[(11 - 1) * kstride];
+#endif
+
+                        for (i = 0; i < nd; i++) {
+                            npy_float s = 0;
+#line 4162
+#if 1 <= 11
+                            s += d[(i + 1 - 1) * dstride] * k1;
+#endif
+
+#line 4162
+#if 2 <= 11
+                            s += d[(i + 2 - 1) * dstride] * k2;
+#endif
+
+#line 4162
+#if 3 <= 11
+                            s += d[(i + 3 - 1) * dstride] * k3;
+#endif
+
+#line 4162
+#if 4 <= 11
+                            s += d[(i + 4 - 1) * dstride] * k4;
+#endif
+
+#line 4162
+#if 5 <= 11
+                            s += d[(i + 5 - 1) * dstride] * k5;
+#endif
+
+#line 4162
+#if 6 <= 11
+                            s += d[(i + 6 - 1) * dstride] * k6;
+#endif
+
+#line 4162
+#if 7 <= 11
+                            s += d[(i + 7 - 1) * dstride] * k7;
+#endif
+
+#line 4162
+#if 8 <= 11
+                            s += d[(i + 8 - 1) * dstride] * k8;
+#endif
+
+#line 4162
+#if 9 <= 11
+                            s += d[(i + 9 - 1) * dstride] * k9;
+#endif
+
+#line 4162
+#if 10 <= 11
+                            s += d[(i + 10 - 1) * dstride] * k10;
+#endif
+
+#line 4162
+#if 11 <= 11
+                            s += d[(i + 11 - 1) * dstride] * k11;
+#endif
+
+                            out[i * ostride] = s;
+                        }
+                        return 1;
+                    }
+
+                    default:
+                        return 0;
+                }
+            }
+
+#line 4136
+        case NPY_DOUBLE:
+            {
+                npy_intp i;
+                const npy_double * d = (npy_double*)d_;
+                const npy_double * k = (npy_double*)k_;
+                npy_double * out = (npy_double*)out_;
+                dstride /= sizeof(npy_double);
+                kstride /= sizeof(npy_double);
+                ostride /= sizeof(npy_double);
+                /* unroll inner loop to optimize register usage of the kernel*/
+                switch (nk) {
+#line 4149
+                    case 1:
+                    {
+#line 4153
+#if 1 <= 1
+                        /* load kernel */
+                        const npy_double k1 = k[(1 - 1) * kstride];
+#endif
+
+#line 4153
+#if 2 <= 1
+                        /* load kernel */
+                        const npy_double k2 = k[(2 - 1) * kstride];
+#endif
+
+#line 4153
+#if 3 <= 1
+                        /* load kernel */
+                        const npy_double k3 = k[(3 - 1) * kstride];
+#endif
+
+#line 4153
+#if 4 <= 1
+                        /* load kernel */
+                        const npy_double k4 = k[(4 - 1) * kstride];
+#endif
+
+#line 4153
+#if 5 <= 1
+                        /* load kernel */
+                        const npy_double k5 = k[(5 - 1) * kstride];
+#endif
+
+#line 4153
+#if 6 <= 1
+                        /* load kernel */
+                        const npy_double k6 = k[(6 - 1) * kstride];
+#endif
+
+#line 4153
+#if 7 <= 1
+                        /* load kernel */
+                        const npy_double k7 = k[(7 - 1) * kstride];
+#endif
+
+#line 4153
+#if 8 <= 1
+                        /* load kernel */
+                        const npy_double k8 = k[(8 - 1) * kstride];
+#endif
+
+#line 4153
+#if 9 <= 1
+                        /* load kernel */
+                        const npy_double k9 = k[(9 - 1) * kstride];
+#endif
+
+#line 4153
+#if 10 <= 1
+                        /* load kernel */
+                        const npy_double k10 = k[(10 - 1) * kstride];
+#endif
+
+#line 4153
+#if 11 <= 1
+                        /* load kernel */
+                        const npy_double k11 = k[(11 - 1) * kstride];
+#endif
+
+                        for (i = 0; i < nd; i++) {
+                            npy_double s = 0;
+#line 4162
+#if 1 <= 1
+                            s += d[(i + 1 - 1) * dstride] * k1;
+#endif
+
+#line 4162
+#if 2 <= 1
+                            s += d[(i + 2 - 1) * dstride] * k2;
+#endif
+
+#line 4162
+#if 3 <= 1
+                            s += d[(i + 3 - 1) * dstride] * k3;
+#endif
+
+#line 4162
+#if 4 <= 1
+                            s += d[(i + 4 - 1) * dstride] * k4;
+#endif
+
+#line 4162
+#if 5 <= 1
+                            s += d[(i + 5 - 1) * dstride] * k5;
+#endif
+
+#line 4162
+#if 6 <= 1
+                            s += d[(i + 6 - 1) * dstride] * k6;
+#endif
+
+#line 4162
+#if 7 <= 1
+                            s += d[(i + 7 - 1) * dstride] * k7;
+#endif
+
+#line 4162
+#if 8 <= 1
+                            s += d[(i + 8 - 1) * dstride] * k8;
+#endif
+
+#line 4162
+#if 9 <= 1
+                            s += d[(i + 9 - 1) * dstride] * k9;
+#endif
+
+#line 4162
+#if 10 <= 1
+                            s += d[(i + 10 - 1) * dstride] * k10;
+#endif
+
+#line 4162
+#if 11 <= 1
+                            s += d[(i + 11 - 1) * dstride] * k11;
+#endif
+
+                            out[i * ostride] = s;
+                        }
+                        return 1;
+                    }
+
+#line 4149
+                    case 2:
+                    {
+#line 4153
+#if 1 <= 2
+                        /* load kernel */
+                        const npy_double k1 = k[(1 - 1) * kstride];
+#endif
+
+#line 4153
+#if 2 <= 2
+                        /* load kernel */
+                        const npy_double k2 = k[(2 - 1) * kstride];
+#endif
+
+#line 4153
+#if 3 <= 2
+                        /* load kernel */
+                        const npy_double k3 = k[(3 - 1) * kstride];
+#endif
+
+#line 4153
+#if 4 <= 2
+                        /* load kernel */
+                        const npy_double k4 = k[(4 - 1) * kstride];
+#endif
+
+#line 4153
+#if 5 <= 2
+                        /* load kernel */
+                        const npy_double k5 = k[(5 - 1) * kstride];
+#endif
+
+#line 4153
+#if 6 <= 2
+                        /* load kernel */
+                        const npy_double k6 = k[(6 - 1) * kstride];
+#endif
+
+#line 4153
+#if 7 <= 2
+                        /* load kernel */
+                        const npy_double k7 = k[(7 - 1) * kstride];
+#endif
+
+#line 4153
+#if 8 <= 2
+                        /* load kernel */
+                        const npy_double k8 = k[(8 - 1) * kstride];
+#endif
+
+#line 4153
+#if 9 <= 2
+                        /* load kernel */
+                        const npy_double k9 = k[(9 - 1) * kstride];
+#endif
+
+#line 4153
+#if 10 <= 2
+                        /* load kernel */
+                        const npy_double k10 = k[(10 - 1) * kstride];
+#endif
+
+#line 4153
+#if 11 <= 2
+                        /* load kernel */
+                        const npy_double k11 = k[(11 - 1) * kstride];
+#endif
+
+                        for (i = 0; i < nd; i++) {
+                            npy_double s = 0;
+#line 4162
+#if 1 <= 2
+                            s += d[(i + 1 - 1) * dstride] * k1;
+#endif
+
+#line 4162
+#if 2 <= 2
+                            s += d[(i + 2 - 1) * dstride] * k2;
+#endif
+
+#line 4162
+#if 3 <= 2
+                            s += d[(i + 3 - 1) * dstride] * k3;
+#endif
+
+#line 4162
+#if 4 <= 2
+                            s += d[(i + 4 - 1) * dstride] * k4;
+#endif
+
+#line 4162
+#if 5 <= 2
+                            s += d[(i + 5 - 1) * dstride] * k5;
+#endif
+
+#line 4162
+#if 6 <= 2
+                            s += d[(i + 6 - 1) * dstride] * k6;
+#endif
+
+#line 4162
+#if 7 <= 2
+                            s += d[(i + 7 - 1) * dstride] * k7;
+#endif
+
+#line 4162
+#if 8 <= 2
+                            s += d[(i + 8 - 1) * dstride] * k8;
+#endif
+
+#line 4162
+#if 9 <= 2
+                            s += d[(i + 9 - 1) * dstride] * k9;
+#endif
+
+#line 4162
+#if 10 <= 2
+                            s += d[(i + 10 - 1) * dstride] * k10;
+#endif
+
+#line 4162
+#if 11 <= 2
+                            s += d[(i + 11 - 1) * dstride] * k11;
+#endif
+
+                            out[i * ostride] = s;
+                        }
+                        return 1;
+                    }
+
+#line 4149
+                    case 3:
+                    {
+#line 4153
+#if 1 <= 3
+                        /* load kernel */
+                        const npy_double k1 = k[(1 - 1) * kstride];
+#endif
+
+#line 4153
+#if 2 <= 3
+                        /* load kernel */
+                        const npy_double k2 = k[(2 - 1) * kstride];
+#endif
+
+#line 4153
+#if 3 <= 3
+                        /* load kernel */
+                        const npy_double k3 = k[(3 - 1) * kstride];
+#endif
+
+#line 4153
+#if 4 <= 3
+                        /* load kernel */
+                        const npy_double k4 = k[(4 - 1) * kstride];
+#endif
+
+#line 4153
+#if 5 <= 3
+                        /* load kernel */
+                        const npy_double k5 = k[(5 - 1) * kstride];
+#endif
+
+#line 4153
+#if 6 <= 3
+                        /* load kernel */
+                        const npy_double k6 = k[(6 - 1) * kstride];
+#endif
+
+#line 4153
+#if 7 <= 3
+                        /* load kernel */
+                        const npy_double k7 = k[(7 - 1) * kstride];
+#endif
+
+#line 4153
+#if 8 <= 3
+                        /* load kernel */
+                        const npy_double k8 = k[(8 - 1) * kstride];
+#endif
+
+#line 4153
+#if 9 <= 3
+                        /* load kernel */
+                        const npy_double k9 = k[(9 - 1) * kstride];
+#endif
+
+#line 4153
+#if 10 <= 3
+                        /* load kernel */
+                        const npy_double k10 = k[(10 - 1) * kstride];
+#endif
+
+#line 4153
+#if 11 <= 3
+                        /* load kernel */
+                        const npy_double k11 = k[(11 - 1) * kstride];
+#endif
+
+                        for (i = 0; i < nd; i++) {
+                            npy_double s = 0;
+#line 4162
+#if 1 <= 3
+                            s += d[(i + 1 - 1) * dstride] * k1;
+#endif
+
+#line 4162
+#if 2 <= 3
+                            s += d[(i + 2 - 1) * dstride] * k2;
+#endif
+
+#line 4162
+#if 3 <= 3
+                            s += d[(i + 3 - 1) * dstride] * k3;
+#endif
+
+#line 4162
+#if 4 <= 3
+                            s += d[(i + 4 - 1) * dstride] * k4;
+#endif
+
+#line 4162
+#if 5 <= 3
+                            s += d[(i + 5 - 1) * dstride] * k5;
+#endif
+
+#line 4162
+#if 6 <= 3
+                            s += d[(i + 6 - 1) * dstride] * k6;
+#endif
+
+#line 4162
+#if 7 <= 3
+                            s += d[(i + 7 - 1) * dstride] * k7;
+#endif
+
+#line 4162
+#if 8 <= 3
+                            s += d[(i + 8 - 1) * dstride] * k8;
+#endif
+
+#line 4162
+#if 9 <= 3
+                            s += d[(i + 9 - 1) * dstride] * k9;
+#endif
+
+#line 4162
+#if 10 <= 3
+                            s += d[(i + 10 - 1) * dstride] * k10;
+#endif
+
+#line 4162
+#if 11 <= 3
+                            s += d[(i + 11 - 1) * dstride] * k11;
+#endif
+
+                            out[i * ostride] = s;
+                        }
+                        return 1;
+                    }
+
+#line 4149
+                    case 4:
+                    {
+#line 4153
+#if 1 <= 4
+                        /* load kernel */
+                        const npy_double k1 = k[(1 - 1) * kstride];
+#endif
+
+#line 4153
+#if 2 <= 4
+                        /* load kernel */
+                        const npy_double k2 = k[(2 - 1) * kstride];
+#endif
+
+#line 4153
+#if 3 <= 4
+                        /* load kernel */
+                        const npy_double k3 = k[(3 - 1) * kstride];
+#endif
+
+#line 4153
+#if 4 <= 4
+                        /* load kernel */
+                        const npy_double k4 = k[(4 - 1) * kstride];
+#endif
+
+#line 4153
+#if 5 <= 4
+                        /* load kernel */
+                        const npy_double k5 = k[(5 - 1) * kstride];
+#endif
+
+#line 4153
+#if 6 <= 4
+                        /* load kernel */
+                        const npy_double k6 = k[(6 - 1) * kstride];
+#endif
+
+#line 4153
+#if 7 <= 4
+                        /* load kernel */
+                        const npy_double k7 = k[(7 - 1) * kstride];
+#endif
+
+#line 4153
+#if 8 <= 4
+                        /* load kernel */
+                        const npy_double k8 = k[(8 - 1) * kstride];
+#endif
+
+#line 4153
+#if 9 <= 4
+                        /* load kernel */
+                        const npy_double k9 = k[(9 - 1) * kstride];
+#endif
+
+#line 4153
+#if 10 <= 4
+                        /* load kernel */
+                        const npy_double k10 = k[(10 - 1) * kstride];
+#endif
+
+#line 4153
+#if 11 <= 4
+                        /* load kernel */
+                        const npy_double k11 = k[(11 - 1) * kstride];
+#endif
+
+                        for (i = 0; i < nd; i++) {
+                            npy_double s = 0;
+#line 4162
+#if 1 <= 4
+                            s += d[(i + 1 - 1) * dstride] * k1;
+#endif
+
+#line 4162
+#if 2 <= 4
+                            s += d[(i + 2 - 1) * dstride] * k2;
+#endif
+
+#line 4162
+#if 3 <= 4
+                            s += d[(i + 3 - 1) * dstride] * k3;
+#endif
+
+#line 4162
+#if 4 <= 4
+                            s += d[(i + 4 - 1) * dstride] * k4;
+#endif
+
+#line 4162
+#if 5 <= 4
+                            s += d[(i + 5 - 1) * dstride] * k5;
+#endif
+
+#line 4162
+#if 6 <= 4
+                            s += d[(i + 6 - 1) * dstride] * k6;
+#endif
+
+#line 4162
+#if 7 <= 4
+                            s += d[(i + 7 - 1) * dstride] * k7;
+#endif
+
+#line 4162
+#if 8 <= 4
+                            s += d[(i + 8 - 1) * dstride] * k8;
+#endif
+
+#line 4162
+#if 9 <= 4
+                            s += d[(i + 9 - 1) * dstride] * k9;
+#endif
+
+#line 4162
+#if 10 <= 4
+                            s += d[(i + 10 - 1) * dstride] * k10;
+#endif
+
+#line 4162
+#if 11 <= 4
+                            s += d[(i + 11 - 1) * dstride] * k11;
+#endif
+
+                            out[i * ostride] = s;
+                        }
+                        return 1;
+                    }
+
+#line 4149
+                    case 5:
+                    {
+#line 4153
+#if 1 <= 5
+                        /* load kernel */
+                        const npy_double k1 = k[(1 - 1) * kstride];
+#endif
+
+#line 4153
+#if 2 <= 5
+                        /* load kernel */
+                        const npy_double k2 = k[(2 - 1) * kstride];
+#endif
+
+#line 4153
+#if 3 <= 5
+                        /* load kernel */
+                        const npy_double k3 = k[(3 - 1) * kstride];
+#endif
+
+#line 4153
+#if 4 <= 5
+                        /* load kernel */
+                        const npy_double k4 = k[(4 - 1) * kstride];
+#endif
+
+#line 4153
+#if 5 <= 5
+                        /* load kernel */
+                        const npy_double k5 = k[(5 - 1) * kstride];
+#endif
+
+#line 4153
+#if 6 <= 5
+                        /* load kernel */
+                        const npy_double k6 = k[(6 - 1) * kstride];
+#endif
+
+#line 4153
+#if 7 <= 5
+                        /* load kernel */
+                        const npy_double k7 = k[(7 - 1) * kstride];
+#endif
+
+#line 4153
+#if 8 <= 5
+                        /* load kernel */
+                        const npy_double k8 = k[(8 - 1) * kstride];
+#endif
+
+#line 4153
+#if 9 <= 5
+                        /* load kernel */
+                        const npy_double k9 = k[(9 - 1) * kstride];
+#endif
+
+#line 4153
+#if 10 <= 5
+                        /* load kernel */
+                        const npy_double k10 = k[(10 - 1) * kstride];
+#endif
+
+#line 4153
+#if 11 <= 5
+                        /* load kernel */
+                        const npy_double k11 = k[(11 - 1) * kstride];
+#endif
+
+                        for (i = 0; i < nd; i++) {
+                            npy_double s = 0;
+#line 4162
+#if 1 <= 5
+                            s += d[(i + 1 - 1) * dstride] * k1;
+#endif
+
+#line 4162
+#if 2 <= 5
+                            s += d[(i + 2 - 1) * dstride] * k2;
+#endif
+
+#line 4162
+#if 3 <= 5
+                            s += d[(i + 3 - 1) * dstride] * k3;
+#endif
+
+#line 4162
+#if 4 <= 5
+                            s += d[(i + 4 - 1) * dstride] * k4;
+#endif
+
+#line 4162
+#if 5 <= 5
+                            s += d[(i + 5 - 1) * dstride] * k5;
+#endif
+
+#line 4162
+#if 6 <= 5
+                            s += d[(i + 6 - 1) * dstride] * k6;
+#endif
+
+#line 4162
+#if 7 <= 5
+                            s += d[(i + 7 - 1) * dstride] * k7;
+#endif
+
+#line 4162
+#if 8 <= 5
+                            s += d[(i + 8 - 1) * dstride] * k8;
+#endif
+
+#line 4162
+#if 9 <= 5
+                            s += d[(i + 9 - 1) * dstride] * k9;
+#endif
+
+#line 4162
+#if 10 <= 5
+                            s += d[(i + 10 - 1) * dstride] * k10;
+#endif
+
+#line 4162
+#if 11 <= 5
+                            s += d[(i + 11 - 1) * dstride] * k11;
+#endif
+
+                            out[i * ostride] = s;
+                        }
+                        return 1;
+                    }
+
+#line 4149
+                    case 6:
+                    {
+#line 4153
+#if 1 <= 6
+                        /* load kernel */
+                        const npy_double k1 = k[(1 - 1) * kstride];
+#endif
+
+#line 4153
+#if 2 <= 6
+                        /* load kernel */
+                        const npy_double k2 = k[(2 - 1) * kstride];
+#endif
+
+#line 4153
+#if 3 <= 6
+                        /* load kernel */
+                        const npy_double k3 = k[(3 - 1) * kstride];
+#endif
+
+#line 4153
+#if 4 <= 6
+                        /* load kernel */
+                        const npy_double k4 = k[(4 - 1) * kstride];
+#endif
+
+#line 4153
+#if 5 <= 6
+                        /* load kernel */
+                        const npy_double k5 = k[(5 - 1) * kstride];
+#endif
+
+#line 4153
+#if 6 <= 6
+                        /* load kernel */
+                        const npy_double k6 = k[(6 - 1) * kstride];
+#endif
+
+#line 4153
+#if 7 <= 6
+                        /* load kernel */
+                        const npy_double k7 = k[(7 - 1) * kstride];
+#endif
+
+#line 4153
+#if 8 <= 6
+                        /* load kernel */
+                        const npy_double k8 = k[(8 - 1) * kstride];
+#endif
+
+#line 4153
+#if 9 <= 6
+                        /* load kernel */
+                        const npy_double k9 = k[(9 - 1) * kstride];
+#endif
+
+#line 4153
+#if 10 <= 6
+                        /* load kernel */
+                        const npy_double k10 = k[(10 - 1) * kstride];
+#endif
+
+#line 4153
+#if 11 <= 6
+                        /* load kernel */
+                        const npy_double k11 = k[(11 - 1) * kstride];
+#endif
+
+                        for (i = 0; i < nd; i++) {
+                            npy_double s = 0;
+#line 4162
+#if 1 <= 6
+                            s += d[(i + 1 - 1) * dstride] * k1;
+#endif
+
+#line 4162
+#if 2 <= 6
+                            s += d[(i + 2 - 1) * dstride] * k2;
+#endif
+
+#line 4162
+#if 3 <= 6
+                            s += d[(i + 3 - 1) * dstride] * k3;
+#endif
+
+#line 4162
+#if 4 <= 6
+                            s += d[(i + 4 - 1) * dstride] * k4;
+#endif
+
+#line 4162
+#if 5 <= 6
+                            s += d[(i + 5 - 1) * dstride] * k5;
+#endif
+
+#line 4162
+#if 6 <= 6
+                            s += d[(i + 6 - 1) * dstride] * k6;
+#endif
+
+#line 4162
+#if 7 <= 6
+                            s += d[(i + 7 - 1) * dstride] * k7;
+#endif
+
+#line 4162
+#if 8 <= 6
+                            s += d[(i + 8 - 1) * dstride] * k8;
+#endif
+
+#line 4162
+#if 9 <= 6
+                            s += d[(i + 9 - 1) * dstride] * k9;
+#endif
+
+#line 4162
+#if 10 <= 6
+                            s += d[(i + 10 - 1) * dstride] * k10;
+#endif
+
+#line 4162
+#if 11 <= 6
+                            s += d[(i + 11 - 1) * dstride] * k11;
+#endif
+
+                            out[i * ostride] = s;
+                        }
+                        return 1;
+                    }
+
+#line 4149
+                    case 7:
+                    {
+#line 4153
+#if 1 <= 7
+                        /* load kernel */
+                        const npy_double k1 = k[(1 - 1) * kstride];
+#endif
+
+#line 4153
+#if 2 <= 7
+                        /* load kernel */
+                        const npy_double k2 = k[(2 - 1) * kstride];
+#endif
+
+#line 4153
+#if 3 <= 7
+                        /* load kernel */
+                        const npy_double k3 = k[(3 - 1) * kstride];
+#endif
+
+#line 4153
+#if 4 <= 7
+                        /* load kernel */
+                        const npy_double k4 = k[(4 - 1) * kstride];
+#endif
+
+#line 4153
+#if 5 <= 7
+                        /* load kernel */
+                        const npy_double k5 = k[(5 - 1) * kstride];
+#endif
+
+#line 4153
+#if 6 <= 7
+                        /* load kernel */
+                        const npy_double k6 = k[(6 - 1) * kstride];
+#endif
+
+#line 4153
+#if 7 <= 7
+                        /* load kernel */
+                        const npy_double k7 = k[(7 - 1) * kstride];
+#endif
+
+#line 4153
+#if 8 <= 7
+                        /* load kernel */
+                        const npy_double k8 = k[(8 - 1) * kstride];
+#endif
+
+#line 4153
+#if 9 <= 7
+                        /* load kernel */
+                        const npy_double k9 = k[(9 - 1) * kstride];
+#endif
+
+#line 4153
+#if 10 <= 7
+                        /* load kernel */
+                        const npy_double k10 = k[(10 - 1) * kstride];
+#endif
+
+#line 4153
+#if 11 <= 7
+                        /* load kernel */
+                        const npy_double k11 = k[(11 - 1) * kstride];
+#endif
+
+                        for (i = 0; i < nd; i++) {
+                            npy_double s = 0;
+#line 4162
+#if 1 <= 7
+                            s += d[(i + 1 - 1) * dstride] * k1;
+#endif
+
+#line 4162
+#if 2 <= 7
+                            s += d[(i + 2 - 1) * dstride] * k2;
+#endif
+
+#line 4162
+#if 3 <= 7
+                            s += d[(i + 3 - 1) * dstride] * k3;
+#endif
+
+#line 4162
+#if 4 <= 7
+                            s += d[(i + 4 - 1) * dstride] * k4;
+#endif
+
+#line 4162
+#if 5 <= 7
+                            s += d[(i + 5 - 1) * dstride] * k5;
+#endif
+
+#line 4162
+#if 6 <= 7
+                            s += d[(i + 6 - 1) * dstride] * k6;
+#endif
+
+#line 4162
+#if 7 <= 7
+                            s += d[(i + 7 - 1) * dstride] * k7;
+#endif
+
+#line 4162
+#if 8 <= 7
+                            s += d[(i + 8 - 1) * dstride] * k8;
+#endif
+
+#line 4162
+#if 9 <= 7
+                            s += d[(i + 9 - 1) * dstride] * k9;
+#endif
+
+#line 4162
+#if 10 <= 7
+                            s += d[(i + 10 - 1) * dstride] * k10;
+#endif
+
+#line 4162
+#if 11 <= 7
+                            s += d[(i + 11 - 1) * dstride] * k11;
+#endif
+
+                            out[i * ostride] = s;
+                        }
+                        return 1;
+                    }
+
+#line 4149
+                    case 8:
+                    {
+#line 4153
+#if 1 <= 8
+                        /* load kernel */
+                        const npy_double k1 = k[(1 - 1) * kstride];
+#endif
+
+#line 4153
+#if 2 <= 8
+                        /* load kernel */
+                        const npy_double k2 = k[(2 - 1) * kstride];
+#endif
+
+#line 4153
+#if 3 <= 8
+                        /* load kernel */
+                        const npy_double k3 = k[(3 - 1) * kstride];
+#endif
+
+#line 4153
+#if 4 <= 8
+                        /* load kernel */
+                        const npy_double k4 = k[(4 - 1) * kstride];
+#endif
+
+#line 4153
+#if 5 <= 8
+                        /* load kernel */
+                        const npy_double k5 = k[(5 - 1) * kstride];
+#endif
+
+#line 4153
+#if 6 <= 8
+                        /* load kernel */
+                        const npy_double k6 = k[(6 - 1) * kstride];
+#endif
+
+#line 4153
+#if 7 <= 8
+                        /* load kernel */
+                        const npy_double k7 = k[(7 - 1) * kstride];
+#endif
+
+#line 4153
+#if 8 <= 8
+                        /* load kernel */
+                        const npy_double k8 = k[(8 - 1) * kstride];
+#endif
+
+#line 4153
+#if 9 <= 8
+                        /* load kernel */
+                        const npy_double k9 = k[(9 - 1) * kstride];
+#endif
+
+#line 4153
+#if 10 <= 8
+                        /* load kernel */
+                        const npy_double k10 = k[(10 - 1) * kstride];
+#endif
+
+#line 4153
+#if 11 <= 8
+                        /* load kernel */
+                        const npy_double k11 = k[(11 - 1) * kstride];
+#endif
+
+                        for (i = 0; i < nd; i++) {
+                            npy_double s = 0;
+#line 4162
+#if 1 <= 8
+                            s += d[(i + 1 - 1) * dstride] * k1;
+#endif
+
+#line 4162
+#if 2 <= 8
+                            s += d[(i + 2 - 1) * dstride] * k2;
+#endif
+
+#line 4162
+#if 3 <= 8
+                            s += d[(i + 3 - 1) * dstride] * k3;
+#endif
+
+#line 4162
+#if 4 <= 8
+                            s += d[(i + 4 - 1) * dstride] * k4;
+#endif
+
+#line 4162
+#if 5 <= 8
+                            s += d[(i + 5 - 1) * dstride] * k5;
+#endif
+
+#line 4162
+#if 6 <= 8
+                            s += d[(i + 6 - 1) * dstride] * k6;
+#endif
+
+#line 4162
+#if 7 <= 8
+                            s += d[(i + 7 - 1) * dstride] * k7;
+#endif
+
+#line 4162
+#if 8 <= 8
+                            s += d[(i + 8 - 1) * dstride] * k8;
+#endif
+
+#line 4162
+#if 9 <= 8
+                            s += d[(i + 9 - 1) * dstride] * k9;
+#endif
+
+#line 4162
+#if 10 <= 8
+                            s += d[(i + 10 - 1) * dstride] * k10;
+#endif
+
+#line 4162
+#if 11 <= 8
+                            s += d[(i + 11 - 1) * dstride] * k11;
+#endif
+
+                            out[i * ostride] = s;
+                        }
+                        return 1;
+                    }
+
+#line 4149
+                    case 9:
+                    {
+#line 4153
+#if 1 <= 9
+                        /* load kernel */
+                        const npy_double k1 = k[(1 - 1) * kstride];
+#endif
+
+#line 4153
+#if 2 <= 9
+                        /* load kernel */
+                        const npy_double k2 = k[(2 - 1) * kstride];
+#endif
+
+#line 4153
+#if 3 <= 9
+                        /* load kernel */
+                        const npy_double k3 = k[(3 - 1) * kstride];
+#endif
+
+#line 4153
+#if 4 <= 9
+                        /* load kernel */
+                        const npy_double k4 = k[(4 - 1) * kstride];
+#endif
+
+#line 4153
+#if 5 <= 9
+                        /* load kernel */
+                        const npy_double k5 = k[(5 - 1) * kstride];
+#endif
+
+#line 4153
+#if 6 <= 9
+                        /* load kernel */
+                        const npy_double k6 = k[(6 - 1) * kstride];
+#endif
+
+#line 4153
+#if 7 <= 9
+                        /* load kernel */
+                        const npy_double k7 = k[(7 - 1) * kstride];
+#endif
+
+#line 4153
+#if 8 <= 9
+                        /* load kernel */
+                        const npy_double k8 = k[(8 - 1) * kstride];
+#endif
+
+#line 4153
+#if 9 <= 9
+                        /* load kernel */
+                        const npy_double k9 = k[(9 - 1) * kstride];
+#endif
+
+#line 4153
+#if 10 <= 9
+                        /* load kernel */
+                        const npy_double k10 = k[(10 - 1) * kstride];
+#endif
+
+#line 4153
+#if 11 <= 9
+                        /* load kernel */
+                        const npy_double k11 = k[(11 - 1) * kstride];
+#endif
+
+                        for (i = 0; i < nd; i++) {
+                            npy_double s = 0;
+#line 4162
+#if 1 <= 9
+                            s += d[(i + 1 - 1) * dstride] * k1;
+#endif
+
+#line 4162
+#if 2 <= 9
+                            s += d[(i + 2 - 1) * dstride] * k2;
+#endif
+
+#line 4162
+#if 3 <= 9
+                            s += d[(i + 3 - 1) * dstride] * k3;
+#endif
+
+#line 4162
+#if 4 <= 9
+                            s += d[(i + 4 - 1) * dstride] * k4;
+#endif
+
+#line 4162
+#if 5 <= 9
+                            s += d[(i + 5 - 1) * dstride] * k5;
+#endif
+
+#line 4162
+#if 6 <= 9
+                            s += d[(i + 6 - 1) * dstride] * k6;
+#endif
+
+#line 4162
+#if 7 <= 9
+                            s += d[(i + 7 - 1) * dstride] * k7;
+#endif
+
+#line 4162
+#if 8 <= 9
+                            s += d[(i + 8 - 1) * dstride] * k8;
+#endif
+
+#line 4162
+#if 9 <= 9
+                            s += d[(i + 9 - 1) * dstride] * k9;
+#endif
+
+#line 4162
+#if 10 <= 9
+                            s += d[(i + 10 - 1) * dstride] * k10;
+#endif
+
+#line 4162
+#if 11 <= 9
+                            s += d[(i + 11 - 1) * dstride] * k11;
+#endif
+
+                            out[i * ostride] = s;
+                        }
+                        return 1;
+                    }
+
+#line 4149
+                    case 10:
+                    {
+#line 4153
+#if 1 <= 10
+                        /* load kernel */
+                        const npy_double k1 = k[(1 - 1) * kstride];
+#endif
+
+#line 4153
+#if 2 <= 10
+                        /* load kernel */
+                        const npy_double k2 = k[(2 - 1) * kstride];
+#endif
+
+#line 4153
+#if 3 <= 10
+                        /* load kernel */
+                        const npy_double k3 = k[(3 - 1) * kstride];
+#endif
+
+#line 4153
+#if 4 <= 10
+                        /* load kernel */
+                        const npy_double k4 = k[(4 - 1) * kstride];
+#endif
+
+#line 4153
+#if 5 <= 10
+                        /* load kernel */
+                        const npy_double k5 = k[(5 - 1) * kstride];
+#endif
+
+#line 4153
+#if 6 <= 10
+                        /* load kernel */
+                        const npy_double k6 = k[(6 - 1) * kstride];
+#endif
+
+#line 4153
+#if 7 <= 10
+                        /* load kernel */
+                        const npy_double k7 = k[(7 - 1) * kstride];
+#endif
+
+#line 4153
+#if 8 <= 10
+                        /* load kernel */
+                        const npy_double k8 = k[(8 - 1) * kstride];
+#endif
+
+#line 4153
+#if 9 <= 10
+                        /* load kernel */
+                        const npy_double k9 = k[(9 - 1) * kstride];
+#endif
+
+#line 4153
+#if 10 <= 10
+                        /* load kernel */
+                        const npy_double k10 = k[(10 - 1) * kstride];
+#endif
+
+#line 4153
+#if 11 <= 10
+                        /* load kernel */
+                        const npy_double k11 = k[(11 - 1) * kstride];
+#endif
+
+                        for (i = 0; i < nd; i++) {
+                            npy_double s = 0;
+#line 4162
+#if 1 <= 10
+                            s += d[(i + 1 - 1) * dstride] * k1;
+#endif
+
+#line 4162
+#if 2 <= 10
+                            s += d[(i + 2 - 1) * dstride] * k2;
+#endif
+
+#line 4162
+#if 3 <= 10
+                            s += d[(i + 3 - 1) * dstride] * k3;
+#endif
+
+#line 4162
+#if 4 <= 10
+                            s += d[(i + 4 - 1) * dstride] * k4;
+#endif
+
+#line 4162
+#if 5 <= 10
+                            s += d[(i + 5 - 1) * dstride] * k5;
+#endif
+
+#line 4162
+#if 6 <= 10
+                            s += d[(i + 6 - 1) * dstride] * k6;
+#endif
+
+#line 4162
+#if 7 <= 10
+                            s += d[(i + 7 - 1) * dstride] * k7;
+#endif
+
+#line 4162
+#if 8 <= 10
+                            s += d[(i + 8 - 1) * dstride] * k8;
+#endif
+
+#line 4162
+#if 9 <= 10
+                            s += d[(i + 9 - 1) * dstride] * k9;
+#endif
+
+#line 4162
+#if 10 <= 10
+                            s += d[(i + 10 - 1) * dstride] * k10;
+#endif
+
+#line 4162
+#if 11 <= 10
+                            s += d[(i + 11 - 1) * dstride] * k11;
+#endif
+
+                            out[i * ostride] = s;
+                        }
+                        return 1;
+                    }
+
+#line 4149
+                    case 11:
+                    {
+#line 4153
+#if 1 <= 11
+                        /* load kernel */
+                        const npy_double k1 = k[(1 - 1) * kstride];
+#endif
+
+#line 4153
+#if 2 <= 11
+                        /* load kernel */
+                        const npy_double k2 = k[(2 - 1) * kstride];
+#endif
+
+#line 4153
+#if 3 <= 11
+                        /* load kernel */
+                        const npy_double k3 = k[(3 - 1) * kstride];
+#endif
+
+#line 4153
+#if 4 <= 11
+                        /* load kernel */
+                        const npy_double k4 = k[(4 - 1) * kstride];
+#endif
+
+#line 4153
+#if 5 <= 11
+                        /* load kernel */
+                        const npy_double k5 = k[(5 - 1) * kstride];
+#endif
+
+#line 4153
+#if 6 <= 11
+                        /* load kernel */
+                        const npy_double k6 = k[(6 - 1) * kstride];
+#endif
+
+#line 4153
+#if 7 <= 11
+                        /* load kernel */
+                        const npy_double k7 = k[(7 - 1) * kstride];
+#endif
+
+#line 4153
+#if 8 <= 11
+                        /* load kernel */
+                        const npy_double k8 = k[(8 - 1) * kstride];
+#endif
+
+#line 4153
+#if 9 <= 11
+                        /* load kernel */
+                        const npy_double k9 = k[(9 - 1) * kstride];
+#endif
+
+#line 4153
+#if 10 <= 11
+                        /* load kernel */
+                        const npy_double k10 = k[(10 - 1) * kstride];
+#endif
+
+#line 4153
+#if 11 <= 11
+                        /* load kernel */
+                        const npy_double k11 = k[(11 - 1) * kstride];
+#endif
+
+                        for (i = 0; i < nd; i++) {
+                            npy_double s = 0;
+#line 4162
+#if 1 <= 11
+                            s += d[(i + 1 - 1) * dstride] * k1;
+#endif
+
+#line 4162
+#if 2 <= 11
+                            s += d[(i + 2 - 1) * dstride] * k2;
+#endif
+
+#line 4162
+#if 3 <= 11
+                            s += d[(i + 3 - 1) * dstride] * k3;
+#endif
+
+#line 4162
+#if 4 <= 11
+                            s += d[(i + 4 - 1) * dstride] * k4;
+#endif
+
+#line 4162
+#if 5 <= 11
+                            s += d[(i + 5 - 1) * dstride] * k5;
+#endif
+
+#line 4162
+#if 6 <= 11
+                            s += d[(i + 6 - 1) * dstride] * k6;
+#endif
+
+#line 4162
+#if 7 <= 11
+                            s += d[(i + 7 - 1) * dstride] * k7;
+#endif
+
+#line 4162
+#if 8 <= 11
+                            s += d[(i + 8 - 1) * dstride] * k8;
+#endif
+
+#line 4162
+#if 9 <= 11
+                            s += d[(i + 9 - 1) * dstride] * k9;
+#endif
+
+#line 4162
+#if 10 <= 11
+                            s += d[(i + 10 - 1) * dstride] * k10;
+#endif
+
+#line 4162
+#if 11 <= 11
+                            s += d[(i + 11 - 1) * dstride] * k11;
+#endif
+
+                            out[i * ostride] = s;
+                        }
+                        return 1;
+                    }
+
+                    default:
+                        return 0;
+                }
+            }
+
+        default:
+            return 0;
+    }
+}
+
+/*
+*/
+
+/* A clone function for the datetime dtype c_metadata */
+static NpyAuxData *
+_datetime_dtype_metadata_clone(NpyAuxData *data)
+{
+    PyArray_DatetimeDTypeMetaData *newdata =
+        (PyArray_DatetimeDTypeMetaData *)PyArray_malloc(
+                        sizeof(*newdata));
+    if (newdata == NULL) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    memcpy(newdata, data, sizeof(*newdata));
+
+    return (NpyAuxData *)newdata;
+}
+
+/*
+ * Allocate and initialize a PyArray_DatetimeDTypeMetaData object
+ */
+static NpyAuxData*
+_create_datetime_metadata(NPY_DATETIMEUNIT base, int num)
+{
+    PyArray_DatetimeDTypeMetaData *data;
+
+    /* Allocate memory for the metadata */
+    data = PyArray_malloc(sizeof(*data));
+    if (data == NULL) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    /* Initialize the base aux data */
+    memset(data, 0, sizeof(PyArray_DatetimeDTypeMetaData));
+    data->base.free = (NpyAuxData_FreeFunc *)PyArray_free;
+    data->base.clone = _datetime_dtype_metadata_clone;
+
+    data->meta.base = base;
+    data->meta.num = num;
+
+    return (NpyAuxData*)data;
+}
+
+
+/*
+ *****************************************************************************
+ **                       SETUP FUNCTION POINTERS                           **
+ *****************************************************************************
+ */
+
+#line 4244
+static PyArray_ArrFuncs _PyVoid_ArrFuncs = {
+    {
+        VOID_to_BOOL,
+        VOID_to_BYTE,
+        VOID_to_UBYTE,
+        VOID_to_SHORT,
+        VOID_to_USHORT,
+        VOID_to_INT,
+        VOID_to_UINT,
+        VOID_to_LONG,
+        VOID_to_ULONG,
+        VOID_to_LONGLONG,
+        VOID_to_ULONGLONG,
+        VOID_to_FLOAT,
+        VOID_to_DOUBLE,
+        VOID_to_LONGDOUBLE,
+        VOID_to_CFLOAT,
+        VOID_to_CDOUBLE,
+        VOID_to_CLONGDOUBLE,
+        VOID_to_OBJECT,
+        VOID_to_STRING,
+        VOID_to_UNICODE,
+        VOID_to_VOID
+    },
+    VOID_getitem,
+    VOID_setitem,
+    (PyArray_CopySwapNFunc*)VOID_copyswapn,
+    (PyArray_CopySwapFunc*)VOID_copyswap,
+    (PyArray_CompareFunc*)VOID_compare,
+    (PyArray_ArgFunc*)VOID_argmax,
+    (PyArray_DotFunc*)NULL,
+    (PyArray_ScanFunc*)VOID_scan,
+    VOID_fromstr,
+    (PyArray_NonzeroFunc*)VOID_nonzero,
+    (PyArray_FillFunc*)NULL,
+    (PyArray_FillWithScalarFunc*)NULL,
+#if 0
+    {
+        quicksort_void,
+        heapsort_void,
+        timsort_void
+    },
+    {
+        aquicksort_void,
+        aheapsort_void,
+        atimsort_void
+    },
+#else
+    {
+        NULL, NULL, NULL
+    },
+    {
+        NULL, NULL, NULL
+    },
+#endif
+    NULL,
+    (PyArray_ScalarKindFunc*)NULL,
+    NULL,
+    NULL,
+    (PyArray_FastClipFunc *)NULL,
+    (PyArray_FastPutmaskFunc *)NULL,
+    (PyArray_FastTakeFunc *)NULL,
+    (PyArray_ArgFunc*)VOID_argmin
+};
+
+/*
+ * FIXME: check for PY3K
+ */
+static PyArray_Descr VOID_Descr = {
+    PyObject_HEAD_INIT(&PyArrayDescr_Type)
+    /* typeobj */
+    &PyVoidArrType_Type,
+    /* kind */
+    NPY_VOIDLTR,
+    /* type */
+    NPY_VOIDLTR,
+    /* byteorder */
+    '|',
+    /* flags, unicode needs init as py3.3 does not like printing garbage  */
+    0,
+    /* type_num */
+    NPY_VOID,
+    /* elsize */
+    0,
+    /* alignment */
+    NPY_ALIGNOF(char),
+    /* subarray */
+    NULL,
+    /* fields */
+    NULL,
+    /* names */
+    NULL,
+    /* f */
+    &_PyVoid_ArrFuncs,
+    /* metadata */
+    NULL,
+    /* c_metadata */
+    NULL,
+    /* hash */
+    -1,
+};
+
+
+#line 4244
+static PyArray_ArrFuncs _PyString_ArrFuncs = {
+    {
+        STRING_to_BOOL,
+        STRING_to_BYTE,
+        STRING_to_UBYTE,
+        STRING_to_SHORT,
+        STRING_to_USHORT,
+        STRING_to_INT,
+        STRING_to_UINT,
+        STRING_to_LONG,
+        STRING_to_ULONG,
+        STRING_to_LONGLONG,
+        STRING_to_ULONGLONG,
+        STRING_to_FLOAT,
+        STRING_to_DOUBLE,
+        STRING_to_LONGDOUBLE,
+        STRING_to_CFLOAT,
+        STRING_to_CDOUBLE,
+        STRING_to_CLONGDOUBLE,
+        STRING_to_OBJECT,
+        STRING_to_STRING,
+        STRING_to_UNICODE,
+        STRING_to_VOID
+    },
+    STRING_getitem,
+    STRING_setitem,
+    (PyArray_CopySwapNFunc*)STRING_copyswapn,
+    (PyArray_CopySwapFunc*)STRING_copyswap,
+    (PyArray_CompareFunc*)STRING_compare,
+    (PyArray_ArgFunc*)STRING_argmax,
+    (PyArray_DotFunc*)NULL,
+    (PyArray_ScanFunc*)STRING_scan,
+    STRING_fromstr,
+    (PyArray_NonzeroFunc*)STRING_nonzero,
+    (PyArray_FillFunc*)NULL,
+    (PyArray_FillWithScalarFunc*)NULL,
+#if 1
+    {
+        quicksort_string,
+        heapsort_string,
+        timsort_string
+    },
+    {
+        aquicksort_string,
+        aheapsort_string,
+        atimsort_string
+    },
+#else
+    {
+        NULL, NULL, NULL
+    },
+    {
+        NULL, NULL, NULL
+    },
+#endif
+    NULL,
+    (PyArray_ScalarKindFunc*)NULL,
+    NULL,
+    NULL,
+    (PyArray_FastClipFunc *)NULL,
+    (PyArray_FastPutmaskFunc *)NULL,
+    (PyArray_FastTakeFunc *)NULL,
+    (PyArray_ArgFunc*)STRING_argmin
+};
+
+/*
+ * FIXME: check for PY3K
+ */
+static PyArray_Descr STRING_Descr = {
+    PyObject_HEAD_INIT(&PyArrayDescr_Type)
+    /* typeobj */
+    &PyStringArrType_Type,
+    /* kind */
+    NPY_STRINGLTR,
+    /* type */
+    NPY_STRINGLTR,
+    /* byteorder */
+    '|',
+    /* flags, unicode needs init as py3.3 does not like printing garbage  */
+    0,
+    /* type_num */
+    NPY_STRING,
+    /* elsize */
+    0,
+    /* alignment */
+    NPY_ALIGNOF(char),
+    /* subarray */
+    NULL,
+    /* fields */
+    NULL,
+    /* names */
+    NULL,
+    /* f */
+    &_PyString_ArrFuncs,
+    /* metadata */
+    NULL,
+    /* c_metadata */
+    NULL,
+    /* hash */
+    -1,
+};
+
+
+#line 4244
+static PyArray_ArrFuncs _PyUnicode_ArrFuncs = {
+    {
+        UNICODE_to_BOOL,
+        UNICODE_to_BYTE,
+        UNICODE_to_UBYTE,
+        UNICODE_to_SHORT,
+        UNICODE_to_USHORT,
+        UNICODE_to_INT,
+        UNICODE_to_UINT,
+        UNICODE_to_LONG,
+        UNICODE_to_ULONG,
+        UNICODE_to_LONGLONG,
+        UNICODE_to_ULONGLONG,
+        UNICODE_to_FLOAT,
+        UNICODE_to_DOUBLE,
+        UNICODE_to_LONGDOUBLE,
+        UNICODE_to_CFLOAT,
+        UNICODE_to_CDOUBLE,
+        UNICODE_to_CLONGDOUBLE,
+        UNICODE_to_OBJECT,
+        UNICODE_to_STRING,
+        UNICODE_to_UNICODE,
+        UNICODE_to_VOID
+    },
+    UNICODE_getitem,
+    UNICODE_setitem,
+    (PyArray_CopySwapNFunc*)UNICODE_copyswapn,
+    (PyArray_CopySwapFunc*)UNICODE_copyswap,
+    (PyArray_CompareFunc*)UNICODE_compare,
+    (PyArray_ArgFunc*)UNICODE_argmax,
+    (PyArray_DotFunc*)NULL,
+    (PyArray_ScanFunc*)UNICODE_scan,
+    UNICODE_fromstr,
+    (PyArray_NonzeroFunc*)UNICODE_nonzero,
+    (PyArray_FillFunc*)NULL,
+    (PyArray_FillWithScalarFunc*)NULL,
+#if 1
+    {
+        quicksort_unicode,
+        heapsort_unicode,
+        timsort_unicode
+    },
+    {
+        aquicksort_unicode,
+        aheapsort_unicode,
+        atimsort_unicode
+    },
+#else
+    {
+        NULL, NULL, NULL
+    },
+    {
+        NULL, NULL, NULL
+    },
+#endif
+    NULL,
+    (PyArray_ScalarKindFunc*)NULL,
+    NULL,
+    NULL,
+    (PyArray_FastClipFunc *)NULL,
+    (PyArray_FastPutmaskFunc *)NULL,
+    (PyArray_FastTakeFunc *)NULL,
+    (PyArray_ArgFunc*)UNICODE_argmin
+};
+
+/*
+ * FIXME: check for PY3K
+ */
+static PyArray_Descr UNICODE_Descr = {
+    PyObject_HEAD_INIT(&PyArrayDescr_Type)
+    /* typeobj */
+    &PyUnicodeArrType_Type,
+    /* kind */
+    NPY_UNICODELTR,
+    /* type */
+    NPY_UNICODELTR,
+    /* byteorder */
+    '=',
+    /* flags, unicode needs init as py3.3 does not like printing garbage  */
+    NPY_NEEDS_INIT,
+    /* type_num */
+    NPY_UNICODE,
+    /* elsize */
+    0,
+    /* alignment */
+    NPY_ALIGNOF(npy_ucs4),
+    /* subarray */
+    NULL,
+    /* fields */
+    NULL,
+    /* names */
+    NULL,
+    /* f */
+    &_PyUnicode_ArrFuncs,
+    /* metadata */
+    NULL,
+    /* c_metadata */
+    NULL,
+    /* hash */
+    -1,
+};
+
+
+
+#line 4385
+
+static PyArray_ArrFuncs _PyBool_ArrFuncs = {
+    {
+        BOOL_to_BOOL,
+        BOOL_to_BYTE,
+        BOOL_to_UBYTE,
+        BOOL_to_SHORT,
+        BOOL_to_USHORT,
+        BOOL_to_INT,
+        BOOL_to_UINT,
+        BOOL_to_LONG,
+        BOOL_to_ULONG,
+        BOOL_to_LONGLONG,
+        BOOL_to_ULONGLONG,
+        BOOL_to_FLOAT,
+        BOOL_to_DOUBLE,
+        BOOL_to_LONGDOUBLE,
+        BOOL_to_CFLOAT,
+        BOOL_to_CDOUBLE,
+        BOOL_to_CLONGDOUBLE,
+        BOOL_to_OBJECT,
+        BOOL_to_STRING,
+        BOOL_to_UNICODE,
+        BOOL_to_VOID
+    },
+    BOOL_getitem,
+    BOOL_setitem,
+    (PyArray_CopySwapNFunc*)BOOL_copyswapn,
+    (PyArray_CopySwapFunc*)BOOL_copyswap,
+    (PyArray_CompareFunc*)BOOL_compare,
+    (PyArray_ArgFunc*)BOOL_argmax,
+    (PyArray_DotFunc*)BOOL_dot,
+    (PyArray_ScanFunc*)BOOL_scan,
+    BOOL_fromstr,
+    (PyArray_NonzeroFunc*)BOOL_nonzero,
+    (PyArray_FillFunc*)BOOL_fill,
+    (PyArray_FillWithScalarFunc*)BOOL_fillwithscalar,
+#if 1
+    {
+        quicksort_bool,
+        heapsort_bool,
+        #if 1
+            radixsort_bool
+        #else
+            timsort_bool
+        #endif
+    },
+    {
+        aquicksort_bool,
+        aheapsort_bool,
+        #if 1
+            aradixsort_bool
+        #else
+            atimsort_bool
+        #endif
+    },
+#else
+    {
+        NULL, NULL, NULL
+    },
+    {
+        NULL, NULL, NULL
+    },
+#endif
+    NULL,
+    (PyArray_ScalarKindFunc*)NULL,
+    NULL,
+    NULL,
+    (PyArray_FastClipFunc*)NULL,
+    (PyArray_FastPutmaskFunc*)NULL,
+    (PyArray_FastTakeFunc*)NULL,
+    (PyArray_ArgFunc*)BOOL_argmin
+};
+
+/*
+ * FIXME: check for PY3K
+ */
+NPY_NO_EXPORT PyArray_Descr BOOL_Descr = {
+    PyObject_HEAD_INIT(&PyArrayDescr_Type)
+    /* typeobj */
+    &PyBoolArrType_Type,
+    /* kind */
+    NPY_GENBOOLLTR,
+    /* type */
+    NPY_BOOLLTR,
+    /* byteorder */
+    '|',
+    /* flags */
+    0,
+    /* type_num */
+    NPY_BOOL,
+    /* elsize */
+    sizeof(npy_bool),
+    /* alignment */
+    NPY_ALIGNOF(npy_bool),
+    /* subarray */
+    NULL,
+    /* fields */
+    NULL,
+    /* names */
+    NULL,
+    /* f */
+    &_PyBool_ArrFuncs,
+    /* metadata */
+    NULL,
+    /* c_metadata */
+    NULL,
+    /* hash */
+    -1,
+};
+
+
+#line 4385
+
+static PyArray_ArrFuncs _PyByte_ArrFuncs = {
+    {
+        BYTE_to_BOOL,
+        BYTE_to_BYTE,
+        BYTE_to_UBYTE,
+        BYTE_to_SHORT,
+        BYTE_to_USHORT,
+        BYTE_to_INT,
+        BYTE_to_UINT,
+        BYTE_to_LONG,
+        BYTE_to_ULONG,
+        BYTE_to_LONGLONG,
+        BYTE_to_ULONGLONG,
+        BYTE_to_FLOAT,
+        BYTE_to_DOUBLE,
+        BYTE_to_LONGDOUBLE,
+        BYTE_to_CFLOAT,
+        BYTE_to_CDOUBLE,
+        BYTE_to_CLONGDOUBLE,
+        BYTE_to_OBJECT,
+        BYTE_to_STRING,
+        BYTE_to_UNICODE,
+        BYTE_to_VOID
+    },
+    BYTE_getitem,
+    BYTE_setitem,
+    (PyArray_CopySwapNFunc*)BYTE_copyswapn,
+    (PyArray_CopySwapFunc*)BYTE_copyswap,
+    (PyArray_CompareFunc*)BYTE_compare,
+    (PyArray_ArgFunc*)BYTE_argmax,
+    (PyArray_DotFunc*)BYTE_dot,
+    (PyArray_ScanFunc*)BYTE_scan,
+    BYTE_fromstr,
+    (PyArray_NonzeroFunc*)BYTE_nonzero,
+    (PyArray_FillFunc*)BYTE_fill,
+    (PyArray_FillWithScalarFunc*)BYTE_fillwithscalar,
+#if 1
+    {
+        quicksort_byte,
+        heapsort_byte,
+        #if 1
+            radixsort_byte
+        #else
+            timsort_byte
+        #endif
+    },
+    {
+        aquicksort_byte,
+        aheapsort_byte,
+        #if 1
+            aradixsort_byte
+        #else
+            atimsort_byte
+        #endif
+    },
+#else
+    {
+        NULL, NULL, NULL
+    },
+    {
+        NULL, NULL, NULL
+    },
+#endif
+    NULL,
+    (PyArray_ScalarKindFunc*)NULL,
+    NULL,
+    NULL,
+    (PyArray_FastClipFunc*)NULL,
+    (PyArray_FastPutmaskFunc*)NULL,
+    (PyArray_FastTakeFunc*)NULL,
+    (PyArray_ArgFunc*)BYTE_argmin
+};
+
+/*
+ * FIXME: check for PY3K
+ */
+NPY_NO_EXPORT PyArray_Descr BYTE_Descr = {
+    PyObject_HEAD_INIT(&PyArrayDescr_Type)
+    /* typeobj */
+    &PyByteArrType_Type,
+    /* kind */
+    NPY_SIGNEDLTR,
+    /* type */
+    NPY_BYTELTR,
+    /* byteorder */
+    '|',
+    /* flags */
+    0,
+    /* type_num */
+    NPY_BYTE,
+    /* elsize */
+    sizeof(npy_byte),
+    /* alignment */
+    NPY_ALIGNOF(npy_byte),
+    /* subarray */
+    NULL,
+    /* fields */
+    NULL,
+    /* names */
+    NULL,
+    /* f */
+    &_PyByte_ArrFuncs,
+    /* metadata */
+    NULL,
+    /* c_metadata */
+    NULL,
+    /* hash */
+    -1,
+};
+
+
+#line 4385
+
+static PyArray_ArrFuncs _PyUByte_ArrFuncs = {
+    {
+        UBYTE_to_BOOL,
+        UBYTE_to_BYTE,
+        UBYTE_to_UBYTE,
+        UBYTE_to_SHORT,
+        UBYTE_to_USHORT,
+        UBYTE_to_INT,
+        UBYTE_to_UINT,
+        UBYTE_to_LONG,
+        UBYTE_to_ULONG,
+        UBYTE_to_LONGLONG,
+        UBYTE_to_ULONGLONG,
+        UBYTE_to_FLOAT,
+        UBYTE_to_DOUBLE,
+        UBYTE_to_LONGDOUBLE,
+        UBYTE_to_CFLOAT,
+        UBYTE_to_CDOUBLE,
+        UBYTE_to_CLONGDOUBLE,
+        UBYTE_to_OBJECT,
+        UBYTE_to_STRING,
+        UBYTE_to_UNICODE,
+        UBYTE_to_VOID
+    },
+    UBYTE_getitem,
+    UBYTE_setitem,
+    (PyArray_CopySwapNFunc*)UBYTE_copyswapn,
+    (PyArray_CopySwapFunc*)UBYTE_copyswap,
+    (PyArray_CompareFunc*)UBYTE_compare,
+    (PyArray_ArgFunc*)UBYTE_argmax,
+    (PyArray_DotFunc*)UBYTE_dot,
+    (PyArray_ScanFunc*)UBYTE_scan,
+    UBYTE_fromstr,
+    (PyArray_NonzeroFunc*)UBYTE_nonzero,
+    (PyArray_FillFunc*)UBYTE_fill,
+    (PyArray_FillWithScalarFunc*)UBYTE_fillwithscalar,
+#if 1
+    {
+        quicksort_ubyte,
+        heapsort_ubyte,
+        #if 1
+            radixsort_ubyte
+        #else
+            timsort_ubyte
+        #endif
+    },
+    {
+        aquicksort_ubyte,
+        aheapsort_ubyte,
+        #if 1
+            aradixsort_ubyte
+        #else
+            atimsort_ubyte
+        #endif
+    },
+#else
+    {
+        NULL, NULL, NULL
+    },
+    {
+        NULL, NULL, NULL
+    },
+#endif
+    NULL,
+    (PyArray_ScalarKindFunc*)NULL,
+    NULL,
+    NULL,
+    (PyArray_FastClipFunc*)NULL,
+    (PyArray_FastPutmaskFunc*)NULL,
+    (PyArray_FastTakeFunc*)NULL,
+    (PyArray_ArgFunc*)UBYTE_argmin
+};
+
+/*
+ * FIXME: check for PY3K
+ */
+NPY_NO_EXPORT PyArray_Descr UBYTE_Descr = {
+    PyObject_HEAD_INIT(&PyArrayDescr_Type)
+    /* typeobj */
+    &PyUByteArrType_Type,
+    /* kind */
+    NPY_UNSIGNEDLTR,
+    /* type */
+    NPY_UBYTELTR,
+    /* byteorder */
+    '|',
+    /* flags */
+    0,
+    /* type_num */
+    NPY_UBYTE,
+    /* elsize */
+    sizeof(npy_ubyte),
+    /* alignment */
+    NPY_ALIGNOF(npy_ubyte),
+    /* subarray */
+    NULL,
+    /* fields */
+    NULL,
+    /* names */
+    NULL,
+    /* f */
+    &_PyUByte_ArrFuncs,
+    /* metadata */
+    NULL,
+    /* c_metadata */
+    NULL,
+    /* hash */
+    -1,
+};
+
+
+#line 4385
+
+static PyArray_ArrFuncs _PyShort_ArrFuncs = {
+    {
+        SHORT_to_BOOL,
+        SHORT_to_BYTE,
+        SHORT_to_UBYTE,
+        SHORT_to_SHORT,
+        SHORT_to_USHORT,
+        SHORT_to_INT,
+        SHORT_to_UINT,
+        SHORT_to_LONG,
+        SHORT_to_ULONG,
+        SHORT_to_LONGLONG,
+        SHORT_to_ULONGLONG,
+        SHORT_to_FLOAT,
+        SHORT_to_DOUBLE,
+        SHORT_to_LONGDOUBLE,
+        SHORT_to_CFLOAT,
+        SHORT_to_CDOUBLE,
+        SHORT_to_CLONGDOUBLE,
+        SHORT_to_OBJECT,
+        SHORT_to_STRING,
+        SHORT_to_UNICODE,
+        SHORT_to_VOID
+    },
+    SHORT_getitem,
+    SHORT_setitem,
+    (PyArray_CopySwapNFunc*)SHORT_copyswapn,
+    (PyArray_CopySwapFunc*)SHORT_copyswap,
+    (PyArray_CompareFunc*)SHORT_compare,
+    (PyArray_ArgFunc*)SHORT_argmax,
+    (PyArray_DotFunc*)SHORT_dot,
+    (PyArray_ScanFunc*)SHORT_scan,
+    SHORT_fromstr,
+    (PyArray_NonzeroFunc*)SHORT_nonzero,
+    (PyArray_FillFunc*)SHORT_fill,
+    (PyArray_FillWithScalarFunc*)SHORT_fillwithscalar,
+#if 1
+    {
+        quicksort_short,
+        heapsort_short,
+        #if 1
+            radixsort_short
+        #else
+            timsort_short
+        #endif
+    },
+    {
+        aquicksort_short,
+        aheapsort_short,
+        #if 1
+            aradixsort_short
+        #else
+            atimsort_short
+        #endif
+    },
+#else
+    {
+        NULL, NULL, NULL
+    },
+    {
+        NULL, NULL, NULL
+    },
+#endif
+    NULL,
+    (PyArray_ScalarKindFunc*)NULL,
+    NULL,
+    NULL,
+    (PyArray_FastClipFunc*)NULL,
+    (PyArray_FastPutmaskFunc*)NULL,
+    (PyArray_FastTakeFunc*)NULL,
+    (PyArray_ArgFunc*)SHORT_argmin
+};
+
+/*
+ * FIXME: check for PY3K
+ */
+NPY_NO_EXPORT PyArray_Descr SHORT_Descr = {
+    PyObject_HEAD_INIT(&PyArrayDescr_Type)
+    /* typeobj */
+    &PyShortArrType_Type,
+    /* kind */
+    NPY_SIGNEDLTR,
+    /* type */
+    NPY_SHORTLTR,
+    /* byteorder */
+    '=',
+    /* flags */
+    0,
+    /* type_num */
+    NPY_SHORT,
+    /* elsize */
+    sizeof(npy_short),
+    /* alignment */
+    NPY_ALIGNOF(npy_short),
+    /* subarray */
+    NULL,
+    /* fields */
+    NULL,
+    /* names */
+    NULL,
+    /* f */
+    &_PyShort_ArrFuncs,
+    /* metadata */
+    NULL,
+    /* c_metadata */
+    NULL,
+    /* hash */
+    -1,
+};
+
+
+#line 4385
+
+static PyArray_ArrFuncs _PyUShort_ArrFuncs = {
+    {
+        USHORT_to_BOOL,
+        USHORT_to_BYTE,
+        USHORT_to_UBYTE,
+        USHORT_to_SHORT,
+        USHORT_to_USHORT,
+        USHORT_to_INT,
+        USHORT_to_UINT,
+        USHORT_to_LONG,
+        USHORT_to_ULONG,
+        USHORT_to_LONGLONG,
+        USHORT_to_ULONGLONG,
+        USHORT_to_FLOAT,
+        USHORT_to_DOUBLE,
+        USHORT_to_LONGDOUBLE,
+        USHORT_to_CFLOAT,
+        USHORT_to_CDOUBLE,
+        USHORT_to_CLONGDOUBLE,
+        USHORT_to_OBJECT,
+        USHORT_to_STRING,
+        USHORT_to_UNICODE,
+        USHORT_to_VOID
+    },
+    USHORT_getitem,
+    USHORT_setitem,
+    (PyArray_CopySwapNFunc*)USHORT_copyswapn,
+    (PyArray_CopySwapFunc*)USHORT_copyswap,
+    (PyArray_CompareFunc*)USHORT_compare,
+    (PyArray_ArgFunc*)USHORT_argmax,
+    (PyArray_DotFunc*)USHORT_dot,
+    (PyArray_ScanFunc*)USHORT_scan,
+    USHORT_fromstr,
+    (PyArray_NonzeroFunc*)USHORT_nonzero,
+    (PyArray_FillFunc*)USHORT_fill,
+    (PyArray_FillWithScalarFunc*)USHORT_fillwithscalar,
+#if 1
+    {
+        quicksort_ushort,
+        heapsort_ushort,
+        #if 1
+            radixsort_ushort
+        #else
+            timsort_ushort
+        #endif
+    },
+    {
+        aquicksort_ushort,
+        aheapsort_ushort,
+        #if 1
+            aradixsort_ushort
+        #else
+            atimsort_ushort
+        #endif
+    },
+#else
+    {
+        NULL, NULL, NULL
+    },
+    {
+        NULL, NULL, NULL
+    },
+#endif
+    NULL,
+    (PyArray_ScalarKindFunc*)NULL,
+    NULL,
+    NULL,
+    (PyArray_FastClipFunc*)NULL,
+    (PyArray_FastPutmaskFunc*)NULL,
+    (PyArray_FastTakeFunc*)NULL,
+    (PyArray_ArgFunc*)USHORT_argmin
+};
+
+/*
+ * FIXME: check for PY3K
+ */
+NPY_NO_EXPORT PyArray_Descr USHORT_Descr = {
+    PyObject_HEAD_INIT(&PyArrayDescr_Type)
+    /* typeobj */
+    &PyUShortArrType_Type,
+    /* kind */
+    NPY_UNSIGNEDLTR,
+    /* type */
+    NPY_USHORTLTR,
+    /* byteorder */
+    '=',
+    /* flags */
+    0,
+    /* type_num */
+    NPY_USHORT,
+    /* elsize */
+    sizeof(npy_ushort),
+    /* alignment */
+    NPY_ALIGNOF(npy_ushort),
+    /* subarray */
+    NULL,
+    /* fields */
+    NULL,
+    /* names */
+    NULL,
+    /* f */
+    &_PyUShort_ArrFuncs,
+    /* metadata */
+    NULL,
+    /* c_metadata */
+    NULL,
+    /* hash */
+    -1,
+};
+
+
+#line 4385
+
+static PyArray_ArrFuncs _PyInt_ArrFuncs = {
+    {
+        INT_to_BOOL,
+        INT_to_BYTE,
+        INT_to_UBYTE,
+        INT_to_SHORT,
+        INT_to_USHORT,
+        INT_to_INT,
+        INT_to_UINT,
+        INT_to_LONG,
+        INT_to_ULONG,
+        INT_to_LONGLONG,
+        INT_to_ULONGLONG,
+        INT_to_FLOAT,
+        INT_to_DOUBLE,
+        INT_to_LONGDOUBLE,
+        INT_to_CFLOAT,
+        INT_to_CDOUBLE,
+        INT_to_CLONGDOUBLE,
+        INT_to_OBJECT,
+        INT_to_STRING,
+        INT_to_UNICODE,
+        INT_to_VOID
+    },
+    INT_getitem,
+    INT_setitem,
+    (PyArray_CopySwapNFunc*)INT_copyswapn,
+    (PyArray_CopySwapFunc*)INT_copyswap,
+    (PyArray_CompareFunc*)INT_compare,
+    (PyArray_ArgFunc*)INT_argmax,
+    (PyArray_DotFunc*)INT_dot,
+    (PyArray_ScanFunc*)INT_scan,
+    INT_fromstr,
+    (PyArray_NonzeroFunc*)INT_nonzero,
+    (PyArray_FillFunc*)INT_fill,
+    (PyArray_FillWithScalarFunc*)INT_fillwithscalar,
+#if 1
+    {
+        quicksort_int,
+        heapsort_int,
+        #if 0
+            radixsort_int
+        #else
+            timsort_int
+        #endif
+    },
+    {
+        aquicksort_int,
+        aheapsort_int,
+        #if 0
+            aradixsort_int
+        #else
+            atimsort_int
+        #endif
+    },
+#else
+    {
+        NULL, NULL, NULL
+    },
+    {
+        NULL, NULL, NULL
+    },
+#endif
+    NULL,
+    (PyArray_ScalarKindFunc*)NULL,
+    NULL,
+    NULL,
+    (PyArray_FastClipFunc*)NULL,
+    (PyArray_FastPutmaskFunc*)NULL,
+    (PyArray_FastTakeFunc*)NULL,
+    (PyArray_ArgFunc*)INT_argmin
+};
+
+/*
+ * FIXME: check for PY3K
+ */
+NPY_NO_EXPORT PyArray_Descr INT_Descr = {
+    PyObject_HEAD_INIT(&PyArrayDescr_Type)
+    /* typeobj */
+    &PyIntArrType_Type,
+    /* kind */
+    NPY_SIGNEDLTR,
+    /* type */
+    NPY_INTLTR,
+    /* byteorder */
+    '=',
+    /* flags */
+    0,
+    /* type_num */
+    NPY_INT,
+    /* elsize */
+    sizeof(npy_int),
+    /* alignment */
+    NPY_ALIGNOF(npy_int),
+    /* subarray */
+    NULL,
+    /* fields */
+    NULL,
+    /* names */
+    NULL,
+    /* f */
+    &_PyInt_ArrFuncs,
+    /* metadata */
+    NULL,
+    /* c_metadata */
+    NULL,
+    /* hash */
+    -1,
+};
+
+
+#line 4385
+
+static PyArray_ArrFuncs _PyUInt_ArrFuncs = {
+    {
+        UINT_to_BOOL,
+        UINT_to_BYTE,
+        UINT_to_UBYTE,
+        UINT_to_SHORT,
+        UINT_to_USHORT,
+        UINT_to_INT,
+        UINT_to_UINT,
+        UINT_to_LONG,
+        UINT_to_ULONG,
+        UINT_to_LONGLONG,
+        UINT_to_ULONGLONG,
+        UINT_to_FLOAT,
+        UINT_to_DOUBLE,
+        UINT_to_LONGDOUBLE,
+        UINT_to_CFLOAT,
+        UINT_to_CDOUBLE,
+        UINT_to_CLONGDOUBLE,
+        UINT_to_OBJECT,
+        UINT_to_STRING,
+        UINT_to_UNICODE,
+        UINT_to_VOID
+    },
+    UINT_getitem,
+    UINT_setitem,
+    (PyArray_CopySwapNFunc*)UINT_copyswapn,
+    (PyArray_CopySwapFunc*)UINT_copyswap,
+    (PyArray_CompareFunc*)UINT_compare,
+    (PyArray_ArgFunc*)UINT_argmax,
+    (PyArray_DotFunc*)UINT_dot,
+    (PyArray_ScanFunc*)UINT_scan,
+    UINT_fromstr,
+    (PyArray_NonzeroFunc*)UINT_nonzero,
+    (PyArray_FillFunc*)UINT_fill,
+    (PyArray_FillWithScalarFunc*)UINT_fillwithscalar,
+#if 1
+    {
+        quicksort_uint,
+        heapsort_uint,
+        #if 0
+            radixsort_uint
+        #else
+            timsort_uint
+        #endif
+    },
+    {
+        aquicksort_uint,
+        aheapsort_uint,
+        #if 0
+            aradixsort_uint
+        #else
+            atimsort_uint
+        #endif
+    },
+#else
+    {
+        NULL, NULL, NULL
+    },
+    {
+        NULL, NULL, NULL
+    },
+#endif
+    NULL,
+    (PyArray_ScalarKindFunc*)NULL,
+    NULL,
+    NULL,
+    (PyArray_FastClipFunc*)NULL,
+    (PyArray_FastPutmaskFunc*)NULL,
+    (PyArray_FastTakeFunc*)NULL,
+    (PyArray_ArgFunc*)UINT_argmin
+};
+
+/*
+ * FIXME: check for PY3K
+ */
+NPY_NO_EXPORT PyArray_Descr UINT_Descr = {
+    PyObject_HEAD_INIT(&PyArrayDescr_Type)
+    /* typeobj */
+    &PyUIntArrType_Type,
+    /* kind */
+    NPY_UNSIGNEDLTR,
+    /* type */
+    NPY_UINTLTR,
+    /* byteorder */
+    '=',
+    /* flags */
+    0,
+    /* type_num */
+    NPY_UINT,
+    /* elsize */
+    sizeof(npy_uint),
+    /* alignment */
+    NPY_ALIGNOF(npy_uint),
+    /* subarray */
+    NULL,
+    /* fields */
+    NULL,
+    /* names */
+    NULL,
+    /* f */
+    &_PyUInt_ArrFuncs,
+    /* metadata */
+    NULL,
+    /* c_metadata */
+    NULL,
+    /* hash */
+    -1,
+};
+
+
+#line 4385
+
+static PyArray_ArrFuncs _PyLong_ArrFuncs = {
+    {
+        LONG_to_BOOL,
+        LONG_to_BYTE,
+        LONG_to_UBYTE,
+        LONG_to_SHORT,
+        LONG_to_USHORT,
+        LONG_to_INT,
+        LONG_to_UINT,
+        LONG_to_LONG,
+        LONG_to_ULONG,
+        LONG_to_LONGLONG,
+        LONG_to_ULONGLONG,
+        LONG_to_FLOAT,
+        LONG_to_DOUBLE,
+        LONG_to_LONGDOUBLE,
+        LONG_to_CFLOAT,
+        LONG_to_CDOUBLE,
+        LONG_to_CLONGDOUBLE,
+        LONG_to_OBJECT,
+        LONG_to_STRING,
+        LONG_to_UNICODE,
+        LONG_to_VOID
+    },
+    LONG_getitem,
+    LONG_setitem,
+    (PyArray_CopySwapNFunc*)LONG_copyswapn,
+    (PyArray_CopySwapFunc*)LONG_copyswap,
+    (PyArray_CompareFunc*)LONG_compare,
+    (PyArray_ArgFunc*)LONG_argmax,
+    (PyArray_DotFunc*)LONG_dot,
+    (PyArray_ScanFunc*)LONG_scan,
+    LONG_fromstr,
+    (PyArray_NonzeroFunc*)LONG_nonzero,
+    (PyArray_FillFunc*)LONG_fill,
+    (PyArray_FillWithScalarFunc*)LONG_fillwithscalar,
+#if 1
+    {
+        quicksort_long,
+        heapsort_long,
+        #if 0
+            radixsort_long
+        #else
+            timsort_long
+        #endif
+    },
+    {
+        aquicksort_long,
+        aheapsort_long,
+        #if 0
+            aradixsort_long
+        #else
+            atimsort_long
+        #endif
+    },
+#else
+    {
+        NULL, NULL, NULL
+    },
+    {
+        NULL, NULL, NULL
+    },
+#endif
+    NULL,
+    (PyArray_ScalarKindFunc*)NULL,
+    NULL,
+    NULL,
+    (PyArray_FastClipFunc*)NULL,
+    (PyArray_FastPutmaskFunc*)NULL,
+    (PyArray_FastTakeFunc*)NULL,
+    (PyArray_ArgFunc*)LONG_argmin
+};
+
+/*
+ * FIXME: check for PY3K
+ */
+NPY_NO_EXPORT PyArray_Descr LONG_Descr = {
+    PyObject_HEAD_INIT(&PyArrayDescr_Type)
+    /* typeobj */
+    &PyLongArrType_Type,
+    /* kind */
+    NPY_SIGNEDLTR,
+    /* type */
+    NPY_LONGLTR,
+    /* byteorder */
+    '=',
+    /* flags */
+    0,
+    /* type_num */
+    NPY_LONG,
+    /* elsize */
+    sizeof(npy_long),
+    /* alignment */
+    NPY_ALIGNOF(npy_long),
+    /* subarray */
+    NULL,
+    /* fields */
+    NULL,
+    /* names */
+    NULL,
+    /* f */
+    &_PyLong_ArrFuncs,
+    /* metadata */
+    NULL,
+    /* c_metadata */
+    NULL,
+    /* hash */
+    -1,
+};
+
+
+#line 4385
+
+static PyArray_ArrFuncs _PyULong_ArrFuncs = {
+    {
+        ULONG_to_BOOL,
+        ULONG_to_BYTE,
+        ULONG_to_UBYTE,
+        ULONG_to_SHORT,
+        ULONG_to_USHORT,
+        ULONG_to_INT,
+        ULONG_to_UINT,
+        ULONG_to_LONG,
+        ULONG_to_ULONG,
+        ULONG_to_LONGLONG,
+        ULONG_to_ULONGLONG,
+        ULONG_to_FLOAT,
+        ULONG_to_DOUBLE,
+        ULONG_to_LONGDOUBLE,
+        ULONG_to_CFLOAT,
+        ULONG_to_CDOUBLE,
+        ULONG_to_CLONGDOUBLE,
+        ULONG_to_OBJECT,
+        ULONG_to_STRING,
+        ULONG_to_UNICODE,
+        ULONG_to_VOID
+    },
+    ULONG_getitem,
+    ULONG_setitem,
+    (PyArray_CopySwapNFunc*)ULONG_copyswapn,
+    (PyArray_CopySwapFunc*)ULONG_copyswap,
+    (PyArray_CompareFunc*)ULONG_compare,
+    (PyArray_ArgFunc*)ULONG_argmax,
+    (PyArray_DotFunc*)ULONG_dot,
+    (PyArray_ScanFunc*)ULONG_scan,
+    ULONG_fromstr,
+    (PyArray_NonzeroFunc*)ULONG_nonzero,
+    (PyArray_FillFunc*)ULONG_fill,
+    (PyArray_FillWithScalarFunc*)ULONG_fillwithscalar,
+#if 1
+    {
+        quicksort_ulong,
+        heapsort_ulong,
+        #if 0
+            radixsort_ulong
+        #else
+            timsort_ulong
+        #endif
+    },
+    {
+        aquicksort_ulong,
+        aheapsort_ulong,
+        #if 0
+            aradixsort_ulong
+        #else
+            atimsort_ulong
+        #endif
+    },
+#else
+    {
+        NULL, NULL, NULL
+    },
+    {
+        NULL, NULL, NULL
+    },
+#endif
+    NULL,
+    (PyArray_ScalarKindFunc*)NULL,
+    NULL,
+    NULL,
+    (PyArray_FastClipFunc*)NULL,
+    (PyArray_FastPutmaskFunc*)NULL,
+    (PyArray_FastTakeFunc*)NULL,
+    (PyArray_ArgFunc*)ULONG_argmin
+};
+
+/*
+ * FIXME: check for PY3K
+ */
+NPY_NO_EXPORT PyArray_Descr ULONG_Descr = {
+    PyObject_HEAD_INIT(&PyArrayDescr_Type)
+    /* typeobj */
+    &PyULongArrType_Type,
+    /* kind */
+    NPY_UNSIGNEDLTR,
+    /* type */
+    NPY_ULONGLTR,
+    /* byteorder */
+    '=',
+    /* flags */
+    0,
+    /* type_num */
+    NPY_ULONG,
+    /* elsize */
+    sizeof(npy_ulong),
+    /* alignment */
+    NPY_ALIGNOF(npy_ulong),
+    /* subarray */
+    NULL,
+    /* fields */
+    NULL,
+    /* names */
+    NULL,
+    /* f */
+    &_PyULong_ArrFuncs,
+    /* metadata */
+    NULL,
+    /* c_metadata */
+    NULL,
+    /* hash */
+    -1,
+};
+
+
+#line 4385
+
+static PyArray_ArrFuncs _PyLongLong_ArrFuncs = {
+    {
+        LONGLONG_to_BOOL,
+        LONGLONG_to_BYTE,
+        LONGLONG_to_UBYTE,
+        LONGLONG_to_SHORT,
+        LONGLONG_to_USHORT,
+        LONGLONG_to_INT,
+        LONGLONG_to_UINT,
+        LONGLONG_to_LONG,
+        LONGLONG_to_ULONG,
+        LONGLONG_to_LONGLONG,
+        LONGLONG_to_ULONGLONG,
+        LONGLONG_to_FLOAT,
+        LONGLONG_to_DOUBLE,
+        LONGLONG_to_LONGDOUBLE,
+        LONGLONG_to_CFLOAT,
+        LONGLONG_to_CDOUBLE,
+        LONGLONG_to_CLONGDOUBLE,
+        LONGLONG_to_OBJECT,
+        LONGLONG_to_STRING,
+        LONGLONG_to_UNICODE,
+        LONGLONG_to_VOID
+    },
+    LONGLONG_getitem,
+    LONGLONG_setitem,
+    (PyArray_CopySwapNFunc*)LONGLONG_copyswapn,
+    (PyArray_CopySwapFunc*)LONGLONG_copyswap,
+    (PyArray_CompareFunc*)LONGLONG_compare,
+    (PyArray_ArgFunc*)LONGLONG_argmax,
+    (PyArray_DotFunc*)LONGLONG_dot,
+    (PyArray_ScanFunc*)LONGLONG_scan,
+    LONGLONG_fromstr,
+    (PyArray_NonzeroFunc*)LONGLONG_nonzero,
+    (PyArray_FillFunc*)LONGLONG_fill,
+    (PyArray_FillWithScalarFunc*)LONGLONG_fillwithscalar,
+#if 1
+    {
+        quicksort_longlong,
+        heapsort_longlong,
+        #if 0
+            radixsort_longlong
+        #else
+            timsort_longlong
+        #endif
+    },
+    {
+        aquicksort_longlong,
+        aheapsort_longlong,
+        #if 0
+            aradixsort_longlong
+        #else
+            atimsort_longlong
+        #endif
+    },
+#else
+    {
+        NULL, NULL, NULL
+    },
+    {
+        NULL, NULL, NULL
+    },
+#endif
+    NULL,
+    (PyArray_ScalarKindFunc*)NULL,
+    NULL,
+    NULL,
+    (PyArray_FastClipFunc*)NULL,
+    (PyArray_FastPutmaskFunc*)NULL,
+    (PyArray_FastTakeFunc*)NULL,
+    (PyArray_ArgFunc*)LONGLONG_argmin
+};
+
+/*
+ * FIXME: check for PY3K
+ */
+NPY_NO_EXPORT PyArray_Descr LONGLONG_Descr = {
+    PyObject_HEAD_INIT(&PyArrayDescr_Type)
+    /* typeobj */
+    &PyLongLongArrType_Type,
+    /* kind */
+    NPY_SIGNEDLTR,
+    /* type */
+    NPY_LONGLONGLTR,
+    /* byteorder */
+    '=',
+    /* flags */
+    0,
+    /* type_num */
+    NPY_LONGLONG,
+    /* elsize */
+    sizeof(npy_longlong),
+    /* alignment */
+    NPY_ALIGNOF(npy_longlong),
+    /* subarray */
+    NULL,
+    /* fields */
+    NULL,
+    /* names */
+    NULL,
+    /* f */
+    &_PyLongLong_ArrFuncs,
+    /* metadata */
+    NULL,
+    /* c_metadata */
+    NULL,
+    /* hash */
+    -1,
+};
+
+
+#line 4385
+
+static PyArray_ArrFuncs _PyULongLong_ArrFuncs = {
+    {
+        ULONGLONG_to_BOOL,
+        ULONGLONG_to_BYTE,
+        ULONGLONG_to_UBYTE,
+        ULONGLONG_to_SHORT,
+        ULONGLONG_to_USHORT,
+        ULONGLONG_to_INT,
+        ULONGLONG_to_UINT,
+        ULONGLONG_to_LONG,
+        ULONGLONG_to_ULONG,
+        ULONGLONG_to_LONGLONG,
+        ULONGLONG_to_ULONGLONG,
+        ULONGLONG_to_FLOAT,
+        ULONGLONG_to_DOUBLE,
+        ULONGLONG_to_LONGDOUBLE,
+        ULONGLONG_to_CFLOAT,
+        ULONGLONG_to_CDOUBLE,
+        ULONGLONG_to_CLONGDOUBLE,
+        ULONGLONG_to_OBJECT,
+        ULONGLONG_to_STRING,
+        ULONGLONG_to_UNICODE,
+        ULONGLONG_to_VOID
+    },
+    ULONGLONG_getitem,
+    ULONGLONG_setitem,
+    (PyArray_CopySwapNFunc*)ULONGLONG_copyswapn,
+    (PyArray_CopySwapFunc*)ULONGLONG_copyswap,
+    (PyArray_CompareFunc*)ULONGLONG_compare,
+    (PyArray_ArgFunc*)ULONGLONG_argmax,
+    (PyArray_DotFunc*)ULONGLONG_dot,
+    (PyArray_ScanFunc*)ULONGLONG_scan,
+    ULONGLONG_fromstr,
+    (PyArray_NonzeroFunc*)ULONGLONG_nonzero,
+    (PyArray_FillFunc*)ULONGLONG_fill,
+    (PyArray_FillWithScalarFunc*)ULONGLONG_fillwithscalar,
+#if 1
+    {
+        quicksort_ulonglong,
+        heapsort_ulonglong,
+        #if 0
+            radixsort_ulonglong
+        #else
+            timsort_ulonglong
+        #endif
+    },
+    {
+        aquicksort_ulonglong,
+        aheapsort_ulonglong,
+        #if 0
+            aradixsort_ulonglong
+        #else
+            atimsort_ulonglong
+        #endif
+    },
+#else
+    {
+        NULL, NULL, NULL
+    },
+    {
+        NULL, NULL, NULL
+    },
+#endif
+    NULL,
+    (PyArray_ScalarKindFunc*)NULL,
+    NULL,
+    NULL,
+    (PyArray_FastClipFunc*)NULL,
+    (PyArray_FastPutmaskFunc*)NULL,
+    (PyArray_FastTakeFunc*)NULL,
+    (PyArray_ArgFunc*)ULONGLONG_argmin
+};
+
+/*
+ * FIXME: check for PY3K
+ */
+NPY_NO_EXPORT PyArray_Descr ULONGLONG_Descr = {
+    PyObject_HEAD_INIT(&PyArrayDescr_Type)
+    /* typeobj */
+    &PyULongLongArrType_Type,
+    /* kind */
+    NPY_UNSIGNEDLTR,
+    /* type */
+    NPY_ULONGLONGLTR,
+    /* byteorder */
+    '=',
+    /* flags */
+    0,
+    /* type_num */
+    NPY_ULONGLONG,
+    /* elsize */
+    sizeof(npy_ulonglong),
+    /* alignment */
+    NPY_ALIGNOF(npy_ulonglong),
+    /* subarray */
+    NULL,
+    /* fields */
+    NULL,
+    /* names */
+    NULL,
+    /* f */
+    &_PyULongLong_ArrFuncs,
+    /* metadata */
+    NULL,
+    /* c_metadata */
+    NULL,
+    /* hash */
+    -1,
+};
+
+
+#line 4385
+
+static PyArray_ArrFuncs _PyHalf_ArrFuncs = {
+    {
+        HALF_to_BOOL,
+        HALF_to_BYTE,
+        HALF_to_UBYTE,
+        HALF_to_SHORT,
+        HALF_to_USHORT,
+        HALF_to_INT,
+        HALF_to_UINT,
+        HALF_to_LONG,
+        HALF_to_ULONG,
+        HALF_to_LONGLONG,
+        HALF_to_ULONGLONG,
+        HALF_to_FLOAT,
+        HALF_to_DOUBLE,
+        HALF_to_LONGDOUBLE,
+        HALF_to_CFLOAT,
+        HALF_to_CDOUBLE,
+        HALF_to_CLONGDOUBLE,
+        HALF_to_OBJECT,
+        HALF_to_STRING,
+        HALF_to_UNICODE,
+        HALF_to_VOID
+    },
+    HALF_getitem,
+    HALF_setitem,
+    (PyArray_CopySwapNFunc*)HALF_copyswapn,
+    (PyArray_CopySwapFunc*)HALF_copyswap,
+    (PyArray_CompareFunc*)HALF_compare,
+    (PyArray_ArgFunc*)HALF_argmax,
+    (PyArray_DotFunc*)HALF_dot,
+    (PyArray_ScanFunc*)HALF_scan,
+    HALF_fromstr,
+    (PyArray_NonzeroFunc*)HALF_nonzero,
+    (PyArray_FillFunc*)HALF_fill,
+    (PyArray_FillWithScalarFunc*)HALF_fillwithscalar,
+#if 1
+    {
+        quicksort_half,
+        heapsort_half,
+        #if 0
+            radixsort_half
+        #else
+            timsort_half
+        #endif
+    },
+    {
+        aquicksort_half,
+        aheapsort_half,
+        #if 0
+            aradixsort_half
+        #else
+            atimsort_half
+        #endif
+    },
+#else
+    {
+        NULL, NULL, NULL
+    },
+    {
+        NULL, NULL, NULL
+    },
+#endif
+    NULL,
+    (PyArray_ScalarKindFunc*)NULL,
+    NULL,
+    NULL,
+    (PyArray_FastClipFunc*)NULL,
+    (PyArray_FastPutmaskFunc*)NULL,
+    (PyArray_FastTakeFunc*)NULL,
+    (PyArray_ArgFunc*)HALF_argmin
+};
+
+/*
+ * FIXME: check for PY3K
+ */
+NPY_NO_EXPORT PyArray_Descr HALF_Descr = {
+    PyObject_HEAD_INIT(&PyArrayDescr_Type)
+    /* typeobj */
+    &PyHalfArrType_Type,
+    /* kind */
+    NPY_FLOATINGLTR,
+    /* type */
+    NPY_HALFLTR,
+    /* byteorder */
+    '=',
+    /* flags */
+    0,
+    /* type_num */
+    NPY_HALF,
+    /* elsize */
+    sizeof(npy_half),
+    /* alignment */
+    NPY_ALIGNOF(npy_half),
+    /* subarray */
+    NULL,
+    /* fields */
+    NULL,
+    /* names */
+    NULL,
+    /* f */
+    &_PyHalf_ArrFuncs,
+    /* metadata */
+    NULL,
+    /* c_metadata */
+    NULL,
+    /* hash */
+    -1,
+};
+
+
+#line 4385
+
+static PyArray_ArrFuncs _PyFloat_ArrFuncs = {
+    {
+        FLOAT_to_BOOL,
+        FLOAT_to_BYTE,
+        FLOAT_to_UBYTE,
+        FLOAT_to_SHORT,
+        FLOAT_to_USHORT,
+        FLOAT_to_INT,
+        FLOAT_to_UINT,
+        FLOAT_to_LONG,
+        FLOAT_to_ULONG,
+        FLOAT_to_LONGLONG,
+        FLOAT_to_ULONGLONG,
+        FLOAT_to_FLOAT,
+        FLOAT_to_DOUBLE,
+        FLOAT_to_LONGDOUBLE,
+        FLOAT_to_CFLOAT,
+        FLOAT_to_CDOUBLE,
+        FLOAT_to_CLONGDOUBLE,
+        FLOAT_to_OBJECT,
+        FLOAT_to_STRING,
+        FLOAT_to_UNICODE,
+        FLOAT_to_VOID
+    },
+    FLOAT_getitem,
+    FLOAT_setitem,
+    (PyArray_CopySwapNFunc*)FLOAT_copyswapn,
+    (PyArray_CopySwapFunc*)FLOAT_copyswap,
+    (PyArray_CompareFunc*)FLOAT_compare,
+    (PyArray_ArgFunc*)FLOAT_argmax,
+    (PyArray_DotFunc*)FLOAT_dot,
+    (PyArray_ScanFunc*)FLOAT_scan,
+    FLOAT_fromstr,
+    (PyArray_NonzeroFunc*)FLOAT_nonzero,
+    (PyArray_FillFunc*)FLOAT_fill,
+    (PyArray_FillWithScalarFunc*)FLOAT_fillwithscalar,
+#if 1
+    {
+        quicksort_float,
+        heapsort_float,
+        #if 0
+            radixsort_float
+        #else
+            timsort_float
+        #endif
+    },
+    {
+        aquicksort_float,
+        aheapsort_float,
+        #if 0
+            aradixsort_float
+        #else
+            atimsort_float
+        #endif
+    },
+#else
+    {
+        NULL, NULL, NULL
+    },
+    {
+        NULL, NULL, NULL
+    },
+#endif
+    NULL,
+    (PyArray_ScalarKindFunc*)NULL,
+    NULL,
+    NULL,
+    (PyArray_FastClipFunc*)NULL,
+    (PyArray_FastPutmaskFunc*)NULL,
+    (PyArray_FastTakeFunc*)NULL,
+    (PyArray_ArgFunc*)FLOAT_argmin
+};
+
+/*
+ * FIXME: check for PY3K
+ */
+NPY_NO_EXPORT PyArray_Descr FLOAT_Descr = {
+    PyObject_HEAD_INIT(&PyArrayDescr_Type)
+    /* typeobj */
+    &PyFloatArrType_Type,
+    /* kind */
+    NPY_FLOATINGLTR,
+    /* type */
+    NPY_FLOATLTR,
+    /* byteorder */
+    '=',
+    /* flags */
+    0,
+    /* type_num */
+    NPY_FLOAT,
+    /* elsize */
+    sizeof(npy_float),
+    /* alignment */
+    NPY_ALIGNOF(npy_float),
+    /* subarray */
+    NULL,
+    /* fields */
+    NULL,
+    /* names */
+    NULL,
+    /* f */
+    &_PyFloat_ArrFuncs,
+    /* metadata */
+    NULL,
+    /* c_metadata */
+    NULL,
+    /* hash */
+    -1,
+};
+
+
+#line 4385
+
+static PyArray_ArrFuncs _PyDouble_ArrFuncs = {
+    {
+        DOUBLE_to_BOOL,
+        DOUBLE_to_BYTE,
+        DOUBLE_to_UBYTE,
+        DOUBLE_to_SHORT,
+        DOUBLE_to_USHORT,
+        DOUBLE_to_INT,
+        DOUBLE_to_UINT,
+        DOUBLE_to_LONG,
+        DOUBLE_to_ULONG,
+        DOUBLE_to_LONGLONG,
+        DOUBLE_to_ULONGLONG,
+        DOUBLE_to_FLOAT,
+        DOUBLE_to_DOUBLE,
+        DOUBLE_to_LONGDOUBLE,
+        DOUBLE_to_CFLOAT,
+        DOUBLE_to_CDOUBLE,
+        DOUBLE_to_CLONGDOUBLE,
+        DOUBLE_to_OBJECT,
+        DOUBLE_to_STRING,
+        DOUBLE_to_UNICODE,
+        DOUBLE_to_VOID
+    },
+    DOUBLE_getitem,
+    DOUBLE_setitem,
+    (PyArray_CopySwapNFunc*)DOUBLE_copyswapn,
+    (PyArray_CopySwapFunc*)DOUBLE_copyswap,
+    (PyArray_CompareFunc*)DOUBLE_compare,
+    (PyArray_ArgFunc*)DOUBLE_argmax,
+    (PyArray_DotFunc*)DOUBLE_dot,
+    (PyArray_ScanFunc*)DOUBLE_scan,
+    DOUBLE_fromstr,
+    (PyArray_NonzeroFunc*)DOUBLE_nonzero,
+    (PyArray_FillFunc*)DOUBLE_fill,
+    (PyArray_FillWithScalarFunc*)DOUBLE_fillwithscalar,
+#if 1
+    {
+        quicksort_double,
+        heapsort_double,
+        #if 0
+            radixsort_double
+        #else
+            timsort_double
+        #endif
+    },
+    {
+        aquicksort_double,
+        aheapsort_double,
+        #if 0
+            aradixsort_double
+        #else
+            atimsort_double
+        #endif
+    },
+#else
+    {
+        NULL, NULL, NULL
+    },
+    {
+        NULL, NULL, NULL
+    },
+#endif
+    NULL,
+    (PyArray_ScalarKindFunc*)NULL,
+    NULL,
+    NULL,
+    (PyArray_FastClipFunc*)NULL,
+    (PyArray_FastPutmaskFunc*)NULL,
+    (PyArray_FastTakeFunc*)NULL,
+    (PyArray_ArgFunc*)DOUBLE_argmin
+};
+
+/*
+ * FIXME: check for PY3K
+ */
+NPY_NO_EXPORT PyArray_Descr DOUBLE_Descr = {
+    PyObject_HEAD_INIT(&PyArrayDescr_Type)
+    /* typeobj */
+    &PyDoubleArrType_Type,
+    /* kind */
+    NPY_FLOATINGLTR,
+    /* type */
+    NPY_DOUBLELTR,
+    /* byteorder */
+    '=',
+    /* flags */
+    0,
+    /* type_num */
+    NPY_DOUBLE,
+    /* elsize */
+    sizeof(npy_double),
+    /* alignment */
+    NPY_ALIGNOF(npy_double),
+    /* subarray */
+    NULL,
+    /* fields */
+    NULL,
+    /* names */
+    NULL,
+    /* f */
+    &_PyDouble_ArrFuncs,
+    /* metadata */
+    NULL,
+    /* c_metadata */
+    NULL,
+    /* hash */
+    -1,
+};
+
+
+#line 4385
+
+static PyArray_ArrFuncs _PyLongDouble_ArrFuncs = {
+    {
+        LONGDOUBLE_to_BOOL,
+        LONGDOUBLE_to_BYTE,
+        LONGDOUBLE_to_UBYTE,
+        LONGDOUBLE_to_SHORT,
+        LONGDOUBLE_to_USHORT,
+        LONGDOUBLE_to_INT,
+        LONGDOUBLE_to_UINT,
+        LONGDOUBLE_to_LONG,
+        LONGDOUBLE_to_ULONG,
+        LONGDOUBLE_to_LONGLONG,
+        LONGDOUBLE_to_ULONGLONG,
+        LONGDOUBLE_to_FLOAT,
+        LONGDOUBLE_to_DOUBLE,
+        LONGDOUBLE_to_LONGDOUBLE,
+        LONGDOUBLE_to_CFLOAT,
+        LONGDOUBLE_to_CDOUBLE,
+        LONGDOUBLE_to_CLONGDOUBLE,
+        LONGDOUBLE_to_OBJECT,
+        LONGDOUBLE_to_STRING,
+        LONGDOUBLE_to_UNICODE,
+        LONGDOUBLE_to_VOID
+    },
+    LONGDOUBLE_getitem,
+    LONGDOUBLE_setitem,
+    (PyArray_CopySwapNFunc*)LONGDOUBLE_copyswapn,
+    (PyArray_CopySwapFunc*)LONGDOUBLE_copyswap,
+    (PyArray_CompareFunc*)LONGDOUBLE_compare,
+    (PyArray_ArgFunc*)LONGDOUBLE_argmax,
+    (PyArray_DotFunc*)LONGDOUBLE_dot,
+    (PyArray_ScanFunc*)LONGDOUBLE_scan,
+    LONGDOUBLE_fromstr,
+    (PyArray_NonzeroFunc*)LONGDOUBLE_nonzero,
+    (PyArray_FillFunc*)LONGDOUBLE_fill,
+    (PyArray_FillWithScalarFunc*)LONGDOUBLE_fillwithscalar,
+#if 1
+    {
+        quicksort_longdouble,
+        heapsort_longdouble,
+        #if 0
+            radixsort_longdouble
+        #else
+            timsort_longdouble
+        #endif
+    },
+    {
+        aquicksort_longdouble,
+        aheapsort_longdouble,
+        #if 0
+            aradixsort_longdouble
+        #else
+            atimsort_longdouble
+        #endif
+    },
+#else
+    {
+        NULL, NULL, NULL
+    },
+    {
+        NULL, NULL, NULL
+    },
+#endif
+    NULL,
+    (PyArray_ScalarKindFunc*)NULL,
+    NULL,
+    NULL,
+    (PyArray_FastClipFunc*)NULL,
+    (PyArray_FastPutmaskFunc*)NULL,
+    (PyArray_FastTakeFunc*)NULL,
+    (PyArray_ArgFunc*)LONGDOUBLE_argmin
+};
+
+/*
+ * FIXME: check for PY3K
+ */
+NPY_NO_EXPORT PyArray_Descr LONGDOUBLE_Descr = {
+    PyObject_HEAD_INIT(&PyArrayDescr_Type)
+    /* typeobj */
+    &PyLongDoubleArrType_Type,
+    /* kind */
+    NPY_FLOATINGLTR,
+    /* type */
+    NPY_LONGDOUBLELTR,
+    /* byteorder */
+    '=',
+    /* flags */
+    0,
+    /* type_num */
+    NPY_LONGDOUBLE,
+    /* elsize */
+    sizeof(npy_longdouble),
+    /* alignment */
+    NPY_ALIGNOF(npy_longdouble),
+    /* subarray */
+    NULL,
+    /* fields */
+    NULL,
+    /* names */
+    NULL,
+    /* f */
+    &_PyLongDouble_ArrFuncs,
+    /* metadata */
+    NULL,
+    /* c_metadata */
+    NULL,
+    /* hash */
+    -1,
+};
+
+
+#line 4385
+
+static PyArray_ArrFuncs _PyCFloat_ArrFuncs = {
+    {
+        CFLOAT_to_BOOL,
+        CFLOAT_to_BYTE,
+        CFLOAT_to_UBYTE,
+        CFLOAT_to_SHORT,
+        CFLOAT_to_USHORT,
+        CFLOAT_to_INT,
+        CFLOAT_to_UINT,
+        CFLOAT_to_LONG,
+        CFLOAT_to_ULONG,
+        CFLOAT_to_LONGLONG,
+        CFLOAT_to_ULONGLONG,
+        CFLOAT_to_FLOAT,
+        CFLOAT_to_DOUBLE,
+        CFLOAT_to_LONGDOUBLE,
+        CFLOAT_to_CFLOAT,
+        CFLOAT_to_CDOUBLE,
+        CFLOAT_to_CLONGDOUBLE,
+        CFLOAT_to_OBJECT,
+        CFLOAT_to_STRING,
+        CFLOAT_to_UNICODE,
+        CFLOAT_to_VOID
+    },
+    CFLOAT_getitem,
+    CFLOAT_setitem,
+    (PyArray_CopySwapNFunc*)CFLOAT_copyswapn,
+    (PyArray_CopySwapFunc*)CFLOAT_copyswap,
+    (PyArray_CompareFunc*)CFLOAT_compare,
+    (PyArray_ArgFunc*)CFLOAT_argmax,
+    (PyArray_DotFunc*)CFLOAT_dot,
+    (PyArray_ScanFunc*)CFLOAT_scan,
+    CFLOAT_fromstr,
+    (PyArray_NonzeroFunc*)CFLOAT_nonzero,
+    (PyArray_FillFunc*)CFLOAT_fill,
+    (PyArray_FillWithScalarFunc*)CFLOAT_fillwithscalar,
+#if 1
+    {
+        quicksort_cfloat,
+        heapsort_cfloat,
+        #if 0
+            radixsort_cfloat
+        #else
+            timsort_cfloat
+        #endif
+    },
+    {
+        aquicksort_cfloat,
+        aheapsort_cfloat,
+        #if 0
+            aradixsort_cfloat
+        #else
+            atimsort_cfloat
+        #endif
+    },
+#else
+    {
+        NULL, NULL, NULL
+    },
+    {
+        NULL, NULL, NULL
+    },
+#endif
+    NULL,
+    (PyArray_ScalarKindFunc*)NULL,
+    NULL,
+    NULL,
+    (PyArray_FastClipFunc*)NULL,
+    (PyArray_FastPutmaskFunc*)NULL,
+    (PyArray_FastTakeFunc*)NULL,
+    (PyArray_ArgFunc*)CFLOAT_argmin
+};
+
+/*
+ * FIXME: check for PY3K
+ */
+NPY_NO_EXPORT PyArray_Descr CFLOAT_Descr = {
+    PyObject_HEAD_INIT(&PyArrayDescr_Type)
+    /* typeobj */
+    &PyCFloatArrType_Type,
+    /* kind */
+    NPY_COMPLEXLTR,
+    /* type */
+    NPY_CFLOATLTR,
+    /* byteorder */
+    '=',
+    /* flags */
+    0,
+    /* type_num */
+    NPY_CFLOAT,
+    /* elsize */
+    sizeof(npy_cfloat),
+    /* alignment */
+    NPY_ALIGNOF(npy_cfloat),
+    /* subarray */
+    NULL,
+    /* fields */
+    NULL,
+    /* names */
+    NULL,
+    /* f */
+    &_PyCFloat_ArrFuncs,
+    /* metadata */
+    NULL,
+    /* c_metadata */
+    NULL,
+    /* hash */
+    -1,
+};
+
+
+#line 4385
+
+static PyArray_ArrFuncs _PyCDouble_ArrFuncs = {
+    {
+        CDOUBLE_to_BOOL,
+        CDOUBLE_to_BYTE,
+        CDOUBLE_to_UBYTE,
+        CDOUBLE_to_SHORT,
+        CDOUBLE_to_USHORT,
+        CDOUBLE_to_INT,
+        CDOUBLE_to_UINT,
+        CDOUBLE_to_LONG,
+        CDOUBLE_to_ULONG,
+        CDOUBLE_to_LONGLONG,
+        CDOUBLE_to_ULONGLONG,
+        CDOUBLE_to_FLOAT,
+        CDOUBLE_to_DOUBLE,
+        CDOUBLE_to_LONGDOUBLE,
+        CDOUBLE_to_CFLOAT,
+        CDOUBLE_to_CDOUBLE,
+        CDOUBLE_to_CLONGDOUBLE,
+        CDOUBLE_to_OBJECT,
+        CDOUBLE_to_STRING,
+        CDOUBLE_to_UNICODE,
+        CDOUBLE_to_VOID
+    },
+    CDOUBLE_getitem,
+    CDOUBLE_setitem,
+    (PyArray_CopySwapNFunc*)CDOUBLE_copyswapn,
+    (PyArray_CopySwapFunc*)CDOUBLE_copyswap,
+    (PyArray_CompareFunc*)CDOUBLE_compare,
+    (PyArray_ArgFunc*)CDOUBLE_argmax,
+    (PyArray_DotFunc*)CDOUBLE_dot,
+    (PyArray_ScanFunc*)CDOUBLE_scan,
+    CDOUBLE_fromstr,
+    (PyArray_NonzeroFunc*)CDOUBLE_nonzero,
+    (PyArray_FillFunc*)CDOUBLE_fill,
+    (PyArray_FillWithScalarFunc*)CDOUBLE_fillwithscalar,
+#if 1
+    {
+        quicksort_cdouble,
+        heapsort_cdouble,
+        #if 0
+            radixsort_cdouble
+        #else
+            timsort_cdouble
+        #endif
+    },
+    {
+        aquicksort_cdouble,
+        aheapsort_cdouble,
+        #if 0
+            aradixsort_cdouble
+        #else
+            atimsort_cdouble
+        #endif
+    },
+#else
+    {
+        NULL, NULL, NULL
+    },
+    {
+        NULL, NULL, NULL
+    },
+#endif
+    NULL,
+    (PyArray_ScalarKindFunc*)NULL,
+    NULL,
+    NULL,
+    (PyArray_FastClipFunc*)NULL,
+    (PyArray_FastPutmaskFunc*)NULL,
+    (PyArray_FastTakeFunc*)NULL,
+    (PyArray_ArgFunc*)CDOUBLE_argmin
+};
+
+/*
+ * FIXME: check for PY3K
+ */
+NPY_NO_EXPORT PyArray_Descr CDOUBLE_Descr = {
+    PyObject_HEAD_INIT(&PyArrayDescr_Type)
+    /* typeobj */
+    &PyCDoubleArrType_Type,
+    /* kind */
+    NPY_COMPLEXLTR,
+    /* type */
+    NPY_CDOUBLELTR,
+    /* byteorder */
+    '=',
+    /* flags */
+    0,
+    /* type_num */
+    NPY_CDOUBLE,
+    /* elsize */
+    sizeof(npy_cdouble),
+    /* alignment */
+    NPY_ALIGNOF(npy_cdouble),
+    /* subarray */
+    NULL,
+    /* fields */
+    NULL,
+    /* names */
+    NULL,
+    /* f */
+    &_PyCDouble_ArrFuncs,
+    /* metadata */
+    NULL,
+    /* c_metadata */
+    NULL,
+    /* hash */
+    -1,
+};
+
+
+#line 4385
+
+static PyArray_ArrFuncs _PyCLongDouble_ArrFuncs = {
+    {
+        CLONGDOUBLE_to_BOOL,
+        CLONGDOUBLE_to_BYTE,
+        CLONGDOUBLE_to_UBYTE,
+        CLONGDOUBLE_to_SHORT,
+        CLONGDOUBLE_to_USHORT,
+        CLONGDOUBLE_to_INT,
+        CLONGDOUBLE_to_UINT,
+        CLONGDOUBLE_to_LONG,
+        CLONGDOUBLE_to_ULONG,
+        CLONGDOUBLE_to_LONGLONG,
+        CLONGDOUBLE_to_ULONGLONG,
+        CLONGDOUBLE_to_FLOAT,
+        CLONGDOUBLE_to_DOUBLE,
+        CLONGDOUBLE_to_LONGDOUBLE,
+        CLONGDOUBLE_to_CFLOAT,
+        CLONGDOUBLE_to_CDOUBLE,
+        CLONGDOUBLE_to_CLONGDOUBLE,
+        CLONGDOUBLE_to_OBJECT,
+        CLONGDOUBLE_to_STRING,
+        CLONGDOUBLE_to_UNICODE,
+        CLONGDOUBLE_to_VOID
+    },
+    CLONGDOUBLE_getitem,
+    CLONGDOUBLE_setitem,
+    (PyArray_CopySwapNFunc*)CLONGDOUBLE_copyswapn,
+    (PyArray_CopySwapFunc*)CLONGDOUBLE_copyswap,
+    (PyArray_CompareFunc*)CLONGDOUBLE_compare,
+    (PyArray_ArgFunc*)CLONGDOUBLE_argmax,
+    (PyArray_DotFunc*)CLONGDOUBLE_dot,
+    (PyArray_ScanFunc*)CLONGDOUBLE_scan,
+    CLONGDOUBLE_fromstr,
+    (PyArray_NonzeroFunc*)CLONGDOUBLE_nonzero,
+    (PyArray_FillFunc*)CLONGDOUBLE_fill,
+    (PyArray_FillWithScalarFunc*)CLONGDOUBLE_fillwithscalar,
+#if 1
+    {
+        quicksort_clongdouble,
+        heapsort_clongdouble,
+        #if 0
+            radixsort_clongdouble
+        #else
+            timsort_clongdouble
+        #endif
+    },
+    {
+        aquicksort_clongdouble,
+        aheapsort_clongdouble,
+        #if 0
+            aradixsort_clongdouble
+        #else
+            atimsort_clongdouble
+        #endif
+    },
+#else
+    {
+        NULL, NULL, NULL
+    },
+    {
+        NULL, NULL, NULL
+    },
+#endif
+    NULL,
+    (PyArray_ScalarKindFunc*)NULL,
+    NULL,
+    NULL,
+    (PyArray_FastClipFunc*)NULL,
+    (PyArray_FastPutmaskFunc*)NULL,
+    (PyArray_FastTakeFunc*)NULL,
+    (PyArray_ArgFunc*)CLONGDOUBLE_argmin
+};
+
+/*
+ * FIXME: check for PY3K
+ */
+NPY_NO_EXPORT PyArray_Descr CLONGDOUBLE_Descr = {
+    PyObject_HEAD_INIT(&PyArrayDescr_Type)
+    /* typeobj */
+    &PyCLongDoubleArrType_Type,
+    /* kind */
+    NPY_COMPLEXLTR,
+    /* type */
+    NPY_CLONGDOUBLELTR,
+    /* byteorder */
+    '=',
+    /* flags */
+    0,
+    /* type_num */
+    NPY_CLONGDOUBLE,
+    /* elsize */
+    sizeof(npy_clongdouble),
+    /* alignment */
+    NPY_ALIGNOF(npy_clongdouble),
+    /* subarray */
+    NULL,
+    /* fields */
+    NULL,
+    /* names */
+    NULL,
+    /* f */
+    &_PyCLongDouble_ArrFuncs,
+    /* metadata */
+    NULL,
+    /* c_metadata */
+    NULL,
+    /* hash */
+    -1,
+};
+
+
+#line 4385
+
+static PyArray_ArrFuncs _PyObject_ArrFuncs = {
+    {
+        OBJECT_to_BOOL,
+        OBJECT_to_BYTE,
+        OBJECT_to_UBYTE,
+        OBJECT_to_SHORT,
+        OBJECT_to_USHORT,
+        OBJECT_to_INT,
+        OBJECT_to_UINT,
+        OBJECT_to_LONG,
+        OBJECT_to_ULONG,
+        OBJECT_to_LONGLONG,
+        OBJECT_to_ULONGLONG,
+        OBJECT_to_FLOAT,
+        OBJECT_to_DOUBLE,
+        OBJECT_to_LONGDOUBLE,
+        OBJECT_to_CFLOAT,
+        OBJECT_to_CDOUBLE,
+        OBJECT_to_CLONGDOUBLE,
+        OBJECT_to_OBJECT,
+        OBJECT_to_STRING,
+        OBJECT_to_UNICODE,
+        OBJECT_to_VOID
+    },
+    OBJECT_getitem,
+    OBJECT_setitem,
+    (PyArray_CopySwapNFunc*)OBJECT_copyswapn,
+    (PyArray_CopySwapFunc*)OBJECT_copyswap,
+    (PyArray_CompareFunc*)OBJECT_compare,
+    (PyArray_ArgFunc*)OBJECT_argmax,
+    (PyArray_DotFunc*)OBJECT_dot,
+    (PyArray_ScanFunc*)OBJECT_scan,
+    OBJECT_fromstr,
+    (PyArray_NonzeroFunc*)OBJECT_nonzero,
+    (PyArray_FillFunc*)OBJECT_fill,
+    (PyArray_FillWithScalarFunc*)OBJECT_fillwithscalar,
+#if 0
+    {
+        quicksort_object,
+        heapsort_object,
+        #if 0
+            radixsort_object
+        #else
+            timsort_object
+        #endif
+    },
+    {
+        aquicksort_object,
+        aheapsort_object,
+        #if 0
+            aradixsort_object
+        #else
+            atimsort_object
+        #endif
+    },
+#else
+    {
+        NULL, NULL, NULL
+    },
+    {
+        NULL, NULL, NULL
+    },
+#endif
+    NULL,
+    (PyArray_ScalarKindFunc*)NULL,
+    NULL,
+    NULL,
+    (PyArray_FastClipFunc*)NULL,
+    (PyArray_FastPutmaskFunc*)NULL,
+    (PyArray_FastTakeFunc*)NULL,
+    (PyArray_ArgFunc*)OBJECT_argmin
+};
+
+/*
+ * FIXME: check for PY3K
+ */
+NPY_NO_EXPORT PyArray_Descr OBJECT_Descr = {
+    PyObject_HEAD_INIT(&PyArrayDescr_Type)
+    /* typeobj */
+    &PyObjectArrType_Type,
+    /* kind */
+    NPY_OBJECTLTR,
+    /* type */
+    NPY_OBJECTLTR,
+    /* byteorder */
+    '|',
+    /* flags */
+    NPY_OBJECT_DTYPE_FLAGS,
+    /* type_num */
+    NPY_OBJECT,
+    /* elsize */
+    sizeof(PyObject *),
+    /* alignment */
+    NPY_ALIGNOF(PyObject *),
+    /* subarray */
+    NULL,
+    /* fields */
+    NULL,
+    /* names */
+    NULL,
+    /* f */
+    &_PyObject_ArrFuncs,
+    /* metadata */
+    NULL,
+    /* c_metadata */
+    NULL,
+    /* hash */
+    -1,
+};
+
+
+#line 4385
+
+static PyArray_ArrFuncs _PyDatetime_ArrFuncs = {
+    {
+        DATETIME_to_BOOL,
+        DATETIME_to_BYTE,
+        DATETIME_to_UBYTE,
+        DATETIME_to_SHORT,
+        DATETIME_to_USHORT,
+        DATETIME_to_INT,
+        DATETIME_to_UINT,
+        DATETIME_to_LONG,
+        DATETIME_to_ULONG,
+        DATETIME_to_LONGLONG,
+        DATETIME_to_ULONGLONG,
+        DATETIME_to_FLOAT,
+        DATETIME_to_DOUBLE,
+        DATETIME_to_LONGDOUBLE,
+        DATETIME_to_CFLOAT,
+        DATETIME_to_CDOUBLE,
+        DATETIME_to_CLONGDOUBLE,
+        DATETIME_to_OBJECT,
+        DATETIME_to_STRING,
+        DATETIME_to_UNICODE,
+        DATETIME_to_VOID
+    },
+    DATETIME_getitem,
+    DATETIME_setitem,
+    (PyArray_CopySwapNFunc*)DATETIME_copyswapn,
+    (PyArray_CopySwapFunc*)DATETIME_copyswap,
+    (PyArray_CompareFunc*)DATETIME_compare,
+    (PyArray_ArgFunc*)DATETIME_argmax,
+    (PyArray_DotFunc*)DATETIME_dot,
+    (PyArray_ScanFunc*)DATETIME_scan,
+    DATETIME_fromstr,
+    (PyArray_NonzeroFunc*)DATETIME_nonzero,
+    (PyArray_FillFunc*)DATETIME_fill,
+    (PyArray_FillWithScalarFunc*)DATETIME_fillwithscalar,
+#if 1
+    {
+        quicksort_datetime,
+        heapsort_datetime,
+        #if 0
+            radixsort_datetime
+        #else
+            timsort_datetime
+        #endif
+    },
+    {
+        aquicksort_datetime,
+        aheapsort_datetime,
+        #if 0
+            aradixsort_datetime
+        #else
+            atimsort_datetime
+        #endif
+    },
+#else
+    {
+        NULL, NULL, NULL
+    },
+    {
+        NULL, NULL, NULL
+    },
+#endif
+    NULL,
+    (PyArray_ScalarKindFunc*)NULL,
+    NULL,
+    NULL,
+    (PyArray_FastClipFunc*)NULL,
+    (PyArray_FastPutmaskFunc*)NULL,
+    (PyArray_FastTakeFunc*)NULL,
+    (PyArray_ArgFunc*)DATETIME_argmin
+};
+
+/*
+ * FIXME: check for PY3K
+ */
+NPY_NO_EXPORT PyArray_Descr DATETIME_Descr = {
+    PyObject_HEAD_INIT(&PyArrayDescr_Type)
+    /* typeobj */
+    &PyDatetimeArrType_Type,
+    /* kind */
+    NPY_DATETIMELTR,
+    /* type */
+    NPY_DATETIMELTR,
+    /* byteorder */
+    '=',
+    /* flags */
+    0,
+    /* type_num */
+    NPY_DATETIME,
+    /* elsize */
+    sizeof(npy_datetime),
+    /* alignment */
+    NPY_ALIGNOF(npy_datetime),
+    /* subarray */
+    NULL,
+    /* fields */
+    NULL,
+    /* names */
+    NULL,
+    /* f */
+    &_PyDatetime_ArrFuncs,
+    /* metadata */
+    NULL,
+    /* c_metadata */
+    NULL,
+    /* hash */
+    -1,
+};
+
+
+#line 4385
+
+static PyArray_ArrFuncs _PyTimedelta_ArrFuncs = {
+    {
+        TIMEDELTA_to_BOOL,
+        TIMEDELTA_to_BYTE,
+        TIMEDELTA_to_UBYTE,
+        TIMEDELTA_to_SHORT,
+        TIMEDELTA_to_USHORT,
+        TIMEDELTA_to_INT,
+        TIMEDELTA_to_UINT,
+        TIMEDELTA_to_LONG,
+        TIMEDELTA_to_ULONG,
+        TIMEDELTA_to_LONGLONG,
+        TIMEDELTA_to_ULONGLONG,
+        TIMEDELTA_to_FLOAT,
+        TIMEDELTA_to_DOUBLE,
+        TIMEDELTA_to_LONGDOUBLE,
+        TIMEDELTA_to_CFLOAT,
+        TIMEDELTA_to_CDOUBLE,
+        TIMEDELTA_to_CLONGDOUBLE,
+        TIMEDELTA_to_OBJECT,
+        TIMEDELTA_to_STRING,
+        TIMEDELTA_to_UNICODE,
+        TIMEDELTA_to_VOID
+    },
+    TIMEDELTA_getitem,
+    TIMEDELTA_setitem,
+    (PyArray_CopySwapNFunc*)TIMEDELTA_copyswapn,
+    (PyArray_CopySwapFunc*)TIMEDELTA_copyswap,
+    (PyArray_CompareFunc*)TIMEDELTA_compare,
+    (PyArray_ArgFunc*)TIMEDELTA_argmax,
+    (PyArray_DotFunc*)TIMEDELTA_dot,
+    (PyArray_ScanFunc*)TIMEDELTA_scan,
+    TIMEDELTA_fromstr,
+    (PyArray_NonzeroFunc*)TIMEDELTA_nonzero,
+    (PyArray_FillFunc*)TIMEDELTA_fill,
+    (PyArray_FillWithScalarFunc*)TIMEDELTA_fillwithscalar,
+#if 1
+    {
+        quicksort_timedelta,
+        heapsort_timedelta,
+        #if 0
+            radixsort_timedelta
+        #else
+            timsort_timedelta
+        #endif
+    },
+    {
+        aquicksort_timedelta,
+        aheapsort_timedelta,
+        #if 0
+            aradixsort_timedelta
+        #else
+            atimsort_timedelta
+        #endif
+    },
+#else
+    {
+        NULL, NULL, NULL
+    },
+    {
+        NULL, NULL, NULL
+    },
+#endif
+    NULL,
+    (PyArray_ScalarKindFunc*)NULL,
+    NULL,
+    NULL,
+    (PyArray_FastClipFunc*)NULL,
+    (PyArray_FastPutmaskFunc*)NULL,
+    (PyArray_FastTakeFunc*)NULL,
+    (PyArray_ArgFunc*)TIMEDELTA_argmin
+};
+
+/*
+ * FIXME: check for PY3K
+ */
+NPY_NO_EXPORT PyArray_Descr TIMEDELTA_Descr = {
+    PyObject_HEAD_INIT(&PyArrayDescr_Type)
+    /* typeobj */
+    &PyTimedeltaArrType_Type,
+    /* kind */
+    NPY_TIMEDELTALTR,
+    /* type */
+    NPY_TIMEDELTALTR,
+    /* byteorder */
+    '=',
+    /* flags */
+    0,
+    /* type_num */
+    NPY_TIMEDELTA,
+    /* elsize */
+    sizeof(npy_timedelta),
+    /* alignment */
+    NPY_ALIGNOF(npy_timedelta),
+    /* subarray */
+    NULL,
+    /* fields */
+    NULL,
+    /* names */
+    NULL,
+    /* f */
+    &_PyTimedelta_ArrFuncs,
+    /* metadata */
+    NULL,
+    /* c_metadata */
+    NULL,
+    /* hash */
+    -1,
+};
+
+
+
+#define _MAX_LETTER 128
+static char _letter_to_num[_MAX_LETTER];
+
+static PyArray_Descr *_builtin_descrs[] = {
+    &BOOL_Descr,
+    &BYTE_Descr,
+    &UBYTE_Descr,
+    &SHORT_Descr,
+    &USHORT_Descr,
+    &INT_Descr,
+    &UINT_Descr,
+    &LONG_Descr,
+    &ULONG_Descr,
+    &LONGLONG_Descr,
+    &ULONGLONG_Descr,
+    &FLOAT_Descr,
+    &DOUBLE_Descr,
+    &LONGDOUBLE_Descr,
+    &CFLOAT_Descr,
+    &CDOUBLE_Descr,
+    &CLONGDOUBLE_Descr,
+    &OBJECT_Descr,
+    &STRING_Descr,
+    &UNICODE_Descr,
+    &VOID_Descr,
+    &DATETIME_Descr,
+    &TIMEDELTA_Descr,
+    &HALF_Descr
+};
+
+/*NUMPY_API
+ * Get the PyArray_Descr structure for a type.
+ */
+NPY_NO_EXPORT PyArray_Descr *
+PyArray_DescrFromType(int type)
+{
+    PyArray_Descr *ret = NULL;
+
+    if (type < 0) {
+        /*
+         * It's not valid for type to be less than 0.
+         * If that happens, then no other branch of
+         * this if/else chain should be followed.
+         * This is effectively a no-op that ensures
+         * the default error is raised.
+         */
+        ret = NULL;
+    }
+    else if (type < NPY_NTYPES) {
+        ret = _builtin_descrs[type];
+    }
+    else if (type == NPY_NOTYPE) {
+        /*
+         * This needs to not raise an error so
+         * that PyArray_DescrFromType(NPY_NOTYPE)
+         * works for backwards-compatible C-API
+         */
+        return NULL;
+    }
+    else if ((type == NPY_CHAR) || (type == NPY_CHARLTR)) {
+        if (type == NPY_CHAR) {
+            /*
+             * warning added 2017-04-25, 1.13
+             * deprecated in 1.7
+             * */
+            if (DEPRECATE("The NPY_CHAR type_num is deprecated. "
+                          "Please port your code to use "
+                          "NPY_STRING instead.") < 0) {
+                return NULL;
+            }
+        }
+        ret = PyArray_DescrNew(_builtin_descrs[NPY_STRING]);
+        if (ret == NULL) {
+            return NULL;
+        }
+        ret->elsize = 1;
+        ret->type = NPY_CHARLTR;
+        return ret;
+    }
+    else if (PyTypeNum_ISUSERDEF(type)) {
+        ret = userdescrs[type - NPY_USERDEF];
+    }
+    else {
+        int num = NPY_NTYPES;
+        if (type < _MAX_LETTER) {
+            num = (int) _letter_to_num[type];
+        }
+        if (num >= NPY_NTYPES) {
+            ret = NULL;
+        }
+        else {
+            ret = _builtin_descrs[num];
+        }
+    }
+    if (ret == NULL) {
+        PyErr_SetString(PyExc_ValueError,
+                "Invalid data-type for array");
+    }
+    else {
+        Py_INCREF(ret);
+    }
+
+    return ret;
+}
+
+/*
+ *****************************************************************************
+ **                             SETUP TYPE INFO                             **
+ *****************************************************************************
+ */
+
+
+/*
+ * This function is called during numpy module initialization,
+ * and is used to initialize internal dtype tables.
+ */
+NPY_NO_EXPORT int
+set_typeinfo(PyObject *dict)
+{
+    PyObject *infodict, *s;
+    int i;
+
+    PyArray_Descr *dtype;
+    PyObject *cobj, *key;
+
+    // SIMD runtime dispatching
+    #ifndef NPY_DISABLE_OPTIMIZATION
+        #include "argfunc.dispatch.h"
+    #endif
+    #line 4636
+    #line 4639
+    NPY_CPU_DISPATCH_CALL_XB(_PyByte_ArrFuncs.argmax = (PyArray_ArgFunc*)BYTE_argmax);
+    
+#line 4639
+    NPY_CPU_DISPATCH_CALL_XB(_PyByte_ArrFuncs.argmin = (PyArray_ArgFunc*)BYTE_argmin);
+    
+    
+#line 4636
+    #line 4639
+    NPY_CPU_DISPATCH_CALL_XB(_PyUByte_ArrFuncs.argmax = (PyArray_ArgFunc*)UBYTE_argmax);
+    
+#line 4639
+    NPY_CPU_DISPATCH_CALL_XB(_PyUByte_ArrFuncs.argmin = (PyArray_ArgFunc*)UBYTE_argmin);
+    
+    
+#line 4636
+    #line 4639
+    NPY_CPU_DISPATCH_CALL_XB(_PyShort_ArrFuncs.argmax = (PyArray_ArgFunc*)SHORT_argmax);
+    
+#line 4639
+    NPY_CPU_DISPATCH_CALL_XB(_PyShort_ArrFuncs.argmin = (PyArray_ArgFunc*)SHORT_argmin);
+    
+    
+#line 4636
+    #line 4639
+    NPY_CPU_DISPATCH_CALL_XB(_PyUShort_ArrFuncs.argmax = (PyArray_ArgFunc*)USHORT_argmax);
+    
+#line 4639
+    NPY_CPU_DISPATCH_CALL_XB(_PyUShort_ArrFuncs.argmin = (PyArray_ArgFunc*)USHORT_argmin);
+    
+    
+#line 4636
+    #line 4639
+    NPY_CPU_DISPATCH_CALL_XB(_PyInt_ArrFuncs.argmax = (PyArray_ArgFunc*)INT_argmax);
+    
+#line 4639
+    NPY_CPU_DISPATCH_CALL_XB(_PyInt_ArrFuncs.argmin = (PyArray_ArgFunc*)INT_argmin);
+    
+    
+#line 4636
+    #line 4639
+    NPY_CPU_DISPATCH_CALL_XB(_PyUInt_ArrFuncs.argmax = (PyArray_ArgFunc*)UINT_argmax);
+    
+#line 4639
+    NPY_CPU_DISPATCH_CALL_XB(_PyUInt_ArrFuncs.argmin = (PyArray_ArgFunc*)UINT_argmin);
+    
+    
+#line 4636
+    #line 4639
+    NPY_CPU_DISPATCH_CALL_XB(_PyLong_ArrFuncs.argmax = (PyArray_ArgFunc*)LONG_argmax);
+    
+#line 4639
+    NPY_CPU_DISPATCH_CALL_XB(_PyLong_ArrFuncs.argmin = (PyArray_ArgFunc*)LONG_argmin);
+    
+    
+#line 4636
+    #line 4639
+    NPY_CPU_DISPATCH_CALL_XB(_PyULong_ArrFuncs.argmax = (PyArray_ArgFunc*)ULONG_argmax);
+    
+#line 4639
+    NPY_CPU_DISPATCH_CALL_XB(_PyULong_ArrFuncs.argmin = (PyArray_ArgFunc*)ULONG_argmin);
+    
+    
+#line 4636
+    #line 4639
+    NPY_CPU_DISPATCH_CALL_XB(_PyLongLong_ArrFuncs.argmax = (PyArray_ArgFunc*)LONGLONG_argmax);
+    
+#line 4639
+    NPY_CPU_DISPATCH_CALL_XB(_PyLongLong_ArrFuncs.argmin = (PyArray_ArgFunc*)LONGLONG_argmin);
+    
+    
+#line 4636
+    #line 4639
+    NPY_CPU_DISPATCH_CALL_XB(_PyULongLong_ArrFuncs.argmax = (PyArray_ArgFunc*)ULONGLONG_argmax);
+    
+#line 4639
+    NPY_CPU_DISPATCH_CALL_XB(_PyULongLong_ArrFuncs.argmin = (PyArray_ArgFunc*)ULONGLONG_argmin);
+    
+    
+#line 4636
+    #line 4639
+    NPY_CPU_DISPATCH_CALL_XB(_PyFloat_ArrFuncs.argmax = (PyArray_ArgFunc*)FLOAT_argmax);
+    
+#line 4639
+    NPY_CPU_DISPATCH_CALL_XB(_PyFloat_ArrFuncs.argmin = (PyArray_ArgFunc*)FLOAT_argmin);
+    
+    
+#line 4636
+    #line 4639
+    NPY_CPU_DISPATCH_CALL_XB(_PyDouble_ArrFuncs.argmax = (PyArray_ArgFunc*)DOUBLE_argmax);
+    
+#line 4639
+    NPY_CPU_DISPATCH_CALL_XB(_PyDouble_ArrFuncs.argmin = (PyArray_ArgFunc*)DOUBLE_argmin);
+    
+    
+#line 4636
+    #line 4639
+    NPY_CPU_DISPATCH_CALL_XB(_PyLongDouble_ArrFuncs.argmax = (PyArray_ArgFunc*)LONGDOUBLE_argmax);
+    
+#line 4639
+    NPY_CPU_DISPATCH_CALL_XB(_PyLongDouble_ArrFuncs.argmin = (PyArray_ArgFunc*)LONGDOUBLE_argmin);
+    
+    
+    NPY_CPU_DISPATCH_CALL_XB(_PyBool_ArrFuncs.argmax = (PyArray_ArgFunc*)BOOL_argmax);
+
+    /*
+     * Override the base class for all types, eventually all of this logic
+     * should be defined on the class and inherited to the scalar.
+     * (NPY_HALF is the largest builtin one.)
+     */
+    #line 4659
+    if (dtypemeta_wrap_legacy_descriptor(
+            _builtin_descrs[NPY_BOOL],
+            "numpy.dtypes." NPY_BOOL_Name "DType",
+#ifdef NPY_BOOL_alias
+            "numpy.dtypes." NPY_BOOL_Alias "DType"
+#else
+            NULL
+#endif
+            ) < 0) {
+        return -1;
+    }
+
+    
+#line 4659
+    if (dtypemeta_wrap_legacy_descriptor(
+            _builtin_descrs[NPY_BYTE],
+            "numpy.dtypes." NPY_BYTE_Name "DType",
+#ifdef NPY_BYTE_alias
+            "numpy.dtypes." NPY_BYTE_Alias "DType"
+#else
+            NULL
+#endif
+            ) < 0) {
+        return -1;
+    }
+
+    
+#line 4659
+    if (dtypemeta_wrap_legacy_descriptor(
+            _builtin_descrs[NPY_UBYTE],
+            "numpy.dtypes." NPY_UBYTE_Name "DType",
+#ifdef NPY_UBYTE_alias
+            "numpy.dtypes." NPY_UBYTE_Alias "DType"
+#else
+            NULL
+#endif
+            ) < 0) {
+        return -1;
+    }
+
+    
+#line 4659
+    if (dtypemeta_wrap_legacy_descriptor(
+            _builtin_descrs[NPY_SHORT],
+            "numpy.dtypes." NPY_SHORT_Name "DType",
+#ifdef NPY_SHORT_alias
+            "numpy.dtypes." NPY_SHORT_Alias "DType"
+#else
+            NULL
+#endif
+            ) < 0) {
+        return -1;
+    }
+
+    
+#line 4659
+    if (dtypemeta_wrap_legacy_descriptor(
+            _builtin_descrs[NPY_USHORT],
+            "numpy.dtypes." NPY_USHORT_Name "DType",
+#ifdef NPY_USHORT_alias
+            "numpy.dtypes." NPY_USHORT_Alias "DType"
+#else
+            NULL
+#endif
+            ) < 0) {
+        return -1;
+    }
+
+    
+#line 4659
+    if (dtypemeta_wrap_legacy_descriptor(
+            _builtin_descrs[NPY_INT],
+            "numpy.dtypes." NPY_INT_Name "DType",
+#ifdef NPY_INT_alias
+            "numpy.dtypes." NPY_INT_Alias "DType"
+#else
+            NULL
+#endif
+            ) < 0) {
+        return -1;
+    }
+
+    
+#line 4659
+    if (dtypemeta_wrap_legacy_descriptor(
+            _builtin_descrs[NPY_UINT],
+            "numpy.dtypes." NPY_UINT_Name "DType",
+#ifdef NPY_UINT_alias
+            "numpy.dtypes." NPY_UINT_Alias "DType"
+#else
+            NULL
+#endif
+            ) < 0) {
+        return -1;
+    }
+
+    
+#line 4659
+    if (dtypemeta_wrap_legacy_descriptor(
+            _builtin_descrs[NPY_LONG],
+            "numpy.dtypes." NPY_LONG_Name "DType",
+#ifdef NPY_LONG_alias
+            "numpy.dtypes." NPY_LONG_Alias "DType"
+#else
+            NULL
+#endif
+            ) < 0) {
+        return -1;
+    }
+
+    
+#line 4659
+    if (dtypemeta_wrap_legacy_descriptor(
+            _builtin_descrs[NPY_ULONG],
+            "numpy.dtypes." NPY_ULONG_Name "DType",
+#ifdef NPY_ULONG_alias
+            "numpy.dtypes." NPY_ULONG_Alias "DType"
+#else
+            NULL
+#endif
+            ) < 0) {
+        return -1;
+    }
+
+    
+#line 4659
+    if (dtypemeta_wrap_legacy_descriptor(
+            _builtin_descrs[NPY_LONGLONG],
+            "numpy.dtypes." NPY_LONGLONG_Name "DType",
+#ifdef NPY_LONGLONG_alias
+            "numpy.dtypes." NPY_LONGLONG_Alias "DType"
+#else
+            NULL
+#endif
+            ) < 0) {
+        return -1;
+    }
+
+    
+#line 4659
+    if (dtypemeta_wrap_legacy_descriptor(
+            _builtin_descrs[NPY_ULONGLONG],
+            "numpy.dtypes." NPY_ULONGLONG_Name "DType",
+#ifdef NPY_ULONGLONG_alias
+            "numpy.dtypes." NPY_ULONGLONG_Alias "DType"
+#else
+            NULL
+#endif
+            ) < 0) {
+        return -1;
+    }
+
+    
+#line 4659
+    if (dtypemeta_wrap_legacy_descriptor(
+            _builtin_descrs[NPY_HALF],
+            "numpy.dtypes." NPY_HALF_Name "DType",
+#ifdef NPY_HALF_alias
+            "numpy.dtypes." NPY_HALF_Alias "DType"
+#else
+            NULL
+#endif
+            ) < 0) {
+        return -1;
+    }
+
+    
+#line 4659
+    if (dtypemeta_wrap_legacy_descriptor(
+            _builtin_descrs[NPY_FLOAT],
+            "numpy.dtypes." NPY_FLOAT_Name "DType",
+#ifdef NPY_FLOAT_alias
+            "numpy.dtypes." NPY_FLOAT_Alias "DType"
+#else
+            NULL
+#endif
+            ) < 0) {
+        return -1;
+    }
+
+    
+#line 4659
+    if (dtypemeta_wrap_legacy_descriptor(
+            _builtin_descrs[NPY_DOUBLE],
+            "numpy.dtypes." NPY_DOUBLE_Name "DType",
+#ifdef NPY_DOUBLE_alias
+            "numpy.dtypes." NPY_DOUBLE_Alias "DType"
+#else
+            NULL
+#endif
+            ) < 0) {
+        return -1;
+    }
+
+    
+#line 4659
+    if (dtypemeta_wrap_legacy_descriptor(
+            _builtin_descrs[NPY_LONGDOUBLE],
+            "numpy.dtypes." NPY_LONGDOUBLE_Name "DType",
+#ifdef NPY_LONGDOUBLE_alias
+            "numpy.dtypes." NPY_LONGDOUBLE_Alias "DType"
+#else
+            NULL
+#endif
+            ) < 0) {
+        return -1;
+    }
+
+    
+#line 4659
+    if (dtypemeta_wrap_legacy_descriptor(
+            _builtin_descrs[NPY_CFLOAT],
+            "numpy.dtypes." NPY_CFLOAT_Name "DType",
+#ifdef NPY_CFLOAT_alias
+            "numpy.dtypes." NPY_CFLOAT_Alias "DType"
+#else
+            NULL
+#endif
+            ) < 0) {
+        return -1;
+    }
+
+    
+#line 4659
+    if (dtypemeta_wrap_legacy_descriptor(
+            _builtin_descrs[NPY_CDOUBLE],
+            "numpy.dtypes." NPY_CDOUBLE_Name "DType",
+#ifdef NPY_CDOUBLE_alias
+            "numpy.dtypes." NPY_CDOUBLE_Alias "DType"
+#else
+            NULL
+#endif
+            ) < 0) {
+        return -1;
+    }
+
+    
+#line 4659
+    if (dtypemeta_wrap_legacy_descriptor(
+            _builtin_descrs[NPY_CLONGDOUBLE],
+            "numpy.dtypes." NPY_CLONGDOUBLE_Name "DType",
+#ifdef NPY_CLONGDOUBLE_alias
+            "numpy.dtypes." NPY_CLONGDOUBLE_Alias "DType"
+#else
+            NULL
+#endif
+            ) < 0) {
+        return -1;
+    }
+
+    
+#line 4659
+    if (dtypemeta_wrap_legacy_descriptor(
+            _builtin_descrs[NPY_OBJECT],
+            "numpy.dtypes." NPY_OBJECT_Name "DType",
+#ifdef NPY_OBJECT_alias
+            "numpy.dtypes." NPY_OBJECT_Alias "DType"
+#else
+            NULL
+#endif
+            ) < 0) {
+        return -1;
+    }
+
+    
+#line 4659
+    if (dtypemeta_wrap_legacy_descriptor(
+            _builtin_descrs[NPY_STRING],
+            "numpy.dtypes." NPY_STRING_Name "DType",
+#ifdef NPY_STRING_alias
+            "numpy.dtypes." NPY_STRING_Alias "DType"
+#else
+            NULL
+#endif
+            ) < 0) {
+        return -1;
+    }
+
+    
+#line 4659
+    if (dtypemeta_wrap_legacy_descriptor(
+            _builtin_descrs[NPY_UNICODE],
+            "numpy.dtypes." NPY_UNICODE_Name "DType",
+#ifdef NPY_UNICODE_alias
+            "numpy.dtypes." NPY_UNICODE_Alias "DType"
+#else
+            NULL
+#endif
+            ) < 0) {
+        return -1;
+    }
+
+    
+#line 4659
+    if (dtypemeta_wrap_legacy_descriptor(
+            _builtin_descrs[NPY_VOID],
+            "numpy.dtypes." NPY_VOID_Name "DType",
+#ifdef NPY_VOID_alias
+            "numpy.dtypes." NPY_VOID_Alias "DType"
+#else
+            NULL
+#endif
+            ) < 0) {
+        return -1;
+    }
+
+    
+#line 4659
+    if (dtypemeta_wrap_legacy_descriptor(
+            _builtin_descrs[NPY_DATETIME],
+            "numpy.dtypes." NPY_DATETIME_Name "DType",
+#ifdef NPY_DATETIME_alias
+            "numpy.dtypes." NPY_DATETIME_Alias "DType"
+#else
+            NULL
+#endif
+            ) < 0) {
+        return -1;
+    }
+
+    
+#line 4659
+    if (dtypemeta_wrap_legacy_descriptor(
+            _builtin_descrs[NPY_TIMEDELTA],
+            "numpy.dtypes." NPY_TIMEDELTA_Name "DType",
+#ifdef NPY_TIMEDELTA_alias
+            "numpy.dtypes." NPY_TIMEDELTA_Alias "DType"
+#else
+            NULL
+#endif
+            ) < 0) {
+        return -1;
+    }
+
+    
+
+    /*
+     * Add cast functions for the new types
+     */
+
+    #line 4687
+
+    #line 4692
+
+    dtype = _builtin_descrs[NPY_BOOL];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_HALF);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)BOOL_to_HALF, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_BOOL];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_DATETIME);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)BOOL_to_DATETIME, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_BOOL];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_TIMEDELTA);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)BOOL_to_TIMEDELTA, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+
+    
+#line 4687
+
+    #line 4692
+
+    dtype = _builtin_descrs[NPY_BYTE];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_HALF);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)BYTE_to_HALF, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_BYTE];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_DATETIME);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)BYTE_to_DATETIME, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_BYTE];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_TIMEDELTA);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)BYTE_to_TIMEDELTA, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+
+    
+#line 4687
+
+    #line 4692
+
+    dtype = _builtin_descrs[NPY_UBYTE];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_HALF);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)UBYTE_to_HALF, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_UBYTE];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_DATETIME);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)UBYTE_to_DATETIME, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_UBYTE];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_TIMEDELTA);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)UBYTE_to_TIMEDELTA, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+
+    
+#line 4687
+
+    #line 4692
+
+    dtype = _builtin_descrs[NPY_SHORT];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_HALF);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)SHORT_to_HALF, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_SHORT];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_DATETIME);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)SHORT_to_DATETIME, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_SHORT];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_TIMEDELTA);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)SHORT_to_TIMEDELTA, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+
+    
+#line 4687
+
+    #line 4692
+
+    dtype = _builtin_descrs[NPY_USHORT];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_HALF);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)USHORT_to_HALF, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_USHORT];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_DATETIME);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)USHORT_to_DATETIME, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_USHORT];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_TIMEDELTA);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)USHORT_to_TIMEDELTA, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+
+    
+#line 4687
+
+    #line 4692
+
+    dtype = _builtin_descrs[NPY_INT];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_HALF);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)INT_to_HALF, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_INT];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_DATETIME);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)INT_to_DATETIME, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_INT];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_TIMEDELTA);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)INT_to_TIMEDELTA, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+
+    
+#line 4687
+
+    #line 4692
+
+    dtype = _builtin_descrs[NPY_UINT];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_HALF);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)UINT_to_HALF, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_UINT];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_DATETIME);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)UINT_to_DATETIME, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_UINT];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_TIMEDELTA);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)UINT_to_TIMEDELTA, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+
+    
+#line 4687
+
+    #line 4692
+
+    dtype = _builtin_descrs[NPY_LONG];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_HALF);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)LONG_to_HALF, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_LONG];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_DATETIME);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)LONG_to_DATETIME, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_LONG];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_TIMEDELTA);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)LONG_to_TIMEDELTA, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+
+    
+#line 4687
+
+    #line 4692
+
+    dtype = _builtin_descrs[NPY_ULONG];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_HALF);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)ULONG_to_HALF, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_ULONG];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_DATETIME);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)ULONG_to_DATETIME, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_ULONG];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_TIMEDELTA);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)ULONG_to_TIMEDELTA, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+
+    
+#line 4687
+
+    #line 4692
+
+    dtype = _builtin_descrs[NPY_LONGLONG];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_HALF);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)LONGLONG_to_HALF, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_LONGLONG];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_DATETIME);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)LONGLONG_to_DATETIME, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_LONGLONG];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_TIMEDELTA);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)LONGLONG_to_TIMEDELTA, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+
+    
+#line 4687
+
+    #line 4692
+
+    dtype = _builtin_descrs[NPY_ULONGLONG];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_HALF);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)ULONGLONG_to_HALF, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_ULONGLONG];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_DATETIME);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)ULONGLONG_to_DATETIME, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_ULONGLONG];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_TIMEDELTA);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)ULONGLONG_to_TIMEDELTA, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+
+    
+#line 4687
+
+    #line 4692
+
+    dtype = _builtin_descrs[NPY_HALF];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_HALF);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)HALF_to_HALF, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_HALF];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_DATETIME);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)HALF_to_DATETIME, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_HALF];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_TIMEDELTA);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)HALF_to_TIMEDELTA, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+
+    
+#line 4687
+
+    #line 4692
+
+    dtype = _builtin_descrs[NPY_FLOAT];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_HALF);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)FLOAT_to_HALF, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_FLOAT];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_DATETIME);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)FLOAT_to_DATETIME, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_FLOAT];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_TIMEDELTA);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)FLOAT_to_TIMEDELTA, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+
+    
+#line 4687
+
+    #line 4692
+
+    dtype = _builtin_descrs[NPY_DOUBLE];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_HALF);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)DOUBLE_to_HALF, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_DOUBLE];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_DATETIME);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)DOUBLE_to_DATETIME, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_DOUBLE];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_TIMEDELTA);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)DOUBLE_to_TIMEDELTA, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+
+    
+#line 4687
+
+    #line 4692
+
+    dtype = _builtin_descrs[NPY_LONGDOUBLE];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_HALF);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)LONGDOUBLE_to_HALF, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_LONGDOUBLE];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_DATETIME);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)LONGDOUBLE_to_DATETIME, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_LONGDOUBLE];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_TIMEDELTA);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)LONGDOUBLE_to_TIMEDELTA, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+
+    
+#line 4687
+
+    #line 4692
+
+    dtype = _builtin_descrs[NPY_CFLOAT];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_HALF);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)CFLOAT_to_HALF, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_CFLOAT];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_DATETIME);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)CFLOAT_to_DATETIME, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_CFLOAT];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_TIMEDELTA);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)CFLOAT_to_TIMEDELTA, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+
+    
+#line 4687
+
+    #line 4692
+
+    dtype = _builtin_descrs[NPY_CDOUBLE];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_HALF);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)CDOUBLE_to_HALF, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_CDOUBLE];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_DATETIME);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)CDOUBLE_to_DATETIME, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_CDOUBLE];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_TIMEDELTA);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)CDOUBLE_to_TIMEDELTA, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+
+    
+#line 4687
+
+    #line 4692
+
+    dtype = _builtin_descrs[NPY_CLONGDOUBLE];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_HALF);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)CLONGDOUBLE_to_HALF, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_CLONGDOUBLE];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_DATETIME);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)CLONGDOUBLE_to_DATETIME, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_CLONGDOUBLE];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_TIMEDELTA);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)CLONGDOUBLE_to_TIMEDELTA, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+
+    
+#line 4687
+
+    #line 4692
+
+    dtype = _builtin_descrs[NPY_OBJECT];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_HALF);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)OBJECT_to_HALF, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_OBJECT];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_DATETIME);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)OBJECT_to_DATETIME, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_OBJECT];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_TIMEDELTA);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)OBJECT_to_TIMEDELTA, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+
+    
+#line 4687
+
+    #line 4692
+
+    dtype = _builtin_descrs[NPY_STRING];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_HALF);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)STRING_to_HALF, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_STRING];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_DATETIME);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)STRING_to_DATETIME, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_STRING];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_TIMEDELTA);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)STRING_to_TIMEDELTA, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+
+    
+#line 4687
+
+    #line 4692
+
+    dtype = _builtin_descrs[NPY_UNICODE];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_HALF);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)UNICODE_to_HALF, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_UNICODE];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_DATETIME);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)UNICODE_to_DATETIME, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_UNICODE];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_TIMEDELTA);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)UNICODE_to_TIMEDELTA, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+
+    
+#line 4687
+
+    #line 4692
+
+    dtype = _builtin_descrs[NPY_VOID];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_HALF);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)VOID_to_HALF, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_VOID];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_DATETIME);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)VOID_to_DATETIME, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_VOID];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_TIMEDELTA);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)VOID_to_TIMEDELTA, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+
+    
+#line 4687
+
+    #line 4692
+
+    dtype = _builtin_descrs[NPY_DATETIME];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_HALF);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)DATETIME_to_HALF, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_DATETIME];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_DATETIME);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)DATETIME_to_DATETIME, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_DATETIME];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_TIMEDELTA);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)DATETIME_to_TIMEDELTA, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+
+    
+#line 4687
+
+    #line 4692
+
+    dtype = _builtin_descrs[NPY_TIMEDELTA];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_HALF);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)TIMEDELTA_to_HALF, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_TIMEDELTA];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_DATETIME);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)TIMEDELTA_to_DATETIME, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+#line 4692
+
+    dtype = _builtin_descrs[NPY_TIMEDELTA];
+    if (dtype->f->castdict == NULL) {
+        dtype->f->castdict = PyDict_New();
+        if (dtype->f->castdict == NULL) {
+            return -1;
+        }
+    }
+    key = PyLong_FromLong(NPY_TIMEDELTA);
+    if (key == NULL) {
+        return -1;
+    }
+    cobj = NpyCapsule_FromVoidPtr((void *)TIMEDELTA_to_TIMEDELTA, NULL);
+    if (cobj == NULL) {
+        Py_DECREF(key);
+        return -1;
+    }
+    if (PyDict_SetItem(dtype->f->castdict, key, cobj) < 0) {
+        Py_DECREF(key);
+        Py_DECREF(cobj);
+        return -1;
+    }
+    Py_DECREF(key);
+    Py_DECREF(cobj);
+
+    
+
+    
+
+    _builtin_descrs[NPY_DATETIME]->c_metadata = _create_datetime_metadata(
+                NPY_DATETIME_DEFAULTUNIT, 1);
+    if (_builtin_descrs[NPY_DATETIME]->c_metadata == NULL) {
+        return -1;
+    }
+    _builtin_descrs[NPY_TIMEDELTA]->c_metadata = _create_datetime_metadata(
+                NPY_DATETIME_DEFAULTUNIT, 1);
+    if (_builtin_descrs[NPY_DATETIME]->c_metadata == NULL) {
+        return -1;
+    }
+
+    for (i = 0; i < _MAX_LETTER; i++) {
+        _letter_to_num[i] = NPY_NTYPES;
+    }
+
+    #line 4747
+
+    _letter_to_num[NPY_BOOLLTR] = NPY_BOOL;
+
+    
+#line 4747
+
+    _letter_to_num[NPY_BYTELTR] = NPY_BYTE;
+
+    
+#line 4747
+
+    _letter_to_num[NPY_UBYTELTR] = NPY_UBYTE;
+
+    
+#line 4747
+
+    _letter_to_num[NPY_SHORTLTR] = NPY_SHORT;
+
+    
+#line 4747
+
+    _letter_to_num[NPY_USHORTLTR] = NPY_USHORT;
+
+    
+#line 4747
+
+    _letter_to_num[NPY_INTLTR] = NPY_INT;
+
+    
+#line 4747
+
+    _letter_to_num[NPY_UINTLTR] = NPY_UINT;
+
+    
+#line 4747
+
+    _letter_to_num[NPY_INTPLTR] = NPY_INTP;
+
+    
+#line 4747
+
+    _letter_to_num[NPY_UINTPLTR] = NPY_UINTP;
+
+    
+#line 4747
+
+    _letter_to_num[NPY_LONGLTR] = NPY_LONG;
+
+    
+#line 4747
+
+    _letter_to_num[NPY_ULONGLTR] = NPY_ULONG;
+
+    
+#line 4747
+
+    _letter_to_num[NPY_LONGLONGLTR] = NPY_LONGLONG;
+
+    
+#line 4747
+
+    _letter_to_num[NPY_ULONGLONGLTR] = NPY_ULONGLONG;
+
+    
+#line 4747
+
+    _letter_to_num[NPY_HALFLTR] = NPY_HALF;
+
+    
+#line 4747
+
+    _letter_to_num[NPY_FLOATLTR] = NPY_FLOAT;
+
+    
+#line 4747
+
+    _letter_to_num[NPY_DOUBLELTR] = NPY_DOUBLE;
+
+    
+#line 4747
+
+    _letter_to_num[NPY_LONGDOUBLELTR] = NPY_LONGDOUBLE;
+
+    
+#line 4747
+
+    _letter_to_num[NPY_CFLOATLTR] = NPY_CFLOAT;
+
+    
+#line 4747
+
+    _letter_to_num[NPY_CDOUBLELTR] = NPY_CDOUBLE;
+
+    
+#line 4747
+
+    _letter_to_num[NPY_CLONGDOUBLELTR] = NPY_CLONGDOUBLE;
+
+    
+#line 4747
+
+    _letter_to_num[NPY_OBJECTLTR] = NPY_OBJECT;
+
+    
+#line 4747
+
+    _letter_to_num[NPY_STRINGLTR] = NPY_STRING;
+
+    
+#line 4747
+
+    _letter_to_num[NPY_UNICODELTR] = NPY_UNICODE;
+
+    
+#line 4747
+
+    _letter_to_num[NPY_VOIDLTR] = NPY_VOID;
+
+    
+#line 4747
+
+    _letter_to_num[NPY_DATETIMELTR] = NPY_DATETIME;
+
+    
+#line 4747
+
+    _letter_to_num[NPY_TIMEDELTALTR] = NPY_TIMEDELTA;
+
+    
+
+    _letter_to_num[NPY_STRINGLTR2] = NPY_STRING;
+
+    #line 4763
+
+    BOOL_Descr.fields = Py_None;
+
+    
+#line 4763
+
+    BYTE_Descr.fields = Py_None;
+
+    
+#line 4763
+
+    UBYTE_Descr.fields = Py_None;
+
+    
+#line 4763
+
+    SHORT_Descr.fields = Py_None;
+
+    
+#line 4763
+
+    USHORT_Descr.fields = Py_None;
+
+    
+#line 4763
+
+    INT_Descr.fields = Py_None;
+
+    
+#line 4763
+
+    UINT_Descr.fields = Py_None;
+
+    
+#line 4763
+
+    LONG_Descr.fields = Py_None;
+
+    
+#line 4763
+
+    ULONG_Descr.fields = Py_None;
+
+    
+#line 4763
+
+    LONGLONG_Descr.fields = Py_None;
+
+    
+#line 4763
+
+    ULONGLONG_Descr.fields = Py_None;
+
+    
+#line 4763
+
+    HALF_Descr.fields = Py_None;
+
+    
+#line 4763
+
+    FLOAT_Descr.fields = Py_None;
+
+    
+#line 4763
+
+    DOUBLE_Descr.fields = Py_None;
+
+    
+#line 4763
+
+    LONGDOUBLE_Descr.fields = Py_None;
+
+    
+#line 4763
+
+    CFLOAT_Descr.fields = Py_None;
+
+    
+#line 4763
+
+    CDOUBLE_Descr.fields = Py_None;
+
+    
+#line 4763
+
+    CLONGDOUBLE_Descr.fields = Py_None;
+
+    
+#line 4763
+
+    OBJECT_Descr.fields = Py_None;
+
+    
+#line 4763
+
+    STRING_Descr.fields = Py_None;
+
+    
+#line 4763
+
+    UNICODE_Descr.fields = Py_None;
+
+    
+#line 4763
+
+    VOID_Descr.fields = Py_None;
+
+    
+#line 4763
+
+    DATETIME_Descr.fields = Py_None;
+
+    
+#line 4763
+
+    TIMEDELTA_Descr.fields = Py_None;
+
+    
+
+
+    #line 4772
+
+    PyDataType_MAKEUNSIZED(&STRING_Descr);
+
+    
+#line 4772
+
+    PyDataType_MAKEUNSIZED(&UNICODE_Descr);
+
+    
+#line 4772
+
+    PyDataType_MAKEUNSIZED(&VOID_Descr);
+
+    
+
+    /* Set a dictionary with type information */
+    infodict = PyDict_New();
+    if (infodict == NULL) return -1;
+
+    int ret;
+    #line 4816
+
+    s = PyArray_typeinforanged(
+        NPY_BOOLLTR, NPY_BOOL, NPY_BITSOF_BOOL, NPY_ALIGNOF(npy_bool),
+        Py_BuildValue("i", 1),
+        Py_BuildValue("i", 0),
+        &PyBoolArrType_Type
+    );
+    if (s == NULL) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+    ret = PyDict_SetItemString(infodict, "BOOL", s);
+    Py_DECREF(s);
+    if (ret < 0) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+
+
+    
+#line 4816
+
+    s = PyArray_typeinforanged(
+        NPY_BYTELTR, NPY_BYTE, NPY_BITSOF_BYTE, NPY_ALIGNOF(npy_byte),
+        Py_BuildValue("i", NPY_MAX_BYTE),
+        Py_BuildValue("i", NPY_MIN_BYTE),
+        &PyByteArrType_Type
+    );
+    if (s == NULL) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+    ret = PyDict_SetItemString(infodict, "BYTE", s);
+    Py_DECREF(s);
+    if (ret < 0) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+
+
+    
+#line 4816
+
+    s = PyArray_typeinforanged(
+        NPY_UBYTELTR, NPY_UBYTE, NPY_BITSOF_BYTE, NPY_ALIGNOF(npy_ubyte),
+        Py_BuildValue("i", NPY_MAX_UBYTE),
+        Py_BuildValue("i", 0),
+        &PyUByteArrType_Type
+    );
+    if (s == NULL) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+    ret = PyDict_SetItemString(infodict, "UBYTE", s);
+    Py_DECREF(s);
+    if (ret < 0) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+
+
+    
+#line 4816
+
+    s = PyArray_typeinforanged(
+        NPY_SHORTLTR, NPY_SHORT, NPY_BITSOF_SHORT, NPY_ALIGNOF(npy_short),
+        Py_BuildValue("i", NPY_MAX_SHORT),
+        Py_BuildValue("i", NPY_MIN_SHORT),
+        &PyShortArrType_Type
+    );
+    if (s == NULL) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+    ret = PyDict_SetItemString(infodict, "SHORT", s);
+    Py_DECREF(s);
+    if (ret < 0) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+
+
+    
+#line 4816
+
+    s = PyArray_typeinforanged(
+        NPY_USHORTLTR, NPY_USHORT, NPY_BITSOF_SHORT, NPY_ALIGNOF(npy_ushort),
+        Py_BuildValue("i", NPY_MAX_USHORT),
+        Py_BuildValue("i", 0),
+        &PyUShortArrType_Type
+    );
+    if (s == NULL) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+    ret = PyDict_SetItemString(infodict, "USHORT", s);
+    Py_DECREF(s);
+    if (ret < 0) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+
+
+    
+#line 4816
+
+    s = PyArray_typeinforanged(
+        NPY_INTLTR, NPY_INT, NPY_BITSOF_INT, NPY_ALIGNOF(npy_int),
+        Py_BuildValue("i", NPY_MAX_INT),
+        Py_BuildValue("i", NPY_MIN_INT),
+        &PyIntArrType_Type
+    );
+    if (s == NULL) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+    ret = PyDict_SetItemString(infodict, "INT", s);
+    Py_DECREF(s);
+    if (ret < 0) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+
+
+    
+#line 4816
+
+    s = PyArray_typeinforanged(
+        NPY_UINTLTR, NPY_UINT, NPY_BITSOF_INT, NPY_ALIGNOF(npy_uint),
+        Py_BuildValue("N", PyLong_FromUnsignedLong(NPY_MAX_UINT)),
+        Py_BuildValue("i", 0),
+        &PyUIntArrType_Type
+    );
+    if (s == NULL) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+    ret = PyDict_SetItemString(infodict, "UINT", s);
+    Py_DECREF(s);
+    if (ret < 0) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+
+
+    
+#line 4816
+
+    s = PyArray_typeinforanged(
+        NPY_INTPLTR, NPY_INTP, NPY_BITSOF_INTP, NPY_ALIGNOF(npy_intp),
+        Py_BuildValue("N", PyLong_FromLongLong((npy_longlong) NPY_MAX_INTP)),
+        Py_BuildValue("N", PyLong_FromLongLong((npy_longlong) NPY_MIN_INTP)),
+        &PyIntpArrType_Type
+    );
+    if (s == NULL) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+    ret = PyDict_SetItemString(infodict, "INTP", s);
+    Py_DECREF(s);
+    if (ret < 0) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+
+
+    
+#line 4816
+
+    s = PyArray_typeinforanged(
+        NPY_UINTPLTR, NPY_UINTP, NPY_BITSOF_INTP, NPY_ALIGNOF(npy_uintp),
+        Py_BuildValue("N", PyLong_FromUnsignedLongLong((npy_ulonglong) NPY_MAX_UINTP)),
+        Py_BuildValue("i", 0),
+        &PyUIntpArrType_Type
+    );
+    if (s == NULL) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+    ret = PyDict_SetItemString(infodict, "UINTP", s);
+    Py_DECREF(s);
+    if (ret < 0) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+
+
+    
+#line 4816
+
+    s = PyArray_typeinforanged(
+        NPY_LONGLTR, NPY_LONG, NPY_BITSOF_LONG, NPY_ALIGNOF(npy_long),
+        Py_BuildValue("l", NPY_MAX_LONG),
+        Py_BuildValue("l", NPY_MIN_LONG),
+        &PyLongArrType_Type
+    );
+    if (s == NULL) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+    ret = PyDict_SetItemString(infodict, "LONG", s);
+    Py_DECREF(s);
+    if (ret < 0) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+
+
+    
+#line 4816
+
+    s = PyArray_typeinforanged(
+        NPY_ULONGLTR, NPY_ULONG, NPY_BITSOF_LONG, NPY_ALIGNOF(npy_ulong),
+        Py_BuildValue("N", PyLong_FromUnsignedLong((npy_ulong) NPY_MAX_ULONG)),
+        Py_BuildValue("i", 0),
+        &PyULongArrType_Type
+    );
+    if (s == NULL) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+    ret = PyDict_SetItemString(infodict, "ULONG", s);
+    Py_DECREF(s);
+    if (ret < 0) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+
+
+    
+#line 4816
+
+    s = PyArray_typeinforanged(
+        NPY_LONGLONGLTR, NPY_LONGLONG, NPY_BITSOF_LONGLONG, NPY_ALIGNOF(npy_longlong),
+        Py_BuildValue("N", PyLong_FromLongLong((npy_longlong) NPY_MAX_LONGLONG)),
+        Py_BuildValue("N", PyLong_FromLongLong((npy_longlong) NPY_MIN_LONGLONG)),
+        &PyLongLongArrType_Type
+    );
+    if (s == NULL) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+    ret = PyDict_SetItemString(infodict, "LONGLONG", s);
+    Py_DECREF(s);
+    if (ret < 0) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+
+
+    
+#line 4816
+
+    s = PyArray_typeinforanged(
+        NPY_ULONGLONGLTR, NPY_ULONGLONG, NPY_BITSOF_LONGLONG, NPY_ALIGNOF(npy_ulonglong),
+        Py_BuildValue("N", PyLong_FromUnsignedLongLong((npy_ulonglong) NPY_MAX_ULONGLONG)),
+        Py_BuildValue("i", 0),
+        &PyULongLongArrType_Type
+    );
+    if (s == NULL) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+    ret = PyDict_SetItemString(infodict, "ULONGLONG", s);
+    Py_DECREF(s);
+    if (ret < 0) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+
+
+    
+
+
+    #line 4847
+    s = PyArray_typeinfo(
+        NPY_HALFLTR, NPY_HALF, NPY_BITSOF_HALF,
+        NPY_ALIGNOF(npy_half), &PyHalfArrType_Type
+    );
+    if (s == NULL) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+    ret = PyDict_SetItemString(infodict, "HALF", s);
+    Py_DECREF(s);
+    if (ret < 0) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+
+    
+#line 4847
+    s = PyArray_typeinfo(
+        NPY_FLOATLTR, NPY_FLOAT, NPY_BITSOF_FLOAT,
+        NPY_ALIGNOF(npy_float), &PyFloatArrType_Type
+    );
+    if (s == NULL) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+    ret = PyDict_SetItemString(infodict, "FLOAT", s);
+    Py_DECREF(s);
+    if (ret < 0) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+
+    
+#line 4847
+    s = PyArray_typeinfo(
+        NPY_DOUBLELTR, NPY_DOUBLE, NPY_BITSOF_DOUBLE,
+        NPY_ALIGNOF(npy_double), &PyDoubleArrType_Type
+    );
+    if (s == NULL) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+    ret = PyDict_SetItemString(infodict, "DOUBLE", s);
+    Py_DECREF(s);
+    if (ret < 0) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+
+    
+#line 4847
+    s = PyArray_typeinfo(
+        NPY_LONGDOUBLELTR, NPY_LONGDOUBLE, NPY_BITSOF_LONGDOUBLE,
+        NPY_ALIGNOF(npy_longdouble), &PyLongDoubleArrType_Type
+    );
+    if (s == NULL) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+    ret = PyDict_SetItemString(infodict, "LONGDOUBLE", s);
+    Py_DECREF(s);
+    if (ret < 0) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+
+    
+#line 4847
+    s = PyArray_typeinfo(
+        NPY_CFLOATLTR, NPY_CFLOAT, NPY_BITSOF_CFLOAT,
+        NPY_ALIGNOF(npy_cfloat), &PyCFloatArrType_Type
+    );
+    if (s == NULL) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+    ret = PyDict_SetItemString(infodict, "CFLOAT", s);
+    Py_DECREF(s);
+    if (ret < 0) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+
+    
+#line 4847
+    s = PyArray_typeinfo(
+        NPY_CDOUBLELTR, NPY_CDOUBLE, NPY_BITSOF_CDOUBLE,
+        NPY_ALIGNOF(npy_cdouble), &PyCDoubleArrType_Type
+    );
+    if (s == NULL) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+    ret = PyDict_SetItemString(infodict, "CDOUBLE", s);
+    Py_DECREF(s);
+    if (ret < 0) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+
+    
+#line 4847
+    s = PyArray_typeinfo(
+        NPY_CLONGDOUBLELTR, NPY_CLONGDOUBLE, NPY_BITSOF_CLONGDOUBLE,
+        NPY_ALIGNOF(npy_clongdouble), &PyCLongDoubleArrType_Type
+    );
+    if (s == NULL) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+    ret = PyDict_SetItemString(infodict, "CLONGDOUBLE", s);
+    Py_DECREF(s);
+    if (ret < 0) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+
+    
+
+    s = PyArray_typeinfo(
+        NPY_OBJECTLTR, NPY_OBJECT, sizeof(PyObject *) * CHAR_BIT,
+        NPY_ALIGNOF(PyObject *),
+        &PyObjectArrType_Type
+    );
+    if (s == NULL) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+    ret = PyDict_SetItemString(infodict, "OBJECT", s);
+    Py_DECREF(s);
+    if (ret < 0) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+    s = PyArray_typeinfo(
+        NPY_STRINGLTR, NPY_STRING, 0, NPY_ALIGNOF(char),
+        &PyStringArrType_Type
+    );
+    if (s == NULL) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+    ret = PyDict_SetItemString(infodict, "STRING", s);
+    Py_DECREF(s);
+    if (ret < 0) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+    s = PyArray_typeinfo(
+        NPY_UNICODELTR, NPY_UNICODE, 0, NPY_ALIGNOF(npy_ucs4),
+        &PyUnicodeArrType_Type
+    );
+    if (s == NULL) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+    ret = PyDict_SetItemString(infodict, "UNICODE", s);
+    Py_DECREF(s);
+    if (ret < 0) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+    s = PyArray_typeinfo(
+        NPY_VOIDLTR, NPY_VOID, 0, NPY_ALIGNOF(char),
+        &PyVoidArrType_Type
+    );
+    if (s == NULL) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+    ret = PyDict_SetItemString(infodict, "VOID", s);
+    Py_DECREF(s);
+    if (ret < 0) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+    s = PyArray_typeinforanged(
+        NPY_DATETIMELTR, NPY_DATETIME, NPY_BITSOF_DATETIME,
+        NPY_ALIGNOF(npy_datetime),
+        MyPyLong_FromInt64(NPY_MAX_DATETIME),
+        MyPyLong_FromInt64(NPY_MIN_DATETIME),
+        &PyDatetimeArrType_Type
+    );
+    if (s == NULL) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+    ret = PyDict_SetItemString(infodict, "DATETIME", s);
+    Py_DECREF(s);
+    if (ret < 0) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+    s = PyArray_typeinforanged(
+        NPY_TIMEDELTALTR, NPY_TIMEDELTA, NPY_BITSOF_TIMEDELTA,
+        NPY_ALIGNOF(npy_timedelta),
+        MyPyLong_FromInt64(NPY_MAX_TIMEDELTA),
+        MyPyLong_FromInt64(NPY_MIN_TIMEDELTA),
+        &PyTimedeltaArrType_Type
+    );
+    if (s == NULL) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+    ret = PyDict_SetItemString(infodict, "TIMEDELTA", s);
+    Py_DECREF(s);
+    if (ret < 0) {
+        Py_DECREF(infodict);
+        return -1;
+    }
+
+#define SETTYPE(name)                                   \
+    Py_INCREF(&Py##name##ArrType_Type);                 \
+    if (PyDict_SetItemString(infodict, #name,           \
+            (PyObject *)&Py##name##ArrType_Type) < 0) { \
+        Py_DECREF(infodict);                            \
+        return -1;                                      \
+    }
+
+    SETTYPE(Generic);
+    SETTYPE(Number);
+    SETTYPE(Integer);
+    SETTYPE(Inexact);
+    SETTYPE(SignedInteger);
+    SETTYPE(UnsignedInteger);
+    SETTYPE(Floating);
+    SETTYPE(ComplexFloating);
+    SETTYPE(Flexible);
+    SETTYPE(Character);
+
+#undef SETTYPE
+
+    ret = PyDict_SetItemString(dict, "typeinfo", infodict);
+    Py_DECREF(infodict);
+    if (ret < 0) {
+        return -1;
+    }
+    return 0;
+}
+
+#undef _MAX_LETTER
+
diff --git a/numpy/core/src/_generated/arraytypes.h b/numpy/core/src/_generated/arraytypes.h
new file mode 100644
index 000000000000..322ac3dc7b32
--- /dev/null
+++ b/numpy/core/src/_generated/arraytypes.h
@@ -0,0 +1,748 @@
+#line 1 "numpy/core/src/multiarray/arraytypes.h.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+#ifndef NUMPY_CORE_SRC_MULTIARRAY_ARRAYTYPES_H_
+#define NUMPY_CORE_SRC_MULTIARRAY_ARRAYTYPES_H_
+
+#include "common.h"
+
+NPY_NO_EXPORT int
+set_typeinfo(PyObject *dict);
+
+/* needed for blasfuncs */
+NPY_NO_EXPORT void
+FLOAT_dot(char *, npy_intp, char *, npy_intp, char *, npy_intp, void *);
+
+NPY_NO_EXPORT void
+CFLOAT_dot(char *, npy_intp, char *, npy_intp, char *, npy_intp, void *);
+
+NPY_NO_EXPORT void
+DOUBLE_dot(char *, npy_intp, char *, npy_intp, char *, npy_intp, void *);
+
+NPY_NO_EXPORT void
+CDOUBLE_dot(char *, npy_intp, char *, npy_intp, char *, npy_intp, void *);
+
+
+/* for _pyarray_correlate */
+NPY_NO_EXPORT int
+small_correlate(const char * d_, npy_intp dstride,
+                npy_intp nd, enum NPY_TYPES dtype,
+                const char * k_, npy_intp kstride,
+                npy_intp nk, enum NPY_TYPES ktype,
+                char * out_, npy_intp ostride);
+
+#line 37
+/*
+ * The setitem functions are currently directly used in certain branches
+ * of the scalar-math code. (Yes, this would be nice to refactor...)
+ */
+
+NPY_NO_EXPORT int
+BYTE_setitem(PyObject *obj, void *data_ptr, void *arr);
+
+
+#line 37
+/*
+ * The setitem functions are currently directly used in certain branches
+ * of the scalar-math code. (Yes, this would be nice to refactor...)
+ */
+
+NPY_NO_EXPORT int
+UBYTE_setitem(PyObject *obj, void *data_ptr, void *arr);
+
+
+#line 37
+/*
+ * The setitem functions are currently directly used in certain branches
+ * of the scalar-math code. (Yes, this would be nice to refactor...)
+ */
+
+NPY_NO_EXPORT int
+SHORT_setitem(PyObject *obj, void *data_ptr, void *arr);
+
+
+#line 37
+/*
+ * The setitem functions are currently directly used in certain branches
+ * of the scalar-math code. (Yes, this would be nice to refactor...)
+ */
+
+NPY_NO_EXPORT int
+USHORT_setitem(PyObject *obj, void *data_ptr, void *arr);
+
+
+#line 37
+/*
+ * The setitem functions are currently directly used in certain branches
+ * of the scalar-math code. (Yes, this would be nice to refactor...)
+ */
+
+NPY_NO_EXPORT int
+INT_setitem(PyObject *obj, void *data_ptr, void *arr);
+
+
+#line 37
+/*
+ * The setitem functions are currently directly used in certain branches
+ * of the scalar-math code. (Yes, this would be nice to refactor...)
+ */
+
+NPY_NO_EXPORT int
+UINT_setitem(PyObject *obj, void *data_ptr, void *arr);
+
+
+#line 37
+/*
+ * The setitem functions are currently directly used in certain branches
+ * of the scalar-math code. (Yes, this would be nice to refactor...)
+ */
+
+NPY_NO_EXPORT int
+LONG_setitem(PyObject *obj, void *data_ptr, void *arr);
+
+
+#line 37
+/*
+ * The setitem functions are currently directly used in certain branches
+ * of the scalar-math code. (Yes, this would be nice to refactor...)
+ */
+
+NPY_NO_EXPORT int
+ULONG_setitem(PyObject *obj, void *data_ptr, void *arr);
+
+
+#line 37
+/*
+ * The setitem functions are currently directly used in certain branches
+ * of the scalar-math code. (Yes, this would be nice to refactor...)
+ */
+
+NPY_NO_EXPORT int
+LONGLONG_setitem(PyObject *obj, void *data_ptr, void *arr);
+
+
+#line 37
+/*
+ * The setitem functions are currently directly used in certain branches
+ * of the scalar-math code. (Yes, this would be nice to refactor...)
+ */
+
+NPY_NO_EXPORT int
+ULONGLONG_setitem(PyObject *obj, void *data_ptr, void *arr);
+
+
+#line 37
+/*
+ * The setitem functions are currently directly used in certain branches
+ * of the scalar-math code. (Yes, this would be nice to refactor...)
+ */
+
+NPY_NO_EXPORT int
+HALF_setitem(PyObject *obj, void *data_ptr, void *arr);
+
+
+#line 37
+/*
+ * The setitem functions are currently directly used in certain branches
+ * of the scalar-math code. (Yes, this would be nice to refactor...)
+ */
+
+NPY_NO_EXPORT int
+FLOAT_setitem(PyObject *obj, void *data_ptr, void *arr);
+
+
+#line 37
+/*
+ * The setitem functions are currently directly used in certain branches
+ * of the scalar-math code. (Yes, this would be nice to refactor...)
+ */
+
+NPY_NO_EXPORT int
+DOUBLE_setitem(PyObject *obj, void *data_ptr, void *arr);
+
+
+#line 37
+/*
+ * The setitem functions are currently directly used in certain branches
+ * of the scalar-math code. (Yes, this would be nice to refactor...)
+ */
+
+NPY_NO_EXPORT int
+LONGDOUBLE_setitem(PyObject *obj, void *data_ptr, void *arr);
+
+
+#line 37
+/*
+ * The setitem functions are currently directly used in certain branches
+ * of the scalar-math code. (Yes, this would be nice to refactor...)
+ */
+
+NPY_NO_EXPORT int
+CFLOAT_setitem(PyObject *obj, void *data_ptr, void *arr);
+
+
+#line 37
+/*
+ * The setitem functions are currently directly used in certain branches
+ * of the scalar-math code. (Yes, this would be nice to refactor...)
+ */
+
+NPY_NO_EXPORT int
+CDOUBLE_setitem(PyObject *obj, void *data_ptr, void *arr);
+
+
+#line 37
+/*
+ * The setitem functions are currently directly used in certain branches
+ * of the scalar-math code. (Yes, this would be nice to refactor...)
+ */
+
+NPY_NO_EXPORT int
+CLONGDOUBLE_setitem(PyObject *obj, void *data_ptr, void *arr);
+
+
+
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "argfunc.dispatch.h"
+#endif
+#line 59
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int BYTE_argmax,
+    (npy_byte *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int BYTE_argmin,
+    (npy_byte *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+
+#line 59
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int UBYTE_argmax,
+    (npy_ubyte *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int UBYTE_argmin,
+    (npy_ubyte *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+
+#line 59
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int SHORT_argmax,
+    (npy_short *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int SHORT_argmin,
+    (npy_short *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+
+#line 59
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int USHORT_argmax,
+    (npy_ushort *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int USHORT_argmin,
+    (npy_ushort *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+
+#line 59
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int INT_argmax,
+    (npy_int *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int INT_argmin,
+    (npy_int *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+
+#line 59
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int UINT_argmax,
+    (npy_uint *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int UINT_argmin,
+    (npy_uint *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+
+#line 59
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int LONG_argmax,
+    (npy_long *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int LONG_argmin,
+    (npy_long *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+
+#line 59
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int ULONG_argmax,
+    (npy_ulong *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int ULONG_argmin,
+    (npy_ulong *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+
+#line 59
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int LONGLONG_argmax,
+    (npy_longlong *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int LONGLONG_argmin,
+    (npy_longlong *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+
+#line 59
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int ULONGLONG_argmax,
+    (npy_ulonglong *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int ULONGLONG_argmin,
+    (npy_ulonglong *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+
+#line 59
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int FLOAT_argmax,
+    (npy_float *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int FLOAT_argmin,
+    (npy_float *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+
+#line 59
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int DOUBLE_argmax,
+    (npy_double *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int DOUBLE_argmin,
+    (npy_double *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+
+#line 59
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int LONGDOUBLE_argmax,
+    (npy_longdouble *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+#line 62
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int LONGDOUBLE_argmin,
+    (npy_longdouble *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT int BOOL_argmax,
+    (npy_bool *ip, npy_intp n, npy_intp *max_ind, PyArrayObject *aip))
+
+
+/*
+ * Define DType and scalar type names and aliases as used in Python.
+ */
+
+#line 91
+#define NPY_BOOL_name "bool_"
+#define NPY_BOOL_Name "Bool"
+
+
+#line 91
+#define NPY_HALF_name "float16"
+#define NPY_HALF_Name "Float16"
+
+
+#line 91
+#define NPY_FLOAT_name "float32"
+#define NPY_FLOAT_Name "Float32"
+
+
+#line 91
+#define NPY_DOUBLE_name "float64"
+#define NPY_DOUBLE_Name "Float64"
+
+
+#line 91
+#define NPY_LONGDOUBLE_name "longdouble"
+#define NPY_LONGDOUBLE_Name "LongDouble"
+
+
+#line 91
+#define NPY_CFLOAT_name "complex64"
+#define NPY_CFLOAT_Name "Complex64"
+
+
+#line 91
+#define NPY_CDOUBLE_name "complex128"
+#define NPY_CDOUBLE_Name "Complex128"
+
+
+#line 91
+#define NPY_CLONGDOUBLE_name "clongdouble"
+#define NPY_CLONGDOUBLE_Name "CLongDouble"
+
+
+#line 91
+#define NPY_STRING_name "bytes_"
+#define NPY_STRING_Name "Bytes"
+
+
+#line 91
+#define NPY_UNICODE_name "str_"
+#define NPY_UNICODE_Name "Str"
+
+
+#line 91
+#define NPY_VOID_name "void"
+#define NPY_VOID_Name "Void"
+
+
+#line 91
+#define NPY_OBJECT_name "object_"
+#define NPY_OBJECT_Name "Object"
+
+
+#line 91
+#define NPY_DATETIME_name "datetime64"
+#define NPY_DATETIME_Name "DateTime64"
+
+
+#line 91
+#define NPY_TIMEDELTA_name "timedelta64"
+#define NPY_TIMEDELTA_Name "TimeDelta64"
+
+
+
+
+/*
+ * Give integers different names when they are the same size (gh-9799).
+ * `intX` always refers to the first int of that size in the sequence
+ * `['LONG', 'LONGLONG', 'INT', 'SHORT', 'BYTE']`.
+ * Unfortunately, since the bitsize names are not strictly fixed, we add
+ * the C name for all integer types (as aliases).
+ *
+ * Right now, we do not define the C aliases for floats (which are always
+ * the same).
+ */
+
+#if NPY_SIZEOF_BYTE == NPY_SIZEOF_SHORT
+    #define BYTE_not_size_named
+#endif
+#if NPY_SIZEOF_SHORT == NPY_SIZEOF_INT
+    #define SHORT_not_size_named
+#endif
+#if NPY_SIZEOF_INT == NPY_SIZEOF_LONG
+    #define INT_not_size_named
+#endif
+#if NPY_SIZEOF_LONGLONG == NPY_SIZEOF_LONG
+    #define LONGLONG_not_size_named
+#endif
+
+
+#line 133
+
+#ifdef BYTE_not_size_named
+    #define NPY_BYTE_name "byte"
+    #define NPY_BYTE_Name "Byte"
+#else
+    /* The C-name is considered just an alias for these: */
+    #define NPY_BYTE_alias "byte"
+    #define NPY_BYTE_Alias "Byte"
+
+    /* The bitsof macro includes math, so cannot be stringified */
+    #if NPY_BITSOF_BYTE == 8
+        #define NPY_BYTE_name "int8"
+        #define NPY_BYTE_Name "Int8"
+    #elif NPY_BITSOF_BYTE == 16
+        #define NPY_BYTE_name "int16"
+        #define NPY_BYTE_Name "Int16"
+    #elif NPY_BITSOF_BYTE == 32
+        #define NPY_BYTE_name "int32"
+        #define NPY_BYTE_Name "Int32"
+    #elif NPY_BITSOF_BYTE == 64
+        #define NPY_BYTE_name "int64"
+        #define NPY_BYTE_Name "Int64"
+    #else
+        #error "need to fix integer bit-length name code"
+    #endif
+#endif
+
+
+#line 133
+
+#ifdef SHORT_not_size_named
+    #define NPY_SHORT_name "short"
+    #define NPY_SHORT_Name "Short"
+#else
+    /* The C-name is considered just an alias for these: */
+    #define NPY_SHORT_alias "short"
+    #define NPY_SHORT_Alias "Short"
+
+    /* The bitsof macro includes math, so cannot be stringified */
+    #if NPY_BITSOF_SHORT == 8
+        #define NPY_SHORT_name "int8"
+        #define NPY_SHORT_Name "Int8"
+    #elif NPY_BITSOF_SHORT == 16
+        #define NPY_SHORT_name "int16"
+        #define NPY_SHORT_Name "Int16"
+    #elif NPY_BITSOF_SHORT == 32
+        #define NPY_SHORT_name "int32"
+        #define NPY_SHORT_Name "Int32"
+    #elif NPY_BITSOF_SHORT == 64
+        #define NPY_SHORT_name "int64"
+        #define NPY_SHORT_Name "Int64"
+    #else
+        #error "need to fix integer bit-length name code"
+    #endif
+#endif
+
+
+#line 133
+
+#ifdef INT_not_size_named
+    #define NPY_INT_name "intc"
+    #define NPY_INT_Name "Int"
+#else
+    /* The C-name is considered just an alias for these: */
+    #define NPY_INT_alias "intc"
+    #define NPY_INT_Alias "Int"
+
+    /* The bitsof macro includes math, so cannot be stringified */
+    #if NPY_BITSOF_INT == 8
+        #define NPY_INT_name "int8"
+        #define NPY_INT_Name "Int8"
+    #elif NPY_BITSOF_INT == 16
+        #define NPY_INT_name "int16"
+        #define NPY_INT_Name "Int16"
+    #elif NPY_BITSOF_INT == 32
+        #define NPY_INT_name "int32"
+        #define NPY_INT_Name "Int32"
+    #elif NPY_BITSOF_INT == 64
+        #define NPY_INT_name "int64"
+        #define NPY_INT_Name "Int64"
+    #else
+        #error "need to fix integer bit-length name code"
+    #endif
+#endif
+
+
+#line 133
+
+#ifdef LONG_not_size_named
+    #define NPY_LONG_name "int_"
+    #define NPY_LONG_Name "Long"
+#else
+    /* The C-name is considered just an alias for these: */
+    #define NPY_LONG_alias "int_"
+    #define NPY_LONG_Alias "Long"
+
+    /* The bitsof macro includes math, so cannot be stringified */
+    #if NPY_BITSOF_LONG == 8
+        #define NPY_LONG_name "int8"
+        #define NPY_LONG_Name "Int8"
+    #elif NPY_BITSOF_LONG == 16
+        #define NPY_LONG_name "int16"
+        #define NPY_LONG_Name "Int16"
+    #elif NPY_BITSOF_LONG == 32
+        #define NPY_LONG_name "int32"
+        #define NPY_LONG_Name "Int32"
+    #elif NPY_BITSOF_LONG == 64
+        #define NPY_LONG_name "int64"
+        #define NPY_LONG_Name "Int64"
+    #else
+        #error "need to fix integer bit-length name code"
+    #endif
+#endif
+
+
+#line 133
+
+#ifdef LONGLONG_not_size_named
+    #define NPY_LONGLONG_name "longlong"
+    #define NPY_LONGLONG_Name "LongLong"
+#else
+    /* The C-name is considered just an alias for these: */
+    #define NPY_LONGLONG_alias "longlong"
+    #define NPY_LONGLONG_Alias "LongLong"
+
+    /* The bitsof macro includes math, so cannot be stringified */
+    #if NPY_BITSOF_LONGLONG == 8
+        #define NPY_LONGLONG_name "int8"
+        #define NPY_LONGLONG_Name "Int8"
+    #elif NPY_BITSOF_LONGLONG == 16
+        #define NPY_LONGLONG_name "int16"
+        #define NPY_LONGLONG_Name "Int16"
+    #elif NPY_BITSOF_LONGLONG == 32
+        #define NPY_LONGLONG_name "int32"
+        #define NPY_LONGLONG_Name "Int32"
+    #elif NPY_BITSOF_LONGLONG == 64
+        #define NPY_LONGLONG_name "int64"
+        #define NPY_LONGLONG_Name "Int64"
+    #else
+        #error "need to fix integer bit-length name code"
+    #endif
+#endif
+
+
+#line 133
+
+#ifdef BYTE_not_size_named
+    #define NPY_UBYTE_name "ubyte"
+    #define NPY_UBYTE_Name "UByte"
+#else
+    /* The C-name is considered just an alias for these: */
+    #define NPY_UBYTE_alias "ubyte"
+    #define NPY_UBYTE_Alias "UByte"
+
+    /* The bitsof macro includes math, so cannot be stringified */
+    #if NPY_BITSOF_BYTE == 8
+        #define NPY_UBYTE_name "uint8"
+        #define NPY_UBYTE_Name "UInt8"
+    #elif NPY_BITSOF_BYTE == 16
+        #define NPY_UBYTE_name "uint16"
+        #define NPY_UBYTE_Name "UInt16"
+    #elif NPY_BITSOF_BYTE == 32
+        #define NPY_UBYTE_name "uint32"
+        #define NPY_UBYTE_Name "UInt32"
+    #elif NPY_BITSOF_BYTE == 64
+        #define NPY_UBYTE_name "uint64"
+        #define NPY_UBYTE_Name "UInt64"
+    #else
+        #error "need to fix integer bit-length name code"
+    #endif
+#endif
+
+
+#line 133
+
+#ifdef SHORT_not_size_named
+    #define NPY_USHORT_name "ushort"
+    #define NPY_USHORT_Name "UShort"
+#else
+    /* The C-name is considered just an alias for these: */
+    #define NPY_USHORT_alias "ushort"
+    #define NPY_USHORT_Alias "UShort"
+
+    /* The bitsof macro includes math, so cannot be stringified */
+    #if NPY_BITSOF_SHORT == 8
+        #define NPY_USHORT_name "uint8"
+        #define NPY_USHORT_Name "UInt8"
+    #elif NPY_BITSOF_SHORT == 16
+        #define NPY_USHORT_name "uint16"
+        #define NPY_USHORT_Name "UInt16"
+    #elif NPY_BITSOF_SHORT == 32
+        #define NPY_USHORT_name "uint32"
+        #define NPY_USHORT_Name "UInt32"
+    #elif NPY_BITSOF_SHORT == 64
+        #define NPY_USHORT_name "uint64"
+        #define NPY_USHORT_Name "UInt64"
+    #else
+        #error "need to fix integer bit-length name code"
+    #endif
+#endif
+
+
+#line 133
+
+#ifdef INT_not_size_named
+    #define NPY_UINT_name "uintc"
+    #define NPY_UINT_Name "UInt"
+#else
+    /* The C-name is considered just an alias for these: */
+    #define NPY_UINT_alias "uintc"
+    #define NPY_UINT_Alias "UInt"
+
+    /* The bitsof macro includes math, so cannot be stringified */
+    #if NPY_BITSOF_INT == 8
+        #define NPY_UINT_name "uint8"
+        #define NPY_UINT_Name "UInt8"
+    #elif NPY_BITSOF_INT == 16
+        #define NPY_UINT_name "uint16"
+        #define NPY_UINT_Name "UInt16"
+    #elif NPY_BITSOF_INT == 32
+        #define NPY_UINT_name "uint32"
+        #define NPY_UINT_Name "UInt32"
+    #elif NPY_BITSOF_INT == 64
+        #define NPY_UINT_name "uint64"
+        #define NPY_UINT_Name "UInt64"
+    #else
+        #error "need to fix integer bit-length name code"
+    #endif
+#endif
+
+
+#line 133
+
+#ifdef LONG_not_size_named
+    #define NPY_ULONG_name "uint"
+    #define NPY_ULONG_Name "ULong"
+#else
+    /* The C-name is considered just an alias for these: */
+    #define NPY_ULONG_alias "uint"
+    #define NPY_ULONG_Alias "ULong"
+
+    /* The bitsof macro includes math, so cannot be stringified */
+    #if NPY_BITSOF_LONG == 8
+        #define NPY_ULONG_name "uint8"
+        #define NPY_ULONG_Name "UInt8"
+    #elif NPY_BITSOF_LONG == 16
+        #define NPY_ULONG_name "uint16"
+        #define NPY_ULONG_Name "UInt16"
+    #elif NPY_BITSOF_LONG == 32
+        #define NPY_ULONG_name "uint32"
+        #define NPY_ULONG_Name "UInt32"
+    #elif NPY_BITSOF_LONG == 64
+        #define NPY_ULONG_name "uint64"
+        #define NPY_ULONG_Name "UInt64"
+    #else
+        #error "need to fix integer bit-length name code"
+    #endif
+#endif
+
+
+#line 133
+
+#ifdef LONGLONG_not_size_named
+    #define NPY_ULONGLONG_name "ulonglong"
+    #define NPY_ULONGLONG_Name "ULongLong"
+#else
+    /* The C-name is considered just an alias for these: */
+    #define NPY_ULONGLONG_alias "ulonglong"
+    #define NPY_ULONGLONG_Alias "ULongLong"
+
+    /* The bitsof macro includes math, so cannot be stringified */
+    #if NPY_BITSOF_LONGLONG == 8
+        #define NPY_ULONGLONG_name "uint8"
+        #define NPY_ULONGLONG_Name "UInt8"
+    #elif NPY_BITSOF_LONGLONG == 16
+        #define NPY_ULONGLONG_name "uint16"
+        #define NPY_ULONGLONG_Name "UInt16"
+    #elif NPY_BITSOF_LONGLONG == 32
+        #define NPY_ULONGLONG_name "uint32"
+        #define NPY_ULONGLONG_Name "UInt32"
+    #elif NPY_BITSOF_LONGLONG == 64
+        #define NPY_ULONGLONG_name "uint64"
+        #define NPY_ULONGLONG_Name "UInt64"
+    #else
+        #error "need to fix integer bit-length name code"
+    #endif
+#endif
+
+
+
+#undef BYTE_not_size_named
+#undef SHORT_not_size_named
+#undef INT_not_size_named
+#undef LONGLONG_not_size_named
+
+#endif  /* NUMPY_CORE_SRC_MULTIARRAY_ARRAYTYPES_H_ */
+
diff --git a/numpy/core/src/_generated/einsum.c b/numpy/core/src/_generated/einsum.c
new file mode 100644
index 000000000000..a88344d885c0
--- /dev/null
+++ b/numpy/core/src/_generated/einsum.c
@@ -0,0 +1,1187 @@
+#line 1 "numpy/core/src/multiarray/einsum.c.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+/*
+ * This file contains the implementation of the 'einsum' function,
+ * which provides an einstein-summation operation.
+ *
+ * Copyright (c) 2011 by Mark Wiebe (mwwiebe@gmail.com)
+ * The University of British Columbia
+ *
+ * See LICENSE.txt for the license.
+ */
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+#include <structmember.h>
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#include <numpy/npy_common.h>
+#include <numpy/arrayobject.h>
+#include <npy_pycompat.h>
+#include <array_assign.h>   //PyArray_AssignRawScalar
+
+#include <ctype.h>
+
+#include "convert.h"
+#include "common.h"
+#include "ctors.h"
+
+#include "einsum_sumprod.h"
+#include "einsum_debug.h"
+
+
+/*
+ * Parses the subscripts for one operand into an output of 'ndim'
+ * labels. The resulting 'op_labels' array will have:
+ *  - the ASCII code of the label for the first occurrence of a label;
+ *  - the (negative) offset to the first occurrence of the label for
+ *    repeated labels;
+ *  - zero for broadcast dimensions, if subscripts has an ellipsis.
+ * For example:
+ *  - subscripts="abbcbc",  ndim=6 -> op_labels=[97, 98, -1, 99, -3, -2]
+ *  - subscripts="ab...bc", ndim=6 -> op_labels=[97, 98, 0, 0, -3, 99]
+ */
+
+static int
+parse_operand_subscripts(char *subscripts, int length,
+                         int ndim, int iop, char *op_labels,
+                         char *label_counts, int *min_label, int *max_label)
+{
+    int i;
+    int idim = 0;
+    int ellipsis = -1;
+
+    /* Process all labels for this operand */
+    for (i = 0; i < length; ++i) {
+        int label = subscripts[i];
+
+        /* A proper label for an axis. */
+        if (label > 0 && isalpha(label)) {
+            /* Check we don't exceed the operator dimensions. */
+            if (idim >= ndim) {
+                PyErr_Format(PyExc_ValueError,
+                             "einstein sum subscripts string contains "
+                             "too many subscripts for operand %d", iop);
+                return -1;
+            }
+
+            op_labels[idim++] = label;
+            if (label < *min_label) {
+                *min_label = label;
+            }
+            if (label > *max_label) {
+                *max_label = label;
+            }
+            label_counts[label]++;
+        }
+        /* The beginning of the ellipsis. */
+        else if (label == '.') {
+            /* Check it's a proper ellipsis. */
+            if (ellipsis != -1 || i + 2 >= length
+                    || subscripts[++i] != '.' || subscripts[++i] != '.') {
+                PyErr_Format(PyExc_ValueError,
+                             "einstein sum subscripts string contains a "
+                             "'.' that is not part of an ellipsis ('...') "
+                             "in operand %d", iop);
+                return -1;
+            }
+
+            ellipsis = idim;
+        }
+        else if (label != ' ') {
+            PyErr_Format(PyExc_ValueError,
+                         "invalid subscript '%c' in einstein sum "
+                         "subscripts string, subscripts must "
+                         "be letters", (char)label);
+            return -1;
+        }
+    }
+
+    /* No ellipsis found, labels must match dimensions exactly. */
+    if (ellipsis == -1) {
+        if (idim != ndim) {
+            PyErr_Format(PyExc_ValueError,
+                         "operand has more dimensions than subscripts "
+                         "given in einstein sum, but no '...' ellipsis "
+                         "provided to broadcast the extra dimensions.");
+            return -1;
+        }
+    }
+    /* Ellipsis found, may have to add broadcast dimensions. */
+    else if (idim < ndim) {
+        /* Move labels after ellipsis to the end. */
+        for (i = 0; i < idim - ellipsis; ++i) {
+            op_labels[ndim - i - 1] = op_labels[idim - i - 1];
+        }
+        /* Set all broadcast dimensions to zero. */
+        for (i = 0; i < ndim - idim; ++i) {
+            op_labels[ellipsis + i] = 0;
+        }
+    }
+
+    /*
+     * Find any labels duplicated for this operand, and turn them
+     * into negative offsets to the axis to merge with.
+     *
+     * In C, the char type may be signed or unsigned, but with
+     * twos complement arithmetic the char is ok either way here, and
+     * later where it matters the char is cast to a signed char.
+     */
+    for (idim = 0; idim < ndim - 1; ++idim) {
+        int label = (signed char)op_labels[idim];
+        /* If it is a proper label, find any duplicates of it. */
+        if (label > 0) {
+            /* Search for the next matching label. */
+            char *next = memchr(op_labels + idim + 1, label, ndim - idim - 1);
+
+            while (next != NULL) {
+                /* The offset from next to op_labels[idim] (negative). */
+                *next = (char)((op_labels + idim) - next);
+                /* Search for the next matching label. */
+                next = memchr(next + 1, label, op_labels + ndim - 1 - next);
+            }
+        }
+    }
+
+    return 0;
+}
+
+
+/*
+ * Parses the subscripts for the output operand into an output that
+ * includes 'ndim_broadcast' unlabeled dimensions, and returns the total
+ * number of output dimensions, or -1 if there is an error. Similarly
+ * to parse_operand_subscripts, the 'out_labels' array will have, for
+ * each dimension:
+ *  - the ASCII code of the corresponding label;
+ *  - zero for broadcast dimensions, if subscripts has an ellipsis.
+ */
+static int
+parse_output_subscripts(char *subscripts, int length,
+                        int ndim_broadcast,
+                        const char *label_counts, char *out_labels)
+{
+    int i, bdim;
+    int ndim = 0;
+    int ellipsis = 0;
+
+    /* Process all the output labels. */
+    for (i = 0; i < length; ++i) {
+        int label = subscripts[i];
+
+        /* A proper label for an axis. */
+        if (label > 0 && isalpha(label)) {
+            /* Check that it doesn't occur again. */
+            if (memchr(subscripts + i + 1, label, length - i - 1) != NULL) {
+                PyErr_Format(PyExc_ValueError,
+                             "einstein sum subscripts string includes "
+                             "output subscript '%c' multiple times",
+                             (char)label);
+                return -1;
+            }
+            /* Check that it was used in the inputs. */
+            if (label_counts[label] == 0) {
+                PyErr_Format(PyExc_ValueError,
+                             "einstein sum subscripts string included "
+                             "output subscript '%c' which never appeared "
+                             "in an input", (char)label);
+                return -1;
+            }
+            /* Check that there is room in out_labels for this label. */
+            if (ndim >= NPY_MAXDIMS) {
+                PyErr_Format(PyExc_ValueError,
+                             "einstein sum subscripts string contains "
+                             "too many subscripts in the output");
+                return -1;
+            }
+
+            out_labels[ndim++] = label;
+        }
+        /* The beginning of the ellipsis. */
+        else if (label == '.') {
+            /* Check it is a proper ellipsis. */
+            if (ellipsis || i + 2 >= length
+                    || subscripts[++i] != '.' || subscripts[++i] != '.') {
+                PyErr_SetString(PyExc_ValueError,
+                                "einstein sum subscripts string "
+                                "contains a '.' that is not part of "
+                                "an ellipsis ('...') in the output");
+                return -1;
+            }
+            /* Check there is room in out_labels for broadcast dims. */
+            if (ndim + ndim_broadcast > NPY_MAXDIMS) {
+                PyErr_Format(PyExc_ValueError,
+                             "einstein sum subscripts string contains "
+                             "too many subscripts in the output");
+                return -1;
+            }
+
+            ellipsis = 1;
+            for (bdim = 0; bdim < ndim_broadcast; ++bdim) {
+                out_labels[ndim++] = 0;
+            }
+        }
+        else if (label != ' ') {
+            PyErr_Format(PyExc_ValueError,
+                         "invalid subscript '%c' in einstein sum "
+                         "subscripts string, subscripts must "
+                         "be letters", (char)label);
+            return -1;
+        }
+    }
+
+    /* If no ellipsis was found there should be no broadcast dimensions. */
+    if (!ellipsis && ndim_broadcast > 0) {
+        PyErr_SetString(PyExc_ValueError,
+                        "output has more dimensions than subscripts "
+                        "given in einstein sum, but no '...' ellipsis "
+                        "provided to broadcast the extra dimensions.");
+        return -1;
+    }
+
+    return ndim;
+}
+
+
+/*
+ * When there's just one operand and no reduction we can return a view
+ * into 'op'.  This calculates the view and stores it in 'ret', if
+ * possible.  Returns -1 on error, 0 otherwise.  Note that a 0 return
+ * does not mean that a view was successfully created.
+ */
+static int
+get_single_op_view(PyArrayObject *op, char *labels,
+                   int ndim_output, char *output_labels,
+                   PyArrayObject **ret)
+{
+    npy_intp new_strides[NPY_MAXDIMS];
+    npy_intp new_dims[NPY_MAXDIMS];
+    char *out_label;
+    int label, i, idim, ndim, ibroadcast = 0;
+
+    ndim = PyArray_NDIM(op);
+
+    /* Initialize the dimensions and strides to zero */
+    for (idim = 0; idim < ndim_output; ++idim) {
+        new_dims[idim] = 0;
+        new_strides[idim] = 0;
+    }
+
+    /* Match the labels in the operand with the output labels */
+    for (idim = 0; idim < ndim; ++idim) {
+        /*
+         * The char type may be either signed or unsigned, we
+         * need it to be signed here.
+         */
+        label = (signed char)labels[idim];
+        /* If this label says to merge axes, get the actual label */
+        if (label < 0) {
+            label = labels[idim+label];
+        }
+        /* If the label is 0, it's an unlabeled broadcast dimension */
+        if (label == 0) {
+            /* The next output label that's a broadcast dimension */
+            for (; ibroadcast < ndim_output; ++ibroadcast) {
+                if (output_labels[ibroadcast] == 0) {
+                    break;
+                }
+            }
+            if (ibroadcast == ndim_output) {
+                PyErr_SetString(PyExc_ValueError,
+                        "output had too few broadcast dimensions");
+                return -1;
+            }
+            new_dims[ibroadcast] = PyArray_DIM(op, idim);
+            new_strides[ibroadcast] = PyArray_STRIDE(op, idim);
+            ++ibroadcast;
+        }
+        else {
+            /* Find the position for this dimension in the output */
+            out_label = (char *)memchr(output_labels, label,
+                                                    ndim_output);
+            /* If it's not found, reduction -> can't return a view */
+            if (out_label == NULL) {
+                break;
+            }
+            /* Update the dimensions and strides of the output */
+            i = out_label - output_labels;
+            if (new_dims[i] != 0 && new_dims[i] != PyArray_DIM(op, idim)) {
+                PyErr_Format(PyExc_ValueError,
+                        "dimensions in single operand for collapsing "
+                        "index '%c' don't match (%d != %d)",
+                        label, (int)new_dims[i], (int)PyArray_DIM(op, idim));
+                return -1;
+            }
+            new_dims[i] = PyArray_DIM(op, idim);
+            new_strides[i] += PyArray_STRIDE(op, idim);
+        }
+    }
+    /* If we processed all the input axes, return a view */
+    if (idim == ndim) {
+        Py_INCREF(PyArray_DESCR(op));
+        *ret = (PyArrayObject *)PyArray_NewFromDescr_int(
+                Py_TYPE(op), PyArray_DESCR(op),
+                ndim_output, new_dims, new_strides, PyArray_DATA(op),
+                PyArray_ISWRITEABLE(op) ? NPY_ARRAY_WRITEABLE : 0,
+                (PyObject *)op, (PyObject *)op, 0);
+
+        if (*ret == NULL) {
+            return -1;
+        }
+        return 0;
+    }
+
+    /* Return success, but that we couldn't make a view */
+    *ret = NULL;
+    return 0;
+}
+
+
+/*
+ * The char type may be either signed or unsigned, we need it to be
+ * signed here.
+ */
+static int
+_any_labels_are_negative(signed char *labels, int ndim)
+{
+    int idim;
+
+    for (idim = 0; idim < ndim; ++idim) {
+        if (labels[idim] < 0) {
+            return 1;
+        }
+    }
+
+    return 0;
+}
+
+/*
+ * Given the labels for an operand array, returns a view of the array
+ * with all repeated labels collapsed into a single dimension along
+ * the corresponding diagonal. The labels are also updated to match
+ * the dimensions of the new array. If no label is repeated, the
+ * original array is reference increased and returned unchanged.
+ */
+static PyArrayObject *
+get_combined_dims_view(PyArrayObject *op, int iop, char *labels)
+{
+    npy_intp new_strides[NPY_MAXDIMS];
+    npy_intp new_dims[NPY_MAXDIMS];
+    int idim, icombine;
+    int icombinemap[NPY_MAXDIMS];
+    int ndim = PyArray_NDIM(op);
+    PyArrayObject *ret = NULL;
+
+    /* A fast path to avoid unnecessary calculations. */
+    if (!_any_labels_are_negative((signed char *)labels, ndim)) {
+        Py_INCREF(op);
+
+        return op;
+    }
+
+    /* Combine repeated labels. */
+    icombine = 0;
+    for(idim = 0; idim < ndim; ++idim) {
+        /*
+         * The char type may be either signed or unsigned, we
+         * need it to be signed here.
+         */
+        int label = (signed char)labels[idim];
+        npy_intp dim = PyArray_DIM(op, idim);
+        npy_intp stride = PyArray_STRIDE(op, idim);
+
+        /* A label seen for the first time, add it to the op view. */
+        if (label >= 0) {
+            /*
+             * icombinemap maps dimensions in the original array to
+             * their position in the combined dimensions view.
+             */
+            icombinemap[idim] = icombine;
+            new_dims[icombine] = dim;
+            new_strides[icombine] = stride;
+            ++icombine;
+        }
+        /* A repeated label, find the original one and merge them. */
+        else {
+#ifdef __GNUC__
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+            int i = icombinemap[idim + label];
+
+            icombinemap[idim] = -1;
+            if (new_dims[i] != dim) {
+                char orig_label = labels[idim + label];
+                PyErr_Format(PyExc_ValueError,
+                             "dimensions in operand %d for collapsing "
+                             "index '%c' don't match (%d != %d)",
+                             iop, orig_label, (int)new_dims[i], (int)dim);
+                return NULL;
+            }
+            new_strides[i] += stride;
+#ifdef __GNUC__
+#pragma GCC diagnostic pop
+#endif
+        }
+    }
+
+    /* Overwrite labels to match the new operand view. */
+    for (idim = 0; idim < ndim; ++idim) {
+        int i = icombinemap[idim];
+
+        if (i >= 0) {
+            labels[i] = labels[idim];
+        }
+    }
+
+    /* The number of dimensions of the combined view. */
+    ndim = icombine;
+
+    /* Create a view of the operand with the compressed dimensions. */
+    Py_INCREF(PyArray_DESCR(op));
+    ret = (PyArrayObject *)PyArray_NewFromDescrAndBase(
+            Py_TYPE(op), PyArray_DESCR(op),
+            ndim, new_dims, new_strides, PyArray_DATA(op),
+            PyArray_ISWRITEABLE(op) ? NPY_ARRAY_WRITEABLE : 0,
+            (PyObject *)op, (PyObject *)op);
+
+    return ret;
+}
+
+static int
+prepare_op_axes(int ndim, int iop, char *labels, int *axes,
+            int ndim_iter, char *iter_labels)
+{
+    int i, label, ibroadcast;
+
+    ibroadcast = ndim-1;
+    for (i = ndim_iter-1; i >= 0; --i) {
+        label = iter_labels[i];
+        /*
+         * If it's an unlabeled broadcast dimension, choose
+         * the next broadcast dimension from the operand.
+         */
+        if (label == 0) {
+            while (ibroadcast >= 0 && labels[ibroadcast] != 0) {
+                --ibroadcast;
+            }
+            /*
+             * If we used up all the operand broadcast dimensions,
+             * extend it with a "newaxis"
+             */
+            if (ibroadcast < 0) {
+                axes[i] = -1;
+            }
+            /* Otherwise map to the broadcast axis */
+            else {
+                axes[i] = ibroadcast;
+                --ibroadcast;
+            }
+        }
+        /* It's a labeled dimension, find the matching one */
+        else {
+            char *match = memchr(labels, label, ndim);
+            /* If the op doesn't have the label, broadcast it */
+            if (match == NULL) {
+                axes[i] = -1;
+            }
+            /* Otherwise use it */
+            else {
+                axes[i] = match - labels;
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int
+unbuffered_loop_nop1_ndim2(NpyIter *iter)
+{
+    npy_intp coord, shape[2], strides[2][2];
+    char *ptrs[2][2], *ptr;
+    sum_of_products_fn sop;
+    NPY_BEGIN_THREADS_DEF;
+
+#if NPY_EINSUM_DBG_TRACING
+    NpyIter_DebugPrint(iter);
+#endif
+    NPY_EINSUM_DBG_PRINT("running hand-coded 1-op 2-dim loop\n");
+
+    NpyIter_GetShape(iter, shape);
+    memcpy(strides[0], NpyIter_GetAxisStrideArray(iter, 0),
+                                            2*sizeof(npy_intp));
+    memcpy(strides[1], NpyIter_GetAxisStrideArray(iter, 1),
+                                            2*sizeof(npy_intp));
+    memcpy(ptrs[0], NpyIter_GetInitialDataPtrArray(iter),
+                                            2*sizeof(char *));
+    memcpy(ptrs[1], ptrs[0], 2*sizeof(char*));
+
+    sop = get_sum_of_products_function(1,
+                    NpyIter_GetDescrArray(iter)[0]->type_num,
+                    NpyIter_GetDescrArray(iter)[0]->elsize,
+                    strides[0]);
+
+    if (sop == NULL) {
+        PyErr_SetString(PyExc_TypeError,
+                    "invalid data type for einsum");
+        return -1;
+    }
+
+    /*
+     * Since the iterator wasn't tracking coordinates, the
+     * loop provided by the iterator is in Fortran-order.
+     */
+    int needs_api = NpyIter_IterationNeedsAPI(iter);
+    if (!needs_api) {
+        NPY_BEGIN_THREADS_THRESHOLDED(shape[1] * shape[0]);
+    }
+    for (coord = shape[1]; coord > 0; --coord) {
+        sop(1, ptrs[0], strides[0], shape[0]);
+
+        if (needs_api && PyErr_Occurred()){
+            return -1;
+        }
+
+        ptr = ptrs[1][0] + strides[1][0];
+        ptrs[0][0] = ptrs[1][0] = ptr;
+        ptr = ptrs[1][1] + strides[1][1];
+        ptrs[0][1] = ptrs[1][1] = ptr;
+    }
+    NPY_END_THREADS;
+
+    return 0;
+}
+
+static int
+unbuffered_loop_nop1_ndim3(NpyIter *iter)
+{
+    npy_intp coords[2], shape[3], strides[3][2];
+    char *ptrs[3][2], *ptr;
+    sum_of_products_fn sop;
+    NPY_BEGIN_THREADS_DEF;
+
+#if NPY_EINSUM_DBG_TRACING
+    NpyIter_DebugPrint(iter);
+#endif
+    NPY_EINSUM_DBG_PRINT("running hand-coded 1-op 3-dim loop\n");
+
+    NpyIter_GetShape(iter, shape);
+    memcpy(strides[0], NpyIter_GetAxisStrideArray(iter, 0),
+                                            2*sizeof(npy_intp));
+    memcpy(strides[1], NpyIter_GetAxisStrideArray(iter, 1),
+                                            2*sizeof(npy_intp));
+    memcpy(strides[2], NpyIter_GetAxisStrideArray(iter, 2),
+                                            2*sizeof(npy_intp));
+    memcpy(ptrs[0], NpyIter_GetInitialDataPtrArray(iter),
+                                            2*sizeof(char *));
+    memcpy(ptrs[1], ptrs[0], 2*sizeof(char*));
+    memcpy(ptrs[2], ptrs[0], 2*sizeof(char*));
+
+    sop = get_sum_of_products_function(1,
+                    NpyIter_GetDescrArray(iter)[0]->type_num,
+                    NpyIter_GetDescrArray(iter)[0]->elsize,
+                    strides[0]);
+
+    if (sop == NULL) {
+        PyErr_SetString(PyExc_TypeError,
+                    "invalid data type for einsum");
+        return -1;
+    }
+
+    /*
+     * Since the iterator wasn't tracking coordinates, the
+     * loop provided by the iterator is in Fortran-order.
+     */
+    int needs_api = NpyIter_IterationNeedsAPI(iter);
+    if (!needs_api) {
+        NPY_BEGIN_THREADS_THRESHOLDED(shape[2] * shape[1] * shape[0]);
+    }
+    for (coords[1] = shape[2]; coords[1] > 0; --coords[1]) {
+        for (coords[0] = shape[1]; coords[0] > 0; --coords[0]) {
+            sop(1, ptrs[0], strides[0], shape[0]);
+
+            if (needs_api && PyErr_Occurred()){
+                return -1;
+            }
+
+            ptr = ptrs[1][0] + strides[1][0];
+            ptrs[0][0] = ptrs[1][0] = ptr;
+            ptr = ptrs[1][1] + strides[1][1];
+            ptrs[0][1] = ptrs[1][1] = ptr;
+        }
+        ptr = ptrs[2][0] + strides[2][0];
+        ptrs[0][0] = ptrs[1][0] = ptrs[2][0] = ptr;
+        ptr = ptrs[2][1] + strides[2][1];
+        ptrs[0][1] = ptrs[1][1] = ptrs[2][1] = ptr;
+    }
+    NPY_END_THREADS;
+
+    return 0;
+}
+
+static int
+unbuffered_loop_nop2_ndim2(NpyIter *iter)
+{
+    npy_intp coord, shape[2], strides[2][3];
+    char *ptrs[2][3], *ptr;
+    sum_of_products_fn sop;
+    NPY_BEGIN_THREADS_DEF;
+
+#if NPY_EINSUM_DBG_TRACING
+    NpyIter_DebugPrint(iter);
+#endif
+    NPY_EINSUM_DBG_PRINT("running hand-coded 2-op 2-dim loop\n");
+
+    NpyIter_GetShape(iter, shape);
+    memcpy(strides[0], NpyIter_GetAxisStrideArray(iter, 0),
+                                            3*sizeof(npy_intp));
+    memcpy(strides[1], NpyIter_GetAxisStrideArray(iter, 1),
+                                            3*sizeof(npy_intp));
+    memcpy(ptrs[0], NpyIter_GetInitialDataPtrArray(iter),
+                                            3*sizeof(char *));
+    memcpy(ptrs[1], ptrs[0], 3*sizeof(char*));
+
+    sop = get_sum_of_products_function(2,
+                    NpyIter_GetDescrArray(iter)[0]->type_num,
+                    NpyIter_GetDescrArray(iter)[0]->elsize,
+                    strides[0]);
+
+    if (sop == NULL) {
+        PyErr_SetString(PyExc_TypeError,
+                    "invalid data type for einsum");
+        return -1;
+    }
+
+    /*
+     * Since the iterator wasn't tracking coordinates, the
+     * loop provided by the iterator is in Fortran-order.
+     */
+    int needs_api = NpyIter_IterationNeedsAPI(iter);
+    if (!needs_api) {
+        NPY_BEGIN_THREADS_THRESHOLDED(shape[1] * shape[0]);
+    }
+    for (coord = shape[1]; coord > 0; --coord) {
+        sop(2, ptrs[0], strides[0], shape[0]);
+
+        if(needs_api && PyErr_Occurred()){
+            return -1;
+        }
+
+        ptr = ptrs[1][0] + strides[1][0];
+        ptrs[0][0] = ptrs[1][0] = ptr;
+        ptr = ptrs[1][1] + strides[1][1];
+        ptrs[0][1] = ptrs[1][1] = ptr;
+        ptr = ptrs[1][2] + strides[1][2];
+        ptrs[0][2] = ptrs[1][2] = ptr;
+    }
+    NPY_END_THREADS;
+
+    return 0;
+}
+
+static int
+unbuffered_loop_nop2_ndim3(NpyIter *iter)
+{
+    npy_intp coords[2], shape[3], strides[3][3];
+    char *ptrs[3][3], *ptr;
+    sum_of_products_fn sop;
+    NPY_BEGIN_THREADS_DEF;
+
+#if NPY_EINSUM_DBG_TRACING
+    NpyIter_DebugPrint(iter);
+#endif
+    NPY_EINSUM_DBG_PRINT("running hand-coded 2-op 3-dim loop\n");
+
+    NpyIter_GetShape(iter, shape);
+    memcpy(strides[0], NpyIter_GetAxisStrideArray(iter, 0),
+                                            3*sizeof(npy_intp));
+    memcpy(strides[1], NpyIter_GetAxisStrideArray(iter, 1),
+                                            3*sizeof(npy_intp));
+    memcpy(strides[2], NpyIter_GetAxisStrideArray(iter, 2),
+                                            3*sizeof(npy_intp));
+    memcpy(ptrs[0], NpyIter_GetInitialDataPtrArray(iter),
+                                            3*sizeof(char *));
+    memcpy(ptrs[1], ptrs[0], 3*sizeof(char*));
+    memcpy(ptrs[2], ptrs[0], 3*sizeof(char*));
+
+    sop = get_sum_of_products_function(2,
+                    NpyIter_GetDescrArray(iter)[0]->type_num,
+                    NpyIter_GetDescrArray(iter)[0]->elsize,
+                    strides[0]);
+
+    if (sop == NULL) {
+        PyErr_SetString(PyExc_TypeError,
+                    "invalid data type for einsum");
+        return -1;
+    }
+
+    /*
+     * Since the iterator wasn't tracking coordinates, the
+     * loop provided by the iterator is in Fortran-order.
+     */
+    int needs_api = NpyIter_IterationNeedsAPI(iter);
+    if (!needs_api) {
+        NPY_BEGIN_THREADS_THRESHOLDED(shape[2] * shape[1] * shape[0]);
+    }
+    for (coords[1] = shape[2]; coords[1] > 0; --coords[1]) {
+        for (coords[0] = shape[1]; coords[0] > 0; --coords[0]) {
+            sop(2, ptrs[0], strides[0], shape[0]);
+
+            if(needs_api && PyErr_Occurred()){
+                return -1;
+            }
+
+            ptr = ptrs[1][0] + strides[1][0];
+            ptrs[0][0] = ptrs[1][0] = ptr;
+            ptr = ptrs[1][1] + strides[1][1];
+            ptrs[0][1] = ptrs[1][1] = ptr;
+            ptr = ptrs[1][2] + strides[1][2];
+            ptrs[0][2] = ptrs[1][2] = ptr;
+        }
+        ptr = ptrs[2][0] + strides[2][0];
+        ptrs[0][0] = ptrs[1][0] = ptrs[2][0] = ptr;
+        ptr = ptrs[2][1] + strides[2][1];
+        ptrs[0][1] = ptrs[1][1] = ptrs[2][1] = ptr;
+        ptr = ptrs[2][2] + strides[2][2];
+        ptrs[0][2] = ptrs[1][2] = ptrs[2][2] = ptr;
+    }
+    NPY_END_THREADS;
+
+    return 0;
+}
+
+
+/*NUMPY_API
+ * This function provides summation of array elements according to
+ * the Einstein summation convention.  For example:
+ *  - trace(a)        -> einsum("ii", a)
+ *  - transpose(a)    -> einsum("ji", a)
+ *  - multiply(a,b)   -> einsum(",", a, b)
+ *  - inner(a,b)      -> einsum("i,i", a, b)
+ *  - outer(a,b)      -> einsum("i,j", a, b)
+ *  - matvec(a,b)     -> einsum("ij,j", a, b)
+ *  - matmat(a,b)     -> einsum("ij,jk", a, b)
+ *
+ * subscripts: The string of subscripts for einstein summation.
+ * nop:        The number of operands
+ * op_in:      The array of operands
+ * dtype:      Either NULL, or the data type to force the calculation as.
+ * order:      The order for the calculation/the output axes.
+ * casting:    What kind of casts should be permitted.
+ * out:        Either NULL, or an array into which the output should be placed.
+ *
+ * By default, the labels get placed in alphabetical order
+ * at the end of the output. So, if c = einsum("i,j", a, b)
+ * then c[i,j] == a[i]*b[j], but if c = einsum("j,i", a, b)
+ * then c[i,j] = a[j]*b[i].
+ *
+ * Alternatively, you can control the output order or prevent
+ * an axis from being summed/force an axis to be summed by providing
+ * indices for the output. This allows us to turn 'trace' into
+ * 'diag', for example.
+ *  - diag(a)         -> einsum("ii->i", a)
+ *  - sum(a, axis=0)  -> einsum("i...->", a)
+ *
+ * Subscripts at the beginning and end may be specified by
+ * putting an ellipsis "..." in the middle.  For example,
+ * the function einsum("i...i", a) takes the diagonal of
+ * the first and last dimensions of the operand, and
+ * einsum("ij...,jk...->ik...") takes the matrix product using
+ * the first two indices of each operand instead of the last two.
+ *
+ * When there is only one operand, no axes being summed, and
+ * no output parameter, this function returns a view
+ * into the operand instead of making a copy.
+ */
+NPY_NO_EXPORT PyArrayObject *
+PyArray_EinsteinSum(char *subscripts, npy_intp nop,
+                    PyArrayObject **op_in,
+                    PyArray_Descr *dtype,
+                    NPY_ORDER order, NPY_CASTING casting,
+                    PyArrayObject *out)
+{
+    int iop, label, min_label = 127, max_label = 0;
+    char label_counts[128];
+    char op_labels[NPY_MAXARGS][NPY_MAXDIMS];
+    char output_labels[NPY_MAXDIMS], *iter_labels;
+    int idim, ndim_output, ndim_broadcast, ndim_iter;
+
+    PyArrayObject *op[NPY_MAXARGS], *ret = NULL;
+    PyArray_Descr *op_dtypes_array[NPY_MAXARGS], **op_dtypes;
+
+    int op_axes_arrays[NPY_MAXARGS][NPY_MAXDIMS];
+    int *op_axes[NPY_MAXARGS];
+    npy_uint32 iter_flags, op_flags[NPY_MAXARGS];
+
+    NpyIter *iter = NULL;
+    sum_of_products_fn sop;
+    npy_intp fixed_strides[NPY_MAXARGS];
+
+    /* nop+1 (+1 is for the output) must fit in NPY_MAXARGS */
+    if (nop >= NPY_MAXARGS) {
+        PyErr_SetString(PyExc_ValueError,
+                    "too many operands provided to einstein sum function");
+        return NULL;
+    }
+    else if (nop < 1) {
+        PyErr_SetString(PyExc_ValueError,
+                    "not enough operands provided to einstein sum function");
+        return NULL;
+    }
+
+    /* Parse the subscripts string into label_counts and op_labels */
+    memset(label_counts, 0, sizeof(label_counts));
+    for (iop = 0; iop < nop; ++iop) {
+        int length = (int)strcspn(subscripts, ",-");
+
+        if (iop == nop-1 && subscripts[length] == ',') {
+            PyErr_SetString(PyExc_ValueError,
+                        "more operands provided to einstein sum function "
+                        "than specified in the subscripts string");
+            return NULL;
+        }
+        else if(iop < nop-1 && subscripts[length] != ',') {
+            PyErr_SetString(PyExc_ValueError,
+                        "fewer operands provided to einstein sum function "
+                        "than specified in the subscripts string");
+            return NULL;
+        }
+
+        if (parse_operand_subscripts(subscripts, length,
+                        PyArray_NDIM(op_in[iop]),
+                        iop, op_labels[iop], label_counts,
+                        &min_label, &max_label) < 0) {
+            return NULL;
+        }
+
+        /* Move subscripts to the start of the labels for the next op */
+        subscripts += length;
+        if (iop < nop-1) {
+            subscripts++;
+        }
+    }
+
+    /*
+     * Find the number of broadcast dimensions, which is the maximum
+     * number of labels == 0 in an op_labels array.
+     */
+    ndim_broadcast = 0;
+    for (iop = 0; iop < nop; ++iop) {
+        npy_intp count_zeros = 0;
+        int ndim;
+        char *labels = op_labels[iop];
+
+        ndim = PyArray_NDIM(op_in[iop]);
+        for (idim = 0; idim < ndim; ++idim) {
+            if (labels[idim] == 0) {
+                ++count_zeros;
+            }
+        }
+
+        if (count_zeros > ndim_broadcast) {
+            ndim_broadcast = count_zeros;
+        }
+    }
+
+    /*
+     * If there is no output signature, fill output_labels and ndim_output
+     * using each label that appeared once, in alphabetical order.
+     */
+    if (subscripts[0] == '\0') {
+        /* If no output was specified, always broadcast left, as usual. */
+        for (ndim_output = 0; ndim_output < ndim_broadcast; ++ndim_output) {
+            output_labels[ndim_output] = 0;
+        }
+        for (label = min_label; label <= max_label; ++label) {
+            if (label_counts[label] == 1) {
+                if (ndim_output < NPY_MAXDIMS) {
+                    output_labels[ndim_output++] = label;
+                }
+                else {
+                    PyErr_SetString(PyExc_ValueError,
+                                "einstein sum subscript string has too many "
+                                "distinct labels");
+                    return NULL;
+                }
+            }
+        }
+    }
+    else {
+        if (subscripts[0] != '-' || subscripts[1] != '>') {
+            PyErr_SetString(PyExc_ValueError,
+                        "einstein sum subscript string does not "
+                        "contain proper '->' output specified");
+            return NULL;
+        }
+        subscripts += 2;
+
+        /* Parse the output subscript string. */
+        ndim_output = parse_output_subscripts(subscripts, strlen(subscripts),
+                                        ndim_broadcast, label_counts,
+                                        output_labels);
+        if (ndim_output < 0) {
+            return NULL;
+        }
+    }
+
+    if (out != NULL && PyArray_NDIM(out) != ndim_output) {
+        PyErr_Format(PyExc_ValueError,
+                "out parameter does not have the correct number of "
+                "dimensions, has %d but should have %d",
+                (int)PyArray_NDIM(out), (int)ndim_output);
+        return NULL;
+    }
+
+    /*
+     * If there's just one operand and no output parameter,
+     * first try remapping the axes to the output to return
+     * a view instead of a copy.
+     */
+    if (nop == 1 && out == NULL) {
+        ret = NULL;
+
+        if (get_single_op_view(op_in[0], op_labels[0], ndim_output,
+                               output_labels, &ret) < 0) {
+            return NULL;
+        }
+
+        if (ret != NULL) {
+            return ret;
+        }
+    }
+
+    /* Set all the op references to NULL */
+    for (iop = 0; iop < nop; ++iop) {
+        op[iop] = NULL;
+    }
+
+    /*
+     * Process all the input ops, combining dimensions into their
+     * diagonal where specified.
+     */
+    for (iop = 0; iop < nop; ++iop) {
+        char *labels = op_labels[iop];
+
+        op[iop] = get_combined_dims_view(op_in[iop], iop, labels);
+        if (op[iop] == NULL) {
+            goto fail;
+        }
+    }
+
+    /* Set the output op */
+    op[nop] = out;
+
+    /*
+     * Set up the labels for the iterator (output + combined labels).
+     * Can just share the output_labels memory, because iter_labels
+     * is output_labels with some more labels appended.
+     */
+    iter_labels = output_labels;
+    ndim_iter = ndim_output;
+    for (label = min_label; label <= max_label; ++label) {
+        if (label_counts[label] > 0 &&
+                memchr(output_labels, label, ndim_output) == NULL) {
+            if (ndim_iter >= NPY_MAXDIMS) {
+                PyErr_SetString(PyExc_ValueError,
+                            "too many subscripts in einsum");
+                goto fail;
+            }
+            iter_labels[ndim_iter++] = label;
+        }
+    }
+
+    /* Set up the op_axes for the iterator */
+    for (iop = 0; iop < nop; ++iop) {
+        op_axes[iop] = op_axes_arrays[iop];
+
+        if (prepare_op_axes(PyArray_NDIM(op[iop]), iop, op_labels[iop],
+                    op_axes[iop], ndim_iter, iter_labels) < 0) {
+            goto fail;
+        }
+    }
+
+    /* Set up the op_dtypes if dtype was provided */
+    if (dtype == NULL) {
+        op_dtypes = NULL;
+    }
+    else {
+        op_dtypes = op_dtypes_array;
+        for (iop = 0; iop <= nop; ++iop) {
+            op_dtypes[iop] = dtype;
+        }
+    }
+
+    /* Set the op_axes for the output */
+    op_axes[nop] = op_axes_arrays[nop];
+    for (idim = 0; idim < ndim_output; ++idim) {
+        op_axes[nop][idim] = idim;
+    }
+    for (idim = ndim_output; idim < ndim_iter; ++idim) {
+        op_axes[nop][idim] = NPY_ITER_REDUCTION_AXIS(-1);
+    }
+
+    /* Set the iterator per-op flags */
+
+    for (iop = 0; iop < nop; ++iop) {
+        op_flags[iop] = NPY_ITER_READONLY|
+                        NPY_ITER_NBO|
+                        NPY_ITER_ALIGNED;
+    }
+    op_flags[nop] = NPY_ITER_READWRITE|
+                    NPY_ITER_NBO|
+                    NPY_ITER_ALIGNED|
+                    NPY_ITER_ALLOCATE;
+    iter_flags = NPY_ITER_EXTERNAL_LOOP|
+            NPY_ITER_BUFFERED|
+            NPY_ITER_DELAY_BUFALLOC|
+            NPY_ITER_GROWINNER|
+            NPY_ITER_REFS_OK|
+            NPY_ITER_ZEROSIZE_OK;
+    if (out != NULL) {
+        iter_flags |= NPY_ITER_COPY_IF_OVERLAP;
+    }
+    if (dtype == NULL) {
+        iter_flags |= NPY_ITER_COMMON_DTYPE;
+    }
+
+    /* Allocate the iterator */
+    iter = NpyIter_AdvancedNew(nop+1, op, iter_flags, order, casting, op_flags,
+                               op_dtypes, ndim_iter, op_axes, NULL, 0);
+
+    if (iter == NULL) {
+        goto fail;
+    }
+
+    /* Initialize the output to all zeros or None*/
+    ret = NpyIter_GetOperandArray(iter)[nop];
+    if (PyArray_AssignZero(ret, NULL) < 0) {
+        goto fail;
+    }
+
+    /***************************/
+    /*
+     * Acceleration for some specific loop structures. Note
+     * that with axis coalescing, inputs with more dimensions can
+     * be reduced to fit into these patterns.
+     */
+    if (!NpyIter_RequiresBuffering(iter)) {
+        int ndim = NpyIter_GetNDim(iter);
+        switch (nop) {
+            case 1:
+                if (ndim == 2) {
+                    if (unbuffered_loop_nop1_ndim2(iter) < 0) {
+                        goto fail;
+                    }
+                    goto finish;
+                }
+                else if (ndim == 3) {
+                    if (unbuffered_loop_nop1_ndim3(iter) < 0) {
+                        goto fail;
+                    }
+                    goto finish;
+                }
+                break;
+            case 2:
+                if (ndim == 2) {
+                    if (unbuffered_loop_nop2_ndim2(iter) < 0) {
+                        goto fail;
+                    }
+                    goto finish;
+                }
+                else if (ndim == 3) {
+                    if (unbuffered_loop_nop2_ndim3(iter) < 0) {
+                        goto fail;
+                    }
+                    goto finish;
+                }
+                break;
+        }
+    }
+    /***************************/
+
+    if (NpyIter_Reset(iter, NULL) != NPY_SUCCEED) {
+        goto fail;
+    }
+
+    /*
+     * Get an inner loop function, specializing it based on
+     * the strides that are fixed for the whole loop.
+     */
+    NpyIter_GetInnerFixedStrideArray(iter, fixed_strides);
+    sop = get_sum_of_products_function(nop,
+                        NpyIter_GetDescrArray(iter)[0]->type_num,
+                        NpyIter_GetDescrArray(iter)[0]->elsize,
+                        fixed_strides);
+
+#if NPY_EINSUM_DBG_TRACING
+    NpyIter_DebugPrint(iter);
+#endif
+
+    /* Finally, the main loop */
+    if (sop == NULL) {
+        PyErr_SetString(PyExc_TypeError,
+                    "invalid data type for einsum");
+    }
+    else if (NpyIter_GetIterSize(iter) != 0) {
+        NpyIter_IterNextFunc *iternext;
+        char **dataptr;
+        npy_intp *stride;
+        npy_intp *countptr;
+        int needs_api;
+        NPY_BEGIN_THREADS_DEF;
+
+        iternext = NpyIter_GetIterNext(iter, NULL);
+        if (iternext == NULL) {
+            goto fail;
+        }
+        dataptr = NpyIter_GetDataPtrArray(iter);
+        stride = NpyIter_GetInnerStrideArray(iter);
+        countptr = NpyIter_GetInnerLoopSizePtr(iter);
+        needs_api = NpyIter_IterationNeedsAPI(iter);
+
+        NPY_BEGIN_THREADS_NDITER(iter);
+        NPY_EINSUM_DBG_PRINT("Einsum loop\n");
+        do {
+            sop(nop, dataptr, stride, *countptr);
+        } while (!(needs_api && PyErr_Occurred()) && iternext(iter));
+        NPY_END_THREADS;
+
+        /* If the API was needed, it may have thrown an error */
+        if (NpyIter_IterationNeedsAPI(iter) && PyErr_Occurred()) {
+            goto fail;
+        }
+    }
+
+finish:
+    if (out != NULL) {
+        ret = out;
+    }
+    Py_INCREF(ret);
+
+    NpyIter_Deallocate(iter);
+    for (iop = 0; iop < nop; ++iop) {
+        Py_DECREF(op[iop]);
+    }
+
+    return ret;
+
+fail:
+    NpyIter_Deallocate(iter);
+    for (iop = 0; iop < nop; ++iop) {
+        Py_XDECREF(op[iop]);
+    }
+
+    return NULL;
+}
+
diff --git a/numpy/core/src/_generated/einsum_sumprod.c b/numpy/core/src/_generated/einsum_sumprod.c
new file mode 100644
index 000000000000..076cf56e3792
--- /dev/null
+++ b/numpy/core/src/_generated/einsum_sumprod.c
@@ -0,0 +1,84668 @@
+#line 1 "numpy/core/src/multiarray/einsum_sumprod.c.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+/*
+ * This file provides optimized sum of product implementations used internally
+ * by einsum.
+ *
+ * Copyright (c) 2011 by Mark Wiebe (mwwiebe@gmail.com)
+ * The University of British Columbia
+ *
+ * See LICENSE.txt for the license.
+ */
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+
+#include <numpy/npy_common.h>
+#include <numpy/ndarraytypes.h>  /* for NPY_NTYPES */
+#include <numpy/halffloat.h>
+
+#include "einsum_sumprod.h"
+#include "einsum_debug.h"
+#include "simd/simd.h"
+#include "common.h"
+
+// ARM/Neon don't have instructions for aligned memory access
+#ifdef NPY_HAVE_NEON
+    #define EINSUM_IS_ALIGNED(x) 0
+#else
+    #define EINSUM_IS_ALIGNED(x) npy_is_aligned(x, NPY_SIMD_WIDTH)
+#endif
+
+/**********************************************/
+
+#line 74
+
+#if !0
+static NPY_GCC_OPT_3 npy_byte byte_sum_of_arr(npy_byte *data, npy_intp count)
+{
+    npy_byte accum = 0;
+#if 0 // NPYV check for npy_byte
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data);
+    const int vstep = npyv_nlanes_s8;
+    npyv_s8 v_accum = npyv_zero_s8();
+    const npy_intp vstepx4 = vstep * 4;
+
+    #line 91
+    if(is_aligned) {
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
+            #line 96
+            npyv_s8 a0 = npyv_loada_s8(data + vstep * 0);
+            
+#line 96
+            npyv_s8 a1 = npyv_loada_s8(data + vstep * 1);
+            
+#line 96
+            npyv_s8 a2 = npyv_loada_s8(data + vstep * 2);
+            
+#line 96
+            npyv_s8 a3 = npyv_loada_s8(data + vstep * 3);
+            
+            npyv_s8 a01   = npyv_add_s8(a0, a1);
+            npyv_s8 a23   = npyv_add_s8(a2, a3);
+            npyv_s8 a0123 = npyv_add_s8(a01, a23);
+                     v_accum = npyv_add_s8(a0123, v_accum);
+        }
+    }
+    
+#line 91
+    else {
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
+            #line 96
+            npyv_s8 a0 = npyv_load_s8(data + vstep * 0);
+            
+#line 96
+            npyv_s8 a1 = npyv_load_s8(data + vstep * 1);
+            
+#line 96
+            npyv_s8 a2 = npyv_load_s8(data + vstep * 2);
+            
+#line 96
+            npyv_s8 a3 = npyv_load_s8(data + vstep * 3);
+            
+            npyv_s8 a01   = npyv_add_s8(a0, a1);
+            npyv_s8 a23   = npyv_add_s8(a2, a3);
+            npyv_s8 a0123 = npyv_add_s8(a01, a23);
+                     v_accum = npyv_add_s8(a0123, v_accum);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep) {
+        npyv_s8 a = npyv_load_tillz_s8(data, count);
+        v_accum = npyv_add_s8(a, v_accum);
+    }
+    accum = npyv_sum_s8(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data += 4) {
+        const npy_byte a01 = (*data) + (data[1]);
+        const npy_byte a23 = (data[2]) + (data[3]);
+        accum +=  a01 + a23;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data++) {
+        accum += (*data);
+    }
+#endif // NPYV check for npy_byte
+    return accum;
+}
+#endif
+
+#line 131
+static void
+byte_sum_of_products_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1 == 2 || 1 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data_out = dataptr[1];
+    npy_intp stride_out = strides[1];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_one (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 1 == 1
+        *(npy_byte *)data_out = ((*(npy_byte *)data0) +
+                                         (*(npy_byte *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 1 == 2
+        *(npy_byte *)data_out = ((*(npy_byte *)data0) *
+                                         (*(npy_byte *)data1) +
+                                         (*(npy_byte *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 1 == 3
+        *(npy_byte *)data_out = ((*(npy_byte *)data0) *
+                                         (*(npy_byte *)data1) *
+                                         (*(npy_byte *)data2) +
+                                         (*(npy_byte *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_byte temp = (*(npy_byte *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_byte *)dataptr[i]);
+        }
+        *(npy_byte *)dataptr[nop] = (temp +
+                                           (*(npy_byte *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1 == 1
+        ((npy_byte *)data_out)[0] = ((npy_byte *)data0)[0] +
+                                         ((npy_byte *)data_out)[0];
+        ((npy_byte *)data_out)[1] = ((npy_byte *)data0)[1] +
+                                         ((npy_byte *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 1 <= 3
+#define _SUMPROD_NOP 1
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_byte re, im, tmp;
+        int i;
+        re = ((npy_byte *)dataptr[0])[0];
+        im = ((npy_byte *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_byte *)dataptr[i])[0] -
+                  im * ((npy_byte *)dataptr[i])[1];
+            im = re * ((npy_byte *)dataptr[i])[1] +
+                 im * ((npy_byte *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_byte *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_byte *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 1 == 1
+
+static void
+byte_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_byte *data0 = (npy_byte *)dataptr[0];
+    npy_byte *data_out = (npy_byte *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_byte *)data_out + 2*6)[0] =
+                                    ((npy_byte *)data0 + 2*6)[0] +
+                                    ((npy_byte *)data_out + 2*6)[0];
+            ((npy_byte *)data_out + 2*6)[1] =
+                                    ((npy_byte *)data0 + 2*6)[1] +
+                                    ((npy_byte *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_byte *)data_out + 2*5)[0] =
+                                    ((npy_byte *)data0 + 2*5)[0] +
+                                    ((npy_byte *)data_out + 2*5)[0];
+            ((npy_byte *)data_out + 2*5)[1] =
+                                    ((npy_byte *)data0 + 2*5)[1] +
+                                    ((npy_byte *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_byte *)data_out + 2*4)[0] =
+                                    ((npy_byte *)data0 + 2*4)[0] +
+                                    ((npy_byte *)data_out + 2*4)[0];
+            ((npy_byte *)data_out + 2*4)[1] =
+                                    ((npy_byte *)data0 + 2*4)[1] +
+                                    ((npy_byte *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_byte *)data_out + 2*3)[0] =
+                                    ((npy_byte *)data0 + 2*3)[0] +
+                                    ((npy_byte *)data_out + 2*3)[0];
+            ((npy_byte *)data_out + 2*3)[1] =
+                                    ((npy_byte *)data0 + 2*3)[1] +
+                                    ((npy_byte *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_byte *)data_out + 2*2)[0] =
+                                    ((npy_byte *)data0 + 2*2)[0] +
+                                    ((npy_byte *)data_out + 2*2)[0];
+            ((npy_byte *)data_out + 2*2)[1] =
+                                    ((npy_byte *)data0 + 2*2)[1] +
+                                    ((npy_byte *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_byte *)data_out + 2*1)[0] =
+                                    ((npy_byte *)data0 + 2*1)[0] +
+                                    ((npy_byte *)data_out + 2*1)[0];
+            ((npy_byte *)data_out + 2*1)[1] =
+                                    ((npy_byte *)data0 + 2*1)[1] +
+                                    ((npy_byte *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_byte *)data_out + 2*0)[0] =
+                                    ((npy_byte *)data0 + 2*0)[0] +
+                                    ((npy_byte *)data_out + 2*0)[0];
+            ((npy_byte *)data_out + 2*0)[1] =
+                                    ((npy_byte *)data0 + 2*0)[1] +
+                                    ((npy_byte *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_byte *)data_out + 2*0)[0] =
+                                ((npy_byte *)data0 + 2*0)[0] +
+                                ((npy_byte *)data_out + 2*0)[0];
+        ((npy_byte *)data_out + 2*0)[1] =
+                                ((npy_byte *)data0 + 2*0)[1] +
+                                ((npy_byte *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_byte *)data_out + 2*1)[0] =
+                                ((npy_byte *)data0 + 2*1)[0] +
+                                ((npy_byte *)data_out + 2*1)[0];
+        ((npy_byte *)data_out + 2*1)[1] =
+                                ((npy_byte *)data0 + 2*1)[1] +
+                                ((npy_byte *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_byte *)data_out + 2*2)[0] =
+                                ((npy_byte *)data0 + 2*2)[0] +
+                                ((npy_byte *)data_out + 2*2)[0];
+        ((npy_byte *)data_out + 2*2)[1] =
+                                ((npy_byte *)data0 + 2*2)[1] +
+                                ((npy_byte *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_byte *)data_out + 2*3)[0] =
+                                ((npy_byte *)data0 + 2*3)[0] +
+                                ((npy_byte *)data_out + 2*3)[0];
+        ((npy_byte *)data_out + 2*3)[1] =
+                                ((npy_byte *)data0 + 2*3)[1] +
+                                ((npy_byte *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_byte *)data_out + 2*4)[0] =
+                                ((npy_byte *)data0 + 2*4)[0] +
+                                ((npy_byte *)data_out + 2*4)[0];
+        ((npy_byte *)data_out + 2*4)[1] =
+                                ((npy_byte *)data0 + 2*4)[1] +
+                                ((npy_byte *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_byte *)data_out + 2*5)[0] =
+                                ((npy_byte *)data0 + 2*5)[0] +
+                                ((npy_byte *)data_out + 2*5)[0];
+        ((npy_byte *)data_out + 2*5)[1] =
+                                ((npy_byte *)data0 + 2*5)[1] +
+                                ((npy_byte *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_byte *)data_out + 2*6)[0] =
+                                ((npy_byte *)data0 + 2*6)[0] +
+                                ((npy_byte *)data_out + 2*6)[0];
+        ((npy_byte *)data_out + 2*6)[1] =
+                                ((npy_byte *)data0 + 2*6)[1] +
+                                ((npy_byte *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_byte *)data_out + 2*7)[0] =
+                                ((npy_byte *)data0 + 2*7)[0] +
+                                ((npy_byte *)data_out + 2*7)[0];
+        ((npy_byte *)data_out + 2*7)[1] =
+                                ((npy_byte *)data0 + 2*7)[1] +
+                                ((npy_byte *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 1 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+byte_sum_of_products_muladd(npy_byte *data, npy_byte *data_out, npy_byte scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_byte
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_s8;
+    const npyv_s8 v_scalar = npyv_setall_s8(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_s8 b0 = npyv_loada_s8(data + vstep * 0);
+            npyv_s8 c0 = npyv_loada_s8(data_out + vstep * 0);
+            
+#line 312
+            npyv_s8 b1 = npyv_loada_s8(data + vstep * 1);
+            npyv_s8 c1 = npyv_loada_s8(data_out + vstep * 1);
+            
+#line 312
+            npyv_s8 b2 = npyv_loada_s8(data + vstep * 2);
+            npyv_s8 c2 = npyv_loada_s8(data_out + vstep * 2);
+            
+#line 312
+            npyv_s8 b3 = npyv_loada_s8(data + vstep * 3);
+            npyv_s8 c3 = npyv_loada_s8(data_out + vstep * 3);
+            
+            #line 318
+            npyv_s8 abc0 = npyv_muladd_s8(v_scalar, b0, c0);
+            
+#line 318
+            npyv_s8 abc1 = npyv_muladd_s8(v_scalar, b1, c1);
+            
+#line 318
+            npyv_s8 abc2 = npyv_muladd_s8(v_scalar, b2, c2);
+            
+#line 318
+            npyv_s8 abc3 = npyv_muladd_s8(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_s8(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_s8(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_s8(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_s8(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_s8 b0 = npyv_load_s8(data + vstep * 0);
+            npyv_s8 c0 = npyv_load_s8(data_out + vstep * 0);
+            
+#line 312
+            npyv_s8 b1 = npyv_load_s8(data + vstep * 1);
+            npyv_s8 c1 = npyv_load_s8(data_out + vstep * 1);
+            
+#line 312
+            npyv_s8 b2 = npyv_load_s8(data + vstep * 2);
+            npyv_s8 c2 = npyv_load_s8(data_out + vstep * 2);
+            
+#line 312
+            npyv_s8 b3 = npyv_load_s8(data + vstep * 3);
+            npyv_s8 c3 = npyv_load_s8(data_out + vstep * 3);
+            
+            #line 318
+            npyv_s8 abc0 = npyv_muladd_s8(v_scalar, b0, c0);
+            
+#line 318
+            npyv_s8 abc1 = npyv_muladd_s8(v_scalar, b1, c1);
+            
+#line 318
+            npyv_s8 abc2 = npyv_muladd_s8(v_scalar, b2, c2);
+            
+#line 318
+            npyv_s8 abc3 = npyv_muladd_s8(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_s8(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_s8(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_s8(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_s8(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_s8 a = npyv_load_tillz_s8(data, count);
+        npyv_s8 b = npyv_load_tillz_s8(data_out, count);
+        npyv_store_till_s8(data_out, count, npyv_muladd_s8(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_byte b0 = (data[0]);
+        const npy_byte c0 = (data_out[0]);
+        
+#line 340
+        const npy_byte b1 = (data[1]);
+        const npy_byte c1 = (data_out[1]);
+        
+#line 340
+        const npy_byte b2 = (data[2]);
+        const npy_byte c2 = (data_out[2]);
+        
+#line 340
+        const npy_byte b3 = (data[3]);
+        const npy_byte c3 = (data_out[3]);
+        
+        #line 346
+        const npy_byte abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_byte abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_byte abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_byte abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_byte b = (*data);
+        const npy_byte c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_byte
+}
+
+static void
+byte_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_byte *data0 = (npy_byte *)dataptr[0];
+    npy_byte *data1 = (npy_byte *)dataptr[1];
+    npy_byte *data_out = (npy_byte *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_byte
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_s8;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_s8 a0 = npyv_loada_s8(data0 + vstep * 0);
+            npyv_s8 b0 = npyv_loada_s8(data1 + vstep * 0);
+            npyv_s8 c0 = npyv_loada_s8(data_out + vstep * 0);
+            
+#line 390
+            npyv_s8 a1 = npyv_loada_s8(data0 + vstep * 1);
+            npyv_s8 b1 = npyv_loada_s8(data1 + vstep * 1);
+            npyv_s8 c1 = npyv_loada_s8(data_out + vstep * 1);
+            
+#line 390
+            npyv_s8 a2 = npyv_loada_s8(data0 + vstep * 2);
+            npyv_s8 b2 = npyv_loada_s8(data1 + vstep * 2);
+            npyv_s8 c2 = npyv_loada_s8(data_out + vstep * 2);
+            
+#line 390
+            npyv_s8 a3 = npyv_loada_s8(data0 + vstep * 3);
+            npyv_s8 b3 = npyv_loada_s8(data1 + vstep * 3);
+            npyv_s8 c3 = npyv_loada_s8(data_out + vstep * 3);
+            
+            #line 397
+            npyv_s8 abc0 = npyv_muladd_s8(a0, b0, c0);
+            
+#line 397
+            npyv_s8 abc1 = npyv_muladd_s8(a1, b1, c1);
+            
+#line 397
+            npyv_s8 abc2 = npyv_muladd_s8(a2, b2, c2);
+            
+#line 397
+            npyv_s8 abc3 = npyv_muladd_s8(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_s8(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_s8(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_s8(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_s8(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_s8 a0 = npyv_load_s8(data0 + vstep * 0);
+            npyv_s8 b0 = npyv_load_s8(data1 + vstep * 0);
+            npyv_s8 c0 = npyv_load_s8(data_out + vstep * 0);
+            
+#line 390
+            npyv_s8 a1 = npyv_load_s8(data0 + vstep * 1);
+            npyv_s8 b1 = npyv_load_s8(data1 + vstep * 1);
+            npyv_s8 c1 = npyv_load_s8(data_out + vstep * 1);
+            
+#line 390
+            npyv_s8 a2 = npyv_load_s8(data0 + vstep * 2);
+            npyv_s8 b2 = npyv_load_s8(data1 + vstep * 2);
+            npyv_s8 c2 = npyv_load_s8(data_out + vstep * 2);
+            
+#line 390
+            npyv_s8 a3 = npyv_load_s8(data0 + vstep * 3);
+            npyv_s8 b3 = npyv_load_s8(data1 + vstep * 3);
+            npyv_s8 c3 = npyv_load_s8(data_out + vstep * 3);
+            
+            #line 397
+            npyv_s8 abc0 = npyv_muladd_s8(a0, b0, c0);
+            
+#line 397
+            npyv_s8 abc1 = npyv_muladd_s8(a1, b1, c1);
+            
+#line 397
+            npyv_s8 abc2 = npyv_muladd_s8(a2, b2, c2);
+            
+#line 397
+            npyv_s8 abc3 = npyv_muladd_s8(a3, b3, c3);
+            
+            #line 402
+            npyv_store_s8(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_s8(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_s8(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_s8(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_s8 a = npyv_load_tillz_s8(data0, count);
+        npyv_s8 b = npyv_load_tillz_s8(data1, count);
+        npyv_s8 c = npyv_load_tillz_s8(data_out, count);
+        npyv_store_till_s8(data_out, count, npyv_muladd_s8(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_byte a0 = (data0[0]);
+        const npy_byte b0 = (data1[0]);
+        const npy_byte c0 = (data_out[0]);
+        
+#line 420
+        const npy_byte a1 = (data0[1]);
+        const npy_byte b1 = (data1[1]);
+        const npy_byte c1 = (data_out[1]);
+        
+#line 420
+        const npy_byte a2 = (data0[2]);
+        const npy_byte b2 = (data1[2]);
+        const npy_byte c2 = (data_out[2]);
+        
+#line 420
+        const npy_byte a3 = (data0[3]);
+        const npy_byte b3 = (data1[3]);
+        const npy_byte c3 = (data_out[3]);
+        
+        #line 427
+        const npy_byte abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_byte abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_byte abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_byte abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_byte a = (*data0);
+        const npy_byte b = (*data1);
+        const npy_byte c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_byte
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+byte_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_byte value0 = (*(npy_byte *)dataptr[0]);
+    npy_byte *data1 = (npy_byte *)dataptr[1];
+    npy_byte *data_out = (npy_byte *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    byte_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+byte_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_byte value1 = (*(npy_byte *)dataptr[1]);
+    npy_byte *data0 = (npy_byte *)dataptr[0];
+    npy_byte *data_out = (npy_byte *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    byte_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+byte_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_byte *data0 = (npy_byte *)dataptr[0];
+    npy_byte *data1 = (npy_byte *)dataptr[1];
+    npy_byte accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_byte
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_s8;
+    npyv_s8 v_accum = npyv_zero_s8();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_s8 a0 = npyv_loada_s8(data0 + vstep * 0);
+            npyv_s8 b0 = npyv_loada_s8(data1 + vstep * 0);
+            
+#line 501
+            npyv_s8 a1 = npyv_loada_s8(data0 + vstep * 1);
+            npyv_s8 b1 = npyv_loada_s8(data1 + vstep * 1);
+            
+#line 501
+            npyv_s8 a2 = npyv_loada_s8(data0 + vstep * 2);
+            npyv_s8 b2 = npyv_loada_s8(data1 + vstep * 2);
+            
+#line 501
+            npyv_s8 a3 = npyv_loada_s8(data0 + vstep * 3);
+            npyv_s8 b3 = npyv_loada_s8(data1 + vstep * 3);
+            
+            npyv_s8 ab3 = npyv_muladd_s8(a3, b3, v_accum);
+            npyv_s8 ab2 = npyv_muladd_s8(a2, b2, ab3);
+            npyv_s8 ab1 = npyv_muladd_s8(a1, b1, ab2);
+                   v_accum = npyv_muladd_s8(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_s8 a0 = npyv_load_s8(data0 + vstep * 0);
+            npyv_s8 b0 = npyv_load_s8(data1 + vstep * 0);
+            
+#line 501
+            npyv_s8 a1 = npyv_load_s8(data0 + vstep * 1);
+            npyv_s8 b1 = npyv_load_s8(data1 + vstep * 1);
+            
+#line 501
+            npyv_s8 a2 = npyv_load_s8(data0 + vstep * 2);
+            npyv_s8 b2 = npyv_load_s8(data1 + vstep * 2);
+            
+#line 501
+            npyv_s8 a3 = npyv_load_s8(data0 + vstep * 3);
+            npyv_s8 b3 = npyv_load_s8(data1 + vstep * 3);
+            
+            npyv_s8 ab3 = npyv_muladd_s8(a3, b3, v_accum);
+            npyv_s8 ab2 = npyv_muladd_s8(a2, b2, ab3);
+            npyv_s8 ab1 = npyv_muladd_s8(a1, b1, ab2);
+                   v_accum = npyv_muladd_s8(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_s8 a = npyv_load_tillz_s8(data0, count);
+        npyv_s8 b = npyv_load_tillz_s8(data1, count);
+        v_accum = npyv_muladd_s8(a, b, v_accum);
+    }
+    accum = npyv_sum_s8(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_byte ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_byte ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_byte ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_byte ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_byte a = (*data0);
+        const npy_byte b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_byte
+    *(npy_byte *)dataptr[2] = ((*(npy_byte *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+byte_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_byte *data1 = (npy_byte *)dataptr[1];
+    npy_byte value0 = (*(npy_byte *)dataptr[0]);
+    npy_byte accum = byte_sum_of_arr(data1, count);
+    *(npy_byte *)dataptr[2] = ((*(npy_byte *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+byte_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_byte *data0 = (npy_byte *)dataptr[0];
+    npy_byte value1 = (*(npy_byte *)dataptr[1]);
+    npy_byte accum = byte_sum_of_arr(data0, count);
+    *(npy_byte *)dataptr[2] = ((*(npy_byte *)dataptr[2]) + value1 * accum);
+}
+
+#elif 1 == 3 && !0
+
+static void
+byte_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_byte *data0 = (npy_byte *)dataptr[0];
+    npy_byte *data1 = (npy_byte *)dataptr[1];
+    npy_byte *data2 = (npy_byte *)dataptr[2];
+    npy_byte *data_out = (npy_byte *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 1 > 3 || @complex */
+
+static void
+byte_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_one (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_byte temp = (*(npy_byte *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_byte *)dataptr[i]);
+        }
+        *(npy_byte *)dataptr[nop] = (temp +
+                                           (*(npy_byte *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_byte);
+        }
+#else /* complex */
+#  if 1 <= 3
+#    define _SUMPROD_NOP 1
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_byte re, im, tmp;
+        int i;
+        re = ((npy_byte *)dataptr[0])[0];
+        im = ((npy_byte *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_byte *)dataptr[i])[0] -
+                  im * ((npy_byte *)dataptr[i])[1];
+            im = re * ((npy_byte *)dataptr[i])[1] +
+                 im * ((npy_byte *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_byte *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_byte *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_byte);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 1 */
+
+#if 1 == 1
+
+static NPY_GCC_OPT_3 void
+byte_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_byte *data = (npy_byte *)dataptr[0];
+    npy_byte accum = byte_sum_of_arr(data, count);
+    *((npy_byte *)dataptr[1]) = (accum + (*((npy_byte *)dataptr[1])));
+#else
+    npy_byte accum_re = 0, accum_im = 0;
+    npy_byte *data0 = (npy_byte *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_byte re01 = data0[0] + data0[2];
+        const npy_byte re23 = data0[4] + data0[6];
+        const npy_byte im13 = data0[1] + data0[3];
+        const npy_byte im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_byte *)dataptr[1])[0] += accum_re;
+    ((npy_byte *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 1 == 1 */
+
+static void
+byte_sum_of_products_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_byte accum_re = 0, accum_im = 0;
+#else
+    npy_byte accum = 0;
+#endif
+
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1 == 2 || 1 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_outstride0_one (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 1 == 1
+        accum += (*(npy_byte *)data0);
+        data0 += stride0;
+#  elif 1 == 2
+        accum += (*(npy_byte *)data0) *
+                 (*(npy_byte *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 1 == 3
+        accum += (*(npy_byte *)data0) *
+                 (*(npy_byte *)data1) *
+                 (*(npy_byte *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_byte temp = (*(npy_byte *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_byte *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1 == 1
+        accum_re += ((npy_byte *)data0)[0];
+        accum_im += ((npy_byte *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 1 <= 3
+#define _SUMPROD_NOP 1
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_byte re, im, tmp;
+        int i;
+        re = ((npy_byte *)dataptr[0])[0];
+        im = ((npy_byte *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_byte *)dataptr[i])[0] -
+                  im * ((npy_byte *)dataptr[i])[1];
+            im = re * ((npy_byte *)dataptr[i])[1] +
+                 im * ((npy_byte *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 1 <= 3
+    ((npy_byte *)dataptr[1])[0] += accum_re;
+    ((npy_byte *)dataptr[1])[1] += accum_im;
+#  else
+    ((npy_byte *)dataptr[nop])[0] += accum_re;
+    ((npy_byte *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 1 <= 3
+    *((npy_byte *)dataptr[1]) = (accum +
+                                    (*((npy_byte *)dataptr[1])));
+#  else
+    *((npy_byte *)dataptr[nop]) = (accum +
+                                    (*((npy_byte *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+byte_sum_of_products_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (2 == 2 || 2 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (2 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data_out = dataptr[2];
+    npy_intp stride_out = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_two (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 2 == 1
+        *(npy_byte *)data_out = ((*(npy_byte *)data0) +
+                                         (*(npy_byte *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 2 == 2
+        *(npy_byte *)data_out = ((*(npy_byte *)data0) *
+                                         (*(npy_byte *)data1) +
+                                         (*(npy_byte *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 2 == 3
+        *(npy_byte *)data_out = ((*(npy_byte *)data0) *
+                                         (*(npy_byte *)data1) *
+                                         (*(npy_byte *)data2) +
+                                         (*(npy_byte *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_byte temp = (*(npy_byte *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_byte *)dataptr[i]);
+        }
+        *(npy_byte *)dataptr[nop] = (temp +
+                                           (*(npy_byte *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 2 == 1
+        ((npy_byte *)data_out)[0] = ((npy_byte *)data0)[0] +
+                                         ((npy_byte *)data_out)[0];
+        ((npy_byte *)data_out)[1] = ((npy_byte *)data0)[1] +
+                                         ((npy_byte *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 2 <= 3
+#define _SUMPROD_NOP 2
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_byte re, im, tmp;
+        int i;
+        re = ((npy_byte *)dataptr[0])[0];
+        im = ((npy_byte *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_byte *)dataptr[i])[0] -
+                  im * ((npy_byte *)dataptr[i])[1];
+            im = re * ((npy_byte *)dataptr[i])[1] +
+                 im * ((npy_byte *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_byte *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_byte *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 2 == 1
+
+static void
+byte_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_byte *data0 = (npy_byte *)dataptr[0];
+    npy_byte *data_out = (npy_byte *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_byte *)data_out + 2*6)[0] =
+                                    ((npy_byte *)data0 + 2*6)[0] +
+                                    ((npy_byte *)data_out + 2*6)[0];
+            ((npy_byte *)data_out + 2*6)[1] =
+                                    ((npy_byte *)data0 + 2*6)[1] +
+                                    ((npy_byte *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_byte *)data_out + 2*5)[0] =
+                                    ((npy_byte *)data0 + 2*5)[0] +
+                                    ((npy_byte *)data_out + 2*5)[0];
+            ((npy_byte *)data_out + 2*5)[1] =
+                                    ((npy_byte *)data0 + 2*5)[1] +
+                                    ((npy_byte *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_byte *)data_out + 2*4)[0] =
+                                    ((npy_byte *)data0 + 2*4)[0] +
+                                    ((npy_byte *)data_out + 2*4)[0];
+            ((npy_byte *)data_out + 2*4)[1] =
+                                    ((npy_byte *)data0 + 2*4)[1] +
+                                    ((npy_byte *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_byte *)data_out + 2*3)[0] =
+                                    ((npy_byte *)data0 + 2*3)[0] +
+                                    ((npy_byte *)data_out + 2*3)[0];
+            ((npy_byte *)data_out + 2*3)[1] =
+                                    ((npy_byte *)data0 + 2*3)[1] +
+                                    ((npy_byte *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_byte *)data_out + 2*2)[0] =
+                                    ((npy_byte *)data0 + 2*2)[0] +
+                                    ((npy_byte *)data_out + 2*2)[0];
+            ((npy_byte *)data_out + 2*2)[1] =
+                                    ((npy_byte *)data0 + 2*2)[1] +
+                                    ((npy_byte *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_byte *)data_out + 2*1)[0] =
+                                    ((npy_byte *)data0 + 2*1)[0] +
+                                    ((npy_byte *)data_out + 2*1)[0];
+            ((npy_byte *)data_out + 2*1)[1] =
+                                    ((npy_byte *)data0 + 2*1)[1] +
+                                    ((npy_byte *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_byte *)data_out + 2*0)[0] =
+                                    ((npy_byte *)data0 + 2*0)[0] +
+                                    ((npy_byte *)data_out + 2*0)[0];
+            ((npy_byte *)data_out + 2*0)[1] =
+                                    ((npy_byte *)data0 + 2*0)[1] +
+                                    ((npy_byte *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_byte *)data_out + 2*0)[0] =
+                                ((npy_byte *)data0 + 2*0)[0] +
+                                ((npy_byte *)data_out + 2*0)[0];
+        ((npy_byte *)data_out + 2*0)[1] =
+                                ((npy_byte *)data0 + 2*0)[1] +
+                                ((npy_byte *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_byte *)data_out + 2*1)[0] =
+                                ((npy_byte *)data0 + 2*1)[0] +
+                                ((npy_byte *)data_out + 2*1)[0];
+        ((npy_byte *)data_out + 2*1)[1] =
+                                ((npy_byte *)data0 + 2*1)[1] +
+                                ((npy_byte *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_byte *)data_out + 2*2)[0] =
+                                ((npy_byte *)data0 + 2*2)[0] +
+                                ((npy_byte *)data_out + 2*2)[0];
+        ((npy_byte *)data_out + 2*2)[1] =
+                                ((npy_byte *)data0 + 2*2)[1] +
+                                ((npy_byte *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_byte *)data_out + 2*3)[0] =
+                                ((npy_byte *)data0 + 2*3)[0] +
+                                ((npy_byte *)data_out + 2*3)[0];
+        ((npy_byte *)data_out + 2*3)[1] =
+                                ((npy_byte *)data0 + 2*3)[1] +
+                                ((npy_byte *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_byte *)data_out + 2*4)[0] =
+                                ((npy_byte *)data0 + 2*4)[0] +
+                                ((npy_byte *)data_out + 2*4)[0];
+        ((npy_byte *)data_out + 2*4)[1] =
+                                ((npy_byte *)data0 + 2*4)[1] +
+                                ((npy_byte *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_byte *)data_out + 2*5)[0] =
+                                ((npy_byte *)data0 + 2*5)[0] +
+                                ((npy_byte *)data_out + 2*5)[0];
+        ((npy_byte *)data_out + 2*5)[1] =
+                                ((npy_byte *)data0 + 2*5)[1] +
+                                ((npy_byte *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_byte *)data_out + 2*6)[0] =
+                                ((npy_byte *)data0 + 2*6)[0] +
+                                ((npy_byte *)data_out + 2*6)[0];
+        ((npy_byte *)data_out + 2*6)[1] =
+                                ((npy_byte *)data0 + 2*6)[1] +
+                                ((npy_byte *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_byte *)data_out + 2*7)[0] =
+                                ((npy_byte *)data0 + 2*7)[0] +
+                                ((npy_byte *)data_out + 2*7)[0];
+        ((npy_byte *)data_out + 2*7)[1] =
+                                ((npy_byte *)data0 + 2*7)[1] +
+                                ((npy_byte *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 2 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+byte_sum_of_products_muladd(npy_byte *data, npy_byte *data_out, npy_byte scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_byte
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_s8;
+    const npyv_s8 v_scalar = npyv_setall_s8(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_s8 b0 = npyv_loada_s8(data + vstep * 0);
+            npyv_s8 c0 = npyv_loada_s8(data_out + vstep * 0);
+            
+#line 312
+            npyv_s8 b1 = npyv_loada_s8(data + vstep * 1);
+            npyv_s8 c1 = npyv_loada_s8(data_out + vstep * 1);
+            
+#line 312
+            npyv_s8 b2 = npyv_loada_s8(data + vstep * 2);
+            npyv_s8 c2 = npyv_loada_s8(data_out + vstep * 2);
+            
+#line 312
+            npyv_s8 b3 = npyv_loada_s8(data + vstep * 3);
+            npyv_s8 c3 = npyv_loada_s8(data_out + vstep * 3);
+            
+            #line 318
+            npyv_s8 abc0 = npyv_muladd_s8(v_scalar, b0, c0);
+            
+#line 318
+            npyv_s8 abc1 = npyv_muladd_s8(v_scalar, b1, c1);
+            
+#line 318
+            npyv_s8 abc2 = npyv_muladd_s8(v_scalar, b2, c2);
+            
+#line 318
+            npyv_s8 abc3 = npyv_muladd_s8(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_s8(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_s8(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_s8(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_s8(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_s8 b0 = npyv_load_s8(data + vstep * 0);
+            npyv_s8 c0 = npyv_load_s8(data_out + vstep * 0);
+            
+#line 312
+            npyv_s8 b1 = npyv_load_s8(data + vstep * 1);
+            npyv_s8 c1 = npyv_load_s8(data_out + vstep * 1);
+            
+#line 312
+            npyv_s8 b2 = npyv_load_s8(data + vstep * 2);
+            npyv_s8 c2 = npyv_load_s8(data_out + vstep * 2);
+            
+#line 312
+            npyv_s8 b3 = npyv_load_s8(data + vstep * 3);
+            npyv_s8 c3 = npyv_load_s8(data_out + vstep * 3);
+            
+            #line 318
+            npyv_s8 abc0 = npyv_muladd_s8(v_scalar, b0, c0);
+            
+#line 318
+            npyv_s8 abc1 = npyv_muladd_s8(v_scalar, b1, c1);
+            
+#line 318
+            npyv_s8 abc2 = npyv_muladd_s8(v_scalar, b2, c2);
+            
+#line 318
+            npyv_s8 abc3 = npyv_muladd_s8(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_s8(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_s8(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_s8(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_s8(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_s8 a = npyv_load_tillz_s8(data, count);
+        npyv_s8 b = npyv_load_tillz_s8(data_out, count);
+        npyv_store_till_s8(data_out, count, npyv_muladd_s8(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_byte b0 = (data[0]);
+        const npy_byte c0 = (data_out[0]);
+        
+#line 340
+        const npy_byte b1 = (data[1]);
+        const npy_byte c1 = (data_out[1]);
+        
+#line 340
+        const npy_byte b2 = (data[2]);
+        const npy_byte c2 = (data_out[2]);
+        
+#line 340
+        const npy_byte b3 = (data[3]);
+        const npy_byte c3 = (data_out[3]);
+        
+        #line 346
+        const npy_byte abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_byte abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_byte abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_byte abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_byte b = (*data);
+        const npy_byte c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_byte
+}
+
+static void
+byte_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_byte *data0 = (npy_byte *)dataptr[0];
+    npy_byte *data1 = (npy_byte *)dataptr[1];
+    npy_byte *data_out = (npy_byte *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_byte
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_s8;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_s8 a0 = npyv_loada_s8(data0 + vstep * 0);
+            npyv_s8 b0 = npyv_loada_s8(data1 + vstep * 0);
+            npyv_s8 c0 = npyv_loada_s8(data_out + vstep * 0);
+            
+#line 390
+            npyv_s8 a1 = npyv_loada_s8(data0 + vstep * 1);
+            npyv_s8 b1 = npyv_loada_s8(data1 + vstep * 1);
+            npyv_s8 c1 = npyv_loada_s8(data_out + vstep * 1);
+            
+#line 390
+            npyv_s8 a2 = npyv_loada_s8(data0 + vstep * 2);
+            npyv_s8 b2 = npyv_loada_s8(data1 + vstep * 2);
+            npyv_s8 c2 = npyv_loada_s8(data_out + vstep * 2);
+            
+#line 390
+            npyv_s8 a3 = npyv_loada_s8(data0 + vstep * 3);
+            npyv_s8 b3 = npyv_loada_s8(data1 + vstep * 3);
+            npyv_s8 c3 = npyv_loada_s8(data_out + vstep * 3);
+            
+            #line 397
+            npyv_s8 abc0 = npyv_muladd_s8(a0, b0, c0);
+            
+#line 397
+            npyv_s8 abc1 = npyv_muladd_s8(a1, b1, c1);
+            
+#line 397
+            npyv_s8 abc2 = npyv_muladd_s8(a2, b2, c2);
+            
+#line 397
+            npyv_s8 abc3 = npyv_muladd_s8(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_s8(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_s8(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_s8(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_s8(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_s8 a0 = npyv_load_s8(data0 + vstep * 0);
+            npyv_s8 b0 = npyv_load_s8(data1 + vstep * 0);
+            npyv_s8 c0 = npyv_load_s8(data_out + vstep * 0);
+            
+#line 390
+            npyv_s8 a1 = npyv_load_s8(data0 + vstep * 1);
+            npyv_s8 b1 = npyv_load_s8(data1 + vstep * 1);
+            npyv_s8 c1 = npyv_load_s8(data_out + vstep * 1);
+            
+#line 390
+            npyv_s8 a2 = npyv_load_s8(data0 + vstep * 2);
+            npyv_s8 b2 = npyv_load_s8(data1 + vstep * 2);
+            npyv_s8 c2 = npyv_load_s8(data_out + vstep * 2);
+            
+#line 390
+            npyv_s8 a3 = npyv_load_s8(data0 + vstep * 3);
+            npyv_s8 b3 = npyv_load_s8(data1 + vstep * 3);
+            npyv_s8 c3 = npyv_load_s8(data_out + vstep * 3);
+            
+            #line 397
+            npyv_s8 abc0 = npyv_muladd_s8(a0, b0, c0);
+            
+#line 397
+            npyv_s8 abc1 = npyv_muladd_s8(a1, b1, c1);
+            
+#line 397
+            npyv_s8 abc2 = npyv_muladd_s8(a2, b2, c2);
+            
+#line 397
+            npyv_s8 abc3 = npyv_muladd_s8(a3, b3, c3);
+            
+            #line 402
+            npyv_store_s8(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_s8(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_s8(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_s8(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_s8 a = npyv_load_tillz_s8(data0, count);
+        npyv_s8 b = npyv_load_tillz_s8(data1, count);
+        npyv_s8 c = npyv_load_tillz_s8(data_out, count);
+        npyv_store_till_s8(data_out, count, npyv_muladd_s8(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_byte a0 = (data0[0]);
+        const npy_byte b0 = (data1[0]);
+        const npy_byte c0 = (data_out[0]);
+        
+#line 420
+        const npy_byte a1 = (data0[1]);
+        const npy_byte b1 = (data1[1]);
+        const npy_byte c1 = (data_out[1]);
+        
+#line 420
+        const npy_byte a2 = (data0[2]);
+        const npy_byte b2 = (data1[2]);
+        const npy_byte c2 = (data_out[2]);
+        
+#line 420
+        const npy_byte a3 = (data0[3]);
+        const npy_byte b3 = (data1[3]);
+        const npy_byte c3 = (data_out[3]);
+        
+        #line 427
+        const npy_byte abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_byte abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_byte abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_byte abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_byte a = (*data0);
+        const npy_byte b = (*data1);
+        const npy_byte c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_byte
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+byte_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_byte value0 = (*(npy_byte *)dataptr[0]);
+    npy_byte *data1 = (npy_byte *)dataptr[1];
+    npy_byte *data_out = (npy_byte *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    byte_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+byte_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_byte value1 = (*(npy_byte *)dataptr[1]);
+    npy_byte *data0 = (npy_byte *)dataptr[0];
+    npy_byte *data_out = (npy_byte *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    byte_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+byte_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_byte *data0 = (npy_byte *)dataptr[0];
+    npy_byte *data1 = (npy_byte *)dataptr[1];
+    npy_byte accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_byte
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_s8;
+    npyv_s8 v_accum = npyv_zero_s8();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_s8 a0 = npyv_loada_s8(data0 + vstep * 0);
+            npyv_s8 b0 = npyv_loada_s8(data1 + vstep * 0);
+            
+#line 501
+            npyv_s8 a1 = npyv_loada_s8(data0 + vstep * 1);
+            npyv_s8 b1 = npyv_loada_s8(data1 + vstep * 1);
+            
+#line 501
+            npyv_s8 a2 = npyv_loada_s8(data0 + vstep * 2);
+            npyv_s8 b2 = npyv_loada_s8(data1 + vstep * 2);
+            
+#line 501
+            npyv_s8 a3 = npyv_loada_s8(data0 + vstep * 3);
+            npyv_s8 b3 = npyv_loada_s8(data1 + vstep * 3);
+            
+            npyv_s8 ab3 = npyv_muladd_s8(a3, b3, v_accum);
+            npyv_s8 ab2 = npyv_muladd_s8(a2, b2, ab3);
+            npyv_s8 ab1 = npyv_muladd_s8(a1, b1, ab2);
+                   v_accum = npyv_muladd_s8(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_s8 a0 = npyv_load_s8(data0 + vstep * 0);
+            npyv_s8 b0 = npyv_load_s8(data1 + vstep * 0);
+            
+#line 501
+            npyv_s8 a1 = npyv_load_s8(data0 + vstep * 1);
+            npyv_s8 b1 = npyv_load_s8(data1 + vstep * 1);
+            
+#line 501
+            npyv_s8 a2 = npyv_load_s8(data0 + vstep * 2);
+            npyv_s8 b2 = npyv_load_s8(data1 + vstep * 2);
+            
+#line 501
+            npyv_s8 a3 = npyv_load_s8(data0 + vstep * 3);
+            npyv_s8 b3 = npyv_load_s8(data1 + vstep * 3);
+            
+            npyv_s8 ab3 = npyv_muladd_s8(a3, b3, v_accum);
+            npyv_s8 ab2 = npyv_muladd_s8(a2, b2, ab3);
+            npyv_s8 ab1 = npyv_muladd_s8(a1, b1, ab2);
+                   v_accum = npyv_muladd_s8(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_s8 a = npyv_load_tillz_s8(data0, count);
+        npyv_s8 b = npyv_load_tillz_s8(data1, count);
+        v_accum = npyv_muladd_s8(a, b, v_accum);
+    }
+    accum = npyv_sum_s8(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_byte ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_byte ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_byte ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_byte ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_byte a = (*data0);
+        const npy_byte b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_byte
+    *(npy_byte *)dataptr[2] = ((*(npy_byte *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+byte_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_byte *data1 = (npy_byte *)dataptr[1];
+    npy_byte value0 = (*(npy_byte *)dataptr[0]);
+    npy_byte accum = byte_sum_of_arr(data1, count);
+    *(npy_byte *)dataptr[2] = ((*(npy_byte *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+byte_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_byte *data0 = (npy_byte *)dataptr[0];
+    npy_byte value1 = (*(npy_byte *)dataptr[1]);
+    npy_byte accum = byte_sum_of_arr(data0, count);
+    *(npy_byte *)dataptr[2] = ((*(npy_byte *)dataptr[2]) + value1 * accum);
+}
+
+#elif 2 == 3 && !0
+
+static void
+byte_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_byte *data0 = (npy_byte *)dataptr[0];
+    npy_byte *data1 = (npy_byte *)dataptr[1];
+    npy_byte *data2 = (npy_byte *)dataptr[2];
+    npy_byte *data_out = (npy_byte *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 2 > 3 || @complex */
+
+static void
+byte_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_two (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_byte temp = (*(npy_byte *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_byte *)dataptr[i]);
+        }
+        *(npy_byte *)dataptr[nop] = (temp +
+                                           (*(npy_byte *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_byte);
+        }
+#else /* complex */
+#  if 2 <= 3
+#    define _SUMPROD_NOP 2
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_byte re, im, tmp;
+        int i;
+        re = ((npy_byte *)dataptr[0])[0];
+        im = ((npy_byte *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_byte *)dataptr[i])[0] -
+                  im * ((npy_byte *)dataptr[i])[1];
+            im = re * ((npy_byte *)dataptr[i])[1] +
+                 im * ((npy_byte *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_byte *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_byte *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_byte);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 2 */
+
+#if 2 == 1
+
+static NPY_GCC_OPT_3 void
+byte_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_byte *data = (npy_byte *)dataptr[0];
+    npy_byte accum = byte_sum_of_arr(data, count);
+    *((npy_byte *)dataptr[1]) = (accum + (*((npy_byte *)dataptr[1])));
+#else
+    npy_byte accum_re = 0, accum_im = 0;
+    npy_byte *data0 = (npy_byte *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_byte re01 = data0[0] + data0[2];
+        const npy_byte re23 = data0[4] + data0[6];
+        const npy_byte im13 = data0[1] + data0[3];
+        const npy_byte im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_byte *)dataptr[1])[0] += accum_re;
+    ((npy_byte *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 2 == 1 */
+
+static void
+byte_sum_of_products_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_byte accum_re = 0, accum_im = 0;
+#else
+    npy_byte accum = 0;
+#endif
+
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (2 == 2 || 2 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (2 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_outstride0_two (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 2 == 1
+        accum += (*(npy_byte *)data0);
+        data0 += stride0;
+#  elif 2 == 2
+        accum += (*(npy_byte *)data0) *
+                 (*(npy_byte *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 2 == 3
+        accum += (*(npy_byte *)data0) *
+                 (*(npy_byte *)data1) *
+                 (*(npy_byte *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_byte temp = (*(npy_byte *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_byte *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 2 == 1
+        accum_re += ((npy_byte *)data0)[0];
+        accum_im += ((npy_byte *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 2 <= 3
+#define _SUMPROD_NOP 2
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_byte re, im, tmp;
+        int i;
+        re = ((npy_byte *)dataptr[0])[0];
+        im = ((npy_byte *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_byte *)dataptr[i])[0] -
+                  im * ((npy_byte *)dataptr[i])[1];
+            im = re * ((npy_byte *)dataptr[i])[1] +
+                 im * ((npy_byte *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 2 <= 3
+    ((npy_byte *)dataptr[2])[0] += accum_re;
+    ((npy_byte *)dataptr[2])[1] += accum_im;
+#  else
+    ((npy_byte *)dataptr[nop])[0] += accum_re;
+    ((npy_byte *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 2 <= 3
+    *((npy_byte *)dataptr[2]) = (accum +
+                                    (*((npy_byte *)dataptr[2])));
+#  else
+    *((npy_byte *)dataptr[nop]) = (accum +
+                                    (*((npy_byte *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+byte_sum_of_products_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (3 == 2 || 3 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (3 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data_out = dataptr[3];
+    npy_intp stride_out = strides[3];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_three (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 3 == 1
+        *(npy_byte *)data_out = ((*(npy_byte *)data0) +
+                                         (*(npy_byte *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 3 == 2
+        *(npy_byte *)data_out = ((*(npy_byte *)data0) *
+                                         (*(npy_byte *)data1) +
+                                         (*(npy_byte *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 3 == 3
+        *(npy_byte *)data_out = ((*(npy_byte *)data0) *
+                                         (*(npy_byte *)data1) *
+                                         (*(npy_byte *)data2) +
+                                         (*(npy_byte *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_byte temp = (*(npy_byte *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_byte *)dataptr[i]);
+        }
+        *(npy_byte *)dataptr[nop] = (temp +
+                                           (*(npy_byte *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 3 == 1
+        ((npy_byte *)data_out)[0] = ((npy_byte *)data0)[0] +
+                                         ((npy_byte *)data_out)[0];
+        ((npy_byte *)data_out)[1] = ((npy_byte *)data0)[1] +
+                                         ((npy_byte *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 3 <= 3
+#define _SUMPROD_NOP 3
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_byte re, im, tmp;
+        int i;
+        re = ((npy_byte *)dataptr[0])[0];
+        im = ((npy_byte *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_byte *)dataptr[i])[0] -
+                  im * ((npy_byte *)dataptr[i])[1];
+            im = re * ((npy_byte *)dataptr[i])[1] +
+                 im * ((npy_byte *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_byte *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_byte *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 3 == 1
+
+static void
+byte_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_byte *data0 = (npy_byte *)dataptr[0];
+    npy_byte *data_out = (npy_byte *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_byte *)data_out + 2*6)[0] =
+                                    ((npy_byte *)data0 + 2*6)[0] +
+                                    ((npy_byte *)data_out + 2*6)[0];
+            ((npy_byte *)data_out + 2*6)[1] =
+                                    ((npy_byte *)data0 + 2*6)[1] +
+                                    ((npy_byte *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_byte *)data_out + 2*5)[0] =
+                                    ((npy_byte *)data0 + 2*5)[0] +
+                                    ((npy_byte *)data_out + 2*5)[0];
+            ((npy_byte *)data_out + 2*5)[1] =
+                                    ((npy_byte *)data0 + 2*5)[1] +
+                                    ((npy_byte *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_byte *)data_out + 2*4)[0] =
+                                    ((npy_byte *)data0 + 2*4)[0] +
+                                    ((npy_byte *)data_out + 2*4)[0];
+            ((npy_byte *)data_out + 2*4)[1] =
+                                    ((npy_byte *)data0 + 2*4)[1] +
+                                    ((npy_byte *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_byte *)data_out + 2*3)[0] =
+                                    ((npy_byte *)data0 + 2*3)[0] +
+                                    ((npy_byte *)data_out + 2*3)[0];
+            ((npy_byte *)data_out + 2*3)[1] =
+                                    ((npy_byte *)data0 + 2*3)[1] +
+                                    ((npy_byte *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_byte *)data_out + 2*2)[0] =
+                                    ((npy_byte *)data0 + 2*2)[0] +
+                                    ((npy_byte *)data_out + 2*2)[0];
+            ((npy_byte *)data_out + 2*2)[1] =
+                                    ((npy_byte *)data0 + 2*2)[1] +
+                                    ((npy_byte *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_byte *)data_out + 2*1)[0] =
+                                    ((npy_byte *)data0 + 2*1)[0] +
+                                    ((npy_byte *)data_out + 2*1)[0];
+            ((npy_byte *)data_out + 2*1)[1] =
+                                    ((npy_byte *)data0 + 2*1)[1] +
+                                    ((npy_byte *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_byte *)data_out + 2*0)[0] =
+                                    ((npy_byte *)data0 + 2*0)[0] +
+                                    ((npy_byte *)data_out + 2*0)[0];
+            ((npy_byte *)data_out + 2*0)[1] =
+                                    ((npy_byte *)data0 + 2*0)[1] +
+                                    ((npy_byte *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_byte *)data_out + 2*0)[0] =
+                                ((npy_byte *)data0 + 2*0)[0] +
+                                ((npy_byte *)data_out + 2*0)[0];
+        ((npy_byte *)data_out + 2*0)[1] =
+                                ((npy_byte *)data0 + 2*0)[1] +
+                                ((npy_byte *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_byte *)data_out + 2*1)[0] =
+                                ((npy_byte *)data0 + 2*1)[0] +
+                                ((npy_byte *)data_out + 2*1)[0];
+        ((npy_byte *)data_out + 2*1)[1] =
+                                ((npy_byte *)data0 + 2*1)[1] +
+                                ((npy_byte *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_byte *)data_out + 2*2)[0] =
+                                ((npy_byte *)data0 + 2*2)[0] +
+                                ((npy_byte *)data_out + 2*2)[0];
+        ((npy_byte *)data_out + 2*2)[1] =
+                                ((npy_byte *)data0 + 2*2)[1] +
+                                ((npy_byte *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_byte *)data_out + 2*3)[0] =
+                                ((npy_byte *)data0 + 2*3)[0] +
+                                ((npy_byte *)data_out + 2*3)[0];
+        ((npy_byte *)data_out + 2*3)[1] =
+                                ((npy_byte *)data0 + 2*3)[1] +
+                                ((npy_byte *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_byte *)data_out + 2*4)[0] =
+                                ((npy_byte *)data0 + 2*4)[0] +
+                                ((npy_byte *)data_out + 2*4)[0];
+        ((npy_byte *)data_out + 2*4)[1] =
+                                ((npy_byte *)data0 + 2*4)[1] +
+                                ((npy_byte *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_byte *)data_out + 2*5)[0] =
+                                ((npy_byte *)data0 + 2*5)[0] +
+                                ((npy_byte *)data_out + 2*5)[0];
+        ((npy_byte *)data_out + 2*5)[1] =
+                                ((npy_byte *)data0 + 2*5)[1] +
+                                ((npy_byte *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_byte *)data_out + 2*6)[0] =
+                                ((npy_byte *)data0 + 2*6)[0] +
+                                ((npy_byte *)data_out + 2*6)[0];
+        ((npy_byte *)data_out + 2*6)[1] =
+                                ((npy_byte *)data0 + 2*6)[1] +
+                                ((npy_byte *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_byte *)data_out + 2*7)[0] =
+                                ((npy_byte *)data0 + 2*7)[0] +
+                                ((npy_byte *)data_out + 2*7)[0];
+        ((npy_byte *)data_out + 2*7)[1] =
+                                ((npy_byte *)data0 + 2*7)[1] +
+                                ((npy_byte *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 3 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+byte_sum_of_products_muladd(npy_byte *data, npy_byte *data_out, npy_byte scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_byte
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_s8;
+    const npyv_s8 v_scalar = npyv_setall_s8(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_s8 b0 = npyv_loada_s8(data + vstep * 0);
+            npyv_s8 c0 = npyv_loada_s8(data_out + vstep * 0);
+            
+#line 312
+            npyv_s8 b1 = npyv_loada_s8(data + vstep * 1);
+            npyv_s8 c1 = npyv_loada_s8(data_out + vstep * 1);
+            
+#line 312
+            npyv_s8 b2 = npyv_loada_s8(data + vstep * 2);
+            npyv_s8 c2 = npyv_loada_s8(data_out + vstep * 2);
+            
+#line 312
+            npyv_s8 b3 = npyv_loada_s8(data + vstep * 3);
+            npyv_s8 c3 = npyv_loada_s8(data_out + vstep * 3);
+            
+            #line 318
+            npyv_s8 abc0 = npyv_muladd_s8(v_scalar, b0, c0);
+            
+#line 318
+            npyv_s8 abc1 = npyv_muladd_s8(v_scalar, b1, c1);
+            
+#line 318
+            npyv_s8 abc2 = npyv_muladd_s8(v_scalar, b2, c2);
+            
+#line 318
+            npyv_s8 abc3 = npyv_muladd_s8(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_s8(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_s8(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_s8(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_s8(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_s8 b0 = npyv_load_s8(data + vstep * 0);
+            npyv_s8 c0 = npyv_load_s8(data_out + vstep * 0);
+            
+#line 312
+            npyv_s8 b1 = npyv_load_s8(data + vstep * 1);
+            npyv_s8 c1 = npyv_load_s8(data_out + vstep * 1);
+            
+#line 312
+            npyv_s8 b2 = npyv_load_s8(data + vstep * 2);
+            npyv_s8 c2 = npyv_load_s8(data_out + vstep * 2);
+            
+#line 312
+            npyv_s8 b3 = npyv_load_s8(data + vstep * 3);
+            npyv_s8 c3 = npyv_load_s8(data_out + vstep * 3);
+            
+            #line 318
+            npyv_s8 abc0 = npyv_muladd_s8(v_scalar, b0, c0);
+            
+#line 318
+            npyv_s8 abc1 = npyv_muladd_s8(v_scalar, b1, c1);
+            
+#line 318
+            npyv_s8 abc2 = npyv_muladd_s8(v_scalar, b2, c2);
+            
+#line 318
+            npyv_s8 abc3 = npyv_muladd_s8(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_s8(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_s8(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_s8(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_s8(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_s8 a = npyv_load_tillz_s8(data, count);
+        npyv_s8 b = npyv_load_tillz_s8(data_out, count);
+        npyv_store_till_s8(data_out, count, npyv_muladd_s8(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_byte b0 = (data[0]);
+        const npy_byte c0 = (data_out[0]);
+        
+#line 340
+        const npy_byte b1 = (data[1]);
+        const npy_byte c1 = (data_out[1]);
+        
+#line 340
+        const npy_byte b2 = (data[2]);
+        const npy_byte c2 = (data_out[2]);
+        
+#line 340
+        const npy_byte b3 = (data[3]);
+        const npy_byte c3 = (data_out[3]);
+        
+        #line 346
+        const npy_byte abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_byte abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_byte abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_byte abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_byte b = (*data);
+        const npy_byte c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_byte
+}
+
+static void
+byte_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_byte *data0 = (npy_byte *)dataptr[0];
+    npy_byte *data1 = (npy_byte *)dataptr[1];
+    npy_byte *data_out = (npy_byte *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_byte
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_s8;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_s8 a0 = npyv_loada_s8(data0 + vstep * 0);
+            npyv_s8 b0 = npyv_loada_s8(data1 + vstep * 0);
+            npyv_s8 c0 = npyv_loada_s8(data_out + vstep * 0);
+            
+#line 390
+            npyv_s8 a1 = npyv_loada_s8(data0 + vstep * 1);
+            npyv_s8 b1 = npyv_loada_s8(data1 + vstep * 1);
+            npyv_s8 c1 = npyv_loada_s8(data_out + vstep * 1);
+            
+#line 390
+            npyv_s8 a2 = npyv_loada_s8(data0 + vstep * 2);
+            npyv_s8 b2 = npyv_loada_s8(data1 + vstep * 2);
+            npyv_s8 c2 = npyv_loada_s8(data_out + vstep * 2);
+            
+#line 390
+            npyv_s8 a3 = npyv_loada_s8(data0 + vstep * 3);
+            npyv_s8 b3 = npyv_loada_s8(data1 + vstep * 3);
+            npyv_s8 c3 = npyv_loada_s8(data_out + vstep * 3);
+            
+            #line 397
+            npyv_s8 abc0 = npyv_muladd_s8(a0, b0, c0);
+            
+#line 397
+            npyv_s8 abc1 = npyv_muladd_s8(a1, b1, c1);
+            
+#line 397
+            npyv_s8 abc2 = npyv_muladd_s8(a2, b2, c2);
+            
+#line 397
+            npyv_s8 abc3 = npyv_muladd_s8(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_s8(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_s8(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_s8(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_s8(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_s8 a0 = npyv_load_s8(data0 + vstep * 0);
+            npyv_s8 b0 = npyv_load_s8(data1 + vstep * 0);
+            npyv_s8 c0 = npyv_load_s8(data_out + vstep * 0);
+            
+#line 390
+            npyv_s8 a1 = npyv_load_s8(data0 + vstep * 1);
+            npyv_s8 b1 = npyv_load_s8(data1 + vstep * 1);
+            npyv_s8 c1 = npyv_load_s8(data_out + vstep * 1);
+            
+#line 390
+            npyv_s8 a2 = npyv_load_s8(data0 + vstep * 2);
+            npyv_s8 b2 = npyv_load_s8(data1 + vstep * 2);
+            npyv_s8 c2 = npyv_load_s8(data_out + vstep * 2);
+            
+#line 390
+            npyv_s8 a3 = npyv_load_s8(data0 + vstep * 3);
+            npyv_s8 b3 = npyv_load_s8(data1 + vstep * 3);
+            npyv_s8 c3 = npyv_load_s8(data_out + vstep * 3);
+            
+            #line 397
+            npyv_s8 abc0 = npyv_muladd_s8(a0, b0, c0);
+            
+#line 397
+            npyv_s8 abc1 = npyv_muladd_s8(a1, b1, c1);
+            
+#line 397
+            npyv_s8 abc2 = npyv_muladd_s8(a2, b2, c2);
+            
+#line 397
+            npyv_s8 abc3 = npyv_muladd_s8(a3, b3, c3);
+            
+            #line 402
+            npyv_store_s8(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_s8(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_s8(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_s8(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_s8 a = npyv_load_tillz_s8(data0, count);
+        npyv_s8 b = npyv_load_tillz_s8(data1, count);
+        npyv_s8 c = npyv_load_tillz_s8(data_out, count);
+        npyv_store_till_s8(data_out, count, npyv_muladd_s8(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_byte a0 = (data0[0]);
+        const npy_byte b0 = (data1[0]);
+        const npy_byte c0 = (data_out[0]);
+        
+#line 420
+        const npy_byte a1 = (data0[1]);
+        const npy_byte b1 = (data1[1]);
+        const npy_byte c1 = (data_out[1]);
+        
+#line 420
+        const npy_byte a2 = (data0[2]);
+        const npy_byte b2 = (data1[2]);
+        const npy_byte c2 = (data_out[2]);
+        
+#line 420
+        const npy_byte a3 = (data0[3]);
+        const npy_byte b3 = (data1[3]);
+        const npy_byte c3 = (data_out[3]);
+        
+        #line 427
+        const npy_byte abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_byte abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_byte abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_byte abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_byte a = (*data0);
+        const npy_byte b = (*data1);
+        const npy_byte c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_byte
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+byte_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_byte value0 = (*(npy_byte *)dataptr[0]);
+    npy_byte *data1 = (npy_byte *)dataptr[1];
+    npy_byte *data_out = (npy_byte *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    byte_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+byte_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_byte value1 = (*(npy_byte *)dataptr[1]);
+    npy_byte *data0 = (npy_byte *)dataptr[0];
+    npy_byte *data_out = (npy_byte *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    byte_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+byte_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_byte *data0 = (npy_byte *)dataptr[0];
+    npy_byte *data1 = (npy_byte *)dataptr[1];
+    npy_byte accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_byte
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_s8;
+    npyv_s8 v_accum = npyv_zero_s8();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_s8 a0 = npyv_loada_s8(data0 + vstep * 0);
+            npyv_s8 b0 = npyv_loada_s8(data1 + vstep * 0);
+            
+#line 501
+            npyv_s8 a1 = npyv_loada_s8(data0 + vstep * 1);
+            npyv_s8 b1 = npyv_loada_s8(data1 + vstep * 1);
+            
+#line 501
+            npyv_s8 a2 = npyv_loada_s8(data0 + vstep * 2);
+            npyv_s8 b2 = npyv_loada_s8(data1 + vstep * 2);
+            
+#line 501
+            npyv_s8 a3 = npyv_loada_s8(data0 + vstep * 3);
+            npyv_s8 b3 = npyv_loada_s8(data1 + vstep * 3);
+            
+            npyv_s8 ab3 = npyv_muladd_s8(a3, b3, v_accum);
+            npyv_s8 ab2 = npyv_muladd_s8(a2, b2, ab3);
+            npyv_s8 ab1 = npyv_muladd_s8(a1, b1, ab2);
+                   v_accum = npyv_muladd_s8(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_s8 a0 = npyv_load_s8(data0 + vstep * 0);
+            npyv_s8 b0 = npyv_load_s8(data1 + vstep * 0);
+            
+#line 501
+            npyv_s8 a1 = npyv_load_s8(data0 + vstep * 1);
+            npyv_s8 b1 = npyv_load_s8(data1 + vstep * 1);
+            
+#line 501
+            npyv_s8 a2 = npyv_load_s8(data0 + vstep * 2);
+            npyv_s8 b2 = npyv_load_s8(data1 + vstep * 2);
+            
+#line 501
+            npyv_s8 a3 = npyv_load_s8(data0 + vstep * 3);
+            npyv_s8 b3 = npyv_load_s8(data1 + vstep * 3);
+            
+            npyv_s8 ab3 = npyv_muladd_s8(a3, b3, v_accum);
+            npyv_s8 ab2 = npyv_muladd_s8(a2, b2, ab3);
+            npyv_s8 ab1 = npyv_muladd_s8(a1, b1, ab2);
+                   v_accum = npyv_muladd_s8(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_s8 a = npyv_load_tillz_s8(data0, count);
+        npyv_s8 b = npyv_load_tillz_s8(data1, count);
+        v_accum = npyv_muladd_s8(a, b, v_accum);
+    }
+    accum = npyv_sum_s8(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_byte ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_byte ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_byte ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_byte ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_byte a = (*data0);
+        const npy_byte b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_byte
+    *(npy_byte *)dataptr[2] = ((*(npy_byte *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+byte_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_byte *data1 = (npy_byte *)dataptr[1];
+    npy_byte value0 = (*(npy_byte *)dataptr[0]);
+    npy_byte accum = byte_sum_of_arr(data1, count);
+    *(npy_byte *)dataptr[2] = ((*(npy_byte *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+byte_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_byte *data0 = (npy_byte *)dataptr[0];
+    npy_byte value1 = (*(npy_byte *)dataptr[1]);
+    npy_byte accum = byte_sum_of_arr(data0, count);
+    *(npy_byte *)dataptr[2] = ((*(npy_byte *)dataptr[2]) + value1 * accum);
+}
+
+#elif 3 == 3 && !0
+
+static void
+byte_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_byte *data0 = (npy_byte *)dataptr[0];
+    npy_byte *data1 = (npy_byte *)dataptr[1];
+    npy_byte *data2 = (npy_byte *)dataptr[2];
+    npy_byte *data_out = (npy_byte *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 3 > 3 || @complex */
+
+static void
+byte_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_three (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_byte temp = (*(npy_byte *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_byte *)dataptr[i]);
+        }
+        *(npy_byte *)dataptr[nop] = (temp +
+                                           (*(npy_byte *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_byte);
+        }
+#else /* complex */
+#  if 3 <= 3
+#    define _SUMPROD_NOP 3
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_byte re, im, tmp;
+        int i;
+        re = ((npy_byte *)dataptr[0])[0];
+        im = ((npy_byte *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_byte *)dataptr[i])[0] -
+                  im * ((npy_byte *)dataptr[i])[1];
+            im = re * ((npy_byte *)dataptr[i])[1] +
+                 im * ((npy_byte *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_byte *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_byte *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_byte);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 3 */
+
+#if 3 == 1
+
+static NPY_GCC_OPT_3 void
+byte_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_byte *data = (npy_byte *)dataptr[0];
+    npy_byte accum = byte_sum_of_arr(data, count);
+    *((npy_byte *)dataptr[1]) = (accum + (*((npy_byte *)dataptr[1])));
+#else
+    npy_byte accum_re = 0, accum_im = 0;
+    npy_byte *data0 = (npy_byte *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_byte re01 = data0[0] + data0[2];
+        const npy_byte re23 = data0[4] + data0[6];
+        const npy_byte im13 = data0[1] + data0[3];
+        const npy_byte im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_byte *)dataptr[1])[0] += accum_re;
+    ((npy_byte *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 3 == 1 */
+
+static void
+byte_sum_of_products_outstride0_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_byte accum_re = 0, accum_im = 0;
+#else
+    npy_byte accum = 0;
+#endif
+
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (3 == 2 || 3 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (3 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_outstride0_three (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 3 == 1
+        accum += (*(npy_byte *)data0);
+        data0 += stride0;
+#  elif 3 == 2
+        accum += (*(npy_byte *)data0) *
+                 (*(npy_byte *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 3 == 3
+        accum += (*(npy_byte *)data0) *
+                 (*(npy_byte *)data1) *
+                 (*(npy_byte *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_byte temp = (*(npy_byte *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_byte *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 3 == 1
+        accum_re += ((npy_byte *)data0)[0];
+        accum_im += ((npy_byte *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 3 <= 3
+#define _SUMPROD_NOP 3
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_byte re, im, tmp;
+        int i;
+        re = ((npy_byte *)dataptr[0])[0];
+        im = ((npy_byte *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_byte *)dataptr[i])[0] -
+                  im * ((npy_byte *)dataptr[i])[1];
+            im = re * ((npy_byte *)dataptr[i])[1] +
+                 im * ((npy_byte *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 3 <= 3
+    ((npy_byte *)dataptr[3])[0] += accum_re;
+    ((npy_byte *)dataptr[3])[1] += accum_im;
+#  else
+    ((npy_byte *)dataptr[nop])[0] += accum_re;
+    ((npy_byte *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 3 <= 3
+    *((npy_byte *)dataptr[3]) = (accum +
+                                    (*((npy_byte *)dataptr[3])));
+#  else
+    *((npy_byte *)dataptr[nop]) = (accum +
+                                    (*((npy_byte *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+byte_sum_of_products_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1000 == 2 || 1000 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1000 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data_out = dataptr[1000];
+    npy_intp stride_out = strides[1000];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_any (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 1000 == 1
+        *(npy_byte *)data_out = ((*(npy_byte *)data0) +
+                                         (*(npy_byte *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 1000 == 2
+        *(npy_byte *)data_out = ((*(npy_byte *)data0) *
+                                         (*(npy_byte *)data1) +
+                                         (*(npy_byte *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 1000 == 3
+        *(npy_byte *)data_out = ((*(npy_byte *)data0) *
+                                         (*(npy_byte *)data1) *
+                                         (*(npy_byte *)data2) +
+                                         (*(npy_byte *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_byte temp = (*(npy_byte *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_byte *)dataptr[i]);
+        }
+        *(npy_byte *)dataptr[nop] = (temp +
+                                           (*(npy_byte *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1000 == 1
+        ((npy_byte *)data_out)[0] = ((npy_byte *)data0)[0] +
+                                         ((npy_byte *)data_out)[0];
+        ((npy_byte *)data_out)[1] = ((npy_byte *)data0)[1] +
+                                         ((npy_byte *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 1000 <= 3
+#define _SUMPROD_NOP 1000
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_byte re, im, tmp;
+        int i;
+        re = ((npy_byte *)dataptr[0])[0];
+        im = ((npy_byte *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_byte *)dataptr[i])[0] -
+                  im * ((npy_byte *)dataptr[i])[1];
+            im = re * ((npy_byte *)dataptr[i])[1] +
+                 im * ((npy_byte *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_byte *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_byte *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 1000 == 1
+
+static void
+byte_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_byte *data0 = (npy_byte *)dataptr[0];
+    npy_byte *data_out = (npy_byte *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_byte *)data_out + 2*6)[0] =
+                                    ((npy_byte *)data0 + 2*6)[0] +
+                                    ((npy_byte *)data_out + 2*6)[0];
+            ((npy_byte *)data_out + 2*6)[1] =
+                                    ((npy_byte *)data0 + 2*6)[1] +
+                                    ((npy_byte *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_byte *)data_out + 2*5)[0] =
+                                    ((npy_byte *)data0 + 2*5)[0] +
+                                    ((npy_byte *)data_out + 2*5)[0];
+            ((npy_byte *)data_out + 2*5)[1] =
+                                    ((npy_byte *)data0 + 2*5)[1] +
+                                    ((npy_byte *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_byte *)data_out + 2*4)[0] =
+                                    ((npy_byte *)data0 + 2*4)[0] +
+                                    ((npy_byte *)data_out + 2*4)[0];
+            ((npy_byte *)data_out + 2*4)[1] =
+                                    ((npy_byte *)data0 + 2*4)[1] +
+                                    ((npy_byte *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_byte *)data_out + 2*3)[0] =
+                                    ((npy_byte *)data0 + 2*3)[0] +
+                                    ((npy_byte *)data_out + 2*3)[0];
+            ((npy_byte *)data_out + 2*3)[1] =
+                                    ((npy_byte *)data0 + 2*3)[1] +
+                                    ((npy_byte *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_byte *)data_out + 2*2)[0] =
+                                    ((npy_byte *)data0 + 2*2)[0] +
+                                    ((npy_byte *)data_out + 2*2)[0];
+            ((npy_byte *)data_out + 2*2)[1] =
+                                    ((npy_byte *)data0 + 2*2)[1] +
+                                    ((npy_byte *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_byte *)data_out + 2*1)[0] =
+                                    ((npy_byte *)data0 + 2*1)[0] +
+                                    ((npy_byte *)data_out + 2*1)[0];
+            ((npy_byte *)data_out + 2*1)[1] =
+                                    ((npy_byte *)data0 + 2*1)[1] +
+                                    ((npy_byte *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_byte *)data_out + 2*0)[0] =
+                                    ((npy_byte *)data0 + 2*0)[0] +
+                                    ((npy_byte *)data_out + 2*0)[0];
+            ((npy_byte *)data_out + 2*0)[1] =
+                                    ((npy_byte *)data0 + 2*0)[1] +
+                                    ((npy_byte *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_byte *)data_out + 2*0)[0] =
+                                ((npy_byte *)data0 + 2*0)[0] +
+                                ((npy_byte *)data_out + 2*0)[0];
+        ((npy_byte *)data_out + 2*0)[1] =
+                                ((npy_byte *)data0 + 2*0)[1] +
+                                ((npy_byte *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_byte *)data_out + 2*1)[0] =
+                                ((npy_byte *)data0 + 2*1)[0] +
+                                ((npy_byte *)data_out + 2*1)[0];
+        ((npy_byte *)data_out + 2*1)[1] =
+                                ((npy_byte *)data0 + 2*1)[1] +
+                                ((npy_byte *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_byte *)data_out + 2*2)[0] =
+                                ((npy_byte *)data0 + 2*2)[0] +
+                                ((npy_byte *)data_out + 2*2)[0];
+        ((npy_byte *)data_out + 2*2)[1] =
+                                ((npy_byte *)data0 + 2*2)[1] +
+                                ((npy_byte *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_byte *)data_out + 2*3)[0] =
+                                ((npy_byte *)data0 + 2*3)[0] +
+                                ((npy_byte *)data_out + 2*3)[0];
+        ((npy_byte *)data_out + 2*3)[1] =
+                                ((npy_byte *)data0 + 2*3)[1] +
+                                ((npy_byte *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_byte *)data_out + 2*4)[0] =
+                                ((npy_byte *)data0 + 2*4)[0] +
+                                ((npy_byte *)data_out + 2*4)[0];
+        ((npy_byte *)data_out + 2*4)[1] =
+                                ((npy_byte *)data0 + 2*4)[1] +
+                                ((npy_byte *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_byte *)data_out + 2*5)[0] =
+                                ((npy_byte *)data0 + 2*5)[0] +
+                                ((npy_byte *)data_out + 2*5)[0];
+        ((npy_byte *)data_out + 2*5)[1] =
+                                ((npy_byte *)data0 + 2*5)[1] +
+                                ((npy_byte *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_byte *)data_out + 2*6)[0] =
+                                ((npy_byte *)data0 + 2*6)[0] +
+                                ((npy_byte *)data_out + 2*6)[0];
+        ((npy_byte *)data_out + 2*6)[1] =
+                                ((npy_byte *)data0 + 2*6)[1] +
+                                ((npy_byte *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_byte *)data_out + 2*7)[0] =
+                                ((npy_byte *)data0 + 2*7)[0] +
+                                ((npy_byte *)data_out + 2*7)[0];
+        ((npy_byte *)data_out + 2*7)[1] =
+                                ((npy_byte *)data0 + 2*7)[1] +
+                                ((npy_byte *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 1000 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+byte_sum_of_products_muladd(npy_byte *data, npy_byte *data_out, npy_byte scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_byte
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_s8;
+    const npyv_s8 v_scalar = npyv_setall_s8(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_s8 b0 = npyv_loada_s8(data + vstep * 0);
+            npyv_s8 c0 = npyv_loada_s8(data_out + vstep * 0);
+            
+#line 312
+            npyv_s8 b1 = npyv_loada_s8(data + vstep * 1);
+            npyv_s8 c1 = npyv_loada_s8(data_out + vstep * 1);
+            
+#line 312
+            npyv_s8 b2 = npyv_loada_s8(data + vstep * 2);
+            npyv_s8 c2 = npyv_loada_s8(data_out + vstep * 2);
+            
+#line 312
+            npyv_s8 b3 = npyv_loada_s8(data + vstep * 3);
+            npyv_s8 c3 = npyv_loada_s8(data_out + vstep * 3);
+            
+            #line 318
+            npyv_s8 abc0 = npyv_muladd_s8(v_scalar, b0, c0);
+            
+#line 318
+            npyv_s8 abc1 = npyv_muladd_s8(v_scalar, b1, c1);
+            
+#line 318
+            npyv_s8 abc2 = npyv_muladd_s8(v_scalar, b2, c2);
+            
+#line 318
+            npyv_s8 abc3 = npyv_muladd_s8(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_s8(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_s8(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_s8(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_s8(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_s8 b0 = npyv_load_s8(data + vstep * 0);
+            npyv_s8 c0 = npyv_load_s8(data_out + vstep * 0);
+            
+#line 312
+            npyv_s8 b1 = npyv_load_s8(data + vstep * 1);
+            npyv_s8 c1 = npyv_load_s8(data_out + vstep * 1);
+            
+#line 312
+            npyv_s8 b2 = npyv_load_s8(data + vstep * 2);
+            npyv_s8 c2 = npyv_load_s8(data_out + vstep * 2);
+            
+#line 312
+            npyv_s8 b3 = npyv_load_s8(data + vstep * 3);
+            npyv_s8 c3 = npyv_load_s8(data_out + vstep * 3);
+            
+            #line 318
+            npyv_s8 abc0 = npyv_muladd_s8(v_scalar, b0, c0);
+            
+#line 318
+            npyv_s8 abc1 = npyv_muladd_s8(v_scalar, b1, c1);
+            
+#line 318
+            npyv_s8 abc2 = npyv_muladd_s8(v_scalar, b2, c2);
+            
+#line 318
+            npyv_s8 abc3 = npyv_muladd_s8(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_s8(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_s8(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_s8(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_s8(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_s8 a = npyv_load_tillz_s8(data, count);
+        npyv_s8 b = npyv_load_tillz_s8(data_out, count);
+        npyv_store_till_s8(data_out, count, npyv_muladd_s8(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_byte b0 = (data[0]);
+        const npy_byte c0 = (data_out[0]);
+        
+#line 340
+        const npy_byte b1 = (data[1]);
+        const npy_byte c1 = (data_out[1]);
+        
+#line 340
+        const npy_byte b2 = (data[2]);
+        const npy_byte c2 = (data_out[2]);
+        
+#line 340
+        const npy_byte b3 = (data[3]);
+        const npy_byte c3 = (data_out[3]);
+        
+        #line 346
+        const npy_byte abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_byte abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_byte abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_byte abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_byte b = (*data);
+        const npy_byte c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_byte
+}
+
+static void
+byte_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_byte *data0 = (npy_byte *)dataptr[0];
+    npy_byte *data1 = (npy_byte *)dataptr[1];
+    npy_byte *data_out = (npy_byte *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_byte
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_s8;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_s8 a0 = npyv_loada_s8(data0 + vstep * 0);
+            npyv_s8 b0 = npyv_loada_s8(data1 + vstep * 0);
+            npyv_s8 c0 = npyv_loada_s8(data_out + vstep * 0);
+            
+#line 390
+            npyv_s8 a1 = npyv_loada_s8(data0 + vstep * 1);
+            npyv_s8 b1 = npyv_loada_s8(data1 + vstep * 1);
+            npyv_s8 c1 = npyv_loada_s8(data_out + vstep * 1);
+            
+#line 390
+            npyv_s8 a2 = npyv_loada_s8(data0 + vstep * 2);
+            npyv_s8 b2 = npyv_loada_s8(data1 + vstep * 2);
+            npyv_s8 c2 = npyv_loada_s8(data_out + vstep * 2);
+            
+#line 390
+            npyv_s8 a3 = npyv_loada_s8(data0 + vstep * 3);
+            npyv_s8 b3 = npyv_loada_s8(data1 + vstep * 3);
+            npyv_s8 c3 = npyv_loada_s8(data_out + vstep * 3);
+            
+            #line 397
+            npyv_s8 abc0 = npyv_muladd_s8(a0, b0, c0);
+            
+#line 397
+            npyv_s8 abc1 = npyv_muladd_s8(a1, b1, c1);
+            
+#line 397
+            npyv_s8 abc2 = npyv_muladd_s8(a2, b2, c2);
+            
+#line 397
+            npyv_s8 abc3 = npyv_muladd_s8(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_s8(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_s8(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_s8(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_s8(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_s8 a0 = npyv_load_s8(data0 + vstep * 0);
+            npyv_s8 b0 = npyv_load_s8(data1 + vstep * 0);
+            npyv_s8 c0 = npyv_load_s8(data_out + vstep * 0);
+            
+#line 390
+            npyv_s8 a1 = npyv_load_s8(data0 + vstep * 1);
+            npyv_s8 b1 = npyv_load_s8(data1 + vstep * 1);
+            npyv_s8 c1 = npyv_load_s8(data_out + vstep * 1);
+            
+#line 390
+            npyv_s8 a2 = npyv_load_s8(data0 + vstep * 2);
+            npyv_s8 b2 = npyv_load_s8(data1 + vstep * 2);
+            npyv_s8 c2 = npyv_load_s8(data_out + vstep * 2);
+            
+#line 390
+            npyv_s8 a3 = npyv_load_s8(data0 + vstep * 3);
+            npyv_s8 b3 = npyv_load_s8(data1 + vstep * 3);
+            npyv_s8 c3 = npyv_load_s8(data_out + vstep * 3);
+            
+            #line 397
+            npyv_s8 abc0 = npyv_muladd_s8(a0, b0, c0);
+            
+#line 397
+            npyv_s8 abc1 = npyv_muladd_s8(a1, b1, c1);
+            
+#line 397
+            npyv_s8 abc2 = npyv_muladd_s8(a2, b2, c2);
+            
+#line 397
+            npyv_s8 abc3 = npyv_muladd_s8(a3, b3, c3);
+            
+            #line 402
+            npyv_store_s8(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_s8(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_s8(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_s8(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_s8 a = npyv_load_tillz_s8(data0, count);
+        npyv_s8 b = npyv_load_tillz_s8(data1, count);
+        npyv_s8 c = npyv_load_tillz_s8(data_out, count);
+        npyv_store_till_s8(data_out, count, npyv_muladd_s8(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_byte a0 = (data0[0]);
+        const npy_byte b0 = (data1[0]);
+        const npy_byte c0 = (data_out[0]);
+        
+#line 420
+        const npy_byte a1 = (data0[1]);
+        const npy_byte b1 = (data1[1]);
+        const npy_byte c1 = (data_out[1]);
+        
+#line 420
+        const npy_byte a2 = (data0[2]);
+        const npy_byte b2 = (data1[2]);
+        const npy_byte c2 = (data_out[2]);
+        
+#line 420
+        const npy_byte a3 = (data0[3]);
+        const npy_byte b3 = (data1[3]);
+        const npy_byte c3 = (data_out[3]);
+        
+        #line 427
+        const npy_byte abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_byte abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_byte abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_byte abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_byte a = (*data0);
+        const npy_byte b = (*data1);
+        const npy_byte c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_byte
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+byte_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_byte value0 = (*(npy_byte *)dataptr[0]);
+    npy_byte *data1 = (npy_byte *)dataptr[1];
+    npy_byte *data_out = (npy_byte *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    byte_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+byte_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_byte value1 = (*(npy_byte *)dataptr[1]);
+    npy_byte *data0 = (npy_byte *)dataptr[0];
+    npy_byte *data_out = (npy_byte *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    byte_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+byte_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_byte *data0 = (npy_byte *)dataptr[0];
+    npy_byte *data1 = (npy_byte *)dataptr[1];
+    npy_byte accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_byte
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_s8;
+    npyv_s8 v_accum = npyv_zero_s8();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_s8 a0 = npyv_loada_s8(data0 + vstep * 0);
+            npyv_s8 b0 = npyv_loada_s8(data1 + vstep * 0);
+            
+#line 501
+            npyv_s8 a1 = npyv_loada_s8(data0 + vstep * 1);
+            npyv_s8 b1 = npyv_loada_s8(data1 + vstep * 1);
+            
+#line 501
+            npyv_s8 a2 = npyv_loada_s8(data0 + vstep * 2);
+            npyv_s8 b2 = npyv_loada_s8(data1 + vstep * 2);
+            
+#line 501
+            npyv_s8 a3 = npyv_loada_s8(data0 + vstep * 3);
+            npyv_s8 b3 = npyv_loada_s8(data1 + vstep * 3);
+            
+            npyv_s8 ab3 = npyv_muladd_s8(a3, b3, v_accum);
+            npyv_s8 ab2 = npyv_muladd_s8(a2, b2, ab3);
+            npyv_s8 ab1 = npyv_muladd_s8(a1, b1, ab2);
+                   v_accum = npyv_muladd_s8(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_s8 a0 = npyv_load_s8(data0 + vstep * 0);
+            npyv_s8 b0 = npyv_load_s8(data1 + vstep * 0);
+            
+#line 501
+            npyv_s8 a1 = npyv_load_s8(data0 + vstep * 1);
+            npyv_s8 b1 = npyv_load_s8(data1 + vstep * 1);
+            
+#line 501
+            npyv_s8 a2 = npyv_load_s8(data0 + vstep * 2);
+            npyv_s8 b2 = npyv_load_s8(data1 + vstep * 2);
+            
+#line 501
+            npyv_s8 a3 = npyv_load_s8(data0 + vstep * 3);
+            npyv_s8 b3 = npyv_load_s8(data1 + vstep * 3);
+            
+            npyv_s8 ab3 = npyv_muladd_s8(a3, b3, v_accum);
+            npyv_s8 ab2 = npyv_muladd_s8(a2, b2, ab3);
+            npyv_s8 ab1 = npyv_muladd_s8(a1, b1, ab2);
+                   v_accum = npyv_muladd_s8(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_s8 a = npyv_load_tillz_s8(data0, count);
+        npyv_s8 b = npyv_load_tillz_s8(data1, count);
+        v_accum = npyv_muladd_s8(a, b, v_accum);
+    }
+    accum = npyv_sum_s8(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_byte ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_byte ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_byte ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_byte ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_byte a = (*data0);
+        const npy_byte b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_byte
+    *(npy_byte *)dataptr[2] = ((*(npy_byte *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+byte_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_byte *data1 = (npy_byte *)dataptr[1];
+    npy_byte value0 = (*(npy_byte *)dataptr[0]);
+    npy_byte accum = byte_sum_of_arr(data1, count);
+    *(npy_byte *)dataptr[2] = ((*(npy_byte *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+byte_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_byte *data0 = (npy_byte *)dataptr[0];
+    npy_byte value1 = (*(npy_byte *)dataptr[1]);
+    npy_byte accum = byte_sum_of_arr(data0, count);
+    *(npy_byte *)dataptr[2] = ((*(npy_byte *)dataptr[2]) + value1 * accum);
+}
+
+#elif 1000 == 3 && !0
+
+static void
+byte_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_byte *data0 = (npy_byte *)dataptr[0];
+    npy_byte *data1 = (npy_byte *)dataptr[1];
+    npy_byte *data2 = (npy_byte *)dataptr[2];
+    npy_byte *data_out = (npy_byte *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 1000 > 3 || @complex */
+
+static void
+byte_sum_of_products_contig_any(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_any (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_byte temp = (*(npy_byte *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_byte *)dataptr[i]);
+        }
+        *(npy_byte *)dataptr[nop] = (temp +
+                                           (*(npy_byte *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_byte);
+        }
+#else /* complex */
+#  if 1000 <= 3
+#    define _SUMPROD_NOP 1000
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_byte re, im, tmp;
+        int i;
+        re = ((npy_byte *)dataptr[0])[0];
+        im = ((npy_byte *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_byte *)dataptr[i])[0] -
+                  im * ((npy_byte *)dataptr[i])[1];
+            im = re * ((npy_byte *)dataptr[i])[1] +
+                 im * ((npy_byte *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_byte *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_byte *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_byte *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_byte);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 1000 */
+
+#if 1000 == 1
+
+static NPY_GCC_OPT_3 void
+byte_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_byte *data = (npy_byte *)dataptr[0];
+    npy_byte accum = byte_sum_of_arr(data, count);
+    *((npy_byte *)dataptr[1]) = (accum + (*((npy_byte *)dataptr[1])));
+#else
+    npy_byte accum_re = 0, accum_im = 0;
+    npy_byte *data0 = (npy_byte *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_byte re01 = data0[0] + data0[2];
+        const npy_byte re23 = data0[4] + data0[6];
+        const npy_byte im13 = data0[1] + data0[3];
+        const npy_byte im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_byte *)dataptr[1])[0] += accum_re;
+    ((npy_byte *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 1000 == 1 */
+
+static void
+byte_sum_of_products_outstride0_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_byte accum_re = 0, accum_im = 0;
+#else
+    npy_byte accum = 0;
+#endif
+
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1000 == 2 || 1000 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1000 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("byte_sum_of_products_outstride0_any (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 1000 == 1
+        accum += (*(npy_byte *)data0);
+        data0 += stride0;
+#  elif 1000 == 2
+        accum += (*(npy_byte *)data0) *
+                 (*(npy_byte *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 1000 == 3
+        accum += (*(npy_byte *)data0) *
+                 (*(npy_byte *)data1) *
+                 (*(npy_byte *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_byte temp = (*(npy_byte *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_byte *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1000 == 1
+        accum_re += ((npy_byte *)data0)[0];
+        accum_im += ((npy_byte *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 1000 <= 3
+#define _SUMPROD_NOP 1000
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_byte re, im, tmp;
+        int i;
+        re = ((npy_byte *)dataptr[0])[0];
+        im = ((npy_byte *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_byte *)dataptr[i])[0] -
+                  im * ((npy_byte *)dataptr[i])[1];
+            im = re * ((npy_byte *)dataptr[i])[1] +
+                 im * ((npy_byte *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 1000 <= 3
+    ((npy_byte *)dataptr[1000])[0] += accum_re;
+    ((npy_byte *)dataptr[1000])[1] += accum_im;
+#  else
+    ((npy_byte *)dataptr[nop])[0] += accum_re;
+    ((npy_byte *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 1000 <= 3
+    *((npy_byte *)dataptr[1000]) = (accum +
+                                    (*((npy_byte *)dataptr[1000])));
+#  else
+    *((npy_byte *)dataptr[nop]) = (accum +
+                                    (*((npy_byte *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+
+
+#line 74
+
+#if !0
+static NPY_GCC_OPT_3 npy_short short_sum_of_arr(npy_short *data, npy_intp count)
+{
+    npy_short accum = 0;
+#if 0 // NPYV check for npy_short
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data);
+    const int vstep = npyv_nlanes_s16;
+    npyv_s16 v_accum = npyv_zero_s16();
+    const npy_intp vstepx4 = vstep * 4;
+
+    #line 91
+    if(is_aligned) {
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
+            #line 96
+            npyv_s16 a0 = npyv_loada_s16(data + vstep * 0);
+            
+#line 96
+            npyv_s16 a1 = npyv_loada_s16(data + vstep * 1);
+            
+#line 96
+            npyv_s16 a2 = npyv_loada_s16(data + vstep * 2);
+            
+#line 96
+            npyv_s16 a3 = npyv_loada_s16(data + vstep * 3);
+            
+            npyv_s16 a01   = npyv_add_s16(a0, a1);
+            npyv_s16 a23   = npyv_add_s16(a2, a3);
+            npyv_s16 a0123 = npyv_add_s16(a01, a23);
+                     v_accum = npyv_add_s16(a0123, v_accum);
+        }
+    }
+    
+#line 91
+    else {
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
+            #line 96
+            npyv_s16 a0 = npyv_load_s16(data + vstep * 0);
+            
+#line 96
+            npyv_s16 a1 = npyv_load_s16(data + vstep * 1);
+            
+#line 96
+            npyv_s16 a2 = npyv_load_s16(data + vstep * 2);
+            
+#line 96
+            npyv_s16 a3 = npyv_load_s16(data + vstep * 3);
+            
+            npyv_s16 a01   = npyv_add_s16(a0, a1);
+            npyv_s16 a23   = npyv_add_s16(a2, a3);
+            npyv_s16 a0123 = npyv_add_s16(a01, a23);
+                     v_accum = npyv_add_s16(a0123, v_accum);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep) {
+        npyv_s16 a = npyv_load_tillz_s16(data, count);
+        v_accum = npyv_add_s16(a, v_accum);
+    }
+    accum = npyv_sum_s16(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data += 4) {
+        const npy_short a01 = (*data) + (data[1]);
+        const npy_short a23 = (data[2]) + (data[3]);
+        accum +=  a01 + a23;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data++) {
+        accum += (*data);
+    }
+#endif // NPYV check for npy_short
+    return accum;
+}
+#endif
+
+#line 131
+static void
+short_sum_of_products_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1 == 2 || 1 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data_out = dataptr[1];
+    npy_intp stride_out = strides[1];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_one (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 1 == 1
+        *(npy_short *)data_out = ((*(npy_short *)data0) +
+                                         (*(npy_short *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 1 == 2
+        *(npy_short *)data_out = ((*(npy_short *)data0) *
+                                         (*(npy_short *)data1) +
+                                         (*(npy_short *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 1 == 3
+        *(npy_short *)data_out = ((*(npy_short *)data0) *
+                                         (*(npy_short *)data1) *
+                                         (*(npy_short *)data2) +
+                                         (*(npy_short *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_short temp = (*(npy_short *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_short *)dataptr[i]);
+        }
+        *(npy_short *)dataptr[nop] = (temp +
+                                           (*(npy_short *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1 == 1
+        ((npy_short *)data_out)[0] = ((npy_short *)data0)[0] +
+                                         ((npy_short *)data_out)[0];
+        ((npy_short *)data_out)[1] = ((npy_short *)data0)[1] +
+                                         ((npy_short *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 1 <= 3
+#define _SUMPROD_NOP 1
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_short re, im, tmp;
+        int i;
+        re = ((npy_short *)dataptr[0])[0];
+        im = ((npy_short *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_short *)dataptr[i])[0] -
+                  im * ((npy_short *)dataptr[i])[1];
+            im = re * ((npy_short *)dataptr[i])[1] +
+                 im * ((npy_short *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_short *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_short *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_short *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_short *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 1 == 1
+
+static void
+short_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_short *data0 = (npy_short *)dataptr[0];
+    npy_short *data_out = (npy_short *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_short *)data_out + 2*6)[0] =
+                                    ((npy_short *)data0 + 2*6)[0] +
+                                    ((npy_short *)data_out + 2*6)[0];
+            ((npy_short *)data_out + 2*6)[1] =
+                                    ((npy_short *)data0 + 2*6)[1] +
+                                    ((npy_short *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_short *)data_out + 2*5)[0] =
+                                    ((npy_short *)data0 + 2*5)[0] +
+                                    ((npy_short *)data_out + 2*5)[0];
+            ((npy_short *)data_out + 2*5)[1] =
+                                    ((npy_short *)data0 + 2*5)[1] +
+                                    ((npy_short *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_short *)data_out + 2*4)[0] =
+                                    ((npy_short *)data0 + 2*4)[0] +
+                                    ((npy_short *)data_out + 2*4)[0];
+            ((npy_short *)data_out + 2*4)[1] =
+                                    ((npy_short *)data0 + 2*4)[1] +
+                                    ((npy_short *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_short *)data_out + 2*3)[0] =
+                                    ((npy_short *)data0 + 2*3)[0] +
+                                    ((npy_short *)data_out + 2*3)[0];
+            ((npy_short *)data_out + 2*3)[1] =
+                                    ((npy_short *)data0 + 2*3)[1] +
+                                    ((npy_short *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_short *)data_out + 2*2)[0] =
+                                    ((npy_short *)data0 + 2*2)[0] +
+                                    ((npy_short *)data_out + 2*2)[0];
+            ((npy_short *)data_out + 2*2)[1] =
+                                    ((npy_short *)data0 + 2*2)[1] +
+                                    ((npy_short *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_short *)data_out + 2*1)[0] =
+                                    ((npy_short *)data0 + 2*1)[0] +
+                                    ((npy_short *)data_out + 2*1)[0];
+            ((npy_short *)data_out + 2*1)[1] =
+                                    ((npy_short *)data0 + 2*1)[1] +
+                                    ((npy_short *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_short *)data_out + 2*0)[0] =
+                                    ((npy_short *)data0 + 2*0)[0] +
+                                    ((npy_short *)data_out + 2*0)[0];
+            ((npy_short *)data_out + 2*0)[1] =
+                                    ((npy_short *)data0 + 2*0)[1] +
+                                    ((npy_short *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_short *)data_out + 2*0)[0] =
+                                ((npy_short *)data0 + 2*0)[0] +
+                                ((npy_short *)data_out + 2*0)[0];
+        ((npy_short *)data_out + 2*0)[1] =
+                                ((npy_short *)data0 + 2*0)[1] +
+                                ((npy_short *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_short *)data_out + 2*1)[0] =
+                                ((npy_short *)data0 + 2*1)[0] +
+                                ((npy_short *)data_out + 2*1)[0];
+        ((npy_short *)data_out + 2*1)[1] =
+                                ((npy_short *)data0 + 2*1)[1] +
+                                ((npy_short *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_short *)data_out + 2*2)[0] =
+                                ((npy_short *)data0 + 2*2)[0] +
+                                ((npy_short *)data_out + 2*2)[0];
+        ((npy_short *)data_out + 2*2)[1] =
+                                ((npy_short *)data0 + 2*2)[1] +
+                                ((npy_short *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_short *)data_out + 2*3)[0] =
+                                ((npy_short *)data0 + 2*3)[0] +
+                                ((npy_short *)data_out + 2*3)[0];
+        ((npy_short *)data_out + 2*3)[1] =
+                                ((npy_short *)data0 + 2*3)[1] +
+                                ((npy_short *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_short *)data_out + 2*4)[0] =
+                                ((npy_short *)data0 + 2*4)[0] +
+                                ((npy_short *)data_out + 2*4)[0];
+        ((npy_short *)data_out + 2*4)[1] =
+                                ((npy_short *)data0 + 2*4)[1] +
+                                ((npy_short *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_short *)data_out + 2*5)[0] =
+                                ((npy_short *)data0 + 2*5)[0] +
+                                ((npy_short *)data_out + 2*5)[0];
+        ((npy_short *)data_out + 2*5)[1] =
+                                ((npy_short *)data0 + 2*5)[1] +
+                                ((npy_short *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_short *)data_out + 2*6)[0] =
+                                ((npy_short *)data0 + 2*6)[0] +
+                                ((npy_short *)data_out + 2*6)[0];
+        ((npy_short *)data_out + 2*6)[1] =
+                                ((npy_short *)data0 + 2*6)[1] +
+                                ((npy_short *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_short *)data_out + 2*7)[0] =
+                                ((npy_short *)data0 + 2*7)[0] +
+                                ((npy_short *)data_out + 2*7)[0];
+        ((npy_short *)data_out + 2*7)[1] =
+                                ((npy_short *)data0 + 2*7)[1] +
+                                ((npy_short *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 1 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+short_sum_of_products_muladd(npy_short *data, npy_short *data_out, npy_short scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_short
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_s16;
+    const npyv_s16 v_scalar = npyv_setall_s16(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_s16 b0 = npyv_loada_s16(data + vstep * 0);
+            npyv_s16 c0 = npyv_loada_s16(data_out + vstep * 0);
+            
+#line 312
+            npyv_s16 b1 = npyv_loada_s16(data + vstep * 1);
+            npyv_s16 c1 = npyv_loada_s16(data_out + vstep * 1);
+            
+#line 312
+            npyv_s16 b2 = npyv_loada_s16(data + vstep * 2);
+            npyv_s16 c2 = npyv_loada_s16(data_out + vstep * 2);
+            
+#line 312
+            npyv_s16 b3 = npyv_loada_s16(data + vstep * 3);
+            npyv_s16 c3 = npyv_loada_s16(data_out + vstep * 3);
+            
+            #line 318
+            npyv_s16 abc0 = npyv_muladd_s16(v_scalar, b0, c0);
+            
+#line 318
+            npyv_s16 abc1 = npyv_muladd_s16(v_scalar, b1, c1);
+            
+#line 318
+            npyv_s16 abc2 = npyv_muladd_s16(v_scalar, b2, c2);
+            
+#line 318
+            npyv_s16 abc3 = npyv_muladd_s16(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_s16(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_s16(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_s16(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_s16(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_s16 b0 = npyv_load_s16(data + vstep * 0);
+            npyv_s16 c0 = npyv_load_s16(data_out + vstep * 0);
+            
+#line 312
+            npyv_s16 b1 = npyv_load_s16(data + vstep * 1);
+            npyv_s16 c1 = npyv_load_s16(data_out + vstep * 1);
+            
+#line 312
+            npyv_s16 b2 = npyv_load_s16(data + vstep * 2);
+            npyv_s16 c2 = npyv_load_s16(data_out + vstep * 2);
+            
+#line 312
+            npyv_s16 b3 = npyv_load_s16(data + vstep * 3);
+            npyv_s16 c3 = npyv_load_s16(data_out + vstep * 3);
+            
+            #line 318
+            npyv_s16 abc0 = npyv_muladd_s16(v_scalar, b0, c0);
+            
+#line 318
+            npyv_s16 abc1 = npyv_muladd_s16(v_scalar, b1, c1);
+            
+#line 318
+            npyv_s16 abc2 = npyv_muladd_s16(v_scalar, b2, c2);
+            
+#line 318
+            npyv_s16 abc3 = npyv_muladd_s16(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_s16(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_s16(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_s16(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_s16(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_s16 a = npyv_load_tillz_s16(data, count);
+        npyv_s16 b = npyv_load_tillz_s16(data_out, count);
+        npyv_store_till_s16(data_out, count, npyv_muladd_s16(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_short b0 = (data[0]);
+        const npy_short c0 = (data_out[0]);
+        
+#line 340
+        const npy_short b1 = (data[1]);
+        const npy_short c1 = (data_out[1]);
+        
+#line 340
+        const npy_short b2 = (data[2]);
+        const npy_short c2 = (data_out[2]);
+        
+#line 340
+        const npy_short b3 = (data[3]);
+        const npy_short c3 = (data_out[3]);
+        
+        #line 346
+        const npy_short abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_short abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_short abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_short abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_short b = (*data);
+        const npy_short c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_short
+}
+
+static void
+short_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_short *data0 = (npy_short *)dataptr[0];
+    npy_short *data1 = (npy_short *)dataptr[1];
+    npy_short *data_out = (npy_short *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_short
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_s16;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_s16 a0 = npyv_loada_s16(data0 + vstep * 0);
+            npyv_s16 b0 = npyv_loada_s16(data1 + vstep * 0);
+            npyv_s16 c0 = npyv_loada_s16(data_out + vstep * 0);
+            
+#line 390
+            npyv_s16 a1 = npyv_loada_s16(data0 + vstep * 1);
+            npyv_s16 b1 = npyv_loada_s16(data1 + vstep * 1);
+            npyv_s16 c1 = npyv_loada_s16(data_out + vstep * 1);
+            
+#line 390
+            npyv_s16 a2 = npyv_loada_s16(data0 + vstep * 2);
+            npyv_s16 b2 = npyv_loada_s16(data1 + vstep * 2);
+            npyv_s16 c2 = npyv_loada_s16(data_out + vstep * 2);
+            
+#line 390
+            npyv_s16 a3 = npyv_loada_s16(data0 + vstep * 3);
+            npyv_s16 b3 = npyv_loada_s16(data1 + vstep * 3);
+            npyv_s16 c3 = npyv_loada_s16(data_out + vstep * 3);
+            
+            #line 397
+            npyv_s16 abc0 = npyv_muladd_s16(a0, b0, c0);
+            
+#line 397
+            npyv_s16 abc1 = npyv_muladd_s16(a1, b1, c1);
+            
+#line 397
+            npyv_s16 abc2 = npyv_muladd_s16(a2, b2, c2);
+            
+#line 397
+            npyv_s16 abc3 = npyv_muladd_s16(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_s16(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_s16(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_s16(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_s16(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_s16 a0 = npyv_load_s16(data0 + vstep * 0);
+            npyv_s16 b0 = npyv_load_s16(data1 + vstep * 0);
+            npyv_s16 c0 = npyv_load_s16(data_out + vstep * 0);
+            
+#line 390
+            npyv_s16 a1 = npyv_load_s16(data0 + vstep * 1);
+            npyv_s16 b1 = npyv_load_s16(data1 + vstep * 1);
+            npyv_s16 c1 = npyv_load_s16(data_out + vstep * 1);
+            
+#line 390
+            npyv_s16 a2 = npyv_load_s16(data0 + vstep * 2);
+            npyv_s16 b2 = npyv_load_s16(data1 + vstep * 2);
+            npyv_s16 c2 = npyv_load_s16(data_out + vstep * 2);
+            
+#line 390
+            npyv_s16 a3 = npyv_load_s16(data0 + vstep * 3);
+            npyv_s16 b3 = npyv_load_s16(data1 + vstep * 3);
+            npyv_s16 c3 = npyv_load_s16(data_out + vstep * 3);
+            
+            #line 397
+            npyv_s16 abc0 = npyv_muladd_s16(a0, b0, c0);
+            
+#line 397
+            npyv_s16 abc1 = npyv_muladd_s16(a1, b1, c1);
+            
+#line 397
+            npyv_s16 abc2 = npyv_muladd_s16(a2, b2, c2);
+            
+#line 397
+            npyv_s16 abc3 = npyv_muladd_s16(a3, b3, c3);
+            
+            #line 402
+            npyv_store_s16(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_s16(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_s16(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_s16(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_s16 a = npyv_load_tillz_s16(data0, count);
+        npyv_s16 b = npyv_load_tillz_s16(data1, count);
+        npyv_s16 c = npyv_load_tillz_s16(data_out, count);
+        npyv_store_till_s16(data_out, count, npyv_muladd_s16(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_short a0 = (data0[0]);
+        const npy_short b0 = (data1[0]);
+        const npy_short c0 = (data_out[0]);
+        
+#line 420
+        const npy_short a1 = (data0[1]);
+        const npy_short b1 = (data1[1]);
+        const npy_short c1 = (data_out[1]);
+        
+#line 420
+        const npy_short a2 = (data0[2]);
+        const npy_short b2 = (data1[2]);
+        const npy_short c2 = (data_out[2]);
+        
+#line 420
+        const npy_short a3 = (data0[3]);
+        const npy_short b3 = (data1[3]);
+        const npy_short c3 = (data_out[3]);
+        
+        #line 427
+        const npy_short abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_short abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_short abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_short abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_short a = (*data0);
+        const npy_short b = (*data1);
+        const npy_short c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_short
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+short_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_short value0 = (*(npy_short *)dataptr[0]);
+    npy_short *data1 = (npy_short *)dataptr[1];
+    npy_short *data_out = (npy_short *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    short_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+short_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_short value1 = (*(npy_short *)dataptr[1]);
+    npy_short *data0 = (npy_short *)dataptr[0];
+    npy_short *data_out = (npy_short *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    short_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+short_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_short *data0 = (npy_short *)dataptr[0];
+    npy_short *data1 = (npy_short *)dataptr[1];
+    npy_short accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_short
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_s16;
+    npyv_s16 v_accum = npyv_zero_s16();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_s16 a0 = npyv_loada_s16(data0 + vstep * 0);
+            npyv_s16 b0 = npyv_loada_s16(data1 + vstep * 0);
+            
+#line 501
+            npyv_s16 a1 = npyv_loada_s16(data0 + vstep * 1);
+            npyv_s16 b1 = npyv_loada_s16(data1 + vstep * 1);
+            
+#line 501
+            npyv_s16 a2 = npyv_loada_s16(data0 + vstep * 2);
+            npyv_s16 b2 = npyv_loada_s16(data1 + vstep * 2);
+            
+#line 501
+            npyv_s16 a3 = npyv_loada_s16(data0 + vstep * 3);
+            npyv_s16 b3 = npyv_loada_s16(data1 + vstep * 3);
+            
+            npyv_s16 ab3 = npyv_muladd_s16(a3, b3, v_accum);
+            npyv_s16 ab2 = npyv_muladd_s16(a2, b2, ab3);
+            npyv_s16 ab1 = npyv_muladd_s16(a1, b1, ab2);
+                   v_accum = npyv_muladd_s16(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_s16 a0 = npyv_load_s16(data0 + vstep * 0);
+            npyv_s16 b0 = npyv_load_s16(data1 + vstep * 0);
+            
+#line 501
+            npyv_s16 a1 = npyv_load_s16(data0 + vstep * 1);
+            npyv_s16 b1 = npyv_load_s16(data1 + vstep * 1);
+            
+#line 501
+            npyv_s16 a2 = npyv_load_s16(data0 + vstep * 2);
+            npyv_s16 b2 = npyv_load_s16(data1 + vstep * 2);
+            
+#line 501
+            npyv_s16 a3 = npyv_load_s16(data0 + vstep * 3);
+            npyv_s16 b3 = npyv_load_s16(data1 + vstep * 3);
+            
+            npyv_s16 ab3 = npyv_muladd_s16(a3, b3, v_accum);
+            npyv_s16 ab2 = npyv_muladd_s16(a2, b2, ab3);
+            npyv_s16 ab1 = npyv_muladd_s16(a1, b1, ab2);
+                   v_accum = npyv_muladd_s16(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_s16 a = npyv_load_tillz_s16(data0, count);
+        npyv_s16 b = npyv_load_tillz_s16(data1, count);
+        v_accum = npyv_muladd_s16(a, b, v_accum);
+    }
+    accum = npyv_sum_s16(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_short ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_short ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_short ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_short ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_short a = (*data0);
+        const npy_short b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_short
+    *(npy_short *)dataptr[2] = ((*(npy_short *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+short_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_short *data1 = (npy_short *)dataptr[1];
+    npy_short value0 = (*(npy_short *)dataptr[0]);
+    npy_short accum = short_sum_of_arr(data1, count);
+    *(npy_short *)dataptr[2] = ((*(npy_short *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+short_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_short *data0 = (npy_short *)dataptr[0];
+    npy_short value1 = (*(npy_short *)dataptr[1]);
+    npy_short accum = short_sum_of_arr(data0, count);
+    *(npy_short *)dataptr[2] = ((*(npy_short *)dataptr[2]) + value1 * accum);
+}
+
+#elif 1 == 3 && !0
+
+static void
+short_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_short *data0 = (npy_short *)dataptr[0];
+    npy_short *data1 = (npy_short *)dataptr[1];
+    npy_short *data2 = (npy_short *)dataptr[2];
+    npy_short *data_out = (npy_short *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 1 > 3 || @complex */
+
+static void
+short_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_one (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_short temp = (*(npy_short *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_short *)dataptr[i]);
+        }
+        *(npy_short *)dataptr[nop] = (temp +
+                                           (*(npy_short *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_short);
+        }
+#else /* complex */
+#  if 1 <= 3
+#    define _SUMPROD_NOP 1
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_short re, im, tmp;
+        int i;
+        re = ((npy_short *)dataptr[0])[0];
+        im = ((npy_short *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_short *)dataptr[i])[0] -
+                  im * ((npy_short *)dataptr[i])[1];
+            im = re * ((npy_short *)dataptr[i])[1] +
+                 im * ((npy_short *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_short *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_short *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_short *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_short *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_short);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 1 */
+
+#if 1 == 1
+
+static NPY_GCC_OPT_3 void
+short_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_short *data = (npy_short *)dataptr[0];
+    npy_short accum = short_sum_of_arr(data, count);
+    *((npy_short *)dataptr[1]) = (accum + (*((npy_short *)dataptr[1])));
+#else
+    npy_short accum_re = 0, accum_im = 0;
+    npy_short *data0 = (npy_short *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_short re01 = data0[0] + data0[2];
+        const npy_short re23 = data0[4] + data0[6];
+        const npy_short im13 = data0[1] + data0[3];
+        const npy_short im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_short *)dataptr[1])[0] += accum_re;
+    ((npy_short *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 1 == 1 */
+
+static void
+short_sum_of_products_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_short accum_re = 0, accum_im = 0;
+#else
+    npy_short accum = 0;
+#endif
+
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1 == 2 || 1 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_outstride0_one (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 1 == 1
+        accum += (*(npy_short *)data0);
+        data0 += stride0;
+#  elif 1 == 2
+        accum += (*(npy_short *)data0) *
+                 (*(npy_short *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 1 == 3
+        accum += (*(npy_short *)data0) *
+                 (*(npy_short *)data1) *
+                 (*(npy_short *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_short temp = (*(npy_short *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_short *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1 == 1
+        accum_re += ((npy_short *)data0)[0];
+        accum_im += ((npy_short *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 1 <= 3
+#define _SUMPROD_NOP 1
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_short re, im, tmp;
+        int i;
+        re = ((npy_short *)dataptr[0])[0];
+        im = ((npy_short *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_short *)dataptr[i])[0] -
+                  im * ((npy_short *)dataptr[i])[1];
+            im = re * ((npy_short *)dataptr[i])[1] +
+                 im * ((npy_short *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 1 <= 3
+    ((npy_short *)dataptr[1])[0] += accum_re;
+    ((npy_short *)dataptr[1])[1] += accum_im;
+#  else
+    ((npy_short *)dataptr[nop])[0] += accum_re;
+    ((npy_short *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 1 <= 3
+    *((npy_short *)dataptr[1]) = (accum +
+                                    (*((npy_short *)dataptr[1])));
+#  else
+    *((npy_short *)dataptr[nop]) = (accum +
+                                    (*((npy_short *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+short_sum_of_products_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (2 == 2 || 2 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (2 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data_out = dataptr[2];
+    npy_intp stride_out = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_two (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 2 == 1
+        *(npy_short *)data_out = ((*(npy_short *)data0) +
+                                         (*(npy_short *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 2 == 2
+        *(npy_short *)data_out = ((*(npy_short *)data0) *
+                                         (*(npy_short *)data1) +
+                                         (*(npy_short *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 2 == 3
+        *(npy_short *)data_out = ((*(npy_short *)data0) *
+                                         (*(npy_short *)data1) *
+                                         (*(npy_short *)data2) +
+                                         (*(npy_short *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_short temp = (*(npy_short *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_short *)dataptr[i]);
+        }
+        *(npy_short *)dataptr[nop] = (temp +
+                                           (*(npy_short *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 2 == 1
+        ((npy_short *)data_out)[0] = ((npy_short *)data0)[0] +
+                                         ((npy_short *)data_out)[0];
+        ((npy_short *)data_out)[1] = ((npy_short *)data0)[1] +
+                                         ((npy_short *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 2 <= 3
+#define _SUMPROD_NOP 2
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_short re, im, tmp;
+        int i;
+        re = ((npy_short *)dataptr[0])[0];
+        im = ((npy_short *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_short *)dataptr[i])[0] -
+                  im * ((npy_short *)dataptr[i])[1];
+            im = re * ((npy_short *)dataptr[i])[1] +
+                 im * ((npy_short *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_short *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_short *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_short *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_short *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 2 == 1
+
+static void
+short_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_short *data0 = (npy_short *)dataptr[0];
+    npy_short *data_out = (npy_short *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_short *)data_out + 2*6)[0] =
+                                    ((npy_short *)data0 + 2*6)[0] +
+                                    ((npy_short *)data_out + 2*6)[0];
+            ((npy_short *)data_out + 2*6)[1] =
+                                    ((npy_short *)data0 + 2*6)[1] +
+                                    ((npy_short *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_short *)data_out + 2*5)[0] =
+                                    ((npy_short *)data0 + 2*5)[0] +
+                                    ((npy_short *)data_out + 2*5)[0];
+            ((npy_short *)data_out + 2*5)[1] =
+                                    ((npy_short *)data0 + 2*5)[1] +
+                                    ((npy_short *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_short *)data_out + 2*4)[0] =
+                                    ((npy_short *)data0 + 2*4)[0] +
+                                    ((npy_short *)data_out + 2*4)[0];
+            ((npy_short *)data_out + 2*4)[1] =
+                                    ((npy_short *)data0 + 2*4)[1] +
+                                    ((npy_short *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_short *)data_out + 2*3)[0] =
+                                    ((npy_short *)data0 + 2*3)[0] +
+                                    ((npy_short *)data_out + 2*3)[0];
+            ((npy_short *)data_out + 2*3)[1] =
+                                    ((npy_short *)data0 + 2*3)[1] +
+                                    ((npy_short *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_short *)data_out + 2*2)[0] =
+                                    ((npy_short *)data0 + 2*2)[0] +
+                                    ((npy_short *)data_out + 2*2)[0];
+            ((npy_short *)data_out + 2*2)[1] =
+                                    ((npy_short *)data0 + 2*2)[1] +
+                                    ((npy_short *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_short *)data_out + 2*1)[0] =
+                                    ((npy_short *)data0 + 2*1)[0] +
+                                    ((npy_short *)data_out + 2*1)[0];
+            ((npy_short *)data_out + 2*1)[1] =
+                                    ((npy_short *)data0 + 2*1)[1] +
+                                    ((npy_short *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_short *)data_out + 2*0)[0] =
+                                    ((npy_short *)data0 + 2*0)[0] +
+                                    ((npy_short *)data_out + 2*0)[0];
+            ((npy_short *)data_out + 2*0)[1] =
+                                    ((npy_short *)data0 + 2*0)[1] +
+                                    ((npy_short *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_short *)data_out + 2*0)[0] =
+                                ((npy_short *)data0 + 2*0)[0] +
+                                ((npy_short *)data_out + 2*0)[0];
+        ((npy_short *)data_out + 2*0)[1] =
+                                ((npy_short *)data0 + 2*0)[1] +
+                                ((npy_short *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_short *)data_out + 2*1)[0] =
+                                ((npy_short *)data0 + 2*1)[0] +
+                                ((npy_short *)data_out + 2*1)[0];
+        ((npy_short *)data_out + 2*1)[1] =
+                                ((npy_short *)data0 + 2*1)[1] +
+                                ((npy_short *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_short *)data_out + 2*2)[0] =
+                                ((npy_short *)data0 + 2*2)[0] +
+                                ((npy_short *)data_out + 2*2)[0];
+        ((npy_short *)data_out + 2*2)[1] =
+                                ((npy_short *)data0 + 2*2)[1] +
+                                ((npy_short *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_short *)data_out + 2*3)[0] =
+                                ((npy_short *)data0 + 2*3)[0] +
+                                ((npy_short *)data_out + 2*3)[0];
+        ((npy_short *)data_out + 2*3)[1] =
+                                ((npy_short *)data0 + 2*3)[1] +
+                                ((npy_short *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_short *)data_out + 2*4)[0] =
+                                ((npy_short *)data0 + 2*4)[0] +
+                                ((npy_short *)data_out + 2*4)[0];
+        ((npy_short *)data_out + 2*4)[1] =
+                                ((npy_short *)data0 + 2*4)[1] +
+                                ((npy_short *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_short *)data_out + 2*5)[0] =
+                                ((npy_short *)data0 + 2*5)[0] +
+                                ((npy_short *)data_out + 2*5)[0];
+        ((npy_short *)data_out + 2*5)[1] =
+                                ((npy_short *)data0 + 2*5)[1] +
+                                ((npy_short *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_short *)data_out + 2*6)[0] =
+                                ((npy_short *)data0 + 2*6)[0] +
+                                ((npy_short *)data_out + 2*6)[0];
+        ((npy_short *)data_out + 2*6)[1] =
+                                ((npy_short *)data0 + 2*6)[1] +
+                                ((npy_short *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_short *)data_out + 2*7)[0] =
+                                ((npy_short *)data0 + 2*7)[0] +
+                                ((npy_short *)data_out + 2*7)[0];
+        ((npy_short *)data_out + 2*7)[1] =
+                                ((npy_short *)data0 + 2*7)[1] +
+                                ((npy_short *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 2 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+short_sum_of_products_muladd(npy_short *data, npy_short *data_out, npy_short scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_short
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_s16;
+    const npyv_s16 v_scalar = npyv_setall_s16(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_s16 b0 = npyv_loada_s16(data + vstep * 0);
+            npyv_s16 c0 = npyv_loada_s16(data_out + vstep * 0);
+            
+#line 312
+            npyv_s16 b1 = npyv_loada_s16(data + vstep * 1);
+            npyv_s16 c1 = npyv_loada_s16(data_out + vstep * 1);
+            
+#line 312
+            npyv_s16 b2 = npyv_loada_s16(data + vstep * 2);
+            npyv_s16 c2 = npyv_loada_s16(data_out + vstep * 2);
+            
+#line 312
+            npyv_s16 b3 = npyv_loada_s16(data + vstep * 3);
+            npyv_s16 c3 = npyv_loada_s16(data_out + vstep * 3);
+            
+            #line 318
+            npyv_s16 abc0 = npyv_muladd_s16(v_scalar, b0, c0);
+            
+#line 318
+            npyv_s16 abc1 = npyv_muladd_s16(v_scalar, b1, c1);
+            
+#line 318
+            npyv_s16 abc2 = npyv_muladd_s16(v_scalar, b2, c2);
+            
+#line 318
+            npyv_s16 abc3 = npyv_muladd_s16(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_s16(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_s16(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_s16(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_s16(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_s16 b0 = npyv_load_s16(data + vstep * 0);
+            npyv_s16 c0 = npyv_load_s16(data_out + vstep * 0);
+            
+#line 312
+            npyv_s16 b1 = npyv_load_s16(data + vstep * 1);
+            npyv_s16 c1 = npyv_load_s16(data_out + vstep * 1);
+            
+#line 312
+            npyv_s16 b2 = npyv_load_s16(data + vstep * 2);
+            npyv_s16 c2 = npyv_load_s16(data_out + vstep * 2);
+            
+#line 312
+            npyv_s16 b3 = npyv_load_s16(data + vstep * 3);
+            npyv_s16 c3 = npyv_load_s16(data_out + vstep * 3);
+            
+            #line 318
+            npyv_s16 abc0 = npyv_muladd_s16(v_scalar, b0, c0);
+            
+#line 318
+            npyv_s16 abc1 = npyv_muladd_s16(v_scalar, b1, c1);
+            
+#line 318
+            npyv_s16 abc2 = npyv_muladd_s16(v_scalar, b2, c2);
+            
+#line 318
+            npyv_s16 abc3 = npyv_muladd_s16(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_s16(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_s16(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_s16(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_s16(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_s16 a = npyv_load_tillz_s16(data, count);
+        npyv_s16 b = npyv_load_tillz_s16(data_out, count);
+        npyv_store_till_s16(data_out, count, npyv_muladd_s16(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_short b0 = (data[0]);
+        const npy_short c0 = (data_out[0]);
+        
+#line 340
+        const npy_short b1 = (data[1]);
+        const npy_short c1 = (data_out[1]);
+        
+#line 340
+        const npy_short b2 = (data[2]);
+        const npy_short c2 = (data_out[2]);
+        
+#line 340
+        const npy_short b3 = (data[3]);
+        const npy_short c3 = (data_out[3]);
+        
+        #line 346
+        const npy_short abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_short abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_short abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_short abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_short b = (*data);
+        const npy_short c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_short
+}
+
+static void
+short_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_short *data0 = (npy_short *)dataptr[0];
+    npy_short *data1 = (npy_short *)dataptr[1];
+    npy_short *data_out = (npy_short *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_short
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_s16;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_s16 a0 = npyv_loada_s16(data0 + vstep * 0);
+            npyv_s16 b0 = npyv_loada_s16(data1 + vstep * 0);
+            npyv_s16 c0 = npyv_loada_s16(data_out + vstep * 0);
+            
+#line 390
+            npyv_s16 a1 = npyv_loada_s16(data0 + vstep * 1);
+            npyv_s16 b1 = npyv_loada_s16(data1 + vstep * 1);
+            npyv_s16 c1 = npyv_loada_s16(data_out + vstep * 1);
+            
+#line 390
+            npyv_s16 a2 = npyv_loada_s16(data0 + vstep * 2);
+            npyv_s16 b2 = npyv_loada_s16(data1 + vstep * 2);
+            npyv_s16 c2 = npyv_loada_s16(data_out + vstep * 2);
+            
+#line 390
+            npyv_s16 a3 = npyv_loada_s16(data0 + vstep * 3);
+            npyv_s16 b3 = npyv_loada_s16(data1 + vstep * 3);
+            npyv_s16 c3 = npyv_loada_s16(data_out + vstep * 3);
+            
+            #line 397
+            npyv_s16 abc0 = npyv_muladd_s16(a0, b0, c0);
+            
+#line 397
+            npyv_s16 abc1 = npyv_muladd_s16(a1, b1, c1);
+            
+#line 397
+            npyv_s16 abc2 = npyv_muladd_s16(a2, b2, c2);
+            
+#line 397
+            npyv_s16 abc3 = npyv_muladd_s16(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_s16(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_s16(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_s16(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_s16(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_s16 a0 = npyv_load_s16(data0 + vstep * 0);
+            npyv_s16 b0 = npyv_load_s16(data1 + vstep * 0);
+            npyv_s16 c0 = npyv_load_s16(data_out + vstep * 0);
+            
+#line 390
+            npyv_s16 a1 = npyv_load_s16(data0 + vstep * 1);
+            npyv_s16 b1 = npyv_load_s16(data1 + vstep * 1);
+            npyv_s16 c1 = npyv_load_s16(data_out + vstep * 1);
+            
+#line 390
+            npyv_s16 a2 = npyv_load_s16(data0 + vstep * 2);
+            npyv_s16 b2 = npyv_load_s16(data1 + vstep * 2);
+            npyv_s16 c2 = npyv_load_s16(data_out + vstep * 2);
+            
+#line 390
+            npyv_s16 a3 = npyv_load_s16(data0 + vstep * 3);
+            npyv_s16 b3 = npyv_load_s16(data1 + vstep * 3);
+            npyv_s16 c3 = npyv_load_s16(data_out + vstep * 3);
+            
+            #line 397
+            npyv_s16 abc0 = npyv_muladd_s16(a0, b0, c0);
+            
+#line 397
+            npyv_s16 abc1 = npyv_muladd_s16(a1, b1, c1);
+            
+#line 397
+            npyv_s16 abc2 = npyv_muladd_s16(a2, b2, c2);
+            
+#line 397
+            npyv_s16 abc3 = npyv_muladd_s16(a3, b3, c3);
+            
+            #line 402
+            npyv_store_s16(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_s16(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_s16(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_s16(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_s16 a = npyv_load_tillz_s16(data0, count);
+        npyv_s16 b = npyv_load_tillz_s16(data1, count);
+        npyv_s16 c = npyv_load_tillz_s16(data_out, count);
+        npyv_store_till_s16(data_out, count, npyv_muladd_s16(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_short a0 = (data0[0]);
+        const npy_short b0 = (data1[0]);
+        const npy_short c0 = (data_out[0]);
+        
+#line 420
+        const npy_short a1 = (data0[1]);
+        const npy_short b1 = (data1[1]);
+        const npy_short c1 = (data_out[1]);
+        
+#line 420
+        const npy_short a2 = (data0[2]);
+        const npy_short b2 = (data1[2]);
+        const npy_short c2 = (data_out[2]);
+        
+#line 420
+        const npy_short a3 = (data0[3]);
+        const npy_short b3 = (data1[3]);
+        const npy_short c3 = (data_out[3]);
+        
+        #line 427
+        const npy_short abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_short abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_short abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_short abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_short a = (*data0);
+        const npy_short b = (*data1);
+        const npy_short c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_short
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+short_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_short value0 = (*(npy_short *)dataptr[0]);
+    npy_short *data1 = (npy_short *)dataptr[1];
+    npy_short *data_out = (npy_short *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    short_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+short_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_short value1 = (*(npy_short *)dataptr[1]);
+    npy_short *data0 = (npy_short *)dataptr[0];
+    npy_short *data_out = (npy_short *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    short_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+short_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_short *data0 = (npy_short *)dataptr[0];
+    npy_short *data1 = (npy_short *)dataptr[1];
+    npy_short accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_short
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_s16;
+    npyv_s16 v_accum = npyv_zero_s16();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_s16 a0 = npyv_loada_s16(data0 + vstep * 0);
+            npyv_s16 b0 = npyv_loada_s16(data1 + vstep * 0);
+            
+#line 501
+            npyv_s16 a1 = npyv_loada_s16(data0 + vstep * 1);
+            npyv_s16 b1 = npyv_loada_s16(data1 + vstep * 1);
+            
+#line 501
+            npyv_s16 a2 = npyv_loada_s16(data0 + vstep * 2);
+            npyv_s16 b2 = npyv_loada_s16(data1 + vstep * 2);
+            
+#line 501
+            npyv_s16 a3 = npyv_loada_s16(data0 + vstep * 3);
+            npyv_s16 b3 = npyv_loada_s16(data1 + vstep * 3);
+            
+            npyv_s16 ab3 = npyv_muladd_s16(a3, b3, v_accum);
+            npyv_s16 ab2 = npyv_muladd_s16(a2, b2, ab3);
+            npyv_s16 ab1 = npyv_muladd_s16(a1, b1, ab2);
+                   v_accum = npyv_muladd_s16(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_s16 a0 = npyv_load_s16(data0 + vstep * 0);
+            npyv_s16 b0 = npyv_load_s16(data1 + vstep * 0);
+            
+#line 501
+            npyv_s16 a1 = npyv_load_s16(data0 + vstep * 1);
+            npyv_s16 b1 = npyv_load_s16(data1 + vstep * 1);
+            
+#line 501
+            npyv_s16 a2 = npyv_load_s16(data0 + vstep * 2);
+            npyv_s16 b2 = npyv_load_s16(data1 + vstep * 2);
+            
+#line 501
+            npyv_s16 a3 = npyv_load_s16(data0 + vstep * 3);
+            npyv_s16 b3 = npyv_load_s16(data1 + vstep * 3);
+            
+            npyv_s16 ab3 = npyv_muladd_s16(a3, b3, v_accum);
+            npyv_s16 ab2 = npyv_muladd_s16(a2, b2, ab3);
+            npyv_s16 ab1 = npyv_muladd_s16(a1, b1, ab2);
+                   v_accum = npyv_muladd_s16(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_s16 a = npyv_load_tillz_s16(data0, count);
+        npyv_s16 b = npyv_load_tillz_s16(data1, count);
+        v_accum = npyv_muladd_s16(a, b, v_accum);
+    }
+    accum = npyv_sum_s16(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_short ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_short ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_short ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_short ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_short a = (*data0);
+        const npy_short b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_short
+    *(npy_short *)dataptr[2] = ((*(npy_short *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+short_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_short *data1 = (npy_short *)dataptr[1];
+    npy_short value0 = (*(npy_short *)dataptr[0]);
+    npy_short accum = short_sum_of_arr(data1, count);
+    *(npy_short *)dataptr[2] = ((*(npy_short *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+short_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_short *data0 = (npy_short *)dataptr[0];
+    npy_short value1 = (*(npy_short *)dataptr[1]);
+    npy_short accum = short_sum_of_arr(data0, count);
+    *(npy_short *)dataptr[2] = ((*(npy_short *)dataptr[2]) + value1 * accum);
+}
+
+#elif 2 == 3 && !0
+
+static void
+short_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_short *data0 = (npy_short *)dataptr[0];
+    npy_short *data1 = (npy_short *)dataptr[1];
+    npy_short *data2 = (npy_short *)dataptr[2];
+    npy_short *data_out = (npy_short *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 2 > 3 || @complex */
+
+static void
+short_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_two (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_short temp = (*(npy_short *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_short *)dataptr[i]);
+        }
+        *(npy_short *)dataptr[nop] = (temp +
+                                           (*(npy_short *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_short);
+        }
+#else /* complex */
+#  if 2 <= 3
+#    define _SUMPROD_NOP 2
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_short re, im, tmp;
+        int i;
+        re = ((npy_short *)dataptr[0])[0];
+        im = ((npy_short *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_short *)dataptr[i])[0] -
+                  im * ((npy_short *)dataptr[i])[1];
+            im = re * ((npy_short *)dataptr[i])[1] +
+                 im * ((npy_short *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_short *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_short *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_short *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_short *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_short);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 2 */
+
+#if 2 == 1
+
+static NPY_GCC_OPT_3 void
+short_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_short *data = (npy_short *)dataptr[0];
+    npy_short accum = short_sum_of_arr(data, count);
+    *((npy_short *)dataptr[1]) = (accum + (*((npy_short *)dataptr[1])));
+#else
+    npy_short accum_re = 0, accum_im = 0;
+    npy_short *data0 = (npy_short *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_short re01 = data0[0] + data0[2];
+        const npy_short re23 = data0[4] + data0[6];
+        const npy_short im13 = data0[1] + data0[3];
+        const npy_short im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_short *)dataptr[1])[0] += accum_re;
+    ((npy_short *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 2 == 1 */
+
+static void
+short_sum_of_products_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_short accum_re = 0, accum_im = 0;
+#else
+    npy_short accum = 0;
+#endif
+
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (2 == 2 || 2 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (2 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_outstride0_two (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 2 == 1
+        accum += (*(npy_short *)data0);
+        data0 += stride0;
+#  elif 2 == 2
+        accum += (*(npy_short *)data0) *
+                 (*(npy_short *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 2 == 3
+        accum += (*(npy_short *)data0) *
+                 (*(npy_short *)data1) *
+                 (*(npy_short *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_short temp = (*(npy_short *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_short *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 2 == 1
+        accum_re += ((npy_short *)data0)[0];
+        accum_im += ((npy_short *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 2 <= 3
+#define _SUMPROD_NOP 2
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_short re, im, tmp;
+        int i;
+        re = ((npy_short *)dataptr[0])[0];
+        im = ((npy_short *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_short *)dataptr[i])[0] -
+                  im * ((npy_short *)dataptr[i])[1];
+            im = re * ((npy_short *)dataptr[i])[1] +
+                 im * ((npy_short *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 2 <= 3
+    ((npy_short *)dataptr[2])[0] += accum_re;
+    ((npy_short *)dataptr[2])[1] += accum_im;
+#  else
+    ((npy_short *)dataptr[nop])[0] += accum_re;
+    ((npy_short *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 2 <= 3
+    *((npy_short *)dataptr[2]) = (accum +
+                                    (*((npy_short *)dataptr[2])));
+#  else
+    *((npy_short *)dataptr[nop]) = (accum +
+                                    (*((npy_short *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+short_sum_of_products_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (3 == 2 || 3 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (3 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data_out = dataptr[3];
+    npy_intp stride_out = strides[3];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_three (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 3 == 1
+        *(npy_short *)data_out = ((*(npy_short *)data0) +
+                                         (*(npy_short *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 3 == 2
+        *(npy_short *)data_out = ((*(npy_short *)data0) *
+                                         (*(npy_short *)data1) +
+                                         (*(npy_short *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 3 == 3
+        *(npy_short *)data_out = ((*(npy_short *)data0) *
+                                         (*(npy_short *)data1) *
+                                         (*(npy_short *)data2) +
+                                         (*(npy_short *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_short temp = (*(npy_short *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_short *)dataptr[i]);
+        }
+        *(npy_short *)dataptr[nop] = (temp +
+                                           (*(npy_short *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 3 == 1
+        ((npy_short *)data_out)[0] = ((npy_short *)data0)[0] +
+                                         ((npy_short *)data_out)[0];
+        ((npy_short *)data_out)[1] = ((npy_short *)data0)[1] +
+                                         ((npy_short *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 3 <= 3
+#define _SUMPROD_NOP 3
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_short re, im, tmp;
+        int i;
+        re = ((npy_short *)dataptr[0])[0];
+        im = ((npy_short *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_short *)dataptr[i])[0] -
+                  im * ((npy_short *)dataptr[i])[1];
+            im = re * ((npy_short *)dataptr[i])[1] +
+                 im * ((npy_short *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_short *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_short *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_short *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_short *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 3 == 1
+
+static void
+short_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_short *data0 = (npy_short *)dataptr[0];
+    npy_short *data_out = (npy_short *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_short *)data_out + 2*6)[0] =
+                                    ((npy_short *)data0 + 2*6)[0] +
+                                    ((npy_short *)data_out + 2*6)[0];
+            ((npy_short *)data_out + 2*6)[1] =
+                                    ((npy_short *)data0 + 2*6)[1] +
+                                    ((npy_short *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_short *)data_out + 2*5)[0] =
+                                    ((npy_short *)data0 + 2*5)[0] +
+                                    ((npy_short *)data_out + 2*5)[0];
+            ((npy_short *)data_out + 2*5)[1] =
+                                    ((npy_short *)data0 + 2*5)[1] +
+                                    ((npy_short *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_short *)data_out + 2*4)[0] =
+                                    ((npy_short *)data0 + 2*4)[0] +
+                                    ((npy_short *)data_out + 2*4)[0];
+            ((npy_short *)data_out + 2*4)[1] =
+                                    ((npy_short *)data0 + 2*4)[1] +
+                                    ((npy_short *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_short *)data_out + 2*3)[0] =
+                                    ((npy_short *)data0 + 2*3)[0] +
+                                    ((npy_short *)data_out + 2*3)[0];
+            ((npy_short *)data_out + 2*3)[1] =
+                                    ((npy_short *)data0 + 2*3)[1] +
+                                    ((npy_short *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_short *)data_out + 2*2)[0] =
+                                    ((npy_short *)data0 + 2*2)[0] +
+                                    ((npy_short *)data_out + 2*2)[0];
+            ((npy_short *)data_out + 2*2)[1] =
+                                    ((npy_short *)data0 + 2*2)[1] +
+                                    ((npy_short *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_short *)data_out + 2*1)[0] =
+                                    ((npy_short *)data0 + 2*1)[0] +
+                                    ((npy_short *)data_out + 2*1)[0];
+            ((npy_short *)data_out + 2*1)[1] =
+                                    ((npy_short *)data0 + 2*1)[1] +
+                                    ((npy_short *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_short *)data_out + 2*0)[0] =
+                                    ((npy_short *)data0 + 2*0)[0] +
+                                    ((npy_short *)data_out + 2*0)[0];
+            ((npy_short *)data_out + 2*0)[1] =
+                                    ((npy_short *)data0 + 2*0)[1] +
+                                    ((npy_short *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_short *)data_out + 2*0)[0] =
+                                ((npy_short *)data0 + 2*0)[0] +
+                                ((npy_short *)data_out + 2*0)[0];
+        ((npy_short *)data_out + 2*0)[1] =
+                                ((npy_short *)data0 + 2*0)[1] +
+                                ((npy_short *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_short *)data_out + 2*1)[0] =
+                                ((npy_short *)data0 + 2*1)[0] +
+                                ((npy_short *)data_out + 2*1)[0];
+        ((npy_short *)data_out + 2*1)[1] =
+                                ((npy_short *)data0 + 2*1)[1] +
+                                ((npy_short *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_short *)data_out + 2*2)[0] =
+                                ((npy_short *)data0 + 2*2)[0] +
+                                ((npy_short *)data_out + 2*2)[0];
+        ((npy_short *)data_out + 2*2)[1] =
+                                ((npy_short *)data0 + 2*2)[1] +
+                                ((npy_short *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_short *)data_out + 2*3)[0] =
+                                ((npy_short *)data0 + 2*3)[0] +
+                                ((npy_short *)data_out + 2*3)[0];
+        ((npy_short *)data_out + 2*3)[1] =
+                                ((npy_short *)data0 + 2*3)[1] +
+                                ((npy_short *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_short *)data_out + 2*4)[0] =
+                                ((npy_short *)data0 + 2*4)[0] +
+                                ((npy_short *)data_out + 2*4)[0];
+        ((npy_short *)data_out + 2*4)[1] =
+                                ((npy_short *)data0 + 2*4)[1] +
+                                ((npy_short *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_short *)data_out + 2*5)[0] =
+                                ((npy_short *)data0 + 2*5)[0] +
+                                ((npy_short *)data_out + 2*5)[0];
+        ((npy_short *)data_out + 2*5)[1] =
+                                ((npy_short *)data0 + 2*5)[1] +
+                                ((npy_short *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_short *)data_out + 2*6)[0] =
+                                ((npy_short *)data0 + 2*6)[0] +
+                                ((npy_short *)data_out + 2*6)[0];
+        ((npy_short *)data_out + 2*6)[1] =
+                                ((npy_short *)data0 + 2*6)[1] +
+                                ((npy_short *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_short *)data_out + 2*7)[0] =
+                                ((npy_short *)data0 + 2*7)[0] +
+                                ((npy_short *)data_out + 2*7)[0];
+        ((npy_short *)data_out + 2*7)[1] =
+                                ((npy_short *)data0 + 2*7)[1] +
+                                ((npy_short *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 3 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+short_sum_of_products_muladd(npy_short *data, npy_short *data_out, npy_short scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_short
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_s16;
+    const npyv_s16 v_scalar = npyv_setall_s16(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_s16 b0 = npyv_loada_s16(data + vstep * 0);
+            npyv_s16 c0 = npyv_loada_s16(data_out + vstep * 0);
+            
+#line 312
+            npyv_s16 b1 = npyv_loada_s16(data + vstep * 1);
+            npyv_s16 c1 = npyv_loada_s16(data_out + vstep * 1);
+            
+#line 312
+            npyv_s16 b2 = npyv_loada_s16(data + vstep * 2);
+            npyv_s16 c2 = npyv_loada_s16(data_out + vstep * 2);
+            
+#line 312
+            npyv_s16 b3 = npyv_loada_s16(data + vstep * 3);
+            npyv_s16 c3 = npyv_loada_s16(data_out + vstep * 3);
+            
+            #line 318
+            npyv_s16 abc0 = npyv_muladd_s16(v_scalar, b0, c0);
+            
+#line 318
+            npyv_s16 abc1 = npyv_muladd_s16(v_scalar, b1, c1);
+            
+#line 318
+            npyv_s16 abc2 = npyv_muladd_s16(v_scalar, b2, c2);
+            
+#line 318
+            npyv_s16 abc3 = npyv_muladd_s16(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_s16(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_s16(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_s16(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_s16(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_s16 b0 = npyv_load_s16(data + vstep * 0);
+            npyv_s16 c0 = npyv_load_s16(data_out + vstep * 0);
+            
+#line 312
+            npyv_s16 b1 = npyv_load_s16(data + vstep * 1);
+            npyv_s16 c1 = npyv_load_s16(data_out + vstep * 1);
+            
+#line 312
+            npyv_s16 b2 = npyv_load_s16(data + vstep * 2);
+            npyv_s16 c2 = npyv_load_s16(data_out + vstep * 2);
+            
+#line 312
+            npyv_s16 b3 = npyv_load_s16(data + vstep * 3);
+            npyv_s16 c3 = npyv_load_s16(data_out + vstep * 3);
+            
+            #line 318
+            npyv_s16 abc0 = npyv_muladd_s16(v_scalar, b0, c0);
+            
+#line 318
+            npyv_s16 abc1 = npyv_muladd_s16(v_scalar, b1, c1);
+            
+#line 318
+            npyv_s16 abc2 = npyv_muladd_s16(v_scalar, b2, c2);
+            
+#line 318
+            npyv_s16 abc3 = npyv_muladd_s16(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_s16(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_s16(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_s16(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_s16(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_s16 a = npyv_load_tillz_s16(data, count);
+        npyv_s16 b = npyv_load_tillz_s16(data_out, count);
+        npyv_store_till_s16(data_out, count, npyv_muladd_s16(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_short b0 = (data[0]);
+        const npy_short c0 = (data_out[0]);
+        
+#line 340
+        const npy_short b1 = (data[1]);
+        const npy_short c1 = (data_out[1]);
+        
+#line 340
+        const npy_short b2 = (data[2]);
+        const npy_short c2 = (data_out[2]);
+        
+#line 340
+        const npy_short b3 = (data[3]);
+        const npy_short c3 = (data_out[3]);
+        
+        #line 346
+        const npy_short abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_short abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_short abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_short abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_short b = (*data);
+        const npy_short c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_short
+}
+
+static void
+short_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_short *data0 = (npy_short *)dataptr[0];
+    npy_short *data1 = (npy_short *)dataptr[1];
+    npy_short *data_out = (npy_short *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_short
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_s16;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_s16 a0 = npyv_loada_s16(data0 + vstep * 0);
+            npyv_s16 b0 = npyv_loada_s16(data1 + vstep * 0);
+            npyv_s16 c0 = npyv_loada_s16(data_out + vstep * 0);
+            
+#line 390
+            npyv_s16 a1 = npyv_loada_s16(data0 + vstep * 1);
+            npyv_s16 b1 = npyv_loada_s16(data1 + vstep * 1);
+            npyv_s16 c1 = npyv_loada_s16(data_out + vstep * 1);
+            
+#line 390
+            npyv_s16 a2 = npyv_loada_s16(data0 + vstep * 2);
+            npyv_s16 b2 = npyv_loada_s16(data1 + vstep * 2);
+            npyv_s16 c2 = npyv_loada_s16(data_out + vstep * 2);
+            
+#line 390
+            npyv_s16 a3 = npyv_loada_s16(data0 + vstep * 3);
+            npyv_s16 b3 = npyv_loada_s16(data1 + vstep * 3);
+            npyv_s16 c3 = npyv_loada_s16(data_out + vstep * 3);
+            
+            #line 397
+            npyv_s16 abc0 = npyv_muladd_s16(a0, b0, c0);
+            
+#line 397
+            npyv_s16 abc1 = npyv_muladd_s16(a1, b1, c1);
+            
+#line 397
+            npyv_s16 abc2 = npyv_muladd_s16(a2, b2, c2);
+            
+#line 397
+            npyv_s16 abc3 = npyv_muladd_s16(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_s16(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_s16(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_s16(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_s16(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_s16 a0 = npyv_load_s16(data0 + vstep * 0);
+            npyv_s16 b0 = npyv_load_s16(data1 + vstep * 0);
+            npyv_s16 c0 = npyv_load_s16(data_out + vstep * 0);
+            
+#line 390
+            npyv_s16 a1 = npyv_load_s16(data0 + vstep * 1);
+            npyv_s16 b1 = npyv_load_s16(data1 + vstep * 1);
+            npyv_s16 c1 = npyv_load_s16(data_out + vstep * 1);
+            
+#line 390
+            npyv_s16 a2 = npyv_load_s16(data0 + vstep * 2);
+            npyv_s16 b2 = npyv_load_s16(data1 + vstep * 2);
+            npyv_s16 c2 = npyv_load_s16(data_out + vstep * 2);
+            
+#line 390
+            npyv_s16 a3 = npyv_load_s16(data0 + vstep * 3);
+            npyv_s16 b3 = npyv_load_s16(data1 + vstep * 3);
+            npyv_s16 c3 = npyv_load_s16(data_out + vstep * 3);
+            
+            #line 397
+            npyv_s16 abc0 = npyv_muladd_s16(a0, b0, c0);
+            
+#line 397
+            npyv_s16 abc1 = npyv_muladd_s16(a1, b1, c1);
+            
+#line 397
+            npyv_s16 abc2 = npyv_muladd_s16(a2, b2, c2);
+            
+#line 397
+            npyv_s16 abc3 = npyv_muladd_s16(a3, b3, c3);
+            
+            #line 402
+            npyv_store_s16(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_s16(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_s16(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_s16(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_s16 a = npyv_load_tillz_s16(data0, count);
+        npyv_s16 b = npyv_load_tillz_s16(data1, count);
+        npyv_s16 c = npyv_load_tillz_s16(data_out, count);
+        npyv_store_till_s16(data_out, count, npyv_muladd_s16(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_short a0 = (data0[0]);
+        const npy_short b0 = (data1[0]);
+        const npy_short c0 = (data_out[0]);
+        
+#line 420
+        const npy_short a1 = (data0[1]);
+        const npy_short b1 = (data1[1]);
+        const npy_short c1 = (data_out[1]);
+        
+#line 420
+        const npy_short a2 = (data0[2]);
+        const npy_short b2 = (data1[2]);
+        const npy_short c2 = (data_out[2]);
+        
+#line 420
+        const npy_short a3 = (data0[3]);
+        const npy_short b3 = (data1[3]);
+        const npy_short c3 = (data_out[3]);
+        
+        #line 427
+        const npy_short abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_short abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_short abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_short abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_short a = (*data0);
+        const npy_short b = (*data1);
+        const npy_short c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_short
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+short_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_short value0 = (*(npy_short *)dataptr[0]);
+    npy_short *data1 = (npy_short *)dataptr[1];
+    npy_short *data_out = (npy_short *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    short_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+short_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_short value1 = (*(npy_short *)dataptr[1]);
+    npy_short *data0 = (npy_short *)dataptr[0];
+    npy_short *data_out = (npy_short *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    short_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+short_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_short *data0 = (npy_short *)dataptr[0];
+    npy_short *data1 = (npy_short *)dataptr[1];
+    npy_short accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_short
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_s16;
+    npyv_s16 v_accum = npyv_zero_s16();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_s16 a0 = npyv_loada_s16(data0 + vstep * 0);
+            npyv_s16 b0 = npyv_loada_s16(data1 + vstep * 0);
+            
+#line 501
+            npyv_s16 a1 = npyv_loada_s16(data0 + vstep * 1);
+            npyv_s16 b1 = npyv_loada_s16(data1 + vstep * 1);
+            
+#line 501
+            npyv_s16 a2 = npyv_loada_s16(data0 + vstep * 2);
+            npyv_s16 b2 = npyv_loada_s16(data1 + vstep * 2);
+            
+#line 501
+            npyv_s16 a3 = npyv_loada_s16(data0 + vstep * 3);
+            npyv_s16 b3 = npyv_loada_s16(data1 + vstep * 3);
+            
+            npyv_s16 ab3 = npyv_muladd_s16(a3, b3, v_accum);
+            npyv_s16 ab2 = npyv_muladd_s16(a2, b2, ab3);
+            npyv_s16 ab1 = npyv_muladd_s16(a1, b1, ab2);
+                   v_accum = npyv_muladd_s16(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_s16 a0 = npyv_load_s16(data0 + vstep * 0);
+            npyv_s16 b0 = npyv_load_s16(data1 + vstep * 0);
+            
+#line 501
+            npyv_s16 a1 = npyv_load_s16(data0 + vstep * 1);
+            npyv_s16 b1 = npyv_load_s16(data1 + vstep * 1);
+            
+#line 501
+            npyv_s16 a2 = npyv_load_s16(data0 + vstep * 2);
+            npyv_s16 b2 = npyv_load_s16(data1 + vstep * 2);
+            
+#line 501
+            npyv_s16 a3 = npyv_load_s16(data0 + vstep * 3);
+            npyv_s16 b3 = npyv_load_s16(data1 + vstep * 3);
+            
+            npyv_s16 ab3 = npyv_muladd_s16(a3, b3, v_accum);
+            npyv_s16 ab2 = npyv_muladd_s16(a2, b2, ab3);
+            npyv_s16 ab1 = npyv_muladd_s16(a1, b1, ab2);
+                   v_accum = npyv_muladd_s16(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_s16 a = npyv_load_tillz_s16(data0, count);
+        npyv_s16 b = npyv_load_tillz_s16(data1, count);
+        v_accum = npyv_muladd_s16(a, b, v_accum);
+    }
+    accum = npyv_sum_s16(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_short ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_short ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_short ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_short ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_short a = (*data0);
+        const npy_short b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_short
+    *(npy_short *)dataptr[2] = ((*(npy_short *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+short_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_short *data1 = (npy_short *)dataptr[1];
+    npy_short value0 = (*(npy_short *)dataptr[0]);
+    npy_short accum = short_sum_of_arr(data1, count);
+    *(npy_short *)dataptr[2] = ((*(npy_short *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+short_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_short *data0 = (npy_short *)dataptr[0];
+    npy_short value1 = (*(npy_short *)dataptr[1]);
+    npy_short accum = short_sum_of_arr(data0, count);
+    *(npy_short *)dataptr[2] = ((*(npy_short *)dataptr[2]) + value1 * accum);
+}
+
+#elif 3 == 3 && !0
+
+static void
+short_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_short *data0 = (npy_short *)dataptr[0];
+    npy_short *data1 = (npy_short *)dataptr[1];
+    npy_short *data2 = (npy_short *)dataptr[2];
+    npy_short *data_out = (npy_short *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 3 > 3 || @complex */
+
+static void
+short_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_three (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_short temp = (*(npy_short *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_short *)dataptr[i]);
+        }
+        *(npy_short *)dataptr[nop] = (temp +
+                                           (*(npy_short *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_short);
+        }
+#else /* complex */
+#  if 3 <= 3
+#    define _SUMPROD_NOP 3
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_short re, im, tmp;
+        int i;
+        re = ((npy_short *)dataptr[0])[0];
+        im = ((npy_short *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_short *)dataptr[i])[0] -
+                  im * ((npy_short *)dataptr[i])[1];
+            im = re * ((npy_short *)dataptr[i])[1] +
+                 im * ((npy_short *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_short *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_short *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_short *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_short *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_short);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 3 */
+
+#if 3 == 1
+
+static NPY_GCC_OPT_3 void
+short_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_short *data = (npy_short *)dataptr[0];
+    npy_short accum = short_sum_of_arr(data, count);
+    *((npy_short *)dataptr[1]) = (accum + (*((npy_short *)dataptr[1])));
+#else
+    npy_short accum_re = 0, accum_im = 0;
+    npy_short *data0 = (npy_short *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_short re01 = data0[0] + data0[2];
+        const npy_short re23 = data0[4] + data0[6];
+        const npy_short im13 = data0[1] + data0[3];
+        const npy_short im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_short *)dataptr[1])[0] += accum_re;
+    ((npy_short *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 3 == 1 */
+
+static void
+short_sum_of_products_outstride0_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_short accum_re = 0, accum_im = 0;
+#else
+    npy_short accum = 0;
+#endif
+
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (3 == 2 || 3 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (3 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_outstride0_three (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 3 == 1
+        accum += (*(npy_short *)data0);
+        data0 += stride0;
+#  elif 3 == 2
+        accum += (*(npy_short *)data0) *
+                 (*(npy_short *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 3 == 3
+        accum += (*(npy_short *)data0) *
+                 (*(npy_short *)data1) *
+                 (*(npy_short *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_short temp = (*(npy_short *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_short *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 3 == 1
+        accum_re += ((npy_short *)data0)[0];
+        accum_im += ((npy_short *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 3 <= 3
+#define _SUMPROD_NOP 3
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_short re, im, tmp;
+        int i;
+        re = ((npy_short *)dataptr[0])[0];
+        im = ((npy_short *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_short *)dataptr[i])[0] -
+                  im * ((npy_short *)dataptr[i])[1];
+            im = re * ((npy_short *)dataptr[i])[1] +
+                 im * ((npy_short *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 3 <= 3
+    ((npy_short *)dataptr[3])[0] += accum_re;
+    ((npy_short *)dataptr[3])[1] += accum_im;
+#  else
+    ((npy_short *)dataptr[nop])[0] += accum_re;
+    ((npy_short *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 3 <= 3
+    *((npy_short *)dataptr[3]) = (accum +
+                                    (*((npy_short *)dataptr[3])));
+#  else
+    *((npy_short *)dataptr[nop]) = (accum +
+                                    (*((npy_short *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+short_sum_of_products_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1000 == 2 || 1000 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1000 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data_out = dataptr[1000];
+    npy_intp stride_out = strides[1000];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_any (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 1000 == 1
+        *(npy_short *)data_out = ((*(npy_short *)data0) +
+                                         (*(npy_short *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 1000 == 2
+        *(npy_short *)data_out = ((*(npy_short *)data0) *
+                                         (*(npy_short *)data1) +
+                                         (*(npy_short *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 1000 == 3
+        *(npy_short *)data_out = ((*(npy_short *)data0) *
+                                         (*(npy_short *)data1) *
+                                         (*(npy_short *)data2) +
+                                         (*(npy_short *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_short temp = (*(npy_short *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_short *)dataptr[i]);
+        }
+        *(npy_short *)dataptr[nop] = (temp +
+                                           (*(npy_short *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1000 == 1
+        ((npy_short *)data_out)[0] = ((npy_short *)data0)[0] +
+                                         ((npy_short *)data_out)[0];
+        ((npy_short *)data_out)[1] = ((npy_short *)data0)[1] +
+                                         ((npy_short *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 1000 <= 3
+#define _SUMPROD_NOP 1000
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_short re, im, tmp;
+        int i;
+        re = ((npy_short *)dataptr[0])[0];
+        im = ((npy_short *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_short *)dataptr[i])[0] -
+                  im * ((npy_short *)dataptr[i])[1];
+            im = re * ((npy_short *)dataptr[i])[1] +
+                 im * ((npy_short *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_short *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_short *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_short *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_short *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 1000 == 1
+
+static void
+short_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_short *data0 = (npy_short *)dataptr[0];
+    npy_short *data_out = (npy_short *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_short *)data_out + 2*6)[0] =
+                                    ((npy_short *)data0 + 2*6)[0] +
+                                    ((npy_short *)data_out + 2*6)[0];
+            ((npy_short *)data_out + 2*6)[1] =
+                                    ((npy_short *)data0 + 2*6)[1] +
+                                    ((npy_short *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_short *)data_out + 2*5)[0] =
+                                    ((npy_short *)data0 + 2*5)[0] +
+                                    ((npy_short *)data_out + 2*5)[0];
+            ((npy_short *)data_out + 2*5)[1] =
+                                    ((npy_short *)data0 + 2*5)[1] +
+                                    ((npy_short *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_short *)data_out + 2*4)[0] =
+                                    ((npy_short *)data0 + 2*4)[0] +
+                                    ((npy_short *)data_out + 2*4)[0];
+            ((npy_short *)data_out + 2*4)[1] =
+                                    ((npy_short *)data0 + 2*4)[1] +
+                                    ((npy_short *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_short *)data_out + 2*3)[0] =
+                                    ((npy_short *)data0 + 2*3)[0] +
+                                    ((npy_short *)data_out + 2*3)[0];
+            ((npy_short *)data_out + 2*3)[1] =
+                                    ((npy_short *)data0 + 2*3)[1] +
+                                    ((npy_short *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_short *)data_out + 2*2)[0] =
+                                    ((npy_short *)data0 + 2*2)[0] +
+                                    ((npy_short *)data_out + 2*2)[0];
+            ((npy_short *)data_out + 2*2)[1] =
+                                    ((npy_short *)data0 + 2*2)[1] +
+                                    ((npy_short *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_short *)data_out + 2*1)[0] =
+                                    ((npy_short *)data0 + 2*1)[0] +
+                                    ((npy_short *)data_out + 2*1)[0];
+            ((npy_short *)data_out + 2*1)[1] =
+                                    ((npy_short *)data0 + 2*1)[1] +
+                                    ((npy_short *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_short *)data_out + 2*0)[0] =
+                                    ((npy_short *)data0 + 2*0)[0] +
+                                    ((npy_short *)data_out + 2*0)[0];
+            ((npy_short *)data_out + 2*0)[1] =
+                                    ((npy_short *)data0 + 2*0)[1] +
+                                    ((npy_short *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_short *)data_out + 2*0)[0] =
+                                ((npy_short *)data0 + 2*0)[0] +
+                                ((npy_short *)data_out + 2*0)[0];
+        ((npy_short *)data_out + 2*0)[1] =
+                                ((npy_short *)data0 + 2*0)[1] +
+                                ((npy_short *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_short *)data_out + 2*1)[0] =
+                                ((npy_short *)data0 + 2*1)[0] +
+                                ((npy_short *)data_out + 2*1)[0];
+        ((npy_short *)data_out + 2*1)[1] =
+                                ((npy_short *)data0 + 2*1)[1] +
+                                ((npy_short *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_short *)data_out + 2*2)[0] =
+                                ((npy_short *)data0 + 2*2)[0] +
+                                ((npy_short *)data_out + 2*2)[0];
+        ((npy_short *)data_out + 2*2)[1] =
+                                ((npy_short *)data0 + 2*2)[1] +
+                                ((npy_short *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_short *)data_out + 2*3)[0] =
+                                ((npy_short *)data0 + 2*3)[0] +
+                                ((npy_short *)data_out + 2*3)[0];
+        ((npy_short *)data_out + 2*3)[1] =
+                                ((npy_short *)data0 + 2*3)[1] +
+                                ((npy_short *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_short *)data_out + 2*4)[0] =
+                                ((npy_short *)data0 + 2*4)[0] +
+                                ((npy_short *)data_out + 2*4)[0];
+        ((npy_short *)data_out + 2*4)[1] =
+                                ((npy_short *)data0 + 2*4)[1] +
+                                ((npy_short *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_short *)data_out + 2*5)[0] =
+                                ((npy_short *)data0 + 2*5)[0] +
+                                ((npy_short *)data_out + 2*5)[0];
+        ((npy_short *)data_out + 2*5)[1] =
+                                ((npy_short *)data0 + 2*5)[1] +
+                                ((npy_short *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_short *)data_out + 2*6)[0] =
+                                ((npy_short *)data0 + 2*6)[0] +
+                                ((npy_short *)data_out + 2*6)[0];
+        ((npy_short *)data_out + 2*6)[1] =
+                                ((npy_short *)data0 + 2*6)[1] +
+                                ((npy_short *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_short *)data_out + 2*7)[0] =
+                                ((npy_short *)data0 + 2*7)[0] +
+                                ((npy_short *)data_out + 2*7)[0];
+        ((npy_short *)data_out + 2*7)[1] =
+                                ((npy_short *)data0 + 2*7)[1] +
+                                ((npy_short *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 1000 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+short_sum_of_products_muladd(npy_short *data, npy_short *data_out, npy_short scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_short
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_s16;
+    const npyv_s16 v_scalar = npyv_setall_s16(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_s16 b0 = npyv_loada_s16(data + vstep * 0);
+            npyv_s16 c0 = npyv_loada_s16(data_out + vstep * 0);
+            
+#line 312
+            npyv_s16 b1 = npyv_loada_s16(data + vstep * 1);
+            npyv_s16 c1 = npyv_loada_s16(data_out + vstep * 1);
+            
+#line 312
+            npyv_s16 b2 = npyv_loada_s16(data + vstep * 2);
+            npyv_s16 c2 = npyv_loada_s16(data_out + vstep * 2);
+            
+#line 312
+            npyv_s16 b3 = npyv_loada_s16(data + vstep * 3);
+            npyv_s16 c3 = npyv_loada_s16(data_out + vstep * 3);
+            
+            #line 318
+            npyv_s16 abc0 = npyv_muladd_s16(v_scalar, b0, c0);
+            
+#line 318
+            npyv_s16 abc1 = npyv_muladd_s16(v_scalar, b1, c1);
+            
+#line 318
+            npyv_s16 abc2 = npyv_muladd_s16(v_scalar, b2, c2);
+            
+#line 318
+            npyv_s16 abc3 = npyv_muladd_s16(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_s16(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_s16(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_s16(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_s16(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_s16 b0 = npyv_load_s16(data + vstep * 0);
+            npyv_s16 c0 = npyv_load_s16(data_out + vstep * 0);
+            
+#line 312
+            npyv_s16 b1 = npyv_load_s16(data + vstep * 1);
+            npyv_s16 c1 = npyv_load_s16(data_out + vstep * 1);
+            
+#line 312
+            npyv_s16 b2 = npyv_load_s16(data + vstep * 2);
+            npyv_s16 c2 = npyv_load_s16(data_out + vstep * 2);
+            
+#line 312
+            npyv_s16 b3 = npyv_load_s16(data + vstep * 3);
+            npyv_s16 c3 = npyv_load_s16(data_out + vstep * 3);
+            
+            #line 318
+            npyv_s16 abc0 = npyv_muladd_s16(v_scalar, b0, c0);
+            
+#line 318
+            npyv_s16 abc1 = npyv_muladd_s16(v_scalar, b1, c1);
+            
+#line 318
+            npyv_s16 abc2 = npyv_muladd_s16(v_scalar, b2, c2);
+            
+#line 318
+            npyv_s16 abc3 = npyv_muladd_s16(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_s16(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_s16(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_s16(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_s16(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_s16 a = npyv_load_tillz_s16(data, count);
+        npyv_s16 b = npyv_load_tillz_s16(data_out, count);
+        npyv_store_till_s16(data_out, count, npyv_muladd_s16(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_short b0 = (data[0]);
+        const npy_short c0 = (data_out[0]);
+        
+#line 340
+        const npy_short b1 = (data[1]);
+        const npy_short c1 = (data_out[1]);
+        
+#line 340
+        const npy_short b2 = (data[2]);
+        const npy_short c2 = (data_out[2]);
+        
+#line 340
+        const npy_short b3 = (data[3]);
+        const npy_short c3 = (data_out[3]);
+        
+        #line 346
+        const npy_short abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_short abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_short abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_short abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_short b = (*data);
+        const npy_short c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_short
+}
+
+static void
+short_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_short *data0 = (npy_short *)dataptr[0];
+    npy_short *data1 = (npy_short *)dataptr[1];
+    npy_short *data_out = (npy_short *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_short
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_s16;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_s16 a0 = npyv_loada_s16(data0 + vstep * 0);
+            npyv_s16 b0 = npyv_loada_s16(data1 + vstep * 0);
+            npyv_s16 c0 = npyv_loada_s16(data_out + vstep * 0);
+            
+#line 390
+            npyv_s16 a1 = npyv_loada_s16(data0 + vstep * 1);
+            npyv_s16 b1 = npyv_loada_s16(data1 + vstep * 1);
+            npyv_s16 c1 = npyv_loada_s16(data_out + vstep * 1);
+            
+#line 390
+            npyv_s16 a2 = npyv_loada_s16(data0 + vstep * 2);
+            npyv_s16 b2 = npyv_loada_s16(data1 + vstep * 2);
+            npyv_s16 c2 = npyv_loada_s16(data_out + vstep * 2);
+            
+#line 390
+            npyv_s16 a3 = npyv_loada_s16(data0 + vstep * 3);
+            npyv_s16 b3 = npyv_loada_s16(data1 + vstep * 3);
+            npyv_s16 c3 = npyv_loada_s16(data_out + vstep * 3);
+            
+            #line 397
+            npyv_s16 abc0 = npyv_muladd_s16(a0, b0, c0);
+            
+#line 397
+            npyv_s16 abc1 = npyv_muladd_s16(a1, b1, c1);
+            
+#line 397
+            npyv_s16 abc2 = npyv_muladd_s16(a2, b2, c2);
+            
+#line 397
+            npyv_s16 abc3 = npyv_muladd_s16(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_s16(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_s16(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_s16(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_s16(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_s16 a0 = npyv_load_s16(data0 + vstep * 0);
+            npyv_s16 b0 = npyv_load_s16(data1 + vstep * 0);
+            npyv_s16 c0 = npyv_load_s16(data_out + vstep * 0);
+            
+#line 390
+            npyv_s16 a1 = npyv_load_s16(data0 + vstep * 1);
+            npyv_s16 b1 = npyv_load_s16(data1 + vstep * 1);
+            npyv_s16 c1 = npyv_load_s16(data_out + vstep * 1);
+            
+#line 390
+            npyv_s16 a2 = npyv_load_s16(data0 + vstep * 2);
+            npyv_s16 b2 = npyv_load_s16(data1 + vstep * 2);
+            npyv_s16 c2 = npyv_load_s16(data_out + vstep * 2);
+            
+#line 390
+            npyv_s16 a3 = npyv_load_s16(data0 + vstep * 3);
+            npyv_s16 b3 = npyv_load_s16(data1 + vstep * 3);
+            npyv_s16 c3 = npyv_load_s16(data_out + vstep * 3);
+            
+            #line 397
+            npyv_s16 abc0 = npyv_muladd_s16(a0, b0, c0);
+            
+#line 397
+            npyv_s16 abc1 = npyv_muladd_s16(a1, b1, c1);
+            
+#line 397
+            npyv_s16 abc2 = npyv_muladd_s16(a2, b2, c2);
+            
+#line 397
+            npyv_s16 abc3 = npyv_muladd_s16(a3, b3, c3);
+            
+            #line 402
+            npyv_store_s16(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_s16(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_s16(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_s16(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_s16 a = npyv_load_tillz_s16(data0, count);
+        npyv_s16 b = npyv_load_tillz_s16(data1, count);
+        npyv_s16 c = npyv_load_tillz_s16(data_out, count);
+        npyv_store_till_s16(data_out, count, npyv_muladd_s16(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_short a0 = (data0[0]);
+        const npy_short b0 = (data1[0]);
+        const npy_short c0 = (data_out[0]);
+        
+#line 420
+        const npy_short a1 = (data0[1]);
+        const npy_short b1 = (data1[1]);
+        const npy_short c1 = (data_out[1]);
+        
+#line 420
+        const npy_short a2 = (data0[2]);
+        const npy_short b2 = (data1[2]);
+        const npy_short c2 = (data_out[2]);
+        
+#line 420
+        const npy_short a3 = (data0[3]);
+        const npy_short b3 = (data1[3]);
+        const npy_short c3 = (data_out[3]);
+        
+        #line 427
+        const npy_short abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_short abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_short abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_short abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_short a = (*data0);
+        const npy_short b = (*data1);
+        const npy_short c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_short
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+short_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_short value0 = (*(npy_short *)dataptr[0]);
+    npy_short *data1 = (npy_short *)dataptr[1];
+    npy_short *data_out = (npy_short *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    short_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+short_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_short value1 = (*(npy_short *)dataptr[1]);
+    npy_short *data0 = (npy_short *)dataptr[0];
+    npy_short *data_out = (npy_short *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    short_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+short_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_short *data0 = (npy_short *)dataptr[0];
+    npy_short *data1 = (npy_short *)dataptr[1];
+    npy_short accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_short
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_s16;
+    npyv_s16 v_accum = npyv_zero_s16();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_s16 a0 = npyv_loada_s16(data0 + vstep * 0);
+            npyv_s16 b0 = npyv_loada_s16(data1 + vstep * 0);
+            
+#line 501
+            npyv_s16 a1 = npyv_loada_s16(data0 + vstep * 1);
+            npyv_s16 b1 = npyv_loada_s16(data1 + vstep * 1);
+            
+#line 501
+            npyv_s16 a2 = npyv_loada_s16(data0 + vstep * 2);
+            npyv_s16 b2 = npyv_loada_s16(data1 + vstep * 2);
+            
+#line 501
+            npyv_s16 a3 = npyv_loada_s16(data0 + vstep * 3);
+            npyv_s16 b3 = npyv_loada_s16(data1 + vstep * 3);
+            
+            npyv_s16 ab3 = npyv_muladd_s16(a3, b3, v_accum);
+            npyv_s16 ab2 = npyv_muladd_s16(a2, b2, ab3);
+            npyv_s16 ab1 = npyv_muladd_s16(a1, b1, ab2);
+                   v_accum = npyv_muladd_s16(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_s16 a0 = npyv_load_s16(data0 + vstep * 0);
+            npyv_s16 b0 = npyv_load_s16(data1 + vstep * 0);
+            
+#line 501
+            npyv_s16 a1 = npyv_load_s16(data0 + vstep * 1);
+            npyv_s16 b1 = npyv_load_s16(data1 + vstep * 1);
+            
+#line 501
+            npyv_s16 a2 = npyv_load_s16(data0 + vstep * 2);
+            npyv_s16 b2 = npyv_load_s16(data1 + vstep * 2);
+            
+#line 501
+            npyv_s16 a3 = npyv_load_s16(data0 + vstep * 3);
+            npyv_s16 b3 = npyv_load_s16(data1 + vstep * 3);
+            
+            npyv_s16 ab3 = npyv_muladd_s16(a3, b3, v_accum);
+            npyv_s16 ab2 = npyv_muladd_s16(a2, b2, ab3);
+            npyv_s16 ab1 = npyv_muladd_s16(a1, b1, ab2);
+                   v_accum = npyv_muladd_s16(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_s16 a = npyv_load_tillz_s16(data0, count);
+        npyv_s16 b = npyv_load_tillz_s16(data1, count);
+        v_accum = npyv_muladd_s16(a, b, v_accum);
+    }
+    accum = npyv_sum_s16(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_short ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_short ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_short ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_short ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_short a = (*data0);
+        const npy_short b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_short
+    *(npy_short *)dataptr[2] = ((*(npy_short *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+short_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_short *data1 = (npy_short *)dataptr[1];
+    npy_short value0 = (*(npy_short *)dataptr[0]);
+    npy_short accum = short_sum_of_arr(data1, count);
+    *(npy_short *)dataptr[2] = ((*(npy_short *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+short_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_short *data0 = (npy_short *)dataptr[0];
+    npy_short value1 = (*(npy_short *)dataptr[1]);
+    npy_short accum = short_sum_of_arr(data0, count);
+    *(npy_short *)dataptr[2] = ((*(npy_short *)dataptr[2]) + value1 * accum);
+}
+
+#elif 1000 == 3 && !0
+
+static void
+short_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_short *data0 = (npy_short *)dataptr[0];
+    npy_short *data1 = (npy_short *)dataptr[1];
+    npy_short *data2 = (npy_short *)dataptr[2];
+    npy_short *data_out = (npy_short *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 1000 > 3 || @complex */
+
+static void
+short_sum_of_products_contig_any(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_any (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_short temp = (*(npy_short *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_short *)dataptr[i]);
+        }
+        *(npy_short *)dataptr[nop] = (temp +
+                                           (*(npy_short *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_short);
+        }
+#else /* complex */
+#  if 1000 <= 3
+#    define _SUMPROD_NOP 1000
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_short re, im, tmp;
+        int i;
+        re = ((npy_short *)dataptr[0])[0];
+        im = ((npy_short *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_short *)dataptr[i])[0] -
+                  im * ((npy_short *)dataptr[i])[1];
+            im = re * ((npy_short *)dataptr[i])[1] +
+                 im * ((npy_short *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_short *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_short *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_short *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_short *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_short);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 1000 */
+
+#if 1000 == 1
+
+static NPY_GCC_OPT_3 void
+short_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_short *data = (npy_short *)dataptr[0];
+    npy_short accum = short_sum_of_arr(data, count);
+    *((npy_short *)dataptr[1]) = (accum + (*((npy_short *)dataptr[1])));
+#else
+    npy_short accum_re = 0, accum_im = 0;
+    npy_short *data0 = (npy_short *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_short re01 = data0[0] + data0[2];
+        const npy_short re23 = data0[4] + data0[6];
+        const npy_short im13 = data0[1] + data0[3];
+        const npy_short im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_short *)dataptr[1])[0] += accum_re;
+    ((npy_short *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 1000 == 1 */
+
+static void
+short_sum_of_products_outstride0_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_short accum_re = 0, accum_im = 0;
+#else
+    npy_short accum = 0;
+#endif
+
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1000 == 2 || 1000 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1000 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("short_sum_of_products_outstride0_any (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 1000 == 1
+        accum += (*(npy_short *)data0);
+        data0 += stride0;
+#  elif 1000 == 2
+        accum += (*(npy_short *)data0) *
+                 (*(npy_short *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 1000 == 3
+        accum += (*(npy_short *)data0) *
+                 (*(npy_short *)data1) *
+                 (*(npy_short *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_short temp = (*(npy_short *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_short *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1000 == 1
+        accum_re += ((npy_short *)data0)[0];
+        accum_im += ((npy_short *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 1000 <= 3
+#define _SUMPROD_NOP 1000
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_short re, im, tmp;
+        int i;
+        re = ((npy_short *)dataptr[0])[0];
+        im = ((npy_short *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_short *)dataptr[i])[0] -
+                  im * ((npy_short *)dataptr[i])[1];
+            im = re * ((npy_short *)dataptr[i])[1] +
+                 im * ((npy_short *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 1000 <= 3
+    ((npy_short *)dataptr[1000])[0] += accum_re;
+    ((npy_short *)dataptr[1000])[1] += accum_im;
+#  else
+    ((npy_short *)dataptr[nop])[0] += accum_re;
+    ((npy_short *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 1000 <= 3
+    *((npy_short *)dataptr[1000]) = (accum +
+                                    (*((npy_short *)dataptr[1000])));
+#  else
+    *((npy_short *)dataptr[nop]) = (accum +
+                                    (*((npy_short *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+
+
+#line 74
+
+#if !0
+static NPY_GCC_OPT_3 npy_int int_sum_of_arr(npy_int *data, npy_intp count)
+{
+    npy_int accum = 0;
+#if 0 // NPYV check for npy_int
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data);
+    const int vstep = npyv_nlanes_s32;
+    npyv_s32 v_accum = npyv_zero_s32();
+    const npy_intp vstepx4 = vstep * 4;
+
+    #line 91
+    if(is_aligned) {
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
+            #line 96
+            npyv_s32 a0 = npyv_loada_s32(data + vstep * 0);
+            
+#line 96
+            npyv_s32 a1 = npyv_loada_s32(data + vstep * 1);
+            
+#line 96
+            npyv_s32 a2 = npyv_loada_s32(data + vstep * 2);
+            
+#line 96
+            npyv_s32 a3 = npyv_loada_s32(data + vstep * 3);
+            
+            npyv_s32 a01   = npyv_add_s32(a0, a1);
+            npyv_s32 a23   = npyv_add_s32(a2, a3);
+            npyv_s32 a0123 = npyv_add_s32(a01, a23);
+                     v_accum = npyv_add_s32(a0123, v_accum);
+        }
+    }
+    
+#line 91
+    else {
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
+            #line 96
+            npyv_s32 a0 = npyv_load_s32(data + vstep * 0);
+            
+#line 96
+            npyv_s32 a1 = npyv_load_s32(data + vstep * 1);
+            
+#line 96
+            npyv_s32 a2 = npyv_load_s32(data + vstep * 2);
+            
+#line 96
+            npyv_s32 a3 = npyv_load_s32(data + vstep * 3);
+            
+            npyv_s32 a01   = npyv_add_s32(a0, a1);
+            npyv_s32 a23   = npyv_add_s32(a2, a3);
+            npyv_s32 a0123 = npyv_add_s32(a01, a23);
+                     v_accum = npyv_add_s32(a0123, v_accum);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep) {
+        npyv_s32 a = npyv_load_tillz_s32(data, count);
+        v_accum = npyv_add_s32(a, v_accum);
+    }
+    accum = npyv_sum_s32(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data += 4) {
+        const npy_int a01 = (*data) + (data[1]);
+        const npy_int a23 = (data[2]) + (data[3]);
+        accum +=  a01 + a23;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data++) {
+        accum += (*data);
+    }
+#endif // NPYV check for npy_int
+    return accum;
+}
+#endif
+
+#line 131
+static void
+int_sum_of_products_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1 == 2 || 1 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data_out = dataptr[1];
+    npy_intp stride_out = strides[1];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_one (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 1 == 1
+        *(npy_int *)data_out = ((*(npy_int *)data0) +
+                                         (*(npy_int *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 1 == 2
+        *(npy_int *)data_out = ((*(npy_int *)data0) *
+                                         (*(npy_int *)data1) +
+                                         (*(npy_int *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 1 == 3
+        *(npy_int *)data_out = ((*(npy_int *)data0) *
+                                         (*(npy_int *)data1) *
+                                         (*(npy_int *)data2) +
+                                         (*(npy_int *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_int temp = (*(npy_int *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_int *)dataptr[i]);
+        }
+        *(npy_int *)dataptr[nop] = (temp +
+                                           (*(npy_int *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1 == 1
+        ((npy_int *)data_out)[0] = ((npy_int *)data0)[0] +
+                                         ((npy_int *)data_out)[0];
+        ((npy_int *)data_out)[1] = ((npy_int *)data0)[1] +
+                                         ((npy_int *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 1 <= 3
+#define _SUMPROD_NOP 1
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_int re, im, tmp;
+        int i;
+        re = ((npy_int *)dataptr[0])[0];
+        im = ((npy_int *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_int *)dataptr[i])[0] -
+                  im * ((npy_int *)dataptr[i])[1];
+            im = re * ((npy_int *)dataptr[i])[1] +
+                 im * ((npy_int *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_int *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_int *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_int *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_int *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 1 == 1
+
+static void
+int_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_int *data0 = (npy_int *)dataptr[0];
+    npy_int *data_out = (npy_int *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_int *)data_out + 2*6)[0] =
+                                    ((npy_int *)data0 + 2*6)[0] +
+                                    ((npy_int *)data_out + 2*6)[0];
+            ((npy_int *)data_out + 2*6)[1] =
+                                    ((npy_int *)data0 + 2*6)[1] +
+                                    ((npy_int *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_int *)data_out + 2*5)[0] =
+                                    ((npy_int *)data0 + 2*5)[0] +
+                                    ((npy_int *)data_out + 2*5)[0];
+            ((npy_int *)data_out + 2*5)[1] =
+                                    ((npy_int *)data0 + 2*5)[1] +
+                                    ((npy_int *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_int *)data_out + 2*4)[0] =
+                                    ((npy_int *)data0 + 2*4)[0] +
+                                    ((npy_int *)data_out + 2*4)[0];
+            ((npy_int *)data_out + 2*4)[1] =
+                                    ((npy_int *)data0 + 2*4)[1] +
+                                    ((npy_int *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_int *)data_out + 2*3)[0] =
+                                    ((npy_int *)data0 + 2*3)[0] +
+                                    ((npy_int *)data_out + 2*3)[0];
+            ((npy_int *)data_out + 2*3)[1] =
+                                    ((npy_int *)data0 + 2*3)[1] +
+                                    ((npy_int *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_int *)data_out + 2*2)[0] =
+                                    ((npy_int *)data0 + 2*2)[0] +
+                                    ((npy_int *)data_out + 2*2)[0];
+            ((npy_int *)data_out + 2*2)[1] =
+                                    ((npy_int *)data0 + 2*2)[1] +
+                                    ((npy_int *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_int *)data_out + 2*1)[0] =
+                                    ((npy_int *)data0 + 2*1)[0] +
+                                    ((npy_int *)data_out + 2*1)[0];
+            ((npy_int *)data_out + 2*1)[1] =
+                                    ((npy_int *)data0 + 2*1)[1] +
+                                    ((npy_int *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_int *)data_out + 2*0)[0] =
+                                    ((npy_int *)data0 + 2*0)[0] +
+                                    ((npy_int *)data_out + 2*0)[0];
+            ((npy_int *)data_out + 2*0)[1] =
+                                    ((npy_int *)data0 + 2*0)[1] +
+                                    ((npy_int *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_int *)data_out + 2*0)[0] =
+                                ((npy_int *)data0 + 2*0)[0] +
+                                ((npy_int *)data_out + 2*0)[0];
+        ((npy_int *)data_out + 2*0)[1] =
+                                ((npy_int *)data0 + 2*0)[1] +
+                                ((npy_int *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_int *)data_out + 2*1)[0] =
+                                ((npy_int *)data0 + 2*1)[0] +
+                                ((npy_int *)data_out + 2*1)[0];
+        ((npy_int *)data_out + 2*1)[1] =
+                                ((npy_int *)data0 + 2*1)[1] +
+                                ((npy_int *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_int *)data_out + 2*2)[0] =
+                                ((npy_int *)data0 + 2*2)[0] +
+                                ((npy_int *)data_out + 2*2)[0];
+        ((npy_int *)data_out + 2*2)[1] =
+                                ((npy_int *)data0 + 2*2)[1] +
+                                ((npy_int *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_int *)data_out + 2*3)[0] =
+                                ((npy_int *)data0 + 2*3)[0] +
+                                ((npy_int *)data_out + 2*3)[0];
+        ((npy_int *)data_out + 2*3)[1] =
+                                ((npy_int *)data0 + 2*3)[1] +
+                                ((npy_int *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_int *)data_out + 2*4)[0] =
+                                ((npy_int *)data0 + 2*4)[0] +
+                                ((npy_int *)data_out + 2*4)[0];
+        ((npy_int *)data_out + 2*4)[1] =
+                                ((npy_int *)data0 + 2*4)[1] +
+                                ((npy_int *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_int *)data_out + 2*5)[0] =
+                                ((npy_int *)data0 + 2*5)[0] +
+                                ((npy_int *)data_out + 2*5)[0];
+        ((npy_int *)data_out + 2*5)[1] =
+                                ((npy_int *)data0 + 2*5)[1] +
+                                ((npy_int *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_int *)data_out + 2*6)[0] =
+                                ((npy_int *)data0 + 2*6)[0] +
+                                ((npy_int *)data_out + 2*6)[0];
+        ((npy_int *)data_out + 2*6)[1] =
+                                ((npy_int *)data0 + 2*6)[1] +
+                                ((npy_int *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_int *)data_out + 2*7)[0] =
+                                ((npy_int *)data0 + 2*7)[0] +
+                                ((npy_int *)data_out + 2*7)[0];
+        ((npy_int *)data_out + 2*7)[1] =
+                                ((npy_int *)data0 + 2*7)[1] +
+                                ((npy_int *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 1 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+int_sum_of_products_muladd(npy_int *data, npy_int *data_out, npy_int scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_int
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_s32;
+    const npyv_s32 v_scalar = npyv_setall_s32(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_s32 b0 = npyv_loada_s32(data + vstep * 0);
+            npyv_s32 c0 = npyv_loada_s32(data_out + vstep * 0);
+            
+#line 312
+            npyv_s32 b1 = npyv_loada_s32(data + vstep * 1);
+            npyv_s32 c1 = npyv_loada_s32(data_out + vstep * 1);
+            
+#line 312
+            npyv_s32 b2 = npyv_loada_s32(data + vstep * 2);
+            npyv_s32 c2 = npyv_loada_s32(data_out + vstep * 2);
+            
+#line 312
+            npyv_s32 b3 = npyv_loada_s32(data + vstep * 3);
+            npyv_s32 c3 = npyv_loada_s32(data_out + vstep * 3);
+            
+            #line 318
+            npyv_s32 abc0 = npyv_muladd_s32(v_scalar, b0, c0);
+            
+#line 318
+            npyv_s32 abc1 = npyv_muladd_s32(v_scalar, b1, c1);
+            
+#line 318
+            npyv_s32 abc2 = npyv_muladd_s32(v_scalar, b2, c2);
+            
+#line 318
+            npyv_s32 abc3 = npyv_muladd_s32(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_s32(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_s32(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_s32(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_s32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_s32 b0 = npyv_load_s32(data + vstep * 0);
+            npyv_s32 c0 = npyv_load_s32(data_out + vstep * 0);
+            
+#line 312
+            npyv_s32 b1 = npyv_load_s32(data + vstep * 1);
+            npyv_s32 c1 = npyv_load_s32(data_out + vstep * 1);
+            
+#line 312
+            npyv_s32 b2 = npyv_load_s32(data + vstep * 2);
+            npyv_s32 c2 = npyv_load_s32(data_out + vstep * 2);
+            
+#line 312
+            npyv_s32 b3 = npyv_load_s32(data + vstep * 3);
+            npyv_s32 c3 = npyv_load_s32(data_out + vstep * 3);
+            
+            #line 318
+            npyv_s32 abc0 = npyv_muladd_s32(v_scalar, b0, c0);
+            
+#line 318
+            npyv_s32 abc1 = npyv_muladd_s32(v_scalar, b1, c1);
+            
+#line 318
+            npyv_s32 abc2 = npyv_muladd_s32(v_scalar, b2, c2);
+            
+#line 318
+            npyv_s32 abc3 = npyv_muladd_s32(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_s32(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_s32(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_s32(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_s32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_s32 a = npyv_load_tillz_s32(data, count);
+        npyv_s32 b = npyv_load_tillz_s32(data_out, count);
+        npyv_store_till_s32(data_out, count, npyv_muladd_s32(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_int b0 = (data[0]);
+        const npy_int c0 = (data_out[0]);
+        
+#line 340
+        const npy_int b1 = (data[1]);
+        const npy_int c1 = (data_out[1]);
+        
+#line 340
+        const npy_int b2 = (data[2]);
+        const npy_int c2 = (data_out[2]);
+        
+#line 340
+        const npy_int b3 = (data[3]);
+        const npy_int c3 = (data_out[3]);
+        
+        #line 346
+        const npy_int abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_int abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_int abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_int abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_int b = (*data);
+        const npy_int c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_int
+}
+
+static void
+int_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_int *data0 = (npy_int *)dataptr[0];
+    npy_int *data1 = (npy_int *)dataptr[1];
+    npy_int *data_out = (npy_int *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_int
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_s32;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_s32 a0 = npyv_loada_s32(data0 + vstep * 0);
+            npyv_s32 b0 = npyv_loada_s32(data1 + vstep * 0);
+            npyv_s32 c0 = npyv_loada_s32(data_out + vstep * 0);
+            
+#line 390
+            npyv_s32 a1 = npyv_loada_s32(data0 + vstep * 1);
+            npyv_s32 b1 = npyv_loada_s32(data1 + vstep * 1);
+            npyv_s32 c1 = npyv_loada_s32(data_out + vstep * 1);
+            
+#line 390
+            npyv_s32 a2 = npyv_loada_s32(data0 + vstep * 2);
+            npyv_s32 b2 = npyv_loada_s32(data1 + vstep * 2);
+            npyv_s32 c2 = npyv_loada_s32(data_out + vstep * 2);
+            
+#line 390
+            npyv_s32 a3 = npyv_loada_s32(data0 + vstep * 3);
+            npyv_s32 b3 = npyv_loada_s32(data1 + vstep * 3);
+            npyv_s32 c3 = npyv_loada_s32(data_out + vstep * 3);
+            
+            #line 397
+            npyv_s32 abc0 = npyv_muladd_s32(a0, b0, c0);
+            
+#line 397
+            npyv_s32 abc1 = npyv_muladd_s32(a1, b1, c1);
+            
+#line 397
+            npyv_s32 abc2 = npyv_muladd_s32(a2, b2, c2);
+            
+#line 397
+            npyv_s32 abc3 = npyv_muladd_s32(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_s32(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_s32(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_s32(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_s32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_s32 a0 = npyv_load_s32(data0 + vstep * 0);
+            npyv_s32 b0 = npyv_load_s32(data1 + vstep * 0);
+            npyv_s32 c0 = npyv_load_s32(data_out + vstep * 0);
+            
+#line 390
+            npyv_s32 a1 = npyv_load_s32(data0 + vstep * 1);
+            npyv_s32 b1 = npyv_load_s32(data1 + vstep * 1);
+            npyv_s32 c1 = npyv_load_s32(data_out + vstep * 1);
+            
+#line 390
+            npyv_s32 a2 = npyv_load_s32(data0 + vstep * 2);
+            npyv_s32 b2 = npyv_load_s32(data1 + vstep * 2);
+            npyv_s32 c2 = npyv_load_s32(data_out + vstep * 2);
+            
+#line 390
+            npyv_s32 a3 = npyv_load_s32(data0 + vstep * 3);
+            npyv_s32 b3 = npyv_load_s32(data1 + vstep * 3);
+            npyv_s32 c3 = npyv_load_s32(data_out + vstep * 3);
+            
+            #line 397
+            npyv_s32 abc0 = npyv_muladd_s32(a0, b0, c0);
+            
+#line 397
+            npyv_s32 abc1 = npyv_muladd_s32(a1, b1, c1);
+            
+#line 397
+            npyv_s32 abc2 = npyv_muladd_s32(a2, b2, c2);
+            
+#line 397
+            npyv_s32 abc3 = npyv_muladd_s32(a3, b3, c3);
+            
+            #line 402
+            npyv_store_s32(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_s32(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_s32(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_s32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_s32 a = npyv_load_tillz_s32(data0, count);
+        npyv_s32 b = npyv_load_tillz_s32(data1, count);
+        npyv_s32 c = npyv_load_tillz_s32(data_out, count);
+        npyv_store_till_s32(data_out, count, npyv_muladd_s32(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_int a0 = (data0[0]);
+        const npy_int b0 = (data1[0]);
+        const npy_int c0 = (data_out[0]);
+        
+#line 420
+        const npy_int a1 = (data0[1]);
+        const npy_int b1 = (data1[1]);
+        const npy_int c1 = (data_out[1]);
+        
+#line 420
+        const npy_int a2 = (data0[2]);
+        const npy_int b2 = (data1[2]);
+        const npy_int c2 = (data_out[2]);
+        
+#line 420
+        const npy_int a3 = (data0[3]);
+        const npy_int b3 = (data1[3]);
+        const npy_int c3 = (data_out[3]);
+        
+        #line 427
+        const npy_int abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_int abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_int abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_int abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_int a = (*data0);
+        const npy_int b = (*data1);
+        const npy_int c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_int
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+int_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_int value0 = (*(npy_int *)dataptr[0]);
+    npy_int *data1 = (npy_int *)dataptr[1];
+    npy_int *data_out = (npy_int *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    int_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+int_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_int value1 = (*(npy_int *)dataptr[1]);
+    npy_int *data0 = (npy_int *)dataptr[0];
+    npy_int *data_out = (npy_int *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    int_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+int_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_int *data0 = (npy_int *)dataptr[0];
+    npy_int *data1 = (npy_int *)dataptr[1];
+    npy_int accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_int
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_s32;
+    npyv_s32 v_accum = npyv_zero_s32();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_s32 a0 = npyv_loada_s32(data0 + vstep * 0);
+            npyv_s32 b0 = npyv_loada_s32(data1 + vstep * 0);
+            
+#line 501
+            npyv_s32 a1 = npyv_loada_s32(data0 + vstep * 1);
+            npyv_s32 b1 = npyv_loada_s32(data1 + vstep * 1);
+            
+#line 501
+            npyv_s32 a2 = npyv_loada_s32(data0 + vstep * 2);
+            npyv_s32 b2 = npyv_loada_s32(data1 + vstep * 2);
+            
+#line 501
+            npyv_s32 a3 = npyv_loada_s32(data0 + vstep * 3);
+            npyv_s32 b3 = npyv_loada_s32(data1 + vstep * 3);
+            
+            npyv_s32 ab3 = npyv_muladd_s32(a3, b3, v_accum);
+            npyv_s32 ab2 = npyv_muladd_s32(a2, b2, ab3);
+            npyv_s32 ab1 = npyv_muladd_s32(a1, b1, ab2);
+                   v_accum = npyv_muladd_s32(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_s32 a0 = npyv_load_s32(data0 + vstep * 0);
+            npyv_s32 b0 = npyv_load_s32(data1 + vstep * 0);
+            
+#line 501
+            npyv_s32 a1 = npyv_load_s32(data0 + vstep * 1);
+            npyv_s32 b1 = npyv_load_s32(data1 + vstep * 1);
+            
+#line 501
+            npyv_s32 a2 = npyv_load_s32(data0 + vstep * 2);
+            npyv_s32 b2 = npyv_load_s32(data1 + vstep * 2);
+            
+#line 501
+            npyv_s32 a3 = npyv_load_s32(data0 + vstep * 3);
+            npyv_s32 b3 = npyv_load_s32(data1 + vstep * 3);
+            
+            npyv_s32 ab3 = npyv_muladd_s32(a3, b3, v_accum);
+            npyv_s32 ab2 = npyv_muladd_s32(a2, b2, ab3);
+            npyv_s32 ab1 = npyv_muladd_s32(a1, b1, ab2);
+                   v_accum = npyv_muladd_s32(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_s32 a = npyv_load_tillz_s32(data0, count);
+        npyv_s32 b = npyv_load_tillz_s32(data1, count);
+        v_accum = npyv_muladd_s32(a, b, v_accum);
+    }
+    accum = npyv_sum_s32(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_int ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_int ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_int ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_int ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_int a = (*data0);
+        const npy_int b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_int
+    *(npy_int *)dataptr[2] = ((*(npy_int *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+int_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_int *data1 = (npy_int *)dataptr[1];
+    npy_int value0 = (*(npy_int *)dataptr[0]);
+    npy_int accum = int_sum_of_arr(data1, count);
+    *(npy_int *)dataptr[2] = ((*(npy_int *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+int_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_int *data0 = (npy_int *)dataptr[0];
+    npy_int value1 = (*(npy_int *)dataptr[1]);
+    npy_int accum = int_sum_of_arr(data0, count);
+    *(npy_int *)dataptr[2] = ((*(npy_int *)dataptr[2]) + value1 * accum);
+}
+
+#elif 1 == 3 && !0
+
+static void
+int_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_int *data0 = (npy_int *)dataptr[0];
+    npy_int *data1 = (npy_int *)dataptr[1];
+    npy_int *data2 = (npy_int *)dataptr[2];
+    npy_int *data_out = (npy_int *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 1 > 3 || @complex */
+
+static void
+int_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_one (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_int temp = (*(npy_int *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_int *)dataptr[i]);
+        }
+        *(npy_int *)dataptr[nop] = (temp +
+                                           (*(npy_int *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_int);
+        }
+#else /* complex */
+#  if 1 <= 3
+#    define _SUMPROD_NOP 1
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_int re, im, tmp;
+        int i;
+        re = ((npy_int *)dataptr[0])[0];
+        im = ((npy_int *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_int *)dataptr[i])[0] -
+                  im * ((npy_int *)dataptr[i])[1];
+            im = re * ((npy_int *)dataptr[i])[1] +
+                 im * ((npy_int *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_int *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_int *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_int *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_int *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_int);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 1 */
+
+#if 1 == 1
+
+static NPY_GCC_OPT_3 void
+int_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_int *data = (npy_int *)dataptr[0];
+    npy_int accum = int_sum_of_arr(data, count);
+    *((npy_int *)dataptr[1]) = (accum + (*((npy_int *)dataptr[1])));
+#else
+    npy_int accum_re = 0, accum_im = 0;
+    npy_int *data0 = (npy_int *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_int re01 = data0[0] + data0[2];
+        const npy_int re23 = data0[4] + data0[6];
+        const npy_int im13 = data0[1] + data0[3];
+        const npy_int im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_int *)dataptr[1])[0] += accum_re;
+    ((npy_int *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 1 == 1 */
+
+static void
+int_sum_of_products_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_int accum_re = 0, accum_im = 0;
+#else
+    npy_int accum = 0;
+#endif
+
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1 == 2 || 1 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_outstride0_one (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 1 == 1
+        accum += (*(npy_int *)data0);
+        data0 += stride0;
+#  elif 1 == 2
+        accum += (*(npy_int *)data0) *
+                 (*(npy_int *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 1 == 3
+        accum += (*(npy_int *)data0) *
+                 (*(npy_int *)data1) *
+                 (*(npy_int *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_int temp = (*(npy_int *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_int *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1 == 1
+        accum_re += ((npy_int *)data0)[0];
+        accum_im += ((npy_int *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 1 <= 3
+#define _SUMPROD_NOP 1
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_int re, im, tmp;
+        int i;
+        re = ((npy_int *)dataptr[0])[0];
+        im = ((npy_int *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_int *)dataptr[i])[0] -
+                  im * ((npy_int *)dataptr[i])[1];
+            im = re * ((npy_int *)dataptr[i])[1] +
+                 im * ((npy_int *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 1 <= 3
+    ((npy_int *)dataptr[1])[0] += accum_re;
+    ((npy_int *)dataptr[1])[1] += accum_im;
+#  else
+    ((npy_int *)dataptr[nop])[0] += accum_re;
+    ((npy_int *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 1 <= 3
+    *((npy_int *)dataptr[1]) = (accum +
+                                    (*((npy_int *)dataptr[1])));
+#  else
+    *((npy_int *)dataptr[nop]) = (accum +
+                                    (*((npy_int *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+int_sum_of_products_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (2 == 2 || 2 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (2 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data_out = dataptr[2];
+    npy_intp stride_out = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_two (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 2 == 1
+        *(npy_int *)data_out = ((*(npy_int *)data0) +
+                                         (*(npy_int *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 2 == 2
+        *(npy_int *)data_out = ((*(npy_int *)data0) *
+                                         (*(npy_int *)data1) +
+                                         (*(npy_int *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 2 == 3
+        *(npy_int *)data_out = ((*(npy_int *)data0) *
+                                         (*(npy_int *)data1) *
+                                         (*(npy_int *)data2) +
+                                         (*(npy_int *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_int temp = (*(npy_int *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_int *)dataptr[i]);
+        }
+        *(npy_int *)dataptr[nop] = (temp +
+                                           (*(npy_int *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 2 == 1
+        ((npy_int *)data_out)[0] = ((npy_int *)data0)[0] +
+                                         ((npy_int *)data_out)[0];
+        ((npy_int *)data_out)[1] = ((npy_int *)data0)[1] +
+                                         ((npy_int *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 2 <= 3
+#define _SUMPROD_NOP 2
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_int re, im, tmp;
+        int i;
+        re = ((npy_int *)dataptr[0])[0];
+        im = ((npy_int *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_int *)dataptr[i])[0] -
+                  im * ((npy_int *)dataptr[i])[1];
+            im = re * ((npy_int *)dataptr[i])[1] +
+                 im * ((npy_int *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_int *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_int *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_int *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_int *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 2 == 1
+
+static void
+int_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_int *data0 = (npy_int *)dataptr[0];
+    npy_int *data_out = (npy_int *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_int *)data_out + 2*6)[0] =
+                                    ((npy_int *)data0 + 2*6)[0] +
+                                    ((npy_int *)data_out + 2*6)[0];
+            ((npy_int *)data_out + 2*6)[1] =
+                                    ((npy_int *)data0 + 2*6)[1] +
+                                    ((npy_int *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_int *)data_out + 2*5)[0] =
+                                    ((npy_int *)data0 + 2*5)[0] +
+                                    ((npy_int *)data_out + 2*5)[0];
+            ((npy_int *)data_out + 2*5)[1] =
+                                    ((npy_int *)data0 + 2*5)[1] +
+                                    ((npy_int *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_int *)data_out + 2*4)[0] =
+                                    ((npy_int *)data0 + 2*4)[0] +
+                                    ((npy_int *)data_out + 2*4)[0];
+            ((npy_int *)data_out + 2*4)[1] =
+                                    ((npy_int *)data0 + 2*4)[1] +
+                                    ((npy_int *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_int *)data_out + 2*3)[0] =
+                                    ((npy_int *)data0 + 2*3)[0] +
+                                    ((npy_int *)data_out + 2*3)[0];
+            ((npy_int *)data_out + 2*3)[1] =
+                                    ((npy_int *)data0 + 2*3)[1] +
+                                    ((npy_int *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_int *)data_out + 2*2)[0] =
+                                    ((npy_int *)data0 + 2*2)[0] +
+                                    ((npy_int *)data_out + 2*2)[0];
+            ((npy_int *)data_out + 2*2)[1] =
+                                    ((npy_int *)data0 + 2*2)[1] +
+                                    ((npy_int *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_int *)data_out + 2*1)[0] =
+                                    ((npy_int *)data0 + 2*1)[0] +
+                                    ((npy_int *)data_out + 2*1)[0];
+            ((npy_int *)data_out + 2*1)[1] =
+                                    ((npy_int *)data0 + 2*1)[1] +
+                                    ((npy_int *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_int *)data_out + 2*0)[0] =
+                                    ((npy_int *)data0 + 2*0)[0] +
+                                    ((npy_int *)data_out + 2*0)[0];
+            ((npy_int *)data_out + 2*0)[1] =
+                                    ((npy_int *)data0 + 2*0)[1] +
+                                    ((npy_int *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_int *)data_out + 2*0)[0] =
+                                ((npy_int *)data0 + 2*0)[0] +
+                                ((npy_int *)data_out + 2*0)[0];
+        ((npy_int *)data_out + 2*0)[1] =
+                                ((npy_int *)data0 + 2*0)[1] +
+                                ((npy_int *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_int *)data_out + 2*1)[0] =
+                                ((npy_int *)data0 + 2*1)[0] +
+                                ((npy_int *)data_out + 2*1)[0];
+        ((npy_int *)data_out + 2*1)[1] =
+                                ((npy_int *)data0 + 2*1)[1] +
+                                ((npy_int *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_int *)data_out + 2*2)[0] =
+                                ((npy_int *)data0 + 2*2)[0] +
+                                ((npy_int *)data_out + 2*2)[0];
+        ((npy_int *)data_out + 2*2)[1] =
+                                ((npy_int *)data0 + 2*2)[1] +
+                                ((npy_int *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_int *)data_out + 2*3)[0] =
+                                ((npy_int *)data0 + 2*3)[0] +
+                                ((npy_int *)data_out + 2*3)[0];
+        ((npy_int *)data_out + 2*3)[1] =
+                                ((npy_int *)data0 + 2*3)[1] +
+                                ((npy_int *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_int *)data_out + 2*4)[0] =
+                                ((npy_int *)data0 + 2*4)[0] +
+                                ((npy_int *)data_out + 2*4)[0];
+        ((npy_int *)data_out + 2*4)[1] =
+                                ((npy_int *)data0 + 2*4)[1] +
+                                ((npy_int *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_int *)data_out + 2*5)[0] =
+                                ((npy_int *)data0 + 2*5)[0] +
+                                ((npy_int *)data_out + 2*5)[0];
+        ((npy_int *)data_out + 2*5)[1] =
+                                ((npy_int *)data0 + 2*5)[1] +
+                                ((npy_int *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_int *)data_out + 2*6)[0] =
+                                ((npy_int *)data0 + 2*6)[0] +
+                                ((npy_int *)data_out + 2*6)[0];
+        ((npy_int *)data_out + 2*6)[1] =
+                                ((npy_int *)data0 + 2*6)[1] +
+                                ((npy_int *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_int *)data_out + 2*7)[0] =
+                                ((npy_int *)data0 + 2*7)[0] +
+                                ((npy_int *)data_out + 2*7)[0];
+        ((npy_int *)data_out + 2*7)[1] =
+                                ((npy_int *)data0 + 2*7)[1] +
+                                ((npy_int *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 2 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+int_sum_of_products_muladd(npy_int *data, npy_int *data_out, npy_int scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_int
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_s32;
+    const npyv_s32 v_scalar = npyv_setall_s32(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_s32 b0 = npyv_loada_s32(data + vstep * 0);
+            npyv_s32 c0 = npyv_loada_s32(data_out + vstep * 0);
+            
+#line 312
+            npyv_s32 b1 = npyv_loada_s32(data + vstep * 1);
+            npyv_s32 c1 = npyv_loada_s32(data_out + vstep * 1);
+            
+#line 312
+            npyv_s32 b2 = npyv_loada_s32(data + vstep * 2);
+            npyv_s32 c2 = npyv_loada_s32(data_out + vstep * 2);
+            
+#line 312
+            npyv_s32 b3 = npyv_loada_s32(data + vstep * 3);
+            npyv_s32 c3 = npyv_loada_s32(data_out + vstep * 3);
+            
+            #line 318
+            npyv_s32 abc0 = npyv_muladd_s32(v_scalar, b0, c0);
+            
+#line 318
+            npyv_s32 abc1 = npyv_muladd_s32(v_scalar, b1, c1);
+            
+#line 318
+            npyv_s32 abc2 = npyv_muladd_s32(v_scalar, b2, c2);
+            
+#line 318
+            npyv_s32 abc3 = npyv_muladd_s32(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_s32(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_s32(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_s32(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_s32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_s32 b0 = npyv_load_s32(data + vstep * 0);
+            npyv_s32 c0 = npyv_load_s32(data_out + vstep * 0);
+            
+#line 312
+            npyv_s32 b1 = npyv_load_s32(data + vstep * 1);
+            npyv_s32 c1 = npyv_load_s32(data_out + vstep * 1);
+            
+#line 312
+            npyv_s32 b2 = npyv_load_s32(data + vstep * 2);
+            npyv_s32 c2 = npyv_load_s32(data_out + vstep * 2);
+            
+#line 312
+            npyv_s32 b3 = npyv_load_s32(data + vstep * 3);
+            npyv_s32 c3 = npyv_load_s32(data_out + vstep * 3);
+            
+            #line 318
+            npyv_s32 abc0 = npyv_muladd_s32(v_scalar, b0, c0);
+            
+#line 318
+            npyv_s32 abc1 = npyv_muladd_s32(v_scalar, b1, c1);
+            
+#line 318
+            npyv_s32 abc2 = npyv_muladd_s32(v_scalar, b2, c2);
+            
+#line 318
+            npyv_s32 abc3 = npyv_muladd_s32(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_s32(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_s32(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_s32(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_s32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_s32 a = npyv_load_tillz_s32(data, count);
+        npyv_s32 b = npyv_load_tillz_s32(data_out, count);
+        npyv_store_till_s32(data_out, count, npyv_muladd_s32(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_int b0 = (data[0]);
+        const npy_int c0 = (data_out[0]);
+        
+#line 340
+        const npy_int b1 = (data[1]);
+        const npy_int c1 = (data_out[1]);
+        
+#line 340
+        const npy_int b2 = (data[2]);
+        const npy_int c2 = (data_out[2]);
+        
+#line 340
+        const npy_int b3 = (data[3]);
+        const npy_int c3 = (data_out[3]);
+        
+        #line 346
+        const npy_int abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_int abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_int abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_int abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_int b = (*data);
+        const npy_int c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_int
+}
+
+static void
+int_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_int *data0 = (npy_int *)dataptr[0];
+    npy_int *data1 = (npy_int *)dataptr[1];
+    npy_int *data_out = (npy_int *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_int
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_s32;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_s32 a0 = npyv_loada_s32(data0 + vstep * 0);
+            npyv_s32 b0 = npyv_loada_s32(data1 + vstep * 0);
+            npyv_s32 c0 = npyv_loada_s32(data_out + vstep * 0);
+            
+#line 390
+            npyv_s32 a1 = npyv_loada_s32(data0 + vstep * 1);
+            npyv_s32 b1 = npyv_loada_s32(data1 + vstep * 1);
+            npyv_s32 c1 = npyv_loada_s32(data_out + vstep * 1);
+            
+#line 390
+            npyv_s32 a2 = npyv_loada_s32(data0 + vstep * 2);
+            npyv_s32 b2 = npyv_loada_s32(data1 + vstep * 2);
+            npyv_s32 c2 = npyv_loada_s32(data_out + vstep * 2);
+            
+#line 390
+            npyv_s32 a3 = npyv_loada_s32(data0 + vstep * 3);
+            npyv_s32 b3 = npyv_loada_s32(data1 + vstep * 3);
+            npyv_s32 c3 = npyv_loada_s32(data_out + vstep * 3);
+            
+            #line 397
+            npyv_s32 abc0 = npyv_muladd_s32(a0, b0, c0);
+            
+#line 397
+            npyv_s32 abc1 = npyv_muladd_s32(a1, b1, c1);
+            
+#line 397
+            npyv_s32 abc2 = npyv_muladd_s32(a2, b2, c2);
+            
+#line 397
+            npyv_s32 abc3 = npyv_muladd_s32(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_s32(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_s32(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_s32(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_s32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_s32 a0 = npyv_load_s32(data0 + vstep * 0);
+            npyv_s32 b0 = npyv_load_s32(data1 + vstep * 0);
+            npyv_s32 c0 = npyv_load_s32(data_out + vstep * 0);
+            
+#line 390
+            npyv_s32 a1 = npyv_load_s32(data0 + vstep * 1);
+            npyv_s32 b1 = npyv_load_s32(data1 + vstep * 1);
+            npyv_s32 c1 = npyv_load_s32(data_out + vstep * 1);
+            
+#line 390
+            npyv_s32 a2 = npyv_load_s32(data0 + vstep * 2);
+            npyv_s32 b2 = npyv_load_s32(data1 + vstep * 2);
+            npyv_s32 c2 = npyv_load_s32(data_out + vstep * 2);
+            
+#line 390
+            npyv_s32 a3 = npyv_load_s32(data0 + vstep * 3);
+            npyv_s32 b3 = npyv_load_s32(data1 + vstep * 3);
+            npyv_s32 c3 = npyv_load_s32(data_out + vstep * 3);
+            
+            #line 397
+            npyv_s32 abc0 = npyv_muladd_s32(a0, b0, c0);
+            
+#line 397
+            npyv_s32 abc1 = npyv_muladd_s32(a1, b1, c1);
+            
+#line 397
+            npyv_s32 abc2 = npyv_muladd_s32(a2, b2, c2);
+            
+#line 397
+            npyv_s32 abc3 = npyv_muladd_s32(a3, b3, c3);
+            
+            #line 402
+            npyv_store_s32(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_s32(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_s32(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_s32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_s32 a = npyv_load_tillz_s32(data0, count);
+        npyv_s32 b = npyv_load_tillz_s32(data1, count);
+        npyv_s32 c = npyv_load_tillz_s32(data_out, count);
+        npyv_store_till_s32(data_out, count, npyv_muladd_s32(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_int a0 = (data0[0]);
+        const npy_int b0 = (data1[0]);
+        const npy_int c0 = (data_out[0]);
+        
+#line 420
+        const npy_int a1 = (data0[1]);
+        const npy_int b1 = (data1[1]);
+        const npy_int c1 = (data_out[1]);
+        
+#line 420
+        const npy_int a2 = (data0[2]);
+        const npy_int b2 = (data1[2]);
+        const npy_int c2 = (data_out[2]);
+        
+#line 420
+        const npy_int a3 = (data0[3]);
+        const npy_int b3 = (data1[3]);
+        const npy_int c3 = (data_out[3]);
+        
+        #line 427
+        const npy_int abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_int abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_int abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_int abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_int a = (*data0);
+        const npy_int b = (*data1);
+        const npy_int c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_int
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+int_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_int value0 = (*(npy_int *)dataptr[0]);
+    npy_int *data1 = (npy_int *)dataptr[1];
+    npy_int *data_out = (npy_int *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    int_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+int_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_int value1 = (*(npy_int *)dataptr[1]);
+    npy_int *data0 = (npy_int *)dataptr[0];
+    npy_int *data_out = (npy_int *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    int_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+int_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_int *data0 = (npy_int *)dataptr[0];
+    npy_int *data1 = (npy_int *)dataptr[1];
+    npy_int accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_int
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_s32;
+    npyv_s32 v_accum = npyv_zero_s32();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_s32 a0 = npyv_loada_s32(data0 + vstep * 0);
+            npyv_s32 b0 = npyv_loada_s32(data1 + vstep * 0);
+            
+#line 501
+            npyv_s32 a1 = npyv_loada_s32(data0 + vstep * 1);
+            npyv_s32 b1 = npyv_loada_s32(data1 + vstep * 1);
+            
+#line 501
+            npyv_s32 a2 = npyv_loada_s32(data0 + vstep * 2);
+            npyv_s32 b2 = npyv_loada_s32(data1 + vstep * 2);
+            
+#line 501
+            npyv_s32 a3 = npyv_loada_s32(data0 + vstep * 3);
+            npyv_s32 b3 = npyv_loada_s32(data1 + vstep * 3);
+            
+            npyv_s32 ab3 = npyv_muladd_s32(a3, b3, v_accum);
+            npyv_s32 ab2 = npyv_muladd_s32(a2, b2, ab3);
+            npyv_s32 ab1 = npyv_muladd_s32(a1, b1, ab2);
+                   v_accum = npyv_muladd_s32(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_s32 a0 = npyv_load_s32(data0 + vstep * 0);
+            npyv_s32 b0 = npyv_load_s32(data1 + vstep * 0);
+            
+#line 501
+            npyv_s32 a1 = npyv_load_s32(data0 + vstep * 1);
+            npyv_s32 b1 = npyv_load_s32(data1 + vstep * 1);
+            
+#line 501
+            npyv_s32 a2 = npyv_load_s32(data0 + vstep * 2);
+            npyv_s32 b2 = npyv_load_s32(data1 + vstep * 2);
+            
+#line 501
+            npyv_s32 a3 = npyv_load_s32(data0 + vstep * 3);
+            npyv_s32 b3 = npyv_load_s32(data1 + vstep * 3);
+            
+            npyv_s32 ab3 = npyv_muladd_s32(a3, b3, v_accum);
+            npyv_s32 ab2 = npyv_muladd_s32(a2, b2, ab3);
+            npyv_s32 ab1 = npyv_muladd_s32(a1, b1, ab2);
+                   v_accum = npyv_muladd_s32(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_s32 a = npyv_load_tillz_s32(data0, count);
+        npyv_s32 b = npyv_load_tillz_s32(data1, count);
+        v_accum = npyv_muladd_s32(a, b, v_accum);
+    }
+    accum = npyv_sum_s32(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_int ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_int ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_int ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_int ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_int a = (*data0);
+        const npy_int b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_int
+    *(npy_int *)dataptr[2] = ((*(npy_int *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+int_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_int *data1 = (npy_int *)dataptr[1];
+    npy_int value0 = (*(npy_int *)dataptr[0]);
+    npy_int accum = int_sum_of_arr(data1, count);
+    *(npy_int *)dataptr[2] = ((*(npy_int *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+int_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_int *data0 = (npy_int *)dataptr[0];
+    npy_int value1 = (*(npy_int *)dataptr[1]);
+    npy_int accum = int_sum_of_arr(data0, count);
+    *(npy_int *)dataptr[2] = ((*(npy_int *)dataptr[2]) + value1 * accum);
+}
+
+#elif 2 == 3 && !0
+
+static void
+int_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_int *data0 = (npy_int *)dataptr[0];
+    npy_int *data1 = (npy_int *)dataptr[1];
+    npy_int *data2 = (npy_int *)dataptr[2];
+    npy_int *data_out = (npy_int *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 2 > 3 || @complex */
+
+static void
+int_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_two (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_int temp = (*(npy_int *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_int *)dataptr[i]);
+        }
+        *(npy_int *)dataptr[nop] = (temp +
+                                           (*(npy_int *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_int);
+        }
+#else /* complex */
+#  if 2 <= 3
+#    define _SUMPROD_NOP 2
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_int re, im, tmp;
+        int i;
+        re = ((npy_int *)dataptr[0])[0];
+        im = ((npy_int *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_int *)dataptr[i])[0] -
+                  im * ((npy_int *)dataptr[i])[1];
+            im = re * ((npy_int *)dataptr[i])[1] +
+                 im * ((npy_int *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_int *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_int *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_int *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_int *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_int);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 2 */
+
+#if 2 == 1
+
+static NPY_GCC_OPT_3 void
+int_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_int *data = (npy_int *)dataptr[0];
+    npy_int accum = int_sum_of_arr(data, count);
+    *((npy_int *)dataptr[1]) = (accum + (*((npy_int *)dataptr[1])));
+#else
+    npy_int accum_re = 0, accum_im = 0;
+    npy_int *data0 = (npy_int *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_int re01 = data0[0] + data0[2];
+        const npy_int re23 = data0[4] + data0[6];
+        const npy_int im13 = data0[1] + data0[3];
+        const npy_int im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_int *)dataptr[1])[0] += accum_re;
+    ((npy_int *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 2 == 1 */
+
+static void
+int_sum_of_products_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_int accum_re = 0, accum_im = 0;
+#else
+    npy_int accum = 0;
+#endif
+
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (2 == 2 || 2 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (2 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_outstride0_two (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 2 == 1
+        accum += (*(npy_int *)data0);
+        data0 += stride0;
+#  elif 2 == 2
+        accum += (*(npy_int *)data0) *
+                 (*(npy_int *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 2 == 3
+        accum += (*(npy_int *)data0) *
+                 (*(npy_int *)data1) *
+                 (*(npy_int *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_int temp = (*(npy_int *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_int *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 2 == 1
+        accum_re += ((npy_int *)data0)[0];
+        accum_im += ((npy_int *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 2 <= 3
+#define _SUMPROD_NOP 2
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_int re, im, tmp;
+        int i;
+        re = ((npy_int *)dataptr[0])[0];
+        im = ((npy_int *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_int *)dataptr[i])[0] -
+                  im * ((npy_int *)dataptr[i])[1];
+            im = re * ((npy_int *)dataptr[i])[1] +
+                 im * ((npy_int *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 2 <= 3
+    ((npy_int *)dataptr[2])[0] += accum_re;
+    ((npy_int *)dataptr[2])[1] += accum_im;
+#  else
+    ((npy_int *)dataptr[nop])[0] += accum_re;
+    ((npy_int *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 2 <= 3
+    *((npy_int *)dataptr[2]) = (accum +
+                                    (*((npy_int *)dataptr[2])));
+#  else
+    *((npy_int *)dataptr[nop]) = (accum +
+                                    (*((npy_int *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+int_sum_of_products_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (3 == 2 || 3 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (3 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data_out = dataptr[3];
+    npy_intp stride_out = strides[3];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_three (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 3 == 1
+        *(npy_int *)data_out = ((*(npy_int *)data0) +
+                                         (*(npy_int *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 3 == 2
+        *(npy_int *)data_out = ((*(npy_int *)data0) *
+                                         (*(npy_int *)data1) +
+                                         (*(npy_int *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 3 == 3
+        *(npy_int *)data_out = ((*(npy_int *)data0) *
+                                         (*(npy_int *)data1) *
+                                         (*(npy_int *)data2) +
+                                         (*(npy_int *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_int temp = (*(npy_int *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_int *)dataptr[i]);
+        }
+        *(npy_int *)dataptr[nop] = (temp +
+                                           (*(npy_int *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 3 == 1
+        ((npy_int *)data_out)[0] = ((npy_int *)data0)[0] +
+                                         ((npy_int *)data_out)[0];
+        ((npy_int *)data_out)[1] = ((npy_int *)data0)[1] +
+                                         ((npy_int *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 3 <= 3
+#define _SUMPROD_NOP 3
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_int re, im, tmp;
+        int i;
+        re = ((npy_int *)dataptr[0])[0];
+        im = ((npy_int *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_int *)dataptr[i])[0] -
+                  im * ((npy_int *)dataptr[i])[1];
+            im = re * ((npy_int *)dataptr[i])[1] +
+                 im * ((npy_int *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_int *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_int *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_int *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_int *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 3 == 1
+
+static void
+int_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_int *data0 = (npy_int *)dataptr[0];
+    npy_int *data_out = (npy_int *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_int *)data_out + 2*6)[0] =
+                                    ((npy_int *)data0 + 2*6)[0] +
+                                    ((npy_int *)data_out + 2*6)[0];
+            ((npy_int *)data_out + 2*6)[1] =
+                                    ((npy_int *)data0 + 2*6)[1] +
+                                    ((npy_int *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_int *)data_out + 2*5)[0] =
+                                    ((npy_int *)data0 + 2*5)[0] +
+                                    ((npy_int *)data_out + 2*5)[0];
+            ((npy_int *)data_out + 2*5)[1] =
+                                    ((npy_int *)data0 + 2*5)[1] +
+                                    ((npy_int *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_int *)data_out + 2*4)[0] =
+                                    ((npy_int *)data0 + 2*4)[0] +
+                                    ((npy_int *)data_out + 2*4)[0];
+            ((npy_int *)data_out + 2*4)[1] =
+                                    ((npy_int *)data0 + 2*4)[1] +
+                                    ((npy_int *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_int *)data_out + 2*3)[0] =
+                                    ((npy_int *)data0 + 2*3)[0] +
+                                    ((npy_int *)data_out + 2*3)[0];
+            ((npy_int *)data_out + 2*3)[1] =
+                                    ((npy_int *)data0 + 2*3)[1] +
+                                    ((npy_int *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_int *)data_out + 2*2)[0] =
+                                    ((npy_int *)data0 + 2*2)[0] +
+                                    ((npy_int *)data_out + 2*2)[0];
+            ((npy_int *)data_out + 2*2)[1] =
+                                    ((npy_int *)data0 + 2*2)[1] +
+                                    ((npy_int *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_int *)data_out + 2*1)[0] =
+                                    ((npy_int *)data0 + 2*1)[0] +
+                                    ((npy_int *)data_out + 2*1)[0];
+            ((npy_int *)data_out + 2*1)[1] =
+                                    ((npy_int *)data0 + 2*1)[1] +
+                                    ((npy_int *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_int *)data_out + 2*0)[0] =
+                                    ((npy_int *)data0 + 2*0)[0] +
+                                    ((npy_int *)data_out + 2*0)[0];
+            ((npy_int *)data_out + 2*0)[1] =
+                                    ((npy_int *)data0 + 2*0)[1] +
+                                    ((npy_int *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_int *)data_out + 2*0)[0] =
+                                ((npy_int *)data0 + 2*0)[0] +
+                                ((npy_int *)data_out + 2*0)[0];
+        ((npy_int *)data_out + 2*0)[1] =
+                                ((npy_int *)data0 + 2*0)[1] +
+                                ((npy_int *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_int *)data_out + 2*1)[0] =
+                                ((npy_int *)data0 + 2*1)[0] +
+                                ((npy_int *)data_out + 2*1)[0];
+        ((npy_int *)data_out + 2*1)[1] =
+                                ((npy_int *)data0 + 2*1)[1] +
+                                ((npy_int *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_int *)data_out + 2*2)[0] =
+                                ((npy_int *)data0 + 2*2)[0] +
+                                ((npy_int *)data_out + 2*2)[0];
+        ((npy_int *)data_out + 2*2)[1] =
+                                ((npy_int *)data0 + 2*2)[1] +
+                                ((npy_int *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_int *)data_out + 2*3)[0] =
+                                ((npy_int *)data0 + 2*3)[0] +
+                                ((npy_int *)data_out + 2*3)[0];
+        ((npy_int *)data_out + 2*3)[1] =
+                                ((npy_int *)data0 + 2*3)[1] +
+                                ((npy_int *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_int *)data_out + 2*4)[0] =
+                                ((npy_int *)data0 + 2*4)[0] +
+                                ((npy_int *)data_out + 2*4)[0];
+        ((npy_int *)data_out + 2*4)[1] =
+                                ((npy_int *)data0 + 2*4)[1] +
+                                ((npy_int *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_int *)data_out + 2*5)[0] =
+                                ((npy_int *)data0 + 2*5)[0] +
+                                ((npy_int *)data_out + 2*5)[0];
+        ((npy_int *)data_out + 2*5)[1] =
+                                ((npy_int *)data0 + 2*5)[1] +
+                                ((npy_int *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_int *)data_out + 2*6)[0] =
+                                ((npy_int *)data0 + 2*6)[0] +
+                                ((npy_int *)data_out + 2*6)[0];
+        ((npy_int *)data_out + 2*6)[1] =
+                                ((npy_int *)data0 + 2*6)[1] +
+                                ((npy_int *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_int *)data_out + 2*7)[0] =
+                                ((npy_int *)data0 + 2*7)[0] +
+                                ((npy_int *)data_out + 2*7)[0];
+        ((npy_int *)data_out + 2*7)[1] =
+                                ((npy_int *)data0 + 2*7)[1] +
+                                ((npy_int *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 3 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+int_sum_of_products_muladd(npy_int *data, npy_int *data_out, npy_int scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_int
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_s32;
+    const npyv_s32 v_scalar = npyv_setall_s32(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_s32 b0 = npyv_loada_s32(data + vstep * 0);
+            npyv_s32 c0 = npyv_loada_s32(data_out + vstep * 0);
+            
+#line 312
+            npyv_s32 b1 = npyv_loada_s32(data + vstep * 1);
+            npyv_s32 c1 = npyv_loada_s32(data_out + vstep * 1);
+            
+#line 312
+            npyv_s32 b2 = npyv_loada_s32(data + vstep * 2);
+            npyv_s32 c2 = npyv_loada_s32(data_out + vstep * 2);
+            
+#line 312
+            npyv_s32 b3 = npyv_loada_s32(data + vstep * 3);
+            npyv_s32 c3 = npyv_loada_s32(data_out + vstep * 3);
+            
+            #line 318
+            npyv_s32 abc0 = npyv_muladd_s32(v_scalar, b0, c0);
+            
+#line 318
+            npyv_s32 abc1 = npyv_muladd_s32(v_scalar, b1, c1);
+            
+#line 318
+            npyv_s32 abc2 = npyv_muladd_s32(v_scalar, b2, c2);
+            
+#line 318
+            npyv_s32 abc3 = npyv_muladd_s32(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_s32(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_s32(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_s32(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_s32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_s32 b0 = npyv_load_s32(data + vstep * 0);
+            npyv_s32 c0 = npyv_load_s32(data_out + vstep * 0);
+            
+#line 312
+            npyv_s32 b1 = npyv_load_s32(data + vstep * 1);
+            npyv_s32 c1 = npyv_load_s32(data_out + vstep * 1);
+            
+#line 312
+            npyv_s32 b2 = npyv_load_s32(data + vstep * 2);
+            npyv_s32 c2 = npyv_load_s32(data_out + vstep * 2);
+            
+#line 312
+            npyv_s32 b3 = npyv_load_s32(data + vstep * 3);
+            npyv_s32 c3 = npyv_load_s32(data_out + vstep * 3);
+            
+            #line 318
+            npyv_s32 abc0 = npyv_muladd_s32(v_scalar, b0, c0);
+            
+#line 318
+            npyv_s32 abc1 = npyv_muladd_s32(v_scalar, b1, c1);
+            
+#line 318
+            npyv_s32 abc2 = npyv_muladd_s32(v_scalar, b2, c2);
+            
+#line 318
+            npyv_s32 abc3 = npyv_muladd_s32(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_s32(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_s32(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_s32(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_s32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_s32 a = npyv_load_tillz_s32(data, count);
+        npyv_s32 b = npyv_load_tillz_s32(data_out, count);
+        npyv_store_till_s32(data_out, count, npyv_muladd_s32(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_int b0 = (data[0]);
+        const npy_int c0 = (data_out[0]);
+        
+#line 340
+        const npy_int b1 = (data[1]);
+        const npy_int c1 = (data_out[1]);
+        
+#line 340
+        const npy_int b2 = (data[2]);
+        const npy_int c2 = (data_out[2]);
+        
+#line 340
+        const npy_int b3 = (data[3]);
+        const npy_int c3 = (data_out[3]);
+        
+        #line 346
+        const npy_int abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_int abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_int abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_int abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_int b = (*data);
+        const npy_int c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_int
+}
+
+static void
+int_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_int *data0 = (npy_int *)dataptr[0];
+    npy_int *data1 = (npy_int *)dataptr[1];
+    npy_int *data_out = (npy_int *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_int
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_s32;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_s32 a0 = npyv_loada_s32(data0 + vstep * 0);
+            npyv_s32 b0 = npyv_loada_s32(data1 + vstep * 0);
+            npyv_s32 c0 = npyv_loada_s32(data_out + vstep * 0);
+            
+#line 390
+            npyv_s32 a1 = npyv_loada_s32(data0 + vstep * 1);
+            npyv_s32 b1 = npyv_loada_s32(data1 + vstep * 1);
+            npyv_s32 c1 = npyv_loada_s32(data_out + vstep * 1);
+            
+#line 390
+            npyv_s32 a2 = npyv_loada_s32(data0 + vstep * 2);
+            npyv_s32 b2 = npyv_loada_s32(data1 + vstep * 2);
+            npyv_s32 c2 = npyv_loada_s32(data_out + vstep * 2);
+            
+#line 390
+            npyv_s32 a3 = npyv_loada_s32(data0 + vstep * 3);
+            npyv_s32 b3 = npyv_loada_s32(data1 + vstep * 3);
+            npyv_s32 c3 = npyv_loada_s32(data_out + vstep * 3);
+            
+            #line 397
+            npyv_s32 abc0 = npyv_muladd_s32(a0, b0, c0);
+            
+#line 397
+            npyv_s32 abc1 = npyv_muladd_s32(a1, b1, c1);
+            
+#line 397
+            npyv_s32 abc2 = npyv_muladd_s32(a2, b2, c2);
+            
+#line 397
+            npyv_s32 abc3 = npyv_muladd_s32(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_s32(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_s32(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_s32(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_s32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_s32 a0 = npyv_load_s32(data0 + vstep * 0);
+            npyv_s32 b0 = npyv_load_s32(data1 + vstep * 0);
+            npyv_s32 c0 = npyv_load_s32(data_out + vstep * 0);
+            
+#line 390
+            npyv_s32 a1 = npyv_load_s32(data0 + vstep * 1);
+            npyv_s32 b1 = npyv_load_s32(data1 + vstep * 1);
+            npyv_s32 c1 = npyv_load_s32(data_out + vstep * 1);
+            
+#line 390
+            npyv_s32 a2 = npyv_load_s32(data0 + vstep * 2);
+            npyv_s32 b2 = npyv_load_s32(data1 + vstep * 2);
+            npyv_s32 c2 = npyv_load_s32(data_out + vstep * 2);
+            
+#line 390
+            npyv_s32 a3 = npyv_load_s32(data0 + vstep * 3);
+            npyv_s32 b3 = npyv_load_s32(data1 + vstep * 3);
+            npyv_s32 c3 = npyv_load_s32(data_out + vstep * 3);
+            
+            #line 397
+            npyv_s32 abc0 = npyv_muladd_s32(a0, b0, c0);
+            
+#line 397
+            npyv_s32 abc1 = npyv_muladd_s32(a1, b1, c1);
+            
+#line 397
+            npyv_s32 abc2 = npyv_muladd_s32(a2, b2, c2);
+            
+#line 397
+            npyv_s32 abc3 = npyv_muladd_s32(a3, b3, c3);
+            
+            #line 402
+            npyv_store_s32(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_s32(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_s32(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_s32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_s32 a = npyv_load_tillz_s32(data0, count);
+        npyv_s32 b = npyv_load_tillz_s32(data1, count);
+        npyv_s32 c = npyv_load_tillz_s32(data_out, count);
+        npyv_store_till_s32(data_out, count, npyv_muladd_s32(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_int a0 = (data0[0]);
+        const npy_int b0 = (data1[0]);
+        const npy_int c0 = (data_out[0]);
+        
+#line 420
+        const npy_int a1 = (data0[1]);
+        const npy_int b1 = (data1[1]);
+        const npy_int c1 = (data_out[1]);
+        
+#line 420
+        const npy_int a2 = (data0[2]);
+        const npy_int b2 = (data1[2]);
+        const npy_int c2 = (data_out[2]);
+        
+#line 420
+        const npy_int a3 = (data0[3]);
+        const npy_int b3 = (data1[3]);
+        const npy_int c3 = (data_out[3]);
+        
+        #line 427
+        const npy_int abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_int abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_int abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_int abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_int a = (*data0);
+        const npy_int b = (*data1);
+        const npy_int c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_int
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+int_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_int value0 = (*(npy_int *)dataptr[0]);
+    npy_int *data1 = (npy_int *)dataptr[1];
+    npy_int *data_out = (npy_int *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    int_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+int_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_int value1 = (*(npy_int *)dataptr[1]);
+    npy_int *data0 = (npy_int *)dataptr[0];
+    npy_int *data_out = (npy_int *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    int_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+int_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_int *data0 = (npy_int *)dataptr[0];
+    npy_int *data1 = (npy_int *)dataptr[1];
+    npy_int accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_int
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_s32;
+    npyv_s32 v_accum = npyv_zero_s32();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_s32 a0 = npyv_loada_s32(data0 + vstep * 0);
+            npyv_s32 b0 = npyv_loada_s32(data1 + vstep * 0);
+            
+#line 501
+            npyv_s32 a1 = npyv_loada_s32(data0 + vstep * 1);
+            npyv_s32 b1 = npyv_loada_s32(data1 + vstep * 1);
+            
+#line 501
+            npyv_s32 a2 = npyv_loada_s32(data0 + vstep * 2);
+            npyv_s32 b2 = npyv_loada_s32(data1 + vstep * 2);
+            
+#line 501
+            npyv_s32 a3 = npyv_loada_s32(data0 + vstep * 3);
+            npyv_s32 b3 = npyv_loada_s32(data1 + vstep * 3);
+            
+            npyv_s32 ab3 = npyv_muladd_s32(a3, b3, v_accum);
+            npyv_s32 ab2 = npyv_muladd_s32(a2, b2, ab3);
+            npyv_s32 ab1 = npyv_muladd_s32(a1, b1, ab2);
+                   v_accum = npyv_muladd_s32(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_s32 a0 = npyv_load_s32(data0 + vstep * 0);
+            npyv_s32 b0 = npyv_load_s32(data1 + vstep * 0);
+            
+#line 501
+            npyv_s32 a1 = npyv_load_s32(data0 + vstep * 1);
+            npyv_s32 b1 = npyv_load_s32(data1 + vstep * 1);
+            
+#line 501
+            npyv_s32 a2 = npyv_load_s32(data0 + vstep * 2);
+            npyv_s32 b2 = npyv_load_s32(data1 + vstep * 2);
+            
+#line 501
+            npyv_s32 a3 = npyv_load_s32(data0 + vstep * 3);
+            npyv_s32 b3 = npyv_load_s32(data1 + vstep * 3);
+            
+            npyv_s32 ab3 = npyv_muladd_s32(a3, b3, v_accum);
+            npyv_s32 ab2 = npyv_muladd_s32(a2, b2, ab3);
+            npyv_s32 ab1 = npyv_muladd_s32(a1, b1, ab2);
+                   v_accum = npyv_muladd_s32(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_s32 a = npyv_load_tillz_s32(data0, count);
+        npyv_s32 b = npyv_load_tillz_s32(data1, count);
+        v_accum = npyv_muladd_s32(a, b, v_accum);
+    }
+    accum = npyv_sum_s32(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_int ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_int ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_int ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_int ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_int a = (*data0);
+        const npy_int b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_int
+    *(npy_int *)dataptr[2] = ((*(npy_int *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+int_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_int *data1 = (npy_int *)dataptr[1];
+    npy_int value0 = (*(npy_int *)dataptr[0]);
+    npy_int accum = int_sum_of_arr(data1, count);
+    *(npy_int *)dataptr[2] = ((*(npy_int *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+int_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_int *data0 = (npy_int *)dataptr[0];
+    npy_int value1 = (*(npy_int *)dataptr[1]);
+    npy_int accum = int_sum_of_arr(data0, count);
+    *(npy_int *)dataptr[2] = ((*(npy_int *)dataptr[2]) + value1 * accum);
+}
+
+#elif 3 == 3 && !0
+
+static void
+int_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_int *data0 = (npy_int *)dataptr[0];
+    npy_int *data1 = (npy_int *)dataptr[1];
+    npy_int *data2 = (npy_int *)dataptr[2];
+    npy_int *data_out = (npy_int *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 3 > 3 || @complex */
+
+static void
+int_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_three (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_int temp = (*(npy_int *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_int *)dataptr[i]);
+        }
+        *(npy_int *)dataptr[nop] = (temp +
+                                           (*(npy_int *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_int);
+        }
+#else /* complex */
+#  if 3 <= 3
+#    define _SUMPROD_NOP 3
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_int re, im, tmp;
+        int i;
+        re = ((npy_int *)dataptr[0])[0];
+        im = ((npy_int *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_int *)dataptr[i])[0] -
+                  im * ((npy_int *)dataptr[i])[1];
+            im = re * ((npy_int *)dataptr[i])[1] +
+                 im * ((npy_int *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_int *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_int *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_int *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_int *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_int);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 3 */
+
+#if 3 == 1
+
+static NPY_GCC_OPT_3 void
+int_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_int *data = (npy_int *)dataptr[0];
+    npy_int accum = int_sum_of_arr(data, count);
+    *((npy_int *)dataptr[1]) = (accum + (*((npy_int *)dataptr[1])));
+#else
+    npy_int accum_re = 0, accum_im = 0;
+    npy_int *data0 = (npy_int *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_int re01 = data0[0] + data0[2];
+        const npy_int re23 = data0[4] + data0[6];
+        const npy_int im13 = data0[1] + data0[3];
+        const npy_int im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_int *)dataptr[1])[0] += accum_re;
+    ((npy_int *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 3 == 1 */
+
+static void
+int_sum_of_products_outstride0_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_int accum_re = 0, accum_im = 0;
+#else
+    npy_int accum = 0;
+#endif
+
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (3 == 2 || 3 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (3 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_outstride0_three (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 3 == 1
+        accum += (*(npy_int *)data0);
+        data0 += stride0;
+#  elif 3 == 2
+        accum += (*(npy_int *)data0) *
+                 (*(npy_int *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 3 == 3
+        accum += (*(npy_int *)data0) *
+                 (*(npy_int *)data1) *
+                 (*(npy_int *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_int temp = (*(npy_int *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_int *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 3 == 1
+        accum_re += ((npy_int *)data0)[0];
+        accum_im += ((npy_int *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 3 <= 3
+#define _SUMPROD_NOP 3
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_int re, im, tmp;
+        int i;
+        re = ((npy_int *)dataptr[0])[0];
+        im = ((npy_int *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_int *)dataptr[i])[0] -
+                  im * ((npy_int *)dataptr[i])[1];
+            im = re * ((npy_int *)dataptr[i])[1] +
+                 im * ((npy_int *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 3 <= 3
+    ((npy_int *)dataptr[3])[0] += accum_re;
+    ((npy_int *)dataptr[3])[1] += accum_im;
+#  else
+    ((npy_int *)dataptr[nop])[0] += accum_re;
+    ((npy_int *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 3 <= 3
+    *((npy_int *)dataptr[3]) = (accum +
+                                    (*((npy_int *)dataptr[3])));
+#  else
+    *((npy_int *)dataptr[nop]) = (accum +
+                                    (*((npy_int *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+int_sum_of_products_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1000 == 2 || 1000 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1000 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data_out = dataptr[1000];
+    npy_intp stride_out = strides[1000];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_any (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 1000 == 1
+        *(npy_int *)data_out = ((*(npy_int *)data0) +
+                                         (*(npy_int *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 1000 == 2
+        *(npy_int *)data_out = ((*(npy_int *)data0) *
+                                         (*(npy_int *)data1) +
+                                         (*(npy_int *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 1000 == 3
+        *(npy_int *)data_out = ((*(npy_int *)data0) *
+                                         (*(npy_int *)data1) *
+                                         (*(npy_int *)data2) +
+                                         (*(npy_int *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_int temp = (*(npy_int *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_int *)dataptr[i]);
+        }
+        *(npy_int *)dataptr[nop] = (temp +
+                                           (*(npy_int *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1000 == 1
+        ((npy_int *)data_out)[0] = ((npy_int *)data0)[0] +
+                                         ((npy_int *)data_out)[0];
+        ((npy_int *)data_out)[1] = ((npy_int *)data0)[1] +
+                                         ((npy_int *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 1000 <= 3
+#define _SUMPROD_NOP 1000
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_int re, im, tmp;
+        int i;
+        re = ((npy_int *)dataptr[0])[0];
+        im = ((npy_int *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_int *)dataptr[i])[0] -
+                  im * ((npy_int *)dataptr[i])[1];
+            im = re * ((npy_int *)dataptr[i])[1] +
+                 im * ((npy_int *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_int *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_int *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_int *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_int *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 1000 == 1
+
+static void
+int_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_int *data0 = (npy_int *)dataptr[0];
+    npy_int *data_out = (npy_int *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_int *)data_out + 2*6)[0] =
+                                    ((npy_int *)data0 + 2*6)[0] +
+                                    ((npy_int *)data_out + 2*6)[0];
+            ((npy_int *)data_out + 2*6)[1] =
+                                    ((npy_int *)data0 + 2*6)[1] +
+                                    ((npy_int *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_int *)data_out + 2*5)[0] =
+                                    ((npy_int *)data0 + 2*5)[0] +
+                                    ((npy_int *)data_out + 2*5)[0];
+            ((npy_int *)data_out + 2*5)[1] =
+                                    ((npy_int *)data0 + 2*5)[1] +
+                                    ((npy_int *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_int *)data_out + 2*4)[0] =
+                                    ((npy_int *)data0 + 2*4)[0] +
+                                    ((npy_int *)data_out + 2*4)[0];
+            ((npy_int *)data_out + 2*4)[1] =
+                                    ((npy_int *)data0 + 2*4)[1] +
+                                    ((npy_int *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_int *)data_out + 2*3)[0] =
+                                    ((npy_int *)data0 + 2*3)[0] +
+                                    ((npy_int *)data_out + 2*3)[0];
+            ((npy_int *)data_out + 2*3)[1] =
+                                    ((npy_int *)data0 + 2*3)[1] +
+                                    ((npy_int *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_int *)data_out + 2*2)[0] =
+                                    ((npy_int *)data0 + 2*2)[0] +
+                                    ((npy_int *)data_out + 2*2)[0];
+            ((npy_int *)data_out + 2*2)[1] =
+                                    ((npy_int *)data0 + 2*2)[1] +
+                                    ((npy_int *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_int *)data_out + 2*1)[0] =
+                                    ((npy_int *)data0 + 2*1)[0] +
+                                    ((npy_int *)data_out + 2*1)[0];
+            ((npy_int *)data_out + 2*1)[1] =
+                                    ((npy_int *)data0 + 2*1)[1] +
+                                    ((npy_int *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_int *)data_out + 2*0)[0] =
+                                    ((npy_int *)data0 + 2*0)[0] +
+                                    ((npy_int *)data_out + 2*0)[0];
+            ((npy_int *)data_out + 2*0)[1] =
+                                    ((npy_int *)data0 + 2*0)[1] +
+                                    ((npy_int *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_int *)data_out + 2*0)[0] =
+                                ((npy_int *)data0 + 2*0)[0] +
+                                ((npy_int *)data_out + 2*0)[0];
+        ((npy_int *)data_out + 2*0)[1] =
+                                ((npy_int *)data0 + 2*0)[1] +
+                                ((npy_int *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_int *)data_out + 2*1)[0] =
+                                ((npy_int *)data0 + 2*1)[0] +
+                                ((npy_int *)data_out + 2*1)[0];
+        ((npy_int *)data_out + 2*1)[1] =
+                                ((npy_int *)data0 + 2*1)[1] +
+                                ((npy_int *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_int *)data_out + 2*2)[0] =
+                                ((npy_int *)data0 + 2*2)[0] +
+                                ((npy_int *)data_out + 2*2)[0];
+        ((npy_int *)data_out + 2*2)[1] =
+                                ((npy_int *)data0 + 2*2)[1] +
+                                ((npy_int *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_int *)data_out + 2*3)[0] =
+                                ((npy_int *)data0 + 2*3)[0] +
+                                ((npy_int *)data_out + 2*3)[0];
+        ((npy_int *)data_out + 2*3)[1] =
+                                ((npy_int *)data0 + 2*3)[1] +
+                                ((npy_int *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_int *)data_out + 2*4)[0] =
+                                ((npy_int *)data0 + 2*4)[0] +
+                                ((npy_int *)data_out + 2*4)[0];
+        ((npy_int *)data_out + 2*4)[1] =
+                                ((npy_int *)data0 + 2*4)[1] +
+                                ((npy_int *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_int *)data_out + 2*5)[0] =
+                                ((npy_int *)data0 + 2*5)[0] +
+                                ((npy_int *)data_out + 2*5)[0];
+        ((npy_int *)data_out + 2*5)[1] =
+                                ((npy_int *)data0 + 2*5)[1] +
+                                ((npy_int *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_int *)data_out + 2*6)[0] =
+                                ((npy_int *)data0 + 2*6)[0] +
+                                ((npy_int *)data_out + 2*6)[0];
+        ((npy_int *)data_out + 2*6)[1] =
+                                ((npy_int *)data0 + 2*6)[1] +
+                                ((npy_int *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_int *)data_out + 2*7)[0] =
+                                ((npy_int *)data0 + 2*7)[0] +
+                                ((npy_int *)data_out + 2*7)[0];
+        ((npy_int *)data_out + 2*7)[1] =
+                                ((npy_int *)data0 + 2*7)[1] +
+                                ((npy_int *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 1000 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+int_sum_of_products_muladd(npy_int *data, npy_int *data_out, npy_int scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_int
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_s32;
+    const npyv_s32 v_scalar = npyv_setall_s32(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_s32 b0 = npyv_loada_s32(data + vstep * 0);
+            npyv_s32 c0 = npyv_loada_s32(data_out + vstep * 0);
+            
+#line 312
+            npyv_s32 b1 = npyv_loada_s32(data + vstep * 1);
+            npyv_s32 c1 = npyv_loada_s32(data_out + vstep * 1);
+            
+#line 312
+            npyv_s32 b2 = npyv_loada_s32(data + vstep * 2);
+            npyv_s32 c2 = npyv_loada_s32(data_out + vstep * 2);
+            
+#line 312
+            npyv_s32 b3 = npyv_loada_s32(data + vstep * 3);
+            npyv_s32 c3 = npyv_loada_s32(data_out + vstep * 3);
+            
+            #line 318
+            npyv_s32 abc0 = npyv_muladd_s32(v_scalar, b0, c0);
+            
+#line 318
+            npyv_s32 abc1 = npyv_muladd_s32(v_scalar, b1, c1);
+            
+#line 318
+            npyv_s32 abc2 = npyv_muladd_s32(v_scalar, b2, c2);
+            
+#line 318
+            npyv_s32 abc3 = npyv_muladd_s32(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_s32(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_s32(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_s32(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_s32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_s32 b0 = npyv_load_s32(data + vstep * 0);
+            npyv_s32 c0 = npyv_load_s32(data_out + vstep * 0);
+            
+#line 312
+            npyv_s32 b1 = npyv_load_s32(data + vstep * 1);
+            npyv_s32 c1 = npyv_load_s32(data_out + vstep * 1);
+            
+#line 312
+            npyv_s32 b2 = npyv_load_s32(data + vstep * 2);
+            npyv_s32 c2 = npyv_load_s32(data_out + vstep * 2);
+            
+#line 312
+            npyv_s32 b3 = npyv_load_s32(data + vstep * 3);
+            npyv_s32 c3 = npyv_load_s32(data_out + vstep * 3);
+            
+            #line 318
+            npyv_s32 abc0 = npyv_muladd_s32(v_scalar, b0, c0);
+            
+#line 318
+            npyv_s32 abc1 = npyv_muladd_s32(v_scalar, b1, c1);
+            
+#line 318
+            npyv_s32 abc2 = npyv_muladd_s32(v_scalar, b2, c2);
+            
+#line 318
+            npyv_s32 abc3 = npyv_muladd_s32(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_s32(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_s32(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_s32(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_s32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_s32 a = npyv_load_tillz_s32(data, count);
+        npyv_s32 b = npyv_load_tillz_s32(data_out, count);
+        npyv_store_till_s32(data_out, count, npyv_muladd_s32(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_int b0 = (data[0]);
+        const npy_int c0 = (data_out[0]);
+        
+#line 340
+        const npy_int b1 = (data[1]);
+        const npy_int c1 = (data_out[1]);
+        
+#line 340
+        const npy_int b2 = (data[2]);
+        const npy_int c2 = (data_out[2]);
+        
+#line 340
+        const npy_int b3 = (data[3]);
+        const npy_int c3 = (data_out[3]);
+        
+        #line 346
+        const npy_int abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_int abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_int abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_int abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_int b = (*data);
+        const npy_int c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_int
+}
+
+static void
+int_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_int *data0 = (npy_int *)dataptr[0];
+    npy_int *data1 = (npy_int *)dataptr[1];
+    npy_int *data_out = (npy_int *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_int
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_s32;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_s32 a0 = npyv_loada_s32(data0 + vstep * 0);
+            npyv_s32 b0 = npyv_loada_s32(data1 + vstep * 0);
+            npyv_s32 c0 = npyv_loada_s32(data_out + vstep * 0);
+            
+#line 390
+            npyv_s32 a1 = npyv_loada_s32(data0 + vstep * 1);
+            npyv_s32 b1 = npyv_loada_s32(data1 + vstep * 1);
+            npyv_s32 c1 = npyv_loada_s32(data_out + vstep * 1);
+            
+#line 390
+            npyv_s32 a2 = npyv_loada_s32(data0 + vstep * 2);
+            npyv_s32 b2 = npyv_loada_s32(data1 + vstep * 2);
+            npyv_s32 c2 = npyv_loada_s32(data_out + vstep * 2);
+            
+#line 390
+            npyv_s32 a3 = npyv_loada_s32(data0 + vstep * 3);
+            npyv_s32 b3 = npyv_loada_s32(data1 + vstep * 3);
+            npyv_s32 c3 = npyv_loada_s32(data_out + vstep * 3);
+            
+            #line 397
+            npyv_s32 abc0 = npyv_muladd_s32(a0, b0, c0);
+            
+#line 397
+            npyv_s32 abc1 = npyv_muladd_s32(a1, b1, c1);
+            
+#line 397
+            npyv_s32 abc2 = npyv_muladd_s32(a2, b2, c2);
+            
+#line 397
+            npyv_s32 abc3 = npyv_muladd_s32(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_s32(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_s32(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_s32(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_s32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_s32 a0 = npyv_load_s32(data0 + vstep * 0);
+            npyv_s32 b0 = npyv_load_s32(data1 + vstep * 0);
+            npyv_s32 c0 = npyv_load_s32(data_out + vstep * 0);
+            
+#line 390
+            npyv_s32 a1 = npyv_load_s32(data0 + vstep * 1);
+            npyv_s32 b1 = npyv_load_s32(data1 + vstep * 1);
+            npyv_s32 c1 = npyv_load_s32(data_out + vstep * 1);
+            
+#line 390
+            npyv_s32 a2 = npyv_load_s32(data0 + vstep * 2);
+            npyv_s32 b2 = npyv_load_s32(data1 + vstep * 2);
+            npyv_s32 c2 = npyv_load_s32(data_out + vstep * 2);
+            
+#line 390
+            npyv_s32 a3 = npyv_load_s32(data0 + vstep * 3);
+            npyv_s32 b3 = npyv_load_s32(data1 + vstep * 3);
+            npyv_s32 c3 = npyv_load_s32(data_out + vstep * 3);
+            
+            #line 397
+            npyv_s32 abc0 = npyv_muladd_s32(a0, b0, c0);
+            
+#line 397
+            npyv_s32 abc1 = npyv_muladd_s32(a1, b1, c1);
+            
+#line 397
+            npyv_s32 abc2 = npyv_muladd_s32(a2, b2, c2);
+            
+#line 397
+            npyv_s32 abc3 = npyv_muladd_s32(a3, b3, c3);
+            
+            #line 402
+            npyv_store_s32(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_s32(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_s32(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_s32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_s32 a = npyv_load_tillz_s32(data0, count);
+        npyv_s32 b = npyv_load_tillz_s32(data1, count);
+        npyv_s32 c = npyv_load_tillz_s32(data_out, count);
+        npyv_store_till_s32(data_out, count, npyv_muladd_s32(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_int a0 = (data0[0]);
+        const npy_int b0 = (data1[0]);
+        const npy_int c0 = (data_out[0]);
+        
+#line 420
+        const npy_int a1 = (data0[1]);
+        const npy_int b1 = (data1[1]);
+        const npy_int c1 = (data_out[1]);
+        
+#line 420
+        const npy_int a2 = (data0[2]);
+        const npy_int b2 = (data1[2]);
+        const npy_int c2 = (data_out[2]);
+        
+#line 420
+        const npy_int a3 = (data0[3]);
+        const npy_int b3 = (data1[3]);
+        const npy_int c3 = (data_out[3]);
+        
+        #line 427
+        const npy_int abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_int abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_int abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_int abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_int a = (*data0);
+        const npy_int b = (*data1);
+        const npy_int c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_int
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+int_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_int value0 = (*(npy_int *)dataptr[0]);
+    npy_int *data1 = (npy_int *)dataptr[1];
+    npy_int *data_out = (npy_int *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    int_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+int_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_int value1 = (*(npy_int *)dataptr[1]);
+    npy_int *data0 = (npy_int *)dataptr[0];
+    npy_int *data_out = (npy_int *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    int_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+int_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_int *data0 = (npy_int *)dataptr[0];
+    npy_int *data1 = (npy_int *)dataptr[1];
+    npy_int accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_int
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_s32;
+    npyv_s32 v_accum = npyv_zero_s32();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_s32 a0 = npyv_loada_s32(data0 + vstep * 0);
+            npyv_s32 b0 = npyv_loada_s32(data1 + vstep * 0);
+            
+#line 501
+            npyv_s32 a1 = npyv_loada_s32(data0 + vstep * 1);
+            npyv_s32 b1 = npyv_loada_s32(data1 + vstep * 1);
+            
+#line 501
+            npyv_s32 a2 = npyv_loada_s32(data0 + vstep * 2);
+            npyv_s32 b2 = npyv_loada_s32(data1 + vstep * 2);
+            
+#line 501
+            npyv_s32 a3 = npyv_loada_s32(data0 + vstep * 3);
+            npyv_s32 b3 = npyv_loada_s32(data1 + vstep * 3);
+            
+            npyv_s32 ab3 = npyv_muladd_s32(a3, b3, v_accum);
+            npyv_s32 ab2 = npyv_muladd_s32(a2, b2, ab3);
+            npyv_s32 ab1 = npyv_muladd_s32(a1, b1, ab2);
+                   v_accum = npyv_muladd_s32(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_s32 a0 = npyv_load_s32(data0 + vstep * 0);
+            npyv_s32 b0 = npyv_load_s32(data1 + vstep * 0);
+            
+#line 501
+            npyv_s32 a1 = npyv_load_s32(data0 + vstep * 1);
+            npyv_s32 b1 = npyv_load_s32(data1 + vstep * 1);
+            
+#line 501
+            npyv_s32 a2 = npyv_load_s32(data0 + vstep * 2);
+            npyv_s32 b2 = npyv_load_s32(data1 + vstep * 2);
+            
+#line 501
+            npyv_s32 a3 = npyv_load_s32(data0 + vstep * 3);
+            npyv_s32 b3 = npyv_load_s32(data1 + vstep * 3);
+            
+            npyv_s32 ab3 = npyv_muladd_s32(a3, b3, v_accum);
+            npyv_s32 ab2 = npyv_muladd_s32(a2, b2, ab3);
+            npyv_s32 ab1 = npyv_muladd_s32(a1, b1, ab2);
+                   v_accum = npyv_muladd_s32(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_s32 a = npyv_load_tillz_s32(data0, count);
+        npyv_s32 b = npyv_load_tillz_s32(data1, count);
+        v_accum = npyv_muladd_s32(a, b, v_accum);
+    }
+    accum = npyv_sum_s32(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_int ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_int ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_int ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_int ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_int a = (*data0);
+        const npy_int b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_int
+    *(npy_int *)dataptr[2] = ((*(npy_int *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+int_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_int *data1 = (npy_int *)dataptr[1];
+    npy_int value0 = (*(npy_int *)dataptr[0]);
+    npy_int accum = int_sum_of_arr(data1, count);
+    *(npy_int *)dataptr[2] = ((*(npy_int *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+int_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_int *data0 = (npy_int *)dataptr[0];
+    npy_int value1 = (*(npy_int *)dataptr[1]);
+    npy_int accum = int_sum_of_arr(data0, count);
+    *(npy_int *)dataptr[2] = ((*(npy_int *)dataptr[2]) + value1 * accum);
+}
+
+#elif 1000 == 3 && !0
+
+static void
+int_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_int *data0 = (npy_int *)dataptr[0];
+    npy_int *data1 = (npy_int *)dataptr[1];
+    npy_int *data2 = (npy_int *)dataptr[2];
+    npy_int *data_out = (npy_int *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 1000 > 3 || @complex */
+
+static void
+int_sum_of_products_contig_any(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_any (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_int temp = (*(npy_int *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_int *)dataptr[i]);
+        }
+        *(npy_int *)dataptr[nop] = (temp +
+                                           (*(npy_int *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_int);
+        }
+#else /* complex */
+#  if 1000 <= 3
+#    define _SUMPROD_NOP 1000
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_int re, im, tmp;
+        int i;
+        re = ((npy_int *)dataptr[0])[0];
+        im = ((npy_int *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_int *)dataptr[i])[0] -
+                  im * ((npy_int *)dataptr[i])[1];
+            im = re * ((npy_int *)dataptr[i])[1] +
+                 im * ((npy_int *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_int *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_int *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_int *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_int *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_int);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 1000 */
+
+#if 1000 == 1
+
+static NPY_GCC_OPT_3 void
+int_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_int *data = (npy_int *)dataptr[0];
+    npy_int accum = int_sum_of_arr(data, count);
+    *((npy_int *)dataptr[1]) = (accum + (*((npy_int *)dataptr[1])));
+#else
+    npy_int accum_re = 0, accum_im = 0;
+    npy_int *data0 = (npy_int *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_int re01 = data0[0] + data0[2];
+        const npy_int re23 = data0[4] + data0[6];
+        const npy_int im13 = data0[1] + data0[3];
+        const npy_int im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_int *)dataptr[1])[0] += accum_re;
+    ((npy_int *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 1000 == 1 */
+
+static void
+int_sum_of_products_outstride0_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_int accum_re = 0, accum_im = 0;
+#else
+    npy_int accum = 0;
+#endif
+
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1000 == 2 || 1000 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1000 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("int_sum_of_products_outstride0_any (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 1000 == 1
+        accum += (*(npy_int *)data0);
+        data0 += stride0;
+#  elif 1000 == 2
+        accum += (*(npy_int *)data0) *
+                 (*(npy_int *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 1000 == 3
+        accum += (*(npy_int *)data0) *
+                 (*(npy_int *)data1) *
+                 (*(npy_int *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_int temp = (*(npy_int *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_int *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1000 == 1
+        accum_re += ((npy_int *)data0)[0];
+        accum_im += ((npy_int *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 1000 <= 3
+#define _SUMPROD_NOP 1000
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_int re, im, tmp;
+        int i;
+        re = ((npy_int *)dataptr[0])[0];
+        im = ((npy_int *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_int *)dataptr[i])[0] -
+                  im * ((npy_int *)dataptr[i])[1];
+            im = re * ((npy_int *)dataptr[i])[1] +
+                 im * ((npy_int *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 1000 <= 3
+    ((npy_int *)dataptr[1000])[0] += accum_re;
+    ((npy_int *)dataptr[1000])[1] += accum_im;
+#  else
+    ((npy_int *)dataptr[nop])[0] += accum_re;
+    ((npy_int *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 1000 <= 3
+    *((npy_int *)dataptr[1000]) = (accum +
+                                    (*((npy_int *)dataptr[1000])));
+#  else
+    *((npy_int *)dataptr[nop]) = (accum +
+                                    (*((npy_int *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+
+
+#line 74
+
+#if !0
+static NPY_GCC_OPT_3 npy_long long_sum_of_arr(npy_long *data, npy_intp count)
+{
+    npy_long accum = 0;
+#if 0 // NPYV check for npy_long
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data);
+    const int vstep = npyv_nlanes_long;
+    npyv_long v_accum = npyv_zero_long();
+    const npy_intp vstepx4 = vstep * 4;
+
+    #line 91
+    if(is_aligned) {
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
+            #line 96
+            npyv_long a0 = npyv_loada_long(data + vstep * 0);
+            
+#line 96
+            npyv_long a1 = npyv_loada_long(data + vstep * 1);
+            
+#line 96
+            npyv_long a2 = npyv_loada_long(data + vstep * 2);
+            
+#line 96
+            npyv_long a3 = npyv_loada_long(data + vstep * 3);
+            
+            npyv_long a01   = npyv_add_long(a0, a1);
+            npyv_long a23   = npyv_add_long(a2, a3);
+            npyv_long a0123 = npyv_add_long(a01, a23);
+                     v_accum = npyv_add_long(a0123, v_accum);
+        }
+    }
+    
+#line 91
+    else {
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
+            #line 96
+            npyv_long a0 = npyv_load_long(data + vstep * 0);
+            
+#line 96
+            npyv_long a1 = npyv_load_long(data + vstep * 1);
+            
+#line 96
+            npyv_long a2 = npyv_load_long(data + vstep * 2);
+            
+#line 96
+            npyv_long a3 = npyv_load_long(data + vstep * 3);
+            
+            npyv_long a01   = npyv_add_long(a0, a1);
+            npyv_long a23   = npyv_add_long(a2, a3);
+            npyv_long a0123 = npyv_add_long(a01, a23);
+                     v_accum = npyv_add_long(a0123, v_accum);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep) {
+        npyv_long a = npyv_load_tillz_long(data, count);
+        v_accum = npyv_add_long(a, v_accum);
+    }
+    accum = npyv_sum_long(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data += 4) {
+        const npy_long a01 = (*data) + (data[1]);
+        const npy_long a23 = (data[2]) + (data[3]);
+        accum +=  a01 + a23;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data++) {
+        accum += (*data);
+    }
+#endif // NPYV check for npy_long
+    return accum;
+}
+#endif
+
+#line 131
+static void
+long_sum_of_products_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1 == 2 || 1 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data_out = dataptr[1];
+    npy_intp stride_out = strides[1];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_one (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 1 == 1
+        *(npy_long *)data_out = ((*(npy_long *)data0) +
+                                         (*(npy_long *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 1 == 2
+        *(npy_long *)data_out = ((*(npy_long *)data0) *
+                                         (*(npy_long *)data1) +
+                                         (*(npy_long *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 1 == 3
+        *(npy_long *)data_out = ((*(npy_long *)data0) *
+                                         (*(npy_long *)data1) *
+                                         (*(npy_long *)data2) +
+                                         (*(npy_long *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_long temp = (*(npy_long *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_long *)dataptr[i]);
+        }
+        *(npy_long *)dataptr[nop] = (temp +
+                                           (*(npy_long *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1 == 1
+        ((npy_long *)data_out)[0] = ((npy_long *)data0)[0] +
+                                         ((npy_long *)data_out)[0];
+        ((npy_long *)data_out)[1] = ((npy_long *)data0)[1] +
+                                         ((npy_long *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 1 <= 3
+#define _SUMPROD_NOP 1
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_long re, im, tmp;
+        int i;
+        re = ((npy_long *)dataptr[0])[0];
+        im = ((npy_long *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_long *)dataptr[i])[0] -
+                  im * ((npy_long *)dataptr[i])[1];
+            im = re * ((npy_long *)dataptr[i])[1] +
+                 im * ((npy_long *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_long *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_long *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_long *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_long *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 1 == 1
+
+static void
+long_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_long *data0 = (npy_long *)dataptr[0];
+    npy_long *data_out = (npy_long *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_long *)data_out + 2*6)[0] =
+                                    ((npy_long *)data0 + 2*6)[0] +
+                                    ((npy_long *)data_out + 2*6)[0];
+            ((npy_long *)data_out + 2*6)[1] =
+                                    ((npy_long *)data0 + 2*6)[1] +
+                                    ((npy_long *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_long *)data_out + 2*5)[0] =
+                                    ((npy_long *)data0 + 2*5)[0] +
+                                    ((npy_long *)data_out + 2*5)[0];
+            ((npy_long *)data_out + 2*5)[1] =
+                                    ((npy_long *)data0 + 2*5)[1] +
+                                    ((npy_long *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_long *)data_out + 2*4)[0] =
+                                    ((npy_long *)data0 + 2*4)[0] +
+                                    ((npy_long *)data_out + 2*4)[0];
+            ((npy_long *)data_out + 2*4)[1] =
+                                    ((npy_long *)data0 + 2*4)[1] +
+                                    ((npy_long *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_long *)data_out + 2*3)[0] =
+                                    ((npy_long *)data0 + 2*3)[0] +
+                                    ((npy_long *)data_out + 2*3)[0];
+            ((npy_long *)data_out + 2*3)[1] =
+                                    ((npy_long *)data0 + 2*3)[1] +
+                                    ((npy_long *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_long *)data_out + 2*2)[0] =
+                                    ((npy_long *)data0 + 2*2)[0] +
+                                    ((npy_long *)data_out + 2*2)[0];
+            ((npy_long *)data_out + 2*2)[1] =
+                                    ((npy_long *)data0 + 2*2)[1] +
+                                    ((npy_long *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_long *)data_out + 2*1)[0] =
+                                    ((npy_long *)data0 + 2*1)[0] +
+                                    ((npy_long *)data_out + 2*1)[0];
+            ((npy_long *)data_out + 2*1)[1] =
+                                    ((npy_long *)data0 + 2*1)[1] +
+                                    ((npy_long *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_long *)data_out + 2*0)[0] =
+                                    ((npy_long *)data0 + 2*0)[0] +
+                                    ((npy_long *)data_out + 2*0)[0];
+            ((npy_long *)data_out + 2*0)[1] =
+                                    ((npy_long *)data0 + 2*0)[1] +
+                                    ((npy_long *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_long *)data_out + 2*0)[0] =
+                                ((npy_long *)data0 + 2*0)[0] +
+                                ((npy_long *)data_out + 2*0)[0];
+        ((npy_long *)data_out + 2*0)[1] =
+                                ((npy_long *)data0 + 2*0)[1] +
+                                ((npy_long *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_long *)data_out + 2*1)[0] =
+                                ((npy_long *)data0 + 2*1)[0] +
+                                ((npy_long *)data_out + 2*1)[0];
+        ((npy_long *)data_out + 2*1)[1] =
+                                ((npy_long *)data0 + 2*1)[1] +
+                                ((npy_long *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_long *)data_out + 2*2)[0] =
+                                ((npy_long *)data0 + 2*2)[0] +
+                                ((npy_long *)data_out + 2*2)[0];
+        ((npy_long *)data_out + 2*2)[1] =
+                                ((npy_long *)data0 + 2*2)[1] +
+                                ((npy_long *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_long *)data_out + 2*3)[0] =
+                                ((npy_long *)data0 + 2*3)[0] +
+                                ((npy_long *)data_out + 2*3)[0];
+        ((npy_long *)data_out + 2*3)[1] =
+                                ((npy_long *)data0 + 2*3)[1] +
+                                ((npy_long *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_long *)data_out + 2*4)[0] =
+                                ((npy_long *)data0 + 2*4)[0] +
+                                ((npy_long *)data_out + 2*4)[0];
+        ((npy_long *)data_out + 2*4)[1] =
+                                ((npy_long *)data0 + 2*4)[1] +
+                                ((npy_long *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_long *)data_out + 2*5)[0] =
+                                ((npy_long *)data0 + 2*5)[0] +
+                                ((npy_long *)data_out + 2*5)[0];
+        ((npy_long *)data_out + 2*5)[1] =
+                                ((npy_long *)data0 + 2*5)[1] +
+                                ((npy_long *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_long *)data_out + 2*6)[0] =
+                                ((npy_long *)data0 + 2*6)[0] +
+                                ((npy_long *)data_out + 2*6)[0];
+        ((npy_long *)data_out + 2*6)[1] =
+                                ((npy_long *)data0 + 2*6)[1] +
+                                ((npy_long *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_long *)data_out + 2*7)[0] =
+                                ((npy_long *)data0 + 2*7)[0] +
+                                ((npy_long *)data_out + 2*7)[0];
+        ((npy_long *)data_out + 2*7)[1] =
+                                ((npy_long *)data0 + 2*7)[1] +
+                                ((npy_long *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 1 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+long_sum_of_products_muladd(npy_long *data, npy_long *data_out, npy_long scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_long
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_long;
+    const npyv_long v_scalar = npyv_setall_long(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_long b0 = npyv_loada_long(data + vstep * 0);
+            npyv_long c0 = npyv_loada_long(data_out + vstep * 0);
+            
+#line 312
+            npyv_long b1 = npyv_loada_long(data + vstep * 1);
+            npyv_long c1 = npyv_loada_long(data_out + vstep * 1);
+            
+#line 312
+            npyv_long b2 = npyv_loada_long(data + vstep * 2);
+            npyv_long c2 = npyv_loada_long(data_out + vstep * 2);
+            
+#line 312
+            npyv_long b3 = npyv_loada_long(data + vstep * 3);
+            npyv_long c3 = npyv_loada_long(data_out + vstep * 3);
+            
+            #line 318
+            npyv_long abc0 = npyv_muladd_long(v_scalar, b0, c0);
+            
+#line 318
+            npyv_long abc1 = npyv_muladd_long(v_scalar, b1, c1);
+            
+#line 318
+            npyv_long abc2 = npyv_muladd_long(v_scalar, b2, c2);
+            
+#line 318
+            npyv_long abc3 = npyv_muladd_long(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_long(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_long(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_long(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_long(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_long b0 = npyv_load_long(data + vstep * 0);
+            npyv_long c0 = npyv_load_long(data_out + vstep * 0);
+            
+#line 312
+            npyv_long b1 = npyv_load_long(data + vstep * 1);
+            npyv_long c1 = npyv_load_long(data_out + vstep * 1);
+            
+#line 312
+            npyv_long b2 = npyv_load_long(data + vstep * 2);
+            npyv_long c2 = npyv_load_long(data_out + vstep * 2);
+            
+#line 312
+            npyv_long b3 = npyv_load_long(data + vstep * 3);
+            npyv_long c3 = npyv_load_long(data_out + vstep * 3);
+            
+            #line 318
+            npyv_long abc0 = npyv_muladd_long(v_scalar, b0, c0);
+            
+#line 318
+            npyv_long abc1 = npyv_muladd_long(v_scalar, b1, c1);
+            
+#line 318
+            npyv_long abc2 = npyv_muladd_long(v_scalar, b2, c2);
+            
+#line 318
+            npyv_long abc3 = npyv_muladd_long(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_long(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_long(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_long(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_long(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_long a = npyv_load_tillz_long(data, count);
+        npyv_long b = npyv_load_tillz_long(data_out, count);
+        npyv_store_till_long(data_out, count, npyv_muladd_long(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_long b0 = (data[0]);
+        const npy_long c0 = (data_out[0]);
+        
+#line 340
+        const npy_long b1 = (data[1]);
+        const npy_long c1 = (data_out[1]);
+        
+#line 340
+        const npy_long b2 = (data[2]);
+        const npy_long c2 = (data_out[2]);
+        
+#line 340
+        const npy_long b3 = (data[3]);
+        const npy_long c3 = (data_out[3]);
+        
+        #line 346
+        const npy_long abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_long abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_long abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_long abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_long b = (*data);
+        const npy_long c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_long
+}
+
+static void
+long_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_long *data0 = (npy_long *)dataptr[0];
+    npy_long *data1 = (npy_long *)dataptr[1];
+    npy_long *data_out = (npy_long *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_long
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_long;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_long a0 = npyv_loada_long(data0 + vstep * 0);
+            npyv_long b0 = npyv_loada_long(data1 + vstep * 0);
+            npyv_long c0 = npyv_loada_long(data_out + vstep * 0);
+            
+#line 390
+            npyv_long a1 = npyv_loada_long(data0 + vstep * 1);
+            npyv_long b1 = npyv_loada_long(data1 + vstep * 1);
+            npyv_long c1 = npyv_loada_long(data_out + vstep * 1);
+            
+#line 390
+            npyv_long a2 = npyv_loada_long(data0 + vstep * 2);
+            npyv_long b2 = npyv_loada_long(data1 + vstep * 2);
+            npyv_long c2 = npyv_loada_long(data_out + vstep * 2);
+            
+#line 390
+            npyv_long a3 = npyv_loada_long(data0 + vstep * 3);
+            npyv_long b3 = npyv_loada_long(data1 + vstep * 3);
+            npyv_long c3 = npyv_loada_long(data_out + vstep * 3);
+            
+            #line 397
+            npyv_long abc0 = npyv_muladd_long(a0, b0, c0);
+            
+#line 397
+            npyv_long abc1 = npyv_muladd_long(a1, b1, c1);
+            
+#line 397
+            npyv_long abc2 = npyv_muladd_long(a2, b2, c2);
+            
+#line 397
+            npyv_long abc3 = npyv_muladd_long(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_long(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_long(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_long(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_long(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_long a0 = npyv_load_long(data0 + vstep * 0);
+            npyv_long b0 = npyv_load_long(data1 + vstep * 0);
+            npyv_long c0 = npyv_load_long(data_out + vstep * 0);
+            
+#line 390
+            npyv_long a1 = npyv_load_long(data0 + vstep * 1);
+            npyv_long b1 = npyv_load_long(data1 + vstep * 1);
+            npyv_long c1 = npyv_load_long(data_out + vstep * 1);
+            
+#line 390
+            npyv_long a2 = npyv_load_long(data0 + vstep * 2);
+            npyv_long b2 = npyv_load_long(data1 + vstep * 2);
+            npyv_long c2 = npyv_load_long(data_out + vstep * 2);
+            
+#line 390
+            npyv_long a3 = npyv_load_long(data0 + vstep * 3);
+            npyv_long b3 = npyv_load_long(data1 + vstep * 3);
+            npyv_long c3 = npyv_load_long(data_out + vstep * 3);
+            
+            #line 397
+            npyv_long abc0 = npyv_muladd_long(a0, b0, c0);
+            
+#line 397
+            npyv_long abc1 = npyv_muladd_long(a1, b1, c1);
+            
+#line 397
+            npyv_long abc2 = npyv_muladd_long(a2, b2, c2);
+            
+#line 397
+            npyv_long abc3 = npyv_muladd_long(a3, b3, c3);
+            
+            #line 402
+            npyv_store_long(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_long(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_long(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_long(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_long a = npyv_load_tillz_long(data0, count);
+        npyv_long b = npyv_load_tillz_long(data1, count);
+        npyv_long c = npyv_load_tillz_long(data_out, count);
+        npyv_store_till_long(data_out, count, npyv_muladd_long(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_long a0 = (data0[0]);
+        const npy_long b0 = (data1[0]);
+        const npy_long c0 = (data_out[0]);
+        
+#line 420
+        const npy_long a1 = (data0[1]);
+        const npy_long b1 = (data1[1]);
+        const npy_long c1 = (data_out[1]);
+        
+#line 420
+        const npy_long a2 = (data0[2]);
+        const npy_long b2 = (data1[2]);
+        const npy_long c2 = (data_out[2]);
+        
+#line 420
+        const npy_long a3 = (data0[3]);
+        const npy_long b3 = (data1[3]);
+        const npy_long c3 = (data_out[3]);
+        
+        #line 427
+        const npy_long abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_long abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_long abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_long abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_long a = (*data0);
+        const npy_long b = (*data1);
+        const npy_long c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_long
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+long_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_long value0 = (*(npy_long *)dataptr[0]);
+    npy_long *data1 = (npy_long *)dataptr[1];
+    npy_long *data_out = (npy_long *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    long_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+long_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_long value1 = (*(npy_long *)dataptr[1]);
+    npy_long *data0 = (npy_long *)dataptr[0];
+    npy_long *data_out = (npy_long *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    long_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+long_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_long *data0 = (npy_long *)dataptr[0];
+    npy_long *data1 = (npy_long *)dataptr[1];
+    npy_long accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_long
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_long;
+    npyv_long v_accum = npyv_zero_long();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_long a0 = npyv_loada_long(data0 + vstep * 0);
+            npyv_long b0 = npyv_loada_long(data1 + vstep * 0);
+            
+#line 501
+            npyv_long a1 = npyv_loada_long(data0 + vstep * 1);
+            npyv_long b1 = npyv_loada_long(data1 + vstep * 1);
+            
+#line 501
+            npyv_long a2 = npyv_loada_long(data0 + vstep * 2);
+            npyv_long b2 = npyv_loada_long(data1 + vstep * 2);
+            
+#line 501
+            npyv_long a3 = npyv_loada_long(data0 + vstep * 3);
+            npyv_long b3 = npyv_loada_long(data1 + vstep * 3);
+            
+            npyv_long ab3 = npyv_muladd_long(a3, b3, v_accum);
+            npyv_long ab2 = npyv_muladd_long(a2, b2, ab3);
+            npyv_long ab1 = npyv_muladd_long(a1, b1, ab2);
+                   v_accum = npyv_muladd_long(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_long a0 = npyv_load_long(data0 + vstep * 0);
+            npyv_long b0 = npyv_load_long(data1 + vstep * 0);
+            
+#line 501
+            npyv_long a1 = npyv_load_long(data0 + vstep * 1);
+            npyv_long b1 = npyv_load_long(data1 + vstep * 1);
+            
+#line 501
+            npyv_long a2 = npyv_load_long(data0 + vstep * 2);
+            npyv_long b2 = npyv_load_long(data1 + vstep * 2);
+            
+#line 501
+            npyv_long a3 = npyv_load_long(data0 + vstep * 3);
+            npyv_long b3 = npyv_load_long(data1 + vstep * 3);
+            
+            npyv_long ab3 = npyv_muladd_long(a3, b3, v_accum);
+            npyv_long ab2 = npyv_muladd_long(a2, b2, ab3);
+            npyv_long ab1 = npyv_muladd_long(a1, b1, ab2);
+                   v_accum = npyv_muladd_long(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_long a = npyv_load_tillz_long(data0, count);
+        npyv_long b = npyv_load_tillz_long(data1, count);
+        v_accum = npyv_muladd_long(a, b, v_accum);
+    }
+    accum = npyv_sum_long(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_long ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_long ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_long ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_long ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_long a = (*data0);
+        const npy_long b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_long
+    *(npy_long *)dataptr[2] = ((*(npy_long *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+long_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_long *data1 = (npy_long *)dataptr[1];
+    npy_long value0 = (*(npy_long *)dataptr[0]);
+    npy_long accum = long_sum_of_arr(data1, count);
+    *(npy_long *)dataptr[2] = ((*(npy_long *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+long_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_long *data0 = (npy_long *)dataptr[0];
+    npy_long value1 = (*(npy_long *)dataptr[1]);
+    npy_long accum = long_sum_of_arr(data0, count);
+    *(npy_long *)dataptr[2] = ((*(npy_long *)dataptr[2]) + value1 * accum);
+}
+
+#elif 1 == 3 && !0
+
+static void
+long_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_long *data0 = (npy_long *)dataptr[0];
+    npy_long *data1 = (npy_long *)dataptr[1];
+    npy_long *data2 = (npy_long *)dataptr[2];
+    npy_long *data_out = (npy_long *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 1 > 3 || @complex */
+
+static void
+long_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_one (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_long temp = (*(npy_long *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_long *)dataptr[i]);
+        }
+        *(npy_long *)dataptr[nop] = (temp +
+                                           (*(npy_long *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_long);
+        }
+#else /* complex */
+#  if 1 <= 3
+#    define _SUMPROD_NOP 1
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_long re, im, tmp;
+        int i;
+        re = ((npy_long *)dataptr[0])[0];
+        im = ((npy_long *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_long *)dataptr[i])[0] -
+                  im * ((npy_long *)dataptr[i])[1];
+            im = re * ((npy_long *)dataptr[i])[1] +
+                 im * ((npy_long *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_long *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_long *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_long *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_long *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_long);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 1 */
+
+#if 1 == 1
+
+static NPY_GCC_OPT_3 void
+long_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_long *data = (npy_long *)dataptr[0];
+    npy_long accum = long_sum_of_arr(data, count);
+    *((npy_long *)dataptr[1]) = (accum + (*((npy_long *)dataptr[1])));
+#else
+    npy_long accum_re = 0, accum_im = 0;
+    npy_long *data0 = (npy_long *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_long re01 = data0[0] + data0[2];
+        const npy_long re23 = data0[4] + data0[6];
+        const npy_long im13 = data0[1] + data0[3];
+        const npy_long im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_long *)dataptr[1])[0] += accum_re;
+    ((npy_long *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 1 == 1 */
+
+static void
+long_sum_of_products_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_long accum_re = 0, accum_im = 0;
+#else
+    npy_long accum = 0;
+#endif
+
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1 == 2 || 1 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_outstride0_one (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 1 == 1
+        accum += (*(npy_long *)data0);
+        data0 += stride0;
+#  elif 1 == 2
+        accum += (*(npy_long *)data0) *
+                 (*(npy_long *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 1 == 3
+        accum += (*(npy_long *)data0) *
+                 (*(npy_long *)data1) *
+                 (*(npy_long *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_long temp = (*(npy_long *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_long *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1 == 1
+        accum_re += ((npy_long *)data0)[0];
+        accum_im += ((npy_long *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 1 <= 3
+#define _SUMPROD_NOP 1
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_long re, im, tmp;
+        int i;
+        re = ((npy_long *)dataptr[0])[0];
+        im = ((npy_long *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_long *)dataptr[i])[0] -
+                  im * ((npy_long *)dataptr[i])[1];
+            im = re * ((npy_long *)dataptr[i])[1] +
+                 im * ((npy_long *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 1 <= 3
+    ((npy_long *)dataptr[1])[0] += accum_re;
+    ((npy_long *)dataptr[1])[1] += accum_im;
+#  else
+    ((npy_long *)dataptr[nop])[0] += accum_re;
+    ((npy_long *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 1 <= 3
+    *((npy_long *)dataptr[1]) = (accum +
+                                    (*((npy_long *)dataptr[1])));
+#  else
+    *((npy_long *)dataptr[nop]) = (accum +
+                                    (*((npy_long *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+long_sum_of_products_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (2 == 2 || 2 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (2 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data_out = dataptr[2];
+    npy_intp stride_out = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_two (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 2 == 1
+        *(npy_long *)data_out = ((*(npy_long *)data0) +
+                                         (*(npy_long *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 2 == 2
+        *(npy_long *)data_out = ((*(npy_long *)data0) *
+                                         (*(npy_long *)data1) +
+                                         (*(npy_long *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 2 == 3
+        *(npy_long *)data_out = ((*(npy_long *)data0) *
+                                         (*(npy_long *)data1) *
+                                         (*(npy_long *)data2) +
+                                         (*(npy_long *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_long temp = (*(npy_long *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_long *)dataptr[i]);
+        }
+        *(npy_long *)dataptr[nop] = (temp +
+                                           (*(npy_long *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 2 == 1
+        ((npy_long *)data_out)[0] = ((npy_long *)data0)[0] +
+                                         ((npy_long *)data_out)[0];
+        ((npy_long *)data_out)[1] = ((npy_long *)data0)[1] +
+                                         ((npy_long *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 2 <= 3
+#define _SUMPROD_NOP 2
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_long re, im, tmp;
+        int i;
+        re = ((npy_long *)dataptr[0])[0];
+        im = ((npy_long *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_long *)dataptr[i])[0] -
+                  im * ((npy_long *)dataptr[i])[1];
+            im = re * ((npy_long *)dataptr[i])[1] +
+                 im * ((npy_long *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_long *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_long *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_long *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_long *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 2 == 1
+
+static void
+long_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_long *data0 = (npy_long *)dataptr[0];
+    npy_long *data_out = (npy_long *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_long *)data_out + 2*6)[0] =
+                                    ((npy_long *)data0 + 2*6)[0] +
+                                    ((npy_long *)data_out + 2*6)[0];
+            ((npy_long *)data_out + 2*6)[1] =
+                                    ((npy_long *)data0 + 2*6)[1] +
+                                    ((npy_long *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_long *)data_out + 2*5)[0] =
+                                    ((npy_long *)data0 + 2*5)[0] +
+                                    ((npy_long *)data_out + 2*5)[0];
+            ((npy_long *)data_out + 2*5)[1] =
+                                    ((npy_long *)data0 + 2*5)[1] +
+                                    ((npy_long *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_long *)data_out + 2*4)[0] =
+                                    ((npy_long *)data0 + 2*4)[0] +
+                                    ((npy_long *)data_out + 2*4)[0];
+            ((npy_long *)data_out + 2*4)[1] =
+                                    ((npy_long *)data0 + 2*4)[1] +
+                                    ((npy_long *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_long *)data_out + 2*3)[0] =
+                                    ((npy_long *)data0 + 2*3)[0] +
+                                    ((npy_long *)data_out + 2*3)[0];
+            ((npy_long *)data_out + 2*3)[1] =
+                                    ((npy_long *)data0 + 2*3)[1] +
+                                    ((npy_long *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_long *)data_out + 2*2)[0] =
+                                    ((npy_long *)data0 + 2*2)[0] +
+                                    ((npy_long *)data_out + 2*2)[0];
+            ((npy_long *)data_out + 2*2)[1] =
+                                    ((npy_long *)data0 + 2*2)[1] +
+                                    ((npy_long *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_long *)data_out + 2*1)[0] =
+                                    ((npy_long *)data0 + 2*1)[0] +
+                                    ((npy_long *)data_out + 2*1)[0];
+            ((npy_long *)data_out + 2*1)[1] =
+                                    ((npy_long *)data0 + 2*1)[1] +
+                                    ((npy_long *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_long *)data_out + 2*0)[0] =
+                                    ((npy_long *)data0 + 2*0)[0] +
+                                    ((npy_long *)data_out + 2*0)[0];
+            ((npy_long *)data_out + 2*0)[1] =
+                                    ((npy_long *)data0 + 2*0)[1] +
+                                    ((npy_long *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_long *)data_out + 2*0)[0] =
+                                ((npy_long *)data0 + 2*0)[0] +
+                                ((npy_long *)data_out + 2*0)[0];
+        ((npy_long *)data_out + 2*0)[1] =
+                                ((npy_long *)data0 + 2*0)[1] +
+                                ((npy_long *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_long *)data_out + 2*1)[0] =
+                                ((npy_long *)data0 + 2*1)[0] +
+                                ((npy_long *)data_out + 2*1)[0];
+        ((npy_long *)data_out + 2*1)[1] =
+                                ((npy_long *)data0 + 2*1)[1] +
+                                ((npy_long *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_long *)data_out + 2*2)[0] =
+                                ((npy_long *)data0 + 2*2)[0] +
+                                ((npy_long *)data_out + 2*2)[0];
+        ((npy_long *)data_out + 2*2)[1] =
+                                ((npy_long *)data0 + 2*2)[1] +
+                                ((npy_long *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_long *)data_out + 2*3)[0] =
+                                ((npy_long *)data0 + 2*3)[0] +
+                                ((npy_long *)data_out + 2*3)[0];
+        ((npy_long *)data_out + 2*3)[1] =
+                                ((npy_long *)data0 + 2*3)[1] +
+                                ((npy_long *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_long *)data_out + 2*4)[0] =
+                                ((npy_long *)data0 + 2*4)[0] +
+                                ((npy_long *)data_out + 2*4)[0];
+        ((npy_long *)data_out + 2*4)[1] =
+                                ((npy_long *)data0 + 2*4)[1] +
+                                ((npy_long *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_long *)data_out + 2*5)[0] =
+                                ((npy_long *)data0 + 2*5)[0] +
+                                ((npy_long *)data_out + 2*5)[0];
+        ((npy_long *)data_out + 2*5)[1] =
+                                ((npy_long *)data0 + 2*5)[1] +
+                                ((npy_long *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_long *)data_out + 2*6)[0] =
+                                ((npy_long *)data0 + 2*6)[0] +
+                                ((npy_long *)data_out + 2*6)[0];
+        ((npy_long *)data_out + 2*6)[1] =
+                                ((npy_long *)data0 + 2*6)[1] +
+                                ((npy_long *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_long *)data_out + 2*7)[0] =
+                                ((npy_long *)data0 + 2*7)[0] +
+                                ((npy_long *)data_out + 2*7)[0];
+        ((npy_long *)data_out + 2*7)[1] =
+                                ((npy_long *)data0 + 2*7)[1] +
+                                ((npy_long *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 2 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+long_sum_of_products_muladd(npy_long *data, npy_long *data_out, npy_long scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_long
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_long;
+    const npyv_long v_scalar = npyv_setall_long(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_long b0 = npyv_loada_long(data + vstep * 0);
+            npyv_long c0 = npyv_loada_long(data_out + vstep * 0);
+            
+#line 312
+            npyv_long b1 = npyv_loada_long(data + vstep * 1);
+            npyv_long c1 = npyv_loada_long(data_out + vstep * 1);
+            
+#line 312
+            npyv_long b2 = npyv_loada_long(data + vstep * 2);
+            npyv_long c2 = npyv_loada_long(data_out + vstep * 2);
+            
+#line 312
+            npyv_long b3 = npyv_loada_long(data + vstep * 3);
+            npyv_long c3 = npyv_loada_long(data_out + vstep * 3);
+            
+            #line 318
+            npyv_long abc0 = npyv_muladd_long(v_scalar, b0, c0);
+            
+#line 318
+            npyv_long abc1 = npyv_muladd_long(v_scalar, b1, c1);
+            
+#line 318
+            npyv_long abc2 = npyv_muladd_long(v_scalar, b2, c2);
+            
+#line 318
+            npyv_long abc3 = npyv_muladd_long(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_long(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_long(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_long(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_long(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_long b0 = npyv_load_long(data + vstep * 0);
+            npyv_long c0 = npyv_load_long(data_out + vstep * 0);
+            
+#line 312
+            npyv_long b1 = npyv_load_long(data + vstep * 1);
+            npyv_long c1 = npyv_load_long(data_out + vstep * 1);
+            
+#line 312
+            npyv_long b2 = npyv_load_long(data + vstep * 2);
+            npyv_long c2 = npyv_load_long(data_out + vstep * 2);
+            
+#line 312
+            npyv_long b3 = npyv_load_long(data + vstep * 3);
+            npyv_long c3 = npyv_load_long(data_out + vstep * 3);
+            
+            #line 318
+            npyv_long abc0 = npyv_muladd_long(v_scalar, b0, c0);
+            
+#line 318
+            npyv_long abc1 = npyv_muladd_long(v_scalar, b1, c1);
+            
+#line 318
+            npyv_long abc2 = npyv_muladd_long(v_scalar, b2, c2);
+            
+#line 318
+            npyv_long abc3 = npyv_muladd_long(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_long(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_long(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_long(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_long(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_long a = npyv_load_tillz_long(data, count);
+        npyv_long b = npyv_load_tillz_long(data_out, count);
+        npyv_store_till_long(data_out, count, npyv_muladd_long(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_long b0 = (data[0]);
+        const npy_long c0 = (data_out[0]);
+        
+#line 340
+        const npy_long b1 = (data[1]);
+        const npy_long c1 = (data_out[1]);
+        
+#line 340
+        const npy_long b2 = (data[2]);
+        const npy_long c2 = (data_out[2]);
+        
+#line 340
+        const npy_long b3 = (data[3]);
+        const npy_long c3 = (data_out[3]);
+        
+        #line 346
+        const npy_long abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_long abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_long abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_long abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_long b = (*data);
+        const npy_long c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_long
+}
+
+static void
+long_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_long *data0 = (npy_long *)dataptr[0];
+    npy_long *data1 = (npy_long *)dataptr[1];
+    npy_long *data_out = (npy_long *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_long
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_long;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_long a0 = npyv_loada_long(data0 + vstep * 0);
+            npyv_long b0 = npyv_loada_long(data1 + vstep * 0);
+            npyv_long c0 = npyv_loada_long(data_out + vstep * 0);
+            
+#line 390
+            npyv_long a1 = npyv_loada_long(data0 + vstep * 1);
+            npyv_long b1 = npyv_loada_long(data1 + vstep * 1);
+            npyv_long c1 = npyv_loada_long(data_out + vstep * 1);
+            
+#line 390
+            npyv_long a2 = npyv_loada_long(data0 + vstep * 2);
+            npyv_long b2 = npyv_loada_long(data1 + vstep * 2);
+            npyv_long c2 = npyv_loada_long(data_out + vstep * 2);
+            
+#line 390
+            npyv_long a3 = npyv_loada_long(data0 + vstep * 3);
+            npyv_long b3 = npyv_loada_long(data1 + vstep * 3);
+            npyv_long c3 = npyv_loada_long(data_out + vstep * 3);
+            
+            #line 397
+            npyv_long abc0 = npyv_muladd_long(a0, b0, c0);
+            
+#line 397
+            npyv_long abc1 = npyv_muladd_long(a1, b1, c1);
+            
+#line 397
+            npyv_long abc2 = npyv_muladd_long(a2, b2, c2);
+            
+#line 397
+            npyv_long abc3 = npyv_muladd_long(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_long(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_long(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_long(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_long(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_long a0 = npyv_load_long(data0 + vstep * 0);
+            npyv_long b0 = npyv_load_long(data1 + vstep * 0);
+            npyv_long c0 = npyv_load_long(data_out + vstep * 0);
+            
+#line 390
+            npyv_long a1 = npyv_load_long(data0 + vstep * 1);
+            npyv_long b1 = npyv_load_long(data1 + vstep * 1);
+            npyv_long c1 = npyv_load_long(data_out + vstep * 1);
+            
+#line 390
+            npyv_long a2 = npyv_load_long(data0 + vstep * 2);
+            npyv_long b2 = npyv_load_long(data1 + vstep * 2);
+            npyv_long c2 = npyv_load_long(data_out + vstep * 2);
+            
+#line 390
+            npyv_long a3 = npyv_load_long(data0 + vstep * 3);
+            npyv_long b3 = npyv_load_long(data1 + vstep * 3);
+            npyv_long c3 = npyv_load_long(data_out + vstep * 3);
+            
+            #line 397
+            npyv_long abc0 = npyv_muladd_long(a0, b0, c0);
+            
+#line 397
+            npyv_long abc1 = npyv_muladd_long(a1, b1, c1);
+            
+#line 397
+            npyv_long abc2 = npyv_muladd_long(a2, b2, c2);
+            
+#line 397
+            npyv_long abc3 = npyv_muladd_long(a3, b3, c3);
+            
+            #line 402
+            npyv_store_long(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_long(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_long(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_long(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_long a = npyv_load_tillz_long(data0, count);
+        npyv_long b = npyv_load_tillz_long(data1, count);
+        npyv_long c = npyv_load_tillz_long(data_out, count);
+        npyv_store_till_long(data_out, count, npyv_muladd_long(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_long a0 = (data0[0]);
+        const npy_long b0 = (data1[0]);
+        const npy_long c0 = (data_out[0]);
+        
+#line 420
+        const npy_long a1 = (data0[1]);
+        const npy_long b1 = (data1[1]);
+        const npy_long c1 = (data_out[1]);
+        
+#line 420
+        const npy_long a2 = (data0[2]);
+        const npy_long b2 = (data1[2]);
+        const npy_long c2 = (data_out[2]);
+        
+#line 420
+        const npy_long a3 = (data0[3]);
+        const npy_long b3 = (data1[3]);
+        const npy_long c3 = (data_out[3]);
+        
+        #line 427
+        const npy_long abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_long abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_long abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_long abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_long a = (*data0);
+        const npy_long b = (*data1);
+        const npy_long c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_long
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+long_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_long value0 = (*(npy_long *)dataptr[0]);
+    npy_long *data1 = (npy_long *)dataptr[1];
+    npy_long *data_out = (npy_long *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    long_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+long_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_long value1 = (*(npy_long *)dataptr[1]);
+    npy_long *data0 = (npy_long *)dataptr[0];
+    npy_long *data_out = (npy_long *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    long_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+long_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_long *data0 = (npy_long *)dataptr[0];
+    npy_long *data1 = (npy_long *)dataptr[1];
+    npy_long accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_long
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_long;
+    npyv_long v_accum = npyv_zero_long();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_long a0 = npyv_loada_long(data0 + vstep * 0);
+            npyv_long b0 = npyv_loada_long(data1 + vstep * 0);
+            
+#line 501
+            npyv_long a1 = npyv_loada_long(data0 + vstep * 1);
+            npyv_long b1 = npyv_loada_long(data1 + vstep * 1);
+            
+#line 501
+            npyv_long a2 = npyv_loada_long(data0 + vstep * 2);
+            npyv_long b2 = npyv_loada_long(data1 + vstep * 2);
+            
+#line 501
+            npyv_long a3 = npyv_loada_long(data0 + vstep * 3);
+            npyv_long b3 = npyv_loada_long(data1 + vstep * 3);
+            
+            npyv_long ab3 = npyv_muladd_long(a3, b3, v_accum);
+            npyv_long ab2 = npyv_muladd_long(a2, b2, ab3);
+            npyv_long ab1 = npyv_muladd_long(a1, b1, ab2);
+                   v_accum = npyv_muladd_long(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_long a0 = npyv_load_long(data0 + vstep * 0);
+            npyv_long b0 = npyv_load_long(data1 + vstep * 0);
+            
+#line 501
+            npyv_long a1 = npyv_load_long(data0 + vstep * 1);
+            npyv_long b1 = npyv_load_long(data1 + vstep * 1);
+            
+#line 501
+            npyv_long a2 = npyv_load_long(data0 + vstep * 2);
+            npyv_long b2 = npyv_load_long(data1 + vstep * 2);
+            
+#line 501
+            npyv_long a3 = npyv_load_long(data0 + vstep * 3);
+            npyv_long b3 = npyv_load_long(data1 + vstep * 3);
+            
+            npyv_long ab3 = npyv_muladd_long(a3, b3, v_accum);
+            npyv_long ab2 = npyv_muladd_long(a2, b2, ab3);
+            npyv_long ab1 = npyv_muladd_long(a1, b1, ab2);
+                   v_accum = npyv_muladd_long(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_long a = npyv_load_tillz_long(data0, count);
+        npyv_long b = npyv_load_tillz_long(data1, count);
+        v_accum = npyv_muladd_long(a, b, v_accum);
+    }
+    accum = npyv_sum_long(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_long ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_long ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_long ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_long ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_long a = (*data0);
+        const npy_long b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_long
+    *(npy_long *)dataptr[2] = ((*(npy_long *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+long_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_long *data1 = (npy_long *)dataptr[1];
+    npy_long value0 = (*(npy_long *)dataptr[0]);
+    npy_long accum = long_sum_of_arr(data1, count);
+    *(npy_long *)dataptr[2] = ((*(npy_long *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+long_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_long *data0 = (npy_long *)dataptr[0];
+    npy_long value1 = (*(npy_long *)dataptr[1]);
+    npy_long accum = long_sum_of_arr(data0, count);
+    *(npy_long *)dataptr[2] = ((*(npy_long *)dataptr[2]) + value1 * accum);
+}
+
+#elif 2 == 3 && !0
+
+static void
+long_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_long *data0 = (npy_long *)dataptr[0];
+    npy_long *data1 = (npy_long *)dataptr[1];
+    npy_long *data2 = (npy_long *)dataptr[2];
+    npy_long *data_out = (npy_long *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 2 > 3 || @complex */
+
+static void
+long_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_two (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_long temp = (*(npy_long *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_long *)dataptr[i]);
+        }
+        *(npy_long *)dataptr[nop] = (temp +
+                                           (*(npy_long *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_long);
+        }
+#else /* complex */
+#  if 2 <= 3
+#    define _SUMPROD_NOP 2
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_long re, im, tmp;
+        int i;
+        re = ((npy_long *)dataptr[0])[0];
+        im = ((npy_long *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_long *)dataptr[i])[0] -
+                  im * ((npy_long *)dataptr[i])[1];
+            im = re * ((npy_long *)dataptr[i])[1] +
+                 im * ((npy_long *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_long *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_long *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_long *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_long *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_long);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 2 */
+
+#if 2 == 1
+
+static NPY_GCC_OPT_3 void
+long_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_long *data = (npy_long *)dataptr[0];
+    npy_long accum = long_sum_of_arr(data, count);
+    *((npy_long *)dataptr[1]) = (accum + (*((npy_long *)dataptr[1])));
+#else
+    npy_long accum_re = 0, accum_im = 0;
+    npy_long *data0 = (npy_long *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_long re01 = data0[0] + data0[2];
+        const npy_long re23 = data0[4] + data0[6];
+        const npy_long im13 = data0[1] + data0[3];
+        const npy_long im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_long *)dataptr[1])[0] += accum_re;
+    ((npy_long *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 2 == 1 */
+
+static void
+long_sum_of_products_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_long accum_re = 0, accum_im = 0;
+#else
+    npy_long accum = 0;
+#endif
+
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (2 == 2 || 2 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (2 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_outstride0_two (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 2 == 1
+        accum += (*(npy_long *)data0);
+        data0 += stride0;
+#  elif 2 == 2
+        accum += (*(npy_long *)data0) *
+                 (*(npy_long *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 2 == 3
+        accum += (*(npy_long *)data0) *
+                 (*(npy_long *)data1) *
+                 (*(npy_long *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_long temp = (*(npy_long *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_long *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 2 == 1
+        accum_re += ((npy_long *)data0)[0];
+        accum_im += ((npy_long *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 2 <= 3
+#define _SUMPROD_NOP 2
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_long re, im, tmp;
+        int i;
+        re = ((npy_long *)dataptr[0])[0];
+        im = ((npy_long *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_long *)dataptr[i])[0] -
+                  im * ((npy_long *)dataptr[i])[1];
+            im = re * ((npy_long *)dataptr[i])[1] +
+                 im * ((npy_long *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 2 <= 3
+    ((npy_long *)dataptr[2])[0] += accum_re;
+    ((npy_long *)dataptr[2])[1] += accum_im;
+#  else
+    ((npy_long *)dataptr[nop])[0] += accum_re;
+    ((npy_long *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 2 <= 3
+    *((npy_long *)dataptr[2]) = (accum +
+                                    (*((npy_long *)dataptr[2])));
+#  else
+    *((npy_long *)dataptr[nop]) = (accum +
+                                    (*((npy_long *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+long_sum_of_products_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (3 == 2 || 3 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (3 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data_out = dataptr[3];
+    npy_intp stride_out = strides[3];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_three (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 3 == 1
+        *(npy_long *)data_out = ((*(npy_long *)data0) +
+                                         (*(npy_long *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 3 == 2
+        *(npy_long *)data_out = ((*(npy_long *)data0) *
+                                         (*(npy_long *)data1) +
+                                         (*(npy_long *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 3 == 3
+        *(npy_long *)data_out = ((*(npy_long *)data0) *
+                                         (*(npy_long *)data1) *
+                                         (*(npy_long *)data2) +
+                                         (*(npy_long *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_long temp = (*(npy_long *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_long *)dataptr[i]);
+        }
+        *(npy_long *)dataptr[nop] = (temp +
+                                           (*(npy_long *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 3 == 1
+        ((npy_long *)data_out)[0] = ((npy_long *)data0)[0] +
+                                         ((npy_long *)data_out)[0];
+        ((npy_long *)data_out)[1] = ((npy_long *)data0)[1] +
+                                         ((npy_long *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 3 <= 3
+#define _SUMPROD_NOP 3
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_long re, im, tmp;
+        int i;
+        re = ((npy_long *)dataptr[0])[0];
+        im = ((npy_long *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_long *)dataptr[i])[0] -
+                  im * ((npy_long *)dataptr[i])[1];
+            im = re * ((npy_long *)dataptr[i])[1] +
+                 im * ((npy_long *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_long *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_long *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_long *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_long *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 3 == 1
+
+static void
+long_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_long *data0 = (npy_long *)dataptr[0];
+    npy_long *data_out = (npy_long *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_long *)data_out + 2*6)[0] =
+                                    ((npy_long *)data0 + 2*6)[0] +
+                                    ((npy_long *)data_out + 2*6)[0];
+            ((npy_long *)data_out + 2*6)[1] =
+                                    ((npy_long *)data0 + 2*6)[1] +
+                                    ((npy_long *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_long *)data_out + 2*5)[0] =
+                                    ((npy_long *)data0 + 2*5)[0] +
+                                    ((npy_long *)data_out + 2*5)[0];
+            ((npy_long *)data_out + 2*5)[1] =
+                                    ((npy_long *)data0 + 2*5)[1] +
+                                    ((npy_long *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_long *)data_out + 2*4)[0] =
+                                    ((npy_long *)data0 + 2*4)[0] +
+                                    ((npy_long *)data_out + 2*4)[0];
+            ((npy_long *)data_out + 2*4)[1] =
+                                    ((npy_long *)data0 + 2*4)[1] +
+                                    ((npy_long *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_long *)data_out + 2*3)[0] =
+                                    ((npy_long *)data0 + 2*3)[0] +
+                                    ((npy_long *)data_out + 2*3)[0];
+            ((npy_long *)data_out + 2*3)[1] =
+                                    ((npy_long *)data0 + 2*3)[1] +
+                                    ((npy_long *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_long *)data_out + 2*2)[0] =
+                                    ((npy_long *)data0 + 2*2)[0] +
+                                    ((npy_long *)data_out + 2*2)[0];
+            ((npy_long *)data_out + 2*2)[1] =
+                                    ((npy_long *)data0 + 2*2)[1] +
+                                    ((npy_long *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_long *)data_out + 2*1)[0] =
+                                    ((npy_long *)data0 + 2*1)[0] +
+                                    ((npy_long *)data_out + 2*1)[0];
+            ((npy_long *)data_out + 2*1)[1] =
+                                    ((npy_long *)data0 + 2*1)[1] +
+                                    ((npy_long *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_long *)data_out + 2*0)[0] =
+                                    ((npy_long *)data0 + 2*0)[0] +
+                                    ((npy_long *)data_out + 2*0)[0];
+            ((npy_long *)data_out + 2*0)[1] =
+                                    ((npy_long *)data0 + 2*0)[1] +
+                                    ((npy_long *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_long *)data_out + 2*0)[0] =
+                                ((npy_long *)data0 + 2*0)[0] +
+                                ((npy_long *)data_out + 2*0)[0];
+        ((npy_long *)data_out + 2*0)[1] =
+                                ((npy_long *)data0 + 2*0)[1] +
+                                ((npy_long *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_long *)data_out + 2*1)[0] =
+                                ((npy_long *)data0 + 2*1)[0] +
+                                ((npy_long *)data_out + 2*1)[0];
+        ((npy_long *)data_out + 2*1)[1] =
+                                ((npy_long *)data0 + 2*1)[1] +
+                                ((npy_long *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_long *)data_out + 2*2)[0] =
+                                ((npy_long *)data0 + 2*2)[0] +
+                                ((npy_long *)data_out + 2*2)[0];
+        ((npy_long *)data_out + 2*2)[1] =
+                                ((npy_long *)data0 + 2*2)[1] +
+                                ((npy_long *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_long *)data_out + 2*3)[0] =
+                                ((npy_long *)data0 + 2*3)[0] +
+                                ((npy_long *)data_out + 2*3)[0];
+        ((npy_long *)data_out + 2*3)[1] =
+                                ((npy_long *)data0 + 2*3)[1] +
+                                ((npy_long *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_long *)data_out + 2*4)[0] =
+                                ((npy_long *)data0 + 2*4)[0] +
+                                ((npy_long *)data_out + 2*4)[0];
+        ((npy_long *)data_out + 2*4)[1] =
+                                ((npy_long *)data0 + 2*4)[1] +
+                                ((npy_long *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_long *)data_out + 2*5)[0] =
+                                ((npy_long *)data0 + 2*5)[0] +
+                                ((npy_long *)data_out + 2*5)[0];
+        ((npy_long *)data_out + 2*5)[1] =
+                                ((npy_long *)data0 + 2*5)[1] +
+                                ((npy_long *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_long *)data_out + 2*6)[0] =
+                                ((npy_long *)data0 + 2*6)[0] +
+                                ((npy_long *)data_out + 2*6)[0];
+        ((npy_long *)data_out + 2*6)[1] =
+                                ((npy_long *)data0 + 2*6)[1] +
+                                ((npy_long *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_long *)data_out + 2*7)[0] =
+                                ((npy_long *)data0 + 2*7)[0] +
+                                ((npy_long *)data_out + 2*7)[0];
+        ((npy_long *)data_out + 2*7)[1] =
+                                ((npy_long *)data0 + 2*7)[1] +
+                                ((npy_long *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 3 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+long_sum_of_products_muladd(npy_long *data, npy_long *data_out, npy_long scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_long
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_long;
+    const npyv_long v_scalar = npyv_setall_long(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_long b0 = npyv_loada_long(data + vstep * 0);
+            npyv_long c0 = npyv_loada_long(data_out + vstep * 0);
+            
+#line 312
+            npyv_long b1 = npyv_loada_long(data + vstep * 1);
+            npyv_long c1 = npyv_loada_long(data_out + vstep * 1);
+            
+#line 312
+            npyv_long b2 = npyv_loada_long(data + vstep * 2);
+            npyv_long c2 = npyv_loada_long(data_out + vstep * 2);
+            
+#line 312
+            npyv_long b3 = npyv_loada_long(data + vstep * 3);
+            npyv_long c3 = npyv_loada_long(data_out + vstep * 3);
+            
+            #line 318
+            npyv_long abc0 = npyv_muladd_long(v_scalar, b0, c0);
+            
+#line 318
+            npyv_long abc1 = npyv_muladd_long(v_scalar, b1, c1);
+            
+#line 318
+            npyv_long abc2 = npyv_muladd_long(v_scalar, b2, c2);
+            
+#line 318
+            npyv_long abc3 = npyv_muladd_long(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_long(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_long(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_long(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_long(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_long b0 = npyv_load_long(data + vstep * 0);
+            npyv_long c0 = npyv_load_long(data_out + vstep * 0);
+            
+#line 312
+            npyv_long b1 = npyv_load_long(data + vstep * 1);
+            npyv_long c1 = npyv_load_long(data_out + vstep * 1);
+            
+#line 312
+            npyv_long b2 = npyv_load_long(data + vstep * 2);
+            npyv_long c2 = npyv_load_long(data_out + vstep * 2);
+            
+#line 312
+            npyv_long b3 = npyv_load_long(data + vstep * 3);
+            npyv_long c3 = npyv_load_long(data_out + vstep * 3);
+            
+            #line 318
+            npyv_long abc0 = npyv_muladd_long(v_scalar, b0, c0);
+            
+#line 318
+            npyv_long abc1 = npyv_muladd_long(v_scalar, b1, c1);
+            
+#line 318
+            npyv_long abc2 = npyv_muladd_long(v_scalar, b2, c2);
+            
+#line 318
+            npyv_long abc3 = npyv_muladd_long(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_long(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_long(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_long(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_long(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_long a = npyv_load_tillz_long(data, count);
+        npyv_long b = npyv_load_tillz_long(data_out, count);
+        npyv_store_till_long(data_out, count, npyv_muladd_long(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_long b0 = (data[0]);
+        const npy_long c0 = (data_out[0]);
+        
+#line 340
+        const npy_long b1 = (data[1]);
+        const npy_long c1 = (data_out[1]);
+        
+#line 340
+        const npy_long b2 = (data[2]);
+        const npy_long c2 = (data_out[2]);
+        
+#line 340
+        const npy_long b3 = (data[3]);
+        const npy_long c3 = (data_out[3]);
+        
+        #line 346
+        const npy_long abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_long abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_long abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_long abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_long b = (*data);
+        const npy_long c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_long
+}
+
+static void
+long_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_long *data0 = (npy_long *)dataptr[0];
+    npy_long *data1 = (npy_long *)dataptr[1];
+    npy_long *data_out = (npy_long *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_long
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_long;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_long a0 = npyv_loada_long(data0 + vstep * 0);
+            npyv_long b0 = npyv_loada_long(data1 + vstep * 0);
+            npyv_long c0 = npyv_loada_long(data_out + vstep * 0);
+            
+#line 390
+            npyv_long a1 = npyv_loada_long(data0 + vstep * 1);
+            npyv_long b1 = npyv_loada_long(data1 + vstep * 1);
+            npyv_long c1 = npyv_loada_long(data_out + vstep * 1);
+            
+#line 390
+            npyv_long a2 = npyv_loada_long(data0 + vstep * 2);
+            npyv_long b2 = npyv_loada_long(data1 + vstep * 2);
+            npyv_long c2 = npyv_loada_long(data_out + vstep * 2);
+            
+#line 390
+            npyv_long a3 = npyv_loada_long(data0 + vstep * 3);
+            npyv_long b3 = npyv_loada_long(data1 + vstep * 3);
+            npyv_long c3 = npyv_loada_long(data_out + vstep * 3);
+            
+            #line 397
+            npyv_long abc0 = npyv_muladd_long(a0, b0, c0);
+            
+#line 397
+            npyv_long abc1 = npyv_muladd_long(a1, b1, c1);
+            
+#line 397
+            npyv_long abc2 = npyv_muladd_long(a2, b2, c2);
+            
+#line 397
+            npyv_long abc3 = npyv_muladd_long(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_long(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_long(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_long(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_long(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_long a0 = npyv_load_long(data0 + vstep * 0);
+            npyv_long b0 = npyv_load_long(data1 + vstep * 0);
+            npyv_long c0 = npyv_load_long(data_out + vstep * 0);
+            
+#line 390
+            npyv_long a1 = npyv_load_long(data0 + vstep * 1);
+            npyv_long b1 = npyv_load_long(data1 + vstep * 1);
+            npyv_long c1 = npyv_load_long(data_out + vstep * 1);
+            
+#line 390
+            npyv_long a2 = npyv_load_long(data0 + vstep * 2);
+            npyv_long b2 = npyv_load_long(data1 + vstep * 2);
+            npyv_long c2 = npyv_load_long(data_out + vstep * 2);
+            
+#line 390
+            npyv_long a3 = npyv_load_long(data0 + vstep * 3);
+            npyv_long b3 = npyv_load_long(data1 + vstep * 3);
+            npyv_long c3 = npyv_load_long(data_out + vstep * 3);
+            
+            #line 397
+            npyv_long abc0 = npyv_muladd_long(a0, b0, c0);
+            
+#line 397
+            npyv_long abc1 = npyv_muladd_long(a1, b1, c1);
+            
+#line 397
+            npyv_long abc2 = npyv_muladd_long(a2, b2, c2);
+            
+#line 397
+            npyv_long abc3 = npyv_muladd_long(a3, b3, c3);
+            
+            #line 402
+            npyv_store_long(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_long(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_long(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_long(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_long a = npyv_load_tillz_long(data0, count);
+        npyv_long b = npyv_load_tillz_long(data1, count);
+        npyv_long c = npyv_load_tillz_long(data_out, count);
+        npyv_store_till_long(data_out, count, npyv_muladd_long(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_long a0 = (data0[0]);
+        const npy_long b0 = (data1[0]);
+        const npy_long c0 = (data_out[0]);
+        
+#line 420
+        const npy_long a1 = (data0[1]);
+        const npy_long b1 = (data1[1]);
+        const npy_long c1 = (data_out[1]);
+        
+#line 420
+        const npy_long a2 = (data0[2]);
+        const npy_long b2 = (data1[2]);
+        const npy_long c2 = (data_out[2]);
+        
+#line 420
+        const npy_long a3 = (data0[3]);
+        const npy_long b3 = (data1[3]);
+        const npy_long c3 = (data_out[3]);
+        
+        #line 427
+        const npy_long abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_long abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_long abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_long abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_long a = (*data0);
+        const npy_long b = (*data1);
+        const npy_long c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_long
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+long_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_long value0 = (*(npy_long *)dataptr[0]);
+    npy_long *data1 = (npy_long *)dataptr[1];
+    npy_long *data_out = (npy_long *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    long_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+long_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_long value1 = (*(npy_long *)dataptr[1]);
+    npy_long *data0 = (npy_long *)dataptr[0];
+    npy_long *data_out = (npy_long *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    long_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+long_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_long *data0 = (npy_long *)dataptr[0];
+    npy_long *data1 = (npy_long *)dataptr[1];
+    npy_long accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_long
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_long;
+    npyv_long v_accum = npyv_zero_long();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_long a0 = npyv_loada_long(data0 + vstep * 0);
+            npyv_long b0 = npyv_loada_long(data1 + vstep * 0);
+            
+#line 501
+            npyv_long a1 = npyv_loada_long(data0 + vstep * 1);
+            npyv_long b1 = npyv_loada_long(data1 + vstep * 1);
+            
+#line 501
+            npyv_long a2 = npyv_loada_long(data0 + vstep * 2);
+            npyv_long b2 = npyv_loada_long(data1 + vstep * 2);
+            
+#line 501
+            npyv_long a3 = npyv_loada_long(data0 + vstep * 3);
+            npyv_long b3 = npyv_loada_long(data1 + vstep * 3);
+            
+            npyv_long ab3 = npyv_muladd_long(a3, b3, v_accum);
+            npyv_long ab2 = npyv_muladd_long(a2, b2, ab3);
+            npyv_long ab1 = npyv_muladd_long(a1, b1, ab2);
+                   v_accum = npyv_muladd_long(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_long a0 = npyv_load_long(data0 + vstep * 0);
+            npyv_long b0 = npyv_load_long(data1 + vstep * 0);
+            
+#line 501
+            npyv_long a1 = npyv_load_long(data0 + vstep * 1);
+            npyv_long b1 = npyv_load_long(data1 + vstep * 1);
+            
+#line 501
+            npyv_long a2 = npyv_load_long(data0 + vstep * 2);
+            npyv_long b2 = npyv_load_long(data1 + vstep * 2);
+            
+#line 501
+            npyv_long a3 = npyv_load_long(data0 + vstep * 3);
+            npyv_long b3 = npyv_load_long(data1 + vstep * 3);
+            
+            npyv_long ab3 = npyv_muladd_long(a3, b3, v_accum);
+            npyv_long ab2 = npyv_muladd_long(a2, b2, ab3);
+            npyv_long ab1 = npyv_muladd_long(a1, b1, ab2);
+                   v_accum = npyv_muladd_long(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_long a = npyv_load_tillz_long(data0, count);
+        npyv_long b = npyv_load_tillz_long(data1, count);
+        v_accum = npyv_muladd_long(a, b, v_accum);
+    }
+    accum = npyv_sum_long(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_long ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_long ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_long ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_long ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_long a = (*data0);
+        const npy_long b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_long
+    *(npy_long *)dataptr[2] = ((*(npy_long *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+long_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_long *data1 = (npy_long *)dataptr[1];
+    npy_long value0 = (*(npy_long *)dataptr[0]);
+    npy_long accum = long_sum_of_arr(data1, count);
+    *(npy_long *)dataptr[2] = ((*(npy_long *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+long_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_long *data0 = (npy_long *)dataptr[0];
+    npy_long value1 = (*(npy_long *)dataptr[1]);
+    npy_long accum = long_sum_of_arr(data0, count);
+    *(npy_long *)dataptr[2] = ((*(npy_long *)dataptr[2]) + value1 * accum);
+}
+
+#elif 3 == 3 && !0
+
+static void
+long_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_long *data0 = (npy_long *)dataptr[0];
+    npy_long *data1 = (npy_long *)dataptr[1];
+    npy_long *data2 = (npy_long *)dataptr[2];
+    npy_long *data_out = (npy_long *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 3 > 3 || @complex */
+
+static void
+long_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_three (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_long temp = (*(npy_long *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_long *)dataptr[i]);
+        }
+        *(npy_long *)dataptr[nop] = (temp +
+                                           (*(npy_long *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_long);
+        }
+#else /* complex */
+#  if 3 <= 3
+#    define _SUMPROD_NOP 3
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_long re, im, tmp;
+        int i;
+        re = ((npy_long *)dataptr[0])[0];
+        im = ((npy_long *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_long *)dataptr[i])[0] -
+                  im * ((npy_long *)dataptr[i])[1];
+            im = re * ((npy_long *)dataptr[i])[1] +
+                 im * ((npy_long *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_long *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_long *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_long *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_long *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_long);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 3 */
+
+#if 3 == 1
+
+static NPY_GCC_OPT_3 void
+long_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_long *data = (npy_long *)dataptr[0];
+    npy_long accum = long_sum_of_arr(data, count);
+    *((npy_long *)dataptr[1]) = (accum + (*((npy_long *)dataptr[1])));
+#else
+    npy_long accum_re = 0, accum_im = 0;
+    npy_long *data0 = (npy_long *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_long re01 = data0[0] + data0[2];
+        const npy_long re23 = data0[4] + data0[6];
+        const npy_long im13 = data0[1] + data0[3];
+        const npy_long im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_long *)dataptr[1])[0] += accum_re;
+    ((npy_long *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 3 == 1 */
+
+static void
+long_sum_of_products_outstride0_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_long accum_re = 0, accum_im = 0;
+#else
+    npy_long accum = 0;
+#endif
+
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (3 == 2 || 3 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (3 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_outstride0_three (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 3 == 1
+        accum += (*(npy_long *)data0);
+        data0 += stride0;
+#  elif 3 == 2
+        accum += (*(npy_long *)data0) *
+                 (*(npy_long *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 3 == 3
+        accum += (*(npy_long *)data0) *
+                 (*(npy_long *)data1) *
+                 (*(npy_long *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_long temp = (*(npy_long *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_long *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 3 == 1
+        accum_re += ((npy_long *)data0)[0];
+        accum_im += ((npy_long *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 3 <= 3
+#define _SUMPROD_NOP 3
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_long re, im, tmp;
+        int i;
+        re = ((npy_long *)dataptr[0])[0];
+        im = ((npy_long *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_long *)dataptr[i])[0] -
+                  im * ((npy_long *)dataptr[i])[1];
+            im = re * ((npy_long *)dataptr[i])[1] +
+                 im * ((npy_long *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 3 <= 3
+    ((npy_long *)dataptr[3])[0] += accum_re;
+    ((npy_long *)dataptr[3])[1] += accum_im;
+#  else
+    ((npy_long *)dataptr[nop])[0] += accum_re;
+    ((npy_long *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 3 <= 3
+    *((npy_long *)dataptr[3]) = (accum +
+                                    (*((npy_long *)dataptr[3])));
+#  else
+    *((npy_long *)dataptr[nop]) = (accum +
+                                    (*((npy_long *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+long_sum_of_products_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1000 == 2 || 1000 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1000 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data_out = dataptr[1000];
+    npy_intp stride_out = strides[1000];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_any (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 1000 == 1
+        *(npy_long *)data_out = ((*(npy_long *)data0) +
+                                         (*(npy_long *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 1000 == 2
+        *(npy_long *)data_out = ((*(npy_long *)data0) *
+                                         (*(npy_long *)data1) +
+                                         (*(npy_long *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 1000 == 3
+        *(npy_long *)data_out = ((*(npy_long *)data0) *
+                                         (*(npy_long *)data1) *
+                                         (*(npy_long *)data2) +
+                                         (*(npy_long *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_long temp = (*(npy_long *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_long *)dataptr[i]);
+        }
+        *(npy_long *)dataptr[nop] = (temp +
+                                           (*(npy_long *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1000 == 1
+        ((npy_long *)data_out)[0] = ((npy_long *)data0)[0] +
+                                         ((npy_long *)data_out)[0];
+        ((npy_long *)data_out)[1] = ((npy_long *)data0)[1] +
+                                         ((npy_long *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 1000 <= 3
+#define _SUMPROD_NOP 1000
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_long re, im, tmp;
+        int i;
+        re = ((npy_long *)dataptr[0])[0];
+        im = ((npy_long *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_long *)dataptr[i])[0] -
+                  im * ((npy_long *)dataptr[i])[1];
+            im = re * ((npy_long *)dataptr[i])[1] +
+                 im * ((npy_long *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_long *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_long *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_long *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_long *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 1000 == 1
+
+static void
+long_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_long *data0 = (npy_long *)dataptr[0];
+    npy_long *data_out = (npy_long *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_long *)data_out + 2*6)[0] =
+                                    ((npy_long *)data0 + 2*6)[0] +
+                                    ((npy_long *)data_out + 2*6)[0];
+            ((npy_long *)data_out + 2*6)[1] =
+                                    ((npy_long *)data0 + 2*6)[1] +
+                                    ((npy_long *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_long *)data_out + 2*5)[0] =
+                                    ((npy_long *)data0 + 2*5)[0] +
+                                    ((npy_long *)data_out + 2*5)[0];
+            ((npy_long *)data_out + 2*5)[1] =
+                                    ((npy_long *)data0 + 2*5)[1] +
+                                    ((npy_long *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_long *)data_out + 2*4)[0] =
+                                    ((npy_long *)data0 + 2*4)[0] +
+                                    ((npy_long *)data_out + 2*4)[0];
+            ((npy_long *)data_out + 2*4)[1] =
+                                    ((npy_long *)data0 + 2*4)[1] +
+                                    ((npy_long *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_long *)data_out + 2*3)[0] =
+                                    ((npy_long *)data0 + 2*3)[0] +
+                                    ((npy_long *)data_out + 2*3)[0];
+            ((npy_long *)data_out + 2*3)[1] =
+                                    ((npy_long *)data0 + 2*3)[1] +
+                                    ((npy_long *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_long *)data_out + 2*2)[0] =
+                                    ((npy_long *)data0 + 2*2)[0] +
+                                    ((npy_long *)data_out + 2*2)[0];
+            ((npy_long *)data_out + 2*2)[1] =
+                                    ((npy_long *)data0 + 2*2)[1] +
+                                    ((npy_long *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_long *)data_out + 2*1)[0] =
+                                    ((npy_long *)data0 + 2*1)[0] +
+                                    ((npy_long *)data_out + 2*1)[0];
+            ((npy_long *)data_out + 2*1)[1] =
+                                    ((npy_long *)data0 + 2*1)[1] +
+                                    ((npy_long *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_long *)data_out + 2*0)[0] =
+                                    ((npy_long *)data0 + 2*0)[0] +
+                                    ((npy_long *)data_out + 2*0)[0];
+            ((npy_long *)data_out + 2*0)[1] =
+                                    ((npy_long *)data0 + 2*0)[1] +
+                                    ((npy_long *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_long *)data_out + 2*0)[0] =
+                                ((npy_long *)data0 + 2*0)[0] +
+                                ((npy_long *)data_out + 2*0)[0];
+        ((npy_long *)data_out + 2*0)[1] =
+                                ((npy_long *)data0 + 2*0)[1] +
+                                ((npy_long *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_long *)data_out + 2*1)[0] =
+                                ((npy_long *)data0 + 2*1)[0] +
+                                ((npy_long *)data_out + 2*1)[0];
+        ((npy_long *)data_out + 2*1)[1] =
+                                ((npy_long *)data0 + 2*1)[1] +
+                                ((npy_long *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_long *)data_out + 2*2)[0] =
+                                ((npy_long *)data0 + 2*2)[0] +
+                                ((npy_long *)data_out + 2*2)[0];
+        ((npy_long *)data_out + 2*2)[1] =
+                                ((npy_long *)data0 + 2*2)[1] +
+                                ((npy_long *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_long *)data_out + 2*3)[0] =
+                                ((npy_long *)data0 + 2*3)[0] +
+                                ((npy_long *)data_out + 2*3)[0];
+        ((npy_long *)data_out + 2*3)[1] =
+                                ((npy_long *)data0 + 2*3)[1] +
+                                ((npy_long *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_long *)data_out + 2*4)[0] =
+                                ((npy_long *)data0 + 2*4)[0] +
+                                ((npy_long *)data_out + 2*4)[0];
+        ((npy_long *)data_out + 2*4)[1] =
+                                ((npy_long *)data0 + 2*4)[1] +
+                                ((npy_long *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_long *)data_out + 2*5)[0] =
+                                ((npy_long *)data0 + 2*5)[0] +
+                                ((npy_long *)data_out + 2*5)[0];
+        ((npy_long *)data_out + 2*5)[1] =
+                                ((npy_long *)data0 + 2*5)[1] +
+                                ((npy_long *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_long *)data_out + 2*6)[0] =
+                                ((npy_long *)data0 + 2*6)[0] +
+                                ((npy_long *)data_out + 2*6)[0];
+        ((npy_long *)data_out + 2*6)[1] =
+                                ((npy_long *)data0 + 2*6)[1] +
+                                ((npy_long *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_long *)data_out + 2*7)[0] =
+                                ((npy_long *)data0 + 2*7)[0] +
+                                ((npy_long *)data_out + 2*7)[0];
+        ((npy_long *)data_out + 2*7)[1] =
+                                ((npy_long *)data0 + 2*7)[1] +
+                                ((npy_long *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 1000 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+long_sum_of_products_muladd(npy_long *data, npy_long *data_out, npy_long scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_long
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_long;
+    const npyv_long v_scalar = npyv_setall_long(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_long b0 = npyv_loada_long(data + vstep * 0);
+            npyv_long c0 = npyv_loada_long(data_out + vstep * 0);
+            
+#line 312
+            npyv_long b1 = npyv_loada_long(data + vstep * 1);
+            npyv_long c1 = npyv_loada_long(data_out + vstep * 1);
+            
+#line 312
+            npyv_long b2 = npyv_loada_long(data + vstep * 2);
+            npyv_long c2 = npyv_loada_long(data_out + vstep * 2);
+            
+#line 312
+            npyv_long b3 = npyv_loada_long(data + vstep * 3);
+            npyv_long c3 = npyv_loada_long(data_out + vstep * 3);
+            
+            #line 318
+            npyv_long abc0 = npyv_muladd_long(v_scalar, b0, c0);
+            
+#line 318
+            npyv_long abc1 = npyv_muladd_long(v_scalar, b1, c1);
+            
+#line 318
+            npyv_long abc2 = npyv_muladd_long(v_scalar, b2, c2);
+            
+#line 318
+            npyv_long abc3 = npyv_muladd_long(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_long(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_long(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_long(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_long(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_long b0 = npyv_load_long(data + vstep * 0);
+            npyv_long c0 = npyv_load_long(data_out + vstep * 0);
+            
+#line 312
+            npyv_long b1 = npyv_load_long(data + vstep * 1);
+            npyv_long c1 = npyv_load_long(data_out + vstep * 1);
+            
+#line 312
+            npyv_long b2 = npyv_load_long(data + vstep * 2);
+            npyv_long c2 = npyv_load_long(data_out + vstep * 2);
+            
+#line 312
+            npyv_long b3 = npyv_load_long(data + vstep * 3);
+            npyv_long c3 = npyv_load_long(data_out + vstep * 3);
+            
+            #line 318
+            npyv_long abc0 = npyv_muladd_long(v_scalar, b0, c0);
+            
+#line 318
+            npyv_long abc1 = npyv_muladd_long(v_scalar, b1, c1);
+            
+#line 318
+            npyv_long abc2 = npyv_muladd_long(v_scalar, b2, c2);
+            
+#line 318
+            npyv_long abc3 = npyv_muladd_long(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_long(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_long(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_long(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_long(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_long a = npyv_load_tillz_long(data, count);
+        npyv_long b = npyv_load_tillz_long(data_out, count);
+        npyv_store_till_long(data_out, count, npyv_muladd_long(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_long b0 = (data[0]);
+        const npy_long c0 = (data_out[0]);
+        
+#line 340
+        const npy_long b1 = (data[1]);
+        const npy_long c1 = (data_out[1]);
+        
+#line 340
+        const npy_long b2 = (data[2]);
+        const npy_long c2 = (data_out[2]);
+        
+#line 340
+        const npy_long b3 = (data[3]);
+        const npy_long c3 = (data_out[3]);
+        
+        #line 346
+        const npy_long abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_long abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_long abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_long abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_long b = (*data);
+        const npy_long c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_long
+}
+
+static void
+long_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_long *data0 = (npy_long *)dataptr[0];
+    npy_long *data1 = (npy_long *)dataptr[1];
+    npy_long *data_out = (npy_long *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_long
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_long;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_long a0 = npyv_loada_long(data0 + vstep * 0);
+            npyv_long b0 = npyv_loada_long(data1 + vstep * 0);
+            npyv_long c0 = npyv_loada_long(data_out + vstep * 0);
+            
+#line 390
+            npyv_long a1 = npyv_loada_long(data0 + vstep * 1);
+            npyv_long b1 = npyv_loada_long(data1 + vstep * 1);
+            npyv_long c1 = npyv_loada_long(data_out + vstep * 1);
+            
+#line 390
+            npyv_long a2 = npyv_loada_long(data0 + vstep * 2);
+            npyv_long b2 = npyv_loada_long(data1 + vstep * 2);
+            npyv_long c2 = npyv_loada_long(data_out + vstep * 2);
+            
+#line 390
+            npyv_long a3 = npyv_loada_long(data0 + vstep * 3);
+            npyv_long b3 = npyv_loada_long(data1 + vstep * 3);
+            npyv_long c3 = npyv_loada_long(data_out + vstep * 3);
+            
+            #line 397
+            npyv_long abc0 = npyv_muladd_long(a0, b0, c0);
+            
+#line 397
+            npyv_long abc1 = npyv_muladd_long(a1, b1, c1);
+            
+#line 397
+            npyv_long abc2 = npyv_muladd_long(a2, b2, c2);
+            
+#line 397
+            npyv_long abc3 = npyv_muladd_long(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_long(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_long(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_long(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_long(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_long a0 = npyv_load_long(data0 + vstep * 0);
+            npyv_long b0 = npyv_load_long(data1 + vstep * 0);
+            npyv_long c0 = npyv_load_long(data_out + vstep * 0);
+            
+#line 390
+            npyv_long a1 = npyv_load_long(data0 + vstep * 1);
+            npyv_long b1 = npyv_load_long(data1 + vstep * 1);
+            npyv_long c1 = npyv_load_long(data_out + vstep * 1);
+            
+#line 390
+            npyv_long a2 = npyv_load_long(data0 + vstep * 2);
+            npyv_long b2 = npyv_load_long(data1 + vstep * 2);
+            npyv_long c2 = npyv_load_long(data_out + vstep * 2);
+            
+#line 390
+            npyv_long a3 = npyv_load_long(data0 + vstep * 3);
+            npyv_long b3 = npyv_load_long(data1 + vstep * 3);
+            npyv_long c3 = npyv_load_long(data_out + vstep * 3);
+            
+            #line 397
+            npyv_long abc0 = npyv_muladd_long(a0, b0, c0);
+            
+#line 397
+            npyv_long abc1 = npyv_muladd_long(a1, b1, c1);
+            
+#line 397
+            npyv_long abc2 = npyv_muladd_long(a2, b2, c2);
+            
+#line 397
+            npyv_long abc3 = npyv_muladd_long(a3, b3, c3);
+            
+            #line 402
+            npyv_store_long(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_long(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_long(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_long(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_long a = npyv_load_tillz_long(data0, count);
+        npyv_long b = npyv_load_tillz_long(data1, count);
+        npyv_long c = npyv_load_tillz_long(data_out, count);
+        npyv_store_till_long(data_out, count, npyv_muladd_long(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_long a0 = (data0[0]);
+        const npy_long b0 = (data1[0]);
+        const npy_long c0 = (data_out[0]);
+        
+#line 420
+        const npy_long a1 = (data0[1]);
+        const npy_long b1 = (data1[1]);
+        const npy_long c1 = (data_out[1]);
+        
+#line 420
+        const npy_long a2 = (data0[2]);
+        const npy_long b2 = (data1[2]);
+        const npy_long c2 = (data_out[2]);
+        
+#line 420
+        const npy_long a3 = (data0[3]);
+        const npy_long b3 = (data1[3]);
+        const npy_long c3 = (data_out[3]);
+        
+        #line 427
+        const npy_long abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_long abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_long abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_long abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_long a = (*data0);
+        const npy_long b = (*data1);
+        const npy_long c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_long
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+long_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_long value0 = (*(npy_long *)dataptr[0]);
+    npy_long *data1 = (npy_long *)dataptr[1];
+    npy_long *data_out = (npy_long *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    long_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+long_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_long value1 = (*(npy_long *)dataptr[1]);
+    npy_long *data0 = (npy_long *)dataptr[0];
+    npy_long *data_out = (npy_long *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    long_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+long_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_long *data0 = (npy_long *)dataptr[0];
+    npy_long *data1 = (npy_long *)dataptr[1];
+    npy_long accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_long
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_long;
+    npyv_long v_accum = npyv_zero_long();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_long a0 = npyv_loada_long(data0 + vstep * 0);
+            npyv_long b0 = npyv_loada_long(data1 + vstep * 0);
+            
+#line 501
+            npyv_long a1 = npyv_loada_long(data0 + vstep * 1);
+            npyv_long b1 = npyv_loada_long(data1 + vstep * 1);
+            
+#line 501
+            npyv_long a2 = npyv_loada_long(data0 + vstep * 2);
+            npyv_long b2 = npyv_loada_long(data1 + vstep * 2);
+            
+#line 501
+            npyv_long a3 = npyv_loada_long(data0 + vstep * 3);
+            npyv_long b3 = npyv_loada_long(data1 + vstep * 3);
+            
+            npyv_long ab3 = npyv_muladd_long(a3, b3, v_accum);
+            npyv_long ab2 = npyv_muladd_long(a2, b2, ab3);
+            npyv_long ab1 = npyv_muladd_long(a1, b1, ab2);
+                   v_accum = npyv_muladd_long(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_long a0 = npyv_load_long(data0 + vstep * 0);
+            npyv_long b0 = npyv_load_long(data1 + vstep * 0);
+            
+#line 501
+            npyv_long a1 = npyv_load_long(data0 + vstep * 1);
+            npyv_long b1 = npyv_load_long(data1 + vstep * 1);
+            
+#line 501
+            npyv_long a2 = npyv_load_long(data0 + vstep * 2);
+            npyv_long b2 = npyv_load_long(data1 + vstep * 2);
+            
+#line 501
+            npyv_long a3 = npyv_load_long(data0 + vstep * 3);
+            npyv_long b3 = npyv_load_long(data1 + vstep * 3);
+            
+            npyv_long ab3 = npyv_muladd_long(a3, b3, v_accum);
+            npyv_long ab2 = npyv_muladd_long(a2, b2, ab3);
+            npyv_long ab1 = npyv_muladd_long(a1, b1, ab2);
+                   v_accum = npyv_muladd_long(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_long a = npyv_load_tillz_long(data0, count);
+        npyv_long b = npyv_load_tillz_long(data1, count);
+        v_accum = npyv_muladd_long(a, b, v_accum);
+    }
+    accum = npyv_sum_long(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_long ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_long ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_long ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_long ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_long a = (*data0);
+        const npy_long b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_long
+    *(npy_long *)dataptr[2] = ((*(npy_long *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+long_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_long *data1 = (npy_long *)dataptr[1];
+    npy_long value0 = (*(npy_long *)dataptr[0]);
+    npy_long accum = long_sum_of_arr(data1, count);
+    *(npy_long *)dataptr[2] = ((*(npy_long *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+long_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_long *data0 = (npy_long *)dataptr[0];
+    npy_long value1 = (*(npy_long *)dataptr[1]);
+    npy_long accum = long_sum_of_arr(data0, count);
+    *(npy_long *)dataptr[2] = ((*(npy_long *)dataptr[2]) + value1 * accum);
+}
+
+#elif 1000 == 3 && !0
+
+static void
+long_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_long *data0 = (npy_long *)dataptr[0];
+    npy_long *data1 = (npy_long *)dataptr[1];
+    npy_long *data2 = (npy_long *)dataptr[2];
+    npy_long *data_out = (npy_long *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 1000 > 3 || @complex */
+
+static void
+long_sum_of_products_contig_any(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_any (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_long temp = (*(npy_long *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_long *)dataptr[i]);
+        }
+        *(npy_long *)dataptr[nop] = (temp +
+                                           (*(npy_long *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_long);
+        }
+#else /* complex */
+#  if 1000 <= 3
+#    define _SUMPROD_NOP 1000
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_long re, im, tmp;
+        int i;
+        re = ((npy_long *)dataptr[0])[0];
+        im = ((npy_long *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_long *)dataptr[i])[0] -
+                  im * ((npy_long *)dataptr[i])[1];
+            im = re * ((npy_long *)dataptr[i])[1] +
+                 im * ((npy_long *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_long *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_long *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_long *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_long *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_long);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 1000 */
+
+#if 1000 == 1
+
+static NPY_GCC_OPT_3 void
+long_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_long *data = (npy_long *)dataptr[0];
+    npy_long accum = long_sum_of_arr(data, count);
+    *((npy_long *)dataptr[1]) = (accum + (*((npy_long *)dataptr[1])));
+#else
+    npy_long accum_re = 0, accum_im = 0;
+    npy_long *data0 = (npy_long *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_long re01 = data0[0] + data0[2];
+        const npy_long re23 = data0[4] + data0[6];
+        const npy_long im13 = data0[1] + data0[3];
+        const npy_long im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_long *)dataptr[1])[0] += accum_re;
+    ((npy_long *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 1000 == 1 */
+
+static void
+long_sum_of_products_outstride0_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_long accum_re = 0, accum_im = 0;
+#else
+    npy_long accum = 0;
+#endif
+
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1000 == 2 || 1000 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1000 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("long_sum_of_products_outstride0_any (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 1000 == 1
+        accum += (*(npy_long *)data0);
+        data0 += stride0;
+#  elif 1000 == 2
+        accum += (*(npy_long *)data0) *
+                 (*(npy_long *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 1000 == 3
+        accum += (*(npy_long *)data0) *
+                 (*(npy_long *)data1) *
+                 (*(npy_long *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_long temp = (*(npy_long *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_long *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1000 == 1
+        accum_re += ((npy_long *)data0)[0];
+        accum_im += ((npy_long *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 1000 <= 3
+#define _SUMPROD_NOP 1000
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_long re, im, tmp;
+        int i;
+        re = ((npy_long *)dataptr[0])[0];
+        im = ((npy_long *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_long *)dataptr[i])[0] -
+                  im * ((npy_long *)dataptr[i])[1];
+            im = re * ((npy_long *)dataptr[i])[1] +
+                 im * ((npy_long *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 1000 <= 3
+    ((npy_long *)dataptr[1000])[0] += accum_re;
+    ((npy_long *)dataptr[1000])[1] += accum_im;
+#  else
+    ((npy_long *)dataptr[nop])[0] += accum_re;
+    ((npy_long *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 1000 <= 3
+    *((npy_long *)dataptr[1000]) = (accum +
+                                    (*((npy_long *)dataptr[1000])));
+#  else
+    *((npy_long *)dataptr[nop]) = (accum +
+                                    (*((npy_long *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+
+
+#line 74
+
+#if !0
+static NPY_GCC_OPT_3 npy_longlong longlong_sum_of_arr(npy_longlong *data, npy_intp count)
+{
+    npy_longlong accum = 0;
+#if 0 // NPYV check for npy_longlong
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data);
+    const int vstep = npyv_nlanes_s64;
+    npyv_s64 v_accum = npyv_zero_s64();
+    const npy_intp vstepx4 = vstep * 4;
+
+    #line 91
+    if(is_aligned) {
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
+            #line 96
+            npyv_s64 a0 = npyv_loada_s64(data + vstep * 0);
+            
+#line 96
+            npyv_s64 a1 = npyv_loada_s64(data + vstep * 1);
+            
+#line 96
+            npyv_s64 a2 = npyv_loada_s64(data + vstep * 2);
+            
+#line 96
+            npyv_s64 a3 = npyv_loada_s64(data + vstep * 3);
+            
+            npyv_s64 a01   = npyv_add_s64(a0, a1);
+            npyv_s64 a23   = npyv_add_s64(a2, a3);
+            npyv_s64 a0123 = npyv_add_s64(a01, a23);
+                     v_accum = npyv_add_s64(a0123, v_accum);
+        }
+    }
+    
+#line 91
+    else {
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
+            #line 96
+            npyv_s64 a0 = npyv_load_s64(data + vstep * 0);
+            
+#line 96
+            npyv_s64 a1 = npyv_load_s64(data + vstep * 1);
+            
+#line 96
+            npyv_s64 a2 = npyv_load_s64(data + vstep * 2);
+            
+#line 96
+            npyv_s64 a3 = npyv_load_s64(data + vstep * 3);
+            
+            npyv_s64 a01   = npyv_add_s64(a0, a1);
+            npyv_s64 a23   = npyv_add_s64(a2, a3);
+            npyv_s64 a0123 = npyv_add_s64(a01, a23);
+                     v_accum = npyv_add_s64(a0123, v_accum);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep) {
+        npyv_s64 a = npyv_load_tillz_s64(data, count);
+        v_accum = npyv_add_s64(a, v_accum);
+    }
+    accum = npyv_sum_s64(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data += 4) {
+        const npy_longlong a01 = (*data) + (data[1]);
+        const npy_longlong a23 = (data[2]) + (data[3]);
+        accum +=  a01 + a23;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data++) {
+        accum += (*data);
+    }
+#endif // NPYV check for npy_longlong
+    return accum;
+}
+#endif
+
+#line 131
+static void
+longlong_sum_of_products_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1 == 2 || 1 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data_out = dataptr[1];
+    npy_intp stride_out = strides[1];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_one (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 1 == 1
+        *(npy_longlong *)data_out = ((*(npy_longlong *)data0) +
+                                         (*(npy_longlong *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 1 == 2
+        *(npy_longlong *)data_out = ((*(npy_longlong *)data0) *
+                                         (*(npy_longlong *)data1) +
+                                         (*(npy_longlong *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 1 == 3
+        *(npy_longlong *)data_out = ((*(npy_longlong *)data0) *
+                                         (*(npy_longlong *)data1) *
+                                         (*(npy_longlong *)data2) +
+                                         (*(npy_longlong *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_longlong temp = (*(npy_longlong *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_longlong *)dataptr[i]);
+        }
+        *(npy_longlong *)dataptr[nop] = (temp +
+                                           (*(npy_longlong *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1 == 1
+        ((npy_longlong *)data_out)[0] = ((npy_longlong *)data0)[0] +
+                                         ((npy_longlong *)data_out)[0];
+        ((npy_longlong *)data_out)[1] = ((npy_longlong *)data0)[1] +
+                                         ((npy_longlong *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 1 <= 3
+#define _SUMPROD_NOP 1
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_longlong re, im, tmp;
+        int i;
+        re = ((npy_longlong *)dataptr[0])[0];
+        im = ((npy_longlong *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_longlong *)dataptr[i])[0] -
+                  im * ((npy_longlong *)dataptr[i])[1];
+            im = re * ((npy_longlong *)dataptr[i])[1] +
+                 im * ((npy_longlong *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_longlong *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_longlong *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 1 == 1
+
+static void
+longlong_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longlong *data0 = (npy_longlong *)dataptr[0];
+    npy_longlong *data_out = (npy_longlong *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_longlong *)data_out + 2*6)[0] =
+                                    ((npy_longlong *)data0 + 2*6)[0] +
+                                    ((npy_longlong *)data_out + 2*6)[0];
+            ((npy_longlong *)data_out + 2*6)[1] =
+                                    ((npy_longlong *)data0 + 2*6)[1] +
+                                    ((npy_longlong *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_longlong *)data_out + 2*5)[0] =
+                                    ((npy_longlong *)data0 + 2*5)[0] +
+                                    ((npy_longlong *)data_out + 2*5)[0];
+            ((npy_longlong *)data_out + 2*5)[1] =
+                                    ((npy_longlong *)data0 + 2*5)[1] +
+                                    ((npy_longlong *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_longlong *)data_out + 2*4)[0] =
+                                    ((npy_longlong *)data0 + 2*4)[0] +
+                                    ((npy_longlong *)data_out + 2*4)[0];
+            ((npy_longlong *)data_out + 2*4)[1] =
+                                    ((npy_longlong *)data0 + 2*4)[1] +
+                                    ((npy_longlong *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_longlong *)data_out + 2*3)[0] =
+                                    ((npy_longlong *)data0 + 2*3)[0] +
+                                    ((npy_longlong *)data_out + 2*3)[0];
+            ((npy_longlong *)data_out + 2*3)[1] =
+                                    ((npy_longlong *)data0 + 2*3)[1] +
+                                    ((npy_longlong *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_longlong *)data_out + 2*2)[0] =
+                                    ((npy_longlong *)data0 + 2*2)[0] +
+                                    ((npy_longlong *)data_out + 2*2)[0];
+            ((npy_longlong *)data_out + 2*2)[1] =
+                                    ((npy_longlong *)data0 + 2*2)[1] +
+                                    ((npy_longlong *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_longlong *)data_out + 2*1)[0] =
+                                    ((npy_longlong *)data0 + 2*1)[0] +
+                                    ((npy_longlong *)data_out + 2*1)[0];
+            ((npy_longlong *)data_out + 2*1)[1] =
+                                    ((npy_longlong *)data0 + 2*1)[1] +
+                                    ((npy_longlong *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_longlong *)data_out + 2*0)[0] =
+                                    ((npy_longlong *)data0 + 2*0)[0] +
+                                    ((npy_longlong *)data_out + 2*0)[0];
+            ((npy_longlong *)data_out + 2*0)[1] =
+                                    ((npy_longlong *)data0 + 2*0)[1] +
+                                    ((npy_longlong *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_longlong *)data_out + 2*0)[0] =
+                                ((npy_longlong *)data0 + 2*0)[0] +
+                                ((npy_longlong *)data_out + 2*0)[0];
+        ((npy_longlong *)data_out + 2*0)[1] =
+                                ((npy_longlong *)data0 + 2*0)[1] +
+                                ((npy_longlong *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_longlong *)data_out + 2*1)[0] =
+                                ((npy_longlong *)data0 + 2*1)[0] +
+                                ((npy_longlong *)data_out + 2*1)[0];
+        ((npy_longlong *)data_out + 2*1)[1] =
+                                ((npy_longlong *)data0 + 2*1)[1] +
+                                ((npy_longlong *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_longlong *)data_out + 2*2)[0] =
+                                ((npy_longlong *)data0 + 2*2)[0] +
+                                ((npy_longlong *)data_out + 2*2)[0];
+        ((npy_longlong *)data_out + 2*2)[1] =
+                                ((npy_longlong *)data0 + 2*2)[1] +
+                                ((npy_longlong *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_longlong *)data_out + 2*3)[0] =
+                                ((npy_longlong *)data0 + 2*3)[0] +
+                                ((npy_longlong *)data_out + 2*3)[0];
+        ((npy_longlong *)data_out + 2*3)[1] =
+                                ((npy_longlong *)data0 + 2*3)[1] +
+                                ((npy_longlong *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_longlong *)data_out + 2*4)[0] =
+                                ((npy_longlong *)data0 + 2*4)[0] +
+                                ((npy_longlong *)data_out + 2*4)[0];
+        ((npy_longlong *)data_out + 2*4)[1] =
+                                ((npy_longlong *)data0 + 2*4)[1] +
+                                ((npy_longlong *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_longlong *)data_out + 2*5)[0] =
+                                ((npy_longlong *)data0 + 2*5)[0] +
+                                ((npy_longlong *)data_out + 2*5)[0];
+        ((npy_longlong *)data_out + 2*5)[1] =
+                                ((npy_longlong *)data0 + 2*5)[1] +
+                                ((npy_longlong *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_longlong *)data_out + 2*6)[0] =
+                                ((npy_longlong *)data0 + 2*6)[0] +
+                                ((npy_longlong *)data_out + 2*6)[0];
+        ((npy_longlong *)data_out + 2*6)[1] =
+                                ((npy_longlong *)data0 + 2*6)[1] +
+                                ((npy_longlong *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_longlong *)data_out + 2*7)[0] =
+                                ((npy_longlong *)data0 + 2*7)[0] +
+                                ((npy_longlong *)data_out + 2*7)[0];
+        ((npy_longlong *)data_out + 2*7)[1] =
+                                ((npy_longlong *)data0 + 2*7)[1] +
+                                ((npy_longlong *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 1 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+longlong_sum_of_products_muladd(npy_longlong *data, npy_longlong *data_out, npy_longlong scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_longlong
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_s64;
+    const npyv_s64 v_scalar = npyv_setall_s64(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_s64 b0 = npyv_loada_s64(data + vstep * 0);
+            npyv_s64 c0 = npyv_loada_s64(data_out + vstep * 0);
+            
+#line 312
+            npyv_s64 b1 = npyv_loada_s64(data + vstep * 1);
+            npyv_s64 c1 = npyv_loada_s64(data_out + vstep * 1);
+            
+#line 312
+            npyv_s64 b2 = npyv_loada_s64(data + vstep * 2);
+            npyv_s64 c2 = npyv_loada_s64(data_out + vstep * 2);
+            
+#line 312
+            npyv_s64 b3 = npyv_loada_s64(data + vstep * 3);
+            npyv_s64 c3 = npyv_loada_s64(data_out + vstep * 3);
+            
+            #line 318
+            npyv_s64 abc0 = npyv_muladd_s64(v_scalar, b0, c0);
+            
+#line 318
+            npyv_s64 abc1 = npyv_muladd_s64(v_scalar, b1, c1);
+            
+#line 318
+            npyv_s64 abc2 = npyv_muladd_s64(v_scalar, b2, c2);
+            
+#line 318
+            npyv_s64 abc3 = npyv_muladd_s64(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_s64(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_s64(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_s64(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_s64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_s64 b0 = npyv_load_s64(data + vstep * 0);
+            npyv_s64 c0 = npyv_load_s64(data_out + vstep * 0);
+            
+#line 312
+            npyv_s64 b1 = npyv_load_s64(data + vstep * 1);
+            npyv_s64 c1 = npyv_load_s64(data_out + vstep * 1);
+            
+#line 312
+            npyv_s64 b2 = npyv_load_s64(data + vstep * 2);
+            npyv_s64 c2 = npyv_load_s64(data_out + vstep * 2);
+            
+#line 312
+            npyv_s64 b3 = npyv_load_s64(data + vstep * 3);
+            npyv_s64 c3 = npyv_load_s64(data_out + vstep * 3);
+            
+            #line 318
+            npyv_s64 abc0 = npyv_muladd_s64(v_scalar, b0, c0);
+            
+#line 318
+            npyv_s64 abc1 = npyv_muladd_s64(v_scalar, b1, c1);
+            
+#line 318
+            npyv_s64 abc2 = npyv_muladd_s64(v_scalar, b2, c2);
+            
+#line 318
+            npyv_s64 abc3 = npyv_muladd_s64(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_s64(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_s64(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_s64(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_s64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_s64 a = npyv_load_tillz_s64(data, count);
+        npyv_s64 b = npyv_load_tillz_s64(data_out, count);
+        npyv_store_till_s64(data_out, count, npyv_muladd_s64(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_longlong b0 = (data[0]);
+        const npy_longlong c0 = (data_out[0]);
+        
+#line 340
+        const npy_longlong b1 = (data[1]);
+        const npy_longlong c1 = (data_out[1]);
+        
+#line 340
+        const npy_longlong b2 = (data[2]);
+        const npy_longlong c2 = (data_out[2]);
+        
+#line 340
+        const npy_longlong b3 = (data[3]);
+        const npy_longlong c3 = (data_out[3]);
+        
+        #line 346
+        const npy_longlong abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_longlong abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_longlong abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_longlong abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_longlong b = (*data);
+        const npy_longlong c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_longlong
+}
+
+static void
+longlong_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longlong *data0 = (npy_longlong *)dataptr[0];
+    npy_longlong *data1 = (npy_longlong *)dataptr[1];
+    npy_longlong *data_out = (npy_longlong *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_longlong
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_s64;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_s64 a0 = npyv_loada_s64(data0 + vstep * 0);
+            npyv_s64 b0 = npyv_loada_s64(data1 + vstep * 0);
+            npyv_s64 c0 = npyv_loada_s64(data_out + vstep * 0);
+            
+#line 390
+            npyv_s64 a1 = npyv_loada_s64(data0 + vstep * 1);
+            npyv_s64 b1 = npyv_loada_s64(data1 + vstep * 1);
+            npyv_s64 c1 = npyv_loada_s64(data_out + vstep * 1);
+            
+#line 390
+            npyv_s64 a2 = npyv_loada_s64(data0 + vstep * 2);
+            npyv_s64 b2 = npyv_loada_s64(data1 + vstep * 2);
+            npyv_s64 c2 = npyv_loada_s64(data_out + vstep * 2);
+            
+#line 390
+            npyv_s64 a3 = npyv_loada_s64(data0 + vstep * 3);
+            npyv_s64 b3 = npyv_loada_s64(data1 + vstep * 3);
+            npyv_s64 c3 = npyv_loada_s64(data_out + vstep * 3);
+            
+            #line 397
+            npyv_s64 abc0 = npyv_muladd_s64(a0, b0, c0);
+            
+#line 397
+            npyv_s64 abc1 = npyv_muladd_s64(a1, b1, c1);
+            
+#line 397
+            npyv_s64 abc2 = npyv_muladd_s64(a2, b2, c2);
+            
+#line 397
+            npyv_s64 abc3 = npyv_muladd_s64(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_s64(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_s64(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_s64(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_s64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_s64 a0 = npyv_load_s64(data0 + vstep * 0);
+            npyv_s64 b0 = npyv_load_s64(data1 + vstep * 0);
+            npyv_s64 c0 = npyv_load_s64(data_out + vstep * 0);
+            
+#line 390
+            npyv_s64 a1 = npyv_load_s64(data0 + vstep * 1);
+            npyv_s64 b1 = npyv_load_s64(data1 + vstep * 1);
+            npyv_s64 c1 = npyv_load_s64(data_out + vstep * 1);
+            
+#line 390
+            npyv_s64 a2 = npyv_load_s64(data0 + vstep * 2);
+            npyv_s64 b2 = npyv_load_s64(data1 + vstep * 2);
+            npyv_s64 c2 = npyv_load_s64(data_out + vstep * 2);
+            
+#line 390
+            npyv_s64 a3 = npyv_load_s64(data0 + vstep * 3);
+            npyv_s64 b3 = npyv_load_s64(data1 + vstep * 3);
+            npyv_s64 c3 = npyv_load_s64(data_out + vstep * 3);
+            
+            #line 397
+            npyv_s64 abc0 = npyv_muladd_s64(a0, b0, c0);
+            
+#line 397
+            npyv_s64 abc1 = npyv_muladd_s64(a1, b1, c1);
+            
+#line 397
+            npyv_s64 abc2 = npyv_muladd_s64(a2, b2, c2);
+            
+#line 397
+            npyv_s64 abc3 = npyv_muladd_s64(a3, b3, c3);
+            
+            #line 402
+            npyv_store_s64(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_s64(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_s64(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_s64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_s64 a = npyv_load_tillz_s64(data0, count);
+        npyv_s64 b = npyv_load_tillz_s64(data1, count);
+        npyv_s64 c = npyv_load_tillz_s64(data_out, count);
+        npyv_store_till_s64(data_out, count, npyv_muladd_s64(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_longlong a0 = (data0[0]);
+        const npy_longlong b0 = (data1[0]);
+        const npy_longlong c0 = (data_out[0]);
+        
+#line 420
+        const npy_longlong a1 = (data0[1]);
+        const npy_longlong b1 = (data1[1]);
+        const npy_longlong c1 = (data_out[1]);
+        
+#line 420
+        const npy_longlong a2 = (data0[2]);
+        const npy_longlong b2 = (data1[2]);
+        const npy_longlong c2 = (data_out[2]);
+        
+#line 420
+        const npy_longlong a3 = (data0[3]);
+        const npy_longlong b3 = (data1[3]);
+        const npy_longlong c3 = (data_out[3]);
+        
+        #line 427
+        const npy_longlong abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_longlong abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_longlong abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_longlong abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_longlong a = (*data0);
+        const npy_longlong b = (*data1);
+        const npy_longlong c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_longlong
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+longlong_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longlong value0 = (*(npy_longlong *)dataptr[0]);
+    npy_longlong *data1 = (npy_longlong *)dataptr[1];
+    npy_longlong *data_out = (npy_longlong *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    longlong_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+longlong_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longlong value1 = (*(npy_longlong *)dataptr[1]);
+    npy_longlong *data0 = (npy_longlong *)dataptr[0];
+    npy_longlong *data_out = (npy_longlong *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    longlong_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+longlong_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longlong *data0 = (npy_longlong *)dataptr[0];
+    npy_longlong *data1 = (npy_longlong *)dataptr[1];
+    npy_longlong accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_longlong
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_s64;
+    npyv_s64 v_accum = npyv_zero_s64();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_s64 a0 = npyv_loada_s64(data0 + vstep * 0);
+            npyv_s64 b0 = npyv_loada_s64(data1 + vstep * 0);
+            
+#line 501
+            npyv_s64 a1 = npyv_loada_s64(data0 + vstep * 1);
+            npyv_s64 b1 = npyv_loada_s64(data1 + vstep * 1);
+            
+#line 501
+            npyv_s64 a2 = npyv_loada_s64(data0 + vstep * 2);
+            npyv_s64 b2 = npyv_loada_s64(data1 + vstep * 2);
+            
+#line 501
+            npyv_s64 a3 = npyv_loada_s64(data0 + vstep * 3);
+            npyv_s64 b3 = npyv_loada_s64(data1 + vstep * 3);
+            
+            npyv_s64 ab3 = npyv_muladd_s64(a3, b3, v_accum);
+            npyv_s64 ab2 = npyv_muladd_s64(a2, b2, ab3);
+            npyv_s64 ab1 = npyv_muladd_s64(a1, b1, ab2);
+                   v_accum = npyv_muladd_s64(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_s64 a0 = npyv_load_s64(data0 + vstep * 0);
+            npyv_s64 b0 = npyv_load_s64(data1 + vstep * 0);
+            
+#line 501
+            npyv_s64 a1 = npyv_load_s64(data0 + vstep * 1);
+            npyv_s64 b1 = npyv_load_s64(data1 + vstep * 1);
+            
+#line 501
+            npyv_s64 a2 = npyv_load_s64(data0 + vstep * 2);
+            npyv_s64 b2 = npyv_load_s64(data1 + vstep * 2);
+            
+#line 501
+            npyv_s64 a3 = npyv_load_s64(data0 + vstep * 3);
+            npyv_s64 b3 = npyv_load_s64(data1 + vstep * 3);
+            
+            npyv_s64 ab3 = npyv_muladd_s64(a3, b3, v_accum);
+            npyv_s64 ab2 = npyv_muladd_s64(a2, b2, ab3);
+            npyv_s64 ab1 = npyv_muladd_s64(a1, b1, ab2);
+                   v_accum = npyv_muladd_s64(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_s64 a = npyv_load_tillz_s64(data0, count);
+        npyv_s64 b = npyv_load_tillz_s64(data1, count);
+        v_accum = npyv_muladd_s64(a, b, v_accum);
+    }
+    accum = npyv_sum_s64(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_longlong ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_longlong ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_longlong ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_longlong ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_longlong a = (*data0);
+        const npy_longlong b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_longlong
+    *(npy_longlong *)dataptr[2] = ((*(npy_longlong *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+longlong_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longlong *data1 = (npy_longlong *)dataptr[1];
+    npy_longlong value0 = (*(npy_longlong *)dataptr[0]);
+    npy_longlong accum = longlong_sum_of_arr(data1, count);
+    *(npy_longlong *)dataptr[2] = ((*(npy_longlong *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+longlong_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longlong *data0 = (npy_longlong *)dataptr[0];
+    npy_longlong value1 = (*(npy_longlong *)dataptr[1]);
+    npy_longlong accum = longlong_sum_of_arr(data0, count);
+    *(npy_longlong *)dataptr[2] = ((*(npy_longlong *)dataptr[2]) + value1 * accum);
+}
+
+#elif 1 == 3 && !0
+
+static void
+longlong_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longlong *data0 = (npy_longlong *)dataptr[0];
+    npy_longlong *data1 = (npy_longlong *)dataptr[1];
+    npy_longlong *data2 = (npy_longlong *)dataptr[2];
+    npy_longlong *data_out = (npy_longlong *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 1 > 3 || @complex */
+
+static void
+longlong_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_one (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_longlong temp = (*(npy_longlong *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_longlong *)dataptr[i]);
+        }
+        *(npy_longlong *)dataptr[nop] = (temp +
+                                           (*(npy_longlong *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_longlong);
+        }
+#else /* complex */
+#  if 1 <= 3
+#    define _SUMPROD_NOP 1
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_longlong re, im, tmp;
+        int i;
+        re = ((npy_longlong *)dataptr[0])[0];
+        im = ((npy_longlong *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_longlong *)dataptr[i])[0] -
+                  im * ((npy_longlong *)dataptr[i])[1];
+            im = re * ((npy_longlong *)dataptr[i])[1] +
+                 im * ((npy_longlong *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_longlong *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_longlong *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_longlong);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 1 */
+
+#if 1 == 1
+
+static NPY_GCC_OPT_3 void
+longlong_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_longlong *data = (npy_longlong *)dataptr[0];
+    npy_longlong accum = longlong_sum_of_arr(data, count);
+    *((npy_longlong *)dataptr[1]) = (accum + (*((npy_longlong *)dataptr[1])));
+#else
+    npy_longlong accum_re = 0, accum_im = 0;
+    npy_longlong *data0 = (npy_longlong *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_longlong re01 = data0[0] + data0[2];
+        const npy_longlong re23 = data0[4] + data0[6];
+        const npy_longlong im13 = data0[1] + data0[3];
+        const npy_longlong im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_longlong *)dataptr[1])[0] += accum_re;
+    ((npy_longlong *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 1 == 1 */
+
+static void
+longlong_sum_of_products_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_longlong accum_re = 0, accum_im = 0;
+#else
+    npy_longlong accum = 0;
+#endif
+
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1 == 2 || 1 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_outstride0_one (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 1 == 1
+        accum += (*(npy_longlong *)data0);
+        data0 += stride0;
+#  elif 1 == 2
+        accum += (*(npy_longlong *)data0) *
+                 (*(npy_longlong *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 1 == 3
+        accum += (*(npy_longlong *)data0) *
+                 (*(npy_longlong *)data1) *
+                 (*(npy_longlong *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_longlong temp = (*(npy_longlong *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_longlong *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1 == 1
+        accum_re += ((npy_longlong *)data0)[0];
+        accum_im += ((npy_longlong *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 1 <= 3
+#define _SUMPROD_NOP 1
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_longlong re, im, tmp;
+        int i;
+        re = ((npy_longlong *)dataptr[0])[0];
+        im = ((npy_longlong *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_longlong *)dataptr[i])[0] -
+                  im * ((npy_longlong *)dataptr[i])[1];
+            im = re * ((npy_longlong *)dataptr[i])[1] +
+                 im * ((npy_longlong *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 1 <= 3
+    ((npy_longlong *)dataptr[1])[0] += accum_re;
+    ((npy_longlong *)dataptr[1])[1] += accum_im;
+#  else
+    ((npy_longlong *)dataptr[nop])[0] += accum_re;
+    ((npy_longlong *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 1 <= 3
+    *((npy_longlong *)dataptr[1]) = (accum +
+                                    (*((npy_longlong *)dataptr[1])));
+#  else
+    *((npy_longlong *)dataptr[nop]) = (accum +
+                                    (*((npy_longlong *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+longlong_sum_of_products_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (2 == 2 || 2 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (2 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data_out = dataptr[2];
+    npy_intp stride_out = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_two (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 2 == 1
+        *(npy_longlong *)data_out = ((*(npy_longlong *)data0) +
+                                         (*(npy_longlong *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 2 == 2
+        *(npy_longlong *)data_out = ((*(npy_longlong *)data0) *
+                                         (*(npy_longlong *)data1) +
+                                         (*(npy_longlong *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 2 == 3
+        *(npy_longlong *)data_out = ((*(npy_longlong *)data0) *
+                                         (*(npy_longlong *)data1) *
+                                         (*(npy_longlong *)data2) +
+                                         (*(npy_longlong *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_longlong temp = (*(npy_longlong *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_longlong *)dataptr[i]);
+        }
+        *(npy_longlong *)dataptr[nop] = (temp +
+                                           (*(npy_longlong *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 2 == 1
+        ((npy_longlong *)data_out)[0] = ((npy_longlong *)data0)[0] +
+                                         ((npy_longlong *)data_out)[0];
+        ((npy_longlong *)data_out)[1] = ((npy_longlong *)data0)[1] +
+                                         ((npy_longlong *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 2 <= 3
+#define _SUMPROD_NOP 2
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_longlong re, im, tmp;
+        int i;
+        re = ((npy_longlong *)dataptr[0])[0];
+        im = ((npy_longlong *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_longlong *)dataptr[i])[0] -
+                  im * ((npy_longlong *)dataptr[i])[1];
+            im = re * ((npy_longlong *)dataptr[i])[1] +
+                 im * ((npy_longlong *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_longlong *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_longlong *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 2 == 1
+
+static void
+longlong_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longlong *data0 = (npy_longlong *)dataptr[0];
+    npy_longlong *data_out = (npy_longlong *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_longlong *)data_out + 2*6)[0] =
+                                    ((npy_longlong *)data0 + 2*6)[0] +
+                                    ((npy_longlong *)data_out + 2*6)[0];
+            ((npy_longlong *)data_out + 2*6)[1] =
+                                    ((npy_longlong *)data0 + 2*6)[1] +
+                                    ((npy_longlong *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_longlong *)data_out + 2*5)[0] =
+                                    ((npy_longlong *)data0 + 2*5)[0] +
+                                    ((npy_longlong *)data_out + 2*5)[0];
+            ((npy_longlong *)data_out + 2*5)[1] =
+                                    ((npy_longlong *)data0 + 2*5)[1] +
+                                    ((npy_longlong *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_longlong *)data_out + 2*4)[0] =
+                                    ((npy_longlong *)data0 + 2*4)[0] +
+                                    ((npy_longlong *)data_out + 2*4)[0];
+            ((npy_longlong *)data_out + 2*4)[1] =
+                                    ((npy_longlong *)data0 + 2*4)[1] +
+                                    ((npy_longlong *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_longlong *)data_out + 2*3)[0] =
+                                    ((npy_longlong *)data0 + 2*3)[0] +
+                                    ((npy_longlong *)data_out + 2*3)[0];
+            ((npy_longlong *)data_out + 2*3)[1] =
+                                    ((npy_longlong *)data0 + 2*3)[1] +
+                                    ((npy_longlong *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_longlong *)data_out + 2*2)[0] =
+                                    ((npy_longlong *)data0 + 2*2)[0] +
+                                    ((npy_longlong *)data_out + 2*2)[0];
+            ((npy_longlong *)data_out + 2*2)[1] =
+                                    ((npy_longlong *)data0 + 2*2)[1] +
+                                    ((npy_longlong *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_longlong *)data_out + 2*1)[0] =
+                                    ((npy_longlong *)data0 + 2*1)[0] +
+                                    ((npy_longlong *)data_out + 2*1)[0];
+            ((npy_longlong *)data_out + 2*1)[1] =
+                                    ((npy_longlong *)data0 + 2*1)[1] +
+                                    ((npy_longlong *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_longlong *)data_out + 2*0)[0] =
+                                    ((npy_longlong *)data0 + 2*0)[0] +
+                                    ((npy_longlong *)data_out + 2*0)[0];
+            ((npy_longlong *)data_out + 2*0)[1] =
+                                    ((npy_longlong *)data0 + 2*0)[1] +
+                                    ((npy_longlong *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_longlong *)data_out + 2*0)[0] =
+                                ((npy_longlong *)data0 + 2*0)[0] +
+                                ((npy_longlong *)data_out + 2*0)[0];
+        ((npy_longlong *)data_out + 2*0)[1] =
+                                ((npy_longlong *)data0 + 2*0)[1] +
+                                ((npy_longlong *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_longlong *)data_out + 2*1)[0] =
+                                ((npy_longlong *)data0 + 2*1)[0] +
+                                ((npy_longlong *)data_out + 2*1)[0];
+        ((npy_longlong *)data_out + 2*1)[1] =
+                                ((npy_longlong *)data0 + 2*1)[1] +
+                                ((npy_longlong *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_longlong *)data_out + 2*2)[0] =
+                                ((npy_longlong *)data0 + 2*2)[0] +
+                                ((npy_longlong *)data_out + 2*2)[0];
+        ((npy_longlong *)data_out + 2*2)[1] =
+                                ((npy_longlong *)data0 + 2*2)[1] +
+                                ((npy_longlong *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_longlong *)data_out + 2*3)[0] =
+                                ((npy_longlong *)data0 + 2*3)[0] +
+                                ((npy_longlong *)data_out + 2*3)[0];
+        ((npy_longlong *)data_out + 2*3)[1] =
+                                ((npy_longlong *)data0 + 2*3)[1] +
+                                ((npy_longlong *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_longlong *)data_out + 2*4)[0] =
+                                ((npy_longlong *)data0 + 2*4)[0] +
+                                ((npy_longlong *)data_out + 2*4)[0];
+        ((npy_longlong *)data_out + 2*4)[1] =
+                                ((npy_longlong *)data0 + 2*4)[1] +
+                                ((npy_longlong *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_longlong *)data_out + 2*5)[0] =
+                                ((npy_longlong *)data0 + 2*5)[0] +
+                                ((npy_longlong *)data_out + 2*5)[0];
+        ((npy_longlong *)data_out + 2*5)[1] =
+                                ((npy_longlong *)data0 + 2*5)[1] +
+                                ((npy_longlong *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_longlong *)data_out + 2*6)[0] =
+                                ((npy_longlong *)data0 + 2*6)[0] +
+                                ((npy_longlong *)data_out + 2*6)[0];
+        ((npy_longlong *)data_out + 2*6)[1] =
+                                ((npy_longlong *)data0 + 2*6)[1] +
+                                ((npy_longlong *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_longlong *)data_out + 2*7)[0] =
+                                ((npy_longlong *)data0 + 2*7)[0] +
+                                ((npy_longlong *)data_out + 2*7)[0];
+        ((npy_longlong *)data_out + 2*7)[1] =
+                                ((npy_longlong *)data0 + 2*7)[1] +
+                                ((npy_longlong *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 2 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+longlong_sum_of_products_muladd(npy_longlong *data, npy_longlong *data_out, npy_longlong scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_longlong
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_s64;
+    const npyv_s64 v_scalar = npyv_setall_s64(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_s64 b0 = npyv_loada_s64(data + vstep * 0);
+            npyv_s64 c0 = npyv_loada_s64(data_out + vstep * 0);
+            
+#line 312
+            npyv_s64 b1 = npyv_loada_s64(data + vstep * 1);
+            npyv_s64 c1 = npyv_loada_s64(data_out + vstep * 1);
+            
+#line 312
+            npyv_s64 b2 = npyv_loada_s64(data + vstep * 2);
+            npyv_s64 c2 = npyv_loada_s64(data_out + vstep * 2);
+            
+#line 312
+            npyv_s64 b3 = npyv_loada_s64(data + vstep * 3);
+            npyv_s64 c3 = npyv_loada_s64(data_out + vstep * 3);
+            
+            #line 318
+            npyv_s64 abc0 = npyv_muladd_s64(v_scalar, b0, c0);
+            
+#line 318
+            npyv_s64 abc1 = npyv_muladd_s64(v_scalar, b1, c1);
+            
+#line 318
+            npyv_s64 abc2 = npyv_muladd_s64(v_scalar, b2, c2);
+            
+#line 318
+            npyv_s64 abc3 = npyv_muladd_s64(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_s64(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_s64(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_s64(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_s64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_s64 b0 = npyv_load_s64(data + vstep * 0);
+            npyv_s64 c0 = npyv_load_s64(data_out + vstep * 0);
+            
+#line 312
+            npyv_s64 b1 = npyv_load_s64(data + vstep * 1);
+            npyv_s64 c1 = npyv_load_s64(data_out + vstep * 1);
+            
+#line 312
+            npyv_s64 b2 = npyv_load_s64(data + vstep * 2);
+            npyv_s64 c2 = npyv_load_s64(data_out + vstep * 2);
+            
+#line 312
+            npyv_s64 b3 = npyv_load_s64(data + vstep * 3);
+            npyv_s64 c3 = npyv_load_s64(data_out + vstep * 3);
+            
+            #line 318
+            npyv_s64 abc0 = npyv_muladd_s64(v_scalar, b0, c0);
+            
+#line 318
+            npyv_s64 abc1 = npyv_muladd_s64(v_scalar, b1, c1);
+            
+#line 318
+            npyv_s64 abc2 = npyv_muladd_s64(v_scalar, b2, c2);
+            
+#line 318
+            npyv_s64 abc3 = npyv_muladd_s64(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_s64(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_s64(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_s64(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_s64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_s64 a = npyv_load_tillz_s64(data, count);
+        npyv_s64 b = npyv_load_tillz_s64(data_out, count);
+        npyv_store_till_s64(data_out, count, npyv_muladd_s64(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_longlong b0 = (data[0]);
+        const npy_longlong c0 = (data_out[0]);
+        
+#line 340
+        const npy_longlong b1 = (data[1]);
+        const npy_longlong c1 = (data_out[1]);
+        
+#line 340
+        const npy_longlong b2 = (data[2]);
+        const npy_longlong c2 = (data_out[2]);
+        
+#line 340
+        const npy_longlong b3 = (data[3]);
+        const npy_longlong c3 = (data_out[3]);
+        
+        #line 346
+        const npy_longlong abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_longlong abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_longlong abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_longlong abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_longlong b = (*data);
+        const npy_longlong c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_longlong
+}
+
+static void
+longlong_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longlong *data0 = (npy_longlong *)dataptr[0];
+    npy_longlong *data1 = (npy_longlong *)dataptr[1];
+    npy_longlong *data_out = (npy_longlong *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_longlong
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_s64;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_s64 a0 = npyv_loada_s64(data0 + vstep * 0);
+            npyv_s64 b0 = npyv_loada_s64(data1 + vstep * 0);
+            npyv_s64 c0 = npyv_loada_s64(data_out + vstep * 0);
+            
+#line 390
+            npyv_s64 a1 = npyv_loada_s64(data0 + vstep * 1);
+            npyv_s64 b1 = npyv_loada_s64(data1 + vstep * 1);
+            npyv_s64 c1 = npyv_loada_s64(data_out + vstep * 1);
+            
+#line 390
+            npyv_s64 a2 = npyv_loada_s64(data0 + vstep * 2);
+            npyv_s64 b2 = npyv_loada_s64(data1 + vstep * 2);
+            npyv_s64 c2 = npyv_loada_s64(data_out + vstep * 2);
+            
+#line 390
+            npyv_s64 a3 = npyv_loada_s64(data0 + vstep * 3);
+            npyv_s64 b3 = npyv_loada_s64(data1 + vstep * 3);
+            npyv_s64 c3 = npyv_loada_s64(data_out + vstep * 3);
+            
+            #line 397
+            npyv_s64 abc0 = npyv_muladd_s64(a0, b0, c0);
+            
+#line 397
+            npyv_s64 abc1 = npyv_muladd_s64(a1, b1, c1);
+            
+#line 397
+            npyv_s64 abc2 = npyv_muladd_s64(a2, b2, c2);
+            
+#line 397
+            npyv_s64 abc3 = npyv_muladd_s64(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_s64(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_s64(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_s64(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_s64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_s64 a0 = npyv_load_s64(data0 + vstep * 0);
+            npyv_s64 b0 = npyv_load_s64(data1 + vstep * 0);
+            npyv_s64 c0 = npyv_load_s64(data_out + vstep * 0);
+            
+#line 390
+            npyv_s64 a1 = npyv_load_s64(data0 + vstep * 1);
+            npyv_s64 b1 = npyv_load_s64(data1 + vstep * 1);
+            npyv_s64 c1 = npyv_load_s64(data_out + vstep * 1);
+            
+#line 390
+            npyv_s64 a2 = npyv_load_s64(data0 + vstep * 2);
+            npyv_s64 b2 = npyv_load_s64(data1 + vstep * 2);
+            npyv_s64 c2 = npyv_load_s64(data_out + vstep * 2);
+            
+#line 390
+            npyv_s64 a3 = npyv_load_s64(data0 + vstep * 3);
+            npyv_s64 b3 = npyv_load_s64(data1 + vstep * 3);
+            npyv_s64 c3 = npyv_load_s64(data_out + vstep * 3);
+            
+            #line 397
+            npyv_s64 abc0 = npyv_muladd_s64(a0, b0, c0);
+            
+#line 397
+            npyv_s64 abc1 = npyv_muladd_s64(a1, b1, c1);
+            
+#line 397
+            npyv_s64 abc2 = npyv_muladd_s64(a2, b2, c2);
+            
+#line 397
+            npyv_s64 abc3 = npyv_muladd_s64(a3, b3, c3);
+            
+            #line 402
+            npyv_store_s64(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_s64(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_s64(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_s64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_s64 a = npyv_load_tillz_s64(data0, count);
+        npyv_s64 b = npyv_load_tillz_s64(data1, count);
+        npyv_s64 c = npyv_load_tillz_s64(data_out, count);
+        npyv_store_till_s64(data_out, count, npyv_muladd_s64(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_longlong a0 = (data0[0]);
+        const npy_longlong b0 = (data1[0]);
+        const npy_longlong c0 = (data_out[0]);
+        
+#line 420
+        const npy_longlong a1 = (data0[1]);
+        const npy_longlong b1 = (data1[1]);
+        const npy_longlong c1 = (data_out[1]);
+        
+#line 420
+        const npy_longlong a2 = (data0[2]);
+        const npy_longlong b2 = (data1[2]);
+        const npy_longlong c2 = (data_out[2]);
+        
+#line 420
+        const npy_longlong a3 = (data0[3]);
+        const npy_longlong b3 = (data1[3]);
+        const npy_longlong c3 = (data_out[3]);
+        
+        #line 427
+        const npy_longlong abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_longlong abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_longlong abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_longlong abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_longlong a = (*data0);
+        const npy_longlong b = (*data1);
+        const npy_longlong c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_longlong
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+longlong_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longlong value0 = (*(npy_longlong *)dataptr[0]);
+    npy_longlong *data1 = (npy_longlong *)dataptr[1];
+    npy_longlong *data_out = (npy_longlong *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    longlong_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+longlong_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longlong value1 = (*(npy_longlong *)dataptr[1]);
+    npy_longlong *data0 = (npy_longlong *)dataptr[0];
+    npy_longlong *data_out = (npy_longlong *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    longlong_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+longlong_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longlong *data0 = (npy_longlong *)dataptr[0];
+    npy_longlong *data1 = (npy_longlong *)dataptr[1];
+    npy_longlong accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_longlong
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_s64;
+    npyv_s64 v_accum = npyv_zero_s64();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_s64 a0 = npyv_loada_s64(data0 + vstep * 0);
+            npyv_s64 b0 = npyv_loada_s64(data1 + vstep * 0);
+            
+#line 501
+            npyv_s64 a1 = npyv_loada_s64(data0 + vstep * 1);
+            npyv_s64 b1 = npyv_loada_s64(data1 + vstep * 1);
+            
+#line 501
+            npyv_s64 a2 = npyv_loada_s64(data0 + vstep * 2);
+            npyv_s64 b2 = npyv_loada_s64(data1 + vstep * 2);
+            
+#line 501
+            npyv_s64 a3 = npyv_loada_s64(data0 + vstep * 3);
+            npyv_s64 b3 = npyv_loada_s64(data1 + vstep * 3);
+            
+            npyv_s64 ab3 = npyv_muladd_s64(a3, b3, v_accum);
+            npyv_s64 ab2 = npyv_muladd_s64(a2, b2, ab3);
+            npyv_s64 ab1 = npyv_muladd_s64(a1, b1, ab2);
+                   v_accum = npyv_muladd_s64(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_s64 a0 = npyv_load_s64(data0 + vstep * 0);
+            npyv_s64 b0 = npyv_load_s64(data1 + vstep * 0);
+            
+#line 501
+            npyv_s64 a1 = npyv_load_s64(data0 + vstep * 1);
+            npyv_s64 b1 = npyv_load_s64(data1 + vstep * 1);
+            
+#line 501
+            npyv_s64 a2 = npyv_load_s64(data0 + vstep * 2);
+            npyv_s64 b2 = npyv_load_s64(data1 + vstep * 2);
+            
+#line 501
+            npyv_s64 a3 = npyv_load_s64(data0 + vstep * 3);
+            npyv_s64 b3 = npyv_load_s64(data1 + vstep * 3);
+            
+            npyv_s64 ab3 = npyv_muladd_s64(a3, b3, v_accum);
+            npyv_s64 ab2 = npyv_muladd_s64(a2, b2, ab3);
+            npyv_s64 ab1 = npyv_muladd_s64(a1, b1, ab2);
+                   v_accum = npyv_muladd_s64(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_s64 a = npyv_load_tillz_s64(data0, count);
+        npyv_s64 b = npyv_load_tillz_s64(data1, count);
+        v_accum = npyv_muladd_s64(a, b, v_accum);
+    }
+    accum = npyv_sum_s64(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_longlong ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_longlong ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_longlong ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_longlong ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_longlong a = (*data0);
+        const npy_longlong b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_longlong
+    *(npy_longlong *)dataptr[2] = ((*(npy_longlong *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+longlong_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longlong *data1 = (npy_longlong *)dataptr[1];
+    npy_longlong value0 = (*(npy_longlong *)dataptr[0]);
+    npy_longlong accum = longlong_sum_of_arr(data1, count);
+    *(npy_longlong *)dataptr[2] = ((*(npy_longlong *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+longlong_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longlong *data0 = (npy_longlong *)dataptr[0];
+    npy_longlong value1 = (*(npy_longlong *)dataptr[1]);
+    npy_longlong accum = longlong_sum_of_arr(data0, count);
+    *(npy_longlong *)dataptr[2] = ((*(npy_longlong *)dataptr[2]) + value1 * accum);
+}
+
+#elif 2 == 3 && !0
+
+static void
+longlong_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longlong *data0 = (npy_longlong *)dataptr[0];
+    npy_longlong *data1 = (npy_longlong *)dataptr[1];
+    npy_longlong *data2 = (npy_longlong *)dataptr[2];
+    npy_longlong *data_out = (npy_longlong *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 2 > 3 || @complex */
+
+static void
+longlong_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_two (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_longlong temp = (*(npy_longlong *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_longlong *)dataptr[i]);
+        }
+        *(npy_longlong *)dataptr[nop] = (temp +
+                                           (*(npy_longlong *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_longlong);
+        }
+#else /* complex */
+#  if 2 <= 3
+#    define _SUMPROD_NOP 2
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_longlong re, im, tmp;
+        int i;
+        re = ((npy_longlong *)dataptr[0])[0];
+        im = ((npy_longlong *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_longlong *)dataptr[i])[0] -
+                  im * ((npy_longlong *)dataptr[i])[1];
+            im = re * ((npy_longlong *)dataptr[i])[1] +
+                 im * ((npy_longlong *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_longlong *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_longlong *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_longlong);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 2 */
+
+#if 2 == 1
+
+static NPY_GCC_OPT_3 void
+longlong_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_longlong *data = (npy_longlong *)dataptr[0];
+    npy_longlong accum = longlong_sum_of_arr(data, count);
+    *((npy_longlong *)dataptr[1]) = (accum + (*((npy_longlong *)dataptr[1])));
+#else
+    npy_longlong accum_re = 0, accum_im = 0;
+    npy_longlong *data0 = (npy_longlong *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_longlong re01 = data0[0] + data0[2];
+        const npy_longlong re23 = data0[4] + data0[6];
+        const npy_longlong im13 = data0[1] + data0[3];
+        const npy_longlong im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_longlong *)dataptr[1])[0] += accum_re;
+    ((npy_longlong *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 2 == 1 */
+
+static void
+longlong_sum_of_products_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_longlong accum_re = 0, accum_im = 0;
+#else
+    npy_longlong accum = 0;
+#endif
+
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (2 == 2 || 2 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (2 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_outstride0_two (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 2 == 1
+        accum += (*(npy_longlong *)data0);
+        data0 += stride0;
+#  elif 2 == 2
+        accum += (*(npy_longlong *)data0) *
+                 (*(npy_longlong *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 2 == 3
+        accum += (*(npy_longlong *)data0) *
+                 (*(npy_longlong *)data1) *
+                 (*(npy_longlong *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_longlong temp = (*(npy_longlong *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_longlong *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 2 == 1
+        accum_re += ((npy_longlong *)data0)[0];
+        accum_im += ((npy_longlong *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 2 <= 3
+#define _SUMPROD_NOP 2
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_longlong re, im, tmp;
+        int i;
+        re = ((npy_longlong *)dataptr[0])[0];
+        im = ((npy_longlong *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_longlong *)dataptr[i])[0] -
+                  im * ((npy_longlong *)dataptr[i])[1];
+            im = re * ((npy_longlong *)dataptr[i])[1] +
+                 im * ((npy_longlong *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 2 <= 3
+    ((npy_longlong *)dataptr[2])[0] += accum_re;
+    ((npy_longlong *)dataptr[2])[1] += accum_im;
+#  else
+    ((npy_longlong *)dataptr[nop])[0] += accum_re;
+    ((npy_longlong *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 2 <= 3
+    *((npy_longlong *)dataptr[2]) = (accum +
+                                    (*((npy_longlong *)dataptr[2])));
+#  else
+    *((npy_longlong *)dataptr[nop]) = (accum +
+                                    (*((npy_longlong *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+longlong_sum_of_products_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (3 == 2 || 3 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (3 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data_out = dataptr[3];
+    npy_intp stride_out = strides[3];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_three (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 3 == 1
+        *(npy_longlong *)data_out = ((*(npy_longlong *)data0) +
+                                         (*(npy_longlong *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 3 == 2
+        *(npy_longlong *)data_out = ((*(npy_longlong *)data0) *
+                                         (*(npy_longlong *)data1) +
+                                         (*(npy_longlong *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 3 == 3
+        *(npy_longlong *)data_out = ((*(npy_longlong *)data0) *
+                                         (*(npy_longlong *)data1) *
+                                         (*(npy_longlong *)data2) +
+                                         (*(npy_longlong *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_longlong temp = (*(npy_longlong *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_longlong *)dataptr[i]);
+        }
+        *(npy_longlong *)dataptr[nop] = (temp +
+                                           (*(npy_longlong *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 3 == 1
+        ((npy_longlong *)data_out)[0] = ((npy_longlong *)data0)[0] +
+                                         ((npy_longlong *)data_out)[0];
+        ((npy_longlong *)data_out)[1] = ((npy_longlong *)data0)[1] +
+                                         ((npy_longlong *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 3 <= 3
+#define _SUMPROD_NOP 3
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_longlong re, im, tmp;
+        int i;
+        re = ((npy_longlong *)dataptr[0])[0];
+        im = ((npy_longlong *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_longlong *)dataptr[i])[0] -
+                  im * ((npy_longlong *)dataptr[i])[1];
+            im = re * ((npy_longlong *)dataptr[i])[1] +
+                 im * ((npy_longlong *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_longlong *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_longlong *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 3 == 1
+
+static void
+longlong_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longlong *data0 = (npy_longlong *)dataptr[0];
+    npy_longlong *data_out = (npy_longlong *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_longlong *)data_out + 2*6)[0] =
+                                    ((npy_longlong *)data0 + 2*6)[0] +
+                                    ((npy_longlong *)data_out + 2*6)[0];
+            ((npy_longlong *)data_out + 2*6)[1] =
+                                    ((npy_longlong *)data0 + 2*6)[1] +
+                                    ((npy_longlong *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_longlong *)data_out + 2*5)[0] =
+                                    ((npy_longlong *)data0 + 2*5)[0] +
+                                    ((npy_longlong *)data_out + 2*5)[0];
+            ((npy_longlong *)data_out + 2*5)[1] =
+                                    ((npy_longlong *)data0 + 2*5)[1] +
+                                    ((npy_longlong *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_longlong *)data_out + 2*4)[0] =
+                                    ((npy_longlong *)data0 + 2*4)[0] +
+                                    ((npy_longlong *)data_out + 2*4)[0];
+            ((npy_longlong *)data_out + 2*4)[1] =
+                                    ((npy_longlong *)data0 + 2*4)[1] +
+                                    ((npy_longlong *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_longlong *)data_out + 2*3)[0] =
+                                    ((npy_longlong *)data0 + 2*3)[0] +
+                                    ((npy_longlong *)data_out + 2*3)[0];
+            ((npy_longlong *)data_out + 2*3)[1] =
+                                    ((npy_longlong *)data0 + 2*3)[1] +
+                                    ((npy_longlong *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_longlong *)data_out + 2*2)[0] =
+                                    ((npy_longlong *)data0 + 2*2)[0] +
+                                    ((npy_longlong *)data_out + 2*2)[0];
+            ((npy_longlong *)data_out + 2*2)[1] =
+                                    ((npy_longlong *)data0 + 2*2)[1] +
+                                    ((npy_longlong *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_longlong *)data_out + 2*1)[0] =
+                                    ((npy_longlong *)data0 + 2*1)[0] +
+                                    ((npy_longlong *)data_out + 2*1)[0];
+            ((npy_longlong *)data_out + 2*1)[1] =
+                                    ((npy_longlong *)data0 + 2*1)[1] +
+                                    ((npy_longlong *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_longlong *)data_out + 2*0)[0] =
+                                    ((npy_longlong *)data0 + 2*0)[0] +
+                                    ((npy_longlong *)data_out + 2*0)[0];
+            ((npy_longlong *)data_out + 2*0)[1] =
+                                    ((npy_longlong *)data0 + 2*0)[1] +
+                                    ((npy_longlong *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_longlong *)data_out + 2*0)[0] =
+                                ((npy_longlong *)data0 + 2*0)[0] +
+                                ((npy_longlong *)data_out + 2*0)[0];
+        ((npy_longlong *)data_out + 2*0)[1] =
+                                ((npy_longlong *)data0 + 2*0)[1] +
+                                ((npy_longlong *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_longlong *)data_out + 2*1)[0] =
+                                ((npy_longlong *)data0 + 2*1)[0] +
+                                ((npy_longlong *)data_out + 2*1)[0];
+        ((npy_longlong *)data_out + 2*1)[1] =
+                                ((npy_longlong *)data0 + 2*1)[1] +
+                                ((npy_longlong *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_longlong *)data_out + 2*2)[0] =
+                                ((npy_longlong *)data0 + 2*2)[0] +
+                                ((npy_longlong *)data_out + 2*2)[0];
+        ((npy_longlong *)data_out + 2*2)[1] =
+                                ((npy_longlong *)data0 + 2*2)[1] +
+                                ((npy_longlong *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_longlong *)data_out + 2*3)[0] =
+                                ((npy_longlong *)data0 + 2*3)[0] +
+                                ((npy_longlong *)data_out + 2*3)[0];
+        ((npy_longlong *)data_out + 2*3)[1] =
+                                ((npy_longlong *)data0 + 2*3)[1] +
+                                ((npy_longlong *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_longlong *)data_out + 2*4)[0] =
+                                ((npy_longlong *)data0 + 2*4)[0] +
+                                ((npy_longlong *)data_out + 2*4)[0];
+        ((npy_longlong *)data_out + 2*4)[1] =
+                                ((npy_longlong *)data0 + 2*4)[1] +
+                                ((npy_longlong *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_longlong *)data_out + 2*5)[0] =
+                                ((npy_longlong *)data0 + 2*5)[0] +
+                                ((npy_longlong *)data_out + 2*5)[0];
+        ((npy_longlong *)data_out + 2*5)[1] =
+                                ((npy_longlong *)data0 + 2*5)[1] +
+                                ((npy_longlong *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_longlong *)data_out + 2*6)[0] =
+                                ((npy_longlong *)data0 + 2*6)[0] +
+                                ((npy_longlong *)data_out + 2*6)[0];
+        ((npy_longlong *)data_out + 2*6)[1] =
+                                ((npy_longlong *)data0 + 2*6)[1] +
+                                ((npy_longlong *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_longlong *)data_out + 2*7)[0] =
+                                ((npy_longlong *)data0 + 2*7)[0] +
+                                ((npy_longlong *)data_out + 2*7)[0];
+        ((npy_longlong *)data_out + 2*7)[1] =
+                                ((npy_longlong *)data0 + 2*7)[1] +
+                                ((npy_longlong *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 3 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+longlong_sum_of_products_muladd(npy_longlong *data, npy_longlong *data_out, npy_longlong scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_longlong
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_s64;
+    const npyv_s64 v_scalar = npyv_setall_s64(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_s64 b0 = npyv_loada_s64(data + vstep * 0);
+            npyv_s64 c0 = npyv_loada_s64(data_out + vstep * 0);
+            
+#line 312
+            npyv_s64 b1 = npyv_loada_s64(data + vstep * 1);
+            npyv_s64 c1 = npyv_loada_s64(data_out + vstep * 1);
+            
+#line 312
+            npyv_s64 b2 = npyv_loada_s64(data + vstep * 2);
+            npyv_s64 c2 = npyv_loada_s64(data_out + vstep * 2);
+            
+#line 312
+            npyv_s64 b3 = npyv_loada_s64(data + vstep * 3);
+            npyv_s64 c3 = npyv_loada_s64(data_out + vstep * 3);
+            
+            #line 318
+            npyv_s64 abc0 = npyv_muladd_s64(v_scalar, b0, c0);
+            
+#line 318
+            npyv_s64 abc1 = npyv_muladd_s64(v_scalar, b1, c1);
+            
+#line 318
+            npyv_s64 abc2 = npyv_muladd_s64(v_scalar, b2, c2);
+            
+#line 318
+            npyv_s64 abc3 = npyv_muladd_s64(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_s64(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_s64(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_s64(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_s64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_s64 b0 = npyv_load_s64(data + vstep * 0);
+            npyv_s64 c0 = npyv_load_s64(data_out + vstep * 0);
+            
+#line 312
+            npyv_s64 b1 = npyv_load_s64(data + vstep * 1);
+            npyv_s64 c1 = npyv_load_s64(data_out + vstep * 1);
+            
+#line 312
+            npyv_s64 b2 = npyv_load_s64(data + vstep * 2);
+            npyv_s64 c2 = npyv_load_s64(data_out + vstep * 2);
+            
+#line 312
+            npyv_s64 b3 = npyv_load_s64(data + vstep * 3);
+            npyv_s64 c3 = npyv_load_s64(data_out + vstep * 3);
+            
+            #line 318
+            npyv_s64 abc0 = npyv_muladd_s64(v_scalar, b0, c0);
+            
+#line 318
+            npyv_s64 abc1 = npyv_muladd_s64(v_scalar, b1, c1);
+            
+#line 318
+            npyv_s64 abc2 = npyv_muladd_s64(v_scalar, b2, c2);
+            
+#line 318
+            npyv_s64 abc3 = npyv_muladd_s64(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_s64(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_s64(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_s64(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_s64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_s64 a = npyv_load_tillz_s64(data, count);
+        npyv_s64 b = npyv_load_tillz_s64(data_out, count);
+        npyv_store_till_s64(data_out, count, npyv_muladd_s64(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_longlong b0 = (data[0]);
+        const npy_longlong c0 = (data_out[0]);
+        
+#line 340
+        const npy_longlong b1 = (data[1]);
+        const npy_longlong c1 = (data_out[1]);
+        
+#line 340
+        const npy_longlong b2 = (data[2]);
+        const npy_longlong c2 = (data_out[2]);
+        
+#line 340
+        const npy_longlong b3 = (data[3]);
+        const npy_longlong c3 = (data_out[3]);
+        
+        #line 346
+        const npy_longlong abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_longlong abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_longlong abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_longlong abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_longlong b = (*data);
+        const npy_longlong c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_longlong
+}
+
+static void
+longlong_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longlong *data0 = (npy_longlong *)dataptr[0];
+    npy_longlong *data1 = (npy_longlong *)dataptr[1];
+    npy_longlong *data_out = (npy_longlong *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_longlong
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_s64;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_s64 a0 = npyv_loada_s64(data0 + vstep * 0);
+            npyv_s64 b0 = npyv_loada_s64(data1 + vstep * 0);
+            npyv_s64 c0 = npyv_loada_s64(data_out + vstep * 0);
+            
+#line 390
+            npyv_s64 a1 = npyv_loada_s64(data0 + vstep * 1);
+            npyv_s64 b1 = npyv_loada_s64(data1 + vstep * 1);
+            npyv_s64 c1 = npyv_loada_s64(data_out + vstep * 1);
+            
+#line 390
+            npyv_s64 a2 = npyv_loada_s64(data0 + vstep * 2);
+            npyv_s64 b2 = npyv_loada_s64(data1 + vstep * 2);
+            npyv_s64 c2 = npyv_loada_s64(data_out + vstep * 2);
+            
+#line 390
+            npyv_s64 a3 = npyv_loada_s64(data0 + vstep * 3);
+            npyv_s64 b3 = npyv_loada_s64(data1 + vstep * 3);
+            npyv_s64 c3 = npyv_loada_s64(data_out + vstep * 3);
+            
+            #line 397
+            npyv_s64 abc0 = npyv_muladd_s64(a0, b0, c0);
+            
+#line 397
+            npyv_s64 abc1 = npyv_muladd_s64(a1, b1, c1);
+            
+#line 397
+            npyv_s64 abc2 = npyv_muladd_s64(a2, b2, c2);
+            
+#line 397
+            npyv_s64 abc3 = npyv_muladd_s64(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_s64(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_s64(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_s64(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_s64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_s64 a0 = npyv_load_s64(data0 + vstep * 0);
+            npyv_s64 b0 = npyv_load_s64(data1 + vstep * 0);
+            npyv_s64 c0 = npyv_load_s64(data_out + vstep * 0);
+            
+#line 390
+            npyv_s64 a1 = npyv_load_s64(data0 + vstep * 1);
+            npyv_s64 b1 = npyv_load_s64(data1 + vstep * 1);
+            npyv_s64 c1 = npyv_load_s64(data_out + vstep * 1);
+            
+#line 390
+            npyv_s64 a2 = npyv_load_s64(data0 + vstep * 2);
+            npyv_s64 b2 = npyv_load_s64(data1 + vstep * 2);
+            npyv_s64 c2 = npyv_load_s64(data_out + vstep * 2);
+            
+#line 390
+            npyv_s64 a3 = npyv_load_s64(data0 + vstep * 3);
+            npyv_s64 b3 = npyv_load_s64(data1 + vstep * 3);
+            npyv_s64 c3 = npyv_load_s64(data_out + vstep * 3);
+            
+            #line 397
+            npyv_s64 abc0 = npyv_muladd_s64(a0, b0, c0);
+            
+#line 397
+            npyv_s64 abc1 = npyv_muladd_s64(a1, b1, c1);
+            
+#line 397
+            npyv_s64 abc2 = npyv_muladd_s64(a2, b2, c2);
+            
+#line 397
+            npyv_s64 abc3 = npyv_muladd_s64(a3, b3, c3);
+            
+            #line 402
+            npyv_store_s64(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_s64(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_s64(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_s64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_s64 a = npyv_load_tillz_s64(data0, count);
+        npyv_s64 b = npyv_load_tillz_s64(data1, count);
+        npyv_s64 c = npyv_load_tillz_s64(data_out, count);
+        npyv_store_till_s64(data_out, count, npyv_muladd_s64(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_longlong a0 = (data0[0]);
+        const npy_longlong b0 = (data1[0]);
+        const npy_longlong c0 = (data_out[0]);
+        
+#line 420
+        const npy_longlong a1 = (data0[1]);
+        const npy_longlong b1 = (data1[1]);
+        const npy_longlong c1 = (data_out[1]);
+        
+#line 420
+        const npy_longlong a2 = (data0[2]);
+        const npy_longlong b2 = (data1[2]);
+        const npy_longlong c2 = (data_out[2]);
+        
+#line 420
+        const npy_longlong a3 = (data0[3]);
+        const npy_longlong b3 = (data1[3]);
+        const npy_longlong c3 = (data_out[3]);
+        
+        #line 427
+        const npy_longlong abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_longlong abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_longlong abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_longlong abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_longlong a = (*data0);
+        const npy_longlong b = (*data1);
+        const npy_longlong c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_longlong
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+longlong_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longlong value0 = (*(npy_longlong *)dataptr[0]);
+    npy_longlong *data1 = (npy_longlong *)dataptr[1];
+    npy_longlong *data_out = (npy_longlong *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    longlong_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+longlong_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longlong value1 = (*(npy_longlong *)dataptr[1]);
+    npy_longlong *data0 = (npy_longlong *)dataptr[0];
+    npy_longlong *data_out = (npy_longlong *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    longlong_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+longlong_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longlong *data0 = (npy_longlong *)dataptr[0];
+    npy_longlong *data1 = (npy_longlong *)dataptr[1];
+    npy_longlong accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_longlong
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_s64;
+    npyv_s64 v_accum = npyv_zero_s64();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_s64 a0 = npyv_loada_s64(data0 + vstep * 0);
+            npyv_s64 b0 = npyv_loada_s64(data1 + vstep * 0);
+            
+#line 501
+            npyv_s64 a1 = npyv_loada_s64(data0 + vstep * 1);
+            npyv_s64 b1 = npyv_loada_s64(data1 + vstep * 1);
+            
+#line 501
+            npyv_s64 a2 = npyv_loada_s64(data0 + vstep * 2);
+            npyv_s64 b2 = npyv_loada_s64(data1 + vstep * 2);
+            
+#line 501
+            npyv_s64 a3 = npyv_loada_s64(data0 + vstep * 3);
+            npyv_s64 b3 = npyv_loada_s64(data1 + vstep * 3);
+            
+            npyv_s64 ab3 = npyv_muladd_s64(a3, b3, v_accum);
+            npyv_s64 ab2 = npyv_muladd_s64(a2, b2, ab3);
+            npyv_s64 ab1 = npyv_muladd_s64(a1, b1, ab2);
+                   v_accum = npyv_muladd_s64(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_s64 a0 = npyv_load_s64(data0 + vstep * 0);
+            npyv_s64 b0 = npyv_load_s64(data1 + vstep * 0);
+            
+#line 501
+            npyv_s64 a1 = npyv_load_s64(data0 + vstep * 1);
+            npyv_s64 b1 = npyv_load_s64(data1 + vstep * 1);
+            
+#line 501
+            npyv_s64 a2 = npyv_load_s64(data0 + vstep * 2);
+            npyv_s64 b2 = npyv_load_s64(data1 + vstep * 2);
+            
+#line 501
+            npyv_s64 a3 = npyv_load_s64(data0 + vstep * 3);
+            npyv_s64 b3 = npyv_load_s64(data1 + vstep * 3);
+            
+            npyv_s64 ab3 = npyv_muladd_s64(a3, b3, v_accum);
+            npyv_s64 ab2 = npyv_muladd_s64(a2, b2, ab3);
+            npyv_s64 ab1 = npyv_muladd_s64(a1, b1, ab2);
+                   v_accum = npyv_muladd_s64(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_s64 a = npyv_load_tillz_s64(data0, count);
+        npyv_s64 b = npyv_load_tillz_s64(data1, count);
+        v_accum = npyv_muladd_s64(a, b, v_accum);
+    }
+    accum = npyv_sum_s64(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_longlong ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_longlong ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_longlong ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_longlong ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_longlong a = (*data0);
+        const npy_longlong b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_longlong
+    *(npy_longlong *)dataptr[2] = ((*(npy_longlong *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+longlong_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longlong *data1 = (npy_longlong *)dataptr[1];
+    npy_longlong value0 = (*(npy_longlong *)dataptr[0]);
+    npy_longlong accum = longlong_sum_of_arr(data1, count);
+    *(npy_longlong *)dataptr[2] = ((*(npy_longlong *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+longlong_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longlong *data0 = (npy_longlong *)dataptr[0];
+    npy_longlong value1 = (*(npy_longlong *)dataptr[1]);
+    npy_longlong accum = longlong_sum_of_arr(data0, count);
+    *(npy_longlong *)dataptr[2] = ((*(npy_longlong *)dataptr[2]) + value1 * accum);
+}
+
+#elif 3 == 3 && !0
+
+static void
+longlong_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longlong *data0 = (npy_longlong *)dataptr[0];
+    npy_longlong *data1 = (npy_longlong *)dataptr[1];
+    npy_longlong *data2 = (npy_longlong *)dataptr[2];
+    npy_longlong *data_out = (npy_longlong *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 3 > 3 || @complex */
+
+static void
+longlong_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_three (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_longlong temp = (*(npy_longlong *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_longlong *)dataptr[i]);
+        }
+        *(npy_longlong *)dataptr[nop] = (temp +
+                                           (*(npy_longlong *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_longlong);
+        }
+#else /* complex */
+#  if 3 <= 3
+#    define _SUMPROD_NOP 3
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_longlong re, im, tmp;
+        int i;
+        re = ((npy_longlong *)dataptr[0])[0];
+        im = ((npy_longlong *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_longlong *)dataptr[i])[0] -
+                  im * ((npy_longlong *)dataptr[i])[1];
+            im = re * ((npy_longlong *)dataptr[i])[1] +
+                 im * ((npy_longlong *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_longlong *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_longlong *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_longlong);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 3 */
+
+#if 3 == 1
+
+static NPY_GCC_OPT_3 void
+longlong_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_longlong *data = (npy_longlong *)dataptr[0];
+    npy_longlong accum = longlong_sum_of_arr(data, count);
+    *((npy_longlong *)dataptr[1]) = (accum + (*((npy_longlong *)dataptr[1])));
+#else
+    npy_longlong accum_re = 0, accum_im = 0;
+    npy_longlong *data0 = (npy_longlong *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_longlong re01 = data0[0] + data0[2];
+        const npy_longlong re23 = data0[4] + data0[6];
+        const npy_longlong im13 = data0[1] + data0[3];
+        const npy_longlong im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_longlong *)dataptr[1])[0] += accum_re;
+    ((npy_longlong *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 3 == 1 */
+
+static void
+longlong_sum_of_products_outstride0_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_longlong accum_re = 0, accum_im = 0;
+#else
+    npy_longlong accum = 0;
+#endif
+
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (3 == 2 || 3 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (3 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_outstride0_three (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 3 == 1
+        accum += (*(npy_longlong *)data0);
+        data0 += stride0;
+#  elif 3 == 2
+        accum += (*(npy_longlong *)data0) *
+                 (*(npy_longlong *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 3 == 3
+        accum += (*(npy_longlong *)data0) *
+                 (*(npy_longlong *)data1) *
+                 (*(npy_longlong *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_longlong temp = (*(npy_longlong *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_longlong *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 3 == 1
+        accum_re += ((npy_longlong *)data0)[0];
+        accum_im += ((npy_longlong *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 3 <= 3
+#define _SUMPROD_NOP 3
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_longlong re, im, tmp;
+        int i;
+        re = ((npy_longlong *)dataptr[0])[0];
+        im = ((npy_longlong *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_longlong *)dataptr[i])[0] -
+                  im * ((npy_longlong *)dataptr[i])[1];
+            im = re * ((npy_longlong *)dataptr[i])[1] +
+                 im * ((npy_longlong *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 3 <= 3
+    ((npy_longlong *)dataptr[3])[0] += accum_re;
+    ((npy_longlong *)dataptr[3])[1] += accum_im;
+#  else
+    ((npy_longlong *)dataptr[nop])[0] += accum_re;
+    ((npy_longlong *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 3 <= 3
+    *((npy_longlong *)dataptr[3]) = (accum +
+                                    (*((npy_longlong *)dataptr[3])));
+#  else
+    *((npy_longlong *)dataptr[nop]) = (accum +
+                                    (*((npy_longlong *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+longlong_sum_of_products_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1000 == 2 || 1000 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1000 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data_out = dataptr[1000];
+    npy_intp stride_out = strides[1000];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_any (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 1000 == 1
+        *(npy_longlong *)data_out = ((*(npy_longlong *)data0) +
+                                         (*(npy_longlong *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 1000 == 2
+        *(npy_longlong *)data_out = ((*(npy_longlong *)data0) *
+                                         (*(npy_longlong *)data1) +
+                                         (*(npy_longlong *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 1000 == 3
+        *(npy_longlong *)data_out = ((*(npy_longlong *)data0) *
+                                         (*(npy_longlong *)data1) *
+                                         (*(npy_longlong *)data2) +
+                                         (*(npy_longlong *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_longlong temp = (*(npy_longlong *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_longlong *)dataptr[i]);
+        }
+        *(npy_longlong *)dataptr[nop] = (temp +
+                                           (*(npy_longlong *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1000 == 1
+        ((npy_longlong *)data_out)[0] = ((npy_longlong *)data0)[0] +
+                                         ((npy_longlong *)data_out)[0];
+        ((npy_longlong *)data_out)[1] = ((npy_longlong *)data0)[1] +
+                                         ((npy_longlong *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 1000 <= 3
+#define _SUMPROD_NOP 1000
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_longlong re, im, tmp;
+        int i;
+        re = ((npy_longlong *)dataptr[0])[0];
+        im = ((npy_longlong *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_longlong *)dataptr[i])[0] -
+                  im * ((npy_longlong *)dataptr[i])[1];
+            im = re * ((npy_longlong *)dataptr[i])[1] +
+                 im * ((npy_longlong *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_longlong *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_longlong *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 1000 == 1
+
+static void
+longlong_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longlong *data0 = (npy_longlong *)dataptr[0];
+    npy_longlong *data_out = (npy_longlong *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_longlong *)data_out + 2*6)[0] =
+                                    ((npy_longlong *)data0 + 2*6)[0] +
+                                    ((npy_longlong *)data_out + 2*6)[0];
+            ((npy_longlong *)data_out + 2*6)[1] =
+                                    ((npy_longlong *)data0 + 2*6)[1] +
+                                    ((npy_longlong *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_longlong *)data_out + 2*5)[0] =
+                                    ((npy_longlong *)data0 + 2*5)[0] +
+                                    ((npy_longlong *)data_out + 2*5)[0];
+            ((npy_longlong *)data_out + 2*5)[1] =
+                                    ((npy_longlong *)data0 + 2*5)[1] +
+                                    ((npy_longlong *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_longlong *)data_out + 2*4)[0] =
+                                    ((npy_longlong *)data0 + 2*4)[0] +
+                                    ((npy_longlong *)data_out + 2*4)[0];
+            ((npy_longlong *)data_out + 2*4)[1] =
+                                    ((npy_longlong *)data0 + 2*4)[1] +
+                                    ((npy_longlong *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_longlong *)data_out + 2*3)[0] =
+                                    ((npy_longlong *)data0 + 2*3)[0] +
+                                    ((npy_longlong *)data_out + 2*3)[0];
+            ((npy_longlong *)data_out + 2*3)[1] =
+                                    ((npy_longlong *)data0 + 2*3)[1] +
+                                    ((npy_longlong *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_longlong *)data_out + 2*2)[0] =
+                                    ((npy_longlong *)data0 + 2*2)[0] +
+                                    ((npy_longlong *)data_out + 2*2)[0];
+            ((npy_longlong *)data_out + 2*2)[1] =
+                                    ((npy_longlong *)data0 + 2*2)[1] +
+                                    ((npy_longlong *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_longlong *)data_out + 2*1)[0] =
+                                    ((npy_longlong *)data0 + 2*1)[0] +
+                                    ((npy_longlong *)data_out + 2*1)[0];
+            ((npy_longlong *)data_out + 2*1)[1] =
+                                    ((npy_longlong *)data0 + 2*1)[1] +
+                                    ((npy_longlong *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_longlong *)data_out + 2*0)[0] =
+                                    ((npy_longlong *)data0 + 2*0)[0] +
+                                    ((npy_longlong *)data_out + 2*0)[0];
+            ((npy_longlong *)data_out + 2*0)[1] =
+                                    ((npy_longlong *)data0 + 2*0)[1] +
+                                    ((npy_longlong *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_longlong *)data_out + 2*0)[0] =
+                                ((npy_longlong *)data0 + 2*0)[0] +
+                                ((npy_longlong *)data_out + 2*0)[0];
+        ((npy_longlong *)data_out + 2*0)[1] =
+                                ((npy_longlong *)data0 + 2*0)[1] +
+                                ((npy_longlong *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_longlong *)data_out + 2*1)[0] =
+                                ((npy_longlong *)data0 + 2*1)[0] +
+                                ((npy_longlong *)data_out + 2*1)[0];
+        ((npy_longlong *)data_out + 2*1)[1] =
+                                ((npy_longlong *)data0 + 2*1)[1] +
+                                ((npy_longlong *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_longlong *)data_out + 2*2)[0] =
+                                ((npy_longlong *)data0 + 2*2)[0] +
+                                ((npy_longlong *)data_out + 2*2)[0];
+        ((npy_longlong *)data_out + 2*2)[1] =
+                                ((npy_longlong *)data0 + 2*2)[1] +
+                                ((npy_longlong *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_longlong *)data_out + 2*3)[0] =
+                                ((npy_longlong *)data0 + 2*3)[0] +
+                                ((npy_longlong *)data_out + 2*3)[0];
+        ((npy_longlong *)data_out + 2*3)[1] =
+                                ((npy_longlong *)data0 + 2*3)[1] +
+                                ((npy_longlong *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_longlong *)data_out + 2*4)[0] =
+                                ((npy_longlong *)data0 + 2*4)[0] +
+                                ((npy_longlong *)data_out + 2*4)[0];
+        ((npy_longlong *)data_out + 2*4)[1] =
+                                ((npy_longlong *)data0 + 2*4)[1] +
+                                ((npy_longlong *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_longlong *)data_out + 2*5)[0] =
+                                ((npy_longlong *)data0 + 2*5)[0] +
+                                ((npy_longlong *)data_out + 2*5)[0];
+        ((npy_longlong *)data_out + 2*5)[1] =
+                                ((npy_longlong *)data0 + 2*5)[1] +
+                                ((npy_longlong *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_longlong *)data_out + 2*6)[0] =
+                                ((npy_longlong *)data0 + 2*6)[0] +
+                                ((npy_longlong *)data_out + 2*6)[0];
+        ((npy_longlong *)data_out + 2*6)[1] =
+                                ((npy_longlong *)data0 + 2*6)[1] +
+                                ((npy_longlong *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_longlong *)data_out + 2*7)[0] =
+                                ((npy_longlong *)data0 + 2*7)[0] +
+                                ((npy_longlong *)data_out + 2*7)[0];
+        ((npy_longlong *)data_out + 2*7)[1] =
+                                ((npy_longlong *)data0 + 2*7)[1] +
+                                ((npy_longlong *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 1000 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+longlong_sum_of_products_muladd(npy_longlong *data, npy_longlong *data_out, npy_longlong scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_longlong
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_s64;
+    const npyv_s64 v_scalar = npyv_setall_s64(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_s64 b0 = npyv_loada_s64(data + vstep * 0);
+            npyv_s64 c0 = npyv_loada_s64(data_out + vstep * 0);
+            
+#line 312
+            npyv_s64 b1 = npyv_loada_s64(data + vstep * 1);
+            npyv_s64 c1 = npyv_loada_s64(data_out + vstep * 1);
+            
+#line 312
+            npyv_s64 b2 = npyv_loada_s64(data + vstep * 2);
+            npyv_s64 c2 = npyv_loada_s64(data_out + vstep * 2);
+            
+#line 312
+            npyv_s64 b3 = npyv_loada_s64(data + vstep * 3);
+            npyv_s64 c3 = npyv_loada_s64(data_out + vstep * 3);
+            
+            #line 318
+            npyv_s64 abc0 = npyv_muladd_s64(v_scalar, b0, c0);
+            
+#line 318
+            npyv_s64 abc1 = npyv_muladd_s64(v_scalar, b1, c1);
+            
+#line 318
+            npyv_s64 abc2 = npyv_muladd_s64(v_scalar, b2, c2);
+            
+#line 318
+            npyv_s64 abc3 = npyv_muladd_s64(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_s64(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_s64(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_s64(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_s64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_s64 b0 = npyv_load_s64(data + vstep * 0);
+            npyv_s64 c0 = npyv_load_s64(data_out + vstep * 0);
+            
+#line 312
+            npyv_s64 b1 = npyv_load_s64(data + vstep * 1);
+            npyv_s64 c1 = npyv_load_s64(data_out + vstep * 1);
+            
+#line 312
+            npyv_s64 b2 = npyv_load_s64(data + vstep * 2);
+            npyv_s64 c2 = npyv_load_s64(data_out + vstep * 2);
+            
+#line 312
+            npyv_s64 b3 = npyv_load_s64(data + vstep * 3);
+            npyv_s64 c3 = npyv_load_s64(data_out + vstep * 3);
+            
+            #line 318
+            npyv_s64 abc0 = npyv_muladd_s64(v_scalar, b0, c0);
+            
+#line 318
+            npyv_s64 abc1 = npyv_muladd_s64(v_scalar, b1, c1);
+            
+#line 318
+            npyv_s64 abc2 = npyv_muladd_s64(v_scalar, b2, c2);
+            
+#line 318
+            npyv_s64 abc3 = npyv_muladd_s64(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_s64(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_s64(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_s64(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_s64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_s64 a = npyv_load_tillz_s64(data, count);
+        npyv_s64 b = npyv_load_tillz_s64(data_out, count);
+        npyv_store_till_s64(data_out, count, npyv_muladd_s64(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_longlong b0 = (data[0]);
+        const npy_longlong c0 = (data_out[0]);
+        
+#line 340
+        const npy_longlong b1 = (data[1]);
+        const npy_longlong c1 = (data_out[1]);
+        
+#line 340
+        const npy_longlong b2 = (data[2]);
+        const npy_longlong c2 = (data_out[2]);
+        
+#line 340
+        const npy_longlong b3 = (data[3]);
+        const npy_longlong c3 = (data_out[3]);
+        
+        #line 346
+        const npy_longlong abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_longlong abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_longlong abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_longlong abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_longlong b = (*data);
+        const npy_longlong c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_longlong
+}
+
+static void
+longlong_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longlong *data0 = (npy_longlong *)dataptr[0];
+    npy_longlong *data1 = (npy_longlong *)dataptr[1];
+    npy_longlong *data_out = (npy_longlong *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_longlong
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_s64;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_s64 a0 = npyv_loada_s64(data0 + vstep * 0);
+            npyv_s64 b0 = npyv_loada_s64(data1 + vstep * 0);
+            npyv_s64 c0 = npyv_loada_s64(data_out + vstep * 0);
+            
+#line 390
+            npyv_s64 a1 = npyv_loada_s64(data0 + vstep * 1);
+            npyv_s64 b1 = npyv_loada_s64(data1 + vstep * 1);
+            npyv_s64 c1 = npyv_loada_s64(data_out + vstep * 1);
+            
+#line 390
+            npyv_s64 a2 = npyv_loada_s64(data0 + vstep * 2);
+            npyv_s64 b2 = npyv_loada_s64(data1 + vstep * 2);
+            npyv_s64 c2 = npyv_loada_s64(data_out + vstep * 2);
+            
+#line 390
+            npyv_s64 a3 = npyv_loada_s64(data0 + vstep * 3);
+            npyv_s64 b3 = npyv_loada_s64(data1 + vstep * 3);
+            npyv_s64 c3 = npyv_loada_s64(data_out + vstep * 3);
+            
+            #line 397
+            npyv_s64 abc0 = npyv_muladd_s64(a0, b0, c0);
+            
+#line 397
+            npyv_s64 abc1 = npyv_muladd_s64(a1, b1, c1);
+            
+#line 397
+            npyv_s64 abc2 = npyv_muladd_s64(a2, b2, c2);
+            
+#line 397
+            npyv_s64 abc3 = npyv_muladd_s64(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_s64(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_s64(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_s64(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_s64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_s64 a0 = npyv_load_s64(data0 + vstep * 0);
+            npyv_s64 b0 = npyv_load_s64(data1 + vstep * 0);
+            npyv_s64 c0 = npyv_load_s64(data_out + vstep * 0);
+            
+#line 390
+            npyv_s64 a1 = npyv_load_s64(data0 + vstep * 1);
+            npyv_s64 b1 = npyv_load_s64(data1 + vstep * 1);
+            npyv_s64 c1 = npyv_load_s64(data_out + vstep * 1);
+            
+#line 390
+            npyv_s64 a2 = npyv_load_s64(data0 + vstep * 2);
+            npyv_s64 b2 = npyv_load_s64(data1 + vstep * 2);
+            npyv_s64 c2 = npyv_load_s64(data_out + vstep * 2);
+            
+#line 390
+            npyv_s64 a3 = npyv_load_s64(data0 + vstep * 3);
+            npyv_s64 b3 = npyv_load_s64(data1 + vstep * 3);
+            npyv_s64 c3 = npyv_load_s64(data_out + vstep * 3);
+            
+            #line 397
+            npyv_s64 abc0 = npyv_muladd_s64(a0, b0, c0);
+            
+#line 397
+            npyv_s64 abc1 = npyv_muladd_s64(a1, b1, c1);
+            
+#line 397
+            npyv_s64 abc2 = npyv_muladd_s64(a2, b2, c2);
+            
+#line 397
+            npyv_s64 abc3 = npyv_muladd_s64(a3, b3, c3);
+            
+            #line 402
+            npyv_store_s64(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_s64(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_s64(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_s64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_s64 a = npyv_load_tillz_s64(data0, count);
+        npyv_s64 b = npyv_load_tillz_s64(data1, count);
+        npyv_s64 c = npyv_load_tillz_s64(data_out, count);
+        npyv_store_till_s64(data_out, count, npyv_muladd_s64(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_longlong a0 = (data0[0]);
+        const npy_longlong b0 = (data1[0]);
+        const npy_longlong c0 = (data_out[0]);
+        
+#line 420
+        const npy_longlong a1 = (data0[1]);
+        const npy_longlong b1 = (data1[1]);
+        const npy_longlong c1 = (data_out[1]);
+        
+#line 420
+        const npy_longlong a2 = (data0[2]);
+        const npy_longlong b2 = (data1[2]);
+        const npy_longlong c2 = (data_out[2]);
+        
+#line 420
+        const npy_longlong a3 = (data0[3]);
+        const npy_longlong b3 = (data1[3]);
+        const npy_longlong c3 = (data_out[3]);
+        
+        #line 427
+        const npy_longlong abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_longlong abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_longlong abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_longlong abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_longlong a = (*data0);
+        const npy_longlong b = (*data1);
+        const npy_longlong c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_longlong
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+longlong_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longlong value0 = (*(npy_longlong *)dataptr[0]);
+    npy_longlong *data1 = (npy_longlong *)dataptr[1];
+    npy_longlong *data_out = (npy_longlong *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    longlong_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+longlong_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longlong value1 = (*(npy_longlong *)dataptr[1]);
+    npy_longlong *data0 = (npy_longlong *)dataptr[0];
+    npy_longlong *data_out = (npy_longlong *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    longlong_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+longlong_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longlong *data0 = (npy_longlong *)dataptr[0];
+    npy_longlong *data1 = (npy_longlong *)dataptr[1];
+    npy_longlong accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_longlong
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_s64;
+    npyv_s64 v_accum = npyv_zero_s64();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_s64 a0 = npyv_loada_s64(data0 + vstep * 0);
+            npyv_s64 b0 = npyv_loada_s64(data1 + vstep * 0);
+            
+#line 501
+            npyv_s64 a1 = npyv_loada_s64(data0 + vstep * 1);
+            npyv_s64 b1 = npyv_loada_s64(data1 + vstep * 1);
+            
+#line 501
+            npyv_s64 a2 = npyv_loada_s64(data0 + vstep * 2);
+            npyv_s64 b2 = npyv_loada_s64(data1 + vstep * 2);
+            
+#line 501
+            npyv_s64 a3 = npyv_loada_s64(data0 + vstep * 3);
+            npyv_s64 b3 = npyv_loada_s64(data1 + vstep * 3);
+            
+            npyv_s64 ab3 = npyv_muladd_s64(a3, b3, v_accum);
+            npyv_s64 ab2 = npyv_muladd_s64(a2, b2, ab3);
+            npyv_s64 ab1 = npyv_muladd_s64(a1, b1, ab2);
+                   v_accum = npyv_muladd_s64(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_s64 a0 = npyv_load_s64(data0 + vstep * 0);
+            npyv_s64 b0 = npyv_load_s64(data1 + vstep * 0);
+            
+#line 501
+            npyv_s64 a1 = npyv_load_s64(data0 + vstep * 1);
+            npyv_s64 b1 = npyv_load_s64(data1 + vstep * 1);
+            
+#line 501
+            npyv_s64 a2 = npyv_load_s64(data0 + vstep * 2);
+            npyv_s64 b2 = npyv_load_s64(data1 + vstep * 2);
+            
+#line 501
+            npyv_s64 a3 = npyv_load_s64(data0 + vstep * 3);
+            npyv_s64 b3 = npyv_load_s64(data1 + vstep * 3);
+            
+            npyv_s64 ab3 = npyv_muladd_s64(a3, b3, v_accum);
+            npyv_s64 ab2 = npyv_muladd_s64(a2, b2, ab3);
+            npyv_s64 ab1 = npyv_muladd_s64(a1, b1, ab2);
+                   v_accum = npyv_muladd_s64(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_s64 a = npyv_load_tillz_s64(data0, count);
+        npyv_s64 b = npyv_load_tillz_s64(data1, count);
+        v_accum = npyv_muladd_s64(a, b, v_accum);
+    }
+    accum = npyv_sum_s64(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_longlong ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_longlong ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_longlong ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_longlong ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_longlong a = (*data0);
+        const npy_longlong b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_longlong
+    *(npy_longlong *)dataptr[2] = ((*(npy_longlong *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+longlong_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longlong *data1 = (npy_longlong *)dataptr[1];
+    npy_longlong value0 = (*(npy_longlong *)dataptr[0]);
+    npy_longlong accum = longlong_sum_of_arr(data1, count);
+    *(npy_longlong *)dataptr[2] = ((*(npy_longlong *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+longlong_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longlong *data0 = (npy_longlong *)dataptr[0];
+    npy_longlong value1 = (*(npy_longlong *)dataptr[1]);
+    npy_longlong accum = longlong_sum_of_arr(data0, count);
+    *(npy_longlong *)dataptr[2] = ((*(npy_longlong *)dataptr[2]) + value1 * accum);
+}
+
+#elif 1000 == 3 && !0
+
+static void
+longlong_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longlong *data0 = (npy_longlong *)dataptr[0];
+    npy_longlong *data1 = (npy_longlong *)dataptr[1];
+    npy_longlong *data2 = (npy_longlong *)dataptr[2];
+    npy_longlong *data_out = (npy_longlong *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 1000 > 3 || @complex */
+
+static void
+longlong_sum_of_products_contig_any(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_any (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_longlong temp = (*(npy_longlong *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_longlong *)dataptr[i]);
+        }
+        *(npy_longlong *)dataptr[nop] = (temp +
+                                           (*(npy_longlong *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_longlong);
+        }
+#else /* complex */
+#  if 1000 <= 3
+#    define _SUMPROD_NOP 1000
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_longlong re, im, tmp;
+        int i;
+        re = ((npy_longlong *)dataptr[0])[0];
+        im = ((npy_longlong *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_longlong *)dataptr[i])[0] -
+                  im * ((npy_longlong *)dataptr[i])[1];
+            im = re * ((npy_longlong *)dataptr[i])[1] +
+                 im * ((npy_longlong *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_longlong *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_longlong *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_longlong *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_longlong);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 1000 */
+
+#if 1000 == 1
+
+static NPY_GCC_OPT_3 void
+longlong_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_longlong *data = (npy_longlong *)dataptr[0];
+    npy_longlong accum = longlong_sum_of_arr(data, count);
+    *((npy_longlong *)dataptr[1]) = (accum + (*((npy_longlong *)dataptr[1])));
+#else
+    npy_longlong accum_re = 0, accum_im = 0;
+    npy_longlong *data0 = (npy_longlong *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_longlong re01 = data0[0] + data0[2];
+        const npy_longlong re23 = data0[4] + data0[6];
+        const npy_longlong im13 = data0[1] + data0[3];
+        const npy_longlong im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_longlong *)dataptr[1])[0] += accum_re;
+    ((npy_longlong *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 1000 == 1 */
+
+static void
+longlong_sum_of_products_outstride0_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_longlong accum_re = 0, accum_im = 0;
+#else
+    npy_longlong accum = 0;
+#endif
+
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1000 == 2 || 1000 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1000 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("longlong_sum_of_products_outstride0_any (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 1000 == 1
+        accum += (*(npy_longlong *)data0);
+        data0 += stride0;
+#  elif 1000 == 2
+        accum += (*(npy_longlong *)data0) *
+                 (*(npy_longlong *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 1000 == 3
+        accum += (*(npy_longlong *)data0) *
+                 (*(npy_longlong *)data1) *
+                 (*(npy_longlong *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_longlong temp = (*(npy_longlong *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_longlong *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1000 == 1
+        accum_re += ((npy_longlong *)data0)[0];
+        accum_im += ((npy_longlong *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 1000 <= 3
+#define _SUMPROD_NOP 1000
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_longlong re, im, tmp;
+        int i;
+        re = ((npy_longlong *)dataptr[0])[0];
+        im = ((npy_longlong *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_longlong *)dataptr[i])[0] -
+                  im * ((npy_longlong *)dataptr[i])[1];
+            im = re * ((npy_longlong *)dataptr[i])[1] +
+                 im * ((npy_longlong *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 1000 <= 3
+    ((npy_longlong *)dataptr[1000])[0] += accum_re;
+    ((npy_longlong *)dataptr[1000])[1] += accum_im;
+#  else
+    ((npy_longlong *)dataptr[nop])[0] += accum_re;
+    ((npy_longlong *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 1000 <= 3
+    *((npy_longlong *)dataptr[1000]) = (accum +
+                                    (*((npy_longlong *)dataptr[1000])));
+#  else
+    *((npy_longlong *)dataptr[nop]) = (accum +
+                                    (*((npy_longlong *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+
+
+#line 74
+
+#if !0
+static NPY_GCC_OPT_3 npy_ubyte ubyte_sum_of_arr(npy_ubyte *data, npy_intp count)
+{
+    npy_ubyte accum = 0;
+#if 0 // NPYV check for npy_ubyte
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data);
+    const int vstep = npyv_nlanes_u8;
+    npyv_u8 v_accum = npyv_zero_u8();
+    const npy_intp vstepx4 = vstep * 4;
+
+    #line 91
+    if(is_aligned) {
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
+            #line 96
+            npyv_u8 a0 = npyv_loada_u8(data + vstep * 0);
+            
+#line 96
+            npyv_u8 a1 = npyv_loada_u8(data + vstep * 1);
+            
+#line 96
+            npyv_u8 a2 = npyv_loada_u8(data + vstep * 2);
+            
+#line 96
+            npyv_u8 a3 = npyv_loada_u8(data + vstep * 3);
+            
+            npyv_u8 a01   = npyv_add_u8(a0, a1);
+            npyv_u8 a23   = npyv_add_u8(a2, a3);
+            npyv_u8 a0123 = npyv_add_u8(a01, a23);
+                     v_accum = npyv_add_u8(a0123, v_accum);
+        }
+    }
+    
+#line 91
+    else {
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
+            #line 96
+            npyv_u8 a0 = npyv_load_u8(data + vstep * 0);
+            
+#line 96
+            npyv_u8 a1 = npyv_load_u8(data + vstep * 1);
+            
+#line 96
+            npyv_u8 a2 = npyv_load_u8(data + vstep * 2);
+            
+#line 96
+            npyv_u8 a3 = npyv_load_u8(data + vstep * 3);
+            
+            npyv_u8 a01   = npyv_add_u8(a0, a1);
+            npyv_u8 a23   = npyv_add_u8(a2, a3);
+            npyv_u8 a0123 = npyv_add_u8(a01, a23);
+                     v_accum = npyv_add_u8(a0123, v_accum);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep) {
+        npyv_u8 a = npyv_load_tillz_u8(data, count);
+        v_accum = npyv_add_u8(a, v_accum);
+    }
+    accum = npyv_sum_u8(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data += 4) {
+        const npy_ubyte a01 = (*data) + (data[1]);
+        const npy_ubyte a23 = (data[2]) + (data[3]);
+        accum +=  a01 + a23;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data++) {
+        accum += (*data);
+    }
+#endif // NPYV check for npy_ubyte
+    return accum;
+}
+#endif
+
+#line 131
+static void
+ubyte_sum_of_products_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1 == 2 || 1 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data_out = dataptr[1];
+    npy_intp stride_out = strides[1];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_one (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 1 == 1
+        *(npy_ubyte *)data_out = ((*(npy_ubyte *)data0) +
+                                         (*(npy_ubyte *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 1 == 2
+        *(npy_ubyte *)data_out = ((*(npy_ubyte *)data0) *
+                                         (*(npy_ubyte *)data1) +
+                                         (*(npy_ubyte *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 1 == 3
+        *(npy_ubyte *)data_out = ((*(npy_ubyte *)data0) *
+                                         (*(npy_ubyte *)data1) *
+                                         (*(npy_ubyte *)data2) +
+                                         (*(npy_ubyte *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_ubyte temp = (*(npy_ubyte *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ubyte *)dataptr[i]);
+        }
+        *(npy_ubyte *)dataptr[nop] = (temp +
+                                           (*(npy_ubyte *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1 == 1
+        ((npy_ubyte *)data_out)[0] = ((npy_ubyte *)data0)[0] +
+                                         ((npy_ubyte *)data_out)[0];
+        ((npy_ubyte *)data_out)[1] = ((npy_ubyte *)data0)[1] +
+                                         ((npy_ubyte *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 1 <= 3
+#define _SUMPROD_NOP 1
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_ubyte re, im, tmp;
+        int i;
+        re = ((npy_ubyte *)dataptr[0])[0];
+        im = ((npy_ubyte *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ubyte *)dataptr[i])[0] -
+                  im * ((npy_ubyte *)dataptr[i])[1];
+            im = re * ((npy_ubyte *)dataptr[i])[1] +
+                 im * ((npy_ubyte *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_ubyte *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_ubyte *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_ubyte *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_ubyte *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 1 == 1
+
+static void
+ubyte_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
+    npy_ubyte *data_out = (npy_ubyte *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_ubyte *)data_out + 2*6)[0] =
+                                    ((npy_ubyte *)data0 + 2*6)[0] +
+                                    ((npy_ubyte *)data_out + 2*6)[0];
+            ((npy_ubyte *)data_out + 2*6)[1] =
+                                    ((npy_ubyte *)data0 + 2*6)[1] +
+                                    ((npy_ubyte *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_ubyte *)data_out + 2*5)[0] =
+                                    ((npy_ubyte *)data0 + 2*5)[0] +
+                                    ((npy_ubyte *)data_out + 2*5)[0];
+            ((npy_ubyte *)data_out + 2*5)[1] =
+                                    ((npy_ubyte *)data0 + 2*5)[1] +
+                                    ((npy_ubyte *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_ubyte *)data_out + 2*4)[0] =
+                                    ((npy_ubyte *)data0 + 2*4)[0] +
+                                    ((npy_ubyte *)data_out + 2*4)[0];
+            ((npy_ubyte *)data_out + 2*4)[1] =
+                                    ((npy_ubyte *)data0 + 2*4)[1] +
+                                    ((npy_ubyte *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_ubyte *)data_out + 2*3)[0] =
+                                    ((npy_ubyte *)data0 + 2*3)[0] +
+                                    ((npy_ubyte *)data_out + 2*3)[0];
+            ((npy_ubyte *)data_out + 2*3)[1] =
+                                    ((npy_ubyte *)data0 + 2*3)[1] +
+                                    ((npy_ubyte *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_ubyte *)data_out + 2*2)[0] =
+                                    ((npy_ubyte *)data0 + 2*2)[0] +
+                                    ((npy_ubyte *)data_out + 2*2)[0];
+            ((npy_ubyte *)data_out + 2*2)[1] =
+                                    ((npy_ubyte *)data0 + 2*2)[1] +
+                                    ((npy_ubyte *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_ubyte *)data_out + 2*1)[0] =
+                                    ((npy_ubyte *)data0 + 2*1)[0] +
+                                    ((npy_ubyte *)data_out + 2*1)[0];
+            ((npy_ubyte *)data_out + 2*1)[1] =
+                                    ((npy_ubyte *)data0 + 2*1)[1] +
+                                    ((npy_ubyte *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_ubyte *)data_out + 2*0)[0] =
+                                    ((npy_ubyte *)data0 + 2*0)[0] +
+                                    ((npy_ubyte *)data_out + 2*0)[0];
+            ((npy_ubyte *)data_out + 2*0)[1] =
+                                    ((npy_ubyte *)data0 + 2*0)[1] +
+                                    ((npy_ubyte *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_ubyte *)data_out + 2*0)[0] =
+                                ((npy_ubyte *)data0 + 2*0)[0] +
+                                ((npy_ubyte *)data_out + 2*0)[0];
+        ((npy_ubyte *)data_out + 2*0)[1] =
+                                ((npy_ubyte *)data0 + 2*0)[1] +
+                                ((npy_ubyte *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_ubyte *)data_out + 2*1)[0] =
+                                ((npy_ubyte *)data0 + 2*1)[0] +
+                                ((npy_ubyte *)data_out + 2*1)[0];
+        ((npy_ubyte *)data_out + 2*1)[1] =
+                                ((npy_ubyte *)data0 + 2*1)[1] +
+                                ((npy_ubyte *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_ubyte *)data_out + 2*2)[0] =
+                                ((npy_ubyte *)data0 + 2*2)[0] +
+                                ((npy_ubyte *)data_out + 2*2)[0];
+        ((npy_ubyte *)data_out + 2*2)[1] =
+                                ((npy_ubyte *)data0 + 2*2)[1] +
+                                ((npy_ubyte *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_ubyte *)data_out + 2*3)[0] =
+                                ((npy_ubyte *)data0 + 2*3)[0] +
+                                ((npy_ubyte *)data_out + 2*3)[0];
+        ((npy_ubyte *)data_out + 2*3)[1] =
+                                ((npy_ubyte *)data0 + 2*3)[1] +
+                                ((npy_ubyte *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_ubyte *)data_out + 2*4)[0] =
+                                ((npy_ubyte *)data0 + 2*4)[0] +
+                                ((npy_ubyte *)data_out + 2*4)[0];
+        ((npy_ubyte *)data_out + 2*4)[1] =
+                                ((npy_ubyte *)data0 + 2*4)[1] +
+                                ((npy_ubyte *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_ubyte *)data_out + 2*5)[0] =
+                                ((npy_ubyte *)data0 + 2*5)[0] +
+                                ((npy_ubyte *)data_out + 2*5)[0];
+        ((npy_ubyte *)data_out + 2*5)[1] =
+                                ((npy_ubyte *)data0 + 2*5)[1] +
+                                ((npy_ubyte *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_ubyte *)data_out + 2*6)[0] =
+                                ((npy_ubyte *)data0 + 2*6)[0] +
+                                ((npy_ubyte *)data_out + 2*6)[0];
+        ((npy_ubyte *)data_out + 2*6)[1] =
+                                ((npy_ubyte *)data0 + 2*6)[1] +
+                                ((npy_ubyte *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_ubyte *)data_out + 2*7)[0] =
+                                ((npy_ubyte *)data0 + 2*7)[0] +
+                                ((npy_ubyte *)data_out + 2*7)[0];
+        ((npy_ubyte *)data_out + 2*7)[1] =
+                                ((npy_ubyte *)data0 + 2*7)[1] +
+                                ((npy_ubyte *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 1 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+ubyte_sum_of_products_muladd(npy_ubyte *data, npy_ubyte *data_out, npy_ubyte scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_ubyte
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_u8;
+    const npyv_u8 v_scalar = npyv_setall_u8(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_u8 b0 = npyv_loada_u8(data + vstep * 0);
+            npyv_u8 c0 = npyv_loada_u8(data_out + vstep * 0);
+            
+#line 312
+            npyv_u8 b1 = npyv_loada_u8(data + vstep * 1);
+            npyv_u8 c1 = npyv_loada_u8(data_out + vstep * 1);
+            
+#line 312
+            npyv_u8 b2 = npyv_loada_u8(data + vstep * 2);
+            npyv_u8 c2 = npyv_loada_u8(data_out + vstep * 2);
+            
+#line 312
+            npyv_u8 b3 = npyv_loada_u8(data + vstep * 3);
+            npyv_u8 c3 = npyv_loada_u8(data_out + vstep * 3);
+            
+            #line 318
+            npyv_u8 abc0 = npyv_muladd_u8(v_scalar, b0, c0);
+            
+#line 318
+            npyv_u8 abc1 = npyv_muladd_u8(v_scalar, b1, c1);
+            
+#line 318
+            npyv_u8 abc2 = npyv_muladd_u8(v_scalar, b2, c2);
+            
+#line 318
+            npyv_u8 abc3 = npyv_muladd_u8(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_u8(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_u8(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_u8(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_u8(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_u8 b0 = npyv_load_u8(data + vstep * 0);
+            npyv_u8 c0 = npyv_load_u8(data_out + vstep * 0);
+            
+#line 312
+            npyv_u8 b1 = npyv_load_u8(data + vstep * 1);
+            npyv_u8 c1 = npyv_load_u8(data_out + vstep * 1);
+            
+#line 312
+            npyv_u8 b2 = npyv_load_u8(data + vstep * 2);
+            npyv_u8 c2 = npyv_load_u8(data_out + vstep * 2);
+            
+#line 312
+            npyv_u8 b3 = npyv_load_u8(data + vstep * 3);
+            npyv_u8 c3 = npyv_load_u8(data_out + vstep * 3);
+            
+            #line 318
+            npyv_u8 abc0 = npyv_muladd_u8(v_scalar, b0, c0);
+            
+#line 318
+            npyv_u8 abc1 = npyv_muladd_u8(v_scalar, b1, c1);
+            
+#line 318
+            npyv_u8 abc2 = npyv_muladd_u8(v_scalar, b2, c2);
+            
+#line 318
+            npyv_u8 abc3 = npyv_muladd_u8(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_u8(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_u8(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_u8(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_u8(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_u8 a = npyv_load_tillz_u8(data, count);
+        npyv_u8 b = npyv_load_tillz_u8(data_out, count);
+        npyv_store_till_u8(data_out, count, npyv_muladd_u8(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_ubyte b0 = (data[0]);
+        const npy_ubyte c0 = (data_out[0]);
+        
+#line 340
+        const npy_ubyte b1 = (data[1]);
+        const npy_ubyte c1 = (data_out[1]);
+        
+#line 340
+        const npy_ubyte b2 = (data[2]);
+        const npy_ubyte c2 = (data_out[2]);
+        
+#line 340
+        const npy_ubyte b3 = (data[3]);
+        const npy_ubyte c3 = (data_out[3]);
+        
+        #line 346
+        const npy_ubyte abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_ubyte abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_ubyte abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_ubyte abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_ubyte b = (*data);
+        const npy_ubyte c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_ubyte
+}
+
+static void
+ubyte_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
+    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
+    npy_ubyte *data_out = (npy_ubyte *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_ubyte
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_u8;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_u8 a0 = npyv_loada_u8(data0 + vstep * 0);
+            npyv_u8 b0 = npyv_loada_u8(data1 + vstep * 0);
+            npyv_u8 c0 = npyv_loada_u8(data_out + vstep * 0);
+            
+#line 390
+            npyv_u8 a1 = npyv_loada_u8(data0 + vstep * 1);
+            npyv_u8 b1 = npyv_loada_u8(data1 + vstep * 1);
+            npyv_u8 c1 = npyv_loada_u8(data_out + vstep * 1);
+            
+#line 390
+            npyv_u8 a2 = npyv_loada_u8(data0 + vstep * 2);
+            npyv_u8 b2 = npyv_loada_u8(data1 + vstep * 2);
+            npyv_u8 c2 = npyv_loada_u8(data_out + vstep * 2);
+            
+#line 390
+            npyv_u8 a3 = npyv_loada_u8(data0 + vstep * 3);
+            npyv_u8 b3 = npyv_loada_u8(data1 + vstep * 3);
+            npyv_u8 c3 = npyv_loada_u8(data_out + vstep * 3);
+            
+            #line 397
+            npyv_u8 abc0 = npyv_muladd_u8(a0, b0, c0);
+            
+#line 397
+            npyv_u8 abc1 = npyv_muladd_u8(a1, b1, c1);
+            
+#line 397
+            npyv_u8 abc2 = npyv_muladd_u8(a2, b2, c2);
+            
+#line 397
+            npyv_u8 abc3 = npyv_muladd_u8(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_u8(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_u8(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_u8(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_u8(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_u8 a0 = npyv_load_u8(data0 + vstep * 0);
+            npyv_u8 b0 = npyv_load_u8(data1 + vstep * 0);
+            npyv_u8 c0 = npyv_load_u8(data_out + vstep * 0);
+            
+#line 390
+            npyv_u8 a1 = npyv_load_u8(data0 + vstep * 1);
+            npyv_u8 b1 = npyv_load_u8(data1 + vstep * 1);
+            npyv_u8 c1 = npyv_load_u8(data_out + vstep * 1);
+            
+#line 390
+            npyv_u8 a2 = npyv_load_u8(data0 + vstep * 2);
+            npyv_u8 b2 = npyv_load_u8(data1 + vstep * 2);
+            npyv_u8 c2 = npyv_load_u8(data_out + vstep * 2);
+            
+#line 390
+            npyv_u8 a3 = npyv_load_u8(data0 + vstep * 3);
+            npyv_u8 b3 = npyv_load_u8(data1 + vstep * 3);
+            npyv_u8 c3 = npyv_load_u8(data_out + vstep * 3);
+            
+            #line 397
+            npyv_u8 abc0 = npyv_muladd_u8(a0, b0, c0);
+            
+#line 397
+            npyv_u8 abc1 = npyv_muladd_u8(a1, b1, c1);
+            
+#line 397
+            npyv_u8 abc2 = npyv_muladd_u8(a2, b2, c2);
+            
+#line 397
+            npyv_u8 abc3 = npyv_muladd_u8(a3, b3, c3);
+            
+            #line 402
+            npyv_store_u8(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_u8(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_u8(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_u8(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_u8 a = npyv_load_tillz_u8(data0, count);
+        npyv_u8 b = npyv_load_tillz_u8(data1, count);
+        npyv_u8 c = npyv_load_tillz_u8(data_out, count);
+        npyv_store_till_u8(data_out, count, npyv_muladd_u8(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_ubyte a0 = (data0[0]);
+        const npy_ubyte b0 = (data1[0]);
+        const npy_ubyte c0 = (data_out[0]);
+        
+#line 420
+        const npy_ubyte a1 = (data0[1]);
+        const npy_ubyte b1 = (data1[1]);
+        const npy_ubyte c1 = (data_out[1]);
+        
+#line 420
+        const npy_ubyte a2 = (data0[2]);
+        const npy_ubyte b2 = (data1[2]);
+        const npy_ubyte c2 = (data_out[2]);
+        
+#line 420
+        const npy_ubyte a3 = (data0[3]);
+        const npy_ubyte b3 = (data1[3]);
+        const npy_ubyte c3 = (data_out[3]);
+        
+        #line 427
+        const npy_ubyte abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_ubyte abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_ubyte abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_ubyte abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_ubyte a = (*data0);
+        const npy_ubyte b = (*data1);
+        const npy_ubyte c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_ubyte
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+ubyte_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ubyte value0 = (*(npy_ubyte *)dataptr[0]);
+    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
+    npy_ubyte *data_out = (npy_ubyte *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    ubyte_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+ubyte_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ubyte value1 = (*(npy_ubyte *)dataptr[1]);
+    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
+    npy_ubyte *data_out = (npy_ubyte *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    ubyte_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+ubyte_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
+    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
+    npy_ubyte accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_ubyte
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_u8;
+    npyv_u8 v_accum = npyv_zero_u8();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_u8 a0 = npyv_loada_u8(data0 + vstep * 0);
+            npyv_u8 b0 = npyv_loada_u8(data1 + vstep * 0);
+            
+#line 501
+            npyv_u8 a1 = npyv_loada_u8(data0 + vstep * 1);
+            npyv_u8 b1 = npyv_loada_u8(data1 + vstep * 1);
+            
+#line 501
+            npyv_u8 a2 = npyv_loada_u8(data0 + vstep * 2);
+            npyv_u8 b2 = npyv_loada_u8(data1 + vstep * 2);
+            
+#line 501
+            npyv_u8 a3 = npyv_loada_u8(data0 + vstep * 3);
+            npyv_u8 b3 = npyv_loada_u8(data1 + vstep * 3);
+            
+            npyv_u8 ab3 = npyv_muladd_u8(a3, b3, v_accum);
+            npyv_u8 ab2 = npyv_muladd_u8(a2, b2, ab3);
+            npyv_u8 ab1 = npyv_muladd_u8(a1, b1, ab2);
+                   v_accum = npyv_muladd_u8(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_u8 a0 = npyv_load_u8(data0 + vstep * 0);
+            npyv_u8 b0 = npyv_load_u8(data1 + vstep * 0);
+            
+#line 501
+            npyv_u8 a1 = npyv_load_u8(data0 + vstep * 1);
+            npyv_u8 b1 = npyv_load_u8(data1 + vstep * 1);
+            
+#line 501
+            npyv_u8 a2 = npyv_load_u8(data0 + vstep * 2);
+            npyv_u8 b2 = npyv_load_u8(data1 + vstep * 2);
+            
+#line 501
+            npyv_u8 a3 = npyv_load_u8(data0 + vstep * 3);
+            npyv_u8 b3 = npyv_load_u8(data1 + vstep * 3);
+            
+            npyv_u8 ab3 = npyv_muladd_u8(a3, b3, v_accum);
+            npyv_u8 ab2 = npyv_muladd_u8(a2, b2, ab3);
+            npyv_u8 ab1 = npyv_muladd_u8(a1, b1, ab2);
+                   v_accum = npyv_muladd_u8(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_u8 a = npyv_load_tillz_u8(data0, count);
+        npyv_u8 b = npyv_load_tillz_u8(data1, count);
+        v_accum = npyv_muladd_u8(a, b, v_accum);
+    }
+    accum = npyv_sum_u8(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_ubyte ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_ubyte ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_ubyte ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_ubyte ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_ubyte a = (*data0);
+        const npy_ubyte b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_ubyte
+    *(npy_ubyte *)dataptr[2] = ((*(npy_ubyte *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+ubyte_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
+    npy_ubyte value0 = (*(npy_ubyte *)dataptr[0]);
+    npy_ubyte accum = ubyte_sum_of_arr(data1, count);
+    *(npy_ubyte *)dataptr[2] = ((*(npy_ubyte *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+ubyte_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
+    npy_ubyte value1 = (*(npy_ubyte *)dataptr[1]);
+    npy_ubyte accum = ubyte_sum_of_arr(data0, count);
+    *(npy_ubyte *)dataptr[2] = ((*(npy_ubyte *)dataptr[2]) + value1 * accum);
+}
+
+#elif 1 == 3 && !0
+
+static void
+ubyte_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
+    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
+    npy_ubyte *data2 = (npy_ubyte *)dataptr[2];
+    npy_ubyte *data_out = (npy_ubyte *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 1 > 3 || @complex */
+
+static void
+ubyte_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_one (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_ubyte temp = (*(npy_ubyte *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ubyte *)dataptr[i]);
+        }
+        *(npy_ubyte *)dataptr[nop] = (temp +
+                                           (*(npy_ubyte *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_ubyte);
+        }
+#else /* complex */
+#  if 1 <= 3
+#    define _SUMPROD_NOP 1
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_ubyte re, im, tmp;
+        int i;
+        re = ((npy_ubyte *)dataptr[0])[0];
+        im = ((npy_ubyte *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ubyte *)dataptr[i])[0] -
+                  im * ((npy_ubyte *)dataptr[i])[1];
+            im = re * ((npy_ubyte *)dataptr[i])[1] +
+                 im * ((npy_ubyte *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_ubyte *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_ubyte *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_ubyte *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_ubyte *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_ubyte);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 1 */
+
+#if 1 == 1
+
+static NPY_GCC_OPT_3 void
+ubyte_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_ubyte *data = (npy_ubyte *)dataptr[0];
+    npy_ubyte accum = ubyte_sum_of_arr(data, count);
+    *((npy_ubyte *)dataptr[1]) = (accum + (*((npy_ubyte *)dataptr[1])));
+#else
+    npy_ubyte accum_re = 0, accum_im = 0;
+    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_ubyte re01 = data0[0] + data0[2];
+        const npy_ubyte re23 = data0[4] + data0[6];
+        const npy_ubyte im13 = data0[1] + data0[3];
+        const npy_ubyte im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_ubyte *)dataptr[1])[0] += accum_re;
+    ((npy_ubyte *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 1 == 1 */
+
+static void
+ubyte_sum_of_products_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_ubyte accum_re = 0, accum_im = 0;
+#else
+    npy_ubyte accum = 0;
+#endif
+
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1 == 2 || 1 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_outstride0_one (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 1 == 1
+        accum += (*(npy_ubyte *)data0);
+        data0 += stride0;
+#  elif 1 == 2
+        accum += (*(npy_ubyte *)data0) *
+                 (*(npy_ubyte *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 1 == 3
+        accum += (*(npy_ubyte *)data0) *
+                 (*(npy_ubyte *)data1) *
+                 (*(npy_ubyte *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_ubyte temp = (*(npy_ubyte *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ubyte *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1 == 1
+        accum_re += ((npy_ubyte *)data0)[0];
+        accum_im += ((npy_ubyte *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 1 <= 3
+#define _SUMPROD_NOP 1
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_ubyte re, im, tmp;
+        int i;
+        re = ((npy_ubyte *)dataptr[0])[0];
+        im = ((npy_ubyte *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ubyte *)dataptr[i])[0] -
+                  im * ((npy_ubyte *)dataptr[i])[1];
+            im = re * ((npy_ubyte *)dataptr[i])[1] +
+                 im * ((npy_ubyte *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 1 <= 3
+    ((npy_ubyte *)dataptr[1])[0] += accum_re;
+    ((npy_ubyte *)dataptr[1])[1] += accum_im;
+#  else
+    ((npy_ubyte *)dataptr[nop])[0] += accum_re;
+    ((npy_ubyte *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 1 <= 3
+    *((npy_ubyte *)dataptr[1]) = (accum +
+                                    (*((npy_ubyte *)dataptr[1])));
+#  else
+    *((npy_ubyte *)dataptr[nop]) = (accum +
+                                    (*((npy_ubyte *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+ubyte_sum_of_products_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (2 == 2 || 2 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (2 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data_out = dataptr[2];
+    npy_intp stride_out = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_two (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 2 == 1
+        *(npy_ubyte *)data_out = ((*(npy_ubyte *)data0) +
+                                         (*(npy_ubyte *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 2 == 2
+        *(npy_ubyte *)data_out = ((*(npy_ubyte *)data0) *
+                                         (*(npy_ubyte *)data1) +
+                                         (*(npy_ubyte *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 2 == 3
+        *(npy_ubyte *)data_out = ((*(npy_ubyte *)data0) *
+                                         (*(npy_ubyte *)data1) *
+                                         (*(npy_ubyte *)data2) +
+                                         (*(npy_ubyte *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_ubyte temp = (*(npy_ubyte *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ubyte *)dataptr[i]);
+        }
+        *(npy_ubyte *)dataptr[nop] = (temp +
+                                           (*(npy_ubyte *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 2 == 1
+        ((npy_ubyte *)data_out)[0] = ((npy_ubyte *)data0)[0] +
+                                         ((npy_ubyte *)data_out)[0];
+        ((npy_ubyte *)data_out)[1] = ((npy_ubyte *)data0)[1] +
+                                         ((npy_ubyte *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 2 <= 3
+#define _SUMPROD_NOP 2
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_ubyte re, im, tmp;
+        int i;
+        re = ((npy_ubyte *)dataptr[0])[0];
+        im = ((npy_ubyte *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ubyte *)dataptr[i])[0] -
+                  im * ((npy_ubyte *)dataptr[i])[1];
+            im = re * ((npy_ubyte *)dataptr[i])[1] +
+                 im * ((npy_ubyte *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_ubyte *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_ubyte *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_ubyte *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_ubyte *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 2 == 1
+
+static void
+ubyte_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
+    npy_ubyte *data_out = (npy_ubyte *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_ubyte *)data_out + 2*6)[0] =
+                                    ((npy_ubyte *)data0 + 2*6)[0] +
+                                    ((npy_ubyte *)data_out + 2*6)[0];
+            ((npy_ubyte *)data_out + 2*6)[1] =
+                                    ((npy_ubyte *)data0 + 2*6)[1] +
+                                    ((npy_ubyte *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_ubyte *)data_out + 2*5)[0] =
+                                    ((npy_ubyte *)data0 + 2*5)[0] +
+                                    ((npy_ubyte *)data_out + 2*5)[0];
+            ((npy_ubyte *)data_out + 2*5)[1] =
+                                    ((npy_ubyte *)data0 + 2*5)[1] +
+                                    ((npy_ubyte *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_ubyte *)data_out + 2*4)[0] =
+                                    ((npy_ubyte *)data0 + 2*4)[0] +
+                                    ((npy_ubyte *)data_out + 2*4)[0];
+            ((npy_ubyte *)data_out + 2*4)[1] =
+                                    ((npy_ubyte *)data0 + 2*4)[1] +
+                                    ((npy_ubyte *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_ubyte *)data_out + 2*3)[0] =
+                                    ((npy_ubyte *)data0 + 2*3)[0] +
+                                    ((npy_ubyte *)data_out + 2*3)[0];
+            ((npy_ubyte *)data_out + 2*3)[1] =
+                                    ((npy_ubyte *)data0 + 2*3)[1] +
+                                    ((npy_ubyte *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_ubyte *)data_out + 2*2)[0] =
+                                    ((npy_ubyte *)data0 + 2*2)[0] +
+                                    ((npy_ubyte *)data_out + 2*2)[0];
+            ((npy_ubyte *)data_out + 2*2)[1] =
+                                    ((npy_ubyte *)data0 + 2*2)[1] +
+                                    ((npy_ubyte *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_ubyte *)data_out + 2*1)[0] =
+                                    ((npy_ubyte *)data0 + 2*1)[0] +
+                                    ((npy_ubyte *)data_out + 2*1)[0];
+            ((npy_ubyte *)data_out + 2*1)[1] =
+                                    ((npy_ubyte *)data0 + 2*1)[1] +
+                                    ((npy_ubyte *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_ubyte *)data_out + 2*0)[0] =
+                                    ((npy_ubyte *)data0 + 2*0)[0] +
+                                    ((npy_ubyte *)data_out + 2*0)[0];
+            ((npy_ubyte *)data_out + 2*0)[1] =
+                                    ((npy_ubyte *)data0 + 2*0)[1] +
+                                    ((npy_ubyte *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_ubyte *)data_out + 2*0)[0] =
+                                ((npy_ubyte *)data0 + 2*0)[0] +
+                                ((npy_ubyte *)data_out + 2*0)[0];
+        ((npy_ubyte *)data_out + 2*0)[1] =
+                                ((npy_ubyte *)data0 + 2*0)[1] +
+                                ((npy_ubyte *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_ubyte *)data_out + 2*1)[0] =
+                                ((npy_ubyte *)data0 + 2*1)[0] +
+                                ((npy_ubyte *)data_out + 2*1)[0];
+        ((npy_ubyte *)data_out + 2*1)[1] =
+                                ((npy_ubyte *)data0 + 2*1)[1] +
+                                ((npy_ubyte *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_ubyte *)data_out + 2*2)[0] =
+                                ((npy_ubyte *)data0 + 2*2)[0] +
+                                ((npy_ubyte *)data_out + 2*2)[0];
+        ((npy_ubyte *)data_out + 2*2)[1] =
+                                ((npy_ubyte *)data0 + 2*2)[1] +
+                                ((npy_ubyte *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_ubyte *)data_out + 2*3)[0] =
+                                ((npy_ubyte *)data0 + 2*3)[0] +
+                                ((npy_ubyte *)data_out + 2*3)[0];
+        ((npy_ubyte *)data_out + 2*3)[1] =
+                                ((npy_ubyte *)data0 + 2*3)[1] +
+                                ((npy_ubyte *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_ubyte *)data_out + 2*4)[0] =
+                                ((npy_ubyte *)data0 + 2*4)[0] +
+                                ((npy_ubyte *)data_out + 2*4)[0];
+        ((npy_ubyte *)data_out + 2*4)[1] =
+                                ((npy_ubyte *)data0 + 2*4)[1] +
+                                ((npy_ubyte *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_ubyte *)data_out + 2*5)[0] =
+                                ((npy_ubyte *)data0 + 2*5)[0] +
+                                ((npy_ubyte *)data_out + 2*5)[0];
+        ((npy_ubyte *)data_out + 2*5)[1] =
+                                ((npy_ubyte *)data0 + 2*5)[1] +
+                                ((npy_ubyte *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_ubyte *)data_out + 2*6)[0] =
+                                ((npy_ubyte *)data0 + 2*6)[0] +
+                                ((npy_ubyte *)data_out + 2*6)[0];
+        ((npy_ubyte *)data_out + 2*6)[1] =
+                                ((npy_ubyte *)data0 + 2*6)[1] +
+                                ((npy_ubyte *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_ubyte *)data_out + 2*7)[0] =
+                                ((npy_ubyte *)data0 + 2*7)[0] +
+                                ((npy_ubyte *)data_out + 2*7)[0];
+        ((npy_ubyte *)data_out + 2*7)[1] =
+                                ((npy_ubyte *)data0 + 2*7)[1] +
+                                ((npy_ubyte *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 2 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+ubyte_sum_of_products_muladd(npy_ubyte *data, npy_ubyte *data_out, npy_ubyte scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_ubyte
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_u8;
+    const npyv_u8 v_scalar = npyv_setall_u8(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_u8 b0 = npyv_loada_u8(data + vstep * 0);
+            npyv_u8 c0 = npyv_loada_u8(data_out + vstep * 0);
+            
+#line 312
+            npyv_u8 b1 = npyv_loada_u8(data + vstep * 1);
+            npyv_u8 c1 = npyv_loada_u8(data_out + vstep * 1);
+            
+#line 312
+            npyv_u8 b2 = npyv_loada_u8(data + vstep * 2);
+            npyv_u8 c2 = npyv_loada_u8(data_out + vstep * 2);
+            
+#line 312
+            npyv_u8 b3 = npyv_loada_u8(data + vstep * 3);
+            npyv_u8 c3 = npyv_loada_u8(data_out + vstep * 3);
+            
+            #line 318
+            npyv_u8 abc0 = npyv_muladd_u8(v_scalar, b0, c0);
+            
+#line 318
+            npyv_u8 abc1 = npyv_muladd_u8(v_scalar, b1, c1);
+            
+#line 318
+            npyv_u8 abc2 = npyv_muladd_u8(v_scalar, b2, c2);
+            
+#line 318
+            npyv_u8 abc3 = npyv_muladd_u8(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_u8(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_u8(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_u8(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_u8(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_u8 b0 = npyv_load_u8(data + vstep * 0);
+            npyv_u8 c0 = npyv_load_u8(data_out + vstep * 0);
+            
+#line 312
+            npyv_u8 b1 = npyv_load_u8(data + vstep * 1);
+            npyv_u8 c1 = npyv_load_u8(data_out + vstep * 1);
+            
+#line 312
+            npyv_u8 b2 = npyv_load_u8(data + vstep * 2);
+            npyv_u8 c2 = npyv_load_u8(data_out + vstep * 2);
+            
+#line 312
+            npyv_u8 b3 = npyv_load_u8(data + vstep * 3);
+            npyv_u8 c3 = npyv_load_u8(data_out + vstep * 3);
+            
+            #line 318
+            npyv_u8 abc0 = npyv_muladd_u8(v_scalar, b0, c0);
+            
+#line 318
+            npyv_u8 abc1 = npyv_muladd_u8(v_scalar, b1, c1);
+            
+#line 318
+            npyv_u8 abc2 = npyv_muladd_u8(v_scalar, b2, c2);
+            
+#line 318
+            npyv_u8 abc3 = npyv_muladd_u8(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_u8(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_u8(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_u8(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_u8(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_u8 a = npyv_load_tillz_u8(data, count);
+        npyv_u8 b = npyv_load_tillz_u8(data_out, count);
+        npyv_store_till_u8(data_out, count, npyv_muladd_u8(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_ubyte b0 = (data[0]);
+        const npy_ubyte c0 = (data_out[0]);
+        
+#line 340
+        const npy_ubyte b1 = (data[1]);
+        const npy_ubyte c1 = (data_out[1]);
+        
+#line 340
+        const npy_ubyte b2 = (data[2]);
+        const npy_ubyte c2 = (data_out[2]);
+        
+#line 340
+        const npy_ubyte b3 = (data[3]);
+        const npy_ubyte c3 = (data_out[3]);
+        
+        #line 346
+        const npy_ubyte abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_ubyte abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_ubyte abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_ubyte abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_ubyte b = (*data);
+        const npy_ubyte c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_ubyte
+}
+
+static void
+ubyte_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
+    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
+    npy_ubyte *data_out = (npy_ubyte *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_ubyte
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_u8;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_u8 a0 = npyv_loada_u8(data0 + vstep * 0);
+            npyv_u8 b0 = npyv_loada_u8(data1 + vstep * 0);
+            npyv_u8 c0 = npyv_loada_u8(data_out + vstep * 0);
+            
+#line 390
+            npyv_u8 a1 = npyv_loada_u8(data0 + vstep * 1);
+            npyv_u8 b1 = npyv_loada_u8(data1 + vstep * 1);
+            npyv_u8 c1 = npyv_loada_u8(data_out + vstep * 1);
+            
+#line 390
+            npyv_u8 a2 = npyv_loada_u8(data0 + vstep * 2);
+            npyv_u8 b2 = npyv_loada_u8(data1 + vstep * 2);
+            npyv_u8 c2 = npyv_loada_u8(data_out + vstep * 2);
+            
+#line 390
+            npyv_u8 a3 = npyv_loada_u8(data0 + vstep * 3);
+            npyv_u8 b3 = npyv_loada_u8(data1 + vstep * 3);
+            npyv_u8 c3 = npyv_loada_u8(data_out + vstep * 3);
+            
+            #line 397
+            npyv_u8 abc0 = npyv_muladd_u8(a0, b0, c0);
+            
+#line 397
+            npyv_u8 abc1 = npyv_muladd_u8(a1, b1, c1);
+            
+#line 397
+            npyv_u8 abc2 = npyv_muladd_u8(a2, b2, c2);
+            
+#line 397
+            npyv_u8 abc3 = npyv_muladd_u8(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_u8(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_u8(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_u8(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_u8(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_u8 a0 = npyv_load_u8(data0 + vstep * 0);
+            npyv_u8 b0 = npyv_load_u8(data1 + vstep * 0);
+            npyv_u8 c0 = npyv_load_u8(data_out + vstep * 0);
+            
+#line 390
+            npyv_u8 a1 = npyv_load_u8(data0 + vstep * 1);
+            npyv_u8 b1 = npyv_load_u8(data1 + vstep * 1);
+            npyv_u8 c1 = npyv_load_u8(data_out + vstep * 1);
+            
+#line 390
+            npyv_u8 a2 = npyv_load_u8(data0 + vstep * 2);
+            npyv_u8 b2 = npyv_load_u8(data1 + vstep * 2);
+            npyv_u8 c2 = npyv_load_u8(data_out + vstep * 2);
+            
+#line 390
+            npyv_u8 a3 = npyv_load_u8(data0 + vstep * 3);
+            npyv_u8 b3 = npyv_load_u8(data1 + vstep * 3);
+            npyv_u8 c3 = npyv_load_u8(data_out + vstep * 3);
+            
+            #line 397
+            npyv_u8 abc0 = npyv_muladd_u8(a0, b0, c0);
+            
+#line 397
+            npyv_u8 abc1 = npyv_muladd_u8(a1, b1, c1);
+            
+#line 397
+            npyv_u8 abc2 = npyv_muladd_u8(a2, b2, c2);
+            
+#line 397
+            npyv_u8 abc3 = npyv_muladd_u8(a3, b3, c3);
+            
+            #line 402
+            npyv_store_u8(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_u8(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_u8(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_u8(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_u8 a = npyv_load_tillz_u8(data0, count);
+        npyv_u8 b = npyv_load_tillz_u8(data1, count);
+        npyv_u8 c = npyv_load_tillz_u8(data_out, count);
+        npyv_store_till_u8(data_out, count, npyv_muladd_u8(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_ubyte a0 = (data0[0]);
+        const npy_ubyte b0 = (data1[0]);
+        const npy_ubyte c0 = (data_out[0]);
+        
+#line 420
+        const npy_ubyte a1 = (data0[1]);
+        const npy_ubyte b1 = (data1[1]);
+        const npy_ubyte c1 = (data_out[1]);
+        
+#line 420
+        const npy_ubyte a2 = (data0[2]);
+        const npy_ubyte b2 = (data1[2]);
+        const npy_ubyte c2 = (data_out[2]);
+        
+#line 420
+        const npy_ubyte a3 = (data0[3]);
+        const npy_ubyte b3 = (data1[3]);
+        const npy_ubyte c3 = (data_out[3]);
+        
+        #line 427
+        const npy_ubyte abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_ubyte abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_ubyte abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_ubyte abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_ubyte a = (*data0);
+        const npy_ubyte b = (*data1);
+        const npy_ubyte c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_ubyte
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+ubyte_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ubyte value0 = (*(npy_ubyte *)dataptr[0]);
+    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
+    npy_ubyte *data_out = (npy_ubyte *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    ubyte_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+ubyte_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ubyte value1 = (*(npy_ubyte *)dataptr[1]);
+    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
+    npy_ubyte *data_out = (npy_ubyte *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    ubyte_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+ubyte_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
+    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
+    npy_ubyte accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_ubyte
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_u8;
+    npyv_u8 v_accum = npyv_zero_u8();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_u8 a0 = npyv_loada_u8(data0 + vstep * 0);
+            npyv_u8 b0 = npyv_loada_u8(data1 + vstep * 0);
+            
+#line 501
+            npyv_u8 a1 = npyv_loada_u8(data0 + vstep * 1);
+            npyv_u8 b1 = npyv_loada_u8(data1 + vstep * 1);
+            
+#line 501
+            npyv_u8 a2 = npyv_loada_u8(data0 + vstep * 2);
+            npyv_u8 b2 = npyv_loada_u8(data1 + vstep * 2);
+            
+#line 501
+            npyv_u8 a3 = npyv_loada_u8(data0 + vstep * 3);
+            npyv_u8 b3 = npyv_loada_u8(data1 + vstep * 3);
+            
+            npyv_u8 ab3 = npyv_muladd_u8(a3, b3, v_accum);
+            npyv_u8 ab2 = npyv_muladd_u8(a2, b2, ab3);
+            npyv_u8 ab1 = npyv_muladd_u8(a1, b1, ab2);
+                   v_accum = npyv_muladd_u8(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_u8 a0 = npyv_load_u8(data0 + vstep * 0);
+            npyv_u8 b0 = npyv_load_u8(data1 + vstep * 0);
+            
+#line 501
+            npyv_u8 a1 = npyv_load_u8(data0 + vstep * 1);
+            npyv_u8 b1 = npyv_load_u8(data1 + vstep * 1);
+            
+#line 501
+            npyv_u8 a2 = npyv_load_u8(data0 + vstep * 2);
+            npyv_u8 b2 = npyv_load_u8(data1 + vstep * 2);
+            
+#line 501
+            npyv_u8 a3 = npyv_load_u8(data0 + vstep * 3);
+            npyv_u8 b3 = npyv_load_u8(data1 + vstep * 3);
+            
+            npyv_u8 ab3 = npyv_muladd_u8(a3, b3, v_accum);
+            npyv_u8 ab2 = npyv_muladd_u8(a2, b2, ab3);
+            npyv_u8 ab1 = npyv_muladd_u8(a1, b1, ab2);
+                   v_accum = npyv_muladd_u8(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_u8 a = npyv_load_tillz_u8(data0, count);
+        npyv_u8 b = npyv_load_tillz_u8(data1, count);
+        v_accum = npyv_muladd_u8(a, b, v_accum);
+    }
+    accum = npyv_sum_u8(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_ubyte ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_ubyte ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_ubyte ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_ubyte ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_ubyte a = (*data0);
+        const npy_ubyte b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_ubyte
+    *(npy_ubyte *)dataptr[2] = ((*(npy_ubyte *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+ubyte_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
+    npy_ubyte value0 = (*(npy_ubyte *)dataptr[0]);
+    npy_ubyte accum = ubyte_sum_of_arr(data1, count);
+    *(npy_ubyte *)dataptr[2] = ((*(npy_ubyte *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+ubyte_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
+    npy_ubyte value1 = (*(npy_ubyte *)dataptr[1]);
+    npy_ubyte accum = ubyte_sum_of_arr(data0, count);
+    *(npy_ubyte *)dataptr[2] = ((*(npy_ubyte *)dataptr[2]) + value1 * accum);
+}
+
+#elif 2 == 3 && !0
+
+static void
+ubyte_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
+    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
+    npy_ubyte *data2 = (npy_ubyte *)dataptr[2];
+    npy_ubyte *data_out = (npy_ubyte *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 2 > 3 || @complex */
+
+static void
+ubyte_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_two (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_ubyte temp = (*(npy_ubyte *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ubyte *)dataptr[i]);
+        }
+        *(npy_ubyte *)dataptr[nop] = (temp +
+                                           (*(npy_ubyte *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_ubyte);
+        }
+#else /* complex */
+#  if 2 <= 3
+#    define _SUMPROD_NOP 2
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_ubyte re, im, tmp;
+        int i;
+        re = ((npy_ubyte *)dataptr[0])[0];
+        im = ((npy_ubyte *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ubyte *)dataptr[i])[0] -
+                  im * ((npy_ubyte *)dataptr[i])[1];
+            im = re * ((npy_ubyte *)dataptr[i])[1] +
+                 im * ((npy_ubyte *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_ubyte *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_ubyte *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_ubyte *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_ubyte *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_ubyte);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 2 */
+
+#if 2 == 1
+
+static NPY_GCC_OPT_3 void
+ubyte_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_ubyte *data = (npy_ubyte *)dataptr[0];
+    npy_ubyte accum = ubyte_sum_of_arr(data, count);
+    *((npy_ubyte *)dataptr[1]) = (accum + (*((npy_ubyte *)dataptr[1])));
+#else
+    npy_ubyte accum_re = 0, accum_im = 0;
+    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_ubyte re01 = data0[0] + data0[2];
+        const npy_ubyte re23 = data0[4] + data0[6];
+        const npy_ubyte im13 = data0[1] + data0[3];
+        const npy_ubyte im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_ubyte *)dataptr[1])[0] += accum_re;
+    ((npy_ubyte *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 2 == 1 */
+
+static void
+ubyte_sum_of_products_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_ubyte accum_re = 0, accum_im = 0;
+#else
+    npy_ubyte accum = 0;
+#endif
+
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (2 == 2 || 2 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (2 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_outstride0_two (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 2 == 1
+        accum += (*(npy_ubyte *)data0);
+        data0 += stride0;
+#  elif 2 == 2
+        accum += (*(npy_ubyte *)data0) *
+                 (*(npy_ubyte *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 2 == 3
+        accum += (*(npy_ubyte *)data0) *
+                 (*(npy_ubyte *)data1) *
+                 (*(npy_ubyte *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_ubyte temp = (*(npy_ubyte *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ubyte *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 2 == 1
+        accum_re += ((npy_ubyte *)data0)[0];
+        accum_im += ((npy_ubyte *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 2 <= 3
+#define _SUMPROD_NOP 2
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_ubyte re, im, tmp;
+        int i;
+        re = ((npy_ubyte *)dataptr[0])[0];
+        im = ((npy_ubyte *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ubyte *)dataptr[i])[0] -
+                  im * ((npy_ubyte *)dataptr[i])[1];
+            im = re * ((npy_ubyte *)dataptr[i])[1] +
+                 im * ((npy_ubyte *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 2 <= 3
+    ((npy_ubyte *)dataptr[2])[0] += accum_re;
+    ((npy_ubyte *)dataptr[2])[1] += accum_im;
+#  else
+    ((npy_ubyte *)dataptr[nop])[0] += accum_re;
+    ((npy_ubyte *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 2 <= 3
+    *((npy_ubyte *)dataptr[2]) = (accum +
+                                    (*((npy_ubyte *)dataptr[2])));
+#  else
+    *((npy_ubyte *)dataptr[nop]) = (accum +
+                                    (*((npy_ubyte *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+ubyte_sum_of_products_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (3 == 2 || 3 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (3 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data_out = dataptr[3];
+    npy_intp stride_out = strides[3];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_three (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 3 == 1
+        *(npy_ubyte *)data_out = ((*(npy_ubyte *)data0) +
+                                         (*(npy_ubyte *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 3 == 2
+        *(npy_ubyte *)data_out = ((*(npy_ubyte *)data0) *
+                                         (*(npy_ubyte *)data1) +
+                                         (*(npy_ubyte *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 3 == 3
+        *(npy_ubyte *)data_out = ((*(npy_ubyte *)data0) *
+                                         (*(npy_ubyte *)data1) *
+                                         (*(npy_ubyte *)data2) +
+                                         (*(npy_ubyte *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_ubyte temp = (*(npy_ubyte *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ubyte *)dataptr[i]);
+        }
+        *(npy_ubyte *)dataptr[nop] = (temp +
+                                           (*(npy_ubyte *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 3 == 1
+        ((npy_ubyte *)data_out)[0] = ((npy_ubyte *)data0)[0] +
+                                         ((npy_ubyte *)data_out)[0];
+        ((npy_ubyte *)data_out)[1] = ((npy_ubyte *)data0)[1] +
+                                         ((npy_ubyte *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 3 <= 3
+#define _SUMPROD_NOP 3
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_ubyte re, im, tmp;
+        int i;
+        re = ((npy_ubyte *)dataptr[0])[0];
+        im = ((npy_ubyte *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ubyte *)dataptr[i])[0] -
+                  im * ((npy_ubyte *)dataptr[i])[1];
+            im = re * ((npy_ubyte *)dataptr[i])[1] +
+                 im * ((npy_ubyte *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_ubyte *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_ubyte *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_ubyte *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_ubyte *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 3 == 1
+
+static void
+ubyte_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
+    npy_ubyte *data_out = (npy_ubyte *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_ubyte *)data_out + 2*6)[0] =
+                                    ((npy_ubyte *)data0 + 2*6)[0] +
+                                    ((npy_ubyte *)data_out + 2*6)[0];
+            ((npy_ubyte *)data_out + 2*6)[1] =
+                                    ((npy_ubyte *)data0 + 2*6)[1] +
+                                    ((npy_ubyte *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_ubyte *)data_out + 2*5)[0] =
+                                    ((npy_ubyte *)data0 + 2*5)[0] +
+                                    ((npy_ubyte *)data_out + 2*5)[0];
+            ((npy_ubyte *)data_out + 2*5)[1] =
+                                    ((npy_ubyte *)data0 + 2*5)[1] +
+                                    ((npy_ubyte *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_ubyte *)data_out + 2*4)[0] =
+                                    ((npy_ubyte *)data0 + 2*4)[0] +
+                                    ((npy_ubyte *)data_out + 2*4)[0];
+            ((npy_ubyte *)data_out + 2*4)[1] =
+                                    ((npy_ubyte *)data0 + 2*4)[1] +
+                                    ((npy_ubyte *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_ubyte *)data_out + 2*3)[0] =
+                                    ((npy_ubyte *)data0 + 2*3)[0] +
+                                    ((npy_ubyte *)data_out + 2*3)[0];
+            ((npy_ubyte *)data_out + 2*3)[1] =
+                                    ((npy_ubyte *)data0 + 2*3)[1] +
+                                    ((npy_ubyte *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_ubyte *)data_out + 2*2)[0] =
+                                    ((npy_ubyte *)data0 + 2*2)[0] +
+                                    ((npy_ubyte *)data_out + 2*2)[0];
+            ((npy_ubyte *)data_out + 2*2)[1] =
+                                    ((npy_ubyte *)data0 + 2*2)[1] +
+                                    ((npy_ubyte *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_ubyte *)data_out + 2*1)[0] =
+                                    ((npy_ubyte *)data0 + 2*1)[0] +
+                                    ((npy_ubyte *)data_out + 2*1)[0];
+            ((npy_ubyte *)data_out + 2*1)[1] =
+                                    ((npy_ubyte *)data0 + 2*1)[1] +
+                                    ((npy_ubyte *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_ubyte *)data_out + 2*0)[0] =
+                                    ((npy_ubyte *)data0 + 2*0)[0] +
+                                    ((npy_ubyte *)data_out + 2*0)[0];
+            ((npy_ubyte *)data_out + 2*0)[1] =
+                                    ((npy_ubyte *)data0 + 2*0)[1] +
+                                    ((npy_ubyte *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_ubyte *)data_out + 2*0)[0] =
+                                ((npy_ubyte *)data0 + 2*0)[0] +
+                                ((npy_ubyte *)data_out + 2*0)[0];
+        ((npy_ubyte *)data_out + 2*0)[1] =
+                                ((npy_ubyte *)data0 + 2*0)[1] +
+                                ((npy_ubyte *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_ubyte *)data_out + 2*1)[0] =
+                                ((npy_ubyte *)data0 + 2*1)[0] +
+                                ((npy_ubyte *)data_out + 2*1)[0];
+        ((npy_ubyte *)data_out + 2*1)[1] =
+                                ((npy_ubyte *)data0 + 2*1)[1] +
+                                ((npy_ubyte *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_ubyte *)data_out + 2*2)[0] =
+                                ((npy_ubyte *)data0 + 2*2)[0] +
+                                ((npy_ubyte *)data_out + 2*2)[0];
+        ((npy_ubyte *)data_out + 2*2)[1] =
+                                ((npy_ubyte *)data0 + 2*2)[1] +
+                                ((npy_ubyte *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_ubyte *)data_out + 2*3)[0] =
+                                ((npy_ubyte *)data0 + 2*3)[0] +
+                                ((npy_ubyte *)data_out + 2*3)[0];
+        ((npy_ubyte *)data_out + 2*3)[1] =
+                                ((npy_ubyte *)data0 + 2*3)[1] +
+                                ((npy_ubyte *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_ubyte *)data_out + 2*4)[0] =
+                                ((npy_ubyte *)data0 + 2*4)[0] +
+                                ((npy_ubyte *)data_out + 2*4)[0];
+        ((npy_ubyte *)data_out + 2*4)[1] =
+                                ((npy_ubyte *)data0 + 2*4)[1] +
+                                ((npy_ubyte *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_ubyte *)data_out + 2*5)[0] =
+                                ((npy_ubyte *)data0 + 2*5)[0] +
+                                ((npy_ubyte *)data_out + 2*5)[0];
+        ((npy_ubyte *)data_out + 2*5)[1] =
+                                ((npy_ubyte *)data0 + 2*5)[1] +
+                                ((npy_ubyte *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_ubyte *)data_out + 2*6)[0] =
+                                ((npy_ubyte *)data0 + 2*6)[0] +
+                                ((npy_ubyte *)data_out + 2*6)[0];
+        ((npy_ubyte *)data_out + 2*6)[1] =
+                                ((npy_ubyte *)data0 + 2*6)[1] +
+                                ((npy_ubyte *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_ubyte *)data_out + 2*7)[0] =
+                                ((npy_ubyte *)data0 + 2*7)[0] +
+                                ((npy_ubyte *)data_out + 2*7)[0];
+        ((npy_ubyte *)data_out + 2*7)[1] =
+                                ((npy_ubyte *)data0 + 2*7)[1] +
+                                ((npy_ubyte *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 3 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+ubyte_sum_of_products_muladd(npy_ubyte *data, npy_ubyte *data_out, npy_ubyte scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_ubyte
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_u8;
+    const npyv_u8 v_scalar = npyv_setall_u8(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_u8 b0 = npyv_loada_u8(data + vstep * 0);
+            npyv_u8 c0 = npyv_loada_u8(data_out + vstep * 0);
+            
+#line 312
+            npyv_u8 b1 = npyv_loada_u8(data + vstep * 1);
+            npyv_u8 c1 = npyv_loada_u8(data_out + vstep * 1);
+            
+#line 312
+            npyv_u8 b2 = npyv_loada_u8(data + vstep * 2);
+            npyv_u8 c2 = npyv_loada_u8(data_out + vstep * 2);
+            
+#line 312
+            npyv_u8 b3 = npyv_loada_u8(data + vstep * 3);
+            npyv_u8 c3 = npyv_loada_u8(data_out + vstep * 3);
+            
+            #line 318
+            npyv_u8 abc0 = npyv_muladd_u8(v_scalar, b0, c0);
+            
+#line 318
+            npyv_u8 abc1 = npyv_muladd_u8(v_scalar, b1, c1);
+            
+#line 318
+            npyv_u8 abc2 = npyv_muladd_u8(v_scalar, b2, c2);
+            
+#line 318
+            npyv_u8 abc3 = npyv_muladd_u8(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_u8(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_u8(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_u8(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_u8(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_u8 b0 = npyv_load_u8(data + vstep * 0);
+            npyv_u8 c0 = npyv_load_u8(data_out + vstep * 0);
+            
+#line 312
+            npyv_u8 b1 = npyv_load_u8(data + vstep * 1);
+            npyv_u8 c1 = npyv_load_u8(data_out + vstep * 1);
+            
+#line 312
+            npyv_u8 b2 = npyv_load_u8(data + vstep * 2);
+            npyv_u8 c2 = npyv_load_u8(data_out + vstep * 2);
+            
+#line 312
+            npyv_u8 b3 = npyv_load_u8(data + vstep * 3);
+            npyv_u8 c3 = npyv_load_u8(data_out + vstep * 3);
+            
+            #line 318
+            npyv_u8 abc0 = npyv_muladd_u8(v_scalar, b0, c0);
+            
+#line 318
+            npyv_u8 abc1 = npyv_muladd_u8(v_scalar, b1, c1);
+            
+#line 318
+            npyv_u8 abc2 = npyv_muladd_u8(v_scalar, b2, c2);
+            
+#line 318
+            npyv_u8 abc3 = npyv_muladd_u8(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_u8(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_u8(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_u8(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_u8(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_u8 a = npyv_load_tillz_u8(data, count);
+        npyv_u8 b = npyv_load_tillz_u8(data_out, count);
+        npyv_store_till_u8(data_out, count, npyv_muladd_u8(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_ubyte b0 = (data[0]);
+        const npy_ubyte c0 = (data_out[0]);
+        
+#line 340
+        const npy_ubyte b1 = (data[1]);
+        const npy_ubyte c1 = (data_out[1]);
+        
+#line 340
+        const npy_ubyte b2 = (data[2]);
+        const npy_ubyte c2 = (data_out[2]);
+        
+#line 340
+        const npy_ubyte b3 = (data[3]);
+        const npy_ubyte c3 = (data_out[3]);
+        
+        #line 346
+        const npy_ubyte abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_ubyte abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_ubyte abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_ubyte abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_ubyte b = (*data);
+        const npy_ubyte c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_ubyte
+}
+
+static void
+ubyte_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
+    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
+    npy_ubyte *data_out = (npy_ubyte *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_ubyte
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_u8;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_u8 a0 = npyv_loada_u8(data0 + vstep * 0);
+            npyv_u8 b0 = npyv_loada_u8(data1 + vstep * 0);
+            npyv_u8 c0 = npyv_loada_u8(data_out + vstep * 0);
+            
+#line 390
+            npyv_u8 a1 = npyv_loada_u8(data0 + vstep * 1);
+            npyv_u8 b1 = npyv_loada_u8(data1 + vstep * 1);
+            npyv_u8 c1 = npyv_loada_u8(data_out + vstep * 1);
+            
+#line 390
+            npyv_u8 a2 = npyv_loada_u8(data0 + vstep * 2);
+            npyv_u8 b2 = npyv_loada_u8(data1 + vstep * 2);
+            npyv_u8 c2 = npyv_loada_u8(data_out + vstep * 2);
+            
+#line 390
+            npyv_u8 a3 = npyv_loada_u8(data0 + vstep * 3);
+            npyv_u8 b3 = npyv_loada_u8(data1 + vstep * 3);
+            npyv_u8 c3 = npyv_loada_u8(data_out + vstep * 3);
+            
+            #line 397
+            npyv_u8 abc0 = npyv_muladd_u8(a0, b0, c0);
+            
+#line 397
+            npyv_u8 abc1 = npyv_muladd_u8(a1, b1, c1);
+            
+#line 397
+            npyv_u8 abc2 = npyv_muladd_u8(a2, b2, c2);
+            
+#line 397
+            npyv_u8 abc3 = npyv_muladd_u8(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_u8(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_u8(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_u8(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_u8(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_u8 a0 = npyv_load_u8(data0 + vstep * 0);
+            npyv_u8 b0 = npyv_load_u8(data1 + vstep * 0);
+            npyv_u8 c0 = npyv_load_u8(data_out + vstep * 0);
+            
+#line 390
+            npyv_u8 a1 = npyv_load_u8(data0 + vstep * 1);
+            npyv_u8 b1 = npyv_load_u8(data1 + vstep * 1);
+            npyv_u8 c1 = npyv_load_u8(data_out + vstep * 1);
+            
+#line 390
+            npyv_u8 a2 = npyv_load_u8(data0 + vstep * 2);
+            npyv_u8 b2 = npyv_load_u8(data1 + vstep * 2);
+            npyv_u8 c2 = npyv_load_u8(data_out + vstep * 2);
+            
+#line 390
+            npyv_u8 a3 = npyv_load_u8(data0 + vstep * 3);
+            npyv_u8 b3 = npyv_load_u8(data1 + vstep * 3);
+            npyv_u8 c3 = npyv_load_u8(data_out + vstep * 3);
+            
+            #line 397
+            npyv_u8 abc0 = npyv_muladd_u8(a0, b0, c0);
+            
+#line 397
+            npyv_u8 abc1 = npyv_muladd_u8(a1, b1, c1);
+            
+#line 397
+            npyv_u8 abc2 = npyv_muladd_u8(a2, b2, c2);
+            
+#line 397
+            npyv_u8 abc3 = npyv_muladd_u8(a3, b3, c3);
+            
+            #line 402
+            npyv_store_u8(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_u8(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_u8(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_u8(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_u8 a = npyv_load_tillz_u8(data0, count);
+        npyv_u8 b = npyv_load_tillz_u8(data1, count);
+        npyv_u8 c = npyv_load_tillz_u8(data_out, count);
+        npyv_store_till_u8(data_out, count, npyv_muladd_u8(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_ubyte a0 = (data0[0]);
+        const npy_ubyte b0 = (data1[0]);
+        const npy_ubyte c0 = (data_out[0]);
+        
+#line 420
+        const npy_ubyte a1 = (data0[1]);
+        const npy_ubyte b1 = (data1[1]);
+        const npy_ubyte c1 = (data_out[1]);
+        
+#line 420
+        const npy_ubyte a2 = (data0[2]);
+        const npy_ubyte b2 = (data1[2]);
+        const npy_ubyte c2 = (data_out[2]);
+        
+#line 420
+        const npy_ubyte a3 = (data0[3]);
+        const npy_ubyte b3 = (data1[3]);
+        const npy_ubyte c3 = (data_out[3]);
+        
+        #line 427
+        const npy_ubyte abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_ubyte abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_ubyte abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_ubyte abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_ubyte a = (*data0);
+        const npy_ubyte b = (*data1);
+        const npy_ubyte c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_ubyte
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+ubyte_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ubyte value0 = (*(npy_ubyte *)dataptr[0]);
+    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
+    npy_ubyte *data_out = (npy_ubyte *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    ubyte_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+ubyte_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ubyte value1 = (*(npy_ubyte *)dataptr[1]);
+    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
+    npy_ubyte *data_out = (npy_ubyte *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    ubyte_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+ubyte_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
+    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
+    npy_ubyte accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_ubyte
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_u8;
+    npyv_u8 v_accum = npyv_zero_u8();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_u8 a0 = npyv_loada_u8(data0 + vstep * 0);
+            npyv_u8 b0 = npyv_loada_u8(data1 + vstep * 0);
+            
+#line 501
+            npyv_u8 a1 = npyv_loada_u8(data0 + vstep * 1);
+            npyv_u8 b1 = npyv_loada_u8(data1 + vstep * 1);
+            
+#line 501
+            npyv_u8 a2 = npyv_loada_u8(data0 + vstep * 2);
+            npyv_u8 b2 = npyv_loada_u8(data1 + vstep * 2);
+            
+#line 501
+            npyv_u8 a3 = npyv_loada_u8(data0 + vstep * 3);
+            npyv_u8 b3 = npyv_loada_u8(data1 + vstep * 3);
+            
+            npyv_u8 ab3 = npyv_muladd_u8(a3, b3, v_accum);
+            npyv_u8 ab2 = npyv_muladd_u8(a2, b2, ab3);
+            npyv_u8 ab1 = npyv_muladd_u8(a1, b1, ab2);
+                   v_accum = npyv_muladd_u8(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_u8 a0 = npyv_load_u8(data0 + vstep * 0);
+            npyv_u8 b0 = npyv_load_u8(data1 + vstep * 0);
+            
+#line 501
+            npyv_u8 a1 = npyv_load_u8(data0 + vstep * 1);
+            npyv_u8 b1 = npyv_load_u8(data1 + vstep * 1);
+            
+#line 501
+            npyv_u8 a2 = npyv_load_u8(data0 + vstep * 2);
+            npyv_u8 b2 = npyv_load_u8(data1 + vstep * 2);
+            
+#line 501
+            npyv_u8 a3 = npyv_load_u8(data0 + vstep * 3);
+            npyv_u8 b3 = npyv_load_u8(data1 + vstep * 3);
+            
+            npyv_u8 ab3 = npyv_muladd_u8(a3, b3, v_accum);
+            npyv_u8 ab2 = npyv_muladd_u8(a2, b2, ab3);
+            npyv_u8 ab1 = npyv_muladd_u8(a1, b1, ab2);
+                   v_accum = npyv_muladd_u8(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_u8 a = npyv_load_tillz_u8(data0, count);
+        npyv_u8 b = npyv_load_tillz_u8(data1, count);
+        v_accum = npyv_muladd_u8(a, b, v_accum);
+    }
+    accum = npyv_sum_u8(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_ubyte ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_ubyte ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_ubyte ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_ubyte ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_ubyte a = (*data0);
+        const npy_ubyte b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_ubyte
+    *(npy_ubyte *)dataptr[2] = ((*(npy_ubyte *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+ubyte_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
+    npy_ubyte value0 = (*(npy_ubyte *)dataptr[0]);
+    npy_ubyte accum = ubyte_sum_of_arr(data1, count);
+    *(npy_ubyte *)dataptr[2] = ((*(npy_ubyte *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+ubyte_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
+    npy_ubyte value1 = (*(npy_ubyte *)dataptr[1]);
+    npy_ubyte accum = ubyte_sum_of_arr(data0, count);
+    *(npy_ubyte *)dataptr[2] = ((*(npy_ubyte *)dataptr[2]) + value1 * accum);
+}
+
+#elif 3 == 3 && !0
+
+static void
+ubyte_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
+    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
+    npy_ubyte *data2 = (npy_ubyte *)dataptr[2];
+    npy_ubyte *data_out = (npy_ubyte *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 3 > 3 || @complex */
+
+static void
+ubyte_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_three (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_ubyte temp = (*(npy_ubyte *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ubyte *)dataptr[i]);
+        }
+        *(npy_ubyte *)dataptr[nop] = (temp +
+                                           (*(npy_ubyte *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_ubyte);
+        }
+#else /* complex */
+#  if 3 <= 3
+#    define _SUMPROD_NOP 3
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_ubyte re, im, tmp;
+        int i;
+        re = ((npy_ubyte *)dataptr[0])[0];
+        im = ((npy_ubyte *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ubyte *)dataptr[i])[0] -
+                  im * ((npy_ubyte *)dataptr[i])[1];
+            im = re * ((npy_ubyte *)dataptr[i])[1] +
+                 im * ((npy_ubyte *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_ubyte *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_ubyte *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_ubyte *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_ubyte *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_ubyte);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 3 */
+
+#if 3 == 1
+
+static NPY_GCC_OPT_3 void
+ubyte_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_ubyte *data = (npy_ubyte *)dataptr[0];
+    npy_ubyte accum = ubyte_sum_of_arr(data, count);
+    *((npy_ubyte *)dataptr[1]) = (accum + (*((npy_ubyte *)dataptr[1])));
+#else
+    npy_ubyte accum_re = 0, accum_im = 0;
+    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_ubyte re01 = data0[0] + data0[2];
+        const npy_ubyte re23 = data0[4] + data0[6];
+        const npy_ubyte im13 = data0[1] + data0[3];
+        const npy_ubyte im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_ubyte *)dataptr[1])[0] += accum_re;
+    ((npy_ubyte *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 3 == 1 */
+
+static void
+ubyte_sum_of_products_outstride0_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_ubyte accum_re = 0, accum_im = 0;
+#else
+    npy_ubyte accum = 0;
+#endif
+
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (3 == 2 || 3 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (3 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_outstride0_three (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 3 == 1
+        accum += (*(npy_ubyte *)data0);
+        data0 += stride0;
+#  elif 3 == 2
+        accum += (*(npy_ubyte *)data0) *
+                 (*(npy_ubyte *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 3 == 3
+        accum += (*(npy_ubyte *)data0) *
+                 (*(npy_ubyte *)data1) *
+                 (*(npy_ubyte *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_ubyte temp = (*(npy_ubyte *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ubyte *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 3 == 1
+        accum_re += ((npy_ubyte *)data0)[0];
+        accum_im += ((npy_ubyte *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 3 <= 3
+#define _SUMPROD_NOP 3
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_ubyte re, im, tmp;
+        int i;
+        re = ((npy_ubyte *)dataptr[0])[0];
+        im = ((npy_ubyte *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ubyte *)dataptr[i])[0] -
+                  im * ((npy_ubyte *)dataptr[i])[1];
+            im = re * ((npy_ubyte *)dataptr[i])[1] +
+                 im * ((npy_ubyte *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 3 <= 3
+    ((npy_ubyte *)dataptr[3])[0] += accum_re;
+    ((npy_ubyte *)dataptr[3])[1] += accum_im;
+#  else
+    ((npy_ubyte *)dataptr[nop])[0] += accum_re;
+    ((npy_ubyte *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 3 <= 3
+    *((npy_ubyte *)dataptr[3]) = (accum +
+                                    (*((npy_ubyte *)dataptr[3])));
+#  else
+    *((npy_ubyte *)dataptr[nop]) = (accum +
+                                    (*((npy_ubyte *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+ubyte_sum_of_products_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1000 == 2 || 1000 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1000 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data_out = dataptr[1000];
+    npy_intp stride_out = strides[1000];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_any (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 1000 == 1
+        *(npy_ubyte *)data_out = ((*(npy_ubyte *)data0) +
+                                         (*(npy_ubyte *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 1000 == 2
+        *(npy_ubyte *)data_out = ((*(npy_ubyte *)data0) *
+                                         (*(npy_ubyte *)data1) +
+                                         (*(npy_ubyte *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 1000 == 3
+        *(npy_ubyte *)data_out = ((*(npy_ubyte *)data0) *
+                                         (*(npy_ubyte *)data1) *
+                                         (*(npy_ubyte *)data2) +
+                                         (*(npy_ubyte *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_ubyte temp = (*(npy_ubyte *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ubyte *)dataptr[i]);
+        }
+        *(npy_ubyte *)dataptr[nop] = (temp +
+                                           (*(npy_ubyte *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1000 == 1
+        ((npy_ubyte *)data_out)[0] = ((npy_ubyte *)data0)[0] +
+                                         ((npy_ubyte *)data_out)[0];
+        ((npy_ubyte *)data_out)[1] = ((npy_ubyte *)data0)[1] +
+                                         ((npy_ubyte *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 1000 <= 3
+#define _SUMPROD_NOP 1000
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_ubyte re, im, tmp;
+        int i;
+        re = ((npy_ubyte *)dataptr[0])[0];
+        im = ((npy_ubyte *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ubyte *)dataptr[i])[0] -
+                  im * ((npy_ubyte *)dataptr[i])[1];
+            im = re * ((npy_ubyte *)dataptr[i])[1] +
+                 im * ((npy_ubyte *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_ubyte *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_ubyte *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_ubyte *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_ubyte *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 1000 == 1
+
+static void
+ubyte_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
+    npy_ubyte *data_out = (npy_ubyte *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_ubyte *)data_out + 2*6)[0] =
+                                    ((npy_ubyte *)data0 + 2*6)[0] +
+                                    ((npy_ubyte *)data_out + 2*6)[0];
+            ((npy_ubyte *)data_out + 2*6)[1] =
+                                    ((npy_ubyte *)data0 + 2*6)[1] +
+                                    ((npy_ubyte *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_ubyte *)data_out + 2*5)[0] =
+                                    ((npy_ubyte *)data0 + 2*5)[0] +
+                                    ((npy_ubyte *)data_out + 2*5)[0];
+            ((npy_ubyte *)data_out + 2*5)[1] =
+                                    ((npy_ubyte *)data0 + 2*5)[1] +
+                                    ((npy_ubyte *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_ubyte *)data_out + 2*4)[0] =
+                                    ((npy_ubyte *)data0 + 2*4)[0] +
+                                    ((npy_ubyte *)data_out + 2*4)[0];
+            ((npy_ubyte *)data_out + 2*4)[1] =
+                                    ((npy_ubyte *)data0 + 2*4)[1] +
+                                    ((npy_ubyte *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_ubyte *)data_out + 2*3)[0] =
+                                    ((npy_ubyte *)data0 + 2*3)[0] +
+                                    ((npy_ubyte *)data_out + 2*3)[0];
+            ((npy_ubyte *)data_out + 2*3)[1] =
+                                    ((npy_ubyte *)data0 + 2*3)[1] +
+                                    ((npy_ubyte *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_ubyte *)data_out + 2*2)[0] =
+                                    ((npy_ubyte *)data0 + 2*2)[0] +
+                                    ((npy_ubyte *)data_out + 2*2)[0];
+            ((npy_ubyte *)data_out + 2*2)[1] =
+                                    ((npy_ubyte *)data0 + 2*2)[1] +
+                                    ((npy_ubyte *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_ubyte *)data_out + 2*1)[0] =
+                                    ((npy_ubyte *)data0 + 2*1)[0] +
+                                    ((npy_ubyte *)data_out + 2*1)[0];
+            ((npy_ubyte *)data_out + 2*1)[1] =
+                                    ((npy_ubyte *)data0 + 2*1)[1] +
+                                    ((npy_ubyte *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_ubyte *)data_out + 2*0)[0] =
+                                    ((npy_ubyte *)data0 + 2*0)[0] +
+                                    ((npy_ubyte *)data_out + 2*0)[0];
+            ((npy_ubyte *)data_out + 2*0)[1] =
+                                    ((npy_ubyte *)data0 + 2*0)[1] +
+                                    ((npy_ubyte *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_ubyte *)data_out + 2*0)[0] =
+                                ((npy_ubyte *)data0 + 2*0)[0] +
+                                ((npy_ubyte *)data_out + 2*0)[0];
+        ((npy_ubyte *)data_out + 2*0)[1] =
+                                ((npy_ubyte *)data0 + 2*0)[1] +
+                                ((npy_ubyte *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_ubyte *)data_out + 2*1)[0] =
+                                ((npy_ubyte *)data0 + 2*1)[0] +
+                                ((npy_ubyte *)data_out + 2*1)[0];
+        ((npy_ubyte *)data_out + 2*1)[1] =
+                                ((npy_ubyte *)data0 + 2*1)[1] +
+                                ((npy_ubyte *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_ubyte *)data_out + 2*2)[0] =
+                                ((npy_ubyte *)data0 + 2*2)[0] +
+                                ((npy_ubyte *)data_out + 2*2)[0];
+        ((npy_ubyte *)data_out + 2*2)[1] =
+                                ((npy_ubyte *)data0 + 2*2)[1] +
+                                ((npy_ubyte *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_ubyte *)data_out + 2*3)[0] =
+                                ((npy_ubyte *)data0 + 2*3)[0] +
+                                ((npy_ubyte *)data_out + 2*3)[0];
+        ((npy_ubyte *)data_out + 2*3)[1] =
+                                ((npy_ubyte *)data0 + 2*3)[1] +
+                                ((npy_ubyte *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_ubyte *)data_out + 2*4)[0] =
+                                ((npy_ubyte *)data0 + 2*4)[0] +
+                                ((npy_ubyte *)data_out + 2*4)[0];
+        ((npy_ubyte *)data_out + 2*4)[1] =
+                                ((npy_ubyte *)data0 + 2*4)[1] +
+                                ((npy_ubyte *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_ubyte *)data_out + 2*5)[0] =
+                                ((npy_ubyte *)data0 + 2*5)[0] +
+                                ((npy_ubyte *)data_out + 2*5)[0];
+        ((npy_ubyte *)data_out + 2*5)[1] =
+                                ((npy_ubyte *)data0 + 2*5)[1] +
+                                ((npy_ubyte *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_ubyte *)data_out + 2*6)[0] =
+                                ((npy_ubyte *)data0 + 2*6)[0] +
+                                ((npy_ubyte *)data_out + 2*6)[0];
+        ((npy_ubyte *)data_out + 2*6)[1] =
+                                ((npy_ubyte *)data0 + 2*6)[1] +
+                                ((npy_ubyte *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_ubyte *)data_out + 2*7)[0] =
+                                ((npy_ubyte *)data0 + 2*7)[0] +
+                                ((npy_ubyte *)data_out + 2*7)[0];
+        ((npy_ubyte *)data_out + 2*7)[1] =
+                                ((npy_ubyte *)data0 + 2*7)[1] +
+                                ((npy_ubyte *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 1000 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+ubyte_sum_of_products_muladd(npy_ubyte *data, npy_ubyte *data_out, npy_ubyte scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_ubyte
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_u8;
+    const npyv_u8 v_scalar = npyv_setall_u8(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_u8 b0 = npyv_loada_u8(data + vstep * 0);
+            npyv_u8 c0 = npyv_loada_u8(data_out + vstep * 0);
+            
+#line 312
+            npyv_u8 b1 = npyv_loada_u8(data + vstep * 1);
+            npyv_u8 c1 = npyv_loada_u8(data_out + vstep * 1);
+            
+#line 312
+            npyv_u8 b2 = npyv_loada_u8(data + vstep * 2);
+            npyv_u8 c2 = npyv_loada_u8(data_out + vstep * 2);
+            
+#line 312
+            npyv_u8 b3 = npyv_loada_u8(data + vstep * 3);
+            npyv_u8 c3 = npyv_loada_u8(data_out + vstep * 3);
+            
+            #line 318
+            npyv_u8 abc0 = npyv_muladd_u8(v_scalar, b0, c0);
+            
+#line 318
+            npyv_u8 abc1 = npyv_muladd_u8(v_scalar, b1, c1);
+            
+#line 318
+            npyv_u8 abc2 = npyv_muladd_u8(v_scalar, b2, c2);
+            
+#line 318
+            npyv_u8 abc3 = npyv_muladd_u8(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_u8(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_u8(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_u8(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_u8(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_u8 b0 = npyv_load_u8(data + vstep * 0);
+            npyv_u8 c0 = npyv_load_u8(data_out + vstep * 0);
+            
+#line 312
+            npyv_u8 b1 = npyv_load_u8(data + vstep * 1);
+            npyv_u8 c1 = npyv_load_u8(data_out + vstep * 1);
+            
+#line 312
+            npyv_u8 b2 = npyv_load_u8(data + vstep * 2);
+            npyv_u8 c2 = npyv_load_u8(data_out + vstep * 2);
+            
+#line 312
+            npyv_u8 b3 = npyv_load_u8(data + vstep * 3);
+            npyv_u8 c3 = npyv_load_u8(data_out + vstep * 3);
+            
+            #line 318
+            npyv_u8 abc0 = npyv_muladd_u8(v_scalar, b0, c0);
+            
+#line 318
+            npyv_u8 abc1 = npyv_muladd_u8(v_scalar, b1, c1);
+            
+#line 318
+            npyv_u8 abc2 = npyv_muladd_u8(v_scalar, b2, c2);
+            
+#line 318
+            npyv_u8 abc3 = npyv_muladd_u8(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_u8(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_u8(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_u8(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_u8(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_u8 a = npyv_load_tillz_u8(data, count);
+        npyv_u8 b = npyv_load_tillz_u8(data_out, count);
+        npyv_store_till_u8(data_out, count, npyv_muladd_u8(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_ubyte b0 = (data[0]);
+        const npy_ubyte c0 = (data_out[0]);
+        
+#line 340
+        const npy_ubyte b1 = (data[1]);
+        const npy_ubyte c1 = (data_out[1]);
+        
+#line 340
+        const npy_ubyte b2 = (data[2]);
+        const npy_ubyte c2 = (data_out[2]);
+        
+#line 340
+        const npy_ubyte b3 = (data[3]);
+        const npy_ubyte c3 = (data_out[3]);
+        
+        #line 346
+        const npy_ubyte abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_ubyte abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_ubyte abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_ubyte abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_ubyte b = (*data);
+        const npy_ubyte c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_ubyte
+}
+
+static void
+ubyte_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
+    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
+    npy_ubyte *data_out = (npy_ubyte *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_ubyte
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_u8;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_u8 a0 = npyv_loada_u8(data0 + vstep * 0);
+            npyv_u8 b0 = npyv_loada_u8(data1 + vstep * 0);
+            npyv_u8 c0 = npyv_loada_u8(data_out + vstep * 0);
+            
+#line 390
+            npyv_u8 a1 = npyv_loada_u8(data0 + vstep * 1);
+            npyv_u8 b1 = npyv_loada_u8(data1 + vstep * 1);
+            npyv_u8 c1 = npyv_loada_u8(data_out + vstep * 1);
+            
+#line 390
+            npyv_u8 a2 = npyv_loada_u8(data0 + vstep * 2);
+            npyv_u8 b2 = npyv_loada_u8(data1 + vstep * 2);
+            npyv_u8 c2 = npyv_loada_u8(data_out + vstep * 2);
+            
+#line 390
+            npyv_u8 a3 = npyv_loada_u8(data0 + vstep * 3);
+            npyv_u8 b3 = npyv_loada_u8(data1 + vstep * 3);
+            npyv_u8 c3 = npyv_loada_u8(data_out + vstep * 3);
+            
+            #line 397
+            npyv_u8 abc0 = npyv_muladd_u8(a0, b0, c0);
+            
+#line 397
+            npyv_u8 abc1 = npyv_muladd_u8(a1, b1, c1);
+            
+#line 397
+            npyv_u8 abc2 = npyv_muladd_u8(a2, b2, c2);
+            
+#line 397
+            npyv_u8 abc3 = npyv_muladd_u8(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_u8(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_u8(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_u8(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_u8(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_u8 a0 = npyv_load_u8(data0 + vstep * 0);
+            npyv_u8 b0 = npyv_load_u8(data1 + vstep * 0);
+            npyv_u8 c0 = npyv_load_u8(data_out + vstep * 0);
+            
+#line 390
+            npyv_u8 a1 = npyv_load_u8(data0 + vstep * 1);
+            npyv_u8 b1 = npyv_load_u8(data1 + vstep * 1);
+            npyv_u8 c1 = npyv_load_u8(data_out + vstep * 1);
+            
+#line 390
+            npyv_u8 a2 = npyv_load_u8(data0 + vstep * 2);
+            npyv_u8 b2 = npyv_load_u8(data1 + vstep * 2);
+            npyv_u8 c2 = npyv_load_u8(data_out + vstep * 2);
+            
+#line 390
+            npyv_u8 a3 = npyv_load_u8(data0 + vstep * 3);
+            npyv_u8 b3 = npyv_load_u8(data1 + vstep * 3);
+            npyv_u8 c3 = npyv_load_u8(data_out + vstep * 3);
+            
+            #line 397
+            npyv_u8 abc0 = npyv_muladd_u8(a0, b0, c0);
+            
+#line 397
+            npyv_u8 abc1 = npyv_muladd_u8(a1, b1, c1);
+            
+#line 397
+            npyv_u8 abc2 = npyv_muladd_u8(a2, b2, c2);
+            
+#line 397
+            npyv_u8 abc3 = npyv_muladd_u8(a3, b3, c3);
+            
+            #line 402
+            npyv_store_u8(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_u8(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_u8(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_u8(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_u8 a = npyv_load_tillz_u8(data0, count);
+        npyv_u8 b = npyv_load_tillz_u8(data1, count);
+        npyv_u8 c = npyv_load_tillz_u8(data_out, count);
+        npyv_store_till_u8(data_out, count, npyv_muladd_u8(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_ubyte a0 = (data0[0]);
+        const npy_ubyte b0 = (data1[0]);
+        const npy_ubyte c0 = (data_out[0]);
+        
+#line 420
+        const npy_ubyte a1 = (data0[1]);
+        const npy_ubyte b1 = (data1[1]);
+        const npy_ubyte c1 = (data_out[1]);
+        
+#line 420
+        const npy_ubyte a2 = (data0[2]);
+        const npy_ubyte b2 = (data1[2]);
+        const npy_ubyte c2 = (data_out[2]);
+        
+#line 420
+        const npy_ubyte a3 = (data0[3]);
+        const npy_ubyte b3 = (data1[3]);
+        const npy_ubyte c3 = (data_out[3]);
+        
+        #line 427
+        const npy_ubyte abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_ubyte abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_ubyte abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_ubyte abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_ubyte a = (*data0);
+        const npy_ubyte b = (*data1);
+        const npy_ubyte c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_ubyte
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+ubyte_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ubyte value0 = (*(npy_ubyte *)dataptr[0]);
+    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
+    npy_ubyte *data_out = (npy_ubyte *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    ubyte_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+ubyte_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ubyte value1 = (*(npy_ubyte *)dataptr[1]);
+    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
+    npy_ubyte *data_out = (npy_ubyte *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    ubyte_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+ubyte_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
+    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
+    npy_ubyte accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_ubyte
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_u8;
+    npyv_u8 v_accum = npyv_zero_u8();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_u8 a0 = npyv_loada_u8(data0 + vstep * 0);
+            npyv_u8 b0 = npyv_loada_u8(data1 + vstep * 0);
+            
+#line 501
+            npyv_u8 a1 = npyv_loada_u8(data0 + vstep * 1);
+            npyv_u8 b1 = npyv_loada_u8(data1 + vstep * 1);
+            
+#line 501
+            npyv_u8 a2 = npyv_loada_u8(data0 + vstep * 2);
+            npyv_u8 b2 = npyv_loada_u8(data1 + vstep * 2);
+            
+#line 501
+            npyv_u8 a3 = npyv_loada_u8(data0 + vstep * 3);
+            npyv_u8 b3 = npyv_loada_u8(data1 + vstep * 3);
+            
+            npyv_u8 ab3 = npyv_muladd_u8(a3, b3, v_accum);
+            npyv_u8 ab2 = npyv_muladd_u8(a2, b2, ab3);
+            npyv_u8 ab1 = npyv_muladd_u8(a1, b1, ab2);
+                   v_accum = npyv_muladd_u8(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_u8 a0 = npyv_load_u8(data0 + vstep * 0);
+            npyv_u8 b0 = npyv_load_u8(data1 + vstep * 0);
+            
+#line 501
+            npyv_u8 a1 = npyv_load_u8(data0 + vstep * 1);
+            npyv_u8 b1 = npyv_load_u8(data1 + vstep * 1);
+            
+#line 501
+            npyv_u8 a2 = npyv_load_u8(data0 + vstep * 2);
+            npyv_u8 b2 = npyv_load_u8(data1 + vstep * 2);
+            
+#line 501
+            npyv_u8 a3 = npyv_load_u8(data0 + vstep * 3);
+            npyv_u8 b3 = npyv_load_u8(data1 + vstep * 3);
+            
+            npyv_u8 ab3 = npyv_muladd_u8(a3, b3, v_accum);
+            npyv_u8 ab2 = npyv_muladd_u8(a2, b2, ab3);
+            npyv_u8 ab1 = npyv_muladd_u8(a1, b1, ab2);
+                   v_accum = npyv_muladd_u8(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_u8 a = npyv_load_tillz_u8(data0, count);
+        npyv_u8 b = npyv_load_tillz_u8(data1, count);
+        v_accum = npyv_muladd_u8(a, b, v_accum);
+    }
+    accum = npyv_sum_u8(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_ubyte ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_ubyte ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_ubyte ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_ubyte ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_ubyte a = (*data0);
+        const npy_ubyte b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_ubyte
+    *(npy_ubyte *)dataptr[2] = ((*(npy_ubyte *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+ubyte_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
+    npy_ubyte value0 = (*(npy_ubyte *)dataptr[0]);
+    npy_ubyte accum = ubyte_sum_of_arr(data1, count);
+    *(npy_ubyte *)dataptr[2] = ((*(npy_ubyte *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+ubyte_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
+    npy_ubyte value1 = (*(npy_ubyte *)dataptr[1]);
+    npy_ubyte accum = ubyte_sum_of_arr(data0, count);
+    *(npy_ubyte *)dataptr[2] = ((*(npy_ubyte *)dataptr[2]) + value1 * accum);
+}
+
+#elif 1000 == 3 && !0
+
+static void
+ubyte_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
+    npy_ubyte *data1 = (npy_ubyte *)dataptr[1];
+    npy_ubyte *data2 = (npy_ubyte *)dataptr[2];
+    npy_ubyte *data_out = (npy_ubyte *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 1000 > 3 || @complex */
+
+static void
+ubyte_sum_of_products_contig_any(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_any (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_ubyte temp = (*(npy_ubyte *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ubyte *)dataptr[i]);
+        }
+        *(npy_ubyte *)dataptr[nop] = (temp +
+                                           (*(npy_ubyte *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_ubyte);
+        }
+#else /* complex */
+#  if 1000 <= 3
+#    define _SUMPROD_NOP 1000
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_ubyte re, im, tmp;
+        int i;
+        re = ((npy_ubyte *)dataptr[0])[0];
+        im = ((npy_ubyte *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ubyte *)dataptr[i])[0] -
+                  im * ((npy_ubyte *)dataptr[i])[1];
+            im = re * ((npy_ubyte *)dataptr[i])[1] +
+                 im * ((npy_ubyte *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_ubyte *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_ubyte *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_ubyte *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_ubyte *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_ubyte);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 1000 */
+
+#if 1000 == 1
+
+static NPY_GCC_OPT_3 void
+ubyte_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_ubyte *data = (npy_ubyte *)dataptr[0];
+    npy_ubyte accum = ubyte_sum_of_arr(data, count);
+    *((npy_ubyte *)dataptr[1]) = (accum + (*((npy_ubyte *)dataptr[1])));
+#else
+    npy_ubyte accum_re = 0, accum_im = 0;
+    npy_ubyte *data0 = (npy_ubyte *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_ubyte re01 = data0[0] + data0[2];
+        const npy_ubyte re23 = data0[4] + data0[6];
+        const npy_ubyte im13 = data0[1] + data0[3];
+        const npy_ubyte im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_ubyte *)dataptr[1])[0] += accum_re;
+    ((npy_ubyte *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 1000 == 1 */
+
+static void
+ubyte_sum_of_products_outstride0_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_ubyte accum_re = 0, accum_im = 0;
+#else
+    npy_ubyte accum = 0;
+#endif
+
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1000 == 2 || 1000 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1000 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("ubyte_sum_of_products_outstride0_any (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 1000 == 1
+        accum += (*(npy_ubyte *)data0);
+        data0 += stride0;
+#  elif 1000 == 2
+        accum += (*(npy_ubyte *)data0) *
+                 (*(npy_ubyte *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 1000 == 3
+        accum += (*(npy_ubyte *)data0) *
+                 (*(npy_ubyte *)data1) *
+                 (*(npy_ubyte *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_ubyte temp = (*(npy_ubyte *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ubyte *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1000 == 1
+        accum_re += ((npy_ubyte *)data0)[0];
+        accum_im += ((npy_ubyte *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 1000 <= 3
+#define _SUMPROD_NOP 1000
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_ubyte re, im, tmp;
+        int i;
+        re = ((npy_ubyte *)dataptr[0])[0];
+        im = ((npy_ubyte *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ubyte *)dataptr[i])[0] -
+                  im * ((npy_ubyte *)dataptr[i])[1];
+            im = re * ((npy_ubyte *)dataptr[i])[1] +
+                 im * ((npy_ubyte *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 1000 <= 3
+    ((npy_ubyte *)dataptr[1000])[0] += accum_re;
+    ((npy_ubyte *)dataptr[1000])[1] += accum_im;
+#  else
+    ((npy_ubyte *)dataptr[nop])[0] += accum_re;
+    ((npy_ubyte *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 1000 <= 3
+    *((npy_ubyte *)dataptr[1000]) = (accum +
+                                    (*((npy_ubyte *)dataptr[1000])));
+#  else
+    *((npy_ubyte *)dataptr[nop]) = (accum +
+                                    (*((npy_ubyte *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+
+
+#line 74
+
+#if !0
+static NPY_GCC_OPT_3 npy_ushort ushort_sum_of_arr(npy_ushort *data, npy_intp count)
+{
+    npy_ushort accum = 0;
+#if 0 // NPYV check for npy_ushort
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data);
+    const int vstep = npyv_nlanes_u16;
+    npyv_u16 v_accum = npyv_zero_u16();
+    const npy_intp vstepx4 = vstep * 4;
+
+    #line 91
+    if(is_aligned) {
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
+            #line 96
+            npyv_u16 a0 = npyv_loada_u16(data + vstep * 0);
+            
+#line 96
+            npyv_u16 a1 = npyv_loada_u16(data + vstep * 1);
+            
+#line 96
+            npyv_u16 a2 = npyv_loada_u16(data + vstep * 2);
+            
+#line 96
+            npyv_u16 a3 = npyv_loada_u16(data + vstep * 3);
+            
+            npyv_u16 a01   = npyv_add_u16(a0, a1);
+            npyv_u16 a23   = npyv_add_u16(a2, a3);
+            npyv_u16 a0123 = npyv_add_u16(a01, a23);
+                     v_accum = npyv_add_u16(a0123, v_accum);
+        }
+    }
+    
+#line 91
+    else {
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
+            #line 96
+            npyv_u16 a0 = npyv_load_u16(data + vstep * 0);
+            
+#line 96
+            npyv_u16 a1 = npyv_load_u16(data + vstep * 1);
+            
+#line 96
+            npyv_u16 a2 = npyv_load_u16(data + vstep * 2);
+            
+#line 96
+            npyv_u16 a3 = npyv_load_u16(data + vstep * 3);
+            
+            npyv_u16 a01   = npyv_add_u16(a0, a1);
+            npyv_u16 a23   = npyv_add_u16(a2, a3);
+            npyv_u16 a0123 = npyv_add_u16(a01, a23);
+                     v_accum = npyv_add_u16(a0123, v_accum);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep) {
+        npyv_u16 a = npyv_load_tillz_u16(data, count);
+        v_accum = npyv_add_u16(a, v_accum);
+    }
+    accum = npyv_sum_u16(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data += 4) {
+        const npy_ushort a01 = (*data) + (data[1]);
+        const npy_ushort a23 = (data[2]) + (data[3]);
+        accum +=  a01 + a23;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data++) {
+        accum += (*data);
+    }
+#endif // NPYV check for npy_ushort
+    return accum;
+}
+#endif
+
+#line 131
+static void
+ushort_sum_of_products_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1 == 2 || 1 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data_out = dataptr[1];
+    npy_intp stride_out = strides[1];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_one (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 1 == 1
+        *(npy_ushort *)data_out = ((*(npy_ushort *)data0) +
+                                         (*(npy_ushort *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 1 == 2
+        *(npy_ushort *)data_out = ((*(npy_ushort *)data0) *
+                                         (*(npy_ushort *)data1) +
+                                         (*(npy_ushort *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 1 == 3
+        *(npy_ushort *)data_out = ((*(npy_ushort *)data0) *
+                                         (*(npy_ushort *)data1) *
+                                         (*(npy_ushort *)data2) +
+                                         (*(npy_ushort *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_ushort temp = (*(npy_ushort *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ushort *)dataptr[i]);
+        }
+        *(npy_ushort *)dataptr[nop] = (temp +
+                                           (*(npy_ushort *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1 == 1
+        ((npy_ushort *)data_out)[0] = ((npy_ushort *)data0)[0] +
+                                         ((npy_ushort *)data_out)[0];
+        ((npy_ushort *)data_out)[1] = ((npy_ushort *)data0)[1] +
+                                         ((npy_ushort *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 1 <= 3
+#define _SUMPROD_NOP 1
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_ushort re, im, tmp;
+        int i;
+        re = ((npy_ushort *)dataptr[0])[0];
+        im = ((npy_ushort *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ushort *)dataptr[i])[0] -
+                  im * ((npy_ushort *)dataptr[i])[1];
+            im = re * ((npy_ushort *)dataptr[i])[1] +
+                 im * ((npy_ushort *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_ushort *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_ushort *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_ushort *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_ushort *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 1 == 1
+
+static void
+ushort_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ushort *data0 = (npy_ushort *)dataptr[0];
+    npy_ushort *data_out = (npy_ushort *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_ushort *)data_out + 2*6)[0] =
+                                    ((npy_ushort *)data0 + 2*6)[0] +
+                                    ((npy_ushort *)data_out + 2*6)[0];
+            ((npy_ushort *)data_out + 2*6)[1] =
+                                    ((npy_ushort *)data0 + 2*6)[1] +
+                                    ((npy_ushort *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_ushort *)data_out + 2*5)[0] =
+                                    ((npy_ushort *)data0 + 2*5)[0] +
+                                    ((npy_ushort *)data_out + 2*5)[0];
+            ((npy_ushort *)data_out + 2*5)[1] =
+                                    ((npy_ushort *)data0 + 2*5)[1] +
+                                    ((npy_ushort *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_ushort *)data_out + 2*4)[0] =
+                                    ((npy_ushort *)data0 + 2*4)[0] +
+                                    ((npy_ushort *)data_out + 2*4)[0];
+            ((npy_ushort *)data_out + 2*4)[1] =
+                                    ((npy_ushort *)data0 + 2*4)[1] +
+                                    ((npy_ushort *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_ushort *)data_out + 2*3)[0] =
+                                    ((npy_ushort *)data0 + 2*3)[0] +
+                                    ((npy_ushort *)data_out + 2*3)[0];
+            ((npy_ushort *)data_out + 2*3)[1] =
+                                    ((npy_ushort *)data0 + 2*3)[1] +
+                                    ((npy_ushort *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_ushort *)data_out + 2*2)[0] =
+                                    ((npy_ushort *)data0 + 2*2)[0] +
+                                    ((npy_ushort *)data_out + 2*2)[0];
+            ((npy_ushort *)data_out + 2*2)[1] =
+                                    ((npy_ushort *)data0 + 2*2)[1] +
+                                    ((npy_ushort *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_ushort *)data_out + 2*1)[0] =
+                                    ((npy_ushort *)data0 + 2*1)[0] +
+                                    ((npy_ushort *)data_out + 2*1)[0];
+            ((npy_ushort *)data_out + 2*1)[1] =
+                                    ((npy_ushort *)data0 + 2*1)[1] +
+                                    ((npy_ushort *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_ushort *)data_out + 2*0)[0] =
+                                    ((npy_ushort *)data0 + 2*0)[0] +
+                                    ((npy_ushort *)data_out + 2*0)[0];
+            ((npy_ushort *)data_out + 2*0)[1] =
+                                    ((npy_ushort *)data0 + 2*0)[1] +
+                                    ((npy_ushort *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_ushort *)data_out + 2*0)[0] =
+                                ((npy_ushort *)data0 + 2*0)[0] +
+                                ((npy_ushort *)data_out + 2*0)[0];
+        ((npy_ushort *)data_out + 2*0)[1] =
+                                ((npy_ushort *)data0 + 2*0)[1] +
+                                ((npy_ushort *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_ushort *)data_out + 2*1)[0] =
+                                ((npy_ushort *)data0 + 2*1)[0] +
+                                ((npy_ushort *)data_out + 2*1)[0];
+        ((npy_ushort *)data_out + 2*1)[1] =
+                                ((npy_ushort *)data0 + 2*1)[1] +
+                                ((npy_ushort *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_ushort *)data_out + 2*2)[0] =
+                                ((npy_ushort *)data0 + 2*2)[0] +
+                                ((npy_ushort *)data_out + 2*2)[0];
+        ((npy_ushort *)data_out + 2*2)[1] =
+                                ((npy_ushort *)data0 + 2*2)[1] +
+                                ((npy_ushort *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_ushort *)data_out + 2*3)[0] =
+                                ((npy_ushort *)data0 + 2*3)[0] +
+                                ((npy_ushort *)data_out + 2*3)[0];
+        ((npy_ushort *)data_out + 2*3)[1] =
+                                ((npy_ushort *)data0 + 2*3)[1] +
+                                ((npy_ushort *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_ushort *)data_out + 2*4)[0] =
+                                ((npy_ushort *)data0 + 2*4)[0] +
+                                ((npy_ushort *)data_out + 2*4)[0];
+        ((npy_ushort *)data_out + 2*4)[1] =
+                                ((npy_ushort *)data0 + 2*4)[1] +
+                                ((npy_ushort *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_ushort *)data_out + 2*5)[0] =
+                                ((npy_ushort *)data0 + 2*5)[0] +
+                                ((npy_ushort *)data_out + 2*5)[0];
+        ((npy_ushort *)data_out + 2*5)[1] =
+                                ((npy_ushort *)data0 + 2*5)[1] +
+                                ((npy_ushort *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_ushort *)data_out + 2*6)[0] =
+                                ((npy_ushort *)data0 + 2*6)[0] +
+                                ((npy_ushort *)data_out + 2*6)[0];
+        ((npy_ushort *)data_out + 2*6)[1] =
+                                ((npy_ushort *)data0 + 2*6)[1] +
+                                ((npy_ushort *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_ushort *)data_out + 2*7)[0] =
+                                ((npy_ushort *)data0 + 2*7)[0] +
+                                ((npy_ushort *)data_out + 2*7)[0];
+        ((npy_ushort *)data_out + 2*7)[1] =
+                                ((npy_ushort *)data0 + 2*7)[1] +
+                                ((npy_ushort *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 1 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+ushort_sum_of_products_muladd(npy_ushort *data, npy_ushort *data_out, npy_ushort scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_ushort
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_u16;
+    const npyv_u16 v_scalar = npyv_setall_u16(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_u16 b0 = npyv_loada_u16(data + vstep * 0);
+            npyv_u16 c0 = npyv_loada_u16(data_out + vstep * 0);
+            
+#line 312
+            npyv_u16 b1 = npyv_loada_u16(data + vstep * 1);
+            npyv_u16 c1 = npyv_loada_u16(data_out + vstep * 1);
+            
+#line 312
+            npyv_u16 b2 = npyv_loada_u16(data + vstep * 2);
+            npyv_u16 c2 = npyv_loada_u16(data_out + vstep * 2);
+            
+#line 312
+            npyv_u16 b3 = npyv_loada_u16(data + vstep * 3);
+            npyv_u16 c3 = npyv_loada_u16(data_out + vstep * 3);
+            
+            #line 318
+            npyv_u16 abc0 = npyv_muladd_u16(v_scalar, b0, c0);
+            
+#line 318
+            npyv_u16 abc1 = npyv_muladd_u16(v_scalar, b1, c1);
+            
+#line 318
+            npyv_u16 abc2 = npyv_muladd_u16(v_scalar, b2, c2);
+            
+#line 318
+            npyv_u16 abc3 = npyv_muladd_u16(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_u16(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_u16(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_u16(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_u16(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_u16 b0 = npyv_load_u16(data + vstep * 0);
+            npyv_u16 c0 = npyv_load_u16(data_out + vstep * 0);
+            
+#line 312
+            npyv_u16 b1 = npyv_load_u16(data + vstep * 1);
+            npyv_u16 c1 = npyv_load_u16(data_out + vstep * 1);
+            
+#line 312
+            npyv_u16 b2 = npyv_load_u16(data + vstep * 2);
+            npyv_u16 c2 = npyv_load_u16(data_out + vstep * 2);
+            
+#line 312
+            npyv_u16 b3 = npyv_load_u16(data + vstep * 3);
+            npyv_u16 c3 = npyv_load_u16(data_out + vstep * 3);
+            
+            #line 318
+            npyv_u16 abc0 = npyv_muladd_u16(v_scalar, b0, c0);
+            
+#line 318
+            npyv_u16 abc1 = npyv_muladd_u16(v_scalar, b1, c1);
+            
+#line 318
+            npyv_u16 abc2 = npyv_muladd_u16(v_scalar, b2, c2);
+            
+#line 318
+            npyv_u16 abc3 = npyv_muladd_u16(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_u16(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_u16(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_u16(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_u16(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_u16 a = npyv_load_tillz_u16(data, count);
+        npyv_u16 b = npyv_load_tillz_u16(data_out, count);
+        npyv_store_till_u16(data_out, count, npyv_muladd_u16(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_ushort b0 = (data[0]);
+        const npy_ushort c0 = (data_out[0]);
+        
+#line 340
+        const npy_ushort b1 = (data[1]);
+        const npy_ushort c1 = (data_out[1]);
+        
+#line 340
+        const npy_ushort b2 = (data[2]);
+        const npy_ushort c2 = (data_out[2]);
+        
+#line 340
+        const npy_ushort b3 = (data[3]);
+        const npy_ushort c3 = (data_out[3]);
+        
+        #line 346
+        const npy_ushort abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_ushort abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_ushort abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_ushort abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_ushort b = (*data);
+        const npy_ushort c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_ushort
+}
+
+static void
+ushort_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ushort *data0 = (npy_ushort *)dataptr[0];
+    npy_ushort *data1 = (npy_ushort *)dataptr[1];
+    npy_ushort *data_out = (npy_ushort *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_ushort
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_u16;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_u16 a0 = npyv_loada_u16(data0 + vstep * 0);
+            npyv_u16 b0 = npyv_loada_u16(data1 + vstep * 0);
+            npyv_u16 c0 = npyv_loada_u16(data_out + vstep * 0);
+            
+#line 390
+            npyv_u16 a1 = npyv_loada_u16(data0 + vstep * 1);
+            npyv_u16 b1 = npyv_loada_u16(data1 + vstep * 1);
+            npyv_u16 c1 = npyv_loada_u16(data_out + vstep * 1);
+            
+#line 390
+            npyv_u16 a2 = npyv_loada_u16(data0 + vstep * 2);
+            npyv_u16 b2 = npyv_loada_u16(data1 + vstep * 2);
+            npyv_u16 c2 = npyv_loada_u16(data_out + vstep * 2);
+            
+#line 390
+            npyv_u16 a3 = npyv_loada_u16(data0 + vstep * 3);
+            npyv_u16 b3 = npyv_loada_u16(data1 + vstep * 3);
+            npyv_u16 c3 = npyv_loada_u16(data_out + vstep * 3);
+            
+            #line 397
+            npyv_u16 abc0 = npyv_muladd_u16(a0, b0, c0);
+            
+#line 397
+            npyv_u16 abc1 = npyv_muladd_u16(a1, b1, c1);
+            
+#line 397
+            npyv_u16 abc2 = npyv_muladd_u16(a2, b2, c2);
+            
+#line 397
+            npyv_u16 abc3 = npyv_muladd_u16(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_u16(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_u16(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_u16(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_u16(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_u16 a0 = npyv_load_u16(data0 + vstep * 0);
+            npyv_u16 b0 = npyv_load_u16(data1 + vstep * 0);
+            npyv_u16 c0 = npyv_load_u16(data_out + vstep * 0);
+            
+#line 390
+            npyv_u16 a1 = npyv_load_u16(data0 + vstep * 1);
+            npyv_u16 b1 = npyv_load_u16(data1 + vstep * 1);
+            npyv_u16 c1 = npyv_load_u16(data_out + vstep * 1);
+            
+#line 390
+            npyv_u16 a2 = npyv_load_u16(data0 + vstep * 2);
+            npyv_u16 b2 = npyv_load_u16(data1 + vstep * 2);
+            npyv_u16 c2 = npyv_load_u16(data_out + vstep * 2);
+            
+#line 390
+            npyv_u16 a3 = npyv_load_u16(data0 + vstep * 3);
+            npyv_u16 b3 = npyv_load_u16(data1 + vstep * 3);
+            npyv_u16 c3 = npyv_load_u16(data_out + vstep * 3);
+            
+            #line 397
+            npyv_u16 abc0 = npyv_muladd_u16(a0, b0, c0);
+            
+#line 397
+            npyv_u16 abc1 = npyv_muladd_u16(a1, b1, c1);
+            
+#line 397
+            npyv_u16 abc2 = npyv_muladd_u16(a2, b2, c2);
+            
+#line 397
+            npyv_u16 abc3 = npyv_muladd_u16(a3, b3, c3);
+            
+            #line 402
+            npyv_store_u16(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_u16(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_u16(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_u16(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_u16 a = npyv_load_tillz_u16(data0, count);
+        npyv_u16 b = npyv_load_tillz_u16(data1, count);
+        npyv_u16 c = npyv_load_tillz_u16(data_out, count);
+        npyv_store_till_u16(data_out, count, npyv_muladd_u16(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_ushort a0 = (data0[0]);
+        const npy_ushort b0 = (data1[0]);
+        const npy_ushort c0 = (data_out[0]);
+        
+#line 420
+        const npy_ushort a1 = (data0[1]);
+        const npy_ushort b1 = (data1[1]);
+        const npy_ushort c1 = (data_out[1]);
+        
+#line 420
+        const npy_ushort a2 = (data0[2]);
+        const npy_ushort b2 = (data1[2]);
+        const npy_ushort c2 = (data_out[2]);
+        
+#line 420
+        const npy_ushort a3 = (data0[3]);
+        const npy_ushort b3 = (data1[3]);
+        const npy_ushort c3 = (data_out[3]);
+        
+        #line 427
+        const npy_ushort abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_ushort abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_ushort abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_ushort abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_ushort a = (*data0);
+        const npy_ushort b = (*data1);
+        const npy_ushort c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_ushort
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+ushort_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ushort value0 = (*(npy_ushort *)dataptr[0]);
+    npy_ushort *data1 = (npy_ushort *)dataptr[1];
+    npy_ushort *data_out = (npy_ushort *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    ushort_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+ushort_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ushort value1 = (*(npy_ushort *)dataptr[1]);
+    npy_ushort *data0 = (npy_ushort *)dataptr[0];
+    npy_ushort *data_out = (npy_ushort *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    ushort_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+ushort_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ushort *data0 = (npy_ushort *)dataptr[0];
+    npy_ushort *data1 = (npy_ushort *)dataptr[1];
+    npy_ushort accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_ushort
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_u16;
+    npyv_u16 v_accum = npyv_zero_u16();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_u16 a0 = npyv_loada_u16(data0 + vstep * 0);
+            npyv_u16 b0 = npyv_loada_u16(data1 + vstep * 0);
+            
+#line 501
+            npyv_u16 a1 = npyv_loada_u16(data0 + vstep * 1);
+            npyv_u16 b1 = npyv_loada_u16(data1 + vstep * 1);
+            
+#line 501
+            npyv_u16 a2 = npyv_loada_u16(data0 + vstep * 2);
+            npyv_u16 b2 = npyv_loada_u16(data1 + vstep * 2);
+            
+#line 501
+            npyv_u16 a3 = npyv_loada_u16(data0 + vstep * 3);
+            npyv_u16 b3 = npyv_loada_u16(data1 + vstep * 3);
+            
+            npyv_u16 ab3 = npyv_muladd_u16(a3, b3, v_accum);
+            npyv_u16 ab2 = npyv_muladd_u16(a2, b2, ab3);
+            npyv_u16 ab1 = npyv_muladd_u16(a1, b1, ab2);
+                   v_accum = npyv_muladd_u16(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_u16 a0 = npyv_load_u16(data0 + vstep * 0);
+            npyv_u16 b0 = npyv_load_u16(data1 + vstep * 0);
+            
+#line 501
+            npyv_u16 a1 = npyv_load_u16(data0 + vstep * 1);
+            npyv_u16 b1 = npyv_load_u16(data1 + vstep * 1);
+            
+#line 501
+            npyv_u16 a2 = npyv_load_u16(data0 + vstep * 2);
+            npyv_u16 b2 = npyv_load_u16(data1 + vstep * 2);
+            
+#line 501
+            npyv_u16 a3 = npyv_load_u16(data0 + vstep * 3);
+            npyv_u16 b3 = npyv_load_u16(data1 + vstep * 3);
+            
+            npyv_u16 ab3 = npyv_muladd_u16(a3, b3, v_accum);
+            npyv_u16 ab2 = npyv_muladd_u16(a2, b2, ab3);
+            npyv_u16 ab1 = npyv_muladd_u16(a1, b1, ab2);
+                   v_accum = npyv_muladd_u16(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_u16 a = npyv_load_tillz_u16(data0, count);
+        npyv_u16 b = npyv_load_tillz_u16(data1, count);
+        v_accum = npyv_muladd_u16(a, b, v_accum);
+    }
+    accum = npyv_sum_u16(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_ushort ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_ushort ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_ushort ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_ushort ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_ushort a = (*data0);
+        const npy_ushort b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_ushort
+    *(npy_ushort *)dataptr[2] = ((*(npy_ushort *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+ushort_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ushort *data1 = (npy_ushort *)dataptr[1];
+    npy_ushort value0 = (*(npy_ushort *)dataptr[0]);
+    npy_ushort accum = ushort_sum_of_arr(data1, count);
+    *(npy_ushort *)dataptr[2] = ((*(npy_ushort *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+ushort_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ushort *data0 = (npy_ushort *)dataptr[0];
+    npy_ushort value1 = (*(npy_ushort *)dataptr[1]);
+    npy_ushort accum = ushort_sum_of_arr(data0, count);
+    *(npy_ushort *)dataptr[2] = ((*(npy_ushort *)dataptr[2]) + value1 * accum);
+}
+
+#elif 1 == 3 && !0
+
+static void
+ushort_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ushort *data0 = (npy_ushort *)dataptr[0];
+    npy_ushort *data1 = (npy_ushort *)dataptr[1];
+    npy_ushort *data2 = (npy_ushort *)dataptr[2];
+    npy_ushort *data_out = (npy_ushort *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 1 > 3 || @complex */
+
+static void
+ushort_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_one (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_ushort temp = (*(npy_ushort *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ushort *)dataptr[i]);
+        }
+        *(npy_ushort *)dataptr[nop] = (temp +
+                                           (*(npy_ushort *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_ushort);
+        }
+#else /* complex */
+#  if 1 <= 3
+#    define _SUMPROD_NOP 1
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_ushort re, im, tmp;
+        int i;
+        re = ((npy_ushort *)dataptr[0])[0];
+        im = ((npy_ushort *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ushort *)dataptr[i])[0] -
+                  im * ((npy_ushort *)dataptr[i])[1];
+            im = re * ((npy_ushort *)dataptr[i])[1] +
+                 im * ((npy_ushort *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_ushort *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_ushort *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_ushort *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_ushort *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_ushort);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 1 */
+
+#if 1 == 1
+
+static NPY_GCC_OPT_3 void
+ushort_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_ushort *data = (npy_ushort *)dataptr[0];
+    npy_ushort accum = ushort_sum_of_arr(data, count);
+    *((npy_ushort *)dataptr[1]) = (accum + (*((npy_ushort *)dataptr[1])));
+#else
+    npy_ushort accum_re = 0, accum_im = 0;
+    npy_ushort *data0 = (npy_ushort *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_ushort re01 = data0[0] + data0[2];
+        const npy_ushort re23 = data0[4] + data0[6];
+        const npy_ushort im13 = data0[1] + data0[3];
+        const npy_ushort im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_ushort *)dataptr[1])[0] += accum_re;
+    ((npy_ushort *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 1 == 1 */
+
+static void
+ushort_sum_of_products_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_ushort accum_re = 0, accum_im = 0;
+#else
+    npy_ushort accum = 0;
+#endif
+
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1 == 2 || 1 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_outstride0_one (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 1 == 1
+        accum += (*(npy_ushort *)data0);
+        data0 += stride0;
+#  elif 1 == 2
+        accum += (*(npy_ushort *)data0) *
+                 (*(npy_ushort *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 1 == 3
+        accum += (*(npy_ushort *)data0) *
+                 (*(npy_ushort *)data1) *
+                 (*(npy_ushort *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_ushort temp = (*(npy_ushort *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ushort *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1 == 1
+        accum_re += ((npy_ushort *)data0)[0];
+        accum_im += ((npy_ushort *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 1 <= 3
+#define _SUMPROD_NOP 1
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_ushort re, im, tmp;
+        int i;
+        re = ((npy_ushort *)dataptr[0])[0];
+        im = ((npy_ushort *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ushort *)dataptr[i])[0] -
+                  im * ((npy_ushort *)dataptr[i])[1];
+            im = re * ((npy_ushort *)dataptr[i])[1] +
+                 im * ((npy_ushort *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 1 <= 3
+    ((npy_ushort *)dataptr[1])[0] += accum_re;
+    ((npy_ushort *)dataptr[1])[1] += accum_im;
+#  else
+    ((npy_ushort *)dataptr[nop])[0] += accum_re;
+    ((npy_ushort *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 1 <= 3
+    *((npy_ushort *)dataptr[1]) = (accum +
+                                    (*((npy_ushort *)dataptr[1])));
+#  else
+    *((npy_ushort *)dataptr[nop]) = (accum +
+                                    (*((npy_ushort *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+ushort_sum_of_products_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (2 == 2 || 2 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (2 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data_out = dataptr[2];
+    npy_intp stride_out = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_two (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 2 == 1
+        *(npy_ushort *)data_out = ((*(npy_ushort *)data0) +
+                                         (*(npy_ushort *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 2 == 2
+        *(npy_ushort *)data_out = ((*(npy_ushort *)data0) *
+                                         (*(npy_ushort *)data1) +
+                                         (*(npy_ushort *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 2 == 3
+        *(npy_ushort *)data_out = ((*(npy_ushort *)data0) *
+                                         (*(npy_ushort *)data1) *
+                                         (*(npy_ushort *)data2) +
+                                         (*(npy_ushort *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_ushort temp = (*(npy_ushort *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ushort *)dataptr[i]);
+        }
+        *(npy_ushort *)dataptr[nop] = (temp +
+                                           (*(npy_ushort *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 2 == 1
+        ((npy_ushort *)data_out)[0] = ((npy_ushort *)data0)[0] +
+                                         ((npy_ushort *)data_out)[0];
+        ((npy_ushort *)data_out)[1] = ((npy_ushort *)data0)[1] +
+                                         ((npy_ushort *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 2 <= 3
+#define _SUMPROD_NOP 2
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_ushort re, im, tmp;
+        int i;
+        re = ((npy_ushort *)dataptr[0])[0];
+        im = ((npy_ushort *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ushort *)dataptr[i])[0] -
+                  im * ((npy_ushort *)dataptr[i])[1];
+            im = re * ((npy_ushort *)dataptr[i])[1] +
+                 im * ((npy_ushort *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_ushort *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_ushort *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_ushort *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_ushort *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 2 == 1
+
+static void
+ushort_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ushort *data0 = (npy_ushort *)dataptr[0];
+    npy_ushort *data_out = (npy_ushort *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_ushort *)data_out + 2*6)[0] =
+                                    ((npy_ushort *)data0 + 2*6)[0] +
+                                    ((npy_ushort *)data_out + 2*6)[0];
+            ((npy_ushort *)data_out + 2*6)[1] =
+                                    ((npy_ushort *)data0 + 2*6)[1] +
+                                    ((npy_ushort *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_ushort *)data_out + 2*5)[0] =
+                                    ((npy_ushort *)data0 + 2*5)[0] +
+                                    ((npy_ushort *)data_out + 2*5)[0];
+            ((npy_ushort *)data_out + 2*5)[1] =
+                                    ((npy_ushort *)data0 + 2*5)[1] +
+                                    ((npy_ushort *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_ushort *)data_out + 2*4)[0] =
+                                    ((npy_ushort *)data0 + 2*4)[0] +
+                                    ((npy_ushort *)data_out + 2*4)[0];
+            ((npy_ushort *)data_out + 2*4)[1] =
+                                    ((npy_ushort *)data0 + 2*4)[1] +
+                                    ((npy_ushort *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_ushort *)data_out + 2*3)[0] =
+                                    ((npy_ushort *)data0 + 2*3)[0] +
+                                    ((npy_ushort *)data_out + 2*3)[0];
+            ((npy_ushort *)data_out + 2*3)[1] =
+                                    ((npy_ushort *)data0 + 2*3)[1] +
+                                    ((npy_ushort *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_ushort *)data_out + 2*2)[0] =
+                                    ((npy_ushort *)data0 + 2*2)[0] +
+                                    ((npy_ushort *)data_out + 2*2)[0];
+            ((npy_ushort *)data_out + 2*2)[1] =
+                                    ((npy_ushort *)data0 + 2*2)[1] +
+                                    ((npy_ushort *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_ushort *)data_out + 2*1)[0] =
+                                    ((npy_ushort *)data0 + 2*1)[0] +
+                                    ((npy_ushort *)data_out + 2*1)[0];
+            ((npy_ushort *)data_out + 2*1)[1] =
+                                    ((npy_ushort *)data0 + 2*1)[1] +
+                                    ((npy_ushort *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_ushort *)data_out + 2*0)[0] =
+                                    ((npy_ushort *)data0 + 2*0)[0] +
+                                    ((npy_ushort *)data_out + 2*0)[0];
+            ((npy_ushort *)data_out + 2*0)[1] =
+                                    ((npy_ushort *)data0 + 2*0)[1] +
+                                    ((npy_ushort *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_ushort *)data_out + 2*0)[0] =
+                                ((npy_ushort *)data0 + 2*0)[0] +
+                                ((npy_ushort *)data_out + 2*0)[0];
+        ((npy_ushort *)data_out + 2*0)[1] =
+                                ((npy_ushort *)data0 + 2*0)[1] +
+                                ((npy_ushort *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_ushort *)data_out + 2*1)[0] =
+                                ((npy_ushort *)data0 + 2*1)[0] +
+                                ((npy_ushort *)data_out + 2*1)[0];
+        ((npy_ushort *)data_out + 2*1)[1] =
+                                ((npy_ushort *)data0 + 2*1)[1] +
+                                ((npy_ushort *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_ushort *)data_out + 2*2)[0] =
+                                ((npy_ushort *)data0 + 2*2)[0] +
+                                ((npy_ushort *)data_out + 2*2)[0];
+        ((npy_ushort *)data_out + 2*2)[1] =
+                                ((npy_ushort *)data0 + 2*2)[1] +
+                                ((npy_ushort *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_ushort *)data_out + 2*3)[0] =
+                                ((npy_ushort *)data0 + 2*3)[0] +
+                                ((npy_ushort *)data_out + 2*3)[0];
+        ((npy_ushort *)data_out + 2*3)[1] =
+                                ((npy_ushort *)data0 + 2*3)[1] +
+                                ((npy_ushort *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_ushort *)data_out + 2*4)[0] =
+                                ((npy_ushort *)data0 + 2*4)[0] +
+                                ((npy_ushort *)data_out + 2*4)[0];
+        ((npy_ushort *)data_out + 2*4)[1] =
+                                ((npy_ushort *)data0 + 2*4)[1] +
+                                ((npy_ushort *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_ushort *)data_out + 2*5)[0] =
+                                ((npy_ushort *)data0 + 2*5)[0] +
+                                ((npy_ushort *)data_out + 2*5)[0];
+        ((npy_ushort *)data_out + 2*5)[1] =
+                                ((npy_ushort *)data0 + 2*5)[1] +
+                                ((npy_ushort *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_ushort *)data_out + 2*6)[0] =
+                                ((npy_ushort *)data0 + 2*6)[0] +
+                                ((npy_ushort *)data_out + 2*6)[0];
+        ((npy_ushort *)data_out + 2*6)[1] =
+                                ((npy_ushort *)data0 + 2*6)[1] +
+                                ((npy_ushort *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_ushort *)data_out + 2*7)[0] =
+                                ((npy_ushort *)data0 + 2*7)[0] +
+                                ((npy_ushort *)data_out + 2*7)[0];
+        ((npy_ushort *)data_out + 2*7)[1] =
+                                ((npy_ushort *)data0 + 2*7)[1] +
+                                ((npy_ushort *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 2 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+ushort_sum_of_products_muladd(npy_ushort *data, npy_ushort *data_out, npy_ushort scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_ushort
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_u16;
+    const npyv_u16 v_scalar = npyv_setall_u16(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_u16 b0 = npyv_loada_u16(data + vstep * 0);
+            npyv_u16 c0 = npyv_loada_u16(data_out + vstep * 0);
+            
+#line 312
+            npyv_u16 b1 = npyv_loada_u16(data + vstep * 1);
+            npyv_u16 c1 = npyv_loada_u16(data_out + vstep * 1);
+            
+#line 312
+            npyv_u16 b2 = npyv_loada_u16(data + vstep * 2);
+            npyv_u16 c2 = npyv_loada_u16(data_out + vstep * 2);
+            
+#line 312
+            npyv_u16 b3 = npyv_loada_u16(data + vstep * 3);
+            npyv_u16 c3 = npyv_loada_u16(data_out + vstep * 3);
+            
+            #line 318
+            npyv_u16 abc0 = npyv_muladd_u16(v_scalar, b0, c0);
+            
+#line 318
+            npyv_u16 abc1 = npyv_muladd_u16(v_scalar, b1, c1);
+            
+#line 318
+            npyv_u16 abc2 = npyv_muladd_u16(v_scalar, b2, c2);
+            
+#line 318
+            npyv_u16 abc3 = npyv_muladd_u16(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_u16(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_u16(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_u16(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_u16(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_u16 b0 = npyv_load_u16(data + vstep * 0);
+            npyv_u16 c0 = npyv_load_u16(data_out + vstep * 0);
+            
+#line 312
+            npyv_u16 b1 = npyv_load_u16(data + vstep * 1);
+            npyv_u16 c1 = npyv_load_u16(data_out + vstep * 1);
+            
+#line 312
+            npyv_u16 b2 = npyv_load_u16(data + vstep * 2);
+            npyv_u16 c2 = npyv_load_u16(data_out + vstep * 2);
+            
+#line 312
+            npyv_u16 b3 = npyv_load_u16(data + vstep * 3);
+            npyv_u16 c3 = npyv_load_u16(data_out + vstep * 3);
+            
+            #line 318
+            npyv_u16 abc0 = npyv_muladd_u16(v_scalar, b0, c0);
+            
+#line 318
+            npyv_u16 abc1 = npyv_muladd_u16(v_scalar, b1, c1);
+            
+#line 318
+            npyv_u16 abc2 = npyv_muladd_u16(v_scalar, b2, c2);
+            
+#line 318
+            npyv_u16 abc3 = npyv_muladd_u16(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_u16(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_u16(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_u16(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_u16(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_u16 a = npyv_load_tillz_u16(data, count);
+        npyv_u16 b = npyv_load_tillz_u16(data_out, count);
+        npyv_store_till_u16(data_out, count, npyv_muladd_u16(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_ushort b0 = (data[0]);
+        const npy_ushort c0 = (data_out[0]);
+        
+#line 340
+        const npy_ushort b1 = (data[1]);
+        const npy_ushort c1 = (data_out[1]);
+        
+#line 340
+        const npy_ushort b2 = (data[2]);
+        const npy_ushort c2 = (data_out[2]);
+        
+#line 340
+        const npy_ushort b3 = (data[3]);
+        const npy_ushort c3 = (data_out[3]);
+        
+        #line 346
+        const npy_ushort abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_ushort abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_ushort abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_ushort abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_ushort b = (*data);
+        const npy_ushort c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_ushort
+}
+
+static void
+ushort_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ushort *data0 = (npy_ushort *)dataptr[0];
+    npy_ushort *data1 = (npy_ushort *)dataptr[1];
+    npy_ushort *data_out = (npy_ushort *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_ushort
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_u16;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_u16 a0 = npyv_loada_u16(data0 + vstep * 0);
+            npyv_u16 b0 = npyv_loada_u16(data1 + vstep * 0);
+            npyv_u16 c0 = npyv_loada_u16(data_out + vstep * 0);
+            
+#line 390
+            npyv_u16 a1 = npyv_loada_u16(data0 + vstep * 1);
+            npyv_u16 b1 = npyv_loada_u16(data1 + vstep * 1);
+            npyv_u16 c1 = npyv_loada_u16(data_out + vstep * 1);
+            
+#line 390
+            npyv_u16 a2 = npyv_loada_u16(data0 + vstep * 2);
+            npyv_u16 b2 = npyv_loada_u16(data1 + vstep * 2);
+            npyv_u16 c2 = npyv_loada_u16(data_out + vstep * 2);
+            
+#line 390
+            npyv_u16 a3 = npyv_loada_u16(data0 + vstep * 3);
+            npyv_u16 b3 = npyv_loada_u16(data1 + vstep * 3);
+            npyv_u16 c3 = npyv_loada_u16(data_out + vstep * 3);
+            
+            #line 397
+            npyv_u16 abc0 = npyv_muladd_u16(a0, b0, c0);
+            
+#line 397
+            npyv_u16 abc1 = npyv_muladd_u16(a1, b1, c1);
+            
+#line 397
+            npyv_u16 abc2 = npyv_muladd_u16(a2, b2, c2);
+            
+#line 397
+            npyv_u16 abc3 = npyv_muladd_u16(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_u16(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_u16(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_u16(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_u16(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_u16 a0 = npyv_load_u16(data0 + vstep * 0);
+            npyv_u16 b0 = npyv_load_u16(data1 + vstep * 0);
+            npyv_u16 c0 = npyv_load_u16(data_out + vstep * 0);
+            
+#line 390
+            npyv_u16 a1 = npyv_load_u16(data0 + vstep * 1);
+            npyv_u16 b1 = npyv_load_u16(data1 + vstep * 1);
+            npyv_u16 c1 = npyv_load_u16(data_out + vstep * 1);
+            
+#line 390
+            npyv_u16 a2 = npyv_load_u16(data0 + vstep * 2);
+            npyv_u16 b2 = npyv_load_u16(data1 + vstep * 2);
+            npyv_u16 c2 = npyv_load_u16(data_out + vstep * 2);
+            
+#line 390
+            npyv_u16 a3 = npyv_load_u16(data0 + vstep * 3);
+            npyv_u16 b3 = npyv_load_u16(data1 + vstep * 3);
+            npyv_u16 c3 = npyv_load_u16(data_out + vstep * 3);
+            
+            #line 397
+            npyv_u16 abc0 = npyv_muladd_u16(a0, b0, c0);
+            
+#line 397
+            npyv_u16 abc1 = npyv_muladd_u16(a1, b1, c1);
+            
+#line 397
+            npyv_u16 abc2 = npyv_muladd_u16(a2, b2, c2);
+            
+#line 397
+            npyv_u16 abc3 = npyv_muladd_u16(a3, b3, c3);
+            
+            #line 402
+            npyv_store_u16(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_u16(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_u16(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_u16(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_u16 a = npyv_load_tillz_u16(data0, count);
+        npyv_u16 b = npyv_load_tillz_u16(data1, count);
+        npyv_u16 c = npyv_load_tillz_u16(data_out, count);
+        npyv_store_till_u16(data_out, count, npyv_muladd_u16(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_ushort a0 = (data0[0]);
+        const npy_ushort b0 = (data1[0]);
+        const npy_ushort c0 = (data_out[0]);
+        
+#line 420
+        const npy_ushort a1 = (data0[1]);
+        const npy_ushort b1 = (data1[1]);
+        const npy_ushort c1 = (data_out[1]);
+        
+#line 420
+        const npy_ushort a2 = (data0[2]);
+        const npy_ushort b2 = (data1[2]);
+        const npy_ushort c2 = (data_out[2]);
+        
+#line 420
+        const npy_ushort a3 = (data0[3]);
+        const npy_ushort b3 = (data1[3]);
+        const npy_ushort c3 = (data_out[3]);
+        
+        #line 427
+        const npy_ushort abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_ushort abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_ushort abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_ushort abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_ushort a = (*data0);
+        const npy_ushort b = (*data1);
+        const npy_ushort c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_ushort
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+ushort_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ushort value0 = (*(npy_ushort *)dataptr[0]);
+    npy_ushort *data1 = (npy_ushort *)dataptr[1];
+    npy_ushort *data_out = (npy_ushort *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    ushort_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+ushort_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ushort value1 = (*(npy_ushort *)dataptr[1]);
+    npy_ushort *data0 = (npy_ushort *)dataptr[0];
+    npy_ushort *data_out = (npy_ushort *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    ushort_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+ushort_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ushort *data0 = (npy_ushort *)dataptr[0];
+    npy_ushort *data1 = (npy_ushort *)dataptr[1];
+    npy_ushort accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_ushort
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_u16;
+    npyv_u16 v_accum = npyv_zero_u16();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_u16 a0 = npyv_loada_u16(data0 + vstep * 0);
+            npyv_u16 b0 = npyv_loada_u16(data1 + vstep * 0);
+            
+#line 501
+            npyv_u16 a1 = npyv_loada_u16(data0 + vstep * 1);
+            npyv_u16 b1 = npyv_loada_u16(data1 + vstep * 1);
+            
+#line 501
+            npyv_u16 a2 = npyv_loada_u16(data0 + vstep * 2);
+            npyv_u16 b2 = npyv_loada_u16(data1 + vstep * 2);
+            
+#line 501
+            npyv_u16 a3 = npyv_loada_u16(data0 + vstep * 3);
+            npyv_u16 b3 = npyv_loada_u16(data1 + vstep * 3);
+            
+            npyv_u16 ab3 = npyv_muladd_u16(a3, b3, v_accum);
+            npyv_u16 ab2 = npyv_muladd_u16(a2, b2, ab3);
+            npyv_u16 ab1 = npyv_muladd_u16(a1, b1, ab2);
+                   v_accum = npyv_muladd_u16(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_u16 a0 = npyv_load_u16(data0 + vstep * 0);
+            npyv_u16 b0 = npyv_load_u16(data1 + vstep * 0);
+            
+#line 501
+            npyv_u16 a1 = npyv_load_u16(data0 + vstep * 1);
+            npyv_u16 b1 = npyv_load_u16(data1 + vstep * 1);
+            
+#line 501
+            npyv_u16 a2 = npyv_load_u16(data0 + vstep * 2);
+            npyv_u16 b2 = npyv_load_u16(data1 + vstep * 2);
+            
+#line 501
+            npyv_u16 a3 = npyv_load_u16(data0 + vstep * 3);
+            npyv_u16 b3 = npyv_load_u16(data1 + vstep * 3);
+            
+            npyv_u16 ab3 = npyv_muladd_u16(a3, b3, v_accum);
+            npyv_u16 ab2 = npyv_muladd_u16(a2, b2, ab3);
+            npyv_u16 ab1 = npyv_muladd_u16(a1, b1, ab2);
+                   v_accum = npyv_muladd_u16(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_u16 a = npyv_load_tillz_u16(data0, count);
+        npyv_u16 b = npyv_load_tillz_u16(data1, count);
+        v_accum = npyv_muladd_u16(a, b, v_accum);
+    }
+    accum = npyv_sum_u16(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_ushort ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_ushort ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_ushort ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_ushort ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_ushort a = (*data0);
+        const npy_ushort b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_ushort
+    *(npy_ushort *)dataptr[2] = ((*(npy_ushort *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+ushort_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ushort *data1 = (npy_ushort *)dataptr[1];
+    npy_ushort value0 = (*(npy_ushort *)dataptr[0]);
+    npy_ushort accum = ushort_sum_of_arr(data1, count);
+    *(npy_ushort *)dataptr[2] = ((*(npy_ushort *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+ushort_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ushort *data0 = (npy_ushort *)dataptr[0];
+    npy_ushort value1 = (*(npy_ushort *)dataptr[1]);
+    npy_ushort accum = ushort_sum_of_arr(data0, count);
+    *(npy_ushort *)dataptr[2] = ((*(npy_ushort *)dataptr[2]) + value1 * accum);
+}
+
+#elif 2 == 3 && !0
+
+static void
+ushort_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ushort *data0 = (npy_ushort *)dataptr[0];
+    npy_ushort *data1 = (npy_ushort *)dataptr[1];
+    npy_ushort *data2 = (npy_ushort *)dataptr[2];
+    npy_ushort *data_out = (npy_ushort *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 2 > 3 || @complex */
+
+static void
+ushort_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_two (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_ushort temp = (*(npy_ushort *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ushort *)dataptr[i]);
+        }
+        *(npy_ushort *)dataptr[nop] = (temp +
+                                           (*(npy_ushort *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_ushort);
+        }
+#else /* complex */
+#  if 2 <= 3
+#    define _SUMPROD_NOP 2
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_ushort re, im, tmp;
+        int i;
+        re = ((npy_ushort *)dataptr[0])[0];
+        im = ((npy_ushort *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ushort *)dataptr[i])[0] -
+                  im * ((npy_ushort *)dataptr[i])[1];
+            im = re * ((npy_ushort *)dataptr[i])[1] +
+                 im * ((npy_ushort *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_ushort *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_ushort *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_ushort *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_ushort *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_ushort);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 2 */
+
+#if 2 == 1
+
+static NPY_GCC_OPT_3 void
+ushort_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_ushort *data = (npy_ushort *)dataptr[0];
+    npy_ushort accum = ushort_sum_of_arr(data, count);
+    *((npy_ushort *)dataptr[1]) = (accum + (*((npy_ushort *)dataptr[1])));
+#else
+    npy_ushort accum_re = 0, accum_im = 0;
+    npy_ushort *data0 = (npy_ushort *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_ushort re01 = data0[0] + data0[2];
+        const npy_ushort re23 = data0[4] + data0[6];
+        const npy_ushort im13 = data0[1] + data0[3];
+        const npy_ushort im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_ushort *)dataptr[1])[0] += accum_re;
+    ((npy_ushort *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 2 == 1 */
+
+static void
+ushort_sum_of_products_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_ushort accum_re = 0, accum_im = 0;
+#else
+    npy_ushort accum = 0;
+#endif
+
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (2 == 2 || 2 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (2 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_outstride0_two (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 2 == 1
+        accum += (*(npy_ushort *)data0);
+        data0 += stride0;
+#  elif 2 == 2
+        accum += (*(npy_ushort *)data0) *
+                 (*(npy_ushort *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 2 == 3
+        accum += (*(npy_ushort *)data0) *
+                 (*(npy_ushort *)data1) *
+                 (*(npy_ushort *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_ushort temp = (*(npy_ushort *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ushort *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 2 == 1
+        accum_re += ((npy_ushort *)data0)[0];
+        accum_im += ((npy_ushort *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 2 <= 3
+#define _SUMPROD_NOP 2
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_ushort re, im, tmp;
+        int i;
+        re = ((npy_ushort *)dataptr[0])[0];
+        im = ((npy_ushort *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ushort *)dataptr[i])[0] -
+                  im * ((npy_ushort *)dataptr[i])[1];
+            im = re * ((npy_ushort *)dataptr[i])[1] +
+                 im * ((npy_ushort *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 2 <= 3
+    ((npy_ushort *)dataptr[2])[0] += accum_re;
+    ((npy_ushort *)dataptr[2])[1] += accum_im;
+#  else
+    ((npy_ushort *)dataptr[nop])[0] += accum_re;
+    ((npy_ushort *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 2 <= 3
+    *((npy_ushort *)dataptr[2]) = (accum +
+                                    (*((npy_ushort *)dataptr[2])));
+#  else
+    *((npy_ushort *)dataptr[nop]) = (accum +
+                                    (*((npy_ushort *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+ushort_sum_of_products_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (3 == 2 || 3 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (3 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data_out = dataptr[3];
+    npy_intp stride_out = strides[3];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_three (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 3 == 1
+        *(npy_ushort *)data_out = ((*(npy_ushort *)data0) +
+                                         (*(npy_ushort *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 3 == 2
+        *(npy_ushort *)data_out = ((*(npy_ushort *)data0) *
+                                         (*(npy_ushort *)data1) +
+                                         (*(npy_ushort *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 3 == 3
+        *(npy_ushort *)data_out = ((*(npy_ushort *)data0) *
+                                         (*(npy_ushort *)data1) *
+                                         (*(npy_ushort *)data2) +
+                                         (*(npy_ushort *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_ushort temp = (*(npy_ushort *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ushort *)dataptr[i]);
+        }
+        *(npy_ushort *)dataptr[nop] = (temp +
+                                           (*(npy_ushort *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 3 == 1
+        ((npy_ushort *)data_out)[0] = ((npy_ushort *)data0)[0] +
+                                         ((npy_ushort *)data_out)[0];
+        ((npy_ushort *)data_out)[1] = ((npy_ushort *)data0)[1] +
+                                         ((npy_ushort *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 3 <= 3
+#define _SUMPROD_NOP 3
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_ushort re, im, tmp;
+        int i;
+        re = ((npy_ushort *)dataptr[0])[0];
+        im = ((npy_ushort *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ushort *)dataptr[i])[0] -
+                  im * ((npy_ushort *)dataptr[i])[1];
+            im = re * ((npy_ushort *)dataptr[i])[1] +
+                 im * ((npy_ushort *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_ushort *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_ushort *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_ushort *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_ushort *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 3 == 1
+
+static void
+ushort_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ushort *data0 = (npy_ushort *)dataptr[0];
+    npy_ushort *data_out = (npy_ushort *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_ushort *)data_out + 2*6)[0] =
+                                    ((npy_ushort *)data0 + 2*6)[0] +
+                                    ((npy_ushort *)data_out + 2*6)[0];
+            ((npy_ushort *)data_out + 2*6)[1] =
+                                    ((npy_ushort *)data0 + 2*6)[1] +
+                                    ((npy_ushort *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_ushort *)data_out + 2*5)[0] =
+                                    ((npy_ushort *)data0 + 2*5)[0] +
+                                    ((npy_ushort *)data_out + 2*5)[0];
+            ((npy_ushort *)data_out + 2*5)[1] =
+                                    ((npy_ushort *)data0 + 2*5)[1] +
+                                    ((npy_ushort *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_ushort *)data_out + 2*4)[0] =
+                                    ((npy_ushort *)data0 + 2*4)[0] +
+                                    ((npy_ushort *)data_out + 2*4)[0];
+            ((npy_ushort *)data_out + 2*4)[1] =
+                                    ((npy_ushort *)data0 + 2*4)[1] +
+                                    ((npy_ushort *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_ushort *)data_out + 2*3)[0] =
+                                    ((npy_ushort *)data0 + 2*3)[0] +
+                                    ((npy_ushort *)data_out + 2*3)[0];
+            ((npy_ushort *)data_out + 2*3)[1] =
+                                    ((npy_ushort *)data0 + 2*3)[1] +
+                                    ((npy_ushort *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_ushort *)data_out + 2*2)[0] =
+                                    ((npy_ushort *)data0 + 2*2)[0] +
+                                    ((npy_ushort *)data_out + 2*2)[0];
+            ((npy_ushort *)data_out + 2*2)[1] =
+                                    ((npy_ushort *)data0 + 2*2)[1] +
+                                    ((npy_ushort *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_ushort *)data_out + 2*1)[0] =
+                                    ((npy_ushort *)data0 + 2*1)[0] +
+                                    ((npy_ushort *)data_out + 2*1)[0];
+            ((npy_ushort *)data_out + 2*1)[1] =
+                                    ((npy_ushort *)data0 + 2*1)[1] +
+                                    ((npy_ushort *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_ushort *)data_out + 2*0)[0] =
+                                    ((npy_ushort *)data0 + 2*0)[0] +
+                                    ((npy_ushort *)data_out + 2*0)[0];
+            ((npy_ushort *)data_out + 2*0)[1] =
+                                    ((npy_ushort *)data0 + 2*0)[1] +
+                                    ((npy_ushort *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_ushort *)data_out + 2*0)[0] =
+                                ((npy_ushort *)data0 + 2*0)[0] +
+                                ((npy_ushort *)data_out + 2*0)[0];
+        ((npy_ushort *)data_out + 2*0)[1] =
+                                ((npy_ushort *)data0 + 2*0)[1] +
+                                ((npy_ushort *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_ushort *)data_out + 2*1)[0] =
+                                ((npy_ushort *)data0 + 2*1)[0] +
+                                ((npy_ushort *)data_out + 2*1)[0];
+        ((npy_ushort *)data_out + 2*1)[1] =
+                                ((npy_ushort *)data0 + 2*1)[1] +
+                                ((npy_ushort *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_ushort *)data_out + 2*2)[0] =
+                                ((npy_ushort *)data0 + 2*2)[0] +
+                                ((npy_ushort *)data_out + 2*2)[0];
+        ((npy_ushort *)data_out + 2*2)[1] =
+                                ((npy_ushort *)data0 + 2*2)[1] +
+                                ((npy_ushort *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_ushort *)data_out + 2*3)[0] =
+                                ((npy_ushort *)data0 + 2*3)[0] +
+                                ((npy_ushort *)data_out + 2*3)[0];
+        ((npy_ushort *)data_out + 2*3)[1] =
+                                ((npy_ushort *)data0 + 2*3)[1] +
+                                ((npy_ushort *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_ushort *)data_out + 2*4)[0] =
+                                ((npy_ushort *)data0 + 2*4)[0] +
+                                ((npy_ushort *)data_out + 2*4)[0];
+        ((npy_ushort *)data_out + 2*4)[1] =
+                                ((npy_ushort *)data0 + 2*4)[1] +
+                                ((npy_ushort *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_ushort *)data_out + 2*5)[0] =
+                                ((npy_ushort *)data0 + 2*5)[0] +
+                                ((npy_ushort *)data_out + 2*5)[0];
+        ((npy_ushort *)data_out + 2*5)[1] =
+                                ((npy_ushort *)data0 + 2*5)[1] +
+                                ((npy_ushort *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_ushort *)data_out + 2*6)[0] =
+                                ((npy_ushort *)data0 + 2*6)[0] +
+                                ((npy_ushort *)data_out + 2*6)[0];
+        ((npy_ushort *)data_out + 2*6)[1] =
+                                ((npy_ushort *)data0 + 2*6)[1] +
+                                ((npy_ushort *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_ushort *)data_out + 2*7)[0] =
+                                ((npy_ushort *)data0 + 2*7)[0] +
+                                ((npy_ushort *)data_out + 2*7)[0];
+        ((npy_ushort *)data_out + 2*7)[1] =
+                                ((npy_ushort *)data0 + 2*7)[1] +
+                                ((npy_ushort *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 3 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+ushort_sum_of_products_muladd(npy_ushort *data, npy_ushort *data_out, npy_ushort scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_ushort
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_u16;
+    const npyv_u16 v_scalar = npyv_setall_u16(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_u16 b0 = npyv_loada_u16(data + vstep * 0);
+            npyv_u16 c0 = npyv_loada_u16(data_out + vstep * 0);
+            
+#line 312
+            npyv_u16 b1 = npyv_loada_u16(data + vstep * 1);
+            npyv_u16 c1 = npyv_loada_u16(data_out + vstep * 1);
+            
+#line 312
+            npyv_u16 b2 = npyv_loada_u16(data + vstep * 2);
+            npyv_u16 c2 = npyv_loada_u16(data_out + vstep * 2);
+            
+#line 312
+            npyv_u16 b3 = npyv_loada_u16(data + vstep * 3);
+            npyv_u16 c3 = npyv_loada_u16(data_out + vstep * 3);
+            
+            #line 318
+            npyv_u16 abc0 = npyv_muladd_u16(v_scalar, b0, c0);
+            
+#line 318
+            npyv_u16 abc1 = npyv_muladd_u16(v_scalar, b1, c1);
+            
+#line 318
+            npyv_u16 abc2 = npyv_muladd_u16(v_scalar, b2, c2);
+            
+#line 318
+            npyv_u16 abc3 = npyv_muladd_u16(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_u16(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_u16(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_u16(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_u16(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_u16 b0 = npyv_load_u16(data + vstep * 0);
+            npyv_u16 c0 = npyv_load_u16(data_out + vstep * 0);
+            
+#line 312
+            npyv_u16 b1 = npyv_load_u16(data + vstep * 1);
+            npyv_u16 c1 = npyv_load_u16(data_out + vstep * 1);
+            
+#line 312
+            npyv_u16 b2 = npyv_load_u16(data + vstep * 2);
+            npyv_u16 c2 = npyv_load_u16(data_out + vstep * 2);
+            
+#line 312
+            npyv_u16 b3 = npyv_load_u16(data + vstep * 3);
+            npyv_u16 c3 = npyv_load_u16(data_out + vstep * 3);
+            
+            #line 318
+            npyv_u16 abc0 = npyv_muladd_u16(v_scalar, b0, c0);
+            
+#line 318
+            npyv_u16 abc1 = npyv_muladd_u16(v_scalar, b1, c1);
+            
+#line 318
+            npyv_u16 abc2 = npyv_muladd_u16(v_scalar, b2, c2);
+            
+#line 318
+            npyv_u16 abc3 = npyv_muladd_u16(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_u16(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_u16(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_u16(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_u16(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_u16 a = npyv_load_tillz_u16(data, count);
+        npyv_u16 b = npyv_load_tillz_u16(data_out, count);
+        npyv_store_till_u16(data_out, count, npyv_muladd_u16(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_ushort b0 = (data[0]);
+        const npy_ushort c0 = (data_out[0]);
+        
+#line 340
+        const npy_ushort b1 = (data[1]);
+        const npy_ushort c1 = (data_out[1]);
+        
+#line 340
+        const npy_ushort b2 = (data[2]);
+        const npy_ushort c2 = (data_out[2]);
+        
+#line 340
+        const npy_ushort b3 = (data[3]);
+        const npy_ushort c3 = (data_out[3]);
+        
+        #line 346
+        const npy_ushort abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_ushort abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_ushort abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_ushort abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_ushort b = (*data);
+        const npy_ushort c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_ushort
+}
+
+static void
+ushort_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ushort *data0 = (npy_ushort *)dataptr[0];
+    npy_ushort *data1 = (npy_ushort *)dataptr[1];
+    npy_ushort *data_out = (npy_ushort *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_ushort
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_u16;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_u16 a0 = npyv_loada_u16(data0 + vstep * 0);
+            npyv_u16 b0 = npyv_loada_u16(data1 + vstep * 0);
+            npyv_u16 c0 = npyv_loada_u16(data_out + vstep * 0);
+            
+#line 390
+            npyv_u16 a1 = npyv_loada_u16(data0 + vstep * 1);
+            npyv_u16 b1 = npyv_loada_u16(data1 + vstep * 1);
+            npyv_u16 c1 = npyv_loada_u16(data_out + vstep * 1);
+            
+#line 390
+            npyv_u16 a2 = npyv_loada_u16(data0 + vstep * 2);
+            npyv_u16 b2 = npyv_loada_u16(data1 + vstep * 2);
+            npyv_u16 c2 = npyv_loada_u16(data_out + vstep * 2);
+            
+#line 390
+            npyv_u16 a3 = npyv_loada_u16(data0 + vstep * 3);
+            npyv_u16 b3 = npyv_loada_u16(data1 + vstep * 3);
+            npyv_u16 c3 = npyv_loada_u16(data_out + vstep * 3);
+            
+            #line 397
+            npyv_u16 abc0 = npyv_muladd_u16(a0, b0, c0);
+            
+#line 397
+            npyv_u16 abc1 = npyv_muladd_u16(a1, b1, c1);
+            
+#line 397
+            npyv_u16 abc2 = npyv_muladd_u16(a2, b2, c2);
+            
+#line 397
+            npyv_u16 abc3 = npyv_muladd_u16(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_u16(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_u16(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_u16(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_u16(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_u16 a0 = npyv_load_u16(data0 + vstep * 0);
+            npyv_u16 b0 = npyv_load_u16(data1 + vstep * 0);
+            npyv_u16 c0 = npyv_load_u16(data_out + vstep * 0);
+            
+#line 390
+            npyv_u16 a1 = npyv_load_u16(data0 + vstep * 1);
+            npyv_u16 b1 = npyv_load_u16(data1 + vstep * 1);
+            npyv_u16 c1 = npyv_load_u16(data_out + vstep * 1);
+            
+#line 390
+            npyv_u16 a2 = npyv_load_u16(data0 + vstep * 2);
+            npyv_u16 b2 = npyv_load_u16(data1 + vstep * 2);
+            npyv_u16 c2 = npyv_load_u16(data_out + vstep * 2);
+            
+#line 390
+            npyv_u16 a3 = npyv_load_u16(data0 + vstep * 3);
+            npyv_u16 b3 = npyv_load_u16(data1 + vstep * 3);
+            npyv_u16 c3 = npyv_load_u16(data_out + vstep * 3);
+            
+            #line 397
+            npyv_u16 abc0 = npyv_muladd_u16(a0, b0, c0);
+            
+#line 397
+            npyv_u16 abc1 = npyv_muladd_u16(a1, b1, c1);
+            
+#line 397
+            npyv_u16 abc2 = npyv_muladd_u16(a2, b2, c2);
+            
+#line 397
+            npyv_u16 abc3 = npyv_muladd_u16(a3, b3, c3);
+            
+            #line 402
+            npyv_store_u16(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_u16(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_u16(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_u16(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_u16 a = npyv_load_tillz_u16(data0, count);
+        npyv_u16 b = npyv_load_tillz_u16(data1, count);
+        npyv_u16 c = npyv_load_tillz_u16(data_out, count);
+        npyv_store_till_u16(data_out, count, npyv_muladd_u16(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_ushort a0 = (data0[0]);
+        const npy_ushort b0 = (data1[0]);
+        const npy_ushort c0 = (data_out[0]);
+        
+#line 420
+        const npy_ushort a1 = (data0[1]);
+        const npy_ushort b1 = (data1[1]);
+        const npy_ushort c1 = (data_out[1]);
+        
+#line 420
+        const npy_ushort a2 = (data0[2]);
+        const npy_ushort b2 = (data1[2]);
+        const npy_ushort c2 = (data_out[2]);
+        
+#line 420
+        const npy_ushort a3 = (data0[3]);
+        const npy_ushort b3 = (data1[3]);
+        const npy_ushort c3 = (data_out[3]);
+        
+        #line 427
+        const npy_ushort abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_ushort abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_ushort abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_ushort abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_ushort a = (*data0);
+        const npy_ushort b = (*data1);
+        const npy_ushort c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_ushort
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+ushort_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ushort value0 = (*(npy_ushort *)dataptr[0]);
+    npy_ushort *data1 = (npy_ushort *)dataptr[1];
+    npy_ushort *data_out = (npy_ushort *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    ushort_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+ushort_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ushort value1 = (*(npy_ushort *)dataptr[1]);
+    npy_ushort *data0 = (npy_ushort *)dataptr[0];
+    npy_ushort *data_out = (npy_ushort *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    ushort_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+ushort_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ushort *data0 = (npy_ushort *)dataptr[0];
+    npy_ushort *data1 = (npy_ushort *)dataptr[1];
+    npy_ushort accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_ushort
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_u16;
+    npyv_u16 v_accum = npyv_zero_u16();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_u16 a0 = npyv_loada_u16(data0 + vstep * 0);
+            npyv_u16 b0 = npyv_loada_u16(data1 + vstep * 0);
+            
+#line 501
+            npyv_u16 a1 = npyv_loada_u16(data0 + vstep * 1);
+            npyv_u16 b1 = npyv_loada_u16(data1 + vstep * 1);
+            
+#line 501
+            npyv_u16 a2 = npyv_loada_u16(data0 + vstep * 2);
+            npyv_u16 b2 = npyv_loada_u16(data1 + vstep * 2);
+            
+#line 501
+            npyv_u16 a3 = npyv_loada_u16(data0 + vstep * 3);
+            npyv_u16 b3 = npyv_loada_u16(data1 + vstep * 3);
+            
+            npyv_u16 ab3 = npyv_muladd_u16(a3, b3, v_accum);
+            npyv_u16 ab2 = npyv_muladd_u16(a2, b2, ab3);
+            npyv_u16 ab1 = npyv_muladd_u16(a1, b1, ab2);
+                   v_accum = npyv_muladd_u16(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_u16 a0 = npyv_load_u16(data0 + vstep * 0);
+            npyv_u16 b0 = npyv_load_u16(data1 + vstep * 0);
+            
+#line 501
+            npyv_u16 a1 = npyv_load_u16(data0 + vstep * 1);
+            npyv_u16 b1 = npyv_load_u16(data1 + vstep * 1);
+            
+#line 501
+            npyv_u16 a2 = npyv_load_u16(data0 + vstep * 2);
+            npyv_u16 b2 = npyv_load_u16(data1 + vstep * 2);
+            
+#line 501
+            npyv_u16 a3 = npyv_load_u16(data0 + vstep * 3);
+            npyv_u16 b3 = npyv_load_u16(data1 + vstep * 3);
+            
+            npyv_u16 ab3 = npyv_muladd_u16(a3, b3, v_accum);
+            npyv_u16 ab2 = npyv_muladd_u16(a2, b2, ab3);
+            npyv_u16 ab1 = npyv_muladd_u16(a1, b1, ab2);
+                   v_accum = npyv_muladd_u16(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_u16 a = npyv_load_tillz_u16(data0, count);
+        npyv_u16 b = npyv_load_tillz_u16(data1, count);
+        v_accum = npyv_muladd_u16(a, b, v_accum);
+    }
+    accum = npyv_sum_u16(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_ushort ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_ushort ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_ushort ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_ushort ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_ushort a = (*data0);
+        const npy_ushort b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_ushort
+    *(npy_ushort *)dataptr[2] = ((*(npy_ushort *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+ushort_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ushort *data1 = (npy_ushort *)dataptr[1];
+    npy_ushort value0 = (*(npy_ushort *)dataptr[0]);
+    npy_ushort accum = ushort_sum_of_arr(data1, count);
+    *(npy_ushort *)dataptr[2] = ((*(npy_ushort *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+ushort_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ushort *data0 = (npy_ushort *)dataptr[0];
+    npy_ushort value1 = (*(npy_ushort *)dataptr[1]);
+    npy_ushort accum = ushort_sum_of_arr(data0, count);
+    *(npy_ushort *)dataptr[2] = ((*(npy_ushort *)dataptr[2]) + value1 * accum);
+}
+
+#elif 3 == 3 && !0
+
+static void
+ushort_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ushort *data0 = (npy_ushort *)dataptr[0];
+    npy_ushort *data1 = (npy_ushort *)dataptr[1];
+    npy_ushort *data2 = (npy_ushort *)dataptr[2];
+    npy_ushort *data_out = (npy_ushort *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 3 > 3 || @complex */
+
+static void
+ushort_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_three (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_ushort temp = (*(npy_ushort *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ushort *)dataptr[i]);
+        }
+        *(npy_ushort *)dataptr[nop] = (temp +
+                                           (*(npy_ushort *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_ushort);
+        }
+#else /* complex */
+#  if 3 <= 3
+#    define _SUMPROD_NOP 3
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_ushort re, im, tmp;
+        int i;
+        re = ((npy_ushort *)dataptr[0])[0];
+        im = ((npy_ushort *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ushort *)dataptr[i])[0] -
+                  im * ((npy_ushort *)dataptr[i])[1];
+            im = re * ((npy_ushort *)dataptr[i])[1] +
+                 im * ((npy_ushort *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_ushort *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_ushort *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_ushort *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_ushort *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_ushort);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 3 */
+
+#if 3 == 1
+
+static NPY_GCC_OPT_3 void
+ushort_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_ushort *data = (npy_ushort *)dataptr[0];
+    npy_ushort accum = ushort_sum_of_arr(data, count);
+    *((npy_ushort *)dataptr[1]) = (accum + (*((npy_ushort *)dataptr[1])));
+#else
+    npy_ushort accum_re = 0, accum_im = 0;
+    npy_ushort *data0 = (npy_ushort *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_ushort re01 = data0[0] + data0[2];
+        const npy_ushort re23 = data0[4] + data0[6];
+        const npy_ushort im13 = data0[1] + data0[3];
+        const npy_ushort im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_ushort *)dataptr[1])[0] += accum_re;
+    ((npy_ushort *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 3 == 1 */
+
+static void
+ushort_sum_of_products_outstride0_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_ushort accum_re = 0, accum_im = 0;
+#else
+    npy_ushort accum = 0;
+#endif
+
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (3 == 2 || 3 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (3 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_outstride0_three (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 3 == 1
+        accum += (*(npy_ushort *)data0);
+        data0 += stride0;
+#  elif 3 == 2
+        accum += (*(npy_ushort *)data0) *
+                 (*(npy_ushort *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 3 == 3
+        accum += (*(npy_ushort *)data0) *
+                 (*(npy_ushort *)data1) *
+                 (*(npy_ushort *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_ushort temp = (*(npy_ushort *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ushort *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 3 == 1
+        accum_re += ((npy_ushort *)data0)[0];
+        accum_im += ((npy_ushort *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 3 <= 3
+#define _SUMPROD_NOP 3
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_ushort re, im, tmp;
+        int i;
+        re = ((npy_ushort *)dataptr[0])[0];
+        im = ((npy_ushort *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ushort *)dataptr[i])[0] -
+                  im * ((npy_ushort *)dataptr[i])[1];
+            im = re * ((npy_ushort *)dataptr[i])[1] +
+                 im * ((npy_ushort *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 3 <= 3
+    ((npy_ushort *)dataptr[3])[0] += accum_re;
+    ((npy_ushort *)dataptr[3])[1] += accum_im;
+#  else
+    ((npy_ushort *)dataptr[nop])[0] += accum_re;
+    ((npy_ushort *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 3 <= 3
+    *((npy_ushort *)dataptr[3]) = (accum +
+                                    (*((npy_ushort *)dataptr[3])));
+#  else
+    *((npy_ushort *)dataptr[nop]) = (accum +
+                                    (*((npy_ushort *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+ushort_sum_of_products_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1000 == 2 || 1000 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1000 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data_out = dataptr[1000];
+    npy_intp stride_out = strides[1000];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_any (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 1000 == 1
+        *(npy_ushort *)data_out = ((*(npy_ushort *)data0) +
+                                         (*(npy_ushort *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 1000 == 2
+        *(npy_ushort *)data_out = ((*(npy_ushort *)data0) *
+                                         (*(npy_ushort *)data1) +
+                                         (*(npy_ushort *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 1000 == 3
+        *(npy_ushort *)data_out = ((*(npy_ushort *)data0) *
+                                         (*(npy_ushort *)data1) *
+                                         (*(npy_ushort *)data2) +
+                                         (*(npy_ushort *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_ushort temp = (*(npy_ushort *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ushort *)dataptr[i]);
+        }
+        *(npy_ushort *)dataptr[nop] = (temp +
+                                           (*(npy_ushort *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1000 == 1
+        ((npy_ushort *)data_out)[0] = ((npy_ushort *)data0)[0] +
+                                         ((npy_ushort *)data_out)[0];
+        ((npy_ushort *)data_out)[1] = ((npy_ushort *)data0)[1] +
+                                         ((npy_ushort *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 1000 <= 3
+#define _SUMPROD_NOP 1000
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_ushort re, im, tmp;
+        int i;
+        re = ((npy_ushort *)dataptr[0])[0];
+        im = ((npy_ushort *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ushort *)dataptr[i])[0] -
+                  im * ((npy_ushort *)dataptr[i])[1];
+            im = re * ((npy_ushort *)dataptr[i])[1] +
+                 im * ((npy_ushort *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_ushort *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_ushort *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_ushort *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_ushort *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 1000 == 1
+
+static void
+ushort_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ushort *data0 = (npy_ushort *)dataptr[0];
+    npy_ushort *data_out = (npy_ushort *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_ushort *)data_out + 2*6)[0] =
+                                    ((npy_ushort *)data0 + 2*6)[0] +
+                                    ((npy_ushort *)data_out + 2*6)[0];
+            ((npy_ushort *)data_out + 2*6)[1] =
+                                    ((npy_ushort *)data0 + 2*6)[1] +
+                                    ((npy_ushort *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_ushort *)data_out + 2*5)[0] =
+                                    ((npy_ushort *)data0 + 2*5)[0] +
+                                    ((npy_ushort *)data_out + 2*5)[0];
+            ((npy_ushort *)data_out + 2*5)[1] =
+                                    ((npy_ushort *)data0 + 2*5)[1] +
+                                    ((npy_ushort *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_ushort *)data_out + 2*4)[0] =
+                                    ((npy_ushort *)data0 + 2*4)[0] +
+                                    ((npy_ushort *)data_out + 2*4)[0];
+            ((npy_ushort *)data_out + 2*4)[1] =
+                                    ((npy_ushort *)data0 + 2*4)[1] +
+                                    ((npy_ushort *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_ushort *)data_out + 2*3)[0] =
+                                    ((npy_ushort *)data0 + 2*3)[0] +
+                                    ((npy_ushort *)data_out + 2*3)[0];
+            ((npy_ushort *)data_out + 2*3)[1] =
+                                    ((npy_ushort *)data0 + 2*3)[1] +
+                                    ((npy_ushort *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_ushort *)data_out + 2*2)[0] =
+                                    ((npy_ushort *)data0 + 2*2)[0] +
+                                    ((npy_ushort *)data_out + 2*2)[0];
+            ((npy_ushort *)data_out + 2*2)[1] =
+                                    ((npy_ushort *)data0 + 2*2)[1] +
+                                    ((npy_ushort *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_ushort *)data_out + 2*1)[0] =
+                                    ((npy_ushort *)data0 + 2*1)[0] +
+                                    ((npy_ushort *)data_out + 2*1)[0];
+            ((npy_ushort *)data_out + 2*1)[1] =
+                                    ((npy_ushort *)data0 + 2*1)[1] +
+                                    ((npy_ushort *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_ushort *)data_out + 2*0)[0] =
+                                    ((npy_ushort *)data0 + 2*0)[0] +
+                                    ((npy_ushort *)data_out + 2*0)[0];
+            ((npy_ushort *)data_out + 2*0)[1] =
+                                    ((npy_ushort *)data0 + 2*0)[1] +
+                                    ((npy_ushort *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_ushort *)data_out + 2*0)[0] =
+                                ((npy_ushort *)data0 + 2*0)[0] +
+                                ((npy_ushort *)data_out + 2*0)[0];
+        ((npy_ushort *)data_out + 2*0)[1] =
+                                ((npy_ushort *)data0 + 2*0)[1] +
+                                ((npy_ushort *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_ushort *)data_out + 2*1)[0] =
+                                ((npy_ushort *)data0 + 2*1)[0] +
+                                ((npy_ushort *)data_out + 2*1)[0];
+        ((npy_ushort *)data_out + 2*1)[1] =
+                                ((npy_ushort *)data0 + 2*1)[1] +
+                                ((npy_ushort *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_ushort *)data_out + 2*2)[0] =
+                                ((npy_ushort *)data0 + 2*2)[0] +
+                                ((npy_ushort *)data_out + 2*2)[0];
+        ((npy_ushort *)data_out + 2*2)[1] =
+                                ((npy_ushort *)data0 + 2*2)[1] +
+                                ((npy_ushort *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_ushort *)data_out + 2*3)[0] =
+                                ((npy_ushort *)data0 + 2*3)[0] +
+                                ((npy_ushort *)data_out + 2*3)[0];
+        ((npy_ushort *)data_out + 2*3)[1] =
+                                ((npy_ushort *)data0 + 2*3)[1] +
+                                ((npy_ushort *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_ushort *)data_out + 2*4)[0] =
+                                ((npy_ushort *)data0 + 2*4)[0] +
+                                ((npy_ushort *)data_out + 2*4)[0];
+        ((npy_ushort *)data_out + 2*4)[1] =
+                                ((npy_ushort *)data0 + 2*4)[1] +
+                                ((npy_ushort *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_ushort *)data_out + 2*5)[0] =
+                                ((npy_ushort *)data0 + 2*5)[0] +
+                                ((npy_ushort *)data_out + 2*5)[0];
+        ((npy_ushort *)data_out + 2*5)[1] =
+                                ((npy_ushort *)data0 + 2*5)[1] +
+                                ((npy_ushort *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_ushort *)data_out + 2*6)[0] =
+                                ((npy_ushort *)data0 + 2*6)[0] +
+                                ((npy_ushort *)data_out + 2*6)[0];
+        ((npy_ushort *)data_out + 2*6)[1] =
+                                ((npy_ushort *)data0 + 2*6)[1] +
+                                ((npy_ushort *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_ushort *)data_out + 2*7)[0] =
+                                ((npy_ushort *)data0 + 2*7)[0] +
+                                ((npy_ushort *)data_out + 2*7)[0];
+        ((npy_ushort *)data_out + 2*7)[1] =
+                                ((npy_ushort *)data0 + 2*7)[1] +
+                                ((npy_ushort *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 1000 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+ushort_sum_of_products_muladd(npy_ushort *data, npy_ushort *data_out, npy_ushort scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_ushort
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_u16;
+    const npyv_u16 v_scalar = npyv_setall_u16(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_u16 b0 = npyv_loada_u16(data + vstep * 0);
+            npyv_u16 c0 = npyv_loada_u16(data_out + vstep * 0);
+            
+#line 312
+            npyv_u16 b1 = npyv_loada_u16(data + vstep * 1);
+            npyv_u16 c1 = npyv_loada_u16(data_out + vstep * 1);
+            
+#line 312
+            npyv_u16 b2 = npyv_loada_u16(data + vstep * 2);
+            npyv_u16 c2 = npyv_loada_u16(data_out + vstep * 2);
+            
+#line 312
+            npyv_u16 b3 = npyv_loada_u16(data + vstep * 3);
+            npyv_u16 c3 = npyv_loada_u16(data_out + vstep * 3);
+            
+            #line 318
+            npyv_u16 abc0 = npyv_muladd_u16(v_scalar, b0, c0);
+            
+#line 318
+            npyv_u16 abc1 = npyv_muladd_u16(v_scalar, b1, c1);
+            
+#line 318
+            npyv_u16 abc2 = npyv_muladd_u16(v_scalar, b2, c2);
+            
+#line 318
+            npyv_u16 abc3 = npyv_muladd_u16(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_u16(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_u16(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_u16(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_u16(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_u16 b0 = npyv_load_u16(data + vstep * 0);
+            npyv_u16 c0 = npyv_load_u16(data_out + vstep * 0);
+            
+#line 312
+            npyv_u16 b1 = npyv_load_u16(data + vstep * 1);
+            npyv_u16 c1 = npyv_load_u16(data_out + vstep * 1);
+            
+#line 312
+            npyv_u16 b2 = npyv_load_u16(data + vstep * 2);
+            npyv_u16 c2 = npyv_load_u16(data_out + vstep * 2);
+            
+#line 312
+            npyv_u16 b3 = npyv_load_u16(data + vstep * 3);
+            npyv_u16 c3 = npyv_load_u16(data_out + vstep * 3);
+            
+            #line 318
+            npyv_u16 abc0 = npyv_muladd_u16(v_scalar, b0, c0);
+            
+#line 318
+            npyv_u16 abc1 = npyv_muladd_u16(v_scalar, b1, c1);
+            
+#line 318
+            npyv_u16 abc2 = npyv_muladd_u16(v_scalar, b2, c2);
+            
+#line 318
+            npyv_u16 abc3 = npyv_muladd_u16(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_u16(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_u16(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_u16(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_u16(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_u16 a = npyv_load_tillz_u16(data, count);
+        npyv_u16 b = npyv_load_tillz_u16(data_out, count);
+        npyv_store_till_u16(data_out, count, npyv_muladd_u16(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_ushort b0 = (data[0]);
+        const npy_ushort c0 = (data_out[0]);
+        
+#line 340
+        const npy_ushort b1 = (data[1]);
+        const npy_ushort c1 = (data_out[1]);
+        
+#line 340
+        const npy_ushort b2 = (data[2]);
+        const npy_ushort c2 = (data_out[2]);
+        
+#line 340
+        const npy_ushort b3 = (data[3]);
+        const npy_ushort c3 = (data_out[3]);
+        
+        #line 346
+        const npy_ushort abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_ushort abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_ushort abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_ushort abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_ushort b = (*data);
+        const npy_ushort c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_ushort
+}
+
+static void
+ushort_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ushort *data0 = (npy_ushort *)dataptr[0];
+    npy_ushort *data1 = (npy_ushort *)dataptr[1];
+    npy_ushort *data_out = (npy_ushort *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_ushort
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_u16;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_u16 a0 = npyv_loada_u16(data0 + vstep * 0);
+            npyv_u16 b0 = npyv_loada_u16(data1 + vstep * 0);
+            npyv_u16 c0 = npyv_loada_u16(data_out + vstep * 0);
+            
+#line 390
+            npyv_u16 a1 = npyv_loada_u16(data0 + vstep * 1);
+            npyv_u16 b1 = npyv_loada_u16(data1 + vstep * 1);
+            npyv_u16 c1 = npyv_loada_u16(data_out + vstep * 1);
+            
+#line 390
+            npyv_u16 a2 = npyv_loada_u16(data0 + vstep * 2);
+            npyv_u16 b2 = npyv_loada_u16(data1 + vstep * 2);
+            npyv_u16 c2 = npyv_loada_u16(data_out + vstep * 2);
+            
+#line 390
+            npyv_u16 a3 = npyv_loada_u16(data0 + vstep * 3);
+            npyv_u16 b3 = npyv_loada_u16(data1 + vstep * 3);
+            npyv_u16 c3 = npyv_loada_u16(data_out + vstep * 3);
+            
+            #line 397
+            npyv_u16 abc0 = npyv_muladd_u16(a0, b0, c0);
+            
+#line 397
+            npyv_u16 abc1 = npyv_muladd_u16(a1, b1, c1);
+            
+#line 397
+            npyv_u16 abc2 = npyv_muladd_u16(a2, b2, c2);
+            
+#line 397
+            npyv_u16 abc3 = npyv_muladd_u16(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_u16(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_u16(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_u16(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_u16(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_u16 a0 = npyv_load_u16(data0 + vstep * 0);
+            npyv_u16 b0 = npyv_load_u16(data1 + vstep * 0);
+            npyv_u16 c0 = npyv_load_u16(data_out + vstep * 0);
+            
+#line 390
+            npyv_u16 a1 = npyv_load_u16(data0 + vstep * 1);
+            npyv_u16 b1 = npyv_load_u16(data1 + vstep * 1);
+            npyv_u16 c1 = npyv_load_u16(data_out + vstep * 1);
+            
+#line 390
+            npyv_u16 a2 = npyv_load_u16(data0 + vstep * 2);
+            npyv_u16 b2 = npyv_load_u16(data1 + vstep * 2);
+            npyv_u16 c2 = npyv_load_u16(data_out + vstep * 2);
+            
+#line 390
+            npyv_u16 a3 = npyv_load_u16(data0 + vstep * 3);
+            npyv_u16 b3 = npyv_load_u16(data1 + vstep * 3);
+            npyv_u16 c3 = npyv_load_u16(data_out + vstep * 3);
+            
+            #line 397
+            npyv_u16 abc0 = npyv_muladd_u16(a0, b0, c0);
+            
+#line 397
+            npyv_u16 abc1 = npyv_muladd_u16(a1, b1, c1);
+            
+#line 397
+            npyv_u16 abc2 = npyv_muladd_u16(a2, b2, c2);
+            
+#line 397
+            npyv_u16 abc3 = npyv_muladd_u16(a3, b3, c3);
+            
+            #line 402
+            npyv_store_u16(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_u16(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_u16(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_u16(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_u16 a = npyv_load_tillz_u16(data0, count);
+        npyv_u16 b = npyv_load_tillz_u16(data1, count);
+        npyv_u16 c = npyv_load_tillz_u16(data_out, count);
+        npyv_store_till_u16(data_out, count, npyv_muladd_u16(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_ushort a0 = (data0[0]);
+        const npy_ushort b0 = (data1[0]);
+        const npy_ushort c0 = (data_out[0]);
+        
+#line 420
+        const npy_ushort a1 = (data0[1]);
+        const npy_ushort b1 = (data1[1]);
+        const npy_ushort c1 = (data_out[1]);
+        
+#line 420
+        const npy_ushort a2 = (data0[2]);
+        const npy_ushort b2 = (data1[2]);
+        const npy_ushort c2 = (data_out[2]);
+        
+#line 420
+        const npy_ushort a3 = (data0[3]);
+        const npy_ushort b3 = (data1[3]);
+        const npy_ushort c3 = (data_out[3]);
+        
+        #line 427
+        const npy_ushort abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_ushort abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_ushort abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_ushort abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_ushort a = (*data0);
+        const npy_ushort b = (*data1);
+        const npy_ushort c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_ushort
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+ushort_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ushort value0 = (*(npy_ushort *)dataptr[0]);
+    npy_ushort *data1 = (npy_ushort *)dataptr[1];
+    npy_ushort *data_out = (npy_ushort *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    ushort_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+ushort_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ushort value1 = (*(npy_ushort *)dataptr[1]);
+    npy_ushort *data0 = (npy_ushort *)dataptr[0];
+    npy_ushort *data_out = (npy_ushort *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    ushort_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+ushort_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ushort *data0 = (npy_ushort *)dataptr[0];
+    npy_ushort *data1 = (npy_ushort *)dataptr[1];
+    npy_ushort accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_ushort
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_u16;
+    npyv_u16 v_accum = npyv_zero_u16();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_u16 a0 = npyv_loada_u16(data0 + vstep * 0);
+            npyv_u16 b0 = npyv_loada_u16(data1 + vstep * 0);
+            
+#line 501
+            npyv_u16 a1 = npyv_loada_u16(data0 + vstep * 1);
+            npyv_u16 b1 = npyv_loada_u16(data1 + vstep * 1);
+            
+#line 501
+            npyv_u16 a2 = npyv_loada_u16(data0 + vstep * 2);
+            npyv_u16 b2 = npyv_loada_u16(data1 + vstep * 2);
+            
+#line 501
+            npyv_u16 a3 = npyv_loada_u16(data0 + vstep * 3);
+            npyv_u16 b3 = npyv_loada_u16(data1 + vstep * 3);
+            
+            npyv_u16 ab3 = npyv_muladd_u16(a3, b3, v_accum);
+            npyv_u16 ab2 = npyv_muladd_u16(a2, b2, ab3);
+            npyv_u16 ab1 = npyv_muladd_u16(a1, b1, ab2);
+                   v_accum = npyv_muladd_u16(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_u16 a0 = npyv_load_u16(data0 + vstep * 0);
+            npyv_u16 b0 = npyv_load_u16(data1 + vstep * 0);
+            
+#line 501
+            npyv_u16 a1 = npyv_load_u16(data0 + vstep * 1);
+            npyv_u16 b1 = npyv_load_u16(data1 + vstep * 1);
+            
+#line 501
+            npyv_u16 a2 = npyv_load_u16(data0 + vstep * 2);
+            npyv_u16 b2 = npyv_load_u16(data1 + vstep * 2);
+            
+#line 501
+            npyv_u16 a3 = npyv_load_u16(data0 + vstep * 3);
+            npyv_u16 b3 = npyv_load_u16(data1 + vstep * 3);
+            
+            npyv_u16 ab3 = npyv_muladd_u16(a3, b3, v_accum);
+            npyv_u16 ab2 = npyv_muladd_u16(a2, b2, ab3);
+            npyv_u16 ab1 = npyv_muladd_u16(a1, b1, ab2);
+                   v_accum = npyv_muladd_u16(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_u16 a = npyv_load_tillz_u16(data0, count);
+        npyv_u16 b = npyv_load_tillz_u16(data1, count);
+        v_accum = npyv_muladd_u16(a, b, v_accum);
+    }
+    accum = npyv_sum_u16(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_ushort ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_ushort ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_ushort ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_ushort ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_ushort a = (*data0);
+        const npy_ushort b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_ushort
+    *(npy_ushort *)dataptr[2] = ((*(npy_ushort *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+ushort_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ushort *data1 = (npy_ushort *)dataptr[1];
+    npy_ushort value0 = (*(npy_ushort *)dataptr[0]);
+    npy_ushort accum = ushort_sum_of_arr(data1, count);
+    *(npy_ushort *)dataptr[2] = ((*(npy_ushort *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+ushort_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ushort *data0 = (npy_ushort *)dataptr[0];
+    npy_ushort value1 = (*(npy_ushort *)dataptr[1]);
+    npy_ushort accum = ushort_sum_of_arr(data0, count);
+    *(npy_ushort *)dataptr[2] = ((*(npy_ushort *)dataptr[2]) + value1 * accum);
+}
+
+#elif 1000 == 3 && !0
+
+static void
+ushort_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ushort *data0 = (npy_ushort *)dataptr[0];
+    npy_ushort *data1 = (npy_ushort *)dataptr[1];
+    npy_ushort *data2 = (npy_ushort *)dataptr[2];
+    npy_ushort *data_out = (npy_ushort *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 1000 > 3 || @complex */
+
+static void
+ushort_sum_of_products_contig_any(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_any (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_ushort temp = (*(npy_ushort *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ushort *)dataptr[i]);
+        }
+        *(npy_ushort *)dataptr[nop] = (temp +
+                                           (*(npy_ushort *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_ushort);
+        }
+#else /* complex */
+#  if 1000 <= 3
+#    define _SUMPROD_NOP 1000
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_ushort re, im, tmp;
+        int i;
+        re = ((npy_ushort *)dataptr[0])[0];
+        im = ((npy_ushort *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ushort *)dataptr[i])[0] -
+                  im * ((npy_ushort *)dataptr[i])[1];
+            im = re * ((npy_ushort *)dataptr[i])[1] +
+                 im * ((npy_ushort *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_ushort *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_ushort *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_ushort *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_ushort *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_ushort);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 1000 */
+
+#if 1000 == 1
+
+static NPY_GCC_OPT_3 void
+ushort_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_ushort *data = (npy_ushort *)dataptr[0];
+    npy_ushort accum = ushort_sum_of_arr(data, count);
+    *((npy_ushort *)dataptr[1]) = (accum + (*((npy_ushort *)dataptr[1])));
+#else
+    npy_ushort accum_re = 0, accum_im = 0;
+    npy_ushort *data0 = (npy_ushort *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_ushort re01 = data0[0] + data0[2];
+        const npy_ushort re23 = data0[4] + data0[6];
+        const npy_ushort im13 = data0[1] + data0[3];
+        const npy_ushort im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_ushort *)dataptr[1])[0] += accum_re;
+    ((npy_ushort *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 1000 == 1 */
+
+static void
+ushort_sum_of_products_outstride0_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_ushort accum_re = 0, accum_im = 0;
+#else
+    npy_ushort accum = 0;
+#endif
+
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1000 == 2 || 1000 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1000 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("ushort_sum_of_products_outstride0_any (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 1000 == 1
+        accum += (*(npy_ushort *)data0);
+        data0 += stride0;
+#  elif 1000 == 2
+        accum += (*(npy_ushort *)data0) *
+                 (*(npy_ushort *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 1000 == 3
+        accum += (*(npy_ushort *)data0) *
+                 (*(npy_ushort *)data1) *
+                 (*(npy_ushort *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_ushort temp = (*(npy_ushort *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ushort *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1000 == 1
+        accum_re += ((npy_ushort *)data0)[0];
+        accum_im += ((npy_ushort *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 1000 <= 3
+#define _SUMPROD_NOP 1000
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_ushort re, im, tmp;
+        int i;
+        re = ((npy_ushort *)dataptr[0])[0];
+        im = ((npy_ushort *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ushort *)dataptr[i])[0] -
+                  im * ((npy_ushort *)dataptr[i])[1];
+            im = re * ((npy_ushort *)dataptr[i])[1] +
+                 im * ((npy_ushort *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 1000 <= 3
+    ((npy_ushort *)dataptr[1000])[0] += accum_re;
+    ((npy_ushort *)dataptr[1000])[1] += accum_im;
+#  else
+    ((npy_ushort *)dataptr[nop])[0] += accum_re;
+    ((npy_ushort *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 1000 <= 3
+    *((npy_ushort *)dataptr[1000]) = (accum +
+                                    (*((npy_ushort *)dataptr[1000])));
+#  else
+    *((npy_ushort *)dataptr[nop]) = (accum +
+                                    (*((npy_ushort *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+
+
+#line 74
+
+#if !0
+static NPY_GCC_OPT_3 npy_uint uint_sum_of_arr(npy_uint *data, npy_intp count)
+{
+    npy_uint accum = 0;
+#if 0 // NPYV check for npy_uint
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data);
+    const int vstep = npyv_nlanes_u32;
+    npyv_u32 v_accum = npyv_zero_u32();
+    const npy_intp vstepx4 = vstep * 4;
+
+    #line 91
+    if(is_aligned) {
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
+            #line 96
+            npyv_u32 a0 = npyv_loada_u32(data + vstep * 0);
+            
+#line 96
+            npyv_u32 a1 = npyv_loada_u32(data + vstep * 1);
+            
+#line 96
+            npyv_u32 a2 = npyv_loada_u32(data + vstep * 2);
+            
+#line 96
+            npyv_u32 a3 = npyv_loada_u32(data + vstep * 3);
+            
+            npyv_u32 a01   = npyv_add_u32(a0, a1);
+            npyv_u32 a23   = npyv_add_u32(a2, a3);
+            npyv_u32 a0123 = npyv_add_u32(a01, a23);
+                     v_accum = npyv_add_u32(a0123, v_accum);
+        }
+    }
+    
+#line 91
+    else {
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
+            #line 96
+            npyv_u32 a0 = npyv_load_u32(data + vstep * 0);
+            
+#line 96
+            npyv_u32 a1 = npyv_load_u32(data + vstep * 1);
+            
+#line 96
+            npyv_u32 a2 = npyv_load_u32(data + vstep * 2);
+            
+#line 96
+            npyv_u32 a3 = npyv_load_u32(data + vstep * 3);
+            
+            npyv_u32 a01   = npyv_add_u32(a0, a1);
+            npyv_u32 a23   = npyv_add_u32(a2, a3);
+            npyv_u32 a0123 = npyv_add_u32(a01, a23);
+                     v_accum = npyv_add_u32(a0123, v_accum);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep) {
+        npyv_u32 a = npyv_load_tillz_u32(data, count);
+        v_accum = npyv_add_u32(a, v_accum);
+    }
+    accum = npyv_sum_u32(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data += 4) {
+        const npy_uint a01 = (*data) + (data[1]);
+        const npy_uint a23 = (data[2]) + (data[3]);
+        accum +=  a01 + a23;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data++) {
+        accum += (*data);
+    }
+#endif // NPYV check for npy_uint
+    return accum;
+}
+#endif
+
+#line 131
+static void
+uint_sum_of_products_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1 == 2 || 1 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data_out = dataptr[1];
+    npy_intp stride_out = strides[1];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("uint_sum_of_products_one (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 1 == 1
+        *(npy_uint *)data_out = ((*(npy_uint *)data0) +
+                                         (*(npy_uint *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 1 == 2
+        *(npy_uint *)data_out = ((*(npy_uint *)data0) *
+                                         (*(npy_uint *)data1) +
+                                         (*(npy_uint *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 1 == 3
+        *(npy_uint *)data_out = ((*(npy_uint *)data0) *
+                                         (*(npy_uint *)data1) *
+                                         (*(npy_uint *)data2) +
+                                         (*(npy_uint *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_uint temp = (*(npy_uint *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_uint *)dataptr[i]);
+        }
+        *(npy_uint *)dataptr[nop] = (temp +
+                                           (*(npy_uint *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1 == 1
+        ((npy_uint *)data_out)[0] = ((npy_uint *)data0)[0] +
+                                         ((npy_uint *)data_out)[0];
+        ((npy_uint *)data_out)[1] = ((npy_uint *)data0)[1] +
+                                         ((npy_uint *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 1 <= 3
+#define _SUMPROD_NOP 1
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_uint re, im, tmp;
+        int i;
+        re = ((npy_uint *)dataptr[0])[0];
+        im = ((npy_uint *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_uint *)dataptr[i])[0] -
+                  im * ((npy_uint *)dataptr[i])[1];
+            im = re * ((npy_uint *)dataptr[i])[1] +
+                 im * ((npy_uint *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_uint *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_uint *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_uint *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_uint *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 1 == 1
+
+static void
+uint_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_uint *data0 = (npy_uint *)dataptr[0];
+    npy_uint *data_out = (npy_uint *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("uint_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_uint *)data_out + 2*6)[0] =
+                                    ((npy_uint *)data0 + 2*6)[0] +
+                                    ((npy_uint *)data_out + 2*6)[0];
+            ((npy_uint *)data_out + 2*6)[1] =
+                                    ((npy_uint *)data0 + 2*6)[1] +
+                                    ((npy_uint *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_uint *)data_out + 2*5)[0] =
+                                    ((npy_uint *)data0 + 2*5)[0] +
+                                    ((npy_uint *)data_out + 2*5)[0];
+            ((npy_uint *)data_out + 2*5)[1] =
+                                    ((npy_uint *)data0 + 2*5)[1] +
+                                    ((npy_uint *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_uint *)data_out + 2*4)[0] =
+                                    ((npy_uint *)data0 + 2*4)[0] +
+                                    ((npy_uint *)data_out + 2*4)[0];
+            ((npy_uint *)data_out + 2*4)[1] =
+                                    ((npy_uint *)data0 + 2*4)[1] +
+                                    ((npy_uint *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_uint *)data_out + 2*3)[0] =
+                                    ((npy_uint *)data0 + 2*3)[0] +
+                                    ((npy_uint *)data_out + 2*3)[0];
+            ((npy_uint *)data_out + 2*3)[1] =
+                                    ((npy_uint *)data0 + 2*3)[1] +
+                                    ((npy_uint *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_uint *)data_out + 2*2)[0] =
+                                    ((npy_uint *)data0 + 2*2)[0] +
+                                    ((npy_uint *)data_out + 2*2)[0];
+            ((npy_uint *)data_out + 2*2)[1] =
+                                    ((npy_uint *)data0 + 2*2)[1] +
+                                    ((npy_uint *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_uint *)data_out + 2*1)[0] =
+                                    ((npy_uint *)data0 + 2*1)[0] +
+                                    ((npy_uint *)data_out + 2*1)[0];
+            ((npy_uint *)data_out + 2*1)[1] =
+                                    ((npy_uint *)data0 + 2*1)[1] +
+                                    ((npy_uint *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_uint *)data_out + 2*0)[0] =
+                                    ((npy_uint *)data0 + 2*0)[0] +
+                                    ((npy_uint *)data_out + 2*0)[0];
+            ((npy_uint *)data_out + 2*0)[1] =
+                                    ((npy_uint *)data0 + 2*0)[1] +
+                                    ((npy_uint *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_uint *)data_out + 2*0)[0] =
+                                ((npy_uint *)data0 + 2*0)[0] +
+                                ((npy_uint *)data_out + 2*0)[0];
+        ((npy_uint *)data_out + 2*0)[1] =
+                                ((npy_uint *)data0 + 2*0)[1] +
+                                ((npy_uint *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_uint *)data_out + 2*1)[0] =
+                                ((npy_uint *)data0 + 2*1)[0] +
+                                ((npy_uint *)data_out + 2*1)[0];
+        ((npy_uint *)data_out + 2*1)[1] =
+                                ((npy_uint *)data0 + 2*1)[1] +
+                                ((npy_uint *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_uint *)data_out + 2*2)[0] =
+                                ((npy_uint *)data0 + 2*2)[0] +
+                                ((npy_uint *)data_out + 2*2)[0];
+        ((npy_uint *)data_out + 2*2)[1] =
+                                ((npy_uint *)data0 + 2*2)[1] +
+                                ((npy_uint *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_uint *)data_out + 2*3)[0] =
+                                ((npy_uint *)data0 + 2*3)[0] +
+                                ((npy_uint *)data_out + 2*3)[0];
+        ((npy_uint *)data_out + 2*3)[1] =
+                                ((npy_uint *)data0 + 2*3)[1] +
+                                ((npy_uint *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_uint *)data_out + 2*4)[0] =
+                                ((npy_uint *)data0 + 2*4)[0] +
+                                ((npy_uint *)data_out + 2*4)[0];
+        ((npy_uint *)data_out + 2*4)[1] =
+                                ((npy_uint *)data0 + 2*4)[1] +
+                                ((npy_uint *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_uint *)data_out + 2*5)[0] =
+                                ((npy_uint *)data0 + 2*5)[0] +
+                                ((npy_uint *)data_out + 2*5)[0];
+        ((npy_uint *)data_out + 2*5)[1] =
+                                ((npy_uint *)data0 + 2*5)[1] +
+                                ((npy_uint *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_uint *)data_out + 2*6)[0] =
+                                ((npy_uint *)data0 + 2*6)[0] +
+                                ((npy_uint *)data_out + 2*6)[0];
+        ((npy_uint *)data_out + 2*6)[1] =
+                                ((npy_uint *)data0 + 2*6)[1] +
+                                ((npy_uint *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_uint *)data_out + 2*7)[0] =
+                                ((npy_uint *)data0 + 2*7)[0] +
+                                ((npy_uint *)data_out + 2*7)[0];
+        ((npy_uint *)data_out + 2*7)[1] =
+                                ((npy_uint *)data0 + 2*7)[1] +
+                                ((npy_uint *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 1 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+uint_sum_of_products_muladd(npy_uint *data, npy_uint *data_out, npy_uint scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_uint
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_u32;
+    const npyv_u32 v_scalar = npyv_setall_u32(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_u32 b0 = npyv_loada_u32(data + vstep * 0);
+            npyv_u32 c0 = npyv_loada_u32(data_out + vstep * 0);
+            
+#line 312
+            npyv_u32 b1 = npyv_loada_u32(data + vstep * 1);
+            npyv_u32 c1 = npyv_loada_u32(data_out + vstep * 1);
+            
+#line 312
+            npyv_u32 b2 = npyv_loada_u32(data + vstep * 2);
+            npyv_u32 c2 = npyv_loada_u32(data_out + vstep * 2);
+            
+#line 312
+            npyv_u32 b3 = npyv_loada_u32(data + vstep * 3);
+            npyv_u32 c3 = npyv_loada_u32(data_out + vstep * 3);
+            
+            #line 318
+            npyv_u32 abc0 = npyv_muladd_u32(v_scalar, b0, c0);
+            
+#line 318
+            npyv_u32 abc1 = npyv_muladd_u32(v_scalar, b1, c1);
+            
+#line 318
+            npyv_u32 abc2 = npyv_muladd_u32(v_scalar, b2, c2);
+            
+#line 318
+            npyv_u32 abc3 = npyv_muladd_u32(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_u32(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_u32(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_u32(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_u32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_u32 b0 = npyv_load_u32(data + vstep * 0);
+            npyv_u32 c0 = npyv_load_u32(data_out + vstep * 0);
+            
+#line 312
+            npyv_u32 b1 = npyv_load_u32(data + vstep * 1);
+            npyv_u32 c1 = npyv_load_u32(data_out + vstep * 1);
+            
+#line 312
+            npyv_u32 b2 = npyv_load_u32(data + vstep * 2);
+            npyv_u32 c2 = npyv_load_u32(data_out + vstep * 2);
+            
+#line 312
+            npyv_u32 b3 = npyv_load_u32(data + vstep * 3);
+            npyv_u32 c3 = npyv_load_u32(data_out + vstep * 3);
+            
+            #line 318
+            npyv_u32 abc0 = npyv_muladd_u32(v_scalar, b0, c0);
+            
+#line 318
+            npyv_u32 abc1 = npyv_muladd_u32(v_scalar, b1, c1);
+            
+#line 318
+            npyv_u32 abc2 = npyv_muladd_u32(v_scalar, b2, c2);
+            
+#line 318
+            npyv_u32 abc3 = npyv_muladd_u32(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_u32(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_u32(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_u32(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_u32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_u32 a = npyv_load_tillz_u32(data, count);
+        npyv_u32 b = npyv_load_tillz_u32(data_out, count);
+        npyv_store_till_u32(data_out, count, npyv_muladd_u32(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_uint b0 = (data[0]);
+        const npy_uint c0 = (data_out[0]);
+        
+#line 340
+        const npy_uint b1 = (data[1]);
+        const npy_uint c1 = (data_out[1]);
+        
+#line 340
+        const npy_uint b2 = (data[2]);
+        const npy_uint c2 = (data_out[2]);
+        
+#line 340
+        const npy_uint b3 = (data[3]);
+        const npy_uint c3 = (data_out[3]);
+        
+        #line 346
+        const npy_uint abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_uint abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_uint abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_uint abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_uint b = (*data);
+        const npy_uint c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_uint
+}
+
+static void
+uint_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_uint *data0 = (npy_uint *)dataptr[0];
+    npy_uint *data1 = (npy_uint *)dataptr[1];
+    npy_uint *data_out = (npy_uint *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("uint_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_uint
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_u32;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_u32 a0 = npyv_loada_u32(data0 + vstep * 0);
+            npyv_u32 b0 = npyv_loada_u32(data1 + vstep * 0);
+            npyv_u32 c0 = npyv_loada_u32(data_out + vstep * 0);
+            
+#line 390
+            npyv_u32 a1 = npyv_loada_u32(data0 + vstep * 1);
+            npyv_u32 b1 = npyv_loada_u32(data1 + vstep * 1);
+            npyv_u32 c1 = npyv_loada_u32(data_out + vstep * 1);
+            
+#line 390
+            npyv_u32 a2 = npyv_loada_u32(data0 + vstep * 2);
+            npyv_u32 b2 = npyv_loada_u32(data1 + vstep * 2);
+            npyv_u32 c2 = npyv_loada_u32(data_out + vstep * 2);
+            
+#line 390
+            npyv_u32 a3 = npyv_loada_u32(data0 + vstep * 3);
+            npyv_u32 b3 = npyv_loada_u32(data1 + vstep * 3);
+            npyv_u32 c3 = npyv_loada_u32(data_out + vstep * 3);
+            
+            #line 397
+            npyv_u32 abc0 = npyv_muladd_u32(a0, b0, c0);
+            
+#line 397
+            npyv_u32 abc1 = npyv_muladd_u32(a1, b1, c1);
+            
+#line 397
+            npyv_u32 abc2 = npyv_muladd_u32(a2, b2, c2);
+            
+#line 397
+            npyv_u32 abc3 = npyv_muladd_u32(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_u32(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_u32(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_u32(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_u32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_u32 a0 = npyv_load_u32(data0 + vstep * 0);
+            npyv_u32 b0 = npyv_load_u32(data1 + vstep * 0);
+            npyv_u32 c0 = npyv_load_u32(data_out + vstep * 0);
+            
+#line 390
+            npyv_u32 a1 = npyv_load_u32(data0 + vstep * 1);
+            npyv_u32 b1 = npyv_load_u32(data1 + vstep * 1);
+            npyv_u32 c1 = npyv_load_u32(data_out + vstep * 1);
+            
+#line 390
+            npyv_u32 a2 = npyv_load_u32(data0 + vstep * 2);
+            npyv_u32 b2 = npyv_load_u32(data1 + vstep * 2);
+            npyv_u32 c2 = npyv_load_u32(data_out + vstep * 2);
+            
+#line 390
+            npyv_u32 a3 = npyv_load_u32(data0 + vstep * 3);
+            npyv_u32 b3 = npyv_load_u32(data1 + vstep * 3);
+            npyv_u32 c3 = npyv_load_u32(data_out + vstep * 3);
+            
+            #line 397
+            npyv_u32 abc0 = npyv_muladd_u32(a0, b0, c0);
+            
+#line 397
+            npyv_u32 abc1 = npyv_muladd_u32(a1, b1, c1);
+            
+#line 397
+            npyv_u32 abc2 = npyv_muladd_u32(a2, b2, c2);
+            
+#line 397
+            npyv_u32 abc3 = npyv_muladd_u32(a3, b3, c3);
+            
+            #line 402
+            npyv_store_u32(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_u32(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_u32(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_u32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_u32 a = npyv_load_tillz_u32(data0, count);
+        npyv_u32 b = npyv_load_tillz_u32(data1, count);
+        npyv_u32 c = npyv_load_tillz_u32(data_out, count);
+        npyv_store_till_u32(data_out, count, npyv_muladd_u32(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_uint a0 = (data0[0]);
+        const npy_uint b0 = (data1[0]);
+        const npy_uint c0 = (data_out[0]);
+        
+#line 420
+        const npy_uint a1 = (data0[1]);
+        const npy_uint b1 = (data1[1]);
+        const npy_uint c1 = (data_out[1]);
+        
+#line 420
+        const npy_uint a2 = (data0[2]);
+        const npy_uint b2 = (data1[2]);
+        const npy_uint c2 = (data_out[2]);
+        
+#line 420
+        const npy_uint a3 = (data0[3]);
+        const npy_uint b3 = (data1[3]);
+        const npy_uint c3 = (data_out[3]);
+        
+        #line 427
+        const npy_uint abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_uint abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_uint abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_uint abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_uint a = (*data0);
+        const npy_uint b = (*data1);
+        const npy_uint c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_uint
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+uint_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_uint value0 = (*(npy_uint *)dataptr[0]);
+    npy_uint *data1 = (npy_uint *)dataptr[1];
+    npy_uint *data_out = (npy_uint *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("uint_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    uint_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+uint_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_uint value1 = (*(npy_uint *)dataptr[1]);
+    npy_uint *data0 = (npy_uint *)dataptr[0];
+    npy_uint *data_out = (npy_uint *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("uint_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    uint_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+uint_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_uint *data0 = (npy_uint *)dataptr[0];
+    npy_uint *data1 = (npy_uint *)dataptr[1];
+    npy_uint accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("uint_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_uint
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_u32;
+    npyv_u32 v_accum = npyv_zero_u32();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_u32 a0 = npyv_loada_u32(data0 + vstep * 0);
+            npyv_u32 b0 = npyv_loada_u32(data1 + vstep * 0);
+            
+#line 501
+            npyv_u32 a1 = npyv_loada_u32(data0 + vstep * 1);
+            npyv_u32 b1 = npyv_loada_u32(data1 + vstep * 1);
+            
+#line 501
+            npyv_u32 a2 = npyv_loada_u32(data0 + vstep * 2);
+            npyv_u32 b2 = npyv_loada_u32(data1 + vstep * 2);
+            
+#line 501
+            npyv_u32 a3 = npyv_loada_u32(data0 + vstep * 3);
+            npyv_u32 b3 = npyv_loada_u32(data1 + vstep * 3);
+            
+            npyv_u32 ab3 = npyv_muladd_u32(a3, b3, v_accum);
+            npyv_u32 ab2 = npyv_muladd_u32(a2, b2, ab3);
+            npyv_u32 ab1 = npyv_muladd_u32(a1, b1, ab2);
+                   v_accum = npyv_muladd_u32(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_u32 a0 = npyv_load_u32(data0 + vstep * 0);
+            npyv_u32 b0 = npyv_load_u32(data1 + vstep * 0);
+            
+#line 501
+            npyv_u32 a1 = npyv_load_u32(data0 + vstep * 1);
+            npyv_u32 b1 = npyv_load_u32(data1 + vstep * 1);
+            
+#line 501
+            npyv_u32 a2 = npyv_load_u32(data0 + vstep * 2);
+            npyv_u32 b2 = npyv_load_u32(data1 + vstep * 2);
+            
+#line 501
+            npyv_u32 a3 = npyv_load_u32(data0 + vstep * 3);
+            npyv_u32 b3 = npyv_load_u32(data1 + vstep * 3);
+            
+            npyv_u32 ab3 = npyv_muladd_u32(a3, b3, v_accum);
+            npyv_u32 ab2 = npyv_muladd_u32(a2, b2, ab3);
+            npyv_u32 ab1 = npyv_muladd_u32(a1, b1, ab2);
+                   v_accum = npyv_muladd_u32(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_u32 a = npyv_load_tillz_u32(data0, count);
+        npyv_u32 b = npyv_load_tillz_u32(data1, count);
+        v_accum = npyv_muladd_u32(a, b, v_accum);
+    }
+    accum = npyv_sum_u32(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_uint ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_uint ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_uint ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_uint ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_uint a = (*data0);
+        const npy_uint b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_uint
+    *(npy_uint *)dataptr[2] = ((*(npy_uint *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+uint_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_uint *data1 = (npy_uint *)dataptr[1];
+    npy_uint value0 = (*(npy_uint *)dataptr[0]);
+    npy_uint accum = uint_sum_of_arr(data1, count);
+    *(npy_uint *)dataptr[2] = ((*(npy_uint *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+uint_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_uint *data0 = (npy_uint *)dataptr[0];
+    npy_uint value1 = (*(npy_uint *)dataptr[1]);
+    npy_uint accum = uint_sum_of_arr(data0, count);
+    *(npy_uint *)dataptr[2] = ((*(npy_uint *)dataptr[2]) + value1 * accum);
+}
+
+#elif 1 == 3 && !0
+
+static void
+uint_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_uint *data0 = (npy_uint *)dataptr[0];
+    npy_uint *data1 = (npy_uint *)dataptr[1];
+    npy_uint *data2 = (npy_uint *)dataptr[2];
+    npy_uint *data_out = (npy_uint *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 1 > 3 || @complex */
+
+static void
+uint_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("uint_sum_of_products_contig_one (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_uint temp = (*(npy_uint *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_uint *)dataptr[i]);
+        }
+        *(npy_uint *)dataptr[nop] = (temp +
+                                           (*(npy_uint *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_uint);
+        }
+#else /* complex */
+#  if 1 <= 3
+#    define _SUMPROD_NOP 1
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_uint re, im, tmp;
+        int i;
+        re = ((npy_uint *)dataptr[0])[0];
+        im = ((npy_uint *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_uint *)dataptr[i])[0] -
+                  im * ((npy_uint *)dataptr[i])[1];
+            im = re * ((npy_uint *)dataptr[i])[1] +
+                 im * ((npy_uint *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_uint *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_uint *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_uint *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_uint *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_uint);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 1 */
+
+#if 1 == 1
+
+static NPY_GCC_OPT_3 void
+uint_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("uint_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_uint *data = (npy_uint *)dataptr[0];
+    npy_uint accum = uint_sum_of_arr(data, count);
+    *((npy_uint *)dataptr[1]) = (accum + (*((npy_uint *)dataptr[1])));
+#else
+    npy_uint accum_re = 0, accum_im = 0;
+    npy_uint *data0 = (npy_uint *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_uint re01 = data0[0] + data0[2];
+        const npy_uint re23 = data0[4] + data0[6];
+        const npy_uint im13 = data0[1] + data0[3];
+        const npy_uint im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_uint *)dataptr[1])[0] += accum_re;
+    ((npy_uint *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 1 == 1 */
+
+static void
+uint_sum_of_products_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_uint accum_re = 0, accum_im = 0;
+#else
+    npy_uint accum = 0;
+#endif
+
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1 == 2 || 1 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("uint_sum_of_products_outstride0_one (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 1 == 1
+        accum += (*(npy_uint *)data0);
+        data0 += stride0;
+#  elif 1 == 2
+        accum += (*(npy_uint *)data0) *
+                 (*(npy_uint *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 1 == 3
+        accum += (*(npy_uint *)data0) *
+                 (*(npy_uint *)data1) *
+                 (*(npy_uint *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_uint temp = (*(npy_uint *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_uint *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1 == 1
+        accum_re += ((npy_uint *)data0)[0];
+        accum_im += ((npy_uint *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 1 <= 3
+#define _SUMPROD_NOP 1
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_uint re, im, tmp;
+        int i;
+        re = ((npy_uint *)dataptr[0])[0];
+        im = ((npy_uint *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_uint *)dataptr[i])[0] -
+                  im * ((npy_uint *)dataptr[i])[1];
+            im = re * ((npy_uint *)dataptr[i])[1] +
+                 im * ((npy_uint *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 1 <= 3
+    ((npy_uint *)dataptr[1])[0] += accum_re;
+    ((npy_uint *)dataptr[1])[1] += accum_im;
+#  else
+    ((npy_uint *)dataptr[nop])[0] += accum_re;
+    ((npy_uint *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 1 <= 3
+    *((npy_uint *)dataptr[1]) = (accum +
+                                    (*((npy_uint *)dataptr[1])));
+#  else
+    *((npy_uint *)dataptr[nop]) = (accum +
+                                    (*((npy_uint *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+uint_sum_of_products_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (2 == 2 || 2 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (2 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data_out = dataptr[2];
+    npy_intp stride_out = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("uint_sum_of_products_two (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 2 == 1
+        *(npy_uint *)data_out = ((*(npy_uint *)data0) +
+                                         (*(npy_uint *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 2 == 2
+        *(npy_uint *)data_out = ((*(npy_uint *)data0) *
+                                         (*(npy_uint *)data1) +
+                                         (*(npy_uint *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 2 == 3
+        *(npy_uint *)data_out = ((*(npy_uint *)data0) *
+                                         (*(npy_uint *)data1) *
+                                         (*(npy_uint *)data2) +
+                                         (*(npy_uint *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_uint temp = (*(npy_uint *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_uint *)dataptr[i]);
+        }
+        *(npy_uint *)dataptr[nop] = (temp +
+                                           (*(npy_uint *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 2 == 1
+        ((npy_uint *)data_out)[0] = ((npy_uint *)data0)[0] +
+                                         ((npy_uint *)data_out)[0];
+        ((npy_uint *)data_out)[1] = ((npy_uint *)data0)[1] +
+                                         ((npy_uint *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 2 <= 3
+#define _SUMPROD_NOP 2
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_uint re, im, tmp;
+        int i;
+        re = ((npy_uint *)dataptr[0])[0];
+        im = ((npy_uint *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_uint *)dataptr[i])[0] -
+                  im * ((npy_uint *)dataptr[i])[1];
+            im = re * ((npy_uint *)dataptr[i])[1] +
+                 im * ((npy_uint *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_uint *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_uint *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_uint *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_uint *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 2 == 1
+
+static void
+uint_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_uint *data0 = (npy_uint *)dataptr[0];
+    npy_uint *data_out = (npy_uint *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("uint_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_uint *)data_out + 2*6)[0] =
+                                    ((npy_uint *)data0 + 2*6)[0] +
+                                    ((npy_uint *)data_out + 2*6)[0];
+            ((npy_uint *)data_out + 2*6)[1] =
+                                    ((npy_uint *)data0 + 2*6)[1] +
+                                    ((npy_uint *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_uint *)data_out + 2*5)[0] =
+                                    ((npy_uint *)data0 + 2*5)[0] +
+                                    ((npy_uint *)data_out + 2*5)[0];
+            ((npy_uint *)data_out + 2*5)[1] =
+                                    ((npy_uint *)data0 + 2*5)[1] +
+                                    ((npy_uint *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_uint *)data_out + 2*4)[0] =
+                                    ((npy_uint *)data0 + 2*4)[0] +
+                                    ((npy_uint *)data_out + 2*4)[0];
+            ((npy_uint *)data_out + 2*4)[1] =
+                                    ((npy_uint *)data0 + 2*4)[1] +
+                                    ((npy_uint *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_uint *)data_out + 2*3)[0] =
+                                    ((npy_uint *)data0 + 2*3)[0] +
+                                    ((npy_uint *)data_out + 2*3)[0];
+            ((npy_uint *)data_out + 2*3)[1] =
+                                    ((npy_uint *)data0 + 2*3)[1] +
+                                    ((npy_uint *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_uint *)data_out + 2*2)[0] =
+                                    ((npy_uint *)data0 + 2*2)[0] +
+                                    ((npy_uint *)data_out + 2*2)[0];
+            ((npy_uint *)data_out + 2*2)[1] =
+                                    ((npy_uint *)data0 + 2*2)[1] +
+                                    ((npy_uint *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_uint *)data_out + 2*1)[0] =
+                                    ((npy_uint *)data0 + 2*1)[0] +
+                                    ((npy_uint *)data_out + 2*1)[0];
+            ((npy_uint *)data_out + 2*1)[1] =
+                                    ((npy_uint *)data0 + 2*1)[1] +
+                                    ((npy_uint *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_uint *)data_out + 2*0)[0] =
+                                    ((npy_uint *)data0 + 2*0)[0] +
+                                    ((npy_uint *)data_out + 2*0)[0];
+            ((npy_uint *)data_out + 2*0)[1] =
+                                    ((npy_uint *)data0 + 2*0)[1] +
+                                    ((npy_uint *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_uint *)data_out + 2*0)[0] =
+                                ((npy_uint *)data0 + 2*0)[0] +
+                                ((npy_uint *)data_out + 2*0)[0];
+        ((npy_uint *)data_out + 2*0)[1] =
+                                ((npy_uint *)data0 + 2*0)[1] +
+                                ((npy_uint *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_uint *)data_out + 2*1)[0] =
+                                ((npy_uint *)data0 + 2*1)[0] +
+                                ((npy_uint *)data_out + 2*1)[0];
+        ((npy_uint *)data_out + 2*1)[1] =
+                                ((npy_uint *)data0 + 2*1)[1] +
+                                ((npy_uint *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_uint *)data_out + 2*2)[0] =
+                                ((npy_uint *)data0 + 2*2)[0] +
+                                ((npy_uint *)data_out + 2*2)[0];
+        ((npy_uint *)data_out + 2*2)[1] =
+                                ((npy_uint *)data0 + 2*2)[1] +
+                                ((npy_uint *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_uint *)data_out + 2*3)[0] =
+                                ((npy_uint *)data0 + 2*3)[0] +
+                                ((npy_uint *)data_out + 2*3)[0];
+        ((npy_uint *)data_out + 2*3)[1] =
+                                ((npy_uint *)data0 + 2*3)[1] +
+                                ((npy_uint *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_uint *)data_out + 2*4)[0] =
+                                ((npy_uint *)data0 + 2*4)[0] +
+                                ((npy_uint *)data_out + 2*4)[0];
+        ((npy_uint *)data_out + 2*4)[1] =
+                                ((npy_uint *)data0 + 2*4)[1] +
+                                ((npy_uint *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_uint *)data_out + 2*5)[0] =
+                                ((npy_uint *)data0 + 2*5)[0] +
+                                ((npy_uint *)data_out + 2*5)[0];
+        ((npy_uint *)data_out + 2*5)[1] =
+                                ((npy_uint *)data0 + 2*5)[1] +
+                                ((npy_uint *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_uint *)data_out + 2*6)[0] =
+                                ((npy_uint *)data0 + 2*6)[0] +
+                                ((npy_uint *)data_out + 2*6)[0];
+        ((npy_uint *)data_out + 2*6)[1] =
+                                ((npy_uint *)data0 + 2*6)[1] +
+                                ((npy_uint *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_uint *)data_out + 2*7)[0] =
+                                ((npy_uint *)data0 + 2*7)[0] +
+                                ((npy_uint *)data_out + 2*7)[0];
+        ((npy_uint *)data_out + 2*7)[1] =
+                                ((npy_uint *)data0 + 2*7)[1] +
+                                ((npy_uint *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 2 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+uint_sum_of_products_muladd(npy_uint *data, npy_uint *data_out, npy_uint scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_uint
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_u32;
+    const npyv_u32 v_scalar = npyv_setall_u32(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_u32 b0 = npyv_loada_u32(data + vstep * 0);
+            npyv_u32 c0 = npyv_loada_u32(data_out + vstep * 0);
+            
+#line 312
+            npyv_u32 b1 = npyv_loada_u32(data + vstep * 1);
+            npyv_u32 c1 = npyv_loada_u32(data_out + vstep * 1);
+            
+#line 312
+            npyv_u32 b2 = npyv_loada_u32(data + vstep * 2);
+            npyv_u32 c2 = npyv_loada_u32(data_out + vstep * 2);
+            
+#line 312
+            npyv_u32 b3 = npyv_loada_u32(data + vstep * 3);
+            npyv_u32 c3 = npyv_loada_u32(data_out + vstep * 3);
+            
+            #line 318
+            npyv_u32 abc0 = npyv_muladd_u32(v_scalar, b0, c0);
+            
+#line 318
+            npyv_u32 abc1 = npyv_muladd_u32(v_scalar, b1, c1);
+            
+#line 318
+            npyv_u32 abc2 = npyv_muladd_u32(v_scalar, b2, c2);
+            
+#line 318
+            npyv_u32 abc3 = npyv_muladd_u32(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_u32(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_u32(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_u32(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_u32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_u32 b0 = npyv_load_u32(data + vstep * 0);
+            npyv_u32 c0 = npyv_load_u32(data_out + vstep * 0);
+            
+#line 312
+            npyv_u32 b1 = npyv_load_u32(data + vstep * 1);
+            npyv_u32 c1 = npyv_load_u32(data_out + vstep * 1);
+            
+#line 312
+            npyv_u32 b2 = npyv_load_u32(data + vstep * 2);
+            npyv_u32 c2 = npyv_load_u32(data_out + vstep * 2);
+            
+#line 312
+            npyv_u32 b3 = npyv_load_u32(data + vstep * 3);
+            npyv_u32 c3 = npyv_load_u32(data_out + vstep * 3);
+            
+            #line 318
+            npyv_u32 abc0 = npyv_muladd_u32(v_scalar, b0, c0);
+            
+#line 318
+            npyv_u32 abc1 = npyv_muladd_u32(v_scalar, b1, c1);
+            
+#line 318
+            npyv_u32 abc2 = npyv_muladd_u32(v_scalar, b2, c2);
+            
+#line 318
+            npyv_u32 abc3 = npyv_muladd_u32(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_u32(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_u32(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_u32(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_u32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_u32 a = npyv_load_tillz_u32(data, count);
+        npyv_u32 b = npyv_load_tillz_u32(data_out, count);
+        npyv_store_till_u32(data_out, count, npyv_muladd_u32(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_uint b0 = (data[0]);
+        const npy_uint c0 = (data_out[0]);
+        
+#line 340
+        const npy_uint b1 = (data[1]);
+        const npy_uint c1 = (data_out[1]);
+        
+#line 340
+        const npy_uint b2 = (data[2]);
+        const npy_uint c2 = (data_out[2]);
+        
+#line 340
+        const npy_uint b3 = (data[3]);
+        const npy_uint c3 = (data_out[3]);
+        
+        #line 346
+        const npy_uint abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_uint abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_uint abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_uint abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_uint b = (*data);
+        const npy_uint c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_uint
+}
+
+static void
+uint_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_uint *data0 = (npy_uint *)dataptr[0];
+    npy_uint *data1 = (npy_uint *)dataptr[1];
+    npy_uint *data_out = (npy_uint *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("uint_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_uint
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_u32;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_u32 a0 = npyv_loada_u32(data0 + vstep * 0);
+            npyv_u32 b0 = npyv_loada_u32(data1 + vstep * 0);
+            npyv_u32 c0 = npyv_loada_u32(data_out + vstep * 0);
+            
+#line 390
+            npyv_u32 a1 = npyv_loada_u32(data0 + vstep * 1);
+            npyv_u32 b1 = npyv_loada_u32(data1 + vstep * 1);
+            npyv_u32 c1 = npyv_loada_u32(data_out + vstep * 1);
+            
+#line 390
+            npyv_u32 a2 = npyv_loada_u32(data0 + vstep * 2);
+            npyv_u32 b2 = npyv_loada_u32(data1 + vstep * 2);
+            npyv_u32 c2 = npyv_loada_u32(data_out + vstep * 2);
+            
+#line 390
+            npyv_u32 a3 = npyv_loada_u32(data0 + vstep * 3);
+            npyv_u32 b3 = npyv_loada_u32(data1 + vstep * 3);
+            npyv_u32 c3 = npyv_loada_u32(data_out + vstep * 3);
+            
+            #line 397
+            npyv_u32 abc0 = npyv_muladd_u32(a0, b0, c0);
+            
+#line 397
+            npyv_u32 abc1 = npyv_muladd_u32(a1, b1, c1);
+            
+#line 397
+            npyv_u32 abc2 = npyv_muladd_u32(a2, b2, c2);
+            
+#line 397
+            npyv_u32 abc3 = npyv_muladd_u32(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_u32(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_u32(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_u32(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_u32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_u32 a0 = npyv_load_u32(data0 + vstep * 0);
+            npyv_u32 b0 = npyv_load_u32(data1 + vstep * 0);
+            npyv_u32 c0 = npyv_load_u32(data_out + vstep * 0);
+            
+#line 390
+            npyv_u32 a1 = npyv_load_u32(data0 + vstep * 1);
+            npyv_u32 b1 = npyv_load_u32(data1 + vstep * 1);
+            npyv_u32 c1 = npyv_load_u32(data_out + vstep * 1);
+            
+#line 390
+            npyv_u32 a2 = npyv_load_u32(data0 + vstep * 2);
+            npyv_u32 b2 = npyv_load_u32(data1 + vstep * 2);
+            npyv_u32 c2 = npyv_load_u32(data_out + vstep * 2);
+            
+#line 390
+            npyv_u32 a3 = npyv_load_u32(data0 + vstep * 3);
+            npyv_u32 b3 = npyv_load_u32(data1 + vstep * 3);
+            npyv_u32 c3 = npyv_load_u32(data_out + vstep * 3);
+            
+            #line 397
+            npyv_u32 abc0 = npyv_muladd_u32(a0, b0, c0);
+            
+#line 397
+            npyv_u32 abc1 = npyv_muladd_u32(a1, b1, c1);
+            
+#line 397
+            npyv_u32 abc2 = npyv_muladd_u32(a2, b2, c2);
+            
+#line 397
+            npyv_u32 abc3 = npyv_muladd_u32(a3, b3, c3);
+            
+            #line 402
+            npyv_store_u32(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_u32(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_u32(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_u32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_u32 a = npyv_load_tillz_u32(data0, count);
+        npyv_u32 b = npyv_load_tillz_u32(data1, count);
+        npyv_u32 c = npyv_load_tillz_u32(data_out, count);
+        npyv_store_till_u32(data_out, count, npyv_muladd_u32(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_uint a0 = (data0[0]);
+        const npy_uint b0 = (data1[0]);
+        const npy_uint c0 = (data_out[0]);
+        
+#line 420
+        const npy_uint a1 = (data0[1]);
+        const npy_uint b1 = (data1[1]);
+        const npy_uint c1 = (data_out[1]);
+        
+#line 420
+        const npy_uint a2 = (data0[2]);
+        const npy_uint b2 = (data1[2]);
+        const npy_uint c2 = (data_out[2]);
+        
+#line 420
+        const npy_uint a3 = (data0[3]);
+        const npy_uint b3 = (data1[3]);
+        const npy_uint c3 = (data_out[3]);
+        
+        #line 427
+        const npy_uint abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_uint abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_uint abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_uint abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_uint a = (*data0);
+        const npy_uint b = (*data1);
+        const npy_uint c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_uint
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+uint_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_uint value0 = (*(npy_uint *)dataptr[0]);
+    npy_uint *data1 = (npy_uint *)dataptr[1];
+    npy_uint *data_out = (npy_uint *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("uint_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    uint_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+uint_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_uint value1 = (*(npy_uint *)dataptr[1]);
+    npy_uint *data0 = (npy_uint *)dataptr[0];
+    npy_uint *data_out = (npy_uint *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("uint_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    uint_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+uint_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_uint *data0 = (npy_uint *)dataptr[0];
+    npy_uint *data1 = (npy_uint *)dataptr[1];
+    npy_uint accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("uint_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_uint
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_u32;
+    npyv_u32 v_accum = npyv_zero_u32();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_u32 a0 = npyv_loada_u32(data0 + vstep * 0);
+            npyv_u32 b0 = npyv_loada_u32(data1 + vstep * 0);
+            
+#line 501
+            npyv_u32 a1 = npyv_loada_u32(data0 + vstep * 1);
+            npyv_u32 b1 = npyv_loada_u32(data1 + vstep * 1);
+            
+#line 501
+            npyv_u32 a2 = npyv_loada_u32(data0 + vstep * 2);
+            npyv_u32 b2 = npyv_loada_u32(data1 + vstep * 2);
+            
+#line 501
+            npyv_u32 a3 = npyv_loada_u32(data0 + vstep * 3);
+            npyv_u32 b3 = npyv_loada_u32(data1 + vstep * 3);
+            
+            npyv_u32 ab3 = npyv_muladd_u32(a3, b3, v_accum);
+            npyv_u32 ab2 = npyv_muladd_u32(a2, b2, ab3);
+            npyv_u32 ab1 = npyv_muladd_u32(a1, b1, ab2);
+                   v_accum = npyv_muladd_u32(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_u32 a0 = npyv_load_u32(data0 + vstep * 0);
+            npyv_u32 b0 = npyv_load_u32(data1 + vstep * 0);
+            
+#line 501
+            npyv_u32 a1 = npyv_load_u32(data0 + vstep * 1);
+            npyv_u32 b1 = npyv_load_u32(data1 + vstep * 1);
+            
+#line 501
+            npyv_u32 a2 = npyv_load_u32(data0 + vstep * 2);
+            npyv_u32 b2 = npyv_load_u32(data1 + vstep * 2);
+            
+#line 501
+            npyv_u32 a3 = npyv_load_u32(data0 + vstep * 3);
+            npyv_u32 b3 = npyv_load_u32(data1 + vstep * 3);
+            
+            npyv_u32 ab3 = npyv_muladd_u32(a3, b3, v_accum);
+            npyv_u32 ab2 = npyv_muladd_u32(a2, b2, ab3);
+            npyv_u32 ab1 = npyv_muladd_u32(a1, b1, ab2);
+                   v_accum = npyv_muladd_u32(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_u32 a = npyv_load_tillz_u32(data0, count);
+        npyv_u32 b = npyv_load_tillz_u32(data1, count);
+        v_accum = npyv_muladd_u32(a, b, v_accum);
+    }
+    accum = npyv_sum_u32(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_uint ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_uint ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_uint ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_uint ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_uint a = (*data0);
+        const npy_uint b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_uint
+    *(npy_uint *)dataptr[2] = ((*(npy_uint *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+uint_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_uint *data1 = (npy_uint *)dataptr[1];
+    npy_uint value0 = (*(npy_uint *)dataptr[0]);
+    npy_uint accum = uint_sum_of_arr(data1, count);
+    *(npy_uint *)dataptr[2] = ((*(npy_uint *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+uint_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_uint *data0 = (npy_uint *)dataptr[0];
+    npy_uint value1 = (*(npy_uint *)dataptr[1]);
+    npy_uint accum = uint_sum_of_arr(data0, count);
+    *(npy_uint *)dataptr[2] = ((*(npy_uint *)dataptr[2]) + value1 * accum);
+}
+
+#elif 2 == 3 && !0
+
+static void
+uint_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_uint *data0 = (npy_uint *)dataptr[0];
+    npy_uint *data1 = (npy_uint *)dataptr[1];
+    npy_uint *data2 = (npy_uint *)dataptr[2];
+    npy_uint *data_out = (npy_uint *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 2 > 3 || @complex */
+
+static void
+uint_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("uint_sum_of_products_contig_two (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_uint temp = (*(npy_uint *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_uint *)dataptr[i]);
+        }
+        *(npy_uint *)dataptr[nop] = (temp +
+                                           (*(npy_uint *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_uint);
+        }
+#else /* complex */
+#  if 2 <= 3
+#    define _SUMPROD_NOP 2
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_uint re, im, tmp;
+        int i;
+        re = ((npy_uint *)dataptr[0])[0];
+        im = ((npy_uint *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_uint *)dataptr[i])[0] -
+                  im * ((npy_uint *)dataptr[i])[1];
+            im = re * ((npy_uint *)dataptr[i])[1] +
+                 im * ((npy_uint *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_uint *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_uint *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_uint *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_uint *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_uint);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 2 */
+
+#if 2 == 1
+
+static NPY_GCC_OPT_3 void
+uint_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("uint_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_uint *data = (npy_uint *)dataptr[0];
+    npy_uint accum = uint_sum_of_arr(data, count);
+    *((npy_uint *)dataptr[1]) = (accum + (*((npy_uint *)dataptr[1])));
+#else
+    npy_uint accum_re = 0, accum_im = 0;
+    npy_uint *data0 = (npy_uint *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_uint re01 = data0[0] + data0[2];
+        const npy_uint re23 = data0[4] + data0[6];
+        const npy_uint im13 = data0[1] + data0[3];
+        const npy_uint im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_uint *)dataptr[1])[0] += accum_re;
+    ((npy_uint *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 2 == 1 */
+
+static void
+uint_sum_of_products_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_uint accum_re = 0, accum_im = 0;
+#else
+    npy_uint accum = 0;
+#endif
+
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (2 == 2 || 2 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (2 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("uint_sum_of_products_outstride0_two (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 2 == 1
+        accum += (*(npy_uint *)data0);
+        data0 += stride0;
+#  elif 2 == 2
+        accum += (*(npy_uint *)data0) *
+                 (*(npy_uint *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 2 == 3
+        accum += (*(npy_uint *)data0) *
+                 (*(npy_uint *)data1) *
+                 (*(npy_uint *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_uint temp = (*(npy_uint *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_uint *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 2 == 1
+        accum_re += ((npy_uint *)data0)[0];
+        accum_im += ((npy_uint *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 2 <= 3
+#define _SUMPROD_NOP 2
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_uint re, im, tmp;
+        int i;
+        re = ((npy_uint *)dataptr[0])[0];
+        im = ((npy_uint *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_uint *)dataptr[i])[0] -
+                  im * ((npy_uint *)dataptr[i])[1];
+            im = re * ((npy_uint *)dataptr[i])[1] +
+                 im * ((npy_uint *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 2 <= 3
+    ((npy_uint *)dataptr[2])[0] += accum_re;
+    ((npy_uint *)dataptr[2])[1] += accum_im;
+#  else
+    ((npy_uint *)dataptr[nop])[0] += accum_re;
+    ((npy_uint *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 2 <= 3
+    *((npy_uint *)dataptr[2]) = (accum +
+                                    (*((npy_uint *)dataptr[2])));
+#  else
+    *((npy_uint *)dataptr[nop]) = (accum +
+                                    (*((npy_uint *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+uint_sum_of_products_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (3 == 2 || 3 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (3 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data_out = dataptr[3];
+    npy_intp stride_out = strides[3];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("uint_sum_of_products_three (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 3 == 1
+        *(npy_uint *)data_out = ((*(npy_uint *)data0) +
+                                         (*(npy_uint *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 3 == 2
+        *(npy_uint *)data_out = ((*(npy_uint *)data0) *
+                                         (*(npy_uint *)data1) +
+                                         (*(npy_uint *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 3 == 3
+        *(npy_uint *)data_out = ((*(npy_uint *)data0) *
+                                         (*(npy_uint *)data1) *
+                                         (*(npy_uint *)data2) +
+                                         (*(npy_uint *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_uint temp = (*(npy_uint *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_uint *)dataptr[i]);
+        }
+        *(npy_uint *)dataptr[nop] = (temp +
+                                           (*(npy_uint *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 3 == 1
+        ((npy_uint *)data_out)[0] = ((npy_uint *)data0)[0] +
+                                         ((npy_uint *)data_out)[0];
+        ((npy_uint *)data_out)[1] = ((npy_uint *)data0)[1] +
+                                         ((npy_uint *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 3 <= 3
+#define _SUMPROD_NOP 3
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_uint re, im, tmp;
+        int i;
+        re = ((npy_uint *)dataptr[0])[0];
+        im = ((npy_uint *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_uint *)dataptr[i])[0] -
+                  im * ((npy_uint *)dataptr[i])[1];
+            im = re * ((npy_uint *)dataptr[i])[1] +
+                 im * ((npy_uint *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_uint *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_uint *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_uint *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_uint *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 3 == 1
+
+static void
+uint_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_uint *data0 = (npy_uint *)dataptr[0];
+    npy_uint *data_out = (npy_uint *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("uint_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_uint *)data_out + 2*6)[0] =
+                                    ((npy_uint *)data0 + 2*6)[0] +
+                                    ((npy_uint *)data_out + 2*6)[0];
+            ((npy_uint *)data_out + 2*6)[1] =
+                                    ((npy_uint *)data0 + 2*6)[1] +
+                                    ((npy_uint *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_uint *)data_out + 2*5)[0] =
+                                    ((npy_uint *)data0 + 2*5)[0] +
+                                    ((npy_uint *)data_out + 2*5)[0];
+            ((npy_uint *)data_out + 2*5)[1] =
+                                    ((npy_uint *)data0 + 2*5)[1] +
+                                    ((npy_uint *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_uint *)data_out + 2*4)[0] =
+                                    ((npy_uint *)data0 + 2*4)[0] +
+                                    ((npy_uint *)data_out + 2*4)[0];
+            ((npy_uint *)data_out + 2*4)[1] =
+                                    ((npy_uint *)data0 + 2*4)[1] +
+                                    ((npy_uint *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_uint *)data_out + 2*3)[0] =
+                                    ((npy_uint *)data0 + 2*3)[0] +
+                                    ((npy_uint *)data_out + 2*3)[0];
+            ((npy_uint *)data_out + 2*3)[1] =
+                                    ((npy_uint *)data0 + 2*3)[1] +
+                                    ((npy_uint *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_uint *)data_out + 2*2)[0] =
+                                    ((npy_uint *)data0 + 2*2)[0] +
+                                    ((npy_uint *)data_out + 2*2)[0];
+            ((npy_uint *)data_out + 2*2)[1] =
+                                    ((npy_uint *)data0 + 2*2)[1] +
+                                    ((npy_uint *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_uint *)data_out + 2*1)[0] =
+                                    ((npy_uint *)data0 + 2*1)[0] +
+                                    ((npy_uint *)data_out + 2*1)[0];
+            ((npy_uint *)data_out + 2*1)[1] =
+                                    ((npy_uint *)data0 + 2*1)[1] +
+                                    ((npy_uint *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_uint *)data_out + 2*0)[0] =
+                                    ((npy_uint *)data0 + 2*0)[0] +
+                                    ((npy_uint *)data_out + 2*0)[0];
+            ((npy_uint *)data_out + 2*0)[1] =
+                                    ((npy_uint *)data0 + 2*0)[1] +
+                                    ((npy_uint *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_uint *)data_out + 2*0)[0] =
+                                ((npy_uint *)data0 + 2*0)[0] +
+                                ((npy_uint *)data_out + 2*0)[0];
+        ((npy_uint *)data_out + 2*0)[1] =
+                                ((npy_uint *)data0 + 2*0)[1] +
+                                ((npy_uint *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_uint *)data_out + 2*1)[0] =
+                                ((npy_uint *)data0 + 2*1)[0] +
+                                ((npy_uint *)data_out + 2*1)[0];
+        ((npy_uint *)data_out + 2*1)[1] =
+                                ((npy_uint *)data0 + 2*1)[1] +
+                                ((npy_uint *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_uint *)data_out + 2*2)[0] =
+                                ((npy_uint *)data0 + 2*2)[0] +
+                                ((npy_uint *)data_out + 2*2)[0];
+        ((npy_uint *)data_out + 2*2)[1] =
+                                ((npy_uint *)data0 + 2*2)[1] +
+                                ((npy_uint *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_uint *)data_out + 2*3)[0] =
+                                ((npy_uint *)data0 + 2*3)[0] +
+                                ((npy_uint *)data_out + 2*3)[0];
+        ((npy_uint *)data_out + 2*3)[1] =
+                                ((npy_uint *)data0 + 2*3)[1] +
+                                ((npy_uint *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_uint *)data_out + 2*4)[0] =
+                                ((npy_uint *)data0 + 2*4)[0] +
+                                ((npy_uint *)data_out + 2*4)[0];
+        ((npy_uint *)data_out + 2*4)[1] =
+                                ((npy_uint *)data0 + 2*4)[1] +
+                                ((npy_uint *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_uint *)data_out + 2*5)[0] =
+                                ((npy_uint *)data0 + 2*5)[0] +
+                                ((npy_uint *)data_out + 2*5)[0];
+        ((npy_uint *)data_out + 2*5)[1] =
+                                ((npy_uint *)data0 + 2*5)[1] +
+                                ((npy_uint *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_uint *)data_out + 2*6)[0] =
+                                ((npy_uint *)data0 + 2*6)[0] +
+                                ((npy_uint *)data_out + 2*6)[0];
+        ((npy_uint *)data_out + 2*6)[1] =
+                                ((npy_uint *)data0 + 2*6)[1] +
+                                ((npy_uint *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_uint *)data_out + 2*7)[0] =
+                                ((npy_uint *)data0 + 2*7)[0] +
+                                ((npy_uint *)data_out + 2*7)[0];
+        ((npy_uint *)data_out + 2*7)[1] =
+                                ((npy_uint *)data0 + 2*7)[1] +
+                                ((npy_uint *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 3 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+uint_sum_of_products_muladd(npy_uint *data, npy_uint *data_out, npy_uint scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_uint
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_u32;
+    const npyv_u32 v_scalar = npyv_setall_u32(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_u32 b0 = npyv_loada_u32(data + vstep * 0);
+            npyv_u32 c0 = npyv_loada_u32(data_out + vstep * 0);
+            
+#line 312
+            npyv_u32 b1 = npyv_loada_u32(data + vstep * 1);
+            npyv_u32 c1 = npyv_loada_u32(data_out + vstep * 1);
+            
+#line 312
+            npyv_u32 b2 = npyv_loada_u32(data + vstep * 2);
+            npyv_u32 c2 = npyv_loada_u32(data_out + vstep * 2);
+            
+#line 312
+            npyv_u32 b3 = npyv_loada_u32(data + vstep * 3);
+            npyv_u32 c3 = npyv_loada_u32(data_out + vstep * 3);
+            
+            #line 318
+            npyv_u32 abc0 = npyv_muladd_u32(v_scalar, b0, c0);
+            
+#line 318
+            npyv_u32 abc1 = npyv_muladd_u32(v_scalar, b1, c1);
+            
+#line 318
+            npyv_u32 abc2 = npyv_muladd_u32(v_scalar, b2, c2);
+            
+#line 318
+            npyv_u32 abc3 = npyv_muladd_u32(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_u32(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_u32(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_u32(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_u32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_u32 b0 = npyv_load_u32(data + vstep * 0);
+            npyv_u32 c0 = npyv_load_u32(data_out + vstep * 0);
+            
+#line 312
+            npyv_u32 b1 = npyv_load_u32(data + vstep * 1);
+            npyv_u32 c1 = npyv_load_u32(data_out + vstep * 1);
+            
+#line 312
+            npyv_u32 b2 = npyv_load_u32(data + vstep * 2);
+            npyv_u32 c2 = npyv_load_u32(data_out + vstep * 2);
+            
+#line 312
+            npyv_u32 b3 = npyv_load_u32(data + vstep * 3);
+            npyv_u32 c3 = npyv_load_u32(data_out + vstep * 3);
+            
+            #line 318
+            npyv_u32 abc0 = npyv_muladd_u32(v_scalar, b0, c0);
+            
+#line 318
+            npyv_u32 abc1 = npyv_muladd_u32(v_scalar, b1, c1);
+            
+#line 318
+            npyv_u32 abc2 = npyv_muladd_u32(v_scalar, b2, c2);
+            
+#line 318
+            npyv_u32 abc3 = npyv_muladd_u32(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_u32(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_u32(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_u32(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_u32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_u32 a = npyv_load_tillz_u32(data, count);
+        npyv_u32 b = npyv_load_tillz_u32(data_out, count);
+        npyv_store_till_u32(data_out, count, npyv_muladd_u32(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_uint b0 = (data[0]);
+        const npy_uint c0 = (data_out[0]);
+        
+#line 340
+        const npy_uint b1 = (data[1]);
+        const npy_uint c1 = (data_out[1]);
+        
+#line 340
+        const npy_uint b2 = (data[2]);
+        const npy_uint c2 = (data_out[2]);
+        
+#line 340
+        const npy_uint b3 = (data[3]);
+        const npy_uint c3 = (data_out[3]);
+        
+        #line 346
+        const npy_uint abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_uint abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_uint abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_uint abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_uint b = (*data);
+        const npy_uint c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_uint
+}
+
+static void
+uint_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_uint *data0 = (npy_uint *)dataptr[0];
+    npy_uint *data1 = (npy_uint *)dataptr[1];
+    npy_uint *data_out = (npy_uint *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("uint_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_uint
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_u32;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_u32 a0 = npyv_loada_u32(data0 + vstep * 0);
+            npyv_u32 b0 = npyv_loada_u32(data1 + vstep * 0);
+            npyv_u32 c0 = npyv_loada_u32(data_out + vstep * 0);
+            
+#line 390
+            npyv_u32 a1 = npyv_loada_u32(data0 + vstep * 1);
+            npyv_u32 b1 = npyv_loada_u32(data1 + vstep * 1);
+            npyv_u32 c1 = npyv_loada_u32(data_out + vstep * 1);
+            
+#line 390
+            npyv_u32 a2 = npyv_loada_u32(data0 + vstep * 2);
+            npyv_u32 b2 = npyv_loada_u32(data1 + vstep * 2);
+            npyv_u32 c2 = npyv_loada_u32(data_out + vstep * 2);
+            
+#line 390
+            npyv_u32 a3 = npyv_loada_u32(data0 + vstep * 3);
+            npyv_u32 b3 = npyv_loada_u32(data1 + vstep * 3);
+            npyv_u32 c3 = npyv_loada_u32(data_out + vstep * 3);
+            
+            #line 397
+            npyv_u32 abc0 = npyv_muladd_u32(a0, b0, c0);
+            
+#line 397
+            npyv_u32 abc1 = npyv_muladd_u32(a1, b1, c1);
+            
+#line 397
+            npyv_u32 abc2 = npyv_muladd_u32(a2, b2, c2);
+            
+#line 397
+            npyv_u32 abc3 = npyv_muladd_u32(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_u32(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_u32(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_u32(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_u32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_u32 a0 = npyv_load_u32(data0 + vstep * 0);
+            npyv_u32 b0 = npyv_load_u32(data1 + vstep * 0);
+            npyv_u32 c0 = npyv_load_u32(data_out + vstep * 0);
+            
+#line 390
+            npyv_u32 a1 = npyv_load_u32(data0 + vstep * 1);
+            npyv_u32 b1 = npyv_load_u32(data1 + vstep * 1);
+            npyv_u32 c1 = npyv_load_u32(data_out + vstep * 1);
+            
+#line 390
+            npyv_u32 a2 = npyv_load_u32(data0 + vstep * 2);
+            npyv_u32 b2 = npyv_load_u32(data1 + vstep * 2);
+            npyv_u32 c2 = npyv_load_u32(data_out + vstep * 2);
+            
+#line 390
+            npyv_u32 a3 = npyv_load_u32(data0 + vstep * 3);
+            npyv_u32 b3 = npyv_load_u32(data1 + vstep * 3);
+            npyv_u32 c3 = npyv_load_u32(data_out + vstep * 3);
+            
+            #line 397
+            npyv_u32 abc0 = npyv_muladd_u32(a0, b0, c0);
+            
+#line 397
+            npyv_u32 abc1 = npyv_muladd_u32(a1, b1, c1);
+            
+#line 397
+            npyv_u32 abc2 = npyv_muladd_u32(a2, b2, c2);
+            
+#line 397
+            npyv_u32 abc3 = npyv_muladd_u32(a3, b3, c3);
+            
+            #line 402
+            npyv_store_u32(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_u32(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_u32(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_u32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_u32 a = npyv_load_tillz_u32(data0, count);
+        npyv_u32 b = npyv_load_tillz_u32(data1, count);
+        npyv_u32 c = npyv_load_tillz_u32(data_out, count);
+        npyv_store_till_u32(data_out, count, npyv_muladd_u32(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_uint a0 = (data0[0]);
+        const npy_uint b0 = (data1[0]);
+        const npy_uint c0 = (data_out[0]);
+        
+#line 420
+        const npy_uint a1 = (data0[1]);
+        const npy_uint b1 = (data1[1]);
+        const npy_uint c1 = (data_out[1]);
+        
+#line 420
+        const npy_uint a2 = (data0[2]);
+        const npy_uint b2 = (data1[2]);
+        const npy_uint c2 = (data_out[2]);
+        
+#line 420
+        const npy_uint a3 = (data0[3]);
+        const npy_uint b3 = (data1[3]);
+        const npy_uint c3 = (data_out[3]);
+        
+        #line 427
+        const npy_uint abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_uint abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_uint abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_uint abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_uint a = (*data0);
+        const npy_uint b = (*data1);
+        const npy_uint c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_uint
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+uint_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_uint value0 = (*(npy_uint *)dataptr[0]);
+    npy_uint *data1 = (npy_uint *)dataptr[1];
+    npy_uint *data_out = (npy_uint *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("uint_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    uint_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+uint_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_uint value1 = (*(npy_uint *)dataptr[1]);
+    npy_uint *data0 = (npy_uint *)dataptr[0];
+    npy_uint *data_out = (npy_uint *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("uint_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    uint_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+uint_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_uint *data0 = (npy_uint *)dataptr[0];
+    npy_uint *data1 = (npy_uint *)dataptr[1];
+    npy_uint accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("uint_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_uint
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_u32;
+    npyv_u32 v_accum = npyv_zero_u32();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_u32 a0 = npyv_loada_u32(data0 + vstep * 0);
+            npyv_u32 b0 = npyv_loada_u32(data1 + vstep * 0);
+            
+#line 501
+            npyv_u32 a1 = npyv_loada_u32(data0 + vstep * 1);
+            npyv_u32 b1 = npyv_loada_u32(data1 + vstep * 1);
+            
+#line 501
+            npyv_u32 a2 = npyv_loada_u32(data0 + vstep * 2);
+            npyv_u32 b2 = npyv_loada_u32(data1 + vstep * 2);
+            
+#line 501
+            npyv_u32 a3 = npyv_loada_u32(data0 + vstep * 3);
+            npyv_u32 b3 = npyv_loada_u32(data1 + vstep * 3);
+            
+            npyv_u32 ab3 = npyv_muladd_u32(a3, b3, v_accum);
+            npyv_u32 ab2 = npyv_muladd_u32(a2, b2, ab3);
+            npyv_u32 ab1 = npyv_muladd_u32(a1, b1, ab2);
+                   v_accum = npyv_muladd_u32(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_u32 a0 = npyv_load_u32(data0 + vstep * 0);
+            npyv_u32 b0 = npyv_load_u32(data1 + vstep * 0);
+            
+#line 501
+            npyv_u32 a1 = npyv_load_u32(data0 + vstep * 1);
+            npyv_u32 b1 = npyv_load_u32(data1 + vstep * 1);
+            
+#line 501
+            npyv_u32 a2 = npyv_load_u32(data0 + vstep * 2);
+            npyv_u32 b2 = npyv_load_u32(data1 + vstep * 2);
+            
+#line 501
+            npyv_u32 a3 = npyv_load_u32(data0 + vstep * 3);
+            npyv_u32 b3 = npyv_load_u32(data1 + vstep * 3);
+            
+            npyv_u32 ab3 = npyv_muladd_u32(a3, b3, v_accum);
+            npyv_u32 ab2 = npyv_muladd_u32(a2, b2, ab3);
+            npyv_u32 ab1 = npyv_muladd_u32(a1, b1, ab2);
+                   v_accum = npyv_muladd_u32(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_u32 a = npyv_load_tillz_u32(data0, count);
+        npyv_u32 b = npyv_load_tillz_u32(data1, count);
+        v_accum = npyv_muladd_u32(a, b, v_accum);
+    }
+    accum = npyv_sum_u32(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_uint ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_uint ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_uint ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_uint ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_uint a = (*data0);
+        const npy_uint b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_uint
+    *(npy_uint *)dataptr[2] = ((*(npy_uint *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+uint_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_uint *data1 = (npy_uint *)dataptr[1];
+    npy_uint value0 = (*(npy_uint *)dataptr[0]);
+    npy_uint accum = uint_sum_of_arr(data1, count);
+    *(npy_uint *)dataptr[2] = ((*(npy_uint *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+uint_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_uint *data0 = (npy_uint *)dataptr[0];
+    npy_uint value1 = (*(npy_uint *)dataptr[1]);
+    npy_uint accum = uint_sum_of_arr(data0, count);
+    *(npy_uint *)dataptr[2] = ((*(npy_uint *)dataptr[2]) + value1 * accum);
+}
+
+#elif 3 == 3 && !0
+
+static void
+uint_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_uint *data0 = (npy_uint *)dataptr[0];
+    npy_uint *data1 = (npy_uint *)dataptr[1];
+    npy_uint *data2 = (npy_uint *)dataptr[2];
+    npy_uint *data_out = (npy_uint *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 3 > 3 || @complex */
+
+static void
+uint_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("uint_sum_of_products_contig_three (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_uint temp = (*(npy_uint *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_uint *)dataptr[i]);
+        }
+        *(npy_uint *)dataptr[nop] = (temp +
+                                           (*(npy_uint *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_uint);
+        }
+#else /* complex */
+#  if 3 <= 3
+#    define _SUMPROD_NOP 3
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_uint re, im, tmp;
+        int i;
+        re = ((npy_uint *)dataptr[0])[0];
+        im = ((npy_uint *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_uint *)dataptr[i])[0] -
+                  im * ((npy_uint *)dataptr[i])[1];
+            im = re * ((npy_uint *)dataptr[i])[1] +
+                 im * ((npy_uint *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_uint *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_uint *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_uint *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_uint *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_uint);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 3 */
+
+#if 3 == 1
+
+static NPY_GCC_OPT_3 void
+uint_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("uint_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_uint *data = (npy_uint *)dataptr[0];
+    npy_uint accum = uint_sum_of_arr(data, count);
+    *((npy_uint *)dataptr[1]) = (accum + (*((npy_uint *)dataptr[1])));
+#else
+    npy_uint accum_re = 0, accum_im = 0;
+    npy_uint *data0 = (npy_uint *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_uint re01 = data0[0] + data0[2];
+        const npy_uint re23 = data0[4] + data0[6];
+        const npy_uint im13 = data0[1] + data0[3];
+        const npy_uint im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_uint *)dataptr[1])[0] += accum_re;
+    ((npy_uint *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 3 == 1 */
+
+static void
+uint_sum_of_products_outstride0_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_uint accum_re = 0, accum_im = 0;
+#else
+    npy_uint accum = 0;
+#endif
+
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (3 == 2 || 3 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (3 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("uint_sum_of_products_outstride0_three (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 3 == 1
+        accum += (*(npy_uint *)data0);
+        data0 += stride0;
+#  elif 3 == 2
+        accum += (*(npy_uint *)data0) *
+                 (*(npy_uint *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 3 == 3
+        accum += (*(npy_uint *)data0) *
+                 (*(npy_uint *)data1) *
+                 (*(npy_uint *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_uint temp = (*(npy_uint *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_uint *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 3 == 1
+        accum_re += ((npy_uint *)data0)[0];
+        accum_im += ((npy_uint *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 3 <= 3
+#define _SUMPROD_NOP 3
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_uint re, im, tmp;
+        int i;
+        re = ((npy_uint *)dataptr[0])[0];
+        im = ((npy_uint *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_uint *)dataptr[i])[0] -
+                  im * ((npy_uint *)dataptr[i])[1];
+            im = re * ((npy_uint *)dataptr[i])[1] +
+                 im * ((npy_uint *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 3 <= 3
+    ((npy_uint *)dataptr[3])[0] += accum_re;
+    ((npy_uint *)dataptr[3])[1] += accum_im;
+#  else
+    ((npy_uint *)dataptr[nop])[0] += accum_re;
+    ((npy_uint *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 3 <= 3
+    *((npy_uint *)dataptr[3]) = (accum +
+                                    (*((npy_uint *)dataptr[3])));
+#  else
+    *((npy_uint *)dataptr[nop]) = (accum +
+                                    (*((npy_uint *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+uint_sum_of_products_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1000 == 2 || 1000 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1000 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data_out = dataptr[1000];
+    npy_intp stride_out = strides[1000];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("uint_sum_of_products_any (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 1000 == 1
+        *(npy_uint *)data_out = ((*(npy_uint *)data0) +
+                                         (*(npy_uint *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 1000 == 2
+        *(npy_uint *)data_out = ((*(npy_uint *)data0) *
+                                         (*(npy_uint *)data1) +
+                                         (*(npy_uint *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 1000 == 3
+        *(npy_uint *)data_out = ((*(npy_uint *)data0) *
+                                         (*(npy_uint *)data1) *
+                                         (*(npy_uint *)data2) +
+                                         (*(npy_uint *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_uint temp = (*(npy_uint *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_uint *)dataptr[i]);
+        }
+        *(npy_uint *)dataptr[nop] = (temp +
+                                           (*(npy_uint *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1000 == 1
+        ((npy_uint *)data_out)[0] = ((npy_uint *)data0)[0] +
+                                         ((npy_uint *)data_out)[0];
+        ((npy_uint *)data_out)[1] = ((npy_uint *)data0)[1] +
+                                         ((npy_uint *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 1000 <= 3
+#define _SUMPROD_NOP 1000
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_uint re, im, tmp;
+        int i;
+        re = ((npy_uint *)dataptr[0])[0];
+        im = ((npy_uint *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_uint *)dataptr[i])[0] -
+                  im * ((npy_uint *)dataptr[i])[1];
+            im = re * ((npy_uint *)dataptr[i])[1] +
+                 im * ((npy_uint *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_uint *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_uint *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_uint *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_uint *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 1000 == 1
+
+static void
+uint_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_uint *data0 = (npy_uint *)dataptr[0];
+    npy_uint *data_out = (npy_uint *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("uint_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_uint *)data_out + 2*6)[0] =
+                                    ((npy_uint *)data0 + 2*6)[0] +
+                                    ((npy_uint *)data_out + 2*6)[0];
+            ((npy_uint *)data_out + 2*6)[1] =
+                                    ((npy_uint *)data0 + 2*6)[1] +
+                                    ((npy_uint *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_uint *)data_out + 2*5)[0] =
+                                    ((npy_uint *)data0 + 2*5)[0] +
+                                    ((npy_uint *)data_out + 2*5)[0];
+            ((npy_uint *)data_out + 2*5)[1] =
+                                    ((npy_uint *)data0 + 2*5)[1] +
+                                    ((npy_uint *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_uint *)data_out + 2*4)[0] =
+                                    ((npy_uint *)data0 + 2*4)[0] +
+                                    ((npy_uint *)data_out + 2*4)[0];
+            ((npy_uint *)data_out + 2*4)[1] =
+                                    ((npy_uint *)data0 + 2*4)[1] +
+                                    ((npy_uint *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_uint *)data_out + 2*3)[0] =
+                                    ((npy_uint *)data0 + 2*3)[0] +
+                                    ((npy_uint *)data_out + 2*3)[0];
+            ((npy_uint *)data_out + 2*3)[1] =
+                                    ((npy_uint *)data0 + 2*3)[1] +
+                                    ((npy_uint *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_uint *)data_out + 2*2)[0] =
+                                    ((npy_uint *)data0 + 2*2)[0] +
+                                    ((npy_uint *)data_out + 2*2)[0];
+            ((npy_uint *)data_out + 2*2)[1] =
+                                    ((npy_uint *)data0 + 2*2)[1] +
+                                    ((npy_uint *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_uint *)data_out + 2*1)[0] =
+                                    ((npy_uint *)data0 + 2*1)[0] +
+                                    ((npy_uint *)data_out + 2*1)[0];
+            ((npy_uint *)data_out + 2*1)[1] =
+                                    ((npy_uint *)data0 + 2*1)[1] +
+                                    ((npy_uint *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_uint *)data_out + 2*0)[0] =
+                                    ((npy_uint *)data0 + 2*0)[0] +
+                                    ((npy_uint *)data_out + 2*0)[0];
+            ((npy_uint *)data_out + 2*0)[1] =
+                                    ((npy_uint *)data0 + 2*0)[1] +
+                                    ((npy_uint *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_uint *)data_out + 2*0)[0] =
+                                ((npy_uint *)data0 + 2*0)[0] +
+                                ((npy_uint *)data_out + 2*0)[0];
+        ((npy_uint *)data_out + 2*0)[1] =
+                                ((npy_uint *)data0 + 2*0)[1] +
+                                ((npy_uint *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_uint *)data_out + 2*1)[0] =
+                                ((npy_uint *)data0 + 2*1)[0] +
+                                ((npy_uint *)data_out + 2*1)[0];
+        ((npy_uint *)data_out + 2*1)[1] =
+                                ((npy_uint *)data0 + 2*1)[1] +
+                                ((npy_uint *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_uint *)data_out + 2*2)[0] =
+                                ((npy_uint *)data0 + 2*2)[0] +
+                                ((npy_uint *)data_out + 2*2)[0];
+        ((npy_uint *)data_out + 2*2)[1] =
+                                ((npy_uint *)data0 + 2*2)[1] +
+                                ((npy_uint *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_uint *)data_out + 2*3)[0] =
+                                ((npy_uint *)data0 + 2*3)[0] +
+                                ((npy_uint *)data_out + 2*3)[0];
+        ((npy_uint *)data_out + 2*3)[1] =
+                                ((npy_uint *)data0 + 2*3)[1] +
+                                ((npy_uint *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_uint *)data_out + 2*4)[0] =
+                                ((npy_uint *)data0 + 2*4)[0] +
+                                ((npy_uint *)data_out + 2*4)[0];
+        ((npy_uint *)data_out + 2*4)[1] =
+                                ((npy_uint *)data0 + 2*4)[1] +
+                                ((npy_uint *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_uint *)data_out + 2*5)[0] =
+                                ((npy_uint *)data0 + 2*5)[0] +
+                                ((npy_uint *)data_out + 2*5)[0];
+        ((npy_uint *)data_out + 2*5)[1] =
+                                ((npy_uint *)data0 + 2*5)[1] +
+                                ((npy_uint *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_uint *)data_out + 2*6)[0] =
+                                ((npy_uint *)data0 + 2*6)[0] +
+                                ((npy_uint *)data_out + 2*6)[0];
+        ((npy_uint *)data_out + 2*6)[1] =
+                                ((npy_uint *)data0 + 2*6)[1] +
+                                ((npy_uint *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_uint *)data_out + 2*7)[0] =
+                                ((npy_uint *)data0 + 2*7)[0] +
+                                ((npy_uint *)data_out + 2*7)[0];
+        ((npy_uint *)data_out + 2*7)[1] =
+                                ((npy_uint *)data0 + 2*7)[1] +
+                                ((npy_uint *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 1000 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+uint_sum_of_products_muladd(npy_uint *data, npy_uint *data_out, npy_uint scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_uint
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_u32;
+    const npyv_u32 v_scalar = npyv_setall_u32(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_u32 b0 = npyv_loada_u32(data + vstep * 0);
+            npyv_u32 c0 = npyv_loada_u32(data_out + vstep * 0);
+            
+#line 312
+            npyv_u32 b1 = npyv_loada_u32(data + vstep * 1);
+            npyv_u32 c1 = npyv_loada_u32(data_out + vstep * 1);
+            
+#line 312
+            npyv_u32 b2 = npyv_loada_u32(data + vstep * 2);
+            npyv_u32 c2 = npyv_loada_u32(data_out + vstep * 2);
+            
+#line 312
+            npyv_u32 b3 = npyv_loada_u32(data + vstep * 3);
+            npyv_u32 c3 = npyv_loada_u32(data_out + vstep * 3);
+            
+            #line 318
+            npyv_u32 abc0 = npyv_muladd_u32(v_scalar, b0, c0);
+            
+#line 318
+            npyv_u32 abc1 = npyv_muladd_u32(v_scalar, b1, c1);
+            
+#line 318
+            npyv_u32 abc2 = npyv_muladd_u32(v_scalar, b2, c2);
+            
+#line 318
+            npyv_u32 abc3 = npyv_muladd_u32(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_u32(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_u32(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_u32(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_u32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_u32 b0 = npyv_load_u32(data + vstep * 0);
+            npyv_u32 c0 = npyv_load_u32(data_out + vstep * 0);
+            
+#line 312
+            npyv_u32 b1 = npyv_load_u32(data + vstep * 1);
+            npyv_u32 c1 = npyv_load_u32(data_out + vstep * 1);
+            
+#line 312
+            npyv_u32 b2 = npyv_load_u32(data + vstep * 2);
+            npyv_u32 c2 = npyv_load_u32(data_out + vstep * 2);
+            
+#line 312
+            npyv_u32 b3 = npyv_load_u32(data + vstep * 3);
+            npyv_u32 c3 = npyv_load_u32(data_out + vstep * 3);
+            
+            #line 318
+            npyv_u32 abc0 = npyv_muladd_u32(v_scalar, b0, c0);
+            
+#line 318
+            npyv_u32 abc1 = npyv_muladd_u32(v_scalar, b1, c1);
+            
+#line 318
+            npyv_u32 abc2 = npyv_muladd_u32(v_scalar, b2, c2);
+            
+#line 318
+            npyv_u32 abc3 = npyv_muladd_u32(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_u32(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_u32(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_u32(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_u32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_u32 a = npyv_load_tillz_u32(data, count);
+        npyv_u32 b = npyv_load_tillz_u32(data_out, count);
+        npyv_store_till_u32(data_out, count, npyv_muladd_u32(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_uint b0 = (data[0]);
+        const npy_uint c0 = (data_out[0]);
+        
+#line 340
+        const npy_uint b1 = (data[1]);
+        const npy_uint c1 = (data_out[1]);
+        
+#line 340
+        const npy_uint b2 = (data[2]);
+        const npy_uint c2 = (data_out[2]);
+        
+#line 340
+        const npy_uint b3 = (data[3]);
+        const npy_uint c3 = (data_out[3]);
+        
+        #line 346
+        const npy_uint abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_uint abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_uint abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_uint abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_uint b = (*data);
+        const npy_uint c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_uint
+}
+
+static void
+uint_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_uint *data0 = (npy_uint *)dataptr[0];
+    npy_uint *data1 = (npy_uint *)dataptr[1];
+    npy_uint *data_out = (npy_uint *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("uint_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_uint
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_u32;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_u32 a0 = npyv_loada_u32(data0 + vstep * 0);
+            npyv_u32 b0 = npyv_loada_u32(data1 + vstep * 0);
+            npyv_u32 c0 = npyv_loada_u32(data_out + vstep * 0);
+            
+#line 390
+            npyv_u32 a1 = npyv_loada_u32(data0 + vstep * 1);
+            npyv_u32 b1 = npyv_loada_u32(data1 + vstep * 1);
+            npyv_u32 c1 = npyv_loada_u32(data_out + vstep * 1);
+            
+#line 390
+            npyv_u32 a2 = npyv_loada_u32(data0 + vstep * 2);
+            npyv_u32 b2 = npyv_loada_u32(data1 + vstep * 2);
+            npyv_u32 c2 = npyv_loada_u32(data_out + vstep * 2);
+            
+#line 390
+            npyv_u32 a3 = npyv_loada_u32(data0 + vstep * 3);
+            npyv_u32 b3 = npyv_loada_u32(data1 + vstep * 3);
+            npyv_u32 c3 = npyv_loada_u32(data_out + vstep * 3);
+            
+            #line 397
+            npyv_u32 abc0 = npyv_muladd_u32(a0, b0, c0);
+            
+#line 397
+            npyv_u32 abc1 = npyv_muladd_u32(a1, b1, c1);
+            
+#line 397
+            npyv_u32 abc2 = npyv_muladd_u32(a2, b2, c2);
+            
+#line 397
+            npyv_u32 abc3 = npyv_muladd_u32(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_u32(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_u32(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_u32(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_u32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_u32 a0 = npyv_load_u32(data0 + vstep * 0);
+            npyv_u32 b0 = npyv_load_u32(data1 + vstep * 0);
+            npyv_u32 c0 = npyv_load_u32(data_out + vstep * 0);
+            
+#line 390
+            npyv_u32 a1 = npyv_load_u32(data0 + vstep * 1);
+            npyv_u32 b1 = npyv_load_u32(data1 + vstep * 1);
+            npyv_u32 c1 = npyv_load_u32(data_out + vstep * 1);
+            
+#line 390
+            npyv_u32 a2 = npyv_load_u32(data0 + vstep * 2);
+            npyv_u32 b2 = npyv_load_u32(data1 + vstep * 2);
+            npyv_u32 c2 = npyv_load_u32(data_out + vstep * 2);
+            
+#line 390
+            npyv_u32 a3 = npyv_load_u32(data0 + vstep * 3);
+            npyv_u32 b3 = npyv_load_u32(data1 + vstep * 3);
+            npyv_u32 c3 = npyv_load_u32(data_out + vstep * 3);
+            
+            #line 397
+            npyv_u32 abc0 = npyv_muladd_u32(a0, b0, c0);
+            
+#line 397
+            npyv_u32 abc1 = npyv_muladd_u32(a1, b1, c1);
+            
+#line 397
+            npyv_u32 abc2 = npyv_muladd_u32(a2, b2, c2);
+            
+#line 397
+            npyv_u32 abc3 = npyv_muladd_u32(a3, b3, c3);
+            
+            #line 402
+            npyv_store_u32(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_u32(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_u32(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_u32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_u32 a = npyv_load_tillz_u32(data0, count);
+        npyv_u32 b = npyv_load_tillz_u32(data1, count);
+        npyv_u32 c = npyv_load_tillz_u32(data_out, count);
+        npyv_store_till_u32(data_out, count, npyv_muladd_u32(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_uint a0 = (data0[0]);
+        const npy_uint b0 = (data1[0]);
+        const npy_uint c0 = (data_out[0]);
+        
+#line 420
+        const npy_uint a1 = (data0[1]);
+        const npy_uint b1 = (data1[1]);
+        const npy_uint c1 = (data_out[1]);
+        
+#line 420
+        const npy_uint a2 = (data0[2]);
+        const npy_uint b2 = (data1[2]);
+        const npy_uint c2 = (data_out[2]);
+        
+#line 420
+        const npy_uint a3 = (data0[3]);
+        const npy_uint b3 = (data1[3]);
+        const npy_uint c3 = (data_out[3]);
+        
+        #line 427
+        const npy_uint abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_uint abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_uint abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_uint abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_uint a = (*data0);
+        const npy_uint b = (*data1);
+        const npy_uint c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_uint
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+uint_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_uint value0 = (*(npy_uint *)dataptr[0]);
+    npy_uint *data1 = (npy_uint *)dataptr[1];
+    npy_uint *data_out = (npy_uint *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("uint_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    uint_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+uint_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_uint value1 = (*(npy_uint *)dataptr[1]);
+    npy_uint *data0 = (npy_uint *)dataptr[0];
+    npy_uint *data_out = (npy_uint *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("uint_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    uint_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+uint_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_uint *data0 = (npy_uint *)dataptr[0];
+    npy_uint *data1 = (npy_uint *)dataptr[1];
+    npy_uint accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("uint_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_uint
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_u32;
+    npyv_u32 v_accum = npyv_zero_u32();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_u32 a0 = npyv_loada_u32(data0 + vstep * 0);
+            npyv_u32 b0 = npyv_loada_u32(data1 + vstep * 0);
+            
+#line 501
+            npyv_u32 a1 = npyv_loada_u32(data0 + vstep * 1);
+            npyv_u32 b1 = npyv_loada_u32(data1 + vstep * 1);
+            
+#line 501
+            npyv_u32 a2 = npyv_loada_u32(data0 + vstep * 2);
+            npyv_u32 b2 = npyv_loada_u32(data1 + vstep * 2);
+            
+#line 501
+            npyv_u32 a3 = npyv_loada_u32(data0 + vstep * 3);
+            npyv_u32 b3 = npyv_loada_u32(data1 + vstep * 3);
+            
+            npyv_u32 ab3 = npyv_muladd_u32(a3, b3, v_accum);
+            npyv_u32 ab2 = npyv_muladd_u32(a2, b2, ab3);
+            npyv_u32 ab1 = npyv_muladd_u32(a1, b1, ab2);
+                   v_accum = npyv_muladd_u32(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_u32 a0 = npyv_load_u32(data0 + vstep * 0);
+            npyv_u32 b0 = npyv_load_u32(data1 + vstep * 0);
+            
+#line 501
+            npyv_u32 a1 = npyv_load_u32(data0 + vstep * 1);
+            npyv_u32 b1 = npyv_load_u32(data1 + vstep * 1);
+            
+#line 501
+            npyv_u32 a2 = npyv_load_u32(data0 + vstep * 2);
+            npyv_u32 b2 = npyv_load_u32(data1 + vstep * 2);
+            
+#line 501
+            npyv_u32 a3 = npyv_load_u32(data0 + vstep * 3);
+            npyv_u32 b3 = npyv_load_u32(data1 + vstep * 3);
+            
+            npyv_u32 ab3 = npyv_muladd_u32(a3, b3, v_accum);
+            npyv_u32 ab2 = npyv_muladd_u32(a2, b2, ab3);
+            npyv_u32 ab1 = npyv_muladd_u32(a1, b1, ab2);
+                   v_accum = npyv_muladd_u32(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_u32 a = npyv_load_tillz_u32(data0, count);
+        npyv_u32 b = npyv_load_tillz_u32(data1, count);
+        v_accum = npyv_muladd_u32(a, b, v_accum);
+    }
+    accum = npyv_sum_u32(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_uint ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_uint ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_uint ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_uint ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_uint a = (*data0);
+        const npy_uint b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_uint
+    *(npy_uint *)dataptr[2] = ((*(npy_uint *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+uint_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_uint *data1 = (npy_uint *)dataptr[1];
+    npy_uint value0 = (*(npy_uint *)dataptr[0]);
+    npy_uint accum = uint_sum_of_arr(data1, count);
+    *(npy_uint *)dataptr[2] = ((*(npy_uint *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+uint_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_uint *data0 = (npy_uint *)dataptr[0];
+    npy_uint value1 = (*(npy_uint *)dataptr[1]);
+    npy_uint accum = uint_sum_of_arr(data0, count);
+    *(npy_uint *)dataptr[2] = ((*(npy_uint *)dataptr[2]) + value1 * accum);
+}
+
+#elif 1000 == 3 && !0
+
+static void
+uint_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_uint *data0 = (npy_uint *)dataptr[0];
+    npy_uint *data1 = (npy_uint *)dataptr[1];
+    npy_uint *data2 = (npy_uint *)dataptr[2];
+    npy_uint *data_out = (npy_uint *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 1000 > 3 || @complex */
+
+static void
+uint_sum_of_products_contig_any(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("uint_sum_of_products_contig_any (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_uint temp = (*(npy_uint *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_uint *)dataptr[i]);
+        }
+        *(npy_uint *)dataptr[nop] = (temp +
+                                           (*(npy_uint *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_uint);
+        }
+#else /* complex */
+#  if 1000 <= 3
+#    define _SUMPROD_NOP 1000
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_uint re, im, tmp;
+        int i;
+        re = ((npy_uint *)dataptr[0])[0];
+        im = ((npy_uint *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_uint *)dataptr[i])[0] -
+                  im * ((npy_uint *)dataptr[i])[1];
+            im = re * ((npy_uint *)dataptr[i])[1] +
+                 im * ((npy_uint *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_uint *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_uint *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_uint *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_uint *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_uint);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 1000 */
+
+#if 1000 == 1
+
+static NPY_GCC_OPT_3 void
+uint_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("uint_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_uint *data = (npy_uint *)dataptr[0];
+    npy_uint accum = uint_sum_of_arr(data, count);
+    *((npy_uint *)dataptr[1]) = (accum + (*((npy_uint *)dataptr[1])));
+#else
+    npy_uint accum_re = 0, accum_im = 0;
+    npy_uint *data0 = (npy_uint *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_uint re01 = data0[0] + data0[2];
+        const npy_uint re23 = data0[4] + data0[6];
+        const npy_uint im13 = data0[1] + data0[3];
+        const npy_uint im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_uint *)dataptr[1])[0] += accum_re;
+    ((npy_uint *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 1000 == 1 */
+
+static void
+uint_sum_of_products_outstride0_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_uint accum_re = 0, accum_im = 0;
+#else
+    npy_uint accum = 0;
+#endif
+
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1000 == 2 || 1000 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1000 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("uint_sum_of_products_outstride0_any (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 1000 == 1
+        accum += (*(npy_uint *)data0);
+        data0 += stride0;
+#  elif 1000 == 2
+        accum += (*(npy_uint *)data0) *
+                 (*(npy_uint *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 1000 == 3
+        accum += (*(npy_uint *)data0) *
+                 (*(npy_uint *)data1) *
+                 (*(npy_uint *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_uint temp = (*(npy_uint *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_uint *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1000 == 1
+        accum_re += ((npy_uint *)data0)[0];
+        accum_im += ((npy_uint *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 1000 <= 3
+#define _SUMPROD_NOP 1000
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_uint re, im, tmp;
+        int i;
+        re = ((npy_uint *)dataptr[0])[0];
+        im = ((npy_uint *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_uint *)dataptr[i])[0] -
+                  im * ((npy_uint *)dataptr[i])[1];
+            im = re * ((npy_uint *)dataptr[i])[1] +
+                 im * ((npy_uint *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 1000 <= 3
+    ((npy_uint *)dataptr[1000])[0] += accum_re;
+    ((npy_uint *)dataptr[1000])[1] += accum_im;
+#  else
+    ((npy_uint *)dataptr[nop])[0] += accum_re;
+    ((npy_uint *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 1000 <= 3
+    *((npy_uint *)dataptr[1000]) = (accum +
+                                    (*((npy_uint *)dataptr[1000])));
+#  else
+    *((npy_uint *)dataptr[nop]) = (accum +
+                                    (*((npy_uint *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+
+
+#line 74
+
+#if !0
+static NPY_GCC_OPT_3 npy_ulong ulong_sum_of_arr(npy_ulong *data, npy_intp count)
+{
+    npy_ulong accum = 0;
+#if 0 // NPYV check for npy_ulong
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data);
+    const int vstep = npyv_nlanes_ulong;
+    npyv_ulong v_accum = npyv_zero_ulong();
+    const npy_intp vstepx4 = vstep * 4;
+
+    #line 91
+    if(is_aligned) {
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
+            #line 96
+            npyv_ulong a0 = npyv_loada_ulong(data + vstep * 0);
+            
+#line 96
+            npyv_ulong a1 = npyv_loada_ulong(data + vstep * 1);
+            
+#line 96
+            npyv_ulong a2 = npyv_loada_ulong(data + vstep * 2);
+            
+#line 96
+            npyv_ulong a3 = npyv_loada_ulong(data + vstep * 3);
+            
+            npyv_ulong a01   = npyv_add_ulong(a0, a1);
+            npyv_ulong a23   = npyv_add_ulong(a2, a3);
+            npyv_ulong a0123 = npyv_add_ulong(a01, a23);
+                     v_accum = npyv_add_ulong(a0123, v_accum);
+        }
+    }
+    
+#line 91
+    else {
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
+            #line 96
+            npyv_ulong a0 = npyv_load_ulong(data + vstep * 0);
+            
+#line 96
+            npyv_ulong a1 = npyv_load_ulong(data + vstep * 1);
+            
+#line 96
+            npyv_ulong a2 = npyv_load_ulong(data + vstep * 2);
+            
+#line 96
+            npyv_ulong a3 = npyv_load_ulong(data + vstep * 3);
+            
+            npyv_ulong a01   = npyv_add_ulong(a0, a1);
+            npyv_ulong a23   = npyv_add_ulong(a2, a3);
+            npyv_ulong a0123 = npyv_add_ulong(a01, a23);
+                     v_accum = npyv_add_ulong(a0123, v_accum);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep) {
+        npyv_ulong a = npyv_load_tillz_ulong(data, count);
+        v_accum = npyv_add_ulong(a, v_accum);
+    }
+    accum = npyv_sum_ulong(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data += 4) {
+        const npy_ulong a01 = (*data) + (data[1]);
+        const npy_ulong a23 = (data[2]) + (data[3]);
+        accum +=  a01 + a23;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data++) {
+        accum += (*data);
+    }
+#endif // NPYV check for npy_ulong
+    return accum;
+}
+#endif
+
+#line 131
+static void
+ulong_sum_of_products_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1 == 2 || 1 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data_out = dataptr[1];
+    npy_intp stride_out = strides[1];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("ulong_sum_of_products_one (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 1 == 1
+        *(npy_ulong *)data_out = ((*(npy_ulong *)data0) +
+                                         (*(npy_ulong *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 1 == 2
+        *(npy_ulong *)data_out = ((*(npy_ulong *)data0) *
+                                         (*(npy_ulong *)data1) +
+                                         (*(npy_ulong *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 1 == 3
+        *(npy_ulong *)data_out = ((*(npy_ulong *)data0) *
+                                         (*(npy_ulong *)data1) *
+                                         (*(npy_ulong *)data2) +
+                                         (*(npy_ulong *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_ulong temp = (*(npy_ulong *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ulong *)dataptr[i]);
+        }
+        *(npy_ulong *)dataptr[nop] = (temp +
+                                           (*(npy_ulong *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1 == 1
+        ((npy_ulong *)data_out)[0] = ((npy_ulong *)data0)[0] +
+                                         ((npy_ulong *)data_out)[0];
+        ((npy_ulong *)data_out)[1] = ((npy_ulong *)data0)[1] +
+                                         ((npy_ulong *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 1 <= 3
+#define _SUMPROD_NOP 1
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_ulong re, im, tmp;
+        int i;
+        re = ((npy_ulong *)dataptr[0])[0];
+        im = ((npy_ulong *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ulong *)dataptr[i])[0] -
+                  im * ((npy_ulong *)dataptr[i])[1];
+            im = re * ((npy_ulong *)dataptr[i])[1] +
+                 im * ((npy_ulong *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_ulong *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_ulong *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_ulong *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_ulong *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 1 == 1
+
+static void
+ulong_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulong *data0 = (npy_ulong *)dataptr[0];
+    npy_ulong *data_out = (npy_ulong *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("ulong_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_ulong *)data_out + 2*6)[0] =
+                                    ((npy_ulong *)data0 + 2*6)[0] +
+                                    ((npy_ulong *)data_out + 2*6)[0];
+            ((npy_ulong *)data_out + 2*6)[1] =
+                                    ((npy_ulong *)data0 + 2*6)[1] +
+                                    ((npy_ulong *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_ulong *)data_out + 2*5)[0] =
+                                    ((npy_ulong *)data0 + 2*5)[0] +
+                                    ((npy_ulong *)data_out + 2*5)[0];
+            ((npy_ulong *)data_out + 2*5)[1] =
+                                    ((npy_ulong *)data0 + 2*5)[1] +
+                                    ((npy_ulong *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_ulong *)data_out + 2*4)[0] =
+                                    ((npy_ulong *)data0 + 2*4)[0] +
+                                    ((npy_ulong *)data_out + 2*4)[0];
+            ((npy_ulong *)data_out + 2*4)[1] =
+                                    ((npy_ulong *)data0 + 2*4)[1] +
+                                    ((npy_ulong *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_ulong *)data_out + 2*3)[0] =
+                                    ((npy_ulong *)data0 + 2*3)[0] +
+                                    ((npy_ulong *)data_out + 2*3)[0];
+            ((npy_ulong *)data_out + 2*3)[1] =
+                                    ((npy_ulong *)data0 + 2*3)[1] +
+                                    ((npy_ulong *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_ulong *)data_out + 2*2)[0] =
+                                    ((npy_ulong *)data0 + 2*2)[0] +
+                                    ((npy_ulong *)data_out + 2*2)[0];
+            ((npy_ulong *)data_out + 2*2)[1] =
+                                    ((npy_ulong *)data0 + 2*2)[1] +
+                                    ((npy_ulong *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_ulong *)data_out + 2*1)[0] =
+                                    ((npy_ulong *)data0 + 2*1)[0] +
+                                    ((npy_ulong *)data_out + 2*1)[0];
+            ((npy_ulong *)data_out + 2*1)[1] =
+                                    ((npy_ulong *)data0 + 2*1)[1] +
+                                    ((npy_ulong *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_ulong *)data_out + 2*0)[0] =
+                                    ((npy_ulong *)data0 + 2*0)[0] +
+                                    ((npy_ulong *)data_out + 2*0)[0];
+            ((npy_ulong *)data_out + 2*0)[1] =
+                                    ((npy_ulong *)data0 + 2*0)[1] +
+                                    ((npy_ulong *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_ulong *)data_out + 2*0)[0] =
+                                ((npy_ulong *)data0 + 2*0)[0] +
+                                ((npy_ulong *)data_out + 2*0)[0];
+        ((npy_ulong *)data_out + 2*0)[1] =
+                                ((npy_ulong *)data0 + 2*0)[1] +
+                                ((npy_ulong *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_ulong *)data_out + 2*1)[0] =
+                                ((npy_ulong *)data0 + 2*1)[0] +
+                                ((npy_ulong *)data_out + 2*1)[0];
+        ((npy_ulong *)data_out + 2*1)[1] =
+                                ((npy_ulong *)data0 + 2*1)[1] +
+                                ((npy_ulong *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_ulong *)data_out + 2*2)[0] =
+                                ((npy_ulong *)data0 + 2*2)[0] +
+                                ((npy_ulong *)data_out + 2*2)[0];
+        ((npy_ulong *)data_out + 2*2)[1] =
+                                ((npy_ulong *)data0 + 2*2)[1] +
+                                ((npy_ulong *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_ulong *)data_out + 2*3)[0] =
+                                ((npy_ulong *)data0 + 2*3)[0] +
+                                ((npy_ulong *)data_out + 2*3)[0];
+        ((npy_ulong *)data_out + 2*3)[1] =
+                                ((npy_ulong *)data0 + 2*3)[1] +
+                                ((npy_ulong *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_ulong *)data_out + 2*4)[0] =
+                                ((npy_ulong *)data0 + 2*4)[0] +
+                                ((npy_ulong *)data_out + 2*4)[0];
+        ((npy_ulong *)data_out + 2*4)[1] =
+                                ((npy_ulong *)data0 + 2*4)[1] +
+                                ((npy_ulong *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_ulong *)data_out + 2*5)[0] =
+                                ((npy_ulong *)data0 + 2*5)[0] +
+                                ((npy_ulong *)data_out + 2*5)[0];
+        ((npy_ulong *)data_out + 2*5)[1] =
+                                ((npy_ulong *)data0 + 2*5)[1] +
+                                ((npy_ulong *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_ulong *)data_out + 2*6)[0] =
+                                ((npy_ulong *)data0 + 2*6)[0] +
+                                ((npy_ulong *)data_out + 2*6)[0];
+        ((npy_ulong *)data_out + 2*6)[1] =
+                                ((npy_ulong *)data0 + 2*6)[1] +
+                                ((npy_ulong *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_ulong *)data_out + 2*7)[0] =
+                                ((npy_ulong *)data0 + 2*7)[0] +
+                                ((npy_ulong *)data_out + 2*7)[0];
+        ((npy_ulong *)data_out + 2*7)[1] =
+                                ((npy_ulong *)data0 + 2*7)[1] +
+                                ((npy_ulong *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 1 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+ulong_sum_of_products_muladd(npy_ulong *data, npy_ulong *data_out, npy_ulong scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_ulong
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_ulong;
+    const npyv_ulong v_scalar = npyv_setall_ulong(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_ulong b0 = npyv_loada_ulong(data + vstep * 0);
+            npyv_ulong c0 = npyv_loada_ulong(data_out + vstep * 0);
+            
+#line 312
+            npyv_ulong b1 = npyv_loada_ulong(data + vstep * 1);
+            npyv_ulong c1 = npyv_loada_ulong(data_out + vstep * 1);
+            
+#line 312
+            npyv_ulong b2 = npyv_loada_ulong(data + vstep * 2);
+            npyv_ulong c2 = npyv_loada_ulong(data_out + vstep * 2);
+            
+#line 312
+            npyv_ulong b3 = npyv_loada_ulong(data + vstep * 3);
+            npyv_ulong c3 = npyv_loada_ulong(data_out + vstep * 3);
+            
+            #line 318
+            npyv_ulong abc0 = npyv_muladd_ulong(v_scalar, b0, c0);
+            
+#line 318
+            npyv_ulong abc1 = npyv_muladd_ulong(v_scalar, b1, c1);
+            
+#line 318
+            npyv_ulong abc2 = npyv_muladd_ulong(v_scalar, b2, c2);
+            
+#line 318
+            npyv_ulong abc3 = npyv_muladd_ulong(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_ulong(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_ulong(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_ulong(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_ulong(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_ulong b0 = npyv_load_ulong(data + vstep * 0);
+            npyv_ulong c0 = npyv_load_ulong(data_out + vstep * 0);
+            
+#line 312
+            npyv_ulong b1 = npyv_load_ulong(data + vstep * 1);
+            npyv_ulong c1 = npyv_load_ulong(data_out + vstep * 1);
+            
+#line 312
+            npyv_ulong b2 = npyv_load_ulong(data + vstep * 2);
+            npyv_ulong c2 = npyv_load_ulong(data_out + vstep * 2);
+            
+#line 312
+            npyv_ulong b3 = npyv_load_ulong(data + vstep * 3);
+            npyv_ulong c3 = npyv_load_ulong(data_out + vstep * 3);
+            
+            #line 318
+            npyv_ulong abc0 = npyv_muladd_ulong(v_scalar, b0, c0);
+            
+#line 318
+            npyv_ulong abc1 = npyv_muladd_ulong(v_scalar, b1, c1);
+            
+#line 318
+            npyv_ulong abc2 = npyv_muladd_ulong(v_scalar, b2, c2);
+            
+#line 318
+            npyv_ulong abc3 = npyv_muladd_ulong(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_ulong(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_ulong(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_ulong(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_ulong(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_ulong a = npyv_load_tillz_ulong(data, count);
+        npyv_ulong b = npyv_load_tillz_ulong(data_out, count);
+        npyv_store_till_ulong(data_out, count, npyv_muladd_ulong(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_ulong b0 = (data[0]);
+        const npy_ulong c0 = (data_out[0]);
+        
+#line 340
+        const npy_ulong b1 = (data[1]);
+        const npy_ulong c1 = (data_out[1]);
+        
+#line 340
+        const npy_ulong b2 = (data[2]);
+        const npy_ulong c2 = (data_out[2]);
+        
+#line 340
+        const npy_ulong b3 = (data[3]);
+        const npy_ulong c3 = (data_out[3]);
+        
+        #line 346
+        const npy_ulong abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_ulong abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_ulong abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_ulong abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_ulong b = (*data);
+        const npy_ulong c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_ulong
+}
+
+static void
+ulong_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulong *data0 = (npy_ulong *)dataptr[0];
+    npy_ulong *data1 = (npy_ulong *)dataptr[1];
+    npy_ulong *data_out = (npy_ulong *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("ulong_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_ulong
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_ulong;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_ulong a0 = npyv_loada_ulong(data0 + vstep * 0);
+            npyv_ulong b0 = npyv_loada_ulong(data1 + vstep * 0);
+            npyv_ulong c0 = npyv_loada_ulong(data_out + vstep * 0);
+            
+#line 390
+            npyv_ulong a1 = npyv_loada_ulong(data0 + vstep * 1);
+            npyv_ulong b1 = npyv_loada_ulong(data1 + vstep * 1);
+            npyv_ulong c1 = npyv_loada_ulong(data_out + vstep * 1);
+            
+#line 390
+            npyv_ulong a2 = npyv_loada_ulong(data0 + vstep * 2);
+            npyv_ulong b2 = npyv_loada_ulong(data1 + vstep * 2);
+            npyv_ulong c2 = npyv_loada_ulong(data_out + vstep * 2);
+            
+#line 390
+            npyv_ulong a3 = npyv_loada_ulong(data0 + vstep * 3);
+            npyv_ulong b3 = npyv_loada_ulong(data1 + vstep * 3);
+            npyv_ulong c3 = npyv_loada_ulong(data_out + vstep * 3);
+            
+            #line 397
+            npyv_ulong abc0 = npyv_muladd_ulong(a0, b0, c0);
+            
+#line 397
+            npyv_ulong abc1 = npyv_muladd_ulong(a1, b1, c1);
+            
+#line 397
+            npyv_ulong abc2 = npyv_muladd_ulong(a2, b2, c2);
+            
+#line 397
+            npyv_ulong abc3 = npyv_muladd_ulong(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_ulong(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_ulong(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_ulong(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_ulong(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_ulong a0 = npyv_load_ulong(data0 + vstep * 0);
+            npyv_ulong b0 = npyv_load_ulong(data1 + vstep * 0);
+            npyv_ulong c0 = npyv_load_ulong(data_out + vstep * 0);
+            
+#line 390
+            npyv_ulong a1 = npyv_load_ulong(data0 + vstep * 1);
+            npyv_ulong b1 = npyv_load_ulong(data1 + vstep * 1);
+            npyv_ulong c1 = npyv_load_ulong(data_out + vstep * 1);
+            
+#line 390
+            npyv_ulong a2 = npyv_load_ulong(data0 + vstep * 2);
+            npyv_ulong b2 = npyv_load_ulong(data1 + vstep * 2);
+            npyv_ulong c2 = npyv_load_ulong(data_out + vstep * 2);
+            
+#line 390
+            npyv_ulong a3 = npyv_load_ulong(data0 + vstep * 3);
+            npyv_ulong b3 = npyv_load_ulong(data1 + vstep * 3);
+            npyv_ulong c3 = npyv_load_ulong(data_out + vstep * 3);
+            
+            #line 397
+            npyv_ulong abc0 = npyv_muladd_ulong(a0, b0, c0);
+            
+#line 397
+            npyv_ulong abc1 = npyv_muladd_ulong(a1, b1, c1);
+            
+#line 397
+            npyv_ulong abc2 = npyv_muladd_ulong(a2, b2, c2);
+            
+#line 397
+            npyv_ulong abc3 = npyv_muladd_ulong(a3, b3, c3);
+            
+            #line 402
+            npyv_store_ulong(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_ulong(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_ulong(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_ulong(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_ulong a = npyv_load_tillz_ulong(data0, count);
+        npyv_ulong b = npyv_load_tillz_ulong(data1, count);
+        npyv_ulong c = npyv_load_tillz_ulong(data_out, count);
+        npyv_store_till_ulong(data_out, count, npyv_muladd_ulong(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_ulong a0 = (data0[0]);
+        const npy_ulong b0 = (data1[0]);
+        const npy_ulong c0 = (data_out[0]);
+        
+#line 420
+        const npy_ulong a1 = (data0[1]);
+        const npy_ulong b1 = (data1[1]);
+        const npy_ulong c1 = (data_out[1]);
+        
+#line 420
+        const npy_ulong a2 = (data0[2]);
+        const npy_ulong b2 = (data1[2]);
+        const npy_ulong c2 = (data_out[2]);
+        
+#line 420
+        const npy_ulong a3 = (data0[3]);
+        const npy_ulong b3 = (data1[3]);
+        const npy_ulong c3 = (data_out[3]);
+        
+        #line 427
+        const npy_ulong abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_ulong abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_ulong abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_ulong abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_ulong a = (*data0);
+        const npy_ulong b = (*data1);
+        const npy_ulong c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_ulong
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+ulong_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulong value0 = (*(npy_ulong *)dataptr[0]);
+    npy_ulong *data1 = (npy_ulong *)dataptr[1];
+    npy_ulong *data_out = (npy_ulong *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("ulong_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    ulong_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+ulong_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulong value1 = (*(npy_ulong *)dataptr[1]);
+    npy_ulong *data0 = (npy_ulong *)dataptr[0];
+    npy_ulong *data_out = (npy_ulong *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("ulong_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    ulong_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+ulong_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulong *data0 = (npy_ulong *)dataptr[0];
+    npy_ulong *data1 = (npy_ulong *)dataptr[1];
+    npy_ulong accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("ulong_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_ulong
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_ulong;
+    npyv_ulong v_accum = npyv_zero_ulong();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_ulong a0 = npyv_loada_ulong(data0 + vstep * 0);
+            npyv_ulong b0 = npyv_loada_ulong(data1 + vstep * 0);
+            
+#line 501
+            npyv_ulong a1 = npyv_loada_ulong(data0 + vstep * 1);
+            npyv_ulong b1 = npyv_loada_ulong(data1 + vstep * 1);
+            
+#line 501
+            npyv_ulong a2 = npyv_loada_ulong(data0 + vstep * 2);
+            npyv_ulong b2 = npyv_loada_ulong(data1 + vstep * 2);
+            
+#line 501
+            npyv_ulong a3 = npyv_loada_ulong(data0 + vstep * 3);
+            npyv_ulong b3 = npyv_loada_ulong(data1 + vstep * 3);
+            
+            npyv_ulong ab3 = npyv_muladd_ulong(a3, b3, v_accum);
+            npyv_ulong ab2 = npyv_muladd_ulong(a2, b2, ab3);
+            npyv_ulong ab1 = npyv_muladd_ulong(a1, b1, ab2);
+                   v_accum = npyv_muladd_ulong(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_ulong a0 = npyv_load_ulong(data0 + vstep * 0);
+            npyv_ulong b0 = npyv_load_ulong(data1 + vstep * 0);
+            
+#line 501
+            npyv_ulong a1 = npyv_load_ulong(data0 + vstep * 1);
+            npyv_ulong b1 = npyv_load_ulong(data1 + vstep * 1);
+            
+#line 501
+            npyv_ulong a2 = npyv_load_ulong(data0 + vstep * 2);
+            npyv_ulong b2 = npyv_load_ulong(data1 + vstep * 2);
+            
+#line 501
+            npyv_ulong a3 = npyv_load_ulong(data0 + vstep * 3);
+            npyv_ulong b3 = npyv_load_ulong(data1 + vstep * 3);
+            
+            npyv_ulong ab3 = npyv_muladd_ulong(a3, b3, v_accum);
+            npyv_ulong ab2 = npyv_muladd_ulong(a2, b2, ab3);
+            npyv_ulong ab1 = npyv_muladd_ulong(a1, b1, ab2);
+                   v_accum = npyv_muladd_ulong(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_ulong a = npyv_load_tillz_ulong(data0, count);
+        npyv_ulong b = npyv_load_tillz_ulong(data1, count);
+        v_accum = npyv_muladd_ulong(a, b, v_accum);
+    }
+    accum = npyv_sum_ulong(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_ulong ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_ulong ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_ulong ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_ulong ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_ulong a = (*data0);
+        const npy_ulong b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_ulong
+    *(npy_ulong *)dataptr[2] = ((*(npy_ulong *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+ulong_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulong *data1 = (npy_ulong *)dataptr[1];
+    npy_ulong value0 = (*(npy_ulong *)dataptr[0]);
+    npy_ulong accum = ulong_sum_of_arr(data1, count);
+    *(npy_ulong *)dataptr[2] = ((*(npy_ulong *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+ulong_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulong *data0 = (npy_ulong *)dataptr[0];
+    npy_ulong value1 = (*(npy_ulong *)dataptr[1]);
+    npy_ulong accum = ulong_sum_of_arr(data0, count);
+    *(npy_ulong *)dataptr[2] = ((*(npy_ulong *)dataptr[2]) + value1 * accum);
+}
+
+#elif 1 == 3 && !0
+
+static void
+ulong_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulong *data0 = (npy_ulong *)dataptr[0];
+    npy_ulong *data1 = (npy_ulong *)dataptr[1];
+    npy_ulong *data2 = (npy_ulong *)dataptr[2];
+    npy_ulong *data_out = (npy_ulong *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 1 > 3 || @complex */
+
+static void
+ulong_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("ulong_sum_of_products_contig_one (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_ulong temp = (*(npy_ulong *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ulong *)dataptr[i]);
+        }
+        *(npy_ulong *)dataptr[nop] = (temp +
+                                           (*(npy_ulong *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_ulong);
+        }
+#else /* complex */
+#  if 1 <= 3
+#    define _SUMPROD_NOP 1
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_ulong re, im, tmp;
+        int i;
+        re = ((npy_ulong *)dataptr[0])[0];
+        im = ((npy_ulong *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ulong *)dataptr[i])[0] -
+                  im * ((npy_ulong *)dataptr[i])[1];
+            im = re * ((npy_ulong *)dataptr[i])[1] +
+                 im * ((npy_ulong *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_ulong *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_ulong *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_ulong *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_ulong *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_ulong);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 1 */
+
+#if 1 == 1
+
+static NPY_GCC_OPT_3 void
+ulong_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("ulong_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_ulong *data = (npy_ulong *)dataptr[0];
+    npy_ulong accum = ulong_sum_of_arr(data, count);
+    *((npy_ulong *)dataptr[1]) = (accum + (*((npy_ulong *)dataptr[1])));
+#else
+    npy_ulong accum_re = 0, accum_im = 0;
+    npy_ulong *data0 = (npy_ulong *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_ulong re01 = data0[0] + data0[2];
+        const npy_ulong re23 = data0[4] + data0[6];
+        const npy_ulong im13 = data0[1] + data0[3];
+        const npy_ulong im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_ulong *)dataptr[1])[0] += accum_re;
+    ((npy_ulong *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 1 == 1 */
+
+static void
+ulong_sum_of_products_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_ulong accum_re = 0, accum_im = 0;
+#else
+    npy_ulong accum = 0;
+#endif
+
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1 == 2 || 1 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("ulong_sum_of_products_outstride0_one (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 1 == 1
+        accum += (*(npy_ulong *)data0);
+        data0 += stride0;
+#  elif 1 == 2
+        accum += (*(npy_ulong *)data0) *
+                 (*(npy_ulong *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 1 == 3
+        accum += (*(npy_ulong *)data0) *
+                 (*(npy_ulong *)data1) *
+                 (*(npy_ulong *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_ulong temp = (*(npy_ulong *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ulong *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1 == 1
+        accum_re += ((npy_ulong *)data0)[0];
+        accum_im += ((npy_ulong *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 1 <= 3
+#define _SUMPROD_NOP 1
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_ulong re, im, tmp;
+        int i;
+        re = ((npy_ulong *)dataptr[0])[0];
+        im = ((npy_ulong *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ulong *)dataptr[i])[0] -
+                  im * ((npy_ulong *)dataptr[i])[1];
+            im = re * ((npy_ulong *)dataptr[i])[1] +
+                 im * ((npy_ulong *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 1 <= 3
+    ((npy_ulong *)dataptr[1])[0] += accum_re;
+    ((npy_ulong *)dataptr[1])[1] += accum_im;
+#  else
+    ((npy_ulong *)dataptr[nop])[0] += accum_re;
+    ((npy_ulong *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 1 <= 3
+    *((npy_ulong *)dataptr[1]) = (accum +
+                                    (*((npy_ulong *)dataptr[1])));
+#  else
+    *((npy_ulong *)dataptr[nop]) = (accum +
+                                    (*((npy_ulong *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+ulong_sum_of_products_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (2 == 2 || 2 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (2 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data_out = dataptr[2];
+    npy_intp stride_out = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("ulong_sum_of_products_two (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 2 == 1
+        *(npy_ulong *)data_out = ((*(npy_ulong *)data0) +
+                                         (*(npy_ulong *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 2 == 2
+        *(npy_ulong *)data_out = ((*(npy_ulong *)data0) *
+                                         (*(npy_ulong *)data1) +
+                                         (*(npy_ulong *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 2 == 3
+        *(npy_ulong *)data_out = ((*(npy_ulong *)data0) *
+                                         (*(npy_ulong *)data1) *
+                                         (*(npy_ulong *)data2) +
+                                         (*(npy_ulong *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_ulong temp = (*(npy_ulong *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ulong *)dataptr[i]);
+        }
+        *(npy_ulong *)dataptr[nop] = (temp +
+                                           (*(npy_ulong *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 2 == 1
+        ((npy_ulong *)data_out)[0] = ((npy_ulong *)data0)[0] +
+                                         ((npy_ulong *)data_out)[0];
+        ((npy_ulong *)data_out)[1] = ((npy_ulong *)data0)[1] +
+                                         ((npy_ulong *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 2 <= 3
+#define _SUMPROD_NOP 2
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_ulong re, im, tmp;
+        int i;
+        re = ((npy_ulong *)dataptr[0])[0];
+        im = ((npy_ulong *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ulong *)dataptr[i])[0] -
+                  im * ((npy_ulong *)dataptr[i])[1];
+            im = re * ((npy_ulong *)dataptr[i])[1] +
+                 im * ((npy_ulong *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_ulong *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_ulong *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_ulong *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_ulong *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 2 == 1
+
+static void
+ulong_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulong *data0 = (npy_ulong *)dataptr[0];
+    npy_ulong *data_out = (npy_ulong *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("ulong_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_ulong *)data_out + 2*6)[0] =
+                                    ((npy_ulong *)data0 + 2*6)[0] +
+                                    ((npy_ulong *)data_out + 2*6)[0];
+            ((npy_ulong *)data_out + 2*6)[1] =
+                                    ((npy_ulong *)data0 + 2*6)[1] +
+                                    ((npy_ulong *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_ulong *)data_out + 2*5)[0] =
+                                    ((npy_ulong *)data0 + 2*5)[0] +
+                                    ((npy_ulong *)data_out + 2*5)[0];
+            ((npy_ulong *)data_out + 2*5)[1] =
+                                    ((npy_ulong *)data0 + 2*5)[1] +
+                                    ((npy_ulong *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_ulong *)data_out + 2*4)[0] =
+                                    ((npy_ulong *)data0 + 2*4)[0] +
+                                    ((npy_ulong *)data_out + 2*4)[0];
+            ((npy_ulong *)data_out + 2*4)[1] =
+                                    ((npy_ulong *)data0 + 2*4)[1] +
+                                    ((npy_ulong *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_ulong *)data_out + 2*3)[0] =
+                                    ((npy_ulong *)data0 + 2*3)[0] +
+                                    ((npy_ulong *)data_out + 2*3)[0];
+            ((npy_ulong *)data_out + 2*3)[1] =
+                                    ((npy_ulong *)data0 + 2*3)[1] +
+                                    ((npy_ulong *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_ulong *)data_out + 2*2)[0] =
+                                    ((npy_ulong *)data0 + 2*2)[0] +
+                                    ((npy_ulong *)data_out + 2*2)[0];
+            ((npy_ulong *)data_out + 2*2)[1] =
+                                    ((npy_ulong *)data0 + 2*2)[1] +
+                                    ((npy_ulong *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_ulong *)data_out + 2*1)[0] =
+                                    ((npy_ulong *)data0 + 2*1)[0] +
+                                    ((npy_ulong *)data_out + 2*1)[0];
+            ((npy_ulong *)data_out + 2*1)[1] =
+                                    ((npy_ulong *)data0 + 2*1)[1] +
+                                    ((npy_ulong *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_ulong *)data_out + 2*0)[0] =
+                                    ((npy_ulong *)data0 + 2*0)[0] +
+                                    ((npy_ulong *)data_out + 2*0)[0];
+            ((npy_ulong *)data_out + 2*0)[1] =
+                                    ((npy_ulong *)data0 + 2*0)[1] +
+                                    ((npy_ulong *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_ulong *)data_out + 2*0)[0] =
+                                ((npy_ulong *)data0 + 2*0)[0] +
+                                ((npy_ulong *)data_out + 2*0)[0];
+        ((npy_ulong *)data_out + 2*0)[1] =
+                                ((npy_ulong *)data0 + 2*0)[1] +
+                                ((npy_ulong *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_ulong *)data_out + 2*1)[0] =
+                                ((npy_ulong *)data0 + 2*1)[0] +
+                                ((npy_ulong *)data_out + 2*1)[0];
+        ((npy_ulong *)data_out + 2*1)[1] =
+                                ((npy_ulong *)data0 + 2*1)[1] +
+                                ((npy_ulong *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_ulong *)data_out + 2*2)[0] =
+                                ((npy_ulong *)data0 + 2*2)[0] +
+                                ((npy_ulong *)data_out + 2*2)[0];
+        ((npy_ulong *)data_out + 2*2)[1] =
+                                ((npy_ulong *)data0 + 2*2)[1] +
+                                ((npy_ulong *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_ulong *)data_out + 2*3)[0] =
+                                ((npy_ulong *)data0 + 2*3)[0] +
+                                ((npy_ulong *)data_out + 2*3)[0];
+        ((npy_ulong *)data_out + 2*3)[1] =
+                                ((npy_ulong *)data0 + 2*3)[1] +
+                                ((npy_ulong *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_ulong *)data_out + 2*4)[0] =
+                                ((npy_ulong *)data0 + 2*4)[0] +
+                                ((npy_ulong *)data_out + 2*4)[0];
+        ((npy_ulong *)data_out + 2*4)[1] =
+                                ((npy_ulong *)data0 + 2*4)[1] +
+                                ((npy_ulong *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_ulong *)data_out + 2*5)[0] =
+                                ((npy_ulong *)data0 + 2*5)[0] +
+                                ((npy_ulong *)data_out + 2*5)[0];
+        ((npy_ulong *)data_out + 2*5)[1] =
+                                ((npy_ulong *)data0 + 2*5)[1] +
+                                ((npy_ulong *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_ulong *)data_out + 2*6)[0] =
+                                ((npy_ulong *)data0 + 2*6)[0] +
+                                ((npy_ulong *)data_out + 2*6)[0];
+        ((npy_ulong *)data_out + 2*6)[1] =
+                                ((npy_ulong *)data0 + 2*6)[1] +
+                                ((npy_ulong *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_ulong *)data_out + 2*7)[0] =
+                                ((npy_ulong *)data0 + 2*7)[0] +
+                                ((npy_ulong *)data_out + 2*7)[0];
+        ((npy_ulong *)data_out + 2*7)[1] =
+                                ((npy_ulong *)data0 + 2*7)[1] +
+                                ((npy_ulong *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 2 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+ulong_sum_of_products_muladd(npy_ulong *data, npy_ulong *data_out, npy_ulong scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_ulong
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_ulong;
+    const npyv_ulong v_scalar = npyv_setall_ulong(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_ulong b0 = npyv_loada_ulong(data + vstep * 0);
+            npyv_ulong c0 = npyv_loada_ulong(data_out + vstep * 0);
+            
+#line 312
+            npyv_ulong b1 = npyv_loada_ulong(data + vstep * 1);
+            npyv_ulong c1 = npyv_loada_ulong(data_out + vstep * 1);
+            
+#line 312
+            npyv_ulong b2 = npyv_loada_ulong(data + vstep * 2);
+            npyv_ulong c2 = npyv_loada_ulong(data_out + vstep * 2);
+            
+#line 312
+            npyv_ulong b3 = npyv_loada_ulong(data + vstep * 3);
+            npyv_ulong c3 = npyv_loada_ulong(data_out + vstep * 3);
+            
+            #line 318
+            npyv_ulong abc0 = npyv_muladd_ulong(v_scalar, b0, c0);
+            
+#line 318
+            npyv_ulong abc1 = npyv_muladd_ulong(v_scalar, b1, c1);
+            
+#line 318
+            npyv_ulong abc2 = npyv_muladd_ulong(v_scalar, b2, c2);
+            
+#line 318
+            npyv_ulong abc3 = npyv_muladd_ulong(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_ulong(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_ulong(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_ulong(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_ulong(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_ulong b0 = npyv_load_ulong(data + vstep * 0);
+            npyv_ulong c0 = npyv_load_ulong(data_out + vstep * 0);
+            
+#line 312
+            npyv_ulong b1 = npyv_load_ulong(data + vstep * 1);
+            npyv_ulong c1 = npyv_load_ulong(data_out + vstep * 1);
+            
+#line 312
+            npyv_ulong b2 = npyv_load_ulong(data + vstep * 2);
+            npyv_ulong c2 = npyv_load_ulong(data_out + vstep * 2);
+            
+#line 312
+            npyv_ulong b3 = npyv_load_ulong(data + vstep * 3);
+            npyv_ulong c3 = npyv_load_ulong(data_out + vstep * 3);
+            
+            #line 318
+            npyv_ulong abc0 = npyv_muladd_ulong(v_scalar, b0, c0);
+            
+#line 318
+            npyv_ulong abc1 = npyv_muladd_ulong(v_scalar, b1, c1);
+            
+#line 318
+            npyv_ulong abc2 = npyv_muladd_ulong(v_scalar, b2, c2);
+            
+#line 318
+            npyv_ulong abc3 = npyv_muladd_ulong(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_ulong(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_ulong(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_ulong(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_ulong(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_ulong a = npyv_load_tillz_ulong(data, count);
+        npyv_ulong b = npyv_load_tillz_ulong(data_out, count);
+        npyv_store_till_ulong(data_out, count, npyv_muladd_ulong(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_ulong b0 = (data[0]);
+        const npy_ulong c0 = (data_out[0]);
+        
+#line 340
+        const npy_ulong b1 = (data[1]);
+        const npy_ulong c1 = (data_out[1]);
+        
+#line 340
+        const npy_ulong b2 = (data[2]);
+        const npy_ulong c2 = (data_out[2]);
+        
+#line 340
+        const npy_ulong b3 = (data[3]);
+        const npy_ulong c3 = (data_out[3]);
+        
+        #line 346
+        const npy_ulong abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_ulong abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_ulong abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_ulong abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_ulong b = (*data);
+        const npy_ulong c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_ulong
+}
+
+static void
+ulong_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulong *data0 = (npy_ulong *)dataptr[0];
+    npy_ulong *data1 = (npy_ulong *)dataptr[1];
+    npy_ulong *data_out = (npy_ulong *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("ulong_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_ulong
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_ulong;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_ulong a0 = npyv_loada_ulong(data0 + vstep * 0);
+            npyv_ulong b0 = npyv_loada_ulong(data1 + vstep * 0);
+            npyv_ulong c0 = npyv_loada_ulong(data_out + vstep * 0);
+            
+#line 390
+            npyv_ulong a1 = npyv_loada_ulong(data0 + vstep * 1);
+            npyv_ulong b1 = npyv_loada_ulong(data1 + vstep * 1);
+            npyv_ulong c1 = npyv_loada_ulong(data_out + vstep * 1);
+            
+#line 390
+            npyv_ulong a2 = npyv_loada_ulong(data0 + vstep * 2);
+            npyv_ulong b2 = npyv_loada_ulong(data1 + vstep * 2);
+            npyv_ulong c2 = npyv_loada_ulong(data_out + vstep * 2);
+            
+#line 390
+            npyv_ulong a3 = npyv_loada_ulong(data0 + vstep * 3);
+            npyv_ulong b3 = npyv_loada_ulong(data1 + vstep * 3);
+            npyv_ulong c3 = npyv_loada_ulong(data_out + vstep * 3);
+            
+            #line 397
+            npyv_ulong abc0 = npyv_muladd_ulong(a0, b0, c0);
+            
+#line 397
+            npyv_ulong abc1 = npyv_muladd_ulong(a1, b1, c1);
+            
+#line 397
+            npyv_ulong abc2 = npyv_muladd_ulong(a2, b2, c2);
+            
+#line 397
+            npyv_ulong abc3 = npyv_muladd_ulong(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_ulong(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_ulong(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_ulong(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_ulong(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_ulong a0 = npyv_load_ulong(data0 + vstep * 0);
+            npyv_ulong b0 = npyv_load_ulong(data1 + vstep * 0);
+            npyv_ulong c0 = npyv_load_ulong(data_out + vstep * 0);
+            
+#line 390
+            npyv_ulong a1 = npyv_load_ulong(data0 + vstep * 1);
+            npyv_ulong b1 = npyv_load_ulong(data1 + vstep * 1);
+            npyv_ulong c1 = npyv_load_ulong(data_out + vstep * 1);
+            
+#line 390
+            npyv_ulong a2 = npyv_load_ulong(data0 + vstep * 2);
+            npyv_ulong b2 = npyv_load_ulong(data1 + vstep * 2);
+            npyv_ulong c2 = npyv_load_ulong(data_out + vstep * 2);
+            
+#line 390
+            npyv_ulong a3 = npyv_load_ulong(data0 + vstep * 3);
+            npyv_ulong b3 = npyv_load_ulong(data1 + vstep * 3);
+            npyv_ulong c3 = npyv_load_ulong(data_out + vstep * 3);
+            
+            #line 397
+            npyv_ulong abc0 = npyv_muladd_ulong(a0, b0, c0);
+            
+#line 397
+            npyv_ulong abc1 = npyv_muladd_ulong(a1, b1, c1);
+            
+#line 397
+            npyv_ulong abc2 = npyv_muladd_ulong(a2, b2, c2);
+            
+#line 397
+            npyv_ulong abc3 = npyv_muladd_ulong(a3, b3, c3);
+            
+            #line 402
+            npyv_store_ulong(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_ulong(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_ulong(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_ulong(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_ulong a = npyv_load_tillz_ulong(data0, count);
+        npyv_ulong b = npyv_load_tillz_ulong(data1, count);
+        npyv_ulong c = npyv_load_tillz_ulong(data_out, count);
+        npyv_store_till_ulong(data_out, count, npyv_muladd_ulong(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_ulong a0 = (data0[0]);
+        const npy_ulong b0 = (data1[0]);
+        const npy_ulong c0 = (data_out[0]);
+        
+#line 420
+        const npy_ulong a1 = (data0[1]);
+        const npy_ulong b1 = (data1[1]);
+        const npy_ulong c1 = (data_out[1]);
+        
+#line 420
+        const npy_ulong a2 = (data0[2]);
+        const npy_ulong b2 = (data1[2]);
+        const npy_ulong c2 = (data_out[2]);
+        
+#line 420
+        const npy_ulong a3 = (data0[3]);
+        const npy_ulong b3 = (data1[3]);
+        const npy_ulong c3 = (data_out[3]);
+        
+        #line 427
+        const npy_ulong abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_ulong abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_ulong abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_ulong abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_ulong a = (*data0);
+        const npy_ulong b = (*data1);
+        const npy_ulong c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_ulong
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+ulong_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulong value0 = (*(npy_ulong *)dataptr[0]);
+    npy_ulong *data1 = (npy_ulong *)dataptr[1];
+    npy_ulong *data_out = (npy_ulong *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("ulong_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    ulong_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+ulong_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulong value1 = (*(npy_ulong *)dataptr[1]);
+    npy_ulong *data0 = (npy_ulong *)dataptr[0];
+    npy_ulong *data_out = (npy_ulong *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("ulong_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    ulong_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+ulong_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulong *data0 = (npy_ulong *)dataptr[0];
+    npy_ulong *data1 = (npy_ulong *)dataptr[1];
+    npy_ulong accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("ulong_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_ulong
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_ulong;
+    npyv_ulong v_accum = npyv_zero_ulong();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_ulong a0 = npyv_loada_ulong(data0 + vstep * 0);
+            npyv_ulong b0 = npyv_loada_ulong(data1 + vstep * 0);
+            
+#line 501
+            npyv_ulong a1 = npyv_loada_ulong(data0 + vstep * 1);
+            npyv_ulong b1 = npyv_loada_ulong(data1 + vstep * 1);
+            
+#line 501
+            npyv_ulong a2 = npyv_loada_ulong(data0 + vstep * 2);
+            npyv_ulong b2 = npyv_loada_ulong(data1 + vstep * 2);
+            
+#line 501
+            npyv_ulong a3 = npyv_loada_ulong(data0 + vstep * 3);
+            npyv_ulong b3 = npyv_loada_ulong(data1 + vstep * 3);
+            
+            npyv_ulong ab3 = npyv_muladd_ulong(a3, b3, v_accum);
+            npyv_ulong ab2 = npyv_muladd_ulong(a2, b2, ab3);
+            npyv_ulong ab1 = npyv_muladd_ulong(a1, b1, ab2);
+                   v_accum = npyv_muladd_ulong(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_ulong a0 = npyv_load_ulong(data0 + vstep * 0);
+            npyv_ulong b0 = npyv_load_ulong(data1 + vstep * 0);
+            
+#line 501
+            npyv_ulong a1 = npyv_load_ulong(data0 + vstep * 1);
+            npyv_ulong b1 = npyv_load_ulong(data1 + vstep * 1);
+            
+#line 501
+            npyv_ulong a2 = npyv_load_ulong(data0 + vstep * 2);
+            npyv_ulong b2 = npyv_load_ulong(data1 + vstep * 2);
+            
+#line 501
+            npyv_ulong a3 = npyv_load_ulong(data0 + vstep * 3);
+            npyv_ulong b3 = npyv_load_ulong(data1 + vstep * 3);
+            
+            npyv_ulong ab3 = npyv_muladd_ulong(a3, b3, v_accum);
+            npyv_ulong ab2 = npyv_muladd_ulong(a2, b2, ab3);
+            npyv_ulong ab1 = npyv_muladd_ulong(a1, b1, ab2);
+                   v_accum = npyv_muladd_ulong(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_ulong a = npyv_load_tillz_ulong(data0, count);
+        npyv_ulong b = npyv_load_tillz_ulong(data1, count);
+        v_accum = npyv_muladd_ulong(a, b, v_accum);
+    }
+    accum = npyv_sum_ulong(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_ulong ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_ulong ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_ulong ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_ulong ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_ulong a = (*data0);
+        const npy_ulong b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_ulong
+    *(npy_ulong *)dataptr[2] = ((*(npy_ulong *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+ulong_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulong *data1 = (npy_ulong *)dataptr[1];
+    npy_ulong value0 = (*(npy_ulong *)dataptr[0]);
+    npy_ulong accum = ulong_sum_of_arr(data1, count);
+    *(npy_ulong *)dataptr[2] = ((*(npy_ulong *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+ulong_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulong *data0 = (npy_ulong *)dataptr[0];
+    npy_ulong value1 = (*(npy_ulong *)dataptr[1]);
+    npy_ulong accum = ulong_sum_of_arr(data0, count);
+    *(npy_ulong *)dataptr[2] = ((*(npy_ulong *)dataptr[2]) + value1 * accum);
+}
+
+#elif 2 == 3 && !0
+
+static void
+ulong_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulong *data0 = (npy_ulong *)dataptr[0];
+    npy_ulong *data1 = (npy_ulong *)dataptr[1];
+    npy_ulong *data2 = (npy_ulong *)dataptr[2];
+    npy_ulong *data_out = (npy_ulong *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 2 > 3 || @complex */
+
+static void
+ulong_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("ulong_sum_of_products_contig_two (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_ulong temp = (*(npy_ulong *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ulong *)dataptr[i]);
+        }
+        *(npy_ulong *)dataptr[nop] = (temp +
+                                           (*(npy_ulong *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_ulong);
+        }
+#else /* complex */
+#  if 2 <= 3
+#    define _SUMPROD_NOP 2
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_ulong re, im, tmp;
+        int i;
+        re = ((npy_ulong *)dataptr[0])[0];
+        im = ((npy_ulong *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ulong *)dataptr[i])[0] -
+                  im * ((npy_ulong *)dataptr[i])[1];
+            im = re * ((npy_ulong *)dataptr[i])[1] +
+                 im * ((npy_ulong *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_ulong *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_ulong *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_ulong *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_ulong *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_ulong);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 2 */
+
+#if 2 == 1
+
+static NPY_GCC_OPT_3 void
+ulong_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("ulong_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_ulong *data = (npy_ulong *)dataptr[0];
+    npy_ulong accum = ulong_sum_of_arr(data, count);
+    *((npy_ulong *)dataptr[1]) = (accum + (*((npy_ulong *)dataptr[1])));
+#else
+    npy_ulong accum_re = 0, accum_im = 0;
+    npy_ulong *data0 = (npy_ulong *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_ulong re01 = data0[0] + data0[2];
+        const npy_ulong re23 = data0[4] + data0[6];
+        const npy_ulong im13 = data0[1] + data0[3];
+        const npy_ulong im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_ulong *)dataptr[1])[0] += accum_re;
+    ((npy_ulong *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 2 == 1 */
+
+static void
+ulong_sum_of_products_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_ulong accum_re = 0, accum_im = 0;
+#else
+    npy_ulong accum = 0;
+#endif
+
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (2 == 2 || 2 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (2 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("ulong_sum_of_products_outstride0_two (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 2 == 1
+        accum += (*(npy_ulong *)data0);
+        data0 += stride0;
+#  elif 2 == 2
+        accum += (*(npy_ulong *)data0) *
+                 (*(npy_ulong *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 2 == 3
+        accum += (*(npy_ulong *)data0) *
+                 (*(npy_ulong *)data1) *
+                 (*(npy_ulong *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_ulong temp = (*(npy_ulong *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ulong *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 2 == 1
+        accum_re += ((npy_ulong *)data0)[0];
+        accum_im += ((npy_ulong *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 2 <= 3
+#define _SUMPROD_NOP 2
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_ulong re, im, tmp;
+        int i;
+        re = ((npy_ulong *)dataptr[0])[0];
+        im = ((npy_ulong *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ulong *)dataptr[i])[0] -
+                  im * ((npy_ulong *)dataptr[i])[1];
+            im = re * ((npy_ulong *)dataptr[i])[1] +
+                 im * ((npy_ulong *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 2 <= 3
+    ((npy_ulong *)dataptr[2])[0] += accum_re;
+    ((npy_ulong *)dataptr[2])[1] += accum_im;
+#  else
+    ((npy_ulong *)dataptr[nop])[0] += accum_re;
+    ((npy_ulong *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 2 <= 3
+    *((npy_ulong *)dataptr[2]) = (accum +
+                                    (*((npy_ulong *)dataptr[2])));
+#  else
+    *((npy_ulong *)dataptr[nop]) = (accum +
+                                    (*((npy_ulong *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+ulong_sum_of_products_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (3 == 2 || 3 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (3 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data_out = dataptr[3];
+    npy_intp stride_out = strides[3];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("ulong_sum_of_products_three (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 3 == 1
+        *(npy_ulong *)data_out = ((*(npy_ulong *)data0) +
+                                         (*(npy_ulong *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 3 == 2
+        *(npy_ulong *)data_out = ((*(npy_ulong *)data0) *
+                                         (*(npy_ulong *)data1) +
+                                         (*(npy_ulong *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 3 == 3
+        *(npy_ulong *)data_out = ((*(npy_ulong *)data0) *
+                                         (*(npy_ulong *)data1) *
+                                         (*(npy_ulong *)data2) +
+                                         (*(npy_ulong *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_ulong temp = (*(npy_ulong *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ulong *)dataptr[i]);
+        }
+        *(npy_ulong *)dataptr[nop] = (temp +
+                                           (*(npy_ulong *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 3 == 1
+        ((npy_ulong *)data_out)[0] = ((npy_ulong *)data0)[0] +
+                                         ((npy_ulong *)data_out)[0];
+        ((npy_ulong *)data_out)[1] = ((npy_ulong *)data0)[1] +
+                                         ((npy_ulong *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 3 <= 3
+#define _SUMPROD_NOP 3
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_ulong re, im, tmp;
+        int i;
+        re = ((npy_ulong *)dataptr[0])[0];
+        im = ((npy_ulong *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ulong *)dataptr[i])[0] -
+                  im * ((npy_ulong *)dataptr[i])[1];
+            im = re * ((npy_ulong *)dataptr[i])[1] +
+                 im * ((npy_ulong *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_ulong *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_ulong *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_ulong *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_ulong *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 3 == 1
+
+static void
+ulong_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulong *data0 = (npy_ulong *)dataptr[0];
+    npy_ulong *data_out = (npy_ulong *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("ulong_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_ulong *)data_out + 2*6)[0] =
+                                    ((npy_ulong *)data0 + 2*6)[0] +
+                                    ((npy_ulong *)data_out + 2*6)[0];
+            ((npy_ulong *)data_out + 2*6)[1] =
+                                    ((npy_ulong *)data0 + 2*6)[1] +
+                                    ((npy_ulong *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_ulong *)data_out + 2*5)[0] =
+                                    ((npy_ulong *)data0 + 2*5)[0] +
+                                    ((npy_ulong *)data_out + 2*5)[0];
+            ((npy_ulong *)data_out + 2*5)[1] =
+                                    ((npy_ulong *)data0 + 2*5)[1] +
+                                    ((npy_ulong *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_ulong *)data_out + 2*4)[0] =
+                                    ((npy_ulong *)data0 + 2*4)[0] +
+                                    ((npy_ulong *)data_out + 2*4)[0];
+            ((npy_ulong *)data_out + 2*4)[1] =
+                                    ((npy_ulong *)data0 + 2*4)[1] +
+                                    ((npy_ulong *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_ulong *)data_out + 2*3)[0] =
+                                    ((npy_ulong *)data0 + 2*3)[0] +
+                                    ((npy_ulong *)data_out + 2*3)[0];
+            ((npy_ulong *)data_out + 2*3)[1] =
+                                    ((npy_ulong *)data0 + 2*3)[1] +
+                                    ((npy_ulong *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_ulong *)data_out + 2*2)[0] =
+                                    ((npy_ulong *)data0 + 2*2)[0] +
+                                    ((npy_ulong *)data_out + 2*2)[0];
+            ((npy_ulong *)data_out + 2*2)[1] =
+                                    ((npy_ulong *)data0 + 2*2)[1] +
+                                    ((npy_ulong *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_ulong *)data_out + 2*1)[0] =
+                                    ((npy_ulong *)data0 + 2*1)[0] +
+                                    ((npy_ulong *)data_out + 2*1)[0];
+            ((npy_ulong *)data_out + 2*1)[1] =
+                                    ((npy_ulong *)data0 + 2*1)[1] +
+                                    ((npy_ulong *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_ulong *)data_out + 2*0)[0] =
+                                    ((npy_ulong *)data0 + 2*0)[0] +
+                                    ((npy_ulong *)data_out + 2*0)[0];
+            ((npy_ulong *)data_out + 2*0)[1] =
+                                    ((npy_ulong *)data0 + 2*0)[1] +
+                                    ((npy_ulong *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_ulong *)data_out + 2*0)[0] =
+                                ((npy_ulong *)data0 + 2*0)[0] +
+                                ((npy_ulong *)data_out + 2*0)[0];
+        ((npy_ulong *)data_out + 2*0)[1] =
+                                ((npy_ulong *)data0 + 2*0)[1] +
+                                ((npy_ulong *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_ulong *)data_out + 2*1)[0] =
+                                ((npy_ulong *)data0 + 2*1)[0] +
+                                ((npy_ulong *)data_out + 2*1)[0];
+        ((npy_ulong *)data_out + 2*1)[1] =
+                                ((npy_ulong *)data0 + 2*1)[1] +
+                                ((npy_ulong *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_ulong *)data_out + 2*2)[0] =
+                                ((npy_ulong *)data0 + 2*2)[0] +
+                                ((npy_ulong *)data_out + 2*2)[0];
+        ((npy_ulong *)data_out + 2*2)[1] =
+                                ((npy_ulong *)data0 + 2*2)[1] +
+                                ((npy_ulong *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_ulong *)data_out + 2*3)[0] =
+                                ((npy_ulong *)data0 + 2*3)[0] +
+                                ((npy_ulong *)data_out + 2*3)[0];
+        ((npy_ulong *)data_out + 2*3)[1] =
+                                ((npy_ulong *)data0 + 2*3)[1] +
+                                ((npy_ulong *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_ulong *)data_out + 2*4)[0] =
+                                ((npy_ulong *)data0 + 2*4)[0] +
+                                ((npy_ulong *)data_out + 2*4)[0];
+        ((npy_ulong *)data_out + 2*4)[1] =
+                                ((npy_ulong *)data0 + 2*4)[1] +
+                                ((npy_ulong *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_ulong *)data_out + 2*5)[0] =
+                                ((npy_ulong *)data0 + 2*5)[0] +
+                                ((npy_ulong *)data_out + 2*5)[0];
+        ((npy_ulong *)data_out + 2*5)[1] =
+                                ((npy_ulong *)data0 + 2*5)[1] +
+                                ((npy_ulong *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_ulong *)data_out + 2*6)[0] =
+                                ((npy_ulong *)data0 + 2*6)[0] +
+                                ((npy_ulong *)data_out + 2*6)[0];
+        ((npy_ulong *)data_out + 2*6)[1] =
+                                ((npy_ulong *)data0 + 2*6)[1] +
+                                ((npy_ulong *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_ulong *)data_out + 2*7)[0] =
+                                ((npy_ulong *)data0 + 2*7)[0] +
+                                ((npy_ulong *)data_out + 2*7)[0];
+        ((npy_ulong *)data_out + 2*7)[1] =
+                                ((npy_ulong *)data0 + 2*7)[1] +
+                                ((npy_ulong *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 3 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+ulong_sum_of_products_muladd(npy_ulong *data, npy_ulong *data_out, npy_ulong scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_ulong
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_ulong;
+    const npyv_ulong v_scalar = npyv_setall_ulong(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_ulong b0 = npyv_loada_ulong(data + vstep * 0);
+            npyv_ulong c0 = npyv_loada_ulong(data_out + vstep * 0);
+            
+#line 312
+            npyv_ulong b1 = npyv_loada_ulong(data + vstep * 1);
+            npyv_ulong c1 = npyv_loada_ulong(data_out + vstep * 1);
+            
+#line 312
+            npyv_ulong b2 = npyv_loada_ulong(data + vstep * 2);
+            npyv_ulong c2 = npyv_loada_ulong(data_out + vstep * 2);
+            
+#line 312
+            npyv_ulong b3 = npyv_loada_ulong(data + vstep * 3);
+            npyv_ulong c3 = npyv_loada_ulong(data_out + vstep * 3);
+            
+            #line 318
+            npyv_ulong abc0 = npyv_muladd_ulong(v_scalar, b0, c0);
+            
+#line 318
+            npyv_ulong abc1 = npyv_muladd_ulong(v_scalar, b1, c1);
+            
+#line 318
+            npyv_ulong abc2 = npyv_muladd_ulong(v_scalar, b2, c2);
+            
+#line 318
+            npyv_ulong abc3 = npyv_muladd_ulong(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_ulong(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_ulong(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_ulong(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_ulong(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_ulong b0 = npyv_load_ulong(data + vstep * 0);
+            npyv_ulong c0 = npyv_load_ulong(data_out + vstep * 0);
+            
+#line 312
+            npyv_ulong b1 = npyv_load_ulong(data + vstep * 1);
+            npyv_ulong c1 = npyv_load_ulong(data_out + vstep * 1);
+            
+#line 312
+            npyv_ulong b2 = npyv_load_ulong(data + vstep * 2);
+            npyv_ulong c2 = npyv_load_ulong(data_out + vstep * 2);
+            
+#line 312
+            npyv_ulong b3 = npyv_load_ulong(data + vstep * 3);
+            npyv_ulong c3 = npyv_load_ulong(data_out + vstep * 3);
+            
+            #line 318
+            npyv_ulong abc0 = npyv_muladd_ulong(v_scalar, b0, c0);
+            
+#line 318
+            npyv_ulong abc1 = npyv_muladd_ulong(v_scalar, b1, c1);
+            
+#line 318
+            npyv_ulong abc2 = npyv_muladd_ulong(v_scalar, b2, c2);
+            
+#line 318
+            npyv_ulong abc3 = npyv_muladd_ulong(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_ulong(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_ulong(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_ulong(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_ulong(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_ulong a = npyv_load_tillz_ulong(data, count);
+        npyv_ulong b = npyv_load_tillz_ulong(data_out, count);
+        npyv_store_till_ulong(data_out, count, npyv_muladd_ulong(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_ulong b0 = (data[0]);
+        const npy_ulong c0 = (data_out[0]);
+        
+#line 340
+        const npy_ulong b1 = (data[1]);
+        const npy_ulong c1 = (data_out[1]);
+        
+#line 340
+        const npy_ulong b2 = (data[2]);
+        const npy_ulong c2 = (data_out[2]);
+        
+#line 340
+        const npy_ulong b3 = (data[3]);
+        const npy_ulong c3 = (data_out[3]);
+        
+        #line 346
+        const npy_ulong abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_ulong abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_ulong abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_ulong abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_ulong b = (*data);
+        const npy_ulong c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_ulong
+}
+
+static void
+ulong_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulong *data0 = (npy_ulong *)dataptr[0];
+    npy_ulong *data1 = (npy_ulong *)dataptr[1];
+    npy_ulong *data_out = (npy_ulong *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("ulong_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_ulong
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_ulong;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_ulong a0 = npyv_loada_ulong(data0 + vstep * 0);
+            npyv_ulong b0 = npyv_loada_ulong(data1 + vstep * 0);
+            npyv_ulong c0 = npyv_loada_ulong(data_out + vstep * 0);
+            
+#line 390
+            npyv_ulong a1 = npyv_loada_ulong(data0 + vstep * 1);
+            npyv_ulong b1 = npyv_loada_ulong(data1 + vstep * 1);
+            npyv_ulong c1 = npyv_loada_ulong(data_out + vstep * 1);
+            
+#line 390
+            npyv_ulong a2 = npyv_loada_ulong(data0 + vstep * 2);
+            npyv_ulong b2 = npyv_loada_ulong(data1 + vstep * 2);
+            npyv_ulong c2 = npyv_loada_ulong(data_out + vstep * 2);
+            
+#line 390
+            npyv_ulong a3 = npyv_loada_ulong(data0 + vstep * 3);
+            npyv_ulong b3 = npyv_loada_ulong(data1 + vstep * 3);
+            npyv_ulong c3 = npyv_loada_ulong(data_out + vstep * 3);
+            
+            #line 397
+            npyv_ulong abc0 = npyv_muladd_ulong(a0, b0, c0);
+            
+#line 397
+            npyv_ulong abc1 = npyv_muladd_ulong(a1, b1, c1);
+            
+#line 397
+            npyv_ulong abc2 = npyv_muladd_ulong(a2, b2, c2);
+            
+#line 397
+            npyv_ulong abc3 = npyv_muladd_ulong(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_ulong(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_ulong(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_ulong(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_ulong(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_ulong a0 = npyv_load_ulong(data0 + vstep * 0);
+            npyv_ulong b0 = npyv_load_ulong(data1 + vstep * 0);
+            npyv_ulong c0 = npyv_load_ulong(data_out + vstep * 0);
+            
+#line 390
+            npyv_ulong a1 = npyv_load_ulong(data0 + vstep * 1);
+            npyv_ulong b1 = npyv_load_ulong(data1 + vstep * 1);
+            npyv_ulong c1 = npyv_load_ulong(data_out + vstep * 1);
+            
+#line 390
+            npyv_ulong a2 = npyv_load_ulong(data0 + vstep * 2);
+            npyv_ulong b2 = npyv_load_ulong(data1 + vstep * 2);
+            npyv_ulong c2 = npyv_load_ulong(data_out + vstep * 2);
+            
+#line 390
+            npyv_ulong a3 = npyv_load_ulong(data0 + vstep * 3);
+            npyv_ulong b3 = npyv_load_ulong(data1 + vstep * 3);
+            npyv_ulong c3 = npyv_load_ulong(data_out + vstep * 3);
+            
+            #line 397
+            npyv_ulong abc0 = npyv_muladd_ulong(a0, b0, c0);
+            
+#line 397
+            npyv_ulong abc1 = npyv_muladd_ulong(a1, b1, c1);
+            
+#line 397
+            npyv_ulong abc2 = npyv_muladd_ulong(a2, b2, c2);
+            
+#line 397
+            npyv_ulong abc3 = npyv_muladd_ulong(a3, b3, c3);
+            
+            #line 402
+            npyv_store_ulong(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_ulong(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_ulong(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_ulong(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_ulong a = npyv_load_tillz_ulong(data0, count);
+        npyv_ulong b = npyv_load_tillz_ulong(data1, count);
+        npyv_ulong c = npyv_load_tillz_ulong(data_out, count);
+        npyv_store_till_ulong(data_out, count, npyv_muladd_ulong(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_ulong a0 = (data0[0]);
+        const npy_ulong b0 = (data1[0]);
+        const npy_ulong c0 = (data_out[0]);
+        
+#line 420
+        const npy_ulong a1 = (data0[1]);
+        const npy_ulong b1 = (data1[1]);
+        const npy_ulong c1 = (data_out[1]);
+        
+#line 420
+        const npy_ulong a2 = (data0[2]);
+        const npy_ulong b2 = (data1[2]);
+        const npy_ulong c2 = (data_out[2]);
+        
+#line 420
+        const npy_ulong a3 = (data0[3]);
+        const npy_ulong b3 = (data1[3]);
+        const npy_ulong c3 = (data_out[3]);
+        
+        #line 427
+        const npy_ulong abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_ulong abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_ulong abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_ulong abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_ulong a = (*data0);
+        const npy_ulong b = (*data1);
+        const npy_ulong c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_ulong
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+ulong_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulong value0 = (*(npy_ulong *)dataptr[0]);
+    npy_ulong *data1 = (npy_ulong *)dataptr[1];
+    npy_ulong *data_out = (npy_ulong *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("ulong_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    ulong_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+ulong_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulong value1 = (*(npy_ulong *)dataptr[1]);
+    npy_ulong *data0 = (npy_ulong *)dataptr[0];
+    npy_ulong *data_out = (npy_ulong *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("ulong_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    ulong_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+ulong_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulong *data0 = (npy_ulong *)dataptr[0];
+    npy_ulong *data1 = (npy_ulong *)dataptr[1];
+    npy_ulong accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("ulong_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_ulong
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_ulong;
+    npyv_ulong v_accum = npyv_zero_ulong();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_ulong a0 = npyv_loada_ulong(data0 + vstep * 0);
+            npyv_ulong b0 = npyv_loada_ulong(data1 + vstep * 0);
+            
+#line 501
+            npyv_ulong a1 = npyv_loada_ulong(data0 + vstep * 1);
+            npyv_ulong b1 = npyv_loada_ulong(data1 + vstep * 1);
+            
+#line 501
+            npyv_ulong a2 = npyv_loada_ulong(data0 + vstep * 2);
+            npyv_ulong b2 = npyv_loada_ulong(data1 + vstep * 2);
+            
+#line 501
+            npyv_ulong a3 = npyv_loada_ulong(data0 + vstep * 3);
+            npyv_ulong b3 = npyv_loada_ulong(data1 + vstep * 3);
+            
+            npyv_ulong ab3 = npyv_muladd_ulong(a3, b3, v_accum);
+            npyv_ulong ab2 = npyv_muladd_ulong(a2, b2, ab3);
+            npyv_ulong ab1 = npyv_muladd_ulong(a1, b1, ab2);
+                   v_accum = npyv_muladd_ulong(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_ulong a0 = npyv_load_ulong(data0 + vstep * 0);
+            npyv_ulong b0 = npyv_load_ulong(data1 + vstep * 0);
+            
+#line 501
+            npyv_ulong a1 = npyv_load_ulong(data0 + vstep * 1);
+            npyv_ulong b1 = npyv_load_ulong(data1 + vstep * 1);
+            
+#line 501
+            npyv_ulong a2 = npyv_load_ulong(data0 + vstep * 2);
+            npyv_ulong b2 = npyv_load_ulong(data1 + vstep * 2);
+            
+#line 501
+            npyv_ulong a3 = npyv_load_ulong(data0 + vstep * 3);
+            npyv_ulong b3 = npyv_load_ulong(data1 + vstep * 3);
+            
+            npyv_ulong ab3 = npyv_muladd_ulong(a3, b3, v_accum);
+            npyv_ulong ab2 = npyv_muladd_ulong(a2, b2, ab3);
+            npyv_ulong ab1 = npyv_muladd_ulong(a1, b1, ab2);
+                   v_accum = npyv_muladd_ulong(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_ulong a = npyv_load_tillz_ulong(data0, count);
+        npyv_ulong b = npyv_load_tillz_ulong(data1, count);
+        v_accum = npyv_muladd_ulong(a, b, v_accum);
+    }
+    accum = npyv_sum_ulong(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_ulong ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_ulong ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_ulong ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_ulong ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_ulong a = (*data0);
+        const npy_ulong b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_ulong
+    *(npy_ulong *)dataptr[2] = ((*(npy_ulong *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+ulong_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulong *data1 = (npy_ulong *)dataptr[1];
+    npy_ulong value0 = (*(npy_ulong *)dataptr[0]);
+    npy_ulong accum = ulong_sum_of_arr(data1, count);
+    *(npy_ulong *)dataptr[2] = ((*(npy_ulong *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+ulong_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulong *data0 = (npy_ulong *)dataptr[0];
+    npy_ulong value1 = (*(npy_ulong *)dataptr[1]);
+    npy_ulong accum = ulong_sum_of_arr(data0, count);
+    *(npy_ulong *)dataptr[2] = ((*(npy_ulong *)dataptr[2]) + value1 * accum);
+}
+
+#elif 3 == 3 && !0
+
+static void
+ulong_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulong *data0 = (npy_ulong *)dataptr[0];
+    npy_ulong *data1 = (npy_ulong *)dataptr[1];
+    npy_ulong *data2 = (npy_ulong *)dataptr[2];
+    npy_ulong *data_out = (npy_ulong *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 3 > 3 || @complex */
+
+static void
+ulong_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("ulong_sum_of_products_contig_three (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_ulong temp = (*(npy_ulong *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ulong *)dataptr[i]);
+        }
+        *(npy_ulong *)dataptr[nop] = (temp +
+                                           (*(npy_ulong *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_ulong);
+        }
+#else /* complex */
+#  if 3 <= 3
+#    define _SUMPROD_NOP 3
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_ulong re, im, tmp;
+        int i;
+        re = ((npy_ulong *)dataptr[0])[0];
+        im = ((npy_ulong *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ulong *)dataptr[i])[0] -
+                  im * ((npy_ulong *)dataptr[i])[1];
+            im = re * ((npy_ulong *)dataptr[i])[1] +
+                 im * ((npy_ulong *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_ulong *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_ulong *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_ulong *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_ulong *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_ulong);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 3 */
+
+#if 3 == 1
+
+static NPY_GCC_OPT_3 void
+ulong_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("ulong_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_ulong *data = (npy_ulong *)dataptr[0];
+    npy_ulong accum = ulong_sum_of_arr(data, count);
+    *((npy_ulong *)dataptr[1]) = (accum + (*((npy_ulong *)dataptr[1])));
+#else
+    npy_ulong accum_re = 0, accum_im = 0;
+    npy_ulong *data0 = (npy_ulong *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_ulong re01 = data0[0] + data0[2];
+        const npy_ulong re23 = data0[4] + data0[6];
+        const npy_ulong im13 = data0[1] + data0[3];
+        const npy_ulong im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_ulong *)dataptr[1])[0] += accum_re;
+    ((npy_ulong *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 3 == 1 */
+
+static void
+ulong_sum_of_products_outstride0_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_ulong accum_re = 0, accum_im = 0;
+#else
+    npy_ulong accum = 0;
+#endif
+
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (3 == 2 || 3 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (3 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("ulong_sum_of_products_outstride0_three (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 3 == 1
+        accum += (*(npy_ulong *)data0);
+        data0 += stride0;
+#  elif 3 == 2
+        accum += (*(npy_ulong *)data0) *
+                 (*(npy_ulong *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 3 == 3
+        accum += (*(npy_ulong *)data0) *
+                 (*(npy_ulong *)data1) *
+                 (*(npy_ulong *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_ulong temp = (*(npy_ulong *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ulong *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 3 == 1
+        accum_re += ((npy_ulong *)data0)[0];
+        accum_im += ((npy_ulong *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 3 <= 3
+#define _SUMPROD_NOP 3
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_ulong re, im, tmp;
+        int i;
+        re = ((npy_ulong *)dataptr[0])[0];
+        im = ((npy_ulong *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ulong *)dataptr[i])[0] -
+                  im * ((npy_ulong *)dataptr[i])[1];
+            im = re * ((npy_ulong *)dataptr[i])[1] +
+                 im * ((npy_ulong *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 3 <= 3
+    ((npy_ulong *)dataptr[3])[0] += accum_re;
+    ((npy_ulong *)dataptr[3])[1] += accum_im;
+#  else
+    ((npy_ulong *)dataptr[nop])[0] += accum_re;
+    ((npy_ulong *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 3 <= 3
+    *((npy_ulong *)dataptr[3]) = (accum +
+                                    (*((npy_ulong *)dataptr[3])));
+#  else
+    *((npy_ulong *)dataptr[nop]) = (accum +
+                                    (*((npy_ulong *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+ulong_sum_of_products_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1000 == 2 || 1000 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1000 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data_out = dataptr[1000];
+    npy_intp stride_out = strides[1000];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("ulong_sum_of_products_any (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 1000 == 1
+        *(npy_ulong *)data_out = ((*(npy_ulong *)data0) +
+                                         (*(npy_ulong *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 1000 == 2
+        *(npy_ulong *)data_out = ((*(npy_ulong *)data0) *
+                                         (*(npy_ulong *)data1) +
+                                         (*(npy_ulong *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 1000 == 3
+        *(npy_ulong *)data_out = ((*(npy_ulong *)data0) *
+                                         (*(npy_ulong *)data1) *
+                                         (*(npy_ulong *)data2) +
+                                         (*(npy_ulong *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_ulong temp = (*(npy_ulong *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ulong *)dataptr[i]);
+        }
+        *(npy_ulong *)dataptr[nop] = (temp +
+                                           (*(npy_ulong *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1000 == 1
+        ((npy_ulong *)data_out)[0] = ((npy_ulong *)data0)[0] +
+                                         ((npy_ulong *)data_out)[0];
+        ((npy_ulong *)data_out)[1] = ((npy_ulong *)data0)[1] +
+                                         ((npy_ulong *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 1000 <= 3
+#define _SUMPROD_NOP 1000
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_ulong re, im, tmp;
+        int i;
+        re = ((npy_ulong *)dataptr[0])[0];
+        im = ((npy_ulong *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ulong *)dataptr[i])[0] -
+                  im * ((npy_ulong *)dataptr[i])[1];
+            im = re * ((npy_ulong *)dataptr[i])[1] +
+                 im * ((npy_ulong *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_ulong *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_ulong *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_ulong *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_ulong *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 1000 == 1
+
+static void
+ulong_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulong *data0 = (npy_ulong *)dataptr[0];
+    npy_ulong *data_out = (npy_ulong *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("ulong_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_ulong *)data_out + 2*6)[0] =
+                                    ((npy_ulong *)data0 + 2*6)[0] +
+                                    ((npy_ulong *)data_out + 2*6)[0];
+            ((npy_ulong *)data_out + 2*6)[1] =
+                                    ((npy_ulong *)data0 + 2*6)[1] +
+                                    ((npy_ulong *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_ulong *)data_out + 2*5)[0] =
+                                    ((npy_ulong *)data0 + 2*5)[0] +
+                                    ((npy_ulong *)data_out + 2*5)[0];
+            ((npy_ulong *)data_out + 2*5)[1] =
+                                    ((npy_ulong *)data0 + 2*5)[1] +
+                                    ((npy_ulong *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_ulong *)data_out + 2*4)[0] =
+                                    ((npy_ulong *)data0 + 2*4)[0] +
+                                    ((npy_ulong *)data_out + 2*4)[0];
+            ((npy_ulong *)data_out + 2*4)[1] =
+                                    ((npy_ulong *)data0 + 2*4)[1] +
+                                    ((npy_ulong *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_ulong *)data_out + 2*3)[0] =
+                                    ((npy_ulong *)data0 + 2*3)[0] +
+                                    ((npy_ulong *)data_out + 2*3)[0];
+            ((npy_ulong *)data_out + 2*3)[1] =
+                                    ((npy_ulong *)data0 + 2*3)[1] +
+                                    ((npy_ulong *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_ulong *)data_out + 2*2)[0] =
+                                    ((npy_ulong *)data0 + 2*2)[0] +
+                                    ((npy_ulong *)data_out + 2*2)[0];
+            ((npy_ulong *)data_out + 2*2)[1] =
+                                    ((npy_ulong *)data0 + 2*2)[1] +
+                                    ((npy_ulong *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_ulong *)data_out + 2*1)[0] =
+                                    ((npy_ulong *)data0 + 2*1)[0] +
+                                    ((npy_ulong *)data_out + 2*1)[0];
+            ((npy_ulong *)data_out + 2*1)[1] =
+                                    ((npy_ulong *)data0 + 2*1)[1] +
+                                    ((npy_ulong *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_ulong *)data_out + 2*0)[0] =
+                                    ((npy_ulong *)data0 + 2*0)[0] +
+                                    ((npy_ulong *)data_out + 2*0)[0];
+            ((npy_ulong *)data_out + 2*0)[1] =
+                                    ((npy_ulong *)data0 + 2*0)[1] +
+                                    ((npy_ulong *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_ulong *)data_out + 2*0)[0] =
+                                ((npy_ulong *)data0 + 2*0)[0] +
+                                ((npy_ulong *)data_out + 2*0)[0];
+        ((npy_ulong *)data_out + 2*0)[1] =
+                                ((npy_ulong *)data0 + 2*0)[1] +
+                                ((npy_ulong *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_ulong *)data_out + 2*1)[0] =
+                                ((npy_ulong *)data0 + 2*1)[0] +
+                                ((npy_ulong *)data_out + 2*1)[0];
+        ((npy_ulong *)data_out + 2*1)[1] =
+                                ((npy_ulong *)data0 + 2*1)[1] +
+                                ((npy_ulong *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_ulong *)data_out + 2*2)[0] =
+                                ((npy_ulong *)data0 + 2*2)[0] +
+                                ((npy_ulong *)data_out + 2*2)[0];
+        ((npy_ulong *)data_out + 2*2)[1] =
+                                ((npy_ulong *)data0 + 2*2)[1] +
+                                ((npy_ulong *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_ulong *)data_out + 2*3)[0] =
+                                ((npy_ulong *)data0 + 2*3)[0] +
+                                ((npy_ulong *)data_out + 2*3)[0];
+        ((npy_ulong *)data_out + 2*3)[1] =
+                                ((npy_ulong *)data0 + 2*3)[1] +
+                                ((npy_ulong *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_ulong *)data_out + 2*4)[0] =
+                                ((npy_ulong *)data0 + 2*4)[0] +
+                                ((npy_ulong *)data_out + 2*4)[0];
+        ((npy_ulong *)data_out + 2*4)[1] =
+                                ((npy_ulong *)data0 + 2*4)[1] +
+                                ((npy_ulong *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_ulong *)data_out + 2*5)[0] =
+                                ((npy_ulong *)data0 + 2*5)[0] +
+                                ((npy_ulong *)data_out + 2*5)[0];
+        ((npy_ulong *)data_out + 2*5)[1] =
+                                ((npy_ulong *)data0 + 2*5)[1] +
+                                ((npy_ulong *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_ulong *)data_out + 2*6)[0] =
+                                ((npy_ulong *)data0 + 2*6)[0] +
+                                ((npy_ulong *)data_out + 2*6)[0];
+        ((npy_ulong *)data_out + 2*6)[1] =
+                                ((npy_ulong *)data0 + 2*6)[1] +
+                                ((npy_ulong *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_ulong *)data_out + 2*7)[0] =
+                                ((npy_ulong *)data0 + 2*7)[0] +
+                                ((npy_ulong *)data_out + 2*7)[0];
+        ((npy_ulong *)data_out + 2*7)[1] =
+                                ((npy_ulong *)data0 + 2*7)[1] +
+                                ((npy_ulong *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 1000 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+ulong_sum_of_products_muladd(npy_ulong *data, npy_ulong *data_out, npy_ulong scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_ulong
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_ulong;
+    const npyv_ulong v_scalar = npyv_setall_ulong(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_ulong b0 = npyv_loada_ulong(data + vstep * 0);
+            npyv_ulong c0 = npyv_loada_ulong(data_out + vstep * 0);
+            
+#line 312
+            npyv_ulong b1 = npyv_loada_ulong(data + vstep * 1);
+            npyv_ulong c1 = npyv_loada_ulong(data_out + vstep * 1);
+            
+#line 312
+            npyv_ulong b2 = npyv_loada_ulong(data + vstep * 2);
+            npyv_ulong c2 = npyv_loada_ulong(data_out + vstep * 2);
+            
+#line 312
+            npyv_ulong b3 = npyv_loada_ulong(data + vstep * 3);
+            npyv_ulong c3 = npyv_loada_ulong(data_out + vstep * 3);
+            
+            #line 318
+            npyv_ulong abc0 = npyv_muladd_ulong(v_scalar, b0, c0);
+            
+#line 318
+            npyv_ulong abc1 = npyv_muladd_ulong(v_scalar, b1, c1);
+            
+#line 318
+            npyv_ulong abc2 = npyv_muladd_ulong(v_scalar, b2, c2);
+            
+#line 318
+            npyv_ulong abc3 = npyv_muladd_ulong(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_ulong(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_ulong(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_ulong(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_ulong(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_ulong b0 = npyv_load_ulong(data + vstep * 0);
+            npyv_ulong c0 = npyv_load_ulong(data_out + vstep * 0);
+            
+#line 312
+            npyv_ulong b1 = npyv_load_ulong(data + vstep * 1);
+            npyv_ulong c1 = npyv_load_ulong(data_out + vstep * 1);
+            
+#line 312
+            npyv_ulong b2 = npyv_load_ulong(data + vstep * 2);
+            npyv_ulong c2 = npyv_load_ulong(data_out + vstep * 2);
+            
+#line 312
+            npyv_ulong b3 = npyv_load_ulong(data + vstep * 3);
+            npyv_ulong c3 = npyv_load_ulong(data_out + vstep * 3);
+            
+            #line 318
+            npyv_ulong abc0 = npyv_muladd_ulong(v_scalar, b0, c0);
+            
+#line 318
+            npyv_ulong abc1 = npyv_muladd_ulong(v_scalar, b1, c1);
+            
+#line 318
+            npyv_ulong abc2 = npyv_muladd_ulong(v_scalar, b2, c2);
+            
+#line 318
+            npyv_ulong abc3 = npyv_muladd_ulong(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_ulong(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_ulong(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_ulong(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_ulong(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_ulong a = npyv_load_tillz_ulong(data, count);
+        npyv_ulong b = npyv_load_tillz_ulong(data_out, count);
+        npyv_store_till_ulong(data_out, count, npyv_muladd_ulong(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_ulong b0 = (data[0]);
+        const npy_ulong c0 = (data_out[0]);
+        
+#line 340
+        const npy_ulong b1 = (data[1]);
+        const npy_ulong c1 = (data_out[1]);
+        
+#line 340
+        const npy_ulong b2 = (data[2]);
+        const npy_ulong c2 = (data_out[2]);
+        
+#line 340
+        const npy_ulong b3 = (data[3]);
+        const npy_ulong c3 = (data_out[3]);
+        
+        #line 346
+        const npy_ulong abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_ulong abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_ulong abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_ulong abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_ulong b = (*data);
+        const npy_ulong c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_ulong
+}
+
+static void
+ulong_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulong *data0 = (npy_ulong *)dataptr[0];
+    npy_ulong *data1 = (npy_ulong *)dataptr[1];
+    npy_ulong *data_out = (npy_ulong *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("ulong_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_ulong
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_ulong;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_ulong a0 = npyv_loada_ulong(data0 + vstep * 0);
+            npyv_ulong b0 = npyv_loada_ulong(data1 + vstep * 0);
+            npyv_ulong c0 = npyv_loada_ulong(data_out + vstep * 0);
+            
+#line 390
+            npyv_ulong a1 = npyv_loada_ulong(data0 + vstep * 1);
+            npyv_ulong b1 = npyv_loada_ulong(data1 + vstep * 1);
+            npyv_ulong c1 = npyv_loada_ulong(data_out + vstep * 1);
+            
+#line 390
+            npyv_ulong a2 = npyv_loada_ulong(data0 + vstep * 2);
+            npyv_ulong b2 = npyv_loada_ulong(data1 + vstep * 2);
+            npyv_ulong c2 = npyv_loada_ulong(data_out + vstep * 2);
+            
+#line 390
+            npyv_ulong a3 = npyv_loada_ulong(data0 + vstep * 3);
+            npyv_ulong b3 = npyv_loada_ulong(data1 + vstep * 3);
+            npyv_ulong c3 = npyv_loada_ulong(data_out + vstep * 3);
+            
+            #line 397
+            npyv_ulong abc0 = npyv_muladd_ulong(a0, b0, c0);
+            
+#line 397
+            npyv_ulong abc1 = npyv_muladd_ulong(a1, b1, c1);
+            
+#line 397
+            npyv_ulong abc2 = npyv_muladd_ulong(a2, b2, c2);
+            
+#line 397
+            npyv_ulong abc3 = npyv_muladd_ulong(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_ulong(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_ulong(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_ulong(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_ulong(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_ulong a0 = npyv_load_ulong(data0 + vstep * 0);
+            npyv_ulong b0 = npyv_load_ulong(data1 + vstep * 0);
+            npyv_ulong c0 = npyv_load_ulong(data_out + vstep * 0);
+            
+#line 390
+            npyv_ulong a1 = npyv_load_ulong(data0 + vstep * 1);
+            npyv_ulong b1 = npyv_load_ulong(data1 + vstep * 1);
+            npyv_ulong c1 = npyv_load_ulong(data_out + vstep * 1);
+            
+#line 390
+            npyv_ulong a2 = npyv_load_ulong(data0 + vstep * 2);
+            npyv_ulong b2 = npyv_load_ulong(data1 + vstep * 2);
+            npyv_ulong c2 = npyv_load_ulong(data_out + vstep * 2);
+            
+#line 390
+            npyv_ulong a3 = npyv_load_ulong(data0 + vstep * 3);
+            npyv_ulong b3 = npyv_load_ulong(data1 + vstep * 3);
+            npyv_ulong c3 = npyv_load_ulong(data_out + vstep * 3);
+            
+            #line 397
+            npyv_ulong abc0 = npyv_muladd_ulong(a0, b0, c0);
+            
+#line 397
+            npyv_ulong abc1 = npyv_muladd_ulong(a1, b1, c1);
+            
+#line 397
+            npyv_ulong abc2 = npyv_muladd_ulong(a2, b2, c2);
+            
+#line 397
+            npyv_ulong abc3 = npyv_muladd_ulong(a3, b3, c3);
+            
+            #line 402
+            npyv_store_ulong(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_ulong(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_ulong(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_ulong(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_ulong a = npyv_load_tillz_ulong(data0, count);
+        npyv_ulong b = npyv_load_tillz_ulong(data1, count);
+        npyv_ulong c = npyv_load_tillz_ulong(data_out, count);
+        npyv_store_till_ulong(data_out, count, npyv_muladd_ulong(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_ulong a0 = (data0[0]);
+        const npy_ulong b0 = (data1[0]);
+        const npy_ulong c0 = (data_out[0]);
+        
+#line 420
+        const npy_ulong a1 = (data0[1]);
+        const npy_ulong b1 = (data1[1]);
+        const npy_ulong c1 = (data_out[1]);
+        
+#line 420
+        const npy_ulong a2 = (data0[2]);
+        const npy_ulong b2 = (data1[2]);
+        const npy_ulong c2 = (data_out[2]);
+        
+#line 420
+        const npy_ulong a3 = (data0[3]);
+        const npy_ulong b3 = (data1[3]);
+        const npy_ulong c3 = (data_out[3]);
+        
+        #line 427
+        const npy_ulong abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_ulong abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_ulong abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_ulong abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_ulong a = (*data0);
+        const npy_ulong b = (*data1);
+        const npy_ulong c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_ulong
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+ulong_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulong value0 = (*(npy_ulong *)dataptr[0]);
+    npy_ulong *data1 = (npy_ulong *)dataptr[1];
+    npy_ulong *data_out = (npy_ulong *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("ulong_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    ulong_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+ulong_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulong value1 = (*(npy_ulong *)dataptr[1]);
+    npy_ulong *data0 = (npy_ulong *)dataptr[0];
+    npy_ulong *data_out = (npy_ulong *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("ulong_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    ulong_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+ulong_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulong *data0 = (npy_ulong *)dataptr[0];
+    npy_ulong *data1 = (npy_ulong *)dataptr[1];
+    npy_ulong accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("ulong_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_ulong
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_ulong;
+    npyv_ulong v_accum = npyv_zero_ulong();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_ulong a0 = npyv_loada_ulong(data0 + vstep * 0);
+            npyv_ulong b0 = npyv_loada_ulong(data1 + vstep * 0);
+            
+#line 501
+            npyv_ulong a1 = npyv_loada_ulong(data0 + vstep * 1);
+            npyv_ulong b1 = npyv_loada_ulong(data1 + vstep * 1);
+            
+#line 501
+            npyv_ulong a2 = npyv_loada_ulong(data0 + vstep * 2);
+            npyv_ulong b2 = npyv_loada_ulong(data1 + vstep * 2);
+            
+#line 501
+            npyv_ulong a3 = npyv_loada_ulong(data0 + vstep * 3);
+            npyv_ulong b3 = npyv_loada_ulong(data1 + vstep * 3);
+            
+            npyv_ulong ab3 = npyv_muladd_ulong(a3, b3, v_accum);
+            npyv_ulong ab2 = npyv_muladd_ulong(a2, b2, ab3);
+            npyv_ulong ab1 = npyv_muladd_ulong(a1, b1, ab2);
+                   v_accum = npyv_muladd_ulong(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_ulong a0 = npyv_load_ulong(data0 + vstep * 0);
+            npyv_ulong b0 = npyv_load_ulong(data1 + vstep * 0);
+            
+#line 501
+            npyv_ulong a1 = npyv_load_ulong(data0 + vstep * 1);
+            npyv_ulong b1 = npyv_load_ulong(data1 + vstep * 1);
+            
+#line 501
+            npyv_ulong a2 = npyv_load_ulong(data0 + vstep * 2);
+            npyv_ulong b2 = npyv_load_ulong(data1 + vstep * 2);
+            
+#line 501
+            npyv_ulong a3 = npyv_load_ulong(data0 + vstep * 3);
+            npyv_ulong b3 = npyv_load_ulong(data1 + vstep * 3);
+            
+            npyv_ulong ab3 = npyv_muladd_ulong(a3, b3, v_accum);
+            npyv_ulong ab2 = npyv_muladd_ulong(a2, b2, ab3);
+            npyv_ulong ab1 = npyv_muladd_ulong(a1, b1, ab2);
+                   v_accum = npyv_muladd_ulong(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_ulong a = npyv_load_tillz_ulong(data0, count);
+        npyv_ulong b = npyv_load_tillz_ulong(data1, count);
+        v_accum = npyv_muladd_ulong(a, b, v_accum);
+    }
+    accum = npyv_sum_ulong(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_ulong ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_ulong ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_ulong ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_ulong ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_ulong a = (*data0);
+        const npy_ulong b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_ulong
+    *(npy_ulong *)dataptr[2] = ((*(npy_ulong *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+ulong_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulong *data1 = (npy_ulong *)dataptr[1];
+    npy_ulong value0 = (*(npy_ulong *)dataptr[0]);
+    npy_ulong accum = ulong_sum_of_arr(data1, count);
+    *(npy_ulong *)dataptr[2] = ((*(npy_ulong *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+ulong_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulong *data0 = (npy_ulong *)dataptr[0];
+    npy_ulong value1 = (*(npy_ulong *)dataptr[1]);
+    npy_ulong accum = ulong_sum_of_arr(data0, count);
+    *(npy_ulong *)dataptr[2] = ((*(npy_ulong *)dataptr[2]) + value1 * accum);
+}
+
+#elif 1000 == 3 && !0
+
+static void
+ulong_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulong *data0 = (npy_ulong *)dataptr[0];
+    npy_ulong *data1 = (npy_ulong *)dataptr[1];
+    npy_ulong *data2 = (npy_ulong *)dataptr[2];
+    npy_ulong *data_out = (npy_ulong *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 1000 > 3 || @complex */
+
+static void
+ulong_sum_of_products_contig_any(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("ulong_sum_of_products_contig_any (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_ulong temp = (*(npy_ulong *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ulong *)dataptr[i]);
+        }
+        *(npy_ulong *)dataptr[nop] = (temp +
+                                           (*(npy_ulong *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_ulong);
+        }
+#else /* complex */
+#  if 1000 <= 3
+#    define _SUMPROD_NOP 1000
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_ulong re, im, tmp;
+        int i;
+        re = ((npy_ulong *)dataptr[0])[0];
+        im = ((npy_ulong *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ulong *)dataptr[i])[0] -
+                  im * ((npy_ulong *)dataptr[i])[1];
+            im = re * ((npy_ulong *)dataptr[i])[1] +
+                 im * ((npy_ulong *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_ulong *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_ulong *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_ulong *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_ulong *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_ulong);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 1000 */
+
+#if 1000 == 1
+
+static NPY_GCC_OPT_3 void
+ulong_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("ulong_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_ulong *data = (npy_ulong *)dataptr[0];
+    npy_ulong accum = ulong_sum_of_arr(data, count);
+    *((npy_ulong *)dataptr[1]) = (accum + (*((npy_ulong *)dataptr[1])));
+#else
+    npy_ulong accum_re = 0, accum_im = 0;
+    npy_ulong *data0 = (npy_ulong *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_ulong re01 = data0[0] + data0[2];
+        const npy_ulong re23 = data0[4] + data0[6];
+        const npy_ulong im13 = data0[1] + data0[3];
+        const npy_ulong im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_ulong *)dataptr[1])[0] += accum_re;
+    ((npy_ulong *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 1000 == 1 */
+
+static void
+ulong_sum_of_products_outstride0_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_ulong accum_re = 0, accum_im = 0;
+#else
+    npy_ulong accum = 0;
+#endif
+
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1000 == 2 || 1000 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1000 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("ulong_sum_of_products_outstride0_any (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 1000 == 1
+        accum += (*(npy_ulong *)data0);
+        data0 += stride0;
+#  elif 1000 == 2
+        accum += (*(npy_ulong *)data0) *
+                 (*(npy_ulong *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 1000 == 3
+        accum += (*(npy_ulong *)data0) *
+                 (*(npy_ulong *)data1) *
+                 (*(npy_ulong *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_ulong temp = (*(npy_ulong *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ulong *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1000 == 1
+        accum_re += ((npy_ulong *)data0)[0];
+        accum_im += ((npy_ulong *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 1000 <= 3
+#define _SUMPROD_NOP 1000
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_ulong re, im, tmp;
+        int i;
+        re = ((npy_ulong *)dataptr[0])[0];
+        im = ((npy_ulong *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ulong *)dataptr[i])[0] -
+                  im * ((npy_ulong *)dataptr[i])[1];
+            im = re * ((npy_ulong *)dataptr[i])[1] +
+                 im * ((npy_ulong *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 1000 <= 3
+    ((npy_ulong *)dataptr[1000])[0] += accum_re;
+    ((npy_ulong *)dataptr[1000])[1] += accum_im;
+#  else
+    ((npy_ulong *)dataptr[nop])[0] += accum_re;
+    ((npy_ulong *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 1000 <= 3
+    *((npy_ulong *)dataptr[1000]) = (accum +
+                                    (*((npy_ulong *)dataptr[1000])));
+#  else
+    *((npy_ulong *)dataptr[nop]) = (accum +
+                                    (*((npy_ulong *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+
+
+#line 74
+
+#if !0
+static NPY_GCC_OPT_3 npy_ulonglong ulonglong_sum_of_arr(npy_ulonglong *data, npy_intp count)
+{
+    npy_ulonglong accum = 0;
+#if 0 // NPYV check for npy_ulonglong
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data);
+    const int vstep = npyv_nlanes_u64;
+    npyv_u64 v_accum = npyv_zero_u64();
+    const npy_intp vstepx4 = vstep * 4;
+
+    #line 91
+    if(is_aligned) {
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
+            #line 96
+            npyv_u64 a0 = npyv_loada_u64(data + vstep * 0);
+            
+#line 96
+            npyv_u64 a1 = npyv_loada_u64(data + vstep * 1);
+            
+#line 96
+            npyv_u64 a2 = npyv_loada_u64(data + vstep * 2);
+            
+#line 96
+            npyv_u64 a3 = npyv_loada_u64(data + vstep * 3);
+            
+            npyv_u64 a01   = npyv_add_u64(a0, a1);
+            npyv_u64 a23   = npyv_add_u64(a2, a3);
+            npyv_u64 a0123 = npyv_add_u64(a01, a23);
+                     v_accum = npyv_add_u64(a0123, v_accum);
+        }
+    }
+    
+#line 91
+    else {
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
+            #line 96
+            npyv_u64 a0 = npyv_load_u64(data + vstep * 0);
+            
+#line 96
+            npyv_u64 a1 = npyv_load_u64(data + vstep * 1);
+            
+#line 96
+            npyv_u64 a2 = npyv_load_u64(data + vstep * 2);
+            
+#line 96
+            npyv_u64 a3 = npyv_load_u64(data + vstep * 3);
+            
+            npyv_u64 a01   = npyv_add_u64(a0, a1);
+            npyv_u64 a23   = npyv_add_u64(a2, a3);
+            npyv_u64 a0123 = npyv_add_u64(a01, a23);
+                     v_accum = npyv_add_u64(a0123, v_accum);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep) {
+        npyv_u64 a = npyv_load_tillz_u64(data, count);
+        v_accum = npyv_add_u64(a, v_accum);
+    }
+    accum = npyv_sum_u64(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data += 4) {
+        const npy_ulonglong a01 = (*data) + (data[1]);
+        const npy_ulonglong a23 = (data[2]) + (data[3]);
+        accum +=  a01 + a23;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data++) {
+        accum += (*data);
+    }
+#endif // NPYV check for npy_ulonglong
+    return accum;
+}
+#endif
+
+#line 131
+static void
+ulonglong_sum_of_products_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1 == 2 || 1 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data_out = dataptr[1];
+    npy_intp stride_out = strides[1];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("ulonglong_sum_of_products_one (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 1 == 1
+        *(npy_ulonglong *)data_out = ((*(npy_ulonglong *)data0) +
+                                         (*(npy_ulonglong *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 1 == 2
+        *(npy_ulonglong *)data_out = ((*(npy_ulonglong *)data0) *
+                                         (*(npy_ulonglong *)data1) +
+                                         (*(npy_ulonglong *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 1 == 3
+        *(npy_ulonglong *)data_out = ((*(npy_ulonglong *)data0) *
+                                         (*(npy_ulonglong *)data1) *
+                                         (*(npy_ulonglong *)data2) +
+                                         (*(npy_ulonglong *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_ulonglong temp = (*(npy_ulonglong *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ulonglong *)dataptr[i]);
+        }
+        *(npy_ulonglong *)dataptr[nop] = (temp +
+                                           (*(npy_ulonglong *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1 == 1
+        ((npy_ulonglong *)data_out)[0] = ((npy_ulonglong *)data0)[0] +
+                                         ((npy_ulonglong *)data_out)[0];
+        ((npy_ulonglong *)data_out)[1] = ((npy_ulonglong *)data0)[1] +
+                                         ((npy_ulonglong *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 1 <= 3
+#define _SUMPROD_NOP 1
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_ulonglong re, im, tmp;
+        int i;
+        re = ((npy_ulonglong *)dataptr[0])[0];
+        im = ((npy_ulonglong *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ulonglong *)dataptr[i])[0] -
+                  im * ((npy_ulonglong *)dataptr[i])[1];
+            im = re * ((npy_ulonglong *)dataptr[i])[1] +
+                 im * ((npy_ulonglong *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_ulonglong *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_ulonglong *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_ulonglong *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_ulonglong *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 1 == 1
+
+static void
+ulonglong_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulonglong *data0 = (npy_ulonglong *)dataptr[0];
+    npy_ulonglong *data_out = (npy_ulonglong *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("ulonglong_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_ulonglong *)data_out + 2*6)[0] =
+                                    ((npy_ulonglong *)data0 + 2*6)[0] +
+                                    ((npy_ulonglong *)data_out + 2*6)[0];
+            ((npy_ulonglong *)data_out + 2*6)[1] =
+                                    ((npy_ulonglong *)data0 + 2*6)[1] +
+                                    ((npy_ulonglong *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_ulonglong *)data_out + 2*5)[0] =
+                                    ((npy_ulonglong *)data0 + 2*5)[0] +
+                                    ((npy_ulonglong *)data_out + 2*5)[0];
+            ((npy_ulonglong *)data_out + 2*5)[1] =
+                                    ((npy_ulonglong *)data0 + 2*5)[1] +
+                                    ((npy_ulonglong *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_ulonglong *)data_out + 2*4)[0] =
+                                    ((npy_ulonglong *)data0 + 2*4)[0] +
+                                    ((npy_ulonglong *)data_out + 2*4)[0];
+            ((npy_ulonglong *)data_out + 2*4)[1] =
+                                    ((npy_ulonglong *)data0 + 2*4)[1] +
+                                    ((npy_ulonglong *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_ulonglong *)data_out + 2*3)[0] =
+                                    ((npy_ulonglong *)data0 + 2*3)[0] +
+                                    ((npy_ulonglong *)data_out + 2*3)[0];
+            ((npy_ulonglong *)data_out + 2*3)[1] =
+                                    ((npy_ulonglong *)data0 + 2*3)[1] +
+                                    ((npy_ulonglong *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_ulonglong *)data_out + 2*2)[0] =
+                                    ((npy_ulonglong *)data0 + 2*2)[0] +
+                                    ((npy_ulonglong *)data_out + 2*2)[0];
+            ((npy_ulonglong *)data_out + 2*2)[1] =
+                                    ((npy_ulonglong *)data0 + 2*2)[1] +
+                                    ((npy_ulonglong *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_ulonglong *)data_out + 2*1)[0] =
+                                    ((npy_ulonglong *)data0 + 2*1)[0] +
+                                    ((npy_ulonglong *)data_out + 2*1)[0];
+            ((npy_ulonglong *)data_out + 2*1)[1] =
+                                    ((npy_ulonglong *)data0 + 2*1)[1] +
+                                    ((npy_ulonglong *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_ulonglong *)data_out + 2*0)[0] =
+                                    ((npy_ulonglong *)data0 + 2*0)[0] +
+                                    ((npy_ulonglong *)data_out + 2*0)[0];
+            ((npy_ulonglong *)data_out + 2*0)[1] =
+                                    ((npy_ulonglong *)data0 + 2*0)[1] +
+                                    ((npy_ulonglong *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_ulonglong *)data_out + 2*0)[0] =
+                                ((npy_ulonglong *)data0 + 2*0)[0] +
+                                ((npy_ulonglong *)data_out + 2*0)[0];
+        ((npy_ulonglong *)data_out + 2*0)[1] =
+                                ((npy_ulonglong *)data0 + 2*0)[1] +
+                                ((npy_ulonglong *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_ulonglong *)data_out + 2*1)[0] =
+                                ((npy_ulonglong *)data0 + 2*1)[0] +
+                                ((npy_ulonglong *)data_out + 2*1)[0];
+        ((npy_ulonglong *)data_out + 2*1)[1] =
+                                ((npy_ulonglong *)data0 + 2*1)[1] +
+                                ((npy_ulonglong *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_ulonglong *)data_out + 2*2)[0] =
+                                ((npy_ulonglong *)data0 + 2*2)[0] +
+                                ((npy_ulonglong *)data_out + 2*2)[0];
+        ((npy_ulonglong *)data_out + 2*2)[1] =
+                                ((npy_ulonglong *)data0 + 2*2)[1] +
+                                ((npy_ulonglong *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_ulonglong *)data_out + 2*3)[0] =
+                                ((npy_ulonglong *)data0 + 2*3)[0] +
+                                ((npy_ulonglong *)data_out + 2*3)[0];
+        ((npy_ulonglong *)data_out + 2*3)[1] =
+                                ((npy_ulonglong *)data0 + 2*3)[1] +
+                                ((npy_ulonglong *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_ulonglong *)data_out + 2*4)[0] =
+                                ((npy_ulonglong *)data0 + 2*4)[0] +
+                                ((npy_ulonglong *)data_out + 2*4)[0];
+        ((npy_ulonglong *)data_out + 2*4)[1] =
+                                ((npy_ulonglong *)data0 + 2*4)[1] +
+                                ((npy_ulonglong *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_ulonglong *)data_out + 2*5)[0] =
+                                ((npy_ulonglong *)data0 + 2*5)[0] +
+                                ((npy_ulonglong *)data_out + 2*5)[0];
+        ((npy_ulonglong *)data_out + 2*5)[1] =
+                                ((npy_ulonglong *)data0 + 2*5)[1] +
+                                ((npy_ulonglong *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_ulonglong *)data_out + 2*6)[0] =
+                                ((npy_ulonglong *)data0 + 2*6)[0] +
+                                ((npy_ulonglong *)data_out + 2*6)[0];
+        ((npy_ulonglong *)data_out + 2*6)[1] =
+                                ((npy_ulonglong *)data0 + 2*6)[1] +
+                                ((npy_ulonglong *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_ulonglong *)data_out + 2*7)[0] =
+                                ((npy_ulonglong *)data0 + 2*7)[0] +
+                                ((npy_ulonglong *)data_out + 2*7)[0];
+        ((npy_ulonglong *)data_out + 2*7)[1] =
+                                ((npy_ulonglong *)data0 + 2*7)[1] +
+                                ((npy_ulonglong *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 1 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+ulonglong_sum_of_products_muladd(npy_ulonglong *data, npy_ulonglong *data_out, npy_ulonglong scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_ulonglong
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_u64;
+    const npyv_u64 v_scalar = npyv_setall_u64(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_u64 b0 = npyv_loada_u64(data + vstep * 0);
+            npyv_u64 c0 = npyv_loada_u64(data_out + vstep * 0);
+            
+#line 312
+            npyv_u64 b1 = npyv_loada_u64(data + vstep * 1);
+            npyv_u64 c1 = npyv_loada_u64(data_out + vstep * 1);
+            
+#line 312
+            npyv_u64 b2 = npyv_loada_u64(data + vstep * 2);
+            npyv_u64 c2 = npyv_loada_u64(data_out + vstep * 2);
+            
+#line 312
+            npyv_u64 b3 = npyv_loada_u64(data + vstep * 3);
+            npyv_u64 c3 = npyv_loada_u64(data_out + vstep * 3);
+            
+            #line 318
+            npyv_u64 abc0 = npyv_muladd_u64(v_scalar, b0, c0);
+            
+#line 318
+            npyv_u64 abc1 = npyv_muladd_u64(v_scalar, b1, c1);
+            
+#line 318
+            npyv_u64 abc2 = npyv_muladd_u64(v_scalar, b2, c2);
+            
+#line 318
+            npyv_u64 abc3 = npyv_muladd_u64(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_u64(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_u64(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_u64(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_u64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_u64 b0 = npyv_load_u64(data + vstep * 0);
+            npyv_u64 c0 = npyv_load_u64(data_out + vstep * 0);
+            
+#line 312
+            npyv_u64 b1 = npyv_load_u64(data + vstep * 1);
+            npyv_u64 c1 = npyv_load_u64(data_out + vstep * 1);
+            
+#line 312
+            npyv_u64 b2 = npyv_load_u64(data + vstep * 2);
+            npyv_u64 c2 = npyv_load_u64(data_out + vstep * 2);
+            
+#line 312
+            npyv_u64 b3 = npyv_load_u64(data + vstep * 3);
+            npyv_u64 c3 = npyv_load_u64(data_out + vstep * 3);
+            
+            #line 318
+            npyv_u64 abc0 = npyv_muladd_u64(v_scalar, b0, c0);
+            
+#line 318
+            npyv_u64 abc1 = npyv_muladd_u64(v_scalar, b1, c1);
+            
+#line 318
+            npyv_u64 abc2 = npyv_muladd_u64(v_scalar, b2, c2);
+            
+#line 318
+            npyv_u64 abc3 = npyv_muladd_u64(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_u64(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_u64(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_u64(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_u64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_u64 a = npyv_load_tillz_u64(data, count);
+        npyv_u64 b = npyv_load_tillz_u64(data_out, count);
+        npyv_store_till_u64(data_out, count, npyv_muladd_u64(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_ulonglong b0 = (data[0]);
+        const npy_ulonglong c0 = (data_out[0]);
+        
+#line 340
+        const npy_ulonglong b1 = (data[1]);
+        const npy_ulonglong c1 = (data_out[1]);
+        
+#line 340
+        const npy_ulonglong b2 = (data[2]);
+        const npy_ulonglong c2 = (data_out[2]);
+        
+#line 340
+        const npy_ulonglong b3 = (data[3]);
+        const npy_ulonglong c3 = (data_out[3]);
+        
+        #line 346
+        const npy_ulonglong abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_ulonglong abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_ulonglong abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_ulonglong abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_ulonglong b = (*data);
+        const npy_ulonglong c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_ulonglong
+}
+
+static void
+ulonglong_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulonglong *data0 = (npy_ulonglong *)dataptr[0];
+    npy_ulonglong *data1 = (npy_ulonglong *)dataptr[1];
+    npy_ulonglong *data_out = (npy_ulonglong *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("ulonglong_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_ulonglong
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_u64;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_u64 a0 = npyv_loada_u64(data0 + vstep * 0);
+            npyv_u64 b0 = npyv_loada_u64(data1 + vstep * 0);
+            npyv_u64 c0 = npyv_loada_u64(data_out + vstep * 0);
+            
+#line 390
+            npyv_u64 a1 = npyv_loada_u64(data0 + vstep * 1);
+            npyv_u64 b1 = npyv_loada_u64(data1 + vstep * 1);
+            npyv_u64 c1 = npyv_loada_u64(data_out + vstep * 1);
+            
+#line 390
+            npyv_u64 a2 = npyv_loada_u64(data0 + vstep * 2);
+            npyv_u64 b2 = npyv_loada_u64(data1 + vstep * 2);
+            npyv_u64 c2 = npyv_loada_u64(data_out + vstep * 2);
+            
+#line 390
+            npyv_u64 a3 = npyv_loada_u64(data0 + vstep * 3);
+            npyv_u64 b3 = npyv_loada_u64(data1 + vstep * 3);
+            npyv_u64 c3 = npyv_loada_u64(data_out + vstep * 3);
+            
+            #line 397
+            npyv_u64 abc0 = npyv_muladd_u64(a0, b0, c0);
+            
+#line 397
+            npyv_u64 abc1 = npyv_muladd_u64(a1, b1, c1);
+            
+#line 397
+            npyv_u64 abc2 = npyv_muladd_u64(a2, b2, c2);
+            
+#line 397
+            npyv_u64 abc3 = npyv_muladd_u64(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_u64(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_u64(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_u64(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_u64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_u64 a0 = npyv_load_u64(data0 + vstep * 0);
+            npyv_u64 b0 = npyv_load_u64(data1 + vstep * 0);
+            npyv_u64 c0 = npyv_load_u64(data_out + vstep * 0);
+            
+#line 390
+            npyv_u64 a1 = npyv_load_u64(data0 + vstep * 1);
+            npyv_u64 b1 = npyv_load_u64(data1 + vstep * 1);
+            npyv_u64 c1 = npyv_load_u64(data_out + vstep * 1);
+            
+#line 390
+            npyv_u64 a2 = npyv_load_u64(data0 + vstep * 2);
+            npyv_u64 b2 = npyv_load_u64(data1 + vstep * 2);
+            npyv_u64 c2 = npyv_load_u64(data_out + vstep * 2);
+            
+#line 390
+            npyv_u64 a3 = npyv_load_u64(data0 + vstep * 3);
+            npyv_u64 b3 = npyv_load_u64(data1 + vstep * 3);
+            npyv_u64 c3 = npyv_load_u64(data_out + vstep * 3);
+            
+            #line 397
+            npyv_u64 abc0 = npyv_muladd_u64(a0, b0, c0);
+            
+#line 397
+            npyv_u64 abc1 = npyv_muladd_u64(a1, b1, c1);
+            
+#line 397
+            npyv_u64 abc2 = npyv_muladd_u64(a2, b2, c2);
+            
+#line 397
+            npyv_u64 abc3 = npyv_muladd_u64(a3, b3, c3);
+            
+            #line 402
+            npyv_store_u64(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_u64(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_u64(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_u64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_u64 a = npyv_load_tillz_u64(data0, count);
+        npyv_u64 b = npyv_load_tillz_u64(data1, count);
+        npyv_u64 c = npyv_load_tillz_u64(data_out, count);
+        npyv_store_till_u64(data_out, count, npyv_muladd_u64(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_ulonglong a0 = (data0[0]);
+        const npy_ulonglong b0 = (data1[0]);
+        const npy_ulonglong c0 = (data_out[0]);
+        
+#line 420
+        const npy_ulonglong a1 = (data0[1]);
+        const npy_ulonglong b1 = (data1[1]);
+        const npy_ulonglong c1 = (data_out[1]);
+        
+#line 420
+        const npy_ulonglong a2 = (data0[2]);
+        const npy_ulonglong b2 = (data1[2]);
+        const npy_ulonglong c2 = (data_out[2]);
+        
+#line 420
+        const npy_ulonglong a3 = (data0[3]);
+        const npy_ulonglong b3 = (data1[3]);
+        const npy_ulonglong c3 = (data_out[3]);
+        
+        #line 427
+        const npy_ulonglong abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_ulonglong abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_ulonglong abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_ulonglong abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_ulonglong a = (*data0);
+        const npy_ulonglong b = (*data1);
+        const npy_ulonglong c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_ulonglong
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+ulonglong_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulonglong value0 = (*(npy_ulonglong *)dataptr[0]);
+    npy_ulonglong *data1 = (npy_ulonglong *)dataptr[1];
+    npy_ulonglong *data_out = (npy_ulonglong *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("ulonglong_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    ulonglong_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+ulonglong_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulonglong value1 = (*(npy_ulonglong *)dataptr[1]);
+    npy_ulonglong *data0 = (npy_ulonglong *)dataptr[0];
+    npy_ulonglong *data_out = (npy_ulonglong *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("ulonglong_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    ulonglong_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+ulonglong_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulonglong *data0 = (npy_ulonglong *)dataptr[0];
+    npy_ulonglong *data1 = (npy_ulonglong *)dataptr[1];
+    npy_ulonglong accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("ulonglong_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_ulonglong
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_u64;
+    npyv_u64 v_accum = npyv_zero_u64();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_u64 a0 = npyv_loada_u64(data0 + vstep * 0);
+            npyv_u64 b0 = npyv_loada_u64(data1 + vstep * 0);
+            
+#line 501
+            npyv_u64 a1 = npyv_loada_u64(data0 + vstep * 1);
+            npyv_u64 b1 = npyv_loada_u64(data1 + vstep * 1);
+            
+#line 501
+            npyv_u64 a2 = npyv_loada_u64(data0 + vstep * 2);
+            npyv_u64 b2 = npyv_loada_u64(data1 + vstep * 2);
+            
+#line 501
+            npyv_u64 a3 = npyv_loada_u64(data0 + vstep * 3);
+            npyv_u64 b3 = npyv_loada_u64(data1 + vstep * 3);
+            
+            npyv_u64 ab3 = npyv_muladd_u64(a3, b3, v_accum);
+            npyv_u64 ab2 = npyv_muladd_u64(a2, b2, ab3);
+            npyv_u64 ab1 = npyv_muladd_u64(a1, b1, ab2);
+                   v_accum = npyv_muladd_u64(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_u64 a0 = npyv_load_u64(data0 + vstep * 0);
+            npyv_u64 b0 = npyv_load_u64(data1 + vstep * 0);
+            
+#line 501
+            npyv_u64 a1 = npyv_load_u64(data0 + vstep * 1);
+            npyv_u64 b1 = npyv_load_u64(data1 + vstep * 1);
+            
+#line 501
+            npyv_u64 a2 = npyv_load_u64(data0 + vstep * 2);
+            npyv_u64 b2 = npyv_load_u64(data1 + vstep * 2);
+            
+#line 501
+            npyv_u64 a3 = npyv_load_u64(data0 + vstep * 3);
+            npyv_u64 b3 = npyv_load_u64(data1 + vstep * 3);
+            
+            npyv_u64 ab3 = npyv_muladd_u64(a3, b3, v_accum);
+            npyv_u64 ab2 = npyv_muladd_u64(a2, b2, ab3);
+            npyv_u64 ab1 = npyv_muladd_u64(a1, b1, ab2);
+                   v_accum = npyv_muladd_u64(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_u64 a = npyv_load_tillz_u64(data0, count);
+        npyv_u64 b = npyv_load_tillz_u64(data1, count);
+        v_accum = npyv_muladd_u64(a, b, v_accum);
+    }
+    accum = npyv_sum_u64(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_ulonglong ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_ulonglong ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_ulonglong ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_ulonglong ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_ulonglong a = (*data0);
+        const npy_ulonglong b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_ulonglong
+    *(npy_ulonglong *)dataptr[2] = ((*(npy_ulonglong *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+ulonglong_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulonglong *data1 = (npy_ulonglong *)dataptr[1];
+    npy_ulonglong value0 = (*(npy_ulonglong *)dataptr[0]);
+    npy_ulonglong accum = ulonglong_sum_of_arr(data1, count);
+    *(npy_ulonglong *)dataptr[2] = ((*(npy_ulonglong *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+ulonglong_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulonglong *data0 = (npy_ulonglong *)dataptr[0];
+    npy_ulonglong value1 = (*(npy_ulonglong *)dataptr[1]);
+    npy_ulonglong accum = ulonglong_sum_of_arr(data0, count);
+    *(npy_ulonglong *)dataptr[2] = ((*(npy_ulonglong *)dataptr[2]) + value1 * accum);
+}
+
+#elif 1 == 3 && !0
+
+static void
+ulonglong_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulonglong *data0 = (npy_ulonglong *)dataptr[0];
+    npy_ulonglong *data1 = (npy_ulonglong *)dataptr[1];
+    npy_ulonglong *data2 = (npy_ulonglong *)dataptr[2];
+    npy_ulonglong *data_out = (npy_ulonglong *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 1 > 3 || @complex */
+
+static void
+ulonglong_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("ulonglong_sum_of_products_contig_one (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_ulonglong temp = (*(npy_ulonglong *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ulonglong *)dataptr[i]);
+        }
+        *(npy_ulonglong *)dataptr[nop] = (temp +
+                                           (*(npy_ulonglong *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_ulonglong);
+        }
+#else /* complex */
+#  if 1 <= 3
+#    define _SUMPROD_NOP 1
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_ulonglong re, im, tmp;
+        int i;
+        re = ((npy_ulonglong *)dataptr[0])[0];
+        im = ((npy_ulonglong *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ulonglong *)dataptr[i])[0] -
+                  im * ((npy_ulonglong *)dataptr[i])[1];
+            im = re * ((npy_ulonglong *)dataptr[i])[1] +
+                 im * ((npy_ulonglong *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_ulonglong *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_ulonglong *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_ulonglong *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_ulonglong *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_ulonglong);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 1 */
+
+#if 1 == 1
+
+static NPY_GCC_OPT_3 void
+ulonglong_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("ulonglong_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_ulonglong *data = (npy_ulonglong *)dataptr[0];
+    npy_ulonglong accum = ulonglong_sum_of_arr(data, count);
+    *((npy_ulonglong *)dataptr[1]) = (accum + (*((npy_ulonglong *)dataptr[1])));
+#else
+    npy_ulonglong accum_re = 0, accum_im = 0;
+    npy_ulonglong *data0 = (npy_ulonglong *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_ulonglong re01 = data0[0] + data0[2];
+        const npy_ulonglong re23 = data0[4] + data0[6];
+        const npy_ulonglong im13 = data0[1] + data0[3];
+        const npy_ulonglong im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_ulonglong *)dataptr[1])[0] += accum_re;
+    ((npy_ulonglong *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 1 == 1 */
+
+static void
+ulonglong_sum_of_products_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_ulonglong accum_re = 0, accum_im = 0;
+#else
+    npy_ulonglong accum = 0;
+#endif
+
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1 == 2 || 1 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("ulonglong_sum_of_products_outstride0_one (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 1 == 1
+        accum += (*(npy_ulonglong *)data0);
+        data0 += stride0;
+#  elif 1 == 2
+        accum += (*(npy_ulonglong *)data0) *
+                 (*(npy_ulonglong *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 1 == 3
+        accum += (*(npy_ulonglong *)data0) *
+                 (*(npy_ulonglong *)data1) *
+                 (*(npy_ulonglong *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_ulonglong temp = (*(npy_ulonglong *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ulonglong *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1 == 1
+        accum_re += ((npy_ulonglong *)data0)[0];
+        accum_im += ((npy_ulonglong *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 1 <= 3
+#define _SUMPROD_NOP 1
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_ulonglong re, im, tmp;
+        int i;
+        re = ((npy_ulonglong *)dataptr[0])[0];
+        im = ((npy_ulonglong *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ulonglong *)dataptr[i])[0] -
+                  im * ((npy_ulonglong *)dataptr[i])[1];
+            im = re * ((npy_ulonglong *)dataptr[i])[1] +
+                 im * ((npy_ulonglong *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 1 <= 3
+    ((npy_ulonglong *)dataptr[1])[0] += accum_re;
+    ((npy_ulonglong *)dataptr[1])[1] += accum_im;
+#  else
+    ((npy_ulonglong *)dataptr[nop])[0] += accum_re;
+    ((npy_ulonglong *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 1 <= 3
+    *((npy_ulonglong *)dataptr[1]) = (accum +
+                                    (*((npy_ulonglong *)dataptr[1])));
+#  else
+    *((npy_ulonglong *)dataptr[nop]) = (accum +
+                                    (*((npy_ulonglong *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+ulonglong_sum_of_products_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (2 == 2 || 2 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (2 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data_out = dataptr[2];
+    npy_intp stride_out = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("ulonglong_sum_of_products_two (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 2 == 1
+        *(npy_ulonglong *)data_out = ((*(npy_ulonglong *)data0) +
+                                         (*(npy_ulonglong *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 2 == 2
+        *(npy_ulonglong *)data_out = ((*(npy_ulonglong *)data0) *
+                                         (*(npy_ulonglong *)data1) +
+                                         (*(npy_ulonglong *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 2 == 3
+        *(npy_ulonglong *)data_out = ((*(npy_ulonglong *)data0) *
+                                         (*(npy_ulonglong *)data1) *
+                                         (*(npy_ulonglong *)data2) +
+                                         (*(npy_ulonglong *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_ulonglong temp = (*(npy_ulonglong *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ulonglong *)dataptr[i]);
+        }
+        *(npy_ulonglong *)dataptr[nop] = (temp +
+                                           (*(npy_ulonglong *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 2 == 1
+        ((npy_ulonglong *)data_out)[0] = ((npy_ulonglong *)data0)[0] +
+                                         ((npy_ulonglong *)data_out)[0];
+        ((npy_ulonglong *)data_out)[1] = ((npy_ulonglong *)data0)[1] +
+                                         ((npy_ulonglong *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 2 <= 3
+#define _SUMPROD_NOP 2
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_ulonglong re, im, tmp;
+        int i;
+        re = ((npy_ulonglong *)dataptr[0])[0];
+        im = ((npy_ulonglong *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ulonglong *)dataptr[i])[0] -
+                  im * ((npy_ulonglong *)dataptr[i])[1];
+            im = re * ((npy_ulonglong *)dataptr[i])[1] +
+                 im * ((npy_ulonglong *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_ulonglong *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_ulonglong *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_ulonglong *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_ulonglong *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 2 == 1
+
+static void
+ulonglong_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulonglong *data0 = (npy_ulonglong *)dataptr[0];
+    npy_ulonglong *data_out = (npy_ulonglong *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("ulonglong_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_ulonglong *)data_out + 2*6)[0] =
+                                    ((npy_ulonglong *)data0 + 2*6)[0] +
+                                    ((npy_ulonglong *)data_out + 2*6)[0];
+            ((npy_ulonglong *)data_out + 2*6)[1] =
+                                    ((npy_ulonglong *)data0 + 2*6)[1] +
+                                    ((npy_ulonglong *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_ulonglong *)data_out + 2*5)[0] =
+                                    ((npy_ulonglong *)data0 + 2*5)[0] +
+                                    ((npy_ulonglong *)data_out + 2*5)[0];
+            ((npy_ulonglong *)data_out + 2*5)[1] =
+                                    ((npy_ulonglong *)data0 + 2*5)[1] +
+                                    ((npy_ulonglong *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_ulonglong *)data_out + 2*4)[0] =
+                                    ((npy_ulonglong *)data0 + 2*4)[0] +
+                                    ((npy_ulonglong *)data_out + 2*4)[0];
+            ((npy_ulonglong *)data_out + 2*4)[1] =
+                                    ((npy_ulonglong *)data0 + 2*4)[1] +
+                                    ((npy_ulonglong *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_ulonglong *)data_out + 2*3)[0] =
+                                    ((npy_ulonglong *)data0 + 2*3)[0] +
+                                    ((npy_ulonglong *)data_out + 2*3)[0];
+            ((npy_ulonglong *)data_out + 2*3)[1] =
+                                    ((npy_ulonglong *)data0 + 2*3)[1] +
+                                    ((npy_ulonglong *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_ulonglong *)data_out + 2*2)[0] =
+                                    ((npy_ulonglong *)data0 + 2*2)[0] +
+                                    ((npy_ulonglong *)data_out + 2*2)[0];
+            ((npy_ulonglong *)data_out + 2*2)[1] =
+                                    ((npy_ulonglong *)data0 + 2*2)[1] +
+                                    ((npy_ulonglong *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_ulonglong *)data_out + 2*1)[0] =
+                                    ((npy_ulonglong *)data0 + 2*1)[0] +
+                                    ((npy_ulonglong *)data_out + 2*1)[0];
+            ((npy_ulonglong *)data_out + 2*1)[1] =
+                                    ((npy_ulonglong *)data0 + 2*1)[1] +
+                                    ((npy_ulonglong *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_ulonglong *)data_out + 2*0)[0] =
+                                    ((npy_ulonglong *)data0 + 2*0)[0] +
+                                    ((npy_ulonglong *)data_out + 2*0)[0];
+            ((npy_ulonglong *)data_out + 2*0)[1] =
+                                    ((npy_ulonglong *)data0 + 2*0)[1] +
+                                    ((npy_ulonglong *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_ulonglong *)data_out + 2*0)[0] =
+                                ((npy_ulonglong *)data0 + 2*0)[0] +
+                                ((npy_ulonglong *)data_out + 2*0)[0];
+        ((npy_ulonglong *)data_out + 2*0)[1] =
+                                ((npy_ulonglong *)data0 + 2*0)[1] +
+                                ((npy_ulonglong *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_ulonglong *)data_out + 2*1)[0] =
+                                ((npy_ulonglong *)data0 + 2*1)[0] +
+                                ((npy_ulonglong *)data_out + 2*1)[0];
+        ((npy_ulonglong *)data_out + 2*1)[1] =
+                                ((npy_ulonglong *)data0 + 2*1)[1] +
+                                ((npy_ulonglong *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_ulonglong *)data_out + 2*2)[0] =
+                                ((npy_ulonglong *)data0 + 2*2)[0] +
+                                ((npy_ulonglong *)data_out + 2*2)[0];
+        ((npy_ulonglong *)data_out + 2*2)[1] =
+                                ((npy_ulonglong *)data0 + 2*2)[1] +
+                                ((npy_ulonglong *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_ulonglong *)data_out + 2*3)[0] =
+                                ((npy_ulonglong *)data0 + 2*3)[0] +
+                                ((npy_ulonglong *)data_out + 2*3)[0];
+        ((npy_ulonglong *)data_out + 2*3)[1] =
+                                ((npy_ulonglong *)data0 + 2*3)[1] +
+                                ((npy_ulonglong *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_ulonglong *)data_out + 2*4)[0] =
+                                ((npy_ulonglong *)data0 + 2*4)[0] +
+                                ((npy_ulonglong *)data_out + 2*4)[0];
+        ((npy_ulonglong *)data_out + 2*4)[1] =
+                                ((npy_ulonglong *)data0 + 2*4)[1] +
+                                ((npy_ulonglong *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_ulonglong *)data_out + 2*5)[0] =
+                                ((npy_ulonglong *)data0 + 2*5)[0] +
+                                ((npy_ulonglong *)data_out + 2*5)[0];
+        ((npy_ulonglong *)data_out + 2*5)[1] =
+                                ((npy_ulonglong *)data0 + 2*5)[1] +
+                                ((npy_ulonglong *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_ulonglong *)data_out + 2*6)[0] =
+                                ((npy_ulonglong *)data0 + 2*6)[0] +
+                                ((npy_ulonglong *)data_out + 2*6)[0];
+        ((npy_ulonglong *)data_out + 2*6)[1] =
+                                ((npy_ulonglong *)data0 + 2*6)[1] +
+                                ((npy_ulonglong *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_ulonglong *)data_out + 2*7)[0] =
+                                ((npy_ulonglong *)data0 + 2*7)[0] +
+                                ((npy_ulonglong *)data_out + 2*7)[0];
+        ((npy_ulonglong *)data_out + 2*7)[1] =
+                                ((npy_ulonglong *)data0 + 2*7)[1] +
+                                ((npy_ulonglong *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 2 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+ulonglong_sum_of_products_muladd(npy_ulonglong *data, npy_ulonglong *data_out, npy_ulonglong scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_ulonglong
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_u64;
+    const npyv_u64 v_scalar = npyv_setall_u64(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_u64 b0 = npyv_loada_u64(data + vstep * 0);
+            npyv_u64 c0 = npyv_loada_u64(data_out + vstep * 0);
+            
+#line 312
+            npyv_u64 b1 = npyv_loada_u64(data + vstep * 1);
+            npyv_u64 c1 = npyv_loada_u64(data_out + vstep * 1);
+            
+#line 312
+            npyv_u64 b2 = npyv_loada_u64(data + vstep * 2);
+            npyv_u64 c2 = npyv_loada_u64(data_out + vstep * 2);
+            
+#line 312
+            npyv_u64 b3 = npyv_loada_u64(data + vstep * 3);
+            npyv_u64 c3 = npyv_loada_u64(data_out + vstep * 3);
+            
+            #line 318
+            npyv_u64 abc0 = npyv_muladd_u64(v_scalar, b0, c0);
+            
+#line 318
+            npyv_u64 abc1 = npyv_muladd_u64(v_scalar, b1, c1);
+            
+#line 318
+            npyv_u64 abc2 = npyv_muladd_u64(v_scalar, b2, c2);
+            
+#line 318
+            npyv_u64 abc3 = npyv_muladd_u64(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_u64(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_u64(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_u64(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_u64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_u64 b0 = npyv_load_u64(data + vstep * 0);
+            npyv_u64 c0 = npyv_load_u64(data_out + vstep * 0);
+            
+#line 312
+            npyv_u64 b1 = npyv_load_u64(data + vstep * 1);
+            npyv_u64 c1 = npyv_load_u64(data_out + vstep * 1);
+            
+#line 312
+            npyv_u64 b2 = npyv_load_u64(data + vstep * 2);
+            npyv_u64 c2 = npyv_load_u64(data_out + vstep * 2);
+            
+#line 312
+            npyv_u64 b3 = npyv_load_u64(data + vstep * 3);
+            npyv_u64 c3 = npyv_load_u64(data_out + vstep * 3);
+            
+            #line 318
+            npyv_u64 abc0 = npyv_muladd_u64(v_scalar, b0, c0);
+            
+#line 318
+            npyv_u64 abc1 = npyv_muladd_u64(v_scalar, b1, c1);
+            
+#line 318
+            npyv_u64 abc2 = npyv_muladd_u64(v_scalar, b2, c2);
+            
+#line 318
+            npyv_u64 abc3 = npyv_muladd_u64(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_u64(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_u64(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_u64(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_u64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_u64 a = npyv_load_tillz_u64(data, count);
+        npyv_u64 b = npyv_load_tillz_u64(data_out, count);
+        npyv_store_till_u64(data_out, count, npyv_muladd_u64(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_ulonglong b0 = (data[0]);
+        const npy_ulonglong c0 = (data_out[0]);
+        
+#line 340
+        const npy_ulonglong b1 = (data[1]);
+        const npy_ulonglong c1 = (data_out[1]);
+        
+#line 340
+        const npy_ulonglong b2 = (data[2]);
+        const npy_ulonglong c2 = (data_out[2]);
+        
+#line 340
+        const npy_ulonglong b3 = (data[3]);
+        const npy_ulonglong c3 = (data_out[3]);
+        
+        #line 346
+        const npy_ulonglong abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_ulonglong abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_ulonglong abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_ulonglong abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_ulonglong b = (*data);
+        const npy_ulonglong c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_ulonglong
+}
+
+static void
+ulonglong_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulonglong *data0 = (npy_ulonglong *)dataptr[0];
+    npy_ulonglong *data1 = (npy_ulonglong *)dataptr[1];
+    npy_ulonglong *data_out = (npy_ulonglong *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("ulonglong_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_ulonglong
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_u64;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_u64 a0 = npyv_loada_u64(data0 + vstep * 0);
+            npyv_u64 b0 = npyv_loada_u64(data1 + vstep * 0);
+            npyv_u64 c0 = npyv_loada_u64(data_out + vstep * 0);
+            
+#line 390
+            npyv_u64 a1 = npyv_loada_u64(data0 + vstep * 1);
+            npyv_u64 b1 = npyv_loada_u64(data1 + vstep * 1);
+            npyv_u64 c1 = npyv_loada_u64(data_out + vstep * 1);
+            
+#line 390
+            npyv_u64 a2 = npyv_loada_u64(data0 + vstep * 2);
+            npyv_u64 b2 = npyv_loada_u64(data1 + vstep * 2);
+            npyv_u64 c2 = npyv_loada_u64(data_out + vstep * 2);
+            
+#line 390
+            npyv_u64 a3 = npyv_loada_u64(data0 + vstep * 3);
+            npyv_u64 b3 = npyv_loada_u64(data1 + vstep * 3);
+            npyv_u64 c3 = npyv_loada_u64(data_out + vstep * 3);
+            
+            #line 397
+            npyv_u64 abc0 = npyv_muladd_u64(a0, b0, c0);
+            
+#line 397
+            npyv_u64 abc1 = npyv_muladd_u64(a1, b1, c1);
+            
+#line 397
+            npyv_u64 abc2 = npyv_muladd_u64(a2, b2, c2);
+            
+#line 397
+            npyv_u64 abc3 = npyv_muladd_u64(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_u64(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_u64(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_u64(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_u64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_u64 a0 = npyv_load_u64(data0 + vstep * 0);
+            npyv_u64 b0 = npyv_load_u64(data1 + vstep * 0);
+            npyv_u64 c0 = npyv_load_u64(data_out + vstep * 0);
+            
+#line 390
+            npyv_u64 a1 = npyv_load_u64(data0 + vstep * 1);
+            npyv_u64 b1 = npyv_load_u64(data1 + vstep * 1);
+            npyv_u64 c1 = npyv_load_u64(data_out + vstep * 1);
+            
+#line 390
+            npyv_u64 a2 = npyv_load_u64(data0 + vstep * 2);
+            npyv_u64 b2 = npyv_load_u64(data1 + vstep * 2);
+            npyv_u64 c2 = npyv_load_u64(data_out + vstep * 2);
+            
+#line 390
+            npyv_u64 a3 = npyv_load_u64(data0 + vstep * 3);
+            npyv_u64 b3 = npyv_load_u64(data1 + vstep * 3);
+            npyv_u64 c3 = npyv_load_u64(data_out + vstep * 3);
+            
+            #line 397
+            npyv_u64 abc0 = npyv_muladd_u64(a0, b0, c0);
+            
+#line 397
+            npyv_u64 abc1 = npyv_muladd_u64(a1, b1, c1);
+            
+#line 397
+            npyv_u64 abc2 = npyv_muladd_u64(a2, b2, c2);
+            
+#line 397
+            npyv_u64 abc3 = npyv_muladd_u64(a3, b3, c3);
+            
+            #line 402
+            npyv_store_u64(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_u64(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_u64(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_u64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_u64 a = npyv_load_tillz_u64(data0, count);
+        npyv_u64 b = npyv_load_tillz_u64(data1, count);
+        npyv_u64 c = npyv_load_tillz_u64(data_out, count);
+        npyv_store_till_u64(data_out, count, npyv_muladd_u64(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_ulonglong a0 = (data0[0]);
+        const npy_ulonglong b0 = (data1[0]);
+        const npy_ulonglong c0 = (data_out[0]);
+        
+#line 420
+        const npy_ulonglong a1 = (data0[1]);
+        const npy_ulonglong b1 = (data1[1]);
+        const npy_ulonglong c1 = (data_out[1]);
+        
+#line 420
+        const npy_ulonglong a2 = (data0[2]);
+        const npy_ulonglong b2 = (data1[2]);
+        const npy_ulonglong c2 = (data_out[2]);
+        
+#line 420
+        const npy_ulonglong a3 = (data0[3]);
+        const npy_ulonglong b3 = (data1[3]);
+        const npy_ulonglong c3 = (data_out[3]);
+        
+        #line 427
+        const npy_ulonglong abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_ulonglong abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_ulonglong abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_ulonglong abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_ulonglong a = (*data0);
+        const npy_ulonglong b = (*data1);
+        const npy_ulonglong c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_ulonglong
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+ulonglong_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulonglong value0 = (*(npy_ulonglong *)dataptr[0]);
+    npy_ulonglong *data1 = (npy_ulonglong *)dataptr[1];
+    npy_ulonglong *data_out = (npy_ulonglong *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("ulonglong_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    ulonglong_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+ulonglong_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulonglong value1 = (*(npy_ulonglong *)dataptr[1]);
+    npy_ulonglong *data0 = (npy_ulonglong *)dataptr[0];
+    npy_ulonglong *data_out = (npy_ulonglong *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("ulonglong_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    ulonglong_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+ulonglong_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulonglong *data0 = (npy_ulonglong *)dataptr[0];
+    npy_ulonglong *data1 = (npy_ulonglong *)dataptr[1];
+    npy_ulonglong accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("ulonglong_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_ulonglong
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_u64;
+    npyv_u64 v_accum = npyv_zero_u64();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_u64 a0 = npyv_loada_u64(data0 + vstep * 0);
+            npyv_u64 b0 = npyv_loada_u64(data1 + vstep * 0);
+            
+#line 501
+            npyv_u64 a1 = npyv_loada_u64(data0 + vstep * 1);
+            npyv_u64 b1 = npyv_loada_u64(data1 + vstep * 1);
+            
+#line 501
+            npyv_u64 a2 = npyv_loada_u64(data0 + vstep * 2);
+            npyv_u64 b2 = npyv_loada_u64(data1 + vstep * 2);
+            
+#line 501
+            npyv_u64 a3 = npyv_loada_u64(data0 + vstep * 3);
+            npyv_u64 b3 = npyv_loada_u64(data1 + vstep * 3);
+            
+            npyv_u64 ab3 = npyv_muladd_u64(a3, b3, v_accum);
+            npyv_u64 ab2 = npyv_muladd_u64(a2, b2, ab3);
+            npyv_u64 ab1 = npyv_muladd_u64(a1, b1, ab2);
+                   v_accum = npyv_muladd_u64(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_u64 a0 = npyv_load_u64(data0 + vstep * 0);
+            npyv_u64 b0 = npyv_load_u64(data1 + vstep * 0);
+            
+#line 501
+            npyv_u64 a1 = npyv_load_u64(data0 + vstep * 1);
+            npyv_u64 b1 = npyv_load_u64(data1 + vstep * 1);
+            
+#line 501
+            npyv_u64 a2 = npyv_load_u64(data0 + vstep * 2);
+            npyv_u64 b2 = npyv_load_u64(data1 + vstep * 2);
+            
+#line 501
+            npyv_u64 a3 = npyv_load_u64(data0 + vstep * 3);
+            npyv_u64 b3 = npyv_load_u64(data1 + vstep * 3);
+            
+            npyv_u64 ab3 = npyv_muladd_u64(a3, b3, v_accum);
+            npyv_u64 ab2 = npyv_muladd_u64(a2, b2, ab3);
+            npyv_u64 ab1 = npyv_muladd_u64(a1, b1, ab2);
+                   v_accum = npyv_muladd_u64(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_u64 a = npyv_load_tillz_u64(data0, count);
+        npyv_u64 b = npyv_load_tillz_u64(data1, count);
+        v_accum = npyv_muladd_u64(a, b, v_accum);
+    }
+    accum = npyv_sum_u64(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_ulonglong ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_ulonglong ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_ulonglong ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_ulonglong ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_ulonglong a = (*data0);
+        const npy_ulonglong b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_ulonglong
+    *(npy_ulonglong *)dataptr[2] = ((*(npy_ulonglong *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+ulonglong_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulonglong *data1 = (npy_ulonglong *)dataptr[1];
+    npy_ulonglong value0 = (*(npy_ulonglong *)dataptr[0]);
+    npy_ulonglong accum = ulonglong_sum_of_arr(data1, count);
+    *(npy_ulonglong *)dataptr[2] = ((*(npy_ulonglong *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+ulonglong_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulonglong *data0 = (npy_ulonglong *)dataptr[0];
+    npy_ulonglong value1 = (*(npy_ulonglong *)dataptr[1]);
+    npy_ulonglong accum = ulonglong_sum_of_arr(data0, count);
+    *(npy_ulonglong *)dataptr[2] = ((*(npy_ulonglong *)dataptr[2]) + value1 * accum);
+}
+
+#elif 2 == 3 && !0
+
+static void
+ulonglong_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulonglong *data0 = (npy_ulonglong *)dataptr[0];
+    npy_ulonglong *data1 = (npy_ulonglong *)dataptr[1];
+    npy_ulonglong *data2 = (npy_ulonglong *)dataptr[2];
+    npy_ulonglong *data_out = (npy_ulonglong *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 2 > 3 || @complex */
+
+static void
+ulonglong_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("ulonglong_sum_of_products_contig_two (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_ulonglong temp = (*(npy_ulonglong *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ulonglong *)dataptr[i]);
+        }
+        *(npy_ulonglong *)dataptr[nop] = (temp +
+                                           (*(npy_ulonglong *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_ulonglong);
+        }
+#else /* complex */
+#  if 2 <= 3
+#    define _SUMPROD_NOP 2
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_ulonglong re, im, tmp;
+        int i;
+        re = ((npy_ulonglong *)dataptr[0])[0];
+        im = ((npy_ulonglong *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ulonglong *)dataptr[i])[0] -
+                  im * ((npy_ulonglong *)dataptr[i])[1];
+            im = re * ((npy_ulonglong *)dataptr[i])[1] +
+                 im * ((npy_ulonglong *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_ulonglong *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_ulonglong *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_ulonglong *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_ulonglong *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_ulonglong);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 2 */
+
+#if 2 == 1
+
+static NPY_GCC_OPT_3 void
+ulonglong_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("ulonglong_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_ulonglong *data = (npy_ulonglong *)dataptr[0];
+    npy_ulonglong accum = ulonglong_sum_of_arr(data, count);
+    *((npy_ulonglong *)dataptr[1]) = (accum + (*((npy_ulonglong *)dataptr[1])));
+#else
+    npy_ulonglong accum_re = 0, accum_im = 0;
+    npy_ulonglong *data0 = (npy_ulonglong *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_ulonglong re01 = data0[0] + data0[2];
+        const npy_ulonglong re23 = data0[4] + data0[6];
+        const npy_ulonglong im13 = data0[1] + data0[3];
+        const npy_ulonglong im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_ulonglong *)dataptr[1])[0] += accum_re;
+    ((npy_ulonglong *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 2 == 1 */
+
+static void
+ulonglong_sum_of_products_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_ulonglong accum_re = 0, accum_im = 0;
+#else
+    npy_ulonglong accum = 0;
+#endif
+
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (2 == 2 || 2 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (2 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("ulonglong_sum_of_products_outstride0_two (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 2 == 1
+        accum += (*(npy_ulonglong *)data0);
+        data0 += stride0;
+#  elif 2 == 2
+        accum += (*(npy_ulonglong *)data0) *
+                 (*(npy_ulonglong *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 2 == 3
+        accum += (*(npy_ulonglong *)data0) *
+                 (*(npy_ulonglong *)data1) *
+                 (*(npy_ulonglong *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_ulonglong temp = (*(npy_ulonglong *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ulonglong *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 2 == 1
+        accum_re += ((npy_ulonglong *)data0)[0];
+        accum_im += ((npy_ulonglong *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 2 <= 3
+#define _SUMPROD_NOP 2
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_ulonglong re, im, tmp;
+        int i;
+        re = ((npy_ulonglong *)dataptr[0])[0];
+        im = ((npy_ulonglong *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ulonglong *)dataptr[i])[0] -
+                  im * ((npy_ulonglong *)dataptr[i])[1];
+            im = re * ((npy_ulonglong *)dataptr[i])[1] +
+                 im * ((npy_ulonglong *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 2 <= 3
+    ((npy_ulonglong *)dataptr[2])[0] += accum_re;
+    ((npy_ulonglong *)dataptr[2])[1] += accum_im;
+#  else
+    ((npy_ulonglong *)dataptr[nop])[0] += accum_re;
+    ((npy_ulonglong *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 2 <= 3
+    *((npy_ulonglong *)dataptr[2]) = (accum +
+                                    (*((npy_ulonglong *)dataptr[2])));
+#  else
+    *((npy_ulonglong *)dataptr[nop]) = (accum +
+                                    (*((npy_ulonglong *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+ulonglong_sum_of_products_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (3 == 2 || 3 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (3 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data_out = dataptr[3];
+    npy_intp stride_out = strides[3];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("ulonglong_sum_of_products_three (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 3 == 1
+        *(npy_ulonglong *)data_out = ((*(npy_ulonglong *)data0) +
+                                         (*(npy_ulonglong *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 3 == 2
+        *(npy_ulonglong *)data_out = ((*(npy_ulonglong *)data0) *
+                                         (*(npy_ulonglong *)data1) +
+                                         (*(npy_ulonglong *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 3 == 3
+        *(npy_ulonglong *)data_out = ((*(npy_ulonglong *)data0) *
+                                         (*(npy_ulonglong *)data1) *
+                                         (*(npy_ulonglong *)data2) +
+                                         (*(npy_ulonglong *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_ulonglong temp = (*(npy_ulonglong *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ulonglong *)dataptr[i]);
+        }
+        *(npy_ulonglong *)dataptr[nop] = (temp +
+                                           (*(npy_ulonglong *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 3 == 1
+        ((npy_ulonglong *)data_out)[0] = ((npy_ulonglong *)data0)[0] +
+                                         ((npy_ulonglong *)data_out)[0];
+        ((npy_ulonglong *)data_out)[1] = ((npy_ulonglong *)data0)[1] +
+                                         ((npy_ulonglong *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 3 <= 3
+#define _SUMPROD_NOP 3
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_ulonglong re, im, tmp;
+        int i;
+        re = ((npy_ulonglong *)dataptr[0])[0];
+        im = ((npy_ulonglong *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ulonglong *)dataptr[i])[0] -
+                  im * ((npy_ulonglong *)dataptr[i])[1];
+            im = re * ((npy_ulonglong *)dataptr[i])[1] +
+                 im * ((npy_ulonglong *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_ulonglong *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_ulonglong *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_ulonglong *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_ulonglong *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 3 == 1
+
+static void
+ulonglong_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulonglong *data0 = (npy_ulonglong *)dataptr[0];
+    npy_ulonglong *data_out = (npy_ulonglong *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("ulonglong_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_ulonglong *)data_out + 2*6)[0] =
+                                    ((npy_ulonglong *)data0 + 2*6)[0] +
+                                    ((npy_ulonglong *)data_out + 2*6)[0];
+            ((npy_ulonglong *)data_out + 2*6)[1] =
+                                    ((npy_ulonglong *)data0 + 2*6)[1] +
+                                    ((npy_ulonglong *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_ulonglong *)data_out + 2*5)[0] =
+                                    ((npy_ulonglong *)data0 + 2*5)[0] +
+                                    ((npy_ulonglong *)data_out + 2*5)[0];
+            ((npy_ulonglong *)data_out + 2*5)[1] =
+                                    ((npy_ulonglong *)data0 + 2*5)[1] +
+                                    ((npy_ulonglong *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_ulonglong *)data_out + 2*4)[0] =
+                                    ((npy_ulonglong *)data0 + 2*4)[0] +
+                                    ((npy_ulonglong *)data_out + 2*4)[0];
+            ((npy_ulonglong *)data_out + 2*4)[1] =
+                                    ((npy_ulonglong *)data0 + 2*4)[1] +
+                                    ((npy_ulonglong *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_ulonglong *)data_out + 2*3)[0] =
+                                    ((npy_ulonglong *)data0 + 2*3)[0] +
+                                    ((npy_ulonglong *)data_out + 2*3)[0];
+            ((npy_ulonglong *)data_out + 2*3)[1] =
+                                    ((npy_ulonglong *)data0 + 2*3)[1] +
+                                    ((npy_ulonglong *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_ulonglong *)data_out + 2*2)[0] =
+                                    ((npy_ulonglong *)data0 + 2*2)[0] +
+                                    ((npy_ulonglong *)data_out + 2*2)[0];
+            ((npy_ulonglong *)data_out + 2*2)[1] =
+                                    ((npy_ulonglong *)data0 + 2*2)[1] +
+                                    ((npy_ulonglong *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_ulonglong *)data_out + 2*1)[0] =
+                                    ((npy_ulonglong *)data0 + 2*1)[0] +
+                                    ((npy_ulonglong *)data_out + 2*1)[0];
+            ((npy_ulonglong *)data_out + 2*1)[1] =
+                                    ((npy_ulonglong *)data0 + 2*1)[1] +
+                                    ((npy_ulonglong *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_ulonglong *)data_out + 2*0)[0] =
+                                    ((npy_ulonglong *)data0 + 2*0)[0] +
+                                    ((npy_ulonglong *)data_out + 2*0)[0];
+            ((npy_ulonglong *)data_out + 2*0)[1] =
+                                    ((npy_ulonglong *)data0 + 2*0)[1] +
+                                    ((npy_ulonglong *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_ulonglong *)data_out + 2*0)[0] =
+                                ((npy_ulonglong *)data0 + 2*0)[0] +
+                                ((npy_ulonglong *)data_out + 2*0)[0];
+        ((npy_ulonglong *)data_out + 2*0)[1] =
+                                ((npy_ulonglong *)data0 + 2*0)[1] +
+                                ((npy_ulonglong *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_ulonglong *)data_out + 2*1)[0] =
+                                ((npy_ulonglong *)data0 + 2*1)[0] +
+                                ((npy_ulonglong *)data_out + 2*1)[0];
+        ((npy_ulonglong *)data_out + 2*1)[1] =
+                                ((npy_ulonglong *)data0 + 2*1)[1] +
+                                ((npy_ulonglong *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_ulonglong *)data_out + 2*2)[0] =
+                                ((npy_ulonglong *)data0 + 2*2)[0] +
+                                ((npy_ulonglong *)data_out + 2*2)[0];
+        ((npy_ulonglong *)data_out + 2*2)[1] =
+                                ((npy_ulonglong *)data0 + 2*2)[1] +
+                                ((npy_ulonglong *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_ulonglong *)data_out + 2*3)[0] =
+                                ((npy_ulonglong *)data0 + 2*3)[0] +
+                                ((npy_ulonglong *)data_out + 2*3)[0];
+        ((npy_ulonglong *)data_out + 2*3)[1] =
+                                ((npy_ulonglong *)data0 + 2*3)[1] +
+                                ((npy_ulonglong *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_ulonglong *)data_out + 2*4)[0] =
+                                ((npy_ulonglong *)data0 + 2*4)[0] +
+                                ((npy_ulonglong *)data_out + 2*4)[0];
+        ((npy_ulonglong *)data_out + 2*4)[1] =
+                                ((npy_ulonglong *)data0 + 2*4)[1] +
+                                ((npy_ulonglong *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_ulonglong *)data_out + 2*5)[0] =
+                                ((npy_ulonglong *)data0 + 2*5)[0] +
+                                ((npy_ulonglong *)data_out + 2*5)[0];
+        ((npy_ulonglong *)data_out + 2*5)[1] =
+                                ((npy_ulonglong *)data0 + 2*5)[1] +
+                                ((npy_ulonglong *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_ulonglong *)data_out + 2*6)[0] =
+                                ((npy_ulonglong *)data0 + 2*6)[0] +
+                                ((npy_ulonglong *)data_out + 2*6)[0];
+        ((npy_ulonglong *)data_out + 2*6)[1] =
+                                ((npy_ulonglong *)data0 + 2*6)[1] +
+                                ((npy_ulonglong *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_ulonglong *)data_out + 2*7)[0] =
+                                ((npy_ulonglong *)data0 + 2*7)[0] +
+                                ((npy_ulonglong *)data_out + 2*7)[0];
+        ((npy_ulonglong *)data_out + 2*7)[1] =
+                                ((npy_ulonglong *)data0 + 2*7)[1] +
+                                ((npy_ulonglong *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 3 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+ulonglong_sum_of_products_muladd(npy_ulonglong *data, npy_ulonglong *data_out, npy_ulonglong scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_ulonglong
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_u64;
+    const npyv_u64 v_scalar = npyv_setall_u64(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_u64 b0 = npyv_loada_u64(data + vstep * 0);
+            npyv_u64 c0 = npyv_loada_u64(data_out + vstep * 0);
+            
+#line 312
+            npyv_u64 b1 = npyv_loada_u64(data + vstep * 1);
+            npyv_u64 c1 = npyv_loada_u64(data_out + vstep * 1);
+            
+#line 312
+            npyv_u64 b2 = npyv_loada_u64(data + vstep * 2);
+            npyv_u64 c2 = npyv_loada_u64(data_out + vstep * 2);
+            
+#line 312
+            npyv_u64 b3 = npyv_loada_u64(data + vstep * 3);
+            npyv_u64 c3 = npyv_loada_u64(data_out + vstep * 3);
+            
+            #line 318
+            npyv_u64 abc0 = npyv_muladd_u64(v_scalar, b0, c0);
+            
+#line 318
+            npyv_u64 abc1 = npyv_muladd_u64(v_scalar, b1, c1);
+            
+#line 318
+            npyv_u64 abc2 = npyv_muladd_u64(v_scalar, b2, c2);
+            
+#line 318
+            npyv_u64 abc3 = npyv_muladd_u64(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_u64(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_u64(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_u64(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_u64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_u64 b0 = npyv_load_u64(data + vstep * 0);
+            npyv_u64 c0 = npyv_load_u64(data_out + vstep * 0);
+            
+#line 312
+            npyv_u64 b1 = npyv_load_u64(data + vstep * 1);
+            npyv_u64 c1 = npyv_load_u64(data_out + vstep * 1);
+            
+#line 312
+            npyv_u64 b2 = npyv_load_u64(data + vstep * 2);
+            npyv_u64 c2 = npyv_load_u64(data_out + vstep * 2);
+            
+#line 312
+            npyv_u64 b3 = npyv_load_u64(data + vstep * 3);
+            npyv_u64 c3 = npyv_load_u64(data_out + vstep * 3);
+            
+            #line 318
+            npyv_u64 abc0 = npyv_muladd_u64(v_scalar, b0, c0);
+            
+#line 318
+            npyv_u64 abc1 = npyv_muladd_u64(v_scalar, b1, c1);
+            
+#line 318
+            npyv_u64 abc2 = npyv_muladd_u64(v_scalar, b2, c2);
+            
+#line 318
+            npyv_u64 abc3 = npyv_muladd_u64(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_u64(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_u64(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_u64(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_u64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_u64 a = npyv_load_tillz_u64(data, count);
+        npyv_u64 b = npyv_load_tillz_u64(data_out, count);
+        npyv_store_till_u64(data_out, count, npyv_muladd_u64(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_ulonglong b0 = (data[0]);
+        const npy_ulonglong c0 = (data_out[0]);
+        
+#line 340
+        const npy_ulonglong b1 = (data[1]);
+        const npy_ulonglong c1 = (data_out[1]);
+        
+#line 340
+        const npy_ulonglong b2 = (data[2]);
+        const npy_ulonglong c2 = (data_out[2]);
+        
+#line 340
+        const npy_ulonglong b3 = (data[3]);
+        const npy_ulonglong c3 = (data_out[3]);
+        
+        #line 346
+        const npy_ulonglong abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_ulonglong abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_ulonglong abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_ulonglong abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_ulonglong b = (*data);
+        const npy_ulonglong c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_ulonglong
+}
+
+static void
+ulonglong_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulonglong *data0 = (npy_ulonglong *)dataptr[0];
+    npy_ulonglong *data1 = (npy_ulonglong *)dataptr[1];
+    npy_ulonglong *data_out = (npy_ulonglong *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("ulonglong_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_ulonglong
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_u64;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_u64 a0 = npyv_loada_u64(data0 + vstep * 0);
+            npyv_u64 b0 = npyv_loada_u64(data1 + vstep * 0);
+            npyv_u64 c0 = npyv_loada_u64(data_out + vstep * 0);
+            
+#line 390
+            npyv_u64 a1 = npyv_loada_u64(data0 + vstep * 1);
+            npyv_u64 b1 = npyv_loada_u64(data1 + vstep * 1);
+            npyv_u64 c1 = npyv_loada_u64(data_out + vstep * 1);
+            
+#line 390
+            npyv_u64 a2 = npyv_loada_u64(data0 + vstep * 2);
+            npyv_u64 b2 = npyv_loada_u64(data1 + vstep * 2);
+            npyv_u64 c2 = npyv_loada_u64(data_out + vstep * 2);
+            
+#line 390
+            npyv_u64 a3 = npyv_loada_u64(data0 + vstep * 3);
+            npyv_u64 b3 = npyv_loada_u64(data1 + vstep * 3);
+            npyv_u64 c3 = npyv_loada_u64(data_out + vstep * 3);
+            
+            #line 397
+            npyv_u64 abc0 = npyv_muladd_u64(a0, b0, c0);
+            
+#line 397
+            npyv_u64 abc1 = npyv_muladd_u64(a1, b1, c1);
+            
+#line 397
+            npyv_u64 abc2 = npyv_muladd_u64(a2, b2, c2);
+            
+#line 397
+            npyv_u64 abc3 = npyv_muladd_u64(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_u64(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_u64(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_u64(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_u64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_u64 a0 = npyv_load_u64(data0 + vstep * 0);
+            npyv_u64 b0 = npyv_load_u64(data1 + vstep * 0);
+            npyv_u64 c0 = npyv_load_u64(data_out + vstep * 0);
+            
+#line 390
+            npyv_u64 a1 = npyv_load_u64(data0 + vstep * 1);
+            npyv_u64 b1 = npyv_load_u64(data1 + vstep * 1);
+            npyv_u64 c1 = npyv_load_u64(data_out + vstep * 1);
+            
+#line 390
+            npyv_u64 a2 = npyv_load_u64(data0 + vstep * 2);
+            npyv_u64 b2 = npyv_load_u64(data1 + vstep * 2);
+            npyv_u64 c2 = npyv_load_u64(data_out + vstep * 2);
+            
+#line 390
+            npyv_u64 a3 = npyv_load_u64(data0 + vstep * 3);
+            npyv_u64 b3 = npyv_load_u64(data1 + vstep * 3);
+            npyv_u64 c3 = npyv_load_u64(data_out + vstep * 3);
+            
+            #line 397
+            npyv_u64 abc0 = npyv_muladd_u64(a0, b0, c0);
+            
+#line 397
+            npyv_u64 abc1 = npyv_muladd_u64(a1, b1, c1);
+            
+#line 397
+            npyv_u64 abc2 = npyv_muladd_u64(a2, b2, c2);
+            
+#line 397
+            npyv_u64 abc3 = npyv_muladd_u64(a3, b3, c3);
+            
+            #line 402
+            npyv_store_u64(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_u64(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_u64(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_u64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_u64 a = npyv_load_tillz_u64(data0, count);
+        npyv_u64 b = npyv_load_tillz_u64(data1, count);
+        npyv_u64 c = npyv_load_tillz_u64(data_out, count);
+        npyv_store_till_u64(data_out, count, npyv_muladd_u64(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_ulonglong a0 = (data0[0]);
+        const npy_ulonglong b0 = (data1[0]);
+        const npy_ulonglong c0 = (data_out[0]);
+        
+#line 420
+        const npy_ulonglong a1 = (data0[1]);
+        const npy_ulonglong b1 = (data1[1]);
+        const npy_ulonglong c1 = (data_out[1]);
+        
+#line 420
+        const npy_ulonglong a2 = (data0[2]);
+        const npy_ulonglong b2 = (data1[2]);
+        const npy_ulonglong c2 = (data_out[2]);
+        
+#line 420
+        const npy_ulonglong a3 = (data0[3]);
+        const npy_ulonglong b3 = (data1[3]);
+        const npy_ulonglong c3 = (data_out[3]);
+        
+        #line 427
+        const npy_ulonglong abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_ulonglong abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_ulonglong abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_ulonglong abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_ulonglong a = (*data0);
+        const npy_ulonglong b = (*data1);
+        const npy_ulonglong c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_ulonglong
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+ulonglong_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulonglong value0 = (*(npy_ulonglong *)dataptr[0]);
+    npy_ulonglong *data1 = (npy_ulonglong *)dataptr[1];
+    npy_ulonglong *data_out = (npy_ulonglong *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("ulonglong_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    ulonglong_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+ulonglong_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulonglong value1 = (*(npy_ulonglong *)dataptr[1]);
+    npy_ulonglong *data0 = (npy_ulonglong *)dataptr[0];
+    npy_ulonglong *data_out = (npy_ulonglong *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("ulonglong_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    ulonglong_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+ulonglong_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulonglong *data0 = (npy_ulonglong *)dataptr[0];
+    npy_ulonglong *data1 = (npy_ulonglong *)dataptr[1];
+    npy_ulonglong accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("ulonglong_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_ulonglong
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_u64;
+    npyv_u64 v_accum = npyv_zero_u64();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_u64 a0 = npyv_loada_u64(data0 + vstep * 0);
+            npyv_u64 b0 = npyv_loada_u64(data1 + vstep * 0);
+            
+#line 501
+            npyv_u64 a1 = npyv_loada_u64(data0 + vstep * 1);
+            npyv_u64 b1 = npyv_loada_u64(data1 + vstep * 1);
+            
+#line 501
+            npyv_u64 a2 = npyv_loada_u64(data0 + vstep * 2);
+            npyv_u64 b2 = npyv_loada_u64(data1 + vstep * 2);
+            
+#line 501
+            npyv_u64 a3 = npyv_loada_u64(data0 + vstep * 3);
+            npyv_u64 b3 = npyv_loada_u64(data1 + vstep * 3);
+            
+            npyv_u64 ab3 = npyv_muladd_u64(a3, b3, v_accum);
+            npyv_u64 ab2 = npyv_muladd_u64(a2, b2, ab3);
+            npyv_u64 ab1 = npyv_muladd_u64(a1, b1, ab2);
+                   v_accum = npyv_muladd_u64(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_u64 a0 = npyv_load_u64(data0 + vstep * 0);
+            npyv_u64 b0 = npyv_load_u64(data1 + vstep * 0);
+            
+#line 501
+            npyv_u64 a1 = npyv_load_u64(data0 + vstep * 1);
+            npyv_u64 b1 = npyv_load_u64(data1 + vstep * 1);
+            
+#line 501
+            npyv_u64 a2 = npyv_load_u64(data0 + vstep * 2);
+            npyv_u64 b2 = npyv_load_u64(data1 + vstep * 2);
+            
+#line 501
+            npyv_u64 a3 = npyv_load_u64(data0 + vstep * 3);
+            npyv_u64 b3 = npyv_load_u64(data1 + vstep * 3);
+            
+            npyv_u64 ab3 = npyv_muladd_u64(a3, b3, v_accum);
+            npyv_u64 ab2 = npyv_muladd_u64(a2, b2, ab3);
+            npyv_u64 ab1 = npyv_muladd_u64(a1, b1, ab2);
+                   v_accum = npyv_muladd_u64(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_u64 a = npyv_load_tillz_u64(data0, count);
+        npyv_u64 b = npyv_load_tillz_u64(data1, count);
+        v_accum = npyv_muladd_u64(a, b, v_accum);
+    }
+    accum = npyv_sum_u64(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_ulonglong ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_ulonglong ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_ulonglong ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_ulonglong ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_ulonglong a = (*data0);
+        const npy_ulonglong b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_ulonglong
+    *(npy_ulonglong *)dataptr[2] = ((*(npy_ulonglong *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+ulonglong_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulonglong *data1 = (npy_ulonglong *)dataptr[1];
+    npy_ulonglong value0 = (*(npy_ulonglong *)dataptr[0]);
+    npy_ulonglong accum = ulonglong_sum_of_arr(data1, count);
+    *(npy_ulonglong *)dataptr[2] = ((*(npy_ulonglong *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+ulonglong_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulonglong *data0 = (npy_ulonglong *)dataptr[0];
+    npy_ulonglong value1 = (*(npy_ulonglong *)dataptr[1]);
+    npy_ulonglong accum = ulonglong_sum_of_arr(data0, count);
+    *(npy_ulonglong *)dataptr[2] = ((*(npy_ulonglong *)dataptr[2]) + value1 * accum);
+}
+
+#elif 3 == 3 && !0
+
+static void
+ulonglong_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulonglong *data0 = (npy_ulonglong *)dataptr[0];
+    npy_ulonglong *data1 = (npy_ulonglong *)dataptr[1];
+    npy_ulonglong *data2 = (npy_ulonglong *)dataptr[2];
+    npy_ulonglong *data_out = (npy_ulonglong *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 3 > 3 || @complex */
+
+static void
+ulonglong_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("ulonglong_sum_of_products_contig_three (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_ulonglong temp = (*(npy_ulonglong *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ulonglong *)dataptr[i]);
+        }
+        *(npy_ulonglong *)dataptr[nop] = (temp +
+                                           (*(npy_ulonglong *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_ulonglong);
+        }
+#else /* complex */
+#  if 3 <= 3
+#    define _SUMPROD_NOP 3
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_ulonglong re, im, tmp;
+        int i;
+        re = ((npy_ulonglong *)dataptr[0])[0];
+        im = ((npy_ulonglong *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ulonglong *)dataptr[i])[0] -
+                  im * ((npy_ulonglong *)dataptr[i])[1];
+            im = re * ((npy_ulonglong *)dataptr[i])[1] +
+                 im * ((npy_ulonglong *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_ulonglong *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_ulonglong *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_ulonglong *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_ulonglong *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_ulonglong);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 3 */
+
+#if 3 == 1
+
+static NPY_GCC_OPT_3 void
+ulonglong_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("ulonglong_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_ulonglong *data = (npy_ulonglong *)dataptr[0];
+    npy_ulonglong accum = ulonglong_sum_of_arr(data, count);
+    *((npy_ulonglong *)dataptr[1]) = (accum + (*((npy_ulonglong *)dataptr[1])));
+#else
+    npy_ulonglong accum_re = 0, accum_im = 0;
+    npy_ulonglong *data0 = (npy_ulonglong *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_ulonglong re01 = data0[0] + data0[2];
+        const npy_ulonglong re23 = data0[4] + data0[6];
+        const npy_ulonglong im13 = data0[1] + data0[3];
+        const npy_ulonglong im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_ulonglong *)dataptr[1])[0] += accum_re;
+    ((npy_ulonglong *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 3 == 1 */
+
+static void
+ulonglong_sum_of_products_outstride0_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_ulonglong accum_re = 0, accum_im = 0;
+#else
+    npy_ulonglong accum = 0;
+#endif
+
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (3 == 2 || 3 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (3 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("ulonglong_sum_of_products_outstride0_three (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 3 == 1
+        accum += (*(npy_ulonglong *)data0);
+        data0 += stride0;
+#  elif 3 == 2
+        accum += (*(npy_ulonglong *)data0) *
+                 (*(npy_ulonglong *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 3 == 3
+        accum += (*(npy_ulonglong *)data0) *
+                 (*(npy_ulonglong *)data1) *
+                 (*(npy_ulonglong *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_ulonglong temp = (*(npy_ulonglong *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ulonglong *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 3 == 1
+        accum_re += ((npy_ulonglong *)data0)[0];
+        accum_im += ((npy_ulonglong *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 3 <= 3
+#define _SUMPROD_NOP 3
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_ulonglong re, im, tmp;
+        int i;
+        re = ((npy_ulonglong *)dataptr[0])[0];
+        im = ((npy_ulonglong *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ulonglong *)dataptr[i])[0] -
+                  im * ((npy_ulonglong *)dataptr[i])[1];
+            im = re * ((npy_ulonglong *)dataptr[i])[1] +
+                 im * ((npy_ulonglong *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 3 <= 3
+    ((npy_ulonglong *)dataptr[3])[0] += accum_re;
+    ((npy_ulonglong *)dataptr[3])[1] += accum_im;
+#  else
+    ((npy_ulonglong *)dataptr[nop])[0] += accum_re;
+    ((npy_ulonglong *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 3 <= 3
+    *((npy_ulonglong *)dataptr[3]) = (accum +
+                                    (*((npy_ulonglong *)dataptr[3])));
+#  else
+    *((npy_ulonglong *)dataptr[nop]) = (accum +
+                                    (*((npy_ulonglong *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+ulonglong_sum_of_products_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1000 == 2 || 1000 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1000 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data_out = dataptr[1000];
+    npy_intp stride_out = strides[1000];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("ulonglong_sum_of_products_any (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 1000 == 1
+        *(npy_ulonglong *)data_out = ((*(npy_ulonglong *)data0) +
+                                         (*(npy_ulonglong *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 1000 == 2
+        *(npy_ulonglong *)data_out = ((*(npy_ulonglong *)data0) *
+                                         (*(npy_ulonglong *)data1) +
+                                         (*(npy_ulonglong *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 1000 == 3
+        *(npy_ulonglong *)data_out = ((*(npy_ulonglong *)data0) *
+                                         (*(npy_ulonglong *)data1) *
+                                         (*(npy_ulonglong *)data2) +
+                                         (*(npy_ulonglong *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_ulonglong temp = (*(npy_ulonglong *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ulonglong *)dataptr[i]);
+        }
+        *(npy_ulonglong *)dataptr[nop] = (temp +
+                                           (*(npy_ulonglong *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1000 == 1
+        ((npy_ulonglong *)data_out)[0] = ((npy_ulonglong *)data0)[0] +
+                                         ((npy_ulonglong *)data_out)[0];
+        ((npy_ulonglong *)data_out)[1] = ((npy_ulonglong *)data0)[1] +
+                                         ((npy_ulonglong *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 1000 <= 3
+#define _SUMPROD_NOP 1000
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_ulonglong re, im, tmp;
+        int i;
+        re = ((npy_ulonglong *)dataptr[0])[0];
+        im = ((npy_ulonglong *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ulonglong *)dataptr[i])[0] -
+                  im * ((npy_ulonglong *)dataptr[i])[1];
+            im = re * ((npy_ulonglong *)dataptr[i])[1] +
+                 im * ((npy_ulonglong *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_ulonglong *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_ulonglong *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_ulonglong *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_ulonglong *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 1000 == 1
+
+static void
+ulonglong_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulonglong *data0 = (npy_ulonglong *)dataptr[0];
+    npy_ulonglong *data_out = (npy_ulonglong *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("ulonglong_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_ulonglong *)data_out + 2*6)[0] =
+                                    ((npy_ulonglong *)data0 + 2*6)[0] +
+                                    ((npy_ulonglong *)data_out + 2*6)[0];
+            ((npy_ulonglong *)data_out + 2*6)[1] =
+                                    ((npy_ulonglong *)data0 + 2*6)[1] +
+                                    ((npy_ulonglong *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_ulonglong *)data_out + 2*5)[0] =
+                                    ((npy_ulonglong *)data0 + 2*5)[0] +
+                                    ((npy_ulonglong *)data_out + 2*5)[0];
+            ((npy_ulonglong *)data_out + 2*5)[1] =
+                                    ((npy_ulonglong *)data0 + 2*5)[1] +
+                                    ((npy_ulonglong *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_ulonglong *)data_out + 2*4)[0] =
+                                    ((npy_ulonglong *)data0 + 2*4)[0] +
+                                    ((npy_ulonglong *)data_out + 2*4)[0];
+            ((npy_ulonglong *)data_out + 2*4)[1] =
+                                    ((npy_ulonglong *)data0 + 2*4)[1] +
+                                    ((npy_ulonglong *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_ulonglong *)data_out + 2*3)[0] =
+                                    ((npy_ulonglong *)data0 + 2*3)[0] +
+                                    ((npy_ulonglong *)data_out + 2*3)[0];
+            ((npy_ulonglong *)data_out + 2*3)[1] =
+                                    ((npy_ulonglong *)data0 + 2*3)[1] +
+                                    ((npy_ulonglong *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_ulonglong *)data_out + 2*2)[0] =
+                                    ((npy_ulonglong *)data0 + 2*2)[0] +
+                                    ((npy_ulonglong *)data_out + 2*2)[0];
+            ((npy_ulonglong *)data_out + 2*2)[1] =
+                                    ((npy_ulonglong *)data0 + 2*2)[1] +
+                                    ((npy_ulonglong *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_ulonglong *)data_out + 2*1)[0] =
+                                    ((npy_ulonglong *)data0 + 2*1)[0] +
+                                    ((npy_ulonglong *)data_out + 2*1)[0];
+            ((npy_ulonglong *)data_out + 2*1)[1] =
+                                    ((npy_ulonglong *)data0 + 2*1)[1] +
+                                    ((npy_ulonglong *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_ulonglong *)data_out + 2*0)[0] =
+                                    ((npy_ulonglong *)data0 + 2*0)[0] +
+                                    ((npy_ulonglong *)data_out + 2*0)[0];
+            ((npy_ulonglong *)data_out + 2*0)[1] =
+                                    ((npy_ulonglong *)data0 + 2*0)[1] +
+                                    ((npy_ulonglong *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_ulonglong *)data_out + 2*0)[0] =
+                                ((npy_ulonglong *)data0 + 2*0)[0] +
+                                ((npy_ulonglong *)data_out + 2*0)[0];
+        ((npy_ulonglong *)data_out + 2*0)[1] =
+                                ((npy_ulonglong *)data0 + 2*0)[1] +
+                                ((npy_ulonglong *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_ulonglong *)data_out + 2*1)[0] =
+                                ((npy_ulonglong *)data0 + 2*1)[0] +
+                                ((npy_ulonglong *)data_out + 2*1)[0];
+        ((npy_ulonglong *)data_out + 2*1)[1] =
+                                ((npy_ulonglong *)data0 + 2*1)[1] +
+                                ((npy_ulonglong *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_ulonglong *)data_out + 2*2)[0] =
+                                ((npy_ulonglong *)data0 + 2*2)[0] +
+                                ((npy_ulonglong *)data_out + 2*2)[0];
+        ((npy_ulonglong *)data_out + 2*2)[1] =
+                                ((npy_ulonglong *)data0 + 2*2)[1] +
+                                ((npy_ulonglong *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_ulonglong *)data_out + 2*3)[0] =
+                                ((npy_ulonglong *)data0 + 2*3)[0] +
+                                ((npy_ulonglong *)data_out + 2*3)[0];
+        ((npy_ulonglong *)data_out + 2*3)[1] =
+                                ((npy_ulonglong *)data0 + 2*3)[1] +
+                                ((npy_ulonglong *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_ulonglong *)data_out + 2*4)[0] =
+                                ((npy_ulonglong *)data0 + 2*4)[0] +
+                                ((npy_ulonglong *)data_out + 2*4)[0];
+        ((npy_ulonglong *)data_out + 2*4)[1] =
+                                ((npy_ulonglong *)data0 + 2*4)[1] +
+                                ((npy_ulonglong *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_ulonglong *)data_out + 2*5)[0] =
+                                ((npy_ulonglong *)data0 + 2*5)[0] +
+                                ((npy_ulonglong *)data_out + 2*5)[0];
+        ((npy_ulonglong *)data_out + 2*5)[1] =
+                                ((npy_ulonglong *)data0 + 2*5)[1] +
+                                ((npy_ulonglong *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_ulonglong *)data_out + 2*6)[0] =
+                                ((npy_ulonglong *)data0 + 2*6)[0] +
+                                ((npy_ulonglong *)data_out + 2*6)[0];
+        ((npy_ulonglong *)data_out + 2*6)[1] =
+                                ((npy_ulonglong *)data0 + 2*6)[1] +
+                                ((npy_ulonglong *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_ulonglong *)data_out + 2*7)[0] =
+                                ((npy_ulonglong *)data0 + 2*7)[0] +
+                                ((npy_ulonglong *)data_out + 2*7)[0];
+        ((npy_ulonglong *)data_out + 2*7)[1] =
+                                ((npy_ulonglong *)data0 + 2*7)[1] +
+                                ((npy_ulonglong *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 1000 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+ulonglong_sum_of_products_muladd(npy_ulonglong *data, npy_ulonglong *data_out, npy_ulonglong scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_ulonglong
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_u64;
+    const npyv_u64 v_scalar = npyv_setall_u64(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_u64 b0 = npyv_loada_u64(data + vstep * 0);
+            npyv_u64 c0 = npyv_loada_u64(data_out + vstep * 0);
+            
+#line 312
+            npyv_u64 b1 = npyv_loada_u64(data + vstep * 1);
+            npyv_u64 c1 = npyv_loada_u64(data_out + vstep * 1);
+            
+#line 312
+            npyv_u64 b2 = npyv_loada_u64(data + vstep * 2);
+            npyv_u64 c2 = npyv_loada_u64(data_out + vstep * 2);
+            
+#line 312
+            npyv_u64 b3 = npyv_loada_u64(data + vstep * 3);
+            npyv_u64 c3 = npyv_loada_u64(data_out + vstep * 3);
+            
+            #line 318
+            npyv_u64 abc0 = npyv_muladd_u64(v_scalar, b0, c0);
+            
+#line 318
+            npyv_u64 abc1 = npyv_muladd_u64(v_scalar, b1, c1);
+            
+#line 318
+            npyv_u64 abc2 = npyv_muladd_u64(v_scalar, b2, c2);
+            
+#line 318
+            npyv_u64 abc3 = npyv_muladd_u64(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_u64(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_u64(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_u64(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_u64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_u64 b0 = npyv_load_u64(data + vstep * 0);
+            npyv_u64 c0 = npyv_load_u64(data_out + vstep * 0);
+            
+#line 312
+            npyv_u64 b1 = npyv_load_u64(data + vstep * 1);
+            npyv_u64 c1 = npyv_load_u64(data_out + vstep * 1);
+            
+#line 312
+            npyv_u64 b2 = npyv_load_u64(data + vstep * 2);
+            npyv_u64 c2 = npyv_load_u64(data_out + vstep * 2);
+            
+#line 312
+            npyv_u64 b3 = npyv_load_u64(data + vstep * 3);
+            npyv_u64 c3 = npyv_load_u64(data_out + vstep * 3);
+            
+            #line 318
+            npyv_u64 abc0 = npyv_muladd_u64(v_scalar, b0, c0);
+            
+#line 318
+            npyv_u64 abc1 = npyv_muladd_u64(v_scalar, b1, c1);
+            
+#line 318
+            npyv_u64 abc2 = npyv_muladd_u64(v_scalar, b2, c2);
+            
+#line 318
+            npyv_u64 abc3 = npyv_muladd_u64(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_u64(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_u64(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_u64(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_u64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_u64 a = npyv_load_tillz_u64(data, count);
+        npyv_u64 b = npyv_load_tillz_u64(data_out, count);
+        npyv_store_till_u64(data_out, count, npyv_muladd_u64(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_ulonglong b0 = (data[0]);
+        const npy_ulonglong c0 = (data_out[0]);
+        
+#line 340
+        const npy_ulonglong b1 = (data[1]);
+        const npy_ulonglong c1 = (data_out[1]);
+        
+#line 340
+        const npy_ulonglong b2 = (data[2]);
+        const npy_ulonglong c2 = (data_out[2]);
+        
+#line 340
+        const npy_ulonglong b3 = (data[3]);
+        const npy_ulonglong c3 = (data_out[3]);
+        
+        #line 346
+        const npy_ulonglong abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_ulonglong abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_ulonglong abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_ulonglong abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_ulonglong b = (*data);
+        const npy_ulonglong c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_ulonglong
+}
+
+static void
+ulonglong_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulonglong *data0 = (npy_ulonglong *)dataptr[0];
+    npy_ulonglong *data1 = (npy_ulonglong *)dataptr[1];
+    npy_ulonglong *data_out = (npy_ulonglong *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("ulonglong_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_ulonglong
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_u64;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_u64 a0 = npyv_loada_u64(data0 + vstep * 0);
+            npyv_u64 b0 = npyv_loada_u64(data1 + vstep * 0);
+            npyv_u64 c0 = npyv_loada_u64(data_out + vstep * 0);
+            
+#line 390
+            npyv_u64 a1 = npyv_loada_u64(data0 + vstep * 1);
+            npyv_u64 b1 = npyv_loada_u64(data1 + vstep * 1);
+            npyv_u64 c1 = npyv_loada_u64(data_out + vstep * 1);
+            
+#line 390
+            npyv_u64 a2 = npyv_loada_u64(data0 + vstep * 2);
+            npyv_u64 b2 = npyv_loada_u64(data1 + vstep * 2);
+            npyv_u64 c2 = npyv_loada_u64(data_out + vstep * 2);
+            
+#line 390
+            npyv_u64 a3 = npyv_loada_u64(data0 + vstep * 3);
+            npyv_u64 b3 = npyv_loada_u64(data1 + vstep * 3);
+            npyv_u64 c3 = npyv_loada_u64(data_out + vstep * 3);
+            
+            #line 397
+            npyv_u64 abc0 = npyv_muladd_u64(a0, b0, c0);
+            
+#line 397
+            npyv_u64 abc1 = npyv_muladd_u64(a1, b1, c1);
+            
+#line 397
+            npyv_u64 abc2 = npyv_muladd_u64(a2, b2, c2);
+            
+#line 397
+            npyv_u64 abc3 = npyv_muladd_u64(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_u64(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_u64(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_u64(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_u64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_u64 a0 = npyv_load_u64(data0 + vstep * 0);
+            npyv_u64 b0 = npyv_load_u64(data1 + vstep * 0);
+            npyv_u64 c0 = npyv_load_u64(data_out + vstep * 0);
+            
+#line 390
+            npyv_u64 a1 = npyv_load_u64(data0 + vstep * 1);
+            npyv_u64 b1 = npyv_load_u64(data1 + vstep * 1);
+            npyv_u64 c1 = npyv_load_u64(data_out + vstep * 1);
+            
+#line 390
+            npyv_u64 a2 = npyv_load_u64(data0 + vstep * 2);
+            npyv_u64 b2 = npyv_load_u64(data1 + vstep * 2);
+            npyv_u64 c2 = npyv_load_u64(data_out + vstep * 2);
+            
+#line 390
+            npyv_u64 a3 = npyv_load_u64(data0 + vstep * 3);
+            npyv_u64 b3 = npyv_load_u64(data1 + vstep * 3);
+            npyv_u64 c3 = npyv_load_u64(data_out + vstep * 3);
+            
+            #line 397
+            npyv_u64 abc0 = npyv_muladd_u64(a0, b0, c0);
+            
+#line 397
+            npyv_u64 abc1 = npyv_muladd_u64(a1, b1, c1);
+            
+#line 397
+            npyv_u64 abc2 = npyv_muladd_u64(a2, b2, c2);
+            
+#line 397
+            npyv_u64 abc3 = npyv_muladd_u64(a3, b3, c3);
+            
+            #line 402
+            npyv_store_u64(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_u64(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_u64(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_u64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_u64 a = npyv_load_tillz_u64(data0, count);
+        npyv_u64 b = npyv_load_tillz_u64(data1, count);
+        npyv_u64 c = npyv_load_tillz_u64(data_out, count);
+        npyv_store_till_u64(data_out, count, npyv_muladd_u64(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_ulonglong a0 = (data0[0]);
+        const npy_ulonglong b0 = (data1[0]);
+        const npy_ulonglong c0 = (data_out[0]);
+        
+#line 420
+        const npy_ulonglong a1 = (data0[1]);
+        const npy_ulonglong b1 = (data1[1]);
+        const npy_ulonglong c1 = (data_out[1]);
+        
+#line 420
+        const npy_ulonglong a2 = (data0[2]);
+        const npy_ulonglong b2 = (data1[2]);
+        const npy_ulonglong c2 = (data_out[2]);
+        
+#line 420
+        const npy_ulonglong a3 = (data0[3]);
+        const npy_ulonglong b3 = (data1[3]);
+        const npy_ulonglong c3 = (data_out[3]);
+        
+        #line 427
+        const npy_ulonglong abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_ulonglong abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_ulonglong abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_ulonglong abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_ulonglong a = (*data0);
+        const npy_ulonglong b = (*data1);
+        const npy_ulonglong c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_ulonglong
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+ulonglong_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulonglong value0 = (*(npy_ulonglong *)dataptr[0]);
+    npy_ulonglong *data1 = (npy_ulonglong *)dataptr[1];
+    npy_ulonglong *data_out = (npy_ulonglong *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("ulonglong_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    ulonglong_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+ulonglong_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulonglong value1 = (*(npy_ulonglong *)dataptr[1]);
+    npy_ulonglong *data0 = (npy_ulonglong *)dataptr[0];
+    npy_ulonglong *data_out = (npy_ulonglong *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("ulonglong_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    ulonglong_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+ulonglong_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulonglong *data0 = (npy_ulonglong *)dataptr[0];
+    npy_ulonglong *data1 = (npy_ulonglong *)dataptr[1];
+    npy_ulonglong accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("ulonglong_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_ulonglong
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_u64;
+    npyv_u64 v_accum = npyv_zero_u64();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_u64 a0 = npyv_loada_u64(data0 + vstep * 0);
+            npyv_u64 b0 = npyv_loada_u64(data1 + vstep * 0);
+            
+#line 501
+            npyv_u64 a1 = npyv_loada_u64(data0 + vstep * 1);
+            npyv_u64 b1 = npyv_loada_u64(data1 + vstep * 1);
+            
+#line 501
+            npyv_u64 a2 = npyv_loada_u64(data0 + vstep * 2);
+            npyv_u64 b2 = npyv_loada_u64(data1 + vstep * 2);
+            
+#line 501
+            npyv_u64 a3 = npyv_loada_u64(data0 + vstep * 3);
+            npyv_u64 b3 = npyv_loada_u64(data1 + vstep * 3);
+            
+            npyv_u64 ab3 = npyv_muladd_u64(a3, b3, v_accum);
+            npyv_u64 ab2 = npyv_muladd_u64(a2, b2, ab3);
+            npyv_u64 ab1 = npyv_muladd_u64(a1, b1, ab2);
+                   v_accum = npyv_muladd_u64(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_u64 a0 = npyv_load_u64(data0 + vstep * 0);
+            npyv_u64 b0 = npyv_load_u64(data1 + vstep * 0);
+            
+#line 501
+            npyv_u64 a1 = npyv_load_u64(data0 + vstep * 1);
+            npyv_u64 b1 = npyv_load_u64(data1 + vstep * 1);
+            
+#line 501
+            npyv_u64 a2 = npyv_load_u64(data0 + vstep * 2);
+            npyv_u64 b2 = npyv_load_u64(data1 + vstep * 2);
+            
+#line 501
+            npyv_u64 a3 = npyv_load_u64(data0 + vstep * 3);
+            npyv_u64 b3 = npyv_load_u64(data1 + vstep * 3);
+            
+            npyv_u64 ab3 = npyv_muladd_u64(a3, b3, v_accum);
+            npyv_u64 ab2 = npyv_muladd_u64(a2, b2, ab3);
+            npyv_u64 ab1 = npyv_muladd_u64(a1, b1, ab2);
+                   v_accum = npyv_muladd_u64(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_u64 a = npyv_load_tillz_u64(data0, count);
+        npyv_u64 b = npyv_load_tillz_u64(data1, count);
+        v_accum = npyv_muladd_u64(a, b, v_accum);
+    }
+    accum = npyv_sum_u64(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_ulonglong ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_ulonglong ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_ulonglong ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_ulonglong ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_ulonglong a = (*data0);
+        const npy_ulonglong b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_ulonglong
+    *(npy_ulonglong *)dataptr[2] = ((*(npy_ulonglong *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+ulonglong_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulonglong *data1 = (npy_ulonglong *)dataptr[1];
+    npy_ulonglong value0 = (*(npy_ulonglong *)dataptr[0]);
+    npy_ulonglong accum = ulonglong_sum_of_arr(data1, count);
+    *(npy_ulonglong *)dataptr[2] = ((*(npy_ulonglong *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+ulonglong_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulonglong *data0 = (npy_ulonglong *)dataptr[0];
+    npy_ulonglong value1 = (*(npy_ulonglong *)dataptr[1]);
+    npy_ulonglong accum = ulonglong_sum_of_arr(data0, count);
+    *(npy_ulonglong *)dataptr[2] = ((*(npy_ulonglong *)dataptr[2]) + value1 * accum);
+}
+
+#elif 1000 == 3 && !0
+
+static void
+ulonglong_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_ulonglong *data0 = (npy_ulonglong *)dataptr[0];
+    npy_ulonglong *data1 = (npy_ulonglong *)dataptr[1];
+    npy_ulonglong *data2 = (npy_ulonglong *)dataptr[2];
+    npy_ulonglong *data_out = (npy_ulonglong *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 1000 > 3 || @complex */
+
+static void
+ulonglong_sum_of_products_contig_any(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("ulonglong_sum_of_products_contig_any (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_ulonglong temp = (*(npy_ulonglong *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ulonglong *)dataptr[i]);
+        }
+        *(npy_ulonglong *)dataptr[nop] = (temp +
+                                           (*(npy_ulonglong *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_ulonglong);
+        }
+#else /* complex */
+#  if 1000 <= 3
+#    define _SUMPROD_NOP 1000
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_ulonglong re, im, tmp;
+        int i;
+        re = ((npy_ulonglong *)dataptr[0])[0];
+        im = ((npy_ulonglong *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ulonglong *)dataptr[i])[0] -
+                  im * ((npy_ulonglong *)dataptr[i])[1];
+            im = re * ((npy_ulonglong *)dataptr[i])[1] +
+                 im * ((npy_ulonglong *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_ulonglong *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_ulonglong *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_ulonglong *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_ulonglong *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_ulonglong);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 1000 */
+
+#if 1000 == 1
+
+static NPY_GCC_OPT_3 void
+ulonglong_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("ulonglong_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_ulonglong *data = (npy_ulonglong *)dataptr[0];
+    npy_ulonglong accum = ulonglong_sum_of_arr(data, count);
+    *((npy_ulonglong *)dataptr[1]) = (accum + (*((npy_ulonglong *)dataptr[1])));
+#else
+    npy_ulonglong accum_re = 0, accum_im = 0;
+    npy_ulonglong *data0 = (npy_ulonglong *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_ulonglong re01 = data0[0] + data0[2];
+        const npy_ulonglong re23 = data0[4] + data0[6];
+        const npy_ulonglong im13 = data0[1] + data0[3];
+        const npy_ulonglong im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_ulonglong *)dataptr[1])[0] += accum_re;
+    ((npy_ulonglong *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 1000 == 1 */
+
+static void
+ulonglong_sum_of_products_outstride0_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_ulonglong accum_re = 0, accum_im = 0;
+#else
+    npy_ulonglong accum = 0;
+#endif
+
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1000 == 2 || 1000 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1000 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("ulonglong_sum_of_products_outstride0_any (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 1000 == 1
+        accum += (*(npy_ulonglong *)data0);
+        data0 += stride0;
+#  elif 1000 == 2
+        accum += (*(npy_ulonglong *)data0) *
+                 (*(npy_ulonglong *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 1000 == 3
+        accum += (*(npy_ulonglong *)data0) *
+                 (*(npy_ulonglong *)data1) *
+                 (*(npy_ulonglong *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_ulonglong temp = (*(npy_ulonglong *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_ulonglong *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1000 == 1
+        accum_re += ((npy_ulonglong *)data0)[0];
+        accum_im += ((npy_ulonglong *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 1000 <= 3
+#define _SUMPROD_NOP 1000
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_ulonglong re, im, tmp;
+        int i;
+        re = ((npy_ulonglong *)dataptr[0])[0];
+        im = ((npy_ulonglong *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_ulonglong *)dataptr[i])[0] -
+                  im * ((npy_ulonglong *)dataptr[i])[1];
+            im = re * ((npy_ulonglong *)dataptr[i])[1] +
+                 im * ((npy_ulonglong *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 1000 <= 3
+    ((npy_ulonglong *)dataptr[1000])[0] += accum_re;
+    ((npy_ulonglong *)dataptr[1000])[1] += accum_im;
+#  else
+    ((npy_ulonglong *)dataptr[nop])[0] += accum_re;
+    ((npy_ulonglong *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 1000 <= 3
+    *((npy_ulonglong *)dataptr[1000]) = (accum +
+                                    (*((npy_ulonglong *)dataptr[1000])));
+#  else
+    *((npy_ulonglong *)dataptr[nop]) = (accum +
+                                    (*((npy_ulonglong *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+
+
+#line 74
+
+#if !0
+static NPY_GCC_OPT_3 npy_float half_sum_of_arr(npy_half *data, npy_intp count)
+{
+    npy_float accum = 0;
+#if 0 // NPYV check for npy_half
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data);
+    const int vstep = npyv_nlanes_half;
+    npyv_half v_accum = npyv_zero_half();
+    const npy_intp vstepx4 = vstep * 4;
+
+    #line 91
+    if(is_aligned) {
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
+            #line 96
+            npyv_half a0 = npyv_loada_half(data + vstep * 0);
+            
+#line 96
+            npyv_half a1 = npyv_loada_half(data + vstep * 1);
+            
+#line 96
+            npyv_half a2 = npyv_loada_half(data + vstep * 2);
+            
+#line 96
+            npyv_half a3 = npyv_loada_half(data + vstep * 3);
+            
+            npyv_half a01   = npyv_add_half(a0, a1);
+            npyv_half a23   = npyv_add_half(a2, a3);
+            npyv_half a0123 = npyv_add_half(a01, a23);
+                     v_accum = npyv_add_half(a0123, v_accum);
+        }
+    }
+    
+#line 91
+    else {
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
+            #line 96
+            npyv_half a0 = npyv_load_half(data + vstep * 0);
+            
+#line 96
+            npyv_half a1 = npyv_load_half(data + vstep * 1);
+            
+#line 96
+            npyv_half a2 = npyv_load_half(data + vstep * 2);
+            
+#line 96
+            npyv_half a3 = npyv_load_half(data + vstep * 3);
+            
+            npyv_half a01   = npyv_add_half(a0, a1);
+            npyv_half a23   = npyv_add_half(a2, a3);
+            npyv_half a0123 = npyv_add_half(a01, a23);
+                     v_accum = npyv_add_half(a0123, v_accum);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep) {
+        npyv_half a = npyv_load_tillz_half(data, count);
+        v_accum = npyv_add_half(a, v_accum);
+    }
+    accum = npyv_sum_half(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data += 4) {
+        const npy_float a01 = npy_half_to_float(*data) + npy_half_to_float(data[1]);
+        const npy_float a23 = npy_half_to_float(data[2]) + npy_half_to_float(data[3]);
+        accum +=  a01 + a23;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data++) {
+        accum += npy_half_to_float(*data);
+    }
+#endif // NPYV check for npy_half
+    return accum;
+}
+#endif
+
+#line 131
+static void
+half_sum_of_products_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1 == 2 || 1 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data_out = dataptr[1];
+    npy_intp stride_out = strides[1];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("half_sum_of_products_one (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 1 == 1
+        *(npy_half *)data_out = npy_float_to_half(npy_half_to_float(*(npy_half *)data0) +
+                                         npy_half_to_float(*(npy_half *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 1 == 2
+        *(npy_half *)data_out = npy_float_to_half(npy_half_to_float(*(npy_half *)data0) *
+                                         npy_half_to_float(*(npy_half *)data1) +
+                                         npy_half_to_float(*(npy_half *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 1 == 3
+        *(npy_half *)data_out = npy_float_to_half(npy_half_to_float(*(npy_half *)data0) *
+                                         npy_half_to_float(*(npy_half *)data1) *
+                                         npy_half_to_float(*(npy_half *)data2) +
+                                         npy_half_to_float(*(npy_half *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_float temp = npy_half_to_float(*(npy_half *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= npy_half_to_float(*(npy_half *)dataptr[i]);
+        }
+        *(npy_half *)dataptr[nop] = npy_float_to_half(temp +
+                                           npy_half_to_float(*(npy_half *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1 == 1
+        ((npy_float *)data_out)[0] = ((npy_float *)data0)[0] +
+                                         ((npy_float *)data_out)[0];
+        ((npy_float *)data_out)[1] = ((npy_float *)data0)[1] +
+                                         ((npy_float *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 1 <= 3
+#define _SUMPROD_NOP 1
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_float re, im, tmp;
+        int i;
+        re = ((npy_float *)dataptr[0])[0];
+        im = ((npy_float *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_float *)dataptr[i])[0] -
+                  im * ((npy_float *)dataptr[i])[1];
+            im = re * ((npy_float *)dataptr[i])[1] +
+                 im * ((npy_float *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_float *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_float *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 1 == 1
+
+static void
+half_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_half *data0 = (npy_half *)dataptr[0];
+    npy_half *data_out = (npy_half *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("half_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = npy_float_to_half(npy_half_to_float(data0[6]) +
+                                 npy_half_to_float(data_out[6]));
+#else
+            ((npy_float *)data_out + 2*6)[0] =
+                                    ((npy_float *)data0 + 2*6)[0] +
+                                    ((npy_float *)data_out + 2*6)[0];
+            ((npy_float *)data_out + 2*6)[1] =
+                                    ((npy_float *)data0 + 2*6)[1] +
+                                    ((npy_float *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = npy_float_to_half(npy_half_to_float(data0[5]) +
+                                 npy_half_to_float(data_out[5]));
+#else
+            ((npy_float *)data_out + 2*5)[0] =
+                                    ((npy_float *)data0 + 2*5)[0] +
+                                    ((npy_float *)data_out + 2*5)[0];
+            ((npy_float *)data_out + 2*5)[1] =
+                                    ((npy_float *)data0 + 2*5)[1] +
+                                    ((npy_float *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = npy_float_to_half(npy_half_to_float(data0[4]) +
+                                 npy_half_to_float(data_out[4]));
+#else
+            ((npy_float *)data_out + 2*4)[0] =
+                                    ((npy_float *)data0 + 2*4)[0] +
+                                    ((npy_float *)data_out + 2*4)[0];
+            ((npy_float *)data_out + 2*4)[1] =
+                                    ((npy_float *)data0 + 2*4)[1] +
+                                    ((npy_float *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = npy_float_to_half(npy_half_to_float(data0[3]) +
+                                 npy_half_to_float(data_out[3]));
+#else
+            ((npy_float *)data_out + 2*3)[0] =
+                                    ((npy_float *)data0 + 2*3)[0] +
+                                    ((npy_float *)data_out + 2*3)[0];
+            ((npy_float *)data_out + 2*3)[1] =
+                                    ((npy_float *)data0 + 2*3)[1] +
+                                    ((npy_float *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = npy_float_to_half(npy_half_to_float(data0[2]) +
+                                 npy_half_to_float(data_out[2]));
+#else
+            ((npy_float *)data_out + 2*2)[0] =
+                                    ((npy_float *)data0 + 2*2)[0] +
+                                    ((npy_float *)data_out + 2*2)[0];
+            ((npy_float *)data_out + 2*2)[1] =
+                                    ((npy_float *)data0 + 2*2)[1] +
+                                    ((npy_float *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = npy_float_to_half(npy_half_to_float(data0[1]) +
+                                 npy_half_to_float(data_out[1]));
+#else
+            ((npy_float *)data_out + 2*1)[0] =
+                                    ((npy_float *)data0 + 2*1)[0] +
+                                    ((npy_float *)data_out + 2*1)[0];
+            ((npy_float *)data_out + 2*1)[1] =
+                                    ((npy_float *)data0 + 2*1)[1] +
+                                    ((npy_float *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = npy_float_to_half(npy_half_to_float(data0[0]) +
+                                 npy_half_to_float(data_out[0]));
+#else
+            ((npy_float *)data_out + 2*0)[0] =
+                                    ((npy_float *)data0 + 2*0)[0] +
+                                    ((npy_float *)data_out + 2*0)[0];
+            ((npy_float *)data_out + 2*0)[1] =
+                                    ((npy_float *)data0 + 2*0)[1] +
+                                    ((npy_float *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = npy_float_to_half(npy_half_to_float(data0[0]) +
+                             npy_half_to_float(data_out[0]));
+#else /* complex */
+        ((npy_float *)data_out + 2*0)[0] =
+                                ((npy_float *)data0 + 2*0)[0] +
+                                ((npy_float *)data_out + 2*0)[0];
+        ((npy_float *)data_out + 2*0)[1] =
+                                ((npy_float *)data0 + 2*0)[1] +
+                                ((npy_float *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = npy_float_to_half(npy_half_to_float(data0[1]) +
+                             npy_half_to_float(data_out[1]));
+#else /* complex */
+        ((npy_float *)data_out + 2*1)[0] =
+                                ((npy_float *)data0 + 2*1)[0] +
+                                ((npy_float *)data_out + 2*1)[0];
+        ((npy_float *)data_out + 2*1)[1] =
+                                ((npy_float *)data0 + 2*1)[1] +
+                                ((npy_float *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = npy_float_to_half(npy_half_to_float(data0[2]) +
+                             npy_half_to_float(data_out[2]));
+#else /* complex */
+        ((npy_float *)data_out + 2*2)[0] =
+                                ((npy_float *)data0 + 2*2)[0] +
+                                ((npy_float *)data_out + 2*2)[0];
+        ((npy_float *)data_out + 2*2)[1] =
+                                ((npy_float *)data0 + 2*2)[1] +
+                                ((npy_float *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = npy_float_to_half(npy_half_to_float(data0[3]) +
+                             npy_half_to_float(data_out[3]));
+#else /* complex */
+        ((npy_float *)data_out + 2*3)[0] =
+                                ((npy_float *)data0 + 2*3)[0] +
+                                ((npy_float *)data_out + 2*3)[0];
+        ((npy_float *)data_out + 2*3)[1] =
+                                ((npy_float *)data0 + 2*3)[1] +
+                                ((npy_float *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = npy_float_to_half(npy_half_to_float(data0[4]) +
+                             npy_half_to_float(data_out[4]));
+#else /* complex */
+        ((npy_float *)data_out + 2*4)[0] =
+                                ((npy_float *)data0 + 2*4)[0] +
+                                ((npy_float *)data_out + 2*4)[0];
+        ((npy_float *)data_out + 2*4)[1] =
+                                ((npy_float *)data0 + 2*4)[1] +
+                                ((npy_float *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = npy_float_to_half(npy_half_to_float(data0[5]) +
+                             npy_half_to_float(data_out[5]));
+#else /* complex */
+        ((npy_float *)data_out + 2*5)[0] =
+                                ((npy_float *)data0 + 2*5)[0] +
+                                ((npy_float *)data_out + 2*5)[0];
+        ((npy_float *)data_out + 2*5)[1] =
+                                ((npy_float *)data0 + 2*5)[1] +
+                                ((npy_float *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = npy_float_to_half(npy_half_to_float(data0[6]) +
+                             npy_half_to_float(data_out[6]));
+#else /* complex */
+        ((npy_float *)data_out + 2*6)[0] =
+                                ((npy_float *)data0 + 2*6)[0] +
+                                ((npy_float *)data_out + 2*6)[0];
+        ((npy_float *)data_out + 2*6)[1] =
+                                ((npy_float *)data0 + 2*6)[1] +
+                                ((npy_float *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = npy_float_to_half(npy_half_to_float(data0[7]) +
+                             npy_half_to_float(data_out[7]));
+#else /* complex */
+        ((npy_float *)data_out + 2*7)[0] =
+                                ((npy_float *)data0 + 2*7)[0] +
+                                ((npy_float *)data_out + 2*7)[0];
+        ((npy_float *)data_out + 2*7)[1] =
+                                ((npy_float *)data0 + 2*7)[1] +
+                                ((npy_float *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 1 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+half_sum_of_products_muladd(npy_half *data, npy_half *data_out, npy_float scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_half
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_half;
+    const npyv_half v_scalar = npyv_setall_half(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_half b0 = npyv_loada_half(data + vstep * 0);
+            npyv_half c0 = npyv_loada_half(data_out + vstep * 0);
+            
+#line 312
+            npyv_half b1 = npyv_loada_half(data + vstep * 1);
+            npyv_half c1 = npyv_loada_half(data_out + vstep * 1);
+            
+#line 312
+            npyv_half b2 = npyv_loada_half(data + vstep * 2);
+            npyv_half c2 = npyv_loada_half(data_out + vstep * 2);
+            
+#line 312
+            npyv_half b3 = npyv_loada_half(data + vstep * 3);
+            npyv_half c3 = npyv_loada_half(data_out + vstep * 3);
+            
+            #line 318
+            npyv_half abc0 = npyv_muladd_half(v_scalar, b0, c0);
+            
+#line 318
+            npyv_half abc1 = npyv_muladd_half(v_scalar, b1, c1);
+            
+#line 318
+            npyv_half abc2 = npyv_muladd_half(v_scalar, b2, c2);
+            
+#line 318
+            npyv_half abc3 = npyv_muladd_half(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_half(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_half(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_half(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_half(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_half b0 = npyv_load_half(data + vstep * 0);
+            npyv_half c0 = npyv_load_half(data_out + vstep * 0);
+            
+#line 312
+            npyv_half b1 = npyv_load_half(data + vstep * 1);
+            npyv_half c1 = npyv_load_half(data_out + vstep * 1);
+            
+#line 312
+            npyv_half b2 = npyv_load_half(data + vstep * 2);
+            npyv_half c2 = npyv_load_half(data_out + vstep * 2);
+            
+#line 312
+            npyv_half b3 = npyv_load_half(data + vstep * 3);
+            npyv_half c3 = npyv_load_half(data_out + vstep * 3);
+            
+            #line 318
+            npyv_half abc0 = npyv_muladd_half(v_scalar, b0, c0);
+            
+#line 318
+            npyv_half abc1 = npyv_muladd_half(v_scalar, b1, c1);
+            
+#line 318
+            npyv_half abc2 = npyv_muladd_half(v_scalar, b2, c2);
+            
+#line 318
+            npyv_half abc3 = npyv_muladd_half(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_half(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_half(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_half(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_half(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_half a = npyv_load_tillz_half(data, count);
+        npyv_half b = npyv_load_tillz_half(data_out, count);
+        npyv_store_till_half(data_out, count, npyv_muladd_half(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_float b0 = npy_half_to_float(data[0]);
+        const npy_float c0 = npy_half_to_float(data_out[0]);
+        
+#line 340
+        const npy_float b1 = npy_half_to_float(data[1]);
+        const npy_float c1 = npy_half_to_float(data_out[1]);
+        
+#line 340
+        const npy_float b2 = npy_half_to_float(data[2]);
+        const npy_float c2 = npy_half_to_float(data_out[2]);
+        
+#line 340
+        const npy_float b3 = npy_half_to_float(data[3]);
+        const npy_float c3 = npy_half_to_float(data_out[3]);
+        
+        #line 346
+        const npy_float abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_float abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_float abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_float abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = npy_float_to_half(abc0);
+        
+#line 351
+        data_out[1] = npy_float_to_half(abc1);
+        
+#line 351
+        data_out[2] = npy_float_to_half(abc2);
+        
+#line 351
+        data_out[3] = npy_float_to_half(abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_float b = npy_half_to_float(*data);
+        const npy_float c = npy_half_to_float(*data_out);
+        *data_out = npy_float_to_half(scalar * b + c);
+    }
+#endif // NPYV check for npy_half
+}
+
+static void
+half_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_half *data0 = (npy_half *)dataptr[0];
+    npy_half *data1 = (npy_half *)dataptr[1];
+    npy_half *data_out = (npy_half *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("half_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_half
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_half;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_half a0 = npyv_loada_half(data0 + vstep * 0);
+            npyv_half b0 = npyv_loada_half(data1 + vstep * 0);
+            npyv_half c0 = npyv_loada_half(data_out + vstep * 0);
+            
+#line 390
+            npyv_half a1 = npyv_loada_half(data0 + vstep * 1);
+            npyv_half b1 = npyv_loada_half(data1 + vstep * 1);
+            npyv_half c1 = npyv_loada_half(data_out + vstep * 1);
+            
+#line 390
+            npyv_half a2 = npyv_loada_half(data0 + vstep * 2);
+            npyv_half b2 = npyv_loada_half(data1 + vstep * 2);
+            npyv_half c2 = npyv_loada_half(data_out + vstep * 2);
+            
+#line 390
+            npyv_half a3 = npyv_loada_half(data0 + vstep * 3);
+            npyv_half b3 = npyv_loada_half(data1 + vstep * 3);
+            npyv_half c3 = npyv_loada_half(data_out + vstep * 3);
+            
+            #line 397
+            npyv_half abc0 = npyv_muladd_half(a0, b0, c0);
+            
+#line 397
+            npyv_half abc1 = npyv_muladd_half(a1, b1, c1);
+            
+#line 397
+            npyv_half abc2 = npyv_muladd_half(a2, b2, c2);
+            
+#line 397
+            npyv_half abc3 = npyv_muladd_half(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_half(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_half(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_half(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_half(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_half a0 = npyv_load_half(data0 + vstep * 0);
+            npyv_half b0 = npyv_load_half(data1 + vstep * 0);
+            npyv_half c0 = npyv_load_half(data_out + vstep * 0);
+            
+#line 390
+            npyv_half a1 = npyv_load_half(data0 + vstep * 1);
+            npyv_half b1 = npyv_load_half(data1 + vstep * 1);
+            npyv_half c1 = npyv_load_half(data_out + vstep * 1);
+            
+#line 390
+            npyv_half a2 = npyv_load_half(data0 + vstep * 2);
+            npyv_half b2 = npyv_load_half(data1 + vstep * 2);
+            npyv_half c2 = npyv_load_half(data_out + vstep * 2);
+            
+#line 390
+            npyv_half a3 = npyv_load_half(data0 + vstep * 3);
+            npyv_half b3 = npyv_load_half(data1 + vstep * 3);
+            npyv_half c3 = npyv_load_half(data_out + vstep * 3);
+            
+            #line 397
+            npyv_half abc0 = npyv_muladd_half(a0, b0, c0);
+            
+#line 397
+            npyv_half abc1 = npyv_muladd_half(a1, b1, c1);
+            
+#line 397
+            npyv_half abc2 = npyv_muladd_half(a2, b2, c2);
+            
+#line 397
+            npyv_half abc3 = npyv_muladd_half(a3, b3, c3);
+            
+            #line 402
+            npyv_store_half(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_half(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_half(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_half(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_half a = npyv_load_tillz_half(data0, count);
+        npyv_half b = npyv_load_tillz_half(data1, count);
+        npyv_half c = npyv_load_tillz_half(data_out, count);
+        npyv_store_till_half(data_out, count, npyv_muladd_half(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_float a0 = npy_half_to_float(data0[0]);
+        const npy_float b0 = npy_half_to_float(data1[0]);
+        const npy_float c0 = npy_half_to_float(data_out[0]);
+        
+#line 420
+        const npy_float a1 = npy_half_to_float(data0[1]);
+        const npy_float b1 = npy_half_to_float(data1[1]);
+        const npy_float c1 = npy_half_to_float(data_out[1]);
+        
+#line 420
+        const npy_float a2 = npy_half_to_float(data0[2]);
+        const npy_float b2 = npy_half_to_float(data1[2]);
+        const npy_float c2 = npy_half_to_float(data_out[2]);
+        
+#line 420
+        const npy_float a3 = npy_half_to_float(data0[3]);
+        const npy_float b3 = npy_half_to_float(data1[3]);
+        const npy_float c3 = npy_half_to_float(data_out[3]);
+        
+        #line 427
+        const npy_float abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_float abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_float abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_float abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = npy_float_to_half(abc0);
+        
+#line 432
+        data_out[1] = npy_float_to_half(abc1);
+        
+#line 432
+        data_out[2] = npy_float_to_half(abc2);
+        
+#line 432
+        data_out[3] = npy_float_to_half(abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_float a = npy_half_to_float(*data0);
+        const npy_float b = npy_half_to_float(*data1);
+        const npy_float c = npy_half_to_float(*data_out);
+        *data_out = npy_float_to_half(a * b + c);
+    }
+#endif // NPYV check for npy_half
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+half_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float value0 = npy_half_to_float(*(npy_half *)dataptr[0]);
+    npy_half *data1 = (npy_half *)dataptr[1];
+    npy_half *data_out = (npy_half *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("half_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    half_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+half_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float value1 = npy_half_to_float(*(npy_half *)dataptr[1]);
+    npy_half *data0 = (npy_half *)dataptr[0];
+    npy_half *data_out = (npy_half *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("half_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    half_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+half_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_half *data0 = (npy_half *)dataptr[0];
+    npy_half *data1 = (npy_half *)dataptr[1];
+    npy_float accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("half_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_half
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_half;
+    npyv_half v_accum = npyv_zero_half();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_half a0 = npyv_loada_half(data0 + vstep * 0);
+            npyv_half b0 = npyv_loada_half(data1 + vstep * 0);
+            
+#line 501
+            npyv_half a1 = npyv_loada_half(data0 + vstep * 1);
+            npyv_half b1 = npyv_loada_half(data1 + vstep * 1);
+            
+#line 501
+            npyv_half a2 = npyv_loada_half(data0 + vstep * 2);
+            npyv_half b2 = npyv_loada_half(data1 + vstep * 2);
+            
+#line 501
+            npyv_half a3 = npyv_loada_half(data0 + vstep * 3);
+            npyv_half b3 = npyv_loada_half(data1 + vstep * 3);
+            
+            npyv_half ab3 = npyv_muladd_half(a3, b3, v_accum);
+            npyv_half ab2 = npyv_muladd_half(a2, b2, ab3);
+            npyv_half ab1 = npyv_muladd_half(a1, b1, ab2);
+                   v_accum = npyv_muladd_half(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_half a0 = npyv_load_half(data0 + vstep * 0);
+            npyv_half b0 = npyv_load_half(data1 + vstep * 0);
+            
+#line 501
+            npyv_half a1 = npyv_load_half(data0 + vstep * 1);
+            npyv_half b1 = npyv_load_half(data1 + vstep * 1);
+            
+#line 501
+            npyv_half a2 = npyv_load_half(data0 + vstep * 2);
+            npyv_half b2 = npyv_load_half(data1 + vstep * 2);
+            
+#line 501
+            npyv_half a3 = npyv_load_half(data0 + vstep * 3);
+            npyv_half b3 = npyv_load_half(data1 + vstep * 3);
+            
+            npyv_half ab3 = npyv_muladd_half(a3, b3, v_accum);
+            npyv_half ab2 = npyv_muladd_half(a2, b2, ab3);
+            npyv_half ab1 = npyv_muladd_half(a1, b1, ab2);
+                   v_accum = npyv_muladd_half(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_half a = npyv_load_tillz_half(data0, count);
+        npyv_half b = npyv_load_tillz_half(data1, count);
+        v_accum = npyv_muladd_half(a, b, v_accum);
+    }
+    accum = npyv_sum_half(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_float ab0 = npy_half_to_float(data0[0]) * npy_half_to_float(data1[0]);
+        
+#line 524
+        const npy_float ab1 = npy_half_to_float(data0[1]) * npy_half_to_float(data1[1]);
+        
+#line 524
+        const npy_float ab2 = npy_half_to_float(data0[2]) * npy_half_to_float(data1[2]);
+        
+#line 524
+        const npy_float ab3 = npy_half_to_float(data0[3]) * npy_half_to_float(data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_float a = npy_half_to_float(*data0);
+        const npy_float b = npy_half_to_float(*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_half
+    *(npy_half *)dataptr[2] = npy_float_to_half(npy_half_to_float(*(npy_half *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+half_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_half *data1 = (npy_half *)dataptr[1];
+    npy_float value0 = npy_half_to_float(*(npy_half *)dataptr[0]);
+    npy_float accum = half_sum_of_arr(data1, count);
+    *(npy_half *)dataptr[2] = npy_float_to_half(npy_half_to_float(*(npy_half *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+half_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_half *data0 = (npy_half *)dataptr[0];
+    npy_float value1 = npy_half_to_float(*(npy_half *)dataptr[1]);
+    npy_float accum = half_sum_of_arr(data0, count);
+    *(npy_half *)dataptr[2] = npy_float_to_half(npy_half_to_float(*(npy_half *)dataptr[2]) + value1 * accum);
+}
+
+#elif 1 == 3 && !0
+
+static void
+half_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_half *data0 = (npy_half *)dataptr[0];
+    npy_half *data1 = (npy_half *)dataptr[1];
+    npy_half *data2 = (npy_half *)dataptr[2];
+    npy_half *data_out = (npy_half *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = npy_float_to_half(npy_half_to_float(data0[0]) *
+                             npy_half_to_float(data1[0]) *
+                             npy_half_to_float(data2[0]) +
+                             npy_half_to_float(data_out[0]));
+
+#line 576
+        data_out[1] = npy_float_to_half(npy_half_to_float(data0[1]) *
+                             npy_half_to_float(data1[1]) *
+                             npy_half_to_float(data2[1]) +
+                             npy_half_to_float(data_out[1]));
+
+#line 576
+        data_out[2] = npy_float_to_half(npy_half_to_float(data0[2]) *
+                             npy_half_to_float(data1[2]) *
+                             npy_half_to_float(data2[2]) +
+                             npy_half_to_float(data_out[2]));
+
+#line 576
+        data_out[3] = npy_float_to_half(npy_half_to_float(data0[3]) *
+                             npy_half_to_float(data1[3]) *
+                             npy_half_to_float(data2[3]) +
+                             npy_half_to_float(data_out[3]));
+
+#line 576
+        data_out[4] = npy_float_to_half(npy_half_to_float(data0[4]) *
+                             npy_half_to_float(data1[4]) *
+                             npy_half_to_float(data2[4]) +
+                             npy_half_to_float(data_out[4]));
+
+#line 576
+        data_out[5] = npy_float_to_half(npy_half_to_float(data0[5]) *
+                             npy_half_to_float(data1[5]) *
+                             npy_half_to_float(data2[5]) +
+                             npy_half_to_float(data_out[5]));
+
+#line 576
+        data_out[6] = npy_float_to_half(npy_half_to_float(data0[6]) *
+                             npy_half_to_float(data1[6]) *
+                             npy_half_to_float(data2[6]) +
+                             npy_half_to_float(data_out[6]));
+
+#line 576
+        data_out[7] = npy_float_to_half(npy_half_to_float(data0[7]) *
+                             npy_half_to_float(data1[7]) *
+                             npy_half_to_float(data2[7]) +
+                             npy_half_to_float(data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = npy_float_to_half(npy_half_to_float(data0[0]) *
+                         npy_half_to_float(data1[0]) *
+                         npy_half_to_float(data2[0]) +
+                         npy_half_to_float(data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = npy_float_to_half(npy_half_to_float(data0[1]) *
+                         npy_half_to_float(data1[1]) *
+                         npy_half_to_float(data2[1]) +
+                         npy_half_to_float(data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = npy_float_to_half(npy_half_to_float(data0[2]) *
+                         npy_half_to_float(data1[2]) *
+                         npy_half_to_float(data2[2]) +
+                         npy_half_to_float(data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = npy_float_to_half(npy_half_to_float(data0[3]) *
+                         npy_half_to_float(data1[3]) *
+                         npy_half_to_float(data2[3]) +
+                         npy_half_to_float(data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = npy_float_to_half(npy_half_to_float(data0[4]) *
+                         npy_half_to_float(data1[4]) *
+                         npy_half_to_float(data2[4]) +
+                         npy_half_to_float(data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = npy_float_to_half(npy_half_to_float(data0[5]) *
+                         npy_half_to_float(data1[5]) *
+                         npy_half_to_float(data2[5]) +
+                         npy_half_to_float(data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = npy_float_to_half(npy_half_to_float(data0[6]) *
+                         npy_half_to_float(data1[6]) *
+                         npy_half_to_float(data2[6]) +
+                         npy_half_to_float(data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = npy_float_to_half(npy_half_to_float(data0[7]) *
+                         npy_half_to_float(data1[7]) *
+                         npy_half_to_float(data2[7]) +
+                         npy_half_to_float(data_out[7]));
+
+}
+
+#else /* 1 > 3 || @complex */
+
+static void
+half_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("half_sum_of_products_contig_one (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_float temp = npy_half_to_float(*(npy_half *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= npy_half_to_float(*(npy_half *)dataptr[i]);
+        }
+        *(npy_half *)dataptr[nop] = npy_float_to_half(temp +
+                                           npy_half_to_float(*(npy_half *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_half);
+        }
+#else /* complex */
+#  if 1 <= 3
+#    define _SUMPROD_NOP 1
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_float re, im, tmp;
+        int i;
+        re = ((npy_float *)dataptr[0])[0];
+        im = ((npy_float *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_float *)dataptr[i])[0] -
+                  im * ((npy_float *)dataptr[i])[1];
+            im = re * ((npy_float *)dataptr[i])[1] +
+                 im * ((npy_float *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_float *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_float *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_half);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 1 */
+
+#if 1 == 1
+
+static NPY_GCC_OPT_3 void
+half_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("half_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_half *data = (npy_half *)dataptr[0];
+    npy_float accum = half_sum_of_arr(data, count);
+    *((npy_half *)dataptr[1]) = npy_float_to_half(accum + npy_half_to_float(*((npy_half *)dataptr[1])));
+#else
+    npy_float accum_re = 0, accum_im = 0;
+    npy_float *data0 = (npy_float *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_float re01 = data0[0] + data0[2];
+        const npy_float re23 = data0[4] + data0[6];
+        const npy_float im13 = data0[1] + data0[3];
+        const npy_float im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_float *)dataptr[1])[0] += accum_re;
+    ((npy_float *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 1 == 1 */
+
+static void
+half_sum_of_products_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_float accum_re = 0, accum_im = 0;
+#else
+    npy_float accum = 0;
+#endif
+
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1 == 2 || 1 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("half_sum_of_products_outstride0_one (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 1 == 1
+        accum += npy_half_to_float(*(npy_half *)data0);
+        data0 += stride0;
+#  elif 1 == 2
+        accum += npy_half_to_float(*(npy_half *)data0) *
+                 npy_half_to_float(*(npy_half *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 1 == 3
+        accum += npy_half_to_float(*(npy_half *)data0) *
+                 npy_half_to_float(*(npy_half *)data1) *
+                 npy_half_to_float(*(npy_half *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_float temp = npy_half_to_float(*(npy_half *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= npy_half_to_float(*(npy_half *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1 == 1
+        accum_re += ((npy_float *)data0)[0];
+        accum_im += ((npy_float *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 1 <= 3
+#define _SUMPROD_NOP 1
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_float re, im, tmp;
+        int i;
+        re = ((npy_float *)dataptr[0])[0];
+        im = ((npy_float *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_float *)dataptr[i])[0] -
+                  im * ((npy_float *)dataptr[i])[1];
+            im = re * ((npy_float *)dataptr[i])[1] +
+                 im * ((npy_float *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 1 <= 3
+    ((npy_float *)dataptr[1])[0] += accum_re;
+    ((npy_float *)dataptr[1])[1] += accum_im;
+#  else
+    ((npy_float *)dataptr[nop])[0] += accum_re;
+    ((npy_float *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 1 <= 3
+    *((npy_half *)dataptr[1]) = npy_float_to_half(accum +
+                                    npy_half_to_float(*((npy_half *)dataptr[1])));
+#  else
+    *((npy_half *)dataptr[nop]) = npy_float_to_half(accum +
+                                    npy_half_to_float(*((npy_half *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+half_sum_of_products_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (2 == 2 || 2 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (2 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data_out = dataptr[2];
+    npy_intp stride_out = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("half_sum_of_products_two (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 2 == 1
+        *(npy_half *)data_out = npy_float_to_half(npy_half_to_float(*(npy_half *)data0) +
+                                         npy_half_to_float(*(npy_half *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 2 == 2
+        *(npy_half *)data_out = npy_float_to_half(npy_half_to_float(*(npy_half *)data0) *
+                                         npy_half_to_float(*(npy_half *)data1) +
+                                         npy_half_to_float(*(npy_half *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 2 == 3
+        *(npy_half *)data_out = npy_float_to_half(npy_half_to_float(*(npy_half *)data0) *
+                                         npy_half_to_float(*(npy_half *)data1) *
+                                         npy_half_to_float(*(npy_half *)data2) +
+                                         npy_half_to_float(*(npy_half *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_float temp = npy_half_to_float(*(npy_half *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= npy_half_to_float(*(npy_half *)dataptr[i]);
+        }
+        *(npy_half *)dataptr[nop] = npy_float_to_half(temp +
+                                           npy_half_to_float(*(npy_half *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 2 == 1
+        ((npy_float *)data_out)[0] = ((npy_float *)data0)[0] +
+                                         ((npy_float *)data_out)[0];
+        ((npy_float *)data_out)[1] = ((npy_float *)data0)[1] +
+                                         ((npy_float *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 2 <= 3
+#define _SUMPROD_NOP 2
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_float re, im, tmp;
+        int i;
+        re = ((npy_float *)dataptr[0])[0];
+        im = ((npy_float *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_float *)dataptr[i])[0] -
+                  im * ((npy_float *)dataptr[i])[1];
+            im = re * ((npy_float *)dataptr[i])[1] +
+                 im * ((npy_float *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_float *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_float *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 2 == 1
+
+static void
+half_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_half *data0 = (npy_half *)dataptr[0];
+    npy_half *data_out = (npy_half *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("half_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = npy_float_to_half(npy_half_to_float(data0[6]) +
+                                 npy_half_to_float(data_out[6]));
+#else
+            ((npy_float *)data_out + 2*6)[0] =
+                                    ((npy_float *)data0 + 2*6)[0] +
+                                    ((npy_float *)data_out + 2*6)[0];
+            ((npy_float *)data_out + 2*6)[1] =
+                                    ((npy_float *)data0 + 2*6)[1] +
+                                    ((npy_float *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = npy_float_to_half(npy_half_to_float(data0[5]) +
+                                 npy_half_to_float(data_out[5]));
+#else
+            ((npy_float *)data_out + 2*5)[0] =
+                                    ((npy_float *)data0 + 2*5)[0] +
+                                    ((npy_float *)data_out + 2*5)[0];
+            ((npy_float *)data_out + 2*5)[1] =
+                                    ((npy_float *)data0 + 2*5)[1] +
+                                    ((npy_float *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = npy_float_to_half(npy_half_to_float(data0[4]) +
+                                 npy_half_to_float(data_out[4]));
+#else
+            ((npy_float *)data_out + 2*4)[0] =
+                                    ((npy_float *)data0 + 2*4)[0] +
+                                    ((npy_float *)data_out + 2*4)[0];
+            ((npy_float *)data_out + 2*4)[1] =
+                                    ((npy_float *)data0 + 2*4)[1] +
+                                    ((npy_float *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = npy_float_to_half(npy_half_to_float(data0[3]) +
+                                 npy_half_to_float(data_out[3]));
+#else
+            ((npy_float *)data_out + 2*3)[0] =
+                                    ((npy_float *)data0 + 2*3)[0] +
+                                    ((npy_float *)data_out + 2*3)[0];
+            ((npy_float *)data_out + 2*3)[1] =
+                                    ((npy_float *)data0 + 2*3)[1] +
+                                    ((npy_float *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = npy_float_to_half(npy_half_to_float(data0[2]) +
+                                 npy_half_to_float(data_out[2]));
+#else
+            ((npy_float *)data_out + 2*2)[0] =
+                                    ((npy_float *)data0 + 2*2)[0] +
+                                    ((npy_float *)data_out + 2*2)[0];
+            ((npy_float *)data_out + 2*2)[1] =
+                                    ((npy_float *)data0 + 2*2)[1] +
+                                    ((npy_float *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = npy_float_to_half(npy_half_to_float(data0[1]) +
+                                 npy_half_to_float(data_out[1]));
+#else
+            ((npy_float *)data_out + 2*1)[0] =
+                                    ((npy_float *)data0 + 2*1)[0] +
+                                    ((npy_float *)data_out + 2*1)[0];
+            ((npy_float *)data_out + 2*1)[1] =
+                                    ((npy_float *)data0 + 2*1)[1] +
+                                    ((npy_float *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = npy_float_to_half(npy_half_to_float(data0[0]) +
+                                 npy_half_to_float(data_out[0]));
+#else
+            ((npy_float *)data_out + 2*0)[0] =
+                                    ((npy_float *)data0 + 2*0)[0] +
+                                    ((npy_float *)data_out + 2*0)[0];
+            ((npy_float *)data_out + 2*0)[1] =
+                                    ((npy_float *)data0 + 2*0)[1] +
+                                    ((npy_float *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = npy_float_to_half(npy_half_to_float(data0[0]) +
+                             npy_half_to_float(data_out[0]));
+#else /* complex */
+        ((npy_float *)data_out + 2*0)[0] =
+                                ((npy_float *)data0 + 2*0)[0] +
+                                ((npy_float *)data_out + 2*0)[0];
+        ((npy_float *)data_out + 2*0)[1] =
+                                ((npy_float *)data0 + 2*0)[1] +
+                                ((npy_float *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = npy_float_to_half(npy_half_to_float(data0[1]) +
+                             npy_half_to_float(data_out[1]));
+#else /* complex */
+        ((npy_float *)data_out + 2*1)[0] =
+                                ((npy_float *)data0 + 2*1)[0] +
+                                ((npy_float *)data_out + 2*1)[0];
+        ((npy_float *)data_out + 2*1)[1] =
+                                ((npy_float *)data0 + 2*1)[1] +
+                                ((npy_float *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = npy_float_to_half(npy_half_to_float(data0[2]) +
+                             npy_half_to_float(data_out[2]));
+#else /* complex */
+        ((npy_float *)data_out + 2*2)[0] =
+                                ((npy_float *)data0 + 2*2)[0] +
+                                ((npy_float *)data_out + 2*2)[0];
+        ((npy_float *)data_out + 2*2)[1] =
+                                ((npy_float *)data0 + 2*2)[1] +
+                                ((npy_float *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = npy_float_to_half(npy_half_to_float(data0[3]) +
+                             npy_half_to_float(data_out[3]));
+#else /* complex */
+        ((npy_float *)data_out + 2*3)[0] =
+                                ((npy_float *)data0 + 2*3)[0] +
+                                ((npy_float *)data_out + 2*3)[0];
+        ((npy_float *)data_out + 2*3)[1] =
+                                ((npy_float *)data0 + 2*3)[1] +
+                                ((npy_float *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = npy_float_to_half(npy_half_to_float(data0[4]) +
+                             npy_half_to_float(data_out[4]));
+#else /* complex */
+        ((npy_float *)data_out + 2*4)[0] =
+                                ((npy_float *)data0 + 2*4)[0] +
+                                ((npy_float *)data_out + 2*4)[0];
+        ((npy_float *)data_out + 2*4)[1] =
+                                ((npy_float *)data0 + 2*4)[1] +
+                                ((npy_float *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = npy_float_to_half(npy_half_to_float(data0[5]) +
+                             npy_half_to_float(data_out[5]));
+#else /* complex */
+        ((npy_float *)data_out + 2*5)[0] =
+                                ((npy_float *)data0 + 2*5)[0] +
+                                ((npy_float *)data_out + 2*5)[0];
+        ((npy_float *)data_out + 2*5)[1] =
+                                ((npy_float *)data0 + 2*5)[1] +
+                                ((npy_float *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = npy_float_to_half(npy_half_to_float(data0[6]) +
+                             npy_half_to_float(data_out[6]));
+#else /* complex */
+        ((npy_float *)data_out + 2*6)[0] =
+                                ((npy_float *)data0 + 2*6)[0] +
+                                ((npy_float *)data_out + 2*6)[0];
+        ((npy_float *)data_out + 2*6)[1] =
+                                ((npy_float *)data0 + 2*6)[1] +
+                                ((npy_float *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = npy_float_to_half(npy_half_to_float(data0[7]) +
+                             npy_half_to_float(data_out[7]));
+#else /* complex */
+        ((npy_float *)data_out + 2*7)[0] =
+                                ((npy_float *)data0 + 2*7)[0] +
+                                ((npy_float *)data_out + 2*7)[0];
+        ((npy_float *)data_out + 2*7)[1] =
+                                ((npy_float *)data0 + 2*7)[1] +
+                                ((npy_float *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 2 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+half_sum_of_products_muladd(npy_half *data, npy_half *data_out, npy_float scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_half
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_half;
+    const npyv_half v_scalar = npyv_setall_half(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_half b0 = npyv_loada_half(data + vstep * 0);
+            npyv_half c0 = npyv_loada_half(data_out + vstep * 0);
+            
+#line 312
+            npyv_half b1 = npyv_loada_half(data + vstep * 1);
+            npyv_half c1 = npyv_loada_half(data_out + vstep * 1);
+            
+#line 312
+            npyv_half b2 = npyv_loada_half(data + vstep * 2);
+            npyv_half c2 = npyv_loada_half(data_out + vstep * 2);
+            
+#line 312
+            npyv_half b3 = npyv_loada_half(data + vstep * 3);
+            npyv_half c3 = npyv_loada_half(data_out + vstep * 3);
+            
+            #line 318
+            npyv_half abc0 = npyv_muladd_half(v_scalar, b0, c0);
+            
+#line 318
+            npyv_half abc1 = npyv_muladd_half(v_scalar, b1, c1);
+            
+#line 318
+            npyv_half abc2 = npyv_muladd_half(v_scalar, b2, c2);
+            
+#line 318
+            npyv_half abc3 = npyv_muladd_half(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_half(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_half(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_half(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_half(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_half b0 = npyv_load_half(data + vstep * 0);
+            npyv_half c0 = npyv_load_half(data_out + vstep * 0);
+            
+#line 312
+            npyv_half b1 = npyv_load_half(data + vstep * 1);
+            npyv_half c1 = npyv_load_half(data_out + vstep * 1);
+            
+#line 312
+            npyv_half b2 = npyv_load_half(data + vstep * 2);
+            npyv_half c2 = npyv_load_half(data_out + vstep * 2);
+            
+#line 312
+            npyv_half b3 = npyv_load_half(data + vstep * 3);
+            npyv_half c3 = npyv_load_half(data_out + vstep * 3);
+            
+            #line 318
+            npyv_half abc0 = npyv_muladd_half(v_scalar, b0, c0);
+            
+#line 318
+            npyv_half abc1 = npyv_muladd_half(v_scalar, b1, c1);
+            
+#line 318
+            npyv_half abc2 = npyv_muladd_half(v_scalar, b2, c2);
+            
+#line 318
+            npyv_half abc3 = npyv_muladd_half(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_half(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_half(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_half(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_half(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_half a = npyv_load_tillz_half(data, count);
+        npyv_half b = npyv_load_tillz_half(data_out, count);
+        npyv_store_till_half(data_out, count, npyv_muladd_half(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_float b0 = npy_half_to_float(data[0]);
+        const npy_float c0 = npy_half_to_float(data_out[0]);
+        
+#line 340
+        const npy_float b1 = npy_half_to_float(data[1]);
+        const npy_float c1 = npy_half_to_float(data_out[1]);
+        
+#line 340
+        const npy_float b2 = npy_half_to_float(data[2]);
+        const npy_float c2 = npy_half_to_float(data_out[2]);
+        
+#line 340
+        const npy_float b3 = npy_half_to_float(data[3]);
+        const npy_float c3 = npy_half_to_float(data_out[3]);
+        
+        #line 346
+        const npy_float abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_float abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_float abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_float abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = npy_float_to_half(abc0);
+        
+#line 351
+        data_out[1] = npy_float_to_half(abc1);
+        
+#line 351
+        data_out[2] = npy_float_to_half(abc2);
+        
+#line 351
+        data_out[3] = npy_float_to_half(abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_float b = npy_half_to_float(*data);
+        const npy_float c = npy_half_to_float(*data_out);
+        *data_out = npy_float_to_half(scalar * b + c);
+    }
+#endif // NPYV check for npy_half
+}
+
+static void
+half_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_half *data0 = (npy_half *)dataptr[0];
+    npy_half *data1 = (npy_half *)dataptr[1];
+    npy_half *data_out = (npy_half *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("half_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_half
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_half;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_half a0 = npyv_loada_half(data0 + vstep * 0);
+            npyv_half b0 = npyv_loada_half(data1 + vstep * 0);
+            npyv_half c0 = npyv_loada_half(data_out + vstep * 0);
+            
+#line 390
+            npyv_half a1 = npyv_loada_half(data0 + vstep * 1);
+            npyv_half b1 = npyv_loada_half(data1 + vstep * 1);
+            npyv_half c1 = npyv_loada_half(data_out + vstep * 1);
+            
+#line 390
+            npyv_half a2 = npyv_loada_half(data0 + vstep * 2);
+            npyv_half b2 = npyv_loada_half(data1 + vstep * 2);
+            npyv_half c2 = npyv_loada_half(data_out + vstep * 2);
+            
+#line 390
+            npyv_half a3 = npyv_loada_half(data0 + vstep * 3);
+            npyv_half b3 = npyv_loada_half(data1 + vstep * 3);
+            npyv_half c3 = npyv_loada_half(data_out + vstep * 3);
+            
+            #line 397
+            npyv_half abc0 = npyv_muladd_half(a0, b0, c0);
+            
+#line 397
+            npyv_half abc1 = npyv_muladd_half(a1, b1, c1);
+            
+#line 397
+            npyv_half abc2 = npyv_muladd_half(a2, b2, c2);
+            
+#line 397
+            npyv_half abc3 = npyv_muladd_half(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_half(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_half(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_half(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_half(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_half a0 = npyv_load_half(data0 + vstep * 0);
+            npyv_half b0 = npyv_load_half(data1 + vstep * 0);
+            npyv_half c0 = npyv_load_half(data_out + vstep * 0);
+            
+#line 390
+            npyv_half a1 = npyv_load_half(data0 + vstep * 1);
+            npyv_half b1 = npyv_load_half(data1 + vstep * 1);
+            npyv_half c1 = npyv_load_half(data_out + vstep * 1);
+            
+#line 390
+            npyv_half a2 = npyv_load_half(data0 + vstep * 2);
+            npyv_half b2 = npyv_load_half(data1 + vstep * 2);
+            npyv_half c2 = npyv_load_half(data_out + vstep * 2);
+            
+#line 390
+            npyv_half a3 = npyv_load_half(data0 + vstep * 3);
+            npyv_half b3 = npyv_load_half(data1 + vstep * 3);
+            npyv_half c3 = npyv_load_half(data_out + vstep * 3);
+            
+            #line 397
+            npyv_half abc0 = npyv_muladd_half(a0, b0, c0);
+            
+#line 397
+            npyv_half abc1 = npyv_muladd_half(a1, b1, c1);
+            
+#line 397
+            npyv_half abc2 = npyv_muladd_half(a2, b2, c2);
+            
+#line 397
+            npyv_half abc3 = npyv_muladd_half(a3, b3, c3);
+            
+            #line 402
+            npyv_store_half(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_half(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_half(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_half(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_half a = npyv_load_tillz_half(data0, count);
+        npyv_half b = npyv_load_tillz_half(data1, count);
+        npyv_half c = npyv_load_tillz_half(data_out, count);
+        npyv_store_till_half(data_out, count, npyv_muladd_half(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_float a0 = npy_half_to_float(data0[0]);
+        const npy_float b0 = npy_half_to_float(data1[0]);
+        const npy_float c0 = npy_half_to_float(data_out[0]);
+        
+#line 420
+        const npy_float a1 = npy_half_to_float(data0[1]);
+        const npy_float b1 = npy_half_to_float(data1[1]);
+        const npy_float c1 = npy_half_to_float(data_out[1]);
+        
+#line 420
+        const npy_float a2 = npy_half_to_float(data0[2]);
+        const npy_float b2 = npy_half_to_float(data1[2]);
+        const npy_float c2 = npy_half_to_float(data_out[2]);
+        
+#line 420
+        const npy_float a3 = npy_half_to_float(data0[3]);
+        const npy_float b3 = npy_half_to_float(data1[3]);
+        const npy_float c3 = npy_half_to_float(data_out[3]);
+        
+        #line 427
+        const npy_float abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_float abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_float abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_float abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = npy_float_to_half(abc0);
+        
+#line 432
+        data_out[1] = npy_float_to_half(abc1);
+        
+#line 432
+        data_out[2] = npy_float_to_half(abc2);
+        
+#line 432
+        data_out[3] = npy_float_to_half(abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_float a = npy_half_to_float(*data0);
+        const npy_float b = npy_half_to_float(*data1);
+        const npy_float c = npy_half_to_float(*data_out);
+        *data_out = npy_float_to_half(a * b + c);
+    }
+#endif // NPYV check for npy_half
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+half_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float value0 = npy_half_to_float(*(npy_half *)dataptr[0]);
+    npy_half *data1 = (npy_half *)dataptr[1];
+    npy_half *data_out = (npy_half *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("half_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    half_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+half_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float value1 = npy_half_to_float(*(npy_half *)dataptr[1]);
+    npy_half *data0 = (npy_half *)dataptr[0];
+    npy_half *data_out = (npy_half *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("half_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    half_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+half_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_half *data0 = (npy_half *)dataptr[0];
+    npy_half *data1 = (npy_half *)dataptr[1];
+    npy_float accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("half_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_half
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_half;
+    npyv_half v_accum = npyv_zero_half();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_half a0 = npyv_loada_half(data0 + vstep * 0);
+            npyv_half b0 = npyv_loada_half(data1 + vstep * 0);
+            
+#line 501
+            npyv_half a1 = npyv_loada_half(data0 + vstep * 1);
+            npyv_half b1 = npyv_loada_half(data1 + vstep * 1);
+            
+#line 501
+            npyv_half a2 = npyv_loada_half(data0 + vstep * 2);
+            npyv_half b2 = npyv_loada_half(data1 + vstep * 2);
+            
+#line 501
+            npyv_half a3 = npyv_loada_half(data0 + vstep * 3);
+            npyv_half b3 = npyv_loada_half(data1 + vstep * 3);
+            
+            npyv_half ab3 = npyv_muladd_half(a3, b3, v_accum);
+            npyv_half ab2 = npyv_muladd_half(a2, b2, ab3);
+            npyv_half ab1 = npyv_muladd_half(a1, b1, ab2);
+                   v_accum = npyv_muladd_half(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_half a0 = npyv_load_half(data0 + vstep * 0);
+            npyv_half b0 = npyv_load_half(data1 + vstep * 0);
+            
+#line 501
+            npyv_half a1 = npyv_load_half(data0 + vstep * 1);
+            npyv_half b1 = npyv_load_half(data1 + vstep * 1);
+            
+#line 501
+            npyv_half a2 = npyv_load_half(data0 + vstep * 2);
+            npyv_half b2 = npyv_load_half(data1 + vstep * 2);
+            
+#line 501
+            npyv_half a3 = npyv_load_half(data0 + vstep * 3);
+            npyv_half b3 = npyv_load_half(data1 + vstep * 3);
+            
+            npyv_half ab3 = npyv_muladd_half(a3, b3, v_accum);
+            npyv_half ab2 = npyv_muladd_half(a2, b2, ab3);
+            npyv_half ab1 = npyv_muladd_half(a1, b1, ab2);
+                   v_accum = npyv_muladd_half(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_half a = npyv_load_tillz_half(data0, count);
+        npyv_half b = npyv_load_tillz_half(data1, count);
+        v_accum = npyv_muladd_half(a, b, v_accum);
+    }
+    accum = npyv_sum_half(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_float ab0 = npy_half_to_float(data0[0]) * npy_half_to_float(data1[0]);
+        
+#line 524
+        const npy_float ab1 = npy_half_to_float(data0[1]) * npy_half_to_float(data1[1]);
+        
+#line 524
+        const npy_float ab2 = npy_half_to_float(data0[2]) * npy_half_to_float(data1[2]);
+        
+#line 524
+        const npy_float ab3 = npy_half_to_float(data0[3]) * npy_half_to_float(data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_float a = npy_half_to_float(*data0);
+        const npy_float b = npy_half_to_float(*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_half
+    *(npy_half *)dataptr[2] = npy_float_to_half(npy_half_to_float(*(npy_half *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+half_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_half *data1 = (npy_half *)dataptr[1];
+    npy_float value0 = npy_half_to_float(*(npy_half *)dataptr[0]);
+    npy_float accum = half_sum_of_arr(data1, count);
+    *(npy_half *)dataptr[2] = npy_float_to_half(npy_half_to_float(*(npy_half *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+half_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_half *data0 = (npy_half *)dataptr[0];
+    npy_float value1 = npy_half_to_float(*(npy_half *)dataptr[1]);
+    npy_float accum = half_sum_of_arr(data0, count);
+    *(npy_half *)dataptr[2] = npy_float_to_half(npy_half_to_float(*(npy_half *)dataptr[2]) + value1 * accum);
+}
+
+#elif 2 == 3 && !0
+
+static void
+half_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_half *data0 = (npy_half *)dataptr[0];
+    npy_half *data1 = (npy_half *)dataptr[1];
+    npy_half *data2 = (npy_half *)dataptr[2];
+    npy_half *data_out = (npy_half *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = npy_float_to_half(npy_half_to_float(data0[0]) *
+                             npy_half_to_float(data1[0]) *
+                             npy_half_to_float(data2[0]) +
+                             npy_half_to_float(data_out[0]));
+
+#line 576
+        data_out[1] = npy_float_to_half(npy_half_to_float(data0[1]) *
+                             npy_half_to_float(data1[1]) *
+                             npy_half_to_float(data2[1]) +
+                             npy_half_to_float(data_out[1]));
+
+#line 576
+        data_out[2] = npy_float_to_half(npy_half_to_float(data0[2]) *
+                             npy_half_to_float(data1[2]) *
+                             npy_half_to_float(data2[2]) +
+                             npy_half_to_float(data_out[2]));
+
+#line 576
+        data_out[3] = npy_float_to_half(npy_half_to_float(data0[3]) *
+                             npy_half_to_float(data1[3]) *
+                             npy_half_to_float(data2[3]) +
+                             npy_half_to_float(data_out[3]));
+
+#line 576
+        data_out[4] = npy_float_to_half(npy_half_to_float(data0[4]) *
+                             npy_half_to_float(data1[4]) *
+                             npy_half_to_float(data2[4]) +
+                             npy_half_to_float(data_out[4]));
+
+#line 576
+        data_out[5] = npy_float_to_half(npy_half_to_float(data0[5]) *
+                             npy_half_to_float(data1[5]) *
+                             npy_half_to_float(data2[5]) +
+                             npy_half_to_float(data_out[5]));
+
+#line 576
+        data_out[6] = npy_float_to_half(npy_half_to_float(data0[6]) *
+                             npy_half_to_float(data1[6]) *
+                             npy_half_to_float(data2[6]) +
+                             npy_half_to_float(data_out[6]));
+
+#line 576
+        data_out[7] = npy_float_to_half(npy_half_to_float(data0[7]) *
+                             npy_half_to_float(data1[7]) *
+                             npy_half_to_float(data2[7]) +
+                             npy_half_to_float(data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = npy_float_to_half(npy_half_to_float(data0[0]) *
+                         npy_half_to_float(data1[0]) *
+                         npy_half_to_float(data2[0]) +
+                         npy_half_to_float(data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = npy_float_to_half(npy_half_to_float(data0[1]) *
+                         npy_half_to_float(data1[1]) *
+                         npy_half_to_float(data2[1]) +
+                         npy_half_to_float(data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = npy_float_to_half(npy_half_to_float(data0[2]) *
+                         npy_half_to_float(data1[2]) *
+                         npy_half_to_float(data2[2]) +
+                         npy_half_to_float(data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = npy_float_to_half(npy_half_to_float(data0[3]) *
+                         npy_half_to_float(data1[3]) *
+                         npy_half_to_float(data2[3]) +
+                         npy_half_to_float(data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = npy_float_to_half(npy_half_to_float(data0[4]) *
+                         npy_half_to_float(data1[4]) *
+                         npy_half_to_float(data2[4]) +
+                         npy_half_to_float(data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = npy_float_to_half(npy_half_to_float(data0[5]) *
+                         npy_half_to_float(data1[5]) *
+                         npy_half_to_float(data2[5]) +
+                         npy_half_to_float(data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = npy_float_to_half(npy_half_to_float(data0[6]) *
+                         npy_half_to_float(data1[6]) *
+                         npy_half_to_float(data2[6]) +
+                         npy_half_to_float(data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = npy_float_to_half(npy_half_to_float(data0[7]) *
+                         npy_half_to_float(data1[7]) *
+                         npy_half_to_float(data2[7]) +
+                         npy_half_to_float(data_out[7]));
+
+}
+
+#else /* 2 > 3 || @complex */
+
+static void
+half_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("half_sum_of_products_contig_two (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_float temp = npy_half_to_float(*(npy_half *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= npy_half_to_float(*(npy_half *)dataptr[i]);
+        }
+        *(npy_half *)dataptr[nop] = npy_float_to_half(temp +
+                                           npy_half_to_float(*(npy_half *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_half);
+        }
+#else /* complex */
+#  if 2 <= 3
+#    define _SUMPROD_NOP 2
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_float re, im, tmp;
+        int i;
+        re = ((npy_float *)dataptr[0])[0];
+        im = ((npy_float *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_float *)dataptr[i])[0] -
+                  im * ((npy_float *)dataptr[i])[1];
+            im = re * ((npy_float *)dataptr[i])[1] +
+                 im * ((npy_float *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_float *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_float *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_half);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 2 */
+
+#if 2 == 1
+
+static NPY_GCC_OPT_3 void
+half_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("half_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_half *data = (npy_half *)dataptr[0];
+    npy_float accum = half_sum_of_arr(data, count);
+    *((npy_half *)dataptr[1]) = npy_float_to_half(accum + npy_half_to_float(*((npy_half *)dataptr[1])));
+#else
+    npy_float accum_re = 0, accum_im = 0;
+    npy_float *data0 = (npy_float *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_float re01 = data0[0] + data0[2];
+        const npy_float re23 = data0[4] + data0[6];
+        const npy_float im13 = data0[1] + data0[3];
+        const npy_float im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_float *)dataptr[1])[0] += accum_re;
+    ((npy_float *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 2 == 1 */
+
+static void
+half_sum_of_products_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_float accum_re = 0, accum_im = 0;
+#else
+    npy_float accum = 0;
+#endif
+
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (2 == 2 || 2 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (2 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("half_sum_of_products_outstride0_two (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 2 == 1
+        accum += npy_half_to_float(*(npy_half *)data0);
+        data0 += stride0;
+#  elif 2 == 2
+        accum += npy_half_to_float(*(npy_half *)data0) *
+                 npy_half_to_float(*(npy_half *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 2 == 3
+        accum += npy_half_to_float(*(npy_half *)data0) *
+                 npy_half_to_float(*(npy_half *)data1) *
+                 npy_half_to_float(*(npy_half *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_float temp = npy_half_to_float(*(npy_half *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= npy_half_to_float(*(npy_half *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 2 == 1
+        accum_re += ((npy_float *)data0)[0];
+        accum_im += ((npy_float *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 2 <= 3
+#define _SUMPROD_NOP 2
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_float re, im, tmp;
+        int i;
+        re = ((npy_float *)dataptr[0])[0];
+        im = ((npy_float *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_float *)dataptr[i])[0] -
+                  im * ((npy_float *)dataptr[i])[1];
+            im = re * ((npy_float *)dataptr[i])[1] +
+                 im * ((npy_float *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 2 <= 3
+    ((npy_float *)dataptr[2])[0] += accum_re;
+    ((npy_float *)dataptr[2])[1] += accum_im;
+#  else
+    ((npy_float *)dataptr[nop])[0] += accum_re;
+    ((npy_float *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 2 <= 3
+    *((npy_half *)dataptr[2]) = npy_float_to_half(accum +
+                                    npy_half_to_float(*((npy_half *)dataptr[2])));
+#  else
+    *((npy_half *)dataptr[nop]) = npy_float_to_half(accum +
+                                    npy_half_to_float(*((npy_half *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+half_sum_of_products_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (3 == 2 || 3 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (3 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data_out = dataptr[3];
+    npy_intp stride_out = strides[3];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("half_sum_of_products_three (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 3 == 1
+        *(npy_half *)data_out = npy_float_to_half(npy_half_to_float(*(npy_half *)data0) +
+                                         npy_half_to_float(*(npy_half *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 3 == 2
+        *(npy_half *)data_out = npy_float_to_half(npy_half_to_float(*(npy_half *)data0) *
+                                         npy_half_to_float(*(npy_half *)data1) +
+                                         npy_half_to_float(*(npy_half *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 3 == 3
+        *(npy_half *)data_out = npy_float_to_half(npy_half_to_float(*(npy_half *)data0) *
+                                         npy_half_to_float(*(npy_half *)data1) *
+                                         npy_half_to_float(*(npy_half *)data2) +
+                                         npy_half_to_float(*(npy_half *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_float temp = npy_half_to_float(*(npy_half *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= npy_half_to_float(*(npy_half *)dataptr[i]);
+        }
+        *(npy_half *)dataptr[nop] = npy_float_to_half(temp +
+                                           npy_half_to_float(*(npy_half *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 3 == 1
+        ((npy_float *)data_out)[0] = ((npy_float *)data0)[0] +
+                                         ((npy_float *)data_out)[0];
+        ((npy_float *)data_out)[1] = ((npy_float *)data0)[1] +
+                                         ((npy_float *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 3 <= 3
+#define _SUMPROD_NOP 3
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_float re, im, tmp;
+        int i;
+        re = ((npy_float *)dataptr[0])[0];
+        im = ((npy_float *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_float *)dataptr[i])[0] -
+                  im * ((npy_float *)dataptr[i])[1];
+            im = re * ((npy_float *)dataptr[i])[1] +
+                 im * ((npy_float *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_float *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_float *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 3 == 1
+
+static void
+half_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_half *data0 = (npy_half *)dataptr[0];
+    npy_half *data_out = (npy_half *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("half_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = npy_float_to_half(npy_half_to_float(data0[6]) +
+                                 npy_half_to_float(data_out[6]));
+#else
+            ((npy_float *)data_out + 2*6)[0] =
+                                    ((npy_float *)data0 + 2*6)[0] +
+                                    ((npy_float *)data_out + 2*6)[0];
+            ((npy_float *)data_out + 2*6)[1] =
+                                    ((npy_float *)data0 + 2*6)[1] +
+                                    ((npy_float *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = npy_float_to_half(npy_half_to_float(data0[5]) +
+                                 npy_half_to_float(data_out[5]));
+#else
+            ((npy_float *)data_out + 2*5)[0] =
+                                    ((npy_float *)data0 + 2*5)[0] +
+                                    ((npy_float *)data_out + 2*5)[0];
+            ((npy_float *)data_out + 2*5)[1] =
+                                    ((npy_float *)data0 + 2*5)[1] +
+                                    ((npy_float *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = npy_float_to_half(npy_half_to_float(data0[4]) +
+                                 npy_half_to_float(data_out[4]));
+#else
+            ((npy_float *)data_out + 2*4)[0] =
+                                    ((npy_float *)data0 + 2*4)[0] +
+                                    ((npy_float *)data_out + 2*4)[0];
+            ((npy_float *)data_out + 2*4)[1] =
+                                    ((npy_float *)data0 + 2*4)[1] +
+                                    ((npy_float *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = npy_float_to_half(npy_half_to_float(data0[3]) +
+                                 npy_half_to_float(data_out[3]));
+#else
+            ((npy_float *)data_out + 2*3)[0] =
+                                    ((npy_float *)data0 + 2*3)[0] +
+                                    ((npy_float *)data_out + 2*3)[0];
+            ((npy_float *)data_out + 2*3)[1] =
+                                    ((npy_float *)data0 + 2*3)[1] +
+                                    ((npy_float *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = npy_float_to_half(npy_half_to_float(data0[2]) +
+                                 npy_half_to_float(data_out[2]));
+#else
+            ((npy_float *)data_out + 2*2)[0] =
+                                    ((npy_float *)data0 + 2*2)[0] +
+                                    ((npy_float *)data_out + 2*2)[0];
+            ((npy_float *)data_out + 2*2)[1] =
+                                    ((npy_float *)data0 + 2*2)[1] +
+                                    ((npy_float *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = npy_float_to_half(npy_half_to_float(data0[1]) +
+                                 npy_half_to_float(data_out[1]));
+#else
+            ((npy_float *)data_out + 2*1)[0] =
+                                    ((npy_float *)data0 + 2*1)[0] +
+                                    ((npy_float *)data_out + 2*1)[0];
+            ((npy_float *)data_out + 2*1)[1] =
+                                    ((npy_float *)data0 + 2*1)[1] +
+                                    ((npy_float *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = npy_float_to_half(npy_half_to_float(data0[0]) +
+                                 npy_half_to_float(data_out[0]));
+#else
+            ((npy_float *)data_out + 2*0)[0] =
+                                    ((npy_float *)data0 + 2*0)[0] +
+                                    ((npy_float *)data_out + 2*0)[0];
+            ((npy_float *)data_out + 2*0)[1] =
+                                    ((npy_float *)data0 + 2*0)[1] +
+                                    ((npy_float *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = npy_float_to_half(npy_half_to_float(data0[0]) +
+                             npy_half_to_float(data_out[0]));
+#else /* complex */
+        ((npy_float *)data_out + 2*0)[0] =
+                                ((npy_float *)data0 + 2*0)[0] +
+                                ((npy_float *)data_out + 2*0)[0];
+        ((npy_float *)data_out + 2*0)[1] =
+                                ((npy_float *)data0 + 2*0)[1] +
+                                ((npy_float *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = npy_float_to_half(npy_half_to_float(data0[1]) +
+                             npy_half_to_float(data_out[1]));
+#else /* complex */
+        ((npy_float *)data_out + 2*1)[0] =
+                                ((npy_float *)data0 + 2*1)[0] +
+                                ((npy_float *)data_out + 2*1)[0];
+        ((npy_float *)data_out + 2*1)[1] =
+                                ((npy_float *)data0 + 2*1)[1] +
+                                ((npy_float *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = npy_float_to_half(npy_half_to_float(data0[2]) +
+                             npy_half_to_float(data_out[2]));
+#else /* complex */
+        ((npy_float *)data_out + 2*2)[0] =
+                                ((npy_float *)data0 + 2*2)[0] +
+                                ((npy_float *)data_out + 2*2)[0];
+        ((npy_float *)data_out + 2*2)[1] =
+                                ((npy_float *)data0 + 2*2)[1] +
+                                ((npy_float *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = npy_float_to_half(npy_half_to_float(data0[3]) +
+                             npy_half_to_float(data_out[3]));
+#else /* complex */
+        ((npy_float *)data_out + 2*3)[0] =
+                                ((npy_float *)data0 + 2*3)[0] +
+                                ((npy_float *)data_out + 2*3)[0];
+        ((npy_float *)data_out + 2*3)[1] =
+                                ((npy_float *)data0 + 2*3)[1] +
+                                ((npy_float *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = npy_float_to_half(npy_half_to_float(data0[4]) +
+                             npy_half_to_float(data_out[4]));
+#else /* complex */
+        ((npy_float *)data_out + 2*4)[0] =
+                                ((npy_float *)data0 + 2*4)[0] +
+                                ((npy_float *)data_out + 2*4)[0];
+        ((npy_float *)data_out + 2*4)[1] =
+                                ((npy_float *)data0 + 2*4)[1] +
+                                ((npy_float *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = npy_float_to_half(npy_half_to_float(data0[5]) +
+                             npy_half_to_float(data_out[5]));
+#else /* complex */
+        ((npy_float *)data_out + 2*5)[0] =
+                                ((npy_float *)data0 + 2*5)[0] +
+                                ((npy_float *)data_out + 2*5)[0];
+        ((npy_float *)data_out + 2*5)[1] =
+                                ((npy_float *)data0 + 2*5)[1] +
+                                ((npy_float *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = npy_float_to_half(npy_half_to_float(data0[6]) +
+                             npy_half_to_float(data_out[6]));
+#else /* complex */
+        ((npy_float *)data_out + 2*6)[0] =
+                                ((npy_float *)data0 + 2*6)[0] +
+                                ((npy_float *)data_out + 2*6)[0];
+        ((npy_float *)data_out + 2*6)[1] =
+                                ((npy_float *)data0 + 2*6)[1] +
+                                ((npy_float *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = npy_float_to_half(npy_half_to_float(data0[7]) +
+                             npy_half_to_float(data_out[7]));
+#else /* complex */
+        ((npy_float *)data_out + 2*7)[0] =
+                                ((npy_float *)data0 + 2*7)[0] +
+                                ((npy_float *)data_out + 2*7)[0];
+        ((npy_float *)data_out + 2*7)[1] =
+                                ((npy_float *)data0 + 2*7)[1] +
+                                ((npy_float *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 3 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+half_sum_of_products_muladd(npy_half *data, npy_half *data_out, npy_float scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_half
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_half;
+    const npyv_half v_scalar = npyv_setall_half(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_half b0 = npyv_loada_half(data + vstep * 0);
+            npyv_half c0 = npyv_loada_half(data_out + vstep * 0);
+            
+#line 312
+            npyv_half b1 = npyv_loada_half(data + vstep * 1);
+            npyv_half c1 = npyv_loada_half(data_out + vstep * 1);
+            
+#line 312
+            npyv_half b2 = npyv_loada_half(data + vstep * 2);
+            npyv_half c2 = npyv_loada_half(data_out + vstep * 2);
+            
+#line 312
+            npyv_half b3 = npyv_loada_half(data + vstep * 3);
+            npyv_half c3 = npyv_loada_half(data_out + vstep * 3);
+            
+            #line 318
+            npyv_half abc0 = npyv_muladd_half(v_scalar, b0, c0);
+            
+#line 318
+            npyv_half abc1 = npyv_muladd_half(v_scalar, b1, c1);
+            
+#line 318
+            npyv_half abc2 = npyv_muladd_half(v_scalar, b2, c2);
+            
+#line 318
+            npyv_half abc3 = npyv_muladd_half(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_half(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_half(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_half(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_half(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_half b0 = npyv_load_half(data + vstep * 0);
+            npyv_half c0 = npyv_load_half(data_out + vstep * 0);
+            
+#line 312
+            npyv_half b1 = npyv_load_half(data + vstep * 1);
+            npyv_half c1 = npyv_load_half(data_out + vstep * 1);
+            
+#line 312
+            npyv_half b2 = npyv_load_half(data + vstep * 2);
+            npyv_half c2 = npyv_load_half(data_out + vstep * 2);
+            
+#line 312
+            npyv_half b3 = npyv_load_half(data + vstep * 3);
+            npyv_half c3 = npyv_load_half(data_out + vstep * 3);
+            
+            #line 318
+            npyv_half abc0 = npyv_muladd_half(v_scalar, b0, c0);
+            
+#line 318
+            npyv_half abc1 = npyv_muladd_half(v_scalar, b1, c1);
+            
+#line 318
+            npyv_half abc2 = npyv_muladd_half(v_scalar, b2, c2);
+            
+#line 318
+            npyv_half abc3 = npyv_muladd_half(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_half(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_half(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_half(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_half(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_half a = npyv_load_tillz_half(data, count);
+        npyv_half b = npyv_load_tillz_half(data_out, count);
+        npyv_store_till_half(data_out, count, npyv_muladd_half(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_float b0 = npy_half_to_float(data[0]);
+        const npy_float c0 = npy_half_to_float(data_out[0]);
+        
+#line 340
+        const npy_float b1 = npy_half_to_float(data[1]);
+        const npy_float c1 = npy_half_to_float(data_out[1]);
+        
+#line 340
+        const npy_float b2 = npy_half_to_float(data[2]);
+        const npy_float c2 = npy_half_to_float(data_out[2]);
+        
+#line 340
+        const npy_float b3 = npy_half_to_float(data[3]);
+        const npy_float c3 = npy_half_to_float(data_out[3]);
+        
+        #line 346
+        const npy_float abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_float abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_float abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_float abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = npy_float_to_half(abc0);
+        
+#line 351
+        data_out[1] = npy_float_to_half(abc1);
+        
+#line 351
+        data_out[2] = npy_float_to_half(abc2);
+        
+#line 351
+        data_out[3] = npy_float_to_half(abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_float b = npy_half_to_float(*data);
+        const npy_float c = npy_half_to_float(*data_out);
+        *data_out = npy_float_to_half(scalar * b + c);
+    }
+#endif // NPYV check for npy_half
+}
+
+static void
+half_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_half *data0 = (npy_half *)dataptr[0];
+    npy_half *data1 = (npy_half *)dataptr[1];
+    npy_half *data_out = (npy_half *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("half_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_half
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_half;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_half a0 = npyv_loada_half(data0 + vstep * 0);
+            npyv_half b0 = npyv_loada_half(data1 + vstep * 0);
+            npyv_half c0 = npyv_loada_half(data_out + vstep * 0);
+            
+#line 390
+            npyv_half a1 = npyv_loada_half(data0 + vstep * 1);
+            npyv_half b1 = npyv_loada_half(data1 + vstep * 1);
+            npyv_half c1 = npyv_loada_half(data_out + vstep * 1);
+            
+#line 390
+            npyv_half a2 = npyv_loada_half(data0 + vstep * 2);
+            npyv_half b2 = npyv_loada_half(data1 + vstep * 2);
+            npyv_half c2 = npyv_loada_half(data_out + vstep * 2);
+            
+#line 390
+            npyv_half a3 = npyv_loada_half(data0 + vstep * 3);
+            npyv_half b3 = npyv_loada_half(data1 + vstep * 3);
+            npyv_half c3 = npyv_loada_half(data_out + vstep * 3);
+            
+            #line 397
+            npyv_half abc0 = npyv_muladd_half(a0, b0, c0);
+            
+#line 397
+            npyv_half abc1 = npyv_muladd_half(a1, b1, c1);
+            
+#line 397
+            npyv_half abc2 = npyv_muladd_half(a2, b2, c2);
+            
+#line 397
+            npyv_half abc3 = npyv_muladd_half(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_half(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_half(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_half(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_half(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_half a0 = npyv_load_half(data0 + vstep * 0);
+            npyv_half b0 = npyv_load_half(data1 + vstep * 0);
+            npyv_half c0 = npyv_load_half(data_out + vstep * 0);
+            
+#line 390
+            npyv_half a1 = npyv_load_half(data0 + vstep * 1);
+            npyv_half b1 = npyv_load_half(data1 + vstep * 1);
+            npyv_half c1 = npyv_load_half(data_out + vstep * 1);
+            
+#line 390
+            npyv_half a2 = npyv_load_half(data0 + vstep * 2);
+            npyv_half b2 = npyv_load_half(data1 + vstep * 2);
+            npyv_half c2 = npyv_load_half(data_out + vstep * 2);
+            
+#line 390
+            npyv_half a3 = npyv_load_half(data0 + vstep * 3);
+            npyv_half b3 = npyv_load_half(data1 + vstep * 3);
+            npyv_half c3 = npyv_load_half(data_out + vstep * 3);
+            
+            #line 397
+            npyv_half abc0 = npyv_muladd_half(a0, b0, c0);
+            
+#line 397
+            npyv_half abc1 = npyv_muladd_half(a1, b1, c1);
+            
+#line 397
+            npyv_half abc2 = npyv_muladd_half(a2, b2, c2);
+            
+#line 397
+            npyv_half abc3 = npyv_muladd_half(a3, b3, c3);
+            
+            #line 402
+            npyv_store_half(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_half(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_half(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_half(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_half a = npyv_load_tillz_half(data0, count);
+        npyv_half b = npyv_load_tillz_half(data1, count);
+        npyv_half c = npyv_load_tillz_half(data_out, count);
+        npyv_store_till_half(data_out, count, npyv_muladd_half(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_float a0 = npy_half_to_float(data0[0]);
+        const npy_float b0 = npy_half_to_float(data1[0]);
+        const npy_float c0 = npy_half_to_float(data_out[0]);
+        
+#line 420
+        const npy_float a1 = npy_half_to_float(data0[1]);
+        const npy_float b1 = npy_half_to_float(data1[1]);
+        const npy_float c1 = npy_half_to_float(data_out[1]);
+        
+#line 420
+        const npy_float a2 = npy_half_to_float(data0[2]);
+        const npy_float b2 = npy_half_to_float(data1[2]);
+        const npy_float c2 = npy_half_to_float(data_out[2]);
+        
+#line 420
+        const npy_float a3 = npy_half_to_float(data0[3]);
+        const npy_float b3 = npy_half_to_float(data1[3]);
+        const npy_float c3 = npy_half_to_float(data_out[3]);
+        
+        #line 427
+        const npy_float abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_float abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_float abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_float abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = npy_float_to_half(abc0);
+        
+#line 432
+        data_out[1] = npy_float_to_half(abc1);
+        
+#line 432
+        data_out[2] = npy_float_to_half(abc2);
+        
+#line 432
+        data_out[3] = npy_float_to_half(abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_float a = npy_half_to_float(*data0);
+        const npy_float b = npy_half_to_float(*data1);
+        const npy_float c = npy_half_to_float(*data_out);
+        *data_out = npy_float_to_half(a * b + c);
+    }
+#endif // NPYV check for npy_half
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+half_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float value0 = npy_half_to_float(*(npy_half *)dataptr[0]);
+    npy_half *data1 = (npy_half *)dataptr[1];
+    npy_half *data_out = (npy_half *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("half_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    half_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+half_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float value1 = npy_half_to_float(*(npy_half *)dataptr[1]);
+    npy_half *data0 = (npy_half *)dataptr[0];
+    npy_half *data_out = (npy_half *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("half_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    half_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+half_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_half *data0 = (npy_half *)dataptr[0];
+    npy_half *data1 = (npy_half *)dataptr[1];
+    npy_float accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("half_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_half
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_half;
+    npyv_half v_accum = npyv_zero_half();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_half a0 = npyv_loada_half(data0 + vstep * 0);
+            npyv_half b0 = npyv_loada_half(data1 + vstep * 0);
+            
+#line 501
+            npyv_half a1 = npyv_loada_half(data0 + vstep * 1);
+            npyv_half b1 = npyv_loada_half(data1 + vstep * 1);
+            
+#line 501
+            npyv_half a2 = npyv_loada_half(data0 + vstep * 2);
+            npyv_half b2 = npyv_loada_half(data1 + vstep * 2);
+            
+#line 501
+            npyv_half a3 = npyv_loada_half(data0 + vstep * 3);
+            npyv_half b3 = npyv_loada_half(data1 + vstep * 3);
+            
+            npyv_half ab3 = npyv_muladd_half(a3, b3, v_accum);
+            npyv_half ab2 = npyv_muladd_half(a2, b2, ab3);
+            npyv_half ab1 = npyv_muladd_half(a1, b1, ab2);
+                   v_accum = npyv_muladd_half(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_half a0 = npyv_load_half(data0 + vstep * 0);
+            npyv_half b0 = npyv_load_half(data1 + vstep * 0);
+            
+#line 501
+            npyv_half a1 = npyv_load_half(data0 + vstep * 1);
+            npyv_half b1 = npyv_load_half(data1 + vstep * 1);
+            
+#line 501
+            npyv_half a2 = npyv_load_half(data0 + vstep * 2);
+            npyv_half b2 = npyv_load_half(data1 + vstep * 2);
+            
+#line 501
+            npyv_half a3 = npyv_load_half(data0 + vstep * 3);
+            npyv_half b3 = npyv_load_half(data1 + vstep * 3);
+            
+            npyv_half ab3 = npyv_muladd_half(a3, b3, v_accum);
+            npyv_half ab2 = npyv_muladd_half(a2, b2, ab3);
+            npyv_half ab1 = npyv_muladd_half(a1, b1, ab2);
+                   v_accum = npyv_muladd_half(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_half a = npyv_load_tillz_half(data0, count);
+        npyv_half b = npyv_load_tillz_half(data1, count);
+        v_accum = npyv_muladd_half(a, b, v_accum);
+    }
+    accum = npyv_sum_half(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_float ab0 = npy_half_to_float(data0[0]) * npy_half_to_float(data1[0]);
+        
+#line 524
+        const npy_float ab1 = npy_half_to_float(data0[1]) * npy_half_to_float(data1[1]);
+        
+#line 524
+        const npy_float ab2 = npy_half_to_float(data0[2]) * npy_half_to_float(data1[2]);
+        
+#line 524
+        const npy_float ab3 = npy_half_to_float(data0[3]) * npy_half_to_float(data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_float a = npy_half_to_float(*data0);
+        const npy_float b = npy_half_to_float(*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_half
+    *(npy_half *)dataptr[2] = npy_float_to_half(npy_half_to_float(*(npy_half *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+half_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_half *data1 = (npy_half *)dataptr[1];
+    npy_float value0 = npy_half_to_float(*(npy_half *)dataptr[0]);
+    npy_float accum = half_sum_of_arr(data1, count);
+    *(npy_half *)dataptr[2] = npy_float_to_half(npy_half_to_float(*(npy_half *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+half_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_half *data0 = (npy_half *)dataptr[0];
+    npy_float value1 = npy_half_to_float(*(npy_half *)dataptr[1]);
+    npy_float accum = half_sum_of_arr(data0, count);
+    *(npy_half *)dataptr[2] = npy_float_to_half(npy_half_to_float(*(npy_half *)dataptr[2]) + value1 * accum);
+}
+
+#elif 3 == 3 && !0
+
+static void
+half_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_half *data0 = (npy_half *)dataptr[0];
+    npy_half *data1 = (npy_half *)dataptr[1];
+    npy_half *data2 = (npy_half *)dataptr[2];
+    npy_half *data_out = (npy_half *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = npy_float_to_half(npy_half_to_float(data0[0]) *
+                             npy_half_to_float(data1[0]) *
+                             npy_half_to_float(data2[0]) +
+                             npy_half_to_float(data_out[0]));
+
+#line 576
+        data_out[1] = npy_float_to_half(npy_half_to_float(data0[1]) *
+                             npy_half_to_float(data1[1]) *
+                             npy_half_to_float(data2[1]) +
+                             npy_half_to_float(data_out[1]));
+
+#line 576
+        data_out[2] = npy_float_to_half(npy_half_to_float(data0[2]) *
+                             npy_half_to_float(data1[2]) *
+                             npy_half_to_float(data2[2]) +
+                             npy_half_to_float(data_out[2]));
+
+#line 576
+        data_out[3] = npy_float_to_half(npy_half_to_float(data0[3]) *
+                             npy_half_to_float(data1[3]) *
+                             npy_half_to_float(data2[3]) +
+                             npy_half_to_float(data_out[3]));
+
+#line 576
+        data_out[4] = npy_float_to_half(npy_half_to_float(data0[4]) *
+                             npy_half_to_float(data1[4]) *
+                             npy_half_to_float(data2[4]) +
+                             npy_half_to_float(data_out[4]));
+
+#line 576
+        data_out[5] = npy_float_to_half(npy_half_to_float(data0[5]) *
+                             npy_half_to_float(data1[5]) *
+                             npy_half_to_float(data2[5]) +
+                             npy_half_to_float(data_out[5]));
+
+#line 576
+        data_out[6] = npy_float_to_half(npy_half_to_float(data0[6]) *
+                             npy_half_to_float(data1[6]) *
+                             npy_half_to_float(data2[6]) +
+                             npy_half_to_float(data_out[6]));
+
+#line 576
+        data_out[7] = npy_float_to_half(npy_half_to_float(data0[7]) *
+                             npy_half_to_float(data1[7]) *
+                             npy_half_to_float(data2[7]) +
+                             npy_half_to_float(data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = npy_float_to_half(npy_half_to_float(data0[0]) *
+                         npy_half_to_float(data1[0]) *
+                         npy_half_to_float(data2[0]) +
+                         npy_half_to_float(data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = npy_float_to_half(npy_half_to_float(data0[1]) *
+                         npy_half_to_float(data1[1]) *
+                         npy_half_to_float(data2[1]) +
+                         npy_half_to_float(data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = npy_float_to_half(npy_half_to_float(data0[2]) *
+                         npy_half_to_float(data1[2]) *
+                         npy_half_to_float(data2[2]) +
+                         npy_half_to_float(data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = npy_float_to_half(npy_half_to_float(data0[3]) *
+                         npy_half_to_float(data1[3]) *
+                         npy_half_to_float(data2[3]) +
+                         npy_half_to_float(data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = npy_float_to_half(npy_half_to_float(data0[4]) *
+                         npy_half_to_float(data1[4]) *
+                         npy_half_to_float(data2[4]) +
+                         npy_half_to_float(data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = npy_float_to_half(npy_half_to_float(data0[5]) *
+                         npy_half_to_float(data1[5]) *
+                         npy_half_to_float(data2[5]) +
+                         npy_half_to_float(data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = npy_float_to_half(npy_half_to_float(data0[6]) *
+                         npy_half_to_float(data1[6]) *
+                         npy_half_to_float(data2[6]) +
+                         npy_half_to_float(data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = npy_float_to_half(npy_half_to_float(data0[7]) *
+                         npy_half_to_float(data1[7]) *
+                         npy_half_to_float(data2[7]) +
+                         npy_half_to_float(data_out[7]));
+
+}
+
+#else /* 3 > 3 || @complex */
+
+static void
+half_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("half_sum_of_products_contig_three (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_float temp = npy_half_to_float(*(npy_half *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= npy_half_to_float(*(npy_half *)dataptr[i]);
+        }
+        *(npy_half *)dataptr[nop] = npy_float_to_half(temp +
+                                           npy_half_to_float(*(npy_half *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_half);
+        }
+#else /* complex */
+#  if 3 <= 3
+#    define _SUMPROD_NOP 3
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_float re, im, tmp;
+        int i;
+        re = ((npy_float *)dataptr[0])[0];
+        im = ((npy_float *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_float *)dataptr[i])[0] -
+                  im * ((npy_float *)dataptr[i])[1];
+            im = re * ((npy_float *)dataptr[i])[1] +
+                 im * ((npy_float *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_float *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_float *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_half);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 3 */
+
+#if 3 == 1
+
+static NPY_GCC_OPT_3 void
+half_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("half_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_half *data = (npy_half *)dataptr[0];
+    npy_float accum = half_sum_of_arr(data, count);
+    *((npy_half *)dataptr[1]) = npy_float_to_half(accum + npy_half_to_float(*((npy_half *)dataptr[1])));
+#else
+    npy_float accum_re = 0, accum_im = 0;
+    npy_float *data0 = (npy_float *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_float re01 = data0[0] + data0[2];
+        const npy_float re23 = data0[4] + data0[6];
+        const npy_float im13 = data0[1] + data0[3];
+        const npy_float im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_float *)dataptr[1])[0] += accum_re;
+    ((npy_float *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 3 == 1 */
+
+static void
+half_sum_of_products_outstride0_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_float accum_re = 0, accum_im = 0;
+#else
+    npy_float accum = 0;
+#endif
+
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (3 == 2 || 3 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (3 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("half_sum_of_products_outstride0_three (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 3 == 1
+        accum += npy_half_to_float(*(npy_half *)data0);
+        data0 += stride0;
+#  elif 3 == 2
+        accum += npy_half_to_float(*(npy_half *)data0) *
+                 npy_half_to_float(*(npy_half *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 3 == 3
+        accum += npy_half_to_float(*(npy_half *)data0) *
+                 npy_half_to_float(*(npy_half *)data1) *
+                 npy_half_to_float(*(npy_half *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_float temp = npy_half_to_float(*(npy_half *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= npy_half_to_float(*(npy_half *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 3 == 1
+        accum_re += ((npy_float *)data0)[0];
+        accum_im += ((npy_float *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 3 <= 3
+#define _SUMPROD_NOP 3
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_float re, im, tmp;
+        int i;
+        re = ((npy_float *)dataptr[0])[0];
+        im = ((npy_float *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_float *)dataptr[i])[0] -
+                  im * ((npy_float *)dataptr[i])[1];
+            im = re * ((npy_float *)dataptr[i])[1] +
+                 im * ((npy_float *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 3 <= 3
+    ((npy_float *)dataptr[3])[0] += accum_re;
+    ((npy_float *)dataptr[3])[1] += accum_im;
+#  else
+    ((npy_float *)dataptr[nop])[0] += accum_re;
+    ((npy_float *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 3 <= 3
+    *((npy_half *)dataptr[3]) = npy_float_to_half(accum +
+                                    npy_half_to_float(*((npy_half *)dataptr[3])));
+#  else
+    *((npy_half *)dataptr[nop]) = npy_float_to_half(accum +
+                                    npy_half_to_float(*((npy_half *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+half_sum_of_products_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1000 == 2 || 1000 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1000 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data_out = dataptr[1000];
+    npy_intp stride_out = strides[1000];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("half_sum_of_products_any (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 1000 == 1
+        *(npy_half *)data_out = npy_float_to_half(npy_half_to_float(*(npy_half *)data0) +
+                                         npy_half_to_float(*(npy_half *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 1000 == 2
+        *(npy_half *)data_out = npy_float_to_half(npy_half_to_float(*(npy_half *)data0) *
+                                         npy_half_to_float(*(npy_half *)data1) +
+                                         npy_half_to_float(*(npy_half *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 1000 == 3
+        *(npy_half *)data_out = npy_float_to_half(npy_half_to_float(*(npy_half *)data0) *
+                                         npy_half_to_float(*(npy_half *)data1) *
+                                         npy_half_to_float(*(npy_half *)data2) +
+                                         npy_half_to_float(*(npy_half *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_float temp = npy_half_to_float(*(npy_half *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= npy_half_to_float(*(npy_half *)dataptr[i]);
+        }
+        *(npy_half *)dataptr[nop] = npy_float_to_half(temp +
+                                           npy_half_to_float(*(npy_half *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1000 == 1
+        ((npy_float *)data_out)[0] = ((npy_float *)data0)[0] +
+                                         ((npy_float *)data_out)[0];
+        ((npy_float *)data_out)[1] = ((npy_float *)data0)[1] +
+                                         ((npy_float *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 1000 <= 3
+#define _SUMPROD_NOP 1000
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_float re, im, tmp;
+        int i;
+        re = ((npy_float *)dataptr[0])[0];
+        im = ((npy_float *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_float *)dataptr[i])[0] -
+                  im * ((npy_float *)dataptr[i])[1];
+            im = re * ((npy_float *)dataptr[i])[1] +
+                 im * ((npy_float *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_float *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_float *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 1000 == 1
+
+static void
+half_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_half *data0 = (npy_half *)dataptr[0];
+    npy_half *data_out = (npy_half *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("half_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = npy_float_to_half(npy_half_to_float(data0[6]) +
+                                 npy_half_to_float(data_out[6]));
+#else
+            ((npy_float *)data_out + 2*6)[0] =
+                                    ((npy_float *)data0 + 2*6)[0] +
+                                    ((npy_float *)data_out + 2*6)[0];
+            ((npy_float *)data_out + 2*6)[1] =
+                                    ((npy_float *)data0 + 2*6)[1] +
+                                    ((npy_float *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = npy_float_to_half(npy_half_to_float(data0[5]) +
+                                 npy_half_to_float(data_out[5]));
+#else
+            ((npy_float *)data_out + 2*5)[0] =
+                                    ((npy_float *)data0 + 2*5)[0] +
+                                    ((npy_float *)data_out + 2*5)[0];
+            ((npy_float *)data_out + 2*5)[1] =
+                                    ((npy_float *)data0 + 2*5)[1] +
+                                    ((npy_float *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = npy_float_to_half(npy_half_to_float(data0[4]) +
+                                 npy_half_to_float(data_out[4]));
+#else
+            ((npy_float *)data_out + 2*4)[0] =
+                                    ((npy_float *)data0 + 2*4)[0] +
+                                    ((npy_float *)data_out + 2*4)[0];
+            ((npy_float *)data_out + 2*4)[1] =
+                                    ((npy_float *)data0 + 2*4)[1] +
+                                    ((npy_float *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = npy_float_to_half(npy_half_to_float(data0[3]) +
+                                 npy_half_to_float(data_out[3]));
+#else
+            ((npy_float *)data_out + 2*3)[0] =
+                                    ((npy_float *)data0 + 2*3)[0] +
+                                    ((npy_float *)data_out + 2*3)[0];
+            ((npy_float *)data_out + 2*3)[1] =
+                                    ((npy_float *)data0 + 2*3)[1] +
+                                    ((npy_float *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = npy_float_to_half(npy_half_to_float(data0[2]) +
+                                 npy_half_to_float(data_out[2]));
+#else
+            ((npy_float *)data_out + 2*2)[0] =
+                                    ((npy_float *)data0 + 2*2)[0] +
+                                    ((npy_float *)data_out + 2*2)[0];
+            ((npy_float *)data_out + 2*2)[1] =
+                                    ((npy_float *)data0 + 2*2)[1] +
+                                    ((npy_float *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = npy_float_to_half(npy_half_to_float(data0[1]) +
+                                 npy_half_to_float(data_out[1]));
+#else
+            ((npy_float *)data_out + 2*1)[0] =
+                                    ((npy_float *)data0 + 2*1)[0] +
+                                    ((npy_float *)data_out + 2*1)[0];
+            ((npy_float *)data_out + 2*1)[1] =
+                                    ((npy_float *)data0 + 2*1)[1] +
+                                    ((npy_float *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = npy_float_to_half(npy_half_to_float(data0[0]) +
+                                 npy_half_to_float(data_out[0]));
+#else
+            ((npy_float *)data_out + 2*0)[0] =
+                                    ((npy_float *)data0 + 2*0)[0] +
+                                    ((npy_float *)data_out + 2*0)[0];
+            ((npy_float *)data_out + 2*0)[1] =
+                                    ((npy_float *)data0 + 2*0)[1] +
+                                    ((npy_float *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = npy_float_to_half(npy_half_to_float(data0[0]) +
+                             npy_half_to_float(data_out[0]));
+#else /* complex */
+        ((npy_float *)data_out + 2*0)[0] =
+                                ((npy_float *)data0 + 2*0)[0] +
+                                ((npy_float *)data_out + 2*0)[0];
+        ((npy_float *)data_out + 2*0)[1] =
+                                ((npy_float *)data0 + 2*0)[1] +
+                                ((npy_float *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = npy_float_to_half(npy_half_to_float(data0[1]) +
+                             npy_half_to_float(data_out[1]));
+#else /* complex */
+        ((npy_float *)data_out + 2*1)[0] =
+                                ((npy_float *)data0 + 2*1)[0] +
+                                ((npy_float *)data_out + 2*1)[0];
+        ((npy_float *)data_out + 2*1)[1] =
+                                ((npy_float *)data0 + 2*1)[1] +
+                                ((npy_float *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = npy_float_to_half(npy_half_to_float(data0[2]) +
+                             npy_half_to_float(data_out[2]));
+#else /* complex */
+        ((npy_float *)data_out + 2*2)[0] =
+                                ((npy_float *)data0 + 2*2)[0] +
+                                ((npy_float *)data_out + 2*2)[0];
+        ((npy_float *)data_out + 2*2)[1] =
+                                ((npy_float *)data0 + 2*2)[1] +
+                                ((npy_float *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = npy_float_to_half(npy_half_to_float(data0[3]) +
+                             npy_half_to_float(data_out[3]));
+#else /* complex */
+        ((npy_float *)data_out + 2*3)[0] =
+                                ((npy_float *)data0 + 2*3)[0] +
+                                ((npy_float *)data_out + 2*3)[0];
+        ((npy_float *)data_out + 2*3)[1] =
+                                ((npy_float *)data0 + 2*3)[1] +
+                                ((npy_float *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = npy_float_to_half(npy_half_to_float(data0[4]) +
+                             npy_half_to_float(data_out[4]));
+#else /* complex */
+        ((npy_float *)data_out + 2*4)[0] =
+                                ((npy_float *)data0 + 2*4)[0] +
+                                ((npy_float *)data_out + 2*4)[0];
+        ((npy_float *)data_out + 2*4)[1] =
+                                ((npy_float *)data0 + 2*4)[1] +
+                                ((npy_float *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = npy_float_to_half(npy_half_to_float(data0[5]) +
+                             npy_half_to_float(data_out[5]));
+#else /* complex */
+        ((npy_float *)data_out + 2*5)[0] =
+                                ((npy_float *)data0 + 2*5)[0] +
+                                ((npy_float *)data_out + 2*5)[0];
+        ((npy_float *)data_out + 2*5)[1] =
+                                ((npy_float *)data0 + 2*5)[1] +
+                                ((npy_float *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = npy_float_to_half(npy_half_to_float(data0[6]) +
+                             npy_half_to_float(data_out[6]));
+#else /* complex */
+        ((npy_float *)data_out + 2*6)[0] =
+                                ((npy_float *)data0 + 2*6)[0] +
+                                ((npy_float *)data_out + 2*6)[0];
+        ((npy_float *)data_out + 2*6)[1] =
+                                ((npy_float *)data0 + 2*6)[1] +
+                                ((npy_float *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = npy_float_to_half(npy_half_to_float(data0[7]) +
+                             npy_half_to_float(data_out[7]));
+#else /* complex */
+        ((npy_float *)data_out + 2*7)[0] =
+                                ((npy_float *)data0 + 2*7)[0] +
+                                ((npy_float *)data_out + 2*7)[0];
+        ((npy_float *)data_out + 2*7)[1] =
+                                ((npy_float *)data0 + 2*7)[1] +
+                                ((npy_float *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 1000 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+half_sum_of_products_muladd(npy_half *data, npy_half *data_out, npy_float scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_half
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_half;
+    const npyv_half v_scalar = npyv_setall_half(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_half b0 = npyv_loada_half(data + vstep * 0);
+            npyv_half c0 = npyv_loada_half(data_out + vstep * 0);
+            
+#line 312
+            npyv_half b1 = npyv_loada_half(data + vstep * 1);
+            npyv_half c1 = npyv_loada_half(data_out + vstep * 1);
+            
+#line 312
+            npyv_half b2 = npyv_loada_half(data + vstep * 2);
+            npyv_half c2 = npyv_loada_half(data_out + vstep * 2);
+            
+#line 312
+            npyv_half b3 = npyv_loada_half(data + vstep * 3);
+            npyv_half c3 = npyv_loada_half(data_out + vstep * 3);
+            
+            #line 318
+            npyv_half abc0 = npyv_muladd_half(v_scalar, b0, c0);
+            
+#line 318
+            npyv_half abc1 = npyv_muladd_half(v_scalar, b1, c1);
+            
+#line 318
+            npyv_half abc2 = npyv_muladd_half(v_scalar, b2, c2);
+            
+#line 318
+            npyv_half abc3 = npyv_muladd_half(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_half(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_half(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_half(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_half(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_half b0 = npyv_load_half(data + vstep * 0);
+            npyv_half c0 = npyv_load_half(data_out + vstep * 0);
+            
+#line 312
+            npyv_half b1 = npyv_load_half(data + vstep * 1);
+            npyv_half c1 = npyv_load_half(data_out + vstep * 1);
+            
+#line 312
+            npyv_half b2 = npyv_load_half(data + vstep * 2);
+            npyv_half c2 = npyv_load_half(data_out + vstep * 2);
+            
+#line 312
+            npyv_half b3 = npyv_load_half(data + vstep * 3);
+            npyv_half c3 = npyv_load_half(data_out + vstep * 3);
+            
+            #line 318
+            npyv_half abc0 = npyv_muladd_half(v_scalar, b0, c0);
+            
+#line 318
+            npyv_half abc1 = npyv_muladd_half(v_scalar, b1, c1);
+            
+#line 318
+            npyv_half abc2 = npyv_muladd_half(v_scalar, b2, c2);
+            
+#line 318
+            npyv_half abc3 = npyv_muladd_half(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_half(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_half(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_half(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_half(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_half a = npyv_load_tillz_half(data, count);
+        npyv_half b = npyv_load_tillz_half(data_out, count);
+        npyv_store_till_half(data_out, count, npyv_muladd_half(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_float b0 = npy_half_to_float(data[0]);
+        const npy_float c0 = npy_half_to_float(data_out[0]);
+        
+#line 340
+        const npy_float b1 = npy_half_to_float(data[1]);
+        const npy_float c1 = npy_half_to_float(data_out[1]);
+        
+#line 340
+        const npy_float b2 = npy_half_to_float(data[2]);
+        const npy_float c2 = npy_half_to_float(data_out[2]);
+        
+#line 340
+        const npy_float b3 = npy_half_to_float(data[3]);
+        const npy_float c3 = npy_half_to_float(data_out[3]);
+        
+        #line 346
+        const npy_float abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_float abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_float abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_float abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = npy_float_to_half(abc0);
+        
+#line 351
+        data_out[1] = npy_float_to_half(abc1);
+        
+#line 351
+        data_out[2] = npy_float_to_half(abc2);
+        
+#line 351
+        data_out[3] = npy_float_to_half(abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_float b = npy_half_to_float(*data);
+        const npy_float c = npy_half_to_float(*data_out);
+        *data_out = npy_float_to_half(scalar * b + c);
+    }
+#endif // NPYV check for npy_half
+}
+
+static void
+half_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_half *data0 = (npy_half *)dataptr[0];
+    npy_half *data1 = (npy_half *)dataptr[1];
+    npy_half *data_out = (npy_half *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("half_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_half
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_half;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_half a0 = npyv_loada_half(data0 + vstep * 0);
+            npyv_half b0 = npyv_loada_half(data1 + vstep * 0);
+            npyv_half c0 = npyv_loada_half(data_out + vstep * 0);
+            
+#line 390
+            npyv_half a1 = npyv_loada_half(data0 + vstep * 1);
+            npyv_half b1 = npyv_loada_half(data1 + vstep * 1);
+            npyv_half c1 = npyv_loada_half(data_out + vstep * 1);
+            
+#line 390
+            npyv_half a2 = npyv_loada_half(data0 + vstep * 2);
+            npyv_half b2 = npyv_loada_half(data1 + vstep * 2);
+            npyv_half c2 = npyv_loada_half(data_out + vstep * 2);
+            
+#line 390
+            npyv_half a3 = npyv_loada_half(data0 + vstep * 3);
+            npyv_half b3 = npyv_loada_half(data1 + vstep * 3);
+            npyv_half c3 = npyv_loada_half(data_out + vstep * 3);
+            
+            #line 397
+            npyv_half abc0 = npyv_muladd_half(a0, b0, c0);
+            
+#line 397
+            npyv_half abc1 = npyv_muladd_half(a1, b1, c1);
+            
+#line 397
+            npyv_half abc2 = npyv_muladd_half(a2, b2, c2);
+            
+#line 397
+            npyv_half abc3 = npyv_muladd_half(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_half(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_half(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_half(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_half(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_half a0 = npyv_load_half(data0 + vstep * 0);
+            npyv_half b0 = npyv_load_half(data1 + vstep * 0);
+            npyv_half c0 = npyv_load_half(data_out + vstep * 0);
+            
+#line 390
+            npyv_half a1 = npyv_load_half(data0 + vstep * 1);
+            npyv_half b1 = npyv_load_half(data1 + vstep * 1);
+            npyv_half c1 = npyv_load_half(data_out + vstep * 1);
+            
+#line 390
+            npyv_half a2 = npyv_load_half(data0 + vstep * 2);
+            npyv_half b2 = npyv_load_half(data1 + vstep * 2);
+            npyv_half c2 = npyv_load_half(data_out + vstep * 2);
+            
+#line 390
+            npyv_half a3 = npyv_load_half(data0 + vstep * 3);
+            npyv_half b3 = npyv_load_half(data1 + vstep * 3);
+            npyv_half c3 = npyv_load_half(data_out + vstep * 3);
+            
+            #line 397
+            npyv_half abc0 = npyv_muladd_half(a0, b0, c0);
+            
+#line 397
+            npyv_half abc1 = npyv_muladd_half(a1, b1, c1);
+            
+#line 397
+            npyv_half abc2 = npyv_muladd_half(a2, b2, c2);
+            
+#line 397
+            npyv_half abc3 = npyv_muladd_half(a3, b3, c3);
+            
+            #line 402
+            npyv_store_half(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_half(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_half(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_half(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_half a = npyv_load_tillz_half(data0, count);
+        npyv_half b = npyv_load_tillz_half(data1, count);
+        npyv_half c = npyv_load_tillz_half(data_out, count);
+        npyv_store_till_half(data_out, count, npyv_muladd_half(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_float a0 = npy_half_to_float(data0[0]);
+        const npy_float b0 = npy_half_to_float(data1[0]);
+        const npy_float c0 = npy_half_to_float(data_out[0]);
+        
+#line 420
+        const npy_float a1 = npy_half_to_float(data0[1]);
+        const npy_float b1 = npy_half_to_float(data1[1]);
+        const npy_float c1 = npy_half_to_float(data_out[1]);
+        
+#line 420
+        const npy_float a2 = npy_half_to_float(data0[2]);
+        const npy_float b2 = npy_half_to_float(data1[2]);
+        const npy_float c2 = npy_half_to_float(data_out[2]);
+        
+#line 420
+        const npy_float a3 = npy_half_to_float(data0[3]);
+        const npy_float b3 = npy_half_to_float(data1[3]);
+        const npy_float c3 = npy_half_to_float(data_out[3]);
+        
+        #line 427
+        const npy_float abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_float abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_float abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_float abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = npy_float_to_half(abc0);
+        
+#line 432
+        data_out[1] = npy_float_to_half(abc1);
+        
+#line 432
+        data_out[2] = npy_float_to_half(abc2);
+        
+#line 432
+        data_out[3] = npy_float_to_half(abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_float a = npy_half_to_float(*data0);
+        const npy_float b = npy_half_to_float(*data1);
+        const npy_float c = npy_half_to_float(*data_out);
+        *data_out = npy_float_to_half(a * b + c);
+    }
+#endif // NPYV check for npy_half
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+half_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float value0 = npy_half_to_float(*(npy_half *)dataptr[0]);
+    npy_half *data1 = (npy_half *)dataptr[1];
+    npy_half *data_out = (npy_half *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("half_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    half_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+half_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float value1 = npy_half_to_float(*(npy_half *)dataptr[1]);
+    npy_half *data0 = (npy_half *)dataptr[0];
+    npy_half *data_out = (npy_half *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("half_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    half_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+half_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_half *data0 = (npy_half *)dataptr[0];
+    npy_half *data1 = (npy_half *)dataptr[1];
+    npy_float accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("half_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_half
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_half;
+    npyv_half v_accum = npyv_zero_half();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_half a0 = npyv_loada_half(data0 + vstep * 0);
+            npyv_half b0 = npyv_loada_half(data1 + vstep * 0);
+            
+#line 501
+            npyv_half a1 = npyv_loada_half(data0 + vstep * 1);
+            npyv_half b1 = npyv_loada_half(data1 + vstep * 1);
+            
+#line 501
+            npyv_half a2 = npyv_loada_half(data0 + vstep * 2);
+            npyv_half b2 = npyv_loada_half(data1 + vstep * 2);
+            
+#line 501
+            npyv_half a3 = npyv_loada_half(data0 + vstep * 3);
+            npyv_half b3 = npyv_loada_half(data1 + vstep * 3);
+            
+            npyv_half ab3 = npyv_muladd_half(a3, b3, v_accum);
+            npyv_half ab2 = npyv_muladd_half(a2, b2, ab3);
+            npyv_half ab1 = npyv_muladd_half(a1, b1, ab2);
+                   v_accum = npyv_muladd_half(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_half a0 = npyv_load_half(data0 + vstep * 0);
+            npyv_half b0 = npyv_load_half(data1 + vstep * 0);
+            
+#line 501
+            npyv_half a1 = npyv_load_half(data0 + vstep * 1);
+            npyv_half b1 = npyv_load_half(data1 + vstep * 1);
+            
+#line 501
+            npyv_half a2 = npyv_load_half(data0 + vstep * 2);
+            npyv_half b2 = npyv_load_half(data1 + vstep * 2);
+            
+#line 501
+            npyv_half a3 = npyv_load_half(data0 + vstep * 3);
+            npyv_half b3 = npyv_load_half(data1 + vstep * 3);
+            
+            npyv_half ab3 = npyv_muladd_half(a3, b3, v_accum);
+            npyv_half ab2 = npyv_muladd_half(a2, b2, ab3);
+            npyv_half ab1 = npyv_muladd_half(a1, b1, ab2);
+                   v_accum = npyv_muladd_half(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_half a = npyv_load_tillz_half(data0, count);
+        npyv_half b = npyv_load_tillz_half(data1, count);
+        v_accum = npyv_muladd_half(a, b, v_accum);
+    }
+    accum = npyv_sum_half(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_float ab0 = npy_half_to_float(data0[0]) * npy_half_to_float(data1[0]);
+        
+#line 524
+        const npy_float ab1 = npy_half_to_float(data0[1]) * npy_half_to_float(data1[1]);
+        
+#line 524
+        const npy_float ab2 = npy_half_to_float(data0[2]) * npy_half_to_float(data1[2]);
+        
+#line 524
+        const npy_float ab3 = npy_half_to_float(data0[3]) * npy_half_to_float(data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_float a = npy_half_to_float(*data0);
+        const npy_float b = npy_half_to_float(*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_half
+    *(npy_half *)dataptr[2] = npy_float_to_half(npy_half_to_float(*(npy_half *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+half_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_half *data1 = (npy_half *)dataptr[1];
+    npy_float value0 = npy_half_to_float(*(npy_half *)dataptr[0]);
+    npy_float accum = half_sum_of_arr(data1, count);
+    *(npy_half *)dataptr[2] = npy_float_to_half(npy_half_to_float(*(npy_half *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+half_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_half *data0 = (npy_half *)dataptr[0];
+    npy_float value1 = npy_half_to_float(*(npy_half *)dataptr[1]);
+    npy_float accum = half_sum_of_arr(data0, count);
+    *(npy_half *)dataptr[2] = npy_float_to_half(npy_half_to_float(*(npy_half *)dataptr[2]) + value1 * accum);
+}
+
+#elif 1000 == 3 && !0
+
+static void
+half_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_half *data0 = (npy_half *)dataptr[0];
+    npy_half *data1 = (npy_half *)dataptr[1];
+    npy_half *data2 = (npy_half *)dataptr[2];
+    npy_half *data_out = (npy_half *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = npy_float_to_half(npy_half_to_float(data0[0]) *
+                             npy_half_to_float(data1[0]) *
+                             npy_half_to_float(data2[0]) +
+                             npy_half_to_float(data_out[0]));
+
+#line 576
+        data_out[1] = npy_float_to_half(npy_half_to_float(data0[1]) *
+                             npy_half_to_float(data1[1]) *
+                             npy_half_to_float(data2[1]) +
+                             npy_half_to_float(data_out[1]));
+
+#line 576
+        data_out[2] = npy_float_to_half(npy_half_to_float(data0[2]) *
+                             npy_half_to_float(data1[2]) *
+                             npy_half_to_float(data2[2]) +
+                             npy_half_to_float(data_out[2]));
+
+#line 576
+        data_out[3] = npy_float_to_half(npy_half_to_float(data0[3]) *
+                             npy_half_to_float(data1[3]) *
+                             npy_half_to_float(data2[3]) +
+                             npy_half_to_float(data_out[3]));
+
+#line 576
+        data_out[4] = npy_float_to_half(npy_half_to_float(data0[4]) *
+                             npy_half_to_float(data1[4]) *
+                             npy_half_to_float(data2[4]) +
+                             npy_half_to_float(data_out[4]));
+
+#line 576
+        data_out[5] = npy_float_to_half(npy_half_to_float(data0[5]) *
+                             npy_half_to_float(data1[5]) *
+                             npy_half_to_float(data2[5]) +
+                             npy_half_to_float(data_out[5]));
+
+#line 576
+        data_out[6] = npy_float_to_half(npy_half_to_float(data0[6]) *
+                             npy_half_to_float(data1[6]) *
+                             npy_half_to_float(data2[6]) +
+                             npy_half_to_float(data_out[6]));
+
+#line 576
+        data_out[7] = npy_float_to_half(npy_half_to_float(data0[7]) *
+                             npy_half_to_float(data1[7]) *
+                             npy_half_to_float(data2[7]) +
+                             npy_half_to_float(data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = npy_float_to_half(npy_half_to_float(data0[0]) *
+                         npy_half_to_float(data1[0]) *
+                         npy_half_to_float(data2[0]) +
+                         npy_half_to_float(data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = npy_float_to_half(npy_half_to_float(data0[1]) *
+                         npy_half_to_float(data1[1]) *
+                         npy_half_to_float(data2[1]) +
+                         npy_half_to_float(data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = npy_float_to_half(npy_half_to_float(data0[2]) *
+                         npy_half_to_float(data1[2]) *
+                         npy_half_to_float(data2[2]) +
+                         npy_half_to_float(data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = npy_float_to_half(npy_half_to_float(data0[3]) *
+                         npy_half_to_float(data1[3]) *
+                         npy_half_to_float(data2[3]) +
+                         npy_half_to_float(data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = npy_float_to_half(npy_half_to_float(data0[4]) *
+                         npy_half_to_float(data1[4]) *
+                         npy_half_to_float(data2[4]) +
+                         npy_half_to_float(data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = npy_float_to_half(npy_half_to_float(data0[5]) *
+                         npy_half_to_float(data1[5]) *
+                         npy_half_to_float(data2[5]) +
+                         npy_half_to_float(data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = npy_float_to_half(npy_half_to_float(data0[6]) *
+                         npy_half_to_float(data1[6]) *
+                         npy_half_to_float(data2[6]) +
+                         npy_half_to_float(data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = npy_float_to_half(npy_half_to_float(data0[7]) *
+                         npy_half_to_float(data1[7]) *
+                         npy_half_to_float(data2[7]) +
+                         npy_half_to_float(data_out[7]));
+
+}
+
+#else /* 1000 > 3 || @complex */
+
+static void
+half_sum_of_products_contig_any(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("half_sum_of_products_contig_any (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_float temp = npy_half_to_float(*(npy_half *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= npy_half_to_float(*(npy_half *)dataptr[i]);
+        }
+        *(npy_half *)dataptr[nop] = npy_float_to_half(temp +
+                                           npy_half_to_float(*(npy_half *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_half);
+        }
+#else /* complex */
+#  if 1000 <= 3
+#    define _SUMPROD_NOP 1000
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_float re, im, tmp;
+        int i;
+        re = ((npy_float *)dataptr[0])[0];
+        im = ((npy_float *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_float *)dataptr[i])[0] -
+                  im * ((npy_float *)dataptr[i])[1];
+            im = re * ((npy_float *)dataptr[i])[1] +
+                 im * ((npy_float *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_float *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_float *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_half);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 1000 */
+
+#if 1000 == 1
+
+static NPY_GCC_OPT_3 void
+half_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("half_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_half *data = (npy_half *)dataptr[0];
+    npy_float accum = half_sum_of_arr(data, count);
+    *((npy_half *)dataptr[1]) = npy_float_to_half(accum + npy_half_to_float(*((npy_half *)dataptr[1])));
+#else
+    npy_float accum_re = 0, accum_im = 0;
+    npy_float *data0 = (npy_float *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_float re01 = data0[0] + data0[2];
+        const npy_float re23 = data0[4] + data0[6];
+        const npy_float im13 = data0[1] + data0[3];
+        const npy_float im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_float *)dataptr[1])[0] += accum_re;
+    ((npy_float *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 1000 == 1 */
+
+static void
+half_sum_of_products_outstride0_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_float accum_re = 0, accum_im = 0;
+#else
+    npy_float accum = 0;
+#endif
+
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1000 == 2 || 1000 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1000 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("half_sum_of_products_outstride0_any (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 1000 == 1
+        accum += npy_half_to_float(*(npy_half *)data0);
+        data0 += stride0;
+#  elif 1000 == 2
+        accum += npy_half_to_float(*(npy_half *)data0) *
+                 npy_half_to_float(*(npy_half *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 1000 == 3
+        accum += npy_half_to_float(*(npy_half *)data0) *
+                 npy_half_to_float(*(npy_half *)data1) *
+                 npy_half_to_float(*(npy_half *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_float temp = npy_half_to_float(*(npy_half *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= npy_half_to_float(*(npy_half *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1000 == 1
+        accum_re += ((npy_float *)data0)[0];
+        accum_im += ((npy_float *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 1000 <= 3
+#define _SUMPROD_NOP 1000
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_float re, im, tmp;
+        int i;
+        re = ((npy_float *)dataptr[0])[0];
+        im = ((npy_float *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_float *)dataptr[i])[0] -
+                  im * ((npy_float *)dataptr[i])[1];
+            im = re * ((npy_float *)dataptr[i])[1] +
+                 im * ((npy_float *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 1000 <= 3
+    ((npy_float *)dataptr[1000])[0] += accum_re;
+    ((npy_float *)dataptr[1000])[1] += accum_im;
+#  else
+    ((npy_float *)dataptr[nop])[0] += accum_re;
+    ((npy_float *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 1000 <= 3
+    *((npy_half *)dataptr[1000]) = npy_float_to_half(accum +
+                                    npy_half_to_float(*((npy_half *)dataptr[1000])));
+#  else
+    *((npy_half *)dataptr[nop]) = npy_float_to_half(accum +
+                                    npy_half_to_float(*((npy_half *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+
+
+#line 74
+
+#if !0
+static NPY_GCC_OPT_3 npy_float float_sum_of_arr(npy_float *data, npy_intp count)
+{
+    npy_float accum = 0;
+#if NPY_SIMD_F32 // NPYV check for npy_float
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data);
+    const int vstep = npyv_nlanes_f32;
+    npyv_f32 v_accum = npyv_zero_f32();
+    const npy_intp vstepx4 = vstep * 4;
+
+    #line 91
+    if(is_aligned) {
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
+            #line 96
+            npyv_f32 a0 = npyv_loada_f32(data + vstep * 0);
+            
+#line 96
+            npyv_f32 a1 = npyv_loada_f32(data + vstep * 1);
+            
+#line 96
+            npyv_f32 a2 = npyv_loada_f32(data + vstep * 2);
+            
+#line 96
+            npyv_f32 a3 = npyv_loada_f32(data + vstep * 3);
+            
+            npyv_f32 a01   = npyv_add_f32(a0, a1);
+            npyv_f32 a23   = npyv_add_f32(a2, a3);
+            npyv_f32 a0123 = npyv_add_f32(a01, a23);
+                     v_accum = npyv_add_f32(a0123, v_accum);
+        }
+    }
+    
+#line 91
+    else {
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
+            #line 96
+            npyv_f32 a0 = npyv_load_f32(data + vstep * 0);
+            
+#line 96
+            npyv_f32 a1 = npyv_load_f32(data + vstep * 1);
+            
+#line 96
+            npyv_f32 a2 = npyv_load_f32(data + vstep * 2);
+            
+#line 96
+            npyv_f32 a3 = npyv_load_f32(data + vstep * 3);
+            
+            npyv_f32 a01   = npyv_add_f32(a0, a1);
+            npyv_f32 a23   = npyv_add_f32(a2, a3);
+            npyv_f32 a0123 = npyv_add_f32(a01, a23);
+                     v_accum = npyv_add_f32(a0123, v_accum);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep) {
+        npyv_f32 a = npyv_load_tillz_f32(data, count);
+        v_accum = npyv_add_f32(a, v_accum);
+    }
+    accum = npyv_sum_f32(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data += 4) {
+        const npy_float a01 = (*data) + (data[1]);
+        const npy_float a23 = (data[2]) + (data[3]);
+        accum +=  a01 + a23;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data++) {
+        accum += (*data);
+    }
+#endif // NPYV check for npy_float
+    return accum;
+}
+#endif
+
+#line 131
+static void
+float_sum_of_products_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1 == 2 || 1 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data_out = dataptr[1];
+    npy_intp stride_out = strides[1];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("float_sum_of_products_one (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 1 == 1
+        *(npy_float *)data_out = ((*(npy_float *)data0) +
+                                         (*(npy_float *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 1 == 2
+        *(npy_float *)data_out = ((*(npy_float *)data0) *
+                                         (*(npy_float *)data1) +
+                                         (*(npy_float *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 1 == 3
+        *(npy_float *)data_out = ((*(npy_float *)data0) *
+                                         (*(npy_float *)data1) *
+                                         (*(npy_float *)data2) +
+                                         (*(npy_float *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_float temp = (*(npy_float *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_float *)dataptr[i]);
+        }
+        *(npy_float *)dataptr[nop] = (temp +
+                                           (*(npy_float *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1 == 1
+        ((npy_float *)data_out)[0] = ((npy_float *)data0)[0] +
+                                         ((npy_float *)data_out)[0];
+        ((npy_float *)data_out)[1] = ((npy_float *)data0)[1] +
+                                         ((npy_float *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 1 <= 3
+#define _SUMPROD_NOP 1
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_float re, im, tmp;
+        int i;
+        re = ((npy_float *)dataptr[0])[0];
+        im = ((npy_float *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_float *)dataptr[i])[0] -
+                  im * ((npy_float *)dataptr[i])[1];
+            im = re * ((npy_float *)dataptr[i])[1] +
+                 im * ((npy_float *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_float *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_float *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 1 == 1
+
+static void
+float_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float *data0 = (npy_float *)dataptr[0];
+    npy_float *data_out = (npy_float *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("float_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_float *)data_out + 2*6)[0] =
+                                    ((npy_float *)data0 + 2*6)[0] +
+                                    ((npy_float *)data_out + 2*6)[0];
+            ((npy_float *)data_out + 2*6)[1] =
+                                    ((npy_float *)data0 + 2*6)[1] +
+                                    ((npy_float *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_float *)data_out + 2*5)[0] =
+                                    ((npy_float *)data0 + 2*5)[0] +
+                                    ((npy_float *)data_out + 2*5)[0];
+            ((npy_float *)data_out + 2*5)[1] =
+                                    ((npy_float *)data0 + 2*5)[1] +
+                                    ((npy_float *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_float *)data_out + 2*4)[0] =
+                                    ((npy_float *)data0 + 2*4)[0] +
+                                    ((npy_float *)data_out + 2*4)[0];
+            ((npy_float *)data_out + 2*4)[1] =
+                                    ((npy_float *)data0 + 2*4)[1] +
+                                    ((npy_float *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_float *)data_out + 2*3)[0] =
+                                    ((npy_float *)data0 + 2*3)[0] +
+                                    ((npy_float *)data_out + 2*3)[0];
+            ((npy_float *)data_out + 2*3)[1] =
+                                    ((npy_float *)data0 + 2*3)[1] +
+                                    ((npy_float *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_float *)data_out + 2*2)[0] =
+                                    ((npy_float *)data0 + 2*2)[0] +
+                                    ((npy_float *)data_out + 2*2)[0];
+            ((npy_float *)data_out + 2*2)[1] =
+                                    ((npy_float *)data0 + 2*2)[1] +
+                                    ((npy_float *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_float *)data_out + 2*1)[0] =
+                                    ((npy_float *)data0 + 2*1)[0] +
+                                    ((npy_float *)data_out + 2*1)[0];
+            ((npy_float *)data_out + 2*1)[1] =
+                                    ((npy_float *)data0 + 2*1)[1] +
+                                    ((npy_float *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_float *)data_out + 2*0)[0] =
+                                    ((npy_float *)data0 + 2*0)[0] +
+                                    ((npy_float *)data_out + 2*0)[0];
+            ((npy_float *)data_out + 2*0)[1] =
+                                    ((npy_float *)data0 + 2*0)[1] +
+                                    ((npy_float *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_float *)data_out + 2*0)[0] =
+                                ((npy_float *)data0 + 2*0)[0] +
+                                ((npy_float *)data_out + 2*0)[0];
+        ((npy_float *)data_out + 2*0)[1] =
+                                ((npy_float *)data0 + 2*0)[1] +
+                                ((npy_float *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_float *)data_out + 2*1)[0] =
+                                ((npy_float *)data0 + 2*1)[0] +
+                                ((npy_float *)data_out + 2*1)[0];
+        ((npy_float *)data_out + 2*1)[1] =
+                                ((npy_float *)data0 + 2*1)[1] +
+                                ((npy_float *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_float *)data_out + 2*2)[0] =
+                                ((npy_float *)data0 + 2*2)[0] +
+                                ((npy_float *)data_out + 2*2)[0];
+        ((npy_float *)data_out + 2*2)[1] =
+                                ((npy_float *)data0 + 2*2)[1] +
+                                ((npy_float *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_float *)data_out + 2*3)[0] =
+                                ((npy_float *)data0 + 2*3)[0] +
+                                ((npy_float *)data_out + 2*3)[0];
+        ((npy_float *)data_out + 2*3)[1] =
+                                ((npy_float *)data0 + 2*3)[1] +
+                                ((npy_float *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_float *)data_out + 2*4)[0] =
+                                ((npy_float *)data0 + 2*4)[0] +
+                                ((npy_float *)data_out + 2*4)[0];
+        ((npy_float *)data_out + 2*4)[1] =
+                                ((npy_float *)data0 + 2*4)[1] +
+                                ((npy_float *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_float *)data_out + 2*5)[0] =
+                                ((npy_float *)data0 + 2*5)[0] +
+                                ((npy_float *)data_out + 2*5)[0];
+        ((npy_float *)data_out + 2*5)[1] =
+                                ((npy_float *)data0 + 2*5)[1] +
+                                ((npy_float *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_float *)data_out + 2*6)[0] =
+                                ((npy_float *)data0 + 2*6)[0] +
+                                ((npy_float *)data_out + 2*6)[0];
+        ((npy_float *)data_out + 2*6)[1] =
+                                ((npy_float *)data0 + 2*6)[1] +
+                                ((npy_float *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_float *)data_out + 2*7)[0] =
+                                ((npy_float *)data0 + 2*7)[0] +
+                                ((npy_float *)data_out + 2*7)[0];
+        ((npy_float *)data_out + 2*7)[1] =
+                                ((npy_float *)data0 + 2*7)[1] +
+                                ((npy_float *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 1 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+float_sum_of_products_muladd(npy_float *data, npy_float *data_out, npy_float scalar, npy_intp count)
+{
+#if NPY_SIMD_F32 // NPYV check for npy_float
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_f32;
+    const npyv_f32 v_scalar = npyv_setall_f32(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_f32 b0 = npyv_loada_f32(data + vstep * 0);
+            npyv_f32 c0 = npyv_loada_f32(data_out + vstep * 0);
+            
+#line 312
+            npyv_f32 b1 = npyv_loada_f32(data + vstep * 1);
+            npyv_f32 c1 = npyv_loada_f32(data_out + vstep * 1);
+            
+#line 312
+            npyv_f32 b2 = npyv_loada_f32(data + vstep * 2);
+            npyv_f32 c2 = npyv_loada_f32(data_out + vstep * 2);
+            
+#line 312
+            npyv_f32 b3 = npyv_loada_f32(data + vstep * 3);
+            npyv_f32 c3 = npyv_loada_f32(data_out + vstep * 3);
+            
+            #line 318
+            npyv_f32 abc0 = npyv_muladd_f32(v_scalar, b0, c0);
+            
+#line 318
+            npyv_f32 abc1 = npyv_muladd_f32(v_scalar, b1, c1);
+            
+#line 318
+            npyv_f32 abc2 = npyv_muladd_f32(v_scalar, b2, c2);
+            
+#line 318
+            npyv_f32 abc3 = npyv_muladd_f32(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_f32(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_f32(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_f32(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_f32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_f32 b0 = npyv_load_f32(data + vstep * 0);
+            npyv_f32 c0 = npyv_load_f32(data_out + vstep * 0);
+            
+#line 312
+            npyv_f32 b1 = npyv_load_f32(data + vstep * 1);
+            npyv_f32 c1 = npyv_load_f32(data_out + vstep * 1);
+            
+#line 312
+            npyv_f32 b2 = npyv_load_f32(data + vstep * 2);
+            npyv_f32 c2 = npyv_load_f32(data_out + vstep * 2);
+            
+#line 312
+            npyv_f32 b3 = npyv_load_f32(data + vstep * 3);
+            npyv_f32 c3 = npyv_load_f32(data_out + vstep * 3);
+            
+            #line 318
+            npyv_f32 abc0 = npyv_muladd_f32(v_scalar, b0, c0);
+            
+#line 318
+            npyv_f32 abc1 = npyv_muladd_f32(v_scalar, b1, c1);
+            
+#line 318
+            npyv_f32 abc2 = npyv_muladd_f32(v_scalar, b2, c2);
+            
+#line 318
+            npyv_f32 abc3 = npyv_muladd_f32(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_f32(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_f32(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_f32(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_f32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_f32 a = npyv_load_tillz_f32(data, count);
+        npyv_f32 b = npyv_load_tillz_f32(data_out, count);
+        npyv_store_till_f32(data_out, count, npyv_muladd_f32(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_float b0 = (data[0]);
+        const npy_float c0 = (data_out[0]);
+        
+#line 340
+        const npy_float b1 = (data[1]);
+        const npy_float c1 = (data_out[1]);
+        
+#line 340
+        const npy_float b2 = (data[2]);
+        const npy_float c2 = (data_out[2]);
+        
+#line 340
+        const npy_float b3 = (data[3]);
+        const npy_float c3 = (data_out[3]);
+        
+        #line 346
+        const npy_float abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_float abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_float abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_float abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_float b = (*data);
+        const npy_float c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_float
+}
+
+static void
+float_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float *data0 = (npy_float *)dataptr[0];
+    npy_float *data1 = (npy_float *)dataptr[1];
+    npy_float *data_out = (npy_float *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("float_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_float
+#if NPY_SIMD_F32
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_f32;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_f32 a0 = npyv_loada_f32(data0 + vstep * 0);
+            npyv_f32 b0 = npyv_loada_f32(data1 + vstep * 0);
+            npyv_f32 c0 = npyv_loada_f32(data_out + vstep * 0);
+            
+#line 390
+            npyv_f32 a1 = npyv_loada_f32(data0 + vstep * 1);
+            npyv_f32 b1 = npyv_loada_f32(data1 + vstep * 1);
+            npyv_f32 c1 = npyv_loada_f32(data_out + vstep * 1);
+            
+#line 390
+            npyv_f32 a2 = npyv_loada_f32(data0 + vstep * 2);
+            npyv_f32 b2 = npyv_loada_f32(data1 + vstep * 2);
+            npyv_f32 c2 = npyv_loada_f32(data_out + vstep * 2);
+            
+#line 390
+            npyv_f32 a3 = npyv_loada_f32(data0 + vstep * 3);
+            npyv_f32 b3 = npyv_loada_f32(data1 + vstep * 3);
+            npyv_f32 c3 = npyv_loada_f32(data_out + vstep * 3);
+            
+            #line 397
+            npyv_f32 abc0 = npyv_muladd_f32(a0, b0, c0);
+            
+#line 397
+            npyv_f32 abc1 = npyv_muladd_f32(a1, b1, c1);
+            
+#line 397
+            npyv_f32 abc2 = npyv_muladd_f32(a2, b2, c2);
+            
+#line 397
+            npyv_f32 abc3 = npyv_muladd_f32(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_f32(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_f32(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_f32(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_f32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_f32 a0 = npyv_load_f32(data0 + vstep * 0);
+            npyv_f32 b0 = npyv_load_f32(data1 + vstep * 0);
+            npyv_f32 c0 = npyv_load_f32(data_out + vstep * 0);
+            
+#line 390
+            npyv_f32 a1 = npyv_load_f32(data0 + vstep * 1);
+            npyv_f32 b1 = npyv_load_f32(data1 + vstep * 1);
+            npyv_f32 c1 = npyv_load_f32(data_out + vstep * 1);
+            
+#line 390
+            npyv_f32 a2 = npyv_load_f32(data0 + vstep * 2);
+            npyv_f32 b2 = npyv_load_f32(data1 + vstep * 2);
+            npyv_f32 c2 = npyv_load_f32(data_out + vstep * 2);
+            
+#line 390
+            npyv_f32 a3 = npyv_load_f32(data0 + vstep * 3);
+            npyv_f32 b3 = npyv_load_f32(data1 + vstep * 3);
+            npyv_f32 c3 = npyv_load_f32(data_out + vstep * 3);
+            
+            #line 397
+            npyv_f32 abc0 = npyv_muladd_f32(a0, b0, c0);
+            
+#line 397
+            npyv_f32 abc1 = npyv_muladd_f32(a1, b1, c1);
+            
+#line 397
+            npyv_f32 abc2 = npyv_muladd_f32(a2, b2, c2);
+            
+#line 397
+            npyv_f32 abc3 = npyv_muladd_f32(a3, b3, c3);
+            
+            #line 402
+            npyv_store_f32(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_f32(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_f32(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_f32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_f32 a = npyv_load_tillz_f32(data0, count);
+        npyv_f32 b = npyv_load_tillz_f32(data1, count);
+        npyv_f32 c = npyv_load_tillz_f32(data_out, count);
+        npyv_store_till_f32(data_out, count, npyv_muladd_f32(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_float a0 = (data0[0]);
+        const npy_float b0 = (data1[0]);
+        const npy_float c0 = (data_out[0]);
+        
+#line 420
+        const npy_float a1 = (data0[1]);
+        const npy_float b1 = (data1[1]);
+        const npy_float c1 = (data_out[1]);
+        
+#line 420
+        const npy_float a2 = (data0[2]);
+        const npy_float b2 = (data1[2]);
+        const npy_float c2 = (data_out[2]);
+        
+#line 420
+        const npy_float a3 = (data0[3]);
+        const npy_float b3 = (data1[3]);
+        const npy_float c3 = (data_out[3]);
+        
+        #line 427
+        const npy_float abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_float abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_float abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_float abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_float a = (*data0);
+        const npy_float b = (*data1);
+        const npy_float c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_float
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+float_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float value0 = (*(npy_float *)dataptr[0]);
+    npy_float *data1 = (npy_float *)dataptr[1];
+    npy_float *data_out = (npy_float *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("float_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    float_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+float_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float value1 = (*(npy_float *)dataptr[1]);
+    npy_float *data0 = (npy_float *)dataptr[0];
+    npy_float *data_out = (npy_float *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("float_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    float_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+float_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float *data0 = (npy_float *)dataptr[0];
+    npy_float *data1 = (npy_float *)dataptr[1];
+    npy_float accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("float_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if NPY_SIMD_F32 // NPYV check for npy_float
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_f32;
+    npyv_f32 v_accum = npyv_zero_f32();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_f32 a0 = npyv_loada_f32(data0 + vstep * 0);
+            npyv_f32 b0 = npyv_loada_f32(data1 + vstep * 0);
+            
+#line 501
+            npyv_f32 a1 = npyv_loada_f32(data0 + vstep * 1);
+            npyv_f32 b1 = npyv_loada_f32(data1 + vstep * 1);
+            
+#line 501
+            npyv_f32 a2 = npyv_loada_f32(data0 + vstep * 2);
+            npyv_f32 b2 = npyv_loada_f32(data1 + vstep * 2);
+            
+#line 501
+            npyv_f32 a3 = npyv_loada_f32(data0 + vstep * 3);
+            npyv_f32 b3 = npyv_loada_f32(data1 + vstep * 3);
+            
+            npyv_f32 ab3 = npyv_muladd_f32(a3, b3, v_accum);
+            npyv_f32 ab2 = npyv_muladd_f32(a2, b2, ab3);
+            npyv_f32 ab1 = npyv_muladd_f32(a1, b1, ab2);
+                   v_accum = npyv_muladd_f32(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_f32 a0 = npyv_load_f32(data0 + vstep * 0);
+            npyv_f32 b0 = npyv_load_f32(data1 + vstep * 0);
+            
+#line 501
+            npyv_f32 a1 = npyv_load_f32(data0 + vstep * 1);
+            npyv_f32 b1 = npyv_load_f32(data1 + vstep * 1);
+            
+#line 501
+            npyv_f32 a2 = npyv_load_f32(data0 + vstep * 2);
+            npyv_f32 b2 = npyv_load_f32(data1 + vstep * 2);
+            
+#line 501
+            npyv_f32 a3 = npyv_load_f32(data0 + vstep * 3);
+            npyv_f32 b3 = npyv_load_f32(data1 + vstep * 3);
+            
+            npyv_f32 ab3 = npyv_muladd_f32(a3, b3, v_accum);
+            npyv_f32 ab2 = npyv_muladd_f32(a2, b2, ab3);
+            npyv_f32 ab1 = npyv_muladd_f32(a1, b1, ab2);
+                   v_accum = npyv_muladd_f32(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_f32 a = npyv_load_tillz_f32(data0, count);
+        npyv_f32 b = npyv_load_tillz_f32(data1, count);
+        v_accum = npyv_muladd_f32(a, b, v_accum);
+    }
+    accum = npyv_sum_f32(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_float ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_float ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_float ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_float ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_float a = (*data0);
+        const npy_float b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_float
+    *(npy_float *)dataptr[2] = ((*(npy_float *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+float_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float *data1 = (npy_float *)dataptr[1];
+    npy_float value0 = (*(npy_float *)dataptr[0]);
+    npy_float accum = float_sum_of_arr(data1, count);
+    *(npy_float *)dataptr[2] = ((*(npy_float *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+float_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float *data0 = (npy_float *)dataptr[0];
+    npy_float value1 = (*(npy_float *)dataptr[1]);
+    npy_float accum = float_sum_of_arr(data0, count);
+    *(npy_float *)dataptr[2] = ((*(npy_float *)dataptr[2]) + value1 * accum);
+}
+
+#elif 1 == 3 && !0
+
+static void
+float_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float *data0 = (npy_float *)dataptr[0];
+    npy_float *data1 = (npy_float *)dataptr[1];
+    npy_float *data2 = (npy_float *)dataptr[2];
+    npy_float *data_out = (npy_float *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 1 > 3 || @complex */
+
+static void
+float_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("float_sum_of_products_contig_one (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_float temp = (*(npy_float *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_float *)dataptr[i]);
+        }
+        *(npy_float *)dataptr[nop] = (temp +
+                                           (*(npy_float *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_float);
+        }
+#else /* complex */
+#  if 1 <= 3
+#    define _SUMPROD_NOP 1
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_float re, im, tmp;
+        int i;
+        re = ((npy_float *)dataptr[0])[0];
+        im = ((npy_float *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_float *)dataptr[i])[0] -
+                  im * ((npy_float *)dataptr[i])[1];
+            im = re * ((npy_float *)dataptr[i])[1] +
+                 im * ((npy_float *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_float *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_float *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_float);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 1 */
+
+#if 1 == 1
+
+static NPY_GCC_OPT_3 void
+float_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("float_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_float *data = (npy_float *)dataptr[0];
+    npy_float accum = float_sum_of_arr(data, count);
+    *((npy_float *)dataptr[1]) = (accum + (*((npy_float *)dataptr[1])));
+#else
+    npy_float accum_re = 0, accum_im = 0;
+    npy_float *data0 = (npy_float *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_float re01 = data0[0] + data0[2];
+        const npy_float re23 = data0[4] + data0[6];
+        const npy_float im13 = data0[1] + data0[3];
+        const npy_float im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_float *)dataptr[1])[0] += accum_re;
+    ((npy_float *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 1 == 1 */
+
+static void
+float_sum_of_products_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_float accum_re = 0, accum_im = 0;
+#else
+    npy_float accum = 0;
+#endif
+
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1 == 2 || 1 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("float_sum_of_products_outstride0_one (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 1 == 1
+        accum += (*(npy_float *)data0);
+        data0 += stride0;
+#  elif 1 == 2
+        accum += (*(npy_float *)data0) *
+                 (*(npy_float *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 1 == 3
+        accum += (*(npy_float *)data0) *
+                 (*(npy_float *)data1) *
+                 (*(npy_float *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_float temp = (*(npy_float *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_float *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1 == 1
+        accum_re += ((npy_float *)data0)[0];
+        accum_im += ((npy_float *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 1 <= 3
+#define _SUMPROD_NOP 1
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_float re, im, tmp;
+        int i;
+        re = ((npy_float *)dataptr[0])[0];
+        im = ((npy_float *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_float *)dataptr[i])[0] -
+                  im * ((npy_float *)dataptr[i])[1];
+            im = re * ((npy_float *)dataptr[i])[1] +
+                 im * ((npy_float *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 1 <= 3
+    ((npy_float *)dataptr[1])[0] += accum_re;
+    ((npy_float *)dataptr[1])[1] += accum_im;
+#  else
+    ((npy_float *)dataptr[nop])[0] += accum_re;
+    ((npy_float *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 1 <= 3
+    *((npy_float *)dataptr[1]) = (accum +
+                                    (*((npy_float *)dataptr[1])));
+#  else
+    *((npy_float *)dataptr[nop]) = (accum +
+                                    (*((npy_float *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+float_sum_of_products_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (2 == 2 || 2 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (2 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data_out = dataptr[2];
+    npy_intp stride_out = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("float_sum_of_products_two (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 2 == 1
+        *(npy_float *)data_out = ((*(npy_float *)data0) +
+                                         (*(npy_float *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 2 == 2
+        *(npy_float *)data_out = ((*(npy_float *)data0) *
+                                         (*(npy_float *)data1) +
+                                         (*(npy_float *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 2 == 3
+        *(npy_float *)data_out = ((*(npy_float *)data0) *
+                                         (*(npy_float *)data1) *
+                                         (*(npy_float *)data2) +
+                                         (*(npy_float *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_float temp = (*(npy_float *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_float *)dataptr[i]);
+        }
+        *(npy_float *)dataptr[nop] = (temp +
+                                           (*(npy_float *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 2 == 1
+        ((npy_float *)data_out)[0] = ((npy_float *)data0)[0] +
+                                         ((npy_float *)data_out)[0];
+        ((npy_float *)data_out)[1] = ((npy_float *)data0)[1] +
+                                         ((npy_float *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 2 <= 3
+#define _SUMPROD_NOP 2
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_float re, im, tmp;
+        int i;
+        re = ((npy_float *)dataptr[0])[0];
+        im = ((npy_float *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_float *)dataptr[i])[0] -
+                  im * ((npy_float *)dataptr[i])[1];
+            im = re * ((npy_float *)dataptr[i])[1] +
+                 im * ((npy_float *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_float *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_float *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 2 == 1
+
+static void
+float_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float *data0 = (npy_float *)dataptr[0];
+    npy_float *data_out = (npy_float *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("float_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_float *)data_out + 2*6)[0] =
+                                    ((npy_float *)data0 + 2*6)[0] +
+                                    ((npy_float *)data_out + 2*6)[0];
+            ((npy_float *)data_out + 2*6)[1] =
+                                    ((npy_float *)data0 + 2*6)[1] +
+                                    ((npy_float *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_float *)data_out + 2*5)[0] =
+                                    ((npy_float *)data0 + 2*5)[0] +
+                                    ((npy_float *)data_out + 2*5)[0];
+            ((npy_float *)data_out + 2*5)[1] =
+                                    ((npy_float *)data0 + 2*5)[1] +
+                                    ((npy_float *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_float *)data_out + 2*4)[0] =
+                                    ((npy_float *)data0 + 2*4)[0] +
+                                    ((npy_float *)data_out + 2*4)[0];
+            ((npy_float *)data_out + 2*4)[1] =
+                                    ((npy_float *)data0 + 2*4)[1] +
+                                    ((npy_float *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_float *)data_out + 2*3)[0] =
+                                    ((npy_float *)data0 + 2*3)[0] +
+                                    ((npy_float *)data_out + 2*3)[0];
+            ((npy_float *)data_out + 2*3)[1] =
+                                    ((npy_float *)data0 + 2*3)[1] +
+                                    ((npy_float *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_float *)data_out + 2*2)[0] =
+                                    ((npy_float *)data0 + 2*2)[0] +
+                                    ((npy_float *)data_out + 2*2)[0];
+            ((npy_float *)data_out + 2*2)[1] =
+                                    ((npy_float *)data0 + 2*2)[1] +
+                                    ((npy_float *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_float *)data_out + 2*1)[0] =
+                                    ((npy_float *)data0 + 2*1)[0] +
+                                    ((npy_float *)data_out + 2*1)[0];
+            ((npy_float *)data_out + 2*1)[1] =
+                                    ((npy_float *)data0 + 2*1)[1] +
+                                    ((npy_float *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_float *)data_out + 2*0)[0] =
+                                    ((npy_float *)data0 + 2*0)[0] +
+                                    ((npy_float *)data_out + 2*0)[0];
+            ((npy_float *)data_out + 2*0)[1] =
+                                    ((npy_float *)data0 + 2*0)[1] +
+                                    ((npy_float *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_float *)data_out + 2*0)[0] =
+                                ((npy_float *)data0 + 2*0)[0] +
+                                ((npy_float *)data_out + 2*0)[0];
+        ((npy_float *)data_out + 2*0)[1] =
+                                ((npy_float *)data0 + 2*0)[1] +
+                                ((npy_float *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_float *)data_out + 2*1)[0] =
+                                ((npy_float *)data0 + 2*1)[0] +
+                                ((npy_float *)data_out + 2*1)[0];
+        ((npy_float *)data_out + 2*1)[1] =
+                                ((npy_float *)data0 + 2*1)[1] +
+                                ((npy_float *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_float *)data_out + 2*2)[0] =
+                                ((npy_float *)data0 + 2*2)[0] +
+                                ((npy_float *)data_out + 2*2)[0];
+        ((npy_float *)data_out + 2*2)[1] =
+                                ((npy_float *)data0 + 2*2)[1] +
+                                ((npy_float *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_float *)data_out + 2*3)[0] =
+                                ((npy_float *)data0 + 2*3)[0] +
+                                ((npy_float *)data_out + 2*3)[0];
+        ((npy_float *)data_out + 2*3)[1] =
+                                ((npy_float *)data0 + 2*3)[1] +
+                                ((npy_float *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_float *)data_out + 2*4)[0] =
+                                ((npy_float *)data0 + 2*4)[0] +
+                                ((npy_float *)data_out + 2*4)[0];
+        ((npy_float *)data_out + 2*4)[1] =
+                                ((npy_float *)data0 + 2*4)[1] +
+                                ((npy_float *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_float *)data_out + 2*5)[0] =
+                                ((npy_float *)data0 + 2*5)[0] +
+                                ((npy_float *)data_out + 2*5)[0];
+        ((npy_float *)data_out + 2*5)[1] =
+                                ((npy_float *)data0 + 2*5)[1] +
+                                ((npy_float *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_float *)data_out + 2*6)[0] =
+                                ((npy_float *)data0 + 2*6)[0] +
+                                ((npy_float *)data_out + 2*6)[0];
+        ((npy_float *)data_out + 2*6)[1] =
+                                ((npy_float *)data0 + 2*6)[1] +
+                                ((npy_float *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_float *)data_out + 2*7)[0] =
+                                ((npy_float *)data0 + 2*7)[0] +
+                                ((npy_float *)data_out + 2*7)[0];
+        ((npy_float *)data_out + 2*7)[1] =
+                                ((npy_float *)data0 + 2*7)[1] +
+                                ((npy_float *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 2 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+float_sum_of_products_muladd(npy_float *data, npy_float *data_out, npy_float scalar, npy_intp count)
+{
+#if NPY_SIMD_F32 // NPYV check for npy_float
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_f32;
+    const npyv_f32 v_scalar = npyv_setall_f32(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_f32 b0 = npyv_loada_f32(data + vstep * 0);
+            npyv_f32 c0 = npyv_loada_f32(data_out + vstep * 0);
+            
+#line 312
+            npyv_f32 b1 = npyv_loada_f32(data + vstep * 1);
+            npyv_f32 c1 = npyv_loada_f32(data_out + vstep * 1);
+            
+#line 312
+            npyv_f32 b2 = npyv_loada_f32(data + vstep * 2);
+            npyv_f32 c2 = npyv_loada_f32(data_out + vstep * 2);
+            
+#line 312
+            npyv_f32 b3 = npyv_loada_f32(data + vstep * 3);
+            npyv_f32 c3 = npyv_loada_f32(data_out + vstep * 3);
+            
+            #line 318
+            npyv_f32 abc0 = npyv_muladd_f32(v_scalar, b0, c0);
+            
+#line 318
+            npyv_f32 abc1 = npyv_muladd_f32(v_scalar, b1, c1);
+            
+#line 318
+            npyv_f32 abc2 = npyv_muladd_f32(v_scalar, b2, c2);
+            
+#line 318
+            npyv_f32 abc3 = npyv_muladd_f32(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_f32(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_f32(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_f32(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_f32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_f32 b0 = npyv_load_f32(data + vstep * 0);
+            npyv_f32 c0 = npyv_load_f32(data_out + vstep * 0);
+            
+#line 312
+            npyv_f32 b1 = npyv_load_f32(data + vstep * 1);
+            npyv_f32 c1 = npyv_load_f32(data_out + vstep * 1);
+            
+#line 312
+            npyv_f32 b2 = npyv_load_f32(data + vstep * 2);
+            npyv_f32 c2 = npyv_load_f32(data_out + vstep * 2);
+            
+#line 312
+            npyv_f32 b3 = npyv_load_f32(data + vstep * 3);
+            npyv_f32 c3 = npyv_load_f32(data_out + vstep * 3);
+            
+            #line 318
+            npyv_f32 abc0 = npyv_muladd_f32(v_scalar, b0, c0);
+            
+#line 318
+            npyv_f32 abc1 = npyv_muladd_f32(v_scalar, b1, c1);
+            
+#line 318
+            npyv_f32 abc2 = npyv_muladd_f32(v_scalar, b2, c2);
+            
+#line 318
+            npyv_f32 abc3 = npyv_muladd_f32(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_f32(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_f32(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_f32(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_f32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_f32 a = npyv_load_tillz_f32(data, count);
+        npyv_f32 b = npyv_load_tillz_f32(data_out, count);
+        npyv_store_till_f32(data_out, count, npyv_muladd_f32(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_float b0 = (data[0]);
+        const npy_float c0 = (data_out[0]);
+        
+#line 340
+        const npy_float b1 = (data[1]);
+        const npy_float c1 = (data_out[1]);
+        
+#line 340
+        const npy_float b2 = (data[2]);
+        const npy_float c2 = (data_out[2]);
+        
+#line 340
+        const npy_float b3 = (data[3]);
+        const npy_float c3 = (data_out[3]);
+        
+        #line 346
+        const npy_float abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_float abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_float abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_float abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_float b = (*data);
+        const npy_float c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_float
+}
+
+static void
+float_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float *data0 = (npy_float *)dataptr[0];
+    npy_float *data1 = (npy_float *)dataptr[1];
+    npy_float *data_out = (npy_float *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("float_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_float
+#if NPY_SIMD_F32
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_f32;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_f32 a0 = npyv_loada_f32(data0 + vstep * 0);
+            npyv_f32 b0 = npyv_loada_f32(data1 + vstep * 0);
+            npyv_f32 c0 = npyv_loada_f32(data_out + vstep * 0);
+            
+#line 390
+            npyv_f32 a1 = npyv_loada_f32(data0 + vstep * 1);
+            npyv_f32 b1 = npyv_loada_f32(data1 + vstep * 1);
+            npyv_f32 c1 = npyv_loada_f32(data_out + vstep * 1);
+            
+#line 390
+            npyv_f32 a2 = npyv_loada_f32(data0 + vstep * 2);
+            npyv_f32 b2 = npyv_loada_f32(data1 + vstep * 2);
+            npyv_f32 c2 = npyv_loada_f32(data_out + vstep * 2);
+            
+#line 390
+            npyv_f32 a3 = npyv_loada_f32(data0 + vstep * 3);
+            npyv_f32 b3 = npyv_loada_f32(data1 + vstep * 3);
+            npyv_f32 c3 = npyv_loada_f32(data_out + vstep * 3);
+            
+            #line 397
+            npyv_f32 abc0 = npyv_muladd_f32(a0, b0, c0);
+            
+#line 397
+            npyv_f32 abc1 = npyv_muladd_f32(a1, b1, c1);
+            
+#line 397
+            npyv_f32 abc2 = npyv_muladd_f32(a2, b2, c2);
+            
+#line 397
+            npyv_f32 abc3 = npyv_muladd_f32(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_f32(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_f32(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_f32(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_f32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_f32 a0 = npyv_load_f32(data0 + vstep * 0);
+            npyv_f32 b0 = npyv_load_f32(data1 + vstep * 0);
+            npyv_f32 c0 = npyv_load_f32(data_out + vstep * 0);
+            
+#line 390
+            npyv_f32 a1 = npyv_load_f32(data0 + vstep * 1);
+            npyv_f32 b1 = npyv_load_f32(data1 + vstep * 1);
+            npyv_f32 c1 = npyv_load_f32(data_out + vstep * 1);
+            
+#line 390
+            npyv_f32 a2 = npyv_load_f32(data0 + vstep * 2);
+            npyv_f32 b2 = npyv_load_f32(data1 + vstep * 2);
+            npyv_f32 c2 = npyv_load_f32(data_out + vstep * 2);
+            
+#line 390
+            npyv_f32 a3 = npyv_load_f32(data0 + vstep * 3);
+            npyv_f32 b3 = npyv_load_f32(data1 + vstep * 3);
+            npyv_f32 c3 = npyv_load_f32(data_out + vstep * 3);
+            
+            #line 397
+            npyv_f32 abc0 = npyv_muladd_f32(a0, b0, c0);
+            
+#line 397
+            npyv_f32 abc1 = npyv_muladd_f32(a1, b1, c1);
+            
+#line 397
+            npyv_f32 abc2 = npyv_muladd_f32(a2, b2, c2);
+            
+#line 397
+            npyv_f32 abc3 = npyv_muladd_f32(a3, b3, c3);
+            
+            #line 402
+            npyv_store_f32(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_f32(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_f32(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_f32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_f32 a = npyv_load_tillz_f32(data0, count);
+        npyv_f32 b = npyv_load_tillz_f32(data1, count);
+        npyv_f32 c = npyv_load_tillz_f32(data_out, count);
+        npyv_store_till_f32(data_out, count, npyv_muladd_f32(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_float a0 = (data0[0]);
+        const npy_float b0 = (data1[0]);
+        const npy_float c0 = (data_out[0]);
+        
+#line 420
+        const npy_float a1 = (data0[1]);
+        const npy_float b1 = (data1[1]);
+        const npy_float c1 = (data_out[1]);
+        
+#line 420
+        const npy_float a2 = (data0[2]);
+        const npy_float b2 = (data1[2]);
+        const npy_float c2 = (data_out[2]);
+        
+#line 420
+        const npy_float a3 = (data0[3]);
+        const npy_float b3 = (data1[3]);
+        const npy_float c3 = (data_out[3]);
+        
+        #line 427
+        const npy_float abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_float abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_float abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_float abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_float a = (*data0);
+        const npy_float b = (*data1);
+        const npy_float c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_float
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+float_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float value0 = (*(npy_float *)dataptr[0]);
+    npy_float *data1 = (npy_float *)dataptr[1];
+    npy_float *data_out = (npy_float *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("float_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    float_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+float_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float value1 = (*(npy_float *)dataptr[1]);
+    npy_float *data0 = (npy_float *)dataptr[0];
+    npy_float *data_out = (npy_float *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("float_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    float_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+float_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float *data0 = (npy_float *)dataptr[0];
+    npy_float *data1 = (npy_float *)dataptr[1];
+    npy_float accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("float_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if NPY_SIMD_F32 // NPYV check for npy_float
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_f32;
+    npyv_f32 v_accum = npyv_zero_f32();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_f32 a0 = npyv_loada_f32(data0 + vstep * 0);
+            npyv_f32 b0 = npyv_loada_f32(data1 + vstep * 0);
+            
+#line 501
+            npyv_f32 a1 = npyv_loada_f32(data0 + vstep * 1);
+            npyv_f32 b1 = npyv_loada_f32(data1 + vstep * 1);
+            
+#line 501
+            npyv_f32 a2 = npyv_loada_f32(data0 + vstep * 2);
+            npyv_f32 b2 = npyv_loada_f32(data1 + vstep * 2);
+            
+#line 501
+            npyv_f32 a3 = npyv_loada_f32(data0 + vstep * 3);
+            npyv_f32 b3 = npyv_loada_f32(data1 + vstep * 3);
+            
+            npyv_f32 ab3 = npyv_muladd_f32(a3, b3, v_accum);
+            npyv_f32 ab2 = npyv_muladd_f32(a2, b2, ab3);
+            npyv_f32 ab1 = npyv_muladd_f32(a1, b1, ab2);
+                   v_accum = npyv_muladd_f32(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_f32 a0 = npyv_load_f32(data0 + vstep * 0);
+            npyv_f32 b0 = npyv_load_f32(data1 + vstep * 0);
+            
+#line 501
+            npyv_f32 a1 = npyv_load_f32(data0 + vstep * 1);
+            npyv_f32 b1 = npyv_load_f32(data1 + vstep * 1);
+            
+#line 501
+            npyv_f32 a2 = npyv_load_f32(data0 + vstep * 2);
+            npyv_f32 b2 = npyv_load_f32(data1 + vstep * 2);
+            
+#line 501
+            npyv_f32 a3 = npyv_load_f32(data0 + vstep * 3);
+            npyv_f32 b3 = npyv_load_f32(data1 + vstep * 3);
+            
+            npyv_f32 ab3 = npyv_muladd_f32(a3, b3, v_accum);
+            npyv_f32 ab2 = npyv_muladd_f32(a2, b2, ab3);
+            npyv_f32 ab1 = npyv_muladd_f32(a1, b1, ab2);
+                   v_accum = npyv_muladd_f32(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_f32 a = npyv_load_tillz_f32(data0, count);
+        npyv_f32 b = npyv_load_tillz_f32(data1, count);
+        v_accum = npyv_muladd_f32(a, b, v_accum);
+    }
+    accum = npyv_sum_f32(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_float ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_float ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_float ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_float ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_float a = (*data0);
+        const npy_float b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_float
+    *(npy_float *)dataptr[2] = ((*(npy_float *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+float_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float *data1 = (npy_float *)dataptr[1];
+    npy_float value0 = (*(npy_float *)dataptr[0]);
+    npy_float accum = float_sum_of_arr(data1, count);
+    *(npy_float *)dataptr[2] = ((*(npy_float *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+float_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float *data0 = (npy_float *)dataptr[0];
+    npy_float value1 = (*(npy_float *)dataptr[1]);
+    npy_float accum = float_sum_of_arr(data0, count);
+    *(npy_float *)dataptr[2] = ((*(npy_float *)dataptr[2]) + value1 * accum);
+}
+
+#elif 2 == 3 && !0
+
+static void
+float_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float *data0 = (npy_float *)dataptr[0];
+    npy_float *data1 = (npy_float *)dataptr[1];
+    npy_float *data2 = (npy_float *)dataptr[2];
+    npy_float *data_out = (npy_float *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 2 > 3 || @complex */
+
+static void
+float_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("float_sum_of_products_contig_two (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_float temp = (*(npy_float *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_float *)dataptr[i]);
+        }
+        *(npy_float *)dataptr[nop] = (temp +
+                                           (*(npy_float *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_float);
+        }
+#else /* complex */
+#  if 2 <= 3
+#    define _SUMPROD_NOP 2
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_float re, im, tmp;
+        int i;
+        re = ((npy_float *)dataptr[0])[0];
+        im = ((npy_float *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_float *)dataptr[i])[0] -
+                  im * ((npy_float *)dataptr[i])[1];
+            im = re * ((npy_float *)dataptr[i])[1] +
+                 im * ((npy_float *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_float *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_float *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_float);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 2 */
+
+#if 2 == 1
+
+static NPY_GCC_OPT_3 void
+float_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("float_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_float *data = (npy_float *)dataptr[0];
+    npy_float accum = float_sum_of_arr(data, count);
+    *((npy_float *)dataptr[1]) = (accum + (*((npy_float *)dataptr[1])));
+#else
+    npy_float accum_re = 0, accum_im = 0;
+    npy_float *data0 = (npy_float *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_float re01 = data0[0] + data0[2];
+        const npy_float re23 = data0[4] + data0[6];
+        const npy_float im13 = data0[1] + data0[3];
+        const npy_float im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_float *)dataptr[1])[0] += accum_re;
+    ((npy_float *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 2 == 1 */
+
+static void
+float_sum_of_products_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_float accum_re = 0, accum_im = 0;
+#else
+    npy_float accum = 0;
+#endif
+
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (2 == 2 || 2 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (2 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("float_sum_of_products_outstride0_two (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 2 == 1
+        accum += (*(npy_float *)data0);
+        data0 += stride0;
+#  elif 2 == 2
+        accum += (*(npy_float *)data0) *
+                 (*(npy_float *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 2 == 3
+        accum += (*(npy_float *)data0) *
+                 (*(npy_float *)data1) *
+                 (*(npy_float *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_float temp = (*(npy_float *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_float *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 2 == 1
+        accum_re += ((npy_float *)data0)[0];
+        accum_im += ((npy_float *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 2 <= 3
+#define _SUMPROD_NOP 2
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_float re, im, tmp;
+        int i;
+        re = ((npy_float *)dataptr[0])[0];
+        im = ((npy_float *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_float *)dataptr[i])[0] -
+                  im * ((npy_float *)dataptr[i])[1];
+            im = re * ((npy_float *)dataptr[i])[1] +
+                 im * ((npy_float *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 2 <= 3
+    ((npy_float *)dataptr[2])[0] += accum_re;
+    ((npy_float *)dataptr[2])[1] += accum_im;
+#  else
+    ((npy_float *)dataptr[nop])[0] += accum_re;
+    ((npy_float *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 2 <= 3
+    *((npy_float *)dataptr[2]) = (accum +
+                                    (*((npy_float *)dataptr[2])));
+#  else
+    *((npy_float *)dataptr[nop]) = (accum +
+                                    (*((npy_float *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+float_sum_of_products_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (3 == 2 || 3 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (3 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data_out = dataptr[3];
+    npy_intp stride_out = strides[3];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("float_sum_of_products_three (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 3 == 1
+        *(npy_float *)data_out = ((*(npy_float *)data0) +
+                                         (*(npy_float *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 3 == 2
+        *(npy_float *)data_out = ((*(npy_float *)data0) *
+                                         (*(npy_float *)data1) +
+                                         (*(npy_float *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 3 == 3
+        *(npy_float *)data_out = ((*(npy_float *)data0) *
+                                         (*(npy_float *)data1) *
+                                         (*(npy_float *)data2) +
+                                         (*(npy_float *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_float temp = (*(npy_float *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_float *)dataptr[i]);
+        }
+        *(npy_float *)dataptr[nop] = (temp +
+                                           (*(npy_float *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 3 == 1
+        ((npy_float *)data_out)[0] = ((npy_float *)data0)[0] +
+                                         ((npy_float *)data_out)[0];
+        ((npy_float *)data_out)[1] = ((npy_float *)data0)[1] +
+                                         ((npy_float *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 3 <= 3
+#define _SUMPROD_NOP 3
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_float re, im, tmp;
+        int i;
+        re = ((npy_float *)dataptr[0])[0];
+        im = ((npy_float *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_float *)dataptr[i])[0] -
+                  im * ((npy_float *)dataptr[i])[1];
+            im = re * ((npy_float *)dataptr[i])[1] +
+                 im * ((npy_float *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_float *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_float *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 3 == 1
+
+static void
+float_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float *data0 = (npy_float *)dataptr[0];
+    npy_float *data_out = (npy_float *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("float_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_float *)data_out + 2*6)[0] =
+                                    ((npy_float *)data0 + 2*6)[0] +
+                                    ((npy_float *)data_out + 2*6)[0];
+            ((npy_float *)data_out + 2*6)[1] =
+                                    ((npy_float *)data0 + 2*6)[1] +
+                                    ((npy_float *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_float *)data_out + 2*5)[0] =
+                                    ((npy_float *)data0 + 2*5)[0] +
+                                    ((npy_float *)data_out + 2*5)[0];
+            ((npy_float *)data_out + 2*5)[1] =
+                                    ((npy_float *)data0 + 2*5)[1] +
+                                    ((npy_float *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_float *)data_out + 2*4)[0] =
+                                    ((npy_float *)data0 + 2*4)[0] +
+                                    ((npy_float *)data_out + 2*4)[0];
+            ((npy_float *)data_out + 2*4)[1] =
+                                    ((npy_float *)data0 + 2*4)[1] +
+                                    ((npy_float *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_float *)data_out + 2*3)[0] =
+                                    ((npy_float *)data0 + 2*3)[0] +
+                                    ((npy_float *)data_out + 2*3)[0];
+            ((npy_float *)data_out + 2*3)[1] =
+                                    ((npy_float *)data0 + 2*3)[1] +
+                                    ((npy_float *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_float *)data_out + 2*2)[0] =
+                                    ((npy_float *)data0 + 2*2)[0] +
+                                    ((npy_float *)data_out + 2*2)[0];
+            ((npy_float *)data_out + 2*2)[1] =
+                                    ((npy_float *)data0 + 2*2)[1] +
+                                    ((npy_float *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_float *)data_out + 2*1)[0] =
+                                    ((npy_float *)data0 + 2*1)[0] +
+                                    ((npy_float *)data_out + 2*1)[0];
+            ((npy_float *)data_out + 2*1)[1] =
+                                    ((npy_float *)data0 + 2*1)[1] +
+                                    ((npy_float *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_float *)data_out + 2*0)[0] =
+                                    ((npy_float *)data0 + 2*0)[0] +
+                                    ((npy_float *)data_out + 2*0)[0];
+            ((npy_float *)data_out + 2*0)[1] =
+                                    ((npy_float *)data0 + 2*0)[1] +
+                                    ((npy_float *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_float *)data_out + 2*0)[0] =
+                                ((npy_float *)data0 + 2*0)[0] +
+                                ((npy_float *)data_out + 2*0)[0];
+        ((npy_float *)data_out + 2*0)[1] =
+                                ((npy_float *)data0 + 2*0)[1] +
+                                ((npy_float *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_float *)data_out + 2*1)[0] =
+                                ((npy_float *)data0 + 2*1)[0] +
+                                ((npy_float *)data_out + 2*1)[0];
+        ((npy_float *)data_out + 2*1)[1] =
+                                ((npy_float *)data0 + 2*1)[1] +
+                                ((npy_float *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_float *)data_out + 2*2)[0] =
+                                ((npy_float *)data0 + 2*2)[0] +
+                                ((npy_float *)data_out + 2*2)[0];
+        ((npy_float *)data_out + 2*2)[1] =
+                                ((npy_float *)data0 + 2*2)[1] +
+                                ((npy_float *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_float *)data_out + 2*3)[0] =
+                                ((npy_float *)data0 + 2*3)[0] +
+                                ((npy_float *)data_out + 2*3)[0];
+        ((npy_float *)data_out + 2*3)[1] =
+                                ((npy_float *)data0 + 2*3)[1] +
+                                ((npy_float *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_float *)data_out + 2*4)[0] =
+                                ((npy_float *)data0 + 2*4)[0] +
+                                ((npy_float *)data_out + 2*4)[0];
+        ((npy_float *)data_out + 2*4)[1] =
+                                ((npy_float *)data0 + 2*4)[1] +
+                                ((npy_float *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_float *)data_out + 2*5)[0] =
+                                ((npy_float *)data0 + 2*5)[0] +
+                                ((npy_float *)data_out + 2*5)[0];
+        ((npy_float *)data_out + 2*5)[1] =
+                                ((npy_float *)data0 + 2*5)[1] +
+                                ((npy_float *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_float *)data_out + 2*6)[0] =
+                                ((npy_float *)data0 + 2*6)[0] +
+                                ((npy_float *)data_out + 2*6)[0];
+        ((npy_float *)data_out + 2*6)[1] =
+                                ((npy_float *)data0 + 2*6)[1] +
+                                ((npy_float *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_float *)data_out + 2*7)[0] =
+                                ((npy_float *)data0 + 2*7)[0] +
+                                ((npy_float *)data_out + 2*7)[0];
+        ((npy_float *)data_out + 2*7)[1] =
+                                ((npy_float *)data0 + 2*7)[1] +
+                                ((npy_float *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 3 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+float_sum_of_products_muladd(npy_float *data, npy_float *data_out, npy_float scalar, npy_intp count)
+{
+#if NPY_SIMD_F32 // NPYV check for npy_float
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_f32;
+    const npyv_f32 v_scalar = npyv_setall_f32(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_f32 b0 = npyv_loada_f32(data + vstep * 0);
+            npyv_f32 c0 = npyv_loada_f32(data_out + vstep * 0);
+            
+#line 312
+            npyv_f32 b1 = npyv_loada_f32(data + vstep * 1);
+            npyv_f32 c1 = npyv_loada_f32(data_out + vstep * 1);
+            
+#line 312
+            npyv_f32 b2 = npyv_loada_f32(data + vstep * 2);
+            npyv_f32 c2 = npyv_loada_f32(data_out + vstep * 2);
+            
+#line 312
+            npyv_f32 b3 = npyv_loada_f32(data + vstep * 3);
+            npyv_f32 c3 = npyv_loada_f32(data_out + vstep * 3);
+            
+            #line 318
+            npyv_f32 abc0 = npyv_muladd_f32(v_scalar, b0, c0);
+            
+#line 318
+            npyv_f32 abc1 = npyv_muladd_f32(v_scalar, b1, c1);
+            
+#line 318
+            npyv_f32 abc2 = npyv_muladd_f32(v_scalar, b2, c2);
+            
+#line 318
+            npyv_f32 abc3 = npyv_muladd_f32(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_f32(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_f32(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_f32(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_f32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_f32 b0 = npyv_load_f32(data + vstep * 0);
+            npyv_f32 c0 = npyv_load_f32(data_out + vstep * 0);
+            
+#line 312
+            npyv_f32 b1 = npyv_load_f32(data + vstep * 1);
+            npyv_f32 c1 = npyv_load_f32(data_out + vstep * 1);
+            
+#line 312
+            npyv_f32 b2 = npyv_load_f32(data + vstep * 2);
+            npyv_f32 c2 = npyv_load_f32(data_out + vstep * 2);
+            
+#line 312
+            npyv_f32 b3 = npyv_load_f32(data + vstep * 3);
+            npyv_f32 c3 = npyv_load_f32(data_out + vstep * 3);
+            
+            #line 318
+            npyv_f32 abc0 = npyv_muladd_f32(v_scalar, b0, c0);
+            
+#line 318
+            npyv_f32 abc1 = npyv_muladd_f32(v_scalar, b1, c1);
+            
+#line 318
+            npyv_f32 abc2 = npyv_muladd_f32(v_scalar, b2, c2);
+            
+#line 318
+            npyv_f32 abc3 = npyv_muladd_f32(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_f32(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_f32(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_f32(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_f32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_f32 a = npyv_load_tillz_f32(data, count);
+        npyv_f32 b = npyv_load_tillz_f32(data_out, count);
+        npyv_store_till_f32(data_out, count, npyv_muladd_f32(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_float b0 = (data[0]);
+        const npy_float c0 = (data_out[0]);
+        
+#line 340
+        const npy_float b1 = (data[1]);
+        const npy_float c1 = (data_out[1]);
+        
+#line 340
+        const npy_float b2 = (data[2]);
+        const npy_float c2 = (data_out[2]);
+        
+#line 340
+        const npy_float b3 = (data[3]);
+        const npy_float c3 = (data_out[3]);
+        
+        #line 346
+        const npy_float abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_float abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_float abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_float abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_float b = (*data);
+        const npy_float c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_float
+}
+
+static void
+float_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float *data0 = (npy_float *)dataptr[0];
+    npy_float *data1 = (npy_float *)dataptr[1];
+    npy_float *data_out = (npy_float *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("float_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_float
+#if NPY_SIMD_F32
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_f32;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_f32 a0 = npyv_loada_f32(data0 + vstep * 0);
+            npyv_f32 b0 = npyv_loada_f32(data1 + vstep * 0);
+            npyv_f32 c0 = npyv_loada_f32(data_out + vstep * 0);
+            
+#line 390
+            npyv_f32 a1 = npyv_loada_f32(data0 + vstep * 1);
+            npyv_f32 b1 = npyv_loada_f32(data1 + vstep * 1);
+            npyv_f32 c1 = npyv_loada_f32(data_out + vstep * 1);
+            
+#line 390
+            npyv_f32 a2 = npyv_loada_f32(data0 + vstep * 2);
+            npyv_f32 b2 = npyv_loada_f32(data1 + vstep * 2);
+            npyv_f32 c2 = npyv_loada_f32(data_out + vstep * 2);
+            
+#line 390
+            npyv_f32 a3 = npyv_loada_f32(data0 + vstep * 3);
+            npyv_f32 b3 = npyv_loada_f32(data1 + vstep * 3);
+            npyv_f32 c3 = npyv_loada_f32(data_out + vstep * 3);
+            
+            #line 397
+            npyv_f32 abc0 = npyv_muladd_f32(a0, b0, c0);
+            
+#line 397
+            npyv_f32 abc1 = npyv_muladd_f32(a1, b1, c1);
+            
+#line 397
+            npyv_f32 abc2 = npyv_muladd_f32(a2, b2, c2);
+            
+#line 397
+            npyv_f32 abc3 = npyv_muladd_f32(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_f32(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_f32(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_f32(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_f32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_f32 a0 = npyv_load_f32(data0 + vstep * 0);
+            npyv_f32 b0 = npyv_load_f32(data1 + vstep * 0);
+            npyv_f32 c0 = npyv_load_f32(data_out + vstep * 0);
+            
+#line 390
+            npyv_f32 a1 = npyv_load_f32(data0 + vstep * 1);
+            npyv_f32 b1 = npyv_load_f32(data1 + vstep * 1);
+            npyv_f32 c1 = npyv_load_f32(data_out + vstep * 1);
+            
+#line 390
+            npyv_f32 a2 = npyv_load_f32(data0 + vstep * 2);
+            npyv_f32 b2 = npyv_load_f32(data1 + vstep * 2);
+            npyv_f32 c2 = npyv_load_f32(data_out + vstep * 2);
+            
+#line 390
+            npyv_f32 a3 = npyv_load_f32(data0 + vstep * 3);
+            npyv_f32 b3 = npyv_load_f32(data1 + vstep * 3);
+            npyv_f32 c3 = npyv_load_f32(data_out + vstep * 3);
+            
+            #line 397
+            npyv_f32 abc0 = npyv_muladd_f32(a0, b0, c0);
+            
+#line 397
+            npyv_f32 abc1 = npyv_muladd_f32(a1, b1, c1);
+            
+#line 397
+            npyv_f32 abc2 = npyv_muladd_f32(a2, b2, c2);
+            
+#line 397
+            npyv_f32 abc3 = npyv_muladd_f32(a3, b3, c3);
+            
+            #line 402
+            npyv_store_f32(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_f32(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_f32(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_f32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_f32 a = npyv_load_tillz_f32(data0, count);
+        npyv_f32 b = npyv_load_tillz_f32(data1, count);
+        npyv_f32 c = npyv_load_tillz_f32(data_out, count);
+        npyv_store_till_f32(data_out, count, npyv_muladd_f32(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_float a0 = (data0[0]);
+        const npy_float b0 = (data1[0]);
+        const npy_float c0 = (data_out[0]);
+        
+#line 420
+        const npy_float a1 = (data0[1]);
+        const npy_float b1 = (data1[1]);
+        const npy_float c1 = (data_out[1]);
+        
+#line 420
+        const npy_float a2 = (data0[2]);
+        const npy_float b2 = (data1[2]);
+        const npy_float c2 = (data_out[2]);
+        
+#line 420
+        const npy_float a3 = (data0[3]);
+        const npy_float b3 = (data1[3]);
+        const npy_float c3 = (data_out[3]);
+        
+        #line 427
+        const npy_float abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_float abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_float abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_float abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_float a = (*data0);
+        const npy_float b = (*data1);
+        const npy_float c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_float
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+float_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float value0 = (*(npy_float *)dataptr[0]);
+    npy_float *data1 = (npy_float *)dataptr[1];
+    npy_float *data_out = (npy_float *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("float_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    float_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+float_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float value1 = (*(npy_float *)dataptr[1]);
+    npy_float *data0 = (npy_float *)dataptr[0];
+    npy_float *data_out = (npy_float *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("float_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    float_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+float_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float *data0 = (npy_float *)dataptr[0];
+    npy_float *data1 = (npy_float *)dataptr[1];
+    npy_float accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("float_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if NPY_SIMD_F32 // NPYV check for npy_float
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_f32;
+    npyv_f32 v_accum = npyv_zero_f32();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_f32 a0 = npyv_loada_f32(data0 + vstep * 0);
+            npyv_f32 b0 = npyv_loada_f32(data1 + vstep * 0);
+            
+#line 501
+            npyv_f32 a1 = npyv_loada_f32(data0 + vstep * 1);
+            npyv_f32 b1 = npyv_loada_f32(data1 + vstep * 1);
+            
+#line 501
+            npyv_f32 a2 = npyv_loada_f32(data0 + vstep * 2);
+            npyv_f32 b2 = npyv_loada_f32(data1 + vstep * 2);
+            
+#line 501
+            npyv_f32 a3 = npyv_loada_f32(data0 + vstep * 3);
+            npyv_f32 b3 = npyv_loada_f32(data1 + vstep * 3);
+            
+            npyv_f32 ab3 = npyv_muladd_f32(a3, b3, v_accum);
+            npyv_f32 ab2 = npyv_muladd_f32(a2, b2, ab3);
+            npyv_f32 ab1 = npyv_muladd_f32(a1, b1, ab2);
+                   v_accum = npyv_muladd_f32(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_f32 a0 = npyv_load_f32(data0 + vstep * 0);
+            npyv_f32 b0 = npyv_load_f32(data1 + vstep * 0);
+            
+#line 501
+            npyv_f32 a1 = npyv_load_f32(data0 + vstep * 1);
+            npyv_f32 b1 = npyv_load_f32(data1 + vstep * 1);
+            
+#line 501
+            npyv_f32 a2 = npyv_load_f32(data0 + vstep * 2);
+            npyv_f32 b2 = npyv_load_f32(data1 + vstep * 2);
+            
+#line 501
+            npyv_f32 a3 = npyv_load_f32(data0 + vstep * 3);
+            npyv_f32 b3 = npyv_load_f32(data1 + vstep * 3);
+            
+            npyv_f32 ab3 = npyv_muladd_f32(a3, b3, v_accum);
+            npyv_f32 ab2 = npyv_muladd_f32(a2, b2, ab3);
+            npyv_f32 ab1 = npyv_muladd_f32(a1, b1, ab2);
+                   v_accum = npyv_muladd_f32(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_f32 a = npyv_load_tillz_f32(data0, count);
+        npyv_f32 b = npyv_load_tillz_f32(data1, count);
+        v_accum = npyv_muladd_f32(a, b, v_accum);
+    }
+    accum = npyv_sum_f32(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_float ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_float ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_float ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_float ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_float a = (*data0);
+        const npy_float b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_float
+    *(npy_float *)dataptr[2] = ((*(npy_float *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+float_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float *data1 = (npy_float *)dataptr[1];
+    npy_float value0 = (*(npy_float *)dataptr[0]);
+    npy_float accum = float_sum_of_arr(data1, count);
+    *(npy_float *)dataptr[2] = ((*(npy_float *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+float_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float *data0 = (npy_float *)dataptr[0];
+    npy_float value1 = (*(npy_float *)dataptr[1]);
+    npy_float accum = float_sum_of_arr(data0, count);
+    *(npy_float *)dataptr[2] = ((*(npy_float *)dataptr[2]) + value1 * accum);
+}
+
+#elif 3 == 3 && !0
+
+static void
+float_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float *data0 = (npy_float *)dataptr[0];
+    npy_float *data1 = (npy_float *)dataptr[1];
+    npy_float *data2 = (npy_float *)dataptr[2];
+    npy_float *data_out = (npy_float *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 3 > 3 || @complex */
+
+static void
+float_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("float_sum_of_products_contig_three (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_float temp = (*(npy_float *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_float *)dataptr[i]);
+        }
+        *(npy_float *)dataptr[nop] = (temp +
+                                           (*(npy_float *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_float);
+        }
+#else /* complex */
+#  if 3 <= 3
+#    define _SUMPROD_NOP 3
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_float re, im, tmp;
+        int i;
+        re = ((npy_float *)dataptr[0])[0];
+        im = ((npy_float *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_float *)dataptr[i])[0] -
+                  im * ((npy_float *)dataptr[i])[1];
+            im = re * ((npy_float *)dataptr[i])[1] +
+                 im * ((npy_float *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_float *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_float *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_float);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 3 */
+
+#if 3 == 1
+
+static NPY_GCC_OPT_3 void
+float_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("float_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_float *data = (npy_float *)dataptr[0];
+    npy_float accum = float_sum_of_arr(data, count);
+    *((npy_float *)dataptr[1]) = (accum + (*((npy_float *)dataptr[1])));
+#else
+    npy_float accum_re = 0, accum_im = 0;
+    npy_float *data0 = (npy_float *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_float re01 = data0[0] + data0[2];
+        const npy_float re23 = data0[4] + data0[6];
+        const npy_float im13 = data0[1] + data0[3];
+        const npy_float im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_float *)dataptr[1])[0] += accum_re;
+    ((npy_float *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 3 == 1 */
+
+static void
+float_sum_of_products_outstride0_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_float accum_re = 0, accum_im = 0;
+#else
+    npy_float accum = 0;
+#endif
+
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (3 == 2 || 3 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (3 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("float_sum_of_products_outstride0_three (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 3 == 1
+        accum += (*(npy_float *)data0);
+        data0 += stride0;
+#  elif 3 == 2
+        accum += (*(npy_float *)data0) *
+                 (*(npy_float *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 3 == 3
+        accum += (*(npy_float *)data0) *
+                 (*(npy_float *)data1) *
+                 (*(npy_float *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_float temp = (*(npy_float *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_float *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 3 == 1
+        accum_re += ((npy_float *)data0)[0];
+        accum_im += ((npy_float *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 3 <= 3
+#define _SUMPROD_NOP 3
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_float re, im, tmp;
+        int i;
+        re = ((npy_float *)dataptr[0])[0];
+        im = ((npy_float *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_float *)dataptr[i])[0] -
+                  im * ((npy_float *)dataptr[i])[1];
+            im = re * ((npy_float *)dataptr[i])[1] +
+                 im * ((npy_float *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 3 <= 3
+    ((npy_float *)dataptr[3])[0] += accum_re;
+    ((npy_float *)dataptr[3])[1] += accum_im;
+#  else
+    ((npy_float *)dataptr[nop])[0] += accum_re;
+    ((npy_float *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 3 <= 3
+    *((npy_float *)dataptr[3]) = (accum +
+                                    (*((npy_float *)dataptr[3])));
+#  else
+    *((npy_float *)dataptr[nop]) = (accum +
+                                    (*((npy_float *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+float_sum_of_products_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1000 == 2 || 1000 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1000 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data_out = dataptr[1000];
+    npy_intp stride_out = strides[1000];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("float_sum_of_products_any (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 1000 == 1
+        *(npy_float *)data_out = ((*(npy_float *)data0) +
+                                         (*(npy_float *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 1000 == 2
+        *(npy_float *)data_out = ((*(npy_float *)data0) *
+                                         (*(npy_float *)data1) +
+                                         (*(npy_float *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 1000 == 3
+        *(npy_float *)data_out = ((*(npy_float *)data0) *
+                                         (*(npy_float *)data1) *
+                                         (*(npy_float *)data2) +
+                                         (*(npy_float *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_float temp = (*(npy_float *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_float *)dataptr[i]);
+        }
+        *(npy_float *)dataptr[nop] = (temp +
+                                           (*(npy_float *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1000 == 1
+        ((npy_float *)data_out)[0] = ((npy_float *)data0)[0] +
+                                         ((npy_float *)data_out)[0];
+        ((npy_float *)data_out)[1] = ((npy_float *)data0)[1] +
+                                         ((npy_float *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 1000 <= 3
+#define _SUMPROD_NOP 1000
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_float re, im, tmp;
+        int i;
+        re = ((npy_float *)dataptr[0])[0];
+        im = ((npy_float *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_float *)dataptr[i])[0] -
+                  im * ((npy_float *)dataptr[i])[1];
+            im = re * ((npy_float *)dataptr[i])[1] +
+                 im * ((npy_float *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_float *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_float *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 1000 == 1
+
+static void
+float_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float *data0 = (npy_float *)dataptr[0];
+    npy_float *data_out = (npy_float *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("float_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_float *)data_out + 2*6)[0] =
+                                    ((npy_float *)data0 + 2*6)[0] +
+                                    ((npy_float *)data_out + 2*6)[0];
+            ((npy_float *)data_out + 2*6)[1] =
+                                    ((npy_float *)data0 + 2*6)[1] +
+                                    ((npy_float *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_float *)data_out + 2*5)[0] =
+                                    ((npy_float *)data0 + 2*5)[0] +
+                                    ((npy_float *)data_out + 2*5)[0];
+            ((npy_float *)data_out + 2*5)[1] =
+                                    ((npy_float *)data0 + 2*5)[1] +
+                                    ((npy_float *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_float *)data_out + 2*4)[0] =
+                                    ((npy_float *)data0 + 2*4)[0] +
+                                    ((npy_float *)data_out + 2*4)[0];
+            ((npy_float *)data_out + 2*4)[1] =
+                                    ((npy_float *)data0 + 2*4)[1] +
+                                    ((npy_float *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_float *)data_out + 2*3)[0] =
+                                    ((npy_float *)data0 + 2*3)[0] +
+                                    ((npy_float *)data_out + 2*3)[0];
+            ((npy_float *)data_out + 2*3)[1] =
+                                    ((npy_float *)data0 + 2*3)[1] +
+                                    ((npy_float *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_float *)data_out + 2*2)[0] =
+                                    ((npy_float *)data0 + 2*2)[0] +
+                                    ((npy_float *)data_out + 2*2)[0];
+            ((npy_float *)data_out + 2*2)[1] =
+                                    ((npy_float *)data0 + 2*2)[1] +
+                                    ((npy_float *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_float *)data_out + 2*1)[0] =
+                                    ((npy_float *)data0 + 2*1)[0] +
+                                    ((npy_float *)data_out + 2*1)[0];
+            ((npy_float *)data_out + 2*1)[1] =
+                                    ((npy_float *)data0 + 2*1)[1] +
+                                    ((npy_float *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_float *)data_out + 2*0)[0] =
+                                    ((npy_float *)data0 + 2*0)[0] +
+                                    ((npy_float *)data_out + 2*0)[0];
+            ((npy_float *)data_out + 2*0)[1] =
+                                    ((npy_float *)data0 + 2*0)[1] +
+                                    ((npy_float *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_float *)data_out + 2*0)[0] =
+                                ((npy_float *)data0 + 2*0)[0] +
+                                ((npy_float *)data_out + 2*0)[0];
+        ((npy_float *)data_out + 2*0)[1] =
+                                ((npy_float *)data0 + 2*0)[1] +
+                                ((npy_float *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_float *)data_out + 2*1)[0] =
+                                ((npy_float *)data0 + 2*1)[0] +
+                                ((npy_float *)data_out + 2*1)[0];
+        ((npy_float *)data_out + 2*1)[1] =
+                                ((npy_float *)data0 + 2*1)[1] +
+                                ((npy_float *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_float *)data_out + 2*2)[0] =
+                                ((npy_float *)data0 + 2*2)[0] +
+                                ((npy_float *)data_out + 2*2)[0];
+        ((npy_float *)data_out + 2*2)[1] =
+                                ((npy_float *)data0 + 2*2)[1] +
+                                ((npy_float *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_float *)data_out + 2*3)[0] =
+                                ((npy_float *)data0 + 2*3)[0] +
+                                ((npy_float *)data_out + 2*3)[0];
+        ((npy_float *)data_out + 2*3)[1] =
+                                ((npy_float *)data0 + 2*3)[1] +
+                                ((npy_float *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_float *)data_out + 2*4)[0] =
+                                ((npy_float *)data0 + 2*4)[0] +
+                                ((npy_float *)data_out + 2*4)[0];
+        ((npy_float *)data_out + 2*4)[1] =
+                                ((npy_float *)data0 + 2*4)[1] +
+                                ((npy_float *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_float *)data_out + 2*5)[0] =
+                                ((npy_float *)data0 + 2*5)[0] +
+                                ((npy_float *)data_out + 2*5)[0];
+        ((npy_float *)data_out + 2*5)[1] =
+                                ((npy_float *)data0 + 2*5)[1] +
+                                ((npy_float *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_float *)data_out + 2*6)[0] =
+                                ((npy_float *)data0 + 2*6)[0] +
+                                ((npy_float *)data_out + 2*6)[0];
+        ((npy_float *)data_out + 2*6)[1] =
+                                ((npy_float *)data0 + 2*6)[1] +
+                                ((npy_float *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_float *)data_out + 2*7)[0] =
+                                ((npy_float *)data0 + 2*7)[0] +
+                                ((npy_float *)data_out + 2*7)[0];
+        ((npy_float *)data_out + 2*7)[1] =
+                                ((npy_float *)data0 + 2*7)[1] +
+                                ((npy_float *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 1000 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+float_sum_of_products_muladd(npy_float *data, npy_float *data_out, npy_float scalar, npy_intp count)
+{
+#if NPY_SIMD_F32 // NPYV check for npy_float
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_f32;
+    const npyv_f32 v_scalar = npyv_setall_f32(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_f32 b0 = npyv_loada_f32(data + vstep * 0);
+            npyv_f32 c0 = npyv_loada_f32(data_out + vstep * 0);
+            
+#line 312
+            npyv_f32 b1 = npyv_loada_f32(data + vstep * 1);
+            npyv_f32 c1 = npyv_loada_f32(data_out + vstep * 1);
+            
+#line 312
+            npyv_f32 b2 = npyv_loada_f32(data + vstep * 2);
+            npyv_f32 c2 = npyv_loada_f32(data_out + vstep * 2);
+            
+#line 312
+            npyv_f32 b3 = npyv_loada_f32(data + vstep * 3);
+            npyv_f32 c3 = npyv_loada_f32(data_out + vstep * 3);
+            
+            #line 318
+            npyv_f32 abc0 = npyv_muladd_f32(v_scalar, b0, c0);
+            
+#line 318
+            npyv_f32 abc1 = npyv_muladd_f32(v_scalar, b1, c1);
+            
+#line 318
+            npyv_f32 abc2 = npyv_muladd_f32(v_scalar, b2, c2);
+            
+#line 318
+            npyv_f32 abc3 = npyv_muladd_f32(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_f32(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_f32(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_f32(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_f32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_f32 b0 = npyv_load_f32(data + vstep * 0);
+            npyv_f32 c0 = npyv_load_f32(data_out + vstep * 0);
+            
+#line 312
+            npyv_f32 b1 = npyv_load_f32(data + vstep * 1);
+            npyv_f32 c1 = npyv_load_f32(data_out + vstep * 1);
+            
+#line 312
+            npyv_f32 b2 = npyv_load_f32(data + vstep * 2);
+            npyv_f32 c2 = npyv_load_f32(data_out + vstep * 2);
+            
+#line 312
+            npyv_f32 b3 = npyv_load_f32(data + vstep * 3);
+            npyv_f32 c3 = npyv_load_f32(data_out + vstep * 3);
+            
+            #line 318
+            npyv_f32 abc0 = npyv_muladd_f32(v_scalar, b0, c0);
+            
+#line 318
+            npyv_f32 abc1 = npyv_muladd_f32(v_scalar, b1, c1);
+            
+#line 318
+            npyv_f32 abc2 = npyv_muladd_f32(v_scalar, b2, c2);
+            
+#line 318
+            npyv_f32 abc3 = npyv_muladd_f32(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_f32(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_f32(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_f32(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_f32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_f32 a = npyv_load_tillz_f32(data, count);
+        npyv_f32 b = npyv_load_tillz_f32(data_out, count);
+        npyv_store_till_f32(data_out, count, npyv_muladd_f32(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_float b0 = (data[0]);
+        const npy_float c0 = (data_out[0]);
+        
+#line 340
+        const npy_float b1 = (data[1]);
+        const npy_float c1 = (data_out[1]);
+        
+#line 340
+        const npy_float b2 = (data[2]);
+        const npy_float c2 = (data_out[2]);
+        
+#line 340
+        const npy_float b3 = (data[3]);
+        const npy_float c3 = (data_out[3]);
+        
+        #line 346
+        const npy_float abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_float abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_float abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_float abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_float b = (*data);
+        const npy_float c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_float
+}
+
+static void
+float_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float *data0 = (npy_float *)dataptr[0];
+    npy_float *data1 = (npy_float *)dataptr[1];
+    npy_float *data_out = (npy_float *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("float_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_float
+#if NPY_SIMD_F32
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_f32;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_f32 a0 = npyv_loada_f32(data0 + vstep * 0);
+            npyv_f32 b0 = npyv_loada_f32(data1 + vstep * 0);
+            npyv_f32 c0 = npyv_loada_f32(data_out + vstep * 0);
+            
+#line 390
+            npyv_f32 a1 = npyv_loada_f32(data0 + vstep * 1);
+            npyv_f32 b1 = npyv_loada_f32(data1 + vstep * 1);
+            npyv_f32 c1 = npyv_loada_f32(data_out + vstep * 1);
+            
+#line 390
+            npyv_f32 a2 = npyv_loada_f32(data0 + vstep * 2);
+            npyv_f32 b2 = npyv_loada_f32(data1 + vstep * 2);
+            npyv_f32 c2 = npyv_loada_f32(data_out + vstep * 2);
+            
+#line 390
+            npyv_f32 a3 = npyv_loada_f32(data0 + vstep * 3);
+            npyv_f32 b3 = npyv_loada_f32(data1 + vstep * 3);
+            npyv_f32 c3 = npyv_loada_f32(data_out + vstep * 3);
+            
+            #line 397
+            npyv_f32 abc0 = npyv_muladd_f32(a0, b0, c0);
+            
+#line 397
+            npyv_f32 abc1 = npyv_muladd_f32(a1, b1, c1);
+            
+#line 397
+            npyv_f32 abc2 = npyv_muladd_f32(a2, b2, c2);
+            
+#line 397
+            npyv_f32 abc3 = npyv_muladd_f32(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_f32(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_f32(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_f32(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_f32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_f32 a0 = npyv_load_f32(data0 + vstep * 0);
+            npyv_f32 b0 = npyv_load_f32(data1 + vstep * 0);
+            npyv_f32 c0 = npyv_load_f32(data_out + vstep * 0);
+            
+#line 390
+            npyv_f32 a1 = npyv_load_f32(data0 + vstep * 1);
+            npyv_f32 b1 = npyv_load_f32(data1 + vstep * 1);
+            npyv_f32 c1 = npyv_load_f32(data_out + vstep * 1);
+            
+#line 390
+            npyv_f32 a2 = npyv_load_f32(data0 + vstep * 2);
+            npyv_f32 b2 = npyv_load_f32(data1 + vstep * 2);
+            npyv_f32 c2 = npyv_load_f32(data_out + vstep * 2);
+            
+#line 390
+            npyv_f32 a3 = npyv_load_f32(data0 + vstep * 3);
+            npyv_f32 b3 = npyv_load_f32(data1 + vstep * 3);
+            npyv_f32 c3 = npyv_load_f32(data_out + vstep * 3);
+            
+            #line 397
+            npyv_f32 abc0 = npyv_muladd_f32(a0, b0, c0);
+            
+#line 397
+            npyv_f32 abc1 = npyv_muladd_f32(a1, b1, c1);
+            
+#line 397
+            npyv_f32 abc2 = npyv_muladd_f32(a2, b2, c2);
+            
+#line 397
+            npyv_f32 abc3 = npyv_muladd_f32(a3, b3, c3);
+            
+            #line 402
+            npyv_store_f32(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_f32(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_f32(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_f32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_f32 a = npyv_load_tillz_f32(data0, count);
+        npyv_f32 b = npyv_load_tillz_f32(data1, count);
+        npyv_f32 c = npyv_load_tillz_f32(data_out, count);
+        npyv_store_till_f32(data_out, count, npyv_muladd_f32(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_float a0 = (data0[0]);
+        const npy_float b0 = (data1[0]);
+        const npy_float c0 = (data_out[0]);
+        
+#line 420
+        const npy_float a1 = (data0[1]);
+        const npy_float b1 = (data1[1]);
+        const npy_float c1 = (data_out[1]);
+        
+#line 420
+        const npy_float a2 = (data0[2]);
+        const npy_float b2 = (data1[2]);
+        const npy_float c2 = (data_out[2]);
+        
+#line 420
+        const npy_float a3 = (data0[3]);
+        const npy_float b3 = (data1[3]);
+        const npy_float c3 = (data_out[3]);
+        
+        #line 427
+        const npy_float abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_float abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_float abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_float abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_float a = (*data0);
+        const npy_float b = (*data1);
+        const npy_float c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_float
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+float_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float value0 = (*(npy_float *)dataptr[0]);
+    npy_float *data1 = (npy_float *)dataptr[1];
+    npy_float *data_out = (npy_float *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("float_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    float_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+float_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float value1 = (*(npy_float *)dataptr[1]);
+    npy_float *data0 = (npy_float *)dataptr[0];
+    npy_float *data_out = (npy_float *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("float_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    float_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+float_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float *data0 = (npy_float *)dataptr[0];
+    npy_float *data1 = (npy_float *)dataptr[1];
+    npy_float accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("float_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if NPY_SIMD_F32 // NPYV check for npy_float
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_f32;
+    npyv_f32 v_accum = npyv_zero_f32();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_f32 a0 = npyv_loada_f32(data0 + vstep * 0);
+            npyv_f32 b0 = npyv_loada_f32(data1 + vstep * 0);
+            
+#line 501
+            npyv_f32 a1 = npyv_loada_f32(data0 + vstep * 1);
+            npyv_f32 b1 = npyv_loada_f32(data1 + vstep * 1);
+            
+#line 501
+            npyv_f32 a2 = npyv_loada_f32(data0 + vstep * 2);
+            npyv_f32 b2 = npyv_loada_f32(data1 + vstep * 2);
+            
+#line 501
+            npyv_f32 a3 = npyv_loada_f32(data0 + vstep * 3);
+            npyv_f32 b3 = npyv_loada_f32(data1 + vstep * 3);
+            
+            npyv_f32 ab3 = npyv_muladd_f32(a3, b3, v_accum);
+            npyv_f32 ab2 = npyv_muladd_f32(a2, b2, ab3);
+            npyv_f32 ab1 = npyv_muladd_f32(a1, b1, ab2);
+                   v_accum = npyv_muladd_f32(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_f32 a0 = npyv_load_f32(data0 + vstep * 0);
+            npyv_f32 b0 = npyv_load_f32(data1 + vstep * 0);
+            
+#line 501
+            npyv_f32 a1 = npyv_load_f32(data0 + vstep * 1);
+            npyv_f32 b1 = npyv_load_f32(data1 + vstep * 1);
+            
+#line 501
+            npyv_f32 a2 = npyv_load_f32(data0 + vstep * 2);
+            npyv_f32 b2 = npyv_load_f32(data1 + vstep * 2);
+            
+#line 501
+            npyv_f32 a3 = npyv_load_f32(data0 + vstep * 3);
+            npyv_f32 b3 = npyv_load_f32(data1 + vstep * 3);
+            
+            npyv_f32 ab3 = npyv_muladd_f32(a3, b3, v_accum);
+            npyv_f32 ab2 = npyv_muladd_f32(a2, b2, ab3);
+            npyv_f32 ab1 = npyv_muladd_f32(a1, b1, ab2);
+                   v_accum = npyv_muladd_f32(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_f32 a = npyv_load_tillz_f32(data0, count);
+        npyv_f32 b = npyv_load_tillz_f32(data1, count);
+        v_accum = npyv_muladd_f32(a, b, v_accum);
+    }
+    accum = npyv_sum_f32(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_float ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_float ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_float ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_float ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_float a = (*data0);
+        const npy_float b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_float
+    *(npy_float *)dataptr[2] = ((*(npy_float *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+float_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float *data1 = (npy_float *)dataptr[1];
+    npy_float value0 = (*(npy_float *)dataptr[0]);
+    npy_float accum = float_sum_of_arr(data1, count);
+    *(npy_float *)dataptr[2] = ((*(npy_float *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+float_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float *data0 = (npy_float *)dataptr[0];
+    npy_float value1 = (*(npy_float *)dataptr[1]);
+    npy_float accum = float_sum_of_arr(data0, count);
+    *(npy_float *)dataptr[2] = ((*(npy_float *)dataptr[2]) + value1 * accum);
+}
+
+#elif 1000 == 3 && !0
+
+static void
+float_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float *data0 = (npy_float *)dataptr[0];
+    npy_float *data1 = (npy_float *)dataptr[1];
+    npy_float *data2 = (npy_float *)dataptr[2];
+    npy_float *data_out = (npy_float *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 1000 > 3 || @complex */
+
+static void
+float_sum_of_products_contig_any(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("float_sum_of_products_contig_any (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_float temp = (*(npy_float *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_float *)dataptr[i]);
+        }
+        *(npy_float *)dataptr[nop] = (temp +
+                                           (*(npy_float *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_float);
+        }
+#else /* complex */
+#  if 1000 <= 3
+#    define _SUMPROD_NOP 1000
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_float re, im, tmp;
+        int i;
+        re = ((npy_float *)dataptr[0])[0];
+        im = ((npy_float *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_float *)dataptr[i])[0] -
+                  im * ((npy_float *)dataptr[i])[1];
+            im = re * ((npy_float *)dataptr[i])[1] +
+                 im * ((npy_float *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_float *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_float *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_float);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 1000 */
+
+#if 1000 == 1
+
+static NPY_GCC_OPT_3 void
+float_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("float_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_float *data = (npy_float *)dataptr[0];
+    npy_float accum = float_sum_of_arr(data, count);
+    *((npy_float *)dataptr[1]) = (accum + (*((npy_float *)dataptr[1])));
+#else
+    npy_float accum_re = 0, accum_im = 0;
+    npy_float *data0 = (npy_float *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_float re01 = data0[0] + data0[2];
+        const npy_float re23 = data0[4] + data0[6];
+        const npy_float im13 = data0[1] + data0[3];
+        const npy_float im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_float *)dataptr[1])[0] += accum_re;
+    ((npy_float *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 1000 == 1 */
+
+static void
+float_sum_of_products_outstride0_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_float accum_re = 0, accum_im = 0;
+#else
+    npy_float accum = 0;
+#endif
+
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1000 == 2 || 1000 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1000 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("float_sum_of_products_outstride0_any (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 1000 == 1
+        accum += (*(npy_float *)data0);
+        data0 += stride0;
+#  elif 1000 == 2
+        accum += (*(npy_float *)data0) *
+                 (*(npy_float *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 1000 == 3
+        accum += (*(npy_float *)data0) *
+                 (*(npy_float *)data1) *
+                 (*(npy_float *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_float temp = (*(npy_float *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_float *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1000 == 1
+        accum_re += ((npy_float *)data0)[0];
+        accum_im += ((npy_float *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 1000 <= 3
+#define _SUMPROD_NOP 1000
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_float re, im, tmp;
+        int i;
+        re = ((npy_float *)dataptr[0])[0];
+        im = ((npy_float *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_float *)dataptr[i])[0] -
+                  im * ((npy_float *)dataptr[i])[1];
+            im = re * ((npy_float *)dataptr[i])[1] +
+                 im * ((npy_float *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 1000 <= 3
+    ((npy_float *)dataptr[1000])[0] += accum_re;
+    ((npy_float *)dataptr[1000])[1] += accum_im;
+#  else
+    ((npy_float *)dataptr[nop])[0] += accum_re;
+    ((npy_float *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 1000 <= 3
+    *((npy_float *)dataptr[1000]) = (accum +
+                                    (*((npy_float *)dataptr[1000])));
+#  else
+    *((npy_float *)dataptr[nop]) = (accum +
+                                    (*((npy_float *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+
+
+#line 74
+
+#if !0
+static NPY_GCC_OPT_3 npy_double double_sum_of_arr(npy_double *data, npy_intp count)
+{
+    npy_double accum = 0;
+#if NPY_SIMD_F64 // NPYV check for npy_double
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data);
+    const int vstep = npyv_nlanes_f64;
+    npyv_f64 v_accum = npyv_zero_f64();
+    const npy_intp vstepx4 = vstep * 4;
+
+    #line 91
+    if(is_aligned) {
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
+            #line 96
+            npyv_f64 a0 = npyv_loada_f64(data + vstep * 0);
+            
+#line 96
+            npyv_f64 a1 = npyv_loada_f64(data + vstep * 1);
+            
+#line 96
+            npyv_f64 a2 = npyv_loada_f64(data + vstep * 2);
+            
+#line 96
+            npyv_f64 a3 = npyv_loada_f64(data + vstep * 3);
+            
+            npyv_f64 a01   = npyv_add_f64(a0, a1);
+            npyv_f64 a23   = npyv_add_f64(a2, a3);
+            npyv_f64 a0123 = npyv_add_f64(a01, a23);
+                     v_accum = npyv_add_f64(a0123, v_accum);
+        }
+    }
+    
+#line 91
+    else {
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
+            #line 96
+            npyv_f64 a0 = npyv_load_f64(data + vstep * 0);
+            
+#line 96
+            npyv_f64 a1 = npyv_load_f64(data + vstep * 1);
+            
+#line 96
+            npyv_f64 a2 = npyv_load_f64(data + vstep * 2);
+            
+#line 96
+            npyv_f64 a3 = npyv_load_f64(data + vstep * 3);
+            
+            npyv_f64 a01   = npyv_add_f64(a0, a1);
+            npyv_f64 a23   = npyv_add_f64(a2, a3);
+            npyv_f64 a0123 = npyv_add_f64(a01, a23);
+                     v_accum = npyv_add_f64(a0123, v_accum);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep) {
+        npyv_f64 a = npyv_load_tillz_f64(data, count);
+        v_accum = npyv_add_f64(a, v_accum);
+    }
+    accum = npyv_sum_f64(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data += 4) {
+        const npy_double a01 = (*data) + (data[1]);
+        const npy_double a23 = (data[2]) + (data[3]);
+        accum +=  a01 + a23;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data++) {
+        accum += (*data);
+    }
+#endif // NPYV check for npy_double
+    return accum;
+}
+#endif
+
+#line 131
+static void
+double_sum_of_products_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1 == 2 || 1 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data_out = dataptr[1];
+    npy_intp stride_out = strides[1];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("double_sum_of_products_one (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 1 == 1
+        *(npy_double *)data_out = ((*(npy_double *)data0) +
+                                         (*(npy_double *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 1 == 2
+        *(npy_double *)data_out = ((*(npy_double *)data0) *
+                                         (*(npy_double *)data1) +
+                                         (*(npy_double *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 1 == 3
+        *(npy_double *)data_out = ((*(npy_double *)data0) *
+                                         (*(npy_double *)data1) *
+                                         (*(npy_double *)data2) +
+                                         (*(npy_double *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_double temp = (*(npy_double *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_double *)dataptr[i]);
+        }
+        *(npy_double *)dataptr[nop] = (temp +
+                                           (*(npy_double *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1 == 1
+        ((npy_double *)data_out)[0] = ((npy_double *)data0)[0] +
+                                         ((npy_double *)data_out)[0];
+        ((npy_double *)data_out)[1] = ((npy_double *)data0)[1] +
+                                         ((npy_double *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 1 <= 3
+#define _SUMPROD_NOP 1
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_double re, im, tmp;
+        int i;
+        re = ((npy_double *)dataptr[0])[0];
+        im = ((npy_double *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_double *)dataptr[i])[0] -
+                  im * ((npy_double *)dataptr[i])[1];
+            im = re * ((npy_double *)dataptr[i])[1] +
+                 im * ((npy_double *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_double *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_double *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_double *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_double *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 1 == 1
+
+static void
+double_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double *data0 = (npy_double *)dataptr[0];
+    npy_double *data_out = (npy_double *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("double_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_double *)data_out + 2*6)[0] =
+                                    ((npy_double *)data0 + 2*6)[0] +
+                                    ((npy_double *)data_out + 2*6)[0];
+            ((npy_double *)data_out + 2*6)[1] =
+                                    ((npy_double *)data0 + 2*6)[1] +
+                                    ((npy_double *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_double *)data_out + 2*5)[0] =
+                                    ((npy_double *)data0 + 2*5)[0] +
+                                    ((npy_double *)data_out + 2*5)[0];
+            ((npy_double *)data_out + 2*5)[1] =
+                                    ((npy_double *)data0 + 2*5)[1] +
+                                    ((npy_double *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_double *)data_out + 2*4)[0] =
+                                    ((npy_double *)data0 + 2*4)[0] +
+                                    ((npy_double *)data_out + 2*4)[0];
+            ((npy_double *)data_out + 2*4)[1] =
+                                    ((npy_double *)data0 + 2*4)[1] +
+                                    ((npy_double *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_double *)data_out + 2*3)[0] =
+                                    ((npy_double *)data0 + 2*3)[0] +
+                                    ((npy_double *)data_out + 2*3)[0];
+            ((npy_double *)data_out + 2*3)[1] =
+                                    ((npy_double *)data0 + 2*3)[1] +
+                                    ((npy_double *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_double *)data_out + 2*2)[0] =
+                                    ((npy_double *)data0 + 2*2)[0] +
+                                    ((npy_double *)data_out + 2*2)[0];
+            ((npy_double *)data_out + 2*2)[1] =
+                                    ((npy_double *)data0 + 2*2)[1] +
+                                    ((npy_double *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_double *)data_out + 2*1)[0] =
+                                    ((npy_double *)data0 + 2*1)[0] +
+                                    ((npy_double *)data_out + 2*1)[0];
+            ((npy_double *)data_out + 2*1)[1] =
+                                    ((npy_double *)data0 + 2*1)[1] +
+                                    ((npy_double *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_double *)data_out + 2*0)[0] =
+                                    ((npy_double *)data0 + 2*0)[0] +
+                                    ((npy_double *)data_out + 2*0)[0];
+            ((npy_double *)data_out + 2*0)[1] =
+                                    ((npy_double *)data0 + 2*0)[1] +
+                                    ((npy_double *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_double *)data_out + 2*0)[0] =
+                                ((npy_double *)data0 + 2*0)[0] +
+                                ((npy_double *)data_out + 2*0)[0];
+        ((npy_double *)data_out + 2*0)[1] =
+                                ((npy_double *)data0 + 2*0)[1] +
+                                ((npy_double *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_double *)data_out + 2*1)[0] =
+                                ((npy_double *)data0 + 2*1)[0] +
+                                ((npy_double *)data_out + 2*1)[0];
+        ((npy_double *)data_out + 2*1)[1] =
+                                ((npy_double *)data0 + 2*1)[1] +
+                                ((npy_double *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_double *)data_out + 2*2)[0] =
+                                ((npy_double *)data0 + 2*2)[0] +
+                                ((npy_double *)data_out + 2*2)[0];
+        ((npy_double *)data_out + 2*2)[1] =
+                                ((npy_double *)data0 + 2*2)[1] +
+                                ((npy_double *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_double *)data_out + 2*3)[0] =
+                                ((npy_double *)data0 + 2*3)[0] +
+                                ((npy_double *)data_out + 2*3)[0];
+        ((npy_double *)data_out + 2*3)[1] =
+                                ((npy_double *)data0 + 2*3)[1] +
+                                ((npy_double *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_double *)data_out + 2*4)[0] =
+                                ((npy_double *)data0 + 2*4)[0] +
+                                ((npy_double *)data_out + 2*4)[0];
+        ((npy_double *)data_out + 2*4)[1] =
+                                ((npy_double *)data0 + 2*4)[1] +
+                                ((npy_double *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_double *)data_out + 2*5)[0] =
+                                ((npy_double *)data0 + 2*5)[0] +
+                                ((npy_double *)data_out + 2*5)[0];
+        ((npy_double *)data_out + 2*5)[1] =
+                                ((npy_double *)data0 + 2*5)[1] +
+                                ((npy_double *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_double *)data_out + 2*6)[0] =
+                                ((npy_double *)data0 + 2*6)[0] +
+                                ((npy_double *)data_out + 2*6)[0];
+        ((npy_double *)data_out + 2*6)[1] =
+                                ((npy_double *)data0 + 2*6)[1] +
+                                ((npy_double *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_double *)data_out + 2*7)[0] =
+                                ((npy_double *)data0 + 2*7)[0] +
+                                ((npy_double *)data_out + 2*7)[0];
+        ((npy_double *)data_out + 2*7)[1] =
+                                ((npy_double *)data0 + 2*7)[1] +
+                                ((npy_double *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 1 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+double_sum_of_products_muladd(npy_double *data, npy_double *data_out, npy_double scalar, npy_intp count)
+{
+#if NPY_SIMD_F64 // NPYV check for npy_double
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_f64;
+    const npyv_f64 v_scalar = npyv_setall_f64(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_f64 b0 = npyv_loada_f64(data + vstep * 0);
+            npyv_f64 c0 = npyv_loada_f64(data_out + vstep * 0);
+            
+#line 312
+            npyv_f64 b1 = npyv_loada_f64(data + vstep * 1);
+            npyv_f64 c1 = npyv_loada_f64(data_out + vstep * 1);
+            
+#line 312
+            npyv_f64 b2 = npyv_loada_f64(data + vstep * 2);
+            npyv_f64 c2 = npyv_loada_f64(data_out + vstep * 2);
+            
+#line 312
+            npyv_f64 b3 = npyv_loada_f64(data + vstep * 3);
+            npyv_f64 c3 = npyv_loada_f64(data_out + vstep * 3);
+            
+            #line 318
+            npyv_f64 abc0 = npyv_muladd_f64(v_scalar, b0, c0);
+            
+#line 318
+            npyv_f64 abc1 = npyv_muladd_f64(v_scalar, b1, c1);
+            
+#line 318
+            npyv_f64 abc2 = npyv_muladd_f64(v_scalar, b2, c2);
+            
+#line 318
+            npyv_f64 abc3 = npyv_muladd_f64(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_f64(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_f64(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_f64(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_f64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_f64 b0 = npyv_load_f64(data + vstep * 0);
+            npyv_f64 c0 = npyv_load_f64(data_out + vstep * 0);
+            
+#line 312
+            npyv_f64 b1 = npyv_load_f64(data + vstep * 1);
+            npyv_f64 c1 = npyv_load_f64(data_out + vstep * 1);
+            
+#line 312
+            npyv_f64 b2 = npyv_load_f64(data + vstep * 2);
+            npyv_f64 c2 = npyv_load_f64(data_out + vstep * 2);
+            
+#line 312
+            npyv_f64 b3 = npyv_load_f64(data + vstep * 3);
+            npyv_f64 c3 = npyv_load_f64(data_out + vstep * 3);
+            
+            #line 318
+            npyv_f64 abc0 = npyv_muladd_f64(v_scalar, b0, c0);
+            
+#line 318
+            npyv_f64 abc1 = npyv_muladd_f64(v_scalar, b1, c1);
+            
+#line 318
+            npyv_f64 abc2 = npyv_muladd_f64(v_scalar, b2, c2);
+            
+#line 318
+            npyv_f64 abc3 = npyv_muladd_f64(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_f64(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_f64(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_f64(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_f64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_f64 a = npyv_load_tillz_f64(data, count);
+        npyv_f64 b = npyv_load_tillz_f64(data_out, count);
+        npyv_store_till_f64(data_out, count, npyv_muladd_f64(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_double b0 = (data[0]);
+        const npy_double c0 = (data_out[0]);
+        
+#line 340
+        const npy_double b1 = (data[1]);
+        const npy_double c1 = (data_out[1]);
+        
+#line 340
+        const npy_double b2 = (data[2]);
+        const npy_double c2 = (data_out[2]);
+        
+#line 340
+        const npy_double b3 = (data[3]);
+        const npy_double c3 = (data_out[3]);
+        
+        #line 346
+        const npy_double abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_double abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_double abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_double abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_double b = (*data);
+        const npy_double c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_double
+}
+
+static void
+double_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double *data0 = (npy_double *)dataptr[0];
+    npy_double *data1 = (npy_double *)dataptr[1];
+    npy_double *data_out = (npy_double *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("double_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_double
+#if NPY_SIMD_F64
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_f64;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_f64 a0 = npyv_loada_f64(data0 + vstep * 0);
+            npyv_f64 b0 = npyv_loada_f64(data1 + vstep * 0);
+            npyv_f64 c0 = npyv_loada_f64(data_out + vstep * 0);
+            
+#line 390
+            npyv_f64 a1 = npyv_loada_f64(data0 + vstep * 1);
+            npyv_f64 b1 = npyv_loada_f64(data1 + vstep * 1);
+            npyv_f64 c1 = npyv_loada_f64(data_out + vstep * 1);
+            
+#line 390
+            npyv_f64 a2 = npyv_loada_f64(data0 + vstep * 2);
+            npyv_f64 b2 = npyv_loada_f64(data1 + vstep * 2);
+            npyv_f64 c2 = npyv_loada_f64(data_out + vstep * 2);
+            
+#line 390
+            npyv_f64 a3 = npyv_loada_f64(data0 + vstep * 3);
+            npyv_f64 b3 = npyv_loada_f64(data1 + vstep * 3);
+            npyv_f64 c3 = npyv_loada_f64(data_out + vstep * 3);
+            
+            #line 397
+            npyv_f64 abc0 = npyv_muladd_f64(a0, b0, c0);
+            
+#line 397
+            npyv_f64 abc1 = npyv_muladd_f64(a1, b1, c1);
+            
+#line 397
+            npyv_f64 abc2 = npyv_muladd_f64(a2, b2, c2);
+            
+#line 397
+            npyv_f64 abc3 = npyv_muladd_f64(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_f64(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_f64(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_f64(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_f64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_f64 a0 = npyv_load_f64(data0 + vstep * 0);
+            npyv_f64 b0 = npyv_load_f64(data1 + vstep * 0);
+            npyv_f64 c0 = npyv_load_f64(data_out + vstep * 0);
+            
+#line 390
+            npyv_f64 a1 = npyv_load_f64(data0 + vstep * 1);
+            npyv_f64 b1 = npyv_load_f64(data1 + vstep * 1);
+            npyv_f64 c1 = npyv_load_f64(data_out + vstep * 1);
+            
+#line 390
+            npyv_f64 a2 = npyv_load_f64(data0 + vstep * 2);
+            npyv_f64 b2 = npyv_load_f64(data1 + vstep * 2);
+            npyv_f64 c2 = npyv_load_f64(data_out + vstep * 2);
+            
+#line 390
+            npyv_f64 a3 = npyv_load_f64(data0 + vstep * 3);
+            npyv_f64 b3 = npyv_load_f64(data1 + vstep * 3);
+            npyv_f64 c3 = npyv_load_f64(data_out + vstep * 3);
+            
+            #line 397
+            npyv_f64 abc0 = npyv_muladd_f64(a0, b0, c0);
+            
+#line 397
+            npyv_f64 abc1 = npyv_muladd_f64(a1, b1, c1);
+            
+#line 397
+            npyv_f64 abc2 = npyv_muladd_f64(a2, b2, c2);
+            
+#line 397
+            npyv_f64 abc3 = npyv_muladd_f64(a3, b3, c3);
+            
+            #line 402
+            npyv_store_f64(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_f64(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_f64(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_f64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_f64 a = npyv_load_tillz_f64(data0, count);
+        npyv_f64 b = npyv_load_tillz_f64(data1, count);
+        npyv_f64 c = npyv_load_tillz_f64(data_out, count);
+        npyv_store_till_f64(data_out, count, npyv_muladd_f64(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_double a0 = (data0[0]);
+        const npy_double b0 = (data1[0]);
+        const npy_double c0 = (data_out[0]);
+        
+#line 420
+        const npy_double a1 = (data0[1]);
+        const npy_double b1 = (data1[1]);
+        const npy_double c1 = (data_out[1]);
+        
+#line 420
+        const npy_double a2 = (data0[2]);
+        const npy_double b2 = (data1[2]);
+        const npy_double c2 = (data_out[2]);
+        
+#line 420
+        const npy_double a3 = (data0[3]);
+        const npy_double b3 = (data1[3]);
+        const npy_double c3 = (data_out[3]);
+        
+        #line 427
+        const npy_double abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_double abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_double abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_double abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_double a = (*data0);
+        const npy_double b = (*data1);
+        const npy_double c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_double
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+double_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double value0 = (*(npy_double *)dataptr[0]);
+    npy_double *data1 = (npy_double *)dataptr[1];
+    npy_double *data_out = (npy_double *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("double_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    double_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+double_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double value1 = (*(npy_double *)dataptr[1]);
+    npy_double *data0 = (npy_double *)dataptr[0];
+    npy_double *data_out = (npy_double *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("double_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    double_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+double_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double *data0 = (npy_double *)dataptr[0];
+    npy_double *data1 = (npy_double *)dataptr[1];
+    npy_double accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("double_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if NPY_SIMD_F64 // NPYV check for npy_double
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_f64;
+    npyv_f64 v_accum = npyv_zero_f64();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_f64 a0 = npyv_loada_f64(data0 + vstep * 0);
+            npyv_f64 b0 = npyv_loada_f64(data1 + vstep * 0);
+            
+#line 501
+            npyv_f64 a1 = npyv_loada_f64(data0 + vstep * 1);
+            npyv_f64 b1 = npyv_loada_f64(data1 + vstep * 1);
+            
+#line 501
+            npyv_f64 a2 = npyv_loada_f64(data0 + vstep * 2);
+            npyv_f64 b2 = npyv_loada_f64(data1 + vstep * 2);
+            
+#line 501
+            npyv_f64 a3 = npyv_loada_f64(data0 + vstep * 3);
+            npyv_f64 b3 = npyv_loada_f64(data1 + vstep * 3);
+            
+            npyv_f64 ab3 = npyv_muladd_f64(a3, b3, v_accum);
+            npyv_f64 ab2 = npyv_muladd_f64(a2, b2, ab3);
+            npyv_f64 ab1 = npyv_muladd_f64(a1, b1, ab2);
+                   v_accum = npyv_muladd_f64(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_f64 a0 = npyv_load_f64(data0 + vstep * 0);
+            npyv_f64 b0 = npyv_load_f64(data1 + vstep * 0);
+            
+#line 501
+            npyv_f64 a1 = npyv_load_f64(data0 + vstep * 1);
+            npyv_f64 b1 = npyv_load_f64(data1 + vstep * 1);
+            
+#line 501
+            npyv_f64 a2 = npyv_load_f64(data0 + vstep * 2);
+            npyv_f64 b2 = npyv_load_f64(data1 + vstep * 2);
+            
+#line 501
+            npyv_f64 a3 = npyv_load_f64(data0 + vstep * 3);
+            npyv_f64 b3 = npyv_load_f64(data1 + vstep * 3);
+            
+            npyv_f64 ab3 = npyv_muladd_f64(a3, b3, v_accum);
+            npyv_f64 ab2 = npyv_muladd_f64(a2, b2, ab3);
+            npyv_f64 ab1 = npyv_muladd_f64(a1, b1, ab2);
+                   v_accum = npyv_muladd_f64(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_f64 a = npyv_load_tillz_f64(data0, count);
+        npyv_f64 b = npyv_load_tillz_f64(data1, count);
+        v_accum = npyv_muladd_f64(a, b, v_accum);
+    }
+    accum = npyv_sum_f64(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_double ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_double ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_double ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_double ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_double a = (*data0);
+        const npy_double b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_double
+    *(npy_double *)dataptr[2] = ((*(npy_double *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+double_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double *data1 = (npy_double *)dataptr[1];
+    npy_double value0 = (*(npy_double *)dataptr[0]);
+    npy_double accum = double_sum_of_arr(data1, count);
+    *(npy_double *)dataptr[2] = ((*(npy_double *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+double_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double *data0 = (npy_double *)dataptr[0];
+    npy_double value1 = (*(npy_double *)dataptr[1]);
+    npy_double accum = double_sum_of_arr(data0, count);
+    *(npy_double *)dataptr[2] = ((*(npy_double *)dataptr[2]) + value1 * accum);
+}
+
+#elif 1 == 3 && !0
+
+static void
+double_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double *data0 = (npy_double *)dataptr[0];
+    npy_double *data1 = (npy_double *)dataptr[1];
+    npy_double *data2 = (npy_double *)dataptr[2];
+    npy_double *data_out = (npy_double *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 1 > 3 || @complex */
+
+static void
+double_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("double_sum_of_products_contig_one (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_double temp = (*(npy_double *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_double *)dataptr[i]);
+        }
+        *(npy_double *)dataptr[nop] = (temp +
+                                           (*(npy_double *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_double);
+        }
+#else /* complex */
+#  if 1 <= 3
+#    define _SUMPROD_NOP 1
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_double re, im, tmp;
+        int i;
+        re = ((npy_double *)dataptr[0])[0];
+        im = ((npy_double *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_double *)dataptr[i])[0] -
+                  im * ((npy_double *)dataptr[i])[1];
+            im = re * ((npy_double *)dataptr[i])[1] +
+                 im * ((npy_double *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_double *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_double *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_double *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_double *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_double);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 1 */
+
+#if 1 == 1
+
+static NPY_GCC_OPT_3 void
+double_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("double_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_double *data = (npy_double *)dataptr[0];
+    npy_double accum = double_sum_of_arr(data, count);
+    *((npy_double *)dataptr[1]) = (accum + (*((npy_double *)dataptr[1])));
+#else
+    npy_double accum_re = 0, accum_im = 0;
+    npy_double *data0 = (npy_double *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_double re01 = data0[0] + data0[2];
+        const npy_double re23 = data0[4] + data0[6];
+        const npy_double im13 = data0[1] + data0[3];
+        const npy_double im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_double *)dataptr[1])[0] += accum_re;
+    ((npy_double *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 1 == 1 */
+
+static void
+double_sum_of_products_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_double accum_re = 0, accum_im = 0;
+#else
+    npy_double accum = 0;
+#endif
+
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1 == 2 || 1 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("double_sum_of_products_outstride0_one (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 1 == 1
+        accum += (*(npy_double *)data0);
+        data0 += stride0;
+#  elif 1 == 2
+        accum += (*(npy_double *)data0) *
+                 (*(npy_double *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 1 == 3
+        accum += (*(npy_double *)data0) *
+                 (*(npy_double *)data1) *
+                 (*(npy_double *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_double temp = (*(npy_double *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_double *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1 == 1
+        accum_re += ((npy_double *)data0)[0];
+        accum_im += ((npy_double *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 1 <= 3
+#define _SUMPROD_NOP 1
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_double re, im, tmp;
+        int i;
+        re = ((npy_double *)dataptr[0])[0];
+        im = ((npy_double *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_double *)dataptr[i])[0] -
+                  im * ((npy_double *)dataptr[i])[1];
+            im = re * ((npy_double *)dataptr[i])[1] +
+                 im * ((npy_double *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 1 <= 3
+    ((npy_double *)dataptr[1])[0] += accum_re;
+    ((npy_double *)dataptr[1])[1] += accum_im;
+#  else
+    ((npy_double *)dataptr[nop])[0] += accum_re;
+    ((npy_double *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 1 <= 3
+    *((npy_double *)dataptr[1]) = (accum +
+                                    (*((npy_double *)dataptr[1])));
+#  else
+    *((npy_double *)dataptr[nop]) = (accum +
+                                    (*((npy_double *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+double_sum_of_products_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (2 == 2 || 2 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (2 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data_out = dataptr[2];
+    npy_intp stride_out = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("double_sum_of_products_two (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 2 == 1
+        *(npy_double *)data_out = ((*(npy_double *)data0) +
+                                         (*(npy_double *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 2 == 2
+        *(npy_double *)data_out = ((*(npy_double *)data0) *
+                                         (*(npy_double *)data1) +
+                                         (*(npy_double *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 2 == 3
+        *(npy_double *)data_out = ((*(npy_double *)data0) *
+                                         (*(npy_double *)data1) *
+                                         (*(npy_double *)data2) +
+                                         (*(npy_double *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_double temp = (*(npy_double *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_double *)dataptr[i]);
+        }
+        *(npy_double *)dataptr[nop] = (temp +
+                                           (*(npy_double *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 2 == 1
+        ((npy_double *)data_out)[0] = ((npy_double *)data0)[0] +
+                                         ((npy_double *)data_out)[0];
+        ((npy_double *)data_out)[1] = ((npy_double *)data0)[1] +
+                                         ((npy_double *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 2 <= 3
+#define _SUMPROD_NOP 2
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_double re, im, tmp;
+        int i;
+        re = ((npy_double *)dataptr[0])[0];
+        im = ((npy_double *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_double *)dataptr[i])[0] -
+                  im * ((npy_double *)dataptr[i])[1];
+            im = re * ((npy_double *)dataptr[i])[1] +
+                 im * ((npy_double *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_double *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_double *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_double *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_double *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 2 == 1
+
+static void
+double_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double *data0 = (npy_double *)dataptr[0];
+    npy_double *data_out = (npy_double *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("double_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_double *)data_out + 2*6)[0] =
+                                    ((npy_double *)data0 + 2*6)[0] +
+                                    ((npy_double *)data_out + 2*6)[0];
+            ((npy_double *)data_out + 2*6)[1] =
+                                    ((npy_double *)data0 + 2*6)[1] +
+                                    ((npy_double *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_double *)data_out + 2*5)[0] =
+                                    ((npy_double *)data0 + 2*5)[0] +
+                                    ((npy_double *)data_out + 2*5)[0];
+            ((npy_double *)data_out + 2*5)[1] =
+                                    ((npy_double *)data0 + 2*5)[1] +
+                                    ((npy_double *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_double *)data_out + 2*4)[0] =
+                                    ((npy_double *)data0 + 2*4)[0] +
+                                    ((npy_double *)data_out + 2*4)[0];
+            ((npy_double *)data_out + 2*4)[1] =
+                                    ((npy_double *)data0 + 2*4)[1] +
+                                    ((npy_double *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_double *)data_out + 2*3)[0] =
+                                    ((npy_double *)data0 + 2*3)[0] +
+                                    ((npy_double *)data_out + 2*3)[0];
+            ((npy_double *)data_out + 2*3)[1] =
+                                    ((npy_double *)data0 + 2*3)[1] +
+                                    ((npy_double *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_double *)data_out + 2*2)[0] =
+                                    ((npy_double *)data0 + 2*2)[0] +
+                                    ((npy_double *)data_out + 2*2)[0];
+            ((npy_double *)data_out + 2*2)[1] =
+                                    ((npy_double *)data0 + 2*2)[1] +
+                                    ((npy_double *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_double *)data_out + 2*1)[0] =
+                                    ((npy_double *)data0 + 2*1)[0] +
+                                    ((npy_double *)data_out + 2*1)[0];
+            ((npy_double *)data_out + 2*1)[1] =
+                                    ((npy_double *)data0 + 2*1)[1] +
+                                    ((npy_double *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_double *)data_out + 2*0)[0] =
+                                    ((npy_double *)data0 + 2*0)[0] +
+                                    ((npy_double *)data_out + 2*0)[0];
+            ((npy_double *)data_out + 2*0)[1] =
+                                    ((npy_double *)data0 + 2*0)[1] +
+                                    ((npy_double *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_double *)data_out + 2*0)[0] =
+                                ((npy_double *)data0 + 2*0)[0] +
+                                ((npy_double *)data_out + 2*0)[0];
+        ((npy_double *)data_out + 2*0)[1] =
+                                ((npy_double *)data0 + 2*0)[1] +
+                                ((npy_double *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_double *)data_out + 2*1)[0] =
+                                ((npy_double *)data0 + 2*1)[0] +
+                                ((npy_double *)data_out + 2*1)[0];
+        ((npy_double *)data_out + 2*1)[1] =
+                                ((npy_double *)data0 + 2*1)[1] +
+                                ((npy_double *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_double *)data_out + 2*2)[0] =
+                                ((npy_double *)data0 + 2*2)[0] +
+                                ((npy_double *)data_out + 2*2)[0];
+        ((npy_double *)data_out + 2*2)[1] =
+                                ((npy_double *)data0 + 2*2)[1] +
+                                ((npy_double *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_double *)data_out + 2*3)[0] =
+                                ((npy_double *)data0 + 2*3)[0] +
+                                ((npy_double *)data_out + 2*3)[0];
+        ((npy_double *)data_out + 2*3)[1] =
+                                ((npy_double *)data0 + 2*3)[1] +
+                                ((npy_double *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_double *)data_out + 2*4)[0] =
+                                ((npy_double *)data0 + 2*4)[0] +
+                                ((npy_double *)data_out + 2*4)[0];
+        ((npy_double *)data_out + 2*4)[1] =
+                                ((npy_double *)data0 + 2*4)[1] +
+                                ((npy_double *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_double *)data_out + 2*5)[0] =
+                                ((npy_double *)data0 + 2*5)[0] +
+                                ((npy_double *)data_out + 2*5)[0];
+        ((npy_double *)data_out + 2*5)[1] =
+                                ((npy_double *)data0 + 2*5)[1] +
+                                ((npy_double *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_double *)data_out + 2*6)[0] =
+                                ((npy_double *)data0 + 2*6)[0] +
+                                ((npy_double *)data_out + 2*6)[0];
+        ((npy_double *)data_out + 2*6)[1] =
+                                ((npy_double *)data0 + 2*6)[1] +
+                                ((npy_double *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_double *)data_out + 2*7)[0] =
+                                ((npy_double *)data0 + 2*7)[0] +
+                                ((npy_double *)data_out + 2*7)[0];
+        ((npy_double *)data_out + 2*7)[1] =
+                                ((npy_double *)data0 + 2*7)[1] +
+                                ((npy_double *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 2 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+double_sum_of_products_muladd(npy_double *data, npy_double *data_out, npy_double scalar, npy_intp count)
+{
+#if NPY_SIMD_F64 // NPYV check for npy_double
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_f64;
+    const npyv_f64 v_scalar = npyv_setall_f64(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_f64 b0 = npyv_loada_f64(data + vstep * 0);
+            npyv_f64 c0 = npyv_loada_f64(data_out + vstep * 0);
+            
+#line 312
+            npyv_f64 b1 = npyv_loada_f64(data + vstep * 1);
+            npyv_f64 c1 = npyv_loada_f64(data_out + vstep * 1);
+            
+#line 312
+            npyv_f64 b2 = npyv_loada_f64(data + vstep * 2);
+            npyv_f64 c2 = npyv_loada_f64(data_out + vstep * 2);
+            
+#line 312
+            npyv_f64 b3 = npyv_loada_f64(data + vstep * 3);
+            npyv_f64 c3 = npyv_loada_f64(data_out + vstep * 3);
+            
+            #line 318
+            npyv_f64 abc0 = npyv_muladd_f64(v_scalar, b0, c0);
+            
+#line 318
+            npyv_f64 abc1 = npyv_muladd_f64(v_scalar, b1, c1);
+            
+#line 318
+            npyv_f64 abc2 = npyv_muladd_f64(v_scalar, b2, c2);
+            
+#line 318
+            npyv_f64 abc3 = npyv_muladd_f64(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_f64(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_f64(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_f64(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_f64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_f64 b0 = npyv_load_f64(data + vstep * 0);
+            npyv_f64 c0 = npyv_load_f64(data_out + vstep * 0);
+            
+#line 312
+            npyv_f64 b1 = npyv_load_f64(data + vstep * 1);
+            npyv_f64 c1 = npyv_load_f64(data_out + vstep * 1);
+            
+#line 312
+            npyv_f64 b2 = npyv_load_f64(data + vstep * 2);
+            npyv_f64 c2 = npyv_load_f64(data_out + vstep * 2);
+            
+#line 312
+            npyv_f64 b3 = npyv_load_f64(data + vstep * 3);
+            npyv_f64 c3 = npyv_load_f64(data_out + vstep * 3);
+            
+            #line 318
+            npyv_f64 abc0 = npyv_muladd_f64(v_scalar, b0, c0);
+            
+#line 318
+            npyv_f64 abc1 = npyv_muladd_f64(v_scalar, b1, c1);
+            
+#line 318
+            npyv_f64 abc2 = npyv_muladd_f64(v_scalar, b2, c2);
+            
+#line 318
+            npyv_f64 abc3 = npyv_muladd_f64(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_f64(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_f64(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_f64(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_f64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_f64 a = npyv_load_tillz_f64(data, count);
+        npyv_f64 b = npyv_load_tillz_f64(data_out, count);
+        npyv_store_till_f64(data_out, count, npyv_muladd_f64(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_double b0 = (data[0]);
+        const npy_double c0 = (data_out[0]);
+        
+#line 340
+        const npy_double b1 = (data[1]);
+        const npy_double c1 = (data_out[1]);
+        
+#line 340
+        const npy_double b2 = (data[2]);
+        const npy_double c2 = (data_out[2]);
+        
+#line 340
+        const npy_double b3 = (data[3]);
+        const npy_double c3 = (data_out[3]);
+        
+        #line 346
+        const npy_double abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_double abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_double abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_double abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_double b = (*data);
+        const npy_double c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_double
+}
+
+static void
+double_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double *data0 = (npy_double *)dataptr[0];
+    npy_double *data1 = (npy_double *)dataptr[1];
+    npy_double *data_out = (npy_double *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("double_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_double
+#if NPY_SIMD_F64
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_f64;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_f64 a0 = npyv_loada_f64(data0 + vstep * 0);
+            npyv_f64 b0 = npyv_loada_f64(data1 + vstep * 0);
+            npyv_f64 c0 = npyv_loada_f64(data_out + vstep * 0);
+            
+#line 390
+            npyv_f64 a1 = npyv_loada_f64(data0 + vstep * 1);
+            npyv_f64 b1 = npyv_loada_f64(data1 + vstep * 1);
+            npyv_f64 c1 = npyv_loada_f64(data_out + vstep * 1);
+            
+#line 390
+            npyv_f64 a2 = npyv_loada_f64(data0 + vstep * 2);
+            npyv_f64 b2 = npyv_loada_f64(data1 + vstep * 2);
+            npyv_f64 c2 = npyv_loada_f64(data_out + vstep * 2);
+            
+#line 390
+            npyv_f64 a3 = npyv_loada_f64(data0 + vstep * 3);
+            npyv_f64 b3 = npyv_loada_f64(data1 + vstep * 3);
+            npyv_f64 c3 = npyv_loada_f64(data_out + vstep * 3);
+            
+            #line 397
+            npyv_f64 abc0 = npyv_muladd_f64(a0, b0, c0);
+            
+#line 397
+            npyv_f64 abc1 = npyv_muladd_f64(a1, b1, c1);
+            
+#line 397
+            npyv_f64 abc2 = npyv_muladd_f64(a2, b2, c2);
+            
+#line 397
+            npyv_f64 abc3 = npyv_muladd_f64(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_f64(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_f64(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_f64(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_f64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_f64 a0 = npyv_load_f64(data0 + vstep * 0);
+            npyv_f64 b0 = npyv_load_f64(data1 + vstep * 0);
+            npyv_f64 c0 = npyv_load_f64(data_out + vstep * 0);
+            
+#line 390
+            npyv_f64 a1 = npyv_load_f64(data0 + vstep * 1);
+            npyv_f64 b1 = npyv_load_f64(data1 + vstep * 1);
+            npyv_f64 c1 = npyv_load_f64(data_out + vstep * 1);
+            
+#line 390
+            npyv_f64 a2 = npyv_load_f64(data0 + vstep * 2);
+            npyv_f64 b2 = npyv_load_f64(data1 + vstep * 2);
+            npyv_f64 c2 = npyv_load_f64(data_out + vstep * 2);
+            
+#line 390
+            npyv_f64 a3 = npyv_load_f64(data0 + vstep * 3);
+            npyv_f64 b3 = npyv_load_f64(data1 + vstep * 3);
+            npyv_f64 c3 = npyv_load_f64(data_out + vstep * 3);
+            
+            #line 397
+            npyv_f64 abc0 = npyv_muladd_f64(a0, b0, c0);
+            
+#line 397
+            npyv_f64 abc1 = npyv_muladd_f64(a1, b1, c1);
+            
+#line 397
+            npyv_f64 abc2 = npyv_muladd_f64(a2, b2, c2);
+            
+#line 397
+            npyv_f64 abc3 = npyv_muladd_f64(a3, b3, c3);
+            
+            #line 402
+            npyv_store_f64(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_f64(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_f64(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_f64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_f64 a = npyv_load_tillz_f64(data0, count);
+        npyv_f64 b = npyv_load_tillz_f64(data1, count);
+        npyv_f64 c = npyv_load_tillz_f64(data_out, count);
+        npyv_store_till_f64(data_out, count, npyv_muladd_f64(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_double a0 = (data0[0]);
+        const npy_double b0 = (data1[0]);
+        const npy_double c0 = (data_out[0]);
+        
+#line 420
+        const npy_double a1 = (data0[1]);
+        const npy_double b1 = (data1[1]);
+        const npy_double c1 = (data_out[1]);
+        
+#line 420
+        const npy_double a2 = (data0[2]);
+        const npy_double b2 = (data1[2]);
+        const npy_double c2 = (data_out[2]);
+        
+#line 420
+        const npy_double a3 = (data0[3]);
+        const npy_double b3 = (data1[3]);
+        const npy_double c3 = (data_out[3]);
+        
+        #line 427
+        const npy_double abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_double abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_double abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_double abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_double a = (*data0);
+        const npy_double b = (*data1);
+        const npy_double c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_double
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+double_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double value0 = (*(npy_double *)dataptr[0]);
+    npy_double *data1 = (npy_double *)dataptr[1];
+    npy_double *data_out = (npy_double *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("double_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    double_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+double_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double value1 = (*(npy_double *)dataptr[1]);
+    npy_double *data0 = (npy_double *)dataptr[0];
+    npy_double *data_out = (npy_double *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("double_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    double_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+double_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double *data0 = (npy_double *)dataptr[0];
+    npy_double *data1 = (npy_double *)dataptr[1];
+    npy_double accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("double_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if NPY_SIMD_F64 // NPYV check for npy_double
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_f64;
+    npyv_f64 v_accum = npyv_zero_f64();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_f64 a0 = npyv_loada_f64(data0 + vstep * 0);
+            npyv_f64 b0 = npyv_loada_f64(data1 + vstep * 0);
+            
+#line 501
+            npyv_f64 a1 = npyv_loada_f64(data0 + vstep * 1);
+            npyv_f64 b1 = npyv_loada_f64(data1 + vstep * 1);
+            
+#line 501
+            npyv_f64 a2 = npyv_loada_f64(data0 + vstep * 2);
+            npyv_f64 b2 = npyv_loada_f64(data1 + vstep * 2);
+            
+#line 501
+            npyv_f64 a3 = npyv_loada_f64(data0 + vstep * 3);
+            npyv_f64 b3 = npyv_loada_f64(data1 + vstep * 3);
+            
+            npyv_f64 ab3 = npyv_muladd_f64(a3, b3, v_accum);
+            npyv_f64 ab2 = npyv_muladd_f64(a2, b2, ab3);
+            npyv_f64 ab1 = npyv_muladd_f64(a1, b1, ab2);
+                   v_accum = npyv_muladd_f64(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_f64 a0 = npyv_load_f64(data0 + vstep * 0);
+            npyv_f64 b0 = npyv_load_f64(data1 + vstep * 0);
+            
+#line 501
+            npyv_f64 a1 = npyv_load_f64(data0 + vstep * 1);
+            npyv_f64 b1 = npyv_load_f64(data1 + vstep * 1);
+            
+#line 501
+            npyv_f64 a2 = npyv_load_f64(data0 + vstep * 2);
+            npyv_f64 b2 = npyv_load_f64(data1 + vstep * 2);
+            
+#line 501
+            npyv_f64 a3 = npyv_load_f64(data0 + vstep * 3);
+            npyv_f64 b3 = npyv_load_f64(data1 + vstep * 3);
+            
+            npyv_f64 ab3 = npyv_muladd_f64(a3, b3, v_accum);
+            npyv_f64 ab2 = npyv_muladd_f64(a2, b2, ab3);
+            npyv_f64 ab1 = npyv_muladd_f64(a1, b1, ab2);
+                   v_accum = npyv_muladd_f64(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_f64 a = npyv_load_tillz_f64(data0, count);
+        npyv_f64 b = npyv_load_tillz_f64(data1, count);
+        v_accum = npyv_muladd_f64(a, b, v_accum);
+    }
+    accum = npyv_sum_f64(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_double ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_double ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_double ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_double ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_double a = (*data0);
+        const npy_double b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_double
+    *(npy_double *)dataptr[2] = ((*(npy_double *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+double_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double *data1 = (npy_double *)dataptr[1];
+    npy_double value0 = (*(npy_double *)dataptr[0]);
+    npy_double accum = double_sum_of_arr(data1, count);
+    *(npy_double *)dataptr[2] = ((*(npy_double *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+double_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double *data0 = (npy_double *)dataptr[0];
+    npy_double value1 = (*(npy_double *)dataptr[1]);
+    npy_double accum = double_sum_of_arr(data0, count);
+    *(npy_double *)dataptr[2] = ((*(npy_double *)dataptr[2]) + value1 * accum);
+}
+
+#elif 2 == 3 && !0
+
+static void
+double_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double *data0 = (npy_double *)dataptr[0];
+    npy_double *data1 = (npy_double *)dataptr[1];
+    npy_double *data2 = (npy_double *)dataptr[2];
+    npy_double *data_out = (npy_double *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 2 > 3 || @complex */
+
+static void
+double_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("double_sum_of_products_contig_two (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_double temp = (*(npy_double *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_double *)dataptr[i]);
+        }
+        *(npy_double *)dataptr[nop] = (temp +
+                                           (*(npy_double *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_double);
+        }
+#else /* complex */
+#  if 2 <= 3
+#    define _SUMPROD_NOP 2
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_double re, im, tmp;
+        int i;
+        re = ((npy_double *)dataptr[0])[0];
+        im = ((npy_double *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_double *)dataptr[i])[0] -
+                  im * ((npy_double *)dataptr[i])[1];
+            im = re * ((npy_double *)dataptr[i])[1] +
+                 im * ((npy_double *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_double *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_double *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_double *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_double *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_double);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 2 */
+
+#if 2 == 1
+
+static NPY_GCC_OPT_3 void
+double_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("double_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_double *data = (npy_double *)dataptr[0];
+    npy_double accum = double_sum_of_arr(data, count);
+    *((npy_double *)dataptr[1]) = (accum + (*((npy_double *)dataptr[1])));
+#else
+    npy_double accum_re = 0, accum_im = 0;
+    npy_double *data0 = (npy_double *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_double re01 = data0[0] + data0[2];
+        const npy_double re23 = data0[4] + data0[6];
+        const npy_double im13 = data0[1] + data0[3];
+        const npy_double im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_double *)dataptr[1])[0] += accum_re;
+    ((npy_double *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 2 == 1 */
+
+static void
+double_sum_of_products_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_double accum_re = 0, accum_im = 0;
+#else
+    npy_double accum = 0;
+#endif
+
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (2 == 2 || 2 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (2 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("double_sum_of_products_outstride0_two (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 2 == 1
+        accum += (*(npy_double *)data0);
+        data0 += stride0;
+#  elif 2 == 2
+        accum += (*(npy_double *)data0) *
+                 (*(npy_double *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 2 == 3
+        accum += (*(npy_double *)data0) *
+                 (*(npy_double *)data1) *
+                 (*(npy_double *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_double temp = (*(npy_double *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_double *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 2 == 1
+        accum_re += ((npy_double *)data0)[0];
+        accum_im += ((npy_double *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 2 <= 3
+#define _SUMPROD_NOP 2
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_double re, im, tmp;
+        int i;
+        re = ((npy_double *)dataptr[0])[0];
+        im = ((npy_double *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_double *)dataptr[i])[0] -
+                  im * ((npy_double *)dataptr[i])[1];
+            im = re * ((npy_double *)dataptr[i])[1] +
+                 im * ((npy_double *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 2 <= 3
+    ((npy_double *)dataptr[2])[0] += accum_re;
+    ((npy_double *)dataptr[2])[1] += accum_im;
+#  else
+    ((npy_double *)dataptr[nop])[0] += accum_re;
+    ((npy_double *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 2 <= 3
+    *((npy_double *)dataptr[2]) = (accum +
+                                    (*((npy_double *)dataptr[2])));
+#  else
+    *((npy_double *)dataptr[nop]) = (accum +
+                                    (*((npy_double *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+double_sum_of_products_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (3 == 2 || 3 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (3 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data_out = dataptr[3];
+    npy_intp stride_out = strides[3];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("double_sum_of_products_three (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 3 == 1
+        *(npy_double *)data_out = ((*(npy_double *)data0) +
+                                         (*(npy_double *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 3 == 2
+        *(npy_double *)data_out = ((*(npy_double *)data0) *
+                                         (*(npy_double *)data1) +
+                                         (*(npy_double *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 3 == 3
+        *(npy_double *)data_out = ((*(npy_double *)data0) *
+                                         (*(npy_double *)data1) *
+                                         (*(npy_double *)data2) +
+                                         (*(npy_double *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_double temp = (*(npy_double *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_double *)dataptr[i]);
+        }
+        *(npy_double *)dataptr[nop] = (temp +
+                                           (*(npy_double *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 3 == 1
+        ((npy_double *)data_out)[0] = ((npy_double *)data0)[0] +
+                                         ((npy_double *)data_out)[0];
+        ((npy_double *)data_out)[1] = ((npy_double *)data0)[1] +
+                                         ((npy_double *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 3 <= 3
+#define _SUMPROD_NOP 3
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_double re, im, tmp;
+        int i;
+        re = ((npy_double *)dataptr[0])[0];
+        im = ((npy_double *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_double *)dataptr[i])[0] -
+                  im * ((npy_double *)dataptr[i])[1];
+            im = re * ((npy_double *)dataptr[i])[1] +
+                 im * ((npy_double *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_double *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_double *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_double *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_double *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 3 == 1
+
+static void
+double_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double *data0 = (npy_double *)dataptr[0];
+    npy_double *data_out = (npy_double *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("double_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_double *)data_out + 2*6)[0] =
+                                    ((npy_double *)data0 + 2*6)[0] +
+                                    ((npy_double *)data_out + 2*6)[0];
+            ((npy_double *)data_out + 2*6)[1] =
+                                    ((npy_double *)data0 + 2*6)[1] +
+                                    ((npy_double *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_double *)data_out + 2*5)[0] =
+                                    ((npy_double *)data0 + 2*5)[0] +
+                                    ((npy_double *)data_out + 2*5)[0];
+            ((npy_double *)data_out + 2*5)[1] =
+                                    ((npy_double *)data0 + 2*5)[1] +
+                                    ((npy_double *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_double *)data_out + 2*4)[0] =
+                                    ((npy_double *)data0 + 2*4)[0] +
+                                    ((npy_double *)data_out + 2*4)[0];
+            ((npy_double *)data_out + 2*4)[1] =
+                                    ((npy_double *)data0 + 2*4)[1] +
+                                    ((npy_double *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_double *)data_out + 2*3)[0] =
+                                    ((npy_double *)data0 + 2*3)[0] +
+                                    ((npy_double *)data_out + 2*3)[0];
+            ((npy_double *)data_out + 2*3)[1] =
+                                    ((npy_double *)data0 + 2*3)[1] +
+                                    ((npy_double *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_double *)data_out + 2*2)[0] =
+                                    ((npy_double *)data0 + 2*2)[0] +
+                                    ((npy_double *)data_out + 2*2)[0];
+            ((npy_double *)data_out + 2*2)[1] =
+                                    ((npy_double *)data0 + 2*2)[1] +
+                                    ((npy_double *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_double *)data_out + 2*1)[0] =
+                                    ((npy_double *)data0 + 2*1)[0] +
+                                    ((npy_double *)data_out + 2*1)[0];
+            ((npy_double *)data_out + 2*1)[1] =
+                                    ((npy_double *)data0 + 2*1)[1] +
+                                    ((npy_double *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_double *)data_out + 2*0)[0] =
+                                    ((npy_double *)data0 + 2*0)[0] +
+                                    ((npy_double *)data_out + 2*0)[0];
+            ((npy_double *)data_out + 2*0)[1] =
+                                    ((npy_double *)data0 + 2*0)[1] +
+                                    ((npy_double *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_double *)data_out + 2*0)[0] =
+                                ((npy_double *)data0 + 2*0)[0] +
+                                ((npy_double *)data_out + 2*0)[0];
+        ((npy_double *)data_out + 2*0)[1] =
+                                ((npy_double *)data0 + 2*0)[1] +
+                                ((npy_double *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_double *)data_out + 2*1)[0] =
+                                ((npy_double *)data0 + 2*1)[0] +
+                                ((npy_double *)data_out + 2*1)[0];
+        ((npy_double *)data_out + 2*1)[1] =
+                                ((npy_double *)data0 + 2*1)[1] +
+                                ((npy_double *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_double *)data_out + 2*2)[0] =
+                                ((npy_double *)data0 + 2*2)[0] +
+                                ((npy_double *)data_out + 2*2)[0];
+        ((npy_double *)data_out + 2*2)[1] =
+                                ((npy_double *)data0 + 2*2)[1] +
+                                ((npy_double *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_double *)data_out + 2*3)[0] =
+                                ((npy_double *)data0 + 2*3)[0] +
+                                ((npy_double *)data_out + 2*3)[0];
+        ((npy_double *)data_out + 2*3)[1] =
+                                ((npy_double *)data0 + 2*3)[1] +
+                                ((npy_double *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_double *)data_out + 2*4)[0] =
+                                ((npy_double *)data0 + 2*4)[0] +
+                                ((npy_double *)data_out + 2*4)[0];
+        ((npy_double *)data_out + 2*4)[1] =
+                                ((npy_double *)data0 + 2*4)[1] +
+                                ((npy_double *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_double *)data_out + 2*5)[0] =
+                                ((npy_double *)data0 + 2*5)[0] +
+                                ((npy_double *)data_out + 2*5)[0];
+        ((npy_double *)data_out + 2*5)[1] =
+                                ((npy_double *)data0 + 2*5)[1] +
+                                ((npy_double *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_double *)data_out + 2*6)[0] =
+                                ((npy_double *)data0 + 2*6)[0] +
+                                ((npy_double *)data_out + 2*6)[0];
+        ((npy_double *)data_out + 2*6)[1] =
+                                ((npy_double *)data0 + 2*6)[1] +
+                                ((npy_double *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_double *)data_out + 2*7)[0] =
+                                ((npy_double *)data0 + 2*7)[0] +
+                                ((npy_double *)data_out + 2*7)[0];
+        ((npy_double *)data_out + 2*7)[1] =
+                                ((npy_double *)data0 + 2*7)[1] +
+                                ((npy_double *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 3 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+double_sum_of_products_muladd(npy_double *data, npy_double *data_out, npy_double scalar, npy_intp count)
+{
+#if NPY_SIMD_F64 // NPYV check for npy_double
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_f64;
+    const npyv_f64 v_scalar = npyv_setall_f64(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_f64 b0 = npyv_loada_f64(data + vstep * 0);
+            npyv_f64 c0 = npyv_loada_f64(data_out + vstep * 0);
+            
+#line 312
+            npyv_f64 b1 = npyv_loada_f64(data + vstep * 1);
+            npyv_f64 c1 = npyv_loada_f64(data_out + vstep * 1);
+            
+#line 312
+            npyv_f64 b2 = npyv_loada_f64(data + vstep * 2);
+            npyv_f64 c2 = npyv_loada_f64(data_out + vstep * 2);
+            
+#line 312
+            npyv_f64 b3 = npyv_loada_f64(data + vstep * 3);
+            npyv_f64 c3 = npyv_loada_f64(data_out + vstep * 3);
+            
+            #line 318
+            npyv_f64 abc0 = npyv_muladd_f64(v_scalar, b0, c0);
+            
+#line 318
+            npyv_f64 abc1 = npyv_muladd_f64(v_scalar, b1, c1);
+            
+#line 318
+            npyv_f64 abc2 = npyv_muladd_f64(v_scalar, b2, c2);
+            
+#line 318
+            npyv_f64 abc3 = npyv_muladd_f64(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_f64(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_f64(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_f64(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_f64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_f64 b0 = npyv_load_f64(data + vstep * 0);
+            npyv_f64 c0 = npyv_load_f64(data_out + vstep * 0);
+            
+#line 312
+            npyv_f64 b1 = npyv_load_f64(data + vstep * 1);
+            npyv_f64 c1 = npyv_load_f64(data_out + vstep * 1);
+            
+#line 312
+            npyv_f64 b2 = npyv_load_f64(data + vstep * 2);
+            npyv_f64 c2 = npyv_load_f64(data_out + vstep * 2);
+            
+#line 312
+            npyv_f64 b3 = npyv_load_f64(data + vstep * 3);
+            npyv_f64 c3 = npyv_load_f64(data_out + vstep * 3);
+            
+            #line 318
+            npyv_f64 abc0 = npyv_muladd_f64(v_scalar, b0, c0);
+            
+#line 318
+            npyv_f64 abc1 = npyv_muladd_f64(v_scalar, b1, c1);
+            
+#line 318
+            npyv_f64 abc2 = npyv_muladd_f64(v_scalar, b2, c2);
+            
+#line 318
+            npyv_f64 abc3 = npyv_muladd_f64(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_f64(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_f64(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_f64(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_f64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_f64 a = npyv_load_tillz_f64(data, count);
+        npyv_f64 b = npyv_load_tillz_f64(data_out, count);
+        npyv_store_till_f64(data_out, count, npyv_muladd_f64(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_double b0 = (data[0]);
+        const npy_double c0 = (data_out[0]);
+        
+#line 340
+        const npy_double b1 = (data[1]);
+        const npy_double c1 = (data_out[1]);
+        
+#line 340
+        const npy_double b2 = (data[2]);
+        const npy_double c2 = (data_out[2]);
+        
+#line 340
+        const npy_double b3 = (data[3]);
+        const npy_double c3 = (data_out[3]);
+        
+        #line 346
+        const npy_double abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_double abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_double abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_double abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_double b = (*data);
+        const npy_double c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_double
+}
+
+static void
+double_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double *data0 = (npy_double *)dataptr[0];
+    npy_double *data1 = (npy_double *)dataptr[1];
+    npy_double *data_out = (npy_double *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("double_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_double
+#if NPY_SIMD_F64
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_f64;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_f64 a0 = npyv_loada_f64(data0 + vstep * 0);
+            npyv_f64 b0 = npyv_loada_f64(data1 + vstep * 0);
+            npyv_f64 c0 = npyv_loada_f64(data_out + vstep * 0);
+            
+#line 390
+            npyv_f64 a1 = npyv_loada_f64(data0 + vstep * 1);
+            npyv_f64 b1 = npyv_loada_f64(data1 + vstep * 1);
+            npyv_f64 c1 = npyv_loada_f64(data_out + vstep * 1);
+            
+#line 390
+            npyv_f64 a2 = npyv_loada_f64(data0 + vstep * 2);
+            npyv_f64 b2 = npyv_loada_f64(data1 + vstep * 2);
+            npyv_f64 c2 = npyv_loada_f64(data_out + vstep * 2);
+            
+#line 390
+            npyv_f64 a3 = npyv_loada_f64(data0 + vstep * 3);
+            npyv_f64 b3 = npyv_loada_f64(data1 + vstep * 3);
+            npyv_f64 c3 = npyv_loada_f64(data_out + vstep * 3);
+            
+            #line 397
+            npyv_f64 abc0 = npyv_muladd_f64(a0, b0, c0);
+            
+#line 397
+            npyv_f64 abc1 = npyv_muladd_f64(a1, b1, c1);
+            
+#line 397
+            npyv_f64 abc2 = npyv_muladd_f64(a2, b2, c2);
+            
+#line 397
+            npyv_f64 abc3 = npyv_muladd_f64(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_f64(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_f64(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_f64(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_f64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_f64 a0 = npyv_load_f64(data0 + vstep * 0);
+            npyv_f64 b0 = npyv_load_f64(data1 + vstep * 0);
+            npyv_f64 c0 = npyv_load_f64(data_out + vstep * 0);
+            
+#line 390
+            npyv_f64 a1 = npyv_load_f64(data0 + vstep * 1);
+            npyv_f64 b1 = npyv_load_f64(data1 + vstep * 1);
+            npyv_f64 c1 = npyv_load_f64(data_out + vstep * 1);
+            
+#line 390
+            npyv_f64 a2 = npyv_load_f64(data0 + vstep * 2);
+            npyv_f64 b2 = npyv_load_f64(data1 + vstep * 2);
+            npyv_f64 c2 = npyv_load_f64(data_out + vstep * 2);
+            
+#line 390
+            npyv_f64 a3 = npyv_load_f64(data0 + vstep * 3);
+            npyv_f64 b3 = npyv_load_f64(data1 + vstep * 3);
+            npyv_f64 c3 = npyv_load_f64(data_out + vstep * 3);
+            
+            #line 397
+            npyv_f64 abc0 = npyv_muladd_f64(a0, b0, c0);
+            
+#line 397
+            npyv_f64 abc1 = npyv_muladd_f64(a1, b1, c1);
+            
+#line 397
+            npyv_f64 abc2 = npyv_muladd_f64(a2, b2, c2);
+            
+#line 397
+            npyv_f64 abc3 = npyv_muladd_f64(a3, b3, c3);
+            
+            #line 402
+            npyv_store_f64(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_f64(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_f64(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_f64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_f64 a = npyv_load_tillz_f64(data0, count);
+        npyv_f64 b = npyv_load_tillz_f64(data1, count);
+        npyv_f64 c = npyv_load_tillz_f64(data_out, count);
+        npyv_store_till_f64(data_out, count, npyv_muladd_f64(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_double a0 = (data0[0]);
+        const npy_double b0 = (data1[0]);
+        const npy_double c0 = (data_out[0]);
+        
+#line 420
+        const npy_double a1 = (data0[1]);
+        const npy_double b1 = (data1[1]);
+        const npy_double c1 = (data_out[1]);
+        
+#line 420
+        const npy_double a2 = (data0[2]);
+        const npy_double b2 = (data1[2]);
+        const npy_double c2 = (data_out[2]);
+        
+#line 420
+        const npy_double a3 = (data0[3]);
+        const npy_double b3 = (data1[3]);
+        const npy_double c3 = (data_out[3]);
+        
+        #line 427
+        const npy_double abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_double abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_double abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_double abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_double a = (*data0);
+        const npy_double b = (*data1);
+        const npy_double c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_double
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+double_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double value0 = (*(npy_double *)dataptr[0]);
+    npy_double *data1 = (npy_double *)dataptr[1];
+    npy_double *data_out = (npy_double *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("double_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    double_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+double_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double value1 = (*(npy_double *)dataptr[1]);
+    npy_double *data0 = (npy_double *)dataptr[0];
+    npy_double *data_out = (npy_double *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("double_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    double_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+double_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double *data0 = (npy_double *)dataptr[0];
+    npy_double *data1 = (npy_double *)dataptr[1];
+    npy_double accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("double_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if NPY_SIMD_F64 // NPYV check for npy_double
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_f64;
+    npyv_f64 v_accum = npyv_zero_f64();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_f64 a0 = npyv_loada_f64(data0 + vstep * 0);
+            npyv_f64 b0 = npyv_loada_f64(data1 + vstep * 0);
+            
+#line 501
+            npyv_f64 a1 = npyv_loada_f64(data0 + vstep * 1);
+            npyv_f64 b1 = npyv_loada_f64(data1 + vstep * 1);
+            
+#line 501
+            npyv_f64 a2 = npyv_loada_f64(data0 + vstep * 2);
+            npyv_f64 b2 = npyv_loada_f64(data1 + vstep * 2);
+            
+#line 501
+            npyv_f64 a3 = npyv_loada_f64(data0 + vstep * 3);
+            npyv_f64 b3 = npyv_loada_f64(data1 + vstep * 3);
+            
+            npyv_f64 ab3 = npyv_muladd_f64(a3, b3, v_accum);
+            npyv_f64 ab2 = npyv_muladd_f64(a2, b2, ab3);
+            npyv_f64 ab1 = npyv_muladd_f64(a1, b1, ab2);
+                   v_accum = npyv_muladd_f64(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_f64 a0 = npyv_load_f64(data0 + vstep * 0);
+            npyv_f64 b0 = npyv_load_f64(data1 + vstep * 0);
+            
+#line 501
+            npyv_f64 a1 = npyv_load_f64(data0 + vstep * 1);
+            npyv_f64 b1 = npyv_load_f64(data1 + vstep * 1);
+            
+#line 501
+            npyv_f64 a2 = npyv_load_f64(data0 + vstep * 2);
+            npyv_f64 b2 = npyv_load_f64(data1 + vstep * 2);
+            
+#line 501
+            npyv_f64 a3 = npyv_load_f64(data0 + vstep * 3);
+            npyv_f64 b3 = npyv_load_f64(data1 + vstep * 3);
+            
+            npyv_f64 ab3 = npyv_muladd_f64(a3, b3, v_accum);
+            npyv_f64 ab2 = npyv_muladd_f64(a2, b2, ab3);
+            npyv_f64 ab1 = npyv_muladd_f64(a1, b1, ab2);
+                   v_accum = npyv_muladd_f64(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_f64 a = npyv_load_tillz_f64(data0, count);
+        npyv_f64 b = npyv_load_tillz_f64(data1, count);
+        v_accum = npyv_muladd_f64(a, b, v_accum);
+    }
+    accum = npyv_sum_f64(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_double ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_double ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_double ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_double ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_double a = (*data0);
+        const npy_double b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_double
+    *(npy_double *)dataptr[2] = ((*(npy_double *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+double_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double *data1 = (npy_double *)dataptr[1];
+    npy_double value0 = (*(npy_double *)dataptr[0]);
+    npy_double accum = double_sum_of_arr(data1, count);
+    *(npy_double *)dataptr[2] = ((*(npy_double *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+double_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double *data0 = (npy_double *)dataptr[0];
+    npy_double value1 = (*(npy_double *)dataptr[1]);
+    npy_double accum = double_sum_of_arr(data0, count);
+    *(npy_double *)dataptr[2] = ((*(npy_double *)dataptr[2]) + value1 * accum);
+}
+
+#elif 3 == 3 && !0
+
+static void
+double_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double *data0 = (npy_double *)dataptr[0];
+    npy_double *data1 = (npy_double *)dataptr[1];
+    npy_double *data2 = (npy_double *)dataptr[2];
+    npy_double *data_out = (npy_double *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 3 > 3 || @complex */
+
+static void
+double_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("double_sum_of_products_contig_three (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_double temp = (*(npy_double *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_double *)dataptr[i]);
+        }
+        *(npy_double *)dataptr[nop] = (temp +
+                                           (*(npy_double *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_double);
+        }
+#else /* complex */
+#  if 3 <= 3
+#    define _SUMPROD_NOP 3
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_double re, im, tmp;
+        int i;
+        re = ((npy_double *)dataptr[0])[0];
+        im = ((npy_double *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_double *)dataptr[i])[0] -
+                  im * ((npy_double *)dataptr[i])[1];
+            im = re * ((npy_double *)dataptr[i])[1] +
+                 im * ((npy_double *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_double *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_double *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_double *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_double *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_double);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 3 */
+
+#if 3 == 1
+
+static NPY_GCC_OPT_3 void
+double_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("double_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_double *data = (npy_double *)dataptr[0];
+    npy_double accum = double_sum_of_arr(data, count);
+    *((npy_double *)dataptr[1]) = (accum + (*((npy_double *)dataptr[1])));
+#else
+    npy_double accum_re = 0, accum_im = 0;
+    npy_double *data0 = (npy_double *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_double re01 = data0[0] + data0[2];
+        const npy_double re23 = data0[4] + data0[6];
+        const npy_double im13 = data0[1] + data0[3];
+        const npy_double im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_double *)dataptr[1])[0] += accum_re;
+    ((npy_double *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 3 == 1 */
+
+static void
+double_sum_of_products_outstride0_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_double accum_re = 0, accum_im = 0;
+#else
+    npy_double accum = 0;
+#endif
+
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (3 == 2 || 3 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (3 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("double_sum_of_products_outstride0_three (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 3 == 1
+        accum += (*(npy_double *)data0);
+        data0 += stride0;
+#  elif 3 == 2
+        accum += (*(npy_double *)data0) *
+                 (*(npy_double *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 3 == 3
+        accum += (*(npy_double *)data0) *
+                 (*(npy_double *)data1) *
+                 (*(npy_double *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_double temp = (*(npy_double *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_double *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 3 == 1
+        accum_re += ((npy_double *)data0)[0];
+        accum_im += ((npy_double *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 3 <= 3
+#define _SUMPROD_NOP 3
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_double re, im, tmp;
+        int i;
+        re = ((npy_double *)dataptr[0])[0];
+        im = ((npy_double *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_double *)dataptr[i])[0] -
+                  im * ((npy_double *)dataptr[i])[1];
+            im = re * ((npy_double *)dataptr[i])[1] +
+                 im * ((npy_double *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 3 <= 3
+    ((npy_double *)dataptr[3])[0] += accum_re;
+    ((npy_double *)dataptr[3])[1] += accum_im;
+#  else
+    ((npy_double *)dataptr[nop])[0] += accum_re;
+    ((npy_double *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 3 <= 3
+    *((npy_double *)dataptr[3]) = (accum +
+                                    (*((npy_double *)dataptr[3])));
+#  else
+    *((npy_double *)dataptr[nop]) = (accum +
+                                    (*((npy_double *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+double_sum_of_products_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1000 == 2 || 1000 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1000 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data_out = dataptr[1000];
+    npy_intp stride_out = strides[1000];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("double_sum_of_products_any (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 1000 == 1
+        *(npy_double *)data_out = ((*(npy_double *)data0) +
+                                         (*(npy_double *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 1000 == 2
+        *(npy_double *)data_out = ((*(npy_double *)data0) *
+                                         (*(npy_double *)data1) +
+                                         (*(npy_double *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 1000 == 3
+        *(npy_double *)data_out = ((*(npy_double *)data0) *
+                                         (*(npy_double *)data1) *
+                                         (*(npy_double *)data2) +
+                                         (*(npy_double *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_double temp = (*(npy_double *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_double *)dataptr[i]);
+        }
+        *(npy_double *)dataptr[nop] = (temp +
+                                           (*(npy_double *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1000 == 1
+        ((npy_double *)data_out)[0] = ((npy_double *)data0)[0] +
+                                         ((npy_double *)data_out)[0];
+        ((npy_double *)data_out)[1] = ((npy_double *)data0)[1] +
+                                         ((npy_double *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 1000 <= 3
+#define _SUMPROD_NOP 1000
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_double re, im, tmp;
+        int i;
+        re = ((npy_double *)dataptr[0])[0];
+        im = ((npy_double *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_double *)dataptr[i])[0] -
+                  im * ((npy_double *)dataptr[i])[1];
+            im = re * ((npy_double *)dataptr[i])[1] +
+                 im * ((npy_double *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_double *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_double *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_double *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_double *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 1000 == 1
+
+static void
+double_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double *data0 = (npy_double *)dataptr[0];
+    npy_double *data_out = (npy_double *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("double_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_double *)data_out + 2*6)[0] =
+                                    ((npy_double *)data0 + 2*6)[0] +
+                                    ((npy_double *)data_out + 2*6)[0];
+            ((npy_double *)data_out + 2*6)[1] =
+                                    ((npy_double *)data0 + 2*6)[1] +
+                                    ((npy_double *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_double *)data_out + 2*5)[0] =
+                                    ((npy_double *)data0 + 2*5)[0] +
+                                    ((npy_double *)data_out + 2*5)[0];
+            ((npy_double *)data_out + 2*5)[1] =
+                                    ((npy_double *)data0 + 2*5)[1] +
+                                    ((npy_double *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_double *)data_out + 2*4)[0] =
+                                    ((npy_double *)data0 + 2*4)[0] +
+                                    ((npy_double *)data_out + 2*4)[0];
+            ((npy_double *)data_out + 2*4)[1] =
+                                    ((npy_double *)data0 + 2*4)[1] +
+                                    ((npy_double *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_double *)data_out + 2*3)[0] =
+                                    ((npy_double *)data0 + 2*3)[0] +
+                                    ((npy_double *)data_out + 2*3)[0];
+            ((npy_double *)data_out + 2*3)[1] =
+                                    ((npy_double *)data0 + 2*3)[1] +
+                                    ((npy_double *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_double *)data_out + 2*2)[0] =
+                                    ((npy_double *)data0 + 2*2)[0] +
+                                    ((npy_double *)data_out + 2*2)[0];
+            ((npy_double *)data_out + 2*2)[1] =
+                                    ((npy_double *)data0 + 2*2)[1] +
+                                    ((npy_double *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_double *)data_out + 2*1)[0] =
+                                    ((npy_double *)data0 + 2*1)[0] +
+                                    ((npy_double *)data_out + 2*1)[0];
+            ((npy_double *)data_out + 2*1)[1] =
+                                    ((npy_double *)data0 + 2*1)[1] +
+                                    ((npy_double *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_double *)data_out + 2*0)[0] =
+                                    ((npy_double *)data0 + 2*0)[0] +
+                                    ((npy_double *)data_out + 2*0)[0];
+            ((npy_double *)data_out + 2*0)[1] =
+                                    ((npy_double *)data0 + 2*0)[1] +
+                                    ((npy_double *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_double *)data_out + 2*0)[0] =
+                                ((npy_double *)data0 + 2*0)[0] +
+                                ((npy_double *)data_out + 2*0)[0];
+        ((npy_double *)data_out + 2*0)[1] =
+                                ((npy_double *)data0 + 2*0)[1] +
+                                ((npy_double *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_double *)data_out + 2*1)[0] =
+                                ((npy_double *)data0 + 2*1)[0] +
+                                ((npy_double *)data_out + 2*1)[0];
+        ((npy_double *)data_out + 2*1)[1] =
+                                ((npy_double *)data0 + 2*1)[1] +
+                                ((npy_double *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_double *)data_out + 2*2)[0] =
+                                ((npy_double *)data0 + 2*2)[0] +
+                                ((npy_double *)data_out + 2*2)[0];
+        ((npy_double *)data_out + 2*2)[1] =
+                                ((npy_double *)data0 + 2*2)[1] +
+                                ((npy_double *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_double *)data_out + 2*3)[0] =
+                                ((npy_double *)data0 + 2*3)[0] +
+                                ((npy_double *)data_out + 2*3)[0];
+        ((npy_double *)data_out + 2*3)[1] =
+                                ((npy_double *)data0 + 2*3)[1] +
+                                ((npy_double *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_double *)data_out + 2*4)[0] =
+                                ((npy_double *)data0 + 2*4)[0] +
+                                ((npy_double *)data_out + 2*4)[0];
+        ((npy_double *)data_out + 2*4)[1] =
+                                ((npy_double *)data0 + 2*4)[1] +
+                                ((npy_double *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_double *)data_out + 2*5)[0] =
+                                ((npy_double *)data0 + 2*5)[0] +
+                                ((npy_double *)data_out + 2*5)[0];
+        ((npy_double *)data_out + 2*5)[1] =
+                                ((npy_double *)data0 + 2*5)[1] +
+                                ((npy_double *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_double *)data_out + 2*6)[0] =
+                                ((npy_double *)data0 + 2*6)[0] +
+                                ((npy_double *)data_out + 2*6)[0];
+        ((npy_double *)data_out + 2*6)[1] =
+                                ((npy_double *)data0 + 2*6)[1] +
+                                ((npy_double *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_double *)data_out + 2*7)[0] =
+                                ((npy_double *)data0 + 2*7)[0] +
+                                ((npy_double *)data_out + 2*7)[0];
+        ((npy_double *)data_out + 2*7)[1] =
+                                ((npy_double *)data0 + 2*7)[1] +
+                                ((npy_double *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 1000 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+double_sum_of_products_muladd(npy_double *data, npy_double *data_out, npy_double scalar, npy_intp count)
+{
+#if NPY_SIMD_F64 // NPYV check for npy_double
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_f64;
+    const npyv_f64 v_scalar = npyv_setall_f64(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_f64 b0 = npyv_loada_f64(data + vstep * 0);
+            npyv_f64 c0 = npyv_loada_f64(data_out + vstep * 0);
+            
+#line 312
+            npyv_f64 b1 = npyv_loada_f64(data + vstep * 1);
+            npyv_f64 c1 = npyv_loada_f64(data_out + vstep * 1);
+            
+#line 312
+            npyv_f64 b2 = npyv_loada_f64(data + vstep * 2);
+            npyv_f64 c2 = npyv_loada_f64(data_out + vstep * 2);
+            
+#line 312
+            npyv_f64 b3 = npyv_loada_f64(data + vstep * 3);
+            npyv_f64 c3 = npyv_loada_f64(data_out + vstep * 3);
+            
+            #line 318
+            npyv_f64 abc0 = npyv_muladd_f64(v_scalar, b0, c0);
+            
+#line 318
+            npyv_f64 abc1 = npyv_muladd_f64(v_scalar, b1, c1);
+            
+#line 318
+            npyv_f64 abc2 = npyv_muladd_f64(v_scalar, b2, c2);
+            
+#line 318
+            npyv_f64 abc3 = npyv_muladd_f64(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_f64(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_f64(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_f64(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_f64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_f64 b0 = npyv_load_f64(data + vstep * 0);
+            npyv_f64 c0 = npyv_load_f64(data_out + vstep * 0);
+            
+#line 312
+            npyv_f64 b1 = npyv_load_f64(data + vstep * 1);
+            npyv_f64 c1 = npyv_load_f64(data_out + vstep * 1);
+            
+#line 312
+            npyv_f64 b2 = npyv_load_f64(data + vstep * 2);
+            npyv_f64 c2 = npyv_load_f64(data_out + vstep * 2);
+            
+#line 312
+            npyv_f64 b3 = npyv_load_f64(data + vstep * 3);
+            npyv_f64 c3 = npyv_load_f64(data_out + vstep * 3);
+            
+            #line 318
+            npyv_f64 abc0 = npyv_muladd_f64(v_scalar, b0, c0);
+            
+#line 318
+            npyv_f64 abc1 = npyv_muladd_f64(v_scalar, b1, c1);
+            
+#line 318
+            npyv_f64 abc2 = npyv_muladd_f64(v_scalar, b2, c2);
+            
+#line 318
+            npyv_f64 abc3 = npyv_muladd_f64(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_f64(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_f64(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_f64(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_f64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_f64 a = npyv_load_tillz_f64(data, count);
+        npyv_f64 b = npyv_load_tillz_f64(data_out, count);
+        npyv_store_till_f64(data_out, count, npyv_muladd_f64(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_double b0 = (data[0]);
+        const npy_double c0 = (data_out[0]);
+        
+#line 340
+        const npy_double b1 = (data[1]);
+        const npy_double c1 = (data_out[1]);
+        
+#line 340
+        const npy_double b2 = (data[2]);
+        const npy_double c2 = (data_out[2]);
+        
+#line 340
+        const npy_double b3 = (data[3]);
+        const npy_double c3 = (data_out[3]);
+        
+        #line 346
+        const npy_double abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_double abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_double abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_double abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_double b = (*data);
+        const npy_double c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_double
+}
+
+static void
+double_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double *data0 = (npy_double *)dataptr[0];
+    npy_double *data1 = (npy_double *)dataptr[1];
+    npy_double *data_out = (npy_double *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("double_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_double
+#if NPY_SIMD_F64
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_f64;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_f64 a0 = npyv_loada_f64(data0 + vstep * 0);
+            npyv_f64 b0 = npyv_loada_f64(data1 + vstep * 0);
+            npyv_f64 c0 = npyv_loada_f64(data_out + vstep * 0);
+            
+#line 390
+            npyv_f64 a1 = npyv_loada_f64(data0 + vstep * 1);
+            npyv_f64 b1 = npyv_loada_f64(data1 + vstep * 1);
+            npyv_f64 c1 = npyv_loada_f64(data_out + vstep * 1);
+            
+#line 390
+            npyv_f64 a2 = npyv_loada_f64(data0 + vstep * 2);
+            npyv_f64 b2 = npyv_loada_f64(data1 + vstep * 2);
+            npyv_f64 c2 = npyv_loada_f64(data_out + vstep * 2);
+            
+#line 390
+            npyv_f64 a3 = npyv_loada_f64(data0 + vstep * 3);
+            npyv_f64 b3 = npyv_loada_f64(data1 + vstep * 3);
+            npyv_f64 c3 = npyv_loada_f64(data_out + vstep * 3);
+            
+            #line 397
+            npyv_f64 abc0 = npyv_muladd_f64(a0, b0, c0);
+            
+#line 397
+            npyv_f64 abc1 = npyv_muladd_f64(a1, b1, c1);
+            
+#line 397
+            npyv_f64 abc2 = npyv_muladd_f64(a2, b2, c2);
+            
+#line 397
+            npyv_f64 abc3 = npyv_muladd_f64(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_f64(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_f64(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_f64(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_f64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_f64 a0 = npyv_load_f64(data0 + vstep * 0);
+            npyv_f64 b0 = npyv_load_f64(data1 + vstep * 0);
+            npyv_f64 c0 = npyv_load_f64(data_out + vstep * 0);
+            
+#line 390
+            npyv_f64 a1 = npyv_load_f64(data0 + vstep * 1);
+            npyv_f64 b1 = npyv_load_f64(data1 + vstep * 1);
+            npyv_f64 c1 = npyv_load_f64(data_out + vstep * 1);
+            
+#line 390
+            npyv_f64 a2 = npyv_load_f64(data0 + vstep * 2);
+            npyv_f64 b2 = npyv_load_f64(data1 + vstep * 2);
+            npyv_f64 c2 = npyv_load_f64(data_out + vstep * 2);
+            
+#line 390
+            npyv_f64 a3 = npyv_load_f64(data0 + vstep * 3);
+            npyv_f64 b3 = npyv_load_f64(data1 + vstep * 3);
+            npyv_f64 c3 = npyv_load_f64(data_out + vstep * 3);
+            
+            #line 397
+            npyv_f64 abc0 = npyv_muladd_f64(a0, b0, c0);
+            
+#line 397
+            npyv_f64 abc1 = npyv_muladd_f64(a1, b1, c1);
+            
+#line 397
+            npyv_f64 abc2 = npyv_muladd_f64(a2, b2, c2);
+            
+#line 397
+            npyv_f64 abc3 = npyv_muladd_f64(a3, b3, c3);
+            
+            #line 402
+            npyv_store_f64(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_f64(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_f64(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_f64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_f64 a = npyv_load_tillz_f64(data0, count);
+        npyv_f64 b = npyv_load_tillz_f64(data1, count);
+        npyv_f64 c = npyv_load_tillz_f64(data_out, count);
+        npyv_store_till_f64(data_out, count, npyv_muladd_f64(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_double a0 = (data0[0]);
+        const npy_double b0 = (data1[0]);
+        const npy_double c0 = (data_out[0]);
+        
+#line 420
+        const npy_double a1 = (data0[1]);
+        const npy_double b1 = (data1[1]);
+        const npy_double c1 = (data_out[1]);
+        
+#line 420
+        const npy_double a2 = (data0[2]);
+        const npy_double b2 = (data1[2]);
+        const npy_double c2 = (data_out[2]);
+        
+#line 420
+        const npy_double a3 = (data0[3]);
+        const npy_double b3 = (data1[3]);
+        const npy_double c3 = (data_out[3]);
+        
+        #line 427
+        const npy_double abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_double abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_double abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_double abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_double a = (*data0);
+        const npy_double b = (*data1);
+        const npy_double c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_double
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+double_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double value0 = (*(npy_double *)dataptr[0]);
+    npy_double *data1 = (npy_double *)dataptr[1];
+    npy_double *data_out = (npy_double *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("double_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    double_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+double_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double value1 = (*(npy_double *)dataptr[1]);
+    npy_double *data0 = (npy_double *)dataptr[0];
+    npy_double *data_out = (npy_double *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("double_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    double_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+double_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double *data0 = (npy_double *)dataptr[0];
+    npy_double *data1 = (npy_double *)dataptr[1];
+    npy_double accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("double_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if NPY_SIMD_F64 // NPYV check for npy_double
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_f64;
+    npyv_f64 v_accum = npyv_zero_f64();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_f64 a0 = npyv_loada_f64(data0 + vstep * 0);
+            npyv_f64 b0 = npyv_loada_f64(data1 + vstep * 0);
+            
+#line 501
+            npyv_f64 a1 = npyv_loada_f64(data0 + vstep * 1);
+            npyv_f64 b1 = npyv_loada_f64(data1 + vstep * 1);
+            
+#line 501
+            npyv_f64 a2 = npyv_loada_f64(data0 + vstep * 2);
+            npyv_f64 b2 = npyv_loada_f64(data1 + vstep * 2);
+            
+#line 501
+            npyv_f64 a3 = npyv_loada_f64(data0 + vstep * 3);
+            npyv_f64 b3 = npyv_loada_f64(data1 + vstep * 3);
+            
+            npyv_f64 ab3 = npyv_muladd_f64(a3, b3, v_accum);
+            npyv_f64 ab2 = npyv_muladd_f64(a2, b2, ab3);
+            npyv_f64 ab1 = npyv_muladd_f64(a1, b1, ab2);
+                   v_accum = npyv_muladd_f64(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_f64 a0 = npyv_load_f64(data0 + vstep * 0);
+            npyv_f64 b0 = npyv_load_f64(data1 + vstep * 0);
+            
+#line 501
+            npyv_f64 a1 = npyv_load_f64(data0 + vstep * 1);
+            npyv_f64 b1 = npyv_load_f64(data1 + vstep * 1);
+            
+#line 501
+            npyv_f64 a2 = npyv_load_f64(data0 + vstep * 2);
+            npyv_f64 b2 = npyv_load_f64(data1 + vstep * 2);
+            
+#line 501
+            npyv_f64 a3 = npyv_load_f64(data0 + vstep * 3);
+            npyv_f64 b3 = npyv_load_f64(data1 + vstep * 3);
+            
+            npyv_f64 ab3 = npyv_muladd_f64(a3, b3, v_accum);
+            npyv_f64 ab2 = npyv_muladd_f64(a2, b2, ab3);
+            npyv_f64 ab1 = npyv_muladd_f64(a1, b1, ab2);
+                   v_accum = npyv_muladd_f64(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_f64 a = npyv_load_tillz_f64(data0, count);
+        npyv_f64 b = npyv_load_tillz_f64(data1, count);
+        v_accum = npyv_muladd_f64(a, b, v_accum);
+    }
+    accum = npyv_sum_f64(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_double ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_double ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_double ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_double ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_double a = (*data0);
+        const npy_double b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_double
+    *(npy_double *)dataptr[2] = ((*(npy_double *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+double_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double *data1 = (npy_double *)dataptr[1];
+    npy_double value0 = (*(npy_double *)dataptr[0]);
+    npy_double accum = double_sum_of_arr(data1, count);
+    *(npy_double *)dataptr[2] = ((*(npy_double *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+double_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double *data0 = (npy_double *)dataptr[0];
+    npy_double value1 = (*(npy_double *)dataptr[1]);
+    npy_double accum = double_sum_of_arr(data0, count);
+    *(npy_double *)dataptr[2] = ((*(npy_double *)dataptr[2]) + value1 * accum);
+}
+
+#elif 1000 == 3 && !0
+
+static void
+double_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double *data0 = (npy_double *)dataptr[0];
+    npy_double *data1 = (npy_double *)dataptr[1];
+    npy_double *data2 = (npy_double *)dataptr[2];
+    npy_double *data_out = (npy_double *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 1000 > 3 || @complex */
+
+static void
+double_sum_of_products_contig_any(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("double_sum_of_products_contig_any (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_double temp = (*(npy_double *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_double *)dataptr[i]);
+        }
+        *(npy_double *)dataptr[nop] = (temp +
+                                           (*(npy_double *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_double);
+        }
+#else /* complex */
+#  if 1000 <= 3
+#    define _SUMPROD_NOP 1000
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_double re, im, tmp;
+        int i;
+        re = ((npy_double *)dataptr[0])[0];
+        im = ((npy_double *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_double *)dataptr[i])[0] -
+                  im * ((npy_double *)dataptr[i])[1];
+            im = re * ((npy_double *)dataptr[i])[1] +
+                 im * ((npy_double *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_double *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_double *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_double *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_double *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_double);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 1000 */
+
+#if 1000 == 1
+
+static NPY_GCC_OPT_3 void
+double_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("double_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_double *data = (npy_double *)dataptr[0];
+    npy_double accum = double_sum_of_arr(data, count);
+    *((npy_double *)dataptr[1]) = (accum + (*((npy_double *)dataptr[1])));
+#else
+    npy_double accum_re = 0, accum_im = 0;
+    npy_double *data0 = (npy_double *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_double re01 = data0[0] + data0[2];
+        const npy_double re23 = data0[4] + data0[6];
+        const npy_double im13 = data0[1] + data0[3];
+        const npy_double im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_double *)dataptr[1])[0] += accum_re;
+    ((npy_double *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 1000 == 1 */
+
+static void
+double_sum_of_products_outstride0_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_double accum_re = 0, accum_im = 0;
+#else
+    npy_double accum = 0;
+#endif
+
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1000 == 2 || 1000 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1000 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("double_sum_of_products_outstride0_any (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 1000 == 1
+        accum += (*(npy_double *)data0);
+        data0 += stride0;
+#  elif 1000 == 2
+        accum += (*(npy_double *)data0) *
+                 (*(npy_double *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 1000 == 3
+        accum += (*(npy_double *)data0) *
+                 (*(npy_double *)data1) *
+                 (*(npy_double *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_double temp = (*(npy_double *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_double *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1000 == 1
+        accum_re += ((npy_double *)data0)[0];
+        accum_im += ((npy_double *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 1000 <= 3
+#define _SUMPROD_NOP 1000
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_double re, im, tmp;
+        int i;
+        re = ((npy_double *)dataptr[0])[0];
+        im = ((npy_double *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_double *)dataptr[i])[0] -
+                  im * ((npy_double *)dataptr[i])[1];
+            im = re * ((npy_double *)dataptr[i])[1] +
+                 im * ((npy_double *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 1000 <= 3
+    ((npy_double *)dataptr[1000])[0] += accum_re;
+    ((npy_double *)dataptr[1000])[1] += accum_im;
+#  else
+    ((npy_double *)dataptr[nop])[0] += accum_re;
+    ((npy_double *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 1000 <= 3
+    *((npy_double *)dataptr[1000]) = (accum +
+                                    (*((npy_double *)dataptr[1000])));
+#  else
+    *((npy_double *)dataptr[nop]) = (accum +
+                                    (*((npy_double *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+
+
+#line 74
+
+#if !0
+static NPY_GCC_OPT_3 npy_longdouble longdouble_sum_of_arr(npy_longdouble *data, npy_intp count)
+{
+    npy_longdouble accum = 0;
+#if 0 // NPYV check for npy_longdouble
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data);
+    const int vstep = npyv_nlanes_longdouble;
+    npyv_longdouble v_accum = npyv_zero_longdouble();
+    const npy_intp vstepx4 = vstep * 4;
+
+    #line 91
+    if(is_aligned) {
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
+            #line 96
+            npyv_longdouble a0 = npyv_loada_longdouble(data + vstep * 0);
+            
+#line 96
+            npyv_longdouble a1 = npyv_loada_longdouble(data + vstep * 1);
+            
+#line 96
+            npyv_longdouble a2 = npyv_loada_longdouble(data + vstep * 2);
+            
+#line 96
+            npyv_longdouble a3 = npyv_loada_longdouble(data + vstep * 3);
+            
+            npyv_longdouble a01   = npyv_add_longdouble(a0, a1);
+            npyv_longdouble a23   = npyv_add_longdouble(a2, a3);
+            npyv_longdouble a0123 = npyv_add_longdouble(a01, a23);
+                     v_accum = npyv_add_longdouble(a0123, v_accum);
+        }
+    }
+    
+#line 91
+    else {
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
+            #line 96
+            npyv_longdouble a0 = npyv_load_longdouble(data + vstep * 0);
+            
+#line 96
+            npyv_longdouble a1 = npyv_load_longdouble(data + vstep * 1);
+            
+#line 96
+            npyv_longdouble a2 = npyv_load_longdouble(data + vstep * 2);
+            
+#line 96
+            npyv_longdouble a3 = npyv_load_longdouble(data + vstep * 3);
+            
+            npyv_longdouble a01   = npyv_add_longdouble(a0, a1);
+            npyv_longdouble a23   = npyv_add_longdouble(a2, a3);
+            npyv_longdouble a0123 = npyv_add_longdouble(a01, a23);
+                     v_accum = npyv_add_longdouble(a0123, v_accum);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep) {
+        npyv_longdouble a = npyv_load_tillz_longdouble(data, count);
+        v_accum = npyv_add_longdouble(a, v_accum);
+    }
+    accum = npyv_sum_longdouble(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data += 4) {
+        const npy_longdouble a01 = (*data) + (data[1]);
+        const npy_longdouble a23 = (data[2]) + (data[3]);
+        accum +=  a01 + a23;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data++) {
+        accum += (*data);
+    }
+#endif // NPYV check for npy_longdouble
+    return accum;
+}
+#endif
+
+#line 131
+static void
+longdouble_sum_of_products_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1 == 2 || 1 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data_out = dataptr[1];
+    npy_intp stride_out = strides[1];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("longdouble_sum_of_products_one (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 1 == 1
+        *(npy_longdouble *)data_out = ((*(npy_longdouble *)data0) +
+                                         (*(npy_longdouble *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 1 == 2
+        *(npy_longdouble *)data_out = ((*(npy_longdouble *)data0) *
+                                         (*(npy_longdouble *)data1) +
+                                         (*(npy_longdouble *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 1 == 3
+        *(npy_longdouble *)data_out = ((*(npy_longdouble *)data0) *
+                                         (*(npy_longdouble *)data1) *
+                                         (*(npy_longdouble *)data2) +
+                                         (*(npy_longdouble *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_longdouble temp = (*(npy_longdouble *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_longdouble *)dataptr[i]);
+        }
+        *(npy_longdouble *)dataptr[nop] = (temp +
+                                           (*(npy_longdouble *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1 == 1
+        ((npy_longdouble *)data_out)[0] = ((npy_longdouble *)data0)[0] +
+                                         ((npy_longdouble *)data_out)[0];
+        ((npy_longdouble *)data_out)[1] = ((npy_longdouble *)data0)[1] +
+                                         ((npy_longdouble *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 1 <= 3
+#define _SUMPROD_NOP 1
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_longdouble re, im, tmp;
+        int i;
+        re = ((npy_longdouble *)dataptr[0])[0];
+        im = ((npy_longdouble *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_longdouble *)dataptr[i])[0] -
+                  im * ((npy_longdouble *)dataptr[i])[1];
+            im = re * ((npy_longdouble *)dataptr[i])[1] +
+                 im * ((npy_longdouble *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_longdouble *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_longdouble *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_longdouble *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_longdouble *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 1 == 1
+
+static void
+longdouble_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble *data0 = (npy_longdouble *)dataptr[0];
+    npy_longdouble *data_out = (npy_longdouble *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("longdouble_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_longdouble *)data_out + 2*6)[0] =
+                                    ((npy_longdouble *)data0 + 2*6)[0] +
+                                    ((npy_longdouble *)data_out + 2*6)[0];
+            ((npy_longdouble *)data_out + 2*6)[1] =
+                                    ((npy_longdouble *)data0 + 2*6)[1] +
+                                    ((npy_longdouble *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_longdouble *)data_out + 2*5)[0] =
+                                    ((npy_longdouble *)data0 + 2*5)[0] +
+                                    ((npy_longdouble *)data_out + 2*5)[0];
+            ((npy_longdouble *)data_out + 2*5)[1] =
+                                    ((npy_longdouble *)data0 + 2*5)[1] +
+                                    ((npy_longdouble *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_longdouble *)data_out + 2*4)[0] =
+                                    ((npy_longdouble *)data0 + 2*4)[0] +
+                                    ((npy_longdouble *)data_out + 2*4)[0];
+            ((npy_longdouble *)data_out + 2*4)[1] =
+                                    ((npy_longdouble *)data0 + 2*4)[1] +
+                                    ((npy_longdouble *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_longdouble *)data_out + 2*3)[0] =
+                                    ((npy_longdouble *)data0 + 2*3)[0] +
+                                    ((npy_longdouble *)data_out + 2*3)[0];
+            ((npy_longdouble *)data_out + 2*3)[1] =
+                                    ((npy_longdouble *)data0 + 2*3)[1] +
+                                    ((npy_longdouble *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_longdouble *)data_out + 2*2)[0] =
+                                    ((npy_longdouble *)data0 + 2*2)[0] +
+                                    ((npy_longdouble *)data_out + 2*2)[0];
+            ((npy_longdouble *)data_out + 2*2)[1] =
+                                    ((npy_longdouble *)data0 + 2*2)[1] +
+                                    ((npy_longdouble *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_longdouble *)data_out + 2*1)[0] =
+                                    ((npy_longdouble *)data0 + 2*1)[0] +
+                                    ((npy_longdouble *)data_out + 2*1)[0];
+            ((npy_longdouble *)data_out + 2*1)[1] =
+                                    ((npy_longdouble *)data0 + 2*1)[1] +
+                                    ((npy_longdouble *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_longdouble *)data_out + 2*0)[0] =
+                                    ((npy_longdouble *)data0 + 2*0)[0] +
+                                    ((npy_longdouble *)data_out + 2*0)[0];
+            ((npy_longdouble *)data_out + 2*0)[1] =
+                                    ((npy_longdouble *)data0 + 2*0)[1] +
+                                    ((npy_longdouble *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*0)[0] =
+                                ((npy_longdouble *)data0 + 2*0)[0] +
+                                ((npy_longdouble *)data_out + 2*0)[0];
+        ((npy_longdouble *)data_out + 2*0)[1] =
+                                ((npy_longdouble *)data0 + 2*0)[1] +
+                                ((npy_longdouble *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*1)[0] =
+                                ((npy_longdouble *)data0 + 2*1)[0] +
+                                ((npy_longdouble *)data_out + 2*1)[0];
+        ((npy_longdouble *)data_out + 2*1)[1] =
+                                ((npy_longdouble *)data0 + 2*1)[1] +
+                                ((npy_longdouble *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*2)[0] =
+                                ((npy_longdouble *)data0 + 2*2)[0] +
+                                ((npy_longdouble *)data_out + 2*2)[0];
+        ((npy_longdouble *)data_out + 2*2)[1] =
+                                ((npy_longdouble *)data0 + 2*2)[1] +
+                                ((npy_longdouble *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*3)[0] =
+                                ((npy_longdouble *)data0 + 2*3)[0] +
+                                ((npy_longdouble *)data_out + 2*3)[0];
+        ((npy_longdouble *)data_out + 2*3)[1] =
+                                ((npy_longdouble *)data0 + 2*3)[1] +
+                                ((npy_longdouble *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*4)[0] =
+                                ((npy_longdouble *)data0 + 2*4)[0] +
+                                ((npy_longdouble *)data_out + 2*4)[0];
+        ((npy_longdouble *)data_out + 2*4)[1] =
+                                ((npy_longdouble *)data0 + 2*4)[1] +
+                                ((npy_longdouble *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*5)[0] =
+                                ((npy_longdouble *)data0 + 2*5)[0] +
+                                ((npy_longdouble *)data_out + 2*5)[0];
+        ((npy_longdouble *)data_out + 2*5)[1] =
+                                ((npy_longdouble *)data0 + 2*5)[1] +
+                                ((npy_longdouble *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*6)[0] =
+                                ((npy_longdouble *)data0 + 2*6)[0] +
+                                ((npy_longdouble *)data_out + 2*6)[0];
+        ((npy_longdouble *)data_out + 2*6)[1] =
+                                ((npy_longdouble *)data0 + 2*6)[1] +
+                                ((npy_longdouble *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*7)[0] =
+                                ((npy_longdouble *)data0 + 2*7)[0] +
+                                ((npy_longdouble *)data_out + 2*7)[0];
+        ((npy_longdouble *)data_out + 2*7)[1] =
+                                ((npy_longdouble *)data0 + 2*7)[1] +
+                                ((npy_longdouble *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 1 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+longdouble_sum_of_products_muladd(npy_longdouble *data, npy_longdouble *data_out, npy_longdouble scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_longdouble
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_longdouble;
+    const npyv_longdouble v_scalar = npyv_setall_longdouble(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_longdouble b0 = npyv_loada_longdouble(data + vstep * 0);
+            npyv_longdouble c0 = npyv_loada_longdouble(data_out + vstep * 0);
+            
+#line 312
+            npyv_longdouble b1 = npyv_loada_longdouble(data + vstep * 1);
+            npyv_longdouble c1 = npyv_loada_longdouble(data_out + vstep * 1);
+            
+#line 312
+            npyv_longdouble b2 = npyv_loada_longdouble(data + vstep * 2);
+            npyv_longdouble c2 = npyv_loada_longdouble(data_out + vstep * 2);
+            
+#line 312
+            npyv_longdouble b3 = npyv_loada_longdouble(data + vstep * 3);
+            npyv_longdouble c3 = npyv_loada_longdouble(data_out + vstep * 3);
+            
+            #line 318
+            npyv_longdouble abc0 = npyv_muladd_longdouble(v_scalar, b0, c0);
+            
+#line 318
+            npyv_longdouble abc1 = npyv_muladd_longdouble(v_scalar, b1, c1);
+            
+#line 318
+            npyv_longdouble abc2 = npyv_muladd_longdouble(v_scalar, b2, c2);
+            
+#line 318
+            npyv_longdouble abc3 = npyv_muladd_longdouble(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_longdouble(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_longdouble(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_longdouble(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_longdouble(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_longdouble b0 = npyv_load_longdouble(data + vstep * 0);
+            npyv_longdouble c0 = npyv_load_longdouble(data_out + vstep * 0);
+            
+#line 312
+            npyv_longdouble b1 = npyv_load_longdouble(data + vstep * 1);
+            npyv_longdouble c1 = npyv_load_longdouble(data_out + vstep * 1);
+            
+#line 312
+            npyv_longdouble b2 = npyv_load_longdouble(data + vstep * 2);
+            npyv_longdouble c2 = npyv_load_longdouble(data_out + vstep * 2);
+            
+#line 312
+            npyv_longdouble b3 = npyv_load_longdouble(data + vstep * 3);
+            npyv_longdouble c3 = npyv_load_longdouble(data_out + vstep * 3);
+            
+            #line 318
+            npyv_longdouble abc0 = npyv_muladd_longdouble(v_scalar, b0, c0);
+            
+#line 318
+            npyv_longdouble abc1 = npyv_muladd_longdouble(v_scalar, b1, c1);
+            
+#line 318
+            npyv_longdouble abc2 = npyv_muladd_longdouble(v_scalar, b2, c2);
+            
+#line 318
+            npyv_longdouble abc3 = npyv_muladd_longdouble(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_longdouble(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_longdouble(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_longdouble(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_longdouble(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_longdouble a = npyv_load_tillz_longdouble(data, count);
+        npyv_longdouble b = npyv_load_tillz_longdouble(data_out, count);
+        npyv_store_till_longdouble(data_out, count, npyv_muladd_longdouble(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_longdouble b0 = (data[0]);
+        const npy_longdouble c0 = (data_out[0]);
+        
+#line 340
+        const npy_longdouble b1 = (data[1]);
+        const npy_longdouble c1 = (data_out[1]);
+        
+#line 340
+        const npy_longdouble b2 = (data[2]);
+        const npy_longdouble c2 = (data_out[2]);
+        
+#line 340
+        const npy_longdouble b3 = (data[3]);
+        const npy_longdouble c3 = (data_out[3]);
+        
+        #line 346
+        const npy_longdouble abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_longdouble abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_longdouble abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_longdouble abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_longdouble b = (*data);
+        const npy_longdouble c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_longdouble
+}
+
+static void
+longdouble_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble *data0 = (npy_longdouble *)dataptr[0];
+    npy_longdouble *data1 = (npy_longdouble *)dataptr[1];
+    npy_longdouble *data_out = (npy_longdouble *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("longdouble_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_longdouble
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_longdouble;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_longdouble a0 = npyv_loada_longdouble(data0 + vstep * 0);
+            npyv_longdouble b0 = npyv_loada_longdouble(data1 + vstep * 0);
+            npyv_longdouble c0 = npyv_loada_longdouble(data_out + vstep * 0);
+            
+#line 390
+            npyv_longdouble a1 = npyv_loada_longdouble(data0 + vstep * 1);
+            npyv_longdouble b1 = npyv_loada_longdouble(data1 + vstep * 1);
+            npyv_longdouble c1 = npyv_loada_longdouble(data_out + vstep * 1);
+            
+#line 390
+            npyv_longdouble a2 = npyv_loada_longdouble(data0 + vstep * 2);
+            npyv_longdouble b2 = npyv_loada_longdouble(data1 + vstep * 2);
+            npyv_longdouble c2 = npyv_loada_longdouble(data_out + vstep * 2);
+            
+#line 390
+            npyv_longdouble a3 = npyv_loada_longdouble(data0 + vstep * 3);
+            npyv_longdouble b3 = npyv_loada_longdouble(data1 + vstep * 3);
+            npyv_longdouble c3 = npyv_loada_longdouble(data_out + vstep * 3);
+            
+            #line 397
+            npyv_longdouble abc0 = npyv_muladd_longdouble(a0, b0, c0);
+            
+#line 397
+            npyv_longdouble abc1 = npyv_muladd_longdouble(a1, b1, c1);
+            
+#line 397
+            npyv_longdouble abc2 = npyv_muladd_longdouble(a2, b2, c2);
+            
+#line 397
+            npyv_longdouble abc3 = npyv_muladd_longdouble(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_longdouble(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_longdouble(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_longdouble(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_longdouble(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_longdouble a0 = npyv_load_longdouble(data0 + vstep * 0);
+            npyv_longdouble b0 = npyv_load_longdouble(data1 + vstep * 0);
+            npyv_longdouble c0 = npyv_load_longdouble(data_out + vstep * 0);
+            
+#line 390
+            npyv_longdouble a1 = npyv_load_longdouble(data0 + vstep * 1);
+            npyv_longdouble b1 = npyv_load_longdouble(data1 + vstep * 1);
+            npyv_longdouble c1 = npyv_load_longdouble(data_out + vstep * 1);
+            
+#line 390
+            npyv_longdouble a2 = npyv_load_longdouble(data0 + vstep * 2);
+            npyv_longdouble b2 = npyv_load_longdouble(data1 + vstep * 2);
+            npyv_longdouble c2 = npyv_load_longdouble(data_out + vstep * 2);
+            
+#line 390
+            npyv_longdouble a3 = npyv_load_longdouble(data0 + vstep * 3);
+            npyv_longdouble b3 = npyv_load_longdouble(data1 + vstep * 3);
+            npyv_longdouble c3 = npyv_load_longdouble(data_out + vstep * 3);
+            
+            #line 397
+            npyv_longdouble abc0 = npyv_muladd_longdouble(a0, b0, c0);
+            
+#line 397
+            npyv_longdouble abc1 = npyv_muladd_longdouble(a1, b1, c1);
+            
+#line 397
+            npyv_longdouble abc2 = npyv_muladd_longdouble(a2, b2, c2);
+            
+#line 397
+            npyv_longdouble abc3 = npyv_muladd_longdouble(a3, b3, c3);
+            
+            #line 402
+            npyv_store_longdouble(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_longdouble(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_longdouble(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_longdouble(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_longdouble a = npyv_load_tillz_longdouble(data0, count);
+        npyv_longdouble b = npyv_load_tillz_longdouble(data1, count);
+        npyv_longdouble c = npyv_load_tillz_longdouble(data_out, count);
+        npyv_store_till_longdouble(data_out, count, npyv_muladd_longdouble(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_longdouble a0 = (data0[0]);
+        const npy_longdouble b0 = (data1[0]);
+        const npy_longdouble c0 = (data_out[0]);
+        
+#line 420
+        const npy_longdouble a1 = (data0[1]);
+        const npy_longdouble b1 = (data1[1]);
+        const npy_longdouble c1 = (data_out[1]);
+        
+#line 420
+        const npy_longdouble a2 = (data0[2]);
+        const npy_longdouble b2 = (data1[2]);
+        const npy_longdouble c2 = (data_out[2]);
+        
+#line 420
+        const npy_longdouble a3 = (data0[3]);
+        const npy_longdouble b3 = (data1[3]);
+        const npy_longdouble c3 = (data_out[3]);
+        
+        #line 427
+        const npy_longdouble abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_longdouble abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_longdouble abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_longdouble abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_longdouble a = (*data0);
+        const npy_longdouble b = (*data1);
+        const npy_longdouble c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_longdouble
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+longdouble_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble value0 = (*(npy_longdouble *)dataptr[0]);
+    npy_longdouble *data1 = (npy_longdouble *)dataptr[1];
+    npy_longdouble *data_out = (npy_longdouble *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("longdouble_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    longdouble_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+longdouble_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble value1 = (*(npy_longdouble *)dataptr[1]);
+    npy_longdouble *data0 = (npy_longdouble *)dataptr[0];
+    npy_longdouble *data_out = (npy_longdouble *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("longdouble_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    longdouble_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+longdouble_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble *data0 = (npy_longdouble *)dataptr[0];
+    npy_longdouble *data1 = (npy_longdouble *)dataptr[1];
+    npy_longdouble accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("longdouble_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_longdouble
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_longdouble;
+    npyv_longdouble v_accum = npyv_zero_longdouble();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_longdouble a0 = npyv_loada_longdouble(data0 + vstep * 0);
+            npyv_longdouble b0 = npyv_loada_longdouble(data1 + vstep * 0);
+            
+#line 501
+            npyv_longdouble a1 = npyv_loada_longdouble(data0 + vstep * 1);
+            npyv_longdouble b1 = npyv_loada_longdouble(data1 + vstep * 1);
+            
+#line 501
+            npyv_longdouble a2 = npyv_loada_longdouble(data0 + vstep * 2);
+            npyv_longdouble b2 = npyv_loada_longdouble(data1 + vstep * 2);
+            
+#line 501
+            npyv_longdouble a3 = npyv_loada_longdouble(data0 + vstep * 3);
+            npyv_longdouble b3 = npyv_loada_longdouble(data1 + vstep * 3);
+            
+            npyv_longdouble ab3 = npyv_muladd_longdouble(a3, b3, v_accum);
+            npyv_longdouble ab2 = npyv_muladd_longdouble(a2, b2, ab3);
+            npyv_longdouble ab1 = npyv_muladd_longdouble(a1, b1, ab2);
+                   v_accum = npyv_muladd_longdouble(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_longdouble a0 = npyv_load_longdouble(data0 + vstep * 0);
+            npyv_longdouble b0 = npyv_load_longdouble(data1 + vstep * 0);
+            
+#line 501
+            npyv_longdouble a1 = npyv_load_longdouble(data0 + vstep * 1);
+            npyv_longdouble b1 = npyv_load_longdouble(data1 + vstep * 1);
+            
+#line 501
+            npyv_longdouble a2 = npyv_load_longdouble(data0 + vstep * 2);
+            npyv_longdouble b2 = npyv_load_longdouble(data1 + vstep * 2);
+            
+#line 501
+            npyv_longdouble a3 = npyv_load_longdouble(data0 + vstep * 3);
+            npyv_longdouble b3 = npyv_load_longdouble(data1 + vstep * 3);
+            
+            npyv_longdouble ab3 = npyv_muladd_longdouble(a3, b3, v_accum);
+            npyv_longdouble ab2 = npyv_muladd_longdouble(a2, b2, ab3);
+            npyv_longdouble ab1 = npyv_muladd_longdouble(a1, b1, ab2);
+                   v_accum = npyv_muladd_longdouble(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_longdouble a = npyv_load_tillz_longdouble(data0, count);
+        npyv_longdouble b = npyv_load_tillz_longdouble(data1, count);
+        v_accum = npyv_muladd_longdouble(a, b, v_accum);
+    }
+    accum = npyv_sum_longdouble(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_longdouble ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_longdouble ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_longdouble ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_longdouble ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_longdouble a = (*data0);
+        const npy_longdouble b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_longdouble
+    *(npy_longdouble *)dataptr[2] = ((*(npy_longdouble *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+longdouble_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble *data1 = (npy_longdouble *)dataptr[1];
+    npy_longdouble value0 = (*(npy_longdouble *)dataptr[0]);
+    npy_longdouble accum = longdouble_sum_of_arr(data1, count);
+    *(npy_longdouble *)dataptr[2] = ((*(npy_longdouble *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+longdouble_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble *data0 = (npy_longdouble *)dataptr[0];
+    npy_longdouble value1 = (*(npy_longdouble *)dataptr[1]);
+    npy_longdouble accum = longdouble_sum_of_arr(data0, count);
+    *(npy_longdouble *)dataptr[2] = ((*(npy_longdouble *)dataptr[2]) + value1 * accum);
+}
+
+#elif 1 == 3 && !0
+
+static void
+longdouble_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble *data0 = (npy_longdouble *)dataptr[0];
+    npy_longdouble *data1 = (npy_longdouble *)dataptr[1];
+    npy_longdouble *data2 = (npy_longdouble *)dataptr[2];
+    npy_longdouble *data_out = (npy_longdouble *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 1 > 3 || @complex */
+
+static void
+longdouble_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("longdouble_sum_of_products_contig_one (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_longdouble temp = (*(npy_longdouble *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_longdouble *)dataptr[i]);
+        }
+        *(npy_longdouble *)dataptr[nop] = (temp +
+                                           (*(npy_longdouble *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_longdouble);
+        }
+#else /* complex */
+#  if 1 <= 3
+#    define _SUMPROD_NOP 1
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_longdouble re, im, tmp;
+        int i;
+        re = ((npy_longdouble *)dataptr[0])[0];
+        im = ((npy_longdouble *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_longdouble *)dataptr[i])[0] -
+                  im * ((npy_longdouble *)dataptr[i])[1];
+            im = re * ((npy_longdouble *)dataptr[i])[1] +
+                 im * ((npy_longdouble *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_longdouble *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_longdouble *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_longdouble *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_longdouble *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_longdouble);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 1 */
+
+#if 1 == 1
+
+static NPY_GCC_OPT_3 void
+longdouble_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("longdouble_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_longdouble *data = (npy_longdouble *)dataptr[0];
+    npy_longdouble accum = longdouble_sum_of_arr(data, count);
+    *((npy_longdouble *)dataptr[1]) = (accum + (*((npy_longdouble *)dataptr[1])));
+#else
+    npy_longdouble accum_re = 0, accum_im = 0;
+    npy_longdouble *data0 = (npy_longdouble *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_longdouble re01 = data0[0] + data0[2];
+        const npy_longdouble re23 = data0[4] + data0[6];
+        const npy_longdouble im13 = data0[1] + data0[3];
+        const npy_longdouble im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_longdouble *)dataptr[1])[0] += accum_re;
+    ((npy_longdouble *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 1 == 1 */
+
+static void
+longdouble_sum_of_products_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_longdouble accum_re = 0, accum_im = 0;
+#else
+    npy_longdouble accum = 0;
+#endif
+
+#if (1 == 1) || (1 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1 == 2 || 1 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("longdouble_sum_of_products_outstride0_one (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 1 == 1
+        accum += (*(npy_longdouble *)data0);
+        data0 += stride0;
+#  elif 1 == 2
+        accum += (*(npy_longdouble *)data0) *
+                 (*(npy_longdouble *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 1 == 3
+        accum += (*(npy_longdouble *)data0) *
+                 (*(npy_longdouble *)data1) *
+                 (*(npy_longdouble *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_longdouble temp = (*(npy_longdouble *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_longdouble *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1 == 1
+        accum_re += ((npy_longdouble *)data0)[0];
+        accum_im += ((npy_longdouble *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 1 <= 3
+#define _SUMPROD_NOP 1
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_longdouble re, im, tmp;
+        int i;
+        re = ((npy_longdouble *)dataptr[0])[0];
+        im = ((npy_longdouble *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_longdouble *)dataptr[i])[0] -
+                  im * ((npy_longdouble *)dataptr[i])[1];
+            im = re * ((npy_longdouble *)dataptr[i])[1] +
+                 im * ((npy_longdouble *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 1 <= 3
+    ((npy_longdouble *)dataptr[1])[0] += accum_re;
+    ((npy_longdouble *)dataptr[1])[1] += accum_im;
+#  else
+    ((npy_longdouble *)dataptr[nop])[0] += accum_re;
+    ((npy_longdouble *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 1 <= 3
+    *((npy_longdouble *)dataptr[1]) = (accum +
+                                    (*((npy_longdouble *)dataptr[1])));
+#  else
+    *((npy_longdouble *)dataptr[nop]) = (accum +
+                                    (*((npy_longdouble *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+longdouble_sum_of_products_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (2 == 2 || 2 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (2 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data_out = dataptr[2];
+    npy_intp stride_out = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("longdouble_sum_of_products_two (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 2 == 1
+        *(npy_longdouble *)data_out = ((*(npy_longdouble *)data0) +
+                                         (*(npy_longdouble *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 2 == 2
+        *(npy_longdouble *)data_out = ((*(npy_longdouble *)data0) *
+                                         (*(npy_longdouble *)data1) +
+                                         (*(npy_longdouble *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 2 == 3
+        *(npy_longdouble *)data_out = ((*(npy_longdouble *)data0) *
+                                         (*(npy_longdouble *)data1) *
+                                         (*(npy_longdouble *)data2) +
+                                         (*(npy_longdouble *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_longdouble temp = (*(npy_longdouble *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_longdouble *)dataptr[i]);
+        }
+        *(npy_longdouble *)dataptr[nop] = (temp +
+                                           (*(npy_longdouble *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 2 == 1
+        ((npy_longdouble *)data_out)[0] = ((npy_longdouble *)data0)[0] +
+                                         ((npy_longdouble *)data_out)[0];
+        ((npy_longdouble *)data_out)[1] = ((npy_longdouble *)data0)[1] +
+                                         ((npy_longdouble *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 2 <= 3
+#define _SUMPROD_NOP 2
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_longdouble re, im, tmp;
+        int i;
+        re = ((npy_longdouble *)dataptr[0])[0];
+        im = ((npy_longdouble *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_longdouble *)dataptr[i])[0] -
+                  im * ((npy_longdouble *)dataptr[i])[1];
+            im = re * ((npy_longdouble *)dataptr[i])[1] +
+                 im * ((npy_longdouble *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_longdouble *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_longdouble *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_longdouble *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_longdouble *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 2 == 1
+
+static void
+longdouble_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble *data0 = (npy_longdouble *)dataptr[0];
+    npy_longdouble *data_out = (npy_longdouble *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("longdouble_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_longdouble *)data_out + 2*6)[0] =
+                                    ((npy_longdouble *)data0 + 2*6)[0] +
+                                    ((npy_longdouble *)data_out + 2*6)[0];
+            ((npy_longdouble *)data_out + 2*6)[1] =
+                                    ((npy_longdouble *)data0 + 2*6)[1] +
+                                    ((npy_longdouble *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_longdouble *)data_out + 2*5)[0] =
+                                    ((npy_longdouble *)data0 + 2*5)[0] +
+                                    ((npy_longdouble *)data_out + 2*5)[0];
+            ((npy_longdouble *)data_out + 2*5)[1] =
+                                    ((npy_longdouble *)data0 + 2*5)[1] +
+                                    ((npy_longdouble *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_longdouble *)data_out + 2*4)[0] =
+                                    ((npy_longdouble *)data0 + 2*4)[0] +
+                                    ((npy_longdouble *)data_out + 2*4)[0];
+            ((npy_longdouble *)data_out + 2*4)[1] =
+                                    ((npy_longdouble *)data0 + 2*4)[1] +
+                                    ((npy_longdouble *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_longdouble *)data_out + 2*3)[0] =
+                                    ((npy_longdouble *)data0 + 2*3)[0] +
+                                    ((npy_longdouble *)data_out + 2*3)[0];
+            ((npy_longdouble *)data_out + 2*3)[1] =
+                                    ((npy_longdouble *)data0 + 2*3)[1] +
+                                    ((npy_longdouble *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_longdouble *)data_out + 2*2)[0] =
+                                    ((npy_longdouble *)data0 + 2*2)[0] +
+                                    ((npy_longdouble *)data_out + 2*2)[0];
+            ((npy_longdouble *)data_out + 2*2)[1] =
+                                    ((npy_longdouble *)data0 + 2*2)[1] +
+                                    ((npy_longdouble *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_longdouble *)data_out + 2*1)[0] =
+                                    ((npy_longdouble *)data0 + 2*1)[0] +
+                                    ((npy_longdouble *)data_out + 2*1)[0];
+            ((npy_longdouble *)data_out + 2*1)[1] =
+                                    ((npy_longdouble *)data0 + 2*1)[1] +
+                                    ((npy_longdouble *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_longdouble *)data_out + 2*0)[0] =
+                                    ((npy_longdouble *)data0 + 2*0)[0] +
+                                    ((npy_longdouble *)data_out + 2*0)[0];
+            ((npy_longdouble *)data_out + 2*0)[1] =
+                                    ((npy_longdouble *)data0 + 2*0)[1] +
+                                    ((npy_longdouble *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*0)[0] =
+                                ((npy_longdouble *)data0 + 2*0)[0] +
+                                ((npy_longdouble *)data_out + 2*0)[0];
+        ((npy_longdouble *)data_out + 2*0)[1] =
+                                ((npy_longdouble *)data0 + 2*0)[1] +
+                                ((npy_longdouble *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*1)[0] =
+                                ((npy_longdouble *)data0 + 2*1)[0] +
+                                ((npy_longdouble *)data_out + 2*1)[0];
+        ((npy_longdouble *)data_out + 2*1)[1] =
+                                ((npy_longdouble *)data0 + 2*1)[1] +
+                                ((npy_longdouble *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*2)[0] =
+                                ((npy_longdouble *)data0 + 2*2)[0] +
+                                ((npy_longdouble *)data_out + 2*2)[0];
+        ((npy_longdouble *)data_out + 2*2)[1] =
+                                ((npy_longdouble *)data0 + 2*2)[1] +
+                                ((npy_longdouble *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*3)[0] =
+                                ((npy_longdouble *)data0 + 2*3)[0] +
+                                ((npy_longdouble *)data_out + 2*3)[0];
+        ((npy_longdouble *)data_out + 2*3)[1] =
+                                ((npy_longdouble *)data0 + 2*3)[1] +
+                                ((npy_longdouble *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*4)[0] =
+                                ((npy_longdouble *)data0 + 2*4)[0] +
+                                ((npy_longdouble *)data_out + 2*4)[0];
+        ((npy_longdouble *)data_out + 2*4)[1] =
+                                ((npy_longdouble *)data0 + 2*4)[1] +
+                                ((npy_longdouble *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*5)[0] =
+                                ((npy_longdouble *)data0 + 2*5)[0] +
+                                ((npy_longdouble *)data_out + 2*5)[0];
+        ((npy_longdouble *)data_out + 2*5)[1] =
+                                ((npy_longdouble *)data0 + 2*5)[1] +
+                                ((npy_longdouble *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*6)[0] =
+                                ((npy_longdouble *)data0 + 2*6)[0] +
+                                ((npy_longdouble *)data_out + 2*6)[0];
+        ((npy_longdouble *)data_out + 2*6)[1] =
+                                ((npy_longdouble *)data0 + 2*6)[1] +
+                                ((npy_longdouble *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*7)[0] =
+                                ((npy_longdouble *)data0 + 2*7)[0] +
+                                ((npy_longdouble *)data_out + 2*7)[0];
+        ((npy_longdouble *)data_out + 2*7)[1] =
+                                ((npy_longdouble *)data0 + 2*7)[1] +
+                                ((npy_longdouble *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 2 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+longdouble_sum_of_products_muladd(npy_longdouble *data, npy_longdouble *data_out, npy_longdouble scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_longdouble
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_longdouble;
+    const npyv_longdouble v_scalar = npyv_setall_longdouble(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_longdouble b0 = npyv_loada_longdouble(data + vstep * 0);
+            npyv_longdouble c0 = npyv_loada_longdouble(data_out + vstep * 0);
+            
+#line 312
+            npyv_longdouble b1 = npyv_loada_longdouble(data + vstep * 1);
+            npyv_longdouble c1 = npyv_loada_longdouble(data_out + vstep * 1);
+            
+#line 312
+            npyv_longdouble b2 = npyv_loada_longdouble(data + vstep * 2);
+            npyv_longdouble c2 = npyv_loada_longdouble(data_out + vstep * 2);
+            
+#line 312
+            npyv_longdouble b3 = npyv_loada_longdouble(data + vstep * 3);
+            npyv_longdouble c3 = npyv_loada_longdouble(data_out + vstep * 3);
+            
+            #line 318
+            npyv_longdouble abc0 = npyv_muladd_longdouble(v_scalar, b0, c0);
+            
+#line 318
+            npyv_longdouble abc1 = npyv_muladd_longdouble(v_scalar, b1, c1);
+            
+#line 318
+            npyv_longdouble abc2 = npyv_muladd_longdouble(v_scalar, b2, c2);
+            
+#line 318
+            npyv_longdouble abc3 = npyv_muladd_longdouble(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_longdouble(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_longdouble(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_longdouble(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_longdouble(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_longdouble b0 = npyv_load_longdouble(data + vstep * 0);
+            npyv_longdouble c0 = npyv_load_longdouble(data_out + vstep * 0);
+            
+#line 312
+            npyv_longdouble b1 = npyv_load_longdouble(data + vstep * 1);
+            npyv_longdouble c1 = npyv_load_longdouble(data_out + vstep * 1);
+            
+#line 312
+            npyv_longdouble b2 = npyv_load_longdouble(data + vstep * 2);
+            npyv_longdouble c2 = npyv_load_longdouble(data_out + vstep * 2);
+            
+#line 312
+            npyv_longdouble b3 = npyv_load_longdouble(data + vstep * 3);
+            npyv_longdouble c3 = npyv_load_longdouble(data_out + vstep * 3);
+            
+            #line 318
+            npyv_longdouble abc0 = npyv_muladd_longdouble(v_scalar, b0, c0);
+            
+#line 318
+            npyv_longdouble abc1 = npyv_muladd_longdouble(v_scalar, b1, c1);
+            
+#line 318
+            npyv_longdouble abc2 = npyv_muladd_longdouble(v_scalar, b2, c2);
+            
+#line 318
+            npyv_longdouble abc3 = npyv_muladd_longdouble(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_longdouble(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_longdouble(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_longdouble(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_longdouble(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_longdouble a = npyv_load_tillz_longdouble(data, count);
+        npyv_longdouble b = npyv_load_tillz_longdouble(data_out, count);
+        npyv_store_till_longdouble(data_out, count, npyv_muladd_longdouble(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_longdouble b0 = (data[0]);
+        const npy_longdouble c0 = (data_out[0]);
+        
+#line 340
+        const npy_longdouble b1 = (data[1]);
+        const npy_longdouble c1 = (data_out[1]);
+        
+#line 340
+        const npy_longdouble b2 = (data[2]);
+        const npy_longdouble c2 = (data_out[2]);
+        
+#line 340
+        const npy_longdouble b3 = (data[3]);
+        const npy_longdouble c3 = (data_out[3]);
+        
+        #line 346
+        const npy_longdouble abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_longdouble abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_longdouble abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_longdouble abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_longdouble b = (*data);
+        const npy_longdouble c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_longdouble
+}
+
+static void
+longdouble_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble *data0 = (npy_longdouble *)dataptr[0];
+    npy_longdouble *data1 = (npy_longdouble *)dataptr[1];
+    npy_longdouble *data_out = (npy_longdouble *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("longdouble_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_longdouble
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_longdouble;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_longdouble a0 = npyv_loada_longdouble(data0 + vstep * 0);
+            npyv_longdouble b0 = npyv_loada_longdouble(data1 + vstep * 0);
+            npyv_longdouble c0 = npyv_loada_longdouble(data_out + vstep * 0);
+            
+#line 390
+            npyv_longdouble a1 = npyv_loada_longdouble(data0 + vstep * 1);
+            npyv_longdouble b1 = npyv_loada_longdouble(data1 + vstep * 1);
+            npyv_longdouble c1 = npyv_loada_longdouble(data_out + vstep * 1);
+            
+#line 390
+            npyv_longdouble a2 = npyv_loada_longdouble(data0 + vstep * 2);
+            npyv_longdouble b2 = npyv_loada_longdouble(data1 + vstep * 2);
+            npyv_longdouble c2 = npyv_loada_longdouble(data_out + vstep * 2);
+            
+#line 390
+            npyv_longdouble a3 = npyv_loada_longdouble(data0 + vstep * 3);
+            npyv_longdouble b3 = npyv_loada_longdouble(data1 + vstep * 3);
+            npyv_longdouble c3 = npyv_loada_longdouble(data_out + vstep * 3);
+            
+            #line 397
+            npyv_longdouble abc0 = npyv_muladd_longdouble(a0, b0, c0);
+            
+#line 397
+            npyv_longdouble abc1 = npyv_muladd_longdouble(a1, b1, c1);
+            
+#line 397
+            npyv_longdouble abc2 = npyv_muladd_longdouble(a2, b2, c2);
+            
+#line 397
+            npyv_longdouble abc3 = npyv_muladd_longdouble(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_longdouble(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_longdouble(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_longdouble(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_longdouble(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_longdouble a0 = npyv_load_longdouble(data0 + vstep * 0);
+            npyv_longdouble b0 = npyv_load_longdouble(data1 + vstep * 0);
+            npyv_longdouble c0 = npyv_load_longdouble(data_out + vstep * 0);
+            
+#line 390
+            npyv_longdouble a1 = npyv_load_longdouble(data0 + vstep * 1);
+            npyv_longdouble b1 = npyv_load_longdouble(data1 + vstep * 1);
+            npyv_longdouble c1 = npyv_load_longdouble(data_out + vstep * 1);
+            
+#line 390
+            npyv_longdouble a2 = npyv_load_longdouble(data0 + vstep * 2);
+            npyv_longdouble b2 = npyv_load_longdouble(data1 + vstep * 2);
+            npyv_longdouble c2 = npyv_load_longdouble(data_out + vstep * 2);
+            
+#line 390
+            npyv_longdouble a3 = npyv_load_longdouble(data0 + vstep * 3);
+            npyv_longdouble b3 = npyv_load_longdouble(data1 + vstep * 3);
+            npyv_longdouble c3 = npyv_load_longdouble(data_out + vstep * 3);
+            
+            #line 397
+            npyv_longdouble abc0 = npyv_muladd_longdouble(a0, b0, c0);
+            
+#line 397
+            npyv_longdouble abc1 = npyv_muladd_longdouble(a1, b1, c1);
+            
+#line 397
+            npyv_longdouble abc2 = npyv_muladd_longdouble(a2, b2, c2);
+            
+#line 397
+            npyv_longdouble abc3 = npyv_muladd_longdouble(a3, b3, c3);
+            
+            #line 402
+            npyv_store_longdouble(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_longdouble(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_longdouble(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_longdouble(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_longdouble a = npyv_load_tillz_longdouble(data0, count);
+        npyv_longdouble b = npyv_load_tillz_longdouble(data1, count);
+        npyv_longdouble c = npyv_load_tillz_longdouble(data_out, count);
+        npyv_store_till_longdouble(data_out, count, npyv_muladd_longdouble(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_longdouble a0 = (data0[0]);
+        const npy_longdouble b0 = (data1[0]);
+        const npy_longdouble c0 = (data_out[0]);
+        
+#line 420
+        const npy_longdouble a1 = (data0[1]);
+        const npy_longdouble b1 = (data1[1]);
+        const npy_longdouble c1 = (data_out[1]);
+        
+#line 420
+        const npy_longdouble a2 = (data0[2]);
+        const npy_longdouble b2 = (data1[2]);
+        const npy_longdouble c2 = (data_out[2]);
+        
+#line 420
+        const npy_longdouble a3 = (data0[3]);
+        const npy_longdouble b3 = (data1[3]);
+        const npy_longdouble c3 = (data_out[3]);
+        
+        #line 427
+        const npy_longdouble abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_longdouble abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_longdouble abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_longdouble abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_longdouble a = (*data0);
+        const npy_longdouble b = (*data1);
+        const npy_longdouble c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_longdouble
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+longdouble_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble value0 = (*(npy_longdouble *)dataptr[0]);
+    npy_longdouble *data1 = (npy_longdouble *)dataptr[1];
+    npy_longdouble *data_out = (npy_longdouble *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("longdouble_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    longdouble_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+longdouble_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble value1 = (*(npy_longdouble *)dataptr[1]);
+    npy_longdouble *data0 = (npy_longdouble *)dataptr[0];
+    npy_longdouble *data_out = (npy_longdouble *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("longdouble_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    longdouble_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+longdouble_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble *data0 = (npy_longdouble *)dataptr[0];
+    npy_longdouble *data1 = (npy_longdouble *)dataptr[1];
+    npy_longdouble accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("longdouble_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_longdouble
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_longdouble;
+    npyv_longdouble v_accum = npyv_zero_longdouble();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_longdouble a0 = npyv_loada_longdouble(data0 + vstep * 0);
+            npyv_longdouble b0 = npyv_loada_longdouble(data1 + vstep * 0);
+            
+#line 501
+            npyv_longdouble a1 = npyv_loada_longdouble(data0 + vstep * 1);
+            npyv_longdouble b1 = npyv_loada_longdouble(data1 + vstep * 1);
+            
+#line 501
+            npyv_longdouble a2 = npyv_loada_longdouble(data0 + vstep * 2);
+            npyv_longdouble b2 = npyv_loada_longdouble(data1 + vstep * 2);
+            
+#line 501
+            npyv_longdouble a3 = npyv_loada_longdouble(data0 + vstep * 3);
+            npyv_longdouble b3 = npyv_loada_longdouble(data1 + vstep * 3);
+            
+            npyv_longdouble ab3 = npyv_muladd_longdouble(a3, b3, v_accum);
+            npyv_longdouble ab2 = npyv_muladd_longdouble(a2, b2, ab3);
+            npyv_longdouble ab1 = npyv_muladd_longdouble(a1, b1, ab2);
+                   v_accum = npyv_muladd_longdouble(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_longdouble a0 = npyv_load_longdouble(data0 + vstep * 0);
+            npyv_longdouble b0 = npyv_load_longdouble(data1 + vstep * 0);
+            
+#line 501
+            npyv_longdouble a1 = npyv_load_longdouble(data0 + vstep * 1);
+            npyv_longdouble b1 = npyv_load_longdouble(data1 + vstep * 1);
+            
+#line 501
+            npyv_longdouble a2 = npyv_load_longdouble(data0 + vstep * 2);
+            npyv_longdouble b2 = npyv_load_longdouble(data1 + vstep * 2);
+            
+#line 501
+            npyv_longdouble a3 = npyv_load_longdouble(data0 + vstep * 3);
+            npyv_longdouble b3 = npyv_load_longdouble(data1 + vstep * 3);
+            
+            npyv_longdouble ab3 = npyv_muladd_longdouble(a3, b3, v_accum);
+            npyv_longdouble ab2 = npyv_muladd_longdouble(a2, b2, ab3);
+            npyv_longdouble ab1 = npyv_muladd_longdouble(a1, b1, ab2);
+                   v_accum = npyv_muladd_longdouble(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_longdouble a = npyv_load_tillz_longdouble(data0, count);
+        npyv_longdouble b = npyv_load_tillz_longdouble(data1, count);
+        v_accum = npyv_muladd_longdouble(a, b, v_accum);
+    }
+    accum = npyv_sum_longdouble(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_longdouble ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_longdouble ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_longdouble ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_longdouble ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_longdouble a = (*data0);
+        const npy_longdouble b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_longdouble
+    *(npy_longdouble *)dataptr[2] = ((*(npy_longdouble *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+longdouble_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble *data1 = (npy_longdouble *)dataptr[1];
+    npy_longdouble value0 = (*(npy_longdouble *)dataptr[0]);
+    npy_longdouble accum = longdouble_sum_of_arr(data1, count);
+    *(npy_longdouble *)dataptr[2] = ((*(npy_longdouble *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+longdouble_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble *data0 = (npy_longdouble *)dataptr[0];
+    npy_longdouble value1 = (*(npy_longdouble *)dataptr[1]);
+    npy_longdouble accum = longdouble_sum_of_arr(data0, count);
+    *(npy_longdouble *)dataptr[2] = ((*(npy_longdouble *)dataptr[2]) + value1 * accum);
+}
+
+#elif 2 == 3 && !0
+
+static void
+longdouble_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble *data0 = (npy_longdouble *)dataptr[0];
+    npy_longdouble *data1 = (npy_longdouble *)dataptr[1];
+    npy_longdouble *data2 = (npy_longdouble *)dataptr[2];
+    npy_longdouble *data_out = (npy_longdouble *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 2 > 3 || @complex */
+
+static void
+longdouble_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("longdouble_sum_of_products_contig_two (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_longdouble temp = (*(npy_longdouble *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_longdouble *)dataptr[i]);
+        }
+        *(npy_longdouble *)dataptr[nop] = (temp +
+                                           (*(npy_longdouble *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_longdouble);
+        }
+#else /* complex */
+#  if 2 <= 3
+#    define _SUMPROD_NOP 2
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_longdouble re, im, tmp;
+        int i;
+        re = ((npy_longdouble *)dataptr[0])[0];
+        im = ((npy_longdouble *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_longdouble *)dataptr[i])[0] -
+                  im * ((npy_longdouble *)dataptr[i])[1];
+            im = re * ((npy_longdouble *)dataptr[i])[1] +
+                 im * ((npy_longdouble *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_longdouble *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_longdouble *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_longdouble *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_longdouble *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_longdouble);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 2 */
+
+#if 2 == 1
+
+static NPY_GCC_OPT_3 void
+longdouble_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("longdouble_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_longdouble *data = (npy_longdouble *)dataptr[0];
+    npy_longdouble accum = longdouble_sum_of_arr(data, count);
+    *((npy_longdouble *)dataptr[1]) = (accum + (*((npy_longdouble *)dataptr[1])));
+#else
+    npy_longdouble accum_re = 0, accum_im = 0;
+    npy_longdouble *data0 = (npy_longdouble *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_longdouble re01 = data0[0] + data0[2];
+        const npy_longdouble re23 = data0[4] + data0[6];
+        const npy_longdouble im13 = data0[1] + data0[3];
+        const npy_longdouble im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_longdouble *)dataptr[1])[0] += accum_re;
+    ((npy_longdouble *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 2 == 1 */
+
+static void
+longdouble_sum_of_products_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_longdouble accum_re = 0, accum_im = 0;
+#else
+    npy_longdouble accum = 0;
+#endif
+
+#if (2 == 1) || (2 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (2 == 2 || 2 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (2 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("longdouble_sum_of_products_outstride0_two (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 2 == 1
+        accum += (*(npy_longdouble *)data0);
+        data0 += stride0;
+#  elif 2 == 2
+        accum += (*(npy_longdouble *)data0) *
+                 (*(npy_longdouble *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 2 == 3
+        accum += (*(npy_longdouble *)data0) *
+                 (*(npy_longdouble *)data1) *
+                 (*(npy_longdouble *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_longdouble temp = (*(npy_longdouble *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_longdouble *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 2 == 1
+        accum_re += ((npy_longdouble *)data0)[0];
+        accum_im += ((npy_longdouble *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 2 <= 3
+#define _SUMPROD_NOP 2
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_longdouble re, im, tmp;
+        int i;
+        re = ((npy_longdouble *)dataptr[0])[0];
+        im = ((npy_longdouble *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_longdouble *)dataptr[i])[0] -
+                  im * ((npy_longdouble *)dataptr[i])[1];
+            im = re * ((npy_longdouble *)dataptr[i])[1] +
+                 im * ((npy_longdouble *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 2 <= 3
+    ((npy_longdouble *)dataptr[2])[0] += accum_re;
+    ((npy_longdouble *)dataptr[2])[1] += accum_im;
+#  else
+    ((npy_longdouble *)dataptr[nop])[0] += accum_re;
+    ((npy_longdouble *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 2 <= 3
+    *((npy_longdouble *)dataptr[2]) = (accum +
+                                    (*((npy_longdouble *)dataptr[2])));
+#  else
+    *((npy_longdouble *)dataptr[nop]) = (accum +
+                                    (*((npy_longdouble *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+longdouble_sum_of_products_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (3 == 2 || 3 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (3 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data_out = dataptr[3];
+    npy_intp stride_out = strides[3];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("longdouble_sum_of_products_three (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 3 == 1
+        *(npy_longdouble *)data_out = ((*(npy_longdouble *)data0) +
+                                         (*(npy_longdouble *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 3 == 2
+        *(npy_longdouble *)data_out = ((*(npy_longdouble *)data0) *
+                                         (*(npy_longdouble *)data1) +
+                                         (*(npy_longdouble *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 3 == 3
+        *(npy_longdouble *)data_out = ((*(npy_longdouble *)data0) *
+                                         (*(npy_longdouble *)data1) *
+                                         (*(npy_longdouble *)data2) +
+                                         (*(npy_longdouble *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_longdouble temp = (*(npy_longdouble *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_longdouble *)dataptr[i]);
+        }
+        *(npy_longdouble *)dataptr[nop] = (temp +
+                                           (*(npy_longdouble *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 3 == 1
+        ((npy_longdouble *)data_out)[0] = ((npy_longdouble *)data0)[0] +
+                                         ((npy_longdouble *)data_out)[0];
+        ((npy_longdouble *)data_out)[1] = ((npy_longdouble *)data0)[1] +
+                                         ((npy_longdouble *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 3 <= 3
+#define _SUMPROD_NOP 3
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_longdouble re, im, tmp;
+        int i;
+        re = ((npy_longdouble *)dataptr[0])[0];
+        im = ((npy_longdouble *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_longdouble *)dataptr[i])[0] -
+                  im * ((npy_longdouble *)dataptr[i])[1];
+            im = re * ((npy_longdouble *)dataptr[i])[1] +
+                 im * ((npy_longdouble *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_longdouble *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_longdouble *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_longdouble *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_longdouble *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 3 == 1
+
+static void
+longdouble_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble *data0 = (npy_longdouble *)dataptr[0];
+    npy_longdouble *data_out = (npy_longdouble *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("longdouble_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_longdouble *)data_out + 2*6)[0] =
+                                    ((npy_longdouble *)data0 + 2*6)[0] +
+                                    ((npy_longdouble *)data_out + 2*6)[0];
+            ((npy_longdouble *)data_out + 2*6)[1] =
+                                    ((npy_longdouble *)data0 + 2*6)[1] +
+                                    ((npy_longdouble *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_longdouble *)data_out + 2*5)[0] =
+                                    ((npy_longdouble *)data0 + 2*5)[0] +
+                                    ((npy_longdouble *)data_out + 2*5)[0];
+            ((npy_longdouble *)data_out + 2*5)[1] =
+                                    ((npy_longdouble *)data0 + 2*5)[1] +
+                                    ((npy_longdouble *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_longdouble *)data_out + 2*4)[0] =
+                                    ((npy_longdouble *)data0 + 2*4)[0] +
+                                    ((npy_longdouble *)data_out + 2*4)[0];
+            ((npy_longdouble *)data_out + 2*4)[1] =
+                                    ((npy_longdouble *)data0 + 2*4)[1] +
+                                    ((npy_longdouble *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_longdouble *)data_out + 2*3)[0] =
+                                    ((npy_longdouble *)data0 + 2*3)[0] +
+                                    ((npy_longdouble *)data_out + 2*3)[0];
+            ((npy_longdouble *)data_out + 2*3)[1] =
+                                    ((npy_longdouble *)data0 + 2*3)[1] +
+                                    ((npy_longdouble *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_longdouble *)data_out + 2*2)[0] =
+                                    ((npy_longdouble *)data0 + 2*2)[0] +
+                                    ((npy_longdouble *)data_out + 2*2)[0];
+            ((npy_longdouble *)data_out + 2*2)[1] =
+                                    ((npy_longdouble *)data0 + 2*2)[1] +
+                                    ((npy_longdouble *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_longdouble *)data_out + 2*1)[0] =
+                                    ((npy_longdouble *)data0 + 2*1)[0] +
+                                    ((npy_longdouble *)data_out + 2*1)[0];
+            ((npy_longdouble *)data_out + 2*1)[1] =
+                                    ((npy_longdouble *)data0 + 2*1)[1] +
+                                    ((npy_longdouble *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_longdouble *)data_out + 2*0)[0] =
+                                    ((npy_longdouble *)data0 + 2*0)[0] +
+                                    ((npy_longdouble *)data_out + 2*0)[0];
+            ((npy_longdouble *)data_out + 2*0)[1] =
+                                    ((npy_longdouble *)data0 + 2*0)[1] +
+                                    ((npy_longdouble *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*0)[0] =
+                                ((npy_longdouble *)data0 + 2*0)[0] +
+                                ((npy_longdouble *)data_out + 2*0)[0];
+        ((npy_longdouble *)data_out + 2*0)[1] =
+                                ((npy_longdouble *)data0 + 2*0)[1] +
+                                ((npy_longdouble *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*1)[0] =
+                                ((npy_longdouble *)data0 + 2*1)[0] +
+                                ((npy_longdouble *)data_out + 2*1)[0];
+        ((npy_longdouble *)data_out + 2*1)[1] =
+                                ((npy_longdouble *)data0 + 2*1)[1] +
+                                ((npy_longdouble *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*2)[0] =
+                                ((npy_longdouble *)data0 + 2*2)[0] +
+                                ((npy_longdouble *)data_out + 2*2)[0];
+        ((npy_longdouble *)data_out + 2*2)[1] =
+                                ((npy_longdouble *)data0 + 2*2)[1] +
+                                ((npy_longdouble *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*3)[0] =
+                                ((npy_longdouble *)data0 + 2*3)[0] +
+                                ((npy_longdouble *)data_out + 2*3)[0];
+        ((npy_longdouble *)data_out + 2*3)[1] =
+                                ((npy_longdouble *)data0 + 2*3)[1] +
+                                ((npy_longdouble *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*4)[0] =
+                                ((npy_longdouble *)data0 + 2*4)[0] +
+                                ((npy_longdouble *)data_out + 2*4)[0];
+        ((npy_longdouble *)data_out + 2*4)[1] =
+                                ((npy_longdouble *)data0 + 2*4)[1] +
+                                ((npy_longdouble *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*5)[0] =
+                                ((npy_longdouble *)data0 + 2*5)[0] +
+                                ((npy_longdouble *)data_out + 2*5)[0];
+        ((npy_longdouble *)data_out + 2*5)[1] =
+                                ((npy_longdouble *)data0 + 2*5)[1] +
+                                ((npy_longdouble *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*6)[0] =
+                                ((npy_longdouble *)data0 + 2*6)[0] +
+                                ((npy_longdouble *)data_out + 2*6)[0];
+        ((npy_longdouble *)data_out + 2*6)[1] =
+                                ((npy_longdouble *)data0 + 2*6)[1] +
+                                ((npy_longdouble *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*7)[0] =
+                                ((npy_longdouble *)data0 + 2*7)[0] +
+                                ((npy_longdouble *)data_out + 2*7)[0];
+        ((npy_longdouble *)data_out + 2*7)[1] =
+                                ((npy_longdouble *)data0 + 2*7)[1] +
+                                ((npy_longdouble *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 3 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+longdouble_sum_of_products_muladd(npy_longdouble *data, npy_longdouble *data_out, npy_longdouble scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_longdouble
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_longdouble;
+    const npyv_longdouble v_scalar = npyv_setall_longdouble(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_longdouble b0 = npyv_loada_longdouble(data + vstep * 0);
+            npyv_longdouble c0 = npyv_loada_longdouble(data_out + vstep * 0);
+            
+#line 312
+            npyv_longdouble b1 = npyv_loada_longdouble(data + vstep * 1);
+            npyv_longdouble c1 = npyv_loada_longdouble(data_out + vstep * 1);
+            
+#line 312
+            npyv_longdouble b2 = npyv_loada_longdouble(data + vstep * 2);
+            npyv_longdouble c2 = npyv_loada_longdouble(data_out + vstep * 2);
+            
+#line 312
+            npyv_longdouble b3 = npyv_loada_longdouble(data + vstep * 3);
+            npyv_longdouble c3 = npyv_loada_longdouble(data_out + vstep * 3);
+            
+            #line 318
+            npyv_longdouble abc0 = npyv_muladd_longdouble(v_scalar, b0, c0);
+            
+#line 318
+            npyv_longdouble abc1 = npyv_muladd_longdouble(v_scalar, b1, c1);
+            
+#line 318
+            npyv_longdouble abc2 = npyv_muladd_longdouble(v_scalar, b2, c2);
+            
+#line 318
+            npyv_longdouble abc3 = npyv_muladd_longdouble(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_longdouble(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_longdouble(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_longdouble(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_longdouble(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_longdouble b0 = npyv_load_longdouble(data + vstep * 0);
+            npyv_longdouble c0 = npyv_load_longdouble(data_out + vstep * 0);
+            
+#line 312
+            npyv_longdouble b1 = npyv_load_longdouble(data + vstep * 1);
+            npyv_longdouble c1 = npyv_load_longdouble(data_out + vstep * 1);
+            
+#line 312
+            npyv_longdouble b2 = npyv_load_longdouble(data + vstep * 2);
+            npyv_longdouble c2 = npyv_load_longdouble(data_out + vstep * 2);
+            
+#line 312
+            npyv_longdouble b3 = npyv_load_longdouble(data + vstep * 3);
+            npyv_longdouble c3 = npyv_load_longdouble(data_out + vstep * 3);
+            
+            #line 318
+            npyv_longdouble abc0 = npyv_muladd_longdouble(v_scalar, b0, c0);
+            
+#line 318
+            npyv_longdouble abc1 = npyv_muladd_longdouble(v_scalar, b1, c1);
+            
+#line 318
+            npyv_longdouble abc2 = npyv_muladd_longdouble(v_scalar, b2, c2);
+            
+#line 318
+            npyv_longdouble abc3 = npyv_muladd_longdouble(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_longdouble(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_longdouble(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_longdouble(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_longdouble(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_longdouble a = npyv_load_tillz_longdouble(data, count);
+        npyv_longdouble b = npyv_load_tillz_longdouble(data_out, count);
+        npyv_store_till_longdouble(data_out, count, npyv_muladd_longdouble(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_longdouble b0 = (data[0]);
+        const npy_longdouble c0 = (data_out[0]);
+        
+#line 340
+        const npy_longdouble b1 = (data[1]);
+        const npy_longdouble c1 = (data_out[1]);
+        
+#line 340
+        const npy_longdouble b2 = (data[2]);
+        const npy_longdouble c2 = (data_out[2]);
+        
+#line 340
+        const npy_longdouble b3 = (data[3]);
+        const npy_longdouble c3 = (data_out[3]);
+        
+        #line 346
+        const npy_longdouble abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_longdouble abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_longdouble abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_longdouble abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_longdouble b = (*data);
+        const npy_longdouble c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_longdouble
+}
+
+static void
+longdouble_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble *data0 = (npy_longdouble *)dataptr[0];
+    npy_longdouble *data1 = (npy_longdouble *)dataptr[1];
+    npy_longdouble *data_out = (npy_longdouble *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("longdouble_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_longdouble
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_longdouble;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_longdouble a0 = npyv_loada_longdouble(data0 + vstep * 0);
+            npyv_longdouble b0 = npyv_loada_longdouble(data1 + vstep * 0);
+            npyv_longdouble c0 = npyv_loada_longdouble(data_out + vstep * 0);
+            
+#line 390
+            npyv_longdouble a1 = npyv_loada_longdouble(data0 + vstep * 1);
+            npyv_longdouble b1 = npyv_loada_longdouble(data1 + vstep * 1);
+            npyv_longdouble c1 = npyv_loada_longdouble(data_out + vstep * 1);
+            
+#line 390
+            npyv_longdouble a2 = npyv_loada_longdouble(data0 + vstep * 2);
+            npyv_longdouble b2 = npyv_loada_longdouble(data1 + vstep * 2);
+            npyv_longdouble c2 = npyv_loada_longdouble(data_out + vstep * 2);
+            
+#line 390
+            npyv_longdouble a3 = npyv_loada_longdouble(data0 + vstep * 3);
+            npyv_longdouble b3 = npyv_loada_longdouble(data1 + vstep * 3);
+            npyv_longdouble c3 = npyv_loada_longdouble(data_out + vstep * 3);
+            
+            #line 397
+            npyv_longdouble abc0 = npyv_muladd_longdouble(a0, b0, c0);
+            
+#line 397
+            npyv_longdouble abc1 = npyv_muladd_longdouble(a1, b1, c1);
+            
+#line 397
+            npyv_longdouble abc2 = npyv_muladd_longdouble(a2, b2, c2);
+            
+#line 397
+            npyv_longdouble abc3 = npyv_muladd_longdouble(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_longdouble(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_longdouble(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_longdouble(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_longdouble(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_longdouble a0 = npyv_load_longdouble(data0 + vstep * 0);
+            npyv_longdouble b0 = npyv_load_longdouble(data1 + vstep * 0);
+            npyv_longdouble c0 = npyv_load_longdouble(data_out + vstep * 0);
+            
+#line 390
+            npyv_longdouble a1 = npyv_load_longdouble(data0 + vstep * 1);
+            npyv_longdouble b1 = npyv_load_longdouble(data1 + vstep * 1);
+            npyv_longdouble c1 = npyv_load_longdouble(data_out + vstep * 1);
+            
+#line 390
+            npyv_longdouble a2 = npyv_load_longdouble(data0 + vstep * 2);
+            npyv_longdouble b2 = npyv_load_longdouble(data1 + vstep * 2);
+            npyv_longdouble c2 = npyv_load_longdouble(data_out + vstep * 2);
+            
+#line 390
+            npyv_longdouble a3 = npyv_load_longdouble(data0 + vstep * 3);
+            npyv_longdouble b3 = npyv_load_longdouble(data1 + vstep * 3);
+            npyv_longdouble c3 = npyv_load_longdouble(data_out + vstep * 3);
+            
+            #line 397
+            npyv_longdouble abc0 = npyv_muladd_longdouble(a0, b0, c0);
+            
+#line 397
+            npyv_longdouble abc1 = npyv_muladd_longdouble(a1, b1, c1);
+            
+#line 397
+            npyv_longdouble abc2 = npyv_muladd_longdouble(a2, b2, c2);
+            
+#line 397
+            npyv_longdouble abc3 = npyv_muladd_longdouble(a3, b3, c3);
+            
+            #line 402
+            npyv_store_longdouble(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_longdouble(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_longdouble(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_longdouble(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_longdouble a = npyv_load_tillz_longdouble(data0, count);
+        npyv_longdouble b = npyv_load_tillz_longdouble(data1, count);
+        npyv_longdouble c = npyv_load_tillz_longdouble(data_out, count);
+        npyv_store_till_longdouble(data_out, count, npyv_muladd_longdouble(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_longdouble a0 = (data0[0]);
+        const npy_longdouble b0 = (data1[0]);
+        const npy_longdouble c0 = (data_out[0]);
+        
+#line 420
+        const npy_longdouble a1 = (data0[1]);
+        const npy_longdouble b1 = (data1[1]);
+        const npy_longdouble c1 = (data_out[1]);
+        
+#line 420
+        const npy_longdouble a2 = (data0[2]);
+        const npy_longdouble b2 = (data1[2]);
+        const npy_longdouble c2 = (data_out[2]);
+        
+#line 420
+        const npy_longdouble a3 = (data0[3]);
+        const npy_longdouble b3 = (data1[3]);
+        const npy_longdouble c3 = (data_out[3]);
+        
+        #line 427
+        const npy_longdouble abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_longdouble abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_longdouble abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_longdouble abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_longdouble a = (*data0);
+        const npy_longdouble b = (*data1);
+        const npy_longdouble c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_longdouble
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+longdouble_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble value0 = (*(npy_longdouble *)dataptr[0]);
+    npy_longdouble *data1 = (npy_longdouble *)dataptr[1];
+    npy_longdouble *data_out = (npy_longdouble *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("longdouble_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    longdouble_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+longdouble_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble value1 = (*(npy_longdouble *)dataptr[1]);
+    npy_longdouble *data0 = (npy_longdouble *)dataptr[0];
+    npy_longdouble *data_out = (npy_longdouble *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("longdouble_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    longdouble_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+longdouble_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble *data0 = (npy_longdouble *)dataptr[0];
+    npy_longdouble *data1 = (npy_longdouble *)dataptr[1];
+    npy_longdouble accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("longdouble_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_longdouble
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_longdouble;
+    npyv_longdouble v_accum = npyv_zero_longdouble();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_longdouble a0 = npyv_loada_longdouble(data0 + vstep * 0);
+            npyv_longdouble b0 = npyv_loada_longdouble(data1 + vstep * 0);
+            
+#line 501
+            npyv_longdouble a1 = npyv_loada_longdouble(data0 + vstep * 1);
+            npyv_longdouble b1 = npyv_loada_longdouble(data1 + vstep * 1);
+            
+#line 501
+            npyv_longdouble a2 = npyv_loada_longdouble(data0 + vstep * 2);
+            npyv_longdouble b2 = npyv_loada_longdouble(data1 + vstep * 2);
+            
+#line 501
+            npyv_longdouble a3 = npyv_loada_longdouble(data0 + vstep * 3);
+            npyv_longdouble b3 = npyv_loada_longdouble(data1 + vstep * 3);
+            
+            npyv_longdouble ab3 = npyv_muladd_longdouble(a3, b3, v_accum);
+            npyv_longdouble ab2 = npyv_muladd_longdouble(a2, b2, ab3);
+            npyv_longdouble ab1 = npyv_muladd_longdouble(a1, b1, ab2);
+                   v_accum = npyv_muladd_longdouble(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_longdouble a0 = npyv_load_longdouble(data0 + vstep * 0);
+            npyv_longdouble b0 = npyv_load_longdouble(data1 + vstep * 0);
+            
+#line 501
+            npyv_longdouble a1 = npyv_load_longdouble(data0 + vstep * 1);
+            npyv_longdouble b1 = npyv_load_longdouble(data1 + vstep * 1);
+            
+#line 501
+            npyv_longdouble a2 = npyv_load_longdouble(data0 + vstep * 2);
+            npyv_longdouble b2 = npyv_load_longdouble(data1 + vstep * 2);
+            
+#line 501
+            npyv_longdouble a3 = npyv_load_longdouble(data0 + vstep * 3);
+            npyv_longdouble b3 = npyv_load_longdouble(data1 + vstep * 3);
+            
+            npyv_longdouble ab3 = npyv_muladd_longdouble(a3, b3, v_accum);
+            npyv_longdouble ab2 = npyv_muladd_longdouble(a2, b2, ab3);
+            npyv_longdouble ab1 = npyv_muladd_longdouble(a1, b1, ab2);
+                   v_accum = npyv_muladd_longdouble(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_longdouble a = npyv_load_tillz_longdouble(data0, count);
+        npyv_longdouble b = npyv_load_tillz_longdouble(data1, count);
+        v_accum = npyv_muladd_longdouble(a, b, v_accum);
+    }
+    accum = npyv_sum_longdouble(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_longdouble ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_longdouble ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_longdouble ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_longdouble ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_longdouble a = (*data0);
+        const npy_longdouble b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_longdouble
+    *(npy_longdouble *)dataptr[2] = ((*(npy_longdouble *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+longdouble_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble *data1 = (npy_longdouble *)dataptr[1];
+    npy_longdouble value0 = (*(npy_longdouble *)dataptr[0]);
+    npy_longdouble accum = longdouble_sum_of_arr(data1, count);
+    *(npy_longdouble *)dataptr[2] = ((*(npy_longdouble *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+longdouble_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble *data0 = (npy_longdouble *)dataptr[0];
+    npy_longdouble value1 = (*(npy_longdouble *)dataptr[1]);
+    npy_longdouble accum = longdouble_sum_of_arr(data0, count);
+    *(npy_longdouble *)dataptr[2] = ((*(npy_longdouble *)dataptr[2]) + value1 * accum);
+}
+
+#elif 3 == 3 && !0
+
+static void
+longdouble_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble *data0 = (npy_longdouble *)dataptr[0];
+    npy_longdouble *data1 = (npy_longdouble *)dataptr[1];
+    npy_longdouble *data2 = (npy_longdouble *)dataptr[2];
+    npy_longdouble *data_out = (npy_longdouble *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 3 > 3 || @complex */
+
+static void
+longdouble_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("longdouble_sum_of_products_contig_three (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_longdouble temp = (*(npy_longdouble *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_longdouble *)dataptr[i]);
+        }
+        *(npy_longdouble *)dataptr[nop] = (temp +
+                                           (*(npy_longdouble *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_longdouble);
+        }
+#else /* complex */
+#  if 3 <= 3
+#    define _SUMPROD_NOP 3
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_longdouble re, im, tmp;
+        int i;
+        re = ((npy_longdouble *)dataptr[0])[0];
+        im = ((npy_longdouble *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_longdouble *)dataptr[i])[0] -
+                  im * ((npy_longdouble *)dataptr[i])[1];
+            im = re * ((npy_longdouble *)dataptr[i])[1] +
+                 im * ((npy_longdouble *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_longdouble *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_longdouble *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_longdouble *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_longdouble *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_longdouble);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 3 */
+
+#if 3 == 1
+
+static NPY_GCC_OPT_3 void
+longdouble_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("longdouble_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_longdouble *data = (npy_longdouble *)dataptr[0];
+    npy_longdouble accum = longdouble_sum_of_arr(data, count);
+    *((npy_longdouble *)dataptr[1]) = (accum + (*((npy_longdouble *)dataptr[1])));
+#else
+    npy_longdouble accum_re = 0, accum_im = 0;
+    npy_longdouble *data0 = (npy_longdouble *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_longdouble re01 = data0[0] + data0[2];
+        const npy_longdouble re23 = data0[4] + data0[6];
+        const npy_longdouble im13 = data0[1] + data0[3];
+        const npy_longdouble im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_longdouble *)dataptr[1])[0] += accum_re;
+    ((npy_longdouble *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 3 == 1 */
+
+static void
+longdouble_sum_of_products_outstride0_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_longdouble accum_re = 0, accum_im = 0;
+#else
+    npy_longdouble accum = 0;
+#endif
+
+#if (3 == 1) || (3 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (3 == 2 || 3 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (3 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("longdouble_sum_of_products_outstride0_three (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 3 == 1
+        accum += (*(npy_longdouble *)data0);
+        data0 += stride0;
+#  elif 3 == 2
+        accum += (*(npy_longdouble *)data0) *
+                 (*(npy_longdouble *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 3 == 3
+        accum += (*(npy_longdouble *)data0) *
+                 (*(npy_longdouble *)data1) *
+                 (*(npy_longdouble *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_longdouble temp = (*(npy_longdouble *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_longdouble *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 3 == 1
+        accum_re += ((npy_longdouble *)data0)[0];
+        accum_im += ((npy_longdouble *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 3 <= 3
+#define _SUMPROD_NOP 3
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_longdouble re, im, tmp;
+        int i;
+        re = ((npy_longdouble *)dataptr[0])[0];
+        im = ((npy_longdouble *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_longdouble *)dataptr[i])[0] -
+                  im * ((npy_longdouble *)dataptr[i])[1];
+            im = re * ((npy_longdouble *)dataptr[i])[1] +
+                 im * ((npy_longdouble *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 3 <= 3
+    ((npy_longdouble *)dataptr[3])[0] += accum_re;
+    ((npy_longdouble *)dataptr[3])[1] += accum_im;
+#  else
+    ((npy_longdouble *)dataptr[nop])[0] += accum_re;
+    ((npy_longdouble *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 3 <= 3
+    *((npy_longdouble *)dataptr[3]) = (accum +
+                                    (*((npy_longdouble *)dataptr[3])));
+#  else
+    *((npy_longdouble *)dataptr[nop]) = (accum +
+                                    (*((npy_longdouble *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+longdouble_sum_of_products_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1000 == 2 || 1000 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1000 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data_out = dataptr[1000];
+    npy_intp stride_out = strides[1000];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("longdouble_sum_of_products_any (%d)\n", (int)count);
+
+    while (count--) {
+#if !0
+#  if 1000 == 1
+        *(npy_longdouble *)data_out = ((*(npy_longdouble *)data0) +
+                                         (*(npy_longdouble *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 1000 == 2
+        *(npy_longdouble *)data_out = ((*(npy_longdouble *)data0) *
+                                         (*(npy_longdouble *)data1) +
+                                         (*(npy_longdouble *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 1000 == 3
+        *(npy_longdouble *)data_out = ((*(npy_longdouble *)data0) *
+                                         (*(npy_longdouble *)data1) *
+                                         (*(npy_longdouble *)data2) +
+                                         (*(npy_longdouble *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_longdouble temp = (*(npy_longdouble *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_longdouble *)dataptr[i]);
+        }
+        *(npy_longdouble *)dataptr[nop] = (temp +
+                                           (*(npy_longdouble *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1000 == 1
+        ((npy_longdouble *)data_out)[0] = ((npy_longdouble *)data0)[0] +
+                                         ((npy_longdouble *)data_out)[0];
+        ((npy_longdouble *)data_out)[1] = ((npy_longdouble *)data0)[1] +
+                                         ((npy_longdouble *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 1000 <= 3
+#define _SUMPROD_NOP 1000
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_longdouble re, im, tmp;
+        int i;
+        re = ((npy_longdouble *)dataptr[0])[0];
+        im = ((npy_longdouble *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_longdouble *)dataptr[i])[0] -
+                  im * ((npy_longdouble *)dataptr[i])[1];
+            im = re * ((npy_longdouble *)dataptr[i])[1] +
+                 im * ((npy_longdouble *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_longdouble *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_longdouble *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_longdouble *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_longdouble *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 1000 == 1
+
+static void
+longdouble_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble *data0 = (npy_longdouble *)dataptr[0];
+    npy_longdouble *data_out = (npy_longdouble *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("longdouble_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !0
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_longdouble *)data_out + 2*6)[0] =
+                                    ((npy_longdouble *)data0 + 2*6)[0] +
+                                    ((npy_longdouble *)data_out + 2*6)[0];
+            ((npy_longdouble *)data_out + 2*6)[1] =
+                                    ((npy_longdouble *)data0 + 2*6)[1] +
+                                    ((npy_longdouble *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !0
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_longdouble *)data_out + 2*5)[0] =
+                                    ((npy_longdouble *)data0 + 2*5)[0] +
+                                    ((npy_longdouble *)data_out + 2*5)[0];
+            ((npy_longdouble *)data_out + 2*5)[1] =
+                                    ((npy_longdouble *)data0 + 2*5)[1] +
+                                    ((npy_longdouble *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !0
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_longdouble *)data_out + 2*4)[0] =
+                                    ((npy_longdouble *)data0 + 2*4)[0] +
+                                    ((npy_longdouble *)data_out + 2*4)[0];
+            ((npy_longdouble *)data_out + 2*4)[1] =
+                                    ((npy_longdouble *)data0 + 2*4)[1] +
+                                    ((npy_longdouble *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !0
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_longdouble *)data_out + 2*3)[0] =
+                                    ((npy_longdouble *)data0 + 2*3)[0] +
+                                    ((npy_longdouble *)data_out + 2*3)[0];
+            ((npy_longdouble *)data_out + 2*3)[1] =
+                                    ((npy_longdouble *)data0 + 2*3)[1] +
+                                    ((npy_longdouble *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !0
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_longdouble *)data_out + 2*2)[0] =
+                                    ((npy_longdouble *)data0 + 2*2)[0] +
+                                    ((npy_longdouble *)data_out + 2*2)[0];
+            ((npy_longdouble *)data_out + 2*2)[1] =
+                                    ((npy_longdouble *)data0 + 2*2)[1] +
+                                    ((npy_longdouble *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !0
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_longdouble *)data_out + 2*1)[0] =
+                                    ((npy_longdouble *)data0 + 2*1)[0] +
+                                    ((npy_longdouble *)data_out + 2*1)[0];
+            ((npy_longdouble *)data_out + 2*1)[1] =
+                                    ((npy_longdouble *)data0 + 2*1)[1] +
+                                    ((npy_longdouble *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !0
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_longdouble *)data_out + 2*0)[0] =
+                                    ((npy_longdouble *)data0 + 2*0)[0] +
+                                    ((npy_longdouble *)data_out + 2*0)[0];
+            ((npy_longdouble *)data_out + 2*0)[1] =
+                                    ((npy_longdouble *)data0 + 2*0)[1] +
+                                    ((npy_longdouble *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !0
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*0)[0] =
+                                ((npy_longdouble *)data0 + 2*0)[0] +
+                                ((npy_longdouble *)data_out + 2*0)[0];
+        ((npy_longdouble *)data_out + 2*0)[1] =
+                                ((npy_longdouble *)data0 + 2*0)[1] +
+                                ((npy_longdouble *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*1)[0] =
+                                ((npy_longdouble *)data0 + 2*1)[0] +
+                                ((npy_longdouble *)data_out + 2*1)[0];
+        ((npy_longdouble *)data_out + 2*1)[1] =
+                                ((npy_longdouble *)data0 + 2*1)[1] +
+                                ((npy_longdouble *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*2)[0] =
+                                ((npy_longdouble *)data0 + 2*2)[0] +
+                                ((npy_longdouble *)data_out + 2*2)[0];
+        ((npy_longdouble *)data_out + 2*2)[1] =
+                                ((npy_longdouble *)data0 + 2*2)[1] +
+                                ((npy_longdouble *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*3)[0] =
+                                ((npy_longdouble *)data0 + 2*3)[0] +
+                                ((npy_longdouble *)data_out + 2*3)[0];
+        ((npy_longdouble *)data_out + 2*3)[1] =
+                                ((npy_longdouble *)data0 + 2*3)[1] +
+                                ((npy_longdouble *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*4)[0] =
+                                ((npy_longdouble *)data0 + 2*4)[0] +
+                                ((npy_longdouble *)data_out + 2*4)[0];
+        ((npy_longdouble *)data_out + 2*4)[1] =
+                                ((npy_longdouble *)data0 + 2*4)[1] +
+                                ((npy_longdouble *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*5)[0] =
+                                ((npy_longdouble *)data0 + 2*5)[0] +
+                                ((npy_longdouble *)data_out + 2*5)[0];
+        ((npy_longdouble *)data_out + 2*5)[1] =
+                                ((npy_longdouble *)data0 + 2*5)[1] +
+                                ((npy_longdouble *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*6)[0] =
+                                ((npy_longdouble *)data0 + 2*6)[0] +
+                                ((npy_longdouble *)data_out + 2*6)[0];
+        ((npy_longdouble *)data_out + 2*6)[1] =
+                                ((npy_longdouble *)data0 + 2*6)[1] +
+                                ((npy_longdouble *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !0
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*7)[0] =
+                                ((npy_longdouble *)data0 + 2*7)[0] +
+                                ((npy_longdouble *)data_out + 2*7)[0];
+        ((npy_longdouble *)data_out + 2*7)[1] =
+                                ((npy_longdouble *)data0 + 2*7)[1] +
+                                ((npy_longdouble *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 1000 == 2 && !0
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+longdouble_sum_of_products_muladd(npy_longdouble *data, npy_longdouble *data_out, npy_longdouble scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_longdouble
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_longdouble;
+    const npyv_longdouble v_scalar = npyv_setall_longdouble(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_longdouble b0 = npyv_loada_longdouble(data + vstep * 0);
+            npyv_longdouble c0 = npyv_loada_longdouble(data_out + vstep * 0);
+            
+#line 312
+            npyv_longdouble b1 = npyv_loada_longdouble(data + vstep * 1);
+            npyv_longdouble c1 = npyv_loada_longdouble(data_out + vstep * 1);
+            
+#line 312
+            npyv_longdouble b2 = npyv_loada_longdouble(data + vstep * 2);
+            npyv_longdouble c2 = npyv_loada_longdouble(data_out + vstep * 2);
+            
+#line 312
+            npyv_longdouble b3 = npyv_loada_longdouble(data + vstep * 3);
+            npyv_longdouble c3 = npyv_loada_longdouble(data_out + vstep * 3);
+            
+            #line 318
+            npyv_longdouble abc0 = npyv_muladd_longdouble(v_scalar, b0, c0);
+            
+#line 318
+            npyv_longdouble abc1 = npyv_muladd_longdouble(v_scalar, b1, c1);
+            
+#line 318
+            npyv_longdouble abc2 = npyv_muladd_longdouble(v_scalar, b2, c2);
+            
+#line 318
+            npyv_longdouble abc3 = npyv_muladd_longdouble(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_longdouble(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_longdouble(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_longdouble(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_longdouble(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_longdouble b0 = npyv_load_longdouble(data + vstep * 0);
+            npyv_longdouble c0 = npyv_load_longdouble(data_out + vstep * 0);
+            
+#line 312
+            npyv_longdouble b1 = npyv_load_longdouble(data + vstep * 1);
+            npyv_longdouble c1 = npyv_load_longdouble(data_out + vstep * 1);
+            
+#line 312
+            npyv_longdouble b2 = npyv_load_longdouble(data + vstep * 2);
+            npyv_longdouble c2 = npyv_load_longdouble(data_out + vstep * 2);
+            
+#line 312
+            npyv_longdouble b3 = npyv_load_longdouble(data + vstep * 3);
+            npyv_longdouble c3 = npyv_load_longdouble(data_out + vstep * 3);
+            
+            #line 318
+            npyv_longdouble abc0 = npyv_muladd_longdouble(v_scalar, b0, c0);
+            
+#line 318
+            npyv_longdouble abc1 = npyv_muladd_longdouble(v_scalar, b1, c1);
+            
+#line 318
+            npyv_longdouble abc2 = npyv_muladd_longdouble(v_scalar, b2, c2);
+            
+#line 318
+            npyv_longdouble abc3 = npyv_muladd_longdouble(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_longdouble(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_longdouble(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_longdouble(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_longdouble(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_longdouble a = npyv_load_tillz_longdouble(data, count);
+        npyv_longdouble b = npyv_load_tillz_longdouble(data_out, count);
+        npyv_store_till_longdouble(data_out, count, npyv_muladd_longdouble(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_longdouble b0 = (data[0]);
+        const npy_longdouble c0 = (data_out[0]);
+        
+#line 340
+        const npy_longdouble b1 = (data[1]);
+        const npy_longdouble c1 = (data_out[1]);
+        
+#line 340
+        const npy_longdouble b2 = (data[2]);
+        const npy_longdouble c2 = (data_out[2]);
+        
+#line 340
+        const npy_longdouble b3 = (data[3]);
+        const npy_longdouble c3 = (data_out[3]);
+        
+        #line 346
+        const npy_longdouble abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_longdouble abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_longdouble abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_longdouble abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_longdouble b = (*data);
+        const npy_longdouble c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_longdouble
+}
+
+static void
+longdouble_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble *data0 = (npy_longdouble *)dataptr[0];
+    npy_longdouble *data1 = (npy_longdouble *)dataptr[1];
+    npy_longdouble *data_out = (npy_longdouble *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("longdouble_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_longdouble
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_longdouble;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_longdouble a0 = npyv_loada_longdouble(data0 + vstep * 0);
+            npyv_longdouble b0 = npyv_loada_longdouble(data1 + vstep * 0);
+            npyv_longdouble c0 = npyv_loada_longdouble(data_out + vstep * 0);
+            
+#line 390
+            npyv_longdouble a1 = npyv_loada_longdouble(data0 + vstep * 1);
+            npyv_longdouble b1 = npyv_loada_longdouble(data1 + vstep * 1);
+            npyv_longdouble c1 = npyv_loada_longdouble(data_out + vstep * 1);
+            
+#line 390
+            npyv_longdouble a2 = npyv_loada_longdouble(data0 + vstep * 2);
+            npyv_longdouble b2 = npyv_loada_longdouble(data1 + vstep * 2);
+            npyv_longdouble c2 = npyv_loada_longdouble(data_out + vstep * 2);
+            
+#line 390
+            npyv_longdouble a3 = npyv_loada_longdouble(data0 + vstep * 3);
+            npyv_longdouble b3 = npyv_loada_longdouble(data1 + vstep * 3);
+            npyv_longdouble c3 = npyv_loada_longdouble(data_out + vstep * 3);
+            
+            #line 397
+            npyv_longdouble abc0 = npyv_muladd_longdouble(a0, b0, c0);
+            
+#line 397
+            npyv_longdouble abc1 = npyv_muladd_longdouble(a1, b1, c1);
+            
+#line 397
+            npyv_longdouble abc2 = npyv_muladd_longdouble(a2, b2, c2);
+            
+#line 397
+            npyv_longdouble abc3 = npyv_muladd_longdouble(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_longdouble(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_longdouble(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_longdouble(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_longdouble(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_longdouble a0 = npyv_load_longdouble(data0 + vstep * 0);
+            npyv_longdouble b0 = npyv_load_longdouble(data1 + vstep * 0);
+            npyv_longdouble c0 = npyv_load_longdouble(data_out + vstep * 0);
+            
+#line 390
+            npyv_longdouble a1 = npyv_load_longdouble(data0 + vstep * 1);
+            npyv_longdouble b1 = npyv_load_longdouble(data1 + vstep * 1);
+            npyv_longdouble c1 = npyv_load_longdouble(data_out + vstep * 1);
+            
+#line 390
+            npyv_longdouble a2 = npyv_load_longdouble(data0 + vstep * 2);
+            npyv_longdouble b2 = npyv_load_longdouble(data1 + vstep * 2);
+            npyv_longdouble c2 = npyv_load_longdouble(data_out + vstep * 2);
+            
+#line 390
+            npyv_longdouble a3 = npyv_load_longdouble(data0 + vstep * 3);
+            npyv_longdouble b3 = npyv_load_longdouble(data1 + vstep * 3);
+            npyv_longdouble c3 = npyv_load_longdouble(data_out + vstep * 3);
+            
+            #line 397
+            npyv_longdouble abc0 = npyv_muladd_longdouble(a0, b0, c0);
+            
+#line 397
+            npyv_longdouble abc1 = npyv_muladd_longdouble(a1, b1, c1);
+            
+#line 397
+            npyv_longdouble abc2 = npyv_muladd_longdouble(a2, b2, c2);
+            
+#line 397
+            npyv_longdouble abc3 = npyv_muladd_longdouble(a3, b3, c3);
+            
+            #line 402
+            npyv_store_longdouble(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_longdouble(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_longdouble(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_longdouble(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_longdouble a = npyv_load_tillz_longdouble(data0, count);
+        npyv_longdouble b = npyv_load_tillz_longdouble(data1, count);
+        npyv_longdouble c = npyv_load_tillz_longdouble(data_out, count);
+        npyv_store_till_longdouble(data_out, count, npyv_muladd_longdouble(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_longdouble a0 = (data0[0]);
+        const npy_longdouble b0 = (data1[0]);
+        const npy_longdouble c0 = (data_out[0]);
+        
+#line 420
+        const npy_longdouble a1 = (data0[1]);
+        const npy_longdouble b1 = (data1[1]);
+        const npy_longdouble c1 = (data_out[1]);
+        
+#line 420
+        const npy_longdouble a2 = (data0[2]);
+        const npy_longdouble b2 = (data1[2]);
+        const npy_longdouble c2 = (data_out[2]);
+        
+#line 420
+        const npy_longdouble a3 = (data0[3]);
+        const npy_longdouble b3 = (data1[3]);
+        const npy_longdouble c3 = (data_out[3]);
+        
+        #line 427
+        const npy_longdouble abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_longdouble abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_longdouble abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_longdouble abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_longdouble a = (*data0);
+        const npy_longdouble b = (*data1);
+        const npy_longdouble c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_longdouble
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+longdouble_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble value0 = (*(npy_longdouble *)dataptr[0]);
+    npy_longdouble *data1 = (npy_longdouble *)dataptr[1];
+    npy_longdouble *data_out = (npy_longdouble *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("longdouble_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    longdouble_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+longdouble_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble value1 = (*(npy_longdouble *)dataptr[1]);
+    npy_longdouble *data0 = (npy_longdouble *)dataptr[0];
+    npy_longdouble *data_out = (npy_longdouble *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("longdouble_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    longdouble_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+longdouble_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble *data0 = (npy_longdouble *)dataptr[0];
+    npy_longdouble *data1 = (npy_longdouble *)dataptr[1];
+    npy_longdouble accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("longdouble_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_longdouble
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_longdouble;
+    npyv_longdouble v_accum = npyv_zero_longdouble();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_longdouble a0 = npyv_loada_longdouble(data0 + vstep * 0);
+            npyv_longdouble b0 = npyv_loada_longdouble(data1 + vstep * 0);
+            
+#line 501
+            npyv_longdouble a1 = npyv_loada_longdouble(data0 + vstep * 1);
+            npyv_longdouble b1 = npyv_loada_longdouble(data1 + vstep * 1);
+            
+#line 501
+            npyv_longdouble a2 = npyv_loada_longdouble(data0 + vstep * 2);
+            npyv_longdouble b2 = npyv_loada_longdouble(data1 + vstep * 2);
+            
+#line 501
+            npyv_longdouble a3 = npyv_loada_longdouble(data0 + vstep * 3);
+            npyv_longdouble b3 = npyv_loada_longdouble(data1 + vstep * 3);
+            
+            npyv_longdouble ab3 = npyv_muladd_longdouble(a3, b3, v_accum);
+            npyv_longdouble ab2 = npyv_muladd_longdouble(a2, b2, ab3);
+            npyv_longdouble ab1 = npyv_muladd_longdouble(a1, b1, ab2);
+                   v_accum = npyv_muladd_longdouble(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_longdouble a0 = npyv_load_longdouble(data0 + vstep * 0);
+            npyv_longdouble b0 = npyv_load_longdouble(data1 + vstep * 0);
+            
+#line 501
+            npyv_longdouble a1 = npyv_load_longdouble(data0 + vstep * 1);
+            npyv_longdouble b1 = npyv_load_longdouble(data1 + vstep * 1);
+            
+#line 501
+            npyv_longdouble a2 = npyv_load_longdouble(data0 + vstep * 2);
+            npyv_longdouble b2 = npyv_load_longdouble(data1 + vstep * 2);
+            
+#line 501
+            npyv_longdouble a3 = npyv_load_longdouble(data0 + vstep * 3);
+            npyv_longdouble b3 = npyv_load_longdouble(data1 + vstep * 3);
+            
+            npyv_longdouble ab3 = npyv_muladd_longdouble(a3, b3, v_accum);
+            npyv_longdouble ab2 = npyv_muladd_longdouble(a2, b2, ab3);
+            npyv_longdouble ab1 = npyv_muladd_longdouble(a1, b1, ab2);
+                   v_accum = npyv_muladd_longdouble(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_longdouble a = npyv_load_tillz_longdouble(data0, count);
+        npyv_longdouble b = npyv_load_tillz_longdouble(data1, count);
+        v_accum = npyv_muladd_longdouble(a, b, v_accum);
+    }
+    accum = npyv_sum_longdouble(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_longdouble ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_longdouble ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_longdouble ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_longdouble ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_longdouble a = (*data0);
+        const npy_longdouble b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_longdouble
+    *(npy_longdouble *)dataptr[2] = ((*(npy_longdouble *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+longdouble_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble *data1 = (npy_longdouble *)dataptr[1];
+    npy_longdouble value0 = (*(npy_longdouble *)dataptr[0]);
+    npy_longdouble accum = longdouble_sum_of_arr(data1, count);
+    *(npy_longdouble *)dataptr[2] = ((*(npy_longdouble *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+longdouble_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble *data0 = (npy_longdouble *)dataptr[0];
+    npy_longdouble value1 = (*(npy_longdouble *)dataptr[1]);
+    npy_longdouble accum = longdouble_sum_of_arr(data0, count);
+    *(npy_longdouble *)dataptr[2] = ((*(npy_longdouble *)dataptr[2]) + value1 * accum);
+}
+
+#elif 1000 == 3 && !0
+
+static void
+longdouble_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble *data0 = (npy_longdouble *)dataptr[0];
+    npy_longdouble *data1 = (npy_longdouble *)dataptr[1];
+    npy_longdouble *data2 = (npy_longdouble *)dataptr[2];
+    npy_longdouble *data_out = (npy_longdouble *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 1000 > 3 || @complex */
+
+static void
+longdouble_sum_of_products_contig_any(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("longdouble_sum_of_products_contig_any (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+        npy_longdouble temp = (*(npy_longdouble *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_longdouble *)dataptr[i]);
+        }
+        *(npy_longdouble *)dataptr[nop] = (temp +
+                                           (*(npy_longdouble *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_longdouble);
+        }
+#else /* complex */
+#  if 1000 <= 3
+#    define _SUMPROD_NOP 1000
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_longdouble re, im, tmp;
+        int i;
+        re = ((npy_longdouble *)dataptr[0])[0];
+        im = ((npy_longdouble *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_longdouble *)dataptr[i])[0] -
+                  im * ((npy_longdouble *)dataptr[i])[1];
+            im = re * ((npy_longdouble *)dataptr[i])[1] +
+                 im * ((npy_longdouble *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_longdouble *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_longdouble *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_longdouble *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_longdouble *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_longdouble);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 1000 */
+
+#if 1000 == 1
+
+static NPY_GCC_OPT_3 void
+longdouble_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("longdouble_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !0
+    npy_longdouble *data = (npy_longdouble *)dataptr[0];
+    npy_longdouble accum = longdouble_sum_of_arr(data, count);
+    *((npy_longdouble *)dataptr[1]) = (accum + (*((npy_longdouble *)dataptr[1])));
+#else
+    npy_longdouble accum_re = 0, accum_im = 0;
+    npy_longdouble *data0 = (npy_longdouble *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_longdouble re01 = data0[0] + data0[2];
+        const npy_longdouble re23 = data0[4] + data0[6];
+        const npy_longdouble im13 = data0[1] + data0[3];
+        const npy_longdouble im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_longdouble *)dataptr[1])[0] += accum_re;
+    ((npy_longdouble *)dataptr[1])[1] += accum_im;
+#endif // !0
+}
+
+#endif /* 1000 == 1 */
+
+static void
+longdouble_sum_of_products_outstride0_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 0
+    npy_longdouble accum_re = 0, accum_im = 0;
+#else
+    npy_longdouble accum = 0;
+#endif
+
+#if (1000 == 1) || (1000 <= 3 && !0)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1000 == 2 || 1000 == 3) && !0
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1000 == 3) && !0
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("longdouble_sum_of_products_outstride0_any (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !0
+#  if 1000 == 1
+        accum += (*(npy_longdouble *)data0);
+        data0 += stride0;
+#  elif 1000 == 2
+        accum += (*(npy_longdouble *)data0) *
+                 (*(npy_longdouble *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 1000 == 3
+        accum += (*(npy_longdouble *)data0) *
+                 (*(npy_longdouble *)data1) *
+                 (*(npy_longdouble *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_longdouble temp = (*(npy_longdouble *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_longdouble *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1000 == 1
+        accum_re += ((npy_longdouble *)data0)[0];
+        accum_im += ((npy_longdouble *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 1000 <= 3
+#define _SUMPROD_NOP 1000
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_longdouble re, im, tmp;
+        int i;
+        re = ((npy_longdouble *)dataptr[0])[0];
+        im = ((npy_longdouble *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_longdouble *)dataptr[i])[0] -
+                  im * ((npy_longdouble *)dataptr[i])[1];
+            im = re * ((npy_longdouble *)dataptr[i])[1] +
+                 im * ((npy_longdouble *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 0
+#  if 1000 <= 3
+    ((npy_longdouble *)dataptr[1000])[0] += accum_re;
+    ((npy_longdouble *)dataptr[1000])[1] += accum_im;
+#  else
+    ((npy_longdouble *)dataptr[nop])[0] += accum_re;
+    ((npy_longdouble *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 1000 <= 3
+    *((npy_longdouble *)dataptr[1000]) = (accum +
+                                    (*((npy_longdouble *)dataptr[1000])));
+#  else
+    *((npy_longdouble *)dataptr[nop]) = (accum +
+                                    (*((npy_longdouble *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+
+
+#line 74
+
+#if !1
+static NPY_GCC_OPT_3 npy_float cfloat_sum_of_arr(npy_cfloat *data, npy_intp count)
+{
+    npy_float accum = 0;
+#if 0 // NPYV check for npy_cfloat
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data);
+    const int vstep = npyv_nlanes_f32;
+    npyv_f32 v_accum = npyv_zero_f32();
+    const npy_intp vstepx4 = vstep * 4;
+
+    #line 91
+    if(is_aligned) {
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
+            #line 96
+            npyv_f32 a0 = npyv_loada_f32(data + vstep * 0);
+            
+#line 96
+            npyv_f32 a1 = npyv_loada_f32(data + vstep * 1);
+            
+#line 96
+            npyv_f32 a2 = npyv_loada_f32(data + vstep * 2);
+            
+#line 96
+            npyv_f32 a3 = npyv_loada_f32(data + vstep * 3);
+            
+            npyv_f32 a01   = npyv_add_f32(a0, a1);
+            npyv_f32 a23   = npyv_add_f32(a2, a3);
+            npyv_f32 a0123 = npyv_add_f32(a01, a23);
+                     v_accum = npyv_add_f32(a0123, v_accum);
+        }
+    }
+    
+#line 91
+    else {
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
+            #line 96
+            npyv_f32 a0 = npyv_load_f32(data + vstep * 0);
+            
+#line 96
+            npyv_f32 a1 = npyv_load_f32(data + vstep * 1);
+            
+#line 96
+            npyv_f32 a2 = npyv_load_f32(data + vstep * 2);
+            
+#line 96
+            npyv_f32 a3 = npyv_load_f32(data + vstep * 3);
+            
+            npyv_f32 a01   = npyv_add_f32(a0, a1);
+            npyv_f32 a23   = npyv_add_f32(a2, a3);
+            npyv_f32 a0123 = npyv_add_f32(a01, a23);
+                     v_accum = npyv_add_f32(a0123, v_accum);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep) {
+        npyv_f32 a = npyv_load_tillz_f32(data, count);
+        v_accum = npyv_add_f32(a, v_accum);
+    }
+    accum = npyv_sum_f32(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data += 4) {
+        const npy_float a01 = (*data) + (data[1]);
+        const npy_float a23 = (data[2]) + (data[3]);
+        accum +=  a01 + a23;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data++) {
+        accum += (*data);
+    }
+#endif // NPYV check for npy_cfloat
+    return accum;
+}
+#endif
+
+#line 131
+static void
+cfloat_sum_of_products_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1 == 1) || (1 <= 3 && !1)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1 == 2 || 1 == 3) && !1
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1 == 3) && !1
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (1 == 1) || (1 <= 3 && !1)
+    char *data_out = dataptr[1];
+    npy_intp stride_out = strides[1];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("cfloat_sum_of_products_one (%d)\n", (int)count);
+
+    while (count--) {
+#if !1
+#  if 1 == 1
+        *(npy_cfloat *)data_out = ((*(npy_cfloat *)data0) +
+                                         (*(npy_cfloat *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 1 == 2
+        *(npy_cfloat *)data_out = ((*(npy_cfloat *)data0) *
+                                         (*(npy_cfloat *)data1) +
+                                         (*(npy_cfloat *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 1 == 3
+        *(npy_cfloat *)data_out = ((*(npy_cfloat *)data0) *
+                                         (*(npy_cfloat *)data1) *
+                                         (*(npy_cfloat *)data2) +
+                                         (*(npy_cfloat *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_float temp = (*(npy_cfloat *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_cfloat *)dataptr[i]);
+        }
+        *(npy_cfloat *)dataptr[nop] = (temp +
+                                           (*(npy_cfloat *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1 == 1
+        ((npy_float *)data_out)[0] = ((npy_float *)data0)[0] +
+                                         ((npy_float *)data_out)[0];
+        ((npy_float *)data_out)[1] = ((npy_float *)data0)[1] +
+                                         ((npy_float *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 1 <= 3
+#define _SUMPROD_NOP 1
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_float re, im, tmp;
+        int i;
+        re = ((npy_float *)dataptr[0])[0];
+        im = ((npy_float *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_float *)dataptr[i])[0] -
+                  im * ((npy_float *)dataptr[i])[1];
+            im = re * ((npy_float *)dataptr[i])[1] +
+                 im * ((npy_float *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_float *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_float *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 1 == 1
+
+static void
+cfloat_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cfloat *data0 = (npy_cfloat *)dataptr[0];
+    npy_cfloat *data_out = (npy_cfloat *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("cfloat_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !1
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_float *)data_out + 2*6)[0] =
+                                    ((npy_float *)data0 + 2*6)[0] +
+                                    ((npy_float *)data_out + 2*6)[0];
+            ((npy_float *)data_out + 2*6)[1] =
+                                    ((npy_float *)data0 + 2*6)[1] +
+                                    ((npy_float *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !1
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_float *)data_out + 2*5)[0] =
+                                    ((npy_float *)data0 + 2*5)[0] +
+                                    ((npy_float *)data_out + 2*5)[0];
+            ((npy_float *)data_out + 2*5)[1] =
+                                    ((npy_float *)data0 + 2*5)[1] +
+                                    ((npy_float *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !1
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_float *)data_out + 2*4)[0] =
+                                    ((npy_float *)data0 + 2*4)[0] +
+                                    ((npy_float *)data_out + 2*4)[0];
+            ((npy_float *)data_out + 2*4)[1] =
+                                    ((npy_float *)data0 + 2*4)[1] +
+                                    ((npy_float *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !1
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_float *)data_out + 2*3)[0] =
+                                    ((npy_float *)data0 + 2*3)[0] +
+                                    ((npy_float *)data_out + 2*3)[0];
+            ((npy_float *)data_out + 2*3)[1] =
+                                    ((npy_float *)data0 + 2*3)[1] +
+                                    ((npy_float *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !1
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_float *)data_out + 2*2)[0] =
+                                    ((npy_float *)data0 + 2*2)[0] +
+                                    ((npy_float *)data_out + 2*2)[0];
+            ((npy_float *)data_out + 2*2)[1] =
+                                    ((npy_float *)data0 + 2*2)[1] +
+                                    ((npy_float *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !1
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_float *)data_out + 2*1)[0] =
+                                    ((npy_float *)data0 + 2*1)[0] +
+                                    ((npy_float *)data_out + 2*1)[0];
+            ((npy_float *)data_out + 2*1)[1] =
+                                    ((npy_float *)data0 + 2*1)[1] +
+                                    ((npy_float *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !1
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_float *)data_out + 2*0)[0] =
+                                    ((npy_float *)data0 + 2*0)[0] +
+                                    ((npy_float *)data_out + 2*0)[0];
+            ((npy_float *)data_out + 2*0)[1] =
+                                    ((npy_float *)data0 + 2*0)[1] +
+                                    ((npy_float *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !1
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_float *)data_out + 2*0)[0] =
+                                ((npy_float *)data0 + 2*0)[0] +
+                                ((npy_float *)data_out + 2*0)[0];
+        ((npy_float *)data_out + 2*0)[1] =
+                                ((npy_float *)data0 + 2*0)[1] +
+                                ((npy_float *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_float *)data_out + 2*1)[0] =
+                                ((npy_float *)data0 + 2*1)[0] +
+                                ((npy_float *)data_out + 2*1)[0];
+        ((npy_float *)data_out + 2*1)[1] =
+                                ((npy_float *)data0 + 2*1)[1] +
+                                ((npy_float *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_float *)data_out + 2*2)[0] =
+                                ((npy_float *)data0 + 2*2)[0] +
+                                ((npy_float *)data_out + 2*2)[0];
+        ((npy_float *)data_out + 2*2)[1] =
+                                ((npy_float *)data0 + 2*2)[1] +
+                                ((npy_float *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_float *)data_out + 2*3)[0] =
+                                ((npy_float *)data0 + 2*3)[0] +
+                                ((npy_float *)data_out + 2*3)[0];
+        ((npy_float *)data_out + 2*3)[1] =
+                                ((npy_float *)data0 + 2*3)[1] +
+                                ((npy_float *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_float *)data_out + 2*4)[0] =
+                                ((npy_float *)data0 + 2*4)[0] +
+                                ((npy_float *)data_out + 2*4)[0];
+        ((npy_float *)data_out + 2*4)[1] =
+                                ((npy_float *)data0 + 2*4)[1] +
+                                ((npy_float *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_float *)data_out + 2*5)[0] =
+                                ((npy_float *)data0 + 2*5)[0] +
+                                ((npy_float *)data_out + 2*5)[0];
+        ((npy_float *)data_out + 2*5)[1] =
+                                ((npy_float *)data0 + 2*5)[1] +
+                                ((npy_float *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_float *)data_out + 2*6)[0] =
+                                ((npy_float *)data0 + 2*6)[0] +
+                                ((npy_float *)data_out + 2*6)[0];
+        ((npy_float *)data_out + 2*6)[1] =
+                                ((npy_float *)data0 + 2*6)[1] +
+                                ((npy_float *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_float *)data_out + 2*7)[0] =
+                                ((npy_float *)data0 + 2*7)[0] +
+                                ((npy_float *)data_out + 2*7)[0];
+        ((npy_float *)data_out + 2*7)[1] =
+                                ((npy_float *)data0 + 2*7)[1] +
+                                ((npy_float *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 1 == 2 && !1
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+cfloat_sum_of_products_muladd(npy_cfloat *data, npy_cfloat *data_out, npy_float scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_cfloat
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_f32;
+    const npyv_f32 v_scalar = npyv_setall_f32(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_f32 b0 = npyv_loada_f32(data + vstep * 0);
+            npyv_f32 c0 = npyv_loada_f32(data_out + vstep * 0);
+            
+#line 312
+            npyv_f32 b1 = npyv_loada_f32(data + vstep * 1);
+            npyv_f32 c1 = npyv_loada_f32(data_out + vstep * 1);
+            
+#line 312
+            npyv_f32 b2 = npyv_loada_f32(data + vstep * 2);
+            npyv_f32 c2 = npyv_loada_f32(data_out + vstep * 2);
+            
+#line 312
+            npyv_f32 b3 = npyv_loada_f32(data + vstep * 3);
+            npyv_f32 c3 = npyv_loada_f32(data_out + vstep * 3);
+            
+            #line 318
+            npyv_f32 abc0 = npyv_muladd_f32(v_scalar, b0, c0);
+            
+#line 318
+            npyv_f32 abc1 = npyv_muladd_f32(v_scalar, b1, c1);
+            
+#line 318
+            npyv_f32 abc2 = npyv_muladd_f32(v_scalar, b2, c2);
+            
+#line 318
+            npyv_f32 abc3 = npyv_muladd_f32(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_f32(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_f32(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_f32(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_f32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_f32 b0 = npyv_load_f32(data + vstep * 0);
+            npyv_f32 c0 = npyv_load_f32(data_out + vstep * 0);
+            
+#line 312
+            npyv_f32 b1 = npyv_load_f32(data + vstep * 1);
+            npyv_f32 c1 = npyv_load_f32(data_out + vstep * 1);
+            
+#line 312
+            npyv_f32 b2 = npyv_load_f32(data + vstep * 2);
+            npyv_f32 c2 = npyv_load_f32(data_out + vstep * 2);
+            
+#line 312
+            npyv_f32 b3 = npyv_load_f32(data + vstep * 3);
+            npyv_f32 c3 = npyv_load_f32(data_out + vstep * 3);
+            
+            #line 318
+            npyv_f32 abc0 = npyv_muladd_f32(v_scalar, b0, c0);
+            
+#line 318
+            npyv_f32 abc1 = npyv_muladd_f32(v_scalar, b1, c1);
+            
+#line 318
+            npyv_f32 abc2 = npyv_muladd_f32(v_scalar, b2, c2);
+            
+#line 318
+            npyv_f32 abc3 = npyv_muladd_f32(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_f32(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_f32(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_f32(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_f32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_f32 a = npyv_load_tillz_f32(data, count);
+        npyv_f32 b = npyv_load_tillz_f32(data_out, count);
+        npyv_store_till_f32(data_out, count, npyv_muladd_f32(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_float b0 = (data[0]);
+        const npy_float c0 = (data_out[0]);
+        
+#line 340
+        const npy_float b1 = (data[1]);
+        const npy_float c1 = (data_out[1]);
+        
+#line 340
+        const npy_float b2 = (data[2]);
+        const npy_float c2 = (data_out[2]);
+        
+#line 340
+        const npy_float b3 = (data[3]);
+        const npy_float c3 = (data_out[3]);
+        
+        #line 346
+        const npy_float abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_float abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_float abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_float abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_float b = (*data);
+        const npy_float c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_cfloat
+}
+
+static void
+cfloat_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cfloat *data0 = (npy_cfloat *)dataptr[0];
+    npy_cfloat *data1 = (npy_cfloat *)dataptr[1];
+    npy_cfloat *data_out = (npy_cfloat *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("cfloat_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_cfloat
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_f32;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_f32 a0 = npyv_loada_f32(data0 + vstep * 0);
+            npyv_f32 b0 = npyv_loada_f32(data1 + vstep * 0);
+            npyv_f32 c0 = npyv_loada_f32(data_out + vstep * 0);
+            
+#line 390
+            npyv_f32 a1 = npyv_loada_f32(data0 + vstep * 1);
+            npyv_f32 b1 = npyv_loada_f32(data1 + vstep * 1);
+            npyv_f32 c1 = npyv_loada_f32(data_out + vstep * 1);
+            
+#line 390
+            npyv_f32 a2 = npyv_loada_f32(data0 + vstep * 2);
+            npyv_f32 b2 = npyv_loada_f32(data1 + vstep * 2);
+            npyv_f32 c2 = npyv_loada_f32(data_out + vstep * 2);
+            
+#line 390
+            npyv_f32 a3 = npyv_loada_f32(data0 + vstep * 3);
+            npyv_f32 b3 = npyv_loada_f32(data1 + vstep * 3);
+            npyv_f32 c3 = npyv_loada_f32(data_out + vstep * 3);
+            
+            #line 397
+            npyv_f32 abc0 = npyv_muladd_f32(a0, b0, c0);
+            
+#line 397
+            npyv_f32 abc1 = npyv_muladd_f32(a1, b1, c1);
+            
+#line 397
+            npyv_f32 abc2 = npyv_muladd_f32(a2, b2, c2);
+            
+#line 397
+            npyv_f32 abc3 = npyv_muladd_f32(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_f32(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_f32(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_f32(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_f32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_f32 a0 = npyv_load_f32(data0 + vstep * 0);
+            npyv_f32 b0 = npyv_load_f32(data1 + vstep * 0);
+            npyv_f32 c0 = npyv_load_f32(data_out + vstep * 0);
+            
+#line 390
+            npyv_f32 a1 = npyv_load_f32(data0 + vstep * 1);
+            npyv_f32 b1 = npyv_load_f32(data1 + vstep * 1);
+            npyv_f32 c1 = npyv_load_f32(data_out + vstep * 1);
+            
+#line 390
+            npyv_f32 a2 = npyv_load_f32(data0 + vstep * 2);
+            npyv_f32 b2 = npyv_load_f32(data1 + vstep * 2);
+            npyv_f32 c2 = npyv_load_f32(data_out + vstep * 2);
+            
+#line 390
+            npyv_f32 a3 = npyv_load_f32(data0 + vstep * 3);
+            npyv_f32 b3 = npyv_load_f32(data1 + vstep * 3);
+            npyv_f32 c3 = npyv_load_f32(data_out + vstep * 3);
+            
+            #line 397
+            npyv_f32 abc0 = npyv_muladd_f32(a0, b0, c0);
+            
+#line 397
+            npyv_f32 abc1 = npyv_muladd_f32(a1, b1, c1);
+            
+#line 397
+            npyv_f32 abc2 = npyv_muladd_f32(a2, b2, c2);
+            
+#line 397
+            npyv_f32 abc3 = npyv_muladd_f32(a3, b3, c3);
+            
+            #line 402
+            npyv_store_f32(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_f32(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_f32(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_f32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_f32 a = npyv_load_tillz_f32(data0, count);
+        npyv_f32 b = npyv_load_tillz_f32(data1, count);
+        npyv_f32 c = npyv_load_tillz_f32(data_out, count);
+        npyv_store_till_f32(data_out, count, npyv_muladd_f32(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_float a0 = (data0[0]);
+        const npy_float b0 = (data1[0]);
+        const npy_float c0 = (data_out[0]);
+        
+#line 420
+        const npy_float a1 = (data0[1]);
+        const npy_float b1 = (data1[1]);
+        const npy_float c1 = (data_out[1]);
+        
+#line 420
+        const npy_float a2 = (data0[2]);
+        const npy_float b2 = (data1[2]);
+        const npy_float c2 = (data_out[2]);
+        
+#line 420
+        const npy_float a3 = (data0[3]);
+        const npy_float b3 = (data1[3]);
+        const npy_float c3 = (data_out[3]);
+        
+        #line 427
+        const npy_float abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_float abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_float abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_float abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_float a = (*data0);
+        const npy_float b = (*data1);
+        const npy_float c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_cfloat
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+cfloat_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float value0 = (*(npy_cfloat *)dataptr[0]);
+    npy_cfloat *data1 = (npy_cfloat *)dataptr[1];
+    npy_cfloat *data_out = (npy_cfloat *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("cfloat_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    cfloat_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+cfloat_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float value1 = (*(npy_cfloat *)dataptr[1]);
+    npy_cfloat *data0 = (npy_cfloat *)dataptr[0];
+    npy_cfloat *data_out = (npy_cfloat *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("cfloat_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    cfloat_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+cfloat_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cfloat *data0 = (npy_cfloat *)dataptr[0];
+    npy_cfloat *data1 = (npy_cfloat *)dataptr[1];
+    npy_float accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("cfloat_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_cfloat
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_f32;
+    npyv_f32 v_accum = npyv_zero_f32();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_f32 a0 = npyv_loada_f32(data0 + vstep * 0);
+            npyv_f32 b0 = npyv_loada_f32(data1 + vstep * 0);
+            
+#line 501
+            npyv_f32 a1 = npyv_loada_f32(data0 + vstep * 1);
+            npyv_f32 b1 = npyv_loada_f32(data1 + vstep * 1);
+            
+#line 501
+            npyv_f32 a2 = npyv_loada_f32(data0 + vstep * 2);
+            npyv_f32 b2 = npyv_loada_f32(data1 + vstep * 2);
+            
+#line 501
+            npyv_f32 a3 = npyv_loada_f32(data0 + vstep * 3);
+            npyv_f32 b3 = npyv_loada_f32(data1 + vstep * 3);
+            
+            npyv_f32 ab3 = npyv_muladd_f32(a3, b3, v_accum);
+            npyv_f32 ab2 = npyv_muladd_f32(a2, b2, ab3);
+            npyv_f32 ab1 = npyv_muladd_f32(a1, b1, ab2);
+                   v_accum = npyv_muladd_f32(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_f32 a0 = npyv_load_f32(data0 + vstep * 0);
+            npyv_f32 b0 = npyv_load_f32(data1 + vstep * 0);
+            
+#line 501
+            npyv_f32 a1 = npyv_load_f32(data0 + vstep * 1);
+            npyv_f32 b1 = npyv_load_f32(data1 + vstep * 1);
+            
+#line 501
+            npyv_f32 a2 = npyv_load_f32(data0 + vstep * 2);
+            npyv_f32 b2 = npyv_load_f32(data1 + vstep * 2);
+            
+#line 501
+            npyv_f32 a3 = npyv_load_f32(data0 + vstep * 3);
+            npyv_f32 b3 = npyv_load_f32(data1 + vstep * 3);
+            
+            npyv_f32 ab3 = npyv_muladd_f32(a3, b3, v_accum);
+            npyv_f32 ab2 = npyv_muladd_f32(a2, b2, ab3);
+            npyv_f32 ab1 = npyv_muladd_f32(a1, b1, ab2);
+                   v_accum = npyv_muladd_f32(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_f32 a = npyv_load_tillz_f32(data0, count);
+        npyv_f32 b = npyv_load_tillz_f32(data1, count);
+        v_accum = npyv_muladd_f32(a, b, v_accum);
+    }
+    accum = npyv_sum_f32(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_float ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_float ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_float ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_float ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_float a = (*data0);
+        const npy_float b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_cfloat
+    *(npy_cfloat *)dataptr[2] = ((*(npy_cfloat *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+cfloat_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cfloat *data1 = (npy_cfloat *)dataptr[1];
+    npy_float value0 = (*(npy_cfloat *)dataptr[0]);
+    npy_float accum = cfloat_sum_of_arr(data1, count);
+    *(npy_cfloat *)dataptr[2] = ((*(npy_cfloat *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+cfloat_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cfloat *data0 = (npy_cfloat *)dataptr[0];
+    npy_float value1 = (*(npy_cfloat *)dataptr[1]);
+    npy_float accum = cfloat_sum_of_arr(data0, count);
+    *(npy_cfloat *)dataptr[2] = ((*(npy_cfloat *)dataptr[2]) + value1 * accum);
+}
+
+#elif 1 == 3 && !1
+
+static void
+cfloat_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cfloat *data0 = (npy_cfloat *)dataptr[0];
+    npy_cfloat *data1 = (npy_cfloat *)dataptr[1];
+    npy_cfloat *data2 = (npy_cfloat *)dataptr[2];
+    npy_cfloat *data_out = (npy_cfloat *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 1 > 3 || @complex */
+
+static void
+cfloat_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("cfloat_sum_of_products_contig_one (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !1
+        npy_float temp = (*(npy_cfloat *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_cfloat *)dataptr[i]);
+        }
+        *(npy_cfloat *)dataptr[nop] = (temp +
+                                           (*(npy_cfloat *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_cfloat);
+        }
+#else /* complex */
+#  if 1 <= 3
+#    define _SUMPROD_NOP 1
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_float re, im, tmp;
+        int i;
+        re = ((npy_float *)dataptr[0])[0];
+        im = ((npy_float *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_float *)dataptr[i])[0] -
+                  im * ((npy_float *)dataptr[i])[1];
+            im = re * ((npy_float *)dataptr[i])[1] +
+                 im * ((npy_float *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_float *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_float *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_cfloat);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 1 */
+
+#if 1 == 1
+
+static NPY_GCC_OPT_3 void
+cfloat_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("cfloat_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !1
+    npy_cfloat *data = (npy_cfloat *)dataptr[0];
+    npy_float accum = cfloat_sum_of_arr(data, count);
+    *((npy_cfloat *)dataptr[1]) = (accum + (*((npy_cfloat *)dataptr[1])));
+#else
+    npy_float accum_re = 0, accum_im = 0;
+    npy_float *data0 = (npy_float *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_float re01 = data0[0] + data0[2];
+        const npy_float re23 = data0[4] + data0[6];
+        const npy_float im13 = data0[1] + data0[3];
+        const npy_float im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_float *)dataptr[1])[0] += accum_re;
+    ((npy_float *)dataptr[1])[1] += accum_im;
+#endif // !1
+}
+
+#endif /* 1 == 1 */
+
+static void
+cfloat_sum_of_products_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 1
+    npy_float accum_re = 0, accum_im = 0;
+#else
+    npy_float accum = 0;
+#endif
+
+#if (1 == 1) || (1 <= 3 && !1)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1 == 2 || 1 == 3) && !1
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1 == 3) && !1
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("cfloat_sum_of_products_outstride0_one (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !1
+#  if 1 == 1
+        accum += (*(npy_cfloat *)data0);
+        data0 += stride0;
+#  elif 1 == 2
+        accum += (*(npy_cfloat *)data0) *
+                 (*(npy_cfloat *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 1 == 3
+        accum += (*(npy_cfloat *)data0) *
+                 (*(npy_cfloat *)data1) *
+                 (*(npy_cfloat *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_float temp = (*(npy_cfloat *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_cfloat *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1 == 1
+        accum_re += ((npy_float *)data0)[0];
+        accum_im += ((npy_float *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 1 <= 3
+#define _SUMPROD_NOP 1
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_float re, im, tmp;
+        int i;
+        re = ((npy_float *)dataptr[0])[0];
+        im = ((npy_float *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_float *)dataptr[i])[0] -
+                  im * ((npy_float *)dataptr[i])[1];
+            im = re * ((npy_float *)dataptr[i])[1] +
+                 im * ((npy_float *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 1
+#  if 1 <= 3
+    ((npy_float *)dataptr[1])[0] += accum_re;
+    ((npy_float *)dataptr[1])[1] += accum_im;
+#  else
+    ((npy_float *)dataptr[nop])[0] += accum_re;
+    ((npy_float *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 1 <= 3
+    *((npy_cfloat *)dataptr[1]) = (accum +
+                                    (*((npy_cfloat *)dataptr[1])));
+#  else
+    *((npy_cfloat *)dataptr[nop]) = (accum +
+                                    (*((npy_cfloat *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+cfloat_sum_of_products_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (2 == 1) || (2 <= 3 && !1)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (2 == 2 || 2 == 3) && !1
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (2 == 3) && !1
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (2 == 1) || (2 <= 3 && !1)
+    char *data_out = dataptr[2];
+    npy_intp stride_out = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("cfloat_sum_of_products_two (%d)\n", (int)count);
+
+    while (count--) {
+#if !1
+#  if 2 == 1
+        *(npy_cfloat *)data_out = ((*(npy_cfloat *)data0) +
+                                         (*(npy_cfloat *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 2 == 2
+        *(npy_cfloat *)data_out = ((*(npy_cfloat *)data0) *
+                                         (*(npy_cfloat *)data1) +
+                                         (*(npy_cfloat *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 2 == 3
+        *(npy_cfloat *)data_out = ((*(npy_cfloat *)data0) *
+                                         (*(npy_cfloat *)data1) *
+                                         (*(npy_cfloat *)data2) +
+                                         (*(npy_cfloat *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_float temp = (*(npy_cfloat *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_cfloat *)dataptr[i]);
+        }
+        *(npy_cfloat *)dataptr[nop] = (temp +
+                                           (*(npy_cfloat *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 2 == 1
+        ((npy_float *)data_out)[0] = ((npy_float *)data0)[0] +
+                                         ((npy_float *)data_out)[0];
+        ((npy_float *)data_out)[1] = ((npy_float *)data0)[1] +
+                                         ((npy_float *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 2 <= 3
+#define _SUMPROD_NOP 2
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_float re, im, tmp;
+        int i;
+        re = ((npy_float *)dataptr[0])[0];
+        im = ((npy_float *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_float *)dataptr[i])[0] -
+                  im * ((npy_float *)dataptr[i])[1];
+            im = re * ((npy_float *)dataptr[i])[1] +
+                 im * ((npy_float *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_float *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_float *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 2 == 1
+
+static void
+cfloat_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cfloat *data0 = (npy_cfloat *)dataptr[0];
+    npy_cfloat *data_out = (npy_cfloat *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("cfloat_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !1
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_float *)data_out + 2*6)[0] =
+                                    ((npy_float *)data0 + 2*6)[0] +
+                                    ((npy_float *)data_out + 2*6)[0];
+            ((npy_float *)data_out + 2*6)[1] =
+                                    ((npy_float *)data0 + 2*6)[1] +
+                                    ((npy_float *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !1
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_float *)data_out + 2*5)[0] =
+                                    ((npy_float *)data0 + 2*5)[0] +
+                                    ((npy_float *)data_out + 2*5)[0];
+            ((npy_float *)data_out + 2*5)[1] =
+                                    ((npy_float *)data0 + 2*5)[1] +
+                                    ((npy_float *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !1
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_float *)data_out + 2*4)[0] =
+                                    ((npy_float *)data0 + 2*4)[0] +
+                                    ((npy_float *)data_out + 2*4)[0];
+            ((npy_float *)data_out + 2*4)[1] =
+                                    ((npy_float *)data0 + 2*4)[1] +
+                                    ((npy_float *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !1
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_float *)data_out + 2*3)[0] =
+                                    ((npy_float *)data0 + 2*3)[0] +
+                                    ((npy_float *)data_out + 2*3)[0];
+            ((npy_float *)data_out + 2*3)[1] =
+                                    ((npy_float *)data0 + 2*3)[1] +
+                                    ((npy_float *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !1
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_float *)data_out + 2*2)[0] =
+                                    ((npy_float *)data0 + 2*2)[0] +
+                                    ((npy_float *)data_out + 2*2)[0];
+            ((npy_float *)data_out + 2*2)[1] =
+                                    ((npy_float *)data0 + 2*2)[1] +
+                                    ((npy_float *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !1
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_float *)data_out + 2*1)[0] =
+                                    ((npy_float *)data0 + 2*1)[0] +
+                                    ((npy_float *)data_out + 2*1)[0];
+            ((npy_float *)data_out + 2*1)[1] =
+                                    ((npy_float *)data0 + 2*1)[1] +
+                                    ((npy_float *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !1
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_float *)data_out + 2*0)[0] =
+                                    ((npy_float *)data0 + 2*0)[0] +
+                                    ((npy_float *)data_out + 2*0)[0];
+            ((npy_float *)data_out + 2*0)[1] =
+                                    ((npy_float *)data0 + 2*0)[1] +
+                                    ((npy_float *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !1
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_float *)data_out + 2*0)[0] =
+                                ((npy_float *)data0 + 2*0)[0] +
+                                ((npy_float *)data_out + 2*0)[0];
+        ((npy_float *)data_out + 2*0)[1] =
+                                ((npy_float *)data0 + 2*0)[1] +
+                                ((npy_float *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_float *)data_out + 2*1)[0] =
+                                ((npy_float *)data0 + 2*1)[0] +
+                                ((npy_float *)data_out + 2*1)[0];
+        ((npy_float *)data_out + 2*1)[1] =
+                                ((npy_float *)data0 + 2*1)[1] +
+                                ((npy_float *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_float *)data_out + 2*2)[0] =
+                                ((npy_float *)data0 + 2*2)[0] +
+                                ((npy_float *)data_out + 2*2)[0];
+        ((npy_float *)data_out + 2*2)[1] =
+                                ((npy_float *)data0 + 2*2)[1] +
+                                ((npy_float *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_float *)data_out + 2*3)[0] =
+                                ((npy_float *)data0 + 2*3)[0] +
+                                ((npy_float *)data_out + 2*3)[0];
+        ((npy_float *)data_out + 2*3)[1] =
+                                ((npy_float *)data0 + 2*3)[1] +
+                                ((npy_float *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_float *)data_out + 2*4)[0] =
+                                ((npy_float *)data0 + 2*4)[0] +
+                                ((npy_float *)data_out + 2*4)[0];
+        ((npy_float *)data_out + 2*4)[1] =
+                                ((npy_float *)data0 + 2*4)[1] +
+                                ((npy_float *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_float *)data_out + 2*5)[0] =
+                                ((npy_float *)data0 + 2*5)[0] +
+                                ((npy_float *)data_out + 2*5)[0];
+        ((npy_float *)data_out + 2*5)[1] =
+                                ((npy_float *)data0 + 2*5)[1] +
+                                ((npy_float *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_float *)data_out + 2*6)[0] =
+                                ((npy_float *)data0 + 2*6)[0] +
+                                ((npy_float *)data_out + 2*6)[0];
+        ((npy_float *)data_out + 2*6)[1] =
+                                ((npy_float *)data0 + 2*6)[1] +
+                                ((npy_float *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_float *)data_out + 2*7)[0] =
+                                ((npy_float *)data0 + 2*7)[0] +
+                                ((npy_float *)data_out + 2*7)[0];
+        ((npy_float *)data_out + 2*7)[1] =
+                                ((npy_float *)data0 + 2*7)[1] +
+                                ((npy_float *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 2 == 2 && !1
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+cfloat_sum_of_products_muladd(npy_cfloat *data, npy_cfloat *data_out, npy_float scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_cfloat
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_f32;
+    const npyv_f32 v_scalar = npyv_setall_f32(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_f32 b0 = npyv_loada_f32(data + vstep * 0);
+            npyv_f32 c0 = npyv_loada_f32(data_out + vstep * 0);
+            
+#line 312
+            npyv_f32 b1 = npyv_loada_f32(data + vstep * 1);
+            npyv_f32 c1 = npyv_loada_f32(data_out + vstep * 1);
+            
+#line 312
+            npyv_f32 b2 = npyv_loada_f32(data + vstep * 2);
+            npyv_f32 c2 = npyv_loada_f32(data_out + vstep * 2);
+            
+#line 312
+            npyv_f32 b3 = npyv_loada_f32(data + vstep * 3);
+            npyv_f32 c3 = npyv_loada_f32(data_out + vstep * 3);
+            
+            #line 318
+            npyv_f32 abc0 = npyv_muladd_f32(v_scalar, b0, c0);
+            
+#line 318
+            npyv_f32 abc1 = npyv_muladd_f32(v_scalar, b1, c1);
+            
+#line 318
+            npyv_f32 abc2 = npyv_muladd_f32(v_scalar, b2, c2);
+            
+#line 318
+            npyv_f32 abc3 = npyv_muladd_f32(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_f32(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_f32(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_f32(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_f32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_f32 b0 = npyv_load_f32(data + vstep * 0);
+            npyv_f32 c0 = npyv_load_f32(data_out + vstep * 0);
+            
+#line 312
+            npyv_f32 b1 = npyv_load_f32(data + vstep * 1);
+            npyv_f32 c1 = npyv_load_f32(data_out + vstep * 1);
+            
+#line 312
+            npyv_f32 b2 = npyv_load_f32(data + vstep * 2);
+            npyv_f32 c2 = npyv_load_f32(data_out + vstep * 2);
+            
+#line 312
+            npyv_f32 b3 = npyv_load_f32(data + vstep * 3);
+            npyv_f32 c3 = npyv_load_f32(data_out + vstep * 3);
+            
+            #line 318
+            npyv_f32 abc0 = npyv_muladd_f32(v_scalar, b0, c0);
+            
+#line 318
+            npyv_f32 abc1 = npyv_muladd_f32(v_scalar, b1, c1);
+            
+#line 318
+            npyv_f32 abc2 = npyv_muladd_f32(v_scalar, b2, c2);
+            
+#line 318
+            npyv_f32 abc3 = npyv_muladd_f32(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_f32(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_f32(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_f32(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_f32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_f32 a = npyv_load_tillz_f32(data, count);
+        npyv_f32 b = npyv_load_tillz_f32(data_out, count);
+        npyv_store_till_f32(data_out, count, npyv_muladd_f32(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_float b0 = (data[0]);
+        const npy_float c0 = (data_out[0]);
+        
+#line 340
+        const npy_float b1 = (data[1]);
+        const npy_float c1 = (data_out[1]);
+        
+#line 340
+        const npy_float b2 = (data[2]);
+        const npy_float c2 = (data_out[2]);
+        
+#line 340
+        const npy_float b3 = (data[3]);
+        const npy_float c3 = (data_out[3]);
+        
+        #line 346
+        const npy_float abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_float abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_float abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_float abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_float b = (*data);
+        const npy_float c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_cfloat
+}
+
+static void
+cfloat_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cfloat *data0 = (npy_cfloat *)dataptr[0];
+    npy_cfloat *data1 = (npy_cfloat *)dataptr[1];
+    npy_cfloat *data_out = (npy_cfloat *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("cfloat_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_cfloat
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_f32;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_f32 a0 = npyv_loada_f32(data0 + vstep * 0);
+            npyv_f32 b0 = npyv_loada_f32(data1 + vstep * 0);
+            npyv_f32 c0 = npyv_loada_f32(data_out + vstep * 0);
+            
+#line 390
+            npyv_f32 a1 = npyv_loada_f32(data0 + vstep * 1);
+            npyv_f32 b1 = npyv_loada_f32(data1 + vstep * 1);
+            npyv_f32 c1 = npyv_loada_f32(data_out + vstep * 1);
+            
+#line 390
+            npyv_f32 a2 = npyv_loada_f32(data0 + vstep * 2);
+            npyv_f32 b2 = npyv_loada_f32(data1 + vstep * 2);
+            npyv_f32 c2 = npyv_loada_f32(data_out + vstep * 2);
+            
+#line 390
+            npyv_f32 a3 = npyv_loada_f32(data0 + vstep * 3);
+            npyv_f32 b3 = npyv_loada_f32(data1 + vstep * 3);
+            npyv_f32 c3 = npyv_loada_f32(data_out + vstep * 3);
+            
+            #line 397
+            npyv_f32 abc0 = npyv_muladd_f32(a0, b0, c0);
+            
+#line 397
+            npyv_f32 abc1 = npyv_muladd_f32(a1, b1, c1);
+            
+#line 397
+            npyv_f32 abc2 = npyv_muladd_f32(a2, b2, c2);
+            
+#line 397
+            npyv_f32 abc3 = npyv_muladd_f32(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_f32(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_f32(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_f32(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_f32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_f32 a0 = npyv_load_f32(data0 + vstep * 0);
+            npyv_f32 b0 = npyv_load_f32(data1 + vstep * 0);
+            npyv_f32 c0 = npyv_load_f32(data_out + vstep * 0);
+            
+#line 390
+            npyv_f32 a1 = npyv_load_f32(data0 + vstep * 1);
+            npyv_f32 b1 = npyv_load_f32(data1 + vstep * 1);
+            npyv_f32 c1 = npyv_load_f32(data_out + vstep * 1);
+            
+#line 390
+            npyv_f32 a2 = npyv_load_f32(data0 + vstep * 2);
+            npyv_f32 b2 = npyv_load_f32(data1 + vstep * 2);
+            npyv_f32 c2 = npyv_load_f32(data_out + vstep * 2);
+            
+#line 390
+            npyv_f32 a3 = npyv_load_f32(data0 + vstep * 3);
+            npyv_f32 b3 = npyv_load_f32(data1 + vstep * 3);
+            npyv_f32 c3 = npyv_load_f32(data_out + vstep * 3);
+            
+            #line 397
+            npyv_f32 abc0 = npyv_muladd_f32(a0, b0, c0);
+            
+#line 397
+            npyv_f32 abc1 = npyv_muladd_f32(a1, b1, c1);
+            
+#line 397
+            npyv_f32 abc2 = npyv_muladd_f32(a2, b2, c2);
+            
+#line 397
+            npyv_f32 abc3 = npyv_muladd_f32(a3, b3, c3);
+            
+            #line 402
+            npyv_store_f32(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_f32(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_f32(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_f32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_f32 a = npyv_load_tillz_f32(data0, count);
+        npyv_f32 b = npyv_load_tillz_f32(data1, count);
+        npyv_f32 c = npyv_load_tillz_f32(data_out, count);
+        npyv_store_till_f32(data_out, count, npyv_muladd_f32(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_float a0 = (data0[0]);
+        const npy_float b0 = (data1[0]);
+        const npy_float c0 = (data_out[0]);
+        
+#line 420
+        const npy_float a1 = (data0[1]);
+        const npy_float b1 = (data1[1]);
+        const npy_float c1 = (data_out[1]);
+        
+#line 420
+        const npy_float a2 = (data0[2]);
+        const npy_float b2 = (data1[2]);
+        const npy_float c2 = (data_out[2]);
+        
+#line 420
+        const npy_float a3 = (data0[3]);
+        const npy_float b3 = (data1[3]);
+        const npy_float c3 = (data_out[3]);
+        
+        #line 427
+        const npy_float abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_float abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_float abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_float abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_float a = (*data0);
+        const npy_float b = (*data1);
+        const npy_float c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_cfloat
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+cfloat_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float value0 = (*(npy_cfloat *)dataptr[0]);
+    npy_cfloat *data1 = (npy_cfloat *)dataptr[1];
+    npy_cfloat *data_out = (npy_cfloat *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("cfloat_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    cfloat_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+cfloat_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float value1 = (*(npy_cfloat *)dataptr[1]);
+    npy_cfloat *data0 = (npy_cfloat *)dataptr[0];
+    npy_cfloat *data_out = (npy_cfloat *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("cfloat_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    cfloat_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+cfloat_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cfloat *data0 = (npy_cfloat *)dataptr[0];
+    npy_cfloat *data1 = (npy_cfloat *)dataptr[1];
+    npy_float accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("cfloat_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_cfloat
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_f32;
+    npyv_f32 v_accum = npyv_zero_f32();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_f32 a0 = npyv_loada_f32(data0 + vstep * 0);
+            npyv_f32 b0 = npyv_loada_f32(data1 + vstep * 0);
+            
+#line 501
+            npyv_f32 a1 = npyv_loada_f32(data0 + vstep * 1);
+            npyv_f32 b1 = npyv_loada_f32(data1 + vstep * 1);
+            
+#line 501
+            npyv_f32 a2 = npyv_loada_f32(data0 + vstep * 2);
+            npyv_f32 b2 = npyv_loada_f32(data1 + vstep * 2);
+            
+#line 501
+            npyv_f32 a3 = npyv_loada_f32(data0 + vstep * 3);
+            npyv_f32 b3 = npyv_loada_f32(data1 + vstep * 3);
+            
+            npyv_f32 ab3 = npyv_muladd_f32(a3, b3, v_accum);
+            npyv_f32 ab2 = npyv_muladd_f32(a2, b2, ab3);
+            npyv_f32 ab1 = npyv_muladd_f32(a1, b1, ab2);
+                   v_accum = npyv_muladd_f32(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_f32 a0 = npyv_load_f32(data0 + vstep * 0);
+            npyv_f32 b0 = npyv_load_f32(data1 + vstep * 0);
+            
+#line 501
+            npyv_f32 a1 = npyv_load_f32(data0 + vstep * 1);
+            npyv_f32 b1 = npyv_load_f32(data1 + vstep * 1);
+            
+#line 501
+            npyv_f32 a2 = npyv_load_f32(data0 + vstep * 2);
+            npyv_f32 b2 = npyv_load_f32(data1 + vstep * 2);
+            
+#line 501
+            npyv_f32 a3 = npyv_load_f32(data0 + vstep * 3);
+            npyv_f32 b3 = npyv_load_f32(data1 + vstep * 3);
+            
+            npyv_f32 ab3 = npyv_muladd_f32(a3, b3, v_accum);
+            npyv_f32 ab2 = npyv_muladd_f32(a2, b2, ab3);
+            npyv_f32 ab1 = npyv_muladd_f32(a1, b1, ab2);
+                   v_accum = npyv_muladd_f32(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_f32 a = npyv_load_tillz_f32(data0, count);
+        npyv_f32 b = npyv_load_tillz_f32(data1, count);
+        v_accum = npyv_muladd_f32(a, b, v_accum);
+    }
+    accum = npyv_sum_f32(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_float ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_float ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_float ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_float ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_float a = (*data0);
+        const npy_float b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_cfloat
+    *(npy_cfloat *)dataptr[2] = ((*(npy_cfloat *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+cfloat_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cfloat *data1 = (npy_cfloat *)dataptr[1];
+    npy_float value0 = (*(npy_cfloat *)dataptr[0]);
+    npy_float accum = cfloat_sum_of_arr(data1, count);
+    *(npy_cfloat *)dataptr[2] = ((*(npy_cfloat *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+cfloat_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cfloat *data0 = (npy_cfloat *)dataptr[0];
+    npy_float value1 = (*(npy_cfloat *)dataptr[1]);
+    npy_float accum = cfloat_sum_of_arr(data0, count);
+    *(npy_cfloat *)dataptr[2] = ((*(npy_cfloat *)dataptr[2]) + value1 * accum);
+}
+
+#elif 2 == 3 && !1
+
+static void
+cfloat_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cfloat *data0 = (npy_cfloat *)dataptr[0];
+    npy_cfloat *data1 = (npy_cfloat *)dataptr[1];
+    npy_cfloat *data2 = (npy_cfloat *)dataptr[2];
+    npy_cfloat *data_out = (npy_cfloat *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 2 > 3 || @complex */
+
+static void
+cfloat_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("cfloat_sum_of_products_contig_two (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !1
+        npy_float temp = (*(npy_cfloat *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_cfloat *)dataptr[i]);
+        }
+        *(npy_cfloat *)dataptr[nop] = (temp +
+                                           (*(npy_cfloat *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_cfloat);
+        }
+#else /* complex */
+#  if 2 <= 3
+#    define _SUMPROD_NOP 2
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_float re, im, tmp;
+        int i;
+        re = ((npy_float *)dataptr[0])[0];
+        im = ((npy_float *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_float *)dataptr[i])[0] -
+                  im * ((npy_float *)dataptr[i])[1];
+            im = re * ((npy_float *)dataptr[i])[1] +
+                 im * ((npy_float *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_float *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_float *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_cfloat);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 2 */
+
+#if 2 == 1
+
+static NPY_GCC_OPT_3 void
+cfloat_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("cfloat_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !1
+    npy_cfloat *data = (npy_cfloat *)dataptr[0];
+    npy_float accum = cfloat_sum_of_arr(data, count);
+    *((npy_cfloat *)dataptr[1]) = (accum + (*((npy_cfloat *)dataptr[1])));
+#else
+    npy_float accum_re = 0, accum_im = 0;
+    npy_float *data0 = (npy_float *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_float re01 = data0[0] + data0[2];
+        const npy_float re23 = data0[4] + data0[6];
+        const npy_float im13 = data0[1] + data0[3];
+        const npy_float im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_float *)dataptr[1])[0] += accum_re;
+    ((npy_float *)dataptr[1])[1] += accum_im;
+#endif // !1
+}
+
+#endif /* 2 == 1 */
+
+static void
+cfloat_sum_of_products_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 1
+    npy_float accum_re = 0, accum_im = 0;
+#else
+    npy_float accum = 0;
+#endif
+
+#if (2 == 1) || (2 <= 3 && !1)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (2 == 2 || 2 == 3) && !1
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (2 == 3) && !1
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("cfloat_sum_of_products_outstride0_two (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !1
+#  if 2 == 1
+        accum += (*(npy_cfloat *)data0);
+        data0 += stride0;
+#  elif 2 == 2
+        accum += (*(npy_cfloat *)data0) *
+                 (*(npy_cfloat *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 2 == 3
+        accum += (*(npy_cfloat *)data0) *
+                 (*(npy_cfloat *)data1) *
+                 (*(npy_cfloat *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_float temp = (*(npy_cfloat *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_cfloat *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 2 == 1
+        accum_re += ((npy_float *)data0)[0];
+        accum_im += ((npy_float *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 2 <= 3
+#define _SUMPROD_NOP 2
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_float re, im, tmp;
+        int i;
+        re = ((npy_float *)dataptr[0])[0];
+        im = ((npy_float *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_float *)dataptr[i])[0] -
+                  im * ((npy_float *)dataptr[i])[1];
+            im = re * ((npy_float *)dataptr[i])[1] +
+                 im * ((npy_float *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 1
+#  if 2 <= 3
+    ((npy_float *)dataptr[2])[0] += accum_re;
+    ((npy_float *)dataptr[2])[1] += accum_im;
+#  else
+    ((npy_float *)dataptr[nop])[0] += accum_re;
+    ((npy_float *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 2 <= 3
+    *((npy_cfloat *)dataptr[2]) = (accum +
+                                    (*((npy_cfloat *)dataptr[2])));
+#  else
+    *((npy_cfloat *)dataptr[nop]) = (accum +
+                                    (*((npy_cfloat *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+cfloat_sum_of_products_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (3 == 1) || (3 <= 3 && !1)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (3 == 2 || 3 == 3) && !1
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (3 == 3) && !1
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (3 == 1) || (3 <= 3 && !1)
+    char *data_out = dataptr[3];
+    npy_intp stride_out = strides[3];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("cfloat_sum_of_products_three (%d)\n", (int)count);
+
+    while (count--) {
+#if !1
+#  if 3 == 1
+        *(npy_cfloat *)data_out = ((*(npy_cfloat *)data0) +
+                                         (*(npy_cfloat *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 3 == 2
+        *(npy_cfloat *)data_out = ((*(npy_cfloat *)data0) *
+                                         (*(npy_cfloat *)data1) +
+                                         (*(npy_cfloat *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 3 == 3
+        *(npy_cfloat *)data_out = ((*(npy_cfloat *)data0) *
+                                         (*(npy_cfloat *)data1) *
+                                         (*(npy_cfloat *)data2) +
+                                         (*(npy_cfloat *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_float temp = (*(npy_cfloat *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_cfloat *)dataptr[i]);
+        }
+        *(npy_cfloat *)dataptr[nop] = (temp +
+                                           (*(npy_cfloat *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 3 == 1
+        ((npy_float *)data_out)[0] = ((npy_float *)data0)[0] +
+                                         ((npy_float *)data_out)[0];
+        ((npy_float *)data_out)[1] = ((npy_float *)data0)[1] +
+                                         ((npy_float *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 3 <= 3
+#define _SUMPROD_NOP 3
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_float re, im, tmp;
+        int i;
+        re = ((npy_float *)dataptr[0])[0];
+        im = ((npy_float *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_float *)dataptr[i])[0] -
+                  im * ((npy_float *)dataptr[i])[1];
+            im = re * ((npy_float *)dataptr[i])[1] +
+                 im * ((npy_float *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_float *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_float *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 3 == 1
+
+static void
+cfloat_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cfloat *data0 = (npy_cfloat *)dataptr[0];
+    npy_cfloat *data_out = (npy_cfloat *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("cfloat_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !1
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_float *)data_out + 2*6)[0] =
+                                    ((npy_float *)data0 + 2*6)[0] +
+                                    ((npy_float *)data_out + 2*6)[0];
+            ((npy_float *)data_out + 2*6)[1] =
+                                    ((npy_float *)data0 + 2*6)[1] +
+                                    ((npy_float *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !1
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_float *)data_out + 2*5)[0] =
+                                    ((npy_float *)data0 + 2*5)[0] +
+                                    ((npy_float *)data_out + 2*5)[0];
+            ((npy_float *)data_out + 2*5)[1] =
+                                    ((npy_float *)data0 + 2*5)[1] +
+                                    ((npy_float *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !1
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_float *)data_out + 2*4)[0] =
+                                    ((npy_float *)data0 + 2*4)[0] +
+                                    ((npy_float *)data_out + 2*4)[0];
+            ((npy_float *)data_out + 2*4)[1] =
+                                    ((npy_float *)data0 + 2*4)[1] +
+                                    ((npy_float *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !1
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_float *)data_out + 2*3)[0] =
+                                    ((npy_float *)data0 + 2*3)[0] +
+                                    ((npy_float *)data_out + 2*3)[0];
+            ((npy_float *)data_out + 2*3)[1] =
+                                    ((npy_float *)data0 + 2*3)[1] +
+                                    ((npy_float *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !1
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_float *)data_out + 2*2)[0] =
+                                    ((npy_float *)data0 + 2*2)[0] +
+                                    ((npy_float *)data_out + 2*2)[0];
+            ((npy_float *)data_out + 2*2)[1] =
+                                    ((npy_float *)data0 + 2*2)[1] +
+                                    ((npy_float *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !1
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_float *)data_out + 2*1)[0] =
+                                    ((npy_float *)data0 + 2*1)[0] +
+                                    ((npy_float *)data_out + 2*1)[0];
+            ((npy_float *)data_out + 2*1)[1] =
+                                    ((npy_float *)data0 + 2*1)[1] +
+                                    ((npy_float *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !1
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_float *)data_out + 2*0)[0] =
+                                    ((npy_float *)data0 + 2*0)[0] +
+                                    ((npy_float *)data_out + 2*0)[0];
+            ((npy_float *)data_out + 2*0)[1] =
+                                    ((npy_float *)data0 + 2*0)[1] +
+                                    ((npy_float *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !1
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_float *)data_out + 2*0)[0] =
+                                ((npy_float *)data0 + 2*0)[0] +
+                                ((npy_float *)data_out + 2*0)[0];
+        ((npy_float *)data_out + 2*0)[1] =
+                                ((npy_float *)data0 + 2*0)[1] +
+                                ((npy_float *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_float *)data_out + 2*1)[0] =
+                                ((npy_float *)data0 + 2*1)[0] +
+                                ((npy_float *)data_out + 2*1)[0];
+        ((npy_float *)data_out + 2*1)[1] =
+                                ((npy_float *)data0 + 2*1)[1] +
+                                ((npy_float *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_float *)data_out + 2*2)[0] =
+                                ((npy_float *)data0 + 2*2)[0] +
+                                ((npy_float *)data_out + 2*2)[0];
+        ((npy_float *)data_out + 2*2)[1] =
+                                ((npy_float *)data0 + 2*2)[1] +
+                                ((npy_float *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_float *)data_out + 2*3)[0] =
+                                ((npy_float *)data0 + 2*3)[0] +
+                                ((npy_float *)data_out + 2*3)[0];
+        ((npy_float *)data_out + 2*3)[1] =
+                                ((npy_float *)data0 + 2*3)[1] +
+                                ((npy_float *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_float *)data_out + 2*4)[0] =
+                                ((npy_float *)data0 + 2*4)[0] +
+                                ((npy_float *)data_out + 2*4)[0];
+        ((npy_float *)data_out + 2*4)[1] =
+                                ((npy_float *)data0 + 2*4)[1] +
+                                ((npy_float *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_float *)data_out + 2*5)[0] =
+                                ((npy_float *)data0 + 2*5)[0] +
+                                ((npy_float *)data_out + 2*5)[0];
+        ((npy_float *)data_out + 2*5)[1] =
+                                ((npy_float *)data0 + 2*5)[1] +
+                                ((npy_float *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_float *)data_out + 2*6)[0] =
+                                ((npy_float *)data0 + 2*6)[0] +
+                                ((npy_float *)data_out + 2*6)[0];
+        ((npy_float *)data_out + 2*6)[1] =
+                                ((npy_float *)data0 + 2*6)[1] +
+                                ((npy_float *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_float *)data_out + 2*7)[0] =
+                                ((npy_float *)data0 + 2*7)[0] +
+                                ((npy_float *)data_out + 2*7)[0];
+        ((npy_float *)data_out + 2*7)[1] =
+                                ((npy_float *)data0 + 2*7)[1] +
+                                ((npy_float *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 3 == 2 && !1
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+cfloat_sum_of_products_muladd(npy_cfloat *data, npy_cfloat *data_out, npy_float scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_cfloat
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_f32;
+    const npyv_f32 v_scalar = npyv_setall_f32(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_f32 b0 = npyv_loada_f32(data + vstep * 0);
+            npyv_f32 c0 = npyv_loada_f32(data_out + vstep * 0);
+            
+#line 312
+            npyv_f32 b1 = npyv_loada_f32(data + vstep * 1);
+            npyv_f32 c1 = npyv_loada_f32(data_out + vstep * 1);
+            
+#line 312
+            npyv_f32 b2 = npyv_loada_f32(data + vstep * 2);
+            npyv_f32 c2 = npyv_loada_f32(data_out + vstep * 2);
+            
+#line 312
+            npyv_f32 b3 = npyv_loada_f32(data + vstep * 3);
+            npyv_f32 c3 = npyv_loada_f32(data_out + vstep * 3);
+            
+            #line 318
+            npyv_f32 abc0 = npyv_muladd_f32(v_scalar, b0, c0);
+            
+#line 318
+            npyv_f32 abc1 = npyv_muladd_f32(v_scalar, b1, c1);
+            
+#line 318
+            npyv_f32 abc2 = npyv_muladd_f32(v_scalar, b2, c2);
+            
+#line 318
+            npyv_f32 abc3 = npyv_muladd_f32(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_f32(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_f32(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_f32(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_f32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_f32 b0 = npyv_load_f32(data + vstep * 0);
+            npyv_f32 c0 = npyv_load_f32(data_out + vstep * 0);
+            
+#line 312
+            npyv_f32 b1 = npyv_load_f32(data + vstep * 1);
+            npyv_f32 c1 = npyv_load_f32(data_out + vstep * 1);
+            
+#line 312
+            npyv_f32 b2 = npyv_load_f32(data + vstep * 2);
+            npyv_f32 c2 = npyv_load_f32(data_out + vstep * 2);
+            
+#line 312
+            npyv_f32 b3 = npyv_load_f32(data + vstep * 3);
+            npyv_f32 c3 = npyv_load_f32(data_out + vstep * 3);
+            
+            #line 318
+            npyv_f32 abc0 = npyv_muladd_f32(v_scalar, b0, c0);
+            
+#line 318
+            npyv_f32 abc1 = npyv_muladd_f32(v_scalar, b1, c1);
+            
+#line 318
+            npyv_f32 abc2 = npyv_muladd_f32(v_scalar, b2, c2);
+            
+#line 318
+            npyv_f32 abc3 = npyv_muladd_f32(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_f32(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_f32(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_f32(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_f32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_f32 a = npyv_load_tillz_f32(data, count);
+        npyv_f32 b = npyv_load_tillz_f32(data_out, count);
+        npyv_store_till_f32(data_out, count, npyv_muladd_f32(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_float b0 = (data[0]);
+        const npy_float c0 = (data_out[0]);
+        
+#line 340
+        const npy_float b1 = (data[1]);
+        const npy_float c1 = (data_out[1]);
+        
+#line 340
+        const npy_float b2 = (data[2]);
+        const npy_float c2 = (data_out[2]);
+        
+#line 340
+        const npy_float b3 = (data[3]);
+        const npy_float c3 = (data_out[3]);
+        
+        #line 346
+        const npy_float abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_float abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_float abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_float abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_float b = (*data);
+        const npy_float c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_cfloat
+}
+
+static void
+cfloat_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cfloat *data0 = (npy_cfloat *)dataptr[0];
+    npy_cfloat *data1 = (npy_cfloat *)dataptr[1];
+    npy_cfloat *data_out = (npy_cfloat *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("cfloat_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_cfloat
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_f32;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_f32 a0 = npyv_loada_f32(data0 + vstep * 0);
+            npyv_f32 b0 = npyv_loada_f32(data1 + vstep * 0);
+            npyv_f32 c0 = npyv_loada_f32(data_out + vstep * 0);
+            
+#line 390
+            npyv_f32 a1 = npyv_loada_f32(data0 + vstep * 1);
+            npyv_f32 b1 = npyv_loada_f32(data1 + vstep * 1);
+            npyv_f32 c1 = npyv_loada_f32(data_out + vstep * 1);
+            
+#line 390
+            npyv_f32 a2 = npyv_loada_f32(data0 + vstep * 2);
+            npyv_f32 b2 = npyv_loada_f32(data1 + vstep * 2);
+            npyv_f32 c2 = npyv_loada_f32(data_out + vstep * 2);
+            
+#line 390
+            npyv_f32 a3 = npyv_loada_f32(data0 + vstep * 3);
+            npyv_f32 b3 = npyv_loada_f32(data1 + vstep * 3);
+            npyv_f32 c3 = npyv_loada_f32(data_out + vstep * 3);
+            
+            #line 397
+            npyv_f32 abc0 = npyv_muladd_f32(a0, b0, c0);
+            
+#line 397
+            npyv_f32 abc1 = npyv_muladd_f32(a1, b1, c1);
+            
+#line 397
+            npyv_f32 abc2 = npyv_muladd_f32(a2, b2, c2);
+            
+#line 397
+            npyv_f32 abc3 = npyv_muladd_f32(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_f32(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_f32(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_f32(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_f32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_f32 a0 = npyv_load_f32(data0 + vstep * 0);
+            npyv_f32 b0 = npyv_load_f32(data1 + vstep * 0);
+            npyv_f32 c0 = npyv_load_f32(data_out + vstep * 0);
+            
+#line 390
+            npyv_f32 a1 = npyv_load_f32(data0 + vstep * 1);
+            npyv_f32 b1 = npyv_load_f32(data1 + vstep * 1);
+            npyv_f32 c1 = npyv_load_f32(data_out + vstep * 1);
+            
+#line 390
+            npyv_f32 a2 = npyv_load_f32(data0 + vstep * 2);
+            npyv_f32 b2 = npyv_load_f32(data1 + vstep * 2);
+            npyv_f32 c2 = npyv_load_f32(data_out + vstep * 2);
+            
+#line 390
+            npyv_f32 a3 = npyv_load_f32(data0 + vstep * 3);
+            npyv_f32 b3 = npyv_load_f32(data1 + vstep * 3);
+            npyv_f32 c3 = npyv_load_f32(data_out + vstep * 3);
+            
+            #line 397
+            npyv_f32 abc0 = npyv_muladd_f32(a0, b0, c0);
+            
+#line 397
+            npyv_f32 abc1 = npyv_muladd_f32(a1, b1, c1);
+            
+#line 397
+            npyv_f32 abc2 = npyv_muladd_f32(a2, b2, c2);
+            
+#line 397
+            npyv_f32 abc3 = npyv_muladd_f32(a3, b3, c3);
+            
+            #line 402
+            npyv_store_f32(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_f32(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_f32(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_f32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_f32 a = npyv_load_tillz_f32(data0, count);
+        npyv_f32 b = npyv_load_tillz_f32(data1, count);
+        npyv_f32 c = npyv_load_tillz_f32(data_out, count);
+        npyv_store_till_f32(data_out, count, npyv_muladd_f32(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_float a0 = (data0[0]);
+        const npy_float b0 = (data1[0]);
+        const npy_float c0 = (data_out[0]);
+        
+#line 420
+        const npy_float a1 = (data0[1]);
+        const npy_float b1 = (data1[1]);
+        const npy_float c1 = (data_out[1]);
+        
+#line 420
+        const npy_float a2 = (data0[2]);
+        const npy_float b2 = (data1[2]);
+        const npy_float c2 = (data_out[2]);
+        
+#line 420
+        const npy_float a3 = (data0[3]);
+        const npy_float b3 = (data1[3]);
+        const npy_float c3 = (data_out[3]);
+        
+        #line 427
+        const npy_float abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_float abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_float abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_float abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_float a = (*data0);
+        const npy_float b = (*data1);
+        const npy_float c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_cfloat
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+cfloat_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float value0 = (*(npy_cfloat *)dataptr[0]);
+    npy_cfloat *data1 = (npy_cfloat *)dataptr[1];
+    npy_cfloat *data_out = (npy_cfloat *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("cfloat_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    cfloat_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+cfloat_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float value1 = (*(npy_cfloat *)dataptr[1]);
+    npy_cfloat *data0 = (npy_cfloat *)dataptr[0];
+    npy_cfloat *data_out = (npy_cfloat *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("cfloat_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    cfloat_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+cfloat_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cfloat *data0 = (npy_cfloat *)dataptr[0];
+    npy_cfloat *data1 = (npy_cfloat *)dataptr[1];
+    npy_float accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("cfloat_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_cfloat
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_f32;
+    npyv_f32 v_accum = npyv_zero_f32();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_f32 a0 = npyv_loada_f32(data0 + vstep * 0);
+            npyv_f32 b0 = npyv_loada_f32(data1 + vstep * 0);
+            
+#line 501
+            npyv_f32 a1 = npyv_loada_f32(data0 + vstep * 1);
+            npyv_f32 b1 = npyv_loada_f32(data1 + vstep * 1);
+            
+#line 501
+            npyv_f32 a2 = npyv_loada_f32(data0 + vstep * 2);
+            npyv_f32 b2 = npyv_loada_f32(data1 + vstep * 2);
+            
+#line 501
+            npyv_f32 a3 = npyv_loada_f32(data0 + vstep * 3);
+            npyv_f32 b3 = npyv_loada_f32(data1 + vstep * 3);
+            
+            npyv_f32 ab3 = npyv_muladd_f32(a3, b3, v_accum);
+            npyv_f32 ab2 = npyv_muladd_f32(a2, b2, ab3);
+            npyv_f32 ab1 = npyv_muladd_f32(a1, b1, ab2);
+                   v_accum = npyv_muladd_f32(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_f32 a0 = npyv_load_f32(data0 + vstep * 0);
+            npyv_f32 b0 = npyv_load_f32(data1 + vstep * 0);
+            
+#line 501
+            npyv_f32 a1 = npyv_load_f32(data0 + vstep * 1);
+            npyv_f32 b1 = npyv_load_f32(data1 + vstep * 1);
+            
+#line 501
+            npyv_f32 a2 = npyv_load_f32(data0 + vstep * 2);
+            npyv_f32 b2 = npyv_load_f32(data1 + vstep * 2);
+            
+#line 501
+            npyv_f32 a3 = npyv_load_f32(data0 + vstep * 3);
+            npyv_f32 b3 = npyv_load_f32(data1 + vstep * 3);
+            
+            npyv_f32 ab3 = npyv_muladd_f32(a3, b3, v_accum);
+            npyv_f32 ab2 = npyv_muladd_f32(a2, b2, ab3);
+            npyv_f32 ab1 = npyv_muladd_f32(a1, b1, ab2);
+                   v_accum = npyv_muladd_f32(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_f32 a = npyv_load_tillz_f32(data0, count);
+        npyv_f32 b = npyv_load_tillz_f32(data1, count);
+        v_accum = npyv_muladd_f32(a, b, v_accum);
+    }
+    accum = npyv_sum_f32(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_float ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_float ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_float ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_float ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_float a = (*data0);
+        const npy_float b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_cfloat
+    *(npy_cfloat *)dataptr[2] = ((*(npy_cfloat *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+cfloat_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cfloat *data1 = (npy_cfloat *)dataptr[1];
+    npy_float value0 = (*(npy_cfloat *)dataptr[0]);
+    npy_float accum = cfloat_sum_of_arr(data1, count);
+    *(npy_cfloat *)dataptr[2] = ((*(npy_cfloat *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+cfloat_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cfloat *data0 = (npy_cfloat *)dataptr[0];
+    npy_float value1 = (*(npy_cfloat *)dataptr[1]);
+    npy_float accum = cfloat_sum_of_arr(data0, count);
+    *(npy_cfloat *)dataptr[2] = ((*(npy_cfloat *)dataptr[2]) + value1 * accum);
+}
+
+#elif 3 == 3 && !1
+
+static void
+cfloat_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cfloat *data0 = (npy_cfloat *)dataptr[0];
+    npy_cfloat *data1 = (npy_cfloat *)dataptr[1];
+    npy_cfloat *data2 = (npy_cfloat *)dataptr[2];
+    npy_cfloat *data_out = (npy_cfloat *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 3 > 3 || @complex */
+
+static void
+cfloat_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("cfloat_sum_of_products_contig_three (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !1
+        npy_float temp = (*(npy_cfloat *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_cfloat *)dataptr[i]);
+        }
+        *(npy_cfloat *)dataptr[nop] = (temp +
+                                           (*(npy_cfloat *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_cfloat);
+        }
+#else /* complex */
+#  if 3 <= 3
+#    define _SUMPROD_NOP 3
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_float re, im, tmp;
+        int i;
+        re = ((npy_float *)dataptr[0])[0];
+        im = ((npy_float *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_float *)dataptr[i])[0] -
+                  im * ((npy_float *)dataptr[i])[1];
+            im = re * ((npy_float *)dataptr[i])[1] +
+                 im * ((npy_float *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_float *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_float *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_cfloat);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 3 */
+
+#if 3 == 1
+
+static NPY_GCC_OPT_3 void
+cfloat_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("cfloat_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !1
+    npy_cfloat *data = (npy_cfloat *)dataptr[0];
+    npy_float accum = cfloat_sum_of_arr(data, count);
+    *((npy_cfloat *)dataptr[1]) = (accum + (*((npy_cfloat *)dataptr[1])));
+#else
+    npy_float accum_re = 0, accum_im = 0;
+    npy_float *data0 = (npy_float *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_float re01 = data0[0] + data0[2];
+        const npy_float re23 = data0[4] + data0[6];
+        const npy_float im13 = data0[1] + data0[3];
+        const npy_float im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_float *)dataptr[1])[0] += accum_re;
+    ((npy_float *)dataptr[1])[1] += accum_im;
+#endif // !1
+}
+
+#endif /* 3 == 1 */
+
+static void
+cfloat_sum_of_products_outstride0_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 1
+    npy_float accum_re = 0, accum_im = 0;
+#else
+    npy_float accum = 0;
+#endif
+
+#if (3 == 1) || (3 <= 3 && !1)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (3 == 2 || 3 == 3) && !1
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (3 == 3) && !1
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("cfloat_sum_of_products_outstride0_three (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !1
+#  if 3 == 1
+        accum += (*(npy_cfloat *)data0);
+        data0 += stride0;
+#  elif 3 == 2
+        accum += (*(npy_cfloat *)data0) *
+                 (*(npy_cfloat *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 3 == 3
+        accum += (*(npy_cfloat *)data0) *
+                 (*(npy_cfloat *)data1) *
+                 (*(npy_cfloat *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_float temp = (*(npy_cfloat *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_cfloat *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 3 == 1
+        accum_re += ((npy_float *)data0)[0];
+        accum_im += ((npy_float *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 3 <= 3
+#define _SUMPROD_NOP 3
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_float re, im, tmp;
+        int i;
+        re = ((npy_float *)dataptr[0])[0];
+        im = ((npy_float *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_float *)dataptr[i])[0] -
+                  im * ((npy_float *)dataptr[i])[1];
+            im = re * ((npy_float *)dataptr[i])[1] +
+                 im * ((npy_float *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 1
+#  if 3 <= 3
+    ((npy_float *)dataptr[3])[0] += accum_re;
+    ((npy_float *)dataptr[3])[1] += accum_im;
+#  else
+    ((npy_float *)dataptr[nop])[0] += accum_re;
+    ((npy_float *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 3 <= 3
+    *((npy_cfloat *)dataptr[3]) = (accum +
+                                    (*((npy_cfloat *)dataptr[3])));
+#  else
+    *((npy_cfloat *)dataptr[nop]) = (accum +
+                                    (*((npy_cfloat *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+cfloat_sum_of_products_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1000 == 1) || (1000 <= 3 && !1)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1000 == 2 || 1000 == 3) && !1
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1000 == 3) && !1
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (1000 == 1) || (1000 <= 3 && !1)
+    char *data_out = dataptr[1000];
+    npy_intp stride_out = strides[1000];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("cfloat_sum_of_products_any (%d)\n", (int)count);
+
+    while (count--) {
+#if !1
+#  if 1000 == 1
+        *(npy_cfloat *)data_out = ((*(npy_cfloat *)data0) +
+                                         (*(npy_cfloat *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 1000 == 2
+        *(npy_cfloat *)data_out = ((*(npy_cfloat *)data0) *
+                                         (*(npy_cfloat *)data1) +
+                                         (*(npy_cfloat *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 1000 == 3
+        *(npy_cfloat *)data_out = ((*(npy_cfloat *)data0) *
+                                         (*(npy_cfloat *)data1) *
+                                         (*(npy_cfloat *)data2) +
+                                         (*(npy_cfloat *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_float temp = (*(npy_cfloat *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_cfloat *)dataptr[i]);
+        }
+        *(npy_cfloat *)dataptr[nop] = (temp +
+                                           (*(npy_cfloat *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1000 == 1
+        ((npy_float *)data_out)[0] = ((npy_float *)data0)[0] +
+                                         ((npy_float *)data_out)[0];
+        ((npy_float *)data_out)[1] = ((npy_float *)data0)[1] +
+                                         ((npy_float *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 1000 <= 3
+#define _SUMPROD_NOP 1000
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_float re, im, tmp;
+        int i;
+        re = ((npy_float *)dataptr[0])[0];
+        im = ((npy_float *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_float *)dataptr[i])[0] -
+                  im * ((npy_float *)dataptr[i])[1];
+            im = re * ((npy_float *)dataptr[i])[1] +
+                 im * ((npy_float *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_float *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_float *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 1000 == 1
+
+static void
+cfloat_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cfloat *data0 = (npy_cfloat *)dataptr[0];
+    npy_cfloat *data_out = (npy_cfloat *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("cfloat_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !1
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_float *)data_out + 2*6)[0] =
+                                    ((npy_float *)data0 + 2*6)[0] +
+                                    ((npy_float *)data_out + 2*6)[0];
+            ((npy_float *)data_out + 2*6)[1] =
+                                    ((npy_float *)data0 + 2*6)[1] +
+                                    ((npy_float *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !1
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_float *)data_out + 2*5)[0] =
+                                    ((npy_float *)data0 + 2*5)[0] +
+                                    ((npy_float *)data_out + 2*5)[0];
+            ((npy_float *)data_out + 2*5)[1] =
+                                    ((npy_float *)data0 + 2*5)[1] +
+                                    ((npy_float *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !1
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_float *)data_out + 2*4)[0] =
+                                    ((npy_float *)data0 + 2*4)[0] +
+                                    ((npy_float *)data_out + 2*4)[0];
+            ((npy_float *)data_out + 2*4)[1] =
+                                    ((npy_float *)data0 + 2*4)[1] +
+                                    ((npy_float *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !1
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_float *)data_out + 2*3)[0] =
+                                    ((npy_float *)data0 + 2*3)[0] +
+                                    ((npy_float *)data_out + 2*3)[0];
+            ((npy_float *)data_out + 2*3)[1] =
+                                    ((npy_float *)data0 + 2*3)[1] +
+                                    ((npy_float *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !1
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_float *)data_out + 2*2)[0] =
+                                    ((npy_float *)data0 + 2*2)[0] +
+                                    ((npy_float *)data_out + 2*2)[0];
+            ((npy_float *)data_out + 2*2)[1] =
+                                    ((npy_float *)data0 + 2*2)[1] +
+                                    ((npy_float *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !1
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_float *)data_out + 2*1)[0] =
+                                    ((npy_float *)data0 + 2*1)[0] +
+                                    ((npy_float *)data_out + 2*1)[0];
+            ((npy_float *)data_out + 2*1)[1] =
+                                    ((npy_float *)data0 + 2*1)[1] +
+                                    ((npy_float *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !1
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_float *)data_out + 2*0)[0] =
+                                    ((npy_float *)data0 + 2*0)[0] +
+                                    ((npy_float *)data_out + 2*0)[0];
+            ((npy_float *)data_out + 2*0)[1] =
+                                    ((npy_float *)data0 + 2*0)[1] +
+                                    ((npy_float *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !1
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_float *)data_out + 2*0)[0] =
+                                ((npy_float *)data0 + 2*0)[0] +
+                                ((npy_float *)data_out + 2*0)[0];
+        ((npy_float *)data_out + 2*0)[1] =
+                                ((npy_float *)data0 + 2*0)[1] +
+                                ((npy_float *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_float *)data_out + 2*1)[0] =
+                                ((npy_float *)data0 + 2*1)[0] +
+                                ((npy_float *)data_out + 2*1)[0];
+        ((npy_float *)data_out + 2*1)[1] =
+                                ((npy_float *)data0 + 2*1)[1] +
+                                ((npy_float *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_float *)data_out + 2*2)[0] =
+                                ((npy_float *)data0 + 2*2)[0] +
+                                ((npy_float *)data_out + 2*2)[0];
+        ((npy_float *)data_out + 2*2)[1] =
+                                ((npy_float *)data0 + 2*2)[1] +
+                                ((npy_float *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_float *)data_out + 2*3)[0] =
+                                ((npy_float *)data0 + 2*3)[0] +
+                                ((npy_float *)data_out + 2*3)[0];
+        ((npy_float *)data_out + 2*3)[1] =
+                                ((npy_float *)data0 + 2*3)[1] +
+                                ((npy_float *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_float *)data_out + 2*4)[0] =
+                                ((npy_float *)data0 + 2*4)[0] +
+                                ((npy_float *)data_out + 2*4)[0];
+        ((npy_float *)data_out + 2*4)[1] =
+                                ((npy_float *)data0 + 2*4)[1] +
+                                ((npy_float *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_float *)data_out + 2*5)[0] =
+                                ((npy_float *)data0 + 2*5)[0] +
+                                ((npy_float *)data_out + 2*5)[0];
+        ((npy_float *)data_out + 2*5)[1] =
+                                ((npy_float *)data0 + 2*5)[1] +
+                                ((npy_float *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_float *)data_out + 2*6)[0] =
+                                ((npy_float *)data0 + 2*6)[0] +
+                                ((npy_float *)data_out + 2*6)[0];
+        ((npy_float *)data_out + 2*6)[1] =
+                                ((npy_float *)data0 + 2*6)[1] +
+                                ((npy_float *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_float *)data_out + 2*7)[0] =
+                                ((npy_float *)data0 + 2*7)[0] +
+                                ((npy_float *)data_out + 2*7)[0];
+        ((npy_float *)data_out + 2*7)[1] =
+                                ((npy_float *)data0 + 2*7)[1] +
+                                ((npy_float *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 1000 == 2 && !1
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+cfloat_sum_of_products_muladd(npy_cfloat *data, npy_cfloat *data_out, npy_float scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_cfloat
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_f32;
+    const npyv_f32 v_scalar = npyv_setall_f32(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_f32 b0 = npyv_loada_f32(data + vstep * 0);
+            npyv_f32 c0 = npyv_loada_f32(data_out + vstep * 0);
+            
+#line 312
+            npyv_f32 b1 = npyv_loada_f32(data + vstep * 1);
+            npyv_f32 c1 = npyv_loada_f32(data_out + vstep * 1);
+            
+#line 312
+            npyv_f32 b2 = npyv_loada_f32(data + vstep * 2);
+            npyv_f32 c2 = npyv_loada_f32(data_out + vstep * 2);
+            
+#line 312
+            npyv_f32 b3 = npyv_loada_f32(data + vstep * 3);
+            npyv_f32 c3 = npyv_loada_f32(data_out + vstep * 3);
+            
+            #line 318
+            npyv_f32 abc0 = npyv_muladd_f32(v_scalar, b0, c0);
+            
+#line 318
+            npyv_f32 abc1 = npyv_muladd_f32(v_scalar, b1, c1);
+            
+#line 318
+            npyv_f32 abc2 = npyv_muladd_f32(v_scalar, b2, c2);
+            
+#line 318
+            npyv_f32 abc3 = npyv_muladd_f32(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_f32(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_f32(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_f32(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_f32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_f32 b0 = npyv_load_f32(data + vstep * 0);
+            npyv_f32 c0 = npyv_load_f32(data_out + vstep * 0);
+            
+#line 312
+            npyv_f32 b1 = npyv_load_f32(data + vstep * 1);
+            npyv_f32 c1 = npyv_load_f32(data_out + vstep * 1);
+            
+#line 312
+            npyv_f32 b2 = npyv_load_f32(data + vstep * 2);
+            npyv_f32 c2 = npyv_load_f32(data_out + vstep * 2);
+            
+#line 312
+            npyv_f32 b3 = npyv_load_f32(data + vstep * 3);
+            npyv_f32 c3 = npyv_load_f32(data_out + vstep * 3);
+            
+            #line 318
+            npyv_f32 abc0 = npyv_muladd_f32(v_scalar, b0, c0);
+            
+#line 318
+            npyv_f32 abc1 = npyv_muladd_f32(v_scalar, b1, c1);
+            
+#line 318
+            npyv_f32 abc2 = npyv_muladd_f32(v_scalar, b2, c2);
+            
+#line 318
+            npyv_f32 abc3 = npyv_muladd_f32(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_f32(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_f32(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_f32(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_f32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_f32 a = npyv_load_tillz_f32(data, count);
+        npyv_f32 b = npyv_load_tillz_f32(data_out, count);
+        npyv_store_till_f32(data_out, count, npyv_muladd_f32(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_float b0 = (data[0]);
+        const npy_float c0 = (data_out[0]);
+        
+#line 340
+        const npy_float b1 = (data[1]);
+        const npy_float c1 = (data_out[1]);
+        
+#line 340
+        const npy_float b2 = (data[2]);
+        const npy_float c2 = (data_out[2]);
+        
+#line 340
+        const npy_float b3 = (data[3]);
+        const npy_float c3 = (data_out[3]);
+        
+        #line 346
+        const npy_float abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_float abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_float abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_float abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_float b = (*data);
+        const npy_float c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_cfloat
+}
+
+static void
+cfloat_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cfloat *data0 = (npy_cfloat *)dataptr[0];
+    npy_cfloat *data1 = (npy_cfloat *)dataptr[1];
+    npy_cfloat *data_out = (npy_cfloat *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("cfloat_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_cfloat
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_f32;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_f32 a0 = npyv_loada_f32(data0 + vstep * 0);
+            npyv_f32 b0 = npyv_loada_f32(data1 + vstep * 0);
+            npyv_f32 c0 = npyv_loada_f32(data_out + vstep * 0);
+            
+#line 390
+            npyv_f32 a1 = npyv_loada_f32(data0 + vstep * 1);
+            npyv_f32 b1 = npyv_loada_f32(data1 + vstep * 1);
+            npyv_f32 c1 = npyv_loada_f32(data_out + vstep * 1);
+            
+#line 390
+            npyv_f32 a2 = npyv_loada_f32(data0 + vstep * 2);
+            npyv_f32 b2 = npyv_loada_f32(data1 + vstep * 2);
+            npyv_f32 c2 = npyv_loada_f32(data_out + vstep * 2);
+            
+#line 390
+            npyv_f32 a3 = npyv_loada_f32(data0 + vstep * 3);
+            npyv_f32 b3 = npyv_loada_f32(data1 + vstep * 3);
+            npyv_f32 c3 = npyv_loada_f32(data_out + vstep * 3);
+            
+            #line 397
+            npyv_f32 abc0 = npyv_muladd_f32(a0, b0, c0);
+            
+#line 397
+            npyv_f32 abc1 = npyv_muladd_f32(a1, b1, c1);
+            
+#line 397
+            npyv_f32 abc2 = npyv_muladd_f32(a2, b2, c2);
+            
+#line 397
+            npyv_f32 abc3 = npyv_muladd_f32(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_f32(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_f32(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_f32(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_f32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_f32 a0 = npyv_load_f32(data0 + vstep * 0);
+            npyv_f32 b0 = npyv_load_f32(data1 + vstep * 0);
+            npyv_f32 c0 = npyv_load_f32(data_out + vstep * 0);
+            
+#line 390
+            npyv_f32 a1 = npyv_load_f32(data0 + vstep * 1);
+            npyv_f32 b1 = npyv_load_f32(data1 + vstep * 1);
+            npyv_f32 c1 = npyv_load_f32(data_out + vstep * 1);
+            
+#line 390
+            npyv_f32 a2 = npyv_load_f32(data0 + vstep * 2);
+            npyv_f32 b2 = npyv_load_f32(data1 + vstep * 2);
+            npyv_f32 c2 = npyv_load_f32(data_out + vstep * 2);
+            
+#line 390
+            npyv_f32 a3 = npyv_load_f32(data0 + vstep * 3);
+            npyv_f32 b3 = npyv_load_f32(data1 + vstep * 3);
+            npyv_f32 c3 = npyv_load_f32(data_out + vstep * 3);
+            
+            #line 397
+            npyv_f32 abc0 = npyv_muladd_f32(a0, b0, c0);
+            
+#line 397
+            npyv_f32 abc1 = npyv_muladd_f32(a1, b1, c1);
+            
+#line 397
+            npyv_f32 abc2 = npyv_muladd_f32(a2, b2, c2);
+            
+#line 397
+            npyv_f32 abc3 = npyv_muladd_f32(a3, b3, c3);
+            
+            #line 402
+            npyv_store_f32(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_f32(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_f32(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_f32(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_f32 a = npyv_load_tillz_f32(data0, count);
+        npyv_f32 b = npyv_load_tillz_f32(data1, count);
+        npyv_f32 c = npyv_load_tillz_f32(data_out, count);
+        npyv_store_till_f32(data_out, count, npyv_muladd_f32(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_float a0 = (data0[0]);
+        const npy_float b0 = (data1[0]);
+        const npy_float c0 = (data_out[0]);
+        
+#line 420
+        const npy_float a1 = (data0[1]);
+        const npy_float b1 = (data1[1]);
+        const npy_float c1 = (data_out[1]);
+        
+#line 420
+        const npy_float a2 = (data0[2]);
+        const npy_float b2 = (data1[2]);
+        const npy_float c2 = (data_out[2]);
+        
+#line 420
+        const npy_float a3 = (data0[3]);
+        const npy_float b3 = (data1[3]);
+        const npy_float c3 = (data_out[3]);
+        
+        #line 427
+        const npy_float abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_float abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_float abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_float abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_float a = (*data0);
+        const npy_float b = (*data1);
+        const npy_float c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_cfloat
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+cfloat_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float value0 = (*(npy_cfloat *)dataptr[0]);
+    npy_cfloat *data1 = (npy_cfloat *)dataptr[1];
+    npy_cfloat *data_out = (npy_cfloat *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("cfloat_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    cfloat_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+cfloat_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_float value1 = (*(npy_cfloat *)dataptr[1]);
+    npy_cfloat *data0 = (npy_cfloat *)dataptr[0];
+    npy_cfloat *data_out = (npy_cfloat *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("cfloat_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    cfloat_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+cfloat_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cfloat *data0 = (npy_cfloat *)dataptr[0];
+    npy_cfloat *data1 = (npy_cfloat *)dataptr[1];
+    npy_float accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("cfloat_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_cfloat
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_f32;
+    npyv_f32 v_accum = npyv_zero_f32();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_f32 a0 = npyv_loada_f32(data0 + vstep * 0);
+            npyv_f32 b0 = npyv_loada_f32(data1 + vstep * 0);
+            
+#line 501
+            npyv_f32 a1 = npyv_loada_f32(data0 + vstep * 1);
+            npyv_f32 b1 = npyv_loada_f32(data1 + vstep * 1);
+            
+#line 501
+            npyv_f32 a2 = npyv_loada_f32(data0 + vstep * 2);
+            npyv_f32 b2 = npyv_loada_f32(data1 + vstep * 2);
+            
+#line 501
+            npyv_f32 a3 = npyv_loada_f32(data0 + vstep * 3);
+            npyv_f32 b3 = npyv_loada_f32(data1 + vstep * 3);
+            
+            npyv_f32 ab3 = npyv_muladd_f32(a3, b3, v_accum);
+            npyv_f32 ab2 = npyv_muladd_f32(a2, b2, ab3);
+            npyv_f32 ab1 = npyv_muladd_f32(a1, b1, ab2);
+                   v_accum = npyv_muladd_f32(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_f32 a0 = npyv_load_f32(data0 + vstep * 0);
+            npyv_f32 b0 = npyv_load_f32(data1 + vstep * 0);
+            
+#line 501
+            npyv_f32 a1 = npyv_load_f32(data0 + vstep * 1);
+            npyv_f32 b1 = npyv_load_f32(data1 + vstep * 1);
+            
+#line 501
+            npyv_f32 a2 = npyv_load_f32(data0 + vstep * 2);
+            npyv_f32 b2 = npyv_load_f32(data1 + vstep * 2);
+            
+#line 501
+            npyv_f32 a3 = npyv_load_f32(data0 + vstep * 3);
+            npyv_f32 b3 = npyv_load_f32(data1 + vstep * 3);
+            
+            npyv_f32 ab3 = npyv_muladd_f32(a3, b3, v_accum);
+            npyv_f32 ab2 = npyv_muladd_f32(a2, b2, ab3);
+            npyv_f32 ab1 = npyv_muladd_f32(a1, b1, ab2);
+                   v_accum = npyv_muladd_f32(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_f32 a = npyv_load_tillz_f32(data0, count);
+        npyv_f32 b = npyv_load_tillz_f32(data1, count);
+        v_accum = npyv_muladd_f32(a, b, v_accum);
+    }
+    accum = npyv_sum_f32(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_float ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_float ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_float ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_float ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_float a = (*data0);
+        const npy_float b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_cfloat
+    *(npy_cfloat *)dataptr[2] = ((*(npy_cfloat *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+cfloat_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cfloat *data1 = (npy_cfloat *)dataptr[1];
+    npy_float value0 = (*(npy_cfloat *)dataptr[0]);
+    npy_float accum = cfloat_sum_of_arr(data1, count);
+    *(npy_cfloat *)dataptr[2] = ((*(npy_cfloat *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+cfloat_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cfloat *data0 = (npy_cfloat *)dataptr[0];
+    npy_float value1 = (*(npy_cfloat *)dataptr[1]);
+    npy_float accum = cfloat_sum_of_arr(data0, count);
+    *(npy_cfloat *)dataptr[2] = ((*(npy_cfloat *)dataptr[2]) + value1 * accum);
+}
+
+#elif 1000 == 3 && !1
+
+static void
+cfloat_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cfloat *data0 = (npy_cfloat *)dataptr[0];
+    npy_cfloat *data1 = (npy_cfloat *)dataptr[1];
+    npy_cfloat *data2 = (npy_cfloat *)dataptr[2];
+    npy_cfloat *data_out = (npy_cfloat *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 1000 > 3 || @complex */
+
+static void
+cfloat_sum_of_products_contig_any(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("cfloat_sum_of_products_contig_any (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !1
+        npy_float temp = (*(npy_cfloat *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_cfloat *)dataptr[i]);
+        }
+        *(npy_cfloat *)dataptr[nop] = (temp +
+                                           (*(npy_cfloat *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_cfloat);
+        }
+#else /* complex */
+#  if 1000 <= 3
+#    define _SUMPROD_NOP 1000
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_float re, im, tmp;
+        int i;
+        re = ((npy_float *)dataptr[0])[0];
+        im = ((npy_float *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_float *)dataptr[i])[0] -
+                  im * ((npy_float *)dataptr[i])[1];
+            im = re * ((npy_float *)dataptr[i])[1] +
+                 im * ((npy_float *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_float *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_float *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_float *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_cfloat);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 1000 */
+
+#if 1000 == 1
+
+static NPY_GCC_OPT_3 void
+cfloat_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("cfloat_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !1
+    npy_cfloat *data = (npy_cfloat *)dataptr[0];
+    npy_float accum = cfloat_sum_of_arr(data, count);
+    *((npy_cfloat *)dataptr[1]) = (accum + (*((npy_cfloat *)dataptr[1])));
+#else
+    npy_float accum_re = 0, accum_im = 0;
+    npy_float *data0 = (npy_float *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_float re01 = data0[0] + data0[2];
+        const npy_float re23 = data0[4] + data0[6];
+        const npy_float im13 = data0[1] + data0[3];
+        const npy_float im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_float *)dataptr[1])[0] += accum_re;
+    ((npy_float *)dataptr[1])[1] += accum_im;
+#endif // !1
+}
+
+#endif /* 1000 == 1 */
+
+static void
+cfloat_sum_of_products_outstride0_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 1
+    npy_float accum_re = 0, accum_im = 0;
+#else
+    npy_float accum = 0;
+#endif
+
+#if (1000 == 1) || (1000 <= 3 && !1)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1000 == 2 || 1000 == 3) && !1
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1000 == 3) && !1
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("cfloat_sum_of_products_outstride0_any (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !1
+#  if 1000 == 1
+        accum += (*(npy_cfloat *)data0);
+        data0 += stride0;
+#  elif 1000 == 2
+        accum += (*(npy_cfloat *)data0) *
+                 (*(npy_cfloat *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 1000 == 3
+        accum += (*(npy_cfloat *)data0) *
+                 (*(npy_cfloat *)data1) *
+                 (*(npy_cfloat *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_float temp = (*(npy_cfloat *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_cfloat *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1000 == 1
+        accum_re += ((npy_float *)data0)[0];
+        accum_im += ((npy_float *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 1000 <= 3
+#define _SUMPROD_NOP 1000
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_float re, im, tmp;
+        int i;
+        re = ((npy_float *)dataptr[0])[0];
+        im = ((npy_float *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_float *)dataptr[i])[0] -
+                  im * ((npy_float *)dataptr[i])[1];
+            im = re * ((npy_float *)dataptr[i])[1] +
+                 im * ((npy_float *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 1
+#  if 1000 <= 3
+    ((npy_float *)dataptr[1000])[0] += accum_re;
+    ((npy_float *)dataptr[1000])[1] += accum_im;
+#  else
+    ((npy_float *)dataptr[nop])[0] += accum_re;
+    ((npy_float *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 1000 <= 3
+    *((npy_cfloat *)dataptr[1000]) = (accum +
+                                    (*((npy_cfloat *)dataptr[1000])));
+#  else
+    *((npy_cfloat *)dataptr[nop]) = (accum +
+                                    (*((npy_cfloat *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+
+
+#line 74
+
+#if !1
+static NPY_GCC_OPT_3 npy_double cdouble_sum_of_arr(npy_cdouble *data, npy_intp count)
+{
+    npy_double accum = 0;
+#if 0 // NPYV check for npy_cdouble
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data);
+    const int vstep = npyv_nlanes_f64;
+    npyv_f64 v_accum = npyv_zero_f64();
+    const npy_intp vstepx4 = vstep * 4;
+
+    #line 91
+    if(is_aligned) {
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
+            #line 96
+            npyv_f64 a0 = npyv_loada_f64(data + vstep * 0);
+            
+#line 96
+            npyv_f64 a1 = npyv_loada_f64(data + vstep * 1);
+            
+#line 96
+            npyv_f64 a2 = npyv_loada_f64(data + vstep * 2);
+            
+#line 96
+            npyv_f64 a3 = npyv_loada_f64(data + vstep * 3);
+            
+            npyv_f64 a01   = npyv_add_f64(a0, a1);
+            npyv_f64 a23   = npyv_add_f64(a2, a3);
+            npyv_f64 a0123 = npyv_add_f64(a01, a23);
+                     v_accum = npyv_add_f64(a0123, v_accum);
+        }
+    }
+    
+#line 91
+    else {
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
+            #line 96
+            npyv_f64 a0 = npyv_load_f64(data + vstep * 0);
+            
+#line 96
+            npyv_f64 a1 = npyv_load_f64(data + vstep * 1);
+            
+#line 96
+            npyv_f64 a2 = npyv_load_f64(data + vstep * 2);
+            
+#line 96
+            npyv_f64 a3 = npyv_load_f64(data + vstep * 3);
+            
+            npyv_f64 a01   = npyv_add_f64(a0, a1);
+            npyv_f64 a23   = npyv_add_f64(a2, a3);
+            npyv_f64 a0123 = npyv_add_f64(a01, a23);
+                     v_accum = npyv_add_f64(a0123, v_accum);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep) {
+        npyv_f64 a = npyv_load_tillz_f64(data, count);
+        v_accum = npyv_add_f64(a, v_accum);
+    }
+    accum = npyv_sum_f64(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data += 4) {
+        const npy_double a01 = (*data) + (data[1]);
+        const npy_double a23 = (data[2]) + (data[3]);
+        accum +=  a01 + a23;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data++) {
+        accum += (*data);
+    }
+#endif // NPYV check for npy_cdouble
+    return accum;
+}
+#endif
+
+#line 131
+static void
+cdouble_sum_of_products_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1 == 1) || (1 <= 3 && !1)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1 == 2 || 1 == 3) && !1
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1 == 3) && !1
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (1 == 1) || (1 <= 3 && !1)
+    char *data_out = dataptr[1];
+    npy_intp stride_out = strides[1];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("cdouble_sum_of_products_one (%d)\n", (int)count);
+
+    while (count--) {
+#if !1
+#  if 1 == 1
+        *(npy_cdouble *)data_out = ((*(npy_cdouble *)data0) +
+                                         (*(npy_cdouble *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 1 == 2
+        *(npy_cdouble *)data_out = ((*(npy_cdouble *)data0) *
+                                         (*(npy_cdouble *)data1) +
+                                         (*(npy_cdouble *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 1 == 3
+        *(npy_cdouble *)data_out = ((*(npy_cdouble *)data0) *
+                                         (*(npy_cdouble *)data1) *
+                                         (*(npy_cdouble *)data2) +
+                                         (*(npy_cdouble *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_double temp = (*(npy_cdouble *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_cdouble *)dataptr[i]);
+        }
+        *(npy_cdouble *)dataptr[nop] = (temp +
+                                           (*(npy_cdouble *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1 == 1
+        ((npy_double *)data_out)[0] = ((npy_double *)data0)[0] +
+                                         ((npy_double *)data_out)[0];
+        ((npy_double *)data_out)[1] = ((npy_double *)data0)[1] +
+                                         ((npy_double *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 1 <= 3
+#define _SUMPROD_NOP 1
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_double re, im, tmp;
+        int i;
+        re = ((npy_double *)dataptr[0])[0];
+        im = ((npy_double *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_double *)dataptr[i])[0] -
+                  im * ((npy_double *)dataptr[i])[1];
+            im = re * ((npy_double *)dataptr[i])[1] +
+                 im * ((npy_double *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_double *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_double *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_double *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_double *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 1 == 1
+
+static void
+cdouble_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cdouble *data0 = (npy_cdouble *)dataptr[0];
+    npy_cdouble *data_out = (npy_cdouble *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("cdouble_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !1
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_double *)data_out + 2*6)[0] =
+                                    ((npy_double *)data0 + 2*6)[0] +
+                                    ((npy_double *)data_out + 2*6)[0];
+            ((npy_double *)data_out + 2*6)[1] =
+                                    ((npy_double *)data0 + 2*6)[1] +
+                                    ((npy_double *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !1
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_double *)data_out + 2*5)[0] =
+                                    ((npy_double *)data0 + 2*5)[0] +
+                                    ((npy_double *)data_out + 2*5)[0];
+            ((npy_double *)data_out + 2*5)[1] =
+                                    ((npy_double *)data0 + 2*5)[1] +
+                                    ((npy_double *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !1
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_double *)data_out + 2*4)[0] =
+                                    ((npy_double *)data0 + 2*4)[0] +
+                                    ((npy_double *)data_out + 2*4)[0];
+            ((npy_double *)data_out + 2*4)[1] =
+                                    ((npy_double *)data0 + 2*4)[1] +
+                                    ((npy_double *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !1
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_double *)data_out + 2*3)[0] =
+                                    ((npy_double *)data0 + 2*3)[0] +
+                                    ((npy_double *)data_out + 2*3)[0];
+            ((npy_double *)data_out + 2*3)[1] =
+                                    ((npy_double *)data0 + 2*3)[1] +
+                                    ((npy_double *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !1
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_double *)data_out + 2*2)[0] =
+                                    ((npy_double *)data0 + 2*2)[0] +
+                                    ((npy_double *)data_out + 2*2)[0];
+            ((npy_double *)data_out + 2*2)[1] =
+                                    ((npy_double *)data0 + 2*2)[1] +
+                                    ((npy_double *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !1
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_double *)data_out + 2*1)[0] =
+                                    ((npy_double *)data0 + 2*1)[0] +
+                                    ((npy_double *)data_out + 2*1)[0];
+            ((npy_double *)data_out + 2*1)[1] =
+                                    ((npy_double *)data0 + 2*1)[1] +
+                                    ((npy_double *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !1
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_double *)data_out + 2*0)[0] =
+                                    ((npy_double *)data0 + 2*0)[0] +
+                                    ((npy_double *)data_out + 2*0)[0];
+            ((npy_double *)data_out + 2*0)[1] =
+                                    ((npy_double *)data0 + 2*0)[1] +
+                                    ((npy_double *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !1
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_double *)data_out + 2*0)[0] =
+                                ((npy_double *)data0 + 2*0)[0] +
+                                ((npy_double *)data_out + 2*0)[0];
+        ((npy_double *)data_out + 2*0)[1] =
+                                ((npy_double *)data0 + 2*0)[1] +
+                                ((npy_double *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_double *)data_out + 2*1)[0] =
+                                ((npy_double *)data0 + 2*1)[0] +
+                                ((npy_double *)data_out + 2*1)[0];
+        ((npy_double *)data_out + 2*1)[1] =
+                                ((npy_double *)data0 + 2*1)[1] +
+                                ((npy_double *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_double *)data_out + 2*2)[0] =
+                                ((npy_double *)data0 + 2*2)[0] +
+                                ((npy_double *)data_out + 2*2)[0];
+        ((npy_double *)data_out + 2*2)[1] =
+                                ((npy_double *)data0 + 2*2)[1] +
+                                ((npy_double *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_double *)data_out + 2*3)[0] =
+                                ((npy_double *)data0 + 2*3)[0] +
+                                ((npy_double *)data_out + 2*3)[0];
+        ((npy_double *)data_out + 2*3)[1] =
+                                ((npy_double *)data0 + 2*3)[1] +
+                                ((npy_double *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_double *)data_out + 2*4)[0] =
+                                ((npy_double *)data0 + 2*4)[0] +
+                                ((npy_double *)data_out + 2*4)[0];
+        ((npy_double *)data_out + 2*4)[1] =
+                                ((npy_double *)data0 + 2*4)[1] +
+                                ((npy_double *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_double *)data_out + 2*5)[0] =
+                                ((npy_double *)data0 + 2*5)[0] +
+                                ((npy_double *)data_out + 2*5)[0];
+        ((npy_double *)data_out + 2*5)[1] =
+                                ((npy_double *)data0 + 2*5)[1] +
+                                ((npy_double *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_double *)data_out + 2*6)[0] =
+                                ((npy_double *)data0 + 2*6)[0] +
+                                ((npy_double *)data_out + 2*6)[0];
+        ((npy_double *)data_out + 2*6)[1] =
+                                ((npy_double *)data0 + 2*6)[1] +
+                                ((npy_double *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_double *)data_out + 2*7)[0] =
+                                ((npy_double *)data0 + 2*7)[0] +
+                                ((npy_double *)data_out + 2*7)[0];
+        ((npy_double *)data_out + 2*7)[1] =
+                                ((npy_double *)data0 + 2*7)[1] +
+                                ((npy_double *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 1 == 2 && !1
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+cdouble_sum_of_products_muladd(npy_cdouble *data, npy_cdouble *data_out, npy_double scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_cdouble
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_f64;
+    const npyv_f64 v_scalar = npyv_setall_f64(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_f64 b0 = npyv_loada_f64(data + vstep * 0);
+            npyv_f64 c0 = npyv_loada_f64(data_out + vstep * 0);
+            
+#line 312
+            npyv_f64 b1 = npyv_loada_f64(data + vstep * 1);
+            npyv_f64 c1 = npyv_loada_f64(data_out + vstep * 1);
+            
+#line 312
+            npyv_f64 b2 = npyv_loada_f64(data + vstep * 2);
+            npyv_f64 c2 = npyv_loada_f64(data_out + vstep * 2);
+            
+#line 312
+            npyv_f64 b3 = npyv_loada_f64(data + vstep * 3);
+            npyv_f64 c3 = npyv_loada_f64(data_out + vstep * 3);
+            
+            #line 318
+            npyv_f64 abc0 = npyv_muladd_f64(v_scalar, b0, c0);
+            
+#line 318
+            npyv_f64 abc1 = npyv_muladd_f64(v_scalar, b1, c1);
+            
+#line 318
+            npyv_f64 abc2 = npyv_muladd_f64(v_scalar, b2, c2);
+            
+#line 318
+            npyv_f64 abc3 = npyv_muladd_f64(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_f64(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_f64(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_f64(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_f64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_f64 b0 = npyv_load_f64(data + vstep * 0);
+            npyv_f64 c0 = npyv_load_f64(data_out + vstep * 0);
+            
+#line 312
+            npyv_f64 b1 = npyv_load_f64(data + vstep * 1);
+            npyv_f64 c1 = npyv_load_f64(data_out + vstep * 1);
+            
+#line 312
+            npyv_f64 b2 = npyv_load_f64(data + vstep * 2);
+            npyv_f64 c2 = npyv_load_f64(data_out + vstep * 2);
+            
+#line 312
+            npyv_f64 b3 = npyv_load_f64(data + vstep * 3);
+            npyv_f64 c3 = npyv_load_f64(data_out + vstep * 3);
+            
+            #line 318
+            npyv_f64 abc0 = npyv_muladd_f64(v_scalar, b0, c0);
+            
+#line 318
+            npyv_f64 abc1 = npyv_muladd_f64(v_scalar, b1, c1);
+            
+#line 318
+            npyv_f64 abc2 = npyv_muladd_f64(v_scalar, b2, c2);
+            
+#line 318
+            npyv_f64 abc3 = npyv_muladd_f64(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_f64(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_f64(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_f64(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_f64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_f64 a = npyv_load_tillz_f64(data, count);
+        npyv_f64 b = npyv_load_tillz_f64(data_out, count);
+        npyv_store_till_f64(data_out, count, npyv_muladd_f64(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_double b0 = (data[0]);
+        const npy_double c0 = (data_out[0]);
+        
+#line 340
+        const npy_double b1 = (data[1]);
+        const npy_double c1 = (data_out[1]);
+        
+#line 340
+        const npy_double b2 = (data[2]);
+        const npy_double c2 = (data_out[2]);
+        
+#line 340
+        const npy_double b3 = (data[3]);
+        const npy_double c3 = (data_out[3]);
+        
+        #line 346
+        const npy_double abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_double abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_double abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_double abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_double b = (*data);
+        const npy_double c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_cdouble
+}
+
+static void
+cdouble_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cdouble *data0 = (npy_cdouble *)dataptr[0];
+    npy_cdouble *data1 = (npy_cdouble *)dataptr[1];
+    npy_cdouble *data_out = (npy_cdouble *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("cdouble_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_cdouble
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_f64;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_f64 a0 = npyv_loada_f64(data0 + vstep * 0);
+            npyv_f64 b0 = npyv_loada_f64(data1 + vstep * 0);
+            npyv_f64 c0 = npyv_loada_f64(data_out + vstep * 0);
+            
+#line 390
+            npyv_f64 a1 = npyv_loada_f64(data0 + vstep * 1);
+            npyv_f64 b1 = npyv_loada_f64(data1 + vstep * 1);
+            npyv_f64 c1 = npyv_loada_f64(data_out + vstep * 1);
+            
+#line 390
+            npyv_f64 a2 = npyv_loada_f64(data0 + vstep * 2);
+            npyv_f64 b2 = npyv_loada_f64(data1 + vstep * 2);
+            npyv_f64 c2 = npyv_loada_f64(data_out + vstep * 2);
+            
+#line 390
+            npyv_f64 a3 = npyv_loada_f64(data0 + vstep * 3);
+            npyv_f64 b3 = npyv_loada_f64(data1 + vstep * 3);
+            npyv_f64 c3 = npyv_loada_f64(data_out + vstep * 3);
+            
+            #line 397
+            npyv_f64 abc0 = npyv_muladd_f64(a0, b0, c0);
+            
+#line 397
+            npyv_f64 abc1 = npyv_muladd_f64(a1, b1, c1);
+            
+#line 397
+            npyv_f64 abc2 = npyv_muladd_f64(a2, b2, c2);
+            
+#line 397
+            npyv_f64 abc3 = npyv_muladd_f64(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_f64(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_f64(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_f64(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_f64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_f64 a0 = npyv_load_f64(data0 + vstep * 0);
+            npyv_f64 b0 = npyv_load_f64(data1 + vstep * 0);
+            npyv_f64 c0 = npyv_load_f64(data_out + vstep * 0);
+            
+#line 390
+            npyv_f64 a1 = npyv_load_f64(data0 + vstep * 1);
+            npyv_f64 b1 = npyv_load_f64(data1 + vstep * 1);
+            npyv_f64 c1 = npyv_load_f64(data_out + vstep * 1);
+            
+#line 390
+            npyv_f64 a2 = npyv_load_f64(data0 + vstep * 2);
+            npyv_f64 b2 = npyv_load_f64(data1 + vstep * 2);
+            npyv_f64 c2 = npyv_load_f64(data_out + vstep * 2);
+            
+#line 390
+            npyv_f64 a3 = npyv_load_f64(data0 + vstep * 3);
+            npyv_f64 b3 = npyv_load_f64(data1 + vstep * 3);
+            npyv_f64 c3 = npyv_load_f64(data_out + vstep * 3);
+            
+            #line 397
+            npyv_f64 abc0 = npyv_muladd_f64(a0, b0, c0);
+            
+#line 397
+            npyv_f64 abc1 = npyv_muladd_f64(a1, b1, c1);
+            
+#line 397
+            npyv_f64 abc2 = npyv_muladd_f64(a2, b2, c2);
+            
+#line 397
+            npyv_f64 abc3 = npyv_muladd_f64(a3, b3, c3);
+            
+            #line 402
+            npyv_store_f64(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_f64(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_f64(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_f64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_f64 a = npyv_load_tillz_f64(data0, count);
+        npyv_f64 b = npyv_load_tillz_f64(data1, count);
+        npyv_f64 c = npyv_load_tillz_f64(data_out, count);
+        npyv_store_till_f64(data_out, count, npyv_muladd_f64(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_double a0 = (data0[0]);
+        const npy_double b0 = (data1[0]);
+        const npy_double c0 = (data_out[0]);
+        
+#line 420
+        const npy_double a1 = (data0[1]);
+        const npy_double b1 = (data1[1]);
+        const npy_double c1 = (data_out[1]);
+        
+#line 420
+        const npy_double a2 = (data0[2]);
+        const npy_double b2 = (data1[2]);
+        const npy_double c2 = (data_out[2]);
+        
+#line 420
+        const npy_double a3 = (data0[3]);
+        const npy_double b3 = (data1[3]);
+        const npy_double c3 = (data_out[3]);
+        
+        #line 427
+        const npy_double abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_double abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_double abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_double abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_double a = (*data0);
+        const npy_double b = (*data1);
+        const npy_double c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_cdouble
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+cdouble_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double value0 = (*(npy_cdouble *)dataptr[0]);
+    npy_cdouble *data1 = (npy_cdouble *)dataptr[1];
+    npy_cdouble *data_out = (npy_cdouble *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("cdouble_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    cdouble_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+cdouble_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double value1 = (*(npy_cdouble *)dataptr[1]);
+    npy_cdouble *data0 = (npy_cdouble *)dataptr[0];
+    npy_cdouble *data_out = (npy_cdouble *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("cdouble_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    cdouble_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+cdouble_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cdouble *data0 = (npy_cdouble *)dataptr[0];
+    npy_cdouble *data1 = (npy_cdouble *)dataptr[1];
+    npy_double accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("cdouble_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_cdouble
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_f64;
+    npyv_f64 v_accum = npyv_zero_f64();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_f64 a0 = npyv_loada_f64(data0 + vstep * 0);
+            npyv_f64 b0 = npyv_loada_f64(data1 + vstep * 0);
+            
+#line 501
+            npyv_f64 a1 = npyv_loada_f64(data0 + vstep * 1);
+            npyv_f64 b1 = npyv_loada_f64(data1 + vstep * 1);
+            
+#line 501
+            npyv_f64 a2 = npyv_loada_f64(data0 + vstep * 2);
+            npyv_f64 b2 = npyv_loada_f64(data1 + vstep * 2);
+            
+#line 501
+            npyv_f64 a3 = npyv_loada_f64(data0 + vstep * 3);
+            npyv_f64 b3 = npyv_loada_f64(data1 + vstep * 3);
+            
+            npyv_f64 ab3 = npyv_muladd_f64(a3, b3, v_accum);
+            npyv_f64 ab2 = npyv_muladd_f64(a2, b2, ab3);
+            npyv_f64 ab1 = npyv_muladd_f64(a1, b1, ab2);
+                   v_accum = npyv_muladd_f64(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_f64 a0 = npyv_load_f64(data0 + vstep * 0);
+            npyv_f64 b0 = npyv_load_f64(data1 + vstep * 0);
+            
+#line 501
+            npyv_f64 a1 = npyv_load_f64(data0 + vstep * 1);
+            npyv_f64 b1 = npyv_load_f64(data1 + vstep * 1);
+            
+#line 501
+            npyv_f64 a2 = npyv_load_f64(data0 + vstep * 2);
+            npyv_f64 b2 = npyv_load_f64(data1 + vstep * 2);
+            
+#line 501
+            npyv_f64 a3 = npyv_load_f64(data0 + vstep * 3);
+            npyv_f64 b3 = npyv_load_f64(data1 + vstep * 3);
+            
+            npyv_f64 ab3 = npyv_muladd_f64(a3, b3, v_accum);
+            npyv_f64 ab2 = npyv_muladd_f64(a2, b2, ab3);
+            npyv_f64 ab1 = npyv_muladd_f64(a1, b1, ab2);
+                   v_accum = npyv_muladd_f64(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_f64 a = npyv_load_tillz_f64(data0, count);
+        npyv_f64 b = npyv_load_tillz_f64(data1, count);
+        v_accum = npyv_muladd_f64(a, b, v_accum);
+    }
+    accum = npyv_sum_f64(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_double ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_double ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_double ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_double ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_double a = (*data0);
+        const npy_double b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_cdouble
+    *(npy_cdouble *)dataptr[2] = ((*(npy_cdouble *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+cdouble_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cdouble *data1 = (npy_cdouble *)dataptr[1];
+    npy_double value0 = (*(npy_cdouble *)dataptr[0]);
+    npy_double accum = cdouble_sum_of_arr(data1, count);
+    *(npy_cdouble *)dataptr[2] = ((*(npy_cdouble *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+cdouble_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cdouble *data0 = (npy_cdouble *)dataptr[0];
+    npy_double value1 = (*(npy_cdouble *)dataptr[1]);
+    npy_double accum = cdouble_sum_of_arr(data0, count);
+    *(npy_cdouble *)dataptr[2] = ((*(npy_cdouble *)dataptr[2]) + value1 * accum);
+}
+
+#elif 1 == 3 && !1
+
+static void
+cdouble_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cdouble *data0 = (npy_cdouble *)dataptr[0];
+    npy_cdouble *data1 = (npy_cdouble *)dataptr[1];
+    npy_cdouble *data2 = (npy_cdouble *)dataptr[2];
+    npy_cdouble *data_out = (npy_cdouble *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 1 > 3 || @complex */
+
+static void
+cdouble_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("cdouble_sum_of_products_contig_one (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !1
+        npy_double temp = (*(npy_cdouble *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_cdouble *)dataptr[i]);
+        }
+        *(npy_cdouble *)dataptr[nop] = (temp +
+                                           (*(npy_cdouble *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_cdouble);
+        }
+#else /* complex */
+#  if 1 <= 3
+#    define _SUMPROD_NOP 1
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_double re, im, tmp;
+        int i;
+        re = ((npy_double *)dataptr[0])[0];
+        im = ((npy_double *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_double *)dataptr[i])[0] -
+                  im * ((npy_double *)dataptr[i])[1];
+            im = re * ((npy_double *)dataptr[i])[1] +
+                 im * ((npy_double *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_double *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_double *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_double *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_double *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_cdouble);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 1 */
+
+#if 1 == 1
+
+static NPY_GCC_OPT_3 void
+cdouble_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("cdouble_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !1
+    npy_cdouble *data = (npy_cdouble *)dataptr[0];
+    npy_double accum = cdouble_sum_of_arr(data, count);
+    *((npy_cdouble *)dataptr[1]) = (accum + (*((npy_cdouble *)dataptr[1])));
+#else
+    npy_double accum_re = 0, accum_im = 0;
+    npy_double *data0 = (npy_double *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_double re01 = data0[0] + data0[2];
+        const npy_double re23 = data0[4] + data0[6];
+        const npy_double im13 = data0[1] + data0[3];
+        const npy_double im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_double *)dataptr[1])[0] += accum_re;
+    ((npy_double *)dataptr[1])[1] += accum_im;
+#endif // !1
+}
+
+#endif /* 1 == 1 */
+
+static void
+cdouble_sum_of_products_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 1
+    npy_double accum_re = 0, accum_im = 0;
+#else
+    npy_double accum = 0;
+#endif
+
+#if (1 == 1) || (1 <= 3 && !1)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1 == 2 || 1 == 3) && !1
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1 == 3) && !1
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("cdouble_sum_of_products_outstride0_one (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !1
+#  if 1 == 1
+        accum += (*(npy_cdouble *)data0);
+        data0 += stride0;
+#  elif 1 == 2
+        accum += (*(npy_cdouble *)data0) *
+                 (*(npy_cdouble *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 1 == 3
+        accum += (*(npy_cdouble *)data0) *
+                 (*(npy_cdouble *)data1) *
+                 (*(npy_cdouble *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_double temp = (*(npy_cdouble *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_cdouble *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1 == 1
+        accum_re += ((npy_double *)data0)[0];
+        accum_im += ((npy_double *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 1 <= 3
+#define _SUMPROD_NOP 1
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_double re, im, tmp;
+        int i;
+        re = ((npy_double *)dataptr[0])[0];
+        im = ((npy_double *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_double *)dataptr[i])[0] -
+                  im * ((npy_double *)dataptr[i])[1];
+            im = re * ((npy_double *)dataptr[i])[1] +
+                 im * ((npy_double *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 1
+#  if 1 <= 3
+    ((npy_double *)dataptr[1])[0] += accum_re;
+    ((npy_double *)dataptr[1])[1] += accum_im;
+#  else
+    ((npy_double *)dataptr[nop])[0] += accum_re;
+    ((npy_double *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 1 <= 3
+    *((npy_cdouble *)dataptr[1]) = (accum +
+                                    (*((npy_cdouble *)dataptr[1])));
+#  else
+    *((npy_cdouble *)dataptr[nop]) = (accum +
+                                    (*((npy_cdouble *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+cdouble_sum_of_products_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (2 == 1) || (2 <= 3 && !1)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (2 == 2 || 2 == 3) && !1
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (2 == 3) && !1
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (2 == 1) || (2 <= 3 && !1)
+    char *data_out = dataptr[2];
+    npy_intp stride_out = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("cdouble_sum_of_products_two (%d)\n", (int)count);
+
+    while (count--) {
+#if !1
+#  if 2 == 1
+        *(npy_cdouble *)data_out = ((*(npy_cdouble *)data0) +
+                                         (*(npy_cdouble *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 2 == 2
+        *(npy_cdouble *)data_out = ((*(npy_cdouble *)data0) *
+                                         (*(npy_cdouble *)data1) +
+                                         (*(npy_cdouble *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 2 == 3
+        *(npy_cdouble *)data_out = ((*(npy_cdouble *)data0) *
+                                         (*(npy_cdouble *)data1) *
+                                         (*(npy_cdouble *)data2) +
+                                         (*(npy_cdouble *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_double temp = (*(npy_cdouble *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_cdouble *)dataptr[i]);
+        }
+        *(npy_cdouble *)dataptr[nop] = (temp +
+                                           (*(npy_cdouble *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 2 == 1
+        ((npy_double *)data_out)[0] = ((npy_double *)data0)[0] +
+                                         ((npy_double *)data_out)[0];
+        ((npy_double *)data_out)[1] = ((npy_double *)data0)[1] +
+                                         ((npy_double *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 2 <= 3
+#define _SUMPROD_NOP 2
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_double re, im, tmp;
+        int i;
+        re = ((npy_double *)dataptr[0])[0];
+        im = ((npy_double *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_double *)dataptr[i])[0] -
+                  im * ((npy_double *)dataptr[i])[1];
+            im = re * ((npy_double *)dataptr[i])[1] +
+                 im * ((npy_double *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_double *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_double *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_double *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_double *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 2 == 1
+
+static void
+cdouble_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cdouble *data0 = (npy_cdouble *)dataptr[0];
+    npy_cdouble *data_out = (npy_cdouble *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("cdouble_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !1
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_double *)data_out + 2*6)[0] =
+                                    ((npy_double *)data0 + 2*6)[0] +
+                                    ((npy_double *)data_out + 2*6)[0];
+            ((npy_double *)data_out + 2*6)[1] =
+                                    ((npy_double *)data0 + 2*6)[1] +
+                                    ((npy_double *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !1
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_double *)data_out + 2*5)[0] =
+                                    ((npy_double *)data0 + 2*5)[0] +
+                                    ((npy_double *)data_out + 2*5)[0];
+            ((npy_double *)data_out + 2*5)[1] =
+                                    ((npy_double *)data0 + 2*5)[1] +
+                                    ((npy_double *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !1
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_double *)data_out + 2*4)[0] =
+                                    ((npy_double *)data0 + 2*4)[0] +
+                                    ((npy_double *)data_out + 2*4)[0];
+            ((npy_double *)data_out + 2*4)[1] =
+                                    ((npy_double *)data0 + 2*4)[1] +
+                                    ((npy_double *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !1
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_double *)data_out + 2*3)[0] =
+                                    ((npy_double *)data0 + 2*3)[0] +
+                                    ((npy_double *)data_out + 2*3)[0];
+            ((npy_double *)data_out + 2*3)[1] =
+                                    ((npy_double *)data0 + 2*3)[1] +
+                                    ((npy_double *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !1
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_double *)data_out + 2*2)[0] =
+                                    ((npy_double *)data0 + 2*2)[0] +
+                                    ((npy_double *)data_out + 2*2)[0];
+            ((npy_double *)data_out + 2*2)[1] =
+                                    ((npy_double *)data0 + 2*2)[1] +
+                                    ((npy_double *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !1
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_double *)data_out + 2*1)[0] =
+                                    ((npy_double *)data0 + 2*1)[0] +
+                                    ((npy_double *)data_out + 2*1)[0];
+            ((npy_double *)data_out + 2*1)[1] =
+                                    ((npy_double *)data0 + 2*1)[1] +
+                                    ((npy_double *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !1
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_double *)data_out + 2*0)[0] =
+                                    ((npy_double *)data0 + 2*0)[0] +
+                                    ((npy_double *)data_out + 2*0)[0];
+            ((npy_double *)data_out + 2*0)[1] =
+                                    ((npy_double *)data0 + 2*0)[1] +
+                                    ((npy_double *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !1
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_double *)data_out + 2*0)[0] =
+                                ((npy_double *)data0 + 2*0)[0] +
+                                ((npy_double *)data_out + 2*0)[0];
+        ((npy_double *)data_out + 2*0)[1] =
+                                ((npy_double *)data0 + 2*0)[1] +
+                                ((npy_double *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_double *)data_out + 2*1)[0] =
+                                ((npy_double *)data0 + 2*1)[0] +
+                                ((npy_double *)data_out + 2*1)[0];
+        ((npy_double *)data_out + 2*1)[1] =
+                                ((npy_double *)data0 + 2*1)[1] +
+                                ((npy_double *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_double *)data_out + 2*2)[0] =
+                                ((npy_double *)data0 + 2*2)[0] +
+                                ((npy_double *)data_out + 2*2)[0];
+        ((npy_double *)data_out + 2*2)[1] =
+                                ((npy_double *)data0 + 2*2)[1] +
+                                ((npy_double *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_double *)data_out + 2*3)[0] =
+                                ((npy_double *)data0 + 2*3)[0] +
+                                ((npy_double *)data_out + 2*3)[0];
+        ((npy_double *)data_out + 2*3)[1] =
+                                ((npy_double *)data0 + 2*3)[1] +
+                                ((npy_double *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_double *)data_out + 2*4)[0] =
+                                ((npy_double *)data0 + 2*4)[0] +
+                                ((npy_double *)data_out + 2*4)[0];
+        ((npy_double *)data_out + 2*4)[1] =
+                                ((npy_double *)data0 + 2*4)[1] +
+                                ((npy_double *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_double *)data_out + 2*5)[0] =
+                                ((npy_double *)data0 + 2*5)[0] +
+                                ((npy_double *)data_out + 2*5)[0];
+        ((npy_double *)data_out + 2*5)[1] =
+                                ((npy_double *)data0 + 2*5)[1] +
+                                ((npy_double *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_double *)data_out + 2*6)[0] =
+                                ((npy_double *)data0 + 2*6)[0] +
+                                ((npy_double *)data_out + 2*6)[0];
+        ((npy_double *)data_out + 2*6)[1] =
+                                ((npy_double *)data0 + 2*6)[1] +
+                                ((npy_double *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_double *)data_out + 2*7)[0] =
+                                ((npy_double *)data0 + 2*7)[0] +
+                                ((npy_double *)data_out + 2*7)[0];
+        ((npy_double *)data_out + 2*7)[1] =
+                                ((npy_double *)data0 + 2*7)[1] +
+                                ((npy_double *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 2 == 2 && !1
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+cdouble_sum_of_products_muladd(npy_cdouble *data, npy_cdouble *data_out, npy_double scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_cdouble
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_f64;
+    const npyv_f64 v_scalar = npyv_setall_f64(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_f64 b0 = npyv_loada_f64(data + vstep * 0);
+            npyv_f64 c0 = npyv_loada_f64(data_out + vstep * 0);
+            
+#line 312
+            npyv_f64 b1 = npyv_loada_f64(data + vstep * 1);
+            npyv_f64 c1 = npyv_loada_f64(data_out + vstep * 1);
+            
+#line 312
+            npyv_f64 b2 = npyv_loada_f64(data + vstep * 2);
+            npyv_f64 c2 = npyv_loada_f64(data_out + vstep * 2);
+            
+#line 312
+            npyv_f64 b3 = npyv_loada_f64(data + vstep * 3);
+            npyv_f64 c3 = npyv_loada_f64(data_out + vstep * 3);
+            
+            #line 318
+            npyv_f64 abc0 = npyv_muladd_f64(v_scalar, b0, c0);
+            
+#line 318
+            npyv_f64 abc1 = npyv_muladd_f64(v_scalar, b1, c1);
+            
+#line 318
+            npyv_f64 abc2 = npyv_muladd_f64(v_scalar, b2, c2);
+            
+#line 318
+            npyv_f64 abc3 = npyv_muladd_f64(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_f64(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_f64(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_f64(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_f64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_f64 b0 = npyv_load_f64(data + vstep * 0);
+            npyv_f64 c0 = npyv_load_f64(data_out + vstep * 0);
+            
+#line 312
+            npyv_f64 b1 = npyv_load_f64(data + vstep * 1);
+            npyv_f64 c1 = npyv_load_f64(data_out + vstep * 1);
+            
+#line 312
+            npyv_f64 b2 = npyv_load_f64(data + vstep * 2);
+            npyv_f64 c2 = npyv_load_f64(data_out + vstep * 2);
+            
+#line 312
+            npyv_f64 b3 = npyv_load_f64(data + vstep * 3);
+            npyv_f64 c3 = npyv_load_f64(data_out + vstep * 3);
+            
+            #line 318
+            npyv_f64 abc0 = npyv_muladd_f64(v_scalar, b0, c0);
+            
+#line 318
+            npyv_f64 abc1 = npyv_muladd_f64(v_scalar, b1, c1);
+            
+#line 318
+            npyv_f64 abc2 = npyv_muladd_f64(v_scalar, b2, c2);
+            
+#line 318
+            npyv_f64 abc3 = npyv_muladd_f64(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_f64(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_f64(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_f64(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_f64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_f64 a = npyv_load_tillz_f64(data, count);
+        npyv_f64 b = npyv_load_tillz_f64(data_out, count);
+        npyv_store_till_f64(data_out, count, npyv_muladd_f64(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_double b0 = (data[0]);
+        const npy_double c0 = (data_out[0]);
+        
+#line 340
+        const npy_double b1 = (data[1]);
+        const npy_double c1 = (data_out[1]);
+        
+#line 340
+        const npy_double b2 = (data[2]);
+        const npy_double c2 = (data_out[2]);
+        
+#line 340
+        const npy_double b3 = (data[3]);
+        const npy_double c3 = (data_out[3]);
+        
+        #line 346
+        const npy_double abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_double abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_double abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_double abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_double b = (*data);
+        const npy_double c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_cdouble
+}
+
+static void
+cdouble_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cdouble *data0 = (npy_cdouble *)dataptr[0];
+    npy_cdouble *data1 = (npy_cdouble *)dataptr[1];
+    npy_cdouble *data_out = (npy_cdouble *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("cdouble_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_cdouble
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_f64;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_f64 a0 = npyv_loada_f64(data0 + vstep * 0);
+            npyv_f64 b0 = npyv_loada_f64(data1 + vstep * 0);
+            npyv_f64 c0 = npyv_loada_f64(data_out + vstep * 0);
+            
+#line 390
+            npyv_f64 a1 = npyv_loada_f64(data0 + vstep * 1);
+            npyv_f64 b1 = npyv_loada_f64(data1 + vstep * 1);
+            npyv_f64 c1 = npyv_loada_f64(data_out + vstep * 1);
+            
+#line 390
+            npyv_f64 a2 = npyv_loada_f64(data0 + vstep * 2);
+            npyv_f64 b2 = npyv_loada_f64(data1 + vstep * 2);
+            npyv_f64 c2 = npyv_loada_f64(data_out + vstep * 2);
+            
+#line 390
+            npyv_f64 a3 = npyv_loada_f64(data0 + vstep * 3);
+            npyv_f64 b3 = npyv_loada_f64(data1 + vstep * 3);
+            npyv_f64 c3 = npyv_loada_f64(data_out + vstep * 3);
+            
+            #line 397
+            npyv_f64 abc0 = npyv_muladd_f64(a0, b0, c0);
+            
+#line 397
+            npyv_f64 abc1 = npyv_muladd_f64(a1, b1, c1);
+            
+#line 397
+            npyv_f64 abc2 = npyv_muladd_f64(a2, b2, c2);
+            
+#line 397
+            npyv_f64 abc3 = npyv_muladd_f64(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_f64(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_f64(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_f64(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_f64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_f64 a0 = npyv_load_f64(data0 + vstep * 0);
+            npyv_f64 b0 = npyv_load_f64(data1 + vstep * 0);
+            npyv_f64 c0 = npyv_load_f64(data_out + vstep * 0);
+            
+#line 390
+            npyv_f64 a1 = npyv_load_f64(data0 + vstep * 1);
+            npyv_f64 b1 = npyv_load_f64(data1 + vstep * 1);
+            npyv_f64 c1 = npyv_load_f64(data_out + vstep * 1);
+            
+#line 390
+            npyv_f64 a2 = npyv_load_f64(data0 + vstep * 2);
+            npyv_f64 b2 = npyv_load_f64(data1 + vstep * 2);
+            npyv_f64 c2 = npyv_load_f64(data_out + vstep * 2);
+            
+#line 390
+            npyv_f64 a3 = npyv_load_f64(data0 + vstep * 3);
+            npyv_f64 b3 = npyv_load_f64(data1 + vstep * 3);
+            npyv_f64 c3 = npyv_load_f64(data_out + vstep * 3);
+            
+            #line 397
+            npyv_f64 abc0 = npyv_muladd_f64(a0, b0, c0);
+            
+#line 397
+            npyv_f64 abc1 = npyv_muladd_f64(a1, b1, c1);
+            
+#line 397
+            npyv_f64 abc2 = npyv_muladd_f64(a2, b2, c2);
+            
+#line 397
+            npyv_f64 abc3 = npyv_muladd_f64(a3, b3, c3);
+            
+            #line 402
+            npyv_store_f64(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_f64(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_f64(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_f64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_f64 a = npyv_load_tillz_f64(data0, count);
+        npyv_f64 b = npyv_load_tillz_f64(data1, count);
+        npyv_f64 c = npyv_load_tillz_f64(data_out, count);
+        npyv_store_till_f64(data_out, count, npyv_muladd_f64(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_double a0 = (data0[0]);
+        const npy_double b0 = (data1[0]);
+        const npy_double c0 = (data_out[0]);
+        
+#line 420
+        const npy_double a1 = (data0[1]);
+        const npy_double b1 = (data1[1]);
+        const npy_double c1 = (data_out[1]);
+        
+#line 420
+        const npy_double a2 = (data0[2]);
+        const npy_double b2 = (data1[2]);
+        const npy_double c2 = (data_out[2]);
+        
+#line 420
+        const npy_double a3 = (data0[3]);
+        const npy_double b3 = (data1[3]);
+        const npy_double c3 = (data_out[3]);
+        
+        #line 427
+        const npy_double abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_double abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_double abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_double abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_double a = (*data0);
+        const npy_double b = (*data1);
+        const npy_double c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_cdouble
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+cdouble_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double value0 = (*(npy_cdouble *)dataptr[0]);
+    npy_cdouble *data1 = (npy_cdouble *)dataptr[1];
+    npy_cdouble *data_out = (npy_cdouble *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("cdouble_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    cdouble_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+cdouble_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double value1 = (*(npy_cdouble *)dataptr[1]);
+    npy_cdouble *data0 = (npy_cdouble *)dataptr[0];
+    npy_cdouble *data_out = (npy_cdouble *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("cdouble_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    cdouble_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+cdouble_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cdouble *data0 = (npy_cdouble *)dataptr[0];
+    npy_cdouble *data1 = (npy_cdouble *)dataptr[1];
+    npy_double accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("cdouble_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_cdouble
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_f64;
+    npyv_f64 v_accum = npyv_zero_f64();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_f64 a0 = npyv_loada_f64(data0 + vstep * 0);
+            npyv_f64 b0 = npyv_loada_f64(data1 + vstep * 0);
+            
+#line 501
+            npyv_f64 a1 = npyv_loada_f64(data0 + vstep * 1);
+            npyv_f64 b1 = npyv_loada_f64(data1 + vstep * 1);
+            
+#line 501
+            npyv_f64 a2 = npyv_loada_f64(data0 + vstep * 2);
+            npyv_f64 b2 = npyv_loada_f64(data1 + vstep * 2);
+            
+#line 501
+            npyv_f64 a3 = npyv_loada_f64(data0 + vstep * 3);
+            npyv_f64 b3 = npyv_loada_f64(data1 + vstep * 3);
+            
+            npyv_f64 ab3 = npyv_muladd_f64(a3, b3, v_accum);
+            npyv_f64 ab2 = npyv_muladd_f64(a2, b2, ab3);
+            npyv_f64 ab1 = npyv_muladd_f64(a1, b1, ab2);
+                   v_accum = npyv_muladd_f64(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_f64 a0 = npyv_load_f64(data0 + vstep * 0);
+            npyv_f64 b0 = npyv_load_f64(data1 + vstep * 0);
+            
+#line 501
+            npyv_f64 a1 = npyv_load_f64(data0 + vstep * 1);
+            npyv_f64 b1 = npyv_load_f64(data1 + vstep * 1);
+            
+#line 501
+            npyv_f64 a2 = npyv_load_f64(data0 + vstep * 2);
+            npyv_f64 b2 = npyv_load_f64(data1 + vstep * 2);
+            
+#line 501
+            npyv_f64 a3 = npyv_load_f64(data0 + vstep * 3);
+            npyv_f64 b3 = npyv_load_f64(data1 + vstep * 3);
+            
+            npyv_f64 ab3 = npyv_muladd_f64(a3, b3, v_accum);
+            npyv_f64 ab2 = npyv_muladd_f64(a2, b2, ab3);
+            npyv_f64 ab1 = npyv_muladd_f64(a1, b1, ab2);
+                   v_accum = npyv_muladd_f64(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_f64 a = npyv_load_tillz_f64(data0, count);
+        npyv_f64 b = npyv_load_tillz_f64(data1, count);
+        v_accum = npyv_muladd_f64(a, b, v_accum);
+    }
+    accum = npyv_sum_f64(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_double ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_double ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_double ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_double ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_double a = (*data0);
+        const npy_double b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_cdouble
+    *(npy_cdouble *)dataptr[2] = ((*(npy_cdouble *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+cdouble_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cdouble *data1 = (npy_cdouble *)dataptr[1];
+    npy_double value0 = (*(npy_cdouble *)dataptr[0]);
+    npy_double accum = cdouble_sum_of_arr(data1, count);
+    *(npy_cdouble *)dataptr[2] = ((*(npy_cdouble *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+cdouble_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cdouble *data0 = (npy_cdouble *)dataptr[0];
+    npy_double value1 = (*(npy_cdouble *)dataptr[1]);
+    npy_double accum = cdouble_sum_of_arr(data0, count);
+    *(npy_cdouble *)dataptr[2] = ((*(npy_cdouble *)dataptr[2]) + value1 * accum);
+}
+
+#elif 2 == 3 && !1
+
+static void
+cdouble_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cdouble *data0 = (npy_cdouble *)dataptr[0];
+    npy_cdouble *data1 = (npy_cdouble *)dataptr[1];
+    npy_cdouble *data2 = (npy_cdouble *)dataptr[2];
+    npy_cdouble *data_out = (npy_cdouble *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 2 > 3 || @complex */
+
+static void
+cdouble_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("cdouble_sum_of_products_contig_two (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !1
+        npy_double temp = (*(npy_cdouble *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_cdouble *)dataptr[i]);
+        }
+        *(npy_cdouble *)dataptr[nop] = (temp +
+                                           (*(npy_cdouble *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_cdouble);
+        }
+#else /* complex */
+#  if 2 <= 3
+#    define _SUMPROD_NOP 2
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_double re, im, tmp;
+        int i;
+        re = ((npy_double *)dataptr[0])[0];
+        im = ((npy_double *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_double *)dataptr[i])[0] -
+                  im * ((npy_double *)dataptr[i])[1];
+            im = re * ((npy_double *)dataptr[i])[1] +
+                 im * ((npy_double *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_double *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_double *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_double *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_double *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_cdouble);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 2 */
+
+#if 2 == 1
+
+static NPY_GCC_OPT_3 void
+cdouble_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("cdouble_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !1
+    npy_cdouble *data = (npy_cdouble *)dataptr[0];
+    npy_double accum = cdouble_sum_of_arr(data, count);
+    *((npy_cdouble *)dataptr[1]) = (accum + (*((npy_cdouble *)dataptr[1])));
+#else
+    npy_double accum_re = 0, accum_im = 0;
+    npy_double *data0 = (npy_double *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_double re01 = data0[0] + data0[2];
+        const npy_double re23 = data0[4] + data0[6];
+        const npy_double im13 = data0[1] + data0[3];
+        const npy_double im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_double *)dataptr[1])[0] += accum_re;
+    ((npy_double *)dataptr[1])[1] += accum_im;
+#endif // !1
+}
+
+#endif /* 2 == 1 */
+
+static void
+cdouble_sum_of_products_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 1
+    npy_double accum_re = 0, accum_im = 0;
+#else
+    npy_double accum = 0;
+#endif
+
+#if (2 == 1) || (2 <= 3 && !1)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (2 == 2 || 2 == 3) && !1
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (2 == 3) && !1
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("cdouble_sum_of_products_outstride0_two (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !1
+#  if 2 == 1
+        accum += (*(npy_cdouble *)data0);
+        data0 += stride0;
+#  elif 2 == 2
+        accum += (*(npy_cdouble *)data0) *
+                 (*(npy_cdouble *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 2 == 3
+        accum += (*(npy_cdouble *)data0) *
+                 (*(npy_cdouble *)data1) *
+                 (*(npy_cdouble *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_double temp = (*(npy_cdouble *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_cdouble *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 2 == 1
+        accum_re += ((npy_double *)data0)[0];
+        accum_im += ((npy_double *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 2 <= 3
+#define _SUMPROD_NOP 2
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_double re, im, tmp;
+        int i;
+        re = ((npy_double *)dataptr[0])[0];
+        im = ((npy_double *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_double *)dataptr[i])[0] -
+                  im * ((npy_double *)dataptr[i])[1];
+            im = re * ((npy_double *)dataptr[i])[1] +
+                 im * ((npy_double *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 1
+#  if 2 <= 3
+    ((npy_double *)dataptr[2])[0] += accum_re;
+    ((npy_double *)dataptr[2])[1] += accum_im;
+#  else
+    ((npy_double *)dataptr[nop])[0] += accum_re;
+    ((npy_double *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 2 <= 3
+    *((npy_cdouble *)dataptr[2]) = (accum +
+                                    (*((npy_cdouble *)dataptr[2])));
+#  else
+    *((npy_cdouble *)dataptr[nop]) = (accum +
+                                    (*((npy_cdouble *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+cdouble_sum_of_products_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (3 == 1) || (3 <= 3 && !1)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (3 == 2 || 3 == 3) && !1
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (3 == 3) && !1
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (3 == 1) || (3 <= 3 && !1)
+    char *data_out = dataptr[3];
+    npy_intp stride_out = strides[3];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("cdouble_sum_of_products_three (%d)\n", (int)count);
+
+    while (count--) {
+#if !1
+#  if 3 == 1
+        *(npy_cdouble *)data_out = ((*(npy_cdouble *)data0) +
+                                         (*(npy_cdouble *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 3 == 2
+        *(npy_cdouble *)data_out = ((*(npy_cdouble *)data0) *
+                                         (*(npy_cdouble *)data1) +
+                                         (*(npy_cdouble *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 3 == 3
+        *(npy_cdouble *)data_out = ((*(npy_cdouble *)data0) *
+                                         (*(npy_cdouble *)data1) *
+                                         (*(npy_cdouble *)data2) +
+                                         (*(npy_cdouble *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_double temp = (*(npy_cdouble *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_cdouble *)dataptr[i]);
+        }
+        *(npy_cdouble *)dataptr[nop] = (temp +
+                                           (*(npy_cdouble *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 3 == 1
+        ((npy_double *)data_out)[0] = ((npy_double *)data0)[0] +
+                                         ((npy_double *)data_out)[0];
+        ((npy_double *)data_out)[1] = ((npy_double *)data0)[1] +
+                                         ((npy_double *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 3 <= 3
+#define _SUMPROD_NOP 3
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_double re, im, tmp;
+        int i;
+        re = ((npy_double *)dataptr[0])[0];
+        im = ((npy_double *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_double *)dataptr[i])[0] -
+                  im * ((npy_double *)dataptr[i])[1];
+            im = re * ((npy_double *)dataptr[i])[1] +
+                 im * ((npy_double *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_double *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_double *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_double *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_double *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 3 == 1
+
+static void
+cdouble_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cdouble *data0 = (npy_cdouble *)dataptr[0];
+    npy_cdouble *data_out = (npy_cdouble *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("cdouble_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !1
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_double *)data_out + 2*6)[0] =
+                                    ((npy_double *)data0 + 2*6)[0] +
+                                    ((npy_double *)data_out + 2*6)[0];
+            ((npy_double *)data_out + 2*6)[1] =
+                                    ((npy_double *)data0 + 2*6)[1] +
+                                    ((npy_double *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !1
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_double *)data_out + 2*5)[0] =
+                                    ((npy_double *)data0 + 2*5)[0] +
+                                    ((npy_double *)data_out + 2*5)[0];
+            ((npy_double *)data_out + 2*5)[1] =
+                                    ((npy_double *)data0 + 2*5)[1] +
+                                    ((npy_double *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !1
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_double *)data_out + 2*4)[0] =
+                                    ((npy_double *)data0 + 2*4)[0] +
+                                    ((npy_double *)data_out + 2*4)[0];
+            ((npy_double *)data_out + 2*4)[1] =
+                                    ((npy_double *)data0 + 2*4)[1] +
+                                    ((npy_double *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !1
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_double *)data_out + 2*3)[0] =
+                                    ((npy_double *)data0 + 2*3)[0] +
+                                    ((npy_double *)data_out + 2*3)[0];
+            ((npy_double *)data_out + 2*3)[1] =
+                                    ((npy_double *)data0 + 2*3)[1] +
+                                    ((npy_double *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !1
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_double *)data_out + 2*2)[0] =
+                                    ((npy_double *)data0 + 2*2)[0] +
+                                    ((npy_double *)data_out + 2*2)[0];
+            ((npy_double *)data_out + 2*2)[1] =
+                                    ((npy_double *)data0 + 2*2)[1] +
+                                    ((npy_double *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !1
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_double *)data_out + 2*1)[0] =
+                                    ((npy_double *)data0 + 2*1)[0] +
+                                    ((npy_double *)data_out + 2*1)[0];
+            ((npy_double *)data_out + 2*1)[1] =
+                                    ((npy_double *)data0 + 2*1)[1] +
+                                    ((npy_double *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !1
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_double *)data_out + 2*0)[0] =
+                                    ((npy_double *)data0 + 2*0)[0] +
+                                    ((npy_double *)data_out + 2*0)[0];
+            ((npy_double *)data_out + 2*0)[1] =
+                                    ((npy_double *)data0 + 2*0)[1] +
+                                    ((npy_double *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !1
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_double *)data_out + 2*0)[0] =
+                                ((npy_double *)data0 + 2*0)[0] +
+                                ((npy_double *)data_out + 2*0)[0];
+        ((npy_double *)data_out + 2*0)[1] =
+                                ((npy_double *)data0 + 2*0)[1] +
+                                ((npy_double *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_double *)data_out + 2*1)[0] =
+                                ((npy_double *)data0 + 2*1)[0] +
+                                ((npy_double *)data_out + 2*1)[0];
+        ((npy_double *)data_out + 2*1)[1] =
+                                ((npy_double *)data0 + 2*1)[1] +
+                                ((npy_double *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_double *)data_out + 2*2)[0] =
+                                ((npy_double *)data0 + 2*2)[0] +
+                                ((npy_double *)data_out + 2*2)[0];
+        ((npy_double *)data_out + 2*2)[1] =
+                                ((npy_double *)data0 + 2*2)[1] +
+                                ((npy_double *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_double *)data_out + 2*3)[0] =
+                                ((npy_double *)data0 + 2*3)[0] +
+                                ((npy_double *)data_out + 2*3)[0];
+        ((npy_double *)data_out + 2*3)[1] =
+                                ((npy_double *)data0 + 2*3)[1] +
+                                ((npy_double *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_double *)data_out + 2*4)[0] =
+                                ((npy_double *)data0 + 2*4)[0] +
+                                ((npy_double *)data_out + 2*4)[0];
+        ((npy_double *)data_out + 2*4)[1] =
+                                ((npy_double *)data0 + 2*4)[1] +
+                                ((npy_double *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_double *)data_out + 2*5)[0] =
+                                ((npy_double *)data0 + 2*5)[0] +
+                                ((npy_double *)data_out + 2*5)[0];
+        ((npy_double *)data_out + 2*5)[1] =
+                                ((npy_double *)data0 + 2*5)[1] +
+                                ((npy_double *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_double *)data_out + 2*6)[0] =
+                                ((npy_double *)data0 + 2*6)[0] +
+                                ((npy_double *)data_out + 2*6)[0];
+        ((npy_double *)data_out + 2*6)[1] =
+                                ((npy_double *)data0 + 2*6)[1] +
+                                ((npy_double *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_double *)data_out + 2*7)[0] =
+                                ((npy_double *)data0 + 2*7)[0] +
+                                ((npy_double *)data_out + 2*7)[0];
+        ((npy_double *)data_out + 2*7)[1] =
+                                ((npy_double *)data0 + 2*7)[1] +
+                                ((npy_double *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 3 == 2 && !1
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+cdouble_sum_of_products_muladd(npy_cdouble *data, npy_cdouble *data_out, npy_double scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_cdouble
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_f64;
+    const npyv_f64 v_scalar = npyv_setall_f64(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_f64 b0 = npyv_loada_f64(data + vstep * 0);
+            npyv_f64 c0 = npyv_loada_f64(data_out + vstep * 0);
+            
+#line 312
+            npyv_f64 b1 = npyv_loada_f64(data + vstep * 1);
+            npyv_f64 c1 = npyv_loada_f64(data_out + vstep * 1);
+            
+#line 312
+            npyv_f64 b2 = npyv_loada_f64(data + vstep * 2);
+            npyv_f64 c2 = npyv_loada_f64(data_out + vstep * 2);
+            
+#line 312
+            npyv_f64 b3 = npyv_loada_f64(data + vstep * 3);
+            npyv_f64 c3 = npyv_loada_f64(data_out + vstep * 3);
+            
+            #line 318
+            npyv_f64 abc0 = npyv_muladd_f64(v_scalar, b0, c0);
+            
+#line 318
+            npyv_f64 abc1 = npyv_muladd_f64(v_scalar, b1, c1);
+            
+#line 318
+            npyv_f64 abc2 = npyv_muladd_f64(v_scalar, b2, c2);
+            
+#line 318
+            npyv_f64 abc3 = npyv_muladd_f64(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_f64(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_f64(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_f64(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_f64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_f64 b0 = npyv_load_f64(data + vstep * 0);
+            npyv_f64 c0 = npyv_load_f64(data_out + vstep * 0);
+            
+#line 312
+            npyv_f64 b1 = npyv_load_f64(data + vstep * 1);
+            npyv_f64 c1 = npyv_load_f64(data_out + vstep * 1);
+            
+#line 312
+            npyv_f64 b2 = npyv_load_f64(data + vstep * 2);
+            npyv_f64 c2 = npyv_load_f64(data_out + vstep * 2);
+            
+#line 312
+            npyv_f64 b3 = npyv_load_f64(data + vstep * 3);
+            npyv_f64 c3 = npyv_load_f64(data_out + vstep * 3);
+            
+            #line 318
+            npyv_f64 abc0 = npyv_muladd_f64(v_scalar, b0, c0);
+            
+#line 318
+            npyv_f64 abc1 = npyv_muladd_f64(v_scalar, b1, c1);
+            
+#line 318
+            npyv_f64 abc2 = npyv_muladd_f64(v_scalar, b2, c2);
+            
+#line 318
+            npyv_f64 abc3 = npyv_muladd_f64(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_f64(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_f64(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_f64(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_f64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_f64 a = npyv_load_tillz_f64(data, count);
+        npyv_f64 b = npyv_load_tillz_f64(data_out, count);
+        npyv_store_till_f64(data_out, count, npyv_muladd_f64(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_double b0 = (data[0]);
+        const npy_double c0 = (data_out[0]);
+        
+#line 340
+        const npy_double b1 = (data[1]);
+        const npy_double c1 = (data_out[1]);
+        
+#line 340
+        const npy_double b2 = (data[2]);
+        const npy_double c2 = (data_out[2]);
+        
+#line 340
+        const npy_double b3 = (data[3]);
+        const npy_double c3 = (data_out[3]);
+        
+        #line 346
+        const npy_double abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_double abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_double abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_double abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_double b = (*data);
+        const npy_double c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_cdouble
+}
+
+static void
+cdouble_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cdouble *data0 = (npy_cdouble *)dataptr[0];
+    npy_cdouble *data1 = (npy_cdouble *)dataptr[1];
+    npy_cdouble *data_out = (npy_cdouble *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("cdouble_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_cdouble
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_f64;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_f64 a0 = npyv_loada_f64(data0 + vstep * 0);
+            npyv_f64 b0 = npyv_loada_f64(data1 + vstep * 0);
+            npyv_f64 c0 = npyv_loada_f64(data_out + vstep * 0);
+            
+#line 390
+            npyv_f64 a1 = npyv_loada_f64(data0 + vstep * 1);
+            npyv_f64 b1 = npyv_loada_f64(data1 + vstep * 1);
+            npyv_f64 c1 = npyv_loada_f64(data_out + vstep * 1);
+            
+#line 390
+            npyv_f64 a2 = npyv_loada_f64(data0 + vstep * 2);
+            npyv_f64 b2 = npyv_loada_f64(data1 + vstep * 2);
+            npyv_f64 c2 = npyv_loada_f64(data_out + vstep * 2);
+            
+#line 390
+            npyv_f64 a3 = npyv_loada_f64(data0 + vstep * 3);
+            npyv_f64 b3 = npyv_loada_f64(data1 + vstep * 3);
+            npyv_f64 c3 = npyv_loada_f64(data_out + vstep * 3);
+            
+            #line 397
+            npyv_f64 abc0 = npyv_muladd_f64(a0, b0, c0);
+            
+#line 397
+            npyv_f64 abc1 = npyv_muladd_f64(a1, b1, c1);
+            
+#line 397
+            npyv_f64 abc2 = npyv_muladd_f64(a2, b2, c2);
+            
+#line 397
+            npyv_f64 abc3 = npyv_muladd_f64(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_f64(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_f64(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_f64(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_f64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_f64 a0 = npyv_load_f64(data0 + vstep * 0);
+            npyv_f64 b0 = npyv_load_f64(data1 + vstep * 0);
+            npyv_f64 c0 = npyv_load_f64(data_out + vstep * 0);
+            
+#line 390
+            npyv_f64 a1 = npyv_load_f64(data0 + vstep * 1);
+            npyv_f64 b1 = npyv_load_f64(data1 + vstep * 1);
+            npyv_f64 c1 = npyv_load_f64(data_out + vstep * 1);
+            
+#line 390
+            npyv_f64 a2 = npyv_load_f64(data0 + vstep * 2);
+            npyv_f64 b2 = npyv_load_f64(data1 + vstep * 2);
+            npyv_f64 c2 = npyv_load_f64(data_out + vstep * 2);
+            
+#line 390
+            npyv_f64 a3 = npyv_load_f64(data0 + vstep * 3);
+            npyv_f64 b3 = npyv_load_f64(data1 + vstep * 3);
+            npyv_f64 c3 = npyv_load_f64(data_out + vstep * 3);
+            
+            #line 397
+            npyv_f64 abc0 = npyv_muladd_f64(a0, b0, c0);
+            
+#line 397
+            npyv_f64 abc1 = npyv_muladd_f64(a1, b1, c1);
+            
+#line 397
+            npyv_f64 abc2 = npyv_muladd_f64(a2, b2, c2);
+            
+#line 397
+            npyv_f64 abc3 = npyv_muladd_f64(a3, b3, c3);
+            
+            #line 402
+            npyv_store_f64(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_f64(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_f64(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_f64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_f64 a = npyv_load_tillz_f64(data0, count);
+        npyv_f64 b = npyv_load_tillz_f64(data1, count);
+        npyv_f64 c = npyv_load_tillz_f64(data_out, count);
+        npyv_store_till_f64(data_out, count, npyv_muladd_f64(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_double a0 = (data0[0]);
+        const npy_double b0 = (data1[0]);
+        const npy_double c0 = (data_out[0]);
+        
+#line 420
+        const npy_double a1 = (data0[1]);
+        const npy_double b1 = (data1[1]);
+        const npy_double c1 = (data_out[1]);
+        
+#line 420
+        const npy_double a2 = (data0[2]);
+        const npy_double b2 = (data1[2]);
+        const npy_double c2 = (data_out[2]);
+        
+#line 420
+        const npy_double a3 = (data0[3]);
+        const npy_double b3 = (data1[3]);
+        const npy_double c3 = (data_out[3]);
+        
+        #line 427
+        const npy_double abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_double abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_double abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_double abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_double a = (*data0);
+        const npy_double b = (*data1);
+        const npy_double c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_cdouble
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+cdouble_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double value0 = (*(npy_cdouble *)dataptr[0]);
+    npy_cdouble *data1 = (npy_cdouble *)dataptr[1];
+    npy_cdouble *data_out = (npy_cdouble *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("cdouble_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    cdouble_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+cdouble_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double value1 = (*(npy_cdouble *)dataptr[1]);
+    npy_cdouble *data0 = (npy_cdouble *)dataptr[0];
+    npy_cdouble *data_out = (npy_cdouble *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("cdouble_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    cdouble_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+cdouble_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cdouble *data0 = (npy_cdouble *)dataptr[0];
+    npy_cdouble *data1 = (npy_cdouble *)dataptr[1];
+    npy_double accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("cdouble_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_cdouble
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_f64;
+    npyv_f64 v_accum = npyv_zero_f64();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_f64 a0 = npyv_loada_f64(data0 + vstep * 0);
+            npyv_f64 b0 = npyv_loada_f64(data1 + vstep * 0);
+            
+#line 501
+            npyv_f64 a1 = npyv_loada_f64(data0 + vstep * 1);
+            npyv_f64 b1 = npyv_loada_f64(data1 + vstep * 1);
+            
+#line 501
+            npyv_f64 a2 = npyv_loada_f64(data0 + vstep * 2);
+            npyv_f64 b2 = npyv_loada_f64(data1 + vstep * 2);
+            
+#line 501
+            npyv_f64 a3 = npyv_loada_f64(data0 + vstep * 3);
+            npyv_f64 b3 = npyv_loada_f64(data1 + vstep * 3);
+            
+            npyv_f64 ab3 = npyv_muladd_f64(a3, b3, v_accum);
+            npyv_f64 ab2 = npyv_muladd_f64(a2, b2, ab3);
+            npyv_f64 ab1 = npyv_muladd_f64(a1, b1, ab2);
+                   v_accum = npyv_muladd_f64(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_f64 a0 = npyv_load_f64(data0 + vstep * 0);
+            npyv_f64 b0 = npyv_load_f64(data1 + vstep * 0);
+            
+#line 501
+            npyv_f64 a1 = npyv_load_f64(data0 + vstep * 1);
+            npyv_f64 b1 = npyv_load_f64(data1 + vstep * 1);
+            
+#line 501
+            npyv_f64 a2 = npyv_load_f64(data0 + vstep * 2);
+            npyv_f64 b2 = npyv_load_f64(data1 + vstep * 2);
+            
+#line 501
+            npyv_f64 a3 = npyv_load_f64(data0 + vstep * 3);
+            npyv_f64 b3 = npyv_load_f64(data1 + vstep * 3);
+            
+            npyv_f64 ab3 = npyv_muladd_f64(a3, b3, v_accum);
+            npyv_f64 ab2 = npyv_muladd_f64(a2, b2, ab3);
+            npyv_f64 ab1 = npyv_muladd_f64(a1, b1, ab2);
+                   v_accum = npyv_muladd_f64(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_f64 a = npyv_load_tillz_f64(data0, count);
+        npyv_f64 b = npyv_load_tillz_f64(data1, count);
+        v_accum = npyv_muladd_f64(a, b, v_accum);
+    }
+    accum = npyv_sum_f64(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_double ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_double ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_double ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_double ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_double a = (*data0);
+        const npy_double b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_cdouble
+    *(npy_cdouble *)dataptr[2] = ((*(npy_cdouble *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+cdouble_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cdouble *data1 = (npy_cdouble *)dataptr[1];
+    npy_double value0 = (*(npy_cdouble *)dataptr[0]);
+    npy_double accum = cdouble_sum_of_arr(data1, count);
+    *(npy_cdouble *)dataptr[2] = ((*(npy_cdouble *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+cdouble_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cdouble *data0 = (npy_cdouble *)dataptr[0];
+    npy_double value1 = (*(npy_cdouble *)dataptr[1]);
+    npy_double accum = cdouble_sum_of_arr(data0, count);
+    *(npy_cdouble *)dataptr[2] = ((*(npy_cdouble *)dataptr[2]) + value1 * accum);
+}
+
+#elif 3 == 3 && !1
+
+static void
+cdouble_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cdouble *data0 = (npy_cdouble *)dataptr[0];
+    npy_cdouble *data1 = (npy_cdouble *)dataptr[1];
+    npy_cdouble *data2 = (npy_cdouble *)dataptr[2];
+    npy_cdouble *data_out = (npy_cdouble *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 3 > 3 || @complex */
+
+static void
+cdouble_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("cdouble_sum_of_products_contig_three (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !1
+        npy_double temp = (*(npy_cdouble *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_cdouble *)dataptr[i]);
+        }
+        *(npy_cdouble *)dataptr[nop] = (temp +
+                                           (*(npy_cdouble *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_cdouble);
+        }
+#else /* complex */
+#  if 3 <= 3
+#    define _SUMPROD_NOP 3
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_double re, im, tmp;
+        int i;
+        re = ((npy_double *)dataptr[0])[0];
+        im = ((npy_double *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_double *)dataptr[i])[0] -
+                  im * ((npy_double *)dataptr[i])[1];
+            im = re * ((npy_double *)dataptr[i])[1] +
+                 im * ((npy_double *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_double *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_double *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_double *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_double *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_cdouble);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 3 */
+
+#if 3 == 1
+
+static NPY_GCC_OPT_3 void
+cdouble_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("cdouble_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !1
+    npy_cdouble *data = (npy_cdouble *)dataptr[0];
+    npy_double accum = cdouble_sum_of_arr(data, count);
+    *((npy_cdouble *)dataptr[1]) = (accum + (*((npy_cdouble *)dataptr[1])));
+#else
+    npy_double accum_re = 0, accum_im = 0;
+    npy_double *data0 = (npy_double *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_double re01 = data0[0] + data0[2];
+        const npy_double re23 = data0[4] + data0[6];
+        const npy_double im13 = data0[1] + data0[3];
+        const npy_double im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_double *)dataptr[1])[0] += accum_re;
+    ((npy_double *)dataptr[1])[1] += accum_im;
+#endif // !1
+}
+
+#endif /* 3 == 1 */
+
+static void
+cdouble_sum_of_products_outstride0_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 1
+    npy_double accum_re = 0, accum_im = 0;
+#else
+    npy_double accum = 0;
+#endif
+
+#if (3 == 1) || (3 <= 3 && !1)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (3 == 2 || 3 == 3) && !1
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (3 == 3) && !1
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("cdouble_sum_of_products_outstride0_three (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !1
+#  if 3 == 1
+        accum += (*(npy_cdouble *)data0);
+        data0 += stride0;
+#  elif 3 == 2
+        accum += (*(npy_cdouble *)data0) *
+                 (*(npy_cdouble *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 3 == 3
+        accum += (*(npy_cdouble *)data0) *
+                 (*(npy_cdouble *)data1) *
+                 (*(npy_cdouble *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_double temp = (*(npy_cdouble *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_cdouble *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 3 == 1
+        accum_re += ((npy_double *)data0)[0];
+        accum_im += ((npy_double *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 3 <= 3
+#define _SUMPROD_NOP 3
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_double re, im, tmp;
+        int i;
+        re = ((npy_double *)dataptr[0])[0];
+        im = ((npy_double *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_double *)dataptr[i])[0] -
+                  im * ((npy_double *)dataptr[i])[1];
+            im = re * ((npy_double *)dataptr[i])[1] +
+                 im * ((npy_double *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 1
+#  if 3 <= 3
+    ((npy_double *)dataptr[3])[0] += accum_re;
+    ((npy_double *)dataptr[3])[1] += accum_im;
+#  else
+    ((npy_double *)dataptr[nop])[0] += accum_re;
+    ((npy_double *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 3 <= 3
+    *((npy_cdouble *)dataptr[3]) = (accum +
+                                    (*((npy_cdouble *)dataptr[3])));
+#  else
+    *((npy_cdouble *)dataptr[nop]) = (accum +
+                                    (*((npy_cdouble *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+cdouble_sum_of_products_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1000 == 1) || (1000 <= 3 && !1)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1000 == 2 || 1000 == 3) && !1
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1000 == 3) && !1
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (1000 == 1) || (1000 <= 3 && !1)
+    char *data_out = dataptr[1000];
+    npy_intp stride_out = strides[1000];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("cdouble_sum_of_products_any (%d)\n", (int)count);
+
+    while (count--) {
+#if !1
+#  if 1000 == 1
+        *(npy_cdouble *)data_out = ((*(npy_cdouble *)data0) +
+                                         (*(npy_cdouble *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 1000 == 2
+        *(npy_cdouble *)data_out = ((*(npy_cdouble *)data0) *
+                                         (*(npy_cdouble *)data1) +
+                                         (*(npy_cdouble *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 1000 == 3
+        *(npy_cdouble *)data_out = ((*(npy_cdouble *)data0) *
+                                         (*(npy_cdouble *)data1) *
+                                         (*(npy_cdouble *)data2) +
+                                         (*(npy_cdouble *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_double temp = (*(npy_cdouble *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_cdouble *)dataptr[i]);
+        }
+        *(npy_cdouble *)dataptr[nop] = (temp +
+                                           (*(npy_cdouble *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1000 == 1
+        ((npy_double *)data_out)[0] = ((npy_double *)data0)[0] +
+                                         ((npy_double *)data_out)[0];
+        ((npy_double *)data_out)[1] = ((npy_double *)data0)[1] +
+                                         ((npy_double *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 1000 <= 3
+#define _SUMPROD_NOP 1000
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_double re, im, tmp;
+        int i;
+        re = ((npy_double *)dataptr[0])[0];
+        im = ((npy_double *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_double *)dataptr[i])[0] -
+                  im * ((npy_double *)dataptr[i])[1];
+            im = re * ((npy_double *)dataptr[i])[1] +
+                 im * ((npy_double *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_double *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_double *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_double *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_double *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 1000 == 1
+
+static void
+cdouble_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cdouble *data0 = (npy_cdouble *)dataptr[0];
+    npy_cdouble *data_out = (npy_cdouble *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("cdouble_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !1
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_double *)data_out + 2*6)[0] =
+                                    ((npy_double *)data0 + 2*6)[0] +
+                                    ((npy_double *)data_out + 2*6)[0];
+            ((npy_double *)data_out + 2*6)[1] =
+                                    ((npy_double *)data0 + 2*6)[1] +
+                                    ((npy_double *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !1
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_double *)data_out + 2*5)[0] =
+                                    ((npy_double *)data0 + 2*5)[0] +
+                                    ((npy_double *)data_out + 2*5)[0];
+            ((npy_double *)data_out + 2*5)[1] =
+                                    ((npy_double *)data0 + 2*5)[1] +
+                                    ((npy_double *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !1
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_double *)data_out + 2*4)[0] =
+                                    ((npy_double *)data0 + 2*4)[0] +
+                                    ((npy_double *)data_out + 2*4)[0];
+            ((npy_double *)data_out + 2*4)[1] =
+                                    ((npy_double *)data0 + 2*4)[1] +
+                                    ((npy_double *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !1
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_double *)data_out + 2*3)[0] =
+                                    ((npy_double *)data0 + 2*3)[0] +
+                                    ((npy_double *)data_out + 2*3)[0];
+            ((npy_double *)data_out + 2*3)[1] =
+                                    ((npy_double *)data0 + 2*3)[1] +
+                                    ((npy_double *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !1
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_double *)data_out + 2*2)[0] =
+                                    ((npy_double *)data0 + 2*2)[0] +
+                                    ((npy_double *)data_out + 2*2)[0];
+            ((npy_double *)data_out + 2*2)[1] =
+                                    ((npy_double *)data0 + 2*2)[1] +
+                                    ((npy_double *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !1
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_double *)data_out + 2*1)[0] =
+                                    ((npy_double *)data0 + 2*1)[0] +
+                                    ((npy_double *)data_out + 2*1)[0];
+            ((npy_double *)data_out + 2*1)[1] =
+                                    ((npy_double *)data0 + 2*1)[1] +
+                                    ((npy_double *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !1
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_double *)data_out + 2*0)[0] =
+                                    ((npy_double *)data0 + 2*0)[0] +
+                                    ((npy_double *)data_out + 2*0)[0];
+            ((npy_double *)data_out + 2*0)[1] =
+                                    ((npy_double *)data0 + 2*0)[1] +
+                                    ((npy_double *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !1
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_double *)data_out + 2*0)[0] =
+                                ((npy_double *)data0 + 2*0)[0] +
+                                ((npy_double *)data_out + 2*0)[0];
+        ((npy_double *)data_out + 2*0)[1] =
+                                ((npy_double *)data0 + 2*0)[1] +
+                                ((npy_double *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_double *)data_out + 2*1)[0] =
+                                ((npy_double *)data0 + 2*1)[0] +
+                                ((npy_double *)data_out + 2*1)[0];
+        ((npy_double *)data_out + 2*1)[1] =
+                                ((npy_double *)data0 + 2*1)[1] +
+                                ((npy_double *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_double *)data_out + 2*2)[0] =
+                                ((npy_double *)data0 + 2*2)[0] +
+                                ((npy_double *)data_out + 2*2)[0];
+        ((npy_double *)data_out + 2*2)[1] =
+                                ((npy_double *)data0 + 2*2)[1] +
+                                ((npy_double *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_double *)data_out + 2*3)[0] =
+                                ((npy_double *)data0 + 2*3)[0] +
+                                ((npy_double *)data_out + 2*3)[0];
+        ((npy_double *)data_out + 2*3)[1] =
+                                ((npy_double *)data0 + 2*3)[1] +
+                                ((npy_double *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_double *)data_out + 2*4)[0] =
+                                ((npy_double *)data0 + 2*4)[0] +
+                                ((npy_double *)data_out + 2*4)[0];
+        ((npy_double *)data_out + 2*4)[1] =
+                                ((npy_double *)data0 + 2*4)[1] +
+                                ((npy_double *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_double *)data_out + 2*5)[0] =
+                                ((npy_double *)data0 + 2*5)[0] +
+                                ((npy_double *)data_out + 2*5)[0];
+        ((npy_double *)data_out + 2*5)[1] =
+                                ((npy_double *)data0 + 2*5)[1] +
+                                ((npy_double *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_double *)data_out + 2*6)[0] =
+                                ((npy_double *)data0 + 2*6)[0] +
+                                ((npy_double *)data_out + 2*6)[0];
+        ((npy_double *)data_out + 2*6)[1] =
+                                ((npy_double *)data0 + 2*6)[1] +
+                                ((npy_double *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_double *)data_out + 2*7)[0] =
+                                ((npy_double *)data0 + 2*7)[0] +
+                                ((npy_double *)data_out + 2*7)[0];
+        ((npy_double *)data_out + 2*7)[1] =
+                                ((npy_double *)data0 + 2*7)[1] +
+                                ((npy_double *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 1000 == 2 && !1
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+cdouble_sum_of_products_muladd(npy_cdouble *data, npy_cdouble *data_out, npy_double scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_cdouble
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_f64;
+    const npyv_f64 v_scalar = npyv_setall_f64(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_f64 b0 = npyv_loada_f64(data + vstep * 0);
+            npyv_f64 c0 = npyv_loada_f64(data_out + vstep * 0);
+            
+#line 312
+            npyv_f64 b1 = npyv_loada_f64(data + vstep * 1);
+            npyv_f64 c1 = npyv_loada_f64(data_out + vstep * 1);
+            
+#line 312
+            npyv_f64 b2 = npyv_loada_f64(data + vstep * 2);
+            npyv_f64 c2 = npyv_loada_f64(data_out + vstep * 2);
+            
+#line 312
+            npyv_f64 b3 = npyv_loada_f64(data + vstep * 3);
+            npyv_f64 c3 = npyv_loada_f64(data_out + vstep * 3);
+            
+            #line 318
+            npyv_f64 abc0 = npyv_muladd_f64(v_scalar, b0, c0);
+            
+#line 318
+            npyv_f64 abc1 = npyv_muladd_f64(v_scalar, b1, c1);
+            
+#line 318
+            npyv_f64 abc2 = npyv_muladd_f64(v_scalar, b2, c2);
+            
+#line 318
+            npyv_f64 abc3 = npyv_muladd_f64(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_f64(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_f64(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_f64(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_f64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_f64 b0 = npyv_load_f64(data + vstep * 0);
+            npyv_f64 c0 = npyv_load_f64(data_out + vstep * 0);
+            
+#line 312
+            npyv_f64 b1 = npyv_load_f64(data + vstep * 1);
+            npyv_f64 c1 = npyv_load_f64(data_out + vstep * 1);
+            
+#line 312
+            npyv_f64 b2 = npyv_load_f64(data + vstep * 2);
+            npyv_f64 c2 = npyv_load_f64(data_out + vstep * 2);
+            
+#line 312
+            npyv_f64 b3 = npyv_load_f64(data + vstep * 3);
+            npyv_f64 c3 = npyv_load_f64(data_out + vstep * 3);
+            
+            #line 318
+            npyv_f64 abc0 = npyv_muladd_f64(v_scalar, b0, c0);
+            
+#line 318
+            npyv_f64 abc1 = npyv_muladd_f64(v_scalar, b1, c1);
+            
+#line 318
+            npyv_f64 abc2 = npyv_muladd_f64(v_scalar, b2, c2);
+            
+#line 318
+            npyv_f64 abc3 = npyv_muladd_f64(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_f64(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_f64(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_f64(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_f64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_f64 a = npyv_load_tillz_f64(data, count);
+        npyv_f64 b = npyv_load_tillz_f64(data_out, count);
+        npyv_store_till_f64(data_out, count, npyv_muladd_f64(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_double b0 = (data[0]);
+        const npy_double c0 = (data_out[0]);
+        
+#line 340
+        const npy_double b1 = (data[1]);
+        const npy_double c1 = (data_out[1]);
+        
+#line 340
+        const npy_double b2 = (data[2]);
+        const npy_double c2 = (data_out[2]);
+        
+#line 340
+        const npy_double b3 = (data[3]);
+        const npy_double c3 = (data_out[3]);
+        
+        #line 346
+        const npy_double abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_double abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_double abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_double abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_double b = (*data);
+        const npy_double c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_cdouble
+}
+
+static void
+cdouble_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cdouble *data0 = (npy_cdouble *)dataptr[0];
+    npy_cdouble *data1 = (npy_cdouble *)dataptr[1];
+    npy_cdouble *data_out = (npy_cdouble *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("cdouble_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_cdouble
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_f64;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_f64 a0 = npyv_loada_f64(data0 + vstep * 0);
+            npyv_f64 b0 = npyv_loada_f64(data1 + vstep * 0);
+            npyv_f64 c0 = npyv_loada_f64(data_out + vstep * 0);
+            
+#line 390
+            npyv_f64 a1 = npyv_loada_f64(data0 + vstep * 1);
+            npyv_f64 b1 = npyv_loada_f64(data1 + vstep * 1);
+            npyv_f64 c1 = npyv_loada_f64(data_out + vstep * 1);
+            
+#line 390
+            npyv_f64 a2 = npyv_loada_f64(data0 + vstep * 2);
+            npyv_f64 b2 = npyv_loada_f64(data1 + vstep * 2);
+            npyv_f64 c2 = npyv_loada_f64(data_out + vstep * 2);
+            
+#line 390
+            npyv_f64 a3 = npyv_loada_f64(data0 + vstep * 3);
+            npyv_f64 b3 = npyv_loada_f64(data1 + vstep * 3);
+            npyv_f64 c3 = npyv_loada_f64(data_out + vstep * 3);
+            
+            #line 397
+            npyv_f64 abc0 = npyv_muladd_f64(a0, b0, c0);
+            
+#line 397
+            npyv_f64 abc1 = npyv_muladd_f64(a1, b1, c1);
+            
+#line 397
+            npyv_f64 abc2 = npyv_muladd_f64(a2, b2, c2);
+            
+#line 397
+            npyv_f64 abc3 = npyv_muladd_f64(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_f64(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_f64(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_f64(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_f64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_f64 a0 = npyv_load_f64(data0 + vstep * 0);
+            npyv_f64 b0 = npyv_load_f64(data1 + vstep * 0);
+            npyv_f64 c0 = npyv_load_f64(data_out + vstep * 0);
+            
+#line 390
+            npyv_f64 a1 = npyv_load_f64(data0 + vstep * 1);
+            npyv_f64 b1 = npyv_load_f64(data1 + vstep * 1);
+            npyv_f64 c1 = npyv_load_f64(data_out + vstep * 1);
+            
+#line 390
+            npyv_f64 a2 = npyv_load_f64(data0 + vstep * 2);
+            npyv_f64 b2 = npyv_load_f64(data1 + vstep * 2);
+            npyv_f64 c2 = npyv_load_f64(data_out + vstep * 2);
+            
+#line 390
+            npyv_f64 a3 = npyv_load_f64(data0 + vstep * 3);
+            npyv_f64 b3 = npyv_load_f64(data1 + vstep * 3);
+            npyv_f64 c3 = npyv_load_f64(data_out + vstep * 3);
+            
+            #line 397
+            npyv_f64 abc0 = npyv_muladd_f64(a0, b0, c0);
+            
+#line 397
+            npyv_f64 abc1 = npyv_muladd_f64(a1, b1, c1);
+            
+#line 397
+            npyv_f64 abc2 = npyv_muladd_f64(a2, b2, c2);
+            
+#line 397
+            npyv_f64 abc3 = npyv_muladd_f64(a3, b3, c3);
+            
+            #line 402
+            npyv_store_f64(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_f64(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_f64(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_f64(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_f64 a = npyv_load_tillz_f64(data0, count);
+        npyv_f64 b = npyv_load_tillz_f64(data1, count);
+        npyv_f64 c = npyv_load_tillz_f64(data_out, count);
+        npyv_store_till_f64(data_out, count, npyv_muladd_f64(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_double a0 = (data0[0]);
+        const npy_double b0 = (data1[0]);
+        const npy_double c0 = (data_out[0]);
+        
+#line 420
+        const npy_double a1 = (data0[1]);
+        const npy_double b1 = (data1[1]);
+        const npy_double c1 = (data_out[1]);
+        
+#line 420
+        const npy_double a2 = (data0[2]);
+        const npy_double b2 = (data1[2]);
+        const npy_double c2 = (data_out[2]);
+        
+#line 420
+        const npy_double a3 = (data0[3]);
+        const npy_double b3 = (data1[3]);
+        const npy_double c3 = (data_out[3]);
+        
+        #line 427
+        const npy_double abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_double abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_double abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_double abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_double a = (*data0);
+        const npy_double b = (*data1);
+        const npy_double c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_cdouble
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+cdouble_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double value0 = (*(npy_cdouble *)dataptr[0]);
+    npy_cdouble *data1 = (npy_cdouble *)dataptr[1];
+    npy_cdouble *data_out = (npy_cdouble *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("cdouble_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    cdouble_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+cdouble_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_double value1 = (*(npy_cdouble *)dataptr[1]);
+    npy_cdouble *data0 = (npy_cdouble *)dataptr[0];
+    npy_cdouble *data_out = (npy_cdouble *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("cdouble_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    cdouble_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+cdouble_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cdouble *data0 = (npy_cdouble *)dataptr[0];
+    npy_cdouble *data1 = (npy_cdouble *)dataptr[1];
+    npy_double accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("cdouble_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_cdouble
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_f64;
+    npyv_f64 v_accum = npyv_zero_f64();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_f64 a0 = npyv_loada_f64(data0 + vstep * 0);
+            npyv_f64 b0 = npyv_loada_f64(data1 + vstep * 0);
+            
+#line 501
+            npyv_f64 a1 = npyv_loada_f64(data0 + vstep * 1);
+            npyv_f64 b1 = npyv_loada_f64(data1 + vstep * 1);
+            
+#line 501
+            npyv_f64 a2 = npyv_loada_f64(data0 + vstep * 2);
+            npyv_f64 b2 = npyv_loada_f64(data1 + vstep * 2);
+            
+#line 501
+            npyv_f64 a3 = npyv_loada_f64(data0 + vstep * 3);
+            npyv_f64 b3 = npyv_loada_f64(data1 + vstep * 3);
+            
+            npyv_f64 ab3 = npyv_muladd_f64(a3, b3, v_accum);
+            npyv_f64 ab2 = npyv_muladd_f64(a2, b2, ab3);
+            npyv_f64 ab1 = npyv_muladd_f64(a1, b1, ab2);
+                   v_accum = npyv_muladd_f64(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_f64 a0 = npyv_load_f64(data0 + vstep * 0);
+            npyv_f64 b0 = npyv_load_f64(data1 + vstep * 0);
+            
+#line 501
+            npyv_f64 a1 = npyv_load_f64(data0 + vstep * 1);
+            npyv_f64 b1 = npyv_load_f64(data1 + vstep * 1);
+            
+#line 501
+            npyv_f64 a2 = npyv_load_f64(data0 + vstep * 2);
+            npyv_f64 b2 = npyv_load_f64(data1 + vstep * 2);
+            
+#line 501
+            npyv_f64 a3 = npyv_load_f64(data0 + vstep * 3);
+            npyv_f64 b3 = npyv_load_f64(data1 + vstep * 3);
+            
+            npyv_f64 ab3 = npyv_muladd_f64(a3, b3, v_accum);
+            npyv_f64 ab2 = npyv_muladd_f64(a2, b2, ab3);
+            npyv_f64 ab1 = npyv_muladd_f64(a1, b1, ab2);
+                   v_accum = npyv_muladd_f64(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_f64 a = npyv_load_tillz_f64(data0, count);
+        npyv_f64 b = npyv_load_tillz_f64(data1, count);
+        v_accum = npyv_muladd_f64(a, b, v_accum);
+    }
+    accum = npyv_sum_f64(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_double ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_double ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_double ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_double ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_double a = (*data0);
+        const npy_double b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_cdouble
+    *(npy_cdouble *)dataptr[2] = ((*(npy_cdouble *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+cdouble_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cdouble *data1 = (npy_cdouble *)dataptr[1];
+    npy_double value0 = (*(npy_cdouble *)dataptr[0]);
+    npy_double accum = cdouble_sum_of_arr(data1, count);
+    *(npy_cdouble *)dataptr[2] = ((*(npy_cdouble *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+cdouble_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cdouble *data0 = (npy_cdouble *)dataptr[0];
+    npy_double value1 = (*(npy_cdouble *)dataptr[1]);
+    npy_double accum = cdouble_sum_of_arr(data0, count);
+    *(npy_cdouble *)dataptr[2] = ((*(npy_cdouble *)dataptr[2]) + value1 * accum);
+}
+
+#elif 1000 == 3 && !1
+
+static void
+cdouble_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_cdouble *data0 = (npy_cdouble *)dataptr[0];
+    npy_cdouble *data1 = (npy_cdouble *)dataptr[1];
+    npy_cdouble *data2 = (npy_cdouble *)dataptr[2];
+    npy_cdouble *data_out = (npy_cdouble *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 1000 > 3 || @complex */
+
+static void
+cdouble_sum_of_products_contig_any(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("cdouble_sum_of_products_contig_any (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !1
+        npy_double temp = (*(npy_cdouble *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_cdouble *)dataptr[i]);
+        }
+        *(npy_cdouble *)dataptr[nop] = (temp +
+                                           (*(npy_cdouble *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_cdouble);
+        }
+#else /* complex */
+#  if 1000 <= 3
+#    define _SUMPROD_NOP 1000
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_double re, im, tmp;
+        int i;
+        re = ((npy_double *)dataptr[0])[0];
+        im = ((npy_double *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_double *)dataptr[i])[0] -
+                  im * ((npy_double *)dataptr[i])[1];
+            im = re * ((npy_double *)dataptr[i])[1] +
+                 im * ((npy_double *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_double *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_double *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_double *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_double *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_cdouble);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 1000 */
+
+#if 1000 == 1
+
+static NPY_GCC_OPT_3 void
+cdouble_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("cdouble_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !1
+    npy_cdouble *data = (npy_cdouble *)dataptr[0];
+    npy_double accum = cdouble_sum_of_arr(data, count);
+    *((npy_cdouble *)dataptr[1]) = (accum + (*((npy_cdouble *)dataptr[1])));
+#else
+    npy_double accum_re = 0, accum_im = 0;
+    npy_double *data0 = (npy_double *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_double re01 = data0[0] + data0[2];
+        const npy_double re23 = data0[4] + data0[6];
+        const npy_double im13 = data0[1] + data0[3];
+        const npy_double im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_double *)dataptr[1])[0] += accum_re;
+    ((npy_double *)dataptr[1])[1] += accum_im;
+#endif // !1
+}
+
+#endif /* 1000 == 1 */
+
+static void
+cdouble_sum_of_products_outstride0_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 1
+    npy_double accum_re = 0, accum_im = 0;
+#else
+    npy_double accum = 0;
+#endif
+
+#if (1000 == 1) || (1000 <= 3 && !1)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1000 == 2 || 1000 == 3) && !1
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1000 == 3) && !1
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("cdouble_sum_of_products_outstride0_any (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !1
+#  if 1000 == 1
+        accum += (*(npy_cdouble *)data0);
+        data0 += stride0;
+#  elif 1000 == 2
+        accum += (*(npy_cdouble *)data0) *
+                 (*(npy_cdouble *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 1000 == 3
+        accum += (*(npy_cdouble *)data0) *
+                 (*(npy_cdouble *)data1) *
+                 (*(npy_cdouble *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_double temp = (*(npy_cdouble *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_cdouble *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1000 == 1
+        accum_re += ((npy_double *)data0)[0];
+        accum_im += ((npy_double *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 1000 <= 3
+#define _SUMPROD_NOP 1000
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_double re, im, tmp;
+        int i;
+        re = ((npy_double *)dataptr[0])[0];
+        im = ((npy_double *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_double *)dataptr[i])[0] -
+                  im * ((npy_double *)dataptr[i])[1];
+            im = re * ((npy_double *)dataptr[i])[1] +
+                 im * ((npy_double *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 1
+#  if 1000 <= 3
+    ((npy_double *)dataptr[1000])[0] += accum_re;
+    ((npy_double *)dataptr[1000])[1] += accum_im;
+#  else
+    ((npy_double *)dataptr[nop])[0] += accum_re;
+    ((npy_double *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 1000 <= 3
+    *((npy_cdouble *)dataptr[1000]) = (accum +
+                                    (*((npy_cdouble *)dataptr[1000])));
+#  else
+    *((npy_cdouble *)dataptr[nop]) = (accum +
+                                    (*((npy_cdouble *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+
+
+#line 74
+
+#if !1
+static NPY_GCC_OPT_3 npy_longdouble clongdouble_sum_of_arr(npy_clongdouble *data, npy_intp count)
+{
+    npy_longdouble accum = 0;
+#if 0 // NPYV check for npy_clongdouble
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data);
+    const int vstep = npyv_nlanes_clongdouble;
+    npyv_clongdouble v_accum = npyv_zero_clongdouble();
+    const npy_intp vstepx4 = vstep * 4;
+
+    #line 91
+    if(is_aligned) {
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
+            #line 96
+            npyv_clongdouble a0 = npyv_loada_clongdouble(data + vstep * 0);
+            
+#line 96
+            npyv_clongdouble a1 = npyv_loada_clongdouble(data + vstep * 1);
+            
+#line 96
+            npyv_clongdouble a2 = npyv_loada_clongdouble(data + vstep * 2);
+            
+#line 96
+            npyv_clongdouble a3 = npyv_loada_clongdouble(data + vstep * 3);
+            
+            npyv_clongdouble a01   = npyv_add_clongdouble(a0, a1);
+            npyv_clongdouble a23   = npyv_add_clongdouble(a2, a3);
+            npyv_clongdouble a0123 = npyv_add_clongdouble(a01, a23);
+                     v_accum = npyv_add_clongdouble(a0123, v_accum);
+        }
+    }
+    
+#line 91
+    else {
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4) {
+            #line 96
+            npyv_clongdouble a0 = npyv_load_clongdouble(data + vstep * 0);
+            
+#line 96
+            npyv_clongdouble a1 = npyv_load_clongdouble(data + vstep * 1);
+            
+#line 96
+            npyv_clongdouble a2 = npyv_load_clongdouble(data + vstep * 2);
+            
+#line 96
+            npyv_clongdouble a3 = npyv_load_clongdouble(data + vstep * 3);
+            
+            npyv_clongdouble a01   = npyv_add_clongdouble(a0, a1);
+            npyv_clongdouble a23   = npyv_add_clongdouble(a2, a3);
+            npyv_clongdouble a0123 = npyv_add_clongdouble(a01, a23);
+                     v_accum = npyv_add_clongdouble(a0123, v_accum);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep) {
+        npyv_clongdouble a = npyv_load_tillz_clongdouble(data, count);
+        v_accum = npyv_add_clongdouble(a, v_accum);
+    }
+    accum = npyv_sum_clongdouble(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data += 4) {
+        const npy_longdouble a01 = (*data) + (data[1]);
+        const npy_longdouble a23 = (data[2]) + (data[3]);
+        accum +=  a01 + a23;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data++) {
+        accum += (*data);
+    }
+#endif // NPYV check for npy_clongdouble
+    return accum;
+}
+#endif
+
+#line 131
+static void
+clongdouble_sum_of_products_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1 == 1) || (1 <= 3 && !1)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1 == 2 || 1 == 3) && !1
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1 == 3) && !1
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (1 == 1) || (1 <= 3 && !1)
+    char *data_out = dataptr[1];
+    npy_intp stride_out = strides[1];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("clongdouble_sum_of_products_one (%d)\n", (int)count);
+
+    while (count--) {
+#if !1
+#  if 1 == 1
+        *(npy_clongdouble *)data_out = ((*(npy_clongdouble *)data0) +
+                                         (*(npy_clongdouble *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 1 == 2
+        *(npy_clongdouble *)data_out = ((*(npy_clongdouble *)data0) *
+                                         (*(npy_clongdouble *)data1) +
+                                         (*(npy_clongdouble *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 1 == 3
+        *(npy_clongdouble *)data_out = ((*(npy_clongdouble *)data0) *
+                                         (*(npy_clongdouble *)data1) *
+                                         (*(npy_clongdouble *)data2) +
+                                         (*(npy_clongdouble *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_longdouble temp = (*(npy_clongdouble *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_clongdouble *)dataptr[i]);
+        }
+        *(npy_clongdouble *)dataptr[nop] = (temp +
+                                           (*(npy_clongdouble *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1 == 1
+        ((npy_longdouble *)data_out)[0] = ((npy_longdouble *)data0)[0] +
+                                         ((npy_longdouble *)data_out)[0];
+        ((npy_longdouble *)data_out)[1] = ((npy_longdouble *)data0)[1] +
+                                         ((npy_longdouble *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 1 <= 3
+#define _SUMPROD_NOP 1
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_longdouble re, im, tmp;
+        int i;
+        re = ((npy_longdouble *)dataptr[0])[0];
+        im = ((npy_longdouble *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_longdouble *)dataptr[i])[0] -
+                  im * ((npy_longdouble *)dataptr[i])[1];
+            im = re * ((npy_longdouble *)dataptr[i])[1] +
+                 im * ((npy_longdouble *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_longdouble *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_longdouble *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_longdouble *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_longdouble *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 1 == 1
+
+static void
+clongdouble_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_clongdouble *data0 = (npy_clongdouble *)dataptr[0];
+    npy_clongdouble *data_out = (npy_clongdouble *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("clongdouble_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !1
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_longdouble *)data_out + 2*6)[0] =
+                                    ((npy_longdouble *)data0 + 2*6)[0] +
+                                    ((npy_longdouble *)data_out + 2*6)[0];
+            ((npy_longdouble *)data_out + 2*6)[1] =
+                                    ((npy_longdouble *)data0 + 2*6)[1] +
+                                    ((npy_longdouble *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !1
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_longdouble *)data_out + 2*5)[0] =
+                                    ((npy_longdouble *)data0 + 2*5)[0] +
+                                    ((npy_longdouble *)data_out + 2*5)[0];
+            ((npy_longdouble *)data_out + 2*5)[1] =
+                                    ((npy_longdouble *)data0 + 2*5)[1] +
+                                    ((npy_longdouble *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !1
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_longdouble *)data_out + 2*4)[0] =
+                                    ((npy_longdouble *)data0 + 2*4)[0] +
+                                    ((npy_longdouble *)data_out + 2*4)[0];
+            ((npy_longdouble *)data_out + 2*4)[1] =
+                                    ((npy_longdouble *)data0 + 2*4)[1] +
+                                    ((npy_longdouble *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !1
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_longdouble *)data_out + 2*3)[0] =
+                                    ((npy_longdouble *)data0 + 2*3)[0] +
+                                    ((npy_longdouble *)data_out + 2*3)[0];
+            ((npy_longdouble *)data_out + 2*3)[1] =
+                                    ((npy_longdouble *)data0 + 2*3)[1] +
+                                    ((npy_longdouble *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !1
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_longdouble *)data_out + 2*2)[0] =
+                                    ((npy_longdouble *)data0 + 2*2)[0] +
+                                    ((npy_longdouble *)data_out + 2*2)[0];
+            ((npy_longdouble *)data_out + 2*2)[1] =
+                                    ((npy_longdouble *)data0 + 2*2)[1] +
+                                    ((npy_longdouble *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !1
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_longdouble *)data_out + 2*1)[0] =
+                                    ((npy_longdouble *)data0 + 2*1)[0] +
+                                    ((npy_longdouble *)data_out + 2*1)[0];
+            ((npy_longdouble *)data_out + 2*1)[1] =
+                                    ((npy_longdouble *)data0 + 2*1)[1] +
+                                    ((npy_longdouble *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !1
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_longdouble *)data_out + 2*0)[0] =
+                                    ((npy_longdouble *)data0 + 2*0)[0] +
+                                    ((npy_longdouble *)data_out + 2*0)[0];
+            ((npy_longdouble *)data_out + 2*0)[1] =
+                                    ((npy_longdouble *)data0 + 2*0)[1] +
+                                    ((npy_longdouble *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !1
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*0)[0] =
+                                ((npy_longdouble *)data0 + 2*0)[0] +
+                                ((npy_longdouble *)data_out + 2*0)[0];
+        ((npy_longdouble *)data_out + 2*0)[1] =
+                                ((npy_longdouble *)data0 + 2*0)[1] +
+                                ((npy_longdouble *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*1)[0] =
+                                ((npy_longdouble *)data0 + 2*1)[0] +
+                                ((npy_longdouble *)data_out + 2*1)[0];
+        ((npy_longdouble *)data_out + 2*1)[1] =
+                                ((npy_longdouble *)data0 + 2*1)[1] +
+                                ((npy_longdouble *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*2)[0] =
+                                ((npy_longdouble *)data0 + 2*2)[0] +
+                                ((npy_longdouble *)data_out + 2*2)[0];
+        ((npy_longdouble *)data_out + 2*2)[1] =
+                                ((npy_longdouble *)data0 + 2*2)[1] +
+                                ((npy_longdouble *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*3)[0] =
+                                ((npy_longdouble *)data0 + 2*3)[0] +
+                                ((npy_longdouble *)data_out + 2*3)[0];
+        ((npy_longdouble *)data_out + 2*3)[1] =
+                                ((npy_longdouble *)data0 + 2*3)[1] +
+                                ((npy_longdouble *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*4)[0] =
+                                ((npy_longdouble *)data0 + 2*4)[0] +
+                                ((npy_longdouble *)data_out + 2*4)[0];
+        ((npy_longdouble *)data_out + 2*4)[1] =
+                                ((npy_longdouble *)data0 + 2*4)[1] +
+                                ((npy_longdouble *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*5)[0] =
+                                ((npy_longdouble *)data0 + 2*5)[0] +
+                                ((npy_longdouble *)data_out + 2*5)[0];
+        ((npy_longdouble *)data_out + 2*5)[1] =
+                                ((npy_longdouble *)data0 + 2*5)[1] +
+                                ((npy_longdouble *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*6)[0] =
+                                ((npy_longdouble *)data0 + 2*6)[0] +
+                                ((npy_longdouble *)data_out + 2*6)[0];
+        ((npy_longdouble *)data_out + 2*6)[1] =
+                                ((npy_longdouble *)data0 + 2*6)[1] +
+                                ((npy_longdouble *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*7)[0] =
+                                ((npy_longdouble *)data0 + 2*7)[0] +
+                                ((npy_longdouble *)data_out + 2*7)[0];
+        ((npy_longdouble *)data_out + 2*7)[1] =
+                                ((npy_longdouble *)data0 + 2*7)[1] +
+                                ((npy_longdouble *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 1 == 2 && !1
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+clongdouble_sum_of_products_muladd(npy_clongdouble *data, npy_clongdouble *data_out, npy_longdouble scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_clongdouble
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_clongdouble;
+    const npyv_clongdouble v_scalar = npyv_setall_clongdouble(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_clongdouble b0 = npyv_loada_clongdouble(data + vstep * 0);
+            npyv_clongdouble c0 = npyv_loada_clongdouble(data_out + vstep * 0);
+            
+#line 312
+            npyv_clongdouble b1 = npyv_loada_clongdouble(data + vstep * 1);
+            npyv_clongdouble c1 = npyv_loada_clongdouble(data_out + vstep * 1);
+            
+#line 312
+            npyv_clongdouble b2 = npyv_loada_clongdouble(data + vstep * 2);
+            npyv_clongdouble c2 = npyv_loada_clongdouble(data_out + vstep * 2);
+            
+#line 312
+            npyv_clongdouble b3 = npyv_loada_clongdouble(data + vstep * 3);
+            npyv_clongdouble c3 = npyv_loada_clongdouble(data_out + vstep * 3);
+            
+            #line 318
+            npyv_clongdouble abc0 = npyv_muladd_clongdouble(v_scalar, b0, c0);
+            
+#line 318
+            npyv_clongdouble abc1 = npyv_muladd_clongdouble(v_scalar, b1, c1);
+            
+#line 318
+            npyv_clongdouble abc2 = npyv_muladd_clongdouble(v_scalar, b2, c2);
+            
+#line 318
+            npyv_clongdouble abc3 = npyv_muladd_clongdouble(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_clongdouble(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_clongdouble(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_clongdouble(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_clongdouble(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_clongdouble b0 = npyv_load_clongdouble(data + vstep * 0);
+            npyv_clongdouble c0 = npyv_load_clongdouble(data_out + vstep * 0);
+            
+#line 312
+            npyv_clongdouble b1 = npyv_load_clongdouble(data + vstep * 1);
+            npyv_clongdouble c1 = npyv_load_clongdouble(data_out + vstep * 1);
+            
+#line 312
+            npyv_clongdouble b2 = npyv_load_clongdouble(data + vstep * 2);
+            npyv_clongdouble c2 = npyv_load_clongdouble(data_out + vstep * 2);
+            
+#line 312
+            npyv_clongdouble b3 = npyv_load_clongdouble(data + vstep * 3);
+            npyv_clongdouble c3 = npyv_load_clongdouble(data_out + vstep * 3);
+            
+            #line 318
+            npyv_clongdouble abc0 = npyv_muladd_clongdouble(v_scalar, b0, c0);
+            
+#line 318
+            npyv_clongdouble abc1 = npyv_muladd_clongdouble(v_scalar, b1, c1);
+            
+#line 318
+            npyv_clongdouble abc2 = npyv_muladd_clongdouble(v_scalar, b2, c2);
+            
+#line 318
+            npyv_clongdouble abc3 = npyv_muladd_clongdouble(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_clongdouble(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_clongdouble(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_clongdouble(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_clongdouble(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_clongdouble a = npyv_load_tillz_clongdouble(data, count);
+        npyv_clongdouble b = npyv_load_tillz_clongdouble(data_out, count);
+        npyv_store_till_clongdouble(data_out, count, npyv_muladd_clongdouble(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_longdouble b0 = (data[0]);
+        const npy_longdouble c0 = (data_out[0]);
+        
+#line 340
+        const npy_longdouble b1 = (data[1]);
+        const npy_longdouble c1 = (data_out[1]);
+        
+#line 340
+        const npy_longdouble b2 = (data[2]);
+        const npy_longdouble c2 = (data_out[2]);
+        
+#line 340
+        const npy_longdouble b3 = (data[3]);
+        const npy_longdouble c3 = (data_out[3]);
+        
+        #line 346
+        const npy_longdouble abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_longdouble abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_longdouble abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_longdouble abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_longdouble b = (*data);
+        const npy_longdouble c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_clongdouble
+}
+
+static void
+clongdouble_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_clongdouble *data0 = (npy_clongdouble *)dataptr[0];
+    npy_clongdouble *data1 = (npy_clongdouble *)dataptr[1];
+    npy_clongdouble *data_out = (npy_clongdouble *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("clongdouble_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_clongdouble
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_clongdouble;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_clongdouble a0 = npyv_loada_clongdouble(data0 + vstep * 0);
+            npyv_clongdouble b0 = npyv_loada_clongdouble(data1 + vstep * 0);
+            npyv_clongdouble c0 = npyv_loada_clongdouble(data_out + vstep * 0);
+            
+#line 390
+            npyv_clongdouble a1 = npyv_loada_clongdouble(data0 + vstep * 1);
+            npyv_clongdouble b1 = npyv_loada_clongdouble(data1 + vstep * 1);
+            npyv_clongdouble c1 = npyv_loada_clongdouble(data_out + vstep * 1);
+            
+#line 390
+            npyv_clongdouble a2 = npyv_loada_clongdouble(data0 + vstep * 2);
+            npyv_clongdouble b2 = npyv_loada_clongdouble(data1 + vstep * 2);
+            npyv_clongdouble c2 = npyv_loada_clongdouble(data_out + vstep * 2);
+            
+#line 390
+            npyv_clongdouble a3 = npyv_loada_clongdouble(data0 + vstep * 3);
+            npyv_clongdouble b3 = npyv_loada_clongdouble(data1 + vstep * 3);
+            npyv_clongdouble c3 = npyv_loada_clongdouble(data_out + vstep * 3);
+            
+            #line 397
+            npyv_clongdouble abc0 = npyv_muladd_clongdouble(a0, b0, c0);
+            
+#line 397
+            npyv_clongdouble abc1 = npyv_muladd_clongdouble(a1, b1, c1);
+            
+#line 397
+            npyv_clongdouble abc2 = npyv_muladd_clongdouble(a2, b2, c2);
+            
+#line 397
+            npyv_clongdouble abc3 = npyv_muladd_clongdouble(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_clongdouble(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_clongdouble(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_clongdouble(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_clongdouble(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_clongdouble a0 = npyv_load_clongdouble(data0 + vstep * 0);
+            npyv_clongdouble b0 = npyv_load_clongdouble(data1 + vstep * 0);
+            npyv_clongdouble c0 = npyv_load_clongdouble(data_out + vstep * 0);
+            
+#line 390
+            npyv_clongdouble a1 = npyv_load_clongdouble(data0 + vstep * 1);
+            npyv_clongdouble b1 = npyv_load_clongdouble(data1 + vstep * 1);
+            npyv_clongdouble c1 = npyv_load_clongdouble(data_out + vstep * 1);
+            
+#line 390
+            npyv_clongdouble a2 = npyv_load_clongdouble(data0 + vstep * 2);
+            npyv_clongdouble b2 = npyv_load_clongdouble(data1 + vstep * 2);
+            npyv_clongdouble c2 = npyv_load_clongdouble(data_out + vstep * 2);
+            
+#line 390
+            npyv_clongdouble a3 = npyv_load_clongdouble(data0 + vstep * 3);
+            npyv_clongdouble b3 = npyv_load_clongdouble(data1 + vstep * 3);
+            npyv_clongdouble c3 = npyv_load_clongdouble(data_out + vstep * 3);
+            
+            #line 397
+            npyv_clongdouble abc0 = npyv_muladd_clongdouble(a0, b0, c0);
+            
+#line 397
+            npyv_clongdouble abc1 = npyv_muladd_clongdouble(a1, b1, c1);
+            
+#line 397
+            npyv_clongdouble abc2 = npyv_muladd_clongdouble(a2, b2, c2);
+            
+#line 397
+            npyv_clongdouble abc3 = npyv_muladd_clongdouble(a3, b3, c3);
+            
+            #line 402
+            npyv_store_clongdouble(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_clongdouble(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_clongdouble(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_clongdouble(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_clongdouble a = npyv_load_tillz_clongdouble(data0, count);
+        npyv_clongdouble b = npyv_load_tillz_clongdouble(data1, count);
+        npyv_clongdouble c = npyv_load_tillz_clongdouble(data_out, count);
+        npyv_store_till_clongdouble(data_out, count, npyv_muladd_clongdouble(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_longdouble a0 = (data0[0]);
+        const npy_longdouble b0 = (data1[0]);
+        const npy_longdouble c0 = (data_out[0]);
+        
+#line 420
+        const npy_longdouble a1 = (data0[1]);
+        const npy_longdouble b1 = (data1[1]);
+        const npy_longdouble c1 = (data_out[1]);
+        
+#line 420
+        const npy_longdouble a2 = (data0[2]);
+        const npy_longdouble b2 = (data1[2]);
+        const npy_longdouble c2 = (data_out[2]);
+        
+#line 420
+        const npy_longdouble a3 = (data0[3]);
+        const npy_longdouble b3 = (data1[3]);
+        const npy_longdouble c3 = (data_out[3]);
+        
+        #line 427
+        const npy_longdouble abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_longdouble abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_longdouble abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_longdouble abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_longdouble a = (*data0);
+        const npy_longdouble b = (*data1);
+        const npy_longdouble c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_clongdouble
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+clongdouble_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble value0 = (*(npy_clongdouble *)dataptr[0]);
+    npy_clongdouble *data1 = (npy_clongdouble *)dataptr[1];
+    npy_clongdouble *data_out = (npy_clongdouble *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("clongdouble_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    clongdouble_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+clongdouble_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble value1 = (*(npy_clongdouble *)dataptr[1]);
+    npy_clongdouble *data0 = (npy_clongdouble *)dataptr[0];
+    npy_clongdouble *data_out = (npy_clongdouble *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("clongdouble_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    clongdouble_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+clongdouble_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_clongdouble *data0 = (npy_clongdouble *)dataptr[0];
+    npy_clongdouble *data1 = (npy_clongdouble *)dataptr[1];
+    npy_longdouble accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("clongdouble_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_clongdouble
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_clongdouble;
+    npyv_clongdouble v_accum = npyv_zero_clongdouble();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_clongdouble a0 = npyv_loada_clongdouble(data0 + vstep * 0);
+            npyv_clongdouble b0 = npyv_loada_clongdouble(data1 + vstep * 0);
+            
+#line 501
+            npyv_clongdouble a1 = npyv_loada_clongdouble(data0 + vstep * 1);
+            npyv_clongdouble b1 = npyv_loada_clongdouble(data1 + vstep * 1);
+            
+#line 501
+            npyv_clongdouble a2 = npyv_loada_clongdouble(data0 + vstep * 2);
+            npyv_clongdouble b2 = npyv_loada_clongdouble(data1 + vstep * 2);
+            
+#line 501
+            npyv_clongdouble a3 = npyv_loada_clongdouble(data0 + vstep * 3);
+            npyv_clongdouble b3 = npyv_loada_clongdouble(data1 + vstep * 3);
+            
+            npyv_clongdouble ab3 = npyv_muladd_clongdouble(a3, b3, v_accum);
+            npyv_clongdouble ab2 = npyv_muladd_clongdouble(a2, b2, ab3);
+            npyv_clongdouble ab1 = npyv_muladd_clongdouble(a1, b1, ab2);
+                   v_accum = npyv_muladd_clongdouble(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_clongdouble a0 = npyv_load_clongdouble(data0 + vstep * 0);
+            npyv_clongdouble b0 = npyv_load_clongdouble(data1 + vstep * 0);
+            
+#line 501
+            npyv_clongdouble a1 = npyv_load_clongdouble(data0 + vstep * 1);
+            npyv_clongdouble b1 = npyv_load_clongdouble(data1 + vstep * 1);
+            
+#line 501
+            npyv_clongdouble a2 = npyv_load_clongdouble(data0 + vstep * 2);
+            npyv_clongdouble b2 = npyv_load_clongdouble(data1 + vstep * 2);
+            
+#line 501
+            npyv_clongdouble a3 = npyv_load_clongdouble(data0 + vstep * 3);
+            npyv_clongdouble b3 = npyv_load_clongdouble(data1 + vstep * 3);
+            
+            npyv_clongdouble ab3 = npyv_muladd_clongdouble(a3, b3, v_accum);
+            npyv_clongdouble ab2 = npyv_muladd_clongdouble(a2, b2, ab3);
+            npyv_clongdouble ab1 = npyv_muladd_clongdouble(a1, b1, ab2);
+                   v_accum = npyv_muladd_clongdouble(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_clongdouble a = npyv_load_tillz_clongdouble(data0, count);
+        npyv_clongdouble b = npyv_load_tillz_clongdouble(data1, count);
+        v_accum = npyv_muladd_clongdouble(a, b, v_accum);
+    }
+    accum = npyv_sum_clongdouble(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_longdouble ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_longdouble ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_longdouble ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_longdouble ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_longdouble a = (*data0);
+        const npy_longdouble b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_clongdouble
+    *(npy_clongdouble *)dataptr[2] = ((*(npy_clongdouble *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+clongdouble_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_clongdouble *data1 = (npy_clongdouble *)dataptr[1];
+    npy_longdouble value0 = (*(npy_clongdouble *)dataptr[0]);
+    npy_longdouble accum = clongdouble_sum_of_arr(data1, count);
+    *(npy_clongdouble *)dataptr[2] = ((*(npy_clongdouble *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+clongdouble_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_clongdouble *data0 = (npy_clongdouble *)dataptr[0];
+    npy_longdouble value1 = (*(npy_clongdouble *)dataptr[1]);
+    npy_longdouble accum = clongdouble_sum_of_arr(data0, count);
+    *(npy_clongdouble *)dataptr[2] = ((*(npy_clongdouble *)dataptr[2]) + value1 * accum);
+}
+
+#elif 1 == 3 && !1
+
+static void
+clongdouble_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_clongdouble *data0 = (npy_clongdouble *)dataptr[0];
+    npy_clongdouble *data1 = (npy_clongdouble *)dataptr[1];
+    npy_clongdouble *data2 = (npy_clongdouble *)dataptr[2];
+    npy_clongdouble *data_out = (npy_clongdouble *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 1 > 3 || @complex */
+
+static void
+clongdouble_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("clongdouble_sum_of_products_contig_one (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !1
+        npy_longdouble temp = (*(npy_clongdouble *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_clongdouble *)dataptr[i]);
+        }
+        *(npy_clongdouble *)dataptr[nop] = (temp +
+                                           (*(npy_clongdouble *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_clongdouble);
+        }
+#else /* complex */
+#  if 1 <= 3
+#    define _SUMPROD_NOP 1
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_longdouble re, im, tmp;
+        int i;
+        re = ((npy_longdouble *)dataptr[0])[0];
+        im = ((npy_longdouble *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_longdouble *)dataptr[i])[0] -
+                  im * ((npy_longdouble *)dataptr[i])[1];
+            im = re * ((npy_longdouble *)dataptr[i])[1] +
+                 im * ((npy_longdouble *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_longdouble *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_longdouble *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_longdouble *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_longdouble *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_clongdouble);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 1 */
+
+#if 1 == 1
+
+static NPY_GCC_OPT_3 void
+clongdouble_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("clongdouble_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !1
+    npy_clongdouble *data = (npy_clongdouble *)dataptr[0];
+    npy_longdouble accum = clongdouble_sum_of_arr(data, count);
+    *((npy_clongdouble *)dataptr[1]) = (accum + (*((npy_clongdouble *)dataptr[1])));
+#else
+    npy_longdouble accum_re = 0, accum_im = 0;
+    npy_longdouble *data0 = (npy_longdouble *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_longdouble re01 = data0[0] + data0[2];
+        const npy_longdouble re23 = data0[4] + data0[6];
+        const npy_longdouble im13 = data0[1] + data0[3];
+        const npy_longdouble im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_longdouble *)dataptr[1])[0] += accum_re;
+    ((npy_longdouble *)dataptr[1])[1] += accum_im;
+#endif // !1
+}
+
+#endif /* 1 == 1 */
+
+static void
+clongdouble_sum_of_products_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 1
+    npy_longdouble accum_re = 0, accum_im = 0;
+#else
+    npy_longdouble accum = 0;
+#endif
+
+#if (1 == 1) || (1 <= 3 && !1)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1 == 2 || 1 == 3) && !1
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1 == 3) && !1
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("clongdouble_sum_of_products_outstride0_one (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !1
+#  if 1 == 1
+        accum += (*(npy_clongdouble *)data0);
+        data0 += stride0;
+#  elif 1 == 2
+        accum += (*(npy_clongdouble *)data0) *
+                 (*(npy_clongdouble *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 1 == 3
+        accum += (*(npy_clongdouble *)data0) *
+                 (*(npy_clongdouble *)data1) *
+                 (*(npy_clongdouble *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_longdouble temp = (*(npy_clongdouble *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_clongdouble *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1 == 1
+        accum_re += ((npy_longdouble *)data0)[0];
+        accum_im += ((npy_longdouble *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 1 <= 3
+#define _SUMPROD_NOP 1
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_longdouble re, im, tmp;
+        int i;
+        re = ((npy_longdouble *)dataptr[0])[0];
+        im = ((npy_longdouble *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_longdouble *)dataptr[i])[0] -
+                  im * ((npy_longdouble *)dataptr[i])[1];
+            im = re * ((npy_longdouble *)dataptr[i])[1] +
+                 im * ((npy_longdouble *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 1
+#  if 1 <= 3
+    ((npy_longdouble *)dataptr[1])[0] += accum_re;
+    ((npy_longdouble *)dataptr[1])[1] += accum_im;
+#  else
+    ((npy_longdouble *)dataptr[nop])[0] += accum_re;
+    ((npy_longdouble *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 1 <= 3
+    *((npy_clongdouble *)dataptr[1]) = (accum +
+                                    (*((npy_clongdouble *)dataptr[1])));
+#  else
+    *((npy_clongdouble *)dataptr[nop]) = (accum +
+                                    (*((npy_clongdouble *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+clongdouble_sum_of_products_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (2 == 1) || (2 <= 3 && !1)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (2 == 2 || 2 == 3) && !1
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (2 == 3) && !1
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (2 == 1) || (2 <= 3 && !1)
+    char *data_out = dataptr[2];
+    npy_intp stride_out = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("clongdouble_sum_of_products_two (%d)\n", (int)count);
+
+    while (count--) {
+#if !1
+#  if 2 == 1
+        *(npy_clongdouble *)data_out = ((*(npy_clongdouble *)data0) +
+                                         (*(npy_clongdouble *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 2 == 2
+        *(npy_clongdouble *)data_out = ((*(npy_clongdouble *)data0) *
+                                         (*(npy_clongdouble *)data1) +
+                                         (*(npy_clongdouble *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 2 == 3
+        *(npy_clongdouble *)data_out = ((*(npy_clongdouble *)data0) *
+                                         (*(npy_clongdouble *)data1) *
+                                         (*(npy_clongdouble *)data2) +
+                                         (*(npy_clongdouble *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_longdouble temp = (*(npy_clongdouble *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_clongdouble *)dataptr[i]);
+        }
+        *(npy_clongdouble *)dataptr[nop] = (temp +
+                                           (*(npy_clongdouble *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 2 == 1
+        ((npy_longdouble *)data_out)[0] = ((npy_longdouble *)data0)[0] +
+                                         ((npy_longdouble *)data_out)[0];
+        ((npy_longdouble *)data_out)[1] = ((npy_longdouble *)data0)[1] +
+                                         ((npy_longdouble *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 2 <= 3
+#define _SUMPROD_NOP 2
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_longdouble re, im, tmp;
+        int i;
+        re = ((npy_longdouble *)dataptr[0])[0];
+        im = ((npy_longdouble *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_longdouble *)dataptr[i])[0] -
+                  im * ((npy_longdouble *)dataptr[i])[1];
+            im = re * ((npy_longdouble *)dataptr[i])[1] +
+                 im * ((npy_longdouble *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_longdouble *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_longdouble *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_longdouble *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_longdouble *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 2 == 1
+
+static void
+clongdouble_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_clongdouble *data0 = (npy_clongdouble *)dataptr[0];
+    npy_clongdouble *data_out = (npy_clongdouble *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("clongdouble_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !1
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_longdouble *)data_out + 2*6)[0] =
+                                    ((npy_longdouble *)data0 + 2*6)[0] +
+                                    ((npy_longdouble *)data_out + 2*6)[0];
+            ((npy_longdouble *)data_out + 2*6)[1] =
+                                    ((npy_longdouble *)data0 + 2*6)[1] +
+                                    ((npy_longdouble *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !1
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_longdouble *)data_out + 2*5)[0] =
+                                    ((npy_longdouble *)data0 + 2*5)[0] +
+                                    ((npy_longdouble *)data_out + 2*5)[0];
+            ((npy_longdouble *)data_out + 2*5)[1] =
+                                    ((npy_longdouble *)data0 + 2*5)[1] +
+                                    ((npy_longdouble *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !1
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_longdouble *)data_out + 2*4)[0] =
+                                    ((npy_longdouble *)data0 + 2*4)[0] +
+                                    ((npy_longdouble *)data_out + 2*4)[0];
+            ((npy_longdouble *)data_out + 2*4)[1] =
+                                    ((npy_longdouble *)data0 + 2*4)[1] +
+                                    ((npy_longdouble *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !1
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_longdouble *)data_out + 2*3)[0] =
+                                    ((npy_longdouble *)data0 + 2*3)[0] +
+                                    ((npy_longdouble *)data_out + 2*3)[0];
+            ((npy_longdouble *)data_out + 2*3)[1] =
+                                    ((npy_longdouble *)data0 + 2*3)[1] +
+                                    ((npy_longdouble *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !1
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_longdouble *)data_out + 2*2)[0] =
+                                    ((npy_longdouble *)data0 + 2*2)[0] +
+                                    ((npy_longdouble *)data_out + 2*2)[0];
+            ((npy_longdouble *)data_out + 2*2)[1] =
+                                    ((npy_longdouble *)data0 + 2*2)[1] +
+                                    ((npy_longdouble *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !1
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_longdouble *)data_out + 2*1)[0] =
+                                    ((npy_longdouble *)data0 + 2*1)[0] +
+                                    ((npy_longdouble *)data_out + 2*1)[0];
+            ((npy_longdouble *)data_out + 2*1)[1] =
+                                    ((npy_longdouble *)data0 + 2*1)[1] +
+                                    ((npy_longdouble *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !1
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_longdouble *)data_out + 2*0)[0] =
+                                    ((npy_longdouble *)data0 + 2*0)[0] +
+                                    ((npy_longdouble *)data_out + 2*0)[0];
+            ((npy_longdouble *)data_out + 2*0)[1] =
+                                    ((npy_longdouble *)data0 + 2*0)[1] +
+                                    ((npy_longdouble *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !1
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*0)[0] =
+                                ((npy_longdouble *)data0 + 2*0)[0] +
+                                ((npy_longdouble *)data_out + 2*0)[0];
+        ((npy_longdouble *)data_out + 2*0)[1] =
+                                ((npy_longdouble *)data0 + 2*0)[1] +
+                                ((npy_longdouble *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*1)[0] =
+                                ((npy_longdouble *)data0 + 2*1)[0] +
+                                ((npy_longdouble *)data_out + 2*1)[0];
+        ((npy_longdouble *)data_out + 2*1)[1] =
+                                ((npy_longdouble *)data0 + 2*1)[1] +
+                                ((npy_longdouble *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*2)[0] =
+                                ((npy_longdouble *)data0 + 2*2)[0] +
+                                ((npy_longdouble *)data_out + 2*2)[0];
+        ((npy_longdouble *)data_out + 2*2)[1] =
+                                ((npy_longdouble *)data0 + 2*2)[1] +
+                                ((npy_longdouble *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*3)[0] =
+                                ((npy_longdouble *)data0 + 2*3)[0] +
+                                ((npy_longdouble *)data_out + 2*3)[0];
+        ((npy_longdouble *)data_out + 2*3)[1] =
+                                ((npy_longdouble *)data0 + 2*3)[1] +
+                                ((npy_longdouble *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*4)[0] =
+                                ((npy_longdouble *)data0 + 2*4)[0] +
+                                ((npy_longdouble *)data_out + 2*4)[0];
+        ((npy_longdouble *)data_out + 2*4)[1] =
+                                ((npy_longdouble *)data0 + 2*4)[1] +
+                                ((npy_longdouble *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*5)[0] =
+                                ((npy_longdouble *)data0 + 2*5)[0] +
+                                ((npy_longdouble *)data_out + 2*5)[0];
+        ((npy_longdouble *)data_out + 2*5)[1] =
+                                ((npy_longdouble *)data0 + 2*5)[1] +
+                                ((npy_longdouble *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*6)[0] =
+                                ((npy_longdouble *)data0 + 2*6)[0] +
+                                ((npy_longdouble *)data_out + 2*6)[0];
+        ((npy_longdouble *)data_out + 2*6)[1] =
+                                ((npy_longdouble *)data0 + 2*6)[1] +
+                                ((npy_longdouble *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*7)[0] =
+                                ((npy_longdouble *)data0 + 2*7)[0] +
+                                ((npy_longdouble *)data_out + 2*7)[0];
+        ((npy_longdouble *)data_out + 2*7)[1] =
+                                ((npy_longdouble *)data0 + 2*7)[1] +
+                                ((npy_longdouble *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 2 == 2 && !1
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+clongdouble_sum_of_products_muladd(npy_clongdouble *data, npy_clongdouble *data_out, npy_longdouble scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_clongdouble
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_clongdouble;
+    const npyv_clongdouble v_scalar = npyv_setall_clongdouble(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_clongdouble b0 = npyv_loada_clongdouble(data + vstep * 0);
+            npyv_clongdouble c0 = npyv_loada_clongdouble(data_out + vstep * 0);
+            
+#line 312
+            npyv_clongdouble b1 = npyv_loada_clongdouble(data + vstep * 1);
+            npyv_clongdouble c1 = npyv_loada_clongdouble(data_out + vstep * 1);
+            
+#line 312
+            npyv_clongdouble b2 = npyv_loada_clongdouble(data + vstep * 2);
+            npyv_clongdouble c2 = npyv_loada_clongdouble(data_out + vstep * 2);
+            
+#line 312
+            npyv_clongdouble b3 = npyv_loada_clongdouble(data + vstep * 3);
+            npyv_clongdouble c3 = npyv_loada_clongdouble(data_out + vstep * 3);
+            
+            #line 318
+            npyv_clongdouble abc0 = npyv_muladd_clongdouble(v_scalar, b0, c0);
+            
+#line 318
+            npyv_clongdouble abc1 = npyv_muladd_clongdouble(v_scalar, b1, c1);
+            
+#line 318
+            npyv_clongdouble abc2 = npyv_muladd_clongdouble(v_scalar, b2, c2);
+            
+#line 318
+            npyv_clongdouble abc3 = npyv_muladd_clongdouble(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_clongdouble(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_clongdouble(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_clongdouble(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_clongdouble(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_clongdouble b0 = npyv_load_clongdouble(data + vstep * 0);
+            npyv_clongdouble c0 = npyv_load_clongdouble(data_out + vstep * 0);
+            
+#line 312
+            npyv_clongdouble b1 = npyv_load_clongdouble(data + vstep * 1);
+            npyv_clongdouble c1 = npyv_load_clongdouble(data_out + vstep * 1);
+            
+#line 312
+            npyv_clongdouble b2 = npyv_load_clongdouble(data + vstep * 2);
+            npyv_clongdouble c2 = npyv_load_clongdouble(data_out + vstep * 2);
+            
+#line 312
+            npyv_clongdouble b3 = npyv_load_clongdouble(data + vstep * 3);
+            npyv_clongdouble c3 = npyv_load_clongdouble(data_out + vstep * 3);
+            
+            #line 318
+            npyv_clongdouble abc0 = npyv_muladd_clongdouble(v_scalar, b0, c0);
+            
+#line 318
+            npyv_clongdouble abc1 = npyv_muladd_clongdouble(v_scalar, b1, c1);
+            
+#line 318
+            npyv_clongdouble abc2 = npyv_muladd_clongdouble(v_scalar, b2, c2);
+            
+#line 318
+            npyv_clongdouble abc3 = npyv_muladd_clongdouble(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_clongdouble(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_clongdouble(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_clongdouble(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_clongdouble(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_clongdouble a = npyv_load_tillz_clongdouble(data, count);
+        npyv_clongdouble b = npyv_load_tillz_clongdouble(data_out, count);
+        npyv_store_till_clongdouble(data_out, count, npyv_muladd_clongdouble(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_longdouble b0 = (data[0]);
+        const npy_longdouble c0 = (data_out[0]);
+        
+#line 340
+        const npy_longdouble b1 = (data[1]);
+        const npy_longdouble c1 = (data_out[1]);
+        
+#line 340
+        const npy_longdouble b2 = (data[2]);
+        const npy_longdouble c2 = (data_out[2]);
+        
+#line 340
+        const npy_longdouble b3 = (data[3]);
+        const npy_longdouble c3 = (data_out[3]);
+        
+        #line 346
+        const npy_longdouble abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_longdouble abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_longdouble abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_longdouble abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_longdouble b = (*data);
+        const npy_longdouble c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_clongdouble
+}
+
+static void
+clongdouble_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_clongdouble *data0 = (npy_clongdouble *)dataptr[0];
+    npy_clongdouble *data1 = (npy_clongdouble *)dataptr[1];
+    npy_clongdouble *data_out = (npy_clongdouble *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("clongdouble_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_clongdouble
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_clongdouble;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_clongdouble a0 = npyv_loada_clongdouble(data0 + vstep * 0);
+            npyv_clongdouble b0 = npyv_loada_clongdouble(data1 + vstep * 0);
+            npyv_clongdouble c0 = npyv_loada_clongdouble(data_out + vstep * 0);
+            
+#line 390
+            npyv_clongdouble a1 = npyv_loada_clongdouble(data0 + vstep * 1);
+            npyv_clongdouble b1 = npyv_loada_clongdouble(data1 + vstep * 1);
+            npyv_clongdouble c1 = npyv_loada_clongdouble(data_out + vstep * 1);
+            
+#line 390
+            npyv_clongdouble a2 = npyv_loada_clongdouble(data0 + vstep * 2);
+            npyv_clongdouble b2 = npyv_loada_clongdouble(data1 + vstep * 2);
+            npyv_clongdouble c2 = npyv_loada_clongdouble(data_out + vstep * 2);
+            
+#line 390
+            npyv_clongdouble a3 = npyv_loada_clongdouble(data0 + vstep * 3);
+            npyv_clongdouble b3 = npyv_loada_clongdouble(data1 + vstep * 3);
+            npyv_clongdouble c3 = npyv_loada_clongdouble(data_out + vstep * 3);
+            
+            #line 397
+            npyv_clongdouble abc0 = npyv_muladd_clongdouble(a0, b0, c0);
+            
+#line 397
+            npyv_clongdouble abc1 = npyv_muladd_clongdouble(a1, b1, c1);
+            
+#line 397
+            npyv_clongdouble abc2 = npyv_muladd_clongdouble(a2, b2, c2);
+            
+#line 397
+            npyv_clongdouble abc3 = npyv_muladd_clongdouble(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_clongdouble(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_clongdouble(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_clongdouble(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_clongdouble(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_clongdouble a0 = npyv_load_clongdouble(data0 + vstep * 0);
+            npyv_clongdouble b0 = npyv_load_clongdouble(data1 + vstep * 0);
+            npyv_clongdouble c0 = npyv_load_clongdouble(data_out + vstep * 0);
+            
+#line 390
+            npyv_clongdouble a1 = npyv_load_clongdouble(data0 + vstep * 1);
+            npyv_clongdouble b1 = npyv_load_clongdouble(data1 + vstep * 1);
+            npyv_clongdouble c1 = npyv_load_clongdouble(data_out + vstep * 1);
+            
+#line 390
+            npyv_clongdouble a2 = npyv_load_clongdouble(data0 + vstep * 2);
+            npyv_clongdouble b2 = npyv_load_clongdouble(data1 + vstep * 2);
+            npyv_clongdouble c2 = npyv_load_clongdouble(data_out + vstep * 2);
+            
+#line 390
+            npyv_clongdouble a3 = npyv_load_clongdouble(data0 + vstep * 3);
+            npyv_clongdouble b3 = npyv_load_clongdouble(data1 + vstep * 3);
+            npyv_clongdouble c3 = npyv_load_clongdouble(data_out + vstep * 3);
+            
+            #line 397
+            npyv_clongdouble abc0 = npyv_muladd_clongdouble(a0, b0, c0);
+            
+#line 397
+            npyv_clongdouble abc1 = npyv_muladd_clongdouble(a1, b1, c1);
+            
+#line 397
+            npyv_clongdouble abc2 = npyv_muladd_clongdouble(a2, b2, c2);
+            
+#line 397
+            npyv_clongdouble abc3 = npyv_muladd_clongdouble(a3, b3, c3);
+            
+            #line 402
+            npyv_store_clongdouble(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_clongdouble(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_clongdouble(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_clongdouble(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_clongdouble a = npyv_load_tillz_clongdouble(data0, count);
+        npyv_clongdouble b = npyv_load_tillz_clongdouble(data1, count);
+        npyv_clongdouble c = npyv_load_tillz_clongdouble(data_out, count);
+        npyv_store_till_clongdouble(data_out, count, npyv_muladd_clongdouble(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_longdouble a0 = (data0[0]);
+        const npy_longdouble b0 = (data1[0]);
+        const npy_longdouble c0 = (data_out[0]);
+        
+#line 420
+        const npy_longdouble a1 = (data0[1]);
+        const npy_longdouble b1 = (data1[1]);
+        const npy_longdouble c1 = (data_out[1]);
+        
+#line 420
+        const npy_longdouble a2 = (data0[2]);
+        const npy_longdouble b2 = (data1[2]);
+        const npy_longdouble c2 = (data_out[2]);
+        
+#line 420
+        const npy_longdouble a3 = (data0[3]);
+        const npy_longdouble b3 = (data1[3]);
+        const npy_longdouble c3 = (data_out[3]);
+        
+        #line 427
+        const npy_longdouble abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_longdouble abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_longdouble abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_longdouble abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_longdouble a = (*data0);
+        const npy_longdouble b = (*data1);
+        const npy_longdouble c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_clongdouble
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+clongdouble_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble value0 = (*(npy_clongdouble *)dataptr[0]);
+    npy_clongdouble *data1 = (npy_clongdouble *)dataptr[1];
+    npy_clongdouble *data_out = (npy_clongdouble *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("clongdouble_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    clongdouble_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+clongdouble_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble value1 = (*(npy_clongdouble *)dataptr[1]);
+    npy_clongdouble *data0 = (npy_clongdouble *)dataptr[0];
+    npy_clongdouble *data_out = (npy_clongdouble *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("clongdouble_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    clongdouble_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+clongdouble_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_clongdouble *data0 = (npy_clongdouble *)dataptr[0];
+    npy_clongdouble *data1 = (npy_clongdouble *)dataptr[1];
+    npy_longdouble accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("clongdouble_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_clongdouble
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_clongdouble;
+    npyv_clongdouble v_accum = npyv_zero_clongdouble();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_clongdouble a0 = npyv_loada_clongdouble(data0 + vstep * 0);
+            npyv_clongdouble b0 = npyv_loada_clongdouble(data1 + vstep * 0);
+            
+#line 501
+            npyv_clongdouble a1 = npyv_loada_clongdouble(data0 + vstep * 1);
+            npyv_clongdouble b1 = npyv_loada_clongdouble(data1 + vstep * 1);
+            
+#line 501
+            npyv_clongdouble a2 = npyv_loada_clongdouble(data0 + vstep * 2);
+            npyv_clongdouble b2 = npyv_loada_clongdouble(data1 + vstep * 2);
+            
+#line 501
+            npyv_clongdouble a3 = npyv_loada_clongdouble(data0 + vstep * 3);
+            npyv_clongdouble b3 = npyv_loada_clongdouble(data1 + vstep * 3);
+            
+            npyv_clongdouble ab3 = npyv_muladd_clongdouble(a3, b3, v_accum);
+            npyv_clongdouble ab2 = npyv_muladd_clongdouble(a2, b2, ab3);
+            npyv_clongdouble ab1 = npyv_muladd_clongdouble(a1, b1, ab2);
+                   v_accum = npyv_muladd_clongdouble(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_clongdouble a0 = npyv_load_clongdouble(data0 + vstep * 0);
+            npyv_clongdouble b0 = npyv_load_clongdouble(data1 + vstep * 0);
+            
+#line 501
+            npyv_clongdouble a1 = npyv_load_clongdouble(data0 + vstep * 1);
+            npyv_clongdouble b1 = npyv_load_clongdouble(data1 + vstep * 1);
+            
+#line 501
+            npyv_clongdouble a2 = npyv_load_clongdouble(data0 + vstep * 2);
+            npyv_clongdouble b2 = npyv_load_clongdouble(data1 + vstep * 2);
+            
+#line 501
+            npyv_clongdouble a3 = npyv_load_clongdouble(data0 + vstep * 3);
+            npyv_clongdouble b3 = npyv_load_clongdouble(data1 + vstep * 3);
+            
+            npyv_clongdouble ab3 = npyv_muladd_clongdouble(a3, b3, v_accum);
+            npyv_clongdouble ab2 = npyv_muladd_clongdouble(a2, b2, ab3);
+            npyv_clongdouble ab1 = npyv_muladd_clongdouble(a1, b1, ab2);
+                   v_accum = npyv_muladd_clongdouble(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_clongdouble a = npyv_load_tillz_clongdouble(data0, count);
+        npyv_clongdouble b = npyv_load_tillz_clongdouble(data1, count);
+        v_accum = npyv_muladd_clongdouble(a, b, v_accum);
+    }
+    accum = npyv_sum_clongdouble(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_longdouble ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_longdouble ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_longdouble ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_longdouble ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_longdouble a = (*data0);
+        const npy_longdouble b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_clongdouble
+    *(npy_clongdouble *)dataptr[2] = ((*(npy_clongdouble *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+clongdouble_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_clongdouble *data1 = (npy_clongdouble *)dataptr[1];
+    npy_longdouble value0 = (*(npy_clongdouble *)dataptr[0]);
+    npy_longdouble accum = clongdouble_sum_of_arr(data1, count);
+    *(npy_clongdouble *)dataptr[2] = ((*(npy_clongdouble *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+clongdouble_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_clongdouble *data0 = (npy_clongdouble *)dataptr[0];
+    npy_longdouble value1 = (*(npy_clongdouble *)dataptr[1]);
+    npy_longdouble accum = clongdouble_sum_of_arr(data0, count);
+    *(npy_clongdouble *)dataptr[2] = ((*(npy_clongdouble *)dataptr[2]) + value1 * accum);
+}
+
+#elif 2 == 3 && !1
+
+static void
+clongdouble_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_clongdouble *data0 = (npy_clongdouble *)dataptr[0];
+    npy_clongdouble *data1 = (npy_clongdouble *)dataptr[1];
+    npy_clongdouble *data2 = (npy_clongdouble *)dataptr[2];
+    npy_clongdouble *data_out = (npy_clongdouble *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 2 > 3 || @complex */
+
+static void
+clongdouble_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("clongdouble_sum_of_products_contig_two (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !1
+        npy_longdouble temp = (*(npy_clongdouble *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_clongdouble *)dataptr[i]);
+        }
+        *(npy_clongdouble *)dataptr[nop] = (temp +
+                                           (*(npy_clongdouble *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_clongdouble);
+        }
+#else /* complex */
+#  if 2 <= 3
+#    define _SUMPROD_NOP 2
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_longdouble re, im, tmp;
+        int i;
+        re = ((npy_longdouble *)dataptr[0])[0];
+        im = ((npy_longdouble *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_longdouble *)dataptr[i])[0] -
+                  im * ((npy_longdouble *)dataptr[i])[1];
+            im = re * ((npy_longdouble *)dataptr[i])[1] +
+                 im * ((npy_longdouble *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_longdouble *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_longdouble *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_longdouble *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_longdouble *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_clongdouble);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 2 */
+
+#if 2 == 1
+
+static NPY_GCC_OPT_3 void
+clongdouble_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("clongdouble_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !1
+    npy_clongdouble *data = (npy_clongdouble *)dataptr[0];
+    npy_longdouble accum = clongdouble_sum_of_arr(data, count);
+    *((npy_clongdouble *)dataptr[1]) = (accum + (*((npy_clongdouble *)dataptr[1])));
+#else
+    npy_longdouble accum_re = 0, accum_im = 0;
+    npy_longdouble *data0 = (npy_longdouble *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_longdouble re01 = data0[0] + data0[2];
+        const npy_longdouble re23 = data0[4] + data0[6];
+        const npy_longdouble im13 = data0[1] + data0[3];
+        const npy_longdouble im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_longdouble *)dataptr[1])[0] += accum_re;
+    ((npy_longdouble *)dataptr[1])[1] += accum_im;
+#endif // !1
+}
+
+#endif /* 2 == 1 */
+
+static void
+clongdouble_sum_of_products_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 1
+    npy_longdouble accum_re = 0, accum_im = 0;
+#else
+    npy_longdouble accum = 0;
+#endif
+
+#if (2 == 1) || (2 <= 3 && !1)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (2 == 2 || 2 == 3) && !1
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (2 == 3) && !1
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("clongdouble_sum_of_products_outstride0_two (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !1
+#  if 2 == 1
+        accum += (*(npy_clongdouble *)data0);
+        data0 += stride0;
+#  elif 2 == 2
+        accum += (*(npy_clongdouble *)data0) *
+                 (*(npy_clongdouble *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 2 == 3
+        accum += (*(npy_clongdouble *)data0) *
+                 (*(npy_clongdouble *)data1) *
+                 (*(npy_clongdouble *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_longdouble temp = (*(npy_clongdouble *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_clongdouble *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 2 == 1
+        accum_re += ((npy_longdouble *)data0)[0];
+        accum_im += ((npy_longdouble *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 2 <= 3
+#define _SUMPROD_NOP 2
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_longdouble re, im, tmp;
+        int i;
+        re = ((npy_longdouble *)dataptr[0])[0];
+        im = ((npy_longdouble *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_longdouble *)dataptr[i])[0] -
+                  im * ((npy_longdouble *)dataptr[i])[1];
+            im = re * ((npy_longdouble *)dataptr[i])[1] +
+                 im * ((npy_longdouble *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 1
+#  if 2 <= 3
+    ((npy_longdouble *)dataptr[2])[0] += accum_re;
+    ((npy_longdouble *)dataptr[2])[1] += accum_im;
+#  else
+    ((npy_longdouble *)dataptr[nop])[0] += accum_re;
+    ((npy_longdouble *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 2 <= 3
+    *((npy_clongdouble *)dataptr[2]) = (accum +
+                                    (*((npy_clongdouble *)dataptr[2])));
+#  else
+    *((npy_clongdouble *)dataptr[nop]) = (accum +
+                                    (*((npy_clongdouble *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+clongdouble_sum_of_products_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (3 == 1) || (3 <= 3 && !1)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (3 == 2 || 3 == 3) && !1
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (3 == 3) && !1
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (3 == 1) || (3 <= 3 && !1)
+    char *data_out = dataptr[3];
+    npy_intp stride_out = strides[3];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("clongdouble_sum_of_products_three (%d)\n", (int)count);
+
+    while (count--) {
+#if !1
+#  if 3 == 1
+        *(npy_clongdouble *)data_out = ((*(npy_clongdouble *)data0) +
+                                         (*(npy_clongdouble *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 3 == 2
+        *(npy_clongdouble *)data_out = ((*(npy_clongdouble *)data0) *
+                                         (*(npy_clongdouble *)data1) +
+                                         (*(npy_clongdouble *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 3 == 3
+        *(npy_clongdouble *)data_out = ((*(npy_clongdouble *)data0) *
+                                         (*(npy_clongdouble *)data1) *
+                                         (*(npy_clongdouble *)data2) +
+                                         (*(npy_clongdouble *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_longdouble temp = (*(npy_clongdouble *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_clongdouble *)dataptr[i]);
+        }
+        *(npy_clongdouble *)dataptr[nop] = (temp +
+                                           (*(npy_clongdouble *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 3 == 1
+        ((npy_longdouble *)data_out)[0] = ((npy_longdouble *)data0)[0] +
+                                         ((npy_longdouble *)data_out)[0];
+        ((npy_longdouble *)data_out)[1] = ((npy_longdouble *)data0)[1] +
+                                         ((npy_longdouble *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 3 <= 3
+#define _SUMPROD_NOP 3
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_longdouble re, im, tmp;
+        int i;
+        re = ((npy_longdouble *)dataptr[0])[0];
+        im = ((npy_longdouble *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_longdouble *)dataptr[i])[0] -
+                  im * ((npy_longdouble *)dataptr[i])[1];
+            im = re * ((npy_longdouble *)dataptr[i])[1] +
+                 im * ((npy_longdouble *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_longdouble *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_longdouble *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_longdouble *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_longdouble *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 3 == 1
+
+static void
+clongdouble_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_clongdouble *data0 = (npy_clongdouble *)dataptr[0];
+    npy_clongdouble *data_out = (npy_clongdouble *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("clongdouble_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !1
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_longdouble *)data_out + 2*6)[0] =
+                                    ((npy_longdouble *)data0 + 2*6)[0] +
+                                    ((npy_longdouble *)data_out + 2*6)[0];
+            ((npy_longdouble *)data_out + 2*6)[1] =
+                                    ((npy_longdouble *)data0 + 2*6)[1] +
+                                    ((npy_longdouble *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !1
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_longdouble *)data_out + 2*5)[0] =
+                                    ((npy_longdouble *)data0 + 2*5)[0] +
+                                    ((npy_longdouble *)data_out + 2*5)[0];
+            ((npy_longdouble *)data_out + 2*5)[1] =
+                                    ((npy_longdouble *)data0 + 2*5)[1] +
+                                    ((npy_longdouble *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !1
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_longdouble *)data_out + 2*4)[0] =
+                                    ((npy_longdouble *)data0 + 2*4)[0] +
+                                    ((npy_longdouble *)data_out + 2*4)[0];
+            ((npy_longdouble *)data_out + 2*4)[1] =
+                                    ((npy_longdouble *)data0 + 2*4)[1] +
+                                    ((npy_longdouble *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !1
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_longdouble *)data_out + 2*3)[0] =
+                                    ((npy_longdouble *)data0 + 2*3)[0] +
+                                    ((npy_longdouble *)data_out + 2*3)[0];
+            ((npy_longdouble *)data_out + 2*3)[1] =
+                                    ((npy_longdouble *)data0 + 2*3)[1] +
+                                    ((npy_longdouble *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !1
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_longdouble *)data_out + 2*2)[0] =
+                                    ((npy_longdouble *)data0 + 2*2)[0] +
+                                    ((npy_longdouble *)data_out + 2*2)[0];
+            ((npy_longdouble *)data_out + 2*2)[1] =
+                                    ((npy_longdouble *)data0 + 2*2)[1] +
+                                    ((npy_longdouble *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !1
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_longdouble *)data_out + 2*1)[0] =
+                                    ((npy_longdouble *)data0 + 2*1)[0] +
+                                    ((npy_longdouble *)data_out + 2*1)[0];
+            ((npy_longdouble *)data_out + 2*1)[1] =
+                                    ((npy_longdouble *)data0 + 2*1)[1] +
+                                    ((npy_longdouble *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !1
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_longdouble *)data_out + 2*0)[0] =
+                                    ((npy_longdouble *)data0 + 2*0)[0] +
+                                    ((npy_longdouble *)data_out + 2*0)[0];
+            ((npy_longdouble *)data_out + 2*0)[1] =
+                                    ((npy_longdouble *)data0 + 2*0)[1] +
+                                    ((npy_longdouble *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !1
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*0)[0] =
+                                ((npy_longdouble *)data0 + 2*0)[0] +
+                                ((npy_longdouble *)data_out + 2*0)[0];
+        ((npy_longdouble *)data_out + 2*0)[1] =
+                                ((npy_longdouble *)data0 + 2*0)[1] +
+                                ((npy_longdouble *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*1)[0] =
+                                ((npy_longdouble *)data0 + 2*1)[0] +
+                                ((npy_longdouble *)data_out + 2*1)[0];
+        ((npy_longdouble *)data_out + 2*1)[1] =
+                                ((npy_longdouble *)data0 + 2*1)[1] +
+                                ((npy_longdouble *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*2)[0] =
+                                ((npy_longdouble *)data0 + 2*2)[0] +
+                                ((npy_longdouble *)data_out + 2*2)[0];
+        ((npy_longdouble *)data_out + 2*2)[1] =
+                                ((npy_longdouble *)data0 + 2*2)[1] +
+                                ((npy_longdouble *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*3)[0] =
+                                ((npy_longdouble *)data0 + 2*3)[0] +
+                                ((npy_longdouble *)data_out + 2*3)[0];
+        ((npy_longdouble *)data_out + 2*3)[1] =
+                                ((npy_longdouble *)data0 + 2*3)[1] +
+                                ((npy_longdouble *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*4)[0] =
+                                ((npy_longdouble *)data0 + 2*4)[0] +
+                                ((npy_longdouble *)data_out + 2*4)[0];
+        ((npy_longdouble *)data_out + 2*4)[1] =
+                                ((npy_longdouble *)data0 + 2*4)[1] +
+                                ((npy_longdouble *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*5)[0] =
+                                ((npy_longdouble *)data0 + 2*5)[0] +
+                                ((npy_longdouble *)data_out + 2*5)[0];
+        ((npy_longdouble *)data_out + 2*5)[1] =
+                                ((npy_longdouble *)data0 + 2*5)[1] +
+                                ((npy_longdouble *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*6)[0] =
+                                ((npy_longdouble *)data0 + 2*6)[0] +
+                                ((npy_longdouble *)data_out + 2*6)[0];
+        ((npy_longdouble *)data_out + 2*6)[1] =
+                                ((npy_longdouble *)data0 + 2*6)[1] +
+                                ((npy_longdouble *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*7)[0] =
+                                ((npy_longdouble *)data0 + 2*7)[0] +
+                                ((npy_longdouble *)data_out + 2*7)[0];
+        ((npy_longdouble *)data_out + 2*7)[1] =
+                                ((npy_longdouble *)data0 + 2*7)[1] +
+                                ((npy_longdouble *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 3 == 2 && !1
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+clongdouble_sum_of_products_muladd(npy_clongdouble *data, npy_clongdouble *data_out, npy_longdouble scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_clongdouble
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_clongdouble;
+    const npyv_clongdouble v_scalar = npyv_setall_clongdouble(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_clongdouble b0 = npyv_loada_clongdouble(data + vstep * 0);
+            npyv_clongdouble c0 = npyv_loada_clongdouble(data_out + vstep * 0);
+            
+#line 312
+            npyv_clongdouble b1 = npyv_loada_clongdouble(data + vstep * 1);
+            npyv_clongdouble c1 = npyv_loada_clongdouble(data_out + vstep * 1);
+            
+#line 312
+            npyv_clongdouble b2 = npyv_loada_clongdouble(data + vstep * 2);
+            npyv_clongdouble c2 = npyv_loada_clongdouble(data_out + vstep * 2);
+            
+#line 312
+            npyv_clongdouble b3 = npyv_loada_clongdouble(data + vstep * 3);
+            npyv_clongdouble c3 = npyv_loada_clongdouble(data_out + vstep * 3);
+            
+            #line 318
+            npyv_clongdouble abc0 = npyv_muladd_clongdouble(v_scalar, b0, c0);
+            
+#line 318
+            npyv_clongdouble abc1 = npyv_muladd_clongdouble(v_scalar, b1, c1);
+            
+#line 318
+            npyv_clongdouble abc2 = npyv_muladd_clongdouble(v_scalar, b2, c2);
+            
+#line 318
+            npyv_clongdouble abc3 = npyv_muladd_clongdouble(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_clongdouble(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_clongdouble(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_clongdouble(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_clongdouble(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_clongdouble b0 = npyv_load_clongdouble(data + vstep * 0);
+            npyv_clongdouble c0 = npyv_load_clongdouble(data_out + vstep * 0);
+            
+#line 312
+            npyv_clongdouble b1 = npyv_load_clongdouble(data + vstep * 1);
+            npyv_clongdouble c1 = npyv_load_clongdouble(data_out + vstep * 1);
+            
+#line 312
+            npyv_clongdouble b2 = npyv_load_clongdouble(data + vstep * 2);
+            npyv_clongdouble c2 = npyv_load_clongdouble(data_out + vstep * 2);
+            
+#line 312
+            npyv_clongdouble b3 = npyv_load_clongdouble(data + vstep * 3);
+            npyv_clongdouble c3 = npyv_load_clongdouble(data_out + vstep * 3);
+            
+            #line 318
+            npyv_clongdouble abc0 = npyv_muladd_clongdouble(v_scalar, b0, c0);
+            
+#line 318
+            npyv_clongdouble abc1 = npyv_muladd_clongdouble(v_scalar, b1, c1);
+            
+#line 318
+            npyv_clongdouble abc2 = npyv_muladd_clongdouble(v_scalar, b2, c2);
+            
+#line 318
+            npyv_clongdouble abc3 = npyv_muladd_clongdouble(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_clongdouble(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_clongdouble(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_clongdouble(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_clongdouble(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_clongdouble a = npyv_load_tillz_clongdouble(data, count);
+        npyv_clongdouble b = npyv_load_tillz_clongdouble(data_out, count);
+        npyv_store_till_clongdouble(data_out, count, npyv_muladd_clongdouble(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_longdouble b0 = (data[0]);
+        const npy_longdouble c0 = (data_out[0]);
+        
+#line 340
+        const npy_longdouble b1 = (data[1]);
+        const npy_longdouble c1 = (data_out[1]);
+        
+#line 340
+        const npy_longdouble b2 = (data[2]);
+        const npy_longdouble c2 = (data_out[2]);
+        
+#line 340
+        const npy_longdouble b3 = (data[3]);
+        const npy_longdouble c3 = (data_out[3]);
+        
+        #line 346
+        const npy_longdouble abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_longdouble abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_longdouble abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_longdouble abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_longdouble b = (*data);
+        const npy_longdouble c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_clongdouble
+}
+
+static void
+clongdouble_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_clongdouble *data0 = (npy_clongdouble *)dataptr[0];
+    npy_clongdouble *data1 = (npy_clongdouble *)dataptr[1];
+    npy_clongdouble *data_out = (npy_clongdouble *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("clongdouble_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_clongdouble
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_clongdouble;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_clongdouble a0 = npyv_loada_clongdouble(data0 + vstep * 0);
+            npyv_clongdouble b0 = npyv_loada_clongdouble(data1 + vstep * 0);
+            npyv_clongdouble c0 = npyv_loada_clongdouble(data_out + vstep * 0);
+            
+#line 390
+            npyv_clongdouble a1 = npyv_loada_clongdouble(data0 + vstep * 1);
+            npyv_clongdouble b1 = npyv_loada_clongdouble(data1 + vstep * 1);
+            npyv_clongdouble c1 = npyv_loada_clongdouble(data_out + vstep * 1);
+            
+#line 390
+            npyv_clongdouble a2 = npyv_loada_clongdouble(data0 + vstep * 2);
+            npyv_clongdouble b2 = npyv_loada_clongdouble(data1 + vstep * 2);
+            npyv_clongdouble c2 = npyv_loada_clongdouble(data_out + vstep * 2);
+            
+#line 390
+            npyv_clongdouble a3 = npyv_loada_clongdouble(data0 + vstep * 3);
+            npyv_clongdouble b3 = npyv_loada_clongdouble(data1 + vstep * 3);
+            npyv_clongdouble c3 = npyv_loada_clongdouble(data_out + vstep * 3);
+            
+            #line 397
+            npyv_clongdouble abc0 = npyv_muladd_clongdouble(a0, b0, c0);
+            
+#line 397
+            npyv_clongdouble abc1 = npyv_muladd_clongdouble(a1, b1, c1);
+            
+#line 397
+            npyv_clongdouble abc2 = npyv_muladd_clongdouble(a2, b2, c2);
+            
+#line 397
+            npyv_clongdouble abc3 = npyv_muladd_clongdouble(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_clongdouble(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_clongdouble(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_clongdouble(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_clongdouble(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_clongdouble a0 = npyv_load_clongdouble(data0 + vstep * 0);
+            npyv_clongdouble b0 = npyv_load_clongdouble(data1 + vstep * 0);
+            npyv_clongdouble c0 = npyv_load_clongdouble(data_out + vstep * 0);
+            
+#line 390
+            npyv_clongdouble a1 = npyv_load_clongdouble(data0 + vstep * 1);
+            npyv_clongdouble b1 = npyv_load_clongdouble(data1 + vstep * 1);
+            npyv_clongdouble c1 = npyv_load_clongdouble(data_out + vstep * 1);
+            
+#line 390
+            npyv_clongdouble a2 = npyv_load_clongdouble(data0 + vstep * 2);
+            npyv_clongdouble b2 = npyv_load_clongdouble(data1 + vstep * 2);
+            npyv_clongdouble c2 = npyv_load_clongdouble(data_out + vstep * 2);
+            
+#line 390
+            npyv_clongdouble a3 = npyv_load_clongdouble(data0 + vstep * 3);
+            npyv_clongdouble b3 = npyv_load_clongdouble(data1 + vstep * 3);
+            npyv_clongdouble c3 = npyv_load_clongdouble(data_out + vstep * 3);
+            
+            #line 397
+            npyv_clongdouble abc0 = npyv_muladd_clongdouble(a0, b0, c0);
+            
+#line 397
+            npyv_clongdouble abc1 = npyv_muladd_clongdouble(a1, b1, c1);
+            
+#line 397
+            npyv_clongdouble abc2 = npyv_muladd_clongdouble(a2, b2, c2);
+            
+#line 397
+            npyv_clongdouble abc3 = npyv_muladd_clongdouble(a3, b3, c3);
+            
+            #line 402
+            npyv_store_clongdouble(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_clongdouble(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_clongdouble(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_clongdouble(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_clongdouble a = npyv_load_tillz_clongdouble(data0, count);
+        npyv_clongdouble b = npyv_load_tillz_clongdouble(data1, count);
+        npyv_clongdouble c = npyv_load_tillz_clongdouble(data_out, count);
+        npyv_store_till_clongdouble(data_out, count, npyv_muladd_clongdouble(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_longdouble a0 = (data0[0]);
+        const npy_longdouble b0 = (data1[0]);
+        const npy_longdouble c0 = (data_out[0]);
+        
+#line 420
+        const npy_longdouble a1 = (data0[1]);
+        const npy_longdouble b1 = (data1[1]);
+        const npy_longdouble c1 = (data_out[1]);
+        
+#line 420
+        const npy_longdouble a2 = (data0[2]);
+        const npy_longdouble b2 = (data1[2]);
+        const npy_longdouble c2 = (data_out[2]);
+        
+#line 420
+        const npy_longdouble a3 = (data0[3]);
+        const npy_longdouble b3 = (data1[3]);
+        const npy_longdouble c3 = (data_out[3]);
+        
+        #line 427
+        const npy_longdouble abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_longdouble abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_longdouble abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_longdouble abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_longdouble a = (*data0);
+        const npy_longdouble b = (*data1);
+        const npy_longdouble c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_clongdouble
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+clongdouble_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble value0 = (*(npy_clongdouble *)dataptr[0]);
+    npy_clongdouble *data1 = (npy_clongdouble *)dataptr[1];
+    npy_clongdouble *data_out = (npy_clongdouble *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("clongdouble_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    clongdouble_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+clongdouble_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble value1 = (*(npy_clongdouble *)dataptr[1]);
+    npy_clongdouble *data0 = (npy_clongdouble *)dataptr[0];
+    npy_clongdouble *data_out = (npy_clongdouble *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("clongdouble_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    clongdouble_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+clongdouble_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_clongdouble *data0 = (npy_clongdouble *)dataptr[0];
+    npy_clongdouble *data1 = (npy_clongdouble *)dataptr[1];
+    npy_longdouble accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("clongdouble_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_clongdouble
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_clongdouble;
+    npyv_clongdouble v_accum = npyv_zero_clongdouble();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_clongdouble a0 = npyv_loada_clongdouble(data0 + vstep * 0);
+            npyv_clongdouble b0 = npyv_loada_clongdouble(data1 + vstep * 0);
+            
+#line 501
+            npyv_clongdouble a1 = npyv_loada_clongdouble(data0 + vstep * 1);
+            npyv_clongdouble b1 = npyv_loada_clongdouble(data1 + vstep * 1);
+            
+#line 501
+            npyv_clongdouble a2 = npyv_loada_clongdouble(data0 + vstep * 2);
+            npyv_clongdouble b2 = npyv_loada_clongdouble(data1 + vstep * 2);
+            
+#line 501
+            npyv_clongdouble a3 = npyv_loada_clongdouble(data0 + vstep * 3);
+            npyv_clongdouble b3 = npyv_loada_clongdouble(data1 + vstep * 3);
+            
+            npyv_clongdouble ab3 = npyv_muladd_clongdouble(a3, b3, v_accum);
+            npyv_clongdouble ab2 = npyv_muladd_clongdouble(a2, b2, ab3);
+            npyv_clongdouble ab1 = npyv_muladd_clongdouble(a1, b1, ab2);
+                   v_accum = npyv_muladd_clongdouble(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_clongdouble a0 = npyv_load_clongdouble(data0 + vstep * 0);
+            npyv_clongdouble b0 = npyv_load_clongdouble(data1 + vstep * 0);
+            
+#line 501
+            npyv_clongdouble a1 = npyv_load_clongdouble(data0 + vstep * 1);
+            npyv_clongdouble b1 = npyv_load_clongdouble(data1 + vstep * 1);
+            
+#line 501
+            npyv_clongdouble a2 = npyv_load_clongdouble(data0 + vstep * 2);
+            npyv_clongdouble b2 = npyv_load_clongdouble(data1 + vstep * 2);
+            
+#line 501
+            npyv_clongdouble a3 = npyv_load_clongdouble(data0 + vstep * 3);
+            npyv_clongdouble b3 = npyv_load_clongdouble(data1 + vstep * 3);
+            
+            npyv_clongdouble ab3 = npyv_muladd_clongdouble(a3, b3, v_accum);
+            npyv_clongdouble ab2 = npyv_muladd_clongdouble(a2, b2, ab3);
+            npyv_clongdouble ab1 = npyv_muladd_clongdouble(a1, b1, ab2);
+                   v_accum = npyv_muladd_clongdouble(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_clongdouble a = npyv_load_tillz_clongdouble(data0, count);
+        npyv_clongdouble b = npyv_load_tillz_clongdouble(data1, count);
+        v_accum = npyv_muladd_clongdouble(a, b, v_accum);
+    }
+    accum = npyv_sum_clongdouble(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_longdouble ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_longdouble ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_longdouble ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_longdouble ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_longdouble a = (*data0);
+        const npy_longdouble b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_clongdouble
+    *(npy_clongdouble *)dataptr[2] = ((*(npy_clongdouble *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+clongdouble_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_clongdouble *data1 = (npy_clongdouble *)dataptr[1];
+    npy_longdouble value0 = (*(npy_clongdouble *)dataptr[0]);
+    npy_longdouble accum = clongdouble_sum_of_arr(data1, count);
+    *(npy_clongdouble *)dataptr[2] = ((*(npy_clongdouble *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+clongdouble_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_clongdouble *data0 = (npy_clongdouble *)dataptr[0];
+    npy_longdouble value1 = (*(npy_clongdouble *)dataptr[1]);
+    npy_longdouble accum = clongdouble_sum_of_arr(data0, count);
+    *(npy_clongdouble *)dataptr[2] = ((*(npy_clongdouble *)dataptr[2]) + value1 * accum);
+}
+
+#elif 3 == 3 && !1
+
+static void
+clongdouble_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_clongdouble *data0 = (npy_clongdouble *)dataptr[0];
+    npy_clongdouble *data1 = (npy_clongdouble *)dataptr[1];
+    npy_clongdouble *data2 = (npy_clongdouble *)dataptr[2];
+    npy_clongdouble *data_out = (npy_clongdouble *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 3 > 3 || @complex */
+
+static void
+clongdouble_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("clongdouble_sum_of_products_contig_three (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !1
+        npy_longdouble temp = (*(npy_clongdouble *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_clongdouble *)dataptr[i]);
+        }
+        *(npy_clongdouble *)dataptr[nop] = (temp +
+                                           (*(npy_clongdouble *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_clongdouble);
+        }
+#else /* complex */
+#  if 3 <= 3
+#    define _SUMPROD_NOP 3
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_longdouble re, im, tmp;
+        int i;
+        re = ((npy_longdouble *)dataptr[0])[0];
+        im = ((npy_longdouble *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_longdouble *)dataptr[i])[0] -
+                  im * ((npy_longdouble *)dataptr[i])[1];
+            im = re * ((npy_longdouble *)dataptr[i])[1] +
+                 im * ((npy_longdouble *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_longdouble *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_longdouble *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_longdouble *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_longdouble *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_clongdouble);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 3 */
+
+#if 3 == 1
+
+static NPY_GCC_OPT_3 void
+clongdouble_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("clongdouble_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !1
+    npy_clongdouble *data = (npy_clongdouble *)dataptr[0];
+    npy_longdouble accum = clongdouble_sum_of_arr(data, count);
+    *((npy_clongdouble *)dataptr[1]) = (accum + (*((npy_clongdouble *)dataptr[1])));
+#else
+    npy_longdouble accum_re = 0, accum_im = 0;
+    npy_longdouble *data0 = (npy_longdouble *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_longdouble re01 = data0[0] + data0[2];
+        const npy_longdouble re23 = data0[4] + data0[6];
+        const npy_longdouble im13 = data0[1] + data0[3];
+        const npy_longdouble im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_longdouble *)dataptr[1])[0] += accum_re;
+    ((npy_longdouble *)dataptr[1])[1] += accum_im;
+#endif // !1
+}
+
+#endif /* 3 == 1 */
+
+static void
+clongdouble_sum_of_products_outstride0_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 1
+    npy_longdouble accum_re = 0, accum_im = 0;
+#else
+    npy_longdouble accum = 0;
+#endif
+
+#if (3 == 1) || (3 <= 3 && !1)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (3 == 2 || 3 == 3) && !1
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (3 == 3) && !1
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("clongdouble_sum_of_products_outstride0_three (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !1
+#  if 3 == 1
+        accum += (*(npy_clongdouble *)data0);
+        data0 += stride0;
+#  elif 3 == 2
+        accum += (*(npy_clongdouble *)data0) *
+                 (*(npy_clongdouble *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 3 == 3
+        accum += (*(npy_clongdouble *)data0) *
+                 (*(npy_clongdouble *)data1) *
+                 (*(npy_clongdouble *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_longdouble temp = (*(npy_clongdouble *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_clongdouble *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 3 == 1
+        accum_re += ((npy_longdouble *)data0)[0];
+        accum_im += ((npy_longdouble *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 3 <= 3
+#define _SUMPROD_NOP 3
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_longdouble re, im, tmp;
+        int i;
+        re = ((npy_longdouble *)dataptr[0])[0];
+        im = ((npy_longdouble *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_longdouble *)dataptr[i])[0] -
+                  im * ((npy_longdouble *)dataptr[i])[1];
+            im = re * ((npy_longdouble *)dataptr[i])[1] +
+                 im * ((npy_longdouble *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 1
+#  if 3 <= 3
+    ((npy_longdouble *)dataptr[3])[0] += accum_re;
+    ((npy_longdouble *)dataptr[3])[1] += accum_im;
+#  else
+    ((npy_longdouble *)dataptr[nop])[0] += accum_re;
+    ((npy_longdouble *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 3 <= 3
+    *((npy_clongdouble *)dataptr[3]) = (accum +
+                                    (*((npy_clongdouble *)dataptr[3])));
+#  else
+    *((npy_clongdouble *)dataptr[nop]) = (accum +
+                                    (*((npy_clongdouble *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+#line 131
+static void
+clongdouble_sum_of_products_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1000 == 1) || (1000 <= 3 && !1)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1000 == 2 || 1000 == 3) && !1
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1000 == 3) && !1
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (1000 == 1) || (1000 <= 3 && !1)
+    char *data_out = dataptr[1000];
+    npy_intp stride_out = strides[1000];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("clongdouble_sum_of_products_any (%d)\n", (int)count);
+
+    while (count--) {
+#if !1
+#  if 1000 == 1
+        *(npy_clongdouble *)data_out = ((*(npy_clongdouble *)data0) +
+                                         (*(npy_clongdouble *)data_out));
+        data0 += stride0;
+        data_out += stride_out;
+#  elif 1000 == 2
+        *(npy_clongdouble *)data_out = ((*(npy_clongdouble *)data0) *
+                                         (*(npy_clongdouble *)data1) +
+                                         (*(npy_clongdouble *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#  elif 1000 == 3
+        *(npy_clongdouble *)data_out = ((*(npy_clongdouble *)data0) *
+                                         (*(npy_clongdouble *)data1) *
+                                         (*(npy_clongdouble *)data2) +
+                                         (*(npy_clongdouble *)data_out));
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#  else
+        npy_longdouble temp = (*(npy_clongdouble *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_clongdouble *)dataptr[i]);
+        }
+        *(npy_clongdouble *)dataptr[nop] = (temp +
+                                           (*(npy_clongdouble *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1000 == 1
+        ((npy_longdouble *)data_out)[0] = ((npy_longdouble *)data0)[0] +
+                                         ((npy_longdouble *)data_out)[0];
+        ((npy_longdouble *)data_out)[1] = ((npy_longdouble *)data0)[1] +
+                                         ((npy_longdouble *)data_out)[1];
+        data0 += stride0;
+        data_out += stride_out;
+#  else
+#    if 1000 <= 3
+#define _SUMPROD_NOP 1000
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_longdouble re, im, tmp;
+        int i;
+        re = ((npy_longdouble *)dataptr[0])[0];
+        im = ((npy_longdouble *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_longdouble *)dataptr[i])[0] -
+                  im * ((npy_longdouble *)dataptr[i])[1];
+            im = re * ((npy_longdouble *)dataptr[i])[1] +
+                 im * ((npy_longdouble *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_longdouble *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_longdouble *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_longdouble *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_longdouble *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+}
+
+#if 1000 == 1
+
+static void
+clongdouble_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_clongdouble *data0 = (npy_clongdouble *)dataptr[0];
+    npy_clongdouble *data_out = (npy_clongdouble *)dataptr[1];
+
+    NPY_EINSUM_DBG_PRINT1("clongdouble_sum_of_products_contig_one (%d)\n",
+                                                            (int)count);
+
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 246
+        case 6+1:
+#if !1
+            data_out[6] = ((data0[6]) +
+                                 (data_out[6]));
+#else
+            ((npy_longdouble *)data_out + 2*6)[0] =
+                                    ((npy_longdouble *)data0 + 2*6)[0] +
+                                    ((npy_longdouble *)data_out + 2*6)[0];
+            ((npy_longdouble *)data_out + 2*6)[1] =
+                                    ((npy_longdouble *)data0 + 2*6)[1] +
+                                    ((npy_longdouble *)data_out + 2*6)[1];
+#endif
+
+#line 246
+        case 5+1:
+#if !1
+            data_out[5] = ((data0[5]) +
+                                 (data_out[5]));
+#else
+            ((npy_longdouble *)data_out + 2*5)[0] =
+                                    ((npy_longdouble *)data0 + 2*5)[0] +
+                                    ((npy_longdouble *)data_out + 2*5)[0];
+            ((npy_longdouble *)data_out + 2*5)[1] =
+                                    ((npy_longdouble *)data0 + 2*5)[1] +
+                                    ((npy_longdouble *)data_out + 2*5)[1];
+#endif
+
+#line 246
+        case 4+1:
+#if !1
+            data_out[4] = ((data0[4]) +
+                                 (data_out[4]));
+#else
+            ((npy_longdouble *)data_out + 2*4)[0] =
+                                    ((npy_longdouble *)data0 + 2*4)[0] +
+                                    ((npy_longdouble *)data_out + 2*4)[0];
+            ((npy_longdouble *)data_out + 2*4)[1] =
+                                    ((npy_longdouble *)data0 + 2*4)[1] +
+                                    ((npy_longdouble *)data_out + 2*4)[1];
+#endif
+
+#line 246
+        case 3+1:
+#if !1
+            data_out[3] = ((data0[3]) +
+                                 (data_out[3]));
+#else
+            ((npy_longdouble *)data_out + 2*3)[0] =
+                                    ((npy_longdouble *)data0 + 2*3)[0] +
+                                    ((npy_longdouble *)data_out + 2*3)[0];
+            ((npy_longdouble *)data_out + 2*3)[1] =
+                                    ((npy_longdouble *)data0 + 2*3)[1] +
+                                    ((npy_longdouble *)data_out + 2*3)[1];
+#endif
+
+#line 246
+        case 2+1:
+#if !1
+            data_out[2] = ((data0[2]) +
+                                 (data_out[2]));
+#else
+            ((npy_longdouble *)data_out + 2*2)[0] =
+                                    ((npy_longdouble *)data0 + 2*2)[0] +
+                                    ((npy_longdouble *)data_out + 2*2)[0];
+            ((npy_longdouble *)data_out + 2*2)[1] =
+                                    ((npy_longdouble *)data0 + 2*2)[1] +
+                                    ((npy_longdouble *)data_out + 2*2)[1];
+#endif
+
+#line 246
+        case 1+1:
+#if !1
+            data_out[1] = ((data0[1]) +
+                                 (data_out[1]));
+#else
+            ((npy_longdouble *)data_out + 2*1)[0] =
+                                    ((npy_longdouble *)data0 + 2*1)[0] +
+                                    ((npy_longdouble *)data_out + 2*1)[0];
+            ((npy_longdouble *)data_out + 2*1)[1] =
+                                    ((npy_longdouble *)data0 + 2*1)[1] +
+                                    ((npy_longdouble *)data_out + 2*1)[1];
+#endif
+
+#line 246
+        case 0+1:
+#if !1
+            data_out[0] = ((data0[0]) +
+                                 (data_out[0]));
+#else
+            ((npy_longdouble *)data_out + 2*0)[0] =
+                                    ((npy_longdouble *)data0 + 2*0)[0] +
+                                    ((npy_longdouble *)data_out + 2*0)[0];
+            ((npy_longdouble *)data_out + 2*0)[1] =
+                                    ((npy_longdouble *)data0 + 2*0)[1] +
+                                    ((npy_longdouble *)data_out + 2*0)[1];
+#endif
+
+        case 0:
+            return;
+    }
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 270
+#if !1
+        data_out[0] = ((data0[0]) +
+                             (data_out[0]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*0)[0] =
+                                ((npy_longdouble *)data0 + 2*0)[0] +
+                                ((npy_longdouble *)data_out + 2*0)[0];
+        ((npy_longdouble *)data_out + 2*0)[1] =
+                                ((npy_longdouble *)data0 + 2*0)[1] +
+                                ((npy_longdouble *)data_out + 2*0)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[1] = ((data0[1]) +
+                             (data_out[1]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*1)[0] =
+                                ((npy_longdouble *)data0 + 2*1)[0] +
+                                ((npy_longdouble *)data_out + 2*1)[0];
+        ((npy_longdouble *)data_out + 2*1)[1] =
+                                ((npy_longdouble *)data0 + 2*1)[1] +
+                                ((npy_longdouble *)data_out + 2*1)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[2] = ((data0[2]) +
+                             (data_out[2]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*2)[0] =
+                                ((npy_longdouble *)data0 + 2*2)[0] +
+                                ((npy_longdouble *)data_out + 2*2)[0];
+        ((npy_longdouble *)data_out + 2*2)[1] =
+                                ((npy_longdouble *)data0 + 2*2)[1] +
+                                ((npy_longdouble *)data_out + 2*2)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[3] = ((data0[3]) +
+                             (data_out[3]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*3)[0] =
+                                ((npy_longdouble *)data0 + 2*3)[0] +
+                                ((npy_longdouble *)data_out + 2*3)[0];
+        ((npy_longdouble *)data_out + 2*3)[1] =
+                                ((npy_longdouble *)data0 + 2*3)[1] +
+                                ((npy_longdouble *)data_out + 2*3)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[4] = ((data0[4]) +
+                             (data_out[4]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*4)[0] =
+                                ((npy_longdouble *)data0 + 2*4)[0] +
+                                ((npy_longdouble *)data_out + 2*4)[0];
+        ((npy_longdouble *)data_out + 2*4)[1] =
+                                ((npy_longdouble *)data0 + 2*4)[1] +
+                                ((npy_longdouble *)data_out + 2*4)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[5] = ((data0[5]) +
+                             (data_out[5]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*5)[0] =
+                                ((npy_longdouble *)data0 + 2*5)[0] +
+                                ((npy_longdouble *)data_out + 2*5)[0];
+        ((npy_longdouble *)data_out + 2*5)[1] =
+                                ((npy_longdouble *)data0 + 2*5)[1] +
+                                ((npy_longdouble *)data_out + 2*5)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[6] = ((data0[6]) +
+                             (data_out[6]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*6)[0] =
+                                ((npy_longdouble *)data0 + 2*6)[0] +
+                                ((npy_longdouble *)data_out + 2*6)[0];
+        ((npy_longdouble *)data_out + 2*6)[1] =
+                                ((npy_longdouble *)data0 + 2*6)[1] +
+                                ((npy_longdouble *)data_out + 2*6)[1];
+#endif
+
+#line 270
+#if !1
+        data_out[7] = ((data0[7]) +
+                             (data_out[7]));
+#else /* complex */
+        ((npy_longdouble *)data_out + 2*7)[0] =
+                                ((npy_longdouble *)data0 + 2*7)[0] +
+                                ((npy_longdouble *)data_out + 2*7)[0];
+        ((npy_longdouble *)data_out + 2*7)[1] =
+                                ((npy_longdouble *)data0 + 2*7)[1] +
+                                ((npy_longdouble *)data_out + 2*7)[1];
+#endif
+
+        data0 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+    goto finish_after_unrolled_loop;
+}
+
+#elif 1000 == 2 && !1
+
+// calculate the multiply and add operation such as dataout = data*scalar+dataout
+static NPY_GCC_OPT_3 void
+clongdouble_sum_of_products_muladd(npy_clongdouble *data, npy_clongdouble *data_out, npy_longdouble scalar, npy_intp count)
+{
+#if 0 // NPYV check for npy_clongdouble
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data) && EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_clongdouble;
+    const npyv_clongdouble v_scalar = npyv_setall_clongdouble(scalar);
+    #line 306
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_clongdouble b0 = npyv_loada_clongdouble(data + vstep * 0);
+            npyv_clongdouble c0 = npyv_loada_clongdouble(data_out + vstep * 0);
+            
+#line 312
+            npyv_clongdouble b1 = npyv_loada_clongdouble(data + vstep * 1);
+            npyv_clongdouble c1 = npyv_loada_clongdouble(data_out + vstep * 1);
+            
+#line 312
+            npyv_clongdouble b2 = npyv_loada_clongdouble(data + vstep * 2);
+            npyv_clongdouble c2 = npyv_loada_clongdouble(data_out + vstep * 2);
+            
+#line 312
+            npyv_clongdouble b3 = npyv_loada_clongdouble(data + vstep * 3);
+            npyv_clongdouble c3 = npyv_loada_clongdouble(data_out + vstep * 3);
+            
+            #line 318
+            npyv_clongdouble abc0 = npyv_muladd_clongdouble(v_scalar, b0, c0);
+            
+#line 318
+            npyv_clongdouble abc1 = npyv_muladd_clongdouble(v_scalar, b1, c1);
+            
+#line 318
+            npyv_clongdouble abc2 = npyv_muladd_clongdouble(v_scalar, b2, c2);
+            
+#line 318
+            npyv_clongdouble abc3 = npyv_muladd_clongdouble(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_storea_clongdouble(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_storea_clongdouble(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_storea_clongdouble(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_storea_clongdouble(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 306
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data += vstepx4, data_out += vstepx4) {
+            #line 312
+            npyv_clongdouble b0 = npyv_load_clongdouble(data + vstep * 0);
+            npyv_clongdouble c0 = npyv_load_clongdouble(data_out + vstep * 0);
+            
+#line 312
+            npyv_clongdouble b1 = npyv_load_clongdouble(data + vstep * 1);
+            npyv_clongdouble c1 = npyv_load_clongdouble(data_out + vstep * 1);
+            
+#line 312
+            npyv_clongdouble b2 = npyv_load_clongdouble(data + vstep * 2);
+            npyv_clongdouble c2 = npyv_load_clongdouble(data_out + vstep * 2);
+            
+#line 312
+            npyv_clongdouble b3 = npyv_load_clongdouble(data + vstep * 3);
+            npyv_clongdouble c3 = npyv_load_clongdouble(data_out + vstep * 3);
+            
+            #line 318
+            npyv_clongdouble abc0 = npyv_muladd_clongdouble(v_scalar, b0, c0);
+            
+#line 318
+            npyv_clongdouble abc1 = npyv_muladd_clongdouble(v_scalar, b1, c1);
+            
+#line 318
+            npyv_clongdouble abc2 = npyv_muladd_clongdouble(v_scalar, b2, c2);
+            
+#line 318
+            npyv_clongdouble abc3 = npyv_muladd_clongdouble(v_scalar, b3, c3);
+            
+            #line 323
+            npyv_store_clongdouble(data_out + vstep * 0, abc0);
+            
+#line 323
+            npyv_store_clongdouble(data_out + vstep * 1, abc1);
+            
+#line 323
+            npyv_store_clongdouble(data_out + vstep * 2, abc2);
+            
+#line 323
+            npyv_store_clongdouble(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data += vstep, data_out += vstep) {
+        npyv_clongdouble a = npyv_load_tillz_clongdouble(data, count);
+        npyv_clongdouble b = npyv_load_tillz_clongdouble(data_out, count);
+        npyv_store_till_clongdouble(data_out, count, npyv_muladd_clongdouble(a, v_scalar, b));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data += 4, data_out += 4) {
+        #line 340
+        const npy_longdouble b0 = (data[0]);
+        const npy_longdouble c0 = (data_out[0]);
+        
+#line 340
+        const npy_longdouble b1 = (data[1]);
+        const npy_longdouble c1 = (data_out[1]);
+        
+#line 340
+        const npy_longdouble b2 = (data[2]);
+        const npy_longdouble c2 = (data_out[2]);
+        
+#line 340
+        const npy_longdouble b3 = (data[3]);
+        const npy_longdouble c3 = (data_out[3]);
+        
+        #line 346
+        const npy_longdouble abc0 = scalar * b0 + c0;
+        
+#line 346
+        const npy_longdouble abc1 = scalar * b1 + c1;
+        
+#line 346
+        const npy_longdouble abc2 = scalar * b2 + c2;
+        
+#line 346
+        const npy_longdouble abc3 = scalar * b3 + c3;
+        
+        #line 351
+        data_out[0] = (abc0);
+        
+#line 351
+        data_out[1] = (abc1);
+        
+#line 351
+        data_out[2] = (abc2);
+        
+#line 351
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data, ++data_out) {
+        const npy_longdouble b = (*data);
+        const npy_longdouble c = (*data_out);
+        *data_out = (scalar * b + c);
+    }
+#endif // NPYV check for npy_clongdouble
+}
+
+static void
+clongdouble_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_clongdouble *data0 = (npy_clongdouble *)dataptr[0];
+    npy_clongdouble *data1 = (npy_clongdouble *)dataptr[1];
+    npy_clongdouble *data_out = (npy_clongdouble *)dataptr[2];
+    NPY_EINSUM_DBG_PRINT1("clongdouble_sum_of_products_contig_two (%d)\n",
+                                                            (int)count);
+    // NPYV check for npy_clongdouble
+#if 0
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1) &&
+                        EINSUM_IS_ALIGNED(data_out);
+    const int vstep = npyv_nlanes_clongdouble;
+
+    #line 384
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_clongdouble a0 = npyv_loada_clongdouble(data0 + vstep * 0);
+            npyv_clongdouble b0 = npyv_loada_clongdouble(data1 + vstep * 0);
+            npyv_clongdouble c0 = npyv_loada_clongdouble(data_out + vstep * 0);
+            
+#line 390
+            npyv_clongdouble a1 = npyv_loada_clongdouble(data0 + vstep * 1);
+            npyv_clongdouble b1 = npyv_loada_clongdouble(data1 + vstep * 1);
+            npyv_clongdouble c1 = npyv_loada_clongdouble(data_out + vstep * 1);
+            
+#line 390
+            npyv_clongdouble a2 = npyv_loada_clongdouble(data0 + vstep * 2);
+            npyv_clongdouble b2 = npyv_loada_clongdouble(data1 + vstep * 2);
+            npyv_clongdouble c2 = npyv_loada_clongdouble(data_out + vstep * 2);
+            
+#line 390
+            npyv_clongdouble a3 = npyv_loada_clongdouble(data0 + vstep * 3);
+            npyv_clongdouble b3 = npyv_loada_clongdouble(data1 + vstep * 3);
+            npyv_clongdouble c3 = npyv_loada_clongdouble(data_out + vstep * 3);
+            
+            #line 397
+            npyv_clongdouble abc0 = npyv_muladd_clongdouble(a0, b0, c0);
+            
+#line 397
+            npyv_clongdouble abc1 = npyv_muladd_clongdouble(a1, b1, c1);
+            
+#line 397
+            npyv_clongdouble abc2 = npyv_muladd_clongdouble(a2, b2, c2);
+            
+#line 397
+            npyv_clongdouble abc3 = npyv_muladd_clongdouble(a3, b3, c3);
+            
+            #line 402
+            npyv_storea_clongdouble(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_storea_clongdouble(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_storea_clongdouble(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_storea_clongdouble(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+#line 384
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4, data_out += vstepx4) {
+            #line 390
+            npyv_clongdouble a0 = npyv_load_clongdouble(data0 + vstep * 0);
+            npyv_clongdouble b0 = npyv_load_clongdouble(data1 + vstep * 0);
+            npyv_clongdouble c0 = npyv_load_clongdouble(data_out + vstep * 0);
+            
+#line 390
+            npyv_clongdouble a1 = npyv_load_clongdouble(data0 + vstep * 1);
+            npyv_clongdouble b1 = npyv_load_clongdouble(data1 + vstep * 1);
+            npyv_clongdouble c1 = npyv_load_clongdouble(data_out + vstep * 1);
+            
+#line 390
+            npyv_clongdouble a2 = npyv_load_clongdouble(data0 + vstep * 2);
+            npyv_clongdouble b2 = npyv_load_clongdouble(data1 + vstep * 2);
+            npyv_clongdouble c2 = npyv_load_clongdouble(data_out + vstep * 2);
+            
+#line 390
+            npyv_clongdouble a3 = npyv_load_clongdouble(data0 + vstep * 3);
+            npyv_clongdouble b3 = npyv_load_clongdouble(data1 + vstep * 3);
+            npyv_clongdouble c3 = npyv_load_clongdouble(data_out + vstep * 3);
+            
+            #line 397
+            npyv_clongdouble abc0 = npyv_muladd_clongdouble(a0, b0, c0);
+            
+#line 397
+            npyv_clongdouble abc1 = npyv_muladd_clongdouble(a1, b1, c1);
+            
+#line 397
+            npyv_clongdouble abc2 = npyv_muladd_clongdouble(a2, b2, c2);
+            
+#line 397
+            npyv_clongdouble abc3 = npyv_muladd_clongdouble(a3, b3, c3);
+            
+            #line 402
+            npyv_store_clongdouble(data_out + vstep * 0, abc0);
+            
+#line 402
+            npyv_store_clongdouble(data_out + vstep * 1, abc1);
+            
+#line 402
+            npyv_store_clongdouble(data_out + vstep * 2, abc2);
+            
+#line 402
+            npyv_store_clongdouble(data_out + vstep * 3, abc3);
+            
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep, data_out += vstep) {
+        npyv_clongdouble a = npyv_load_tillz_clongdouble(data0, count);
+        npyv_clongdouble b = npyv_load_tillz_clongdouble(data1, count);
+        npyv_clongdouble c = npyv_load_tillz_clongdouble(data_out, count);
+        npyv_store_till_clongdouble(data_out, count, npyv_muladd_clongdouble(a, b, c));
+    }
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4, data_out += 4) {
+        #line 420
+        const npy_longdouble a0 = (data0[0]);
+        const npy_longdouble b0 = (data1[0]);
+        const npy_longdouble c0 = (data_out[0]);
+        
+#line 420
+        const npy_longdouble a1 = (data0[1]);
+        const npy_longdouble b1 = (data1[1]);
+        const npy_longdouble c1 = (data_out[1]);
+        
+#line 420
+        const npy_longdouble a2 = (data0[2]);
+        const npy_longdouble b2 = (data1[2]);
+        const npy_longdouble c2 = (data_out[2]);
+        
+#line 420
+        const npy_longdouble a3 = (data0[3]);
+        const npy_longdouble b3 = (data1[3]);
+        const npy_longdouble c3 = (data_out[3]);
+        
+        #line 427
+        const npy_longdouble abc0 = a0 * b0 + c0;
+        
+#line 427
+        const npy_longdouble abc1 = a1 * b1 + c1;
+        
+#line 427
+        const npy_longdouble abc2 = a2 * b2 + c2;
+        
+#line 427
+        const npy_longdouble abc3 = a3 * b3 + c3;
+        
+        #line 432
+        data_out[0] = (abc0);
+        
+#line 432
+        data_out[1] = (abc1);
+        
+#line 432
+        data_out[2] = (abc2);
+        
+#line 432
+        data_out[3] = (abc3);
+        
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1, ++data_out) {
+        const npy_longdouble a = (*data0);
+        const npy_longdouble b = (*data1);
+        const npy_longdouble c = (*data_out);
+        *data_out = (a * b + c);
+    }
+#endif // NPYV check for npy_clongdouble
+
+}
+
+/* Some extra specializations for the two operand case */
+static void
+clongdouble_sum_of_products_stride0_contig_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble value0 = (*(npy_clongdouble *)dataptr[0]);
+    npy_clongdouble *data1 = (npy_clongdouble *)dataptr[1];
+    npy_clongdouble *data_out = (npy_clongdouble *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("clongdouble_sum_of_products_stride0_contig_outcontig_two (%d)\n",
+                                                    (int)count);
+    clongdouble_sum_of_products_muladd(data1, data_out, value0, count);
+    
+}
+
+static void
+clongdouble_sum_of_products_contig_stride0_outcontig_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_longdouble value1 = (*(npy_clongdouble *)dataptr[1]);
+    npy_clongdouble *data0 = (npy_clongdouble *)dataptr[0];
+    npy_clongdouble *data_out = (npy_clongdouble *)dataptr[2];
+
+    NPY_EINSUM_DBG_PRINT1("clongdouble_sum_of_products_contig_stride0_outcontig_two (%d)\n",
+                                                    (int)count);
+    clongdouble_sum_of_products_muladd(data0, data_out, value1, count);
+}
+
+static NPY_GCC_OPT_3 void
+clongdouble_sum_of_products_contig_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_clongdouble *data0 = (npy_clongdouble *)dataptr[0];
+    npy_clongdouble *data1 = (npy_clongdouble *)dataptr[1];
+    npy_longdouble accum = 0;
+
+    NPY_EINSUM_DBG_PRINT1("clongdouble_sum_of_products_contig_contig_outstride0_two (%d)\n",
+                                                    (int)count);
+#if 0 // NPYV check for npy_clongdouble
+    /* Use aligned instructions if possible */
+    const int is_aligned = EINSUM_IS_ALIGNED(data0) && EINSUM_IS_ALIGNED(data1);
+    const int vstep = npyv_nlanes_clongdouble;
+    npyv_clongdouble v_accum = npyv_zero_clongdouble();
+
+    #line 495
+    if(is_aligned) {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_clongdouble a0 = npyv_loada_clongdouble(data0 + vstep * 0);
+            npyv_clongdouble b0 = npyv_loada_clongdouble(data1 + vstep * 0);
+            
+#line 501
+            npyv_clongdouble a1 = npyv_loada_clongdouble(data0 + vstep * 1);
+            npyv_clongdouble b1 = npyv_loada_clongdouble(data1 + vstep * 1);
+            
+#line 501
+            npyv_clongdouble a2 = npyv_loada_clongdouble(data0 + vstep * 2);
+            npyv_clongdouble b2 = npyv_loada_clongdouble(data1 + vstep * 2);
+            
+#line 501
+            npyv_clongdouble a3 = npyv_loada_clongdouble(data0 + vstep * 3);
+            npyv_clongdouble b3 = npyv_loada_clongdouble(data1 + vstep * 3);
+            
+            npyv_clongdouble ab3 = npyv_muladd_clongdouble(a3, b3, v_accum);
+            npyv_clongdouble ab2 = npyv_muladd_clongdouble(a2, b2, ab3);
+            npyv_clongdouble ab1 = npyv_muladd_clongdouble(a1, b1, ab2);
+                   v_accum = npyv_muladd_clongdouble(a0, b0, ab1);
+        }
+    }
+    
+#line 495
+    else {
+        const npy_intp vstepx4 = vstep * 4;
+        for (; count >= vstepx4; count -= vstepx4, data0 += vstepx4, data1 += vstepx4) {
+            #line 501
+            npyv_clongdouble a0 = npyv_load_clongdouble(data0 + vstep * 0);
+            npyv_clongdouble b0 = npyv_load_clongdouble(data1 + vstep * 0);
+            
+#line 501
+            npyv_clongdouble a1 = npyv_load_clongdouble(data0 + vstep * 1);
+            npyv_clongdouble b1 = npyv_load_clongdouble(data1 + vstep * 1);
+            
+#line 501
+            npyv_clongdouble a2 = npyv_load_clongdouble(data0 + vstep * 2);
+            npyv_clongdouble b2 = npyv_load_clongdouble(data1 + vstep * 2);
+            
+#line 501
+            npyv_clongdouble a3 = npyv_load_clongdouble(data0 + vstep * 3);
+            npyv_clongdouble b3 = npyv_load_clongdouble(data1 + vstep * 3);
+            
+            npyv_clongdouble ab3 = npyv_muladd_clongdouble(a3, b3, v_accum);
+            npyv_clongdouble ab2 = npyv_muladd_clongdouble(a2, b2, ab3);
+            npyv_clongdouble ab1 = npyv_muladd_clongdouble(a1, b1, ab2);
+                   v_accum = npyv_muladd_clongdouble(a0, b0, ab1);
+        }
+    }
+    
+    for (; count > 0; count -= vstep, data0 += vstep, data1 += vstep) {
+        npyv_clongdouble a = npyv_load_tillz_clongdouble(data0, count);
+        npyv_clongdouble b = npyv_load_tillz_clongdouble(data1, count);
+        v_accum = npyv_muladd_clongdouble(a, b, v_accum);
+    }
+    accum = npyv_sum_clongdouble(v_accum);
+    npyv_cleanup();
+#else
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count >= 4; count -= 4, data0 += 4, data1 += 4) {
+        #line 524
+        const npy_longdouble ab0 = (data0[0]) * (data1[0]);
+        
+#line 524
+        const npy_longdouble ab1 = (data0[1]) * (data1[1]);
+        
+#line 524
+        const npy_longdouble ab2 = (data0[2]) * (data1[2]);
+        
+#line 524
+        const npy_longdouble ab3 = (data0[3]) * (data1[3]);
+        
+        accum += ab0 + ab1 + ab2 + ab3;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, ++data0, ++data1) {
+        const npy_longdouble a = (*data0);
+        const npy_longdouble b = (*data1);
+        accum += a * b;
+    }
+#endif // NPYV check for npy_clongdouble
+    *(npy_clongdouble *)dataptr[2] = ((*(npy_clongdouble *)dataptr[2]) + accum);
+}
+
+static NPY_GCC_OPT_3 void
+clongdouble_sum_of_products_stride0_contig_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_clongdouble *data1 = (npy_clongdouble *)dataptr[1];
+    npy_longdouble value0 = (*(npy_clongdouble *)dataptr[0]);
+    npy_longdouble accum = clongdouble_sum_of_arr(data1, count);
+    *(npy_clongdouble *)dataptr[2] = ((*(npy_clongdouble *)dataptr[2]) + value0 * accum);
+}
+
+static NPY_GCC_OPT_3 void
+clongdouble_sum_of_products_contig_stride0_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_clongdouble *data0 = (npy_clongdouble *)dataptr[0];
+    npy_longdouble value1 = (*(npy_clongdouble *)dataptr[1]);
+    npy_longdouble accum = clongdouble_sum_of_arr(data0, count);
+    *(npy_clongdouble *)dataptr[2] = ((*(npy_clongdouble *)dataptr[2]) + value1 * accum);
+}
+
+#elif 1000 == 3 && !1
+
+static void
+clongdouble_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    npy_clongdouble *data0 = (npy_clongdouble *)dataptr[0];
+    npy_clongdouble *data1 = (npy_clongdouble *)dataptr[1];
+    npy_clongdouble *data2 = (npy_clongdouble *)dataptr[2];
+    npy_clongdouble *data_out = (npy_clongdouble *)dataptr[3];
+
+    /* Unroll the loop by 8 */
+    while (count >= 8) {
+        count -= 8;
+
+#line 576
+        data_out[0] = ((data0[0]) *
+                             (data1[0]) *
+                             (data2[0]) +
+                             (data_out[0]));
+
+#line 576
+        data_out[1] = ((data0[1]) *
+                             (data1[1]) *
+                             (data2[1]) +
+                             (data_out[1]));
+
+#line 576
+        data_out[2] = ((data0[2]) *
+                             (data1[2]) *
+                             (data2[2]) +
+                             (data_out[2]));
+
+#line 576
+        data_out[3] = ((data0[3]) *
+                             (data1[3]) *
+                             (data2[3]) +
+                             (data_out[3]));
+
+#line 576
+        data_out[4] = ((data0[4]) *
+                             (data1[4]) *
+                             (data2[4]) +
+                             (data_out[4]));
+
+#line 576
+        data_out[5] = ((data0[5]) *
+                             (data1[5]) *
+                             (data2[5]) +
+                             (data_out[5]));
+
+#line 576
+        data_out[6] = ((data0[6]) *
+                             (data1[6]) *
+                             (data2[6]) +
+                             (data_out[6]));
+
+#line 576
+        data_out[7] = ((data0[7]) *
+                             (data1[7]) *
+                             (data2[7]) +
+                             (data_out[7]));
+
+        data0 += 8;
+        data1 += 8;
+        data2 += 8;
+        data_out += 8;
+    }
+
+    /* Finish off the loop */
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[0] = ((data0[0]) *
+                         (data1[0]) *
+                         (data2[0]) +
+                         (data_out[0]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[1] = ((data0[1]) *
+                         (data1[1]) *
+                         (data2[1]) +
+                         (data_out[1]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[2] = ((data0[2]) *
+                         (data1[2]) *
+                         (data2[2]) +
+                         (data_out[2]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[3] = ((data0[3]) *
+                         (data1[3]) *
+                         (data2[3]) +
+                         (data_out[3]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[4] = ((data0[4]) *
+                         (data1[4]) *
+                         (data2[4]) +
+                         (data_out[4]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[5] = ((data0[5]) *
+                         (data1[5]) *
+                         (data2[5]) +
+                         (data_out[5]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[6] = ((data0[6]) *
+                         (data1[6]) *
+                         (data2[6]) +
+                         (data_out[6]));
+
+#line 592
+    if (count-- == 0) {
+        return;
+    }
+    data_out[7] = ((data0[7]) *
+                         (data1[7]) *
+                         (data2[7]) +
+                         (data_out[7]));
+
+}
+
+#else /* 1000 > 3 || @complex */
+
+static void
+clongdouble_sum_of_products_contig_any(int nop, char **dataptr,
+                                npy_intp const *NPY_UNUSED(strides), npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("clongdouble_sum_of_products_contig_any (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !1
+        npy_longdouble temp = (*(npy_clongdouble *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_clongdouble *)dataptr[i]);
+        }
+        *(npy_clongdouble *)dataptr[nop] = (temp +
+                                           (*(npy_clongdouble *)dataptr[i]));
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_clongdouble);
+        }
+#else /* complex */
+#  if 1000 <= 3
+#    define _SUMPROD_NOP 1000
+#  else
+#    define _SUMPROD_NOP nop
+#  endif
+        npy_longdouble re, im, tmp;
+        int i;
+        re = ((npy_longdouble *)dataptr[0])[0];
+        im = ((npy_longdouble *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_longdouble *)dataptr[i])[0] -
+                  im * ((npy_longdouble *)dataptr[i])[1];
+            im = re * ((npy_longdouble *)dataptr[i])[1] +
+                 im * ((npy_longdouble *)dataptr[i])[0];
+            re = tmp;
+        }
+        ((npy_longdouble *)dataptr[_SUMPROD_NOP])[0] = re +
+                                     ((npy_longdouble *)dataptr[_SUMPROD_NOP])[0];
+        ((npy_longdouble *)dataptr[_SUMPROD_NOP])[1] = im +
+                                     ((npy_longdouble *)dataptr[_SUMPROD_NOP])[1];
+
+        for (i = 0; i <= _SUMPROD_NOP; ++i) {
+            dataptr[i] += sizeof(npy_clongdouble);
+        }
+#  undef _SUMPROD_NOP
+#endif
+    }
+}
+
+#endif /* functions for various 1000 */
+
+#if 1000 == 1
+
+static NPY_GCC_OPT_3 void
+clongdouble_sum_of_products_contig_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    NPY_EINSUM_DBG_PRINT1("clongdouble_sum_of_products_contig_outstride0_one (%d)\n", (int)count);
+#if !1
+    npy_clongdouble *data = (npy_clongdouble *)dataptr[0];
+    npy_longdouble accum = clongdouble_sum_of_arr(data, count);
+    *((npy_clongdouble *)dataptr[1]) = (accum + (*((npy_clongdouble *)dataptr[1])));
+#else
+    npy_longdouble accum_re = 0, accum_im = 0;
+    npy_longdouble *data0 = (npy_longdouble *)dataptr[0];
+#ifndef NPY_DISABLE_OPTIMIZATION
+    for (; count > 4; count -= 4, data0 += 4*2) {
+        const npy_longdouble re01 = data0[0] + data0[2];
+        const npy_longdouble re23 = data0[4] + data0[6];
+        const npy_longdouble im13 = data0[1] + data0[3];
+        const npy_longdouble im57 = data0[5] + data0[7];
+        accum_re += re01 + re23;
+        accum_im += im13 + im57;
+    }
+#endif // !NPY_DISABLE_OPTIMIZATION
+    for (; count > 0; --count, data0 += 2) {
+        accum_re += data0[0];
+        accum_im += data0[1];
+    }
+    ((npy_longdouble *)dataptr[1])[0] += accum_re;
+    ((npy_longdouble *)dataptr[1])[1] += accum_im;
+#endif // !1
+}
+
+#endif /* 1000 == 1 */
+
+static void
+clongdouble_sum_of_products_outstride0_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if 1
+    npy_longdouble accum_re = 0, accum_im = 0;
+#else
+    npy_longdouble accum = 0;
+#endif
+
+#if (1000 == 1) || (1000 <= 3 && !1)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1000 == 2 || 1000 == 3) && !1
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1000 == 3) && !1
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    NPY_EINSUM_DBG_PRINT1("clongdouble_sum_of_products_outstride0_any (%d)\n",
+                                                    (int)count);
+
+    while (count--) {
+#if !1
+#  if 1000 == 1
+        accum += (*(npy_clongdouble *)data0);
+        data0 += stride0;
+#  elif 1000 == 2
+        accum += (*(npy_clongdouble *)data0) *
+                 (*(npy_clongdouble *)data1);
+        data0 += stride0;
+        data1 += stride1;
+#  elif 1000 == 3
+        accum += (*(npy_clongdouble *)data0) *
+                 (*(npy_clongdouble *)data1) *
+                 (*(npy_clongdouble *)data2);
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#  else
+        npy_longdouble temp = (*(npy_clongdouble *)dataptr[0]);
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp *= (*(npy_clongdouble *)dataptr[i]);
+        }
+        accum += temp;
+        for (i = 0; i < nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#  endif
+#else /* complex */
+#  if 1000 == 1
+        accum_re += ((npy_longdouble *)data0)[0];
+        accum_im += ((npy_longdouble *)data0)[1];
+        data0 += stride0;
+#  else
+#    if 1000 <= 3
+#define _SUMPROD_NOP 1000
+#    else
+#define _SUMPROD_NOP nop
+#    endif
+        npy_longdouble re, im, tmp;
+        int i;
+        re = ((npy_longdouble *)dataptr[0])[0];
+        im = ((npy_longdouble *)dataptr[0])[1];
+        for (i = 1; i < _SUMPROD_NOP; ++i) {
+            tmp = re * ((npy_longdouble *)dataptr[i])[0] -
+                  im * ((npy_longdouble *)dataptr[i])[1];
+            im = re * ((npy_longdouble *)dataptr[i])[1] +
+                 im * ((npy_longdouble *)dataptr[i])[0];
+            re = tmp;
+        }
+        accum_re += re;
+        accum_im += im;
+        for (i = 0; i < _SUMPROD_NOP; ++i) {
+            dataptr[i] += strides[i];
+        }
+#undef _SUMPROD_NOP
+#  endif
+#endif
+    }
+
+#if 1
+#  if 1000 <= 3
+    ((npy_longdouble *)dataptr[1000])[0] += accum_re;
+    ((npy_longdouble *)dataptr[1000])[1] += accum_im;
+#  else
+    ((npy_longdouble *)dataptr[nop])[0] += accum_re;
+    ((npy_longdouble *)dataptr[nop])[1] += accum_im;
+#  endif
+#else
+#  if 1000 <= 3
+    *((npy_clongdouble *)dataptr[1000]) = (accum +
+                                    (*((npy_clongdouble *)dataptr[1000])));
+#  else
+    *((npy_clongdouble *)dataptr[nop]) = (accum +
+                                    (*((npy_clongdouble *)dataptr[nop])));
+#  endif
+#endif
+
+}
+
+
+
+
+
+
+/* Do OR of ANDs for the boolean type */
+
+#line 807
+
+static void
+bool_sum_of_products_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1 <= 3)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1 == 2 || 1 == 3)
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1 == 3)
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (1 <= 3)
+    char *data_out = dataptr[1];
+    npy_intp stride_out = strides[1];
+#endif
+
+    while (count--) {
+#if 1 == 1
+        *(npy_bool *)data_out = *(npy_bool *)data0 ||
+                                  *(npy_bool *)data_out;
+        data0 += stride0;
+        data_out += stride_out;
+#elif 1 == 2
+        *(npy_bool *)data_out = (*(npy_bool *)data0 &&
+                                   *(npy_bool *)data1) ||
+                                   *(npy_bool *)data_out;
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#elif 1 == 3
+        *(npy_bool *)data_out = (*(npy_bool *)data0 &&
+                                   *(npy_bool *)data1 &&
+                                   *(npy_bool *)data2) ||
+                                   *(npy_bool *)data_out;
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#else
+        npy_bool temp = *(npy_bool *)dataptr[0];
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp = temp && *(npy_bool *)dataptr[i];
+        }
+        *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i];
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#endif
+    }
+}
+
+static void
+bool_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1 <= 3)
+    char *data0 = dataptr[0];
+#endif
+#if (1 == 2 || 1 == 3)
+    char *data1 = dataptr[1];
+#endif
+#if (1 == 3)
+    char *data2 = dataptr[2];
+#endif
+#if (1 <= 3)
+    char *data_out = dataptr[1];
+#endif
+
+#if (1 <= 3)
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 889
+        case 6+1:
+#  if 1 == 1
+            ((npy_bool *)data_out)[6] = ((npy_bool *)data0)[6] ||
+                                            ((npy_bool *)data_out)[6];
+#  elif 1 == 2
+            ((npy_bool *)data_out)[6] =
+                            (((npy_bool *)data0)[6] &&
+                             ((npy_bool *)data1)[6]) ||
+                                ((npy_bool *)data_out)[6];
+#  elif 1 == 3
+            ((npy_bool *)data_out)[6] =
+                           (((npy_bool *)data0)[6] &&
+                            ((npy_bool *)data1)[6] &&
+                            ((npy_bool *)data2)[6]) ||
+                                ((npy_bool *)data_out)[6];
+#  endif
+
+#line 889
+        case 5+1:
+#  if 1 == 1
+            ((npy_bool *)data_out)[5] = ((npy_bool *)data0)[5] ||
+                                            ((npy_bool *)data_out)[5];
+#  elif 1 == 2
+            ((npy_bool *)data_out)[5] =
+                            (((npy_bool *)data0)[5] &&
+                             ((npy_bool *)data1)[5]) ||
+                                ((npy_bool *)data_out)[5];
+#  elif 1 == 3
+            ((npy_bool *)data_out)[5] =
+                           (((npy_bool *)data0)[5] &&
+                            ((npy_bool *)data1)[5] &&
+                            ((npy_bool *)data2)[5]) ||
+                                ((npy_bool *)data_out)[5];
+#  endif
+
+#line 889
+        case 4+1:
+#  if 1 == 1
+            ((npy_bool *)data_out)[4] = ((npy_bool *)data0)[4] ||
+                                            ((npy_bool *)data_out)[4];
+#  elif 1 == 2
+            ((npy_bool *)data_out)[4] =
+                            (((npy_bool *)data0)[4] &&
+                             ((npy_bool *)data1)[4]) ||
+                                ((npy_bool *)data_out)[4];
+#  elif 1 == 3
+            ((npy_bool *)data_out)[4] =
+                           (((npy_bool *)data0)[4] &&
+                            ((npy_bool *)data1)[4] &&
+                            ((npy_bool *)data2)[4]) ||
+                                ((npy_bool *)data_out)[4];
+#  endif
+
+#line 889
+        case 3+1:
+#  if 1 == 1
+            ((npy_bool *)data_out)[3] = ((npy_bool *)data0)[3] ||
+                                            ((npy_bool *)data_out)[3];
+#  elif 1 == 2
+            ((npy_bool *)data_out)[3] =
+                            (((npy_bool *)data0)[3] &&
+                             ((npy_bool *)data1)[3]) ||
+                                ((npy_bool *)data_out)[3];
+#  elif 1 == 3
+            ((npy_bool *)data_out)[3] =
+                           (((npy_bool *)data0)[3] &&
+                            ((npy_bool *)data1)[3] &&
+                            ((npy_bool *)data2)[3]) ||
+                                ((npy_bool *)data_out)[3];
+#  endif
+
+#line 889
+        case 2+1:
+#  if 1 == 1
+            ((npy_bool *)data_out)[2] = ((npy_bool *)data0)[2] ||
+                                            ((npy_bool *)data_out)[2];
+#  elif 1 == 2
+            ((npy_bool *)data_out)[2] =
+                            (((npy_bool *)data0)[2] &&
+                             ((npy_bool *)data1)[2]) ||
+                                ((npy_bool *)data_out)[2];
+#  elif 1 == 3
+            ((npy_bool *)data_out)[2] =
+                           (((npy_bool *)data0)[2] &&
+                            ((npy_bool *)data1)[2] &&
+                            ((npy_bool *)data2)[2]) ||
+                                ((npy_bool *)data_out)[2];
+#  endif
+
+#line 889
+        case 1+1:
+#  if 1 == 1
+            ((npy_bool *)data_out)[1] = ((npy_bool *)data0)[1] ||
+                                            ((npy_bool *)data_out)[1];
+#  elif 1 == 2
+            ((npy_bool *)data_out)[1] =
+                            (((npy_bool *)data0)[1] &&
+                             ((npy_bool *)data1)[1]) ||
+                                ((npy_bool *)data_out)[1];
+#  elif 1 == 3
+            ((npy_bool *)data_out)[1] =
+                           (((npy_bool *)data0)[1] &&
+                            ((npy_bool *)data1)[1] &&
+                            ((npy_bool *)data2)[1]) ||
+                                ((npy_bool *)data_out)[1];
+#  endif
+
+#line 889
+        case 0+1:
+#  if 1 == 1
+            ((npy_bool *)data_out)[0] = ((npy_bool *)data0)[0] ||
+                                            ((npy_bool *)data_out)[0];
+#  elif 1 == 2
+            ((npy_bool *)data_out)[0] =
+                            (((npy_bool *)data0)[0] &&
+                             ((npy_bool *)data1)[0]) ||
+                                ((npy_bool *)data_out)[0];
+#  elif 1 == 3
+            ((npy_bool *)data_out)[0] =
+                           (((npy_bool *)data0)[0] &&
+                            ((npy_bool *)data1)[0] &&
+                            ((npy_bool *)data2)[0]) ||
+                                ((npy_bool *)data_out)[0];
+#  endif
+
+        case 0:
+            return;
+    }
+#endif
+
+/* Unroll the loop by 8 for fixed-size nop */
+#if (1 <= 3)
+    while (count >= 8) {
+        count -= 8;
+#else
+    while (count--) {
+#endif
+
+#  if 1 == 1
+#line 923
+        *((npy_bool *)data_out + 0) = (*((npy_bool *)data0 + 0)) ||
+                                        (*((npy_bool *)data_out + 0));
+
+#line 923
+        *((npy_bool *)data_out + 1) = (*((npy_bool *)data0 + 1)) ||
+                                        (*((npy_bool *)data_out + 1));
+
+#line 923
+        *((npy_bool *)data_out + 2) = (*((npy_bool *)data0 + 2)) ||
+                                        (*((npy_bool *)data_out + 2));
+
+#line 923
+        *((npy_bool *)data_out + 3) = (*((npy_bool *)data0 + 3)) ||
+                                        (*((npy_bool *)data_out + 3));
+
+#line 923
+        *((npy_bool *)data_out + 4) = (*((npy_bool *)data0 + 4)) ||
+                                        (*((npy_bool *)data_out + 4));
+
+#line 923
+        *((npy_bool *)data_out + 5) = (*((npy_bool *)data0 + 5)) ||
+                                        (*((npy_bool *)data_out + 5));
+
+#line 923
+        *((npy_bool *)data_out + 6) = (*((npy_bool *)data0 + 6)) ||
+                                        (*((npy_bool *)data_out + 6));
+
+#line 923
+        *((npy_bool *)data_out + 7) = (*((npy_bool *)data0 + 7)) ||
+                                        (*((npy_bool *)data_out + 7));
+
+        data0 += 8*sizeof(npy_bool);
+        data_out += 8*sizeof(npy_bool);
+#  elif 1 == 2
+#line 932
+        *((npy_bool *)data_out + 0) =
+                        ((*((npy_bool *)data0 + 0)) &&
+                         (*((npy_bool *)data1 + 0))) ||
+                            (*((npy_bool *)data_out + 0));
+
+#line 932
+        *((npy_bool *)data_out + 1) =
+                        ((*((npy_bool *)data0 + 1)) &&
+                         (*((npy_bool *)data1 + 1))) ||
+                            (*((npy_bool *)data_out + 1));
+
+#line 932
+        *((npy_bool *)data_out + 2) =
+                        ((*((npy_bool *)data0 + 2)) &&
+                         (*((npy_bool *)data1 + 2))) ||
+                            (*((npy_bool *)data_out + 2));
+
+#line 932
+        *((npy_bool *)data_out + 3) =
+                        ((*((npy_bool *)data0 + 3)) &&
+                         (*((npy_bool *)data1 + 3))) ||
+                            (*((npy_bool *)data_out + 3));
+
+#line 932
+        *((npy_bool *)data_out + 4) =
+                        ((*((npy_bool *)data0 + 4)) &&
+                         (*((npy_bool *)data1 + 4))) ||
+                            (*((npy_bool *)data_out + 4));
+
+#line 932
+        *((npy_bool *)data_out + 5) =
+                        ((*((npy_bool *)data0 + 5)) &&
+                         (*((npy_bool *)data1 + 5))) ||
+                            (*((npy_bool *)data_out + 5));
+
+#line 932
+        *((npy_bool *)data_out + 6) =
+                        ((*((npy_bool *)data0 + 6)) &&
+                         (*((npy_bool *)data1 + 6))) ||
+                            (*((npy_bool *)data_out + 6));
+
+#line 932
+        *((npy_bool *)data_out + 7) =
+                        ((*((npy_bool *)data0 + 7)) &&
+                         (*((npy_bool *)data1 + 7))) ||
+                            (*((npy_bool *)data_out + 7));
+
+        data0 += 8*sizeof(npy_bool);
+        data1 += 8*sizeof(npy_bool);
+        data_out += 8*sizeof(npy_bool);
+#  elif 1 == 3
+#line 944
+        *((npy_bool *)data_out + 0) =
+                       ((*((npy_bool *)data0 + 0)) &&
+                        (*((npy_bool *)data1 + 0)) &&
+                        (*((npy_bool *)data2 + 0))) ||
+                            (*((npy_bool *)data_out + 0));
+
+#line 944
+        *((npy_bool *)data_out + 1) =
+                       ((*((npy_bool *)data0 + 1)) &&
+                        (*((npy_bool *)data1 + 1)) &&
+                        (*((npy_bool *)data2 + 1))) ||
+                            (*((npy_bool *)data_out + 1));
+
+#line 944
+        *((npy_bool *)data_out + 2) =
+                       ((*((npy_bool *)data0 + 2)) &&
+                        (*((npy_bool *)data1 + 2)) &&
+                        (*((npy_bool *)data2 + 2))) ||
+                            (*((npy_bool *)data_out + 2));
+
+#line 944
+        *((npy_bool *)data_out + 3) =
+                       ((*((npy_bool *)data0 + 3)) &&
+                        (*((npy_bool *)data1 + 3)) &&
+                        (*((npy_bool *)data2 + 3))) ||
+                            (*((npy_bool *)data_out + 3));
+
+#line 944
+        *((npy_bool *)data_out + 4) =
+                       ((*((npy_bool *)data0 + 4)) &&
+                        (*((npy_bool *)data1 + 4)) &&
+                        (*((npy_bool *)data2 + 4))) ||
+                            (*((npy_bool *)data_out + 4));
+
+#line 944
+        *((npy_bool *)data_out + 5) =
+                       ((*((npy_bool *)data0 + 5)) &&
+                        (*((npy_bool *)data1 + 5)) &&
+                        (*((npy_bool *)data2 + 5))) ||
+                            (*((npy_bool *)data_out + 5));
+
+#line 944
+        *((npy_bool *)data_out + 6) =
+                       ((*((npy_bool *)data0 + 6)) &&
+                        (*((npy_bool *)data1 + 6)) &&
+                        (*((npy_bool *)data2 + 6))) ||
+                            (*((npy_bool *)data_out + 6));
+
+#line 944
+        *((npy_bool *)data_out + 7) =
+                       ((*((npy_bool *)data0 + 7)) &&
+                        (*((npy_bool *)data1 + 7)) &&
+                        (*((npy_bool *)data2 + 7))) ||
+                            (*((npy_bool *)data_out + 7));
+
+        data0 += 8*sizeof(npy_bool);
+        data1 += 8*sizeof(npy_bool);
+        data2 += 8*sizeof(npy_bool);
+        data_out += 8*sizeof(npy_bool);
+#  else
+        npy_bool temp = *(npy_bool *)dataptr[0];
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp = temp && *(npy_bool *)dataptr[i];
+        }
+        *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i];
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_bool);
+        }
+#  endif
+    }
+
+    /* If the loop was unrolled, we need to finish it off */
+#if (1 <= 3)
+    goto finish_after_unrolled_loop;
+#endif
+}
+
+static void
+bool_sum_of_products_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    npy_bool accum = 0;
+
+#if (1 <= 3)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1 == 2 || 1 == 3)
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1 == 3)
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    while (count--) {
+#if 1 == 1
+        accum = *(npy_bool *)data0 || accum;
+        data0 += stride0;
+#elif 1 == 2
+        accum = (*(npy_bool *)data0 && *(npy_bool *)data1) || accum;
+        data0 += stride0;
+        data1 += stride1;
+#elif 1 == 3
+        accum = (*(npy_bool *)data0 &&
+                 *(npy_bool *)data1 &&
+                 *(npy_bool *)data2) || accum;
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#else
+        npy_bool temp = *(npy_bool *)dataptr[0];
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp = temp && *(npy_bool *)dataptr[i];
+        }
+        accum = temp || accum;
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#endif
+    }
+
+#  if 1 <= 3
+    *((npy_bool *)dataptr[1]) = accum || *((npy_bool *)dataptr[1]);
+#  else
+    *((npy_bool *)dataptr[nop]) = accum || *((npy_bool *)dataptr[nop]);
+#  endif
+}
+
+
+#line 807
+
+static void
+bool_sum_of_products_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (2 <= 3)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (2 == 2 || 2 == 3)
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (2 == 3)
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (2 <= 3)
+    char *data_out = dataptr[2];
+    npy_intp stride_out = strides[2];
+#endif
+
+    while (count--) {
+#if 2 == 1
+        *(npy_bool *)data_out = *(npy_bool *)data0 ||
+                                  *(npy_bool *)data_out;
+        data0 += stride0;
+        data_out += stride_out;
+#elif 2 == 2
+        *(npy_bool *)data_out = (*(npy_bool *)data0 &&
+                                   *(npy_bool *)data1) ||
+                                   *(npy_bool *)data_out;
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#elif 2 == 3
+        *(npy_bool *)data_out = (*(npy_bool *)data0 &&
+                                   *(npy_bool *)data1 &&
+                                   *(npy_bool *)data2) ||
+                                   *(npy_bool *)data_out;
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#else
+        npy_bool temp = *(npy_bool *)dataptr[0];
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp = temp && *(npy_bool *)dataptr[i];
+        }
+        *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i];
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#endif
+    }
+}
+
+static void
+bool_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (2 <= 3)
+    char *data0 = dataptr[0];
+#endif
+#if (2 == 2 || 2 == 3)
+    char *data1 = dataptr[1];
+#endif
+#if (2 == 3)
+    char *data2 = dataptr[2];
+#endif
+#if (2 <= 3)
+    char *data_out = dataptr[2];
+#endif
+
+#if (2 <= 3)
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 889
+        case 6+1:
+#  if 2 == 1
+            ((npy_bool *)data_out)[6] = ((npy_bool *)data0)[6] ||
+                                            ((npy_bool *)data_out)[6];
+#  elif 2 == 2
+            ((npy_bool *)data_out)[6] =
+                            (((npy_bool *)data0)[6] &&
+                             ((npy_bool *)data1)[6]) ||
+                                ((npy_bool *)data_out)[6];
+#  elif 2 == 3
+            ((npy_bool *)data_out)[6] =
+                           (((npy_bool *)data0)[6] &&
+                            ((npy_bool *)data1)[6] &&
+                            ((npy_bool *)data2)[6]) ||
+                                ((npy_bool *)data_out)[6];
+#  endif
+
+#line 889
+        case 5+1:
+#  if 2 == 1
+            ((npy_bool *)data_out)[5] = ((npy_bool *)data0)[5] ||
+                                            ((npy_bool *)data_out)[5];
+#  elif 2 == 2
+            ((npy_bool *)data_out)[5] =
+                            (((npy_bool *)data0)[5] &&
+                             ((npy_bool *)data1)[5]) ||
+                                ((npy_bool *)data_out)[5];
+#  elif 2 == 3
+            ((npy_bool *)data_out)[5] =
+                           (((npy_bool *)data0)[5] &&
+                            ((npy_bool *)data1)[5] &&
+                            ((npy_bool *)data2)[5]) ||
+                                ((npy_bool *)data_out)[5];
+#  endif
+
+#line 889
+        case 4+1:
+#  if 2 == 1
+            ((npy_bool *)data_out)[4] = ((npy_bool *)data0)[4] ||
+                                            ((npy_bool *)data_out)[4];
+#  elif 2 == 2
+            ((npy_bool *)data_out)[4] =
+                            (((npy_bool *)data0)[4] &&
+                             ((npy_bool *)data1)[4]) ||
+                                ((npy_bool *)data_out)[4];
+#  elif 2 == 3
+            ((npy_bool *)data_out)[4] =
+                           (((npy_bool *)data0)[4] &&
+                            ((npy_bool *)data1)[4] &&
+                            ((npy_bool *)data2)[4]) ||
+                                ((npy_bool *)data_out)[4];
+#  endif
+
+#line 889
+        case 3+1:
+#  if 2 == 1
+            ((npy_bool *)data_out)[3] = ((npy_bool *)data0)[3] ||
+                                            ((npy_bool *)data_out)[3];
+#  elif 2 == 2
+            ((npy_bool *)data_out)[3] =
+                            (((npy_bool *)data0)[3] &&
+                             ((npy_bool *)data1)[3]) ||
+                                ((npy_bool *)data_out)[3];
+#  elif 2 == 3
+            ((npy_bool *)data_out)[3] =
+                           (((npy_bool *)data0)[3] &&
+                            ((npy_bool *)data1)[3] &&
+                            ((npy_bool *)data2)[3]) ||
+                                ((npy_bool *)data_out)[3];
+#  endif
+
+#line 889
+        case 2+1:
+#  if 2 == 1
+            ((npy_bool *)data_out)[2] = ((npy_bool *)data0)[2] ||
+                                            ((npy_bool *)data_out)[2];
+#  elif 2 == 2
+            ((npy_bool *)data_out)[2] =
+                            (((npy_bool *)data0)[2] &&
+                             ((npy_bool *)data1)[2]) ||
+                                ((npy_bool *)data_out)[2];
+#  elif 2 == 3
+            ((npy_bool *)data_out)[2] =
+                           (((npy_bool *)data0)[2] &&
+                            ((npy_bool *)data1)[2] &&
+                            ((npy_bool *)data2)[2]) ||
+                                ((npy_bool *)data_out)[2];
+#  endif
+
+#line 889
+        case 1+1:
+#  if 2 == 1
+            ((npy_bool *)data_out)[1] = ((npy_bool *)data0)[1] ||
+                                            ((npy_bool *)data_out)[1];
+#  elif 2 == 2
+            ((npy_bool *)data_out)[1] =
+                            (((npy_bool *)data0)[1] &&
+                             ((npy_bool *)data1)[1]) ||
+                                ((npy_bool *)data_out)[1];
+#  elif 2 == 3
+            ((npy_bool *)data_out)[1] =
+                           (((npy_bool *)data0)[1] &&
+                            ((npy_bool *)data1)[1] &&
+                            ((npy_bool *)data2)[1]) ||
+                                ((npy_bool *)data_out)[1];
+#  endif
+
+#line 889
+        case 0+1:
+#  if 2 == 1
+            ((npy_bool *)data_out)[0] = ((npy_bool *)data0)[0] ||
+                                            ((npy_bool *)data_out)[0];
+#  elif 2 == 2
+            ((npy_bool *)data_out)[0] =
+                            (((npy_bool *)data0)[0] &&
+                             ((npy_bool *)data1)[0]) ||
+                                ((npy_bool *)data_out)[0];
+#  elif 2 == 3
+            ((npy_bool *)data_out)[0] =
+                           (((npy_bool *)data0)[0] &&
+                            ((npy_bool *)data1)[0] &&
+                            ((npy_bool *)data2)[0]) ||
+                                ((npy_bool *)data_out)[0];
+#  endif
+
+        case 0:
+            return;
+    }
+#endif
+
+/* Unroll the loop by 8 for fixed-size nop */
+#if (2 <= 3)
+    while (count >= 8) {
+        count -= 8;
+#else
+    while (count--) {
+#endif
+
+#  if 2 == 1
+#line 923
+        *((npy_bool *)data_out + 0) = (*((npy_bool *)data0 + 0)) ||
+                                        (*((npy_bool *)data_out + 0));
+
+#line 923
+        *((npy_bool *)data_out + 1) = (*((npy_bool *)data0 + 1)) ||
+                                        (*((npy_bool *)data_out + 1));
+
+#line 923
+        *((npy_bool *)data_out + 2) = (*((npy_bool *)data0 + 2)) ||
+                                        (*((npy_bool *)data_out + 2));
+
+#line 923
+        *((npy_bool *)data_out + 3) = (*((npy_bool *)data0 + 3)) ||
+                                        (*((npy_bool *)data_out + 3));
+
+#line 923
+        *((npy_bool *)data_out + 4) = (*((npy_bool *)data0 + 4)) ||
+                                        (*((npy_bool *)data_out + 4));
+
+#line 923
+        *((npy_bool *)data_out + 5) = (*((npy_bool *)data0 + 5)) ||
+                                        (*((npy_bool *)data_out + 5));
+
+#line 923
+        *((npy_bool *)data_out + 6) = (*((npy_bool *)data0 + 6)) ||
+                                        (*((npy_bool *)data_out + 6));
+
+#line 923
+        *((npy_bool *)data_out + 7) = (*((npy_bool *)data0 + 7)) ||
+                                        (*((npy_bool *)data_out + 7));
+
+        data0 += 8*sizeof(npy_bool);
+        data_out += 8*sizeof(npy_bool);
+#  elif 2 == 2
+#line 932
+        *((npy_bool *)data_out + 0) =
+                        ((*((npy_bool *)data0 + 0)) &&
+                         (*((npy_bool *)data1 + 0))) ||
+                            (*((npy_bool *)data_out + 0));
+
+#line 932
+        *((npy_bool *)data_out + 1) =
+                        ((*((npy_bool *)data0 + 1)) &&
+                         (*((npy_bool *)data1 + 1))) ||
+                            (*((npy_bool *)data_out + 1));
+
+#line 932
+        *((npy_bool *)data_out + 2) =
+                        ((*((npy_bool *)data0 + 2)) &&
+                         (*((npy_bool *)data1 + 2))) ||
+                            (*((npy_bool *)data_out + 2));
+
+#line 932
+        *((npy_bool *)data_out + 3) =
+                        ((*((npy_bool *)data0 + 3)) &&
+                         (*((npy_bool *)data1 + 3))) ||
+                            (*((npy_bool *)data_out + 3));
+
+#line 932
+        *((npy_bool *)data_out + 4) =
+                        ((*((npy_bool *)data0 + 4)) &&
+                         (*((npy_bool *)data1 + 4))) ||
+                            (*((npy_bool *)data_out + 4));
+
+#line 932
+        *((npy_bool *)data_out + 5) =
+                        ((*((npy_bool *)data0 + 5)) &&
+                         (*((npy_bool *)data1 + 5))) ||
+                            (*((npy_bool *)data_out + 5));
+
+#line 932
+        *((npy_bool *)data_out + 6) =
+                        ((*((npy_bool *)data0 + 6)) &&
+                         (*((npy_bool *)data1 + 6))) ||
+                            (*((npy_bool *)data_out + 6));
+
+#line 932
+        *((npy_bool *)data_out + 7) =
+                        ((*((npy_bool *)data0 + 7)) &&
+                         (*((npy_bool *)data1 + 7))) ||
+                            (*((npy_bool *)data_out + 7));
+
+        data0 += 8*sizeof(npy_bool);
+        data1 += 8*sizeof(npy_bool);
+        data_out += 8*sizeof(npy_bool);
+#  elif 2 == 3
+#line 944
+        *((npy_bool *)data_out + 0) =
+                       ((*((npy_bool *)data0 + 0)) &&
+                        (*((npy_bool *)data1 + 0)) &&
+                        (*((npy_bool *)data2 + 0))) ||
+                            (*((npy_bool *)data_out + 0));
+
+#line 944
+        *((npy_bool *)data_out + 1) =
+                       ((*((npy_bool *)data0 + 1)) &&
+                        (*((npy_bool *)data1 + 1)) &&
+                        (*((npy_bool *)data2 + 1))) ||
+                            (*((npy_bool *)data_out + 1));
+
+#line 944
+        *((npy_bool *)data_out + 2) =
+                       ((*((npy_bool *)data0 + 2)) &&
+                        (*((npy_bool *)data1 + 2)) &&
+                        (*((npy_bool *)data2 + 2))) ||
+                            (*((npy_bool *)data_out + 2));
+
+#line 944
+        *((npy_bool *)data_out + 3) =
+                       ((*((npy_bool *)data0 + 3)) &&
+                        (*((npy_bool *)data1 + 3)) &&
+                        (*((npy_bool *)data2 + 3))) ||
+                            (*((npy_bool *)data_out + 3));
+
+#line 944
+        *((npy_bool *)data_out + 4) =
+                       ((*((npy_bool *)data0 + 4)) &&
+                        (*((npy_bool *)data1 + 4)) &&
+                        (*((npy_bool *)data2 + 4))) ||
+                            (*((npy_bool *)data_out + 4));
+
+#line 944
+        *((npy_bool *)data_out + 5) =
+                       ((*((npy_bool *)data0 + 5)) &&
+                        (*((npy_bool *)data1 + 5)) &&
+                        (*((npy_bool *)data2 + 5))) ||
+                            (*((npy_bool *)data_out + 5));
+
+#line 944
+        *((npy_bool *)data_out + 6) =
+                       ((*((npy_bool *)data0 + 6)) &&
+                        (*((npy_bool *)data1 + 6)) &&
+                        (*((npy_bool *)data2 + 6))) ||
+                            (*((npy_bool *)data_out + 6));
+
+#line 944
+        *((npy_bool *)data_out + 7) =
+                       ((*((npy_bool *)data0 + 7)) &&
+                        (*((npy_bool *)data1 + 7)) &&
+                        (*((npy_bool *)data2 + 7))) ||
+                            (*((npy_bool *)data_out + 7));
+
+        data0 += 8*sizeof(npy_bool);
+        data1 += 8*sizeof(npy_bool);
+        data2 += 8*sizeof(npy_bool);
+        data_out += 8*sizeof(npy_bool);
+#  else
+        npy_bool temp = *(npy_bool *)dataptr[0];
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp = temp && *(npy_bool *)dataptr[i];
+        }
+        *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i];
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_bool);
+        }
+#  endif
+    }
+
+    /* If the loop was unrolled, we need to finish it off */
+#if (2 <= 3)
+    goto finish_after_unrolled_loop;
+#endif
+}
+
+static void
+bool_sum_of_products_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    npy_bool accum = 0;
+
+#if (2 <= 3)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (2 == 2 || 2 == 3)
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (2 == 3)
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    while (count--) {
+#if 2 == 1
+        accum = *(npy_bool *)data0 || accum;
+        data0 += stride0;
+#elif 2 == 2
+        accum = (*(npy_bool *)data0 && *(npy_bool *)data1) || accum;
+        data0 += stride0;
+        data1 += stride1;
+#elif 2 == 3
+        accum = (*(npy_bool *)data0 &&
+                 *(npy_bool *)data1 &&
+                 *(npy_bool *)data2) || accum;
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#else
+        npy_bool temp = *(npy_bool *)dataptr[0];
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp = temp && *(npy_bool *)dataptr[i];
+        }
+        accum = temp || accum;
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#endif
+    }
+
+#  if 2 <= 3
+    *((npy_bool *)dataptr[2]) = accum || *((npy_bool *)dataptr[2]);
+#  else
+    *((npy_bool *)dataptr[nop]) = accum || *((npy_bool *)dataptr[nop]);
+#  endif
+}
+
+
+#line 807
+
+static void
+bool_sum_of_products_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (3 <= 3)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (3 == 2 || 3 == 3)
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (3 == 3)
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (3 <= 3)
+    char *data_out = dataptr[3];
+    npy_intp stride_out = strides[3];
+#endif
+
+    while (count--) {
+#if 3 == 1
+        *(npy_bool *)data_out = *(npy_bool *)data0 ||
+                                  *(npy_bool *)data_out;
+        data0 += stride0;
+        data_out += stride_out;
+#elif 3 == 2
+        *(npy_bool *)data_out = (*(npy_bool *)data0 &&
+                                   *(npy_bool *)data1) ||
+                                   *(npy_bool *)data_out;
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#elif 3 == 3
+        *(npy_bool *)data_out = (*(npy_bool *)data0 &&
+                                   *(npy_bool *)data1 &&
+                                   *(npy_bool *)data2) ||
+                                   *(npy_bool *)data_out;
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#else
+        npy_bool temp = *(npy_bool *)dataptr[0];
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp = temp && *(npy_bool *)dataptr[i];
+        }
+        *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i];
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#endif
+    }
+}
+
+static void
+bool_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (3 <= 3)
+    char *data0 = dataptr[0];
+#endif
+#if (3 == 2 || 3 == 3)
+    char *data1 = dataptr[1];
+#endif
+#if (3 == 3)
+    char *data2 = dataptr[2];
+#endif
+#if (3 <= 3)
+    char *data_out = dataptr[3];
+#endif
+
+#if (3 <= 3)
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 889
+        case 6+1:
+#  if 3 == 1
+            ((npy_bool *)data_out)[6] = ((npy_bool *)data0)[6] ||
+                                            ((npy_bool *)data_out)[6];
+#  elif 3 == 2
+            ((npy_bool *)data_out)[6] =
+                            (((npy_bool *)data0)[6] &&
+                             ((npy_bool *)data1)[6]) ||
+                                ((npy_bool *)data_out)[6];
+#  elif 3 == 3
+            ((npy_bool *)data_out)[6] =
+                           (((npy_bool *)data0)[6] &&
+                            ((npy_bool *)data1)[6] &&
+                            ((npy_bool *)data2)[6]) ||
+                                ((npy_bool *)data_out)[6];
+#  endif
+
+#line 889
+        case 5+1:
+#  if 3 == 1
+            ((npy_bool *)data_out)[5] = ((npy_bool *)data0)[5] ||
+                                            ((npy_bool *)data_out)[5];
+#  elif 3 == 2
+            ((npy_bool *)data_out)[5] =
+                            (((npy_bool *)data0)[5] &&
+                             ((npy_bool *)data1)[5]) ||
+                                ((npy_bool *)data_out)[5];
+#  elif 3 == 3
+            ((npy_bool *)data_out)[5] =
+                           (((npy_bool *)data0)[5] &&
+                            ((npy_bool *)data1)[5] &&
+                            ((npy_bool *)data2)[5]) ||
+                                ((npy_bool *)data_out)[5];
+#  endif
+
+#line 889
+        case 4+1:
+#  if 3 == 1
+            ((npy_bool *)data_out)[4] = ((npy_bool *)data0)[4] ||
+                                            ((npy_bool *)data_out)[4];
+#  elif 3 == 2
+            ((npy_bool *)data_out)[4] =
+                            (((npy_bool *)data0)[4] &&
+                             ((npy_bool *)data1)[4]) ||
+                                ((npy_bool *)data_out)[4];
+#  elif 3 == 3
+            ((npy_bool *)data_out)[4] =
+                           (((npy_bool *)data0)[4] &&
+                            ((npy_bool *)data1)[4] &&
+                            ((npy_bool *)data2)[4]) ||
+                                ((npy_bool *)data_out)[4];
+#  endif
+
+#line 889
+        case 3+1:
+#  if 3 == 1
+            ((npy_bool *)data_out)[3] = ((npy_bool *)data0)[3] ||
+                                            ((npy_bool *)data_out)[3];
+#  elif 3 == 2
+            ((npy_bool *)data_out)[3] =
+                            (((npy_bool *)data0)[3] &&
+                             ((npy_bool *)data1)[3]) ||
+                                ((npy_bool *)data_out)[3];
+#  elif 3 == 3
+            ((npy_bool *)data_out)[3] =
+                           (((npy_bool *)data0)[3] &&
+                            ((npy_bool *)data1)[3] &&
+                            ((npy_bool *)data2)[3]) ||
+                                ((npy_bool *)data_out)[3];
+#  endif
+
+#line 889
+        case 2+1:
+#  if 3 == 1
+            ((npy_bool *)data_out)[2] = ((npy_bool *)data0)[2] ||
+                                            ((npy_bool *)data_out)[2];
+#  elif 3 == 2
+            ((npy_bool *)data_out)[2] =
+                            (((npy_bool *)data0)[2] &&
+                             ((npy_bool *)data1)[2]) ||
+                                ((npy_bool *)data_out)[2];
+#  elif 3 == 3
+            ((npy_bool *)data_out)[2] =
+                           (((npy_bool *)data0)[2] &&
+                            ((npy_bool *)data1)[2] &&
+                            ((npy_bool *)data2)[2]) ||
+                                ((npy_bool *)data_out)[2];
+#  endif
+
+#line 889
+        case 1+1:
+#  if 3 == 1
+            ((npy_bool *)data_out)[1] = ((npy_bool *)data0)[1] ||
+                                            ((npy_bool *)data_out)[1];
+#  elif 3 == 2
+            ((npy_bool *)data_out)[1] =
+                            (((npy_bool *)data0)[1] &&
+                             ((npy_bool *)data1)[1]) ||
+                                ((npy_bool *)data_out)[1];
+#  elif 3 == 3
+            ((npy_bool *)data_out)[1] =
+                           (((npy_bool *)data0)[1] &&
+                            ((npy_bool *)data1)[1] &&
+                            ((npy_bool *)data2)[1]) ||
+                                ((npy_bool *)data_out)[1];
+#  endif
+
+#line 889
+        case 0+1:
+#  if 3 == 1
+            ((npy_bool *)data_out)[0] = ((npy_bool *)data0)[0] ||
+                                            ((npy_bool *)data_out)[0];
+#  elif 3 == 2
+            ((npy_bool *)data_out)[0] =
+                            (((npy_bool *)data0)[0] &&
+                             ((npy_bool *)data1)[0]) ||
+                                ((npy_bool *)data_out)[0];
+#  elif 3 == 3
+            ((npy_bool *)data_out)[0] =
+                           (((npy_bool *)data0)[0] &&
+                            ((npy_bool *)data1)[0] &&
+                            ((npy_bool *)data2)[0]) ||
+                                ((npy_bool *)data_out)[0];
+#  endif
+
+        case 0:
+            return;
+    }
+#endif
+
+/* Unroll the loop by 8 for fixed-size nop */
+#if (3 <= 3)
+    while (count >= 8) {
+        count -= 8;
+#else
+    while (count--) {
+#endif
+
+#  if 3 == 1
+#line 923
+        *((npy_bool *)data_out + 0) = (*((npy_bool *)data0 + 0)) ||
+                                        (*((npy_bool *)data_out + 0));
+
+#line 923
+        *((npy_bool *)data_out + 1) = (*((npy_bool *)data0 + 1)) ||
+                                        (*((npy_bool *)data_out + 1));
+
+#line 923
+        *((npy_bool *)data_out + 2) = (*((npy_bool *)data0 + 2)) ||
+                                        (*((npy_bool *)data_out + 2));
+
+#line 923
+        *((npy_bool *)data_out + 3) = (*((npy_bool *)data0 + 3)) ||
+                                        (*((npy_bool *)data_out + 3));
+
+#line 923
+        *((npy_bool *)data_out + 4) = (*((npy_bool *)data0 + 4)) ||
+                                        (*((npy_bool *)data_out + 4));
+
+#line 923
+        *((npy_bool *)data_out + 5) = (*((npy_bool *)data0 + 5)) ||
+                                        (*((npy_bool *)data_out + 5));
+
+#line 923
+        *((npy_bool *)data_out + 6) = (*((npy_bool *)data0 + 6)) ||
+                                        (*((npy_bool *)data_out + 6));
+
+#line 923
+        *((npy_bool *)data_out + 7) = (*((npy_bool *)data0 + 7)) ||
+                                        (*((npy_bool *)data_out + 7));
+
+        data0 += 8*sizeof(npy_bool);
+        data_out += 8*sizeof(npy_bool);
+#  elif 3 == 2
+#line 932
+        *((npy_bool *)data_out + 0) =
+                        ((*((npy_bool *)data0 + 0)) &&
+                         (*((npy_bool *)data1 + 0))) ||
+                            (*((npy_bool *)data_out + 0));
+
+#line 932
+        *((npy_bool *)data_out + 1) =
+                        ((*((npy_bool *)data0 + 1)) &&
+                         (*((npy_bool *)data1 + 1))) ||
+                            (*((npy_bool *)data_out + 1));
+
+#line 932
+        *((npy_bool *)data_out + 2) =
+                        ((*((npy_bool *)data0 + 2)) &&
+                         (*((npy_bool *)data1 + 2))) ||
+                            (*((npy_bool *)data_out + 2));
+
+#line 932
+        *((npy_bool *)data_out + 3) =
+                        ((*((npy_bool *)data0 + 3)) &&
+                         (*((npy_bool *)data1 + 3))) ||
+                            (*((npy_bool *)data_out + 3));
+
+#line 932
+        *((npy_bool *)data_out + 4) =
+                        ((*((npy_bool *)data0 + 4)) &&
+                         (*((npy_bool *)data1 + 4))) ||
+                            (*((npy_bool *)data_out + 4));
+
+#line 932
+        *((npy_bool *)data_out + 5) =
+                        ((*((npy_bool *)data0 + 5)) &&
+                         (*((npy_bool *)data1 + 5))) ||
+                            (*((npy_bool *)data_out + 5));
+
+#line 932
+        *((npy_bool *)data_out + 6) =
+                        ((*((npy_bool *)data0 + 6)) &&
+                         (*((npy_bool *)data1 + 6))) ||
+                            (*((npy_bool *)data_out + 6));
+
+#line 932
+        *((npy_bool *)data_out + 7) =
+                        ((*((npy_bool *)data0 + 7)) &&
+                         (*((npy_bool *)data1 + 7))) ||
+                            (*((npy_bool *)data_out + 7));
+
+        data0 += 8*sizeof(npy_bool);
+        data1 += 8*sizeof(npy_bool);
+        data_out += 8*sizeof(npy_bool);
+#  elif 3 == 3
+#line 944
+        *((npy_bool *)data_out + 0) =
+                       ((*((npy_bool *)data0 + 0)) &&
+                        (*((npy_bool *)data1 + 0)) &&
+                        (*((npy_bool *)data2 + 0))) ||
+                            (*((npy_bool *)data_out + 0));
+
+#line 944
+        *((npy_bool *)data_out + 1) =
+                       ((*((npy_bool *)data0 + 1)) &&
+                        (*((npy_bool *)data1 + 1)) &&
+                        (*((npy_bool *)data2 + 1))) ||
+                            (*((npy_bool *)data_out + 1));
+
+#line 944
+        *((npy_bool *)data_out + 2) =
+                       ((*((npy_bool *)data0 + 2)) &&
+                        (*((npy_bool *)data1 + 2)) &&
+                        (*((npy_bool *)data2 + 2))) ||
+                            (*((npy_bool *)data_out + 2));
+
+#line 944
+        *((npy_bool *)data_out + 3) =
+                       ((*((npy_bool *)data0 + 3)) &&
+                        (*((npy_bool *)data1 + 3)) &&
+                        (*((npy_bool *)data2 + 3))) ||
+                            (*((npy_bool *)data_out + 3));
+
+#line 944
+        *((npy_bool *)data_out + 4) =
+                       ((*((npy_bool *)data0 + 4)) &&
+                        (*((npy_bool *)data1 + 4)) &&
+                        (*((npy_bool *)data2 + 4))) ||
+                            (*((npy_bool *)data_out + 4));
+
+#line 944
+        *((npy_bool *)data_out + 5) =
+                       ((*((npy_bool *)data0 + 5)) &&
+                        (*((npy_bool *)data1 + 5)) &&
+                        (*((npy_bool *)data2 + 5))) ||
+                            (*((npy_bool *)data_out + 5));
+
+#line 944
+        *((npy_bool *)data_out + 6) =
+                       ((*((npy_bool *)data0 + 6)) &&
+                        (*((npy_bool *)data1 + 6)) &&
+                        (*((npy_bool *)data2 + 6))) ||
+                            (*((npy_bool *)data_out + 6));
+
+#line 944
+        *((npy_bool *)data_out + 7) =
+                       ((*((npy_bool *)data0 + 7)) &&
+                        (*((npy_bool *)data1 + 7)) &&
+                        (*((npy_bool *)data2 + 7))) ||
+                            (*((npy_bool *)data_out + 7));
+
+        data0 += 8*sizeof(npy_bool);
+        data1 += 8*sizeof(npy_bool);
+        data2 += 8*sizeof(npy_bool);
+        data_out += 8*sizeof(npy_bool);
+#  else
+        npy_bool temp = *(npy_bool *)dataptr[0];
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp = temp && *(npy_bool *)dataptr[i];
+        }
+        *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i];
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_bool);
+        }
+#  endif
+    }
+
+    /* If the loop was unrolled, we need to finish it off */
+#if (3 <= 3)
+    goto finish_after_unrolled_loop;
+#endif
+}
+
+static void
+bool_sum_of_products_outstride0_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    npy_bool accum = 0;
+
+#if (3 <= 3)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (3 == 2 || 3 == 3)
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (3 == 3)
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    while (count--) {
+#if 3 == 1
+        accum = *(npy_bool *)data0 || accum;
+        data0 += stride0;
+#elif 3 == 2
+        accum = (*(npy_bool *)data0 && *(npy_bool *)data1) || accum;
+        data0 += stride0;
+        data1 += stride1;
+#elif 3 == 3
+        accum = (*(npy_bool *)data0 &&
+                 *(npy_bool *)data1 &&
+                 *(npy_bool *)data2) || accum;
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#else
+        npy_bool temp = *(npy_bool *)dataptr[0];
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp = temp && *(npy_bool *)dataptr[i];
+        }
+        accum = temp || accum;
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#endif
+    }
+
+#  if 3 <= 3
+    *((npy_bool *)dataptr[3]) = accum || *((npy_bool *)dataptr[3]);
+#  else
+    *((npy_bool *)dataptr[nop]) = accum || *((npy_bool *)dataptr[nop]);
+#  endif
+}
+
+
+#line 807
+
+static void
+bool_sum_of_products_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1000 <= 3)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1000 == 2 || 1000 == 3)
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1000 == 3)
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+#if (1000 <= 3)
+    char *data_out = dataptr[1000];
+    npy_intp stride_out = strides[1000];
+#endif
+
+    while (count--) {
+#if 1000 == 1
+        *(npy_bool *)data_out = *(npy_bool *)data0 ||
+                                  *(npy_bool *)data_out;
+        data0 += stride0;
+        data_out += stride_out;
+#elif 1000 == 2
+        *(npy_bool *)data_out = (*(npy_bool *)data0 &&
+                                   *(npy_bool *)data1) ||
+                                   *(npy_bool *)data_out;
+        data0 += stride0;
+        data1 += stride1;
+        data_out += stride_out;
+#elif 1000 == 3
+        *(npy_bool *)data_out = (*(npy_bool *)data0 &&
+                                   *(npy_bool *)data1 &&
+                                   *(npy_bool *)data2) ||
+                                   *(npy_bool *)data_out;
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+        data_out += stride_out;
+#else
+        npy_bool temp = *(npy_bool *)dataptr[0];
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp = temp && *(npy_bool *)dataptr[i];
+        }
+        *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i];
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#endif
+    }
+}
+
+static void
+bool_sum_of_products_contig_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+#if (1000 <= 3)
+    char *data0 = dataptr[0];
+#endif
+#if (1000 == 2 || 1000 == 3)
+    char *data1 = dataptr[1];
+#endif
+#if (1000 == 3)
+    char *data2 = dataptr[2];
+#endif
+#if (1000 <= 3)
+    char *data_out = dataptr[1000];
+#endif
+
+#if (1000 <= 3)
+/* This is placed before the main loop to make small counts faster */
+finish_after_unrolled_loop:
+    switch (count) {
+#line 889
+        case 6+1:
+#  if 1000 == 1
+            ((npy_bool *)data_out)[6] = ((npy_bool *)data0)[6] ||
+                                            ((npy_bool *)data_out)[6];
+#  elif 1000 == 2
+            ((npy_bool *)data_out)[6] =
+                            (((npy_bool *)data0)[6] &&
+                             ((npy_bool *)data1)[6]) ||
+                                ((npy_bool *)data_out)[6];
+#  elif 1000 == 3
+            ((npy_bool *)data_out)[6] =
+                           (((npy_bool *)data0)[6] &&
+                            ((npy_bool *)data1)[6] &&
+                            ((npy_bool *)data2)[6]) ||
+                                ((npy_bool *)data_out)[6];
+#  endif
+
+#line 889
+        case 5+1:
+#  if 1000 == 1
+            ((npy_bool *)data_out)[5] = ((npy_bool *)data0)[5] ||
+                                            ((npy_bool *)data_out)[5];
+#  elif 1000 == 2
+            ((npy_bool *)data_out)[5] =
+                            (((npy_bool *)data0)[5] &&
+                             ((npy_bool *)data1)[5]) ||
+                                ((npy_bool *)data_out)[5];
+#  elif 1000 == 3
+            ((npy_bool *)data_out)[5] =
+                           (((npy_bool *)data0)[5] &&
+                            ((npy_bool *)data1)[5] &&
+                            ((npy_bool *)data2)[5]) ||
+                                ((npy_bool *)data_out)[5];
+#  endif
+
+#line 889
+        case 4+1:
+#  if 1000 == 1
+            ((npy_bool *)data_out)[4] = ((npy_bool *)data0)[4] ||
+                                            ((npy_bool *)data_out)[4];
+#  elif 1000 == 2
+            ((npy_bool *)data_out)[4] =
+                            (((npy_bool *)data0)[4] &&
+                             ((npy_bool *)data1)[4]) ||
+                                ((npy_bool *)data_out)[4];
+#  elif 1000 == 3
+            ((npy_bool *)data_out)[4] =
+                           (((npy_bool *)data0)[4] &&
+                            ((npy_bool *)data1)[4] &&
+                            ((npy_bool *)data2)[4]) ||
+                                ((npy_bool *)data_out)[4];
+#  endif
+
+#line 889
+        case 3+1:
+#  if 1000 == 1
+            ((npy_bool *)data_out)[3] = ((npy_bool *)data0)[3] ||
+                                            ((npy_bool *)data_out)[3];
+#  elif 1000 == 2
+            ((npy_bool *)data_out)[3] =
+                            (((npy_bool *)data0)[3] &&
+                             ((npy_bool *)data1)[3]) ||
+                                ((npy_bool *)data_out)[3];
+#  elif 1000 == 3
+            ((npy_bool *)data_out)[3] =
+                           (((npy_bool *)data0)[3] &&
+                            ((npy_bool *)data1)[3] &&
+                            ((npy_bool *)data2)[3]) ||
+                                ((npy_bool *)data_out)[3];
+#  endif
+
+#line 889
+        case 2+1:
+#  if 1000 == 1
+            ((npy_bool *)data_out)[2] = ((npy_bool *)data0)[2] ||
+                                            ((npy_bool *)data_out)[2];
+#  elif 1000 == 2
+            ((npy_bool *)data_out)[2] =
+                            (((npy_bool *)data0)[2] &&
+                             ((npy_bool *)data1)[2]) ||
+                                ((npy_bool *)data_out)[2];
+#  elif 1000 == 3
+            ((npy_bool *)data_out)[2] =
+                           (((npy_bool *)data0)[2] &&
+                            ((npy_bool *)data1)[2] &&
+                            ((npy_bool *)data2)[2]) ||
+                                ((npy_bool *)data_out)[2];
+#  endif
+
+#line 889
+        case 1+1:
+#  if 1000 == 1
+            ((npy_bool *)data_out)[1] = ((npy_bool *)data0)[1] ||
+                                            ((npy_bool *)data_out)[1];
+#  elif 1000 == 2
+            ((npy_bool *)data_out)[1] =
+                            (((npy_bool *)data0)[1] &&
+                             ((npy_bool *)data1)[1]) ||
+                                ((npy_bool *)data_out)[1];
+#  elif 1000 == 3
+            ((npy_bool *)data_out)[1] =
+                           (((npy_bool *)data0)[1] &&
+                            ((npy_bool *)data1)[1] &&
+                            ((npy_bool *)data2)[1]) ||
+                                ((npy_bool *)data_out)[1];
+#  endif
+
+#line 889
+        case 0+1:
+#  if 1000 == 1
+            ((npy_bool *)data_out)[0] = ((npy_bool *)data0)[0] ||
+                                            ((npy_bool *)data_out)[0];
+#  elif 1000 == 2
+            ((npy_bool *)data_out)[0] =
+                            (((npy_bool *)data0)[0] &&
+                             ((npy_bool *)data1)[0]) ||
+                                ((npy_bool *)data_out)[0];
+#  elif 1000 == 3
+            ((npy_bool *)data_out)[0] =
+                           (((npy_bool *)data0)[0] &&
+                            ((npy_bool *)data1)[0] &&
+                            ((npy_bool *)data2)[0]) ||
+                                ((npy_bool *)data_out)[0];
+#  endif
+
+        case 0:
+            return;
+    }
+#endif
+
+/* Unroll the loop by 8 for fixed-size nop */
+#if (1000 <= 3)
+    while (count >= 8) {
+        count -= 8;
+#else
+    while (count--) {
+#endif
+
+#  if 1000 == 1
+#line 923
+        *((npy_bool *)data_out + 0) = (*((npy_bool *)data0 + 0)) ||
+                                        (*((npy_bool *)data_out + 0));
+
+#line 923
+        *((npy_bool *)data_out + 1) = (*((npy_bool *)data0 + 1)) ||
+                                        (*((npy_bool *)data_out + 1));
+
+#line 923
+        *((npy_bool *)data_out + 2) = (*((npy_bool *)data0 + 2)) ||
+                                        (*((npy_bool *)data_out + 2));
+
+#line 923
+        *((npy_bool *)data_out + 3) = (*((npy_bool *)data0 + 3)) ||
+                                        (*((npy_bool *)data_out + 3));
+
+#line 923
+        *((npy_bool *)data_out + 4) = (*((npy_bool *)data0 + 4)) ||
+                                        (*((npy_bool *)data_out + 4));
+
+#line 923
+        *((npy_bool *)data_out + 5) = (*((npy_bool *)data0 + 5)) ||
+                                        (*((npy_bool *)data_out + 5));
+
+#line 923
+        *((npy_bool *)data_out + 6) = (*((npy_bool *)data0 + 6)) ||
+                                        (*((npy_bool *)data_out + 6));
+
+#line 923
+        *((npy_bool *)data_out + 7) = (*((npy_bool *)data0 + 7)) ||
+                                        (*((npy_bool *)data_out + 7));
+
+        data0 += 8*sizeof(npy_bool);
+        data_out += 8*sizeof(npy_bool);
+#  elif 1000 == 2
+#line 932
+        *((npy_bool *)data_out + 0) =
+                        ((*((npy_bool *)data0 + 0)) &&
+                         (*((npy_bool *)data1 + 0))) ||
+                            (*((npy_bool *)data_out + 0));
+
+#line 932
+        *((npy_bool *)data_out + 1) =
+                        ((*((npy_bool *)data0 + 1)) &&
+                         (*((npy_bool *)data1 + 1))) ||
+                            (*((npy_bool *)data_out + 1));
+
+#line 932
+        *((npy_bool *)data_out + 2) =
+                        ((*((npy_bool *)data0 + 2)) &&
+                         (*((npy_bool *)data1 + 2))) ||
+                            (*((npy_bool *)data_out + 2));
+
+#line 932
+        *((npy_bool *)data_out + 3) =
+                        ((*((npy_bool *)data0 + 3)) &&
+                         (*((npy_bool *)data1 + 3))) ||
+                            (*((npy_bool *)data_out + 3));
+
+#line 932
+        *((npy_bool *)data_out + 4) =
+                        ((*((npy_bool *)data0 + 4)) &&
+                         (*((npy_bool *)data1 + 4))) ||
+                            (*((npy_bool *)data_out + 4));
+
+#line 932
+        *((npy_bool *)data_out + 5) =
+                        ((*((npy_bool *)data0 + 5)) &&
+                         (*((npy_bool *)data1 + 5))) ||
+                            (*((npy_bool *)data_out + 5));
+
+#line 932
+        *((npy_bool *)data_out + 6) =
+                        ((*((npy_bool *)data0 + 6)) &&
+                         (*((npy_bool *)data1 + 6))) ||
+                            (*((npy_bool *)data_out + 6));
+
+#line 932
+        *((npy_bool *)data_out + 7) =
+                        ((*((npy_bool *)data0 + 7)) &&
+                         (*((npy_bool *)data1 + 7))) ||
+                            (*((npy_bool *)data_out + 7));
+
+        data0 += 8*sizeof(npy_bool);
+        data1 += 8*sizeof(npy_bool);
+        data_out += 8*sizeof(npy_bool);
+#  elif 1000 == 3
+#line 944
+        *((npy_bool *)data_out + 0) =
+                       ((*((npy_bool *)data0 + 0)) &&
+                        (*((npy_bool *)data1 + 0)) &&
+                        (*((npy_bool *)data2 + 0))) ||
+                            (*((npy_bool *)data_out + 0));
+
+#line 944
+        *((npy_bool *)data_out + 1) =
+                       ((*((npy_bool *)data0 + 1)) &&
+                        (*((npy_bool *)data1 + 1)) &&
+                        (*((npy_bool *)data2 + 1))) ||
+                            (*((npy_bool *)data_out + 1));
+
+#line 944
+        *((npy_bool *)data_out + 2) =
+                       ((*((npy_bool *)data0 + 2)) &&
+                        (*((npy_bool *)data1 + 2)) &&
+                        (*((npy_bool *)data2 + 2))) ||
+                            (*((npy_bool *)data_out + 2));
+
+#line 944
+        *((npy_bool *)data_out + 3) =
+                       ((*((npy_bool *)data0 + 3)) &&
+                        (*((npy_bool *)data1 + 3)) &&
+                        (*((npy_bool *)data2 + 3))) ||
+                            (*((npy_bool *)data_out + 3));
+
+#line 944
+        *((npy_bool *)data_out + 4) =
+                       ((*((npy_bool *)data0 + 4)) &&
+                        (*((npy_bool *)data1 + 4)) &&
+                        (*((npy_bool *)data2 + 4))) ||
+                            (*((npy_bool *)data_out + 4));
+
+#line 944
+        *((npy_bool *)data_out + 5) =
+                       ((*((npy_bool *)data0 + 5)) &&
+                        (*((npy_bool *)data1 + 5)) &&
+                        (*((npy_bool *)data2 + 5))) ||
+                            (*((npy_bool *)data_out + 5));
+
+#line 944
+        *((npy_bool *)data_out + 6) =
+                       ((*((npy_bool *)data0 + 6)) &&
+                        (*((npy_bool *)data1 + 6)) &&
+                        (*((npy_bool *)data2 + 6))) ||
+                            (*((npy_bool *)data_out + 6));
+
+#line 944
+        *((npy_bool *)data_out + 7) =
+                       ((*((npy_bool *)data0 + 7)) &&
+                        (*((npy_bool *)data1 + 7)) &&
+                        (*((npy_bool *)data2 + 7))) ||
+                            (*((npy_bool *)data_out + 7));
+
+        data0 += 8*sizeof(npy_bool);
+        data1 += 8*sizeof(npy_bool);
+        data2 += 8*sizeof(npy_bool);
+        data_out += 8*sizeof(npy_bool);
+#  else
+        npy_bool temp = *(npy_bool *)dataptr[0];
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp = temp && *(npy_bool *)dataptr[i];
+        }
+        *(npy_bool *)dataptr[nop] = temp || *(npy_bool *)dataptr[i];
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += sizeof(npy_bool);
+        }
+#  endif
+    }
+
+    /* If the loop was unrolled, we need to finish it off */
+#if (1000 <= 3)
+    goto finish_after_unrolled_loop;
+#endif
+}
+
+static void
+bool_sum_of_products_outstride0_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    npy_bool accum = 0;
+
+#if (1000 <= 3)
+    char *data0 = dataptr[0];
+    npy_intp stride0 = strides[0];
+#endif
+#if (1000 == 2 || 1000 == 3)
+    char *data1 = dataptr[1];
+    npy_intp stride1 = strides[1];
+#endif
+#if (1000 == 3)
+    char *data2 = dataptr[2];
+    npy_intp stride2 = strides[2];
+#endif
+
+    while (count--) {
+#if 1000 == 1
+        accum = *(npy_bool *)data0 || accum;
+        data0 += stride0;
+#elif 1000 == 2
+        accum = (*(npy_bool *)data0 && *(npy_bool *)data1) || accum;
+        data0 += stride0;
+        data1 += stride1;
+#elif 1000 == 3
+        accum = (*(npy_bool *)data0 &&
+                 *(npy_bool *)data1 &&
+                 *(npy_bool *)data2) || accum;
+        data0 += stride0;
+        data1 += stride1;
+        data2 += stride2;
+#else
+        npy_bool temp = *(npy_bool *)dataptr[0];
+        int i;
+        for (i = 1; i < nop; ++i) {
+            temp = temp && *(npy_bool *)dataptr[i];
+        }
+        accum = temp || accum;
+        for (i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+#endif
+    }
+
+#  if 1000 <= 3
+    *((npy_bool *)dataptr[1000]) = accum || *((npy_bool *)dataptr[1000]);
+#  else
+    *((npy_bool *)dataptr[nop]) = accum || *((npy_bool *)dataptr[nop]);
+#  endif
+}
+
+
+
+#line 1044
+static void
+object_sum_of_products_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    while(count--){
+        PyObject *prod = *(PyObject **)dataptr[0];
+        if (!prod) {
+            prod = Py_None;  // convention is to treat nulls as None
+        }
+        Py_INCREF(prod);
+        for (int i = 1; i < nop; ++i){
+            PyObject *curr = *(PyObject **)dataptr[i];
+            if (!curr) {
+                curr = Py_None;  // convention is to treat nulls as None
+            }
+            Py_SETREF(prod, PyNumber_Multiply(prod, curr));
+            if (!prod) {
+                return;
+            }
+        }
+
+        PyObject *sum = PyNumber_Add(*(PyObject **)dataptr[nop], prod);
+        Py_DECREF(prod);
+        if (!sum) {
+            return;
+        }
+        
+        Py_XDECREF(*(PyObject **)dataptr[nop]);
+        *(PyObject **)dataptr[nop] = sum;
+        for (int i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+    }
+}
+
+#line 1044
+static void
+object_sum_of_products_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    while(count--){
+        PyObject *prod = *(PyObject **)dataptr[0];
+        if (!prod) {
+            prod = Py_None;  // convention is to treat nulls as None
+        }
+        Py_INCREF(prod);
+        for (int i = 1; i < nop; ++i){
+            PyObject *curr = *(PyObject **)dataptr[i];
+            if (!curr) {
+                curr = Py_None;  // convention is to treat nulls as None
+            }
+            Py_SETREF(prod, PyNumber_Multiply(prod, curr));
+            if (!prod) {
+                return;
+            }
+        }
+
+        PyObject *sum = PyNumber_Add(*(PyObject **)dataptr[nop], prod);
+        Py_DECREF(prod);
+        if (!sum) {
+            return;
+        }
+        
+        Py_XDECREF(*(PyObject **)dataptr[nop]);
+        *(PyObject **)dataptr[nop] = sum;
+        for (int i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+    }
+}
+
+#line 1044
+static void
+object_sum_of_products_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    while(count--){
+        PyObject *prod = *(PyObject **)dataptr[0];
+        if (!prod) {
+            prod = Py_None;  // convention is to treat nulls as None
+        }
+        Py_INCREF(prod);
+        for (int i = 1; i < nop; ++i){
+            PyObject *curr = *(PyObject **)dataptr[i];
+            if (!curr) {
+                curr = Py_None;  // convention is to treat nulls as None
+            }
+            Py_SETREF(prod, PyNumber_Multiply(prod, curr));
+            if (!prod) {
+                return;
+            }
+        }
+
+        PyObject *sum = PyNumber_Add(*(PyObject **)dataptr[nop], prod);
+        Py_DECREF(prod);
+        if (!sum) {
+            return;
+        }
+        
+        Py_XDECREF(*(PyObject **)dataptr[nop]);
+        *(PyObject **)dataptr[nop] = sum;
+        for (int i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+    }
+}
+
+#line 1044
+static void
+object_sum_of_products_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    while(count--){
+        PyObject *prod = *(PyObject **)dataptr[0];
+        if (!prod) {
+            prod = Py_None;  // convention is to treat nulls as None
+        }
+        Py_INCREF(prod);
+        for (int i = 1; i < nop; ++i){
+            PyObject *curr = *(PyObject **)dataptr[i];
+            if (!curr) {
+                curr = Py_None;  // convention is to treat nulls as None
+            }
+            Py_SETREF(prod, PyNumber_Multiply(prod, curr));
+            if (!prod) {
+                return;
+            }
+        }
+
+        PyObject *sum = PyNumber_Add(*(PyObject **)dataptr[nop], prod);
+        Py_DECREF(prod);
+        if (!sum) {
+            return;
+        }
+        
+        Py_XDECREF(*(PyObject **)dataptr[nop]);
+        *(PyObject **)dataptr[nop] = sum;
+        for (int i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+    }
+}
+
+#line 1044
+static void
+object_sum_of_products_contig_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    while(count--){
+        PyObject *prod = *(PyObject **)dataptr[0];
+        if (!prod) {
+            prod = Py_None;  // convention is to treat nulls as None
+        }
+        Py_INCREF(prod);
+        for (int i = 1; i < nop; ++i){
+            PyObject *curr = *(PyObject **)dataptr[i];
+            if (!curr) {
+                curr = Py_None;  // convention is to treat nulls as None
+            }
+            Py_SETREF(prod, PyNumber_Multiply(prod, curr));
+            if (!prod) {
+                return;
+            }
+        }
+
+        PyObject *sum = PyNumber_Add(*(PyObject **)dataptr[nop], prod);
+        Py_DECREF(prod);
+        if (!sum) {
+            return;
+        }
+        
+        Py_XDECREF(*(PyObject **)dataptr[nop]);
+        *(PyObject **)dataptr[nop] = sum;
+        for (int i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+    }
+}
+
+#line 1044
+static void
+object_sum_of_products_contig_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    while(count--){
+        PyObject *prod = *(PyObject **)dataptr[0];
+        if (!prod) {
+            prod = Py_None;  // convention is to treat nulls as None
+        }
+        Py_INCREF(prod);
+        for (int i = 1; i < nop; ++i){
+            PyObject *curr = *(PyObject **)dataptr[i];
+            if (!curr) {
+                curr = Py_None;  // convention is to treat nulls as None
+            }
+            Py_SETREF(prod, PyNumber_Multiply(prod, curr));
+            if (!prod) {
+                return;
+            }
+        }
+
+        PyObject *sum = PyNumber_Add(*(PyObject **)dataptr[nop], prod);
+        Py_DECREF(prod);
+        if (!sum) {
+            return;
+        }
+        
+        Py_XDECREF(*(PyObject **)dataptr[nop]);
+        *(PyObject **)dataptr[nop] = sum;
+        for (int i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+    }
+}
+
+#line 1044
+static void
+object_sum_of_products_contig_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    while(count--){
+        PyObject *prod = *(PyObject **)dataptr[0];
+        if (!prod) {
+            prod = Py_None;  // convention is to treat nulls as None
+        }
+        Py_INCREF(prod);
+        for (int i = 1; i < nop; ++i){
+            PyObject *curr = *(PyObject **)dataptr[i];
+            if (!curr) {
+                curr = Py_None;  // convention is to treat nulls as None
+            }
+            Py_SETREF(prod, PyNumber_Multiply(prod, curr));
+            if (!prod) {
+                return;
+            }
+        }
+
+        PyObject *sum = PyNumber_Add(*(PyObject **)dataptr[nop], prod);
+        Py_DECREF(prod);
+        if (!sum) {
+            return;
+        }
+        
+        Py_XDECREF(*(PyObject **)dataptr[nop]);
+        *(PyObject **)dataptr[nop] = sum;
+        for (int i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+    }
+}
+
+#line 1044
+static void
+object_sum_of_products_contig_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    while(count--){
+        PyObject *prod = *(PyObject **)dataptr[0];
+        if (!prod) {
+            prod = Py_None;  // convention is to treat nulls as None
+        }
+        Py_INCREF(prod);
+        for (int i = 1; i < nop; ++i){
+            PyObject *curr = *(PyObject **)dataptr[i];
+            if (!curr) {
+                curr = Py_None;  // convention is to treat nulls as None
+            }
+            Py_SETREF(prod, PyNumber_Multiply(prod, curr));
+            if (!prod) {
+                return;
+            }
+        }
+
+        PyObject *sum = PyNumber_Add(*(PyObject **)dataptr[nop], prod);
+        Py_DECREF(prod);
+        if (!sum) {
+            return;
+        }
+        
+        Py_XDECREF(*(PyObject **)dataptr[nop]);
+        *(PyObject **)dataptr[nop] = sum;
+        for (int i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+    }
+}
+
+#line 1044
+static void
+object_sum_of_products_outstride0_any(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    while(count--){
+        PyObject *prod = *(PyObject **)dataptr[0];
+        if (!prod) {
+            prod = Py_None;  // convention is to treat nulls as None
+        }
+        Py_INCREF(prod);
+        for (int i = 1; i < nop; ++i){
+            PyObject *curr = *(PyObject **)dataptr[i];
+            if (!curr) {
+                curr = Py_None;  // convention is to treat nulls as None
+            }
+            Py_SETREF(prod, PyNumber_Multiply(prod, curr));
+            if (!prod) {
+                return;
+            }
+        }
+
+        PyObject *sum = PyNumber_Add(*(PyObject **)dataptr[nop], prod);
+        Py_DECREF(prod);
+        if (!sum) {
+            return;
+        }
+        
+        Py_XDECREF(*(PyObject **)dataptr[nop]);
+        *(PyObject **)dataptr[nop] = sum;
+        for (int i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+    }
+}
+
+#line 1044
+static void
+object_sum_of_products_outstride0_one(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    while(count--){
+        PyObject *prod = *(PyObject **)dataptr[0];
+        if (!prod) {
+            prod = Py_None;  // convention is to treat nulls as None
+        }
+        Py_INCREF(prod);
+        for (int i = 1; i < nop; ++i){
+            PyObject *curr = *(PyObject **)dataptr[i];
+            if (!curr) {
+                curr = Py_None;  // convention is to treat nulls as None
+            }
+            Py_SETREF(prod, PyNumber_Multiply(prod, curr));
+            if (!prod) {
+                return;
+            }
+        }
+
+        PyObject *sum = PyNumber_Add(*(PyObject **)dataptr[nop], prod);
+        Py_DECREF(prod);
+        if (!sum) {
+            return;
+        }
+        
+        Py_XDECREF(*(PyObject **)dataptr[nop]);
+        *(PyObject **)dataptr[nop] = sum;
+        for (int i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+    }
+}
+
+#line 1044
+static void
+object_sum_of_products_outstride0_two(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    while(count--){
+        PyObject *prod = *(PyObject **)dataptr[0];
+        if (!prod) {
+            prod = Py_None;  // convention is to treat nulls as None
+        }
+        Py_INCREF(prod);
+        for (int i = 1; i < nop; ++i){
+            PyObject *curr = *(PyObject **)dataptr[i];
+            if (!curr) {
+                curr = Py_None;  // convention is to treat nulls as None
+            }
+            Py_SETREF(prod, PyNumber_Multiply(prod, curr));
+            if (!prod) {
+                return;
+            }
+        }
+
+        PyObject *sum = PyNumber_Add(*(PyObject **)dataptr[nop], prod);
+        Py_DECREF(prod);
+        if (!sum) {
+            return;
+        }
+        
+        Py_XDECREF(*(PyObject **)dataptr[nop]);
+        *(PyObject **)dataptr[nop] = sum;
+        for (int i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+    }
+}
+
+#line 1044
+static void
+object_sum_of_products_outstride0_three(int nop, char **dataptr,
+                                npy_intp const *strides, npy_intp count)
+{
+    while(count--){
+        PyObject *prod = *(PyObject **)dataptr[0];
+        if (!prod) {
+            prod = Py_None;  // convention is to treat nulls as None
+        }
+        Py_INCREF(prod);
+        for (int i = 1; i < nop; ++i){
+            PyObject *curr = *(PyObject **)dataptr[i];
+            if (!curr) {
+                curr = Py_None;  // convention is to treat nulls as None
+            }
+            Py_SETREF(prod, PyNumber_Multiply(prod, curr));
+            if (!prod) {
+                return;
+            }
+        }
+
+        PyObject *sum = PyNumber_Add(*(PyObject **)dataptr[nop], prod);
+        Py_DECREF(prod);
+        if (!sum) {
+            return;
+        }
+        
+        Py_XDECREF(*(PyObject **)dataptr[nop]);
+        *(PyObject **)dataptr[nop] = sum;
+        for (int i = 0; i <= nop; ++i) {
+            dataptr[i] += strides[i];
+        }
+    }
+}
+
+
+/* These tables need to match up with the type enum */
+static sum_of_products_fn
+_contig_outstride0_unary_specialization_table[NPY_NTYPES] = {
+#line 1105
+#if 0
+    &bool_sum_of_products_contig_outstride0_one,
+#else
+    NULL,
+#endif
+
+#line 1105
+#if 1
+    &byte_sum_of_products_contig_outstride0_one,
+#else
+    NULL,
+#endif
+
+#line 1105
+#if 1
+    &ubyte_sum_of_products_contig_outstride0_one,
+#else
+    NULL,
+#endif
+
+#line 1105
+#if 1
+    &short_sum_of_products_contig_outstride0_one,
+#else
+    NULL,
+#endif
+
+#line 1105
+#if 1
+    &ushort_sum_of_products_contig_outstride0_one,
+#else
+    NULL,
+#endif
+
+#line 1105
+#if 1
+    &int_sum_of_products_contig_outstride0_one,
+#else
+    NULL,
+#endif
+
+#line 1105
+#if 1
+    &uint_sum_of_products_contig_outstride0_one,
+#else
+    NULL,
+#endif
+
+#line 1105
+#if 1
+    &long_sum_of_products_contig_outstride0_one,
+#else
+    NULL,
+#endif
+
+#line 1105
+#if 1
+    &ulong_sum_of_products_contig_outstride0_one,
+#else
+    NULL,
+#endif
+
+#line 1105
+#if 1
+    &longlong_sum_of_products_contig_outstride0_one,
+#else
+    NULL,
+#endif
+
+#line 1105
+#if 1
+    &ulonglong_sum_of_products_contig_outstride0_one,
+#else
+    NULL,
+#endif
+
+#line 1105
+#if 1
+    &float_sum_of_products_contig_outstride0_one,
+#else
+    NULL,
+#endif
+
+#line 1105
+#if 1
+    &double_sum_of_products_contig_outstride0_one,
+#else
+    NULL,
+#endif
+
+#line 1105
+#if 1
+    &longdouble_sum_of_products_contig_outstride0_one,
+#else
+    NULL,
+#endif
+
+#line 1105
+#if 1
+    &cfloat_sum_of_products_contig_outstride0_one,
+#else
+    NULL,
+#endif
+
+#line 1105
+#if 1
+    &cdouble_sum_of_products_contig_outstride0_one,
+#else
+    NULL,
+#endif
+
+#line 1105
+#if 1
+    &clongdouble_sum_of_products_contig_outstride0_one,
+#else
+    NULL,
+#endif
+
+#line 1105
+#if 0
+    &object_sum_of_products_contig_outstride0_one,
+#else
+    NULL,
+#endif
+
+#line 1105
+#if 0
+    &string_sum_of_products_contig_outstride0_one,
+#else
+    NULL,
+#endif
+
+#line 1105
+#if 0
+    &unicode_sum_of_products_contig_outstride0_one,
+#else
+    NULL,
+#endif
+
+#line 1105
+#if 0
+    &void_sum_of_products_contig_outstride0_one,
+#else
+    NULL,
+#endif
+
+#line 1105
+#if 0
+    &datetime_sum_of_products_contig_outstride0_one,
+#else
+    NULL,
+#endif
+
+#line 1105
+#if 0
+    &timedelta_sum_of_products_contig_outstride0_one,
+#else
+    NULL,
+#endif
+
+#line 1105
+#if 1
+    &half_sum_of_products_contig_outstride0_one,
+#else
+    NULL,
+#endif
+
+}; /* End of _contig_outstride0_unary_specialization_table */
+
+static sum_of_products_fn _binary_specialization_table[NPY_NTYPES][5] = {
+#line 1136
+#if 0
+{
+    &bool_sum_of_products_stride0_contig_outstride0_two,
+    &bool_sum_of_products_stride0_contig_outcontig_two,
+    &bool_sum_of_products_contig_stride0_outstride0_two,
+    &bool_sum_of_products_contig_stride0_outcontig_two,
+    &bool_sum_of_products_contig_contig_outstride0_two,
+},
+#else
+    {NULL, NULL, NULL, NULL, NULL},
+#endif
+
+#line 1136
+#if 1
+{
+    &byte_sum_of_products_stride0_contig_outstride0_two,
+    &byte_sum_of_products_stride0_contig_outcontig_two,
+    &byte_sum_of_products_contig_stride0_outstride0_two,
+    &byte_sum_of_products_contig_stride0_outcontig_two,
+    &byte_sum_of_products_contig_contig_outstride0_two,
+},
+#else
+    {NULL, NULL, NULL, NULL, NULL},
+#endif
+
+#line 1136
+#if 1
+{
+    &ubyte_sum_of_products_stride0_contig_outstride0_two,
+    &ubyte_sum_of_products_stride0_contig_outcontig_two,
+    &ubyte_sum_of_products_contig_stride0_outstride0_two,
+    &ubyte_sum_of_products_contig_stride0_outcontig_two,
+    &ubyte_sum_of_products_contig_contig_outstride0_two,
+},
+#else
+    {NULL, NULL, NULL, NULL, NULL},
+#endif
+
+#line 1136
+#if 1
+{
+    &short_sum_of_products_stride0_contig_outstride0_two,
+    &short_sum_of_products_stride0_contig_outcontig_two,
+    &short_sum_of_products_contig_stride0_outstride0_two,
+    &short_sum_of_products_contig_stride0_outcontig_two,
+    &short_sum_of_products_contig_contig_outstride0_two,
+},
+#else
+    {NULL, NULL, NULL, NULL, NULL},
+#endif
+
+#line 1136
+#if 1
+{
+    &ushort_sum_of_products_stride0_contig_outstride0_two,
+    &ushort_sum_of_products_stride0_contig_outcontig_two,
+    &ushort_sum_of_products_contig_stride0_outstride0_two,
+    &ushort_sum_of_products_contig_stride0_outcontig_two,
+    &ushort_sum_of_products_contig_contig_outstride0_two,
+},
+#else
+    {NULL, NULL, NULL, NULL, NULL},
+#endif
+
+#line 1136
+#if 1
+{
+    &int_sum_of_products_stride0_contig_outstride0_two,
+    &int_sum_of_products_stride0_contig_outcontig_two,
+    &int_sum_of_products_contig_stride0_outstride0_two,
+    &int_sum_of_products_contig_stride0_outcontig_two,
+    &int_sum_of_products_contig_contig_outstride0_two,
+},
+#else
+    {NULL, NULL, NULL, NULL, NULL},
+#endif
+
+#line 1136
+#if 1
+{
+    &uint_sum_of_products_stride0_contig_outstride0_two,
+    &uint_sum_of_products_stride0_contig_outcontig_two,
+    &uint_sum_of_products_contig_stride0_outstride0_two,
+    &uint_sum_of_products_contig_stride0_outcontig_two,
+    &uint_sum_of_products_contig_contig_outstride0_two,
+},
+#else
+    {NULL, NULL, NULL, NULL, NULL},
+#endif
+
+#line 1136
+#if 1
+{
+    &long_sum_of_products_stride0_contig_outstride0_two,
+    &long_sum_of_products_stride0_contig_outcontig_two,
+    &long_sum_of_products_contig_stride0_outstride0_two,
+    &long_sum_of_products_contig_stride0_outcontig_two,
+    &long_sum_of_products_contig_contig_outstride0_two,
+},
+#else
+    {NULL, NULL, NULL, NULL, NULL},
+#endif
+
+#line 1136
+#if 1
+{
+    &ulong_sum_of_products_stride0_contig_outstride0_two,
+    &ulong_sum_of_products_stride0_contig_outcontig_two,
+    &ulong_sum_of_products_contig_stride0_outstride0_two,
+    &ulong_sum_of_products_contig_stride0_outcontig_two,
+    &ulong_sum_of_products_contig_contig_outstride0_two,
+},
+#else
+    {NULL, NULL, NULL, NULL, NULL},
+#endif
+
+#line 1136
+#if 1
+{
+    &longlong_sum_of_products_stride0_contig_outstride0_two,
+    &longlong_sum_of_products_stride0_contig_outcontig_two,
+    &longlong_sum_of_products_contig_stride0_outstride0_two,
+    &longlong_sum_of_products_contig_stride0_outcontig_two,
+    &longlong_sum_of_products_contig_contig_outstride0_two,
+},
+#else
+    {NULL, NULL, NULL, NULL, NULL},
+#endif
+
+#line 1136
+#if 1
+{
+    &ulonglong_sum_of_products_stride0_contig_outstride0_two,
+    &ulonglong_sum_of_products_stride0_contig_outcontig_two,
+    &ulonglong_sum_of_products_contig_stride0_outstride0_two,
+    &ulonglong_sum_of_products_contig_stride0_outcontig_two,
+    &ulonglong_sum_of_products_contig_contig_outstride0_two,
+},
+#else
+    {NULL, NULL, NULL, NULL, NULL},
+#endif
+
+#line 1136
+#if 1
+{
+    &float_sum_of_products_stride0_contig_outstride0_two,
+    &float_sum_of_products_stride0_contig_outcontig_two,
+    &float_sum_of_products_contig_stride0_outstride0_two,
+    &float_sum_of_products_contig_stride0_outcontig_two,
+    &float_sum_of_products_contig_contig_outstride0_two,
+},
+#else
+    {NULL, NULL, NULL, NULL, NULL},
+#endif
+
+#line 1136
+#if 1
+{
+    &double_sum_of_products_stride0_contig_outstride0_two,
+    &double_sum_of_products_stride0_contig_outcontig_two,
+    &double_sum_of_products_contig_stride0_outstride0_two,
+    &double_sum_of_products_contig_stride0_outcontig_two,
+    &double_sum_of_products_contig_contig_outstride0_two,
+},
+#else
+    {NULL, NULL, NULL, NULL, NULL},
+#endif
+
+#line 1136
+#if 1
+{
+    &longdouble_sum_of_products_stride0_contig_outstride0_two,
+    &longdouble_sum_of_products_stride0_contig_outcontig_two,
+    &longdouble_sum_of_products_contig_stride0_outstride0_two,
+    &longdouble_sum_of_products_contig_stride0_outcontig_two,
+    &longdouble_sum_of_products_contig_contig_outstride0_two,
+},
+#else
+    {NULL, NULL, NULL, NULL, NULL},
+#endif
+
+#line 1136
+#if 0
+{
+    &cfloat_sum_of_products_stride0_contig_outstride0_two,
+    &cfloat_sum_of_products_stride0_contig_outcontig_two,
+    &cfloat_sum_of_products_contig_stride0_outstride0_two,
+    &cfloat_sum_of_products_contig_stride0_outcontig_two,
+    &cfloat_sum_of_products_contig_contig_outstride0_two,
+},
+#else
+    {NULL, NULL, NULL, NULL, NULL},
+#endif
+
+#line 1136
+#if 0
+{
+    &cdouble_sum_of_products_stride0_contig_outstride0_two,
+    &cdouble_sum_of_products_stride0_contig_outcontig_two,
+    &cdouble_sum_of_products_contig_stride0_outstride0_two,
+    &cdouble_sum_of_products_contig_stride0_outcontig_two,
+    &cdouble_sum_of_products_contig_contig_outstride0_two,
+},
+#else
+    {NULL, NULL, NULL, NULL, NULL},
+#endif
+
+#line 1136
+#if 0
+{
+    &clongdouble_sum_of_products_stride0_contig_outstride0_two,
+    &clongdouble_sum_of_products_stride0_contig_outcontig_two,
+    &clongdouble_sum_of_products_contig_stride0_outstride0_two,
+    &clongdouble_sum_of_products_contig_stride0_outcontig_two,
+    &clongdouble_sum_of_products_contig_contig_outstride0_two,
+},
+#else
+    {NULL, NULL, NULL, NULL, NULL},
+#endif
+
+#line 1136
+#if 0
+{
+    &object_sum_of_products_stride0_contig_outstride0_two,
+    &object_sum_of_products_stride0_contig_outcontig_two,
+    &object_sum_of_products_contig_stride0_outstride0_two,
+    &object_sum_of_products_contig_stride0_outcontig_two,
+    &object_sum_of_products_contig_contig_outstride0_two,
+},
+#else
+    {NULL, NULL, NULL, NULL, NULL},
+#endif
+
+#line 1136
+#if 0
+{
+    &string_sum_of_products_stride0_contig_outstride0_two,
+    &string_sum_of_products_stride0_contig_outcontig_two,
+    &string_sum_of_products_contig_stride0_outstride0_two,
+    &string_sum_of_products_contig_stride0_outcontig_two,
+    &string_sum_of_products_contig_contig_outstride0_two,
+},
+#else
+    {NULL, NULL, NULL, NULL, NULL},
+#endif
+
+#line 1136
+#if 0
+{
+    &unicode_sum_of_products_stride0_contig_outstride0_two,
+    &unicode_sum_of_products_stride0_contig_outcontig_two,
+    &unicode_sum_of_products_contig_stride0_outstride0_two,
+    &unicode_sum_of_products_contig_stride0_outcontig_two,
+    &unicode_sum_of_products_contig_contig_outstride0_two,
+},
+#else
+    {NULL, NULL, NULL, NULL, NULL},
+#endif
+
+#line 1136
+#if 0
+{
+    &void_sum_of_products_stride0_contig_outstride0_two,
+    &void_sum_of_products_stride0_contig_outcontig_two,
+    &void_sum_of_products_contig_stride0_outstride0_two,
+    &void_sum_of_products_contig_stride0_outcontig_two,
+    &void_sum_of_products_contig_contig_outstride0_two,
+},
+#else
+    {NULL, NULL, NULL, NULL, NULL},
+#endif
+
+#line 1136
+#if 0
+{
+    &datetime_sum_of_products_stride0_contig_outstride0_two,
+    &datetime_sum_of_products_stride0_contig_outcontig_two,
+    &datetime_sum_of_products_contig_stride0_outstride0_two,
+    &datetime_sum_of_products_contig_stride0_outcontig_two,
+    &datetime_sum_of_products_contig_contig_outstride0_two,
+},
+#else
+    {NULL, NULL, NULL, NULL, NULL},
+#endif
+
+#line 1136
+#if 0
+{
+    &timedelta_sum_of_products_stride0_contig_outstride0_two,
+    &timedelta_sum_of_products_stride0_contig_outcontig_two,
+    &timedelta_sum_of_products_contig_stride0_outstride0_two,
+    &timedelta_sum_of_products_contig_stride0_outcontig_two,
+    &timedelta_sum_of_products_contig_contig_outstride0_two,
+},
+#else
+    {NULL, NULL, NULL, NULL, NULL},
+#endif
+
+#line 1136
+#if 1
+{
+    &half_sum_of_products_stride0_contig_outstride0_two,
+    &half_sum_of_products_stride0_contig_outcontig_two,
+    &half_sum_of_products_contig_stride0_outstride0_two,
+    &half_sum_of_products_contig_stride0_outcontig_two,
+    &half_sum_of_products_contig_contig_outstride0_two,
+},
+#else
+    {NULL, NULL, NULL, NULL, NULL},
+#endif
+
+}; /* End of _binary_specialization_table */
+
+static sum_of_products_fn _outstride0_specialized_table[NPY_NTYPES][4] = {
+#line 1173
+#if 1
+{
+    &bool_sum_of_products_outstride0_any,
+    &bool_sum_of_products_outstride0_one,
+    &bool_sum_of_products_outstride0_two,
+    &bool_sum_of_products_outstride0_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1173
+#if 1
+{
+    &byte_sum_of_products_outstride0_any,
+    &byte_sum_of_products_outstride0_one,
+    &byte_sum_of_products_outstride0_two,
+    &byte_sum_of_products_outstride0_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1173
+#if 1
+{
+    &ubyte_sum_of_products_outstride0_any,
+    &ubyte_sum_of_products_outstride0_one,
+    &ubyte_sum_of_products_outstride0_two,
+    &ubyte_sum_of_products_outstride0_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1173
+#if 1
+{
+    &short_sum_of_products_outstride0_any,
+    &short_sum_of_products_outstride0_one,
+    &short_sum_of_products_outstride0_two,
+    &short_sum_of_products_outstride0_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1173
+#if 1
+{
+    &ushort_sum_of_products_outstride0_any,
+    &ushort_sum_of_products_outstride0_one,
+    &ushort_sum_of_products_outstride0_two,
+    &ushort_sum_of_products_outstride0_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1173
+#if 1
+{
+    &int_sum_of_products_outstride0_any,
+    &int_sum_of_products_outstride0_one,
+    &int_sum_of_products_outstride0_two,
+    &int_sum_of_products_outstride0_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1173
+#if 1
+{
+    &uint_sum_of_products_outstride0_any,
+    &uint_sum_of_products_outstride0_one,
+    &uint_sum_of_products_outstride0_two,
+    &uint_sum_of_products_outstride0_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1173
+#if 1
+{
+    &long_sum_of_products_outstride0_any,
+    &long_sum_of_products_outstride0_one,
+    &long_sum_of_products_outstride0_two,
+    &long_sum_of_products_outstride0_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1173
+#if 1
+{
+    &ulong_sum_of_products_outstride0_any,
+    &ulong_sum_of_products_outstride0_one,
+    &ulong_sum_of_products_outstride0_two,
+    &ulong_sum_of_products_outstride0_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1173
+#if 1
+{
+    &longlong_sum_of_products_outstride0_any,
+    &longlong_sum_of_products_outstride0_one,
+    &longlong_sum_of_products_outstride0_two,
+    &longlong_sum_of_products_outstride0_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1173
+#if 1
+{
+    &ulonglong_sum_of_products_outstride0_any,
+    &ulonglong_sum_of_products_outstride0_one,
+    &ulonglong_sum_of_products_outstride0_two,
+    &ulonglong_sum_of_products_outstride0_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1173
+#if 1
+{
+    &float_sum_of_products_outstride0_any,
+    &float_sum_of_products_outstride0_one,
+    &float_sum_of_products_outstride0_two,
+    &float_sum_of_products_outstride0_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1173
+#if 1
+{
+    &double_sum_of_products_outstride0_any,
+    &double_sum_of_products_outstride0_one,
+    &double_sum_of_products_outstride0_two,
+    &double_sum_of_products_outstride0_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1173
+#if 1
+{
+    &longdouble_sum_of_products_outstride0_any,
+    &longdouble_sum_of_products_outstride0_one,
+    &longdouble_sum_of_products_outstride0_two,
+    &longdouble_sum_of_products_outstride0_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1173
+#if 1
+{
+    &cfloat_sum_of_products_outstride0_any,
+    &cfloat_sum_of_products_outstride0_one,
+    &cfloat_sum_of_products_outstride0_two,
+    &cfloat_sum_of_products_outstride0_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1173
+#if 1
+{
+    &cdouble_sum_of_products_outstride0_any,
+    &cdouble_sum_of_products_outstride0_one,
+    &cdouble_sum_of_products_outstride0_two,
+    &cdouble_sum_of_products_outstride0_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1173
+#if 1
+{
+    &clongdouble_sum_of_products_outstride0_any,
+    &clongdouble_sum_of_products_outstride0_one,
+    &clongdouble_sum_of_products_outstride0_two,
+    &clongdouble_sum_of_products_outstride0_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1173
+#if 1
+{
+    &object_sum_of_products_outstride0_any,
+    &object_sum_of_products_outstride0_one,
+    &object_sum_of_products_outstride0_two,
+    &object_sum_of_products_outstride0_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1173
+#if 0
+{
+    &string_sum_of_products_outstride0_any,
+    &string_sum_of_products_outstride0_one,
+    &string_sum_of_products_outstride0_two,
+    &string_sum_of_products_outstride0_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1173
+#if 0
+{
+    &unicode_sum_of_products_outstride0_any,
+    &unicode_sum_of_products_outstride0_one,
+    &unicode_sum_of_products_outstride0_two,
+    &unicode_sum_of_products_outstride0_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1173
+#if 0
+{
+    &void_sum_of_products_outstride0_any,
+    &void_sum_of_products_outstride0_one,
+    &void_sum_of_products_outstride0_two,
+    &void_sum_of_products_outstride0_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1173
+#if 0
+{
+    &datetime_sum_of_products_outstride0_any,
+    &datetime_sum_of_products_outstride0_one,
+    &datetime_sum_of_products_outstride0_two,
+    &datetime_sum_of_products_outstride0_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1173
+#if 0
+{
+    &timedelta_sum_of_products_outstride0_any,
+    &timedelta_sum_of_products_outstride0_one,
+    &timedelta_sum_of_products_outstride0_two,
+    &timedelta_sum_of_products_outstride0_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1173
+#if 1
+{
+    &half_sum_of_products_outstride0_any,
+    &half_sum_of_products_outstride0_one,
+    &half_sum_of_products_outstride0_two,
+    &half_sum_of_products_outstride0_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+}; /* End of _outstride0_specialized_table */
+
+static sum_of_products_fn _allcontig_specialized_table[NPY_NTYPES][4] = {
+#line 1209
+#if 1
+{
+    &bool_sum_of_products_contig_any,
+    &bool_sum_of_products_contig_one,
+    &bool_sum_of_products_contig_two,
+    &bool_sum_of_products_contig_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1209
+#if 1
+{
+    &byte_sum_of_products_contig_any,
+    &byte_sum_of_products_contig_one,
+    &byte_sum_of_products_contig_two,
+    &byte_sum_of_products_contig_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1209
+#if 1
+{
+    &ubyte_sum_of_products_contig_any,
+    &ubyte_sum_of_products_contig_one,
+    &ubyte_sum_of_products_contig_two,
+    &ubyte_sum_of_products_contig_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1209
+#if 1
+{
+    &short_sum_of_products_contig_any,
+    &short_sum_of_products_contig_one,
+    &short_sum_of_products_contig_two,
+    &short_sum_of_products_contig_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1209
+#if 1
+{
+    &ushort_sum_of_products_contig_any,
+    &ushort_sum_of_products_contig_one,
+    &ushort_sum_of_products_contig_two,
+    &ushort_sum_of_products_contig_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1209
+#if 1
+{
+    &int_sum_of_products_contig_any,
+    &int_sum_of_products_contig_one,
+    &int_sum_of_products_contig_two,
+    &int_sum_of_products_contig_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1209
+#if 1
+{
+    &uint_sum_of_products_contig_any,
+    &uint_sum_of_products_contig_one,
+    &uint_sum_of_products_contig_two,
+    &uint_sum_of_products_contig_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1209
+#if 1
+{
+    &long_sum_of_products_contig_any,
+    &long_sum_of_products_contig_one,
+    &long_sum_of_products_contig_two,
+    &long_sum_of_products_contig_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1209
+#if 1
+{
+    &ulong_sum_of_products_contig_any,
+    &ulong_sum_of_products_contig_one,
+    &ulong_sum_of_products_contig_two,
+    &ulong_sum_of_products_contig_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1209
+#if 1
+{
+    &longlong_sum_of_products_contig_any,
+    &longlong_sum_of_products_contig_one,
+    &longlong_sum_of_products_contig_two,
+    &longlong_sum_of_products_contig_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1209
+#if 1
+{
+    &ulonglong_sum_of_products_contig_any,
+    &ulonglong_sum_of_products_contig_one,
+    &ulonglong_sum_of_products_contig_two,
+    &ulonglong_sum_of_products_contig_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1209
+#if 1
+{
+    &float_sum_of_products_contig_any,
+    &float_sum_of_products_contig_one,
+    &float_sum_of_products_contig_two,
+    &float_sum_of_products_contig_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1209
+#if 1
+{
+    &double_sum_of_products_contig_any,
+    &double_sum_of_products_contig_one,
+    &double_sum_of_products_contig_two,
+    &double_sum_of_products_contig_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1209
+#if 1
+{
+    &longdouble_sum_of_products_contig_any,
+    &longdouble_sum_of_products_contig_one,
+    &longdouble_sum_of_products_contig_two,
+    &longdouble_sum_of_products_contig_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1209
+#if 1
+{
+    &cfloat_sum_of_products_contig_any,
+    &cfloat_sum_of_products_contig_one,
+    &cfloat_sum_of_products_contig_two,
+    &cfloat_sum_of_products_contig_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1209
+#if 1
+{
+    &cdouble_sum_of_products_contig_any,
+    &cdouble_sum_of_products_contig_one,
+    &cdouble_sum_of_products_contig_two,
+    &cdouble_sum_of_products_contig_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1209
+#if 1
+{
+    &clongdouble_sum_of_products_contig_any,
+    &clongdouble_sum_of_products_contig_one,
+    &clongdouble_sum_of_products_contig_two,
+    &clongdouble_sum_of_products_contig_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1209
+#if 1
+{
+    &object_sum_of_products_contig_any,
+    &object_sum_of_products_contig_one,
+    &object_sum_of_products_contig_two,
+    &object_sum_of_products_contig_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1209
+#if 0
+{
+    &string_sum_of_products_contig_any,
+    &string_sum_of_products_contig_one,
+    &string_sum_of_products_contig_two,
+    &string_sum_of_products_contig_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1209
+#if 0
+{
+    &unicode_sum_of_products_contig_any,
+    &unicode_sum_of_products_contig_one,
+    &unicode_sum_of_products_contig_two,
+    &unicode_sum_of_products_contig_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1209
+#if 0
+{
+    &void_sum_of_products_contig_any,
+    &void_sum_of_products_contig_one,
+    &void_sum_of_products_contig_two,
+    &void_sum_of_products_contig_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1209
+#if 0
+{
+    &datetime_sum_of_products_contig_any,
+    &datetime_sum_of_products_contig_one,
+    &datetime_sum_of_products_contig_two,
+    &datetime_sum_of_products_contig_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1209
+#if 0
+{
+    &timedelta_sum_of_products_contig_any,
+    &timedelta_sum_of_products_contig_one,
+    &timedelta_sum_of_products_contig_two,
+    &timedelta_sum_of_products_contig_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1209
+#if 1
+{
+    &half_sum_of_products_contig_any,
+    &half_sum_of_products_contig_one,
+    &half_sum_of_products_contig_two,
+    &half_sum_of_products_contig_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+}; /* End of _allcontig_specialized_table */
+
+static sum_of_products_fn _unspecialized_table[NPY_NTYPES][4] = {
+#line 1245
+#if 1
+{
+    &bool_sum_of_products_any,
+    &bool_sum_of_products_one,
+    &bool_sum_of_products_two,
+    &bool_sum_of_products_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1245
+#if 1
+{
+    &byte_sum_of_products_any,
+    &byte_sum_of_products_one,
+    &byte_sum_of_products_two,
+    &byte_sum_of_products_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1245
+#if 1
+{
+    &ubyte_sum_of_products_any,
+    &ubyte_sum_of_products_one,
+    &ubyte_sum_of_products_two,
+    &ubyte_sum_of_products_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1245
+#if 1
+{
+    &short_sum_of_products_any,
+    &short_sum_of_products_one,
+    &short_sum_of_products_two,
+    &short_sum_of_products_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1245
+#if 1
+{
+    &ushort_sum_of_products_any,
+    &ushort_sum_of_products_one,
+    &ushort_sum_of_products_two,
+    &ushort_sum_of_products_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1245
+#if 1
+{
+    &int_sum_of_products_any,
+    &int_sum_of_products_one,
+    &int_sum_of_products_two,
+    &int_sum_of_products_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1245
+#if 1
+{
+    &uint_sum_of_products_any,
+    &uint_sum_of_products_one,
+    &uint_sum_of_products_two,
+    &uint_sum_of_products_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1245
+#if 1
+{
+    &long_sum_of_products_any,
+    &long_sum_of_products_one,
+    &long_sum_of_products_two,
+    &long_sum_of_products_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1245
+#if 1
+{
+    &ulong_sum_of_products_any,
+    &ulong_sum_of_products_one,
+    &ulong_sum_of_products_two,
+    &ulong_sum_of_products_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1245
+#if 1
+{
+    &longlong_sum_of_products_any,
+    &longlong_sum_of_products_one,
+    &longlong_sum_of_products_two,
+    &longlong_sum_of_products_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1245
+#if 1
+{
+    &ulonglong_sum_of_products_any,
+    &ulonglong_sum_of_products_one,
+    &ulonglong_sum_of_products_two,
+    &ulonglong_sum_of_products_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1245
+#if 1
+{
+    &float_sum_of_products_any,
+    &float_sum_of_products_one,
+    &float_sum_of_products_two,
+    &float_sum_of_products_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1245
+#if 1
+{
+    &double_sum_of_products_any,
+    &double_sum_of_products_one,
+    &double_sum_of_products_two,
+    &double_sum_of_products_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1245
+#if 1
+{
+    &longdouble_sum_of_products_any,
+    &longdouble_sum_of_products_one,
+    &longdouble_sum_of_products_two,
+    &longdouble_sum_of_products_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1245
+#if 1
+{
+    &cfloat_sum_of_products_any,
+    &cfloat_sum_of_products_one,
+    &cfloat_sum_of_products_two,
+    &cfloat_sum_of_products_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1245
+#if 1
+{
+    &cdouble_sum_of_products_any,
+    &cdouble_sum_of_products_one,
+    &cdouble_sum_of_products_two,
+    &cdouble_sum_of_products_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1245
+#if 1
+{
+    &clongdouble_sum_of_products_any,
+    &clongdouble_sum_of_products_one,
+    &clongdouble_sum_of_products_two,
+    &clongdouble_sum_of_products_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1245
+#if 1
+{
+    &object_sum_of_products_any,
+    &object_sum_of_products_one,
+    &object_sum_of_products_two,
+    &object_sum_of_products_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1245
+#if 0
+{
+    &string_sum_of_products_any,
+    &string_sum_of_products_one,
+    &string_sum_of_products_two,
+    &string_sum_of_products_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1245
+#if 0
+{
+    &unicode_sum_of_products_any,
+    &unicode_sum_of_products_one,
+    &unicode_sum_of_products_two,
+    &unicode_sum_of_products_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1245
+#if 0
+{
+    &void_sum_of_products_any,
+    &void_sum_of_products_one,
+    &void_sum_of_products_two,
+    &void_sum_of_products_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1245
+#if 0
+{
+    &datetime_sum_of_products_any,
+    &datetime_sum_of_products_one,
+    &datetime_sum_of_products_two,
+    &datetime_sum_of_products_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1245
+#if 0
+{
+    &timedelta_sum_of_products_any,
+    &timedelta_sum_of_products_one,
+    &timedelta_sum_of_products_two,
+    &timedelta_sum_of_products_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+#line 1245
+#if 1
+{
+    &half_sum_of_products_any,
+    &half_sum_of_products_one,
+    &half_sum_of_products_two,
+    &half_sum_of_products_three
+},
+#else
+    {NULL, NULL, NULL, NULL},
+#endif
+
+}; /* End of _unnspecialized_table */
+
+NPY_VISIBILITY_HIDDEN sum_of_products_fn
+get_sum_of_products_function(int nop, int type_num,
+                             npy_intp itemsize, npy_intp const *fixed_strides)
+{
+    int iop;
+
+    if (type_num >= NPY_NTYPES) {
+        return NULL;
+    }
+
+    /* contiguous reduction */
+    if (nop == 1 && fixed_strides[0] == itemsize && fixed_strides[1] == 0) {
+        sum_of_products_fn ret =
+            _contig_outstride0_unary_specialization_table[type_num];
+        if (ret != NULL) {
+            return ret;
+        }
+    }
+
+    /* nop of 2 has more specializations */
+    if (nop == 2) {
+        /* Encode the zero/contiguous strides */
+        int code;
+        code = (fixed_strides[0] == 0) ? 0 :
+                    (fixed_strides[0] == itemsize) ? 2*2*1 : 8;
+        code += (fixed_strides[1] == 0) ? 0 :
+                    (fixed_strides[1] == itemsize) ? 2*1 : 8;
+        code += (fixed_strides[2] == 0) ? 0 :
+                    (fixed_strides[2] == itemsize) ? 1 : 8;
+        if (code >= 2 && code < 7) {
+            sum_of_products_fn ret =
+                        _binary_specialization_table[type_num][code-2];
+            if (ret != NULL) {
+                return ret;
+            }
+        }
+    }
+
+    /* Inner loop with an output stride of 0 */
+    if (fixed_strides[nop] == 0) {
+        return _outstride0_specialized_table[type_num][nop <= 3 ? nop : 0];
+    }
+
+    /* Check for all contiguous */
+    for (iop = 0; iop < nop + 1; ++iop) {
+        if (fixed_strides[iop] != itemsize) {
+            break;
+        }
+    }
+
+    /* Contiguous loop */
+    if (iop == nop + 1) {
+        return _allcontig_specialized_table[type_num][nop <= 3 ? nop : 0];
+    }
+
+    /* None of the above specializations caught it, general loops */
+    return _unspecialized_table[type_num][nop <= 3 ? nop : 0];
+}
+
diff --git a/numpy/core/src/_generated/funcs.inc b/numpy/core/src/_generated/funcs.inc
new file mode 100644
index 000000000000..35efc10d81c7
--- /dev/null
+++ b/numpy/core/src/_generated/funcs.inc
@@ -0,0 +1,867 @@
+#line 1 "numpy/core/src/umath/funcs.inc.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+/* -*- c -*- */
+
+/*
+ * This file is for the definitions of the non-c99 functions used in ufuncs.
+ * All the complex ufuncs are defined here along with a smattering of real and
+ * object functions.
+ */
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#include "npy_pycompat.h"
+#include "npy_import.h"
+
+
+/*
+ *****************************************************************************
+ **                        PYTHON OBJECT FUNCTIONS                          **
+ *****************************************************************************
+ */
+
+static PyObject *
+Py_square(PyObject *o)
+{
+    return PyNumber_Multiply(o, o);
+}
+
+static PyObject *
+Py_get_one(PyObject *NPY_UNUSED(o))
+{
+    return PyLong_FromLong(1);
+}
+
+static PyObject *
+Py_reciprocal(PyObject *o)
+{
+    PyObject *one = PyLong_FromLong(1);
+    PyObject *result;
+
+    if (!one) {
+        return NULL;
+    }
+    result = PyNumber_TrueDivide(one, o);
+    Py_DECREF(one);
+    return result;
+}
+
+/*
+ * Define numpy version of PyNumber_Power as binary function.
+ */
+static PyObject *
+npy_ObjectPower(PyObject *x, PyObject *y)
+{
+    return PyNumber_Power(x, y, Py_None);
+}
+
+#line 59
+static PyObject *
+npy_ObjectMax(PyObject *i1, PyObject *i2)
+{
+    PyObject *result;
+    int cmp;
+
+    cmp = PyObject_RichCompareBool(i1, i2, Py_GE);
+    if (cmp < 0) {
+        return NULL;
+    }
+    if (cmp == 1) {
+        result = i1;
+    }
+    else {
+        result = i2;
+    }
+    Py_INCREF(result);
+    return result;
+}
+
+#line 59
+static PyObject *
+npy_ObjectMin(PyObject *i1, PyObject *i2)
+{
+    PyObject *result;
+    int cmp;
+
+    cmp = PyObject_RichCompareBool(i1, i2, Py_LE);
+    if (cmp < 0) {
+        return NULL;
+    }
+    if (cmp == 1) {
+        result = i1;
+    }
+    else {
+        result = i2;
+    }
+    Py_INCREF(result);
+    return result;
+}
+
+
+/* Emulates Python's 'a or b' behavior */
+static PyObject *
+npy_ObjectLogicalOr(PyObject *i1, PyObject *i2)
+{
+    if (i1 == NULL) {
+        Py_XINCREF(i2);
+        return i2;
+    }
+    else if (i2 == NULL) {
+        Py_INCREF(i1);
+        return i1;
+    }
+    else {
+        int retcode = PyObject_IsTrue(i1);
+        if (retcode == -1) {
+            return NULL;
+        }
+        else if (retcode) {
+            Py_INCREF(i1);
+            return i1;
+        }
+        else {
+            Py_INCREF(i2);
+            return i2;
+        }
+    }
+}
+
+/* Emulates Python's 'a and b' behavior */
+static PyObject *
+npy_ObjectLogicalAnd(PyObject *i1, PyObject *i2)
+{
+    if (i1 == NULL) {
+        return NULL;
+    }
+    else if (i2 == NULL) {
+        return NULL;
+    }
+    else {
+        int retcode = PyObject_IsTrue(i1);
+        if (retcode == -1) {
+            return NULL;
+        }
+        else if (!retcode) {
+            Py_INCREF(i1);
+            return i1;
+        }
+        else {
+            Py_INCREF(i2);
+            return i2;
+        }
+    }
+}
+
+
+/* Emulates Python's 'not b' behavior */
+static PyObject *
+npy_ObjectLogicalNot(PyObject *i1)
+{
+    if (i1 == NULL) {
+        return NULL;
+    }
+    else {
+        int retcode = PyObject_Not(i1);
+        if (retcode == -1) {
+            return NULL;
+        }
+        else if (retcode) {
+            Py_INCREF(Py_True);
+            return Py_True;
+        }
+        else {
+            Py_INCREF(Py_False);
+            return Py_False;
+        }
+    }
+}
+
+static PyObject *
+npy_ObjectFloor(PyObject *obj) {
+    static PyObject *math_floor_func = NULL;
+
+    npy_cache_import("math", "floor", &math_floor_func);
+    if (math_floor_func == NULL) {
+        return NULL;
+    }
+    return PyObject_CallFunction(math_floor_func, "O", obj);
+}
+
+static PyObject *
+npy_ObjectCeil(PyObject *obj) {
+    static PyObject *math_ceil_func = NULL;
+
+    npy_cache_import("math", "ceil", &math_ceil_func);
+    if (math_ceil_func == NULL) {
+        return NULL;
+    }
+    return PyObject_CallFunction(math_ceil_func, "O", obj);
+}
+
+static PyObject *
+npy_ObjectTrunc(PyObject *obj) {
+    static PyObject *math_trunc_func = NULL;
+
+    npy_cache_import("math", "trunc", &math_trunc_func);
+    if (math_trunc_func == NULL) {
+        return NULL;
+    }
+    return PyObject_CallFunction(math_trunc_func, "O", obj);
+}
+
+static PyObject *
+npy_ObjectGCD(PyObject *i1, PyObject *i2)
+{
+    PyObject *gcd = NULL;
+
+    /* use math.gcd if valid on the provided types */
+    {
+        static PyObject *math_gcd_func = NULL;
+
+        npy_cache_import("math", "gcd", &math_gcd_func);
+        if (math_gcd_func == NULL) {
+            return NULL;
+        }
+        gcd = PyObject_CallFunction(math_gcd_func, "OO", i1, i2);
+        if (gcd != NULL) {
+            return gcd;
+        }
+        /* silence errors, and fall back on pure-python gcd */
+        PyErr_Clear();
+    }
+
+    /* otherwise, use our internal one, written in python */
+    {
+        static PyObject *internal_gcd_func = NULL;
+
+        npy_cache_import("numpy.core._internal", "_gcd", &internal_gcd_func);
+        if (internal_gcd_func == NULL) {
+            return NULL;
+        }
+        gcd = PyObject_CallFunction(internal_gcd_func, "OO", i1, i2);
+        if (gcd == NULL) {
+            return NULL;
+        }
+        /* _gcd has some unusual behaviour regarding sign */
+        Py_SETREF(gcd, PyNumber_Absolute(gcd));
+        return gcd;
+    }
+}
+
+static PyObject *
+npy_ObjectLCM(PyObject *i1, PyObject *i2)
+{
+    /* lcm(a, b) = abs(a // gcd(a, b) * b) */
+
+    PyObject *gcd = npy_ObjectGCD(i1, i2);
+    PyObject *tmp;
+    if(gcd == NULL) {
+        return NULL;
+    }
+    /* Floor divide preserves integer types - we know the division will have
+     * no remainder
+     */
+    tmp = PyNumber_FloorDivide(i1, gcd);
+    Py_DECREF(gcd);
+    if(tmp == NULL) {
+        return NULL;
+    }
+
+    Py_SETREF(tmp, PyNumber_Multiply(tmp, i2));
+    if(tmp == NULL) {
+        return NULL;
+    }
+
+    /* even though we fix gcd to be positive, we need to do it again here */
+    Py_SETREF(tmp,  PyNumber_Absolute(tmp));
+    return tmp;
+}
+
+
+static PyObject *
+npy_ObjectClip(PyObject *arr, PyObject *min, PyObject *max) {
+    PyObject *o = npy_ObjectMax(arr, min);
+    if (o == NULL) {
+        return NULL;
+    }
+    Py_SETREF(o, npy_ObjectMin(o, max));
+    return o;
+}
+
+/*
+ *****************************************************************************
+ **                           COMPLEX FUNCTIONS                             **
+ *****************************************************************************
+ */
+
+
+/*
+ * Don't pass structures between functions (only pointers) because how
+ * structures are passed is compiler dependent and could cause segfaults if
+ * umath_ufunc_object.inc is compiled with a different compiler than an
+ * extension that makes use of the UFUNC API
+ */
+
+#line 290
+
+static void
+nc_negf(npy_cfloat *a, npy_cfloat *r)
+{
+    r->real = -a->real;
+    r->imag = -a->imag;
+    return;
+}
+
+static void
+nc_posf(npy_cfloat *a, npy_cfloat *r)
+{
+    r->real = +a->real;
+    r->imag = +a->imag;
+    return;
+}
+
+static void
+nc_sqrtf(npy_cfloat *x, npy_cfloat *r)
+{
+    *r = npy_csqrtf(*x);
+    return;
+}
+
+static void
+nc_rintf(npy_cfloat *x, npy_cfloat *r)
+{
+    r->real = npy_rintf(x->real);
+    r->imag = npy_rintf(x->imag);
+}
+
+static void
+nc_logf(npy_cfloat *x, npy_cfloat *r)
+{
+    *r = npy_clogf(*x);
+    return;
+}
+
+static void
+nc_log1pf(npy_cfloat *x, npy_cfloat *r)
+{
+    npy_float l = npy_hypotf(x->real + 1,x->imag);
+    r->imag = npy_atan2f(x->imag, x->real + 1);
+    r->real = npy_logf(l);
+    return;
+}
+
+static void
+nc_expf(npy_cfloat *x, npy_cfloat *r)
+{
+    *r = npy_cexpf(*x);
+    return;
+}
+
+static void
+nc_exp2f(npy_cfloat *x, npy_cfloat *r)
+{
+    npy_cfloat a;
+    a.real = x->real*NPY_LOGE2f;
+    a.imag = x->imag*NPY_LOGE2f;
+    nc_expf(&a, r);
+    return;
+}
+
+static void
+nc_expm1f(npy_cfloat *x, npy_cfloat *r)
+{
+    npy_float a = npy_sinf(x->imag / 2);
+    r->real = npy_expm1f(x->real) * npy_cosf(x->imag) - 2 * a * a;
+    r->imag = npy_expf(x->real) * npy_sinf(x->imag);
+    return;
+}
+
+static void
+nc_powf(npy_cfloat *a, npy_cfloat *b, npy_cfloat *r)
+{
+   *r = npy_cpowf(*a, *b);
+    return;
+}
+
+static void
+nc_acosf(npy_cfloat *x, npy_cfloat *r)
+{
+    *r = npy_cacosf(*x);
+    return;
+}
+
+static void
+nc_acoshf(npy_cfloat *x, npy_cfloat *r)
+{
+    *r = npy_cacoshf(*x);
+    return;
+}
+
+static void
+nc_asinf(npy_cfloat *x, npy_cfloat *r)
+{
+    *r = npy_casinf(*x);
+    return;
+}
+
+
+static void
+nc_asinhf(npy_cfloat *x, npy_cfloat *r)
+{
+    *r = npy_casinhf(*x);
+    return;
+}
+
+static void
+nc_atanf(npy_cfloat *x, npy_cfloat *r)
+{
+    *r = npy_catanf(*x);
+    return;
+}
+
+static void
+nc_atanhf(npy_cfloat *x, npy_cfloat *r)
+{
+    *r = npy_catanhf(*x);
+    return;
+}
+
+static void
+nc_cosf(npy_cfloat *x, npy_cfloat *r)
+{
+    *r = npy_ccosf(*x);
+    return;
+}
+
+static void
+nc_coshf(npy_cfloat *x, npy_cfloat *r)
+{
+    *r = npy_ccoshf(*x);
+    return;
+}
+
+static void
+nc_log10f(npy_cfloat *x, npy_cfloat *r)
+{
+    nc_logf(x, r);
+    r->real *= NPY_LOG10Ef;
+    r->imag *= NPY_LOG10Ef;
+    return;
+}
+
+static void
+nc_log2f(npy_cfloat *x, npy_cfloat *r)
+{
+    nc_logf(x, r);
+    r->real *= NPY_LOG2Ef;
+    r->imag *= NPY_LOG2Ef;
+    return;
+}
+
+static void
+nc_sinf(npy_cfloat *x, npy_cfloat *r)
+{
+    *r = npy_csinf(*x);
+    return;
+}
+
+static void
+nc_sinhf(npy_cfloat *x, npy_cfloat *r)
+{
+    *r = npy_csinhf(*x);
+    return;
+}
+
+static void
+nc_tanf(npy_cfloat *x, npy_cfloat *r)
+{
+   *r = npy_ctanf(*x);
+   return;
+}
+
+static void
+nc_tanhf(npy_cfloat *x, npy_cfloat *r)
+{
+    *r = npy_ctanhf(*x);
+    return;
+}
+
+
+#line 290
+
+static void
+nc_neg(npy_cdouble *a, npy_cdouble *r)
+{
+    r->real = -a->real;
+    r->imag = -a->imag;
+    return;
+}
+
+static void
+nc_pos(npy_cdouble *a, npy_cdouble *r)
+{
+    r->real = +a->real;
+    r->imag = +a->imag;
+    return;
+}
+
+static void
+nc_sqrt(npy_cdouble *x, npy_cdouble *r)
+{
+    *r = npy_csqrt(*x);
+    return;
+}
+
+static void
+nc_rint(npy_cdouble *x, npy_cdouble *r)
+{
+    r->real = npy_rint(x->real);
+    r->imag = npy_rint(x->imag);
+}
+
+static void
+nc_log(npy_cdouble *x, npy_cdouble *r)
+{
+    *r = npy_clog(*x);
+    return;
+}
+
+static void
+nc_log1p(npy_cdouble *x, npy_cdouble *r)
+{
+    npy_double l = npy_hypot(x->real + 1,x->imag);
+    r->imag = npy_atan2(x->imag, x->real + 1);
+    r->real = npy_log(l);
+    return;
+}
+
+static void
+nc_exp(npy_cdouble *x, npy_cdouble *r)
+{
+    *r = npy_cexp(*x);
+    return;
+}
+
+static void
+nc_exp2(npy_cdouble *x, npy_cdouble *r)
+{
+    npy_cdouble a;
+    a.real = x->real*NPY_LOGE2;
+    a.imag = x->imag*NPY_LOGE2;
+    nc_exp(&a, r);
+    return;
+}
+
+static void
+nc_expm1(npy_cdouble *x, npy_cdouble *r)
+{
+    npy_double a = npy_sin(x->imag / 2);
+    r->real = npy_expm1(x->real) * npy_cos(x->imag) - 2 * a * a;
+    r->imag = npy_exp(x->real) * npy_sin(x->imag);
+    return;
+}
+
+static void
+nc_pow(npy_cdouble *a, npy_cdouble *b, npy_cdouble *r)
+{
+   *r = npy_cpow(*a, *b);
+    return;
+}
+
+static void
+nc_acos(npy_cdouble *x, npy_cdouble *r)
+{
+    *r = npy_cacos(*x);
+    return;
+}
+
+static void
+nc_acosh(npy_cdouble *x, npy_cdouble *r)
+{
+    *r = npy_cacosh(*x);
+    return;
+}
+
+static void
+nc_asin(npy_cdouble *x, npy_cdouble *r)
+{
+    *r = npy_casin(*x);
+    return;
+}
+
+
+static void
+nc_asinh(npy_cdouble *x, npy_cdouble *r)
+{
+    *r = npy_casinh(*x);
+    return;
+}
+
+static void
+nc_atan(npy_cdouble *x, npy_cdouble *r)
+{
+    *r = npy_catan(*x);
+    return;
+}
+
+static void
+nc_atanh(npy_cdouble *x, npy_cdouble *r)
+{
+    *r = npy_catanh(*x);
+    return;
+}
+
+static void
+nc_cos(npy_cdouble *x, npy_cdouble *r)
+{
+    *r = npy_ccos(*x);
+    return;
+}
+
+static void
+nc_cosh(npy_cdouble *x, npy_cdouble *r)
+{
+    *r = npy_ccosh(*x);
+    return;
+}
+
+static void
+nc_log10(npy_cdouble *x, npy_cdouble *r)
+{
+    nc_log(x, r);
+    r->real *= NPY_LOG10E;
+    r->imag *= NPY_LOG10E;
+    return;
+}
+
+static void
+nc_log2(npy_cdouble *x, npy_cdouble *r)
+{
+    nc_log(x, r);
+    r->real *= NPY_LOG2E;
+    r->imag *= NPY_LOG2E;
+    return;
+}
+
+static void
+nc_sin(npy_cdouble *x, npy_cdouble *r)
+{
+    *r = npy_csin(*x);
+    return;
+}
+
+static void
+nc_sinh(npy_cdouble *x, npy_cdouble *r)
+{
+    *r = npy_csinh(*x);
+    return;
+}
+
+static void
+nc_tan(npy_cdouble *x, npy_cdouble *r)
+{
+   *r = npy_ctan(*x);
+   return;
+}
+
+static void
+nc_tanh(npy_cdouble *x, npy_cdouble *r)
+{
+    *r = npy_ctanh(*x);
+    return;
+}
+
+
+#line 290
+
+static void
+nc_negl(npy_clongdouble *a, npy_clongdouble *r)
+{
+    r->real = -a->real;
+    r->imag = -a->imag;
+    return;
+}
+
+static void
+nc_posl(npy_clongdouble *a, npy_clongdouble *r)
+{
+    r->real = +a->real;
+    r->imag = +a->imag;
+    return;
+}
+
+static void
+nc_sqrtl(npy_clongdouble *x, npy_clongdouble *r)
+{
+    *r = npy_csqrtl(*x);
+    return;
+}
+
+static void
+nc_rintl(npy_clongdouble *x, npy_clongdouble *r)
+{
+    r->real = npy_rintl(x->real);
+    r->imag = npy_rintl(x->imag);
+}
+
+static void
+nc_logl(npy_clongdouble *x, npy_clongdouble *r)
+{
+    *r = npy_clogl(*x);
+    return;
+}
+
+static void
+nc_log1pl(npy_clongdouble *x, npy_clongdouble *r)
+{
+    npy_longdouble l = npy_hypotl(x->real + 1,x->imag);
+    r->imag = npy_atan2l(x->imag, x->real + 1);
+    r->real = npy_logl(l);
+    return;
+}
+
+static void
+nc_expl(npy_clongdouble *x, npy_clongdouble *r)
+{
+    *r = npy_cexpl(*x);
+    return;
+}
+
+static void
+nc_exp2l(npy_clongdouble *x, npy_clongdouble *r)
+{
+    npy_clongdouble a;
+    a.real = x->real*NPY_LOGE2l;
+    a.imag = x->imag*NPY_LOGE2l;
+    nc_expl(&a, r);
+    return;
+}
+
+static void
+nc_expm1l(npy_clongdouble *x, npy_clongdouble *r)
+{
+    npy_longdouble a = npy_sinl(x->imag / 2);
+    r->real = npy_expm1l(x->real) * npy_cosl(x->imag) - 2 * a * a;
+    r->imag = npy_expl(x->real) * npy_sinl(x->imag);
+    return;
+}
+
+static void
+nc_powl(npy_clongdouble *a, npy_clongdouble *b, npy_clongdouble *r)
+{
+   *r = npy_cpowl(*a, *b);
+    return;
+}
+
+static void
+nc_acosl(npy_clongdouble *x, npy_clongdouble *r)
+{
+    *r = npy_cacosl(*x);
+    return;
+}
+
+static void
+nc_acoshl(npy_clongdouble *x, npy_clongdouble *r)
+{
+    *r = npy_cacoshl(*x);
+    return;
+}
+
+static void
+nc_asinl(npy_clongdouble *x, npy_clongdouble *r)
+{
+    *r = npy_casinl(*x);
+    return;
+}
+
+
+static void
+nc_asinhl(npy_clongdouble *x, npy_clongdouble *r)
+{
+    *r = npy_casinhl(*x);
+    return;
+}
+
+static void
+nc_atanl(npy_clongdouble *x, npy_clongdouble *r)
+{
+    *r = npy_catanl(*x);
+    return;
+}
+
+static void
+nc_atanhl(npy_clongdouble *x, npy_clongdouble *r)
+{
+    *r = npy_catanhl(*x);
+    return;
+}
+
+static void
+nc_cosl(npy_clongdouble *x, npy_clongdouble *r)
+{
+    *r = npy_ccosl(*x);
+    return;
+}
+
+static void
+nc_coshl(npy_clongdouble *x, npy_clongdouble *r)
+{
+    *r = npy_ccoshl(*x);
+    return;
+}
+
+static void
+nc_log10l(npy_clongdouble *x, npy_clongdouble *r)
+{
+    nc_logl(x, r);
+    r->real *= NPY_LOG10El;
+    r->imag *= NPY_LOG10El;
+    return;
+}
+
+static void
+nc_log2l(npy_clongdouble *x, npy_clongdouble *r)
+{
+    nc_logl(x, r);
+    r->real *= NPY_LOG2El;
+    r->imag *= NPY_LOG2El;
+    return;
+}
+
+static void
+nc_sinl(npy_clongdouble *x, npy_clongdouble *r)
+{
+    *r = npy_csinl(*x);
+    return;
+}
+
+static void
+nc_sinhl(npy_clongdouble *x, npy_clongdouble *r)
+{
+    *r = npy_csinhl(*x);
+    return;
+}
+
+static void
+nc_tanl(npy_clongdouble *x, npy_clongdouble *r)
+{
+   *r = npy_ctanl(*x);
+   return;
+}
+
+static void
+nc_tanhl(npy_clongdouble *x, npy_clongdouble *r)
+{
+    *r = npy_ctanhl(*x);
+    return;
+}
+
+
+
diff --git a/numpy/core/src/_generated/ieee754.c b/numpy/core/src/_generated/ieee754.c
new file mode 100644
index 000000000000..ee9bf8532fa7
--- /dev/null
+++ b/numpy/core/src/_generated/ieee754.c
@@ -0,0 +1,435 @@
+#line 1 "numpy/core/src/npymath/ieee754.c.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+/* -*- c -*- */
+/*
+ * vim:syntax=c
+ *
+ * Low-level routines related to IEEE-754 format
+ */
+#include "npy_math_common.h"
+#include "npy_math_private.h"
+#include "numpy/utils.h"
+
+/*
+ The below code is provided for compilers which do not yet provide C11 compatibility (gcc 4.5 and older)
+ */
+#ifndef LDBL_TRUE_MIN
+#define LDBL_TRUE_MIN __LDBL_DENORM_MIN__
+#endif
+
+/*
+ * FIXME: There is a lot of redundancy between _next* and npy_nextafter*.
+ * refactor this at some point
+ *
+ * p >= 0, returnx x + nulp
+ * p < 0, returnx x - nulp
+ */
+static double _next(double x, int p)
+{
+    volatile double t;
+    npy_int32 hx, hy, ix;
+    npy_uint32 lx;
+
+    EXTRACT_WORDS(hx, lx, x);
+    ix = hx & 0x7fffffff;       /* |x| */
+
+    if (((ix >= 0x7ff00000) && ((ix - 0x7ff00000) | lx) != 0))        /* x is nan */
+        return x;
+    if ((ix | lx) == 0) {       /* x == 0 */
+        if (p >= 0) {
+            INSERT_WORDS(x, 0x0, 1);    /* return +minsubnormal */
+        } else {
+            INSERT_WORDS(x, 0x80000000, 1);    /* return -minsubnormal */
+        }
+        t = x * x;
+        if (t == x)
+            return t;
+        else
+            return x;           /* raise underflow flag */
+    }
+    if (p < 0) {     /* x -= ulp */
+        if (lx == 0)
+            hx -= 1;
+        lx -= 1;
+    } else {         /* x += ulp */
+        lx += 1;
+        if (lx == 0)
+            hx += 1;
+    }
+    hy = hx & 0x7ff00000;
+    if (hy >= 0x7ff00000)
+        return x + x;           /* overflow  */
+    if (hy < 0x00100000) {      /* underflow */
+        t = x * x;
+        if (t != x) {           /* raise underflow flag */
+            INSERT_WORDS(x, hx, lx);
+            return x;
+        }
+    }
+    INSERT_WORDS(x, hx, lx);
+    return x;
+}
+
+static float _nextf(float x, int p)
+{
+    volatile float t;
+    npy_int32 hx, hy, ix;
+
+    GET_FLOAT_WORD(hx, x);
+    ix = hx & 0x7fffffff;       /* |x| */
+
+    if ((ix > 0x7f800000))      /* x is nan */
+        return x;
+    if (ix == 0) {              /* x == 0 */
+        if (p >= 0) {
+            SET_FLOAT_WORD(x, 0x0 | 1); /* return +minsubnormal */
+        } else {
+            SET_FLOAT_WORD(x, 0x80000000 | 1); /* return -minsubnormal */
+        }
+        t = x * x;
+        if (t == x)
+            return t;
+        else
+            return x;           /* raise underflow flag */
+    }
+    if (p < 0) {            /* x -= ulp */
+        hx -= 1;
+    } else {                /* x += ulp */
+        hx += 1;
+    }
+    hy = hx & 0x7f800000;
+    if (hy >= 0x7f800000)
+        return x + x;           /* overflow  */
+    if (hy < 0x00800000) {      /* underflow */
+        t = x * x;
+        if (t != x) {           /* raise underflow flag */
+            SET_FLOAT_WORD(x, hx);
+            return x;
+        }
+    }
+    SET_FLOAT_WORD(x, hx);
+    return x;
+}
+
+#if defined(HAVE_LDOUBLE_DOUBLE_DOUBLE_BE) || \
+    defined(HAVE_LDOUBLE_DOUBLE_DOUBLE_LE)
+
+/*
+ * FIXME: this is ugly and untested. The asm part only works with gcc, and we
+ * should consolidate the GET_LDOUBLE* / SET_LDOUBLE macros
+ */
+#define math_opt_barrier(x) \
+        ({ __typeof (x) __x = x; __asm ("" : "+m" (__x)); __x; })
+#define math_force_eval(x) __asm __volatile ("" : : "m" (x))
+
+/* only works for big endian */
+typedef union
+{
+    npy_longdouble value;
+    struct
+    {
+        npy_uint64 msw;
+        npy_uint64 lsw;
+    } parts64;
+    struct
+    {
+        npy_uint32 w0, w1, w2, w3;
+    } parts32;
+} ieee854_long_double_shape_type;
+
+/* Get two 64 bit ints from a long double.  */
+
+#define GET_LDOUBLE_WORDS64(ix0,ix1,d) \
+do {                                   \
+  ieee854_long_double_shape_type qw_u; \
+  qw_u.value = (d);                    \
+  (ix0) = qw_u.parts64.msw;            \
+  (ix1) = qw_u.parts64.lsw;            \
+} while (0)
+
+/* Set a long double from two 64 bit ints.  */
+
+#define SET_LDOUBLE_WORDS64(d,ix0,ix1) \
+do {                                   \
+  ieee854_long_double_shape_type qw_u; \
+  qw_u.parts64.msw = (ix0);            \
+  qw_u.parts64.lsw = (ix1);            \
+  (d) = qw_u.value;                    \
+} while (0)
+
+static npy_longdouble _nextl(npy_longdouble x, int p)
+{
+    npy_int64 hx,ihx,ilx;
+    npy_uint64 lx;
+    npy_longdouble u;
+
+    GET_LDOUBLE_WORDS64(hx, lx, x);
+    ihx = hx & 0x7fffffffffffffffLL;      /* |hx| */
+    ilx = lx & 0x7fffffffffffffffLL;      /* |lx| */
+
+    if(((ihx & 0x7ff0000000000000LL)==0x7ff0000000000000LL)&&
+       ((ihx & 0x000fffffffffffffLL)!=0)) {
+        return x; /* signal the nan */
+    }
+    if(ihx == 0 && ilx == 0) {          /* x == 0 */
+        SET_LDOUBLE_WORDS64(x, p, 0ULL);/* return +-minsubnormal */
+        u = x * x;
+        if (u == x) {
+            return u;
+        } else {
+            return x;           /* raise underflow flag */
+        }
+    }
+
+    if(p < 0) { /* p < 0, x -= ulp */
+        if((hx==0xffefffffffffffffLL)&&(lx==0xfc8ffffffffffffeLL))
+            return x+x; /* overflow, return -inf */
+        if (hx >= 0x7ff0000000000000LL) {
+            SET_LDOUBLE_WORDS64(u,0x7fefffffffffffffLL,0x7c8ffffffffffffeLL);
+            return u;
+        }
+        if(ihx <= 0x0360000000000000LL) {  /* x <= LDBL_MIN */
+            u = math_opt_barrier (x);
+            x -= LDBL_TRUE_MIN;
+            if (ihx < 0x0360000000000000LL
+                    || (hx > 0 && (npy_int64) lx <= 0)
+                    || (hx < 0 && (npy_int64) lx > 1)) {
+                u = u * u;
+                math_force_eval (u);        /* raise underflow flag */
+            }
+            return x;
+        }
+        if (ihx < 0x06a0000000000000LL) { /* ulp will denormal */
+            SET_LDOUBLE_WORDS64(u,(hx&0x7ff0000000000000LL),0ULL);
+            u *= 0x1.0000000000000p-105L;
+        } else
+            SET_LDOUBLE_WORDS64(u,(hx&0x7ff0000000000000LL)-0x0690000000000000LL,0ULL);
+        return x - u;
+    } else {                /* p >= 0, x += ulp */
+        if((hx==0x7fefffffffffffffLL)&&(lx==0x7c8ffffffffffffeLL))
+            return x+x; /* overflow, return +inf */
+        if ((npy_uint64) hx >= 0xfff0000000000000ULL) {
+            SET_LDOUBLE_WORDS64(u,0xffefffffffffffffLL,0xfc8ffffffffffffeLL);
+            return u;
+        }
+        if(ihx <= 0x0360000000000000LL) {  /* x <= LDBL_MIN */
+            u = math_opt_barrier (x);
+            x += LDBL_TRUE_MIN;
+            if (ihx < 0x0360000000000000LL
+                    || (hx > 0 && (npy_int64) lx < 0 && lx != 0x8000000000000001LL)
+                    || (hx < 0 && (npy_int64) lx >= 0)) {
+                u = u * u;
+                math_force_eval (u);        /* raise underflow flag */
+            }
+            if (x == 0.0L)  /* handle negative LDBL_TRUE_MIN case */
+                x = -0.0L;
+            return x;
+        }
+        if (ihx < 0x06a0000000000000LL) { /* ulp will denormal */
+            SET_LDOUBLE_WORDS64(u,(hx&0x7ff0000000000000LL),0ULL);
+            u *= 0x1.0000000000000p-105L;
+        } else
+            SET_LDOUBLE_WORDS64(u,(hx&0x7ff0000000000000LL)-0x0690000000000000LL,0ULL);
+        return x + u;
+    }
+}
+#else
+static npy_longdouble _nextl(npy_longdouble x, int p)
+{
+    volatile npy_longdouble t;
+    union IEEEl2bitsrep ux;
+
+    ux.e = x;
+
+    if ((GET_LDOUBLE_EXP(ux) == 0x7fff &&
+         ((GET_LDOUBLE_MANH(ux) & ~LDBL_NBIT) | GET_LDOUBLE_MANL(ux)) != 0)) {
+        return ux.e;        /* x is nan */
+    }
+    if (ux.e == 0.0) {
+        SET_LDOUBLE_MANH(ux, 0);              /* return +-minsubnormal */
+        SET_LDOUBLE_MANL(ux, 1);
+        if (p >= 0) {
+            SET_LDOUBLE_SIGN(ux, 0);
+        } else {
+            SET_LDOUBLE_SIGN(ux, 1);
+        }
+        t = ux.e * ux.e;
+        if (t == ux.e) {
+            return t;
+        } else {
+            return ux.e;           /* raise underflow flag */
+        }
+    }
+    if (p < 0) {      /* x -= ulp */
+        if (GET_LDOUBLE_MANL(ux) == 0) {
+            if ((GET_LDOUBLE_MANH(ux) & ~LDBL_NBIT) == 0) {
+                SET_LDOUBLE_EXP(ux, GET_LDOUBLE_EXP(ux) - 1);
+            }
+            SET_LDOUBLE_MANH(ux,
+                             (GET_LDOUBLE_MANH(ux) - 1) |
+                             (GET_LDOUBLE_MANH(ux) & LDBL_NBIT));
+        }
+        SET_LDOUBLE_MANL(ux, GET_LDOUBLE_MANL(ux) - 1);
+    } else {                    /* x += ulp */
+        SET_LDOUBLE_MANL(ux, GET_LDOUBLE_MANL(ux) + 1);
+        if (GET_LDOUBLE_MANL(ux) == 0) {
+            SET_LDOUBLE_MANH(ux,
+                             (GET_LDOUBLE_MANH(ux) + 1) |
+                             (GET_LDOUBLE_MANH(ux) & LDBL_NBIT));
+            if ((GET_LDOUBLE_MANH(ux) & ~LDBL_NBIT) == 0) {
+                SET_LDOUBLE_EXP(ux, GET_LDOUBLE_EXP(ux) + 1);
+            }
+        }
+    }
+    if (GET_LDOUBLE_EXP(ux) == 0x7fff) {
+        return ux.e + ux.e;           /* overflow  */
+    }
+    if (GET_LDOUBLE_EXP(ux) == 0) {            /* underflow */
+        if (LDBL_NBIT) {
+            SET_LDOUBLE_MANH(ux, GET_LDOUBLE_MANH(ux) & ~LDBL_NBIT);
+        }
+        t = ux.e * ux.e;
+        if (t != ux.e) {           /* raise underflow flag */
+            return ux.e;
+        }
+    }
+
+    return ux.e;
+}
+#endif
+
+#line 304
+npy_float npy_spacingf(npy_float x)
+{
+    /* XXX: npy isnan/isinf may be optimized by bit twiddling */
+    if (npy_isinf(x)) {
+        return NPY_NANF;
+    }
+
+    return _nextf(x, 1) - x;
+}
+
+#line 304
+npy_double npy_spacing(npy_double x)
+{
+    /* XXX: npy isnan/isinf may be optimized by bit twiddling */
+    if (npy_isinf(x)) {
+        return NPY_NAN;
+    }
+
+    return _next(x, 1) - x;
+}
+
+#line 304
+npy_longdouble npy_spacingl(npy_longdouble x)
+{
+    /* XXX: npy isnan/isinf may be optimized by bit twiddling */
+    if (npy_isinf(x)) {
+        return NPY_NANL;
+    }
+
+    return _nextl(x, 1) - x;
+}
+
+
+int npy_clear_floatstatus() {
+    char x=0;
+    return npy_clear_floatstatus_barrier(&x);
+}
+int npy_get_floatstatus() {
+    char x=0;
+    return npy_get_floatstatus_barrier(&x);
+}
+
+
+/*
+ * General C99 code for floating point error handling.  These functions mainly
+ * exists, because `fenv.h` was not standardized in C89 so they gave better
+ * portability.  This should be unnecessary with C99/C++11 and further
+ * functionality can be used from `fenv.h` directly.
+ */
+#  include <fenv.h>
+
+/*
+ * According to the C99 standard FE_DIVBYZERO, etc. may not be provided when
+ * unsupported.  In such cases NumPy will not report these correctly, but we
+ * should still allow compiling (whether tests pass or not).
+ * By defining them as 0 locally, we make them no-ops.  Unlike these defines,
+ * for example `musl` still defines all of the functions (as no-ops):
+ *     https://git.musl-libc.org/cgit/musl/tree/src/fenv/fenv.c
+ * and does similar replacement in its tests:
+ * http://nsz.repo.hu/git/?p=libc-test;a=blob;f=src/common/mtest.h;h=706c1ba23ea8989b17a2f72ed1a919e187c06b6a;hb=HEAD#l30
+ */
+#ifndef FE_DIVBYZERO
+    #define FE_DIVBYZERO 0
+#endif
+#ifndef FE_OVERFLOW
+    #define FE_OVERFLOW 0
+#endif
+#ifndef FE_UNDERFLOW
+    #define FE_UNDERFLOW 0
+#endif
+#ifndef FE_INVALID
+    #define FE_INVALID 0
+#endif
+
+
+int npy_get_floatstatus_barrier(char* param)
+{
+    int fpstatus = fetestexcept(FE_DIVBYZERO | FE_OVERFLOW |
+                                FE_UNDERFLOW | FE_INVALID);
+    /*
+     * By using a volatile, the compiler cannot reorder this call
+     */
+    if (param != NULL) {
+        volatile char NPY_UNUSED(c) = *(char*)param;
+    }
+
+    return ((FE_DIVBYZERO  & fpstatus) ? NPY_FPE_DIVIDEBYZERO : 0) |
+           ((FE_OVERFLOW   & fpstatus) ? NPY_FPE_OVERFLOW : 0) |
+           ((FE_UNDERFLOW  & fpstatus) ? NPY_FPE_UNDERFLOW : 0) |
+           ((FE_INVALID    & fpstatus) ? NPY_FPE_INVALID : 0);
+}
+
+int npy_clear_floatstatus_barrier(char * param)
+{
+    /* testing float status is 50-100 times faster than clearing on x86 */
+    int fpstatus = npy_get_floatstatus_barrier(param);
+    if (fpstatus != 0) {
+        feclearexcept(FE_DIVBYZERO | FE_OVERFLOW |
+                      FE_UNDERFLOW | FE_INVALID);
+    }
+
+    return fpstatus;
+}
+
+
+void npy_set_floatstatus_divbyzero(void)
+{
+    feraiseexcept(FE_DIVBYZERO);
+}
+
+void npy_set_floatstatus_overflow(void)
+{
+    feraiseexcept(FE_OVERFLOW);
+}
+
+void npy_set_floatstatus_underflow(void)
+{
+    feraiseexcept(FE_UNDERFLOW);
+}
+
+void npy_set_floatstatus_invalid(void)
+{
+    feraiseexcept(FE_INVALID);
+}
+
+
diff --git a/numpy/core/src/_generated/loops.c b/numpy/core/src/_generated/loops.c
new file mode 100644
index 000000000000..b2e03f7d4616
--- /dev/null
+++ b/numpy/core/src/_generated/loops.c
@@ -0,0 +1,7923 @@
+#line 1 "numpy/core/src/umath/loops.c.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+/* -*- c -*- */
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "npy_config.h"
+#include "numpy/npy_common.h"
+#include "numpy/arrayobject.h"
+#include "numpy/ufuncobject.h"
+#include "numpy/npy_math.h"
+#include "numpy/halffloat.h"
+#include "lowlevel_strided_loops.h"
+#include "loops_utils.h"
+
+#include "npy_pycompat.h"
+
+#include "ufunc_object.h"
+
+#include <string.h> /* for memchr */
+
+/* Use Libdivide for faster division */
+#include "numpy/libdivide/libdivide.h"
+
+/*
+ * cutoff blocksize for pairwise summation
+ * decreasing it decreases errors slightly as more pairs are summed but
+ * also lowers performance, as the inner loop is unrolled eight times it is
+ * effectively 16
+ */
+#define PW_BLOCKSIZE    128
+
+/** Provides the various *_LOOP macros */
+#include "fast_loop_macros.h"
+
+/******************************************************************************
+ **                          GENERIC FLOAT LOOPS                             **
+ *****************************************************************************/
+
+/* direct loops using a suitable callback */
+
+#line 48
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_e_e(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    typedef npy_half func_type(npy_half);
+    func_type *f = (func_type *)func;
+    UNARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        *(npy_half *)op1 = f(in1);
+    }
+}
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_ee_e(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    typedef npy_half func_type(npy_half, npy_half);
+    func_type *f = (func_type *)func;
+    BINARY_LOOP {
+        npy_half in1 = *(npy_half *)ip1;
+        npy_half in2 = *(npy_half *)ip2;
+        *(npy_half *)op1 = f(in1, in2);
+    }
+}
+
+
+#line 48
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_f_f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    typedef npy_float func_type(npy_float);
+    func_type *f = (func_type *)func;
+    UNARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        *(npy_float *)op1 = f(in1);
+    }
+}
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_ff_f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    typedef npy_float func_type(npy_float, npy_float);
+    func_type *f = (func_type *)func;
+    BINARY_LOOP {
+        npy_float in1 = *(npy_float *)ip1;
+        npy_float in2 = *(npy_float *)ip2;
+        *(npy_float *)op1 = f(in1, in2);
+    }
+}
+
+
+#line 48
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_d_d(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    typedef npy_double func_type(npy_double);
+    func_type *f = (func_type *)func;
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *(npy_double *)op1 = f(in1);
+    }
+}
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_dd_d(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    typedef npy_double func_type(npy_double, npy_double);
+    func_type *f = (func_type *)func;
+    BINARY_LOOP {
+        npy_double in1 = *(npy_double *)ip1;
+        npy_double in2 = *(npy_double *)ip2;
+        *(npy_double *)op1 = f(in1, in2);
+    }
+}
+
+
+#line 48
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_g_g(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    typedef npy_longdouble func_type(npy_longdouble);
+    func_type *f = (func_type *)func;
+    UNARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        *(npy_longdouble *)op1 = f(in1);
+    }
+}
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_gg_g(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    typedef npy_longdouble func_type(npy_longdouble, npy_longdouble);
+    func_type *f = (func_type *)func;
+    BINARY_LOOP {
+        npy_longdouble in1 = *(npy_longdouble *)ip1;
+        npy_longdouble in2 = *(npy_longdouble *)ip2;
+        *(npy_longdouble *)op1 = f(in1, in2);
+    }
+}
+
+
+
+/* indirect loops with casting */
+#line 86
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_e_e_As_f_f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    typedef npy_float func_type(npy_float);
+    func_type *f = (func_type *)func;
+    UNARY_LOOP {
+        const npy_float in1 = npy_half_to_float(*(npy_half *)ip1);
+        *(npy_half *)op1 = npy_float_to_half(f(in1));
+    }
+}
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_ee_e_As_ff_f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    typedef npy_float func_type(npy_float, npy_float);
+    func_type *f = (func_type *)func;
+    BINARY_LOOP {
+        const npy_float in1 = npy_half_to_float(*(npy_half *)ip1);
+        const npy_float in2 = npy_half_to_float(*(npy_half *)ip2);
+        *(npy_half *)op1 = npy_float_to_half(f(in1, in2));
+    }
+}
+
+
+#line 86
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_e_e_As_d_d(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    typedef npy_double func_type(npy_double);
+    func_type *f = (func_type *)func;
+    UNARY_LOOP {
+        const npy_double in1 = npy_half_to_double(*(npy_half *)ip1);
+        *(npy_half *)op1 = npy_double_to_half(f(in1));
+    }
+}
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_ee_e_As_dd_d(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    typedef npy_double func_type(npy_double, npy_double);
+    func_type *f = (func_type *)func;
+    BINARY_LOOP {
+        const npy_double in1 = npy_half_to_double(*(npy_half *)ip1);
+        const npy_double in2 = npy_half_to_double(*(npy_half *)ip2);
+        *(npy_half *)op1 = npy_double_to_half(f(in1, in2));
+    }
+}
+
+
+#line 86
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_f_f_As_d_d(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    typedef npy_double func_type(npy_double);
+    func_type *f = (func_type *)func;
+    UNARY_LOOP {
+        const npy_double in1 = (double)(*(npy_float *)ip1);
+        *(npy_float *)op1 = (float)(f(in1));
+    }
+}
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_ff_f_As_dd_d(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    typedef npy_double func_type(npy_double, npy_double);
+    func_type *f = (func_type *)func;
+    BINARY_LOOP {
+        const npy_double in1 = (double)(*(npy_float *)ip1);
+        const npy_double in2 = (double)(*(npy_float *)ip2);
+        *(npy_float *)op1 = (float)(f(in1, in2));
+    }
+}
+
+
+
+/******************************************************************************
+ **                          GENERIC COMPLEX LOOPS                           **
+ *****************************************************************************/
+
+/* direct loops using a suitable callback */
+#line 122
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_F_F(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    typedef void func_type(npy_cfloat *, npy_cfloat *);
+    func_type *f = (func_type *)func;
+    UNARY_LOOP {
+        npy_cfloat in1 = *(npy_cfloat *)ip1;
+        npy_cfloat *out = (npy_cfloat *)op1;
+        f(&in1, out);
+    }
+}
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_FF_F(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    typedef void func_type(npy_cfloat *, npy_cfloat *, npy_cfloat *);
+    func_type *f = (func_type *)func;
+    BINARY_LOOP {
+        npy_cfloat in1 = *(npy_cfloat *)ip1;
+        npy_cfloat in2 = *(npy_cfloat *)ip2;
+        npy_cfloat *out = (npy_cfloat *)op1;
+        f(&in1, &in2, out);
+    }
+}
+
+#line 122
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_D_D(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    typedef void func_type(npy_cdouble *, npy_cdouble *);
+    func_type *f = (func_type *)func;
+    UNARY_LOOP {
+        npy_cdouble in1 = *(npy_cdouble *)ip1;
+        npy_cdouble *out = (npy_cdouble *)op1;
+        f(&in1, out);
+    }
+}
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_DD_D(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    typedef void func_type(npy_cdouble *, npy_cdouble *, npy_cdouble *);
+    func_type *f = (func_type *)func;
+    BINARY_LOOP {
+        npy_cdouble in1 = *(npy_cdouble *)ip1;
+        npy_cdouble in2 = *(npy_cdouble *)ip2;
+        npy_cdouble *out = (npy_cdouble *)op1;
+        f(&in1, &in2, out);
+    }
+}
+
+#line 122
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_G_G(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    typedef void func_type(npy_clongdouble *, npy_clongdouble *);
+    func_type *f = (func_type *)func;
+    UNARY_LOOP {
+        npy_clongdouble in1 = *(npy_clongdouble *)ip1;
+        npy_clongdouble *out = (npy_clongdouble *)op1;
+        f(&in1, out);
+    }
+}
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_GG_G(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    typedef void func_type(npy_clongdouble *, npy_clongdouble *, npy_clongdouble *);
+    func_type *f = (func_type *)func;
+    BINARY_LOOP {
+        npy_clongdouble in1 = *(npy_clongdouble *)ip1;
+        npy_clongdouble in2 = *(npy_clongdouble *)ip2;
+        npy_clongdouble *out = (npy_clongdouble *)op1;
+        f(&in1, &in2, out);
+    }
+}
+
+
+
+/* indirect loops with casting */
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_F_F_As_D_D(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    typedef void func_type(npy_cdouble *, npy_cdouble *);
+    func_type *f = (func_type *)func;
+    UNARY_LOOP {
+        npy_cdouble tmp, out;
+        tmp.real = (double)((float *)ip1)[0];
+        tmp.imag = (double)((float *)ip1)[1];
+        f(&tmp, &out);
+        ((float *)op1)[0] = (float)out.real;
+        ((float *)op1)[1] = (float)out.imag;
+    }
+}
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_FF_F_As_DD_D(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    typedef void func_type(npy_cdouble *, npy_cdouble *, npy_cdouble *);
+    func_type *f = (func_type *)func;
+    BINARY_LOOP {
+        npy_cdouble tmp1, tmp2, out;
+        tmp1.real = (double)((float *)ip1)[0];
+        tmp1.imag = (double)((float *)ip1)[1];
+        tmp2.real = (double)((float *)ip2)[0];
+        tmp2.imag = (double)((float *)ip2)[1];
+        f(&tmp1, &tmp2, &out);
+        ((float *)op1)[0] = (float)out.real;
+        ((float *)op1)[1] = (float)out.imag;
+    }
+}
+
+
+/******************************************************************************
+ **                         GENERIC OBJECT lOOPS                             **
+ *****************************************************************************/
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_O_O(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    unaryfunc f = (unaryfunc)func;
+    UNARY_LOOP {
+        PyObject *in1 = *(PyObject **)ip1;
+        PyObject **out = (PyObject **)op1;
+        /* We allow NULL, but try to guarantee non-NULL to downstream */
+        assert(in1 != NULL);
+        PyObject *ret = f(in1 ? in1 : Py_None);
+        if (ret == NULL) {
+            return;
+        }
+        Py_XDECREF(*out);
+        *out = ret;
+    }
+}
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_O_O_method(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    char *meth = (char *)func;
+    UNARY_LOOP {
+        PyObject *in1 = *(PyObject **)ip1;
+        PyObject **out = (PyObject **)op1;
+        /* We allow NULL, but try to guarantee non-NULL to downstream */
+        assert(in1 != NULL);
+        PyObject *ret, *func;
+        func = PyObject_GetAttrString(in1 ? in1 : Py_None, meth);
+        if (func != NULL && !PyCallable_Check(func)) {
+            Py_DECREF(func);
+            func = NULL;
+        }
+        if (func == NULL) {
+            PyObject *exc, *val, *tb;
+            PyTypeObject *type = in1 ? Py_TYPE(in1) : Py_TYPE(Py_None);
+            PyErr_Fetch(&exc, &val, &tb);
+            PyErr_Format(PyExc_TypeError,
+                         "loop of ufunc does not support argument %d of "
+                         "type %s which has no callable %s method",
+                         i, type->tp_name, meth);
+            npy_PyErr_ChainExceptionsCause(exc, val, tb);
+            Py_XDECREF(func);
+            return;
+        }
+        ret = PyObject_CallObject(func, NULL);
+        Py_DECREF(func);
+        if (ret == NULL) {
+            return;
+        }
+        Py_XDECREF(*out);
+        *out = ret;
+    }
+}
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_OO_O(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    binaryfunc f = (binaryfunc)func;
+    BINARY_LOOP {
+        PyObject *in1 = *(PyObject **)ip1;
+        PyObject *in2 = *(PyObject **)ip2;
+        PyObject **out = (PyObject **)op1;
+        /* We allow NULL, but try to guarantee non-NULL to downstream */
+        assert(in1 != NULL);
+        assert(in2 != NULL);
+        PyObject *ret = f(in1 ? in1 : Py_None, in2 ? in2 : Py_None);
+        if (ret == NULL) {
+            return;
+        }
+        Py_XDECREF(*out);
+        *out = ret;
+    }
+}
+
+NPY_NO_EXPORT void
+PyUFunc_OOO_O(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    ternaryfunc f = (ternaryfunc)func;
+    TERNARY_LOOP {
+        PyObject *in1 = *(PyObject **)ip1;
+        PyObject *in2 = *(PyObject **)ip2;
+        PyObject *in3 = *(PyObject **)ip3;
+        PyObject **out = (PyObject **)op1;
+        /* We allow NULL, but try to guarantee non-NULL to downstream */
+        assert(in1 != NULL);
+        assert(in2 != NULL);
+        assert(in3 != NULL);
+        PyObject *ret = f(
+            in1 ? in1 : Py_None,
+            in2 ? in2 : Py_None,
+            in3 ? in3 : Py_None
+        );
+        if (ret == NULL) {
+            return;
+        }
+        Py_XDECREF(*out);
+        *out = ret;
+    }
+}
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_OO_O_method(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    char *meth = (char *)func;
+    BINARY_LOOP {
+        PyObject *in1 = *(PyObject **)ip1;
+        PyObject *in2 = *(PyObject **)ip2;
+        PyObject **out = (PyObject **)op1;
+        /* We allow NULL, but try to guarantee non-NULL to downstream */
+        assert(in1 != NULL);
+        assert(in2 != NULL);
+        PyObject *ret = PyObject_CallMethod(in1 ? in1 : Py_None,
+                                            meth, "(O)", in2);
+        if (ret == NULL) {
+            return;
+        }
+        Py_XDECREF(*out);
+        *out = ret;
+    }
+}
+
+/*
+ * A general-purpose ufunc that deals with general-purpose Python callable.
+ * func is a structure with nin, nout, and a Python callable function
+ */
+
+/*UFUNC_API*/
+NPY_NO_EXPORT void
+PyUFunc_On_Om(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    npy_intp n =  dimensions[0];
+    PyUFunc_PyFuncData *data = (PyUFunc_PyFuncData *)func;
+    int nin = data->nin;
+    int nout = data->nout;
+    PyObject *tocall = data->callable;
+    char *ptrs[NPY_MAXARGS];
+    PyObject *arglist, *result;
+    PyObject *in, **op;
+    npy_intp i, j, ntot;
+
+    ntot = nin+nout;
+
+    for(j = 0; j < ntot; j++) {
+        ptrs[j] = args[j];
+    }
+    for(i = 0; i < n; i++) {
+        arglist = PyTuple_New(nin);
+        if (arglist == NULL) {
+            return;
+        }
+        for(j = 0; j < nin; j++) {
+            in = *((PyObject **)ptrs[j]);
+            /* We allow NULL, but try to guarantee non-NULL to downstream */
+            assert(in != NULL);
+            if (in == NULL) {
+                in = Py_None;
+            }
+            PyTuple_SET_ITEM(arglist, j, in);
+            Py_INCREF(in);
+        }
+        result = PyObject_CallObject(tocall, arglist);
+        Py_DECREF(arglist);
+        if (result == NULL) {
+            return;
+        }
+        if (nout == 0  && result == Py_None) {
+            /* No output expected, no output received, continue */
+            Py_DECREF(result);
+        }
+        else if (nout == 1) {
+            /* Single output expected, assign and continue */
+            op = (PyObject **)ptrs[nin];
+            Py_XDECREF(*op);
+            *op = result;
+        }
+        else if (PyTuple_Check(result) && nout == PyTuple_Size(result)) {
+            /*
+             * Multiple returns match expected number of outputs, assign
+             * and continue. Will also gobble empty tuples if nout == 0.
+             */
+            for(j = 0; j < nout; j++) {
+                op = (PyObject **)ptrs[j+nin];
+                Py_XDECREF(*op);
+                *op = PyTuple_GET_ITEM(result, j);
+                Py_INCREF(*op);
+            }
+            Py_DECREF(result);
+        }
+        else {
+            /* Mismatch between returns and expected outputs, exit */
+            Py_DECREF(result);
+            return;
+        }
+        for(j = 0; j < ntot; j++) {
+            ptrs[j] += steps[j];
+        }
+    }
+}
+
+/*
+ *****************************************************************************
+ **                             BOOLEAN LOOPS                               **
+ *****************************************************************************
+ */
+
+NPY_NO_EXPORT void
+BOOL__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    OUTPUT_LOOP {
+        *((npy_bool *)op1) = 1;
+    }
+}
+
+/*
+ *****************************************************************************
+ **                           INTEGER LOOPS
+ *****************************************************************************
+ */
+
+#line 426
+
+#define BYTE_floor_divide BYTE_divide
+#define BYTE_floor_divide_indexed BYTE_divide_indexed
+#define BYTE_fmax BYTE_maximum
+#define BYTE_fmax_indexed BYTE_maximum_indexed
+#define BYTE_fmin BYTE_minimum
+#define BYTE_fmin_indexed BYTE_minimum_indexed
+
+NPY_NO_EXPORT void
+BYTE__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    OUTPUT_LOOP {
+        *((npy_byte *)op1) = 1;
+    }
+}
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+BYTE_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_byte *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_byte *)(ip1 + is1 * indx);
+        *indexed = *indexed + *(npy_byte *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+BYTE_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_byte *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_byte *)(ip1 + is1 * indx);
+        *indexed = *indexed - *(npy_byte *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+BYTE_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_byte *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_byte *)(ip1 + is1 * indx);
+        *indexed = *indexed * *(npy_byte *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+BYTE_bitwise_and_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_byte *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_byte *)(ip1 + is1 * indx);
+        *indexed = *indexed & *(npy_byte *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+BYTE_bitwise_or_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_byte *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_byte *)(ip1 + is1 * indx);
+        *indexed = *indexed | *(npy_byte *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+BYTE_bitwise_xor_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_byte *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_byte *)(ip1 + is1 * indx);
+        *indexed = *indexed ^ *(npy_byte *)value;
+    }
+    return 0;
+}
+
+
+NPY_NO_EXPORT void
+BYTE_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        npy_byte in1 = *(npy_byte *)ip1;
+        npy_byte in2 = *(npy_byte *)ip2;
+        npy_byte out;
+
+#if 1
+        if (in2 < 0) {
+            NPY_ALLOW_C_API_DEF
+            NPY_ALLOW_C_API;
+            PyErr_SetString(PyExc_ValueError,
+                    "Integers to negative integer powers are not allowed.");
+            NPY_DISABLE_C_API;
+            return;
+        }
+#endif
+        if (in2 == 0) {
+            *((npy_byte *)op1) = 1;
+            continue;
+        }
+        if (in1 == 1) {
+            *((npy_byte *)op1) = 1;
+            continue;
+        }
+
+        out = in2 & 1 ? in1 : 1;
+        in2 >>= 1;
+        while (in2 > 0) {
+            in1 *= in1;
+            if (in2 & 1) {
+                out *= in1;
+            }
+            in2 >>= 1;
+        }
+        *((npy_byte *) op1) = out;
+    }
+}
+
+#line 426
+
+#define UBYTE_floor_divide UBYTE_divide
+#define UBYTE_floor_divide_indexed UBYTE_divide_indexed
+#define UBYTE_fmax UBYTE_maximum
+#define UBYTE_fmax_indexed UBYTE_maximum_indexed
+#define UBYTE_fmin UBYTE_minimum
+#define UBYTE_fmin_indexed UBYTE_minimum_indexed
+
+NPY_NO_EXPORT void
+UBYTE__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    OUTPUT_LOOP {
+        *((npy_ubyte *)op1) = 1;
+    }
+}
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+UBYTE_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_ubyte *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ubyte *)(ip1 + is1 * indx);
+        *indexed = *indexed + *(npy_ubyte *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+UBYTE_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_ubyte *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ubyte *)(ip1 + is1 * indx);
+        *indexed = *indexed - *(npy_ubyte *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+UBYTE_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_ubyte *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ubyte *)(ip1 + is1 * indx);
+        *indexed = *indexed * *(npy_ubyte *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+UBYTE_bitwise_and_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_ubyte *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ubyte *)(ip1 + is1 * indx);
+        *indexed = *indexed & *(npy_ubyte *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+UBYTE_bitwise_or_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_ubyte *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ubyte *)(ip1 + is1 * indx);
+        *indexed = *indexed | *(npy_ubyte *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+UBYTE_bitwise_xor_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_ubyte *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ubyte *)(ip1 + is1 * indx);
+        *indexed = *indexed ^ *(npy_ubyte *)value;
+    }
+    return 0;
+}
+
+
+NPY_NO_EXPORT void
+UBYTE_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        npy_ubyte in1 = *(npy_ubyte *)ip1;
+        npy_ubyte in2 = *(npy_ubyte *)ip2;
+        npy_ubyte out;
+
+#if 0
+        if (in2 < 0) {
+            NPY_ALLOW_C_API_DEF
+            NPY_ALLOW_C_API;
+            PyErr_SetString(PyExc_ValueError,
+                    "Integers to negative integer powers are not allowed.");
+            NPY_DISABLE_C_API;
+            return;
+        }
+#endif
+        if (in2 == 0) {
+            *((npy_ubyte *)op1) = 1;
+            continue;
+        }
+        if (in1 == 1) {
+            *((npy_ubyte *)op1) = 1;
+            continue;
+        }
+
+        out = in2 & 1 ? in1 : 1;
+        in2 >>= 1;
+        while (in2 > 0) {
+            in1 *= in1;
+            if (in2 & 1) {
+                out *= in1;
+            }
+            in2 >>= 1;
+        }
+        *((npy_ubyte *) op1) = out;
+    }
+}
+
+#line 426
+
+#define SHORT_floor_divide SHORT_divide
+#define SHORT_floor_divide_indexed SHORT_divide_indexed
+#define SHORT_fmax SHORT_maximum
+#define SHORT_fmax_indexed SHORT_maximum_indexed
+#define SHORT_fmin SHORT_minimum
+#define SHORT_fmin_indexed SHORT_minimum_indexed
+
+NPY_NO_EXPORT void
+SHORT__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    OUTPUT_LOOP {
+        *((npy_short *)op1) = 1;
+    }
+}
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+SHORT_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_short *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_short *)(ip1 + is1 * indx);
+        *indexed = *indexed + *(npy_short *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+SHORT_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_short *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_short *)(ip1 + is1 * indx);
+        *indexed = *indexed - *(npy_short *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+SHORT_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_short *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_short *)(ip1 + is1 * indx);
+        *indexed = *indexed * *(npy_short *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+SHORT_bitwise_and_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_short *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_short *)(ip1 + is1 * indx);
+        *indexed = *indexed & *(npy_short *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+SHORT_bitwise_or_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_short *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_short *)(ip1 + is1 * indx);
+        *indexed = *indexed | *(npy_short *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+SHORT_bitwise_xor_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_short *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_short *)(ip1 + is1 * indx);
+        *indexed = *indexed ^ *(npy_short *)value;
+    }
+    return 0;
+}
+
+
+NPY_NO_EXPORT void
+SHORT_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        npy_short in1 = *(npy_short *)ip1;
+        npy_short in2 = *(npy_short *)ip2;
+        npy_short out;
+
+#if 1
+        if (in2 < 0) {
+            NPY_ALLOW_C_API_DEF
+            NPY_ALLOW_C_API;
+            PyErr_SetString(PyExc_ValueError,
+                    "Integers to negative integer powers are not allowed.");
+            NPY_DISABLE_C_API;
+            return;
+        }
+#endif
+        if (in2 == 0) {
+            *((npy_short *)op1) = 1;
+            continue;
+        }
+        if (in1 == 1) {
+            *((npy_short *)op1) = 1;
+            continue;
+        }
+
+        out = in2 & 1 ? in1 : 1;
+        in2 >>= 1;
+        while (in2 > 0) {
+            in1 *= in1;
+            if (in2 & 1) {
+                out *= in1;
+            }
+            in2 >>= 1;
+        }
+        *((npy_short *) op1) = out;
+    }
+}
+
+#line 426
+
+#define USHORT_floor_divide USHORT_divide
+#define USHORT_floor_divide_indexed USHORT_divide_indexed
+#define USHORT_fmax USHORT_maximum
+#define USHORT_fmax_indexed USHORT_maximum_indexed
+#define USHORT_fmin USHORT_minimum
+#define USHORT_fmin_indexed USHORT_minimum_indexed
+
+NPY_NO_EXPORT void
+USHORT__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    OUTPUT_LOOP {
+        *((npy_ushort *)op1) = 1;
+    }
+}
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+USHORT_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_ushort *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ushort *)(ip1 + is1 * indx);
+        *indexed = *indexed + *(npy_ushort *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+USHORT_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_ushort *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ushort *)(ip1 + is1 * indx);
+        *indexed = *indexed - *(npy_ushort *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+USHORT_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_ushort *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ushort *)(ip1 + is1 * indx);
+        *indexed = *indexed * *(npy_ushort *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+USHORT_bitwise_and_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_ushort *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ushort *)(ip1 + is1 * indx);
+        *indexed = *indexed & *(npy_ushort *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+USHORT_bitwise_or_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_ushort *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ushort *)(ip1 + is1 * indx);
+        *indexed = *indexed | *(npy_ushort *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+USHORT_bitwise_xor_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_ushort *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ushort *)(ip1 + is1 * indx);
+        *indexed = *indexed ^ *(npy_ushort *)value;
+    }
+    return 0;
+}
+
+
+NPY_NO_EXPORT void
+USHORT_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        npy_ushort in1 = *(npy_ushort *)ip1;
+        npy_ushort in2 = *(npy_ushort *)ip2;
+        npy_ushort out;
+
+#if 0
+        if (in2 < 0) {
+            NPY_ALLOW_C_API_DEF
+            NPY_ALLOW_C_API;
+            PyErr_SetString(PyExc_ValueError,
+                    "Integers to negative integer powers are not allowed.");
+            NPY_DISABLE_C_API;
+            return;
+        }
+#endif
+        if (in2 == 0) {
+            *((npy_ushort *)op1) = 1;
+            continue;
+        }
+        if (in1 == 1) {
+            *((npy_ushort *)op1) = 1;
+            continue;
+        }
+
+        out = in2 & 1 ? in1 : 1;
+        in2 >>= 1;
+        while (in2 > 0) {
+            in1 *= in1;
+            if (in2 & 1) {
+                out *= in1;
+            }
+            in2 >>= 1;
+        }
+        *((npy_ushort *) op1) = out;
+    }
+}
+
+#line 426
+
+#define INT_floor_divide INT_divide
+#define INT_floor_divide_indexed INT_divide_indexed
+#define INT_fmax INT_maximum
+#define INT_fmax_indexed INT_maximum_indexed
+#define INT_fmin INT_minimum
+#define INT_fmin_indexed INT_minimum_indexed
+
+NPY_NO_EXPORT void
+INT__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    OUTPUT_LOOP {
+        *((npy_int *)op1) = 1;
+    }
+}
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+INT_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_int *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_int *)(ip1 + is1 * indx);
+        *indexed = *indexed + *(npy_int *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+INT_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_int *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_int *)(ip1 + is1 * indx);
+        *indexed = *indexed - *(npy_int *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+INT_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_int *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_int *)(ip1 + is1 * indx);
+        *indexed = *indexed * *(npy_int *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+INT_bitwise_and_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_int *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_int *)(ip1 + is1 * indx);
+        *indexed = *indexed & *(npy_int *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+INT_bitwise_or_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_int *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_int *)(ip1 + is1 * indx);
+        *indexed = *indexed | *(npy_int *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+INT_bitwise_xor_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_int *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_int *)(ip1 + is1 * indx);
+        *indexed = *indexed ^ *(npy_int *)value;
+    }
+    return 0;
+}
+
+
+NPY_NO_EXPORT void
+INT_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        npy_int in1 = *(npy_int *)ip1;
+        npy_int in2 = *(npy_int *)ip2;
+        npy_int out;
+
+#if 1
+        if (in2 < 0) {
+            NPY_ALLOW_C_API_DEF
+            NPY_ALLOW_C_API;
+            PyErr_SetString(PyExc_ValueError,
+                    "Integers to negative integer powers are not allowed.");
+            NPY_DISABLE_C_API;
+            return;
+        }
+#endif
+        if (in2 == 0) {
+            *((npy_int *)op1) = 1;
+            continue;
+        }
+        if (in1 == 1) {
+            *((npy_int *)op1) = 1;
+            continue;
+        }
+
+        out = in2 & 1 ? in1 : 1;
+        in2 >>= 1;
+        while (in2 > 0) {
+            in1 *= in1;
+            if (in2 & 1) {
+                out *= in1;
+            }
+            in2 >>= 1;
+        }
+        *((npy_int *) op1) = out;
+    }
+}
+
+#line 426
+
+#define UINT_floor_divide UINT_divide
+#define UINT_floor_divide_indexed UINT_divide_indexed
+#define UINT_fmax UINT_maximum
+#define UINT_fmax_indexed UINT_maximum_indexed
+#define UINT_fmin UINT_minimum
+#define UINT_fmin_indexed UINT_minimum_indexed
+
+NPY_NO_EXPORT void
+UINT__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    OUTPUT_LOOP {
+        *((npy_uint *)op1) = 1;
+    }
+}
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+UINT_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_uint *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_uint *)(ip1 + is1 * indx);
+        *indexed = *indexed + *(npy_uint *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+UINT_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_uint *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_uint *)(ip1 + is1 * indx);
+        *indexed = *indexed - *(npy_uint *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+UINT_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_uint *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_uint *)(ip1 + is1 * indx);
+        *indexed = *indexed * *(npy_uint *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+UINT_bitwise_and_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_uint *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_uint *)(ip1 + is1 * indx);
+        *indexed = *indexed & *(npy_uint *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+UINT_bitwise_or_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_uint *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_uint *)(ip1 + is1 * indx);
+        *indexed = *indexed | *(npy_uint *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+UINT_bitwise_xor_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_uint *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_uint *)(ip1 + is1 * indx);
+        *indexed = *indexed ^ *(npy_uint *)value;
+    }
+    return 0;
+}
+
+
+NPY_NO_EXPORT void
+UINT_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        npy_uint in1 = *(npy_uint *)ip1;
+        npy_uint in2 = *(npy_uint *)ip2;
+        npy_uint out;
+
+#if 0
+        if (in2 < 0) {
+            NPY_ALLOW_C_API_DEF
+            NPY_ALLOW_C_API;
+            PyErr_SetString(PyExc_ValueError,
+                    "Integers to negative integer powers are not allowed.");
+            NPY_DISABLE_C_API;
+            return;
+        }
+#endif
+        if (in2 == 0) {
+            *((npy_uint *)op1) = 1;
+            continue;
+        }
+        if (in1 == 1) {
+            *((npy_uint *)op1) = 1;
+            continue;
+        }
+
+        out = in2 & 1 ? in1 : 1;
+        in2 >>= 1;
+        while (in2 > 0) {
+            in1 *= in1;
+            if (in2 & 1) {
+                out *= in1;
+            }
+            in2 >>= 1;
+        }
+        *((npy_uint *) op1) = out;
+    }
+}
+
+#line 426
+
+#define LONG_floor_divide LONG_divide
+#define LONG_floor_divide_indexed LONG_divide_indexed
+#define LONG_fmax LONG_maximum
+#define LONG_fmax_indexed LONG_maximum_indexed
+#define LONG_fmin LONG_minimum
+#define LONG_fmin_indexed LONG_minimum_indexed
+
+NPY_NO_EXPORT void
+LONG__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    OUTPUT_LOOP {
+        *((npy_long *)op1) = 1;
+    }
+}
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+LONG_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_long *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_long *)(ip1 + is1 * indx);
+        *indexed = *indexed + *(npy_long *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+LONG_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_long *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_long *)(ip1 + is1 * indx);
+        *indexed = *indexed - *(npy_long *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+LONG_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_long *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_long *)(ip1 + is1 * indx);
+        *indexed = *indexed * *(npy_long *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+LONG_bitwise_and_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_long *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_long *)(ip1 + is1 * indx);
+        *indexed = *indexed & *(npy_long *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+LONG_bitwise_or_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_long *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_long *)(ip1 + is1 * indx);
+        *indexed = *indexed | *(npy_long *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+LONG_bitwise_xor_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_long *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_long *)(ip1 + is1 * indx);
+        *indexed = *indexed ^ *(npy_long *)value;
+    }
+    return 0;
+}
+
+
+NPY_NO_EXPORT void
+LONG_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        npy_long in1 = *(npy_long *)ip1;
+        npy_long in2 = *(npy_long *)ip2;
+        npy_long out;
+
+#if 1
+        if (in2 < 0) {
+            NPY_ALLOW_C_API_DEF
+            NPY_ALLOW_C_API;
+            PyErr_SetString(PyExc_ValueError,
+                    "Integers to negative integer powers are not allowed.");
+            NPY_DISABLE_C_API;
+            return;
+        }
+#endif
+        if (in2 == 0) {
+            *((npy_long *)op1) = 1;
+            continue;
+        }
+        if (in1 == 1) {
+            *((npy_long *)op1) = 1;
+            continue;
+        }
+
+        out = in2 & 1 ? in1 : 1;
+        in2 >>= 1;
+        while (in2 > 0) {
+            in1 *= in1;
+            if (in2 & 1) {
+                out *= in1;
+            }
+            in2 >>= 1;
+        }
+        *((npy_long *) op1) = out;
+    }
+}
+
+#line 426
+
+#define ULONG_floor_divide ULONG_divide
+#define ULONG_floor_divide_indexed ULONG_divide_indexed
+#define ULONG_fmax ULONG_maximum
+#define ULONG_fmax_indexed ULONG_maximum_indexed
+#define ULONG_fmin ULONG_minimum
+#define ULONG_fmin_indexed ULONG_minimum_indexed
+
+NPY_NO_EXPORT void
+ULONG__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    OUTPUT_LOOP {
+        *((npy_ulong *)op1) = 1;
+    }
+}
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+ULONG_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_ulong *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ulong *)(ip1 + is1 * indx);
+        *indexed = *indexed + *(npy_ulong *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+ULONG_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_ulong *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ulong *)(ip1 + is1 * indx);
+        *indexed = *indexed - *(npy_ulong *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+ULONG_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_ulong *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ulong *)(ip1 + is1 * indx);
+        *indexed = *indexed * *(npy_ulong *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+ULONG_bitwise_and_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_ulong *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ulong *)(ip1 + is1 * indx);
+        *indexed = *indexed & *(npy_ulong *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+ULONG_bitwise_or_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_ulong *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ulong *)(ip1 + is1 * indx);
+        *indexed = *indexed | *(npy_ulong *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+ULONG_bitwise_xor_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_ulong *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ulong *)(ip1 + is1 * indx);
+        *indexed = *indexed ^ *(npy_ulong *)value;
+    }
+    return 0;
+}
+
+
+NPY_NO_EXPORT void
+ULONG_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        npy_ulong in1 = *(npy_ulong *)ip1;
+        npy_ulong in2 = *(npy_ulong *)ip2;
+        npy_ulong out;
+
+#if 0
+        if (in2 < 0) {
+            NPY_ALLOW_C_API_DEF
+            NPY_ALLOW_C_API;
+            PyErr_SetString(PyExc_ValueError,
+                    "Integers to negative integer powers are not allowed.");
+            NPY_DISABLE_C_API;
+            return;
+        }
+#endif
+        if (in2 == 0) {
+            *((npy_ulong *)op1) = 1;
+            continue;
+        }
+        if (in1 == 1) {
+            *((npy_ulong *)op1) = 1;
+            continue;
+        }
+
+        out = in2 & 1 ? in1 : 1;
+        in2 >>= 1;
+        while (in2 > 0) {
+            in1 *= in1;
+            if (in2 & 1) {
+                out *= in1;
+            }
+            in2 >>= 1;
+        }
+        *((npy_ulong *) op1) = out;
+    }
+}
+
+#line 426
+
+#define LONGLONG_floor_divide LONGLONG_divide
+#define LONGLONG_floor_divide_indexed LONGLONG_divide_indexed
+#define LONGLONG_fmax LONGLONG_maximum
+#define LONGLONG_fmax_indexed LONGLONG_maximum_indexed
+#define LONGLONG_fmin LONGLONG_minimum
+#define LONGLONG_fmin_indexed LONGLONG_minimum_indexed
+
+NPY_NO_EXPORT void
+LONGLONG__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    OUTPUT_LOOP {
+        *((npy_longlong *)op1) = 1;
+    }
+}
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+LONGLONG_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_longlong *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_longlong *)(ip1 + is1 * indx);
+        *indexed = *indexed + *(npy_longlong *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+LONGLONG_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_longlong *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_longlong *)(ip1 + is1 * indx);
+        *indexed = *indexed - *(npy_longlong *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+LONGLONG_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_longlong *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_longlong *)(ip1 + is1 * indx);
+        *indexed = *indexed * *(npy_longlong *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+LONGLONG_bitwise_and_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_longlong *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_longlong *)(ip1 + is1 * indx);
+        *indexed = *indexed & *(npy_longlong *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+LONGLONG_bitwise_or_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_longlong *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_longlong *)(ip1 + is1 * indx);
+        *indexed = *indexed | *(npy_longlong *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+LONGLONG_bitwise_xor_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_longlong *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_longlong *)(ip1 + is1 * indx);
+        *indexed = *indexed ^ *(npy_longlong *)value;
+    }
+    return 0;
+}
+
+
+NPY_NO_EXPORT void
+LONGLONG_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        npy_longlong in1 = *(npy_longlong *)ip1;
+        npy_longlong in2 = *(npy_longlong *)ip2;
+        npy_longlong out;
+
+#if 1
+        if (in2 < 0) {
+            NPY_ALLOW_C_API_DEF
+            NPY_ALLOW_C_API;
+            PyErr_SetString(PyExc_ValueError,
+                    "Integers to negative integer powers are not allowed.");
+            NPY_DISABLE_C_API;
+            return;
+        }
+#endif
+        if (in2 == 0) {
+            *((npy_longlong *)op1) = 1;
+            continue;
+        }
+        if (in1 == 1) {
+            *((npy_longlong *)op1) = 1;
+            continue;
+        }
+
+        out = in2 & 1 ? in1 : 1;
+        in2 >>= 1;
+        while (in2 > 0) {
+            in1 *= in1;
+            if (in2 & 1) {
+                out *= in1;
+            }
+            in2 >>= 1;
+        }
+        *((npy_longlong *) op1) = out;
+    }
+}
+
+#line 426
+
+#define ULONGLONG_floor_divide ULONGLONG_divide
+#define ULONGLONG_floor_divide_indexed ULONGLONG_divide_indexed
+#define ULONGLONG_fmax ULONGLONG_maximum
+#define ULONGLONG_fmax_indexed ULONGLONG_maximum_indexed
+#define ULONGLONG_fmin ULONGLONG_minimum
+#define ULONGLONG_fmin_indexed ULONGLONG_minimum_indexed
+
+NPY_NO_EXPORT void
+ULONGLONG__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    OUTPUT_LOOP {
+        *((npy_ulonglong *)op1) = 1;
+    }
+}
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+ULONGLONG_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_ulonglong *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ulonglong *)(ip1 + is1 * indx);
+        *indexed = *indexed + *(npy_ulonglong *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+ULONGLONG_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_ulonglong *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ulonglong *)(ip1 + is1 * indx);
+        *indexed = *indexed - *(npy_ulonglong *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+ULONGLONG_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_ulonglong *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ulonglong *)(ip1 + is1 * indx);
+        *indexed = *indexed * *(npy_ulonglong *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+ULONGLONG_bitwise_and_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_ulonglong *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ulonglong *)(ip1 + is1 * indx);
+        *indexed = *indexed & *(npy_ulonglong *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+ULONGLONG_bitwise_or_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_ulonglong *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ulonglong *)(ip1 + is1 * indx);
+        *indexed = *indexed | *(npy_ulonglong *)value;
+    }
+    return 0;
+}
+
+#line 446
+
+NPY_NO_EXPORT NPY_GCC_OPT_3 int
+ULONGLONG_bitwise_xor_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+                      char **args, npy_intp const *dimensions, npy_intp const *steps,
+                      void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_ulonglong *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ulonglong *)(ip1 + is1 * indx);
+        *indexed = *indexed ^ *(npy_ulonglong *)value;
+    }
+    return 0;
+}
+
+
+NPY_NO_EXPORT void
+ULONGLONG_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        npy_ulonglong in1 = *(npy_ulonglong *)ip1;
+        npy_ulonglong in2 = *(npy_ulonglong *)ip2;
+        npy_ulonglong out;
+
+#if 0
+        if (in2 < 0) {
+            NPY_ALLOW_C_API_DEF
+            NPY_ALLOW_C_API;
+            PyErr_SetString(PyExc_ValueError,
+                    "Integers to negative integer powers are not allowed.");
+            NPY_DISABLE_C_API;
+            return;
+        }
+#endif
+        if (in2 == 0) {
+            *((npy_ulonglong *)op1) = 1;
+            continue;
+        }
+        if (in1 == 1) {
+            *((npy_ulonglong *)op1) = 1;
+            continue;
+        }
+
+        out = in2 & 1 ? in1 : 1;
+        in2 >>= 1;
+        while (in2 > 0) {
+            in1 *= in1;
+            if (in2 & 1) {
+                out *= in1;
+            }
+            in2 >>= 1;
+        }
+        *((npy_ulonglong *) op1) = out;
+    }
+}
+
+
+#line 518
+#line 521
+NPY_NO_EXPORT void
+BYTE_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_byte in1 = *(npy_byte *)ip1;
+        const npy_byte in2 = *(npy_byte *)ip2;
+        *((npy_byte *)op1) = npy_gcd(in1, in2);
+    }
+}
+
+#line 521
+NPY_NO_EXPORT void
+BYTE_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_byte in1 = *(npy_byte *)ip1;
+        const npy_byte in2 = *(npy_byte *)ip2;
+        *((npy_byte *)op1) = npy_lcm(in1, in2);
+    }
+}
+
+
+#line 518
+#line 521
+NPY_NO_EXPORT void
+SHORT_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_short in1 = *(npy_short *)ip1;
+        const npy_short in2 = *(npy_short *)ip2;
+        *((npy_short *)op1) = npy_gcd(in1, in2);
+    }
+}
+
+#line 521
+NPY_NO_EXPORT void
+SHORT_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_short in1 = *(npy_short *)ip1;
+        const npy_short in2 = *(npy_short *)ip2;
+        *((npy_short *)op1) = npy_lcm(in1, in2);
+    }
+}
+
+
+#line 518
+#line 521
+NPY_NO_EXPORT void
+INT_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_int in1 = *(npy_int *)ip1;
+        const npy_int in2 = *(npy_int *)ip2;
+        *((npy_int *)op1) = npy_gcd(in1, in2);
+    }
+}
+
+#line 521
+NPY_NO_EXPORT void
+INT_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_int in1 = *(npy_int *)ip1;
+        const npy_int in2 = *(npy_int *)ip2;
+        *((npy_int *)op1) = npy_lcm(in1, in2);
+    }
+}
+
+
+#line 518
+#line 521
+NPY_NO_EXPORT void
+LONG_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_long in1 = *(npy_long *)ip1;
+        const npy_long in2 = *(npy_long *)ip2;
+        *((npy_long *)op1) = npy_gcdl(in1, in2);
+    }
+}
+
+#line 521
+NPY_NO_EXPORT void
+LONG_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_long in1 = *(npy_long *)ip1;
+        const npy_long in2 = *(npy_long *)ip2;
+        *((npy_long *)op1) = npy_lcml(in1, in2);
+    }
+}
+
+
+#line 518
+#line 521
+NPY_NO_EXPORT void
+LONGLONG_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longlong in1 = *(npy_longlong *)ip1;
+        const npy_longlong in2 = *(npy_longlong *)ip2;
+        *((npy_longlong *)op1) = npy_gcdll(in1, in2);
+    }
+}
+
+#line 521
+NPY_NO_EXPORT void
+LONGLONG_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longlong in1 = *(npy_longlong *)ip1;
+        const npy_longlong in2 = *(npy_longlong *)ip2;
+        *((npy_longlong *)op1) = npy_lcmll(in1, in2);
+    }
+}
+
+
+
+#line 538
+#line 541
+NPY_NO_EXPORT void
+UBYTE_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_ubyte in1 = *(npy_ubyte *)ip1;
+        const npy_ubyte in2 = *(npy_ubyte *)ip2;
+        *((npy_ubyte *)op1) = npy_gcdu(in1, in2);
+    }
+}
+
+#line 541
+NPY_NO_EXPORT void
+UBYTE_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_ubyte in1 = *(npy_ubyte *)ip1;
+        const npy_ubyte in2 = *(npy_ubyte *)ip2;
+        *((npy_ubyte *)op1) = npy_lcmu(in1, in2);
+    }
+}
+
+
+#line 538
+#line 541
+NPY_NO_EXPORT void
+USHORT_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_ushort in1 = *(npy_ushort *)ip1;
+        const npy_ushort in2 = *(npy_ushort *)ip2;
+        *((npy_ushort *)op1) = npy_gcdu(in1, in2);
+    }
+}
+
+#line 541
+NPY_NO_EXPORT void
+USHORT_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_ushort in1 = *(npy_ushort *)ip1;
+        const npy_ushort in2 = *(npy_ushort *)ip2;
+        *((npy_ushort *)op1) = npy_lcmu(in1, in2);
+    }
+}
+
+
+#line 538
+#line 541
+NPY_NO_EXPORT void
+UINT_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_uint in1 = *(npy_uint *)ip1;
+        const npy_uint in2 = *(npy_uint *)ip2;
+        *((npy_uint *)op1) = npy_gcdu(in1, in2);
+    }
+}
+
+#line 541
+NPY_NO_EXPORT void
+UINT_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_uint in1 = *(npy_uint *)ip1;
+        const npy_uint in2 = *(npy_uint *)ip2;
+        *((npy_uint *)op1) = npy_lcmu(in1, in2);
+    }
+}
+
+
+#line 538
+#line 541
+NPY_NO_EXPORT void
+ULONG_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_ulong in1 = *(npy_ulong *)ip1;
+        const npy_ulong in2 = *(npy_ulong *)ip2;
+        *((npy_ulong *)op1) = npy_gcdul(in1, in2);
+    }
+}
+
+#line 541
+NPY_NO_EXPORT void
+ULONG_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_ulong in1 = *(npy_ulong *)ip1;
+        const npy_ulong in2 = *(npy_ulong *)ip2;
+        *((npy_ulong *)op1) = npy_lcmul(in1, in2);
+    }
+}
+
+
+#line 538
+#line 541
+NPY_NO_EXPORT void
+ULONGLONG_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
+        const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
+        *((npy_ulonglong *)op1) = npy_gcdull(in1, in2);
+    }
+}
+
+#line 541
+NPY_NO_EXPORT void
+ULONGLONG_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
+        const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
+        *((npy_ulonglong *)op1) = npy_lcmull(in1, in2);
+    }
+}
+
+
+
+
+/*
+ * NOTE: It may be nice to vectorize these, OTOH, these are still faster
+ *       than the cast we used to do.
+ */
+
+#line 563
+NPY_NO_EXPORT void
+LONGLONG_Qq_bool_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
+        const npy_longlong in2 = *(npy_longlong *)ip2;
+        if (in2 < 0) {
+            *(npy_bool *)op1 = 0 == in2;
+        }
+        else {
+            *(npy_bool *)op1 = in1 == (npy_ulonglong)in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+LONGLONG_qQ_bool_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longlong in1 = *(npy_longlong *)ip1;
+        const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
+        if (in1 < 0) {
+            *(npy_bool *)op1 = in1 == 0;
+        }
+        else {
+            *(npy_bool *)op1 = (npy_ulonglong)in1 == in2;
+        }
+    }
+}
+
+#line 563
+NPY_NO_EXPORT void
+LONGLONG_Qq_bool_not_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
+        const npy_longlong in2 = *(npy_longlong *)ip2;
+        if (in2 < 0) {
+            *(npy_bool *)op1 = 0 != in2;
+        }
+        else {
+            *(npy_bool *)op1 = in1 != (npy_ulonglong)in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+LONGLONG_qQ_bool_not_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longlong in1 = *(npy_longlong *)ip1;
+        const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
+        if (in1 < 0) {
+            *(npy_bool *)op1 = in1 != 0;
+        }
+        else {
+            *(npy_bool *)op1 = (npy_ulonglong)in1 != in2;
+        }
+    }
+}
+
+#line 563
+NPY_NO_EXPORT void
+LONGLONG_Qq_bool_less(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
+        const npy_longlong in2 = *(npy_longlong *)ip2;
+        if (in2 < 0) {
+            *(npy_bool *)op1 = 0 < in2;
+        }
+        else {
+            *(npy_bool *)op1 = in1 < (npy_ulonglong)in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+LONGLONG_qQ_bool_less(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longlong in1 = *(npy_longlong *)ip1;
+        const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
+        if (in1 < 0) {
+            *(npy_bool *)op1 = in1 < 0;
+        }
+        else {
+            *(npy_bool *)op1 = (npy_ulonglong)in1 < in2;
+        }
+    }
+}
+
+#line 563
+NPY_NO_EXPORT void
+LONGLONG_Qq_bool_less_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
+        const npy_longlong in2 = *(npy_longlong *)ip2;
+        if (in2 < 0) {
+            *(npy_bool *)op1 = 0 <= in2;
+        }
+        else {
+            *(npy_bool *)op1 = in1 <= (npy_ulonglong)in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+LONGLONG_qQ_bool_less_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longlong in1 = *(npy_longlong *)ip1;
+        const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
+        if (in1 < 0) {
+            *(npy_bool *)op1 = in1 <= 0;
+        }
+        else {
+            *(npy_bool *)op1 = (npy_ulonglong)in1 <= in2;
+        }
+    }
+}
+
+#line 563
+NPY_NO_EXPORT void
+LONGLONG_Qq_bool_greater(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
+        const npy_longlong in2 = *(npy_longlong *)ip2;
+        if (in2 < 0) {
+            *(npy_bool *)op1 = 0 > in2;
+        }
+        else {
+            *(npy_bool *)op1 = in1 > (npy_ulonglong)in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+LONGLONG_qQ_bool_greater(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longlong in1 = *(npy_longlong *)ip1;
+        const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
+        if (in1 < 0) {
+            *(npy_bool *)op1 = in1 > 0;
+        }
+        else {
+            *(npy_bool *)op1 = (npy_ulonglong)in1 > in2;
+        }
+    }
+}
+
+#line 563
+NPY_NO_EXPORT void
+LONGLONG_Qq_bool_greater_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
+        const npy_longlong in2 = *(npy_longlong *)ip2;
+        if (in2 < 0) {
+            *(npy_bool *)op1 = 0 >= in2;
+        }
+        else {
+            *(npy_bool *)op1 = in1 >= (npy_ulonglong)in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+LONGLONG_qQ_bool_greater_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longlong in1 = *(npy_longlong *)ip1;
+        const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
+        if (in1 < 0) {
+            *(npy_bool *)op1 = in1 >= 0;
+        }
+        else {
+            *(npy_bool *)op1 = (npy_ulonglong)in1 >= in2;
+        }
+    }
+}
+
+
+
+/*
+ *****************************************************************************
+ **                           DATETIME LOOPS                                **
+ *****************************************************************************
+ */
+
+NPY_NO_EXPORT void
+TIMEDELTA_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        if (in1 == NPY_DATETIME_NAT) {
+            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+        }
+        else {
+            *((npy_timedelta *)op1) = -in1;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+TIMEDELTA_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        *((npy_timedelta *)op1) = +in1;
+    }
+}
+
+NPY_NO_EXPORT void
+TIMEDELTA_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        if (in1 == NPY_DATETIME_NAT) {
+            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+        }
+        else {
+            *((npy_timedelta *)op1) = (in1 >= 0) ? in1 : -in1;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+TIMEDELTA_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        *((npy_timedelta *)op1) = in1 > 0 ? 1 : (in1 < 0 ? -1 : 0);
+    }
+}
+
+#line 651
+
+NPY_NO_EXPORT void
+DATETIME_isnat(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_datetime in1 = *(npy_datetime *)ip1;
+        *((npy_bool *)op1) = (in1 == NPY_DATETIME_NAT);
+    }
+}
+
+NPY_NO_EXPORT void
+DATETIME_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_datetime in1 = *(npy_datetime *)ip1;
+        *((npy_bool *)op1) = (in1 != NPY_DATETIME_NAT);
+    }
+}
+
+NPY_NO_EXPORT void
+DATETIME__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    OUTPUT_LOOP {
+        *((npy_datetime *)op1) = 1;
+    }
+}
+
+#line 682
+NPY_NO_EXPORT void
+DATETIME_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_datetime in1 = *(npy_datetime *)ip1;
+        const npy_datetime in2 = *(npy_datetime *)ip2;
+        *((npy_bool *)op1) = (in1 == in2 &&
+                              in1 != NPY_DATETIME_NAT &&
+                              in2 != NPY_DATETIME_NAT);
+    }
+}
+
+#line 682
+NPY_NO_EXPORT void
+DATETIME_greater(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_datetime in1 = *(npy_datetime *)ip1;
+        const npy_datetime in2 = *(npy_datetime *)ip2;
+        *((npy_bool *)op1) = (in1 > in2 &&
+                              in1 != NPY_DATETIME_NAT &&
+                              in2 != NPY_DATETIME_NAT);
+    }
+}
+
+#line 682
+NPY_NO_EXPORT void
+DATETIME_greater_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_datetime in1 = *(npy_datetime *)ip1;
+        const npy_datetime in2 = *(npy_datetime *)ip2;
+        *((npy_bool *)op1) = (in1 >= in2 &&
+                              in1 != NPY_DATETIME_NAT &&
+                              in2 != NPY_DATETIME_NAT);
+    }
+}
+
+#line 682
+NPY_NO_EXPORT void
+DATETIME_less(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_datetime in1 = *(npy_datetime *)ip1;
+        const npy_datetime in2 = *(npy_datetime *)ip2;
+        *((npy_bool *)op1) = (in1 < in2 &&
+                              in1 != NPY_DATETIME_NAT &&
+                              in2 != NPY_DATETIME_NAT);
+    }
+}
+
+#line 682
+NPY_NO_EXPORT void
+DATETIME_less_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_datetime in1 = *(npy_datetime *)ip1;
+        const npy_datetime in2 = *(npy_datetime *)ip2;
+        *((npy_bool *)op1) = (in1 <= in2 &&
+                              in1 != NPY_DATETIME_NAT &&
+                              in2 != NPY_DATETIME_NAT);
+    }
+}
+
+
+NPY_NO_EXPORT void
+DATETIME_not_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_datetime in1 = *(npy_datetime *)ip1;
+        const npy_datetime in2 = *(npy_datetime *)ip2;
+        *((npy_bool *)op1) = (in1 != in2 ||
+                              in1 == NPY_DATETIME_NAT ||
+                              in2 == NPY_DATETIME_NAT);
+    }
+}
+
+
+#line 712
+NPY_NO_EXPORT void
+DATETIME_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_datetime in1 = *(npy_datetime *)ip1;
+        const npy_datetime in2 = *(npy_datetime *)ip2;
+        if (in1 == NPY_DATETIME_NAT) {
+            *((npy_datetime *)op1) = in1;
+        }
+        else if (in2 == NPY_DATETIME_NAT) {
+            *((npy_datetime *)op1) = in2;
+        }
+        else {
+            *((npy_datetime *)op1) = (in1 > in2) ? in1 : in2;
+        }
+    }
+}
+
+
+#line 712
+NPY_NO_EXPORT void
+DATETIME_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_datetime in1 = *(npy_datetime *)ip1;
+        const npy_datetime in2 = *(npy_datetime *)ip2;
+        if (in1 == NPY_DATETIME_NAT) {
+            *((npy_datetime *)op1) = in1;
+        }
+        else if (in2 == NPY_DATETIME_NAT) {
+            *((npy_datetime *)op1) = in2;
+        }
+        else {
+            *((npy_datetime *)op1) = (in1 < in2) ? in1 : in2;
+        }
+    }
+}
+
+
+
+#line 736
+NPY_NO_EXPORT void
+DATETIME_fmax(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_datetime in1 = *(npy_datetime *)ip1;
+        const npy_datetime in2 = *(npy_datetime *)ip2;
+        if (in1 == NPY_DATETIME_NAT) {
+            *((npy_datetime *)op1) = in2;
+        }
+        else if (in2 == NPY_DATETIME_NAT) {
+            *((npy_datetime *)op1) = in1;
+        }
+        else {
+            *((npy_datetime *)op1) = in1 >= in2 ? in1 : in2;
+        }
+    }
+}
+
+#line 736
+NPY_NO_EXPORT void
+DATETIME_fmin(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_datetime in1 = *(npy_datetime *)ip1;
+        const npy_datetime in2 = *(npy_datetime *)ip2;
+        if (in1 == NPY_DATETIME_NAT) {
+            *((npy_datetime *)op1) = in2;
+        }
+        else if (in2 == NPY_DATETIME_NAT) {
+            *((npy_datetime *)op1) = in1;
+        }
+        else {
+            *((npy_datetime *)op1) = in1 <= in2 ? in1 : in2;
+        }
+    }
+}
+
+
+
+#line 651
+
+NPY_NO_EXPORT void
+TIMEDELTA_isnat(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        *((npy_bool *)op1) = (in1 == NPY_DATETIME_NAT);
+    }
+}
+
+NPY_NO_EXPORT void
+TIMEDELTA_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        *((npy_bool *)op1) = (in1 != NPY_DATETIME_NAT);
+    }
+}
+
+NPY_NO_EXPORT void
+TIMEDELTA__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    OUTPUT_LOOP {
+        *((npy_timedelta *)op1) = 1;
+    }
+}
+
+#line 682
+NPY_NO_EXPORT void
+TIMEDELTA_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        const npy_timedelta in2 = *(npy_timedelta *)ip2;
+        *((npy_bool *)op1) = (in1 == in2 &&
+                              in1 != NPY_DATETIME_NAT &&
+                              in2 != NPY_DATETIME_NAT);
+    }
+}
+
+#line 682
+NPY_NO_EXPORT void
+TIMEDELTA_greater(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        const npy_timedelta in2 = *(npy_timedelta *)ip2;
+        *((npy_bool *)op1) = (in1 > in2 &&
+                              in1 != NPY_DATETIME_NAT &&
+                              in2 != NPY_DATETIME_NAT);
+    }
+}
+
+#line 682
+NPY_NO_EXPORT void
+TIMEDELTA_greater_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        const npy_timedelta in2 = *(npy_timedelta *)ip2;
+        *((npy_bool *)op1) = (in1 >= in2 &&
+                              in1 != NPY_DATETIME_NAT &&
+                              in2 != NPY_DATETIME_NAT);
+    }
+}
+
+#line 682
+NPY_NO_EXPORT void
+TIMEDELTA_less(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        const npy_timedelta in2 = *(npy_timedelta *)ip2;
+        *((npy_bool *)op1) = (in1 < in2 &&
+                              in1 != NPY_DATETIME_NAT &&
+                              in2 != NPY_DATETIME_NAT);
+    }
+}
+
+#line 682
+NPY_NO_EXPORT void
+TIMEDELTA_less_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        const npy_timedelta in2 = *(npy_timedelta *)ip2;
+        *((npy_bool *)op1) = (in1 <= in2 &&
+                              in1 != NPY_DATETIME_NAT &&
+                              in2 != NPY_DATETIME_NAT);
+    }
+}
+
+
+NPY_NO_EXPORT void
+TIMEDELTA_not_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        const npy_timedelta in2 = *(npy_timedelta *)ip2;
+        *((npy_bool *)op1) = (in1 != in2 ||
+                              in1 == NPY_DATETIME_NAT ||
+                              in2 == NPY_DATETIME_NAT);
+    }
+}
+
+
+#line 712
+NPY_NO_EXPORT void
+TIMEDELTA_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        const npy_timedelta in2 = *(npy_timedelta *)ip2;
+        if (in1 == NPY_DATETIME_NAT) {
+            *((npy_timedelta *)op1) = in1;
+        }
+        else if (in2 == NPY_DATETIME_NAT) {
+            *((npy_timedelta *)op1) = in2;
+        }
+        else {
+            *((npy_timedelta *)op1) = (in1 > in2) ? in1 : in2;
+        }
+    }
+}
+
+
+#line 712
+NPY_NO_EXPORT void
+TIMEDELTA_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        const npy_timedelta in2 = *(npy_timedelta *)ip2;
+        if (in1 == NPY_DATETIME_NAT) {
+            *((npy_timedelta *)op1) = in1;
+        }
+        else if (in2 == NPY_DATETIME_NAT) {
+            *((npy_timedelta *)op1) = in2;
+        }
+        else {
+            *((npy_timedelta *)op1) = (in1 < in2) ? in1 : in2;
+        }
+    }
+}
+
+
+
+#line 736
+NPY_NO_EXPORT void
+TIMEDELTA_fmax(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        const npy_timedelta in2 = *(npy_timedelta *)ip2;
+        if (in1 == NPY_DATETIME_NAT) {
+            *((npy_timedelta *)op1) = in2;
+        }
+        else if (in2 == NPY_DATETIME_NAT) {
+            *((npy_timedelta *)op1) = in1;
+        }
+        else {
+            *((npy_timedelta *)op1) = in1 >= in2 ? in1 : in2;
+        }
+    }
+}
+
+#line 736
+NPY_NO_EXPORT void
+TIMEDELTA_fmin(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        const npy_timedelta in2 = *(npy_timedelta *)ip2;
+        if (in1 == NPY_DATETIME_NAT) {
+            *((npy_timedelta *)op1) = in2;
+        }
+        else if (in2 == NPY_DATETIME_NAT) {
+            *((npy_timedelta *)op1) = in1;
+        }
+        else {
+            *((npy_timedelta *)op1) = in1 <= in2 ? in1 : in2;
+        }
+    }
+}
+
+
+
+
+NPY_NO_EXPORT void
+DATETIME_Mm_M_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    BINARY_LOOP {
+        const npy_datetime in1 = *(npy_datetime *)ip1;
+        const npy_timedelta in2 = *(npy_timedelta *)ip2;
+        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
+            *((npy_datetime *)op1) = NPY_DATETIME_NAT;
+        }
+        else {
+            *((npy_datetime *)op1) = in1 + in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+DATETIME_mM_M_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        const npy_datetime in2 = *(npy_datetime *)ip2;
+        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
+            *((npy_datetime *)op1) = NPY_DATETIME_NAT;
+        }
+        else {
+            *((npy_datetime *)op1) = in1 + in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_m_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        const npy_timedelta in2 = *(npy_timedelta *)ip2;
+        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
+            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+        }
+        else {
+            *((npy_timedelta *)op1) = in1 + in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+DATETIME_Mm_M_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_datetime in1 = *(npy_datetime *)ip1;
+        const npy_timedelta in2 = *(npy_timedelta *)ip2;
+        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
+            *((npy_datetime *)op1) = NPY_DATETIME_NAT;
+        }
+        else {
+            *((npy_datetime *)op1) = in1 - in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+DATETIME_MM_m_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_datetime in1 = *(npy_datetime *)ip1;
+        const npy_datetime in2 = *(npy_datetime *)ip2;
+        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
+            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+        }
+        else {
+            *((npy_timedelta *)op1) = in1 - in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_m_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        const npy_timedelta in2 = *(npy_timedelta *)ip2;
+        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
+            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+        }
+        else {
+            *((npy_timedelta *)op1) = in1 - in2;
+        }
+    }
+}
+
+/* Note: Assuming 'q' == NPY_LONGLONG */
+NPY_NO_EXPORT void
+TIMEDELTA_mq_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        const npy_int64 in2 = *(npy_int64 *)ip2;
+        if (in1 == NPY_DATETIME_NAT) {
+            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+        }
+        else {
+            *((npy_timedelta *)op1) = in1 * in2;
+        }
+    }
+}
+
+/* Note: Assuming 'q' == NPY_LONGLONG */
+NPY_NO_EXPORT void
+TIMEDELTA_qm_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_int64 in1 = *(npy_int64 *)ip1;
+        const npy_timedelta in2 = *(npy_timedelta *)ip2;
+        if (in2 == NPY_DATETIME_NAT) {
+            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+        }
+        else {
+            *((npy_timedelta *)op1) = in1 * in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+TIMEDELTA_md_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        const double in2 = *(double *)ip2;
+        if (in1 == NPY_DATETIME_NAT) {
+            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+        }
+        else {
+            double result = in1 * in2;
+            if (npy_isfinite(result)) {
+                *((npy_timedelta *)op1) = (npy_timedelta)result;
+            }
+            else {
+                *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+            }
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+TIMEDELTA_dm_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const double in1 = *(double *)ip1;
+        const npy_timedelta in2 = *(npy_timedelta *)ip2;
+        if (in2 == NPY_DATETIME_NAT) {
+            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+        }
+        else {
+            double result = in1 * in2;
+            if (npy_isfinite(result)) {
+                *((npy_timedelta *)op1) = (npy_timedelta)result;
+            }
+            else {
+                *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+            }
+        }
+    }
+}
+
+/* Note: Assuming 'q' == NPY_LONGLONG */
+NPY_NO_EXPORT void
+TIMEDELTA_mq_m_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /* NOTE: This code is similar to array floor divide */
+    BINARY_DEFS
+
+    /* When the divisor is a constant, use libdivide for faster division */
+    if (steps[1] == 0) {
+        /* In case of empty array, just return */
+        if (n == 0) {
+            return;
+        }
+
+        const npy_int64 in2 = *(npy_int64 *)ip2;
+
+        /* If divisor is 0, we need not compute anything */
+        if (in2 == 0) {
+            npy_set_floatstatus_divbyzero();
+            BINARY_LOOP_SLIDING {
+                *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+            }
+        }
+        else {
+            struct libdivide_s64_t fast_d = libdivide_s64_gen(in2);
+            BINARY_LOOP_SLIDING {
+                const npy_timedelta in1 = *(npy_timedelta *)ip1;
+                if (in1 == NPY_DATETIME_NAT) {
+                    *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+                }
+                else {
+                    *((npy_timedelta *)op1) = libdivide_s64_do(in1, &fast_d);
+                }
+            }
+        }
+    }
+    else {
+        BINARY_LOOP_SLIDING {
+            const npy_timedelta in1 = *(npy_timedelta *)ip1;
+            const npy_int64 in2 = *(npy_int64 *)ip2;
+            if (in1 == NPY_DATETIME_NAT || in2 == 0) {
+                *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+            }
+            else {
+                *((npy_timedelta *)op1) = in1 / in2;
+            }
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+TIMEDELTA_md_m_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        const double in2 = *(double *)ip2;
+        if (in1 == NPY_DATETIME_NAT) {
+            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+        }
+        else {
+            double result = in1 / in2;
+            if (npy_isfinite(result)) {
+                *((npy_timedelta *)op1) = (npy_timedelta)result;
+            }
+            else {
+                *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+            }
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_d_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        const npy_timedelta in2 = *(npy_timedelta *)ip2;
+        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
+            *((double *)op1) = NPY_NAN;
+        }
+        else {
+            *((double *)op1) = (double)in1 / (double)in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_m_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        const npy_timedelta in2 = *(npy_timedelta *)ip2;
+        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
+            *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+        }
+        else {
+            if (in2 == 0) {
+                npy_set_floatstatus_divbyzero();
+                *((npy_timedelta *)op1) = NPY_DATETIME_NAT;
+            }
+            else {
+                /* handle mixed case the way Python does */
+                const npy_timedelta rem = in1 % in2;
+                if ((in1 > 0) == (in2 > 0) || rem == 0) {
+                    *((npy_timedelta *)op1) = rem;
+                }
+                else {
+                    *((npy_timedelta *)op1) = rem + in2;
+                }
+            }
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_q_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /* NOTE: This code is similar to array floor divide */
+    BINARY_DEFS
+
+    /* When the divisor is a constant, use libdivide for faster division */
+    if (steps[1] == 0) {
+        /* In case of empty array, just return */
+        if (n == 0) {
+            return;
+        }
+
+        const npy_timedelta in2 = *(npy_timedelta *)ip2;
+
+        /* If divisor is 0 or NAT, we need not compute anything */
+        if (in2 == 0) {
+            npy_set_floatstatus_divbyzero();
+            BINARY_LOOP_SLIDING {
+                *((npy_int64 *)op1) = 0;
+            }
+        }
+        else if (in2 == NPY_DATETIME_NAT) {
+            npy_set_floatstatus_invalid();
+            BINARY_LOOP_SLIDING {
+                *((npy_int64 *)op1) = 0;
+            }
+        }
+        else {
+            struct libdivide_s64_t fast_d = libdivide_s64_gen(in2);
+             BINARY_LOOP_SLIDING {
+                const npy_timedelta in1 = *(npy_timedelta *)ip1;
+                if (in1 == NPY_DATETIME_NAT) {
+                    npy_set_floatstatus_invalid();
+                    *((npy_int64 *)op1) = 0;
+                }
+                else {
+                    *((npy_int64 *)op1) = libdivide_s64_do(in1, &fast_d);
+
+                    /* Negative quotients needs to be rounded down */
+                    if (((in1 > 0) != (in2 > 0)) && (*((npy_int64 *)op1) * in2 != in1)) {
+                        *((npy_int64 *)op1) = *((npy_int64 *)op1) - 1;
+                    }
+                }
+            }
+        }
+    }
+    else {
+        BINARY_LOOP_SLIDING {
+            const npy_timedelta in1 = *(npy_timedelta *)ip1;
+            const npy_timedelta in2 = *(npy_timedelta *)ip2;
+            if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
+                npy_set_floatstatus_invalid();
+                *((npy_int64 *)op1) = 0;
+            }
+            else if (in2 == 0) {
+                npy_set_floatstatus_divbyzero();
+                *((npy_int64 *)op1) = 0;
+            }
+            else {
+                *((npy_int64 *)op1) = in1/in2;
+
+                /* Negative quotients needs to be rounded down */
+                if (((in1 > 0) != (in2 > 0)) && (*((npy_int64 *)op1) * in2 != in1)) {
+                    *((npy_int64 *)op1) = *((npy_int64 *)op1) - 1;
+                }
+            }
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP_TWO_OUT {
+        const npy_timedelta in1 = *(npy_timedelta *)ip1;
+        const npy_timedelta in2 = *(npy_timedelta *)ip2;
+        if (in1 == NPY_DATETIME_NAT || in2 == NPY_DATETIME_NAT) {
+            npy_set_floatstatus_invalid();
+            *((npy_int64 *)op1) = 0;
+            *((npy_timedelta *)op2) = NPY_DATETIME_NAT;
+        }
+        else if (in2 == 0) {
+            npy_set_floatstatus_divbyzero();
+            *((npy_int64 *)op1) = 0;
+            *((npy_timedelta *)op2) = NPY_DATETIME_NAT;
+        }
+        else {
+            const npy_int64 quo = in1 / in2;
+            const npy_timedelta rem = in1 % in2;
+            if ((in1 > 0) == (in2 > 0) || rem == 0) {
+                *((npy_int64 *)op1) = quo;
+                *((npy_timedelta *)op2) = rem;
+            }
+            else {
+                *((npy_int64 *)op1) = quo - 1;
+                *((npy_timedelta *)op2) = rem + in2;
+            }
+        }
+    }
+}
+
+/*
+ *****************************************************************************
+ **                             FLOAT LOOPS                                 **
+ *****************************************************************************
+ */
+
+#line 1152
+#line 1156
+NPY_NO_EXPORT void
+FLOAT_logical_and(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        const npy_float in2 = *(npy_float *)ip2;
+        *((npy_bool *)op1) = in1 && in2;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 1156
+NPY_NO_EXPORT void
+FLOAT_logical_or(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        const npy_float in2 = *(npy_float *)ip2;
+        *((npy_bool *)op1) = in1 || in2;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+
+NPY_NO_EXPORT void
+FLOAT_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const int t1 = !!*(npy_float *)ip1;
+        const int t2 = !!*(npy_float *)ip2;
+        *((npy_bool *)op1) = (t1 != t2);
+    }
+}
+
+NPY_NO_EXPORT void
+FLOAT_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        *((npy_bool *)op1) = !in1;
+    }
+}
+
+#if !1
+#line 1192
+NPY_NO_EXPORT void
+FLOAT_isnan(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        *((npy_bool *)op1) = npy_isnan(in1) != 0;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 1192
+NPY_NO_EXPORT void
+FLOAT_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        *((npy_bool *)op1) = npy_isinf(in1) != 0;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 1192
+NPY_NO_EXPORT void
+FLOAT_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        *((npy_bool *)op1) = npy_isfinite(in1) != 0;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 1192
+NPY_NO_EXPORT void
+FLOAT_signbit(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        *((npy_bool *)op1) = npy_signbit(in1) != 0;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#endif
+
+NPY_NO_EXPORT void
+FLOAT_spacing(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        *((npy_float *)op1) = npy_spacingf(in1);
+    }
+}
+
+NPY_NO_EXPORT void
+FLOAT_copysign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        const npy_float in2 = *(npy_float *)ip2;
+        *((npy_float *)op1)= npy_copysignf(in1, in2);
+    }
+}
+
+NPY_NO_EXPORT void
+FLOAT_nextafter(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        const npy_float in2 = *(npy_float *)ip2;
+        *((npy_float *)op1)= npy_nextafterf(in1, in2);
+    }
+}
+
+NPY_NO_EXPORT void
+FLOAT_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        const npy_float in2 = *(npy_float *)ip2;
+        *((npy_float *)op1) = npy_floor_dividef(in1, in2);
+    }
+}
+
+NPY_NO_EXPORT int
+FLOAT_floor_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_float *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_float *)(ip1 + is1 * indx);
+        *indexed = npy_floor_dividef(*indexed, *(npy_float *)value);
+    }
+    return 0;
+}
+
+NPY_NO_EXPORT void
+FLOAT_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        const npy_float in2 = *(npy_float *)ip2;
+        *((npy_float *) op1) = npy_remainderf(in1, in2);
+    }
+}
+
+NPY_NO_EXPORT void
+FLOAT_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP_TWO_OUT {
+        const npy_float in1 = *(npy_float *)ip1;
+        const npy_float in2 = *(npy_float *)ip2;
+        *((npy_float *)op1) = npy_divmodf(in1, in2, (npy_float *)op2);
+    }
+}
+
+NPY_NO_EXPORT void
+FLOAT__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    OUTPUT_LOOP {
+        *((npy_float *)op1) = 1;
+    }
+}
+
+NPY_NO_EXPORT void
+FLOAT_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        *((npy_float *)op1) = in1;
+    }
+}
+
+NPY_NO_EXPORT void
+FLOAT_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        *((npy_float *)op1) = +in1;
+    }
+}
+
+NPY_NO_EXPORT void
+FLOAT_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /* Sign of nan is nan */
+    UNARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        *((npy_float *)op1) = in1 > 0 ? 1 : (in1 < 0 ? -1 : (in1 == 0 ? 0 : in1));
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+NPY_NO_EXPORT void
+FLOAT_modf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_TWO_OUT {
+        const npy_float in1 = *(npy_float *)ip1;
+        *((npy_float *)op1) = npy_modff(in1, (npy_float *)op2);
+    }
+}
+
+NPY_NO_EXPORT void
+FLOAT_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * Additional loop to handle npy_long integer inputs (cf. #866, #1633).
+     * npy_long != npy_int on many 64-bit platforms, so we need this second loop
+     * to handle the default integer type.
+     */
+    BINARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        const long in2 = *(long *)ip2;
+        if (((int)in2) == in2) {
+            /* Range OK */
+            *((npy_float *)op1) = npy_ldexpf(in1, ((int)in2));
+        }
+        else {
+            /*
+             * Outside npy_int range -- also ldexp will overflow in this case,
+             * given that exponent has less bits than npy_int.
+             */
+            if (in2 > 0) {
+                *((npy_float *)op1) = npy_ldexpf(in1, NPY_MAX_INT);
+            }
+            else {
+                *((npy_float *)op1) = npy_ldexpf(in1, NPY_MIN_INT);
+            }
+        }
+    }
+}
+
+
+#line 1152
+#line 1156
+NPY_NO_EXPORT void
+DOUBLE_logical_and(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        const npy_double in2 = *(npy_double *)ip2;
+        *((npy_bool *)op1) = in1 && in2;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 1156
+NPY_NO_EXPORT void
+DOUBLE_logical_or(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        const npy_double in2 = *(npy_double *)ip2;
+        *((npy_bool *)op1) = in1 || in2;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+
+NPY_NO_EXPORT void
+DOUBLE_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const int t1 = !!*(npy_double *)ip1;
+        const int t2 = !!*(npy_double *)ip2;
+        *((npy_bool *)op1) = (t1 != t2);
+    }
+}
+
+NPY_NO_EXPORT void
+DOUBLE_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *((npy_bool *)op1) = !in1;
+    }
+}
+
+#if !1
+#line 1192
+NPY_NO_EXPORT void
+DOUBLE_isnan(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *((npy_bool *)op1) = npy_isnan(in1) != 0;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 1192
+NPY_NO_EXPORT void
+DOUBLE_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *((npy_bool *)op1) = npy_isinf(in1) != 0;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 1192
+NPY_NO_EXPORT void
+DOUBLE_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *((npy_bool *)op1) = npy_isfinite(in1) != 0;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 1192
+NPY_NO_EXPORT void
+DOUBLE_signbit(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *((npy_bool *)op1) = npy_signbit(in1) != 0;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#endif
+
+NPY_NO_EXPORT void
+DOUBLE_spacing(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *((npy_double *)op1) = npy_spacing(in1);
+    }
+}
+
+NPY_NO_EXPORT void
+DOUBLE_copysign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        const npy_double in2 = *(npy_double *)ip2;
+        *((npy_double *)op1)= npy_copysign(in1, in2);
+    }
+}
+
+NPY_NO_EXPORT void
+DOUBLE_nextafter(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        const npy_double in2 = *(npy_double *)ip2;
+        *((npy_double *)op1)= npy_nextafter(in1, in2);
+    }
+}
+
+NPY_NO_EXPORT void
+DOUBLE_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        const npy_double in2 = *(npy_double *)ip2;
+        *((npy_double *)op1) = npy_floor_divide(in1, in2);
+    }
+}
+
+NPY_NO_EXPORT int
+DOUBLE_floor_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_double *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_double *)(ip1 + is1 * indx);
+        *indexed = npy_floor_divide(*indexed, *(npy_double *)value);
+    }
+    return 0;
+}
+
+NPY_NO_EXPORT void
+DOUBLE_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        const npy_double in2 = *(npy_double *)ip2;
+        *((npy_double *) op1) = npy_remainder(in1, in2);
+    }
+}
+
+NPY_NO_EXPORT void
+DOUBLE_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP_TWO_OUT {
+        const npy_double in1 = *(npy_double *)ip1;
+        const npy_double in2 = *(npy_double *)ip2;
+        *((npy_double *)op1) = npy_divmod(in1, in2, (npy_double *)op2);
+    }
+}
+
+NPY_NO_EXPORT void
+DOUBLE__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    OUTPUT_LOOP {
+        *((npy_double *)op1) = 1;
+    }
+}
+
+NPY_NO_EXPORT void
+DOUBLE_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *((npy_double *)op1) = in1;
+    }
+}
+
+NPY_NO_EXPORT void
+DOUBLE_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *((npy_double *)op1) = +in1;
+    }
+}
+
+NPY_NO_EXPORT void
+DOUBLE_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /* Sign of nan is nan */
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *((npy_double *)op1) = in1 > 0 ? 1 : (in1 < 0 ? -1 : (in1 == 0 ? 0 : in1));
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+NPY_NO_EXPORT void
+DOUBLE_modf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_TWO_OUT {
+        const npy_double in1 = *(npy_double *)ip1;
+        *((npy_double *)op1) = npy_modf(in1, (npy_double *)op2);
+    }
+}
+
+NPY_NO_EXPORT void
+DOUBLE_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * Additional loop to handle npy_long integer inputs (cf. #866, #1633).
+     * npy_long != npy_int on many 64-bit platforms, so we need this second loop
+     * to handle the default integer type.
+     */
+    BINARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        const long in2 = *(long *)ip2;
+        if (((int)in2) == in2) {
+            /* Range OK */
+            *((npy_double *)op1) = npy_ldexp(in1, ((int)in2));
+        }
+        else {
+            /*
+             * Outside npy_int range -- also ldexp will overflow in this case,
+             * given that exponent has less bits than npy_int.
+             */
+            if (in2 > 0) {
+                *((npy_double *)op1) = npy_ldexp(in1, NPY_MAX_INT);
+            }
+            else {
+                *((npy_double *)op1) = npy_ldexp(in1, NPY_MIN_INT);
+            }
+        }
+    }
+}
+
+
+#line 1152
+#line 1156
+NPY_NO_EXPORT void
+LONGDOUBLE_logical_and(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        const npy_longdouble in2 = *(npy_longdouble *)ip2;
+        *((npy_bool *)op1) = in1 && in2;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 1156
+NPY_NO_EXPORT void
+LONGDOUBLE_logical_or(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        const npy_longdouble in2 = *(npy_longdouble *)ip2;
+        *((npy_bool *)op1) = in1 || in2;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+
+NPY_NO_EXPORT void
+LONGDOUBLE_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const int t1 = !!*(npy_longdouble *)ip1;
+        const int t2 = !!*(npy_longdouble *)ip2;
+        *((npy_bool *)op1) = (t1 != t2);
+    }
+}
+
+NPY_NO_EXPORT void
+LONGDOUBLE_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        *((npy_bool *)op1) = !in1;
+    }
+}
+
+#if !0
+#line 1192
+NPY_NO_EXPORT void
+LONGDOUBLE_isnan(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        *((npy_bool *)op1) = npy_isnan(in1) != 0;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 1192
+NPY_NO_EXPORT void
+LONGDOUBLE_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        *((npy_bool *)op1) = npy_isinf(in1) != 0;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 1192
+NPY_NO_EXPORT void
+LONGDOUBLE_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        *((npy_bool *)op1) = npy_isfinite(in1) != 0;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 1192
+NPY_NO_EXPORT void
+LONGDOUBLE_signbit(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        *((npy_bool *)op1) = npy_signbit(in1) != 0;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#endif
+
+NPY_NO_EXPORT void
+LONGDOUBLE_spacing(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        *((npy_longdouble *)op1) = npy_spacingl(in1);
+    }
+}
+
+NPY_NO_EXPORT void
+LONGDOUBLE_copysign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        const npy_longdouble in2 = *(npy_longdouble *)ip2;
+        *((npy_longdouble *)op1)= npy_copysignl(in1, in2);
+    }
+}
+
+NPY_NO_EXPORT void
+LONGDOUBLE_nextafter(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        const npy_longdouble in2 = *(npy_longdouble *)ip2;
+        *((npy_longdouble *)op1)= npy_nextafterl(in1, in2);
+    }
+}
+
+NPY_NO_EXPORT void
+LONGDOUBLE_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        const npy_longdouble in2 = *(npy_longdouble *)ip2;
+        *((npy_longdouble *)op1) = npy_floor_dividel(in1, in2);
+    }
+}
+
+NPY_NO_EXPORT int
+LONGDOUBLE_floor_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_longdouble *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_longdouble *)(ip1 + is1 * indx);
+        *indexed = npy_floor_dividel(*indexed, *(npy_longdouble *)value);
+    }
+    return 0;
+}
+
+NPY_NO_EXPORT void
+LONGDOUBLE_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        const npy_longdouble in2 = *(npy_longdouble *)ip2;
+        *((npy_longdouble *) op1) = npy_remainderl(in1, in2);
+    }
+}
+
+NPY_NO_EXPORT void
+LONGDOUBLE_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP_TWO_OUT {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        const npy_longdouble in2 = *(npy_longdouble *)ip2;
+        *((npy_longdouble *)op1) = npy_divmodl(in1, in2, (npy_longdouble *)op2);
+    }
+}
+
+NPY_NO_EXPORT void
+LONGDOUBLE__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    OUTPUT_LOOP {
+        *((npy_longdouble *)op1) = 1;
+    }
+}
+
+NPY_NO_EXPORT void
+LONGDOUBLE_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        *((npy_longdouble *)op1) = in1;
+    }
+}
+
+NPY_NO_EXPORT void
+LONGDOUBLE_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        *((npy_longdouble *)op1) = +in1;
+    }
+}
+
+NPY_NO_EXPORT void
+LONGDOUBLE_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /* Sign of nan is nan */
+    UNARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        *((npy_longdouble *)op1) = in1 > 0 ? 1 : (in1 < 0 ? -1 : (in1 == 0 ? 0 : in1));
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+NPY_NO_EXPORT void
+LONGDOUBLE_modf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_TWO_OUT {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        *((npy_longdouble *)op1) = npy_modfl(in1, (npy_longdouble *)op2);
+    }
+}
+
+NPY_NO_EXPORT void
+LONGDOUBLE_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * Additional loop to handle npy_long integer inputs (cf. #866, #1633).
+     * npy_long != npy_int on many 64-bit platforms, so we need this second loop
+     * to handle the default integer type.
+     */
+    BINARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        const long in2 = *(long *)ip2;
+        if (((int)in2) == in2) {
+            /* Range OK */
+            *((npy_longdouble *)op1) = npy_ldexpl(in1, ((int)in2));
+        }
+        else {
+            /*
+             * Outside npy_int range -- also ldexp will overflow in this case,
+             * given that exponent has less bits than npy_int.
+             */
+            if (in2 > 0) {
+                *((npy_longdouble *)op1) = npy_ldexpl(in1, NPY_MAX_INT);
+            }
+            else {
+                *((npy_longdouble *)op1) = npy_ldexpl(in1, NPY_MIN_INT);
+            }
+        }
+    }
+}
+
+
+
+/*
+ *****************************************************************************
+ **                          LONGDOUBLE LOOPS                               **
+ *****************************************************************************
+ */
+
+#line 1377
+NPY_NO_EXPORT void
+LONGDOUBLE_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+#if 1
+        npy_longdouble * iop1 = (npy_longdouble *)args[0];
+        npy_intp n = dimensions[0];
+
+        *iop1 += LONGDOUBLE_pairwise_sum(args[1], n, steps[1]);
+#else
+        BINARY_REDUCE_LOOP(npy_longdouble) {
+            io1 += *(npy_longdouble *)ip2;
+        }
+        *((npy_longdouble *)iop1) = io1;
+#endif
+    }
+    else {
+        BINARY_LOOP {
+            const npy_longdouble in1 = *(npy_longdouble *)ip1;
+            const npy_longdouble in2 = *(npy_longdouble *)ip2;
+            *((npy_longdouble *)op1) = in1 + in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT int
+LONGDOUBLE_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_longdouble *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_longdouble *)(ip1 + is1 * indx);
+        *indexed = *indexed + *(npy_longdouble *)value;
+    }
+    return 0;
+}
+
+
+#line 1377
+NPY_NO_EXPORT void
+LONGDOUBLE_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+#if 0
+        npy_longdouble * iop1 = (npy_longdouble *)args[0];
+        npy_intp n = dimensions[0];
+
+        *iop1 -= LONGDOUBLE_pairwise_sum(args[1], n, steps[1]);
+#else
+        BINARY_REDUCE_LOOP(npy_longdouble) {
+            io1 -= *(npy_longdouble *)ip2;
+        }
+        *((npy_longdouble *)iop1) = io1;
+#endif
+    }
+    else {
+        BINARY_LOOP {
+            const npy_longdouble in1 = *(npy_longdouble *)ip1;
+            const npy_longdouble in2 = *(npy_longdouble *)ip2;
+            *((npy_longdouble *)op1) = in1 - in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT int
+LONGDOUBLE_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_longdouble *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_longdouble *)(ip1 + is1 * indx);
+        *indexed = *indexed - *(npy_longdouble *)value;
+    }
+    return 0;
+}
+
+
+#line 1377
+NPY_NO_EXPORT void
+LONGDOUBLE_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+#if 0
+        npy_longdouble * iop1 = (npy_longdouble *)args[0];
+        npy_intp n = dimensions[0];
+
+        *iop1 *= LONGDOUBLE_pairwise_sum(args[1], n, steps[1]);
+#else
+        BINARY_REDUCE_LOOP(npy_longdouble) {
+            io1 *= *(npy_longdouble *)ip2;
+        }
+        *((npy_longdouble *)iop1) = io1;
+#endif
+    }
+    else {
+        BINARY_LOOP {
+            const npy_longdouble in1 = *(npy_longdouble *)ip1;
+            const npy_longdouble in2 = *(npy_longdouble *)ip2;
+            *((npy_longdouble *)op1) = in1 * in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT int
+LONGDOUBLE_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_longdouble *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_longdouble *)(ip1 + is1 * indx);
+        *indexed = *indexed * *(npy_longdouble *)value;
+    }
+    return 0;
+}
+
+
+#line 1377
+NPY_NO_EXPORT void
+LONGDOUBLE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+#if 0
+        npy_longdouble * iop1 = (npy_longdouble *)args[0];
+        npy_intp n = dimensions[0];
+
+        *iop1 /= LONGDOUBLE_pairwise_sum(args[1], n, steps[1]);
+#else
+        BINARY_REDUCE_LOOP(npy_longdouble) {
+            io1 /= *(npy_longdouble *)ip2;
+        }
+        *((npy_longdouble *)iop1) = io1;
+#endif
+    }
+    else {
+        BINARY_LOOP {
+            const npy_longdouble in1 = *(npy_longdouble *)ip1;
+            const npy_longdouble in2 = *(npy_longdouble *)ip2;
+            *((npy_longdouble *)op1) = in1 / in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT int
+LONGDOUBLE_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_longdouble *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_longdouble *)(ip1 + is1 * indx);
+        *indexed = *indexed / *(npy_longdouble *)value;
+    }
+    return 0;
+}
+
+
+
+#line 1432
+NPY_NO_EXPORT void
+LONGDOUBLE_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        const npy_longdouble in2 = *(npy_longdouble *)ip2;
+        *((npy_bool *)op1) = in1 == in2;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 1432
+NPY_NO_EXPORT void
+LONGDOUBLE_not_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        const npy_longdouble in2 = *(npy_longdouble *)ip2;
+        *((npy_bool *)op1) = in1 != in2;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 1432
+NPY_NO_EXPORT void
+LONGDOUBLE_less(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        const npy_longdouble in2 = *(npy_longdouble *)ip2;
+        *((npy_bool *)op1) = in1 < in2;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 1432
+NPY_NO_EXPORT void
+LONGDOUBLE_less_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        const npy_longdouble in2 = *(npy_longdouble *)ip2;
+        *((npy_bool *)op1) = in1 <= in2;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 1432
+NPY_NO_EXPORT void
+LONGDOUBLE_greater(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        const npy_longdouble in2 = *(npy_longdouble *)ip2;
+        *((npy_bool *)op1) = in1 > in2;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 1432
+NPY_NO_EXPORT void
+LONGDOUBLE_greater_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        const npy_longdouble in2 = *(npy_longdouble *)ip2;
+        *((npy_bool *)op1) = in1 >= in2;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+
+NPY_NO_EXPORT void
+LONGDOUBLE_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble*)ip1;
+        *((npy_longdouble *)op1) = 1/in1;
+    }
+}
+
+NPY_NO_EXPORT void
+LONGDOUBLE_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        const npy_longdouble tmp = in1 > 0 ? in1 : -in1;
+        /* add 0 to clear -0.0 */
+        *((npy_longdouble *)op1) = tmp + 0;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+NPY_NO_EXPORT void
+LONGDOUBLE_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        *((npy_longdouble *)op1) = in1*in1;
+    }
+}
+
+NPY_NO_EXPORT void
+LONGDOUBLE_frexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_TWO_OUT {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        *((npy_longdouble *)op1) = npy_frexpl(in1, (int *)op2);
+    }
+}
+
+NPY_NO_EXPORT void
+LONGDOUBLE_ldexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        const int in2 = *(int *)ip2;
+        *((npy_longdouble *)op1) = npy_ldexpl(in1, in2);
+    }
+}
+
+/*
+ *****************************************************************************
+ **                          HALF-FLOAT LOOPS                               **
+ *****************************************************************************
+ */
+
+
+#line 1506
+NPY_NO_EXPORT void
+HALF_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        char *iop1 = args[0];
+        float io1 = npy_half_to_float(*(npy_half *)iop1);
+#if 1
+        npy_intp n = dimensions[0];
+
+        io1 += HALF_pairwise_sum(args[1], n, steps[1]);
+#else
+        BINARY_REDUCE_LOOP_INNER {
+            io1 += npy_half_to_float(*(npy_half *)ip2);
+        }
+#endif
+        *((npy_half *)iop1) = npy_float_to_half(io1);
+    }
+    else {
+        BINARY_LOOP {
+            const float in1 = npy_half_to_float(*(npy_half *)ip1);
+            const float in2 = npy_half_to_float(*(npy_half *)ip2);
+            *((npy_half *)op1) = npy_float_to_half(in1 + in2);
+        }
+    }
+}
+
+NPY_NO_EXPORT int
+HALF_add_indexed(void *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_half *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_half *)(ip1 + is1 * indx);
+        const float v = npy_half_to_float(*(npy_half *)value);
+        *indexed = npy_float_to_half(npy_half_to_float(*indexed) + v);
+    }
+    return 0;
+}
+
+#line 1506
+NPY_NO_EXPORT void
+HALF_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        char *iop1 = args[0];
+        float io1 = npy_half_to_float(*(npy_half *)iop1);
+#if 0
+        npy_intp n = dimensions[0];
+
+        io1 -= HALF_pairwise_sum(args[1], n, steps[1]);
+#else
+        BINARY_REDUCE_LOOP_INNER {
+            io1 -= npy_half_to_float(*(npy_half *)ip2);
+        }
+#endif
+        *((npy_half *)iop1) = npy_float_to_half(io1);
+    }
+    else {
+        BINARY_LOOP {
+            const float in1 = npy_half_to_float(*(npy_half *)ip1);
+            const float in2 = npy_half_to_float(*(npy_half *)ip2);
+            *((npy_half *)op1) = npy_float_to_half(in1 - in2);
+        }
+    }
+}
+
+NPY_NO_EXPORT int
+HALF_subtract_indexed(void *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_half *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_half *)(ip1 + is1 * indx);
+        const float v = npy_half_to_float(*(npy_half *)value);
+        *indexed = npy_float_to_half(npy_half_to_float(*indexed) - v);
+    }
+    return 0;
+}
+
+#line 1506
+NPY_NO_EXPORT void
+HALF_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        char *iop1 = args[0];
+        float io1 = npy_half_to_float(*(npy_half *)iop1);
+#if 0
+        npy_intp n = dimensions[0];
+
+        io1 *= HALF_pairwise_sum(args[1], n, steps[1]);
+#else
+        BINARY_REDUCE_LOOP_INNER {
+            io1 *= npy_half_to_float(*(npy_half *)ip2);
+        }
+#endif
+        *((npy_half *)iop1) = npy_float_to_half(io1);
+    }
+    else {
+        BINARY_LOOP {
+            const float in1 = npy_half_to_float(*(npy_half *)ip1);
+            const float in2 = npy_half_to_float(*(npy_half *)ip2);
+            *((npy_half *)op1) = npy_float_to_half(in1 * in2);
+        }
+    }
+}
+
+NPY_NO_EXPORT int
+HALF_multiply_indexed(void *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_half *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_half *)(ip1 + is1 * indx);
+        const float v = npy_half_to_float(*(npy_half *)value);
+        *indexed = npy_float_to_half(npy_half_to_float(*indexed) * v);
+    }
+    return 0;
+}
+
+#line 1506
+NPY_NO_EXPORT void
+HALF_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        char *iop1 = args[0];
+        float io1 = npy_half_to_float(*(npy_half *)iop1);
+#if 0
+        npy_intp n = dimensions[0];
+
+        io1 /= HALF_pairwise_sum(args[1], n, steps[1]);
+#else
+        BINARY_REDUCE_LOOP_INNER {
+            io1 /= npy_half_to_float(*(npy_half *)ip2);
+        }
+#endif
+        *((npy_half *)iop1) = npy_float_to_half(io1);
+    }
+    else {
+        BINARY_LOOP {
+            const float in1 = npy_half_to_float(*(npy_half *)ip1);
+            const float in2 = npy_half_to_float(*(npy_half *)ip2);
+            *((npy_half *)op1) = npy_float_to_half(in1 / in2);
+        }
+    }
+}
+
+NPY_NO_EXPORT int
+HALF_divide_indexed(void *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_half *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_half *)(ip1 + is1 * indx);
+        const float v = npy_half_to_float(*(npy_half *)value);
+        *indexed = npy_float_to_half(npy_half_to_float(*indexed) / v);
+    }
+    return 0;
+}
+
+
+#define _HALF_LOGICAL_AND(a,b) (!npy_half_iszero(a) && !npy_half_iszero(b))
+#define _HALF_LOGICAL_OR(a,b) (!npy_half_iszero(a) || !npy_half_iszero(b))
+#line 1566
+NPY_NO_EXPORT void
+HALF_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        const npy_half in2 = *(npy_half *)ip2;
+        *((npy_bool *)op1) = npy_half_eq(in1, in2);
+    }
+}
+
+#line 1566
+NPY_NO_EXPORT void
+HALF_not_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        const npy_half in2 = *(npy_half *)ip2;
+        *((npy_bool *)op1) = npy_half_ne(in1, in2);
+    }
+}
+
+#line 1566
+NPY_NO_EXPORT void
+HALF_less(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        const npy_half in2 = *(npy_half *)ip2;
+        *((npy_bool *)op1) = npy_half_lt(in1, in2);
+    }
+}
+
+#line 1566
+NPY_NO_EXPORT void
+HALF_less_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        const npy_half in2 = *(npy_half *)ip2;
+        *((npy_bool *)op1) = npy_half_le(in1, in2);
+    }
+}
+
+#line 1566
+NPY_NO_EXPORT void
+HALF_greater(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        const npy_half in2 = *(npy_half *)ip2;
+        *((npy_bool *)op1) = npy_half_gt(in1, in2);
+    }
+}
+
+#line 1566
+NPY_NO_EXPORT void
+HALF_greater_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        const npy_half in2 = *(npy_half *)ip2;
+        *((npy_bool *)op1) = npy_half_ge(in1, in2);
+    }
+}
+
+#line 1566
+NPY_NO_EXPORT void
+HALF_logical_and(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        const npy_half in2 = *(npy_half *)ip2;
+        *((npy_bool *)op1) = _HALF_LOGICAL_AND(in1, in2);
+    }
+}
+
+#line 1566
+NPY_NO_EXPORT void
+HALF_logical_or(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        const npy_half in2 = *(npy_half *)ip2;
+        *((npy_bool *)op1) = _HALF_LOGICAL_OR(in1, in2);
+    }
+}
+
+#undef _HALF_LOGICAL_AND
+#undef _HALF_LOGICAL_OR
+
+NPY_NO_EXPORT void
+HALF_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const int in1 = !npy_half_iszero(*(npy_half *)ip1);
+        const int in2 = !npy_half_iszero(*(npy_half *)ip2);
+        *((npy_bool *)op1) = (in1 != in2);
+    }
+}
+
+NPY_NO_EXPORT void
+HALF_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        *((npy_bool *)op1) = npy_half_iszero(in1);
+    }
+}
+
+#line 1602
+NPY_NO_EXPORT void
+HALF_isnan(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        *((npy_bool *)op1) = npy_half_isnan(in1) != 0;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 1602
+NPY_NO_EXPORT void
+HALF_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        *((npy_bool *)op1) = npy_half_isinf(in1) != 0;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 1602
+NPY_NO_EXPORT void
+HALF_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        *((npy_bool *)op1) = npy_half_isfinite(in1) != 0;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 1602
+NPY_NO_EXPORT void
+HALF_signbit(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        *((npy_bool *)op1) = npy_half_signbit(in1) != 0;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+
+NPY_NO_EXPORT void
+HALF_spacing(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        *((npy_half *)op1) = npy_half_spacing(in1);
+    }
+}
+
+NPY_NO_EXPORT void
+HALF_copysign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        const npy_half in2 = *(npy_half *)ip2;
+        *((npy_half *)op1)= npy_half_copysign(in1, in2);
+    }
+}
+
+NPY_NO_EXPORT void
+HALF_nextafter(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        const npy_half in2 = *(npy_half *)ip2;
+        *((npy_half *)op1)= npy_half_nextafter(in1, in2);
+    }
+}
+
+#line 1646
+NPY_NO_EXPORT void
+HALF_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*  */
+    BINARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        const npy_half in2 = *(npy_half *)ip2;
+        *((npy_half *)op1) = (npy_half_ge(in1, in2) || npy_half_isnan(in1)) ? in1 : in2;
+    }
+    /* npy_half_isnan will never set floatstatus_invalid, so do not clear */
+}
+
+NPY_NO_EXPORT int
+HALF_maximum_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_half *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_half *)(ip1 + is1 * indx);
+        npy_half v = *(npy_half *)value;
+        *indexed = (npy_half_ge(*indexed, v) || npy_half_isnan(*indexed)) ? *indexed : v;
+    }
+    return 0;
+}
+
+
+#line 1646
+NPY_NO_EXPORT void
+HALF_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*  */
+    BINARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        const npy_half in2 = *(npy_half *)ip2;
+        *((npy_half *)op1) = (npy_half_le(in1, in2) || npy_half_isnan(in1)) ? in1 : in2;
+    }
+    /* npy_half_isnan will never set floatstatus_invalid, so do not clear */
+}
+
+NPY_NO_EXPORT int
+HALF_minimum_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_half *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_half *)(ip1 + is1 * indx);
+        npy_half v = *(npy_half *)value;
+        *indexed = (npy_half_le(*indexed, v) || npy_half_isnan(*indexed)) ? *indexed : v;
+    }
+    return 0;
+}
+
+
+
+#line 1689
+NPY_NO_EXPORT void
+HALF_fmax(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*  */
+    BINARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        const npy_half in2 = *(npy_half *)ip2;
+        *((npy_half *)op1) = (npy_half_ge(in1, in2) || npy_half_isnan(in2)) ? in1 : in2;
+    }
+    /* no need to clear floatstatus_invalid */
+}
+
+NPY_NO_EXPORT int
+HALF_fmax_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_half *indexed;
+    for (i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_half *)(ip1 + is1 * indx);
+        npy_half v = *(npy_half *)value;
+        *indexed = (npy_half_ge(*indexed, v) || npy_half_isnan(v)) ? *indexed: v;
+    }
+    return 0;
+}
+
+
+#line 1689
+NPY_NO_EXPORT void
+HALF_fmin(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*  */
+    BINARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        const npy_half in2 = *(npy_half *)ip2;
+        *((npy_half *)op1) = (npy_half_le(in1, in2) || npy_half_isnan(in2)) ? in1 : in2;
+    }
+    /* no need to clear floatstatus_invalid */
+}
+
+NPY_NO_EXPORT int
+HALF_fmin_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_half *indexed;
+    for (i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_half *)(ip1 + is1 * indx);
+        npy_half v = *(npy_half *)value;
+        *indexed = (npy_half_le(*indexed, v) || npy_half_isnan(v)) ? *indexed: v;
+    }
+    return 0;
+}
+
+
+
+NPY_NO_EXPORT void
+HALF_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        const npy_half in2 = *(npy_half *)ip2;
+
+        float fh1 = npy_half_to_float(in1);
+        float fh2 = npy_half_to_float(in2);
+        float div;
+
+        div = npy_floor_dividef(fh1, fh2);
+        *((npy_half *)op1) = npy_float_to_half(div);
+    }
+}
+
+NPY_NO_EXPORT int
+HALF_floor_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
+         char **args, npy_intp const *dimensions, npy_intp const *steps,
+         void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_half *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_half *)(ip1 + is1 * indx);
+        float v = npy_half_to_float(*(npy_half *)value);
+        float div = npy_floor_dividef(npy_half_to_float(*indexed), v);
+        *indexed = npy_float_to_half(div);
+    }
+    return 0;
+}
+
+NPY_NO_EXPORT void
+HALF_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        const npy_half in2 = *(npy_half *)ip2;
+        float fh1 = npy_half_to_float(in1);
+        float fh2 = npy_half_to_float(in2);
+        float mod;
+        mod = npy_remainderf(fh1, fh2);
+        *((npy_half *)op1) = npy_float_to_half(mod);
+    }
+}
+
+NPY_NO_EXPORT void
+HALF_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP_TWO_OUT {
+        const npy_half in1 = *(npy_half *)ip1;
+        const npy_half in2 = *(npy_half *)ip2;
+        *((npy_half *)op1) = npy_half_divmod(in1, in2, (npy_half *)op2);
+    }
+}
+
+NPY_NO_EXPORT void
+HALF_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP {
+        const float in1 = npy_half_to_float(*(npy_half *)ip1);
+        *((npy_half *)op1) = npy_float_to_half(in1*in1);
+    }
+}
+
+NPY_NO_EXPORT void
+HALF_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP {
+        const float in1 = npy_half_to_float(*(npy_half *)ip1);
+        *((npy_half *)op1) = npy_float_to_half(1/in1);
+    }
+}
+
+NPY_NO_EXPORT void
+HALF__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    OUTPUT_LOOP {
+        *((npy_half *)op1) = NPY_HALF_ONE;
+    }
+}
+
+NPY_NO_EXPORT void
+HALF_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        *((npy_half *)op1) = in1;
+    }
+}
+
+NPY_NO_EXPORT void
+HALF_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        *((npy_half *)op1) = in1^0x8000u;
+    }
+}
+
+NPY_NO_EXPORT void
+HALF_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        *((npy_half *)op1) = +in1;
+    }
+}
+
+NPY_NO_EXPORT void
+HALF_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /* Sign of nan is nan */
+    UNARY_LOOP {
+        const npy_half in1 = *(npy_half *)ip1;
+        *((npy_half *)op1) = npy_half_isnan(in1) ? in1 :
+                    (((in1&0x7fffu) == 0) ? 0 :
+                      (((in1&0x8000u) == 0) ? NPY_HALF_ONE : NPY_HALF_NEGONE));
+    }
+}
+
+NPY_NO_EXPORT void
+HALF_modf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    float temp;
+
+    UNARY_LOOP_TWO_OUT {
+        const float in1 = npy_half_to_float(*(npy_half *)ip1);
+        *((npy_half *)op1) = npy_float_to_half(npy_modff(in1, &temp));
+        *((npy_half *)op2) = npy_float_to_half(temp);
+    }
+}
+
+NPY_NO_EXPORT void
+HALF_frexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_TWO_OUT {
+        const float in1 = npy_half_to_float(*(npy_half *)ip1);
+        *((npy_half *)op1) = npy_float_to_half(npy_frexpf(in1, (int *)op2));
+    }
+}
+
+NPY_NO_EXPORT void
+HALF_ldexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const float in1 = npy_half_to_float(*(npy_half *)ip1);
+        const int in2 = *(int *)ip2;
+        *((npy_half *)op1) = npy_float_to_half(npy_ldexpf(in1, in2));
+    }
+}
+
+NPY_NO_EXPORT void
+HALF_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * Additional loop to handle npy_long integer inputs (cf. #866, #1633).
+     * npy_long != npy_int on many 64-bit platforms, so we need this second loop
+     * to handle the default integer type.
+     */
+    BINARY_LOOP {
+        const float in1 = npy_half_to_float(*(npy_half *)ip1);
+        const long in2 = *(long *)ip2;
+        if (((int)in2) == in2) {
+            /* Range OK */
+            *((npy_half *)op1) = npy_float_to_half(npy_ldexpf(in1, ((int)in2)));
+        }
+        else {
+            /*
+             * Outside npy_int range -- also ldexp will overflow in this case,
+             * given that exponent has less bits than npy_int.
+             */
+            if (in2 > 0) {
+                *((npy_half *)op1) = npy_float_to_half(npy_ldexpf(in1, NPY_MAX_INT));
+            }
+            else {
+                *((npy_half *)op1) = npy_float_to_half(npy_ldexpf(in1, NPY_MIN_INT));
+            }
+        }
+    }
+}
+
+/*
+ *****************************************************************************
+ **                           COMPLEX LOOPS                                 **
+ *****************************************************************************
+ */
+
+#define CGE(xr,xi,yr,yi) ((xr > yr && !npy_isnan(xi) && !npy_isnan(yi)) \
+                          || (xr == yr && xi >= yi))
+#define CLE(xr,xi,yr,yi) ((xr < yr && !npy_isnan(xi) && !npy_isnan(yi)) \
+                          || (xr == yr && xi <= yi))
+#define CGT(xr,xi,yr,yi) ((xr > yr && !npy_isnan(xi) && !npy_isnan(yi)) \
+                          || (xr == yr && xi > yi))
+#define CLT(xr,xi,yr,yi) ((xr < yr && !npy_isnan(xi) && !npy_isnan(yi)) \
+                          || (xr == yr && xi < yi))
+#define CEQ(xr,xi,yr,yi) (xr == yr && xi == yi)
+#define CNE(xr,xi,yr,yi) (xr != yr || xi != yi)
+
+#line 1945
+
+#if !1
+// CFLOAT & CDOUBLE defined by 'loops_arithm_fp.dispatch.c.src'
+#line 1954
+NPY_NO_EXPORT void
+CFLOAT_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    // Parenthesis around 1 tells clang dead code is intentional
+    if (IS_BINARY_REDUCE && (1)) {
+        npy_intp n = dimensions[0];
+        npy_float * or = ((npy_float *)args[0]);
+        npy_float * oi = ((npy_float *)args[0]) + 1;
+        npy_float rr, ri;
+
+        CFLOAT_pairwise_sum(&rr, &ri, args[1], n * 2, steps[1] / 2);
+        *or += rr;
+        *oi += ri;
+        return;
+    }
+    else {
+        BINARY_LOOP {
+            const npy_float in1r = ((npy_float *)ip1)[0];
+            const npy_float in1i = ((npy_float *)ip1)[1];
+            const npy_float in2r = ((npy_float *)ip2)[0];
+            const npy_float in2i = ((npy_float *)ip2)[1];
+            ((npy_float *)op1)[0] = in1r + in2r;
+            ((npy_float *)op1)[1] = in1i + in2i;
+        }
+    }
+}
+
+NPY_NO_EXPORT int CFLOAT_add_indexed
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_float *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_float *)(ip1 + is1 * indx);
+        const npy_float b_r = ((npy_float *)value)[0];
+        const npy_float b_i = ((npy_float *)value)[1];
+        indexed[0] += b_r;
+        indexed[1] += b_i;
+    }
+    return 0;
+}
+
+#line 1954
+NPY_NO_EXPORT void
+CFLOAT_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    // Parenthesis around 0 tells clang dead code is intentional
+    if (IS_BINARY_REDUCE && (0)) {
+        npy_intp n = dimensions[0];
+        npy_float * or = ((npy_float *)args[0]);
+        npy_float * oi = ((npy_float *)args[0]) + 1;
+        npy_float rr, ri;
+
+        CFLOAT_pairwise_sum(&rr, &ri, args[1], n * 2, steps[1] / 2);
+        *or -= rr;
+        *oi -= ri;
+        return;
+    }
+    else {
+        BINARY_LOOP {
+            const npy_float in1r = ((npy_float *)ip1)[0];
+            const npy_float in1i = ((npy_float *)ip1)[1];
+            const npy_float in2r = ((npy_float *)ip2)[0];
+            const npy_float in2i = ((npy_float *)ip2)[1];
+            ((npy_float *)op1)[0] = in1r - in2r;
+            ((npy_float *)op1)[1] = in1i - in2i;
+        }
+    }
+}
+
+NPY_NO_EXPORT int CFLOAT_subtract_indexed
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_float *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_float *)(ip1 + is1 * indx);
+        const npy_float b_r = ((npy_float *)value)[0];
+        const npy_float b_i = ((npy_float *)value)[1];
+        indexed[0] -= b_r;
+        indexed[1] -= b_i;
+    }
+    return 0;
+}
+
+
+NPY_NO_EXPORT void
+CFLOAT_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_float in1r = ((npy_float *)ip1)[0];
+        const npy_float in1i = ((npy_float *)ip1)[1];
+        const npy_float in2r = ((npy_float *)ip2)[0];
+        const npy_float in2i = ((npy_float *)ip2)[1];
+        ((npy_float *)op1)[0] = in1r*in2r - in1i*in2i;
+        ((npy_float *)op1)[1] = in1r*in2i + in1i*in2r;
+    }
+}
+
+NPY_NO_EXPORT int CFLOAT_multiply_indexed
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_float *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_float *)(ip1 + is1 * indx);
+        const npy_float a_r = indexed[0];
+        const npy_float a_i = indexed[1];
+        const npy_float b_r = ((npy_float *)value)[0];
+        const npy_float b_i = ((npy_float *)value)[1];
+        indexed[0] = a_r*b_r - a_i*b_i;
+        indexed[1] = a_r*b_i + a_i*b_r;
+    }
+    return 0;
+}
+#endif // !SIMD
+
+NPY_NO_EXPORT void
+CFLOAT_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_float in1r = ((npy_float *)ip1)[0];
+        const npy_float in1i = ((npy_float *)ip1)[1];
+        const npy_float in2r = ((npy_float *)ip2)[0];
+        const npy_float in2i = ((npy_float *)ip2)[1];
+        const npy_float in2r_abs = npy_fabsf(in2r);
+        const npy_float in2i_abs = npy_fabsf(in2i);
+        if (in2r_abs >= in2i_abs) {
+            if (in2r_abs == 0 && in2i_abs == 0) {
+                /* divide by zero should yield a complex inf or nan */
+                ((npy_float *)op1)[0] = in1r/in2r_abs;
+                ((npy_float *)op1)[1] = in1i/in2i_abs;
+            }
+            else {
+                const npy_float rat = in2i/in2r;
+                const npy_float scl = 1.0f/(in2r + in2i*rat);
+                ((npy_float *)op1)[0] = (in1r + in1i*rat)*scl;
+                ((npy_float *)op1)[1] = (in1i - in1r*rat)*scl;
+            }
+        }
+        else {
+            const npy_float rat = in2r/in2i;
+            const npy_float scl = 1.0f/(in2i + in2r*rat);
+            ((npy_float *)op1)[0] = (in1r*rat + in1i)*scl;
+            ((npy_float *)op1)[1] = (in1i*rat - in1r)*scl;
+        }
+    }
+}
+
+
+#line 2085
+NPY_NO_EXPORT void
+CFLOAT_greater(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_float in1r = ((npy_float *)ip1)[0];
+        const npy_float in1i = ((npy_float *)ip1)[1];
+        const npy_float in2r = ((npy_float *)ip2)[0];
+        const npy_float in2i = ((npy_float *)ip2)[1];
+        *((npy_bool *)op1) = CGT(in1r,in1i,in2r,in2i);
+    }
+}
+
+#line 2085
+NPY_NO_EXPORT void
+CFLOAT_greater_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_float in1r = ((npy_float *)ip1)[0];
+        const npy_float in1i = ((npy_float *)ip1)[1];
+        const npy_float in2r = ((npy_float *)ip2)[0];
+        const npy_float in2i = ((npy_float *)ip2)[1];
+        *((npy_bool *)op1) = CGE(in1r,in1i,in2r,in2i);
+    }
+}
+
+#line 2085
+NPY_NO_EXPORT void
+CFLOAT_less(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_float in1r = ((npy_float *)ip1)[0];
+        const npy_float in1i = ((npy_float *)ip1)[1];
+        const npy_float in2r = ((npy_float *)ip2)[0];
+        const npy_float in2i = ((npy_float *)ip2)[1];
+        *((npy_bool *)op1) = CLT(in1r,in1i,in2r,in2i);
+    }
+}
+
+#line 2085
+NPY_NO_EXPORT void
+CFLOAT_less_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_float in1r = ((npy_float *)ip1)[0];
+        const npy_float in1i = ((npy_float *)ip1)[1];
+        const npy_float in2r = ((npy_float *)ip2)[0];
+        const npy_float in2i = ((npy_float *)ip2)[1];
+        *((npy_bool *)op1) = CLE(in1r,in1i,in2r,in2i);
+    }
+}
+
+#line 2085
+NPY_NO_EXPORT void
+CFLOAT_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_float in1r = ((npy_float *)ip1)[0];
+        const npy_float in1i = ((npy_float *)ip1)[1];
+        const npy_float in2r = ((npy_float *)ip2)[0];
+        const npy_float in2i = ((npy_float *)ip2)[1];
+        *((npy_bool *)op1) = CEQ(in1r,in1i,in2r,in2i);
+    }
+}
+
+#line 2085
+NPY_NO_EXPORT void
+CFLOAT_not_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_float in1r = ((npy_float *)ip1)[0];
+        const npy_float in1i = ((npy_float *)ip1)[1];
+        const npy_float in2r = ((npy_float *)ip2)[0];
+        const npy_float in2i = ((npy_float *)ip2)[1];
+        *((npy_bool *)op1) = CNE(in1r,in1i,in2r,in2i);
+    }
+}
+
+
+#line 2103
+NPY_NO_EXPORT void
+CFLOAT_logical_and(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_float in1r = ((npy_float *)ip1)[0];
+        const npy_float in1i = ((npy_float *)ip1)[1];
+        const npy_float in2r = ((npy_float *)ip2)[0];
+        const npy_float in2i = ((npy_float *)ip2)[1];
+        *((npy_bool *)op1) = (in1r || in1i) && (in2r || in2i);
+    }
+}
+
+#line 2103
+NPY_NO_EXPORT void
+CFLOAT_logical_or(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_float in1r = ((npy_float *)ip1)[0];
+        const npy_float in1i = ((npy_float *)ip1)[1];
+        const npy_float in2r = ((npy_float *)ip2)[0];
+        const npy_float in2i = ((npy_float *)ip2)[1];
+        *((npy_bool *)op1) = (in1r || in1i) || (in2r || in2i);
+    }
+}
+
+
+NPY_NO_EXPORT void
+CFLOAT_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_float in1r = ((npy_float *)ip1)[0];
+        const npy_float in1i = ((npy_float *)ip1)[1];
+        const npy_float in2r = ((npy_float *)ip2)[0];
+        const npy_float in2i = ((npy_float *)ip2)[1];
+        const npy_bool tmp1 = (in1r || in1i);
+        const npy_bool tmp2 = (in2r || in2i);
+        *((npy_bool *)op1) = tmp1 != tmp2;
+    }
+}
+
+NPY_NO_EXPORT void
+CFLOAT_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_float in1r = ((npy_float *)ip1)[0];
+        const npy_float in1i = ((npy_float *)ip1)[1];
+        *((npy_bool *)op1) = !(in1r || in1i);
+    }
+}
+
+#line 2145
+NPY_NO_EXPORT void
+CFLOAT_isnan(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_float in1r = ((npy_float *)ip1)[0];
+        const npy_float in1i = ((npy_float *)ip1)[1];
+        *((npy_bool *)op1) = npy_isnan(in1r) || npy_isnan(in1i);
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 2145
+NPY_NO_EXPORT void
+CFLOAT_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_float in1r = ((npy_float *)ip1)[0];
+        const npy_float in1i = ((npy_float *)ip1)[1];
+        *((npy_bool *)op1) = npy_isinf(in1r) || npy_isinf(in1i);
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 2145
+NPY_NO_EXPORT void
+CFLOAT_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_float in1r = ((npy_float *)ip1)[0];
+        const npy_float in1i = ((npy_float *)ip1)[1];
+        *((npy_bool *)op1) = npy_isfinite(in1r) && npy_isfinite(in1i);
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+
+#if !1
+// CFLOAT & CDOUBLE defined by 'loops_arithm_fp.dispatch.c.src'
+NPY_NO_EXPORT void
+CFLOAT_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP {
+        const npy_float in1r = ((npy_float *)ip1)[0];
+        const npy_float in1i = ((npy_float *)ip1)[1];
+        ((npy_float *)op1)[0] = in1r*in1r - in1i*in1i;
+        ((npy_float *)op1)[1] = in1r*in1i + in1i*in1r;
+    }
+}
+#endif
+
+NPY_NO_EXPORT void
+CFLOAT_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP {
+        const npy_float in1r = ((npy_float *)ip1)[0];
+        const npy_float in1i = ((npy_float *)ip1)[1];
+        if (npy_fabsf(in1i) <= npy_fabsf(in1r)) {
+            const npy_float r = in1i/in1r;
+            const npy_float d = in1r + in1i*r;
+            ((npy_float *)op1)[0] = 1/d;
+            ((npy_float *)op1)[1] = -r/d;
+        } else {
+            const npy_float r = in1r/in1i;
+            const npy_float d = in1r*r + in1i;
+            ((npy_float *)op1)[0] = r/d;
+            ((npy_float *)op1)[1] = -1/d;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+CFLOAT__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    OUTPUT_LOOP {
+        ((npy_float *)op1)[0] = 1;
+        ((npy_float *)op1)[1] = 0;
+    }
+}
+
+#if !1
+// CFLOAT & CDOUBLE defined by 'loops_arithm_fp.dispatch.c.src'
+NPY_NO_EXPORT void
+CFLOAT_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) {
+    UNARY_LOOP {
+        const npy_float in1r = ((npy_float *)ip1)[0];
+        const npy_float in1i = ((npy_float *)ip1)[1];
+        ((npy_float *)op1)[0] = in1r;
+        ((npy_float *)op1)[1] = -in1i;
+    }
+}
+
+NPY_NO_EXPORT void
+CFLOAT_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_float in1r = ((npy_float *)ip1)[0];
+        const npy_float in1i = ((npy_float *)ip1)[1];
+        *((npy_float *)op1) = npy_hypotf(in1r, in1i);
+    }
+}
+#endif
+
+NPY_NO_EXPORT void
+CFLOAT__arg(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_float in1r = ((npy_float *)ip1)[0];
+        const npy_float in1i = ((npy_float *)ip1)[1];
+        *((npy_float *)op1) = npy_atan2f(in1i, in1r);
+    }
+}
+
+NPY_NO_EXPORT void
+CFLOAT_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /* fixme: sign of nan is currently 0 */
+    UNARY_LOOP {
+        const npy_float in1r = ((npy_float *)ip1)[0];
+        const npy_float in1i = ((npy_float *)ip1)[1];
+        ((npy_float *)op1)[0] = CGT(in1r, in1i, 0.0, 0.0) ?  1 :
+                            (CLT(in1r, in1i, 0.0, 0.0) ? -1 :
+                            (CEQ(in1r, in1i, 0.0, 0.0) ?  0 : NPY_NANF));
+        ((npy_float *)op1)[1] = 0;
+    }
+}
+
+#line 2251
+NPY_NO_EXPORT void
+CFLOAT_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        npy_float in1r = ((npy_float *)ip1)[0];
+        npy_float in1i = ((npy_float *)ip1)[1];
+        const npy_float in2r = ((npy_float *)ip2)[0];
+        const npy_float in2i = ((npy_float *)ip2)[1];
+        if ( !(npy_isnan(in1r) || npy_isnan(in1i) || CGE(in1r, in1i, in2r, in2i))) {
+            in1r = in2r;
+            in1i = in2i;
+        }
+        ((npy_float *)op1)[0] = in1r;
+        ((npy_float *)op1)[1] = in1i;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 2251
+NPY_NO_EXPORT void
+CFLOAT_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        npy_float in1r = ((npy_float *)ip1)[0];
+        npy_float in1i = ((npy_float *)ip1)[1];
+        const npy_float in2r = ((npy_float *)ip2)[0];
+        const npy_float in2i = ((npy_float *)ip2)[1];
+        if ( !(npy_isnan(in1r) || npy_isnan(in1i) || CLE(in1r, in1i, in2r, in2i))) {
+            in1r = in2r;
+            in1i = in2i;
+        }
+        ((npy_float *)op1)[0] = in1r;
+        ((npy_float *)op1)[1] = in1i;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+
+#line 2274
+NPY_NO_EXPORT void
+CFLOAT_fmax(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_float in1r = ((npy_float *)ip1)[0];
+        const npy_float in1i = ((npy_float *)ip1)[1];
+        const npy_float in2r = ((npy_float *)ip2)[0];
+        const npy_float in2i = ((npy_float *)ip2)[1];
+        if (npy_isnan(in2r) || npy_isnan(in2i) || CGE(in1r, in1i, in2r, in2i)) {
+            ((npy_float *)op1)[0] = in1r;
+            ((npy_float *)op1)[1] = in1i;
+        }
+        else {
+            ((npy_float *)op1)[0] = in2r;
+            ((npy_float *)op1)[1] = in2i;
+        }
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 2274
+NPY_NO_EXPORT void
+CFLOAT_fmin(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_float in1r = ((npy_float *)ip1)[0];
+        const npy_float in1i = ((npy_float *)ip1)[1];
+        const npy_float in2r = ((npy_float *)ip2)[0];
+        const npy_float in2i = ((npy_float *)ip2)[1];
+        if (npy_isnan(in2r) || npy_isnan(in2i) || CLE(in1r, in1i, in2r, in2i)) {
+            ((npy_float *)op1)[0] = in1r;
+            ((npy_float *)op1)[1] = in1i;
+        }
+        else {
+            ((npy_float *)op1)[0] = in2r;
+            ((npy_float *)op1)[1] = in2i;
+        }
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+
+
+#line 1945
+
+#if !1
+// CFLOAT & CDOUBLE defined by 'loops_arithm_fp.dispatch.c.src'
+#line 1954
+NPY_NO_EXPORT void
+CDOUBLE_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    // Parenthesis around 1 tells clang dead code is intentional
+    if (IS_BINARY_REDUCE && (1)) {
+        npy_intp n = dimensions[0];
+        npy_double * or = ((npy_double *)args[0]);
+        npy_double * oi = ((npy_double *)args[0]) + 1;
+        npy_double rr, ri;
+
+        CDOUBLE_pairwise_sum(&rr, &ri, args[1], n * 2, steps[1] / 2);
+        *or += rr;
+        *oi += ri;
+        return;
+    }
+    else {
+        BINARY_LOOP {
+            const npy_double in1r = ((npy_double *)ip1)[0];
+            const npy_double in1i = ((npy_double *)ip1)[1];
+            const npy_double in2r = ((npy_double *)ip2)[0];
+            const npy_double in2i = ((npy_double *)ip2)[1];
+            ((npy_double *)op1)[0] = in1r + in2r;
+            ((npy_double *)op1)[1] = in1i + in2i;
+        }
+    }
+}
+
+NPY_NO_EXPORT int CDOUBLE_add_indexed
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_double *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_double *)(ip1 + is1 * indx);
+        const npy_double b_r = ((npy_double *)value)[0];
+        const npy_double b_i = ((npy_double *)value)[1];
+        indexed[0] += b_r;
+        indexed[1] += b_i;
+    }
+    return 0;
+}
+
+#line 1954
+NPY_NO_EXPORT void
+CDOUBLE_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    // Parenthesis around 0 tells clang dead code is intentional
+    if (IS_BINARY_REDUCE && (0)) {
+        npy_intp n = dimensions[0];
+        npy_double * or = ((npy_double *)args[0]);
+        npy_double * oi = ((npy_double *)args[0]) + 1;
+        npy_double rr, ri;
+
+        CDOUBLE_pairwise_sum(&rr, &ri, args[1], n * 2, steps[1] / 2);
+        *or -= rr;
+        *oi -= ri;
+        return;
+    }
+    else {
+        BINARY_LOOP {
+            const npy_double in1r = ((npy_double *)ip1)[0];
+            const npy_double in1i = ((npy_double *)ip1)[1];
+            const npy_double in2r = ((npy_double *)ip2)[0];
+            const npy_double in2i = ((npy_double *)ip2)[1];
+            ((npy_double *)op1)[0] = in1r - in2r;
+            ((npy_double *)op1)[1] = in1i - in2i;
+        }
+    }
+}
+
+NPY_NO_EXPORT int CDOUBLE_subtract_indexed
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_double *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_double *)(ip1 + is1 * indx);
+        const npy_double b_r = ((npy_double *)value)[0];
+        const npy_double b_i = ((npy_double *)value)[1];
+        indexed[0] -= b_r;
+        indexed[1] -= b_i;
+    }
+    return 0;
+}
+
+
+NPY_NO_EXPORT void
+CDOUBLE_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_double in1r = ((npy_double *)ip1)[0];
+        const npy_double in1i = ((npy_double *)ip1)[1];
+        const npy_double in2r = ((npy_double *)ip2)[0];
+        const npy_double in2i = ((npy_double *)ip2)[1];
+        ((npy_double *)op1)[0] = in1r*in2r - in1i*in2i;
+        ((npy_double *)op1)[1] = in1r*in2i + in1i*in2r;
+    }
+}
+
+NPY_NO_EXPORT int CDOUBLE_multiply_indexed
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_double *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_double *)(ip1 + is1 * indx);
+        const npy_double a_r = indexed[0];
+        const npy_double a_i = indexed[1];
+        const npy_double b_r = ((npy_double *)value)[0];
+        const npy_double b_i = ((npy_double *)value)[1];
+        indexed[0] = a_r*b_r - a_i*b_i;
+        indexed[1] = a_r*b_i + a_i*b_r;
+    }
+    return 0;
+}
+#endif // !SIMD
+
+NPY_NO_EXPORT void
+CDOUBLE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_double in1r = ((npy_double *)ip1)[0];
+        const npy_double in1i = ((npy_double *)ip1)[1];
+        const npy_double in2r = ((npy_double *)ip2)[0];
+        const npy_double in2i = ((npy_double *)ip2)[1];
+        const npy_double in2r_abs = npy_fabs(in2r);
+        const npy_double in2i_abs = npy_fabs(in2i);
+        if (in2r_abs >= in2i_abs) {
+            if (in2r_abs == 0 && in2i_abs == 0) {
+                /* divide by zero should yield a complex inf or nan */
+                ((npy_double *)op1)[0] = in1r/in2r_abs;
+                ((npy_double *)op1)[1] = in1i/in2i_abs;
+            }
+            else {
+                const npy_double rat = in2i/in2r;
+                const npy_double scl = 1.0/(in2r + in2i*rat);
+                ((npy_double *)op1)[0] = (in1r + in1i*rat)*scl;
+                ((npy_double *)op1)[1] = (in1i - in1r*rat)*scl;
+            }
+        }
+        else {
+            const npy_double rat = in2r/in2i;
+            const npy_double scl = 1.0/(in2i + in2r*rat);
+            ((npy_double *)op1)[0] = (in1r*rat + in1i)*scl;
+            ((npy_double *)op1)[1] = (in1i*rat - in1r)*scl;
+        }
+    }
+}
+
+
+#line 2085
+NPY_NO_EXPORT void
+CDOUBLE_greater(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_double in1r = ((npy_double *)ip1)[0];
+        const npy_double in1i = ((npy_double *)ip1)[1];
+        const npy_double in2r = ((npy_double *)ip2)[0];
+        const npy_double in2i = ((npy_double *)ip2)[1];
+        *((npy_bool *)op1) = CGT(in1r,in1i,in2r,in2i);
+    }
+}
+
+#line 2085
+NPY_NO_EXPORT void
+CDOUBLE_greater_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_double in1r = ((npy_double *)ip1)[0];
+        const npy_double in1i = ((npy_double *)ip1)[1];
+        const npy_double in2r = ((npy_double *)ip2)[0];
+        const npy_double in2i = ((npy_double *)ip2)[1];
+        *((npy_bool *)op1) = CGE(in1r,in1i,in2r,in2i);
+    }
+}
+
+#line 2085
+NPY_NO_EXPORT void
+CDOUBLE_less(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_double in1r = ((npy_double *)ip1)[0];
+        const npy_double in1i = ((npy_double *)ip1)[1];
+        const npy_double in2r = ((npy_double *)ip2)[0];
+        const npy_double in2i = ((npy_double *)ip2)[1];
+        *((npy_bool *)op1) = CLT(in1r,in1i,in2r,in2i);
+    }
+}
+
+#line 2085
+NPY_NO_EXPORT void
+CDOUBLE_less_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_double in1r = ((npy_double *)ip1)[0];
+        const npy_double in1i = ((npy_double *)ip1)[1];
+        const npy_double in2r = ((npy_double *)ip2)[0];
+        const npy_double in2i = ((npy_double *)ip2)[1];
+        *((npy_bool *)op1) = CLE(in1r,in1i,in2r,in2i);
+    }
+}
+
+#line 2085
+NPY_NO_EXPORT void
+CDOUBLE_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_double in1r = ((npy_double *)ip1)[0];
+        const npy_double in1i = ((npy_double *)ip1)[1];
+        const npy_double in2r = ((npy_double *)ip2)[0];
+        const npy_double in2i = ((npy_double *)ip2)[1];
+        *((npy_bool *)op1) = CEQ(in1r,in1i,in2r,in2i);
+    }
+}
+
+#line 2085
+NPY_NO_EXPORT void
+CDOUBLE_not_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_double in1r = ((npy_double *)ip1)[0];
+        const npy_double in1i = ((npy_double *)ip1)[1];
+        const npy_double in2r = ((npy_double *)ip2)[0];
+        const npy_double in2i = ((npy_double *)ip2)[1];
+        *((npy_bool *)op1) = CNE(in1r,in1i,in2r,in2i);
+    }
+}
+
+
+#line 2103
+NPY_NO_EXPORT void
+CDOUBLE_logical_and(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_double in1r = ((npy_double *)ip1)[0];
+        const npy_double in1i = ((npy_double *)ip1)[1];
+        const npy_double in2r = ((npy_double *)ip2)[0];
+        const npy_double in2i = ((npy_double *)ip2)[1];
+        *((npy_bool *)op1) = (in1r || in1i) && (in2r || in2i);
+    }
+}
+
+#line 2103
+NPY_NO_EXPORT void
+CDOUBLE_logical_or(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_double in1r = ((npy_double *)ip1)[0];
+        const npy_double in1i = ((npy_double *)ip1)[1];
+        const npy_double in2r = ((npy_double *)ip2)[0];
+        const npy_double in2i = ((npy_double *)ip2)[1];
+        *((npy_bool *)op1) = (in1r || in1i) || (in2r || in2i);
+    }
+}
+
+
+NPY_NO_EXPORT void
+CDOUBLE_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_double in1r = ((npy_double *)ip1)[0];
+        const npy_double in1i = ((npy_double *)ip1)[1];
+        const npy_double in2r = ((npy_double *)ip2)[0];
+        const npy_double in2i = ((npy_double *)ip2)[1];
+        const npy_bool tmp1 = (in1r || in1i);
+        const npy_bool tmp2 = (in2r || in2i);
+        *((npy_bool *)op1) = tmp1 != tmp2;
+    }
+}
+
+NPY_NO_EXPORT void
+CDOUBLE_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_double in1r = ((npy_double *)ip1)[0];
+        const npy_double in1i = ((npy_double *)ip1)[1];
+        *((npy_bool *)op1) = !(in1r || in1i);
+    }
+}
+
+#line 2145
+NPY_NO_EXPORT void
+CDOUBLE_isnan(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_double in1r = ((npy_double *)ip1)[0];
+        const npy_double in1i = ((npy_double *)ip1)[1];
+        *((npy_bool *)op1) = npy_isnan(in1r) || npy_isnan(in1i);
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 2145
+NPY_NO_EXPORT void
+CDOUBLE_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_double in1r = ((npy_double *)ip1)[0];
+        const npy_double in1i = ((npy_double *)ip1)[1];
+        *((npy_bool *)op1) = npy_isinf(in1r) || npy_isinf(in1i);
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 2145
+NPY_NO_EXPORT void
+CDOUBLE_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_double in1r = ((npy_double *)ip1)[0];
+        const npy_double in1i = ((npy_double *)ip1)[1];
+        *((npy_bool *)op1) = npy_isfinite(in1r) && npy_isfinite(in1i);
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+
+#if !1
+// CFLOAT & CDOUBLE defined by 'loops_arithm_fp.dispatch.c.src'
+NPY_NO_EXPORT void
+CDOUBLE_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP {
+        const npy_double in1r = ((npy_double *)ip1)[0];
+        const npy_double in1i = ((npy_double *)ip1)[1];
+        ((npy_double *)op1)[0] = in1r*in1r - in1i*in1i;
+        ((npy_double *)op1)[1] = in1r*in1i + in1i*in1r;
+    }
+}
+#endif
+
+NPY_NO_EXPORT void
+CDOUBLE_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP {
+        const npy_double in1r = ((npy_double *)ip1)[0];
+        const npy_double in1i = ((npy_double *)ip1)[1];
+        if (npy_fabs(in1i) <= npy_fabs(in1r)) {
+            const npy_double r = in1i/in1r;
+            const npy_double d = in1r + in1i*r;
+            ((npy_double *)op1)[0] = 1/d;
+            ((npy_double *)op1)[1] = -r/d;
+        } else {
+            const npy_double r = in1r/in1i;
+            const npy_double d = in1r*r + in1i;
+            ((npy_double *)op1)[0] = r/d;
+            ((npy_double *)op1)[1] = -1/d;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+CDOUBLE__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    OUTPUT_LOOP {
+        ((npy_double *)op1)[0] = 1;
+        ((npy_double *)op1)[1] = 0;
+    }
+}
+
+#if !1
+// CFLOAT & CDOUBLE defined by 'loops_arithm_fp.dispatch.c.src'
+NPY_NO_EXPORT void
+CDOUBLE_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) {
+    UNARY_LOOP {
+        const npy_double in1r = ((npy_double *)ip1)[0];
+        const npy_double in1i = ((npy_double *)ip1)[1];
+        ((npy_double *)op1)[0] = in1r;
+        ((npy_double *)op1)[1] = -in1i;
+    }
+}
+
+NPY_NO_EXPORT void
+CDOUBLE_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_double in1r = ((npy_double *)ip1)[0];
+        const npy_double in1i = ((npy_double *)ip1)[1];
+        *((npy_double *)op1) = npy_hypot(in1r, in1i);
+    }
+}
+#endif
+
+NPY_NO_EXPORT void
+CDOUBLE__arg(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_double in1r = ((npy_double *)ip1)[0];
+        const npy_double in1i = ((npy_double *)ip1)[1];
+        *((npy_double *)op1) = npy_atan2(in1i, in1r);
+    }
+}
+
+NPY_NO_EXPORT void
+CDOUBLE_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /* fixme: sign of nan is currently 0 */
+    UNARY_LOOP {
+        const npy_double in1r = ((npy_double *)ip1)[0];
+        const npy_double in1i = ((npy_double *)ip1)[1];
+        ((npy_double *)op1)[0] = CGT(in1r, in1i, 0.0, 0.0) ?  1 :
+                            (CLT(in1r, in1i, 0.0, 0.0) ? -1 :
+                            (CEQ(in1r, in1i, 0.0, 0.0) ?  0 : NPY_NAN));
+        ((npy_double *)op1)[1] = 0;
+    }
+}
+
+#line 2251
+NPY_NO_EXPORT void
+CDOUBLE_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        npy_double in1r = ((npy_double *)ip1)[0];
+        npy_double in1i = ((npy_double *)ip1)[1];
+        const npy_double in2r = ((npy_double *)ip2)[0];
+        const npy_double in2i = ((npy_double *)ip2)[1];
+        if ( !(npy_isnan(in1r) || npy_isnan(in1i) || CGE(in1r, in1i, in2r, in2i))) {
+            in1r = in2r;
+            in1i = in2i;
+        }
+        ((npy_double *)op1)[0] = in1r;
+        ((npy_double *)op1)[1] = in1i;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 2251
+NPY_NO_EXPORT void
+CDOUBLE_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        npy_double in1r = ((npy_double *)ip1)[0];
+        npy_double in1i = ((npy_double *)ip1)[1];
+        const npy_double in2r = ((npy_double *)ip2)[0];
+        const npy_double in2i = ((npy_double *)ip2)[1];
+        if ( !(npy_isnan(in1r) || npy_isnan(in1i) || CLE(in1r, in1i, in2r, in2i))) {
+            in1r = in2r;
+            in1i = in2i;
+        }
+        ((npy_double *)op1)[0] = in1r;
+        ((npy_double *)op1)[1] = in1i;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+
+#line 2274
+NPY_NO_EXPORT void
+CDOUBLE_fmax(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_double in1r = ((npy_double *)ip1)[0];
+        const npy_double in1i = ((npy_double *)ip1)[1];
+        const npy_double in2r = ((npy_double *)ip2)[0];
+        const npy_double in2i = ((npy_double *)ip2)[1];
+        if (npy_isnan(in2r) || npy_isnan(in2i) || CGE(in1r, in1i, in2r, in2i)) {
+            ((npy_double *)op1)[0] = in1r;
+            ((npy_double *)op1)[1] = in1i;
+        }
+        else {
+            ((npy_double *)op1)[0] = in2r;
+            ((npy_double *)op1)[1] = in2i;
+        }
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 2274
+NPY_NO_EXPORT void
+CDOUBLE_fmin(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_double in1r = ((npy_double *)ip1)[0];
+        const npy_double in1i = ((npy_double *)ip1)[1];
+        const npy_double in2r = ((npy_double *)ip2)[0];
+        const npy_double in2i = ((npy_double *)ip2)[1];
+        if (npy_isnan(in2r) || npy_isnan(in2i) || CLE(in1r, in1i, in2r, in2i)) {
+            ((npy_double *)op1)[0] = in1r;
+            ((npy_double *)op1)[1] = in1i;
+        }
+        else {
+            ((npy_double *)op1)[0] = in2r;
+            ((npy_double *)op1)[1] = in2i;
+        }
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+
+
+#line 1945
+
+#if !0
+// CFLOAT & CDOUBLE defined by 'loops_arithm_fp.dispatch.c.src'
+#line 1954
+NPY_NO_EXPORT void
+CLONGDOUBLE_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    // Parenthesis around 1 tells clang dead code is intentional
+    if (IS_BINARY_REDUCE && (1)) {
+        npy_intp n = dimensions[0];
+        npy_longdouble * or = ((npy_longdouble *)args[0]);
+        npy_longdouble * oi = ((npy_longdouble *)args[0]) + 1;
+        npy_longdouble rr, ri;
+
+        CLONGDOUBLE_pairwise_sum(&rr, &ri, args[1], n * 2, steps[1] / 2);
+        *or += rr;
+        *oi += ri;
+        return;
+    }
+    else {
+        BINARY_LOOP {
+            const npy_longdouble in1r = ((npy_longdouble *)ip1)[0];
+            const npy_longdouble in1i = ((npy_longdouble *)ip1)[1];
+            const npy_longdouble in2r = ((npy_longdouble *)ip2)[0];
+            const npy_longdouble in2i = ((npy_longdouble *)ip2)[1];
+            ((npy_longdouble *)op1)[0] = in1r + in2r;
+            ((npy_longdouble *)op1)[1] = in1i + in2i;
+        }
+    }
+}
+
+NPY_NO_EXPORT int CLONGDOUBLE_add_indexed
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_longdouble *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_longdouble *)(ip1 + is1 * indx);
+        const npy_longdouble b_r = ((npy_longdouble *)value)[0];
+        const npy_longdouble b_i = ((npy_longdouble *)value)[1];
+        indexed[0] += b_r;
+        indexed[1] += b_i;
+    }
+    return 0;
+}
+
+#line 1954
+NPY_NO_EXPORT void
+CLONGDOUBLE_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    // Parenthesis around 0 tells clang dead code is intentional
+    if (IS_BINARY_REDUCE && (0)) {
+        npy_intp n = dimensions[0];
+        npy_longdouble * or = ((npy_longdouble *)args[0]);
+        npy_longdouble * oi = ((npy_longdouble *)args[0]) + 1;
+        npy_longdouble rr, ri;
+
+        CLONGDOUBLE_pairwise_sum(&rr, &ri, args[1], n * 2, steps[1] / 2);
+        *or -= rr;
+        *oi -= ri;
+        return;
+    }
+    else {
+        BINARY_LOOP {
+            const npy_longdouble in1r = ((npy_longdouble *)ip1)[0];
+            const npy_longdouble in1i = ((npy_longdouble *)ip1)[1];
+            const npy_longdouble in2r = ((npy_longdouble *)ip2)[0];
+            const npy_longdouble in2i = ((npy_longdouble *)ip2)[1];
+            ((npy_longdouble *)op1)[0] = in1r - in2r;
+            ((npy_longdouble *)op1)[1] = in1i - in2i;
+        }
+    }
+}
+
+NPY_NO_EXPORT int CLONGDOUBLE_subtract_indexed
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_longdouble *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_longdouble *)(ip1 + is1 * indx);
+        const npy_longdouble b_r = ((npy_longdouble *)value)[0];
+        const npy_longdouble b_i = ((npy_longdouble *)value)[1];
+        indexed[0] -= b_r;
+        indexed[1] -= b_i;
+    }
+    return 0;
+}
+
+
+NPY_NO_EXPORT void
+CLONGDOUBLE_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longdouble in1r = ((npy_longdouble *)ip1)[0];
+        const npy_longdouble in1i = ((npy_longdouble *)ip1)[1];
+        const npy_longdouble in2r = ((npy_longdouble *)ip2)[0];
+        const npy_longdouble in2i = ((npy_longdouble *)ip2)[1];
+        ((npy_longdouble *)op1)[0] = in1r*in2r - in1i*in2i;
+        ((npy_longdouble *)op1)[1] = in1r*in2i + in1i*in2r;
+    }
+}
+
+NPY_NO_EXPORT int CLONGDOUBLE_multiply_indexed
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_longdouble *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_longdouble *)(ip1 + is1 * indx);
+        const npy_longdouble a_r = indexed[0];
+        const npy_longdouble a_i = indexed[1];
+        const npy_longdouble b_r = ((npy_longdouble *)value)[0];
+        const npy_longdouble b_i = ((npy_longdouble *)value)[1];
+        indexed[0] = a_r*b_r - a_i*b_i;
+        indexed[1] = a_r*b_i + a_i*b_r;
+    }
+    return 0;
+}
+#endif // !SIMD
+
+NPY_NO_EXPORT void
+CLONGDOUBLE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longdouble in1r = ((npy_longdouble *)ip1)[0];
+        const npy_longdouble in1i = ((npy_longdouble *)ip1)[1];
+        const npy_longdouble in2r = ((npy_longdouble *)ip2)[0];
+        const npy_longdouble in2i = ((npy_longdouble *)ip2)[1];
+        const npy_longdouble in2r_abs = npy_fabsl(in2r);
+        const npy_longdouble in2i_abs = npy_fabsl(in2i);
+        if (in2r_abs >= in2i_abs) {
+            if (in2r_abs == 0 && in2i_abs == 0) {
+                /* divide by zero should yield a complex inf or nan */
+                ((npy_longdouble *)op1)[0] = in1r/in2r_abs;
+                ((npy_longdouble *)op1)[1] = in1i/in2i_abs;
+            }
+            else {
+                const npy_longdouble rat = in2i/in2r;
+                const npy_longdouble scl = 1.0l/(in2r + in2i*rat);
+                ((npy_longdouble *)op1)[0] = (in1r + in1i*rat)*scl;
+                ((npy_longdouble *)op1)[1] = (in1i - in1r*rat)*scl;
+            }
+        }
+        else {
+            const npy_longdouble rat = in2r/in2i;
+            const npy_longdouble scl = 1.0l/(in2i + in2r*rat);
+            ((npy_longdouble *)op1)[0] = (in1r*rat + in1i)*scl;
+            ((npy_longdouble *)op1)[1] = (in1i*rat - in1r)*scl;
+        }
+    }
+}
+
+
+#line 2085
+NPY_NO_EXPORT void
+CLONGDOUBLE_greater(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longdouble in1r = ((npy_longdouble *)ip1)[0];
+        const npy_longdouble in1i = ((npy_longdouble *)ip1)[1];
+        const npy_longdouble in2r = ((npy_longdouble *)ip2)[0];
+        const npy_longdouble in2i = ((npy_longdouble *)ip2)[1];
+        *((npy_bool *)op1) = CGT(in1r,in1i,in2r,in2i);
+    }
+}
+
+#line 2085
+NPY_NO_EXPORT void
+CLONGDOUBLE_greater_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longdouble in1r = ((npy_longdouble *)ip1)[0];
+        const npy_longdouble in1i = ((npy_longdouble *)ip1)[1];
+        const npy_longdouble in2r = ((npy_longdouble *)ip2)[0];
+        const npy_longdouble in2i = ((npy_longdouble *)ip2)[1];
+        *((npy_bool *)op1) = CGE(in1r,in1i,in2r,in2i);
+    }
+}
+
+#line 2085
+NPY_NO_EXPORT void
+CLONGDOUBLE_less(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longdouble in1r = ((npy_longdouble *)ip1)[0];
+        const npy_longdouble in1i = ((npy_longdouble *)ip1)[1];
+        const npy_longdouble in2r = ((npy_longdouble *)ip2)[0];
+        const npy_longdouble in2i = ((npy_longdouble *)ip2)[1];
+        *((npy_bool *)op1) = CLT(in1r,in1i,in2r,in2i);
+    }
+}
+
+#line 2085
+NPY_NO_EXPORT void
+CLONGDOUBLE_less_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longdouble in1r = ((npy_longdouble *)ip1)[0];
+        const npy_longdouble in1i = ((npy_longdouble *)ip1)[1];
+        const npy_longdouble in2r = ((npy_longdouble *)ip2)[0];
+        const npy_longdouble in2i = ((npy_longdouble *)ip2)[1];
+        *((npy_bool *)op1) = CLE(in1r,in1i,in2r,in2i);
+    }
+}
+
+#line 2085
+NPY_NO_EXPORT void
+CLONGDOUBLE_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longdouble in1r = ((npy_longdouble *)ip1)[0];
+        const npy_longdouble in1i = ((npy_longdouble *)ip1)[1];
+        const npy_longdouble in2r = ((npy_longdouble *)ip2)[0];
+        const npy_longdouble in2i = ((npy_longdouble *)ip2)[1];
+        *((npy_bool *)op1) = CEQ(in1r,in1i,in2r,in2i);
+    }
+}
+
+#line 2085
+NPY_NO_EXPORT void
+CLONGDOUBLE_not_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longdouble in1r = ((npy_longdouble *)ip1)[0];
+        const npy_longdouble in1i = ((npy_longdouble *)ip1)[1];
+        const npy_longdouble in2r = ((npy_longdouble *)ip2)[0];
+        const npy_longdouble in2i = ((npy_longdouble *)ip2)[1];
+        *((npy_bool *)op1) = CNE(in1r,in1i,in2r,in2i);
+    }
+}
+
+
+#line 2103
+NPY_NO_EXPORT void
+CLONGDOUBLE_logical_and(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longdouble in1r = ((npy_longdouble *)ip1)[0];
+        const npy_longdouble in1i = ((npy_longdouble *)ip1)[1];
+        const npy_longdouble in2r = ((npy_longdouble *)ip2)[0];
+        const npy_longdouble in2i = ((npy_longdouble *)ip2)[1];
+        *((npy_bool *)op1) = (in1r || in1i) && (in2r || in2i);
+    }
+}
+
+#line 2103
+NPY_NO_EXPORT void
+CLONGDOUBLE_logical_or(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longdouble in1r = ((npy_longdouble *)ip1)[0];
+        const npy_longdouble in1i = ((npy_longdouble *)ip1)[1];
+        const npy_longdouble in2r = ((npy_longdouble *)ip2)[0];
+        const npy_longdouble in2i = ((npy_longdouble *)ip2)[1];
+        *((npy_bool *)op1) = (in1r || in1i) || (in2r || in2i);
+    }
+}
+
+
+NPY_NO_EXPORT void
+CLONGDOUBLE_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longdouble in1r = ((npy_longdouble *)ip1)[0];
+        const npy_longdouble in1i = ((npy_longdouble *)ip1)[1];
+        const npy_longdouble in2r = ((npy_longdouble *)ip2)[0];
+        const npy_longdouble in2i = ((npy_longdouble *)ip2)[1];
+        const npy_bool tmp1 = (in1r || in1i);
+        const npy_bool tmp2 = (in2r || in2i);
+        *((npy_bool *)op1) = tmp1 != tmp2;
+    }
+}
+
+NPY_NO_EXPORT void
+CLONGDOUBLE_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_longdouble in1r = ((npy_longdouble *)ip1)[0];
+        const npy_longdouble in1i = ((npy_longdouble *)ip1)[1];
+        *((npy_bool *)op1) = !(in1r || in1i);
+    }
+}
+
+#line 2145
+NPY_NO_EXPORT void
+CLONGDOUBLE_isnan(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_longdouble in1r = ((npy_longdouble *)ip1)[0];
+        const npy_longdouble in1i = ((npy_longdouble *)ip1)[1];
+        *((npy_bool *)op1) = npy_isnan(in1r) || npy_isnan(in1i);
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 2145
+NPY_NO_EXPORT void
+CLONGDOUBLE_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_longdouble in1r = ((npy_longdouble *)ip1)[0];
+        const npy_longdouble in1i = ((npy_longdouble *)ip1)[1];
+        *((npy_bool *)op1) = npy_isinf(in1r) || npy_isinf(in1i);
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 2145
+NPY_NO_EXPORT void
+CLONGDOUBLE_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_longdouble in1r = ((npy_longdouble *)ip1)[0];
+        const npy_longdouble in1i = ((npy_longdouble *)ip1)[1];
+        *((npy_bool *)op1) = npy_isfinite(in1r) && npy_isfinite(in1i);
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+
+#if !0
+// CFLOAT & CDOUBLE defined by 'loops_arithm_fp.dispatch.c.src'
+NPY_NO_EXPORT void
+CLONGDOUBLE_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP {
+        const npy_longdouble in1r = ((npy_longdouble *)ip1)[0];
+        const npy_longdouble in1i = ((npy_longdouble *)ip1)[1];
+        ((npy_longdouble *)op1)[0] = in1r*in1r - in1i*in1i;
+        ((npy_longdouble *)op1)[1] = in1r*in1i + in1i*in1r;
+    }
+}
+#endif
+
+NPY_NO_EXPORT void
+CLONGDOUBLE_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP {
+        const npy_longdouble in1r = ((npy_longdouble *)ip1)[0];
+        const npy_longdouble in1i = ((npy_longdouble *)ip1)[1];
+        if (npy_fabsl(in1i) <= npy_fabsl(in1r)) {
+            const npy_longdouble r = in1i/in1r;
+            const npy_longdouble d = in1r + in1i*r;
+            ((npy_longdouble *)op1)[0] = 1/d;
+            ((npy_longdouble *)op1)[1] = -r/d;
+        } else {
+            const npy_longdouble r = in1r/in1i;
+            const npy_longdouble d = in1r*r + in1i;
+            ((npy_longdouble *)op1)[0] = r/d;
+            ((npy_longdouble *)op1)[1] = -1/d;
+        }
+    }
+}
+
+NPY_NO_EXPORT void
+CLONGDOUBLE__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    OUTPUT_LOOP {
+        ((npy_longdouble *)op1)[0] = 1;
+        ((npy_longdouble *)op1)[1] = 0;
+    }
+}
+
+#if !0
+// CFLOAT & CDOUBLE defined by 'loops_arithm_fp.dispatch.c.src'
+NPY_NO_EXPORT void
+CLONGDOUBLE_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) {
+    UNARY_LOOP {
+        const npy_longdouble in1r = ((npy_longdouble *)ip1)[0];
+        const npy_longdouble in1i = ((npy_longdouble *)ip1)[1];
+        ((npy_longdouble *)op1)[0] = in1r;
+        ((npy_longdouble *)op1)[1] = -in1i;
+    }
+}
+
+NPY_NO_EXPORT void
+CLONGDOUBLE_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_longdouble in1r = ((npy_longdouble *)ip1)[0];
+        const npy_longdouble in1i = ((npy_longdouble *)ip1)[1];
+        *((npy_longdouble *)op1) = npy_hypotl(in1r, in1i);
+    }
+}
+#endif
+
+NPY_NO_EXPORT void
+CLONGDOUBLE__arg(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP {
+        const npy_longdouble in1r = ((npy_longdouble *)ip1)[0];
+        const npy_longdouble in1i = ((npy_longdouble *)ip1)[1];
+        *((npy_longdouble *)op1) = npy_atan2l(in1i, in1r);
+    }
+}
+
+NPY_NO_EXPORT void
+CLONGDOUBLE_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /* fixme: sign of nan is currently 0 */
+    UNARY_LOOP {
+        const npy_longdouble in1r = ((npy_longdouble *)ip1)[0];
+        const npy_longdouble in1i = ((npy_longdouble *)ip1)[1];
+        ((npy_longdouble *)op1)[0] = CGT(in1r, in1i, 0.0, 0.0) ?  1 :
+                            (CLT(in1r, in1i, 0.0, 0.0) ? -1 :
+                            (CEQ(in1r, in1i, 0.0, 0.0) ?  0 : NPY_NANL));
+        ((npy_longdouble *)op1)[1] = 0;
+    }
+}
+
+#line 2251
+NPY_NO_EXPORT void
+CLONGDOUBLE_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        npy_longdouble in1r = ((npy_longdouble *)ip1)[0];
+        npy_longdouble in1i = ((npy_longdouble *)ip1)[1];
+        const npy_longdouble in2r = ((npy_longdouble *)ip2)[0];
+        const npy_longdouble in2i = ((npy_longdouble *)ip2)[1];
+        if ( !(npy_isnan(in1r) || npy_isnan(in1i) || CGE(in1r, in1i, in2r, in2i))) {
+            in1r = in2r;
+            in1i = in2i;
+        }
+        ((npy_longdouble *)op1)[0] = in1r;
+        ((npy_longdouble *)op1)[1] = in1i;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 2251
+NPY_NO_EXPORT void
+CLONGDOUBLE_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        npy_longdouble in1r = ((npy_longdouble *)ip1)[0];
+        npy_longdouble in1i = ((npy_longdouble *)ip1)[1];
+        const npy_longdouble in2r = ((npy_longdouble *)ip2)[0];
+        const npy_longdouble in2i = ((npy_longdouble *)ip2)[1];
+        if ( !(npy_isnan(in1r) || npy_isnan(in1i) || CLE(in1r, in1i, in2r, in2i))) {
+            in1r = in2r;
+            in1i = in2i;
+        }
+        ((npy_longdouble *)op1)[0] = in1r;
+        ((npy_longdouble *)op1)[1] = in1i;
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+
+#line 2274
+NPY_NO_EXPORT void
+CLONGDOUBLE_fmax(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longdouble in1r = ((npy_longdouble *)ip1)[0];
+        const npy_longdouble in1i = ((npy_longdouble *)ip1)[1];
+        const npy_longdouble in2r = ((npy_longdouble *)ip2)[0];
+        const npy_longdouble in2i = ((npy_longdouble *)ip2)[1];
+        if (npy_isnan(in2r) || npy_isnan(in2i) || CGE(in1r, in1i, in2r, in2i)) {
+            ((npy_longdouble *)op1)[0] = in1r;
+            ((npy_longdouble *)op1)[1] = in1i;
+        }
+        else {
+            ((npy_longdouble *)op1)[0] = in2r;
+            ((npy_longdouble *)op1)[1] = in2i;
+        }
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 2274
+NPY_NO_EXPORT void
+CLONGDOUBLE_fmin(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP {
+        const npy_longdouble in1r = ((npy_longdouble *)ip1)[0];
+        const npy_longdouble in1i = ((npy_longdouble *)ip1)[1];
+        const npy_longdouble in2r = ((npy_longdouble *)ip2)[0];
+        const npy_longdouble in2i = ((npy_longdouble *)ip2)[1];
+        if (npy_isnan(in2r) || npy_isnan(in2i) || CLE(in1r, in1i, in2r, in2i)) {
+            ((npy_longdouble *)op1)[0] = in1r;
+            ((npy_longdouble *)op1)[1] = in1i;
+        }
+        else {
+            ((npy_longdouble *)op1)[0] = in2r;
+            ((npy_longdouble *)op1)[1] = in2i;
+        }
+    }
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+
+
+
+#undef CGE
+#undef CLE
+#undef CGT
+#undef CLT
+#undef CEQ
+#undef CNE
+
+/*
+ *****************************************************************************
+ **                            OBJECT LOOPS                                 **
+ *****************************************************************************
+ */
+
+#line 2315
+
+#line 2320
+NPY_NO_EXPORT void
+OBJECT_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) {
+    BINARY_LOOP {
+        PyObject *ret_obj;
+        PyObject *in1 = *(PyObject **)ip1;
+        PyObject *in2 = *(PyObject **)ip2;
+
+        in1 = in1 ? in1 : Py_None;
+        in2 = in2 ? in2 : Py_None;
+
+        /*
+         * Do not use RichCompareBool because it includes an identity check for
+         * == and !=. This is wrong for elementwise behaviour, since it means
+         * that NaN can be equal to NaN and an array is equal to itself.
+         */
+        ret_obj = PyObject_RichCompare(in1, in2, Py_EQ);
+        if (ret_obj == NULL) {
+            return;
+        }
+#if 1
+        {
+            int ret = PyObject_IsTrue(ret_obj);
+            Py_DECREF(ret_obj);
+            if (ret == -1) {
+                return;
+            }
+            *((npy_bool *)op1) = (npy_bool)ret;
+        }
+#else
+        *((PyObject **)op1) = ret_obj;
+#endif
+    }
+}
+
+#line 2320
+NPY_NO_EXPORT void
+OBJECT_OO_O_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) {
+    BINARY_LOOP {
+        PyObject *ret_obj;
+        PyObject *in1 = *(PyObject **)ip1;
+        PyObject *in2 = *(PyObject **)ip2;
+
+        in1 = in1 ? in1 : Py_None;
+        in2 = in2 ? in2 : Py_None;
+
+        /*
+         * Do not use RichCompareBool because it includes an identity check for
+         * == and !=. This is wrong for elementwise behaviour, since it means
+         * that NaN can be equal to NaN and an array is equal to itself.
+         */
+        ret_obj = PyObject_RichCompare(in1, in2, Py_EQ);
+        if (ret_obj == NULL) {
+            return;
+        }
+#if 0
+        {
+            int ret = PyObject_IsTrue(ret_obj);
+            Py_DECREF(ret_obj);
+            if (ret == -1) {
+                return;
+            }
+            *((npy_bool *)op1) = (npy_bool)ret;
+        }
+#else
+        *((PyObject **)op1) = ret_obj;
+#endif
+    }
+}
+
+
+#line 2315
+
+#line 2320
+NPY_NO_EXPORT void
+OBJECT_not_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) {
+    BINARY_LOOP {
+        PyObject *ret_obj;
+        PyObject *in1 = *(PyObject **)ip1;
+        PyObject *in2 = *(PyObject **)ip2;
+
+        in1 = in1 ? in1 : Py_None;
+        in2 = in2 ? in2 : Py_None;
+
+        /*
+         * Do not use RichCompareBool because it includes an identity check for
+         * == and !=. This is wrong for elementwise behaviour, since it means
+         * that NaN can be equal to NaN and an array is equal to itself.
+         */
+        ret_obj = PyObject_RichCompare(in1, in2, Py_NE);
+        if (ret_obj == NULL) {
+            return;
+        }
+#if 1
+        {
+            int ret = PyObject_IsTrue(ret_obj);
+            Py_DECREF(ret_obj);
+            if (ret == -1) {
+                return;
+            }
+            *((npy_bool *)op1) = (npy_bool)ret;
+        }
+#else
+        *((PyObject **)op1) = ret_obj;
+#endif
+    }
+}
+
+#line 2320
+NPY_NO_EXPORT void
+OBJECT_OO_O_not_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) {
+    BINARY_LOOP {
+        PyObject *ret_obj;
+        PyObject *in1 = *(PyObject **)ip1;
+        PyObject *in2 = *(PyObject **)ip2;
+
+        in1 = in1 ? in1 : Py_None;
+        in2 = in2 ? in2 : Py_None;
+
+        /*
+         * Do not use RichCompareBool because it includes an identity check for
+         * == and !=. This is wrong for elementwise behaviour, since it means
+         * that NaN can be equal to NaN and an array is equal to itself.
+         */
+        ret_obj = PyObject_RichCompare(in1, in2, Py_NE);
+        if (ret_obj == NULL) {
+            return;
+        }
+#if 0
+        {
+            int ret = PyObject_IsTrue(ret_obj);
+            Py_DECREF(ret_obj);
+            if (ret == -1) {
+                return;
+            }
+            *((npy_bool *)op1) = (npy_bool)ret;
+        }
+#else
+        *((PyObject **)op1) = ret_obj;
+#endif
+    }
+}
+
+
+#line 2315
+
+#line 2320
+NPY_NO_EXPORT void
+OBJECT_greater(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) {
+    BINARY_LOOP {
+        PyObject *ret_obj;
+        PyObject *in1 = *(PyObject **)ip1;
+        PyObject *in2 = *(PyObject **)ip2;
+
+        in1 = in1 ? in1 : Py_None;
+        in2 = in2 ? in2 : Py_None;
+
+        /*
+         * Do not use RichCompareBool because it includes an identity check for
+         * == and !=. This is wrong for elementwise behaviour, since it means
+         * that NaN can be equal to NaN and an array is equal to itself.
+         */
+        ret_obj = PyObject_RichCompare(in1, in2, Py_GT);
+        if (ret_obj == NULL) {
+            return;
+        }
+#if 1
+        {
+            int ret = PyObject_IsTrue(ret_obj);
+            Py_DECREF(ret_obj);
+            if (ret == -1) {
+                return;
+            }
+            *((npy_bool *)op1) = (npy_bool)ret;
+        }
+#else
+        *((PyObject **)op1) = ret_obj;
+#endif
+    }
+}
+
+#line 2320
+NPY_NO_EXPORT void
+OBJECT_OO_O_greater(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) {
+    BINARY_LOOP {
+        PyObject *ret_obj;
+        PyObject *in1 = *(PyObject **)ip1;
+        PyObject *in2 = *(PyObject **)ip2;
+
+        in1 = in1 ? in1 : Py_None;
+        in2 = in2 ? in2 : Py_None;
+
+        /*
+         * Do not use RichCompareBool because it includes an identity check for
+         * == and !=. This is wrong for elementwise behaviour, since it means
+         * that NaN can be equal to NaN and an array is equal to itself.
+         */
+        ret_obj = PyObject_RichCompare(in1, in2, Py_GT);
+        if (ret_obj == NULL) {
+            return;
+        }
+#if 0
+        {
+            int ret = PyObject_IsTrue(ret_obj);
+            Py_DECREF(ret_obj);
+            if (ret == -1) {
+                return;
+            }
+            *((npy_bool *)op1) = (npy_bool)ret;
+        }
+#else
+        *((PyObject **)op1) = ret_obj;
+#endif
+    }
+}
+
+
+#line 2315
+
+#line 2320
+NPY_NO_EXPORT void
+OBJECT_greater_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) {
+    BINARY_LOOP {
+        PyObject *ret_obj;
+        PyObject *in1 = *(PyObject **)ip1;
+        PyObject *in2 = *(PyObject **)ip2;
+
+        in1 = in1 ? in1 : Py_None;
+        in2 = in2 ? in2 : Py_None;
+
+        /*
+         * Do not use RichCompareBool because it includes an identity check for
+         * == and !=. This is wrong for elementwise behaviour, since it means
+         * that NaN can be equal to NaN and an array is equal to itself.
+         */
+        ret_obj = PyObject_RichCompare(in1, in2, Py_GE);
+        if (ret_obj == NULL) {
+            return;
+        }
+#if 1
+        {
+            int ret = PyObject_IsTrue(ret_obj);
+            Py_DECREF(ret_obj);
+            if (ret == -1) {
+                return;
+            }
+            *((npy_bool *)op1) = (npy_bool)ret;
+        }
+#else
+        *((PyObject **)op1) = ret_obj;
+#endif
+    }
+}
+
+#line 2320
+NPY_NO_EXPORT void
+OBJECT_OO_O_greater_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) {
+    BINARY_LOOP {
+        PyObject *ret_obj;
+        PyObject *in1 = *(PyObject **)ip1;
+        PyObject *in2 = *(PyObject **)ip2;
+
+        in1 = in1 ? in1 : Py_None;
+        in2 = in2 ? in2 : Py_None;
+
+        /*
+         * Do not use RichCompareBool because it includes an identity check for
+         * == and !=. This is wrong for elementwise behaviour, since it means
+         * that NaN can be equal to NaN and an array is equal to itself.
+         */
+        ret_obj = PyObject_RichCompare(in1, in2, Py_GE);
+        if (ret_obj == NULL) {
+            return;
+        }
+#if 0
+        {
+            int ret = PyObject_IsTrue(ret_obj);
+            Py_DECREF(ret_obj);
+            if (ret == -1) {
+                return;
+            }
+            *((npy_bool *)op1) = (npy_bool)ret;
+        }
+#else
+        *((PyObject **)op1) = ret_obj;
+#endif
+    }
+}
+
+
+#line 2315
+
+#line 2320
+NPY_NO_EXPORT void
+OBJECT_less(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) {
+    BINARY_LOOP {
+        PyObject *ret_obj;
+        PyObject *in1 = *(PyObject **)ip1;
+        PyObject *in2 = *(PyObject **)ip2;
+
+        in1 = in1 ? in1 : Py_None;
+        in2 = in2 ? in2 : Py_None;
+
+        /*
+         * Do not use RichCompareBool because it includes an identity check for
+         * == and !=. This is wrong for elementwise behaviour, since it means
+         * that NaN can be equal to NaN and an array is equal to itself.
+         */
+        ret_obj = PyObject_RichCompare(in1, in2, Py_LT);
+        if (ret_obj == NULL) {
+            return;
+        }
+#if 1
+        {
+            int ret = PyObject_IsTrue(ret_obj);
+            Py_DECREF(ret_obj);
+            if (ret == -1) {
+                return;
+            }
+            *((npy_bool *)op1) = (npy_bool)ret;
+        }
+#else
+        *((PyObject **)op1) = ret_obj;
+#endif
+    }
+}
+
+#line 2320
+NPY_NO_EXPORT void
+OBJECT_OO_O_less(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) {
+    BINARY_LOOP {
+        PyObject *ret_obj;
+        PyObject *in1 = *(PyObject **)ip1;
+        PyObject *in2 = *(PyObject **)ip2;
+
+        in1 = in1 ? in1 : Py_None;
+        in2 = in2 ? in2 : Py_None;
+
+        /*
+         * Do not use RichCompareBool because it includes an identity check for
+         * == and !=. This is wrong for elementwise behaviour, since it means
+         * that NaN can be equal to NaN and an array is equal to itself.
+         */
+        ret_obj = PyObject_RichCompare(in1, in2, Py_LT);
+        if (ret_obj == NULL) {
+            return;
+        }
+#if 0
+        {
+            int ret = PyObject_IsTrue(ret_obj);
+            Py_DECREF(ret_obj);
+            if (ret == -1) {
+                return;
+            }
+            *((npy_bool *)op1) = (npy_bool)ret;
+        }
+#else
+        *((PyObject **)op1) = ret_obj;
+#endif
+    }
+}
+
+
+#line 2315
+
+#line 2320
+NPY_NO_EXPORT void
+OBJECT_less_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) {
+    BINARY_LOOP {
+        PyObject *ret_obj;
+        PyObject *in1 = *(PyObject **)ip1;
+        PyObject *in2 = *(PyObject **)ip2;
+
+        in1 = in1 ? in1 : Py_None;
+        in2 = in2 ? in2 : Py_None;
+
+        /*
+         * Do not use RichCompareBool because it includes an identity check for
+         * == and !=. This is wrong for elementwise behaviour, since it means
+         * that NaN can be equal to NaN and an array is equal to itself.
+         */
+        ret_obj = PyObject_RichCompare(in1, in2, Py_LE);
+        if (ret_obj == NULL) {
+            return;
+        }
+#if 1
+        {
+            int ret = PyObject_IsTrue(ret_obj);
+            Py_DECREF(ret_obj);
+            if (ret == -1) {
+                return;
+            }
+            *((npy_bool *)op1) = (npy_bool)ret;
+        }
+#else
+        *((PyObject **)op1) = ret_obj;
+#endif
+    }
+}
+
+#line 2320
+NPY_NO_EXPORT void
+OBJECT_OO_O_less_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) {
+    BINARY_LOOP {
+        PyObject *ret_obj;
+        PyObject *in1 = *(PyObject **)ip1;
+        PyObject *in2 = *(PyObject **)ip2;
+
+        in1 = in1 ? in1 : Py_None;
+        in2 = in2 ? in2 : Py_None;
+
+        /*
+         * Do not use RichCompareBool because it includes an identity check for
+         * == and !=. This is wrong for elementwise behaviour, since it means
+         * that NaN can be equal to NaN and an array is equal to itself.
+         */
+        ret_obj = PyObject_RichCompare(in1, in2, Py_LE);
+        if (ret_obj == NULL) {
+            return;
+        }
+#if 0
+        {
+            int ret = PyObject_IsTrue(ret_obj);
+            Py_DECREF(ret_obj);
+            if (ret == -1) {
+                return;
+            }
+            *((npy_bool *)op1) = (npy_bool)ret;
+        }
+#else
+        *((PyObject **)op1) = ret_obj;
+#endif
+    }
+}
+
+
+
+NPY_NO_EXPORT void
+OBJECT_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    PyObject *zero = PyLong_FromLong(0);
+
+    UNARY_LOOP {
+        PyObject *in1 = *(PyObject **)ip1;
+        PyObject **out = (PyObject **)op1;
+        PyObject *ret = NULL;
+        int v;
+
+        if (in1 == NULL) {
+            in1 = Py_None;
+        }
+
+        if ((v = PyObject_RichCompareBool(in1, zero, Py_LT)) == 1) {
+            ret = PyLong_FromLong(-1);
+        }
+        else if (v == 0 &&
+                (v = PyObject_RichCompareBool(in1, zero, Py_GT)) == 1) {
+            ret = PyLong_FromLong(1);
+        }
+        else if (v == 0 &&
+                (v = PyObject_RichCompareBool(in1, zero, Py_EQ)) == 1) {
+            ret = PyLong_FromLong(0);
+        }
+        else if (v == 0) {
+            /* in1 is NaN */
+            PyErr_SetString(PyExc_TypeError,
+                    "unorderable types for comparison");
+        }
+
+        if (ret == NULL) {
+            break;
+        }
+        Py_XDECREF(*out);
+        *out = ret;
+    }
+    Py_XDECREF(zero);
+}
+
+/*
+ *****************************************************************************
+ **                              END LOOPS                                  **
+ *****************************************************************************
+ */
+
diff --git a/numpy/core/src/_generated/loops.h b/numpy/core/src/_generated/loops.h
new file mode 100644
index 000000000000..6df7c4c634e4
--- /dev/null
+++ b/numpy/core/src/_generated/loops.h
@@ -0,0 +1,4985 @@
+#line 1 "numpy/core/src/umath/loops.h.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+/* -*- c -*- */
+/*
+ * vim:syntax=c
+ */
+
+#ifndef _NPY_UMATH_LOOPS_H_
+#define _NPY_UMATH_LOOPS_H_
+
+#ifndef NPY_NO_EXPORT
+    #define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN
+#endif
+
+/*
+ *****************************************************************************
+ **                             BOOLEAN LOOPS                               **
+ *****************************************************************************
+ */
+
+/*
+ * Following functions are defined by umath generator
+ * to enable runtime dispatching without the need
+ * to redefine them within dsipatch-able sources.
+ */
+// #define BOOL_invert BOOL_logical_not
+// #define BOOL_add BOOL_logical_or
+// #define BOOL_bitwise_and BOOL_logical_and
+// #define BOOL_bitwise_or BOOL_logical_or
+// #define BOOL_logical_xor BOOL_not_equal
+// #define BOOL_bitwise_xor BOOL_logical_xor
+// #define BOOL_multiply BOOL_logical_and
+// #define BOOL_maximum BOOL_logical_or
+// #define BOOL_minimum BOOL_logical_and
+// #define BOOL_fmax BOOL_maximum
+// #define BOOL_fmin BOOL_minimum
+
+typedef struct PyArrayMethod_Context_tag PyArrayMethod_Context;
+typedef struct NpyAuxData_tag NpyAuxData;
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_comparison.dispatch.h"
+#endif
+
+#line 46
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 46
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_not_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 46
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_greater,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 46
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_greater_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 46
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_less,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 46
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_less_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_logical.dispatch.h"
+#endif
+
+#line 57
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_logical_and,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 57
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_logical_or,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 57
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_logical_not,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 57
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_absolute,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+NPY_NO_EXPORT void
+BOOL__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+#line 67
+NPY_NO_EXPORT void
+BOOL_isnan(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 67
+NPY_NO_EXPORT void
+BOOL_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 67
+NPY_NO_EXPORT void
+BOOL_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_autovec.dispatch.h"
+#endif
+#line 78
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_isnan,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 78
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_isinf,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 78
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BOOL_isfinite,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+/*
+ *****************************************************************************
+ **                           INTEGER LOOPS
+ *****************************************************************************
+ */
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_arithmetic.dispatch.h"
+#endif
+
+#line 96
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_divide,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+UBYTE_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+/**end repeat3**/
+
+#line 96
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_divide,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+USHORT_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+/**end repeat3**/
+
+#line 96
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_divide,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+UINT_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+/**end repeat3**/
+
+#line 96
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_divide,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+ULONG_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+/**end repeat3**/
+
+#line 96
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_divide,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+ULONGLONG_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+/**end repeat3**/
+
+#line 96
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_divide,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+BYTE_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+/**end repeat3**/
+
+#line 96
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_divide,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+SHORT_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+/**end repeat3**/
+
+#line 96
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_divide,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+INT_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+/**end repeat3**/
+
+#line 96
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_divide,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+LONG_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+/**end repeat3**/
+
+#line 96
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_divide,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+LONGLONG_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+/**end repeat3**/
+
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_modulo.dispatch.h"
+#endif
+
+#line 113
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_divmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_fmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_remainder,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 113
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_divmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_fmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_remainder,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 113
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_divmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_fmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_remainder,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 113
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_divmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_fmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_remainder,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 113
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_divmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_fmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_remainder,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 113
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_divmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_fmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_remainder,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 113
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_divmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_fmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_remainder,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 113
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_divmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_fmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_remainder,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 113
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_divmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_fmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_remainder,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 113
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_divmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_fmod,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 116
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_remainder,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_comparison.dispatch.h"
+#endif
+
+#line 129
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_not_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_greater,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_greater_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_less,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_less_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 129
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_not_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_greater,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_greater_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_less,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_less_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 129
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_not_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_greater,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_greater_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_less,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_less_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 129
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_not_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_greater,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_greater_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_less,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_less_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 129
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_not_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_greater,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_greater_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_less,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_less_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 129
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_not_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_greater,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_greater_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_less,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_less_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 129
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_not_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_greater,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_greater_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_less,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_less_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 129
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_not_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_greater,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_greater_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_less,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_less_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 129
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_not_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_greater,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_greater_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_less,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_less_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 129
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_not_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_greater,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_greater_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_less,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 132
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_less_equal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_autovec.dispatch.h"
+#endif
+#line 145
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_invert,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_logical_not,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_conjugate,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_reciprocal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_square,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_add,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_subtract,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_multiply,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_bitwise_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_bitwise_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_bitwise_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_left_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_right_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_logical_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_logical_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_logical_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_isnan,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_isinf,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_isfinite,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_absolute,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_sign,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 145
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_invert,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_logical_not,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_conjugate,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_reciprocal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_square,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_add,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_subtract,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_multiply,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_bitwise_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_bitwise_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_bitwise_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_left_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_right_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_logical_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_logical_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_logical_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_isnan,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_isinf,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_isfinite,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_absolute,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_sign,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 145
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_invert,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_logical_not,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_conjugate,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_reciprocal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_square,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_add,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_subtract,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_multiply,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_bitwise_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_bitwise_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_bitwise_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_left_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_right_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_logical_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_logical_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_logical_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_isnan,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_isinf,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_isfinite,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_absolute,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_sign,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 145
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_invert,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_logical_not,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_conjugate,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_reciprocal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_square,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_add,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_subtract,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_multiply,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_bitwise_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_bitwise_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_bitwise_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_left_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_right_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_logical_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_logical_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_logical_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_isnan,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_isinf,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_isfinite,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_absolute,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_sign,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 145
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_invert,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_logical_not,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_conjugate,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_reciprocal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_square,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_add,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_subtract,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_multiply,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_bitwise_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_bitwise_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_bitwise_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_left_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_right_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_logical_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_logical_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_logical_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_isnan,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_isinf,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_isfinite,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_absolute,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_sign,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 145
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_invert,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_logical_not,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_conjugate,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_reciprocal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_square,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_add,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_subtract,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_multiply,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_bitwise_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_bitwise_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_bitwise_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_left_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_right_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_logical_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_logical_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_logical_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_isnan,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_isinf,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_isfinite,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_absolute,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_sign,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 145
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_invert,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_logical_not,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_conjugate,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_reciprocal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_square,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_add,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_subtract,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_multiply,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_bitwise_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_bitwise_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_bitwise_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_left_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_right_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_logical_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_logical_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_logical_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_isnan,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_isinf,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_isfinite,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_absolute,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_sign,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 145
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_invert,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_logical_not,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_conjugate,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_reciprocal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_square,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_add,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_subtract,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_multiply,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_bitwise_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_bitwise_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_bitwise_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_left_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_right_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_logical_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_logical_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_logical_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_isnan,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_isinf,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_isfinite,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_absolute,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_sign,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 145
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_invert,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_logical_not,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_conjugate,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_reciprocal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_square,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_add,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_subtract,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_multiply,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_bitwise_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_bitwise_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_bitwise_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_left_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_right_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_logical_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_logical_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_logical_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_isnan,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_isinf,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_isfinite,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_absolute,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_sign,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 145
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_invert,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_logical_not,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_conjugate,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_reciprocal,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_square,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_add,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_subtract,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_multiply,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_bitwise_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_bitwise_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_bitwise_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_left_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_right_shift,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_logical_and,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_logical_or,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_logical_xor,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_isnan,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_isinf,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_isfinite,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_absolute,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+#line 152
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_sign,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+
+#line 160
+#line 165
+#define BYTE_floor_divide BYTE_divide
+#define BYTE_floor_divide_indexed BYTE_divide_indexed
+#define BYTE_fmax BYTE_maximum
+#define BYTE_fmin BYTE_minimum
+#define BYTE_fmax_indexed BYTE_maximum_indexed
+#define BYTE_fmin_indexed BYTE_minimum_indexed
+
+NPY_NO_EXPORT void
+BYTE__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+BYTE_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 184
+NPY_NO_EXPORT int
+BYTE_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+BYTE_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+BYTE_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+BYTE_bitwise_and_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+BYTE_bitwise_or_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+BYTE_bitwise_xor_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+BYTE_left_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+BYTE_right_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+#line 193
+NPY_NO_EXPORT void
+BYTE_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+BYTE_maximum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 193
+NPY_NO_EXPORT void
+BYTE_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+BYTE_minimum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+NPY_NO_EXPORT void
+BYTE_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+BYTE_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+BYTE_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
+
+
+#line 165
+#define UBYTE_floor_divide UBYTE_divide
+#define UBYTE_floor_divide_indexed UBYTE_divide_indexed
+#define UBYTE_fmax UBYTE_maximum
+#define UBYTE_fmin UBYTE_minimum
+#define UBYTE_fmax_indexed UBYTE_maximum_indexed
+#define UBYTE_fmin_indexed UBYTE_minimum_indexed
+
+NPY_NO_EXPORT void
+UBYTE__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+UBYTE_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 184
+NPY_NO_EXPORT int
+UBYTE_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+UBYTE_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+UBYTE_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+UBYTE_bitwise_and_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+UBYTE_bitwise_or_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+UBYTE_bitwise_xor_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+UBYTE_left_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+UBYTE_right_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+#line 193
+NPY_NO_EXPORT void
+UBYTE_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+UBYTE_maximum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 193
+NPY_NO_EXPORT void
+UBYTE_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+UBYTE_minimum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+NPY_NO_EXPORT void
+UBYTE_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+UBYTE_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+UBYTE_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
+
+
+
+#line 160
+#line 165
+#define SHORT_floor_divide SHORT_divide
+#define SHORT_floor_divide_indexed SHORT_divide_indexed
+#define SHORT_fmax SHORT_maximum
+#define SHORT_fmin SHORT_minimum
+#define SHORT_fmax_indexed SHORT_maximum_indexed
+#define SHORT_fmin_indexed SHORT_minimum_indexed
+
+NPY_NO_EXPORT void
+SHORT__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+SHORT_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 184
+NPY_NO_EXPORT int
+SHORT_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+SHORT_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+SHORT_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+SHORT_bitwise_and_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+SHORT_bitwise_or_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+SHORT_bitwise_xor_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+SHORT_left_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+SHORT_right_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+#line 193
+NPY_NO_EXPORT void
+SHORT_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+SHORT_maximum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 193
+NPY_NO_EXPORT void
+SHORT_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+SHORT_minimum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+NPY_NO_EXPORT void
+SHORT_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+SHORT_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+SHORT_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
+
+
+#line 165
+#define USHORT_floor_divide USHORT_divide
+#define USHORT_floor_divide_indexed USHORT_divide_indexed
+#define USHORT_fmax USHORT_maximum
+#define USHORT_fmin USHORT_minimum
+#define USHORT_fmax_indexed USHORT_maximum_indexed
+#define USHORT_fmin_indexed USHORT_minimum_indexed
+
+NPY_NO_EXPORT void
+USHORT__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+USHORT_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 184
+NPY_NO_EXPORT int
+USHORT_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+USHORT_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+USHORT_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+USHORT_bitwise_and_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+USHORT_bitwise_or_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+USHORT_bitwise_xor_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+USHORT_left_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+USHORT_right_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+#line 193
+NPY_NO_EXPORT void
+USHORT_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+USHORT_maximum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 193
+NPY_NO_EXPORT void
+USHORT_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+USHORT_minimum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+NPY_NO_EXPORT void
+USHORT_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+USHORT_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+USHORT_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
+
+
+
+#line 160
+#line 165
+#define INT_floor_divide INT_divide
+#define INT_floor_divide_indexed INT_divide_indexed
+#define INT_fmax INT_maximum
+#define INT_fmin INT_minimum
+#define INT_fmax_indexed INT_maximum_indexed
+#define INT_fmin_indexed INT_minimum_indexed
+
+NPY_NO_EXPORT void
+INT__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+INT_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 184
+NPY_NO_EXPORT int
+INT_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+INT_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+INT_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+INT_bitwise_and_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+INT_bitwise_or_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+INT_bitwise_xor_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+INT_left_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+INT_right_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+#line 193
+NPY_NO_EXPORT void
+INT_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+INT_maximum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 193
+NPY_NO_EXPORT void
+INT_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+INT_minimum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+NPY_NO_EXPORT void
+INT_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+INT_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+INT_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
+
+
+#line 165
+#define UINT_floor_divide UINT_divide
+#define UINT_floor_divide_indexed UINT_divide_indexed
+#define UINT_fmax UINT_maximum
+#define UINT_fmin UINT_minimum
+#define UINT_fmax_indexed UINT_maximum_indexed
+#define UINT_fmin_indexed UINT_minimum_indexed
+
+NPY_NO_EXPORT void
+UINT__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+UINT_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 184
+NPY_NO_EXPORT int
+UINT_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+UINT_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+UINT_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+UINT_bitwise_and_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+UINT_bitwise_or_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+UINT_bitwise_xor_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+UINT_left_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+UINT_right_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+#line 193
+NPY_NO_EXPORT void
+UINT_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+UINT_maximum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 193
+NPY_NO_EXPORT void
+UINT_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+UINT_minimum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+NPY_NO_EXPORT void
+UINT_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+UINT_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+UINT_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
+
+
+
+#line 160
+#line 165
+#define LONG_floor_divide LONG_divide
+#define LONG_floor_divide_indexed LONG_divide_indexed
+#define LONG_fmax LONG_maximum
+#define LONG_fmin LONG_minimum
+#define LONG_fmax_indexed LONG_maximum_indexed
+#define LONG_fmin_indexed LONG_minimum_indexed
+
+NPY_NO_EXPORT void
+LONG__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+LONG_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 184
+NPY_NO_EXPORT int
+LONG_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+LONG_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+LONG_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+LONG_bitwise_and_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+LONG_bitwise_or_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+LONG_bitwise_xor_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+LONG_left_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+LONG_right_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+#line 193
+NPY_NO_EXPORT void
+LONG_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+LONG_maximum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 193
+NPY_NO_EXPORT void
+LONG_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+LONG_minimum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+NPY_NO_EXPORT void
+LONG_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+LONG_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+LONG_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
+
+
+#line 165
+#define ULONG_floor_divide ULONG_divide
+#define ULONG_floor_divide_indexed ULONG_divide_indexed
+#define ULONG_fmax ULONG_maximum
+#define ULONG_fmin ULONG_minimum
+#define ULONG_fmax_indexed ULONG_maximum_indexed
+#define ULONG_fmin_indexed ULONG_minimum_indexed
+
+NPY_NO_EXPORT void
+ULONG__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+ULONG_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 184
+NPY_NO_EXPORT int
+ULONG_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+ULONG_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+ULONG_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+ULONG_bitwise_and_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+ULONG_bitwise_or_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+ULONG_bitwise_xor_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+ULONG_left_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+ULONG_right_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+#line 193
+NPY_NO_EXPORT void
+ULONG_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+ULONG_maximum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 193
+NPY_NO_EXPORT void
+ULONG_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+ULONG_minimum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+NPY_NO_EXPORT void
+ULONG_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+ULONG_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+ULONG_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
+
+
+
+#line 160
+#line 165
+#define LONGLONG_floor_divide LONGLONG_divide
+#define LONGLONG_floor_divide_indexed LONGLONG_divide_indexed
+#define LONGLONG_fmax LONGLONG_maximum
+#define LONGLONG_fmin LONGLONG_minimum
+#define LONGLONG_fmax_indexed LONGLONG_maximum_indexed
+#define LONGLONG_fmin_indexed LONGLONG_minimum_indexed
+
+NPY_NO_EXPORT void
+LONGLONG__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+LONGLONG_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 184
+NPY_NO_EXPORT int
+LONGLONG_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+LONGLONG_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+LONGLONG_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+LONGLONG_bitwise_and_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+LONGLONG_bitwise_or_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+LONGLONG_bitwise_xor_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+LONGLONG_left_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+LONGLONG_right_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+#line 193
+NPY_NO_EXPORT void
+LONGLONG_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+LONGLONG_maximum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 193
+NPY_NO_EXPORT void
+LONGLONG_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+LONGLONG_minimum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+NPY_NO_EXPORT void
+LONGLONG_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+LONGLONG_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+LONGLONG_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
+
+
+#line 165
+#define ULONGLONG_floor_divide ULONGLONG_divide
+#define ULONGLONG_floor_divide_indexed ULONGLONG_divide_indexed
+#define ULONGLONG_fmax ULONGLONG_maximum
+#define ULONGLONG_fmin ULONGLONG_minimum
+#define ULONGLONG_fmax_indexed ULONGLONG_maximum_indexed
+#define ULONGLONG_fmin_indexed ULONGLONG_minimum_indexed
+
+NPY_NO_EXPORT void
+ULONGLONG__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+ULONGLONG_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 184
+NPY_NO_EXPORT int
+ULONGLONG_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+ULONGLONG_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+ULONGLONG_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+ULONGLONG_bitwise_and_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+ULONGLONG_bitwise_or_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+ULONGLONG_bitwise_xor_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+ULONGLONG_left_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 184
+NPY_NO_EXPORT int
+ULONGLONG_right_shift_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args,
+                         npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+#line 193
+NPY_NO_EXPORT void
+ULONGLONG_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+ULONGLONG_maximum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 193
+NPY_NO_EXPORT void
+ULONGLONG_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+ULONGLONG_minimum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+NPY_NO_EXPORT void
+ULONGLONG_power(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+ULONGLONG_gcd(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+ULONGLONG_lcm(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+/**end repeat2**/
+
+
+
+
+#line 218
+NPY_NO_EXPORT void
+LONGLONG_Qq_bool_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+LONGLONG_qQ_bool_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 218
+NPY_NO_EXPORT void
+LONGLONG_Qq_bool_not_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+LONGLONG_qQ_bool_not_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 218
+NPY_NO_EXPORT void
+LONGLONG_Qq_bool_less(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+LONGLONG_qQ_bool_less(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 218
+NPY_NO_EXPORT void
+LONGLONG_Qq_bool_less_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+LONGLONG_qQ_bool_less_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 218
+NPY_NO_EXPORT void
+LONGLONG_Qq_bool_greater(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+LONGLONG_qQ_bool_greater(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 218
+NPY_NO_EXPORT void
+LONGLONG_Qq_bool_greater_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+LONGLONG_qQ_bool_greater_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_unary.dispatch.h"
+#endif
+#line 233
+#line 236
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_negative,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 233
+#line 236
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_negative,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 233
+#line 236
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_negative,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 233
+#line 236
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_negative,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 233
+#line 236
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_negative,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 233
+#line 236
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_negative,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 233
+#line 236
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_negative,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 233
+#line 236
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_negative,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 233
+#line 236
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_negative,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 233
+#line 236
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_negative,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+
+
+/*
+ *****************************************************************************
+ **                             FLOAT LOOPS                                 **
+ *****************************************************************************
+ */
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_unary_fp.dispatch.h"
+#endif
+#line 253
+#line 256
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_rint,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 256
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_floor,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 256
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_trunc,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 256
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_ceil,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 256
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_sqrt,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 256
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_absolute,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 256
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_square,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 256
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_reciprocal,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 253
+#line 256
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_rint,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 256
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_floor,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 256
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_trunc,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 256
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_ceil,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 256
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_sqrt,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 256
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_absolute,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 256
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_square,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 256
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_reciprocal,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_unary_fp_le.dispatch.h"
+#endif
+#line 267
+#line 270
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_isnan,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 270
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_isinf,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 270
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_isfinite,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 270
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_signbit,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 267
+#line 270
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_isnan,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 270
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_isinf,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 270
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_isfinite,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 270
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_signbit,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_unary.dispatch.h"
+#endif
+#line 281
+#line 284
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_negative,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 281
+#line 284
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_negative,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 281
+#line 284
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGDOUBLE_negative,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_arithm_fp.dispatch.h"
+#endif
+#line 295
+#line 299
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_add,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+FLOAT_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 299
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_subtract,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+FLOAT_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 299
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_multiply,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+FLOAT_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 299
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_divide,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+FLOAT_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+#line 295
+#line 299
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_add,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+DOUBLE_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 299
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_subtract,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+DOUBLE_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 299
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_multiply,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+DOUBLE_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 299
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_divide,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+NPY_NO_EXPORT int
+DOUBLE_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_hyperbolic.dispatch.h"
+#endif
+#line 314
+#line 317
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_tanh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 314
+#line 317
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_tanh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+
+// SVML
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_umath_fp.dispatch.h"
+#endif
+
+#line 330
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_tanh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_exp2,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_log2,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_log10,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_expm1,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_log1p,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_cbrt,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_tan,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_arcsin,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_arccos,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_arctan,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_sinh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_cosh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_arcsinh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_arccosh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_arctanh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+
+#line 330
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_tanh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_exp2,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_log2,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_log10,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_expm1,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_log1p,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_cbrt,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_tan,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_arcsin,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_arccos,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_arctan,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_sinh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_cosh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_arcsinh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_arccosh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 333
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_arctanh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_sin,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_cos,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_tan,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_exp,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_exp2,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_log,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_log2,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_log10,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_expm1,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_log1p,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_cbrt,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_arcsin,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_arccos,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_arctan,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_sinh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_cosh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_tanh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_arcsinh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_arccosh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 343
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_arctanh,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+
+#line 352
+#line 355
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_power,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 355
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_arctan2,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+
+#line 352
+#line 355
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_power,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 355
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_arctan2,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_trigonometric.dispatch.h"
+#endif
+
+#line 369
+#line 372
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_sin, (
+    char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+#line 372
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_cos, (
+    char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+
+#line 369
+#line 372
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_sin, (
+    char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+#line 372
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_cos, (
+    char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_exponent_log.dispatch.h"
+#endif
+#line 384
+#line 387
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_exp, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+#line 387
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_log, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+#line 387
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_frexp, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+#line 387
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_ldexp, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+
+#line 384
+#line 387
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_exp, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+#line 387
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_log, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+#line 387
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_frexp, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+#line 387
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_ldexp, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_comparison.dispatch.h"
+#endif
+#line 399
+#line 402
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_equal, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+#line 402
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_not_equal, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+#line 402
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_less, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+#line 402
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_less_equal, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+#line 402
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_greater, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+#line 402
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_greater_equal, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+
+#line 399
+#line 402
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_equal, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+#line 402
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_not_equal, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+#line 402
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_less, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+#line 402
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_less_equal, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+#line 402
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_greater, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+#line 402
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_greater_equal, (
+  char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)
+))
+
+
+
+#line 416
+
+#line 422
+NPY_NO_EXPORT void
+HALF_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+HALF_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 422
+NPY_NO_EXPORT void
+HALF_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+HALF_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 422
+NPY_NO_EXPORT void
+HALF_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+HALF_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 422
+NPY_NO_EXPORT void
+HALF_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+HALF_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+/**end repeat1**/
+
+#line 435
+NPY_NO_EXPORT void
+HALF_logical_and(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 435
+NPY_NO_EXPORT void
+HALF_logical_or(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+NPY_NO_EXPORT void
+HALF_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+HALF_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 450
+
+#if !0 || !1
+NPY_NO_EXPORT void
+HALF_isnan(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !0 || !1
+NPY_NO_EXPORT void
+HALF_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !0 || !1
+NPY_NO_EXPORT void
+HALF_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !0 || !1
+NPY_NO_EXPORT void
+HALF_signbit(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !0 || !0
+NPY_NO_EXPORT void
+HALF_copysign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !0 || !0
+NPY_NO_EXPORT void
+HALF_nextafter(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !0 || !0
+NPY_NO_EXPORT void
+HALF_spacing(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+
+#line 461
+NPY_NO_EXPORT void
+HALF_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+HALF_maximum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 461
+NPY_NO_EXPORT void
+HALF_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+HALF_minimum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+#line 472
+NPY_NO_EXPORT void
+HALF_fmax(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+HALF_fmax_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 472
+NPY_NO_EXPORT void
+HALF_fmin(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+HALF_fmin_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+NPY_NO_EXPORT void
+HALF_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+HALF_floor_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+HALF_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+HALF_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+HALF_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+HALF_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+HALF__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+HALF_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+HALF_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#if 1
+NPY_NO_EXPORT void
+HALF_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+
+NPY_NO_EXPORT void
+HALF_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+HALF_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+HALF_modf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+HALF_frexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+HALF_ldexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+HALF_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 416
+
+#line 422
+NPY_NO_EXPORT void
+FLOAT_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+FLOAT_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 422
+NPY_NO_EXPORT void
+FLOAT_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+FLOAT_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 422
+NPY_NO_EXPORT void
+FLOAT_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+FLOAT_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 422
+NPY_NO_EXPORT void
+FLOAT_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+FLOAT_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+/**end repeat1**/
+
+#line 435
+NPY_NO_EXPORT void
+FLOAT_logical_and(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 435
+NPY_NO_EXPORT void
+FLOAT_logical_or(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+NPY_NO_EXPORT void
+FLOAT_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+FLOAT_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 450
+
+#if !1 || !1
+NPY_NO_EXPORT void
+FLOAT_isnan(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !1 || !1
+NPY_NO_EXPORT void
+FLOAT_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !1 || !1
+NPY_NO_EXPORT void
+FLOAT_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !1 || !1
+NPY_NO_EXPORT void
+FLOAT_signbit(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !1 || !0
+NPY_NO_EXPORT void
+FLOAT_copysign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !1 || !0
+NPY_NO_EXPORT void
+FLOAT_nextafter(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !1 || !0
+NPY_NO_EXPORT void
+FLOAT_spacing(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+
+#line 461
+NPY_NO_EXPORT void
+FLOAT_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+FLOAT_maximum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 461
+NPY_NO_EXPORT void
+FLOAT_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+FLOAT_minimum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+#line 472
+NPY_NO_EXPORT void
+FLOAT_fmax(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+FLOAT_fmax_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 472
+NPY_NO_EXPORT void
+FLOAT_fmin(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+FLOAT_fmin_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+NPY_NO_EXPORT void
+FLOAT_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+FLOAT_floor_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+FLOAT_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+FLOAT_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+FLOAT_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+FLOAT_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+FLOAT__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+FLOAT_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+FLOAT_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#if 0
+NPY_NO_EXPORT void
+FLOAT_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+
+NPY_NO_EXPORT void
+FLOAT_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+FLOAT_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+FLOAT_modf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+FLOAT_frexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+FLOAT_ldexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+FLOAT_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 416
+
+#line 422
+NPY_NO_EXPORT void
+DOUBLE_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+DOUBLE_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 422
+NPY_NO_EXPORT void
+DOUBLE_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+DOUBLE_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 422
+NPY_NO_EXPORT void
+DOUBLE_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+DOUBLE_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 422
+NPY_NO_EXPORT void
+DOUBLE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+DOUBLE_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+/**end repeat1**/
+
+#line 435
+NPY_NO_EXPORT void
+DOUBLE_logical_and(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 435
+NPY_NO_EXPORT void
+DOUBLE_logical_or(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+NPY_NO_EXPORT void
+DOUBLE_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DOUBLE_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 450
+
+#if !1 || !1
+NPY_NO_EXPORT void
+DOUBLE_isnan(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !1 || !1
+NPY_NO_EXPORT void
+DOUBLE_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !1 || !1
+NPY_NO_EXPORT void
+DOUBLE_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !1 || !1
+NPY_NO_EXPORT void
+DOUBLE_signbit(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !1 || !0
+NPY_NO_EXPORT void
+DOUBLE_copysign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !1 || !0
+NPY_NO_EXPORT void
+DOUBLE_nextafter(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !1 || !0
+NPY_NO_EXPORT void
+DOUBLE_spacing(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+
+#line 461
+NPY_NO_EXPORT void
+DOUBLE_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+DOUBLE_maximum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 461
+NPY_NO_EXPORT void
+DOUBLE_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+DOUBLE_minimum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+#line 472
+NPY_NO_EXPORT void
+DOUBLE_fmax(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+DOUBLE_fmax_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 472
+NPY_NO_EXPORT void
+DOUBLE_fmin(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+DOUBLE_fmin_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+NPY_NO_EXPORT void
+DOUBLE_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+DOUBLE_floor_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DOUBLE_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DOUBLE_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DOUBLE_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+DOUBLE_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+DOUBLE__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+DOUBLE_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DOUBLE_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#if 0
+NPY_NO_EXPORT void
+DOUBLE_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+
+NPY_NO_EXPORT void
+DOUBLE_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DOUBLE_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DOUBLE_modf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DOUBLE_frexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DOUBLE_ldexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DOUBLE_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 416
+
+#line 422
+NPY_NO_EXPORT void
+LONGDOUBLE_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+LONGDOUBLE_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 422
+NPY_NO_EXPORT void
+LONGDOUBLE_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+LONGDOUBLE_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 422
+NPY_NO_EXPORT void
+LONGDOUBLE_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+LONGDOUBLE_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 422
+NPY_NO_EXPORT void
+LONGDOUBLE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+LONGDOUBLE_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+/**end repeat1**/
+
+#line 435
+NPY_NO_EXPORT void
+LONGDOUBLE_logical_and(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 435
+NPY_NO_EXPORT void
+LONGDOUBLE_logical_or(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+NPY_NO_EXPORT void
+LONGDOUBLE_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+LONGDOUBLE_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 450
+
+#if !0 || !1
+NPY_NO_EXPORT void
+LONGDOUBLE_isnan(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !0 || !1
+NPY_NO_EXPORT void
+LONGDOUBLE_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !0 || !1
+NPY_NO_EXPORT void
+LONGDOUBLE_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !0 || !1
+NPY_NO_EXPORT void
+LONGDOUBLE_signbit(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !0 || !0
+NPY_NO_EXPORT void
+LONGDOUBLE_copysign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !0 || !0
+NPY_NO_EXPORT void
+LONGDOUBLE_nextafter(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+#line 450
+
+#if !0 || !0
+NPY_NO_EXPORT void
+LONGDOUBLE_spacing(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+/**end repeat2**/
+
+
+#line 461
+NPY_NO_EXPORT void
+LONGDOUBLE_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+LONGDOUBLE_maximum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 461
+NPY_NO_EXPORT void
+LONGDOUBLE_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+LONGDOUBLE_minimum_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+#line 472
+NPY_NO_EXPORT void
+LONGDOUBLE_fmax(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+LONGDOUBLE_fmax_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+#line 472
+NPY_NO_EXPORT void
+LONGDOUBLE_fmin(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+LONGDOUBLE_fmin_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+
+NPY_NO_EXPORT void
+LONGDOUBLE_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+LONGDOUBLE_floor_divide_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+LONGDOUBLE_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+LONGDOUBLE_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+LONGDOUBLE_square(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+LONGDOUBLE_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+LONGDOUBLE__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+LONGDOUBLE_conjugate(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+LONGDOUBLE_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#if 0
+NPY_NO_EXPORT void
+LONGDOUBLE_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#endif
+
+NPY_NO_EXPORT void
+LONGDOUBLE_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+LONGDOUBLE_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+LONGDOUBLE_modf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+LONGDOUBLE_frexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+LONGDOUBLE_ldexp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+LONGDOUBLE_ldexp_long(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 534
+#line 537
+NPY_NO_EXPORT void
+HALF_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 537
+NPY_NO_EXPORT void
+HALF_not_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 537
+NPY_NO_EXPORT void
+HALF_less(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 537
+NPY_NO_EXPORT void
+HALF_less_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 537
+NPY_NO_EXPORT void
+HALF_greater(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 537
+NPY_NO_EXPORT void
+HALF_greater_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 534
+#line 537
+NPY_NO_EXPORT void
+LONGDOUBLE_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 537
+NPY_NO_EXPORT void
+LONGDOUBLE_not_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 537
+NPY_NO_EXPORT void
+LONGDOUBLE_less(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 537
+NPY_NO_EXPORT void
+LONGDOUBLE_less_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 537
+NPY_NO_EXPORT void
+LONGDOUBLE_greater(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 537
+NPY_NO_EXPORT void
+LONGDOUBLE_greater_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_autovec.dispatch.h"
+#endif
+#line 548
+#line 551
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void HALF_absolute,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+/*
+ *****************************************************************************
+ **                           COMPLEX LOOPS                                 **
+ *****************************************************************************
+ */
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_arithm_fp.dispatch.h"
+#endif
+#line 566
+#line 569
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void CFLOAT_add,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 569
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void CFLOAT_subtract,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 569
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void CFLOAT_multiply,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 569
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void CFLOAT_conjugate,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 569
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void CFLOAT_square,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 566
+#line 569
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void CDOUBLE_add,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 569
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void CDOUBLE_subtract,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 569
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void CDOUBLE_multiply,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 569
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void CDOUBLE_conjugate,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 569
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void CDOUBLE_square,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_unary_complex.dispatch.h"
+#endif
+#line 580
+#line 583
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void CFLOAT_absolute,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 580
+#line 583
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void CDOUBLE_absolute,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+
+#define CGE(xr,xi,yr,yi) (xr > yr || (xr == yr && xi >= yi));
+#define CLE(xr,xi,yr,yi) (xr < yr || (xr == yr && xi <= yi));
+#define CGT(xr,xi,yr,yi) (xr > yr || (xr == yr && xi > yi));
+#define CLT(xr,xi,yr,yi) (xr < yr || (xr == yr && xi < yi));
+#define CEQ(xr,xi,yr,yi) (xr == yr && xi == yi);
+#define CNE(xr,xi,yr,yi) (xr != yr || xi != yi);
+
+#line 601
+
+#line 606
+NPY_NO_EXPORT void
+CFLOAT_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+CFLOAT_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+#line 606
+NPY_NO_EXPORT void
+CFLOAT_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+CFLOAT_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+#line 606
+NPY_NO_EXPORT void
+CFLOAT_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+CFLOAT_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+NPY_NO_EXPORT void
+CFLOAT_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 620
+NPY_NO_EXPORT void
+CFLOAT_greater(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 620
+NPY_NO_EXPORT void
+CFLOAT_greater_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 620
+NPY_NO_EXPORT void
+CFLOAT_less(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 620
+NPY_NO_EXPORT void
+CFLOAT_less_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 620
+NPY_NO_EXPORT void
+CFLOAT_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 620
+NPY_NO_EXPORT void
+CFLOAT_not_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 629
+NPY_NO_EXPORT void
+CFLOAT_logical_and(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 629
+NPY_NO_EXPORT void
+CFLOAT_logical_or(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+NPY_NO_EXPORT void
+CFLOAT_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+CFLOAT_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#line 643
+NPY_NO_EXPORT void
+CFLOAT_isnan(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 643
+NPY_NO_EXPORT void
+CFLOAT_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 643
+NPY_NO_EXPORT void
+CFLOAT_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+NPY_NO_EXPORT void
+CFLOAT_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+CFLOAT__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+CFLOAT_conjugate(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+CFLOAT_absolute(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+CFLOAT_square(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+CFLOAT__arg(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+CFLOAT_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 672
+NPY_NO_EXPORT void
+CFLOAT_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 672
+NPY_NO_EXPORT void
+CFLOAT_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 680
+NPY_NO_EXPORT void
+CFLOAT_fmax(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 680
+NPY_NO_EXPORT void
+CFLOAT_fmin(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+
+#line 601
+
+#line 606
+NPY_NO_EXPORT void
+CDOUBLE_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+CDOUBLE_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+#line 606
+NPY_NO_EXPORT void
+CDOUBLE_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+CDOUBLE_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+#line 606
+NPY_NO_EXPORT void
+CDOUBLE_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+CDOUBLE_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+NPY_NO_EXPORT void
+CDOUBLE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 620
+NPY_NO_EXPORT void
+CDOUBLE_greater(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 620
+NPY_NO_EXPORT void
+CDOUBLE_greater_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 620
+NPY_NO_EXPORT void
+CDOUBLE_less(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 620
+NPY_NO_EXPORT void
+CDOUBLE_less_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 620
+NPY_NO_EXPORT void
+CDOUBLE_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 620
+NPY_NO_EXPORT void
+CDOUBLE_not_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 629
+NPY_NO_EXPORT void
+CDOUBLE_logical_and(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 629
+NPY_NO_EXPORT void
+CDOUBLE_logical_or(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+NPY_NO_EXPORT void
+CDOUBLE_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+CDOUBLE_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#line 643
+NPY_NO_EXPORT void
+CDOUBLE_isnan(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 643
+NPY_NO_EXPORT void
+CDOUBLE_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 643
+NPY_NO_EXPORT void
+CDOUBLE_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+NPY_NO_EXPORT void
+CDOUBLE_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+CDOUBLE__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+CDOUBLE_conjugate(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+CDOUBLE_absolute(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+CDOUBLE_square(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+CDOUBLE__arg(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+CDOUBLE_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 672
+NPY_NO_EXPORT void
+CDOUBLE_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 672
+NPY_NO_EXPORT void
+CDOUBLE_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 680
+NPY_NO_EXPORT void
+CDOUBLE_fmax(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 680
+NPY_NO_EXPORT void
+CDOUBLE_fmin(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+
+#line 601
+
+#line 606
+NPY_NO_EXPORT void
+CLONGDOUBLE_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+CLONGDOUBLE_add_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+#line 606
+NPY_NO_EXPORT void
+CLONGDOUBLE_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+CLONGDOUBLE_subtract_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+#line 606
+NPY_NO_EXPORT void
+CLONGDOUBLE_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT int
+CLONGDOUBLE_multiply_indexed(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func));
+
+
+NPY_NO_EXPORT void
+CLONGDOUBLE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 620
+NPY_NO_EXPORT void
+CLONGDOUBLE_greater(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 620
+NPY_NO_EXPORT void
+CLONGDOUBLE_greater_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 620
+NPY_NO_EXPORT void
+CLONGDOUBLE_less(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 620
+NPY_NO_EXPORT void
+CLONGDOUBLE_less_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 620
+NPY_NO_EXPORT void
+CLONGDOUBLE_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 620
+NPY_NO_EXPORT void
+CLONGDOUBLE_not_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 629
+NPY_NO_EXPORT void
+CLONGDOUBLE_logical_and(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 629
+NPY_NO_EXPORT void
+CLONGDOUBLE_logical_or(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+NPY_NO_EXPORT void
+CLONGDOUBLE_logical_xor(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+CLONGDOUBLE_logical_not(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+#line 643
+NPY_NO_EXPORT void
+CLONGDOUBLE_isnan(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 643
+NPY_NO_EXPORT void
+CLONGDOUBLE_isinf(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 643
+NPY_NO_EXPORT void
+CLONGDOUBLE_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+NPY_NO_EXPORT void
+CLONGDOUBLE_reciprocal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+CLONGDOUBLE__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+CLONGDOUBLE_conjugate(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+CLONGDOUBLE_absolute(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+CLONGDOUBLE_square(char **args, const npy_intp *dimensions, const npy_intp *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+CLONGDOUBLE__arg(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+CLONGDOUBLE_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 672
+NPY_NO_EXPORT void
+CLONGDOUBLE_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 672
+NPY_NO_EXPORT void
+CLONGDOUBLE_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 680
+NPY_NO_EXPORT void
+CLONGDOUBLE_fmax(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 680
+NPY_NO_EXPORT void
+CLONGDOUBLE_fmin(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+
+
+#undef CGE
+#undef CLE
+#undef CGT
+#undef CLT
+#undef CEQ
+#undef CNE
+
+/*
+ *****************************************************************************
+ **                            DATETIME LOOPS                               **
+ *****************************************************************************
+ */
+
+NPY_NO_EXPORT void
+TIMEDELTA_negative(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_positive(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_absolute(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 714
+
+NPY_NO_EXPORT void
+DATETIME_isnat(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DATETIME_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#define DATETIME_isnan DATETIME_isnat
+
+NPY_NO_EXPORT void
+DATETIME__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+#line 730
+NPY_NO_EXPORT void
+DATETIME_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 730
+NPY_NO_EXPORT void
+DATETIME_not_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 730
+NPY_NO_EXPORT void
+DATETIME_greater(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 730
+NPY_NO_EXPORT void
+DATETIME_greater_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 730
+NPY_NO_EXPORT void
+DATETIME_less(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 730
+NPY_NO_EXPORT void
+DATETIME_less_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 737
+NPY_NO_EXPORT void
+DATETIME_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 737
+NPY_NO_EXPORT void
+DATETIME_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 737
+NPY_NO_EXPORT void
+DATETIME_fmin(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 737
+NPY_NO_EXPORT void
+DATETIME_fmax(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+
+#line 714
+
+NPY_NO_EXPORT void
+TIMEDELTA_isnat(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_isfinite(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#define TIMEDELTA_isnan TIMEDELTA_isnat
+
+NPY_NO_EXPORT void
+TIMEDELTA__ones_like(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+#line 730
+NPY_NO_EXPORT void
+TIMEDELTA_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 730
+NPY_NO_EXPORT void
+TIMEDELTA_not_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 730
+NPY_NO_EXPORT void
+TIMEDELTA_greater(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 730
+NPY_NO_EXPORT void
+TIMEDELTA_greater_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 730
+NPY_NO_EXPORT void
+TIMEDELTA_less(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 730
+NPY_NO_EXPORT void
+TIMEDELTA_less_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 737
+NPY_NO_EXPORT void
+TIMEDELTA_maximum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 737
+NPY_NO_EXPORT void
+TIMEDELTA_minimum(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 737
+NPY_NO_EXPORT void
+TIMEDELTA_fmin(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 737
+NPY_NO_EXPORT void
+TIMEDELTA_fmax(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+
+
+NPY_NO_EXPORT void
+DATETIME_Mm_M_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data));
+
+NPY_NO_EXPORT void
+DATETIME_mM_M_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_m_add(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DATETIME_Mm_M_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DATETIME_MM_m_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_m_subtract(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mq_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_qm_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_md_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_dm_m_multiply(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mq_m_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_md_m_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_d_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_q_floor_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_m_remainder(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+TIMEDELTA_mm_qm_divmod(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+/* Special case equivalents to above functions */
+#define TIMEDELTA_mq_m_floor_divide TIMEDELTA_mq_m_divide
+#define TIMEDELTA_md_m_floor_divide TIMEDELTA_md_m_divide
+/* #define TIMEDELTA_mm_d_floor_divide TIMEDELTA_mm_d_divide */
+
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_autovec.dispatch.h"
+#endif
+#line 803
+#line 806
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void TIMEDELTA_isinf,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+#line 803
+#line 806
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DATETIME_isinf,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+
+
+/*
+ *****************************************************************************
+ **                            OBJECT LOOPS                                 **
+ *****************************************************************************
+ */
+
+#line 821
+#line 824
+NPY_NO_EXPORT void
+OBJECT_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 824
+NPY_NO_EXPORT void
+OBJECT_OO_O_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 821
+#line 824
+NPY_NO_EXPORT void
+OBJECT_not_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 824
+NPY_NO_EXPORT void
+OBJECT_OO_O_not_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 821
+#line 824
+NPY_NO_EXPORT void
+OBJECT_greater(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 824
+NPY_NO_EXPORT void
+OBJECT_OO_O_greater(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 821
+#line 824
+NPY_NO_EXPORT void
+OBJECT_greater_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 824
+NPY_NO_EXPORT void
+OBJECT_OO_O_greater_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 821
+#line 824
+NPY_NO_EXPORT void
+OBJECT_less(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 824
+NPY_NO_EXPORT void
+OBJECT_OO_O_less(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+#line 821
+#line 824
+NPY_NO_EXPORT void
+OBJECT_less_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 824
+NPY_NO_EXPORT void
+OBJECT_OO_O_less_equal(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+
+NPY_NO_EXPORT void
+OBJECT_sign(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+PyUFunc_OOO_O(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func);
+
+/*
+ *****************************************************************************
+ **                            MIN/MAX LOOPS                                **
+ *****************************************************************************
+ */
+
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_minmax.dispatch.h"
+#endif
+
+//---------- Integers ----------
+
+#line 851
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_maximum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void BYTE_minimum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 851
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_maximum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UBYTE_minimum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 851
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_maximum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void SHORT_minimum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 851
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_maximum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void USHORT_minimum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 851
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_maximum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void INT_minimum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 851
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_maximum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void UINT_minimum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 851
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_maximum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONG_minimum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 851
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_maximum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONG_minimum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 851
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_maximum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGLONG_minimum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 851
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_maximum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 854
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void ULONGLONG_minimum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+
+//---------- Float ----------
+
+ #line 864
+#line 867
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_maximum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 867
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_minimum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 867
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_fmax,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 867
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void FLOAT_fmin,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 864
+#line 867
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_maximum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 867
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_minimum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 867
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_fmax,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 867
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_fmin,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+#line 864
+#line 867
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGDOUBLE_maximum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 867
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGDOUBLE_minimum,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 867
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGDOUBLE_fmax,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+#line 867
+ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void LONGDOUBLE_fmin,
+   (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)))
+
+
+
+/*
+ *****************************************************************************
+ **                              END LOOPS                                  **
+ *****************************************************************************
+ */
+
+#endif
+
diff --git a/numpy/core/src/_generated/loops_arithm_fp.dispatch.c b/numpy/core/src/_generated/loops_arithm_fp.dispatch.c
new file mode 100644
index 000000000000..e71513dc142c
--- /dev/null
+++ b/numpy/core/src/_generated/loops_arithm_fp.dispatch.c
@@ -0,0 +1,3377 @@
+#line 1 "numpy/core/src/umath/loops_arithm_fp.dispatch.c.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+/*@targets
+ ** $maxopt baseline
+ ** sse2 (avx2 fma3)
+ ** neon asimd
+ ** vsx2 vsx3
+ ** vx vxe
+ **/
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+/**
+ * TODO:
+ *  - Improve the implementation of SIMD complex absolute,
+ *    current one kinda slow and it can be optimized by
+ *    at least avoiding the division and keep sqrt.
+ *  - Vectorize reductions
+ *  - Add support for ASIMD/VCMLA through universal intrinics.
+ */
+
+//###############################################################################
+//## Real Single/Double precision
+//###############################################################################
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+#line 43
+#line 52
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_add)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp len = dimensions[0];
+    char *src0 = args[0], *src1 = args[1], *dst = args[2];
+    npy_intp ssrc0 = steps[0], ssrc1 = steps[1], sdst = steps[2];
+    // reduce
+    if (ssrc0 == 0 && ssrc0 == sdst && src0 == dst) {
+    #if 1
+        *((npy_float*)src0) += FLOAT_pairwise_sum(src1, len, ssrc1);
+    #else
+        npy_float acc = *((npy_float*)src0);
+        if (ssrc1 == sizeof(npy_float)) {
+            for (; len > 0; --len, src1 += sizeof(npy_float)) {
+                acc += *(npy_float *)src1;
+            }
+        } else {
+            for (; len > 0; --len, src1 += ssrc1) {
+                acc += *(npy_float *)src1;
+            }
+        }
+        *((npy_float*)src0) = acc;
+    #endif
+        return;
+    }
+#if 0 && defined(NPY_HAVE_NEON) && !NPY_SIMD_F64
+    /**
+     * The SIMD branch is disabled on armhf(armv7) due to the absence of native SIMD
+     * support for single-precision floating-point division. Only scalar division is
+     * supported natively, and without hardware for performance and accuracy comparison,
+     * it's challenging to evaluate the benefits of emulated SIMD intrinsic versus
+     * native scalar division.
+     *
+     * The `npyv_div_f32` universal intrinsic emulates the division operation using an
+     * approximate reciprocal combined with 3 Newton-Raphson iterations for enhanced
+     * precision. However, this approach has limitations:
+     *
+     * - It can cause unexpected floating-point overflows in special cases, such as when
+     *   the divisor is subnormal (refer: https://github.com/numpy/numpy/issues/25097).
+     *
+     * - The precision may vary between the emulated SIMD and scalar division due to
+     *   non-uniform branches (non-contiguous) in the code, leading to precision
+     *   inconsistencies.
+     *
+     * - Considering the necessity of multiple Newton-Raphson iterations, the performance
+     *   gain may not sufficiently offset these drawbacks.
+     */
+#elif NPY_SIMD_F32
+    if (len > npyv_nlanes_f32*2 &&
+        !is_mem_overlap(src0, ssrc0, dst, sdst, len) &&
+        !is_mem_overlap(src1, ssrc1, dst, sdst, len)
+    ) {
+        const int vstep = npyv_nlanes_u8;
+        const int wstep = vstep * 2;
+        const int hstep = npyv_nlanes_f32;
+        const int lstep = hstep * 2;
+        // lots of specializations, to squeeze out max performance
+        if (ssrc0 == sizeof(npy_float) && ssrc0 == ssrc1 && ssrc0 == sdst) {
+            for (; len >= lstep; len -= lstep, src0 += wstep, src1 += wstep, dst += wstep) {
+                npyv_f32 a0 = npyv_load_f32((const npy_float*)src0);
+                npyv_f32 a1 = npyv_load_f32((const npy_float*)(src0 + vstep));
+                npyv_f32 b0 = npyv_load_f32((const npy_float*)src1);
+                npyv_f32 b1 = npyv_load_f32((const npy_float*)(src1 + vstep));
+                npyv_f32 r0 = npyv_add_f32(a0, b0);
+                npyv_f32 r1 = npyv_add_f32(a1, b1);
+                npyv_store_f32((npy_float*)dst, r0);
+                npyv_store_f32((npy_float*)(dst + vstep), r1);
+            }
+            for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
+            #if 0
+                npyv_f32 a = npyv_load_till_f32((const npy_float*)src0, len, 1.0f);
+                npyv_f32 b = npyv_load_till_f32((const npy_float*)src1, len, 1.0f);
+            #else
+                npyv_f32 a = npyv_load_tillz_f32((const npy_float*)src0, len);
+                npyv_f32 b = npyv_load_tillz_f32((const npy_float*)src1, len);
+            #endif
+                npyv_f32 r = npyv_add_f32(a, b);
+                npyv_store_till_f32((npy_float*)dst, len, r);
+            }
+        }
+        else if (ssrc0 == 0 && ssrc1 == sizeof(npy_float) && sdst == ssrc1) {
+            npyv_f32 a = npyv_setall_f32(*((npy_float*)src0));
+            for (; len >= lstep; len -= lstep, src1 += wstep, dst += wstep) {
+                npyv_f32 b0 = npyv_load_f32((const npy_float*)src1);
+                npyv_f32 b1 = npyv_load_f32((const npy_float*)(src1 + vstep));
+                npyv_f32 r0 = npyv_add_f32(a, b0);
+                npyv_f32 r1 = npyv_add_f32(a, b1);
+                npyv_store_f32((npy_float*)dst, r0);
+                npyv_store_f32((npy_float*)(dst + vstep), r1);
+            }
+            for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
+            #if 0 || 0
+                npyv_f32 b = npyv_load_till_f32((const npy_float*)src1, len, 1.0f);
+            #else
+                npyv_f32 b = npyv_load_tillz_f32((const npy_float*)src1, len);
+            #endif
+                npyv_f32 r = npyv_add_f32(a, b);
+                npyv_store_till_f32((npy_float*)dst, len, r);
+            }
+        }
+        else if (ssrc1 == 0 && ssrc0 == sizeof(npy_float) && sdst == ssrc0) {
+            npyv_f32 b = npyv_setall_f32(*((npy_float*)src1));
+            for (; len >= lstep; len -= lstep, src0 += wstep, dst += wstep) {
+                npyv_f32 a0 = npyv_load_f32((const npy_float*)src0);
+                npyv_f32 a1 = npyv_load_f32((const npy_float*)(src0 + vstep));
+                npyv_f32 r0 = npyv_add_f32(a0, b);
+                npyv_f32 r1 = npyv_add_f32(a1, b);
+                npyv_store_f32((npy_float*)dst, r0);
+                npyv_store_f32((npy_float*)(dst + vstep), r1);
+            }
+            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
+            #if 0
+                npyv_f32 a = npyv_load_till_f32((const npy_float*)src0, len, 1.0f);
+            #elif 0
+                npyv_f32 a = npyv_load_till_f32((const npy_float*)src0, len, NPY_NANF);
+            #else
+                npyv_f32 a = npyv_load_tillz_f32((const npy_float*)src0, len);
+            #endif
+                npyv_f32 r = npyv_add_f32(a, b);
+                npyv_store_till_f32((npy_float*)dst, len, r);
+            }
+        } else {
+            goto loop_scalar;
+        }
+        npyv_cleanup();
+        return;
+    }
+loop_scalar:
+#endif
+    for (; len > 0; --len, src0 += ssrc0, src1 += ssrc1, dst += sdst) {
+        const npy_float a = *((npy_float*)src0);
+        const npy_float b = *((npy_float*)src1);
+        *((npy_float*)dst) = a + b;
+    }
+}
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_add_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_float *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_float *)(ip1 + is1 * indx);
+        *indexed = *indexed + *(npy_float *)value;
+    }
+    return 0;
+}
+
+
+#line 52
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_subtract)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp len = dimensions[0];
+    char *src0 = args[0], *src1 = args[1], *dst = args[2];
+    npy_intp ssrc0 = steps[0], ssrc1 = steps[1], sdst = steps[2];
+    // reduce
+    if (ssrc0 == 0 && ssrc0 == sdst && src0 == dst) {
+    #if 0
+        *((npy_float*)src0) -= FLOAT_pairwise_sum(src1, len, ssrc1);
+    #else
+        npy_float acc = *((npy_float*)src0);
+        if (ssrc1 == sizeof(npy_float)) {
+            for (; len > 0; --len, src1 += sizeof(npy_float)) {
+                acc -= *(npy_float *)src1;
+            }
+        } else {
+            for (; len > 0; --len, src1 += ssrc1) {
+                acc -= *(npy_float *)src1;
+            }
+        }
+        *((npy_float*)src0) = acc;
+    #endif
+        return;
+    }
+#if 0 && defined(NPY_HAVE_NEON) && !NPY_SIMD_F64
+    /**
+     * The SIMD branch is disabled on armhf(armv7) due to the absence of native SIMD
+     * support for single-precision floating-point division. Only scalar division is
+     * supported natively, and without hardware for performance and accuracy comparison,
+     * it's challenging to evaluate the benefits of emulated SIMD intrinsic versus
+     * native scalar division.
+     *
+     * The `npyv_div_f32` universal intrinsic emulates the division operation using an
+     * approximate reciprocal combined with 3 Newton-Raphson iterations for enhanced
+     * precision. However, this approach has limitations:
+     *
+     * - It can cause unexpected floating-point overflows in special cases, such as when
+     *   the divisor is subnormal (refer: https://github.com/numpy/numpy/issues/25097).
+     *
+     * - The precision may vary between the emulated SIMD and scalar division due to
+     *   non-uniform branches (non-contiguous) in the code, leading to precision
+     *   inconsistencies.
+     *
+     * - Considering the necessity of multiple Newton-Raphson iterations, the performance
+     *   gain may not sufficiently offset these drawbacks.
+     */
+#elif NPY_SIMD_F32
+    if (len > npyv_nlanes_f32*2 &&
+        !is_mem_overlap(src0, ssrc0, dst, sdst, len) &&
+        !is_mem_overlap(src1, ssrc1, dst, sdst, len)
+    ) {
+        const int vstep = npyv_nlanes_u8;
+        const int wstep = vstep * 2;
+        const int hstep = npyv_nlanes_f32;
+        const int lstep = hstep * 2;
+        // lots of specializations, to squeeze out max performance
+        if (ssrc0 == sizeof(npy_float) && ssrc0 == ssrc1 && ssrc0 == sdst) {
+            for (; len >= lstep; len -= lstep, src0 += wstep, src1 += wstep, dst += wstep) {
+                npyv_f32 a0 = npyv_load_f32((const npy_float*)src0);
+                npyv_f32 a1 = npyv_load_f32((const npy_float*)(src0 + vstep));
+                npyv_f32 b0 = npyv_load_f32((const npy_float*)src1);
+                npyv_f32 b1 = npyv_load_f32((const npy_float*)(src1 + vstep));
+                npyv_f32 r0 = npyv_sub_f32(a0, b0);
+                npyv_f32 r1 = npyv_sub_f32(a1, b1);
+                npyv_store_f32((npy_float*)dst, r0);
+                npyv_store_f32((npy_float*)(dst + vstep), r1);
+            }
+            for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
+            #if 0
+                npyv_f32 a = npyv_load_till_f32((const npy_float*)src0, len, 1.0f);
+                npyv_f32 b = npyv_load_till_f32((const npy_float*)src1, len, 1.0f);
+            #else
+                npyv_f32 a = npyv_load_tillz_f32((const npy_float*)src0, len);
+                npyv_f32 b = npyv_load_tillz_f32((const npy_float*)src1, len);
+            #endif
+                npyv_f32 r = npyv_sub_f32(a, b);
+                npyv_store_till_f32((npy_float*)dst, len, r);
+            }
+        }
+        else if (ssrc0 == 0 && ssrc1 == sizeof(npy_float) && sdst == ssrc1) {
+            npyv_f32 a = npyv_setall_f32(*((npy_float*)src0));
+            for (; len >= lstep; len -= lstep, src1 += wstep, dst += wstep) {
+                npyv_f32 b0 = npyv_load_f32((const npy_float*)src1);
+                npyv_f32 b1 = npyv_load_f32((const npy_float*)(src1 + vstep));
+                npyv_f32 r0 = npyv_sub_f32(a, b0);
+                npyv_f32 r1 = npyv_sub_f32(a, b1);
+                npyv_store_f32((npy_float*)dst, r0);
+                npyv_store_f32((npy_float*)(dst + vstep), r1);
+            }
+            for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
+            #if 0 || 0
+                npyv_f32 b = npyv_load_till_f32((const npy_float*)src1, len, 1.0f);
+            #else
+                npyv_f32 b = npyv_load_tillz_f32((const npy_float*)src1, len);
+            #endif
+                npyv_f32 r = npyv_sub_f32(a, b);
+                npyv_store_till_f32((npy_float*)dst, len, r);
+            }
+        }
+        else if (ssrc1 == 0 && ssrc0 == sizeof(npy_float) && sdst == ssrc0) {
+            npyv_f32 b = npyv_setall_f32(*((npy_float*)src1));
+            for (; len >= lstep; len -= lstep, src0 += wstep, dst += wstep) {
+                npyv_f32 a0 = npyv_load_f32((const npy_float*)src0);
+                npyv_f32 a1 = npyv_load_f32((const npy_float*)(src0 + vstep));
+                npyv_f32 r0 = npyv_sub_f32(a0, b);
+                npyv_f32 r1 = npyv_sub_f32(a1, b);
+                npyv_store_f32((npy_float*)dst, r0);
+                npyv_store_f32((npy_float*)(dst + vstep), r1);
+            }
+            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
+            #if 0
+                npyv_f32 a = npyv_load_till_f32((const npy_float*)src0, len, 1.0f);
+            #elif 0
+                npyv_f32 a = npyv_load_till_f32((const npy_float*)src0, len, NPY_NANF);
+            #else
+                npyv_f32 a = npyv_load_tillz_f32((const npy_float*)src0, len);
+            #endif
+                npyv_f32 r = npyv_sub_f32(a, b);
+                npyv_store_till_f32((npy_float*)dst, len, r);
+            }
+        } else {
+            goto loop_scalar;
+        }
+        npyv_cleanup();
+        return;
+    }
+loop_scalar:
+#endif
+    for (; len > 0; --len, src0 += ssrc0, src1 += ssrc1, dst += sdst) {
+        const npy_float a = *((npy_float*)src0);
+        const npy_float b = *((npy_float*)src1);
+        *((npy_float*)dst) = a - b;
+    }
+}
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_subtract_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_float *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_float *)(ip1 + is1 * indx);
+        *indexed = *indexed - *(npy_float *)value;
+    }
+    return 0;
+}
+
+
+#line 52
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_multiply)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp len = dimensions[0];
+    char *src0 = args[0], *src1 = args[1], *dst = args[2];
+    npy_intp ssrc0 = steps[0], ssrc1 = steps[1], sdst = steps[2];
+    // reduce
+    if (ssrc0 == 0 && ssrc0 == sdst && src0 == dst) {
+    #if 0
+        *((npy_float*)src0) *= FLOAT_pairwise_sum(src1, len, ssrc1);
+    #else
+        npy_float acc = *((npy_float*)src0);
+        if (ssrc1 == sizeof(npy_float)) {
+            for (; len > 0; --len, src1 += sizeof(npy_float)) {
+                acc *= *(npy_float *)src1;
+            }
+        } else {
+            for (; len > 0; --len, src1 += ssrc1) {
+                acc *= *(npy_float *)src1;
+            }
+        }
+        *((npy_float*)src0) = acc;
+    #endif
+        return;
+    }
+#if 0 && defined(NPY_HAVE_NEON) && !NPY_SIMD_F64
+    /**
+     * The SIMD branch is disabled on armhf(armv7) due to the absence of native SIMD
+     * support for single-precision floating-point division. Only scalar division is
+     * supported natively, and without hardware for performance and accuracy comparison,
+     * it's challenging to evaluate the benefits of emulated SIMD intrinsic versus
+     * native scalar division.
+     *
+     * The `npyv_div_f32` universal intrinsic emulates the division operation using an
+     * approximate reciprocal combined with 3 Newton-Raphson iterations for enhanced
+     * precision. However, this approach has limitations:
+     *
+     * - It can cause unexpected floating-point overflows in special cases, such as when
+     *   the divisor is subnormal (refer: https://github.com/numpy/numpy/issues/25097).
+     *
+     * - The precision may vary between the emulated SIMD and scalar division due to
+     *   non-uniform branches (non-contiguous) in the code, leading to precision
+     *   inconsistencies.
+     *
+     * - Considering the necessity of multiple Newton-Raphson iterations, the performance
+     *   gain may not sufficiently offset these drawbacks.
+     */
+#elif NPY_SIMD_F32
+    if (len > npyv_nlanes_f32*2 &&
+        !is_mem_overlap(src0, ssrc0, dst, sdst, len) &&
+        !is_mem_overlap(src1, ssrc1, dst, sdst, len)
+    ) {
+        const int vstep = npyv_nlanes_u8;
+        const int wstep = vstep * 2;
+        const int hstep = npyv_nlanes_f32;
+        const int lstep = hstep * 2;
+        // lots of specializations, to squeeze out max performance
+        if (ssrc0 == sizeof(npy_float) && ssrc0 == ssrc1 && ssrc0 == sdst) {
+            for (; len >= lstep; len -= lstep, src0 += wstep, src1 += wstep, dst += wstep) {
+                npyv_f32 a0 = npyv_load_f32((const npy_float*)src0);
+                npyv_f32 a1 = npyv_load_f32((const npy_float*)(src0 + vstep));
+                npyv_f32 b0 = npyv_load_f32((const npy_float*)src1);
+                npyv_f32 b1 = npyv_load_f32((const npy_float*)(src1 + vstep));
+                npyv_f32 r0 = npyv_mul_f32(a0, b0);
+                npyv_f32 r1 = npyv_mul_f32(a1, b1);
+                npyv_store_f32((npy_float*)dst, r0);
+                npyv_store_f32((npy_float*)(dst + vstep), r1);
+            }
+            for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
+            #if 0
+                npyv_f32 a = npyv_load_till_f32((const npy_float*)src0, len, 1.0f);
+                npyv_f32 b = npyv_load_till_f32((const npy_float*)src1, len, 1.0f);
+            #else
+                npyv_f32 a = npyv_load_tillz_f32((const npy_float*)src0, len);
+                npyv_f32 b = npyv_load_tillz_f32((const npy_float*)src1, len);
+            #endif
+                npyv_f32 r = npyv_mul_f32(a, b);
+                npyv_store_till_f32((npy_float*)dst, len, r);
+            }
+        }
+        else if (ssrc0 == 0 && ssrc1 == sizeof(npy_float) && sdst == ssrc1) {
+            npyv_f32 a = npyv_setall_f32(*((npy_float*)src0));
+            for (; len >= lstep; len -= lstep, src1 += wstep, dst += wstep) {
+                npyv_f32 b0 = npyv_load_f32((const npy_float*)src1);
+                npyv_f32 b1 = npyv_load_f32((const npy_float*)(src1 + vstep));
+                npyv_f32 r0 = npyv_mul_f32(a, b0);
+                npyv_f32 r1 = npyv_mul_f32(a, b1);
+                npyv_store_f32((npy_float*)dst, r0);
+                npyv_store_f32((npy_float*)(dst + vstep), r1);
+            }
+            for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
+            #if 0 || 1
+                npyv_f32 b = npyv_load_till_f32((const npy_float*)src1, len, 1.0f);
+            #else
+                npyv_f32 b = npyv_load_tillz_f32((const npy_float*)src1, len);
+            #endif
+                npyv_f32 r = npyv_mul_f32(a, b);
+                npyv_store_till_f32((npy_float*)dst, len, r);
+            }
+        }
+        else if (ssrc1 == 0 && ssrc0 == sizeof(npy_float) && sdst == ssrc0) {
+            npyv_f32 b = npyv_setall_f32(*((npy_float*)src1));
+            for (; len >= lstep; len -= lstep, src0 += wstep, dst += wstep) {
+                npyv_f32 a0 = npyv_load_f32((const npy_float*)src0);
+                npyv_f32 a1 = npyv_load_f32((const npy_float*)(src0 + vstep));
+                npyv_f32 r0 = npyv_mul_f32(a0, b);
+                npyv_f32 r1 = npyv_mul_f32(a1, b);
+                npyv_store_f32((npy_float*)dst, r0);
+                npyv_store_f32((npy_float*)(dst + vstep), r1);
+            }
+            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
+            #if 1
+                npyv_f32 a = npyv_load_till_f32((const npy_float*)src0, len, 1.0f);
+            #elif 0
+                npyv_f32 a = npyv_load_till_f32((const npy_float*)src0, len, NPY_NANF);
+            #else
+                npyv_f32 a = npyv_load_tillz_f32((const npy_float*)src0, len);
+            #endif
+                npyv_f32 r = npyv_mul_f32(a, b);
+                npyv_store_till_f32((npy_float*)dst, len, r);
+            }
+        } else {
+            goto loop_scalar;
+        }
+        npyv_cleanup();
+        return;
+    }
+loop_scalar:
+#endif
+    for (; len > 0; --len, src0 += ssrc0, src1 += ssrc1, dst += sdst) {
+        const npy_float a = *((npy_float*)src0);
+        const npy_float b = *((npy_float*)src1);
+        *((npy_float*)dst) = a * b;
+    }
+}
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_multiply_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_float *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_float *)(ip1 + is1 * indx);
+        *indexed = *indexed * *(npy_float *)value;
+    }
+    return 0;
+}
+
+
+#line 52
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_divide)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp len = dimensions[0];
+    char *src0 = args[0], *src1 = args[1], *dst = args[2];
+    npy_intp ssrc0 = steps[0], ssrc1 = steps[1], sdst = steps[2];
+    // reduce
+    if (ssrc0 == 0 && ssrc0 == sdst && src0 == dst) {
+    #if 0
+        *((npy_float*)src0) /= FLOAT_pairwise_sum(src1, len, ssrc1);
+    #else
+        npy_float acc = *((npy_float*)src0);
+        if (ssrc1 == sizeof(npy_float)) {
+            for (; len > 0; --len, src1 += sizeof(npy_float)) {
+                acc /= *(npy_float *)src1;
+            }
+        } else {
+            for (; len > 0; --len, src1 += ssrc1) {
+                acc /= *(npy_float *)src1;
+            }
+        }
+        *((npy_float*)src0) = acc;
+    #endif
+        return;
+    }
+#if 1 && defined(NPY_HAVE_NEON) && !NPY_SIMD_F64
+    /**
+     * The SIMD branch is disabled on armhf(armv7) due to the absence of native SIMD
+     * support for single-precision floating-point division. Only scalar division is
+     * supported natively, and without hardware for performance and accuracy comparison,
+     * it's challenging to evaluate the benefits of emulated SIMD intrinsic versus
+     * native scalar division.
+     *
+     * The `npyv_div_f32` universal intrinsic emulates the division operation using an
+     * approximate reciprocal combined with 3 Newton-Raphson iterations for enhanced
+     * precision. However, this approach has limitations:
+     *
+     * - It can cause unexpected floating-point overflows in special cases, such as when
+     *   the divisor is subnormal (refer: https://github.com/numpy/numpy/issues/25097).
+     *
+     * - The precision may vary between the emulated SIMD and scalar division due to
+     *   non-uniform branches (non-contiguous) in the code, leading to precision
+     *   inconsistencies.
+     *
+     * - Considering the necessity of multiple Newton-Raphson iterations, the performance
+     *   gain may not sufficiently offset these drawbacks.
+     */
+#elif NPY_SIMD_F32
+    if (len > npyv_nlanes_f32*2 &&
+        !is_mem_overlap(src0, ssrc0, dst, sdst, len) &&
+        !is_mem_overlap(src1, ssrc1, dst, sdst, len)
+    ) {
+        const int vstep = npyv_nlanes_u8;
+        const int wstep = vstep * 2;
+        const int hstep = npyv_nlanes_f32;
+        const int lstep = hstep * 2;
+        // lots of specializations, to squeeze out max performance
+        if (ssrc0 == sizeof(npy_float) && ssrc0 == ssrc1 && ssrc0 == sdst) {
+            for (; len >= lstep; len -= lstep, src0 += wstep, src1 += wstep, dst += wstep) {
+                npyv_f32 a0 = npyv_load_f32((const npy_float*)src0);
+                npyv_f32 a1 = npyv_load_f32((const npy_float*)(src0 + vstep));
+                npyv_f32 b0 = npyv_load_f32((const npy_float*)src1);
+                npyv_f32 b1 = npyv_load_f32((const npy_float*)(src1 + vstep));
+                npyv_f32 r0 = npyv_div_f32(a0, b0);
+                npyv_f32 r1 = npyv_div_f32(a1, b1);
+                npyv_store_f32((npy_float*)dst, r0);
+                npyv_store_f32((npy_float*)(dst + vstep), r1);
+            }
+            for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
+            #if 1
+                npyv_f32 a = npyv_load_till_f32((const npy_float*)src0, len, 1.0f);
+                npyv_f32 b = npyv_load_till_f32((const npy_float*)src1, len, 1.0f);
+            #else
+                npyv_f32 a = npyv_load_tillz_f32((const npy_float*)src0, len);
+                npyv_f32 b = npyv_load_tillz_f32((const npy_float*)src1, len);
+            #endif
+                npyv_f32 r = npyv_div_f32(a, b);
+                npyv_store_till_f32((npy_float*)dst, len, r);
+            }
+        }
+        else if (ssrc0 == 0 && ssrc1 == sizeof(npy_float) && sdst == ssrc1) {
+            npyv_f32 a = npyv_setall_f32(*((npy_float*)src0));
+            for (; len >= lstep; len -= lstep, src1 += wstep, dst += wstep) {
+                npyv_f32 b0 = npyv_load_f32((const npy_float*)src1);
+                npyv_f32 b1 = npyv_load_f32((const npy_float*)(src1 + vstep));
+                npyv_f32 r0 = npyv_div_f32(a, b0);
+                npyv_f32 r1 = npyv_div_f32(a, b1);
+                npyv_store_f32((npy_float*)dst, r0);
+                npyv_store_f32((npy_float*)(dst + vstep), r1);
+            }
+            for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
+            #if 1 || 0
+                npyv_f32 b = npyv_load_till_f32((const npy_float*)src1, len, 1.0f);
+            #else
+                npyv_f32 b = npyv_load_tillz_f32((const npy_float*)src1, len);
+            #endif
+                npyv_f32 r = npyv_div_f32(a, b);
+                npyv_store_till_f32((npy_float*)dst, len, r);
+            }
+        }
+        else if (ssrc1 == 0 && ssrc0 == sizeof(npy_float) && sdst == ssrc0) {
+            npyv_f32 b = npyv_setall_f32(*((npy_float*)src1));
+            for (; len >= lstep; len -= lstep, src0 += wstep, dst += wstep) {
+                npyv_f32 a0 = npyv_load_f32((const npy_float*)src0);
+                npyv_f32 a1 = npyv_load_f32((const npy_float*)(src0 + vstep));
+                npyv_f32 r0 = npyv_div_f32(a0, b);
+                npyv_f32 r1 = npyv_div_f32(a1, b);
+                npyv_store_f32((npy_float*)dst, r0);
+                npyv_store_f32((npy_float*)(dst + vstep), r1);
+            }
+            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
+            #if 0
+                npyv_f32 a = npyv_load_till_f32((const npy_float*)src0, len, 1.0f);
+            #elif 1
+                npyv_f32 a = npyv_load_till_f32((const npy_float*)src0, len, NPY_NANF);
+            #else
+                npyv_f32 a = npyv_load_tillz_f32((const npy_float*)src0, len);
+            #endif
+                npyv_f32 r = npyv_div_f32(a, b);
+                npyv_store_till_f32((npy_float*)dst, len, r);
+            }
+        } else {
+            goto loop_scalar;
+        }
+        npyv_cleanup();
+        return;
+    }
+loop_scalar:
+#endif
+    for (; len > 0; --len, src0 += ssrc0, src1 += ssrc1, dst += sdst) {
+        const npy_float a = *((npy_float*)src0);
+        const npy_float b = *((npy_float*)src1);
+        *((npy_float*)dst) = a / b;
+    }
+}
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_divide_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_float *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_float *)(ip1 + is1 * indx);
+        *indexed = *indexed / *(npy_float *)value;
+    }
+    return 0;
+}
+
+
+
+#line 43
+#line 52
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_add)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp len = dimensions[0];
+    char *src0 = args[0], *src1 = args[1], *dst = args[2];
+    npy_intp ssrc0 = steps[0], ssrc1 = steps[1], sdst = steps[2];
+    // reduce
+    if (ssrc0 == 0 && ssrc0 == sdst && src0 == dst) {
+    #if 1
+        *((npy_double*)src0) += DOUBLE_pairwise_sum(src1, len, ssrc1);
+    #else
+        npy_double acc = *((npy_double*)src0);
+        if (ssrc1 == sizeof(npy_double)) {
+            for (; len > 0; --len, src1 += sizeof(npy_double)) {
+                acc += *(npy_double *)src1;
+            }
+        } else {
+            for (; len > 0; --len, src1 += ssrc1) {
+                acc += *(npy_double *)src1;
+            }
+        }
+        *((npy_double*)src0) = acc;
+    #endif
+        return;
+    }
+#if 0 && defined(NPY_HAVE_NEON) && !NPY_SIMD_F64
+    /**
+     * The SIMD branch is disabled on armhf(armv7) due to the absence of native SIMD
+     * support for single-precision floating-point division. Only scalar division is
+     * supported natively, and without hardware for performance and accuracy comparison,
+     * it's challenging to evaluate the benefits of emulated SIMD intrinsic versus
+     * native scalar division.
+     *
+     * The `npyv_div_f32` universal intrinsic emulates the division operation using an
+     * approximate reciprocal combined with 3 Newton-Raphson iterations for enhanced
+     * precision. However, this approach has limitations:
+     *
+     * - It can cause unexpected floating-point overflows in special cases, such as when
+     *   the divisor is subnormal (refer: https://github.com/numpy/numpy/issues/25097).
+     *
+     * - The precision may vary between the emulated SIMD and scalar division due to
+     *   non-uniform branches (non-contiguous) in the code, leading to precision
+     *   inconsistencies.
+     *
+     * - Considering the necessity of multiple Newton-Raphson iterations, the performance
+     *   gain may not sufficiently offset these drawbacks.
+     */
+#elif NPY_SIMD_F64
+    if (len > npyv_nlanes_f64*2 &&
+        !is_mem_overlap(src0, ssrc0, dst, sdst, len) &&
+        !is_mem_overlap(src1, ssrc1, dst, sdst, len)
+    ) {
+        const int vstep = npyv_nlanes_u8;
+        const int wstep = vstep * 2;
+        const int hstep = npyv_nlanes_f64;
+        const int lstep = hstep * 2;
+        // lots of specializations, to squeeze out max performance
+        if (ssrc0 == sizeof(npy_double) && ssrc0 == ssrc1 && ssrc0 == sdst) {
+            for (; len >= lstep; len -= lstep, src0 += wstep, src1 += wstep, dst += wstep) {
+                npyv_f64 a0 = npyv_load_f64((const npy_double*)src0);
+                npyv_f64 a1 = npyv_load_f64((const npy_double*)(src0 + vstep));
+                npyv_f64 b0 = npyv_load_f64((const npy_double*)src1);
+                npyv_f64 b1 = npyv_load_f64((const npy_double*)(src1 + vstep));
+                npyv_f64 r0 = npyv_add_f64(a0, b0);
+                npyv_f64 r1 = npyv_add_f64(a1, b1);
+                npyv_store_f64((npy_double*)dst, r0);
+                npyv_store_f64((npy_double*)(dst + vstep), r1);
+            }
+            for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
+            #if 0
+                npyv_f64 a = npyv_load_till_f64((const npy_double*)src0, len, 1.0);
+                npyv_f64 b = npyv_load_till_f64((const npy_double*)src1, len, 1.0);
+            #else
+                npyv_f64 a = npyv_load_tillz_f64((const npy_double*)src0, len);
+                npyv_f64 b = npyv_load_tillz_f64((const npy_double*)src1, len);
+            #endif
+                npyv_f64 r = npyv_add_f64(a, b);
+                npyv_store_till_f64((npy_double*)dst, len, r);
+            }
+        }
+        else if (ssrc0 == 0 && ssrc1 == sizeof(npy_double) && sdst == ssrc1) {
+            npyv_f64 a = npyv_setall_f64(*((npy_double*)src0));
+            for (; len >= lstep; len -= lstep, src1 += wstep, dst += wstep) {
+                npyv_f64 b0 = npyv_load_f64((const npy_double*)src1);
+                npyv_f64 b1 = npyv_load_f64((const npy_double*)(src1 + vstep));
+                npyv_f64 r0 = npyv_add_f64(a, b0);
+                npyv_f64 r1 = npyv_add_f64(a, b1);
+                npyv_store_f64((npy_double*)dst, r0);
+                npyv_store_f64((npy_double*)(dst + vstep), r1);
+            }
+            for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
+            #if 0 || 0
+                npyv_f64 b = npyv_load_till_f64((const npy_double*)src1, len, 1.0);
+            #else
+                npyv_f64 b = npyv_load_tillz_f64((const npy_double*)src1, len);
+            #endif
+                npyv_f64 r = npyv_add_f64(a, b);
+                npyv_store_till_f64((npy_double*)dst, len, r);
+            }
+        }
+        else if (ssrc1 == 0 && ssrc0 == sizeof(npy_double) && sdst == ssrc0) {
+            npyv_f64 b = npyv_setall_f64(*((npy_double*)src1));
+            for (; len >= lstep; len -= lstep, src0 += wstep, dst += wstep) {
+                npyv_f64 a0 = npyv_load_f64((const npy_double*)src0);
+                npyv_f64 a1 = npyv_load_f64((const npy_double*)(src0 + vstep));
+                npyv_f64 r0 = npyv_add_f64(a0, b);
+                npyv_f64 r1 = npyv_add_f64(a1, b);
+                npyv_store_f64((npy_double*)dst, r0);
+                npyv_store_f64((npy_double*)(dst + vstep), r1);
+            }
+            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
+            #if 0
+                npyv_f64 a = npyv_load_till_f64((const npy_double*)src0, len, 1.0);
+            #elif 0
+                npyv_f64 a = npyv_load_till_f64((const npy_double*)src0, len, NPY_NAN);
+            #else
+                npyv_f64 a = npyv_load_tillz_f64((const npy_double*)src0, len);
+            #endif
+                npyv_f64 r = npyv_add_f64(a, b);
+                npyv_store_till_f64((npy_double*)dst, len, r);
+            }
+        } else {
+            goto loop_scalar;
+        }
+        npyv_cleanup();
+        return;
+    }
+loop_scalar:
+#endif
+    for (; len > 0; --len, src0 += ssrc0, src1 += ssrc1, dst += sdst) {
+        const npy_double a = *((npy_double*)src0);
+        const npy_double b = *((npy_double*)src1);
+        *((npy_double*)dst) = a + b;
+    }
+}
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_add_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_double *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_double *)(ip1 + is1 * indx);
+        *indexed = *indexed + *(npy_double *)value;
+    }
+    return 0;
+}
+
+
+#line 52
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_subtract)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp len = dimensions[0];
+    char *src0 = args[0], *src1 = args[1], *dst = args[2];
+    npy_intp ssrc0 = steps[0], ssrc1 = steps[1], sdst = steps[2];
+    // reduce
+    if (ssrc0 == 0 && ssrc0 == sdst && src0 == dst) {
+    #if 0
+        *((npy_double*)src0) -= DOUBLE_pairwise_sum(src1, len, ssrc1);
+    #else
+        npy_double acc = *((npy_double*)src0);
+        if (ssrc1 == sizeof(npy_double)) {
+            for (; len > 0; --len, src1 += sizeof(npy_double)) {
+                acc -= *(npy_double *)src1;
+            }
+        } else {
+            for (; len > 0; --len, src1 += ssrc1) {
+                acc -= *(npy_double *)src1;
+            }
+        }
+        *((npy_double*)src0) = acc;
+    #endif
+        return;
+    }
+#if 0 && defined(NPY_HAVE_NEON) && !NPY_SIMD_F64
+    /**
+     * The SIMD branch is disabled on armhf(armv7) due to the absence of native SIMD
+     * support for single-precision floating-point division. Only scalar division is
+     * supported natively, and without hardware for performance and accuracy comparison,
+     * it's challenging to evaluate the benefits of emulated SIMD intrinsic versus
+     * native scalar division.
+     *
+     * The `npyv_div_f32` universal intrinsic emulates the division operation using an
+     * approximate reciprocal combined with 3 Newton-Raphson iterations for enhanced
+     * precision. However, this approach has limitations:
+     *
+     * - It can cause unexpected floating-point overflows in special cases, such as when
+     *   the divisor is subnormal (refer: https://github.com/numpy/numpy/issues/25097).
+     *
+     * - The precision may vary between the emulated SIMD and scalar division due to
+     *   non-uniform branches (non-contiguous) in the code, leading to precision
+     *   inconsistencies.
+     *
+     * - Considering the necessity of multiple Newton-Raphson iterations, the performance
+     *   gain may not sufficiently offset these drawbacks.
+     */
+#elif NPY_SIMD_F64
+    if (len > npyv_nlanes_f64*2 &&
+        !is_mem_overlap(src0, ssrc0, dst, sdst, len) &&
+        !is_mem_overlap(src1, ssrc1, dst, sdst, len)
+    ) {
+        const int vstep = npyv_nlanes_u8;
+        const int wstep = vstep * 2;
+        const int hstep = npyv_nlanes_f64;
+        const int lstep = hstep * 2;
+        // lots of specializations, to squeeze out max performance
+        if (ssrc0 == sizeof(npy_double) && ssrc0 == ssrc1 && ssrc0 == sdst) {
+            for (; len >= lstep; len -= lstep, src0 += wstep, src1 += wstep, dst += wstep) {
+                npyv_f64 a0 = npyv_load_f64((const npy_double*)src0);
+                npyv_f64 a1 = npyv_load_f64((const npy_double*)(src0 + vstep));
+                npyv_f64 b0 = npyv_load_f64((const npy_double*)src1);
+                npyv_f64 b1 = npyv_load_f64((const npy_double*)(src1 + vstep));
+                npyv_f64 r0 = npyv_sub_f64(a0, b0);
+                npyv_f64 r1 = npyv_sub_f64(a1, b1);
+                npyv_store_f64((npy_double*)dst, r0);
+                npyv_store_f64((npy_double*)(dst + vstep), r1);
+            }
+            for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
+            #if 0
+                npyv_f64 a = npyv_load_till_f64((const npy_double*)src0, len, 1.0);
+                npyv_f64 b = npyv_load_till_f64((const npy_double*)src1, len, 1.0);
+            #else
+                npyv_f64 a = npyv_load_tillz_f64((const npy_double*)src0, len);
+                npyv_f64 b = npyv_load_tillz_f64((const npy_double*)src1, len);
+            #endif
+                npyv_f64 r = npyv_sub_f64(a, b);
+                npyv_store_till_f64((npy_double*)dst, len, r);
+            }
+        }
+        else if (ssrc0 == 0 && ssrc1 == sizeof(npy_double) && sdst == ssrc1) {
+            npyv_f64 a = npyv_setall_f64(*((npy_double*)src0));
+            for (; len >= lstep; len -= lstep, src1 += wstep, dst += wstep) {
+                npyv_f64 b0 = npyv_load_f64((const npy_double*)src1);
+                npyv_f64 b1 = npyv_load_f64((const npy_double*)(src1 + vstep));
+                npyv_f64 r0 = npyv_sub_f64(a, b0);
+                npyv_f64 r1 = npyv_sub_f64(a, b1);
+                npyv_store_f64((npy_double*)dst, r0);
+                npyv_store_f64((npy_double*)(dst + vstep), r1);
+            }
+            for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
+            #if 0 || 0
+                npyv_f64 b = npyv_load_till_f64((const npy_double*)src1, len, 1.0);
+            #else
+                npyv_f64 b = npyv_load_tillz_f64((const npy_double*)src1, len);
+            #endif
+                npyv_f64 r = npyv_sub_f64(a, b);
+                npyv_store_till_f64((npy_double*)dst, len, r);
+            }
+        }
+        else if (ssrc1 == 0 && ssrc0 == sizeof(npy_double) && sdst == ssrc0) {
+            npyv_f64 b = npyv_setall_f64(*((npy_double*)src1));
+            for (; len >= lstep; len -= lstep, src0 += wstep, dst += wstep) {
+                npyv_f64 a0 = npyv_load_f64((const npy_double*)src0);
+                npyv_f64 a1 = npyv_load_f64((const npy_double*)(src0 + vstep));
+                npyv_f64 r0 = npyv_sub_f64(a0, b);
+                npyv_f64 r1 = npyv_sub_f64(a1, b);
+                npyv_store_f64((npy_double*)dst, r0);
+                npyv_store_f64((npy_double*)(dst + vstep), r1);
+            }
+            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
+            #if 0
+                npyv_f64 a = npyv_load_till_f64((const npy_double*)src0, len, 1.0);
+            #elif 0
+                npyv_f64 a = npyv_load_till_f64((const npy_double*)src0, len, NPY_NAN);
+            #else
+                npyv_f64 a = npyv_load_tillz_f64((const npy_double*)src0, len);
+            #endif
+                npyv_f64 r = npyv_sub_f64(a, b);
+                npyv_store_till_f64((npy_double*)dst, len, r);
+            }
+        } else {
+            goto loop_scalar;
+        }
+        npyv_cleanup();
+        return;
+    }
+loop_scalar:
+#endif
+    for (; len > 0; --len, src0 += ssrc0, src1 += ssrc1, dst += sdst) {
+        const npy_double a = *((npy_double*)src0);
+        const npy_double b = *((npy_double*)src1);
+        *((npy_double*)dst) = a - b;
+    }
+}
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_subtract_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_double *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_double *)(ip1 + is1 * indx);
+        *indexed = *indexed - *(npy_double *)value;
+    }
+    return 0;
+}
+
+
+#line 52
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_multiply)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp len = dimensions[0];
+    char *src0 = args[0], *src1 = args[1], *dst = args[2];
+    npy_intp ssrc0 = steps[0], ssrc1 = steps[1], sdst = steps[2];
+    // reduce
+    if (ssrc0 == 0 && ssrc0 == sdst && src0 == dst) {
+    #if 0
+        *((npy_double*)src0) *= DOUBLE_pairwise_sum(src1, len, ssrc1);
+    #else
+        npy_double acc = *((npy_double*)src0);
+        if (ssrc1 == sizeof(npy_double)) {
+            for (; len > 0; --len, src1 += sizeof(npy_double)) {
+                acc *= *(npy_double *)src1;
+            }
+        } else {
+            for (; len > 0; --len, src1 += ssrc1) {
+                acc *= *(npy_double *)src1;
+            }
+        }
+        *((npy_double*)src0) = acc;
+    #endif
+        return;
+    }
+#if 0 && defined(NPY_HAVE_NEON) && !NPY_SIMD_F64
+    /**
+     * The SIMD branch is disabled on armhf(armv7) due to the absence of native SIMD
+     * support for single-precision floating-point division. Only scalar division is
+     * supported natively, and without hardware for performance and accuracy comparison,
+     * it's challenging to evaluate the benefits of emulated SIMD intrinsic versus
+     * native scalar division.
+     *
+     * The `npyv_div_f32` universal intrinsic emulates the division operation using an
+     * approximate reciprocal combined with 3 Newton-Raphson iterations for enhanced
+     * precision. However, this approach has limitations:
+     *
+     * - It can cause unexpected floating-point overflows in special cases, such as when
+     *   the divisor is subnormal (refer: https://github.com/numpy/numpy/issues/25097).
+     *
+     * - The precision may vary between the emulated SIMD and scalar division due to
+     *   non-uniform branches (non-contiguous) in the code, leading to precision
+     *   inconsistencies.
+     *
+     * - Considering the necessity of multiple Newton-Raphson iterations, the performance
+     *   gain may not sufficiently offset these drawbacks.
+     */
+#elif NPY_SIMD_F64
+    if (len > npyv_nlanes_f64*2 &&
+        !is_mem_overlap(src0, ssrc0, dst, sdst, len) &&
+        !is_mem_overlap(src1, ssrc1, dst, sdst, len)
+    ) {
+        const int vstep = npyv_nlanes_u8;
+        const int wstep = vstep * 2;
+        const int hstep = npyv_nlanes_f64;
+        const int lstep = hstep * 2;
+        // lots of specializations, to squeeze out max performance
+        if (ssrc0 == sizeof(npy_double) && ssrc0 == ssrc1 && ssrc0 == sdst) {
+            for (; len >= lstep; len -= lstep, src0 += wstep, src1 += wstep, dst += wstep) {
+                npyv_f64 a0 = npyv_load_f64((const npy_double*)src0);
+                npyv_f64 a1 = npyv_load_f64((const npy_double*)(src0 + vstep));
+                npyv_f64 b0 = npyv_load_f64((const npy_double*)src1);
+                npyv_f64 b1 = npyv_load_f64((const npy_double*)(src1 + vstep));
+                npyv_f64 r0 = npyv_mul_f64(a0, b0);
+                npyv_f64 r1 = npyv_mul_f64(a1, b1);
+                npyv_store_f64((npy_double*)dst, r0);
+                npyv_store_f64((npy_double*)(dst + vstep), r1);
+            }
+            for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
+            #if 0
+                npyv_f64 a = npyv_load_till_f64((const npy_double*)src0, len, 1.0);
+                npyv_f64 b = npyv_load_till_f64((const npy_double*)src1, len, 1.0);
+            #else
+                npyv_f64 a = npyv_load_tillz_f64((const npy_double*)src0, len);
+                npyv_f64 b = npyv_load_tillz_f64((const npy_double*)src1, len);
+            #endif
+                npyv_f64 r = npyv_mul_f64(a, b);
+                npyv_store_till_f64((npy_double*)dst, len, r);
+            }
+        }
+        else if (ssrc0 == 0 && ssrc1 == sizeof(npy_double) && sdst == ssrc1) {
+            npyv_f64 a = npyv_setall_f64(*((npy_double*)src0));
+            for (; len >= lstep; len -= lstep, src1 += wstep, dst += wstep) {
+                npyv_f64 b0 = npyv_load_f64((const npy_double*)src1);
+                npyv_f64 b1 = npyv_load_f64((const npy_double*)(src1 + vstep));
+                npyv_f64 r0 = npyv_mul_f64(a, b0);
+                npyv_f64 r1 = npyv_mul_f64(a, b1);
+                npyv_store_f64((npy_double*)dst, r0);
+                npyv_store_f64((npy_double*)(dst + vstep), r1);
+            }
+            for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
+            #if 0 || 1
+                npyv_f64 b = npyv_load_till_f64((const npy_double*)src1, len, 1.0);
+            #else
+                npyv_f64 b = npyv_load_tillz_f64((const npy_double*)src1, len);
+            #endif
+                npyv_f64 r = npyv_mul_f64(a, b);
+                npyv_store_till_f64((npy_double*)dst, len, r);
+            }
+        }
+        else if (ssrc1 == 0 && ssrc0 == sizeof(npy_double) && sdst == ssrc0) {
+            npyv_f64 b = npyv_setall_f64(*((npy_double*)src1));
+            for (; len >= lstep; len -= lstep, src0 += wstep, dst += wstep) {
+                npyv_f64 a0 = npyv_load_f64((const npy_double*)src0);
+                npyv_f64 a1 = npyv_load_f64((const npy_double*)(src0 + vstep));
+                npyv_f64 r0 = npyv_mul_f64(a0, b);
+                npyv_f64 r1 = npyv_mul_f64(a1, b);
+                npyv_store_f64((npy_double*)dst, r0);
+                npyv_store_f64((npy_double*)(dst + vstep), r1);
+            }
+            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
+            #if 1
+                npyv_f64 a = npyv_load_till_f64((const npy_double*)src0, len, 1.0);
+            #elif 0
+                npyv_f64 a = npyv_load_till_f64((const npy_double*)src0, len, NPY_NAN);
+            #else
+                npyv_f64 a = npyv_load_tillz_f64((const npy_double*)src0, len);
+            #endif
+                npyv_f64 r = npyv_mul_f64(a, b);
+                npyv_store_till_f64((npy_double*)dst, len, r);
+            }
+        } else {
+            goto loop_scalar;
+        }
+        npyv_cleanup();
+        return;
+    }
+loop_scalar:
+#endif
+    for (; len > 0; --len, src0 += ssrc0, src1 += ssrc1, dst += sdst) {
+        const npy_double a = *((npy_double*)src0);
+        const npy_double b = *((npy_double*)src1);
+        *((npy_double*)dst) = a * b;
+    }
+}
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_multiply_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_double *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_double *)(ip1 + is1 * indx);
+        *indexed = *indexed * *(npy_double *)value;
+    }
+    return 0;
+}
+
+
+#line 52
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_divide)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp len = dimensions[0];
+    char *src0 = args[0], *src1 = args[1], *dst = args[2];
+    npy_intp ssrc0 = steps[0], ssrc1 = steps[1], sdst = steps[2];
+    // reduce
+    if (ssrc0 == 0 && ssrc0 == sdst && src0 == dst) {
+    #if 0
+        *((npy_double*)src0) /= DOUBLE_pairwise_sum(src1, len, ssrc1);
+    #else
+        npy_double acc = *((npy_double*)src0);
+        if (ssrc1 == sizeof(npy_double)) {
+            for (; len > 0; --len, src1 += sizeof(npy_double)) {
+                acc /= *(npy_double *)src1;
+            }
+        } else {
+            for (; len > 0; --len, src1 += ssrc1) {
+                acc /= *(npy_double *)src1;
+            }
+        }
+        *((npy_double*)src0) = acc;
+    #endif
+        return;
+    }
+#if 1 && defined(NPY_HAVE_NEON) && !NPY_SIMD_F64
+    /**
+     * The SIMD branch is disabled on armhf(armv7) due to the absence of native SIMD
+     * support for single-precision floating-point division. Only scalar division is
+     * supported natively, and without hardware for performance and accuracy comparison,
+     * it's challenging to evaluate the benefits of emulated SIMD intrinsic versus
+     * native scalar division.
+     *
+     * The `npyv_div_f32` universal intrinsic emulates the division operation using an
+     * approximate reciprocal combined with 3 Newton-Raphson iterations for enhanced
+     * precision. However, this approach has limitations:
+     *
+     * - It can cause unexpected floating-point overflows in special cases, such as when
+     *   the divisor is subnormal (refer: https://github.com/numpy/numpy/issues/25097).
+     *
+     * - The precision may vary between the emulated SIMD and scalar division due to
+     *   non-uniform branches (non-contiguous) in the code, leading to precision
+     *   inconsistencies.
+     *
+     * - Considering the necessity of multiple Newton-Raphson iterations, the performance
+     *   gain may not sufficiently offset these drawbacks.
+     */
+#elif NPY_SIMD_F64
+    if (len > npyv_nlanes_f64*2 &&
+        !is_mem_overlap(src0, ssrc0, dst, sdst, len) &&
+        !is_mem_overlap(src1, ssrc1, dst, sdst, len)
+    ) {
+        const int vstep = npyv_nlanes_u8;
+        const int wstep = vstep * 2;
+        const int hstep = npyv_nlanes_f64;
+        const int lstep = hstep * 2;
+        // lots of specializations, to squeeze out max performance
+        if (ssrc0 == sizeof(npy_double) && ssrc0 == ssrc1 && ssrc0 == sdst) {
+            for (; len >= lstep; len -= lstep, src0 += wstep, src1 += wstep, dst += wstep) {
+                npyv_f64 a0 = npyv_load_f64((const npy_double*)src0);
+                npyv_f64 a1 = npyv_load_f64((const npy_double*)(src0 + vstep));
+                npyv_f64 b0 = npyv_load_f64((const npy_double*)src1);
+                npyv_f64 b1 = npyv_load_f64((const npy_double*)(src1 + vstep));
+                npyv_f64 r0 = npyv_div_f64(a0, b0);
+                npyv_f64 r1 = npyv_div_f64(a1, b1);
+                npyv_store_f64((npy_double*)dst, r0);
+                npyv_store_f64((npy_double*)(dst + vstep), r1);
+            }
+            for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
+            #if 1
+                npyv_f64 a = npyv_load_till_f64((const npy_double*)src0, len, 1.0);
+                npyv_f64 b = npyv_load_till_f64((const npy_double*)src1, len, 1.0);
+            #else
+                npyv_f64 a = npyv_load_tillz_f64((const npy_double*)src0, len);
+                npyv_f64 b = npyv_load_tillz_f64((const npy_double*)src1, len);
+            #endif
+                npyv_f64 r = npyv_div_f64(a, b);
+                npyv_store_till_f64((npy_double*)dst, len, r);
+            }
+        }
+        else if (ssrc0 == 0 && ssrc1 == sizeof(npy_double) && sdst == ssrc1) {
+            npyv_f64 a = npyv_setall_f64(*((npy_double*)src0));
+            for (; len >= lstep; len -= lstep, src1 += wstep, dst += wstep) {
+                npyv_f64 b0 = npyv_load_f64((const npy_double*)src1);
+                npyv_f64 b1 = npyv_load_f64((const npy_double*)(src1 + vstep));
+                npyv_f64 r0 = npyv_div_f64(a, b0);
+                npyv_f64 r1 = npyv_div_f64(a, b1);
+                npyv_store_f64((npy_double*)dst, r0);
+                npyv_store_f64((npy_double*)(dst + vstep), r1);
+            }
+            for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
+            #if 1 || 0
+                npyv_f64 b = npyv_load_till_f64((const npy_double*)src1, len, 1.0);
+            #else
+                npyv_f64 b = npyv_load_tillz_f64((const npy_double*)src1, len);
+            #endif
+                npyv_f64 r = npyv_div_f64(a, b);
+                npyv_store_till_f64((npy_double*)dst, len, r);
+            }
+        }
+        else if (ssrc1 == 0 && ssrc0 == sizeof(npy_double) && sdst == ssrc0) {
+            npyv_f64 b = npyv_setall_f64(*((npy_double*)src1));
+            for (; len >= lstep; len -= lstep, src0 += wstep, dst += wstep) {
+                npyv_f64 a0 = npyv_load_f64((const npy_double*)src0);
+                npyv_f64 a1 = npyv_load_f64((const npy_double*)(src0 + vstep));
+                npyv_f64 r0 = npyv_div_f64(a0, b);
+                npyv_f64 r1 = npyv_div_f64(a1, b);
+                npyv_store_f64((npy_double*)dst, r0);
+                npyv_store_f64((npy_double*)(dst + vstep), r1);
+            }
+            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
+            #if 0
+                npyv_f64 a = npyv_load_till_f64((const npy_double*)src0, len, 1.0);
+            #elif 1
+                npyv_f64 a = npyv_load_till_f64((const npy_double*)src0, len, NPY_NAN);
+            #else
+                npyv_f64 a = npyv_load_tillz_f64((const npy_double*)src0, len);
+            #endif
+                npyv_f64 r = npyv_div_f64(a, b);
+                npyv_store_till_f64((npy_double*)dst, len, r);
+            }
+        } else {
+            goto loop_scalar;
+        }
+        npyv_cleanup();
+        return;
+    }
+loop_scalar:
+#endif
+    for (; len > 0; --len, src0 += ssrc0, src1 += ssrc1, dst += sdst) {
+        const npy_double a = *((npy_double*)src0);
+        const npy_double b = *((npy_double*)src1);
+        *((npy_double*)dst) = a / b;
+    }
+}
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_divide_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_double *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_double *)(ip1 + is1 * indx);
+        *indexed = *indexed / *(npy_double *)value;
+    }
+    return 0;
+}
+
+
+
+
+//###############################################################################
+//## Complex Single/Double precision
+//###############################################################################
+
+/********************************************************************************
+ ** op intrinics
+ ********************************************************************************/
+
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_f32x2 simd_set2_f32(const float *a)
+{
+    npyv_f32 fill = npyv_reinterpret_f32_u64(npyv_setall_u64(*(npy_uint64*)a));
+    npyv_f32x2 r;
+    r.val[0] = fill;
+    r.val[1] = fill;
+    return r;
+}
+
+NPY_FINLINE npyv_f32
+simd_cconjugate_f32(npyv_f32 x)
+{
+#if NPY_SIMD_BIGENDIAN
+    const npyv_f32 mask = npyv_reinterpret_f32_u64(npyv_setall_u64(0x80000000));
+#else
+    const npyv_f32 mask = npyv_reinterpret_f32_u64(npyv_setall_u64(0x8000000000000000ULL));
+#endif
+    return npyv_xor_f32(x, mask);
+}
+
+NPY_FINLINE npyv_f32
+simd_cmul_f32(npyv_f32 a, npyv_f32 b)
+{
+    npyv_f32 b_rev = npyv_permi128_f32(b, 1, 0, 3, 2);
+    npyv_f32 a_re = npyv_permi128_f32(a, 0, 0, 2, 2);
+    npyv_f32 a_im = npyv_permi128_f32(a, 1, 1, 3, 3);
+    // a_im * b_im, a_im * b_re
+    npyv_f32 ab_iiir = npyv_mul_f32(a_im, b_rev);
+    return npyv_muladdsub_f32(a_re, b, ab_iiir);
+}
+
+NPY_FINLINE npyv_f32
+simd_csquare_f32(npyv_f32 x)
+{ return simd_cmul_f32(x, x); }
+#endif
+
+#if NPY_SIMD_F64
+
+NPY_FINLINE npyv_f64x2 simd_set2_f64(const double *a)
+{
+    npyv_f64 r = npyv_setall_f64(a[0]);
+    npyv_f64 i = npyv_setall_f64(a[1]);
+    return npyv_zip_f64(r, i);
+}
+
+NPY_FINLINE npyv_f64
+simd_cconjugate_f64(npyv_f64 x)
+{
+    const npyv_f64 mask = npyv_reinterpret_f64_u64(npyv_set_u64(
+       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL,
+       0, 0x8000000000000000ULL, 0, 0x8000000000000000ULL
+    ));
+    return npyv_xor_f64(x, mask);
+}
+
+NPY_FINLINE npyv_f64
+simd_cmul_f64(npyv_f64 a, npyv_f64 b)
+{
+    npyv_f64 b_rev = npyv_permi128_f64(b, 1, 0);
+    npyv_f64 a_re = npyv_permi128_f64(a, 0, 0);
+    npyv_f64 a_im = npyv_permi128_f64(a, 1, 1);
+    // a_im * b_im, a_im * b_re
+    npyv_f64 ab_iiir = npyv_mul_f64(a_im, b_rev);
+    return npyv_muladdsub_f64(a_re, b, ab_iiir);
+}
+
+NPY_FINLINE npyv_f64
+simd_csquare_f64(npyv_f64 x)
+{ return simd_cmul_f64(x, x); }
+#endif
+
+#line 310
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_f32
+simd_cabsolute_f32(npyv_f32 re, npyv_f32 im)
+{
+    const npyv_f32 inf = npyv_setall_f32(NPY_INFINITYF);
+    const npyv_f32 nan = npyv_setall_f32(NPY_NANF);
+
+    re = npyv_abs_f32(re);
+    im = npyv_abs_f32(im);
+    /*
+     * If real or imag = INF, then convert it to inf + j*inf
+     * Handles: inf + j*nan, nan + j*inf
+     */
+    npyv_b32 re_infmask = npyv_cmpeq_f32(re, inf);
+    npyv_b32 im_infmask = npyv_cmpeq_f32(im, inf);
+    im = npyv_select_f32(re_infmask, inf, im);
+    re = npyv_select_f32(im_infmask, inf, re);
+    /*
+     * If real or imag = NAN, then convert it to nan + j*nan
+     * Handles: x + j*nan, nan + j*x
+     */
+    npyv_b32 re_nnanmask = npyv_notnan_f32(re);
+    npyv_b32 im_nnanmask = npyv_notnan_f32(im);
+    im = npyv_select_f32(re_nnanmask, im, nan);
+    re = npyv_select_f32(im_nnanmask, re, nan);
+
+    npyv_f32 larger  = npyv_max_f32(re, im);
+    npyv_f32 smaller = npyv_min_f32(im, re);
+    /*
+     * Calculate div_mask to prevent 0./0. and inf/inf operations in div
+     */
+    npyv_b32 zeromask = npyv_cmpeq_f32(larger, npyv_zero_f32());
+    npyv_b32 infmask = npyv_cmpeq_f32(smaller, inf);
+    npyv_b32 div_mask = npyv_not_b32(npyv_or_b32(zeromask, infmask));
+
+    npyv_f32 ratio = npyv_ifdivz_f32(div_mask, smaller, larger);
+    npyv_f32 hypot = npyv_sqrt_f32(
+        npyv_muladd_f32(ratio, ratio, npyv_setall_f32(1.0f)
+    ));
+    return npyv_mul_f32(hypot, larger);
+}
+#endif // VECTOR
+
+#line 310
+#if NPY_SIMD_F64
+NPY_FINLINE npyv_f64
+simd_cabsolute_f64(npyv_f64 re, npyv_f64 im)
+{
+    const npyv_f64 inf = npyv_setall_f64(NPY_INFINITY);
+    const npyv_f64 nan = npyv_setall_f64(NPY_NAN);
+
+    re = npyv_abs_f64(re);
+    im = npyv_abs_f64(im);
+    /*
+     * If real or imag = INF, then convert it to inf + j*inf
+     * Handles: inf + j*nan, nan + j*inf
+     */
+    npyv_b64 re_infmask = npyv_cmpeq_f64(re, inf);
+    npyv_b64 im_infmask = npyv_cmpeq_f64(im, inf);
+    im = npyv_select_f64(re_infmask, inf, im);
+    re = npyv_select_f64(im_infmask, inf, re);
+    /*
+     * If real or imag = NAN, then convert it to nan + j*nan
+     * Handles: x + j*nan, nan + j*x
+     */
+    npyv_b64 re_nnanmask = npyv_notnan_f64(re);
+    npyv_b64 im_nnanmask = npyv_notnan_f64(im);
+    im = npyv_select_f64(re_nnanmask, im, nan);
+    re = npyv_select_f64(im_nnanmask, re, nan);
+
+    npyv_f64 larger  = npyv_max_f64(re, im);
+    npyv_f64 smaller = npyv_min_f64(im, re);
+    /*
+     * Calculate div_mask to prevent 0./0. and inf/inf operations in div
+     */
+    npyv_b64 zeromask = npyv_cmpeq_f64(larger, npyv_zero_f64());
+    npyv_b64 infmask = npyv_cmpeq_f64(smaller, inf);
+    npyv_b64 div_mask = npyv_not_b64(npyv_or_b64(zeromask, infmask));
+
+    npyv_f64 ratio = npyv_ifdivz_f64(div_mask, smaller, larger);
+    npyv_f64 hypot = npyv_sqrt_f64(
+        npyv_muladd_f64(ratio, ratio, npyv_setall_f64(1.0)
+    ));
+    return npyv_mul_f64(hypot, larger);
+}
+#endif // VECTOR
+
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+#line 366
+#line 374
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(CFLOAT_add)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp len = dimensions[0];
+    char *b_src0 = args[0], *b_src1 = args[1], *b_dst = args[2];
+    npy_intp b_ssrc0 = steps[0], b_ssrc1 = steps[1], b_sdst = steps[2];
+#if 1
+    // reduce
+    if (b_ssrc0 == 0 && b_ssrc0 == b_sdst && b_src0 == b_dst &&
+        b_ssrc1 % (sizeof(npy_float)*2) == 0
+    ) {
+        npy_float *rl_im = (npy_float *)b_src0;
+        npy_float rr, ri;
+        CFLOAT_pairwise_sum(&rr, &ri, b_src1, len * 2, b_ssrc1 / 2);
+        rl_im[0] += rr;
+        rl_im[1] += ri;
+        return;
+    }
+#endif
+#if NPY_SIMD_F32
+    // Certain versions of Apple clang (commonly used in CI images) produce
+    // non-deterministic output in the mul path with AVX2 enabled on x86_64.
+    // Work around by scalarising.
+    #if 0 \
+            && defined(NPY_CPU_AMD64) && defined(__clang__) \
+            && defined(__apple_build_version__) \
+            && __apple_build_version__ >= 14000000 \
+            && __apple_build_version__ < 14030000
+        goto loop_scalar;
+    #endif  // end affected Apple clang.
+    if (is_mem_overlap(b_src0, b_ssrc0, b_dst, b_sdst, len) ||
+        is_mem_overlap(b_src1, b_ssrc1, b_dst, b_sdst, len) ||
+        b_sdst  % sizeof(npy_float) != 0 || b_sdst == 0 ||
+        b_ssrc0 % sizeof(npy_float) != 0 ||
+        b_ssrc1 % sizeof(npy_float) != 0
+    ) {
+        goto loop_scalar;
+    }
+    const npy_float *src0 = (npy_float*)b_src0;
+    const npy_float *src1 = (npy_float*)b_src1;
+          npy_float *dst  = (npy_float*)b_dst;
+
+    const npy_intp ssrc0 = b_ssrc0 / sizeof(npy_float);
+    const npy_intp ssrc1 = b_ssrc1 / sizeof(npy_float);
+    const npy_intp sdst  = b_sdst / sizeof(npy_float);
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 2;
+    const int hstep = vstep / 2;
+
+    const int loadable0 = npyv_loadable_stride_s64(ssrc0);
+    const int loadable1 = npyv_loadable_stride_s64(ssrc1);
+    const int storable = npyv_storable_stride_s64(sdst);
+
+    // lots**lots of specializations, to squeeze out max performance
+    // contig
+    if (ssrc0 == 2 && ssrc0 == ssrc1 && ssrc0 == sdst) {
+        for (; len >= vstep; len -= vstep, src0 += wstep, src1 += wstep, dst += wstep) {
+            npyv_f32 a0 = npyv_load_f32(src0);
+            npyv_f32 a1 = npyv_load_f32(src0 + vstep);
+            npyv_f32 b0 = npyv_load_f32(src1);
+            npyv_f32 b1 = npyv_load_f32(src1 + vstep);
+            npyv_f32 r0 = npyv_add_f32(a0, b0);
+            npyv_f32 r1 = npyv_add_f32(a1, b1);
+            npyv_store_f32(dst, r0);
+            npyv_store_f32(dst + vstep, r1);
+        }
+        for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
+            npyv_f32 a = npyv_load2_tillz_f32(src0, len);
+            npyv_f32 b = npyv_load2_tillz_f32(src1, len);
+            npyv_f32 r = npyv_add_f32(a, b);
+            npyv_store2_till_f32(dst, len, r);
+        }
+    }
+    // scalar 0
+    else if (ssrc0 == 0) {
+        npyv_f32x2 a = simd_set2_f32(src0);
+        // contig
+        if (ssrc1 == 2 && sdst == ssrc1) {
+            for (; len >= vstep; len -= vstep, src1 += wstep, dst += wstep) {
+                npyv_f32 b0 = npyv_load_f32(src1);
+                npyv_f32 b1 = npyv_load_f32(src1 + vstep);
+                npyv_f32 r0 = npyv_add_f32(a.val[0], b0);
+                npyv_f32 r1 = npyv_add_f32(a.val[1], b1);
+                npyv_store_f32(dst, r0);
+                npyv_store_f32(dst + vstep, r1);
+            }
+            for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
+            #if 0
+                npyv_f32 b = npyv_load2_till_f32(src1, len, 1.0f, 1.0f);
+            #else
+                npyv_f32 b = npyv_load2_tillz_f32(src1, len);
+            #endif
+                npyv_f32 r = npyv_add_f32(a.val[0], b);
+                npyv_store2_till_f32(dst, len, r);
+            }
+        }
+        // non-contig
+        else if (loadable1 && storable) {
+            for (; len >= vstep; len -= vstep, src1 += ssrc1*vstep, dst += sdst*vstep) {
+                npyv_f32 b0 = npyv_loadn2_f32(src1, ssrc1);
+                npyv_f32 b1 = npyv_loadn2_f32(src1 + ssrc1*hstep, ssrc1);
+                npyv_f32 r0 = npyv_add_f32(a.val[0], b0);
+                npyv_f32 r1 = npyv_add_f32(a.val[1], b1);
+                npyv_storen2_f32(dst, sdst, r0);
+                npyv_storen2_f32(dst + sdst*hstep, sdst, r1);
+            }
+            for (; len > 0; len -= hstep, src1 += ssrc1*hstep, dst += sdst*hstep) {
+            #if 0
+                npyv_f32 b = npyv_loadn2_till_f32(src1, ssrc1, len, 1.0f, 1.0f);
+            #else
+                npyv_f32 b = npyv_loadn2_tillz_f32(src1, ssrc1, len);
+            #endif
+                npyv_f32 r = npyv_add_f32(a.val[0], b);
+                npyv_storen2_till_f32(dst, sdst, len, r);
+            }
+        }
+        else {
+            goto loop_scalar;
+        }
+    }
+    // scalar 1
+    else if (ssrc1 == 0) {
+        npyv_f32x2 b = simd_set2_f32(src1);
+        if (ssrc0 == 2 && sdst == ssrc0) {
+            for (; len >= vstep; len -= vstep, src0 += wstep, dst += wstep) {
+                npyv_f32 a0 = npyv_load_f32(src0);
+                npyv_f32 a1 = npyv_load_f32(src0 + vstep);
+                npyv_f32 r0 = npyv_add_f32(a0, b.val[0]);
+                npyv_f32 r1 = npyv_add_f32(a1, b.val[1]);
+                npyv_store_f32(dst, r0);
+                npyv_store_f32(dst + vstep, r1);
+            }
+            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
+            #if 0
+                npyv_f32 a = npyv_load2_till_f32(src0, len, 1.0f, 1.0f);
+            #else
+                npyv_f32 a = npyv_load2_tillz_f32(src0, len);
+            #endif
+                npyv_f32 r = npyv_add_f32(a, b.val[0]);
+                npyv_store2_till_f32(dst, len, r);
+            }
+        }
+        // non-contig
+        else if (loadable0 && storable) {
+            for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep, dst += sdst*vstep) {
+                npyv_f32 a0 = npyv_loadn2_f32(src0, ssrc0);
+                npyv_f32 a1 = npyv_loadn2_f32(src0 + ssrc0*hstep, ssrc0);
+                npyv_f32 r0 = npyv_add_f32(a0, b.val[0]);
+                npyv_f32 r1 = npyv_add_f32(a1, b.val[1]);
+                npyv_storen2_f32(dst, sdst, r0);
+                npyv_storen2_f32(dst + sdst*hstep, sdst, r1);
+            }
+            for (; len > 0; len -= hstep, src0 += ssrc0*hstep, dst += sdst*hstep) {
+            #if 0
+                npyv_f32 a = npyv_loadn2_till_f32(src0, ssrc0, len, 1.0f, 1.0f);
+            #else
+                npyv_f32 a = npyv_loadn2_tillz_f32(src0, ssrc0, len);
+            #endif
+                npyv_f32 r = npyv_add_f32(a, b.val[0]);
+                npyv_storen2_till_f32(dst, sdst, len, r);
+            }
+        }
+        else {
+            goto loop_scalar;
+        }
+    }
+    #if 0
+    // non-contig
+    else if (loadable0 && loadable1 && storable) {
+        for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep,
+                            src1 += ssrc1*vstep, dst += sdst*vstep
+        ) {
+            npyv_f32 a0 = npyv_loadn2_f32(src0, ssrc0);
+            npyv_f32 a1 = npyv_loadn2_f32(src0 + ssrc0*hstep, ssrc0);
+            npyv_f32 b0 = npyv_loadn2_f32(src1, ssrc1);
+            npyv_f32 b1 = npyv_loadn2_f32(src1 + ssrc1*hstep, ssrc1);
+            npyv_f32 r0 = npyv_add_f32(a0, b0);
+            npyv_f32 r1 = npyv_add_f32(a1, b1);
+            npyv_storen2_f32(dst, sdst, r0);
+            npyv_storen2_f32(dst + sdst*hstep, sdst, r1);
+        }
+        for (; len > 0; len -= hstep, src0 += ssrc0*hstep,
+                       src1 += ssrc1*hstep, dst += sdst*hstep
+        ) {
+        #if 0
+            npyv_f32 a = npyv_loadn2_till_f32(src0, ssrc0, len, 1.0f, 1.0f);
+            npyv_f32 b = npyv_loadn2_till_f32(src1, ssrc1, len, 1.0f, 1.0f);
+        #else
+            npyv_f32 a = npyv_loadn2_tillz_f32(src0, ssrc0, len);
+            npyv_f32 b = npyv_loadn2_tillz_f32(src1, ssrc1, len);
+        #endif
+            npyv_f32 r = npyv_add_f32(a, b);
+            npyv_storen2_till_f32(dst, sdst, len, r);
+        }
+    }
+    #endif
+    else {
+        goto loop_scalar;
+    }
+    npyv_cleanup();
+    return;
+loop_scalar:
+#endif
+    for (; len > 0; --len, b_src0 += b_ssrc0, b_src1 += b_ssrc1, b_dst += b_sdst) {
+        const npy_float a_r = ((npy_float *)b_src0)[0];
+        const npy_float a_i = ((npy_float *)b_src0)[1];
+        const npy_float b_r = ((npy_float *)b_src1)[0];
+        const npy_float b_i = ((npy_float *)b_src1)[1];
+    #if 0
+        ((npy_float *)b_dst)[0] = a_r*b_r - a_i*b_i;
+        ((npy_float *)b_dst)[1] = a_r*b_i + a_i*b_r;
+    #else
+        ((npy_float *)b_dst)[0] = a_r + b_r;
+        ((npy_float *)b_dst)[1] = a_i + b_i;
+    #endif
+    }
+}
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(CFLOAT_add_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_float *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_float *)(ip1 + is1 * indx);
+        const npy_float b_r = ((npy_float *)value)[0];
+        const npy_float b_i = ((npy_float *)value)[1];
+    #if 0
+        const npy_float a_r = indexed[0];
+        const npy_float a_i = indexed[1];
+        indexed[0] = a_r*b_r - a_i*b_i;
+        indexed[1] = a_r*b_i + a_i*b_r;
+    #else
+        indexed[0] += b_r;
+        indexed[1] += b_i;
+    #endif
+    }
+    return 0;
+}
+
+#line 374
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(CFLOAT_subtract)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp len = dimensions[0];
+    char *b_src0 = args[0], *b_src1 = args[1], *b_dst = args[2];
+    npy_intp b_ssrc0 = steps[0], b_ssrc1 = steps[1], b_sdst = steps[2];
+#if 0
+    // reduce
+    if (b_ssrc0 == 0 && b_ssrc0 == b_sdst && b_src0 == b_dst &&
+        b_ssrc1 % (sizeof(npy_float)*2) == 0
+    ) {
+        npy_float *rl_im = (npy_float *)b_src0;
+        npy_float rr, ri;
+        CFLOAT_pairwise_sum(&rr, &ri, b_src1, len * 2, b_ssrc1 / 2);
+        rl_im[0] -= rr;
+        rl_im[1] -= ri;
+        return;
+    }
+#endif
+#if NPY_SIMD_F32
+    // Certain versions of Apple clang (commonly used in CI images) produce
+    // non-deterministic output in the mul path with AVX2 enabled on x86_64.
+    // Work around by scalarising.
+    #if 0 \
+            && defined(NPY_CPU_AMD64) && defined(__clang__) \
+            && defined(__apple_build_version__) \
+            && __apple_build_version__ >= 14000000 \
+            && __apple_build_version__ < 14030000
+        goto loop_scalar;
+    #endif  // end affected Apple clang.
+    if (is_mem_overlap(b_src0, b_ssrc0, b_dst, b_sdst, len) ||
+        is_mem_overlap(b_src1, b_ssrc1, b_dst, b_sdst, len) ||
+        b_sdst  % sizeof(npy_float) != 0 || b_sdst == 0 ||
+        b_ssrc0 % sizeof(npy_float) != 0 ||
+        b_ssrc1 % sizeof(npy_float) != 0
+    ) {
+        goto loop_scalar;
+    }
+    const npy_float *src0 = (npy_float*)b_src0;
+    const npy_float *src1 = (npy_float*)b_src1;
+          npy_float *dst  = (npy_float*)b_dst;
+
+    const npy_intp ssrc0 = b_ssrc0 / sizeof(npy_float);
+    const npy_intp ssrc1 = b_ssrc1 / sizeof(npy_float);
+    const npy_intp sdst  = b_sdst / sizeof(npy_float);
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 2;
+    const int hstep = vstep / 2;
+
+    const int loadable0 = npyv_loadable_stride_s64(ssrc0);
+    const int loadable1 = npyv_loadable_stride_s64(ssrc1);
+    const int storable = npyv_storable_stride_s64(sdst);
+
+    // lots**lots of specializations, to squeeze out max performance
+    // contig
+    if (ssrc0 == 2 && ssrc0 == ssrc1 && ssrc0 == sdst) {
+        for (; len >= vstep; len -= vstep, src0 += wstep, src1 += wstep, dst += wstep) {
+            npyv_f32 a0 = npyv_load_f32(src0);
+            npyv_f32 a1 = npyv_load_f32(src0 + vstep);
+            npyv_f32 b0 = npyv_load_f32(src1);
+            npyv_f32 b1 = npyv_load_f32(src1 + vstep);
+            npyv_f32 r0 = npyv_sub_f32(a0, b0);
+            npyv_f32 r1 = npyv_sub_f32(a1, b1);
+            npyv_store_f32(dst, r0);
+            npyv_store_f32(dst + vstep, r1);
+        }
+        for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
+            npyv_f32 a = npyv_load2_tillz_f32(src0, len);
+            npyv_f32 b = npyv_load2_tillz_f32(src1, len);
+            npyv_f32 r = npyv_sub_f32(a, b);
+            npyv_store2_till_f32(dst, len, r);
+        }
+    }
+    // scalar 0
+    else if (ssrc0 == 0) {
+        npyv_f32x2 a = simd_set2_f32(src0);
+        // contig
+        if (ssrc1 == 2 && sdst == ssrc1) {
+            for (; len >= vstep; len -= vstep, src1 += wstep, dst += wstep) {
+                npyv_f32 b0 = npyv_load_f32(src1);
+                npyv_f32 b1 = npyv_load_f32(src1 + vstep);
+                npyv_f32 r0 = npyv_sub_f32(a.val[0], b0);
+                npyv_f32 r1 = npyv_sub_f32(a.val[1], b1);
+                npyv_store_f32(dst, r0);
+                npyv_store_f32(dst + vstep, r1);
+            }
+            for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
+            #if 0
+                npyv_f32 b = npyv_load2_till_f32(src1, len, 1.0f, 1.0f);
+            #else
+                npyv_f32 b = npyv_load2_tillz_f32(src1, len);
+            #endif
+                npyv_f32 r = npyv_sub_f32(a.val[0], b);
+                npyv_store2_till_f32(dst, len, r);
+            }
+        }
+        // non-contig
+        else if (loadable1 && storable) {
+            for (; len >= vstep; len -= vstep, src1 += ssrc1*vstep, dst += sdst*vstep) {
+                npyv_f32 b0 = npyv_loadn2_f32(src1, ssrc1);
+                npyv_f32 b1 = npyv_loadn2_f32(src1 + ssrc1*hstep, ssrc1);
+                npyv_f32 r0 = npyv_sub_f32(a.val[0], b0);
+                npyv_f32 r1 = npyv_sub_f32(a.val[1], b1);
+                npyv_storen2_f32(dst, sdst, r0);
+                npyv_storen2_f32(dst + sdst*hstep, sdst, r1);
+            }
+            for (; len > 0; len -= hstep, src1 += ssrc1*hstep, dst += sdst*hstep) {
+            #if 0
+                npyv_f32 b = npyv_loadn2_till_f32(src1, ssrc1, len, 1.0f, 1.0f);
+            #else
+                npyv_f32 b = npyv_loadn2_tillz_f32(src1, ssrc1, len);
+            #endif
+                npyv_f32 r = npyv_sub_f32(a.val[0], b);
+                npyv_storen2_till_f32(dst, sdst, len, r);
+            }
+        }
+        else {
+            goto loop_scalar;
+        }
+    }
+    // scalar 1
+    else if (ssrc1 == 0) {
+        npyv_f32x2 b = simd_set2_f32(src1);
+        if (ssrc0 == 2 && sdst == ssrc0) {
+            for (; len >= vstep; len -= vstep, src0 += wstep, dst += wstep) {
+                npyv_f32 a0 = npyv_load_f32(src0);
+                npyv_f32 a1 = npyv_load_f32(src0 + vstep);
+                npyv_f32 r0 = npyv_sub_f32(a0, b.val[0]);
+                npyv_f32 r1 = npyv_sub_f32(a1, b.val[1]);
+                npyv_store_f32(dst, r0);
+                npyv_store_f32(dst + vstep, r1);
+            }
+            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
+            #if 0
+                npyv_f32 a = npyv_load2_till_f32(src0, len, 1.0f, 1.0f);
+            #else
+                npyv_f32 a = npyv_load2_tillz_f32(src0, len);
+            #endif
+                npyv_f32 r = npyv_sub_f32(a, b.val[0]);
+                npyv_store2_till_f32(dst, len, r);
+            }
+        }
+        // non-contig
+        else if (loadable0 && storable) {
+            for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep, dst += sdst*vstep) {
+                npyv_f32 a0 = npyv_loadn2_f32(src0, ssrc0);
+                npyv_f32 a1 = npyv_loadn2_f32(src0 + ssrc0*hstep, ssrc0);
+                npyv_f32 r0 = npyv_sub_f32(a0, b.val[0]);
+                npyv_f32 r1 = npyv_sub_f32(a1, b.val[1]);
+                npyv_storen2_f32(dst, sdst, r0);
+                npyv_storen2_f32(dst + sdst*hstep, sdst, r1);
+            }
+            for (; len > 0; len -= hstep, src0 += ssrc0*hstep, dst += sdst*hstep) {
+            #if 0
+                npyv_f32 a = npyv_loadn2_till_f32(src0, ssrc0, len, 1.0f, 1.0f);
+            #else
+                npyv_f32 a = npyv_loadn2_tillz_f32(src0, ssrc0, len);
+            #endif
+                npyv_f32 r = npyv_sub_f32(a, b.val[0]);
+                npyv_storen2_till_f32(dst, sdst, len, r);
+            }
+        }
+        else {
+            goto loop_scalar;
+        }
+    }
+    #if 0
+    // non-contig
+    else if (loadable0 && loadable1 && storable) {
+        for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep,
+                            src1 += ssrc1*vstep, dst += sdst*vstep
+        ) {
+            npyv_f32 a0 = npyv_loadn2_f32(src0, ssrc0);
+            npyv_f32 a1 = npyv_loadn2_f32(src0 + ssrc0*hstep, ssrc0);
+            npyv_f32 b0 = npyv_loadn2_f32(src1, ssrc1);
+            npyv_f32 b1 = npyv_loadn2_f32(src1 + ssrc1*hstep, ssrc1);
+            npyv_f32 r0 = npyv_sub_f32(a0, b0);
+            npyv_f32 r1 = npyv_sub_f32(a1, b1);
+            npyv_storen2_f32(dst, sdst, r0);
+            npyv_storen2_f32(dst + sdst*hstep, sdst, r1);
+        }
+        for (; len > 0; len -= hstep, src0 += ssrc0*hstep,
+                       src1 += ssrc1*hstep, dst += sdst*hstep
+        ) {
+        #if 0
+            npyv_f32 a = npyv_loadn2_till_f32(src0, ssrc0, len, 1.0f, 1.0f);
+            npyv_f32 b = npyv_loadn2_till_f32(src1, ssrc1, len, 1.0f, 1.0f);
+        #else
+            npyv_f32 a = npyv_loadn2_tillz_f32(src0, ssrc0, len);
+            npyv_f32 b = npyv_loadn2_tillz_f32(src1, ssrc1, len);
+        #endif
+            npyv_f32 r = npyv_sub_f32(a, b);
+            npyv_storen2_till_f32(dst, sdst, len, r);
+        }
+    }
+    #endif
+    else {
+        goto loop_scalar;
+    }
+    npyv_cleanup();
+    return;
+loop_scalar:
+#endif
+    for (; len > 0; --len, b_src0 += b_ssrc0, b_src1 += b_ssrc1, b_dst += b_sdst) {
+        const npy_float a_r = ((npy_float *)b_src0)[0];
+        const npy_float a_i = ((npy_float *)b_src0)[1];
+        const npy_float b_r = ((npy_float *)b_src1)[0];
+        const npy_float b_i = ((npy_float *)b_src1)[1];
+    #if 0
+        ((npy_float *)b_dst)[0] = a_r*b_r - a_i*b_i;
+        ((npy_float *)b_dst)[1] = a_r*b_i + a_i*b_r;
+    #else
+        ((npy_float *)b_dst)[0] = a_r - b_r;
+        ((npy_float *)b_dst)[1] = a_i - b_i;
+    #endif
+    }
+}
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(CFLOAT_subtract_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_float *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_float *)(ip1 + is1 * indx);
+        const npy_float b_r = ((npy_float *)value)[0];
+        const npy_float b_i = ((npy_float *)value)[1];
+    #if 0
+        const npy_float a_r = indexed[0];
+        const npy_float a_i = indexed[1];
+        indexed[0] = a_r*b_r - a_i*b_i;
+        indexed[1] = a_r*b_i + a_i*b_r;
+    #else
+        indexed[0] -= b_r;
+        indexed[1] -= b_i;
+    #endif
+    }
+    return 0;
+}
+
+#line 374
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(CFLOAT_multiply)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp len = dimensions[0];
+    char *b_src0 = args[0], *b_src1 = args[1], *b_dst = args[2];
+    npy_intp b_ssrc0 = steps[0], b_ssrc1 = steps[1], b_sdst = steps[2];
+#if 0
+    // reduce
+    if (b_ssrc0 == 0 && b_ssrc0 == b_sdst && b_src0 == b_dst &&
+        b_ssrc1 % (sizeof(npy_float)*2) == 0
+    ) {
+        npy_float *rl_im = (npy_float *)b_src0;
+        npy_float rr, ri;
+        CFLOAT_pairwise_sum(&rr, &ri, b_src1, len * 2, b_ssrc1 / 2);
+        rl_im[0] *= rr;
+        rl_im[1] *= ri;
+        return;
+    }
+#endif
+#if NPY_SIMD_F32
+    // Certain versions of Apple clang (commonly used in CI images) produce
+    // non-deterministic output in the mul path with AVX2 enabled on x86_64.
+    // Work around by scalarising.
+    #if 1 \
+            && defined(NPY_CPU_AMD64) && defined(__clang__) \
+            && defined(__apple_build_version__) \
+            && __apple_build_version__ >= 14000000 \
+            && __apple_build_version__ < 14030000
+        goto loop_scalar;
+    #endif  // end affected Apple clang.
+    if (is_mem_overlap(b_src0, b_ssrc0, b_dst, b_sdst, len) ||
+        is_mem_overlap(b_src1, b_ssrc1, b_dst, b_sdst, len) ||
+        b_sdst  % sizeof(npy_float) != 0 || b_sdst == 0 ||
+        b_ssrc0 % sizeof(npy_float) != 0 ||
+        b_ssrc1 % sizeof(npy_float) != 0
+    ) {
+        goto loop_scalar;
+    }
+    const npy_float *src0 = (npy_float*)b_src0;
+    const npy_float *src1 = (npy_float*)b_src1;
+          npy_float *dst  = (npy_float*)b_dst;
+
+    const npy_intp ssrc0 = b_ssrc0 / sizeof(npy_float);
+    const npy_intp ssrc1 = b_ssrc1 / sizeof(npy_float);
+    const npy_intp sdst  = b_sdst / sizeof(npy_float);
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 2;
+    const int hstep = vstep / 2;
+
+    const int loadable0 = npyv_loadable_stride_s64(ssrc0);
+    const int loadable1 = npyv_loadable_stride_s64(ssrc1);
+    const int storable = npyv_storable_stride_s64(sdst);
+
+    // lots**lots of specializations, to squeeze out max performance
+    // contig
+    if (ssrc0 == 2 && ssrc0 == ssrc1 && ssrc0 == sdst) {
+        for (; len >= vstep; len -= vstep, src0 += wstep, src1 += wstep, dst += wstep) {
+            npyv_f32 a0 = npyv_load_f32(src0);
+            npyv_f32 a1 = npyv_load_f32(src0 + vstep);
+            npyv_f32 b0 = npyv_load_f32(src1);
+            npyv_f32 b1 = npyv_load_f32(src1 + vstep);
+            npyv_f32 r0 = simd_cmul_f32(a0, b0);
+            npyv_f32 r1 = simd_cmul_f32(a1, b1);
+            npyv_store_f32(dst, r0);
+            npyv_store_f32(dst + vstep, r1);
+        }
+        for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
+            npyv_f32 a = npyv_load2_tillz_f32(src0, len);
+            npyv_f32 b = npyv_load2_tillz_f32(src1, len);
+            npyv_f32 r = simd_cmul_f32(a, b);
+            npyv_store2_till_f32(dst, len, r);
+        }
+    }
+    // scalar 0
+    else if (ssrc0 == 0) {
+        npyv_f32x2 a = simd_set2_f32(src0);
+        // contig
+        if (ssrc1 == 2 && sdst == ssrc1) {
+            for (; len >= vstep; len -= vstep, src1 += wstep, dst += wstep) {
+                npyv_f32 b0 = npyv_load_f32(src1);
+                npyv_f32 b1 = npyv_load_f32(src1 + vstep);
+                npyv_f32 r0 = simd_cmul_f32(a.val[0], b0);
+                npyv_f32 r1 = simd_cmul_f32(a.val[1], b1);
+                npyv_store_f32(dst, r0);
+                npyv_store_f32(dst + vstep, r1);
+            }
+            for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
+            #if 1
+                npyv_f32 b = npyv_load2_till_f32(src1, len, 1.0f, 1.0f);
+            #else
+                npyv_f32 b = npyv_load2_tillz_f32(src1, len);
+            #endif
+                npyv_f32 r = simd_cmul_f32(a.val[0], b);
+                npyv_store2_till_f32(dst, len, r);
+            }
+        }
+        // non-contig
+        else if (loadable1 && storable) {
+            for (; len >= vstep; len -= vstep, src1 += ssrc1*vstep, dst += sdst*vstep) {
+                npyv_f32 b0 = npyv_loadn2_f32(src1, ssrc1);
+                npyv_f32 b1 = npyv_loadn2_f32(src1 + ssrc1*hstep, ssrc1);
+                npyv_f32 r0 = simd_cmul_f32(a.val[0], b0);
+                npyv_f32 r1 = simd_cmul_f32(a.val[1], b1);
+                npyv_storen2_f32(dst, sdst, r0);
+                npyv_storen2_f32(dst + sdst*hstep, sdst, r1);
+            }
+            for (; len > 0; len -= hstep, src1 += ssrc1*hstep, dst += sdst*hstep) {
+            #if 1
+                npyv_f32 b = npyv_loadn2_till_f32(src1, ssrc1, len, 1.0f, 1.0f);
+            #else
+                npyv_f32 b = npyv_loadn2_tillz_f32(src1, ssrc1, len);
+            #endif
+                npyv_f32 r = simd_cmul_f32(a.val[0], b);
+                npyv_storen2_till_f32(dst, sdst, len, r);
+            }
+        }
+        else {
+            goto loop_scalar;
+        }
+    }
+    // scalar 1
+    else if (ssrc1 == 0) {
+        npyv_f32x2 b = simd_set2_f32(src1);
+        if (ssrc0 == 2 && sdst == ssrc0) {
+            for (; len >= vstep; len -= vstep, src0 += wstep, dst += wstep) {
+                npyv_f32 a0 = npyv_load_f32(src0);
+                npyv_f32 a1 = npyv_load_f32(src0 + vstep);
+                npyv_f32 r0 = simd_cmul_f32(a0, b.val[0]);
+                npyv_f32 r1 = simd_cmul_f32(a1, b.val[1]);
+                npyv_store_f32(dst, r0);
+                npyv_store_f32(dst + vstep, r1);
+            }
+            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
+            #if 1
+                npyv_f32 a = npyv_load2_till_f32(src0, len, 1.0f, 1.0f);
+            #else
+                npyv_f32 a = npyv_load2_tillz_f32(src0, len);
+            #endif
+                npyv_f32 r = simd_cmul_f32(a, b.val[0]);
+                npyv_store2_till_f32(dst, len, r);
+            }
+        }
+        // non-contig
+        else if (loadable0 && storable) {
+            for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep, dst += sdst*vstep) {
+                npyv_f32 a0 = npyv_loadn2_f32(src0, ssrc0);
+                npyv_f32 a1 = npyv_loadn2_f32(src0 + ssrc0*hstep, ssrc0);
+                npyv_f32 r0 = simd_cmul_f32(a0, b.val[0]);
+                npyv_f32 r1 = simd_cmul_f32(a1, b.val[1]);
+                npyv_storen2_f32(dst, sdst, r0);
+                npyv_storen2_f32(dst + sdst*hstep, sdst, r1);
+            }
+            for (; len > 0; len -= hstep, src0 += ssrc0*hstep, dst += sdst*hstep) {
+            #if 1
+                npyv_f32 a = npyv_loadn2_till_f32(src0, ssrc0, len, 1.0f, 1.0f);
+            #else
+                npyv_f32 a = npyv_loadn2_tillz_f32(src0, ssrc0, len);
+            #endif
+                npyv_f32 r = simd_cmul_f32(a, b.val[0]);
+                npyv_storen2_till_f32(dst, sdst, len, r);
+            }
+        }
+        else {
+            goto loop_scalar;
+        }
+    }
+    #if 1
+    // non-contig
+    else if (loadable0 && loadable1 && storable) {
+        for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep,
+                            src1 += ssrc1*vstep, dst += sdst*vstep
+        ) {
+            npyv_f32 a0 = npyv_loadn2_f32(src0, ssrc0);
+            npyv_f32 a1 = npyv_loadn2_f32(src0 + ssrc0*hstep, ssrc0);
+            npyv_f32 b0 = npyv_loadn2_f32(src1, ssrc1);
+            npyv_f32 b1 = npyv_loadn2_f32(src1 + ssrc1*hstep, ssrc1);
+            npyv_f32 r0 = simd_cmul_f32(a0, b0);
+            npyv_f32 r1 = simd_cmul_f32(a1, b1);
+            npyv_storen2_f32(dst, sdst, r0);
+            npyv_storen2_f32(dst + sdst*hstep, sdst, r1);
+        }
+        for (; len > 0; len -= hstep, src0 += ssrc0*hstep,
+                       src1 += ssrc1*hstep, dst += sdst*hstep
+        ) {
+        #if 1
+            npyv_f32 a = npyv_loadn2_till_f32(src0, ssrc0, len, 1.0f, 1.0f);
+            npyv_f32 b = npyv_loadn2_till_f32(src1, ssrc1, len, 1.0f, 1.0f);
+        #else
+            npyv_f32 a = npyv_loadn2_tillz_f32(src0, ssrc0, len);
+            npyv_f32 b = npyv_loadn2_tillz_f32(src1, ssrc1, len);
+        #endif
+            npyv_f32 r = simd_cmul_f32(a, b);
+            npyv_storen2_till_f32(dst, sdst, len, r);
+        }
+    }
+    #endif
+    else {
+        goto loop_scalar;
+    }
+    npyv_cleanup();
+    return;
+loop_scalar:
+#endif
+    for (; len > 0; --len, b_src0 += b_ssrc0, b_src1 += b_ssrc1, b_dst += b_sdst) {
+        const npy_float a_r = ((npy_float *)b_src0)[0];
+        const npy_float a_i = ((npy_float *)b_src0)[1];
+        const npy_float b_r = ((npy_float *)b_src1)[0];
+        const npy_float b_i = ((npy_float *)b_src1)[1];
+    #if 1
+        ((npy_float *)b_dst)[0] = a_r*b_r - a_i*b_i;
+        ((npy_float *)b_dst)[1] = a_r*b_i + a_i*b_r;
+    #else
+        ((npy_float *)b_dst)[0] = a_r * b_r;
+        ((npy_float *)b_dst)[1] = a_i * b_i;
+    #endif
+    }
+}
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(CFLOAT_multiply_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_float *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_float *)(ip1 + is1 * indx);
+        const npy_float b_r = ((npy_float *)value)[0];
+        const npy_float b_i = ((npy_float *)value)[1];
+    #if 1
+        const npy_float a_r = indexed[0];
+        const npy_float a_i = indexed[1];
+        indexed[0] = a_r*b_r - a_i*b_i;
+        indexed[1] = a_r*b_i + a_i*b_r;
+    #else
+        indexed[0] *= b_r;
+        indexed[1] *= b_i;
+    #endif
+    }
+    return 0;
+}
+
+
+#line 630
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(CFLOAT_conjugate)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp len = dimensions[0];
+    char *b_src = args[0], *b_dst = args[1];
+    npy_intp b_ssrc = steps[0], b_sdst = steps[1];
+#if NPY_SIMD_F32
+    if (is_mem_overlap(b_src, b_ssrc, b_dst, b_sdst, len) ||
+        b_sdst % sizeof(npy_float) != 0 ||
+        b_ssrc % sizeof(npy_float) != 0
+    ) {
+        goto loop_scalar;
+    }
+    const npy_float *src  = (npy_float*)b_src;
+          npy_float *dst  = (npy_float*)b_dst;
+    const npy_intp ssrc = b_ssrc / sizeof(npy_float);
+    const npy_intp sdst = b_sdst / sizeof(npy_float);
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 2;
+    const int hstep = vstep / 2;
+
+    if (ssrc == 2 && ssrc == sdst) {
+        for (; len >= vstep; len -= vstep, src += wstep, dst += wstep) {
+            npyv_f32 a0 = npyv_load_f32(src);
+            npyv_f32 a1 = npyv_load_f32(src + vstep);
+            npyv_f32 r0 = simd_cconjugate_f32(a0);
+            npyv_f32 r1 = simd_cconjugate_f32(a1);
+            npyv_store_f32(dst, r0);
+            npyv_store_f32(dst + vstep, r1);
+        }
+        for (; len > 0; len -= hstep, src += vstep, dst += vstep) {
+            npyv_f32 a = npyv_load2_tillz_f32(src, len);
+            npyv_f32 r = simd_cconjugate_f32(a);
+            npyv_store2_till_f32(dst, len, r);
+        }
+    }
+    else if (ssrc == 2 && npyv_storable_stride_s64(sdst)) {
+        for (; len >= vstep; len -= vstep, src += wstep, dst += sdst*vstep) {
+            npyv_f32 a0 = npyv_load_f32(src);
+            npyv_f32 a1 = npyv_load_f32(src + vstep);
+            npyv_f32 r0 = simd_cconjugate_f32(a0);
+            npyv_f32 r1 = simd_cconjugate_f32(a1);
+            npyv_storen2_f32(dst, sdst, r0);
+            npyv_storen2_f32(dst + sdst*hstep, sdst, r1);
+        }
+        for (; len > 0; len -= hstep, src += vstep, dst += sdst*hstep) {
+            npyv_f32 a = npyv_load2_tillz_f32(src, len);
+            npyv_f32 r = simd_cconjugate_f32(a);
+            npyv_storen2_till_f32(dst, sdst, len, r);
+        }
+    }
+    else if (sdst == 2 && npyv_loadable_stride_s64(ssrc)) {
+        for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += wstep) {
+            npyv_f32 a0 = npyv_loadn2_f32(src, ssrc);
+            npyv_f32 a1 = npyv_loadn2_f32(src + ssrc*hstep, ssrc);
+            npyv_f32 r0 = simd_cconjugate_f32(a0);
+            npyv_f32 r1 = simd_cconjugate_f32(a1);
+            npyv_store_f32(dst, r0);
+            npyv_store_f32(dst + vstep, r1);
+        }
+        for (; len > 0; len -= hstep, src += ssrc*hstep, dst += vstep) {
+            npyv_f32 a = npyv_loadn2_tillz_f32((npy_float*)src, ssrc, len);
+            npyv_f32 r = simd_cconjugate_f32(a);
+            npyv_store2_till_f32(dst, len, r);
+        }
+    }
+    else {
+        goto loop_scalar;
+    }
+    npyv_cleanup();
+    return;
+loop_scalar:
+#endif
+    for (; len > 0; --len, b_src += b_ssrc, b_dst += b_sdst) {
+        const npy_float rl = ((npy_float *)b_src)[0];
+        const npy_float im = ((npy_float *)b_src)[1];
+    #if 0
+        ((npy_float *)b_dst)[0] = rl*rl - im*im;
+        ((npy_float *)b_dst)[1] = rl*im + im*rl;
+    #else
+        ((npy_float *)b_dst)[0] = rl;
+        ((npy_float *)b_dst)[1] = -im;
+    #endif
+    }
+}
+
+#line 630
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(CFLOAT_square)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp len = dimensions[0];
+    char *b_src = args[0], *b_dst = args[1];
+    npy_intp b_ssrc = steps[0], b_sdst = steps[1];
+#if NPY_SIMD_F32
+    if (is_mem_overlap(b_src, b_ssrc, b_dst, b_sdst, len) ||
+        b_sdst % sizeof(npy_float) != 0 ||
+        b_ssrc % sizeof(npy_float) != 0
+    ) {
+        goto loop_scalar;
+    }
+    const npy_float *src  = (npy_float*)b_src;
+          npy_float *dst  = (npy_float*)b_dst;
+    const npy_intp ssrc = b_ssrc / sizeof(npy_float);
+    const npy_intp sdst = b_sdst / sizeof(npy_float);
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 2;
+    const int hstep = vstep / 2;
+
+    if (ssrc == 2 && ssrc == sdst) {
+        for (; len >= vstep; len -= vstep, src += wstep, dst += wstep) {
+            npyv_f32 a0 = npyv_load_f32(src);
+            npyv_f32 a1 = npyv_load_f32(src + vstep);
+            npyv_f32 r0 = simd_csquare_f32(a0);
+            npyv_f32 r1 = simd_csquare_f32(a1);
+            npyv_store_f32(dst, r0);
+            npyv_store_f32(dst + vstep, r1);
+        }
+        for (; len > 0; len -= hstep, src += vstep, dst += vstep) {
+            npyv_f32 a = npyv_load2_tillz_f32(src, len);
+            npyv_f32 r = simd_csquare_f32(a);
+            npyv_store2_till_f32(dst, len, r);
+        }
+    }
+    else if (ssrc == 2 && npyv_storable_stride_s64(sdst)) {
+        for (; len >= vstep; len -= vstep, src += wstep, dst += sdst*vstep) {
+            npyv_f32 a0 = npyv_load_f32(src);
+            npyv_f32 a1 = npyv_load_f32(src + vstep);
+            npyv_f32 r0 = simd_csquare_f32(a0);
+            npyv_f32 r1 = simd_csquare_f32(a1);
+            npyv_storen2_f32(dst, sdst, r0);
+            npyv_storen2_f32(dst + sdst*hstep, sdst, r1);
+        }
+        for (; len > 0; len -= hstep, src += vstep, dst += sdst*hstep) {
+            npyv_f32 a = npyv_load2_tillz_f32(src, len);
+            npyv_f32 r = simd_csquare_f32(a);
+            npyv_storen2_till_f32(dst, sdst, len, r);
+        }
+    }
+    else if (sdst == 2 && npyv_loadable_stride_s64(ssrc)) {
+        for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += wstep) {
+            npyv_f32 a0 = npyv_loadn2_f32(src, ssrc);
+            npyv_f32 a1 = npyv_loadn2_f32(src + ssrc*hstep, ssrc);
+            npyv_f32 r0 = simd_csquare_f32(a0);
+            npyv_f32 r1 = simd_csquare_f32(a1);
+            npyv_store_f32(dst, r0);
+            npyv_store_f32(dst + vstep, r1);
+        }
+        for (; len > 0; len -= hstep, src += ssrc*hstep, dst += vstep) {
+            npyv_f32 a = npyv_loadn2_tillz_f32((npy_float*)src, ssrc, len);
+            npyv_f32 r = simd_csquare_f32(a);
+            npyv_store2_till_f32(dst, len, r);
+        }
+    }
+    else {
+        goto loop_scalar;
+    }
+    npyv_cleanup();
+    return;
+loop_scalar:
+#endif
+    for (; len > 0; --len, b_src += b_ssrc, b_dst += b_sdst) {
+        const npy_float rl = ((npy_float *)b_src)[0];
+        const npy_float im = ((npy_float *)b_src)[1];
+    #if 1
+        ((npy_float *)b_dst)[0] = rl*rl - im*im;
+        ((npy_float *)b_dst)[1] = rl*im + im*rl;
+    #else
+        ((npy_float *)b_dst)[0] = rl;
+        ((npy_float *)b_dst)[1] = -im;
+    #endif
+    }
+}
+
+
+#line 366
+#line 374
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(CDOUBLE_add)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp len = dimensions[0];
+    char *b_src0 = args[0], *b_src1 = args[1], *b_dst = args[2];
+    npy_intp b_ssrc0 = steps[0], b_ssrc1 = steps[1], b_sdst = steps[2];
+#if 1
+    // reduce
+    if (b_ssrc0 == 0 && b_ssrc0 == b_sdst && b_src0 == b_dst &&
+        b_ssrc1 % (sizeof(npy_double)*2) == 0
+    ) {
+        npy_double *rl_im = (npy_double *)b_src0;
+        npy_double rr, ri;
+        CDOUBLE_pairwise_sum(&rr, &ri, b_src1, len * 2, b_ssrc1 / 2);
+        rl_im[0] += rr;
+        rl_im[1] += ri;
+        return;
+    }
+#endif
+#if NPY_SIMD_F64
+    // Certain versions of Apple clang (commonly used in CI images) produce
+    // non-deterministic output in the mul path with AVX2 enabled on x86_64.
+    // Work around by scalarising.
+    #if 0 \
+            && defined(NPY_CPU_AMD64) && defined(__clang__) \
+            && defined(__apple_build_version__) \
+            && __apple_build_version__ >= 14000000 \
+            && __apple_build_version__ < 14030000
+        goto loop_scalar;
+    #endif  // end affected Apple clang.
+    if (is_mem_overlap(b_src0, b_ssrc0, b_dst, b_sdst, len) ||
+        is_mem_overlap(b_src1, b_ssrc1, b_dst, b_sdst, len) ||
+        b_sdst  % sizeof(npy_double) != 0 || b_sdst == 0 ||
+        b_ssrc0 % sizeof(npy_double) != 0 ||
+        b_ssrc1 % sizeof(npy_double) != 0
+    ) {
+        goto loop_scalar;
+    }
+    const npy_double *src0 = (npy_double*)b_src0;
+    const npy_double *src1 = (npy_double*)b_src1;
+          npy_double *dst  = (npy_double*)b_dst;
+
+    const npy_intp ssrc0 = b_ssrc0 / sizeof(npy_double);
+    const npy_intp ssrc1 = b_ssrc1 / sizeof(npy_double);
+    const npy_intp sdst  = b_sdst / sizeof(npy_double);
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 2;
+    const int hstep = vstep / 2;
+
+    const int loadable0 = npyv_loadable_stride_s64(ssrc0);
+    const int loadable1 = npyv_loadable_stride_s64(ssrc1);
+    const int storable = npyv_storable_stride_s64(sdst);
+
+    // lots**lots of specializations, to squeeze out max performance
+    // contig
+    if (ssrc0 == 2 && ssrc0 == ssrc1 && ssrc0 == sdst) {
+        for (; len >= vstep; len -= vstep, src0 += wstep, src1 += wstep, dst += wstep) {
+            npyv_f64 a0 = npyv_load_f64(src0);
+            npyv_f64 a1 = npyv_load_f64(src0 + vstep);
+            npyv_f64 b0 = npyv_load_f64(src1);
+            npyv_f64 b1 = npyv_load_f64(src1 + vstep);
+            npyv_f64 r0 = npyv_add_f64(a0, b0);
+            npyv_f64 r1 = npyv_add_f64(a1, b1);
+            npyv_store_f64(dst, r0);
+            npyv_store_f64(dst + vstep, r1);
+        }
+        for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
+            npyv_f64 a = npyv_load2_tillz_f64(src0, len);
+            npyv_f64 b = npyv_load2_tillz_f64(src1, len);
+            npyv_f64 r = npyv_add_f64(a, b);
+            npyv_store2_till_f64(dst, len, r);
+        }
+    }
+    // scalar 0
+    else if (ssrc0 == 0) {
+        npyv_f64x2 a = simd_set2_f64(src0);
+        // contig
+        if (ssrc1 == 2 && sdst == ssrc1) {
+            for (; len >= vstep; len -= vstep, src1 += wstep, dst += wstep) {
+                npyv_f64 b0 = npyv_load_f64(src1);
+                npyv_f64 b1 = npyv_load_f64(src1 + vstep);
+                npyv_f64 r0 = npyv_add_f64(a.val[0], b0);
+                npyv_f64 r1 = npyv_add_f64(a.val[1], b1);
+                npyv_store_f64(dst, r0);
+                npyv_store_f64(dst + vstep, r1);
+            }
+            for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
+            #if 0
+                npyv_f64 b = npyv_load2_till_f64(src1, len, 1.0, 1.0);
+            #else
+                npyv_f64 b = npyv_load2_tillz_f64(src1, len);
+            #endif
+                npyv_f64 r = npyv_add_f64(a.val[0], b);
+                npyv_store2_till_f64(dst, len, r);
+            }
+        }
+        // non-contig
+        else if (loadable1 && storable) {
+            for (; len >= vstep; len -= vstep, src1 += ssrc1*vstep, dst += sdst*vstep) {
+                npyv_f64 b0 = npyv_loadn2_f64(src1, ssrc1);
+                npyv_f64 b1 = npyv_loadn2_f64(src1 + ssrc1*hstep, ssrc1);
+                npyv_f64 r0 = npyv_add_f64(a.val[0], b0);
+                npyv_f64 r1 = npyv_add_f64(a.val[1], b1);
+                npyv_storen2_f64(dst, sdst, r0);
+                npyv_storen2_f64(dst + sdst*hstep, sdst, r1);
+            }
+            for (; len > 0; len -= hstep, src1 += ssrc1*hstep, dst += sdst*hstep) {
+            #if 0
+                npyv_f64 b = npyv_loadn2_till_f64(src1, ssrc1, len, 1.0, 1.0);
+            #else
+                npyv_f64 b = npyv_loadn2_tillz_f64(src1, ssrc1, len);
+            #endif
+                npyv_f64 r = npyv_add_f64(a.val[0], b);
+                npyv_storen2_till_f64(dst, sdst, len, r);
+            }
+        }
+        else {
+            goto loop_scalar;
+        }
+    }
+    // scalar 1
+    else if (ssrc1 == 0) {
+        npyv_f64x2 b = simd_set2_f64(src1);
+        if (ssrc0 == 2 && sdst == ssrc0) {
+            for (; len >= vstep; len -= vstep, src0 += wstep, dst += wstep) {
+                npyv_f64 a0 = npyv_load_f64(src0);
+                npyv_f64 a1 = npyv_load_f64(src0 + vstep);
+                npyv_f64 r0 = npyv_add_f64(a0, b.val[0]);
+                npyv_f64 r1 = npyv_add_f64(a1, b.val[1]);
+                npyv_store_f64(dst, r0);
+                npyv_store_f64(dst + vstep, r1);
+            }
+            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
+            #if 0
+                npyv_f64 a = npyv_load2_till_f64(src0, len, 1.0, 1.0);
+            #else
+                npyv_f64 a = npyv_load2_tillz_f64(src0, len);
+            #endif
+                npyv_f64 r = npyv_add_f64(a, b.val[0]);
+                npyv_store2_till_f64(dst, len, r);
+            }
+        }
+        // non-contig
+        else if (loadable0 && storable) {
+            for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep, dst += sdst*vstep) {
+                npyv_f64 a0 = npyv_loadn2_f64(src0, ssrc0);
+                npyv_f64 a1 = npyv_loadn2_f64(src0 + ssrc0*hstep, ssrc0);
+                npyv_f64 r0 = npyv_add_f64(a0, b.val[0]);
+                npyv_f64 r1 = npyv_add_f64(a1, b.val[1]);
+                npyv_storen2_f64(dst, sdst, r0);
+                npyv_storen2_f64(dst + sdst*hstep, sdst, r1);
+            }
+            for (; len > 0; len -= hstep, src0 += ssrc0*hstep, dst += sdst*hstep) {
+            #if 0
+                npyv_f64 a = npyv_loadn2_till_f64(src0, ssrc0, len, 1.0, 1.0);
+            #else
+                npyv_f64 a = npyv_loadn2_tillz_f64(src0, ssrc0, len);
+            #endif
+                npyv_f64 r = npyv_add_f64(a, b.val[0]);
+                npyv_storen2_till_f64(dst, sdst, len, r);
+            }
+        }
+        else {
+            goto loop_scalar;
+        }
+    }
+    #if 0
+    // non-contig
+    else if (loadable0 && loadable1 && storable) {
+        for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep,
+                            src1 += ssrc1*vstep, dst += sdst*vstep
+        ) {
+            npyv_f64 a0 = npyv_loadn2_f64(src0, ssrc0);
+            npyv_f64 a1 = npyv_loadn2_f64(src0 + ssrc0*hstep, ssrc0);
+            npyv_f64 b0 = npyv_loadn2_f64(src1, ssrc1);
+            npyv_f64 b1 = npyv_loadn2_f64(src1 + ssrc1*hstep, ssrc1);
+            npyv_f64 r0 = npyv_add_f64(a0, b0);
+            npyv_f64 r1 = npyv_add_f64(a1, b1);
+            npyv_storen2_f64(dst, sdst, r0);
+            npyv_storen2_f64(dst + sdst*hstep, sdst, r1);
+        }
+        for (; len > 0; len -= hstep, src0 += ssrc0*hstep,
+                       src1 += ssrc1*hstep, dst += sdst*hstep
+        ) {
+        #if 0
+            npyv_f64 a = npyv_loadn2_till_f64(src0, ssrc0, len, 1.0, 1.0);
+            npyv_f64 b = npyv_loadn2_till_f64(src1, ssrc1, len, 1.0, 1.0);
+        #else
+            npyv_f64 a = npyv_loadn2_tillz_f64(src0, ssrc0, len);
+            npyv_f64 b = npyv_loadn2_tillz_f64(src1, ssrc1, len);
+        #endif
+            npyv_f64 r = npyv_add_f64(a, b);
+            npyv_storen2_till_f64(dst, sdst, len, r);
+        }
+    }
+    #endif
+    else {
+        goto loop_scalar;
+    }
+    npyv_cleanup();
+    return;
+loop_scalar:
+#endif
+    for (; len > 0; --len, b_src0 += b_ssrc0, b_src1 += b_ssrc1, b_dst += b_sdst) {
+        const npy_double a_r = ((npy_double *)b_src0)[0];
+        const npy_double a_i = ((npy_double *)b_src0)[1];
+        const npy_double b_r = ((npy_double *)b_src1)[0];
+        const npy_double b_i = ((npy_double *)b_src1)[1];
+    #if 0
+        ((npy_double *)b_dst)[0] = a_r*b_r - a_i*b_i;
+        ((npy_double *)b_dst)[1] = a_r*b_i + a_i*b_r;
+    #else
+        ((npy_double *)b_dst)[0] = a_r + b_r;
+        ((npy_double *)b_dst)[1] = a_i + b_i;
+    #endif
+    }
+}
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(CDOUBLE_add_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_double *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_double *)(ip1 + is1 * indx);
+        const npy_double b_r = ((npy_double *)value)[0];
+        const npy_double b_i = ((npy_double *)value)[1];
+    #if 0
+        const npy_double a_r = indexed[0];
+        const npy_double a_i = indexed[1];
+        indexed[0] = a_r*b_r - a_i*b_i;
+        indexed[1] = a_r*b_i + a_i*b_r;
+    #else
+        indexed[0] += b_r;
+        indexed[1] += b_i;
+    #endif
+    }
+    return 0;
+}
+
+#line 374
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(CDOUBLE_subtract)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp len = dimensions[0];
+    char *b_src0 = args[0], *b_src1 = args[1], *b_dst = args[2];
+    npy_intp b_ssrc0 = steps[0], b_ssrc1 = steps[1], b_sdst = steps[2];
+#if 0
+    // reduce
+    if (b_ssrc0 == 0 && b_ssrc0 == b_sdst && b_src0 == b_dst &&
+        b_ssrc1 % (sizeof(npy_double)*2) == 0
+    ) {
+        npy_double *rl_im = (npy_double *)b_src0;
+        npy_double rr, ri;
+        CDOUBLE_pairwise_sum(&rr, &ri, b_src1, len * 2, b_ssrc1 / 2);
+        rl_im[0] -= rr;
+        rl_im[1] -= ri;
+        return;
+    }
+#endif
+#if NPY_SIMD_F64
+    // Certain versions of Apple clang (commonly used in CI images) produce
+    // non-deterministic output in the mul path with AVX2 enabled on x86_64.
+    // Work around by scalarising.
+    #if 0 \
+            && defined(NPY_CPU_AMD64) && defined(__clang__) \
+            && defined(__apple_build_version__) \
+            && __apple_build_version__ >= 14000000 \
+            && __apple_build_version__ < 14030000
+        goto loop_scalar;
+    #endif  // end affected Apple clang.
+    if (is_mem_overlap(b_src0, b_ssrc0, b_dst, b_sdst, len) ||
+        is_mem_overlap(b_src1, b_ssrc1, b_dst, b_sdst, len) ||
+        b_sdst  % sizeof(npy_double) != 0 || b_sdst == 0 ||
+        b_ssrc0 % sizeof(npy_double) != 0 ||
+        b_ssrc1 % sizeof(npy_double) != 0
+    ) {
+        goto loop_scalar;
+    }
+    const npy_double *src0 = (npy_double*)b_src0;
+    const npy_double *src1 = (npy_double*)b_src1;
+          npy_double *dst  = (npy_double*)b_dst;
+
+    const npy_intp ssrc0 = b_ssrc0 / sizeof(npy_double);
+    const npy_intp ssrc1 = b_ssrc1 / sizeof(npy_double);
+    const npy_intp sdst  = b_sdst / sizeof(npy_double);
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 2;
+    const int hstep = vstep / 2;
+
+    const int loadable0 = npyv_loadable_stride_s64(ssrc0);
+    const int loadable1 = npyv_loadable_stride_s64(ssrc1);
+    const int storable = npyv_storable_stride_s64(sdst);
+
+    // lots**lots of specializations, to squeeze out max performance
+    // contig
+    if (ssrc0 == 2 && ssrc0 == ssrc1 && ssrc0 == sdst) {
+        for (; len >= vstep; len -= vstep, src0 += wstep, src1 += wstep, dst += wstep) {
+            npyv_f64 a0 = npyv_load_f64(src0);
+            npyv_f64 a1 = npyv_load_f64(src0 + vstep);
+            npyv_f64 b0 = npyv_load_f64(src1);
+            npyv_f64 b1 = npyv_load_f64(src1 + vstep);
+            npyv_f64 r0 = npyv_sub_f64(a0, b0);
+            npyv_f64 r1 = npyv_sub_f64(a1, b1);
+            npyv_store_f64(dst, r0);
+            npyv_store_f64(dst + vstep, r1);
+        }
+        for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
+            npyv_f64 a = npyv_load2_tillz_f64(src0, len);
+            npyv_f64 b = npyv_load2_tillz_f64(src1, len);
+            npyv_f64 r = npyv_sub_f64(a, b);
+            npyv_store2_till_f64(dst, len, r);
+        }
+    }
+    // scalar 0
+    else if (ssrc0 == 0) {
+        npyv_f64x2 a = simd_set2_f64(src0);
+        // contig
+        if (ssrc1 == 2 && sdst == ssrc1) {
+            for (; len >= vstep; len -= vstep, src1 += wstep, dst += wstep) {
+                npyv_f64 b0 = npyv_load_f64(src1);
+                npyv_f64 b1 = npyv_load_f64(src1 + vstep);
+                npyv_f64 r0 = npyv_sub_f64(a.val[0], b0);
+                npyv_f64 r1 = npyv_sub_f64(a.val[1], b1);
+                npyv_store_f64(dst, r0);
+                npyv_store_f64(dst + vstep, r1);
+            }
+            for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
+            #if 0
+                npyv_f64 b = npyv_load2_till_f64(src1, len, 1.0, 1.0);
+            #else
+                npyv_f64 b = npyv_load2_tillz_f64(src1, len);
+            #endif
+                npyv_f64 r = npyv_sub_f64(a.val[0], b);
+                npyv_store2_till_f64(dst, len, r);
+            }
+        }
+        // non-contig
+        else if (loadable1 && storable) {
+            for (; len >= vstep; len -= vstep, src1 += ssrc1*vstep, dst += sdst*vstep) {
+                npyv_f64 b0 = npyv_loadn2_f64(src1, ssrc1);
+                npyv_f64 b1 = npyv_loadn2_f64(src1 + ssrc1*hstep, ssrc1);
+                npyv_f64 r0 = npyv_sub_f64(a.val[0], b0);
+                npyv_f64 r1 = npyv_sub_f64(a.val[1], b1);
+                npyv_storen2_f64(dst, sdst, r0);
+                npyv_storen2_f64(dst + sdst*hstep, sdst, r1);
+            }
+            for (; len > 0; len -= hstep, src1 += ssrc1*hstep, dst += sdst*hstep) {
+            #if 0
+                npyv_f64 b = npyv_loadn2_till_f64(src1, ssrc1, len, 1.0, 1.0);
+            #else
+                npyv_f64 b = npyv_loadn2_tillz_f64(src1, ssrc1, len);
+            #endif
+                npyv_f64 r = npyv_sub_f64(a.val[0], b);
+                npyv_storen2_till_f64(dst, sdst, len, r);
+            }
+        }
+        else {
+            goto loop_scalar;
+        }
+    }
+    // scalar 1
+    else if (ssrc1 == 0) {
+        npyv_f64x2 b = simd_set2_f64(src1);
+        if (ssrc0 == 2 && sdst == ssrc0) {
+            for (; len >= vstep; len -= vstep, src0 += wstep, dst += wstep) {
+                npyv_f64 a0 = npyv_load_f64(src0);
+                npyv_f64 a1 = npyv_load_f64(src0 + vstep);
+                npyv_f64 r0 = npyv_sub_f64(a0, b.val[0]);
+                npyv_f64 r1 = npyv_sub_f64(a1, b.val[1]);
+                npyv_store_f64(dst, r0);
+                npyv_store_f64(dst + vstep, r1);
+            }
+            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
+            #if 0
+                npyv_f64 a = npyv_load2_till_f64(src0, len, 1.0, 1.0);
+            #else
+                npyv_f64 a = npyv_load2_tillz_f64(src0, len);
+            #endif
+                npyv_f64 r = npyv_sub_f64(a, b.val[0]);
+                npyv_store2_till_f64(dst, len, r);
+            }
+        }
+        // non-contig
+        else if (loadable0 && storable) {
+            for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep, dst += sdst*vstep) {
+                npyv_f64 a0 = npyv_loadn2_f64(src0, ssrc0);
+                npyv_f64 a1 = npyv_loadn2_f64(src0 + ssrc0*hstep, ssrc0);
+                npyv_f64 r0 = npyv_sub_f64(a0, b.val[0]);
+                npyv_f64 r1 = npyv_sub_f64(a1, b.val[1]);
+                npyv_storen2_f64(dst, sdst, r0);
+                npyv_storen2_f64(dst + sdst*hstep, sdst, r1);
+            }
+            for (; len > 0; len -= hstep, src0 += ssrc0*hstep, dst += sdst*hstep) {
+            #if 0
+                npyv_f64 a = npyv_loadn2_till_f64(src0, ssrc0, len, 1.0, 1.0);
+            #else
+                npyv_f64 a = npyv_loadn2_tillz_f64(src0, ssrc0, len);
+            #endif
+                npyv_f64 r = npyv_sub_f64(a, b.val[0]);
+                npyv_storen2_till_f64(dst, sdst, len, r);
+            }
+        }
+        else {
+            goto loop_scalar;
+        }
+    }
+    #if 0
+    // non-contig
+    else if (loadable0 && loadable1 && storable) {
+        for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep,
+                            src1 += ssrc1*vstep, dst += sdst*vstep
+        ) {
+            npyv_f64 a0 = npyv_loadn2_f64(src0, ssrc0);
+            npyv_f64 a1 = npyv_loadn2_f64(src0 + ssrc0*hstep, ssrc0);
+            npyv_f64 b0 = npyv_loadn2_f64(src1, ssrc1);
+            npyv_f64 b1 = npyv_loadn2_f64(src1 + ssrc1*hstep, ssrc1);
+            npyv_f64 r0 = npyv_sub_f64(a0, b0);
+            npyv_f64 r1 = npyv_sub_f64(a1, b1);
+            npyv_storen2_f64(dst, sdst, r0);
+            npyv_storen2_f64(dst + sdst*hstep, sdst, r1);
+        }
+        for (; len > 0; len -= hstep, src0 += ssrc0*hstep,
+                       src1 += ssrc1*hstep, dst += sdst*hstep
+        ) {
+        #if 0
+            npyv_f64 a = npyv_loadn2_till_f64(src0, ssrc0, len, 1.0, 1.0);
+            npyv_f64 b = npyv_loadn2_till_f64(src1, ssrc1, len, 1.0, 1.0);
+        #else
+            npyv_f64 a = npyv_loadn2_tillz_f64(src0, ssrc0, len);
+            npyv_f64 b = npyv_loadn2_tillz_f64(src1, ssrc1, len);
+        #endif
+            npyv_f64 r = npyv_sub_f64(a, b);
+            npyv_storen2_till_f64(dst, sdst, len, r);
+        }
+    }
+    #endif
+    else {
+        goto loop_scalar;
+    }
+    npyv_cleanup();
+    return;
+loop_scalar:
+#endif
+    for (; len > 0; --len, b_src0 += b_ssrc0, b_src1 += b_ssrc1, b_dst += b_sdst) {
+        const npy_double a_r = ((npy_double *)b_src0)[0];
+        const npy_double a_i = ((npy_double *)b_src0)[1];
+        const npy_double b_r = ((npy_double *)b_src1)[0];
+        const npy_double b_i = ((npy_double *)b_src1)[1];
+    #if 0
+        ((npy_double *)b_dst)[0] = a_r*b_r - a_i*b_i;
+        ((npy_double *)b_dst)[1] = a_r*b_i + a_i*b_r;
+    #else
+        ((npy_double *)b_dst)[0] = a_r - b_r;
+        ((npy_double *)b_dst)[1] = a_i - b_i;
+    #endif
+    }
+}
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(CDOUBLE_subtract_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_double *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_double *)(ip1 + is1 * indx);
+        const npy_double b_r = ((npy_double *)value)[0];
+        const npy_double b_i = ((npy_double *)value)[1];
+    #if 0
+        const npy_double a_r = indexed[0];
+        const npy_double a_i = indexed[1];
+        indexed[0] = a_r*b_r - a_i*b_i;
+        indexed[1] = a_r*b_i + a_i*b_r;
+    #else
+        indexed[0] -= b_r;
+        indexed[1] -= b_i;
+    #endif
+    }
+    return 0;
+}
+
+#line 374
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(CDOUBLE_multiply)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp len = dimensions[0];
+    char *b_src0 = args[0], *b_src1 = args[1], *b_dst = args[2];
+    npy_intp b_ssrc0 = steps[0], b_ssrc1 = steps[1], b_sdst = steps[2];
+#if 0
+    // reduce
+    if (b_ssrc0 == 0 && b_ssrc0 == b_sdst && b_src0 == b_dst &&
+        b_ssrc1 % (sizeof(npy_double)*2) == 0
+    ) {
+        npy_double *rl_im = (npy_double *)b_src0;
+        npy_double rr, ri;
+        CDOUBLE_pairwise_sum(&rr, &ri, b_src1, len * 2, b_ssrc1 / 2);
+        rl_im[0] *= rr;
+        rl_im[1] *= ri;
+        return;
+    }
+#endif
+#if NPY_SIMD_F64
+    // Certain versions of Apple clang (commonly used in CI images) produce
+    // non-deterministic output in the mul path with AVX2 enabled on x86_64.
+    // Work around by scalarising.
+    #if 1 \
+            && defined(NPY_CPU_AMD64) && defined(__clang__) \
+            && defined(__apple_build_version__) \
+            && __apple_build_version__ >= 14000000 \
+            && __apple_build_version__ < 14030000
+        goto loop_scalar;
+    #endif  // end affected Apple clang.
+    if (is_mem_overlap(b_src0, b_ssrc0, b_dst, b_sdst, len) ||
+        is_mem_overlap(b_src1, b_ssrc1, b_dst, b_sdst, len) ||
+        b_sdst  % sizeof(npy_double) != 0 || b_sdst == 0 ||
+        b_ssrc0 % sizeof(npy_double) != 0 ||
+        b_ssrc1 % sizeof(npy_double) != 0
+    ) {
+        goto loop_scalar;
+    }
+    const npy_double *src0 = (npy_double*)b_src0;
+    const npy_double *src1 = (npy_double*)b_src1;
+          npy_double *dst  = (npy_double*)b_dst;
+
+    const npy_intp ssrc0 = b_ssrc0 / sizeof(npy_double);
+    const npy_intp ssrc1 = b_ssrc1 / sizeof(npy_double);
+    const npy_intp sdst  = b_sdst / sizeof(npy_double);
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 2;
+    const int hstep = vstep / 2;
+
+    const int loadable0 = npyv_loadable_stride_s64(ssrc0);
+    const int loadable1 = npyv_loadable_stride_s64(ssrc1);
+    const int storable = npyv_storable_stride_s64(sdst);
+
+    // lots**lots of specializations, to squeeze out max performance
+    // contig
+    if (ssrc0 == 2 && ssrc0 == ssrc1 && ssrc0 == sdst) {
+        for (; len >= vstep; len -= vstep, src0 += wstep, src1 += wstep, dst += wstep) {
+            npyv_f64 a0 = npyv_load_f64(src0);
+            npyv_f64 a1 = npyv_load_f64(src0 + vstep);
+            npyv_f64 b0 = npyv_load_f64(src1);
+            npyv_f64 b1 = npyv_load_f64(src1 + vstep);
+            npyv_f64 r0 = simd_cmul_f64(a0, b0);
+            npyv_f64 r1 = simd_cmul_f64(a1, b1);
+            npyv_store_f64(dst, r0);
+            npyv_store_f64(dst + vstep, r1);
+        }
+        for (; len > 0; len -= hstep, src0 += vstep, src1 += vstep, dst += vstep) {
+            npyv_f64 a = npyv_load2_tillz_f64(src0, len);
+            npyv_f64 b = npyv_load2_tillz_f64(src1, len);
+            npyv_f64 r = simd_cmul_f64(a, b);
+            npyv_store2_till_f64(dst, len, r);
+        }
+    }
+    // scalar 0
+    else if (ssrc0 == 0) {
+        npyv_f64x2 a = simd_set2_f64(src0);
+        // contig
+        if (ssrc1 == 2 && sdst == ssrc1) {
+            for (; len >= vstep; len -= vstep, src1 += wstep, dst += wstep) {
+                npyv_f64 b0 = npyv_load_f64(src1);
+                npyv_f64 b1 = npyv_load_f64(src1 + vstep);
+                npyv_f64 r0 = simd_cmul_f64(a.val[0], b0);
+                npyv_f64 r1 = simd_cmul_f64(a.val[1], b1);
+                npyv_store_f64(dst, r0);
+                npyv_store_f64(dst + vstep, r1);
+            }
+            for (; len > 0; len -= hstep, src1 += vstep, dst += vstep) {
+            #if 1
+                npyv_f64 b = npyv_load2_till_f64(src1, len, 1.0, 1.0);
+            #else
+                npyv_f64 b = npyv_load2_tillz_f64(src1, len);
+            #endif
+                npyv_f64 r = simd_cmul_f64(a.val[0], b);
+                npyv_store2_till_f64(dst, len, r);
+            }
+        }
+        // non-contig
+        else if (loadable1 && storable) {
+            for (; len >= vstep; len -= vstep, src1 += ssrc1*vstep, dst += sdst*vstep) {
+                npyv_f64 b0 = npyv_loadn2_f64(src1, ssrc1);
+                npyv_f64 b1 = npyv_loadn2_f64(src1 + ssrc1*hstep, ssrc1);
+                npyv_f64 r0 = simd_cmul_f64(a.val[0], b0);
+                npyv_f64 r1 = simd_cmul_f64(a.val[1], b1);
+                npyv_storen2_f64(dst, sdst, r0);
+                npyv_storen2_f64(dst + sdst*hstep, sdst, r1);
+            }
+            for (; len > 0; len -= hstep, src1 += ssrc1*hstep, dst += sdst*hstep) {
+            #if 1
+                npyv_f64 b = npyv_loadn2_till_f64(src1, ssrc1, len, 1.0, 1.0);
+            #else
+                npyv_f64 b = npyv_loadn2_tillz_f64(src1, ssrc1, len);
+            #endif
+                npyv_f64 r = simd_cmul_f64(a.val[0], b);
+                npyv_storen2_till_f64(dst, sdst, len, r);
+            }
+        }
+        else {
+            goto loop_scalar;
+        }
+    }
+    // scalar 1
+    else if (ssrc1 == 0) {
+        npyv_f64x2 b = simd_set2_f64(src1);
+        if (ssrc0 == 2 && sdst == ssrc0) {
+            for (; len >= vstep; len -= vstep, src0 += wstep, dst += wstep) {
+                npyv_f64 a0 = npyv_load_f64(src0);
+                npyv_f64 a1 = npyv_load_f64(src0 + vstep);
+                npyv_f64 r0 = simd_cmul_f64(a0, b.val[0]);
+                npyv_f64 r1 = simd_cmul_f64(a1, b.val[1]);
+                npyv_store_f64(dst, r0);
+                npyv_store_f64(dst + vstep, r1);
+            }
+            for (; len > 0; len -= hstep, src0 += vstep, dst += vstep) {
+            #if 1
+                npyv_f64 a = npyv_load2_till_f64(src0, len, 1.0, 1.0);
+            #else
+                npyv_f64 a = npyv_load2_tillz_f64(src0, len);
+            #endif
+                npyv_f64 r = simd_cmul_f64(a, b.val[0]);
+                npyv_store2_till_f64(dst, len, r);
+            }
+        }
+        // non-contig
+        else if (loadable0 && storable) {
+            for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep, dst += sdst*vstep) {
+                npyv_f64 a0 = npyv_loadn2_f64(src0, ssrc0);
+                npyv_f64 a1 = npyv_loadn2_f64(src0 + ssrc0*hstep, ssrc0);
+                npyv_f64 r0 = simd_cmul_f64(a0, b.val[0]);
+                npyv_f64 r1 = simd_cmul_f64(a1, b.val[1]);
+                npyv_storen2_f64(dst, sdst, r0);
+                npyv_storen2_f64(dst + sdst*hstep, sdst, r1);
+            }
+            for (; len > 0; len -= hstep, src0 += ssrc0*hstep, dst += sdst*hstep) {
+            #if 1
+                npyv_f64 a = npyv_loadn2_till_f64(src0, ssrc0, len, 1.0, 1.0);
+            #else
+                npyv_f64 a = npyv_loadn2_tillz_f64(src0, ssrc0, len);
+            #endif
+                npyv_f64 r = simd_cmul_f64(a, b.val[0]);
+                npyv_storen2_till_f64(dst, sdst, len, r);
+            }
+        }
+        else {
+            goto loop_scalar;
+        }
+    }
+    #if 1
+    // non-contig
+    else if (loadable0 && loadable1 && storable) {
+        for (; len >= vstep; len -= vstep, src0 += ssrc0*vstep,
+                            src1 += ssrc1*vstep, dst += sdst*vstep
+        ) {
+            npyv_f64 a0 = npyv_loadn2_f64(src0, ssrc0);
+            npyv_f64 a1 = npyv_loadn2_f64(src0 + ssrc0*hstep, ssrc0);
+            npyv_f64 b0 = npyv_loadn2_f64(src1, ssrc1);
+            npyv_f64 b1 = npyv_loadn2_f64(src1 + ssrc1*hstep, ssrc1);
+            npyv_f64 r0 = simd_cmul_f64(a0, b0);
+            npyv_f64 r1 = simd_cmul_f64(a1, b1);
+            npyv_storen2_f64(dst, sdst, r0);
+            npyv_storen2_f64(dst + sdst*hstep, sdst, r1);
+        }
+        for (; len > 0; len -= hstep, src0 += ssrc0*hstep,
+                       src1 += ssrc1*hstep, dst += sdst*hstep
+        ) {
+        #if 1
+            npyv_f64 a = npyv_loadn2_till_f64(src0, ssrc0, len, 1.0, 1.0);
+            npyv_f64 b = npyv_loadn2_till_f64(src1, ssrc1, len, 1.0, 1.0);
+        #else
+            npyv_f64 a = npyv_loadn2_tillz_f64(src0, ssrc0, len);
+            npyv_f64 b = npyv_loadn2_tillz_f64(src1, ssrc1, len);
+        #endif
+            npyv_f64 r = simd_cmul_f64(a, b);
+            npyv_storen2_till_f64(dst, sdst, len, r);
+        }
+    }
+    #endif
+    else {
+        goto loop_scalar;
+    }
+    npyv_cleanup();
+    return;
+loop_scalar:
+#endif
+    for (; len > 0; --len, b_src0 += b_ssrc0, b_src1 += b_ssrc1, b_dst += b_sdst) {
+        const npy_double a_r = ((npy_double *)b_src0)[0];
+        const npy_double a_i = ((npy_double *)b_src0)[1];
+        const npy_double b_r = ((npy_double *)b_src1)[0];
+        const npy_double b_i = ((npy_double *)b_src1)[1];
+    #if 1
+        ((npy_double *)b_dst)[0] = a_r*b_r - a_i*b_i;
+        ((npy_double *)b_dst)[1] = a_r*b_i + a_i*b_r;
+    #else
+        ((npy_double *)b_dst)[0] = a_r * b_r;
+        ((npy_double *)b_dst)[1] = a_i * b_i;
+    #endif
+    }
+}
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(CDOUBLE_multiply_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_double *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_double *)(ip1 + is1 * indx);
+        const npy_double b_r = ((npy_double *)value)[0];
+        const npy_double b_i = ((npy_double *)value)[1];
+    #if 1
+        const npy_double a_r = indexed[0];
+        const npy_double a_i = indexed[1];
+        indexed[0] = a_r*b_r - a_i*b_i;
+        indexed[1] = a_r*b_i + a_i*b_r;
+    #else
+        indexed[0] *= b_r;
+        indexed[1] *= b_i;
+    #endif
+    }
+    return 0;
+}
+
+
+#line 630
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(CDOUBLE_conjugate)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp len = dimensions[0];
+    char *b_src = args[0], *b_dst = args[1];
+    npy_intp b_ssrc = steps[0], b_sdst = steps[1];
+#if NPY_SIMD_F64
+    if (is_mem_overlap(b_src, b_ssrc, b_dst, b_sdst, len) ||
+        b_sdst % sizeof(npy_double) != 0 ||
+        b_ssrc % sizeof(npy_double) != 0
+    ) {
+        goto loop_scalar;
+    }
+    const npy_double *src  = (npy_double*)b_src;
+          npy_double *dst  = (npy_double*)b_dst;
+    const npy_intp ssrc = b_ssrc / sizeof(npy_double);
+    const npy_intp sdst = b_sdst / sizeof(npy_double);
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 2;
+    const int hstep = vstep / 2;
+
+    if (ssrc == 2 && ssrc == sdst) {
+        for (; len >= vstep; len -= vstep, src += wstep, dst += wstep) {
+            npyv_f64 a0 = npyv_load_f64(src);
+            npyv_f64 a1 = npyv_load_f64(src + vstep);
+            npyv_f64 r0 = simd_cconjugate_f64(a0);
+            npyv_f64 r1 = simd_cconjugate_f64(a1);
+            npyv_store_f64(dst, r0);
+            npyv_store_f64(dst + vstep, r1);
+        }
+        for (; len > 0; len -= hstep, src += vstep, dst += vstep) {
+            npyv_f64 a = npyv_load2_tillz_f64(src, len);
+            npyv_f64 r = simd_cconjugate_f64(a);
+            npyv_store2_till_f64(dst, len, r);
+        }
+    }
+    else if (ssrc == 2 && npyv_storable_stride_s64(sdst)) {
+        for (; len >= vstep; len -= vstep, src += wstep, dst += sdst*vstep) {
+            npyv_f64 a0 = npyv_load_f64(src);
+            npyv_f64 a1 = npyv_load_f64(src + vstep);
+            npyv_f64 r0 = simd_cconjugate_f64(a0);
+            npyv_f64 r1 = simd_cconjugate_f64(a1);
+            npyv_storen2_f64(dst, sdst, r0);
+            npyv_storen2_f64(dst + sdst*hstep, sdst, r1);
+        }
+        for (; len > 0; len -= hstep, src += vstep, dst += sdst*hstep) {
+            npyv_f64 a = npyv_load2_tillz_f64(src, len);
+            npyv_f64 r = simd_cconjugate_f64(a);
+            npyv_storen2_till_f64(dst, sdst, len, r);
+        }
+    }
+    else if (sdst == 2 && npyv_loadable_stride_s64(ssrc)) {
+        for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += wstep) {
+            npyv_f64 a0 = npyv_loadn2_f64(src, ssrc);
+            npyv_f64 a1 = npyv_loadn2_f64(src + ssrc*hstep, ssrc);
+            npyv_f64 r0 = simd_cconjugate_f64(a0);
+            npyv_f64 r1 = simd_cconjugate_f64(a1);
+            npyv_store_f64(dst, r0);
+            npyv_store_f64(dst + vstep, r1);
+        }
+        for (; len > 0; len -= hstep, src += ssrc*hstep, dst += vstep) {
+            npyv_f64 a = npyv_loadn2_tillz_f64((npy_double*)src, ssrc, len);
+            npyv_f64 r = simd_cconjugate_f64(a);
+            npyv_store2_till_f64(dst, len, r);
+        }
+    }
+    else {
+        goto loop_scalar;
+    }
+    npyv_cleanup();
+    return;
+loop_scalar:
+#endif
+    for (; len > 0; --len, b_src += b_ssrc, b_dst += b_sdst) {
+        const npy_double rl = ((npy_double *)b_src)[0];
+        const npy_double im = ((npy_double *)b_src)[1];
+    #if 0
+        ((npy_double *)b_dst)[0] = rl*rl - im*im;
+        ((npy_double *)b_dst)[1] = rl*im + im*rl;
+    #else
+        ((npy_double *)b_dst)[0] = rl;
+        ((npy_double *)b_dst)[1] = -im;
+    #endif
+    }
+}
+
+#line 630
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(CDOUBLE_square)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp len = dimensions[0];
+    char *b_src = args[0], *b_dst = args[1];
+    npy_intp b_ssrc = steps[0], b_sdst = steps[1];
+#if NPY_SIMD_F64
+    if (is_mem_overlap(b_src, b_ssrc, b_dst, b_sdst, len) ||
+        b_sdst % sizeof(npy_double) != 0 ||
+        b_ssrc % sizeof(npy_double) != 0
+    ) {
+        goto loop_scalar;
+    }
+    const npy_double *src  = (npy_double*)b_src;
+          npy_double *dst  = (npy_double*)b_dst;
+    const npy_intp ssrc = b_ssrc / sizeof(npy_double);
+    const npy_intp sdst = b_sdst / sizeof(npy_double);
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 2;
+    const int hstep = vstep / 2;
+
+    if (ssrc == 2 && ssrc == sdst) {
+        for (; len >= vstep; len -= vstep, src += wstep, dst += wstep) {
+            npyv_f64 a0 = npyv_load_f64(src);
+            npyv_f64 a1 = npyv_load_f64(src + vstep);
+            npyv_f64 r0 = simd_csquare_f64(a0);
+            npyv_f64 r1 = simd_csquare_f64(a1);
+            npyv_store_f64(dst, r0);
+            npyv_store_f64(dst + vstep, r1);
+        }
+        for (; len > 0; len -= hstep, src += vstep, dst += vstep) {
+            npyv_f64 a = npyv_load2_tillz_f64(src, len);
+            npyv_f64 r = simd_csquare_f64(a);
+            npyv_store2_till_f64(dst, len, r);
+        }
+    }
+    else if (ssrc == 2 && npyv_storable_stride_s64(sdst)) {
+        for (; len >= vstep; len -= vstep, src += wstep, dst += sdst*vstep) {
+            npyv_f64 a0 = npyv_load_f64(src);
+            npyv_f64 a1 = npyv_load_f64(src + vstep);
+            npyv_f64 r0 = simd_csquare_f64(a0);
+            npyv_f64 r1 = simd_csquare_f64(a1);
+            npyv_storen2_f64(dst, sdst, r0);
+            npyv_storen2_f64(dst + sdst*hstep, sdst, r1);
+        }
+        for (; len > 0; len -= hstep, src += vstep, dst += sdst*hstep) {
+            npyv_f64 a = npyv_load2_tillz_f64(src, len);
+            npyv_f64 r = simd_csquare_f64(a);
+            npyv_storen2_till_f64(dst, sdst, len, r);
+        }
+    }
+    else if (sdst == 2 && npyv_loadable_stride_s64(ssrc)) {
+        for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += wstep) {
+            npyv_f64 a0 = npyv_loadn2_f64(src, ssrc);
+            npyv_f64 a1 = npyv_loadn2_f64(src + ssrc*hstep, ssrc);
+            npyv_f64 r0 = simd_csquare_f64(a0);
+            npyv_f64 r1 = simd_csquare_f64(a1);
+            npyv_store_f64(dst, r0);
+            npyv_store_f64(dst + vstep, r1);
+        }
+        for (; len > 0; len -= hstep, src += ssrc*hstep, dst += vstep) {
+            npyv_f64 a = npyv_loadn2_tillz_f64((npy_double*)src, ssrc, len);
+            npyv_f64 r = simd_csquare_f64(a);
+            npyv_store2_till_f64(dst, len, r);
+        }
+    }
+    else {
+        goto loop_scalar;
+    }
+    npyv_cleanup();
+    return;
+loop_scalar:
+#endif
+    for (; len > 0; --len, b_src += b_ssrc, b_dst += b_sdst) {
+        const npy_double rl = ((npy_double *)b_src)[0];
+        const npy_double im = ((npy_double *)b_src)[1];
+    #if 1
+        ((npy_double *)b_dst)[0] = rl*rl - im*im;
+        ((npy_double *)b_dst)[1] = rl*im + im*rl;
+    #else
+        ((npy_double *)b_dst)[0] = rl;
+        ((npy_double *)b_dst)[1] = -im;
+    #endif
+    }
+}
+
+
+
diff --git a/numpy/core/src/_generated/loops_arithmetic.dispatch.c b/numpy/core/src/_generated/loops_arithmetic.dispatch.c
new file mode 100644
index 000000000000..e213d7111876
--- /dev/null
+++ b/numpy/core/src/_generated/loops_arithmetic.dispatch.c
@@ -0,0 +1,1985 @@
+#line 1 "numpy/core/src/umath/loops_arithmetic.dispatch.c.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+/*@targets
+ ** $maxopt baseline
+ ** sse2 sse41 avx2 avx512f avx512_skx
+ ** vsx2 vsx4
+ ** neon
+ ** vx
+ **/
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+//###############################################################################
+//## Division
+//###############################################################################
+/********************************************************************************
+ ** Defining the SIMD kernels
+ *
+ * Floor division of signed is based on T. Granlund and P. L. Montgomery
+ * "Division by invariant integers using multiplication(see [Figure 6.1]
+ * http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.1.2556)"
+ * For details on TRUNC division see simd/intdiv.h for more clarification
+ ***********************************************************************************
+ ** Figure 6.1: Signed division by run-time invariant divisor, rounded towards -INF
+ ***********************************************************************************
+ * For q = FLOOR(a/d), all sword:
+ *     sword -dsign = SRL(d, N - 1);
+ *     uword -nsign = (n < -dsign);
+ *     uword -qsign = EOR(-nsign, -dsign);
+ *     q = TRUNC((n - (-dsign ) + (-nsign))/d) - (-qsign);
+ ********************************************************************************/
+
+#if (defined(NPY_HAVE_VSX) && !defined(NPY_HAVE_VSX4)) || defined(NPY_HAVE_NEON)
+    // Due to integer 128-bit multiplication emulation, SIMD 64-bit division
+    // may not perform well on both neon and up to VSX3 compared to scalar
+    // division.
+    #define SIMD_DISABLE_DIV64_OPT
+#endif
+
+#if NPY_SIMD
+#line 52
+#if 8 < 64 || (8 == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
+static inline void
+simd_divide_by_scalar_contig_s8(char **args, npy_intp len)
+{
+    npyv_lanetype_s8 *src   = (npyv_lanetype_s8 *) args[0];
+    npyv_lanetype_s8 scalar = *(npyv_lanetype_s8 *) args[1];
+    npyv_lanetype_s8 *dst   = (npyv_lanetype_s8 *) args[2];
+    const int vstep            = npyv_nlanes_s8;
+    const npyv_s8x3 divisor = npyv_divisor_s8(scalar);
+
+    if (scalar == -1) {
+        npyv_b8 noverflow  = npyv_cvt_b8_s8(npyv_setall_s8(-1));
+        const npyv_s8 vzero = npyv_zero_s8();
+        const npyv_s8 vmin  = npyv_setall_s8(NPY_MIN_INT8);
+        for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+            npyv_s8 a       = npyv_load_s8(src);
+            npyv_b8 gt_min = npyv_cmpgt_s8(a, npyv_setall_s8(NPY_MIN_INT8));
+            noverflow          = npyv_and_b8(noverflow, gt_min);
+            npyv_s8 neg     = npyv_ifsub_s8(gt_min, vzero, a, vmin);
+            npyv_store_s8(dst, neg);
+        }
+
+        int raise_err = npyv_tobits_b8(npyv_not_b8(noverflow)) != 0;
+        for (; len > 0; --len, ++src, ++dst) {
+            npyv_lanetype_s8 a = *src;
+            if (a == NPY_MIN_INT8) {
+                raise_err = 1;
+                *dst  = NPY_MIN_INT8;
+            } else {
+                *dst = -a;
+            }
+        }
+        if (raise_err) {
+            npy_set_floatstatus_overflow();
+        }
+    } else {
+        for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+            npyv_s8  nsign_d   = npyv_setall_s8(scalar < 0);
+            npyv_s8  a         = npyv_load_s8(src);
+            npyv_s8  nsign_a   = npyv_cvt_s8_b8(npyv_cmplt_s8(a, nsign_d));
+            nsign_a               = npyv_and_s8(nsign_a, npyv_setall_s8(1));
+            npyv_s8  diff_sign = npyv_sub_s8(nsign_a, nsign_d);
+            npyv_s8  to_ninf   = npyv_xor_s8(nsign_a, nsign_d);
+            npyv_s8  trunc     = npyv_divc_s8(npyv_add_s8(a, diff_sign), divisor);
+            npyv_s8  floor     = npyv_sub_s8(trunc, to_ninf);
+            npyv_store_s8(dst, floor);
+        }
+
+        for (; len > 0; --len, ++src, ++dst) {
+            const npyv_lanetype_s8 a = *src;
+            npyv_lanetype_s8 r = a / scalar;
+            // Negative quotients needs to be rounded down
+            if (((a > 0) != (scalar > 0)) && ((r * scalar) != a)) {
+                r--;
+            }
+            *dst = r;
+        }
+    }
+    npyv_cleanup();
+}
+#endif
+
+#line 52
+#if 16 < 64 || (16 == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
+static inline void
+simd_divide_by_scalar_contig_s16(char **args, npy_intp len)
+{
+    npyv_lanetype_s16 *src   = (npyv_lanetype_s16 *) args[0];
+    npyv_lanetype_s16 scalar = *(npyv_lanetype_s16 *) args[1];
+    npyv_lanetype_s16 *dst   = (npyv_lanetype_s16 *) args[2];
+    const int vstep            = npyv_nlanes_s16;
+    const npyv_s16x3 divisor = npyv_divisor_s16(scalar);
+
+    if (scalar == -1) {
+        npyv_b16 noverflow  = npyv_cvt_b16_s16(npyv_setall_s16(-1));
+        const npyv_s16 vzero = npyv_zero_s16();
+        const npyv_s16 vmin  = npyv_setall_s16(NPY_MIN_INT16);
+        for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+            npyv_s16 a       = npyv_load_s16(src);
+            npyv_b16 gt_min = npyv_cmpgt_s16(a, npyv_setall_s16(NPY_MIN_INT16));
+            noverflow          = npyv_and_b16(noverflow, gt_min);
+            npyv_s16 neg     = npyv_ifsub_s16(gt_min, vzero, a, vmin);
+            npyv_store_s16(dst, neg);
+        }
+
+        int raise_err = npyv_tobits_b16(npyv_not_b16(noverflow)) != 0;
+        for (; len > 0; --len, ++src, ++dst) {
+            npyv_lanetype_s16 a = *src;
+            if (a == NPY_MIN_INT16) {
+                raise_err = 1;
+                *dst  = NPY_MIN_INT16;
+            } else {
+                *dst = -a;
+            }
+        }
+        if (raise_err) {
+            npy_set_floatstatus_overflow();
+        }
+    } else {
+        for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+            npyv_s16  nsign_d   = npyv_setall_s16(scalar < 0);
+            npyv_s16  a         = npyv_load_s16(src);
+            npyv_s16  nsign_a   = npyv_cvt_s16_b16(npyv_cmplt_s16(a, nsign_d));
+            nsign_a               = npyv_and_s16(nsign_a, npyv_setall_s16(1));
+            npyv_s16  diff_sign = npyv_sub_s16(nsign_a, nsign_d);
+            npyv_s16  to_ninf   = npyv_xor_s16(nsign_a, nsign_d);
+            npyv_s16  trunc     = npyv_divc_s16(npyv_add_s16(a, diff_sign), divisor);
+            npyv_s16  floor     = npyv_sub_s16(trunc, to_ninf);
+            npyv_store_s16(dst, floor);
+        }
+
+        for (; len > 0; --len, ++src, ++dst) {
+            const npyv_lanetype_s16 a = *src;
+            npyv_lanetype_s16 r = a / scalar;
+            // Negative quotients needs to be rounded down
+            if (((a > 0) != (scalar > 0)) && ((r * scalar) != a)) {
+                r--;
+            }
+            *dst = r;
+        }
+    }
+    npyv_cleanup();
+}
+#endif
+
+#line 52
+#if 32 < 64 || (32 == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
+static inline void
+simd_divide_by_scalar_contig_s32(char **args, npy_intp len)
+{
+    npyv_lanetype_s32 *src   = (npyv_lanetype_s32 *) args[0];
+    npyv_lanetype_s32 scalar = *(npyv_lanetype_s32 *) args[1];
+    npyv_lanetype_s32 *dst   = (npyv_lanetype_s32 *) args[2];
+    const int vstep            = npyv_nlanes_s32;
+    const npyv_s32x3 divisor = npyv_divisor_s32(scalar);
+
+    if (scalar == -1) {
+        npyv_b32 noverflow  = npyv_cvt_b32_s32(npyv_setall_s32(-1));
+        const npyv_s32 vzero = npyv_zero_s32();
+        const npyv_s32 vmin  = npyv_setall_s32(NPY_MIN_INT32);
+        for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+            npyv_s32 a       = npyv_load_s32(src);
+            npyv_b32 gt_min = npyv_cmpgt_s32(a, npyv_setall_s32(NPY_MIN_INT32));
+            noverflow          = npyv_and_b32(noverflow, gt_min);
+            npyv_s32 neg     = npyv_ifsub_s32(gt_min, vzero, a, vmin);
+            npyv_store_s32(dst, neg);
+        }
+
+        int raise_err = npyv_tobits_b32(npyv_not_b32(noverflow)) != 0;
+        for (; len > 0; --len, ++src, ++dst) {
+            npyv_lanetype_s32 a = *src;
+            if (a == NPY_MIN_INT32) {
+                raise_err = 1;
+                *dst  = NPY_MIN_INT32;
+            } else {
+                *dst = -a;
+            }
+        }
+        if (raise_err) {
+            npy_set_floatstatus_overflow();
+        }
+    } else {
+        for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+            npyv_s32  nsign_d   = npyv_setall_s32(scalar < 0);
+            npyv_s32  a         = npyv_load_s32(src);
+            npyv_s32  nsign_a   = npyv_cvt_s32_b32(npyv_cmplt_s32(a, nsign_d));
+            nsign_a               = npyv_and_s32(nsign_a, npyv_setall_s32(1));
+            npyv_s32  diff_sign = npyv_sub_s32(nsign_a, nsign_d);
+            npyv_s32  to_ninf   = npyv_xor_s32(nsign_a, nsign_d);
+            npyv_s32  trunc     = npyv_divc_s32(npyv_add_s32(a, diff_sign), divisor);
+            npyv_s32  floor     = npyv_sub_s32(trunc, to_ninf);
+            npyv_store_s32(dst, floor);
+        }
+
+        for (; len > 0; --len, ++src, ++dst) {
+            const npyv_lanetype_s32 a = *src;
+            npyv_lanetype_s32 r = a / scalar;
+            // Negative quotients needs to be rounded down
+            if (((a > 0) != (scalar > 0)) && ((r * scalar) != a)) {
+                r--;
+            }
+            *dst = r;
+        }
+    }
+    npyv_cleanup();
+}
+#endif
+
+#line 52
+#if 64 < 64 || (64 == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
+static inline void
+simd_divide_by_scalar_contig_s64(char **args, npy_intp len)
+{
+    npyv_lanetype_s64 *src   = (npyv_lanetype_s64 *) args[0];
+    npyv_lanetype_s64 scalar = *(npyv_lanetype_s64 *) args[1];
+    npyv_lanetype_s64 *dst   = (npyv_lanetype_s64 *) args[2];
+    const int vstep            = npyv_nlanes_s64;
+    const npyv_s64x3 divisor = npyv_divisor_s64(scalar);
+
+    if (scalar == -1) {
+        npyv_b64 noverflow  = npyv_cvt_b64_s64(npyv_setall_s64(-1));
+        const npyv_s64 vzero = npyv_zero_s64();
+        const npyv_s64 vmin  = npyv_setall_s64(NPY_MIN_INT64);
+        for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+            npyv_s64 a       = npyv_load_s64(src);
+            npyv_b64 gt_min = npyv_cmpgt_s64(a, npyv_setall_s64(NPY_MIN_INT64));
+            noverflow          = npyv_and_b64(noverflow, gt_min);
+            npyv_s64 neg     = npyv_ifsub_s64(gt_min, vzero, a, vmin);
+            npyv_store_s64(dst, neg);
+        }
+
+        int raise_err = npyv_tobits_b64(npyv_not_b64(noverflow)) != 0;
+        for (; len > 0; --len, ++src, ++dst) {
+            npyv_lanetype_s64 a = *src;
+            if (a == NPY_MIN_INT64) {
+                raise_err = 1;
+                *dst  = NPY_MIN_INT64;
+            } else {
+                *dst = -a;
+            }
+        }
+        if (raise_err) {
+            npy_set_floatstatus_overflow();
+        }
+    } else {
+        for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+            npyv_s64  nsign_d   = npyv_setall_s64(scalar < 0);
+            npyv_s64  a         = npyv_load_s64(src);
+            npyv_s64  nsign_a   = npyv_cvt_s64_b64(npyv_cmplt_s64(a, nsign_d));
+            nsign_a               = npyv_and_s64(nsign_a, npyv_setall_s64(1));
+            npyv_s64  diff_sign = npyv_sub_s64(nsign_a, nsign_d);
+            npyv_s64  to_ninf   = npyv_xor_s64(nsign_a, nsign_d);
+            npyv_s64  trunc     = npyv_divc_s64(npyv_add_s64(a, diff_sign), divisor);
+            npyv_s64  floor     = npyv_sub_s64(trunc, to_ninf);
+            npyv_store_s64(dst, floor);
+        }
+
+        for (; len > 0; --len, ++src, ++dst) {
+            const npyv_lanetype_s64 a = *src;
+            npyv_lanetype_s64 r = a / scalar;
+            // Negative quotients needs to be rounded down
+            if (((a > 0) != (scalar > 0)) && ((r * scalar) != a)) {
+                r--;
+            }
+            *dst = r;
+        }
+    }
+    npyv_cleanup();
+}
+#endif
+
+
+#line 120
+#if 8 < 64 || (8 == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
+static inline void
+simd_divide_by_scalar_contig_u8(char **args, npy_intp len)
+{
+    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[0];
+    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[1];
+    npyv_lanetype_u8 *dst   = (npyv_lanetype_u8 *) args[2];
+    const int vstep            = npyv_nlanes_u8;
+    const npyv_u8x3 divisor = npyv_divisor_u8(scalar);
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+        npyv_u8 a = npyv_load_u8(src);
+        npyv_u8 c = npyv_divc_u8(a, divisor);
+        npyv_store_u8(dst, c);
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u8 a = *src;
+        *dst = a / scalar;
+    }
+    npyv_cleanup();
+}
+#endif
+
+#line 120
+#if 16 < 64 || (16 == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
+static inline void
+simd_divide_by_scalar_contig_u16(char **args, npy_intp len)
+{
+    npyv_lanetype_u16 *src   = (npyv_lanetype_u16 *) args[0];
+    npyv_lanetype_u16 scalar = *(npyv_lanetype_u16 *) args[1];
+    npyv_lanetype_u16 *dst   = (npyv_lanetype_u16 *) args[2];
+    const int vstep            = npyv_nlanes_u16;
+    const npyv_u16x3 divisor = npyv_divisor_u16(scalar);
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+        npyv_u16 a = npyv_load_u16(src);
+        npyv_u16 c = npyv_divc_u16(a, divisor);
+        npyv_store_u16(dst, c);
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u16 a = *src;
+        *dst = a / scalar;
+    }
+    npyv_cleanup();
+}
+#endif
+
+#line 120
+#if 32 < 64 || (32 == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
+static inline void
+simd_divide_by_scalar_contig_u32(char **args, npy_intp len)
+{
+    npyv_lanetype_u32 *src   = (npyv_lanetype_u32 *) args[0];
+    npyv_lanetype_u32 scalar = *(npyv_lanetype_u32 *) args[1];
+    npyv_lanetype_u32 *dst   = (npyv_lanetype_u32 *) args[2];
+    const int vstep            = npyv_nlanes_u32;
+    const npyv_u32x3 divisor = npyv_divisor_u32(scalar);
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+        npyv_u32 a = npyv_load_u32(src);
+        npyv_u32 c = npyv_divc_u32(a, divisor);
+        npyv_store_u32(dst, c);
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u32 a = *src;
+        *dst = a / scalar;
+    }
+    npyv_cleanup();
+}
+#endif
+
+#line 120
+#if 64 < 64 || (64 == 64 && !defined(SIMD_DISABLE_DIV64_OPT))
+static inline void
+simd_divide_by_scalar_contig_u64(char **args, npy_intp len)
+{
+    npyv_lanetype_u64 *src   = (npyv_lanetype_u64 *) args[0];
+    npyv_lanetype_u64 scalar = *(npyv_lanetype_u64 *) args[1];
+    npyv_lanetype_u64 *dst   = (npyv_lanetype_u64 *) args[2];
+    const int vstep            = npyv_nlanes_u64;
+    const npyv_u64x3 divisor = npyv_divisor_u64(scalar);
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+        npyv_u64 a = npyv_load_u64(src);
+        npyv_u64 c = npyv_divc_u64(a, divisor);
+        npyv_store_u64(dst, c);
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u64 a = *src;
+        *dst = a / scalar;
+    }
+    npyv_cleanup();
+}
+#endif
+
+
+#if defined(NPY_HAVE_VSX4)
+
+#line 151
+/*
+ * Computes division of 2 8-bit signed/unsigned integer vectors
+ *
+ * As Power10 only supports integer vector division for data of 32 bits or
+ * greater, we have to convert npyv_u8 into 4x npyv_u32, execute the integer
+ * vector division instruction, and then, convert the result back to npyv_u8.
+ */
+NPY_FINLINE npyv_u8
+vsx4_div_u8(npyv_u8 a, npyv_u8 b)
+{
+#if 0
+    npyv_s16x2 ta, tb;
+    npyv_s32x2 ahi, alo, bhi, blo;
+    ta.val[0] = vec_unpackh(a);
+    ta.val[1] = vec_unpackl(a);
+    tb.val[0] = vec_unpackh(b);
+    tb.val[1] = vec_unpackl(b);
+    ahi.val[0] = vec_unpackh(ta.val[0]);
+    ahi.val[1] = vec_unpackl(ta.val[0]);
+    alo.val[0] = vec_unpackh(ta.val[1]);
+    alo.val[1] = vec_unpackl(ta.val[1]);
+    bhi.val[0] = vec_unpackh(tb.val[0]);
+    bhi.val[1] = vec_unpackl(tb.val[0]);
+    blo.val[0] = vec_unpackh(tb.val[1]);
+    blo.val[1] = vec_unpackl(tb.val[1]);
+#else
+    npyv_u16x2 a_expand = npyv_expand_u16_u8(a);
+    npyv_u16x2 b_expand = npyv_expand_u16_u8(b);
+    npyv_u32x2 ahi = npyv_expand_u32_u16(a_expand.val[0]);
+    npyv_u32x2 alo = npyv_expand_u32_u16(a_expand.val[1]);
+    npyv_u32x2 bhi = npyv_expand_u32_u16(b_expand.val[0]);
+    npyv_u32x2 blo = npyv_expand_u32_u16(b_expand.val[1]);
+#endif
+    npyv_u32 v1 = vec_div(ahi.val[0], bhi.val[0]);
+    npyv_u32 v2 = vec_div(ahi.val[1], bhi.val[1]);
+    npyv_u32 v3 = vec_div(alo.val[0], blo.val[0]);
+    npyv_u32 v4 = vec_div(alo.val[1], blo.val[1]);
+    npyv_u16 hi = vec_pack(v1, v2);
+    npyv_u16 lo = vec_pack(v3, v4);
+    return vec_pack(hi, lo);
+}
+
+NPY_FINLINE npyv_u16
+vsx4_div_u16(npyv_u16 a, npyv_u16 b)
+{
+#if 0
+    npyv_s32x2 a_expand;
+    npyv_s32x2 b_expand;
+    a_expand.val[0] = vec_unpackh(a);
+    a_expand.val[1] = vec_unpackl(a);
+    b_expand.val[0] = vec_unpackh(b);
+    b_expand.val[1] = vec_unpackl(b);
+#else
+    npyv_u32x2 a_expand = npyv_expand_u32_u16(a);
+    npyv_u32x2 b_expand = npyv_expand_u32_u16(b);
+#endif
+    npyv_u32 v1 = vec_div(a_expand.val[0], b_expand.val[0]);
+    npyv_u32 v2 = vec_div(a_expand.val[1], b_expand.val[1]);
+    return vec_pack(v1, v2);
+}
+
+#define vsx4_div_u32 vec_div
+#define vsx4_div_u64 vec_div
+
+#line 151
+/*
+ * Computes division of 2 8-bit signed/unsigned integer vectors
+ *
+ * As Power10 only supports integer vector division for data of 32 bits or
+ * greater, we have to convert npyv_u8 into 4x npyv_u32, execute the integer
+ * vector division instruction, and then, convert the result back to npyv_u8.
+ */
+NPY_FINLINE npyv_s8
+vsx4_div_s8(npyv_s8 a, npyv_s8 b)
+{
+#if 1
+    npyv_s16x2 ta, tb;
+    npyv_s32x2 ahi, alo, bhi, blo;
+    ta.val[0] = vec_unpackh(a);
+    ta.val[1] = vec_unpackl(a);
+    tb.val[0] = vec_unpackh(b);
+    tb.val[1] = vec_unpackl(b);
+    ahi.val[0] = vec_unpackh(ta.val[0]);
+    ahi.val[1] = vec_unpackl(ta.val[0]);
+    alo.val[0] = vec_unpackh(ta.val[1]);
+    alo.val[1] = vec_unpackl(ta.val[1]);
+    bhi.val[0] = vec_unpackh(tb.val[0]);
+    bhi.val[1] = vec_unpackl(tb.val[0]);
+    blo.val[0] = vec_unpackh(tb.val[1]);
+    blo.val[1] = vec_unpackl(tb.val[1]);
+#else
+    npyv_u16x2 a_expand = npyv_expand_u16_u8(a);
+    npyv_u16x2 b_expand = npyv_expand_u16_u8(b);
+    npyv_u32x2 ahi = npyv_expand_u32_u16(a_expand.val[0]);
+    npyv_u32x2 alo = npyv_expand_u32_u16(a_expand.val[1]);
+    npyv_u32x2 bhi = npyv_expand_u32_u16(b_expand.val[0]);
+    npyv_u32x2 blo = npyv_expand_u32_u16(b_expand.val[1]);
+#endif
+    npyv_s32 v1 = vec_div(ahi.val[0], bhi.val[0]);
+    npyv_s32 v2 = vec_div(ahi.val[1], bhi.val[1]);
+    npyv_s32 v3 = vec_div(alo.val[0], blo.val[0]);
+    npyv_s32 v4 = vec_div(alo.val[1], blo.val[1]);
+    npyv_s16 hi = vec_pack(v1, v2);
+    npyv_s16 lo = vec_pack(v3, v4);
+    return vec_pack(hi, lo);
+}
+
+NPY_FINLINE npyv_s16
+vsx4_div_s16(npyv_s16 a, npyv_s16 b)
+{
+#if 1
+    npyv_s32x2 a_expand;
+    npyv_s32x2 b_expand;
+    a_expand.val[0] = vec_unpackh(a);
+    a_expand.val[1] = vec_unpackl(a);
+    b_expand.val[0] = vec_unpackh(b);
+    b_expand.val[1] = vec_unpackl(b);
+#else
+    npyv_u32x2 a_expand = npyv_expand_s32_s16(a);
+    npyv_u32x2 b_expand = npyv_expand_s32_s16(b);
+#endif
+    npyv_s32 v1 = vec_div(a_expand.val[0], b_expand.val[0]);
+    npyv_s32 v2 = vec_div(a_expand.val[1], b_expand.val[1]);
+    return vec_pack(v1, v2);
+}
+
+#define vsx4_div_s32 vec_div
+#define vsx4_div_s64 vec_div
+
+
+#line 221
+static inline void
+vsx4_simd_divide_contig_u8(char **args, npy_intp len)
+{
+    npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0];
+    npyv_lanetype_u8 *src2 = (npyv_lanetype_u8 *) args[1];
+    npyv_lanetype_u8 *dst1 = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 vzero    = npyv_zero_u8();
+    const int vstep           = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep) {
+        npyv_u8 a = npyv_load_u8(src1);
+        npyv_u8 b = npyv_load_u8(src2);
+        npyv_u8 c = vsx4_div_u8(a, b);
+        npyv_store_u8(dst1, c);
+        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
+            npy_set_floatstatus_divbyzero();
+        }
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
+        const npyv_lanetype_u8 a = *src1;
+        const npyv_lanetype_u8 b = *src2;
+        if (NPY_UNLIKELY(b == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *dst1 = 0;
+        } else{
+            *dst1 = a / b;
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 221
+static inline void
+vsx4_simd_divide_contig_u16(char **args, npy_intp len)
+{
+    npyv_lanetype_u16 *src1 = (npyv_lanetype_u16 *) args[0];
+    npyv_lanetype_u16 *src2 = (npyv_lanetype_u16 *) args[1];
+    npyv_lanetype_u16 *dst1 = (npyv_lanetype_u16 *) args[2];
+    const npyv_u16 vzero    = npyv_zero_u16();
+    const int vstep           = npyv_nlanes_u16;
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep) {
+        npyv_u16 a = npyv_load_u16(src1);
+        npyv_u16 b = npyv_load_u16(src2);
+        npyv_u16 c = vsx4_div_u16(a, b);
+        npyv_store_u16(dst1, c);
+        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
+            npy_set_floatstatus_divbyzero();
+        }
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
+        const npyv_lanetype_u16 a = *src1;
+        const npyv_lanetype_u16 b = *src2;
+        if (NPY_UNLIKELY(b == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *dst1 = 0;
+        } else{
+            *dst1 = a / b;
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 221
+static inline void
+vsx4_simd_divide_contig_u32(char **args, npy_intp len)
+{
+    npyv_lanetype_u32 *src1 = (npyv_lanetype_u32 *) args[0];
+    npyv_lanetype_u32 *src2 = (npyv_lanetype_u32 *) args[1];
+    npyv_lanetype_u32 *dst1 = (npyv_lanetype_u32 *) args[2];
+    const npyv_u32 vzero    = npyv_zero_u32();
+    const int vstep           = npyv_nlanes_u32;
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep) {
+        npyv_u32 a = npyv_load_u32(src1);
+        npyv_u32 b = npyv_load_u32(src2);
+        npyv_u32 c = vsx4_div_u32(a, b);
+        npyv_store_u32(dst1, c);
+        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
+            npy_set_floatstatus_divbyzero();
+        }
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
+        const npyv_lanetype_u32 a = *src1;
+        const npyv_lanetype_u32 b = *src2;
+        if (NPY_UNLIKELY(b == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *dst1 = 0;
+        } else{
+            *dst1 = a / b;
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 221
+static inline void
+vsx4_simd_divide_contig_u64(char **args, npy_intp len)
+{
+    npyv_lanetype_u64 *src1 = (npyv_lanetype_u64 *) args[0];
+    npyv_lanetype_u64 *src2 = (npyv_lanetype_u64 *) args[1];
+    npyv_lanetype_u64 *dst1 = (npyv_lanetype_u64 *) args[2];
+    const npyv_u64 vzero    = npyv_zero_u64();
+    const int vstep           = npyv_nlanes_u64;
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep) {
+        npyv_u64 a = npyv_load_u64(src1);
+        npyv_u64 b = npyv_load_u64(src2);
+        npyv_u64 c = vsx4_div_u64(a, b);
+        npyv_store_u64(dst1, c);
+        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
+            npy_set_floatstatus_divbyzero();
+        }
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
+        const npyv_lanetype_u64 a = *src1;
+        const npyv_lanetype_u64 b = *src2;
+        if (NPY_UNLIKELY(b == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *dst1 = 0;
+        } else{
+            *dst1 = a / b;
+        }
+    }
+    npyv_cleanup();
+}
+
+
+#line 260
+static inline void
+vsx4_simd_divide_contig_s8(char **args, npy_intp len)
+{
+    npyv_lanetype_s8 *src1 = (npyv_lanetype_s8 *) args[0];
+    npyv_lanetype_s8 *src2 = (npyv_lanetype_s8 *) args[1];
+    npyv_lanetype_s8 *dst1 = (npyv_lanetype_s8 *) args[2];
+    const npyv_s8 vneg_one = npyv_setall_s8(-1);
+    const npyv_s8 vzero    = npyv_zero_s8();
+    const npyv_s8 vmin     = npyv_setall_s8(NPY_MIN_INT8);
+    npyv_b8 warn_zero     = npyv_cvt_b8_s8(npyv_zero_s8());
+    npyv_b8 warn_overflow = npyv_cvt_b8_s8(npyv_zero_s8());
+    const int vstep           = npyv_nlanes_s8;
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep) {
+        npyv_s8 a   = npyv_load_s8(src1);
+        npyv_s8 b   = npyv_load_s8(src2);
+        npyv_s8 quo = vsx4_div_s8(a, b);
+        npyv_s8 rem = npyv_sub_s8(a, vec_mul(b, quo));
+        // (b == 0 || (a == NPY_MIN_INT8 && b == -1))
+        npyv_b8 bzero    = npyv_cmpeq_s8(b, vzero);
+        npyv_b8 amin     = npyv_cmpeq_s8(a, vmin);
+        npyv_b8 bneg_one = npyv_cmpeq_s8(b, vneg_one);
+        npyv_b8 overflow = npyv_and_s8(bneg_one, amin);
+                   warn_zero = npyv_or_s8(bzero, warn_zero);
+               warn_overflow = npyv_or_s8(overflow, warn_overflow);
+        // handle mixed case the way Python does
+        // ((a > 0) == (b > 0) || rem == 0)
+        npyv_b8 a_gt_zero  = npyv_cmpgt_s8(a, vzero);
+        npyv_b8 b_gt_zero  = npyv_cmpgt_s8(b, vzero);
+        npyv_b8 ab_eq_cond = npyv_cmpeq_s8(a_gt_zero, b_gt_zero);
+        npyv_b8 rem_zero   = npyv_cmpeq_s8(rem, vzero);
+        npyv_b8 or         = npyv_or_s8(ab_eq_cond, rem_zero);
+        npyv_s8 to_sub = npyv_select_s8(or, vzero, vneg_one);
+                      quo = npyv_add_s8(quo, to_sub);
+                      // Divide by zero
+                      quo = npyv_select_s8(bzero, vzero, quo);
+                      // Overflow
+                      quo = npyv_select_s8(overflow, vmin, quo);
+        npyv_store_s8(dst1, quo);
+    }
+
+    if (!vec_all_eq(warn_zero, vzero)) {
+        npy_set_floatstatus_divbyzero();
+    }
+    if (!vec_all_eq(warn_overflow, vzero)) {
+        npy_set_floatstatus_overflow();
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
+        const npyv_lanetype_s8 a = *src1;
+        const npyv_lanetype_s8 b = *src2;
+        if (NPY_UNLIKELY(b == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *dst1 = 0;
+        } else if (NPY_UNLIKELY((a == NPY_MIN_INT8) && (b == -1))) {
+            npy_set_floatstatus_overflow();
+            *dst1 = NPY_MIN_INT8;
+        } else {
+            *dst1 = a / b;
+            if (((a > 0) != (b > 0)) && ((*dst1 * b) != a)) {
+                *dst1 -= 1;
+            }
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 260
+static inline void
+vsx4_simd_divide_contig_s16(char **args, npy_intp len)
+{
+    npyv_lanetype_s16 *src1 = (npyv_lanetype_s16 *) args[0];
+    npyv_lanetype_s16 *src2 = (npyv_lanetype_s16 *) args[1];
+    npyv_lanetype_s16 *dst1 = (npyv_lanetype_s16 *) args[2];
+    const npyv_s16 vneg_one = npyv_setall_s16(-1);
+    const npyv_s16 vzero    = npyv_zero_s16();
+    const npyv_s16 vmin     = npyv_setall_s16(NPY_MIN_INT16);
+    npyv_b16 warn_zero     = npyv_cvt_b16_s16(npyv_zero_s16());
+    npyv_b16 warn_overflow = npyv_cvt_b16_s16(npyv_zero_s16());
+    const int vstep           = npyv_nlanes_s16;
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep) {
+        npyv_s16 a   = npyv_load_s16(src1);
+        npyv_s16 b   = npyv_load_s16(src2);
+        npyv_s16 quo = vsx4_div_s16(a, b);
+        npyv_s16 rem = npyv_sub_s16(a, vec_mul(b, quo));
+        // (b == 0 || (a == NPY_MIN_INT16 && b == -1))
+        npyv_b16 bzero    = npyv_cmpeq_s16(b, vzero);
+        npyv_b16 amin     = npyv_cmpeq_s16(a, vmin);
+        npyv_b16 bneg_one = npyv_cmpeq_s16(b, vneg_one);
+        npyv_b16 overflow = npyv_and_s16(bneg_one, amin);
+                   warn_zero = npyv_or_s16(bzero, warn_zero);
+               warn_overflow = npyv_or_s16(overflow, warn_overflow);
+        // handle mixed case the way Python does
+        // ((a > 0) == (b > 0) || rem == 0)
+        npyv_b16 a_gt_zero  = npyv_cmpgt_s16(a, vzero);
+        npyv_b16 b_gt_zero  = npyv_cmpgt_s16(b, vzero);
+        npyv_b16 ab_eq_cond = npyv_cmpeq_s16(a_gt_zero, b_gt_zero);
+        npyv_b16 rem_zero   = npyv_cmpeq_s16(rem, vzero);
+        npyv_b16 or         = npyv_or_s16(ab_eq_cond, rem_zero);
+        npyv_s16 to_sub = npyv_select_s16(or, vzero, vneg_one);
+                      quo = npyv_add_s16(quo, to_sub);
+                      // Divide by zero
+                      quo = npyv_select_s16(bzero, vzero, quo);
+                      // Overflow
+                      quo = npyv_select_s16(overflow, vmin, quo);
+        npyv_store_s16(dst1, quo);
+    }
+
+    if (!vec_all_eq(warn_zero, vzero)) {
+        npy_set_floatstatus_divbyzero();
+    }
+    if (!vec_all_eq(warn_overflow, vzero)) {
+        npy_set_floatstatus_overflow();
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
+        const npyv_lanetype_s16 a = *src1;
+        const npyv_lanetype_s16 b = *src2;
+        if (NPY_UNLIKELY(b == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *dst1 = 0;
+        } else if (NPY_UNLIKELY((a == NPY_MIN_INT16) && (b == -1))) {
+            npy_set_floatstatus_overflow();
+            *dst1 = NPY_MIN_INT16;
+        } else {
+            *dst1 = a / b;
+            if (((a > 0) != (b > 0)) && ((*dst1 * b) != a)) {
+                *dst1 -= 1;
+            }
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 260
+static inline void
+vsx4_simd_divide_contig_s32(char **args, npy_intp len)
+{
+    npyv_lanetype_s32 *src1 = (npyv_lanetype_s32 *) args[0];
+    npyv_lanetype_s32 *src2 = (npyv_lanetype_s32 *) args[1];
+    npyv_lanetype_s32 *dst1 = (npyv_lanetype_s32 *) args[2];
+    const npyv_s32 vneg_one = npyv_setall_s32(-1);
+    const npyv_s32 vzero    = npyv_zero_s32();
+    const npyv_s32 vmin     = npyv_setall_s32(NPY_MIN_INT32);
+    npyv_b32 warn_zero     = npyv_cvt_b32_s32(npyv_zero_s32());
+    npyv_b32 warn_overflow = npyv_cvt_b32_s32(npyv_zero_s32());
+    const int vstep           = npyv_nlanes_s32;
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep) {
+        npyv_s32 a   = npyv_load_s32(src1);
+        npyv_s32 b   = npyv_load_s32(src2);
+        npyv_s32 quo = vsx4_div_s32(a, b);
+        npyv_s32 rem = npyv_sub_s32(a, vec_mul(b, quo));
+        // (b == 0 || (a == NPY_MIN_INT32 && b == -1))
+        npyv_b32 bzero    = npyv_cmpeq_s32(b, vzero);
+        npyv_b32 amin     = npyv_cmpeq_s32(a, vmin);
+        npyv_b32 bneg_one = npyv_cmpeq_s32(b, vneg_one);
+        npyv_b32 overflow = npyv_and_s32(bneg_one, amin);
+                   warn_zero = npyv_or_s32(bzero, warn_zero);
+               warn_overflow = npyv_or_s32(overflow, warn_overflow);
+        // handle mixed case the way Python does
+        // ((a > 0) == (b > 0) || rem == 0)
+        npyv_b32 a_gt_zero  = npyv_cmpgt_s32(a, vzero);
+        npyv_b32 b_gt_zero  = npyv_cmpgt_s32(b, vzero);
+        npyv_b32 ab_eq_cond = npyv_cmpeq_s32(a_gt_zero, b_gt_zero);
+        npyv_b32 rem_zero   = npyv_cmpeq_s32(rem, vzero);
+        npyv_b32 or         = npyv_or_s32(ab_eq_cond, rem_zero);
+        npyv_s32 to_sub = npyv_select_s32(or, vzero, vneg_one);
+                      quo = npyv_add_s32(quo, to_sub);
+                      // Divide by zero
+                      quo = npyv_select_s32(bzero, vzero, quo);
+                      // Overflow
+                      quo = npyv_select_s32(overflow, vmin, quo);
+        npyv_store_s32(dst1, quo);
+    }
+
+    if (!vec_all_eq(warn_zero, vzero)) {
+        npy_set_floatstatus_divbyzero();
+    }
+    if (!vec_all_eq(warn_overflow, vzero)) {
+        npy_set_floatstatus_overflow();
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
+        const npyv_lanetype_s32 a = *src1;
+        const npyv_lanetype_s32 b = *src2;
+        if (NPY_UNLIKELY(b == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *dst1 = 0;
+        } else if (NPY_UNLIKELY((a == NPY_MIN_INT32) && (b == -1))) {
+            npy_set_floatstatus_overflow();
+            *dst1 = NPY_MIN_INT32;
+        } else {
+            *dst1 = a / b;
+            if (((a > 0) != (b > 0)) && ((*dst1 * b) != a)) {
+                *dst1 -= 1;
+            }
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 260
+static inline void
+vsx4_simd_divide_contig_s64(char **args, npy_intp len)
+{
+    npyv_lanetype_s64 *src1 = (npyv_lanetype_s64 *) args[0];
+    npyv_lanetype_s64 *src2 = (npyv_lanetype_s64 *) args[1];
+    npyv_lanetype_s64 *dst1 = (npyv_lanetype_s64 *) args[2];
+    const npyv_s64 vneg_one = npyv_setall_s64(-1);
+    const npyv_s64 vzero    = npyv_zero_s64();
+    const npyv_s64 vmin     = npyv_setall_s64(NPY_MIN_INT64);
+    npyv_b64 warn_zero     = npyv_cvt_b64_s64(npyv_zero_s64());
+    npyv_b64 warn_overflow = npyv_cvt_b64_s64(npyv_zero_s64());
+    const int vstep           = npyv_nlanes_s64;
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep) {
+        npyv_s64 a   = npyv_load_s64(src1);
+        npyv_s64 b   = npyv_load_s64(src2);
+        npyv_s64 quo = vsx4_div_s64(a, b);
+        npyv_s64 rem = npyv_sub_s64(a, vec_mul(b, quo));
+        // (b == 0 || (a == NPY_MIN_INT64 && b == -1))
+        npyv_b64 bzero    = npyv_cmpeq_s64(b, vzero);
+        npyv_b64 amin     = npyv_cmpeq_s64(a, vmin);
+        npyv_b64 bneg_one = npyv_cmpeq_s64(b, vneg_one);
+        npyv_b64 overflow = npyv_and_s64(bneg_one, amin);
+                   warn_zero = npyv_or_s64(bzero, warn_zero);
+               warn_overflow = npyv_or_s64(overflow, warn_overflow);
+        // handle mixed case the way Python does
+        // ((a > 0) == (b > 0) || rem == 0)
+        npyv_b64 a_gt_zero  = npyv_cmpgt_s64(a, vzero);
+        npyv_b64 b_gt_zero  = npyv_cmpgt_s64(b, vzero);
+        npyv_b64 ab_eq_cond = npyv_cmpeq_s64(a_gt_zero, b_gt_zero);
+        npyv_b64 rem_zero   = npyv_cmpeq_s64(rem, vzero);
+        npyv_b64 or         = npyv_or_s64(ab_eq_cond, rem_zero);
+        npyv_s64 to_sub = npyv_select_s64(or, vzero, vneg_one);
+                      quo = npyv_add_s64(quo, to_sub);
+                      // Divide by zero
+                      quo = npyv_select_s64(bzero, vzero, quo);
+                      // Overflow
+                      quo = npyv_select_s64(overflow, vmin, quo);
+        npyv_store_s64(dst1, quo);
+    }
+
+    if (!vec_all_eq(warn_zero, vzero)) {
+        npy_set_floatstatus_divbyzero();
+    }
+    if (!vec_all_eq(warn_overflow, vzero)) {
+        npy_set_floatstatus_overflow();
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
+        const npyv_lanetype_s64 a = *src1;
+        const npyv_lanetype_s64 b = *src2;
+        if (NPY_UNLIKELY(b == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *dst1 = 0;
+        } else if (NPY_UNLIKELY((a == NPY_MIN_INT64) && (b == -1))) {
+            npy_set_floatstatus_overflow();
+            *dst1 = NPY_MIN_INT64;
+        } else {
+            *dst1 = a / b;
+            if (((a > 0) != (b > 0)) && ((*dst1 * b) != a)) {
+                *dst1 -= 1;
+            }
+        }
+    }
+    npyv_cleanup();
+}
+
+#endif // NPY_HAVE_VSX4
+#endif // NPY_SIMD
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+
+#line 340
+#undef TO_SIMD_SFX
+#if 0
+#line 345
+#elif NPY_BITSOF_BYTE == 8
+    #define TO_SIMD_SFX(X) X##_s8
+
+#line 345
+#elif NPY_BITSOF_BYTE == 16
+    #define TO_SIMD_SFX(X) X##_s16
+
+#line 345
+#elif NPY_BITSOF_BYTE == 32
+    #define TO_SIMD_SFX(X) X##_s32
+
+#line 345
+#elif NPY_BITSOF_BYTE == 64
+    #define TO_SIMD_SFX(X) X##_s64
+
+#endif
+#if NPY_BITSOF_BYTE == 64 && defined(SIMD_DISABLE_DIV64_OPT)
+    #undef TO_SIMD_SFX
+#endif
+
+NPY_FINLINE npy_byte floor_div_BYTE(const npy_byte n, const npy_byte d)
+{
+    /*
+     * FIXME: On x86 at least, dividing the smallest representable integer
+     * by -1 causes a SIFGPE (division overflow). We treat this case here
+     * (to avoid a SIGFPE crash at python level), but a good solution would
+     * be to treat integer division problems separately from FPU exceptions
+     * (i.e. a different approach than npy_set_floatstatus_divbyzero()).
+     */
+    if (NPY_UNLIKELY(d == 0 || (n == NPY_MIN_BYTE && d == -1))) {
+        if (d == 0) {
+            npy_set_floatstatus_divbyzero();
+            return 0;
+        }
+        else {
+            npy_set_floatstatus_overflow();
+            return NPY_MIN_BYTE;
+        }
+    }
+    npy_byte r = n / d;
+    // Negative quotients needs to be rounded down
+    if (((n > 0) != (d > 0)) && ((r * d) != n)) {
+        r--;
+    }
+    return r;
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_divide)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP(npy_byte) {
+            io1 = floor_div_BYTE(io1, *(npy_byte*)ip2);
+        }
+        *((npy_byte *)iop1) = io1;
+    }
+#if NPY_SIMD && defined(TO_SIMD_SFX)
+#if defined(NPY_HAVE_VSX4)
+    // both arguments are arrays of the same size
+    else if (IS_BLOCKABLE_BINARY(sizeof(npy_byte), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]);
+    }
+#endif
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_byte), NPY_SIMD_WIDTH) &&
+             (*(npy_byte *)args[1]) != 0) {
+        TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
+    }
+#endif
+    else {
+        BINARY_LOOP {
+            *((npy_byte *)op1) = floor_div_BYTE(*(npy_byte*)ip1, *(npy_byte*)ip2);
+        }
+    }
+}
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_divide_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_byte *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_byte *)(ip1 + is1 * indx);
+        *indexed = floor_div_BYTE(*indexed, *(npy_byte *)value);
+    }
+    return 0;
+}
+
+
+#line 340
+#undef TO_SIMD_SFX
+#if 0
+#line 345
+#elif NPY_BITSOF_SHORT == 8
+    #define TO_SIMD_SFX(X) X##_s8
+
+#line 345
+#elif NPY_BITSOF_SHORT == 16
+    #define TO_SIMD_SFX(X) X##_s16
+
+#line 345
+#elif NPY_BITSOF_SHORT == 32
+    #define TO_SIMD_SFX(X) X##_s32
+
+#line 345
+#elif NPY_BITSOF_SHORT == 64
+    #define TO_SIMD_SFX(X) X##_s64
+
+#endif
+#if NPY_BITSOF_SHORT == 64 && defined(SIMD_DISABLE_DIV64_OPT)
+    #undef TO_SIMD_SFX
+#endif
+
+NPY_FINLINE npy_short floor_div_SHORT(const npy_short n, const npy_short d)
+{
+    /*
+     * FIXME: On x86 at least, dividing the smallest representable integer
+     * by -1 causes a SIFGPE (division overflow). We treat this case here
+     * (to avoid a SIGFPE crash at python level), but a good solution would
+     * be to treat integer division problems separately from FPU exceptions
+     * (i.e. a different approach than npy_set_floatstatus_divbyzero()).
+     */
+    if (NPY_UNLIKELY(d == 0 || (n == NPY_MIN_SHORT && d == -1))) {
+        if (d == 0) {
+            npy_set_floatstatus_divbyzero();
+            return 0;
+        }
+        else {
+            npy_set_floatstatus_overflow();
+            return NPY_MIN_SHORT;
+        }
+    }
+    npy_short r = n / d;
+    // Negative quotients needs to be rounded down
+    if (((n > 0) != (d > 0)) && ((r * d) != n)) {
+        r--;
+    }
+    return r;
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_divide)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP(npy_short) {
+            io1 = floor_div_SHORT(io1, *(npy_short*)ip2);
+        }
+        *((npy_short *)iop1) = io1;
+    }
+#if NPY_SIMD && defined(TO_SIMD_SFX)
+#if defined(NPY_HAVE_VSX4)
+    // both arguments are arrays of the same size
+    else if (IS_BLOCKABLE_BINARY(sizeof(npy_short), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]);
+    }
+#endif
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_short), NPY_SIMD_WIDTH) &&
+             (*(npy_short *)args[1]) != 0) {
+        TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
+    }
+#endif
+    else {
+        BINARY_LOOP {
+            *((npy_short *)op1) = floor_div_SHORT(*(npy_short*)ip1, *(npy_short*)ip2);
+        }
+    }
+}
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_divide_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_short *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_short *)(ip1 + is1 * indx);
+        *indexed = floor_div_SHORT(*indexed, *(npy_short *)value);
+    }
+    return 0;
+}
+
+
+#line 340
+#undef TO_SIMD_SFX
+#if 0
+#line 345
+#elif NPY_BITSOF_INT == 8
+    #define TO_SIMD_SFX(X) X##_s8
+
+#line 345
+#elif NPY_BITSOF_INT == 16
+    #define TO_SIMD_SFX(X) X##_s16
+
+#line 345
+#elif NPY_BITSOF_INT == 32
+    #define TO_SIMD_SFX(X) X##_s32
+
+#line 345
+#elif NPY_BITSOF_INT == 64
+    #define TO_SIMD_SFX(X) X##_s64
+
+#endif
+#if NPY_BITSOF_INT == 64 && defined(SIMD_DISABLE_DIV64_OPT)
+    #undef TO_SIMD_SFX
+#endif
+
+NPY_FINLINE npy_int floor_div_INT(const npy_int n, const npy_int d)
+{
+    /*
+     * FIXME: On x86 at least, dividing the smallest representable integer
+     * by -1 causes a SIFGPE (division overflow). We treat this case here
+     * (to avoid a SIGFPE crash at python level), but a good solution would
+     * be to treat integer division problems separately from FPU exceptions
+     * (i.e. a different approach than npy_set_floatstatus_divbyzero()).
+     */
+    if (NPY_UNLIKELY(d == 0 || (n == NPY_MIN_INT && d == -1))) {
+        if (d == 0) {
+            npy_set_floatstatus_divbyzero();
+            return 0;
+        }
+        else {
+            npy_set_floatstatus_overflow();
+            return NPY_MIN_INT;
+        }
+    }
+    npy_int r = n / d;
+    // Negative quotients needs to be rounded down
+    if (((n > 0) != (d > 0)) && ((r * d) != n)) {
+        r--;
+    }
+    return r;
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_divide)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP(npy_int) {
+            io1 = floor_div_INT(io1, *(npy_int*)ip2);
+        }
+        *((npy_int *)iop1) = io1;
+    }
+#if NPY_SIMD && defined(TO_SIMD_SFX)
+#if defined(NPY_HAVE_VSX4)
+    // both arguments are arrays of the same size
+    else if (IS_BLOCKABLE_BINARY(sizeof(npy_int), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]);
+    }
+#endif
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_int), NPY_SIMD_WIDTH) &&
+             (*(npy_int *)args[1]) != 0) {
+        TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
+    }
+#endif
+    else {
+        BINARY_LOOP {
+            *((npy_int *)op1) = floor_div_INT(*(npy_int*)ip1, *(npy_int*)ip2);
+        }
+    }
+}
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_divide_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_int *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_int *)(ip1 + is1 * indx);
+        *indexed = floor_div_INT(*indexed, *(npy_int *)value);
+    }
+    return 0;
+}
+
+
+#line 340
+#undef TO_SIMD_SFX
+#if 0
+#line 345
+#elif NPY_BITSOF_LONG == 8
+    #define TO_SIMD_SFX(X) X##_s8
+
+#line 345
+#elif NPY_BITSOF_LONG == 16
+    #define TO_SIMD_SFX(X) X##_s16
+
+#line 345
+#elif NPY_BITSOF_LONG == 32
+    #define TO_SIMD_SFX(X) X##_s32
+
+#line 345
+#elif NPY_BITSOF_LONG == 64
+    #define TO_SIMD_SFX(X) X##_s64
+
+#endif
+#if NPY_BITSOF_LONG == 64 && defined(SIMD_DISABLE_DIV64_OPT)
+    #undef TO_SIMD_SFX
+#endif
+
+NPY_FINLINE npy_long floor_div_LONG(const npy_long n, const npy_long d)
+{
+    /*
+     * FIXME: On x86 at least, dividing the smallest representable integer
+     * by -1 causes a SIFGPE (division overflow). We treat this case here
+     * (to avoid a SIGFPE crash at python level), but a good solution would
+     * be to treat integer division problems separately from FPU exceptions
+     * (i.e. a different approach than npy_set_floatstatus_divbyzero()).
+     */
+    if (NPY_UNLIKELY(d == 0 || (n == NPY_MIN_LONG && d == -1))) {
+        if (d == 0) {
+            npy_set_floatstatus_divbyzero();
+            return 0;
+        }
+        else {
+            npy_set_floatstatus_overflow();
+            return NPY_MIN_LONG;
+        }
+    }
+    npy_long r = n / d;
+    // Negative quotients needs to be rounded down
+    if (((n > 0) != (d > 0)) && ((r * d) != n)) {
+        r--;
+    }
+    return r;
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_divide)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP(npy_long) {
+            io1 = floor_div_LONG(io1, *(npy_long*)ip2);
+        }
+        *((npy_long *)iop1) = io1;
+    }
+#if NPY_SIMD && defined(TO_SIMD_SFX)
+#if defined(NPY_HAVE_VSX4)
+    // both arguments are arrays of the same size
+    else if (IS_BLOCKABLE_BINARY(sizeof(npy_long), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]);
+    }
+#endif
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_long), NPY_SIMD_WIDTH) &&
+             (*(npy_long *)args[1]) != 0) {
+        TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
+    }
+#endif
+    else {
+        BINARY_LOOP {
+            *((npy_long *)op1) = floor_div_LONG(*(npy_long*)ip1, *(npy_long*)ip2);
+        }
+    }
+}
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_divide_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_long *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_long *)(ip1 + is1 * indx);
+        *indexed = floor_div_LONG(*indexed, *(npy_long *)value);
+    }
+    return 0;
+}
+
+
+#line 340
+#undef TO_SIMD_SFX
+#if 0
+#line 345
+#elif NPY_BITSOF_LONGLONG == 8
+    #define TO_SIMD_SFX(X) X##_s8
+
+#line 345
+#elif NPY_BITSOF_LONGLONG == 16
+    #define TO_SIMD_SFX(X) X##_s16
+
+#line 345
+#elif NPY_BITSOF_LONGLONG == 32
+    #define TO_SIMD_SFX(X) X##_s32
+
+#line 345
+#elif NPY_BITSOF_LONGLONG == 64
+    #define TO_SIMD_SFX(X) X##_s64
+
+#endif
+#if NPY_BITSOF_LONGLONG == 64 && defined(SIMD_DISABLE_DIV64_OPT)
+    #undef TO_SIMD_SFX
+#endif
+
+NPY_FINLINE npy_longlong floor_div_LONGLONG(const npy_longlong n, const npy_longlong d)
+{
+    /*
+     * FIXME: On x86 at least, dividing the smallest representable integer
+     * by -1 causes a SIFGPE (division overflow). We treat this case here
+     * (to avoid a SIGFPE crash at python level), but a good solution would
+     * be to treat integer division problems separately from FPU exceptions
+     * (i.e. a different approach than npy_set_floatstatus_divbyzero()).
+     */
+    if (NPY_UNLIKELY(d == 0 || (n == NPY_MIN_LONGLONG && d == -1))) {
+        if (d == 0) {
+            npy_set_floatstatus_divbyzero();
+            return 0;
+        }
+        else {
+            npy_set_floatstatus_overflow();
+            return NPY_MIN_LONGLONG;
+        }
+    }
+    npy_longlong r = n / d;
+    // Negative quotients needs to be rounded down
+    if (((n > 0) != (d > 0)) && ((r * d) != n)) {
+        r--;
+    }
+    return r;
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_divide)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP(npy_longlong) {
+            io1 = floor_div_LONGLONG(io1, *(npy_longlong*)ip2);
+        }
+        *((npy_longlong *)iop1) = io1;
+    }
+#if NPY_SIMD && defined(TO_SIMD_SFX)
+#if defined(NPY_HAVE_VSX4)
+    // both arguments are arrays of the same size
+    else if (IS_BLOCKABLE_BINARY(sizeof(npy_longlong), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]);
+    }
+#endif
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_longlong), NPY_SIMD_WIDTH) &&
+             (*(npy_longlong *)args[1]) != 0) {
+        TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
+    }
+#endif
+    else {
+        BINARY_LOOP {
+            *((npy_longlong *)op1) = floor_div_LONGLONG(*(npy_longlong*)ip1, *(npy_longlong*)ip2);
+        }
+    }
+}
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_divide_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_longlong *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_longlong *)(ip1 + is1 * indx);
+        *indexed = floor_div_LONGLONG(*indexed, *(npy_longlong *)value);
+    }
+    return 0;
+}
+
+
+
+#line 439
+#undef TO_SIMD_SFX
+#if 0
+#line 444
+#elif NPY_BITSOF_BYTE == 8
+    #define TO_SIMD_SFX(X) X##_u8
+
+#line 444
+#elif NPY_BITSOF_BYTE == 16
+    #define TO_SIMD_SFX(X) X##_u16
+
+#line 444
+#elif NPY_BITSOF_BYTE == 32
+    #define TO_SIMD_SFX(X) X##_u32
+
+#line 444
+#elif NPY_BITSOF_BYTE == 64
+    #define TO_SIMD_SFX(X) X##_u64
+
+#endif
+/*
+ * For 64-bit division on Armv7, Aarch64, and IBM/Power, NPYV fall-backs to the scalar division
+ * because emulating multiply-high on these architectures is going to be expensive comparing
+ * to the native scalar dividers.
+ * Therefore it's better to disable NPYV in this special case to avoid any unnecessary shuffles.
+ * Power10(VSX4) is an exception here since it has native support for integer vector division.
+ */
+#if NPY_BITSOF_BYTE == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
+    #undef TO_SIMD_SFX
+#endif
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_divide)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP(npy_ubyte) {
+            const npy_ubyte d = *(npy_ubyte *)ip2;
+            if (NPY_UNLIKELY(d == 0)) {
+                npy_set_floatstatus_divbyzero();
+                io1 = 0;
+            } else {
+                io1 /= d;
+            }
+        }
+        *((npy_ubyte *)iop1) = io1;
+    }
+#if NPY_SIMD && defined(TO_SIMD_SFX)
+#if defined(NPY_HAVE_VSX4)
+    // both arguments are arrays of the same size
+    else if (IS_BLOCKABLE_BINARY(sizeof(npy_ubyte), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]);
+    }
+#endif
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ubyte), NPY_SIMD_WIDTH) &&
+             (*(npy_ubyte *)args[1]) != 0) {
+        TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
+    }
+#endif
+    else {
+        BINARY_LOOP {
+            const npy_ubyte in1 = *(npy_ubyte *)ip1;
+            const npy_ubyte in2 = *(npy_ubyte *)ip2;
+            if (NPY_UNLIKELY(in2 == 0)) {
+                npy_set_floatstatus_divbyzero();
+                *((npy_ubyte *)op1) = 0;
+            } else{
+                *((npy_ubyte *)op1) = in1 / in2;
+            }
+        }
+    }
+}
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_divide_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_ubyte *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ubyte *)(ip1 + is1 * indx);
+        npy_ubyte in2 = *(npy_ubyte *)value;
+        if (NPY_UNLIKELY(in2 == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *indexed = 0;
+        } else {
+            *indexed = *indexed / in2;
+        }
+    }
+    return 0;
+}
+
+
+#line 439
+#undef TO_SIMD_SFX
+#if 0
+#line 444
+#elif NPY_BITSOF_SHORT == 8
+    #define TO_SIMD_SFX(X) X##_u8
+
+#line 444
+#elif NPY_BITSOF_SHORT == 16
+    #define TO_SIMD_SFX(X) X##_u16
+
+#line 444
+#elif NPY_BITSOF_SHORT == 32
+    #define TO_SIMD_SFX(X) X##_u32
+
+#line 444
+#elif NPY_BITSOF_SHORT == 64
+    #define TO_SIMD_SFX(X) X##_u64
+
+#endif
+/*
+ * For 64-bit division on Armv7, Aarch64, and IBM/Power, NPYV fall-backs to the scalar division
+ * because emulating multiply-high on these architectures is going to be expensive comparing
+ * to the native scalar dividers.
+ * Therefore it's better to disable NPYV in this special case to avoid any unnecessary shuffles.
+ * Power10(VSX4) is an exception here since it has native support for integer vector division.
+ */
+#if NPY_BITSOF_SHORT == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
+    #undef TO_SIMD_SFX
+#endif
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_divide)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP(npy_ushort) {
+            const npy_ushort d = *(npy_ushort *)ip2;
+            if (NPY_UNLIKELY(d == 0)) {
+                npy_set_floatstatus_divbyzero();
+                io1 = 0;
+            } else {
+                io1 /= d;
+            }
+        }
+        *((npy_ushort *)iop1) = io1;
+    }
+#if NPY_SIMD && defined(TO_SIMD_SFX)
+#if defined(NPY_HAVE_VSX4)
+    // both arguments are arrays of the same size
+    else if (IS_BLOCKABLE_BINARY(sizeof(npy_ushort), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]);
+    }
+#endif
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ushort), NPY_SIMD_WIDTH) &&
+             (*(npy_ushort *)args[1]) != 0) {
+        TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
+    }
+#endif
+    else {
+        BINARY_LOOP {
+            const npy_ushort in1 = *(npy_ushort *)ip1;
+            const npy_ushort in2 = *(npy_ushort *)ip2;
+            if (NPY_UNLIKELY(in2 == 0)) {
+                npy_set_floatstatus_divbyzero();
+                *((npy_ushort *)op1) = 0;
+            } else{
+                *((npy_ushort *)op1) = in1 / in2;
+            }
+        }
+    }
+}
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_divide_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_ushort *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ushort *)(ip1 + is1 * indx);
+        npy_ushort in2 = *(npy_ushort *)value;
+        if (NPY_UNLIKELY(in2 == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *indexed = 0;
+        } else {
+            *indexed = *indexed / in2;
+        }
+    }
+    return 0;
+}
+
+
+#line 439
+#undef TO_SIMD_SFX
+#if 0
+#line 444
+#elif NPY_BITSOF_INT == 8
+    #define TO_SIMD_SFX(X) X##_u8
+
+#line 444
+#elif NPY_BITSOF_INT == 16
+    #define TO_SIMD_SFX(X) X##_u16
+
+#line 444
+#elif NPY_BITSOF_INT == 32
+    #define TO_SIMD_SFX(X) X##_u32
+
+#line 444
+#elif NPY_BITSOF_INT == 64
+    #define TO_SIMD_SFX(X) X##_u64
+
+#endif
+/*
+ * For 64-bit division on Armv7, Aarch64, and IBM/Power, NPYV fall-backs to the scalar division
+ * because emulating multiply-high on these architectures is going to be expensive comparing
+ * to the native scalar dividers.
+ * Therefore it's better to disable NPYV in this special case to avoid any unnecessary shuffles.
+ * Power10(VSX4) is an exception here since it has native support for integer vector division.
+ */
+#if NPY_BITSOF_INT == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
+    #undef TO_SIMD_SFX
+#endif
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_divide)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP(npy_uint) {
+            const npy_uint d = *(npy_uint *)ip2;
+            if (NPY_UNLIKELY(d == 0)) {
+                npy_set_floatstatus_divbyzero();
+                io1 = 0;
+            } else {
+                io1 /= d;
+            }
+        }
+        *((npy_uint *)iop1) = io1;
+    }
+#if NPY_SIMD && defined(TO_SIMD_SFX)
+#if defined(NPY_HAVE_VSX4)
+    // both arguments are arrays of the same size
+    else if (IS_BLOCKABLE_BINARY(sizeof(npy_uint), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]);
+    }
+#endif
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_uint), NPY_SIMD_WIDTH) &&
+             (*(npy_uint *)args[1]) != 0) {
+        TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
+    }
+#endif
+    else {
+        BINARY_LOOP {
+            const npy_uint in1 = *(npy_uint *)ip1;
+            const npy_uint in2 = *(npy_uint *)ip2;
+            if (NPY_UNLIKELY(in2 == 0)) {
+                npy_set_floatstatus_divbyzero();
+                *((npy_uint *)op1) = 0;
+            } else{
+                *((npy_uint *)op1) = in1 / in2;
+            }
+        }
+    }
+}
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_divide_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_uint *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_uint *)(ip1 + is1 * indx);
+        npy_uint in2 = *(npy_uint *)value;
+        if (NPY_UNLIKELY(in2 == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *indexed = 0;
+        } else {
+            *indexed = *indexed / in2;
+        }
+    }
+    return 0;
+}
+
+
+#line 439
+#undef TO_SIMD_SFX
+#if 0
+#line 444
+#elif NPY_BITSOF_LONG == 8
+    #define TO_SIMD_SFX(X) X##_u8
+
+#line 444
+#elif NPY_BITSOF_LONG == 16
+    #define TO_SIMD_SFX(X) X##_u16
+
+#line 444
+#elif NPY_BITSOF_LONG == 32
+    #define TO_SIMD_SFX(X) X##_u32
+
+#line 444
+#elif NPY_BITSOF_LONG == 64
+    #define TO_SIMD_SFX(X) X##_u64
+
+#endif
+/*
+ * For 64-bit division on Armv7, Aarch64, and IBM/Power, NPYV fall-backs to the scalar division
+ * because emulating multiply-high on these architectures is going to be expensive comparing
+ * to the native scalar dividers.
+ * Therefore it's better to disable NPYV in this special case to avoid any unnecessary shuffles.
+ * Power10(VSX4) is an exception here since it has native support for integer vector division.
+ */
+#if NPY_BITSOF_LONG == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
+    #undef TO_SIMD_SFX
+#endif
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_divide)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP(npy_ulong) {
+            const npy_ulong d = *(npy_ulong *)ip2;
+            if (NPY_UNLIKELY(d == 0)) {
+                npy_set_floatstatus_divbyzero();
+                io1 = 0;
+            } else {
+                io1 /= d;
+            }
+        }
+        *((npy_ulong *)iop1) = io1;
+    }
+#if NPY_SIMD && defined(TO_SIMD_SFX)
+#if defined(NPY_HAVE_VSX4)
+    // both arguments are arrays of the same size
+    else if (IS_BLOCKABLE_BINARY(sizeof(npy_ulong), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]);
+    }
+#endif
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ulong), NPY_SIMD_WIDTH) &&
+             (*(npy_ulong *)args[1]) != 0) {
+        TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
+    }
+#endif
+    else {
+        BINARY_LOOP {
+            const npy_ulong in1 = *(npy_ulong *)ip1;
+            const npy_ulong in2 = *(npy_ulong *)ip2;
+            if (NPY_UNLIKELY(in2 == 0)) {
+                npy_set_floatstatus_divbyzero();
+                *((npy_ulong *)op1) = 0;
+            } else{
+                *((npy_ulong *)op1) = in1 / in2;
+            }
+        }
+    }
+}
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_divide_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_ulong *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ulong *)(ip1 + is1 * indx);
+        npy_ulong in2 = *(npy_ulong *)value;
+        if (NPY_UNLIKELY(in2 == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *indexed = 0;
+        } else {
+            *indexed = *indexed / in2;
+        }
+    }
+    return 0;
+}
+
+
+#line 439
+#undef TO_SIMD_SFX
+#if 0
+#line 444
+#elif NPY_BITSOF_LONGLONG == 8
+    #define TO_SIMD_SFX(X) X##_u8
+
+#line 444
+#elif NPY_BITSOF_LONGLONG == 16
+    #define TO_SIMD_SFX(X) X##_u16
+
+#line 444
+#elif NPY_BITSOF_LONGLONG == 32
+    #define TO_SIMD_SFX(X) X##_u32
+
+#line 444
+#elif NPY_BITSOF_LONGLONG == 64
+    #define TO_SIMD_SFX(X) X##_u64
+
+#endif
+/*
+ * For 64-bit division on Armv7, Aarch64, and IBM/Power, NPYV fall-backs to the scalar division
+ * because emulating multiply-high on these architectures is going to be expensive comparing
+ * to the native scalar dividers.
+ * Therefore it's better to disable NPYV in this special case to avoid any unnecessary shuffles.
+ * Power10(VSX4) is an exception here since it has native support for integer vector division.
+ */
+#if NPY_BITSOF_LONGLONG == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON))
+    #undef TO_SIMD_SFX
+#endif
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_divide)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP(npy_ulonglong) {
+            const npy_ulonglong d = *(npy_ulonglong *)ip2;
+            if (NPY_UNLIKELY(d == 0)) {
+                npy_set_floatstatus_divbyzero();
+                io1 = 0;
+            } else {
+                io1 /= d;
+            }
+        }
+        *((npy_ulonglong *)iop1) = io1;
+    }
+#if NPY_SIMD && defined(TO_SIMD_SFX)
+#if defined(NPY_HAVE_VSX4)
+    // both arguments are arrays of the same size
+    else if (IS_BLOCKABLE_BINARY(sizeof(npy_ulonglong), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_divide_contig)(args, dimensions[0]);
+    }
+#endif
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ulonglong), NPY_SIMD_WIDTH) &&
+             (*(npy_ulonglong *)args[1]) != 0) {
+        TO_SIMD_SFX(simd_divide_by_scalar_contig)(args, dimensions[0]);
+    }
+#endif
+    else {
+        BINARY_LOOP {
+            const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
+            const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
+            if (NPY_UNLIKELY(in2 == 0)) {
+                npy_set_floatstatus_divbyzero();
+                *((npy_ulonglong *)op1) = 0;
+            } else{
+                *((npy_ulonglong *)op1) = in1 / in2;
+            }
+        }
+    }
+}
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_divide_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char * const*args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp shape = steps[3];
+    npy_intp n = dimensions[0];
+    npy_intp i;
+    npy_ulonglong *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ulonglong *)(ip1 + is1 * indx);
+        npy_ulonglong in2 = *(npy_ulonglong *)value;
+        if (NPY_UNLIKELY(in2 == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *indexed = 0;
+        } else {
+            *indexed = *indexed / in2;
+        }
+    }
+    return 0;
+}
+
+
+
diff --git a/numpy/core/src/_generated/loops_autovec.dispatch.c b/numpy/core/src/_generated/loops_autovec.dispatch.c
new file mode 100644
index 000000000000..db5cb1ca393d
--- /dev/null
+++ b/numpy/core/src/_generated/loops_autovec.dispatch.c
@@ -0,0 +1,2113 @@
+#line 1 "numpy/core/src/umath/loops_autovec.dispatch.c.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+/*@targets
+ ** $maxopt $autovec baseline
+ ** sse2 avx2
+ ** neon
+ ** vsx2
+ ** vx
+ **/
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+/*
+ *****************************************************************************
+ **                           INTEGER LOOPS
+ *****************************************************************************
+ */
+/*
+ * Arithmetic bit shift operations.
+ *
+ * Intel hardware masks bit shift values, so large shifts wrap around
+ * and can produce surprising results. The special handling ensures that
+ * behavior is independent of compiler or hardware.
+ * TODO: We could implement consistent behavior for negative shifts,
+ *       which is undefined in C.
+ */
+#define INT_left_shift_needs_clear_floatstatus
+#define UINT_left_shift_needs_clear_floatstatus
+
+#line 45
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_positive)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_byte, npy_byte, *out = +in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_square)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP_FAST(npy_byte, npy_byte, *out = in * in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_reciprocal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP_FAST(npy_byte, npy_byte, *out = 1.0 / in);
+}
+
+#line 69
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_add)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_byte, io1 += in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_byte, npy_byte, *out = in1 + in2);
+    }
+}
+
+#line 69
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_subtract)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_byte, io1 -= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_byte, npy_byte, *out = in1 - in2);
+    }
+}
+
+#line 69
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_multiply)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_byte, io1 *= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_byte, npy_byte, *out = in1 * in2);
+    }
+}
+
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_left_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps,
+                  void *NPY_UNUSED(func))
+{
+    BINARY_LOOP_FAST(npy_byte, npy_byte, *out = npy_lshifthh(in1, in2));
+#ifdef BYTE_left_shift_needs_clear_floatstatus
+    // For some reason, our macOS CI sets an "invalid" flag here, but only
+    // for some types.
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_right_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#ifndef NPY_DO_NOT_OPTIMIZE_BYTE_right_shift
+    BINARY_LOOP_FAST(npy_byte, npy_byte, *out = npy_rshifthh(in1, in2));
+#else
+    BINARY_LOOP {
+        npy_byte in1 = *(npy_byte *)ip1;
+        npy_byte in2 = *(npy_byte *)ip2;
+        *(npy_byte *)op1 = npy_rshifthh(in1, in2);
+    }
+#endif
+}
+
+#line 45
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_positive)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_ubyte, npy_ubyte, *out = +in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_square)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP_FAST(npy_ubyte, npy_ubyte, *out = in * in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_reciprocal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP_FAST(npy_ubyte, npy_ubyte, *out = 1.0 / in);
+}
+
+#line 69
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_add)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_ubyte, io1 += in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_ubyte, npy_ubyte, *out = in1 + in2);
+    }
+}
+
+#line 69
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_subtract)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_ubyte, io1 -= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_ubyte, npy_ubyte, *out = in1 - in2);
+    }
+}
+
+#line 69
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_multiply)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_ubyte, io1 *= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_ubyte, npy_ubyte, *out = in1 * in2);
+    }
+}
+
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_left_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps,
+                  void *NPY_UNUSED(func))
+{
+    BINARY_LOOP_FAST(npy_ubyte, npy_ubyte, *out = npy_lshiftuhh(in1, in2));
+#ifdef UBYTE_left_shift_needs_clear_floatstatus
+    // For some reason, our macOS CI sets an "invalid" flag here, but only
+    // for some types.
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_right_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#ifndef NPY_DO_NOT_OPTIMIZE_UBYTE_right_shift
+    BINARY_LOOP_FAST(npy_ubyte, npy_ubyte, *out = npy_rshiftuhh(in1, in2));
+#else
+    BINARY_LOOP {
+        npy_ubyte in1 = *(npy_ubyte *)ip1;
+        npy_ubyte in2 = *(npy_ubyte *)ip2;
+        *(npy_ubyte *)op1 = npy_rshiftuhh(in1, in2);
+    }
+#endif
+}
+
+#line 45
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_positive)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_short, npy_short, *out = +in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_square)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP_FAST(npy_short, npy_short, *out = in * in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_reciprocal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP_FAST(npy_short, npy_short, *out = 1.0 / in);
+}
+
+#line 69
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_add)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_short, io1 += in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_short, npy_short, *out = in1 + in2);
+    }
+}
+
+#line 69
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_subtract)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_short, io1 -= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_short, npy_short, *out = in1 - in2);
+    }
+}
+
+#line 69
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_multiply)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_short, io1 *= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_short, npy_short, *out = in1 * in2);
+    }
+}
+
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_left_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps,
+                  void *NPY_UNUSED(func))
+{
+    BINARY_LOOP_FAST(npy_short, npy_short, *out = npy_lshifth(in1, in2));
+#ifdef SHORT_left_shift_needs_clear_floatstatus
+    // For some reason, our macOS CI sets an "invalid" flag here, but only
+    // for some types.
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_right_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#ifndef NPY_DO_NOT_OPTIMIZE_SHORT_right_shift
+    BINARY_LOOP_FAST(npy_short, npy_short, *out = npy_rshifth(in1, in2));
+#else
+    BINARY_LOOP {
+        npy_short in1 = *(npy_short *)ip1;
+        npy_short in2 = *(npy_short *)ip2;
+        *(npy_short *)op1 = npy_rshifth(in1, in2);
+    }
+#endif
+}
+
+#line 45
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_positive)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_ushort, npy_ushort, *out = +in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_square)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP_FAST(npy_ushort, npy_ushort, *out = in * in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_reciprocal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP_FAST(npy_ushort, npy_ushort, *out = 1.0 / in);
+}
+
+#line 69
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_add)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_ushort, io1 += in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_ushort, npy_ushort, *out = in1 + in2);
+    }
+}
+
+#line 69
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_subtract)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_ushort, io1 -= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_ushort, npy_ushort, *out = in1 - in2);
+    }
+}
+
+#line 69
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_multiply)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_ushort, io1 *= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_ushort, npy_ushort, *out = in1 * in2);
+    }
+}
+
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_left_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps,
+                  void *NPY_UNUSED(func))
+{
+    BINARY_LOOP_FAST(npy_ushort, npy_ushort, *out = npy_lshiftuh(in1, in2));
+#ifdef USHORT_left_shift_needs_clear_floatstatus
+    // For some reason, our macOS CI sets an "invalid" flag here, but only
+    // for some types.
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_right_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#ifndef NPY_DO_NOT_OPTIMIZE_USHORT_right_shift
+    BINARY_LOOP_FAST(npy_ushort, npy_ushort, *out = npy_rshiftuh(in1, in2));
+#else
+    BINARY_LOOP {
+        npy_ushort in1 = *(npy_ushort *)ip1;
+        npy_ushort in2 = *(npy_ushort *)ip2;
+        *(npy_ushort *)op1 = npy_rshiftuh(in1, in2);
+    }
+#endif
+}
+
+#line 45
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_positive)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_int, npy_int, *out = +in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_square)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP_FAST(npy_int, npy_int, *out = in * in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_reciprocal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP_FAST(npy_int, npy_int, *out = 1.0 / in);
+}
+
+#line 69
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_add)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_int, io1 += in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_int, npy_int, *out = in1 + in2);
+    }
+}
+
+#line 69
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_subtract)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_int, io1 -= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_int, npy_int, *out = in1 - in2);
+    }
+}
+
+#line 69
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_multiply)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_int, io1 *= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_int, npy_int, *out = in1 * in2);
+    }
+}
+
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_left_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps,
+                  void *NPY_UNUSED(func))
+{
+    BINARY_LOOP_FAST(npy_int, npy_int, *out = npy_lshift(in1, in2));
+#ifdef INT_left_shift_needs_clear_floatstatus
+    // For some reason, our macOS CI sets an "invalid" flag here, but only
+    // for some types.
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_right_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#ifndef NPY_DO_NOT_OPTIMIZE_INT_right_shift
+    BINARY_LOOP_FAST(npy_int, npy_int, *out = npy_rshift(in1, in2));
+#else
+    BINARY_LOOP {
+        npy_int in1 = *(npy_int *)ip1;
+        npy_int in2 = *(npy_int *)ip2;
+        *(npy_int *)op1 = npy_rshift(in1, in2);
+    }
+#endif
+}
+
+#line 45
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_positive)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_uint, npy_uint, *out = +in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_square)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP_FAST(npy_uint, npy_uint, *out = in * in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_reciprocal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP_FAST(npy_uint, npy_uint, *out = 1.0 / in);
+}
+
+#line 69
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_add)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_uint, io1 += in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_uint, npy_uint, *out = in1 + in2);
+    }
+}
+
+#line 69
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_subtract)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_uint, io1 -= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_uint, npy_uint, *out = in1 - in2);
+    }
+}
+
+#line 69
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_multiply)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_uint, io1 *= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_uint, npy_uint, *out = in1 * in2);
+    }
+}
+
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_left_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps,
+                  void *NPY_UNUSED(func))
+{
+    BINARY_LOOP_FAST(npy_uint, npy_uint, *out = npy_lshiftu(in1, in2));
+#ifdef UINT_left_shift_needs_clear_floatstatus
+    // For some reason, our macOS CI sets an "invalid" flag here, but only
+    // for some types.
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_right_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#ifndef NPY_DO_NOT_OPTIMIZE_UINT_right_shift
+    BINARY_LOOP_FAST(npy_uint, npy_uint, *out = npy_rshiftu(in1, in2));
+#else
+    BINARY_LOOP {
+        npy_uint in1 = *(npy_uint *)ip1;
+        npy_uint in2 = *(npy_uint *)ip2;
+        *(npy_uint *)op1 = npy_rshiftu(in1, in2);
+    }
+#endif
+}
+
+#line 45
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_positive)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_long, npy_long, *out = +in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_square)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP_FAST(npy_long, npy_long, *out = in * in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_reciprocal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP_FAST(npy_long, npy_long, *out = 1.0 / in);
+}
+
+#line 69
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_add)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_long, io1 += in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_long, npy_long, *out = in1 + in2);
+    }
+}
+
+#line 69
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_subtract)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_long, io1 -= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_long, npy_long, *out = in1 - in2);
+    }
+}
+
+#line 69
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_multiply)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_long, io1 *= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_long, npy_long, *out = in1 * in2);
+    }
+}
+
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_left_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps,
+                  void *NPY_UNUSED(func))
+{
+    BINARY_LOOP_FAST(npy_long, npy_long, *out = npy_lshiftl(in1, in2));
+#ifdef LONG_left_shift_needs_clear_floatstatus
+    // For some reason, our macOS CI sets an "invalid" flag here, but only
+    // for some types.
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_right_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#ifndef NPY_DO_NOT_OPTIMIZE_LONG_right_shift
+    BINARY_LOOP_FAST(npy_long, npy_long, *out = npy_rshiftl(in1, in2));
+#else
+    BINARY_LOOP {
+        npy_long in1 = *(npy_long *)ip1;
+        npy_long in2 = *(npy_long *)ip2;
+        *(npy_long *)op1 = npy_rshiftl(in1, in2);
+    }
+#endif
+}
+
+#line 45
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_positive)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_ulong, npy_ulong, *out = +in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_square)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP_FAST(npy_ulong, npy_ulong, *out = in * in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_reciprocal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP_FAST(npy_ulong, npy_ulong, *out = 1.0 / in);
+}
+
+#line 69
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_add)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_ulong, io1 += in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_ulong, npy_ulong, *out = in1 + in2);
+    }
+}
+
+#line 69
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_subtract)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_ulong, io1 -= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_ulong, npy_ulong, *out = in1 - in2);
+    }
+}
+
+#line 69
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_multiply)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_ulong, io1 *= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_ulong, npy_ulong, *out = in1 * in2);
+    }
+}
+
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_left_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps,
+                  void *NPY_UNUSED(func))
+{
+    BINARY_LOOP_FAST(npy_ulong, npy_ulong, *out = npy_lshiftul(in1, in2));
+#ifdef ULONG_left_shift_needs_clear_floatstatus
+    // For some reason, our macOS CI sets an "invalid" flag here, but only
+    // for some types.
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_right_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#ifndef NPY_DO_NOT_OPTIMIZE_ULONG_right_shift
+    BINARY_LOOP_FAST(npy_ulong, npy_ulong, *out = npy_rshiftul(in1, in2));
+#else
+    BINARY_LOOP {
+        npy_ulong in1 = *(npy_ulong *)ip1;
+        npy_ulong in2 = *(npy_ulong *)ip2;
+        *(npy_ulong *)op1 = npy_rshiftul(in1, in2);
+    }
+#endif
+}
+
+#line 45
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_positive)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_longlong, npy_longlong, *out = +in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_square)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP_FAST(npy_longlong, npy_longlong, *out = in * in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_reciprocal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP_FAST(npy_longlong, npy_longlong, *out = 1.0 / in);
+}
+
+#line 69
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_add)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_longlong, io1 += in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_longlong, npy_longlong, *out = in1 + in2);
+    }
+}
+
+#line 69
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_subtract)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_longlong, io1 -= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_longlong, npy_longlong, *out = in1 - in2);
+    }
+}
+
+#line 69
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_multiply)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_longlong, io1 *= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_longlong, npy_longlong, *out = in1 * in2);
+    }
+}
+
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_left_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps,
+                  void *NPY_UNUSED(func))
+{
+    BINARY_LOOP_FAST(npy_longlong, npy_longlong, *out = npy_lshiftll(in1, in2));
+#ifdef LONGLONG_left_shift_needs_clear_floatstatus
+    // For some reason, our macOS CI sets an "invalid" flag here, but only
+    // for some types.
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_right_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#ifndef NPY_DO_NOT_OPTIMIZE_LONGLONG_right_shift
+    BINARY_LOOP_FAST(npy_longlong, npy_longlong, *out = npy_rshiftll(in1, in2));
+#else
+    BINARY_LOOP {
+        npy_longlong in1 = *(npy_longlong *)ip1;
+        npy_longlong in2 = *(npy_longlong *)ip2;
+        *(npy_longlong *)op1 = npy_rshiftll(in1, in2);
+    }
+#endif
+}
+
+#line 45
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_positive)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_ulonglong, npy_ulonglong, *out = +in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_square)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP_FAST(npy_ulonglong, npy_ulonglong, *out = in * in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_reciprocal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    UNARY_LOOP_FAST(npy_ulonglong, npy_ulonglong, *out = 1.0 / in);
+}
+
+#line 69
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_add)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_ulonglong, io1 += in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_ulonglong, npy_ulonglong, *out = in1 + in2);
+    }
+}
+
+#line 69
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_subtract)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_ulonglong, io1 -= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_ulonglong, npy_ulonglong, *out = in1 - in2);
+    }
+}
+
+#line 69
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_multiply)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_ulonglong, io1 *= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_ulonglong, npy_ulonglong, *out = in1 * in2);
+    }
+}
+
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_left_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps,
+                  void *NPY_UNUSED(func))
+{
+    BINARY_LOOP_FAST(npy_ulonglong, npy_ulonglong, *out = npy_lshiftull(in1, in2));
+#ifdef ULONGLONG_left_shift_needs_clear_floatstatus
+    // For some reason, our macOS CI sets an "invalid" flag here, but only
+    // for some types.
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_right_shift)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#ifndef NPY_DO_NOT_OPTIMIZE_ULONGLONG_right_shift
+    BINARY_LOOP_FAST(npy_ulonglong, npy_ulonglong, *out = npy_rshiftull(in1, in2));
+#else
+    BINARY_LOOP {
+        npy_ulonglong in1 = *(npy_ulonglong *)ip1;
+        npy_ulonglong in2 = *(npy_ulonglong *)ip2;
+        *(npy_ulonglong *)op1 = npy_rshiftull(in1, in2);
+    }
+#endif
+}
+
+
+/*
+ *****************************************************************************
+ **                         UNSIGNED INTEGER LOOPS
+ *****************************************************************************
+ */
+#line 118
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_absolute)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_ubyte, npy_ubyte, *out = in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_sign)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_ubyte, npy_ubyte, *out = in > 0 ? 1 : 0);
+}
+
+#line 135
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_bitwise_and)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_ubyte, io1 &= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_ubyte, npy_ubyte, *out = in1 & in2);
+    }
+}
+
+#line 135
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_bitwise_or)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_ubyte, io1 |= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_ubyte, npy_ubyte, *out = in1 | in2);
+    }
+}
+
+#line 135
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_bitwise_xor)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_ubyte, io1 ^= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_ubyte, npy_ubyte, *out = in1 ^ in2);
+    }
+}
+
+
+#line 151
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_logical_and)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * gcc vectorization of this is not good (PR60575) but manual integer
+     * vectorization is too tedious to be worthwhile
+     */
+    BINARY_LOOP_FAST(npy_ubyte, npy_bool, *out = in1 && in2);
+}
+
+#line 151
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_logical_or)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * gcc vectorization of this is not good (PR60575) but manual integer
+     * vectorization is too tedious to be worthwhile
+     */
+    BINARY_LOOP_FAST(npy_ubyte, npy_bool, *out = in1 || in2);
+}
+
+
+NPY_FINLINE npy_bool UBYTE_logical_xor_(npy_ubyte in1, npy_ubyte in2)
+{ return (!!in1) != (!!in2); }
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_logical_xor)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP_FAST(npy_ubyte, npy_bool, *out = UBYTE_logical_xor_(in1, in2));
+}
+
+#line 176
+NPY_NO_EXPORT void  NPY_CPU_DISPATCH_CURFX(UBYTE_isnan)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * The (void)in; suppresses an unused variable warning raised by gcc and allows
+     * us to re-use this macro even though we do not depend on in
+     */
+    UNARY_LOOP_FAST(npy_ubyte, npy_bool, (void)in; *out = NPY_FALSE);
+}
+
+#line 176
+NPY_NO_EXPORT void  NPY_CPU_DISPATCH_CURFX(UBYTE_isinf)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * The (void)in; suppresses an unused variable warning raised by gcc and allows
+     * us to re-use this macro even though we do not depend on in
+     */
+    UNARY_LOOP_FAST(npy_ubyte, npy_bool, (void)in; *out = NPY_FALSE);
+}
+
+#line 176
+NPY_NO_EXPORT void  NPY_CPU_DISPATCH_CURFX(UBYTE_isfinite)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * The (void)in; suppresses an unused variable warning raised by gcc and allows
+     * us to re-use this macro even though we do not depend on in
+     */
+    UNARY_LOOP_FAST(npy_ubyte, npy_bool, (void)in; *out = NPY_TRUE);
+}
+
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_conjugate)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_ubyte, npy_ubyte, *out = in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_logical_not)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_ubyte, npy_bool, *out = !in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_invert)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_ubyte, npy_ubyte, *out = ~in);
+}
+
+#line 118
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_absolute)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_ushort, npy_ushort, *out = in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_sign)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_ushort, npy_ushort, *out = in > 0 ? 1 : 0);
+}
+
+#line 135
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_bitwise_and)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_ushort, io1 &= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_ushort, npy_ushort, *out = in1 & in2);
+    }
+}
+
+#line 135
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_bitwise_or)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_ushort, io1 |= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_ushort, npy_ushort, *out = in1 | in2);
+    }
+}
+
+#line 135
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_bitwise_xor)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_ushort, io1 ^= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_ushort, npy_ushort, *out = in1 ^ in2);
+    }
+}
+
+
+#line 151
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_logical_and)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * gcc vectorization of this is not good (PR60575) but manual integer
+     * vectorization is too tedious to be worthwhile
+     */
+    BINARY_LOOP_FAST(npy_ushort, npy_bool, *out = in1 && in2);
+}
+
+#line 151
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_logical_or)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * gcc vectorization of this is not good (PR60575) but manual integer
+     * vectorization is too tedious to be worthwhile
+     */
+    BINARY_LOOP_FAST(npy_ushort, npy_bool, *out = in1 || in2);
+}
+
+
+NPY_FINLINE npy_bool USHORT_logical_xor_(npy_ushort in1, npy_ushort in2)
+{ return (!!in1) != (!!in2); }
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_logical_xor)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP_FAST(npy_ushort, npy_bool, *out = USHORT_logical_xor_(in1, in2));
+}
+
+#line 176
+NPY_NO_EXPORT void  NPY_CPU_DISPATCH_CURFX(USHORT_isnan)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * The (void)in; suppresses an unused variable warning raised by gcc and allows
+     * us to re-use this macro even though we do not depend on in
+     */
+    UNARY_LOOP_FAST(npy_ushort, npy_bool, (void)in; *out = NPY_FALSE);
+}
+
+#line 176
+NPY_NO_EXPORT void  NPY_CPU_DISPATCH_CURFX(USHORT_isinf)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * The (void)in; suppresses an unused variable warning raised by gcc and allows
+     * us to re-use this macro even though we do not depend on in
+     */
+    UNARY_LOOP_FAST(npy_ushort, npy_bool, (void)in; *out = NPY_FALSE);
+}
+
+#line 176
+NPY_NO_EXPORT void  NPY_CPU_DISPATCH_CURFX(USHORT_isfinite)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * The (void)in; suppresses an unused variable warning raised by gcc and allows
+     * us to re-use this macro even though we do not depend on in
+     */
+    UNARY_LOOP_FAST(npy_ushort, npy_bool, (void)in; *out = NPY_TRUE);
+}
+
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_conjugate)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_ushort, npy_ushort, *out = in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_logical_not)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_ushort, npy_bool, *out = !in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_invert)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_ushort, npy_ushort, *out = ~in);
+}
+
+#line 118
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_absolute)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_uint, npy_uint, *out = in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_sign)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_uint, npy_uint, *out = in > 0 ? 1 : 0);
+}
+
+#line 135
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_bitwise_and)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_uint, io1 &= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_uint, npy_uint, *out = in1 & in2);
+    }
+}
+
+#line 135
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_bitwise_or)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_uint, io1 |= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_uint, npy_uint, *out = in1 | in2);
+    }
+}
+
+#line 135
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_bitwise_xor)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_uint, io1 ^= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_uint, npy_uint, *out = in1 ^ in2);
+    }
+}
+
+
+#line 151
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_logical_and)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * gcc vectorization of this is not good (PR60575) but manual integer
+     * vectorization is too tedious to be worthwhile
+     */
+    BINARY_LOOP_FAST(npy_uint, npy_bool, *out = in1 && in2);
+}
+
+#line 151
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_logical_or)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * gcc vectorization of this is not good (PR60575) but manual integer
+     * vectorization is too tedious to be worthwhile
+     */
+    BINARY_LOOP_FAST(npy_uint, npy_bool, *out = in1 || in2);
+}
+
+
+NPY_FINLINE npy_bool UINT_logical_xor_(npy_uint in1, npy_uint in2)
+{ return (!!in1) != (!!in2); }
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_logical_xor)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP_FAST(npy_uint, npy_bool, *out = UINT_logical_xor_(in1, in2));
+}
+
+#line 176
+NPY_NO_EXPORT void  NPY_CPU_DISPATCH_CURFX(UINT_isnan)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * The (void)in; suppresses an unused variable warning raised by gcc and allows
+     * us to re-use this macro even though we do not depend on in
+     */
+    UNARY_LOOP_FAST(npy_uint, npy_bool, (void)in; *out = NPY_FALSE);
+}
+
+#line 176
+NPY_NO_EXPORT void  NPY_CPU_DISPATCH_CURFX(UINT_isinf)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * The (void)in; suppresses an unused variable warning raised by gcc and allows
+     * us to re-use this macro even though we do not depend on in
+     */
+    UNARY_LOOP_FAST(npy_uint, npy_bool, (void)in; *out = NPY_FALSE);
+}
+
+#line 176
+NPY_NO_EXPORT void  NPY_CPU_DISPATCH_CURFX(UINT_isfinite)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * The (void)in; suppresses an unused variable warning raised by gcc and allows
+     * us to re-use this macro even though we do not depend on in
+     */
+    UNARY_LOOP_FAST(npy_uint, npy_bool, (void)in; *out = NPY_TRUE);
+}
+
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_conjugate)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_uint, npy_uint, *out = in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_logical_not)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_uint, npy_bool, *out = !in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_invert)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_uint, npy_uint, *out = ~in);
+}
+
+#line 118
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_absolute)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_ulong, npy_ulong, *out = in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_sign)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_ulong, npy_ulong, *out = in > 0 ? 1 : 0);
+}
+
+#line 135
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_bitwise_and)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_ulong, io1 &= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_ulong, npy_ulong, *out = in1 & in2);
+    }
+}
+
+#line 135
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_bitwise_or)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_ulong, io1 |= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_ulong, npy_ulong, *out = in1 | in2);
+    }
+}
+
+#line 135
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_bitwise_xor)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_ulong, io1 ^= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_ulong, npy_ulong, *out = in1 ^ in2);
+    }
+}
+
+
+#line 151
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_logical_and)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * gcc vectorization of this is not good (PR60575) but manual integer
+     * vectorization is too tedious to be worthwhile
+     */
+    BINARY_LOOP_FAST(npy_ulong, npy_bool, *out = in1 && in2);
+}
+
+#line 151
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_logical_or)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * gcc vectorization of this is not good (PR60575) but manual integer
+     * vectorization is too tedious to be worthwhile
+     */
+    BINARY_LOOP_FAST(npy_ulong, npy_bool, *out = in1 || in2);
+}
+
+
+NPY_FINLINE npy_bool ULONG_logical_xor_(npy_ulong in1, npy_ulong in2)
+{ return (!!in1) != (!!in2); }
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_logical_xor)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP_FAST(npy_ulong, npy_bool, *out = ULONG_logical_xor_(in1, in2));
+}
+
+#line 176
+NPY_NO_EXPORT void  NPY_CPU_DISPATCH_CURFX(ULONG_isnan)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * The (void)in; suppresses an unused variable warning raised by gcc and allows
+     * us to re-use this macro even though we do not depend on in
+     */
+    UNARY_LOOP_FAST(npy_ulong, npy_bool, (void)in; *out = NPY_FALSE);
+}
+
+#line 176
+NPY_NO_EXPORT void  NPY_CPU_DISPATCH_CURFX(ULONG_isinf)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * The (void)in; suppresses an unused variable warning raised by gcc and allows
+     * us to re-use this macro even though we do not depend on in
+     */
+    UNARY_LOOP_FAST(npy_ulong, npy_bool, (void)in; *out = NPY_FALSE);
+}
+
+#line 176
+NPY_NO_EXPORT void  NPY_CPU_DISPATCH_CURFX(ULONG_isfinite)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * The (void)in; suppresses an unused variable warning raised by gcc and allows
+     * us to re-use this macro even though we do not depend on in
+     */
+    UNARY_LOOP_FAST(npy_ulong, npy_bool, (void)in; *out = NPY_TRUE);
+}
+
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_conjugate)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_ulong, npy_ulong, *out = in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_logical_not)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_ulong, npy_bool, *out = !in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_invert)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_ulong, npy_ulong, *out = ~in);
+}
+
+#line 118
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_absolute)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_ulonglong, npy_ulonglong, *out = in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_sign)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_ulonglong, npy_ulonglong, *out = in > 0 ? 1 : 0);
+}
+
+#line 135
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_bitwise_and)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_ulonglong, io1 &= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_ulonglong, npy_ulonglong, *out = in1 & in2);
+    }
+}
+
+#line 135
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_bitwise_or)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_ulonglong, io1 |= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_ulonglong, npy_ulonglong, *out = in1 | in2);
+    }
+}
+
+#line 135
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_bitwise_xor)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (IS_BINARY_REDUCE) {
+        BINARY_REDUCE_LOOP_FAST(npy_ulonglong, io1 ^= in2);
+    }
+    else {
+        BINARY_LOOP_FAST(npy_ulonglong, npy_ulonglong, *out = in1 ^ in2);
+    }
+}
+
+
+#line 151
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_logical_and)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * gcc vectorization of this is not good (PR60575) but manual integer
+     * vectorization is too tedious to be worthwhile
+     */
+    BINARY_LOOP_FAST(npy_ulonglong, npy_bool, *out = in1 && in2);
+}
+
+#line 151
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_logical_or)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * gcc vectorization of this is not good (PR60575) but manual integer
+     * vectorization is too tedious to be worthwhile
+     */
+    BINARY_LOOP_FAST(npy_ulonglong, npy_bool, *out = in1 || in2);
+}
+
+
+NPY_FINLINE npy_bool ULONGLONG_logical_xor_(npy_ulonglong in1, npy_ulonglong in2)
+{ return (!!in1) != (!!in2); }
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_logical_xor)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    BINARY_LOOP_FAST(npy_ulonglong, npy_bool, *out = ULONGLONG_logical_xor_(in1, in2));
+}
+
+#line 176
+NPY_NO_EXPORT void  NPY_CPU_DISPATCH_CURFX(ULONGLONG_isnan)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * The (void)in; suppresses an unused variable warning raised by gcc and allows
+     * us to re-use this macro even though we do not depend on in
+     */
+    UNARY_LOOP_FAST(npy_ulonglong, npy_bool, (void)in; *out = NPY_FALSE);
+}
+
+#line 176
+NPY_NO_EXPORT void  NPY_CPU_DISPATCH_CURFX(ULONGLONG_isinf)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * The (void)in; suppresses an unused variable warning raised by gcc and allows
+     * us to re-use this macro even though we do not depend on in
+     */
+    UNARY_LOOP_FAST(npy_ulonglong, npy_bool, (void)in; *out = NPY_FALSE);
+}
+
+#line 176
+NPY_NO_EXPORT void  NPY_CPU_DISPATCH_CURFX(ULONGLONG_isfinite)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    /*
+     * The (void)in; suppresses an unused variable warning raised by gcc and allows
+     * us to re-use this macro even though we do not depend on in
+     */
+    UNARY_LOOP_FAST(npy_ulonglong, npy_bool, (void)in; *out = NPY_TRUE);
+}
+
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_conjugate)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_ulonglong, npy_ulonglong, *out = in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_logical_not)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_ulonglong, npy_bool, *out = !in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_invert)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_ulonglong, npy_ulonglong, *out = ~in);
+}
+
+
+/*
+ *****************************************************************************
+ **                         SIGNED! INTEGER LOOPS
+ *****************************************************************************
+ */
+
+#line 217
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_absolute)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_byte, npy_byte, *out = (in >= 0) ? in : -in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_sign)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_byte, npy_byte, *out = in > 0 ? 1 : (in < 0 ? -1 : 0));
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_conjugate)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(UBYTE_conjugate)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_invert)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(UBYTE_invert)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_isnan)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(UBYTE_isnan)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_isinf)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(UBYTE_isinf)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_isfinite)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(UBYTE_isfinite)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_logical_and)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(UBYTE_logical_and)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_logical_or)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(UBYTE_logical_or)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_logical_xor)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(UBYTE_logical_xor)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_logical_not)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(UBYTE_logical_not)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_bitwise_and)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(UBYTE_bitwise_and)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_bitwise_or)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(UBYTE_bitwise_or)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_bitwise_xor)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(UBYTE_bitwise_xor)(args, dimensions, steps, func);
+}
+
+
+#line 217
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_absolute)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_short, npy_short, *out = (in >= 0) ? in : -in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_sign)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_short, npy_short, *out = in > 0 ? 1 : (in < 0 ? -1 : 0));
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_conjugate)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(USHORT_conjugate)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_invert)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(USHORT_invert)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_isnan)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(USHORT_isnan)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_isinf)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(USHORT_isinf)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_isfinite)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(USHORT_isfinite)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_logical_and)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(USHORT_logical_and)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_logical_or)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(USHORT_logical_or)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_logical_xor)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(USHORT_logical_xor)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_logical_not)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(USHORT_logical_not)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_bitwise_and)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(USHORT_bitwise_and)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_bitwise_or)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(USHORT_bitwise_or)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_bitwise_xor)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(USHORT_bitwise_xor)(args, dimensions, steps, func);
+}
+
+
+#line 217
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_absolute)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_int, npy_int, *out = (in >= 0) ? in : -in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_sign)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_int, npy_int, *out = in > 0 ? 1 : (in < 0 ? -1 : 0));
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_conjugate)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(UINT_conjugate)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_invert)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(UINT_invert)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_isnan)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(UINT_isnan)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_isinf)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(UINT_isinf)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_isfinite)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(UINT_isfinite)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_logical_and)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(UINT_logical_and)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_logical_or)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(UINT_logical_or)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_logical_xor)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(UINT_logical_xor)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_logical_not)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(UINT_logical_not)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_bitwise_and)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(UINT_bitwise_and)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_bitwise_or)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(UINT_bitwise_or)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_bitwise_xor)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(UINT_bitwise_xor)(args, dimensions, steps, func);
+}
+
+
+#line 217
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_absolute)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_long, npy_long, *out = (in >= 0) ? in : -in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_sign)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_long, npy_long, *out = in > 0 ? 1 : (in < 0 ? -1 : 0));
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_conjugate)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(ULONG_conjugate)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_invert)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(ULONG_invert)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_isnan)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(ULONG_isnan)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_isinf)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(ULONG_isinf)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_isfinite)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(ULONG_isfinite)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_logical_and)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(ULONG_logical_and)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_logical_or)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(ULONG_logical_or)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_logical_xor)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(ULONG_logical_xor)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_logical_not)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(ULONG_logical_not)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_bitwise_and)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(ULONG_bitwise_and)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_bitwise_or)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(ULONG_bitwise_or)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_bitwise_xor)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(ULONG_bitwise_xor)(args, dimensions, steps, func);
+}
+
+
+#line 217
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_absolute)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_longlong, npy_longlong, *out = (in >= 0) ? in : -in);
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_sign)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_longlong, npy_longlong, *out = in > 0 ? 1 : (in < 0 ? -1 : 0));
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_conjugate)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(ULONGLONG_conjugate)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_invert)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(ULONGLONG_invert)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_isnan)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(ULONGLONG_isnan)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_isinf)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(ULONGLONG_isinf)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_isfinite)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(ULONGLONG_isfinite)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_logical_and)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(ULONGLONG_logical_and)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_logical_or)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(ULONGLONG_logical_or)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_logical_xor)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(ULONGLONG_logical_xor)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_logical_not)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(ULONGLONG_logical_not)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_bitwise_and)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(ULONGLONG_bitwise_and)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_bitwise_or)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(ULONGLONG_bitwise_or)(args, dimensions, steps, func);
+}
+
+#line 235
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_bitwise_xor)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(ULONGLONG_bitwise_xor)(args, dimensions, steps, func);
+}
+
+
+
+/*
+ *****************************************************************************
+ **                             BOOLEAN LOOPS                               **
+ *****************************************************************************
+ */
+#line 253
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_isnan)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(UBYTE_isnan)(args, dimensions, steps, func);
+}
+
+#line 253
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_isinf)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(UBYTE_isinf)(args, dimensions, steps, func);
+}
+
+#line 253
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_isfinite)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(UBYTE_isfinite)(args, dimensions, steps, func);
+}
+
+
+/*
+ *****************************************************************************
+ **                          HALF-FLOAT LOOPS                               **
+ *****************************************************************************
+ */
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(HALF_absolute)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    UNARY_LOOP_FAST(npy_half, npy_half, *out = in&0x7fffu);
+}
+
+/*
+ *****************************************************************************
+ **                           DATETIME LOOPS                                **
+ *****************************************************************************
+ */
+
+#line 282
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DATETIME_isinf)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(ULONGLONG_isinf)(args, dimensions, steps, func);
+}
+
+#line 282
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(TIMEDELTA_isinf)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *func)
+{
+    NPY_CPU_DISPATCH_CURFX(ULONGLONG_isinf)(args, dimensions, steps, func);
+}
+
+
diff --git a/numpy/core/src/_generated/loops_comparison.dispatch.c b/numpy/core/src/_generated/loops_comparison.dispatch.c
new file mode 100644
index 000000000000..b22c57e1c9cd
--- /dev/null
+++ b/numpy/core/src/_generated/loops_comparison.dispatch.c
@@ -0,0 +1,10299 @@
+#line 1 "numpy/core/src/umath/loops_comparison.dispatch.c.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+/*@targets
+ ** $maxopt baseline
+ ** sse2 sse42 avx2 avx512f avx512_skx
+ ** vsx2 vsx3
+ ** neon
+ ** vx vxe
+ **/
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+/********************************************************************************
+ ** Defining the SIMD kernels
+ ********************************************************************************/
+#line 28
+#line 35
+#if NPY_SIMD && !((1 || 0) && 0)
+static void simd_binary_equal_u8(char **args, npy_intp len)
+{
+    npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0];
+    npyv_lanetype_u8 *src2 = (npyv_lanetype_u8 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 8 >= 8
+        npyv_u8  a1 = npyv_load_u8(src1 + npyv_nlanes_u8 * 0);
+        npyv_u8  b1 = npyv_load_u8(src2 + npyv_nlanes_u8 * 0);
+        npyv_b8 c1 = npyv_cmpeq_u8(a1, b1);
+#if 8 >= 16
+        npyv_u8  a2 = npyv_load_u8(src1 + npyv_nlanes_u8 * 1);
+        npyv_u8  b2 = npyv_load_u8(src2 + npyv_nlanes_u8 * 1);
+        npyv_b8 c2 = npyv_cmpeq_u8(a2, b2);
+#if 8 >= 32
+        npyv_u8  a3 = npyv_load_u8(src1 + npyv_nlanes_u8 * 2);
+        npyv_u8  b3 = npyv_load_u8(src2 + npyv_nlanes_u8 * 2);
+        npyv_u8  a4 = npyv_load_u8(src1 + npyv_nlanes_u8 * 3);
+        npyv_u8  b4 = npyv_load_u8(src2 + npyv_nlanes_u8 * 3);
+        npyv_b8 c3 = npyv_cmpeq_u8(a3, b3);
+        npyv_b8 c4 = npyv_cmpeq_u8(a4, b4);
+#if 8 == 64
+        npyv_u8  a5 = npyv_load_u8(src1 + npyv_nlanes_u8 * 4);
+        npyv_u8  b5 = npyv_load_u8(src2 + npyv_nlanes_u8 * 4);
+        npyv_u8  a6 = npyv_load_u8(src1 + npyv_nlanes_u8 * 5);
+        npyv_u8  b6 = npyv_load_u8(src2 + npyv_nlanes_u8 * 5);
+        npyv_u8  a7 = npyv_load_u8(src1 + npyv_nlanes_u8 * 6);
+        npyv_u8  b7 = npyv_load_u8(src2 + npyv_nlanes_u8 * 6);
+        npyv_u8  a8 = npyv_load_u8(src1 + npyv_nlanes_u8 * 7);
+        npyv_u8  b8 = npyv_load_u8(src2 + npyv_nlanes_u8 * 7);
+        npyv_b8 c5 = npyv_cmpeq_u8(a5, b5);
+        npyv_b8 c6 = npyv_cmpeq_u8(a6, b6);
+        npyv_b8 c7 = npyv_cmpeq_u8(a7, b7);
+        npyv_b8 c8 = npyv_cmpeq_u8(a8, b8);
+#endif // 8 >= 64
+#endif // 8 >= 32
+#endif // 8 >= 16
+#endif // 8 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 8 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 8 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 8 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 8 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_u8 a = *src1;
+        const npyv_lanetype_u8 b = *src2;
+        *dst = a == b;
+    }
+}
+
+static void simd_binary_scalar1_equal_u8(char **args, npy_intp len)
+{
+    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[0];
+    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 a         = npyv_setall_u8(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 8 >= 8
+        npyv_u8  b1 = npyv_load_u8(src + npyv_nlanes_u8 * 0);
+        npyv_b8 c1 = npyv_cmpeq_u8(a, b1);
+#if 8 >= 16
+        npyv_u8  b2 = npyv_load_u8(src + npyv_nlanes_u8 * 1);
+        npyv_b8 c2 = npyv_cmpeq_u8(a, b2);
+#if 8 >= 32
+        npyv_u8  b3 = npyv_load_u8(src + npyv_nlanes_u8 * 2);
+        npyv_u8  b4 = npyv_load_u8(src + npyv_nlanes_u8 * 3);
+        npyv_b8 c3 = npyv_cmpeq_u8(a, b3);
+        npyv_b8 c4 = npyv_cmpeq_u8(a, b4);
+#if 8 == 64
+        npyv_u8  b5 = npyv_load_u8(src + npyv_nlanes_u8 * 4);
+        npyv_u8  b6 = npyv_load_u8(src + npyv_nlanes_u8 * 5);
+        npyv_u8  b7 = npyv_load_u8(src + npyv_nlanes_u8 * 6);
+        npyv_u8  b8 = npyv_load_u8(src + npyv_nlanes_u8 * 7);
+        npyv_b8 c5 = npyv_cmpeq_u8(a, b5);
+        npyv_b8 c6 = npyv_cmpeq_u8(a, b6);
+        npyv_b8 c7 = npyv_cmpeq_u8(a, b7);
+        npyv_b8 c8 = npyv_cmpeq_u8(a, b8);
+#endif // 8 >= 64
+#endif // 8 >= 32
+#endif // 8 >= 16
+#endif // 8 >= 8
+
+#if 8 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 8 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 8 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 8 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u8 b = *src;
+        *dst = scalar == b;
+    }
+}
+
+static void simd_binary_scalar2_equal_u8(char **args, npy_intp len)
+{
+    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[0];
+    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 b         = npyv_setall_u8(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 8 >= 8
+        npyv_u8  a1 = npyv_load_u8(src + npyv_nlanes_u8 * 0);
+        npyv_b8 c1 = npyv_cmpeq_u8(a1, b);
+#if 8 >= 16
+        npyv_u8  a2 = npyv_load_u8(src + npyv_nlanes_u8 * 1);
+        npyv_b8 c2 = npyv_cmpeq_u8(a2, b);
+#if 8 >= 32
+        npyv_u8  a3 = npyv_load_u8(src + npyv_nlanes_u8 * 2);
+        npyv_u8  a4 = npyv_load_u8(src + npyv_nlanes_u8 * 3);
+        npyv_b8 c3 = npyv_cmpeq_u8(a3, b);
+        npyv_b8 c4 = npyv_cmpeq_u8(a4, b);
+#if 8 == 64
+        npyv_u8  a5 = npyv_load_u8(src + npyv_nlanes_u8 * 4);
+        npyv_u8  a6 = npyv_load_u8(src + npyv_nlanes_u8 * 5);
+        npyv_u8  a7 = npyv_load_u8(src + npyv_nlanes_u8 * 6);
+        npyv_u8  a8 = npyv_load_u8(src + npyv_nlanes_u8 * 7);
+        npyv_b8 c5 = npyv_cmpeq_u8(a5, b);
+        npyv_b8 c6 = npyv_cmpeq_u8(a6, b);
+        npyv_b8 c7 = npyv_cmpeq_u8(a7, b);
+        npyv_b8 c8 = npyv_cmpeq_u8(a8, b);
+#endif // 8 >= 64
+#endif // 8 >= 32
+#endif // 8 >= 16
+#endif // 8 >= 8
+
+#if 8 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 8 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 8 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 8 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u8 a = *src;
+        *dst = a == scalar;
+    }
+}
+#endif
+
+
+#line 35
+#if NPY_SIMD && !((0 || 1) && 0)
+static void simd_binary_not_equal_u8(char **args, npy_intp len)
+{
+    npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0];
+    npyv_lanetype_u8 *src2 = (npyv_lanetype_u8 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 8 >= 8
+        npyv_u8  a1 = npyv_load_u8(src1 + npyv_nlanes_u8 * 0);
+        npyv_u8  b1 = npyv_load_u8(src2 + npyv_nlanes_u8 * 0);
+        npyv_b8 c1 = npyv_cmpneq_u8(a1, b1);
+#if 8 >= 16
+        npyv_u8  a2 = npyv_load_u8(src1 + npyv_nlanes_u8 * 1);
+        npyv_u8  b2 = npyv_load_u8(src2 + npyv_nlanes_u8 * 1);
+        npyv_b8 c2 = npyv_cmpneq_u8(a2, b2);
+#if 8 >= 32
+        npyv_u8  a3 = npyv_load_u8(src1 + npyv_nlanes_u8 * 2);
+        npyv_u8  b3 = npyv_load_u8(src2 + npyv_nlanes_u8 * 2);
+        npyv_u8  a4 = npyv_load_u8(src1 + npyv_nlanes_u8 * 3);
+        npyv_u8  b4 = npyv_load_u8(src2 + npyv_nlanes_u8 * 3);
+        npyv_b8 c3 = npyv_cmpneq_u8(a3, b3);
+        npyv_b8 c4 = npyv_cmpneq_u8(a4, b4);
+#if 8 == 64
+        npyv_u8  a5 = npyv_load_u8(src1 + npyv_nlanes_u8 * 4);
+        npyv_u8  b5 = npyv_load_u8(src2 + npyv_nlanes_u8 * 4);
+        npyv_u8  a6 = npyv_load_u8(src1 + npyv_nlanes_u8 * 5);
+        npyv_u8  b6 = npyv_load_u8(src2 + npyv_nlanes_u8 * 5);
+        npyv_u8  a7 = npyv_load_u8(src1 + npyv_nlanes_u8 * 6);
+        npyv_u8  b7 = npyv_load_u8(src2 + npyv_nlanes_u8 * 6);
+        npyv_u8  a8 = npyv_load_u8(src1 + npyv_nlanes_u8 * 7);
+        npyv_u8  b8 = npyv_load_u8(src2 + npyv_nlanes_u8 * 7);
+        npyv_b8 c5 = npyv_cmpneq_u8(a5, b5);
+        npyv_b8 c6 = npyv_cmpneq_u8(a6, b6);
+        npyv_b8 c7 = npyv_cmpneq_u8(a7, b7);
+        npyv_b8 c8 = npyv_cmpneq_u8(a8, b8);
+#endif // 8 >= 64
+#endif // 8 >= 32
+#endif // 8 >= 16
+#endif // 8 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 8 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 8 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 8 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 8 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_u8 a = *src1;
+        const npyv_lanetype_u8 b = *src2;
+        *dst = a != b;
+    }
+}
+
+static void simd_binary_scalar1_not_equal_u8(char **args, npy_intp len)
+{
+    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[0];
+    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 a         = npyv_setall_u8(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 8 >= 8
+        npyv_u8  b1 = npyv_load_u8(src + npyv_nlanes_u8 * 0);
+        npyv_b8 c1 = npyv_cmpneq_u8(a, b1);
+#if 8 >= 16
+        npyv_u8  b2 = npyv_load_u8(src + npyv_nlanes_u8 * 1);
+        npyv_b8 c2 = npyv_cmpneq_u8(a, b2);
+#if 8 >= 32
+        npyv_u8  b3 = npyv_load_u8(src + npyv_nlanes_u8 * 2);
+        npyv_u8  b4 = npyv_load_u8(src + npyv_nlanes_u8 * 3);
+        npyv_b8 c3 = npyv_cmpneq_u8(a, b3);
+        npyv_b8 c4 = npyv_cmpneq_u8(a, b4);
+#if 8 == 64
+        npyv_u8  b5 = npyv_load_u8(src + npyv_nlanes_u8 * 4);
+        npyv_u8  b6 = npyv_load_u8(src + npyv_nlanes_u8 * 5);
+        npyv_u8  b7 = npyv_load_u8(src + npyv_nlanes_u8 * 6);
+        npyv_u8  b8 = npyv_load_u8(src + npyv_nlanes_u8 * 7);
+        npyv_b8 c5 = npyv_cmpneq_u8(a, b5);
+        npyv_b8 c6 = npyv_cmpneq_u8(a, b6);
+        npyv_b8 c7 = npyv_cmpneq_u8(a, b7);
+        npyv_b8 c8 = npyv_cmpneq_u8(a, b8);
+#endif // 8 >= 64
+#endif // 8 >= 32
+#endif // 8 >= 16
+#endif // 8 >= 8
+
+#if 8 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 8 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 8 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 8 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u8 b = *src;
+        *dst = scalar != b;
+    }
+}
+
+static void simd_binary_scalar2_not_equal_u8(char **args, npy_intp len)
+{
+    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[0];
+    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 b         = npyv_setall_u8(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 8 >= 8
+        npyv_u8  a1 = npyv_load_u8(src + npyv_nlanes_u8 * 0);
+        npyv_b8 c1 = npyv_cmpneq_u8(a1, b);
+#if 8 >= 16
+        npyv_u8  a2 = npyv_load_u8(src + npyv_nlanes_u8 * 1);
+        npyv_b8 c2 = npyv_cmpneq_u8(a2, b);
+#if 8 >= 32
+        npyv_u8  a3 = npyv_load_u8(src + npyv_nlanes_u8 * 2);
+        npyv_u8  a4 = npyv_load_u8(src + npyv_nlanes_u8 * 3);
+        npyv_b8 c3 = npyv_cmpneq_u8(a3, b);
+        npyv_b8 c4 = npyv_cmpneq_u8(a4, b);
+#if 8 == 64
+        npyv_u8  a5 = npyv_load_u8(src + npyv_nlanes_u8 * 4);
+        npyv_u8  a6 = npyv_load_u8(src + npyv_nlanes_u8 * 5);
+        npyv_u8  a7 = npyv_load_u8(src + npyv_nlanes_u8 * 6);
+        npyv_u8  a8 = npyv_load_u8(src + npyv_nlanes_u8 * 7);
+        npyv_b8 c5 = npyv_cmpneq_u8(a5, b);
+        npyv_b8 c6 = npyv_cmpneq_u8(a6, b);
+        npyv_b8 c7 = npyv_cmpneq_u8(a7, b);
+        npyv_b8 c8 = npyv_cmpneq_u8(a8, b);
+#endif // 8 >= 64
+#endif // 8 >= 32
+#endif // 8 >= 16
+#endif // 8 >= 8
+
+#if 8 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 8 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 8 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 8 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u8 a = *src;
+        *dst = a != scalar;
+    }
+}
+#endif
+
+
+#line 35
+#if NPY_SIMD && !((0 || 0) && 0)
+static void simd_binary_less_u8(char **args, npy_intp len)
+{
+    npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0];
+    npyv_lanetype_u8 *src2 = (npyv_lanetype_u8 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 8 >= 8
+        npyv_u8  a1 = npyv_load_u8(src1 + npyv_nlanes_u8 * 0);
+        npyv_u8  b1 = npyv_load_u8(src2 + npyv_nlanes_u8 * 0);
+        npyv_b8 c1 = npyv_cmplt_u8(a1, b1);
+#if 8 >= 16
+        npyv_u8  a2 = npyv_load_u8(src1 + npyv_nlanes_u8 * 1);
+        npyv_u8  b2 = npyv_load_u8(src2 + npyv_nlanes_u8 * 1);
+        npyv_b8 c2 = npyv_cmplt_u8(a2, b2);
+#if 8 >= 32
+        npyv_u8  a3 = npyv_load_u8(src1 + npyv_nlanes_u8 * 2);
+        npyv_u8  b3 = npyv_load_u8(src2 + npyv_nlanes_u8 * 2);
+        npyv_u8  a4 = npyv_load_u8(src1 + npyv_nlanes_u8 * 3);
+        npyv_u8  b4 = npyv_load_u8(src2 + npyv_nlanes_u8 * 3);
+        npyv_b8 c3 = npyv_cmplt_u8(a3, b3);
+        npyv_b8 c4 = npyv_cmplt_u8(a4, b4);
+#if 8 == 64
+        npyv_u8  a5 = npyv_load_u8(src1 + npyv_nlanes_u8 * 4);
+        npyv_u8  b5 = npyv_load_u8(src2 + npyv_nlanes_u8 * 4);
+        npyv_u8  a6 = npyv_load_u8(src1 + npyv_nlanes_u8 * 5);
+        npyv_u8  b6 = npyv_load_u8(src2 + npyv_nlanes_u8 * 5);
+        npyv_u8  a7 = npyv_load_u8(src1 + npyv_nlanes_u8 * 6);
+        npyv_u8  b7 = npyv_load_u8(src2 + npyv_nlanes_u8 * 6);
+        npyv_u8  a8 = npyv_load_u8(src1 + npyv_nlanes_u8 * 7);
+        npyv_u8  b8 = npyv_load_u8(src2 + npyv_nlanes_u8 * 7);
+        npyv_b8 c5 = npyv_cmplt_u8(a5, b5);
+        npyv_b8 c6 = npyv_cmplt_u8(a6, b6);
+        npyv_b8 c7 = npyv_cmplt_u8(a7, b7);
+        npyv_b8 c8 = npyv_cmplt_u8(a8, b8);
+#endif // 8 >= 64
+#endif // 8 >= 32
+#endif // 8 >= 16
+#endif // 8 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 8 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 8 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 8 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 8 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_u8 a = *src1;
+        const npyv_lanetype_u8 b = *src2;
+        *dst = a < b;
+    }
+}
+
+static void simd_binary_scalar1_less_u8(char **args, npy_intp len)
+{
+    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[0];
+    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 a         = npyv_setall_u8(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 8 >= 8
+        npyv_u8  b1 = npyv_load_u8(src + npyv_nlanes_u8 * 0);
+        npyv_b8 c1 = npyv_cmplt_u8(a, b1);
+#if 8 >= 16
+        npyv_u8  b2 = npyv_load_u8(src + npyv_nlanes_u8 * 1);
+        npyv_b8 c2 = npyv_cmplt_u8(a, b2);
+#if 8 >= 32
+        npyv_u8  b3 = npyv_load_u8(src + npyv_nlanes_u8 * 2);
+        npyv_u8  b4 = npyv_load_u8(src + npyv_nlanes_u8 * 3);
+        npyv_b8 c3 = npyv_cmplt_u8(a, b3);
+        npyv_b8 c4 = npyv_cmplt_u8(a, b4);
+#if 8 == 64
+        npyv_u8  b5 = npyv_load_u8(src + npyv_nlanes_u8 * 4);
+        npyv_u8  b6 = npyv_load_u8(src + npyv_nlanes_u8 * 5);
+        npyv_u8  b7 = npyv_load_u8(src + npyv_nlanes_u8 * 6);
+        npyv_u8  b8 = npyv_load_u8(src + npyv_nlanes_u8 * 7);
+        npyv_b8 c5 = npyv_cmplt_u8(a, b5);
+        npyv_b8 c6 = npyv_cmplt_u8(a, b6);
+        npyv_b8 c7 = npyv_cmplt_u8(a, b7);
+        npyv_b8 c8 = npyv_cmplt_u8(a, b8);
+#endif // 8 >= 64
+#endif // 8 >= 32
+#endif // 8 >= 16
+#endif // 8 >= 8
+
+#if 8 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 8 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 8 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 8 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u8 b = *src;
+        *dst = scalar < b;
+    }
+}
+
+static void simd_binary_scalar2_less_u8(char **args, npy_intp len)
+{
+    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[0];
+    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 b         = npyv_setall_u8(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 8 >= 8
+        npyv_u8  a1 = npyv_load_u8(src + npyv_nlanes_u8 * 0);
+        npyv_b8 c1 = npyv_cmplt_u8(a1, b);
+#if 8 >= 16
+        npyv_u8  a2 = npyv_load_u8(src + npyv_nlanes_u8 * 1);
+        npyv_b8 c2 = npyv_cmplt_u8(a2, b);
+#if 8 >= 32
+        npyv_u8  a3 = npyv_load_u8(src + npyv_nlanes_u8 * 2);
+        npyv_u8  a4 = npyv_load_u8(src + npyv_nlanes_u8 * 3);
+        npyv_b8 c3 = npyv_cmplt_u8(a3, b);
+        npyv_b8 c4 = npyv_cmplt_u8(a4, b);
+#if 8 == 64
+        npyv_u8  a5 = npyv_load_u8(src + npyv_nlanes_u8 * 4);
+        npyv_u8  a6 = npyv_load_u8(src + npyv_nlanes_u8 * 5);
+        npyv_u8  a7 = npyv_load_u8(src + npyv_nlanes_u8 * 6);
+        npyv_u8  a8 = npyv_load_u8(src + npyv_nlanes_u8 * 7);
+        npyv_b8 c5 = npyv_cmplt_u8(a5, b);
+        npyv_b8 c6 = npyv_cmplt_u8(a6, b);
+        npyv_b8 c7 = npyv_cmplt_u8(a7, b);
+        npyv_b8 c8 = npyv_cmplt_u8(a8, b);
+#endif // 8 >= 64
+#endif // 8 >= 32
+#endif // 8 >= 16
+#endif // 8 >= 8
+
+#if 8 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 8 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 8 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 8 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u8 a = *src;
+        *dst = a < scalar;
+    }
+}
+#endif
+
+
+#line 35
+#if NPY_SIMD && !((0 || 0) && 0)
+static void simd_binary_less_equal_u8(char **args, npy_intp len)
+{
+    npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0];
+    npyv_lanetype_u8 *src2 = (npyv_lanetype_u8 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 8 >= 8
+        npyv_u8  a1 = npyv_load_u8(src1 + npyv_nlanes_u8 * 0);
+        npyv_u8  b1 = npyv_load_u8(src2 + npyv_nlanes_u8 * 0);
+        npyv_b8 c1 = npyv_cmple_u8(a1, b1);
+#if 8 >= 16
+        npyv_u8  a2 = npyv_load_u8(src1 + npyv_nlanes_u8 * 1);
+        npyv_u8  b2 = npyv_load_u8(src2 + npyv_nlanes_u8 * 1);
+        npyv_b8 c2 = npyv_cmple_u8(a2, b2);
+#if 8 >= 32
+        npyv_u8  a3 = npyv_load_u8(src1 + npyv_nlanes_u8 * 2);
+        npyv_u8  b3 = npyv_load_u8(src2 + npyv_nlanes_u8 * 2);
+        npyv_u8  a4 = npyv_load_u8(src1 + npyv_nlanes_u8 * 3);
+        npyv_u8  b4 = npyv_load_u8(src2 + npyv_nlanes_u8 * 3);
+        npyv_b8 c3 = npyv_cmple_u8(a3, b3);
+        npyv_b8 c4 = npyv_cmple_u8(a4, b4);
+#if 8 == 64
+        npyv_u8  a5 = npyv_load_u8(src1 + npyv_nlanes_u8 * 4);
+        npyv_u8  b5 = npyv_load_u8(src2 + npyv_nlanes_u8 * 4);
+        npyv_u8  a6 = npyv_load_u8(src1 + npyv_nlanes_u8 * 5);
+        npyv_u8  b6 = npyv_load_u8(src2 + npyv_nlanes_u8 * 5);
+        npyv_u8  a7 = npyv_load_u8(src1 + npyv_nlanes_u8 * 6);
+        npyv_u8  b7 = npyv_load_u8(src2 + npyv_nlanes_u8 * 6);
+        npyv_u8  a8 = npyv_load_u8(src1 + npyv_nlanes_u8 * 7);
+        npyv_u8  b8 = npyv_load_u8(src2 + npyv_nlanes_u8 * 7);
+        npyv_b8 c5 = npyv_cmple_u8(a5, b5);
+        npyv_b8 c6 = npyv_cmple_u8(a6, b6);
+        npyv_b8 c7 = npyv_cmple_u8(a7, b7);
+        npyv_b8 c8 = npyv_cmple_u8(a8, b8);
+#endif // 8 >= 64
+#endif // 8 >= 32
+#endif // 8 >= 16
+#endif // 8 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 8 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 8 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 8 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 8 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_u8 a = *src1;
+        const npyv_lanetype_u8 b = *src2;
+        *dst = a <= b;
+    }
+}
+
+static void simd_binary_scalar1_less_equal_u8(char **args, npy_intp len)
+{
+    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[0];
+    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 a         = npyv_setall_u8(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 8 >= 8
+        npyv_u8  b1 = npyv_load_u8(src + npyv_nlanes_u8 * 0);
+        npyv_b8 c1 = npyv_cmple_u8(a, b1);
+#if 8 >= 16
+        npyv_u8  b2 = npyv_load_u8(src + npyv_nlanes_u8 * 1);
+        npyv_b8 c2 = npyv_cmple_u8(a, b2);
+#if 8 >= 32
+        npyv_u8  b3 = npyv_load_u8(src + npyv_nlanes_u8 * 2);
+        npyv_u8  b4 = npyv_load_u8(src + npyv_nlanes_u8 * 3);
+        npyv_b8 c3 = npyv_cmple_u8(a, b3);
+        npyv_b8 c4 = npyv_cmple_u8(a, b4);
+#if 8 == 64
+        npyv_u8  b5 = npyv_load_u8(src + npyv_nlanes_u8 * 4);
+        npyv_u8  b6 = npyv_load_u8(src + npyv_nlanes_u8 * 5);
+        npyv_u8  b7 = npyv_load_u8(src + npyv_nlanes_u8 * 6);
+        npyv_u8  b8 = npyv_load_u8(src + npyv_nlanes_u8 * 7);
+        npyv_b8 c5 = npyv_cmple_u8(a, b5);
+        npyv_b8 c6 = npyv_cmple_u8(a, b6);
+        npyv_b8 c7 = npyv_cmple_u8(a, b7);
+        npyv_b8 c8 = npyv_cmple_u8(a, b8);
+#endif // 8 >= 64
+#endif // 8 >= 32
+#endif // 8 >= 16
+#endif // 8 >= 8
+
+#if 8 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 8 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 8 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 8 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u8 b = *src;
+        *dst = scalar <= b;
+    }
+}
+
+static void simd_binary_scalar2_less_equal_u8(char **args, npy_intp len)
+{
+    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[0];
+    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 b         = npyv_setall_u8(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 8 >= 8
+        npyv_u8  a1 = npyv_load_u8(src + npyv_nlanes_u8 * 0);
+        npyv_b8 c1 = npyv_cmple_u8(a1, b);
+#if 8 >= 16
+        npyv_u8  a2 = npyv_load_u8(src + npyv_nlanes_u8 * 1);
+        npyv_b8 c2 = npyv_cmple_u8(a2, b);
+#if 8 >= 32
+        npyv_u8  a3 = npyv_load_u8(src + npyv_nlanes_u8 * 2);
+        npyv_u8  a4 = npyv_load_u8(src + npyv_nlanes_u8 * 3);
+        npyv_b8 c3 = npyv_cmple_u8(a3, b);
+        npyv_b8 c4 = npyv_cmple_u8(a4, b);
+#if 8 == 64
+        npyv_u8  a5 = npyv_load_u8(src + npyv_nlanes_u8 * 4);
+        npyv_u8  a6 = npyv_load_u8(src + npyv_nlanes_u8 * 5);
+        npyv_u8  a7 = npyv_load_u8(src + npyv_nlanes_u8 * 6);
+        npyv_u8  a8 = npyv_load_u8(src + npyv_nlanes_u8 * 7);
+        npyv_b8 c5 = npyv_cmple_u8(a5, b);
+        npyv_b8 c6 = npyv_cmple_u8(a6, b);
+        npyv_b8 c7 = npyv_cmple_u8(a7, b);
+        npyv_b8 c8 = npyv_cmple_u8(a8, b);
+#endif // 8 >= 64
+#endif // 8 >= 32
+#endif // 8 >= 16
+#endif // 8 >= 8
+
+#if 8 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 8 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 8 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 8 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u8 a = *src;
+        *dst = a <= scalar;
+    }
+}
+#endif
+
+
+
+#line 28
+#line 35
+#if NPY_SIMD && !((1 || 0) && 1)
+static void simd_binary_equal_s8(char **args, npy_intp len)
+{
+    npyv_lanetype_s8 *src1 = (npyv_lanetype_s8 *) args[0];
+    npyv_lanetype_s8 *src2 = (npyv_lanetype_s8 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 8 >= 8
+        npyv_s8  a1 = npyv_load_s8(src1 + npyv_nlanes_s8 * 0);
+        npyv_s8  b1 = npyv_load_s8(src2 + npyv_nlanes_s8 * 0);
+        npyv_b8 c1 = npyv_cmpeq_s8(a1, b1);
+#if 8 >= 16
+        npyv_s8  a2 = npyv_load_s8(src1 + npyv_nlanes_s8 * 1);
+        npyv_s8  b2 = npyv_load_s8(src2 + npyv_nlanes_s8 * 1);
+        npyv_b8 c2 = npyv_cmpeq_s8(a2, b2);
+#if 8 >= 32
+        npyv_s8  a3 = npyv_load_s8(src1 + npyv_nlanes_s8 * 2);
+        npyv_s8  b3 = npyv_load_s8(src2 + npyv_nlanes_s8 * 2);
+        npyv_s8  a4 = npyv_load_s8(src1 + npyv_nlanes_s8 * 3);
+        npyv_s8  b4 = npyv_load_s8(src2 + npyv_nlanes_s8 * 3);
+        npyv_b8 c3 = npyv_cmpeq_s8(a3, b3);
+        npyv_b8 c4 = npyv_cmpeq_s8(a4, b4);
+#if 8 == 64
+        npyv_s8  a5 = npyv_load_s8(src1 + npyv_nlanes_s8 * 4);
+        npyv_s8  b5 = npyv_load_s8(src2 + npyv_nlanes_s8 * 4);
+        npyv_s8  a6 = npyv_load_s8(src1 + npyv_nlanes_s8 * 5);
+        npyv_s8  b6 = npyv_load_s8(src2 + npyv_nlanes_s8 * 5);
+        npyv_s8  a7 = npyv_load_s8(src1 + npyv_nlanes_s8 * 6);
+        npyv_s8  b7 = npyv_load_s8(src2 + npyv_nlanes_s8 * 6);
+        npyv_s8  a8 = npyv_load_s8(src1 + npyv_nlanes_s8 * 7);
+        npyv_s8  b8 = npyv_load_s8(src2 + npyv_nlanes_s8 * 7);
+        npyv_b8 c5 = npyv_cmpeq_s8(a5, b5);
+        npyv_b8 c6 = npyv_cmpeq_s8(a6, b6);
+        npyv_b8 c7 = npyv_cmpeq_s8(a7, b7);
+        npyv_b8 c8 = npyv_cmpeq_s8(a8, b8);
+#endif // 8 >= 64
+#endif // 8 >= 32
+#endif // 8 >= 16
+#endif // 8 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 8 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 8 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 8 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 8 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_s8 a = *src1;
+        const npyv_lanetype_s8 b = *src2;
+        *dst = a == b;
+    }
+}
+
+static void simd_binary_scalar1_equal_s8(char **args, npy_intp len)
+{
+    npyv_lanetype_s8 scalar = *(npyv_lanetype_s8 *) args[0];
+    npyv_lanetype_s8 *src   = (npyv_lanetype_s8 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_s8 a         = npyv_setall_s8(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 8 >= 8
+        npyv_s8  b1 = npyv_load_s8(src + npyv_nlanes_s8 * 0);
+        npyv_b8 c1 = npyv_cmpeq_s8(a, b1);
+#if 8 >= 16
+        npyv_s8  b2 = npyv_load_s8(src + npyv_nlanes_s8 * 1);
+        npyv_b8 c2 = npyv_cmpeq_s8(a, b2);
+#if 8 >= 32
+        npyv_s8  b3 = npyv_load_s8(src + npyv_nlanes_s8 * 2);
+        npyv_s8  b4 = npyv_load_s8(src + npyv_nlanes_s8 * 3);
+        npyv_b8 c3 = npyv_cmpeq_s8(a, b3);
+        npyv_b8 c4 = npyv_cmpeq_s8(a, b4);
+#if 8 == 64
+        npyv_s8  b5 = npyv_load_s8(src + npyv_nlanes_s8 * 4);
+        npyv_s8  b6 = npyv_load_s8(src + npyv_nlanes_s8 * 5);
+        npyv_s8  b7 = npyv_load_s8(src + npyv_nlanes_s8 * 6);
+        npyv_s8  b8 = npyv_load_s8(src + npyv_nlanes_s8 * 7);
+        npyv_b8 c5 = npyv_cmpeq_s8(a, b5);
+        npyv_b8 c6 = npyv_cmpeq_s8(a, b6);
+        npyv_b8 c7 = npyv_cmpeq_s8(a, b7);
+        npyv_b8 c8 = npyv_cmpeq_s8(a, b8);
+#endif // 8 >= 64
+#endif // 8 >= 32
+#endif // 8 >= 16
+#endif // 8 >= 8
+
+#if 8 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 8 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 8 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 8 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_s8 b = *src;
+        *dst = scalar == b;
+    }
+}
+
+static void simd_binary_scalar2_equal_s8(char **args, npy_intp len)
+{
+    npyv_lanetype_s8 *src   = (npyv_lanetype_s8 *) args[0];
+    npyv_lanetype_s8 scalar = *(npyv_lanetype_s8 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_s8 b         = npyv_setall_s8(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 8 >= 8
+        npyv_s8  a1 = npyv_load_s8(src + npyv_nlanes_s8 * 0);
+        npyv_b8 c1 = npyv_cmpeq_s8(a1, b);
+#if 8 >= 16
+        npyv_s8  a2 = npyv_load_s8(src + npyv_nlanes_s8 * 1);
+        npyv_b8 c2 = npyv_cmpeq_s8(a2, b);
+#if 8 >= 32
+        npyv_s8  a3 = npyv_load_s8(src + npyv_nlanes_s8 * 2);
+        npyv_s8  a4 = npyv_load_s8(src + npyv_nlanes_s8 * 3);
+        npyv_b8 c3 = npyv_cmpeq_s8(a3, b);
+        npyv_b8 c4 = npyv_cmpeq_s8(a4, b);
+#if 8 == 64
+        npyv_s8  a5 = npyv_load_s8(src + npyv_nlanes_s8 * 4);
+        npyv_s8  a6 = npyv_load_s8(src + npyv_nlanes_s8 * 5);
+        npyv_s8  a7 = npyv_load_s8(src + npyv_nlanes_s8 * 6);
+        npyv_s8  a8 = npyv_load_s8(src + npyv_nlanes_s8 * 7);
+        npyv_b8 c5 = npyv_cmpeq_s8(a5, b);
+        npyv_b8 c6 = npyv_cmpeq_s8(a6, b);
+        npyv_b8 c7 = npyv_cmpeq_s8(a7, b);
+        npyv_b8 c8 = npyv_cmpeq_s8(a8, b);
+#endif // 8 >= 64
+#endif // 8 >= 32
+#endif // 8 >= 16
+#endif // 8 >= 8
+
+#if 8 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 8 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 8 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 8 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_s8 a = *src;
+        *dst = a == scalar;
+    }
+}
+#endif
+
+
+#line 35
+#if NPY_SIMD && !((0 || 1) && 1)
+static void simd_binary_not_equal_s8(char **args, npy_intp len)
+{
+    npyv_lanetype_s8 *src1 = (npyv_lanetype_s8 *) args[0];
+    npyv_lanetype_s8 *src2 = (npyv_lanetype_s8 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 8 >= 8
+        npyv_s8  a1 = npyv_load_s8(src1 + npyv_nlanes_s8 * 0);
+        npyv_s8  b1 = npyv_load_s8(src2 + npyv_nlanes_s8 * 0);
+        npyv_b8 c1 = npyv_cmpneq_s8(a1, b1);
+#if 8 >= 16
+        npyv_s8  a2 = npyv_load_s8(src1 + npyv_nlanes_s8 * 1);
+        npyv_s8  b2 = npyv_load_s8(src2 + npyv_nlanes_s8 * 1);
+        npyv_b8 c2 = npyv_cmpneq_s8(a2, b2);
+#if 8 >= 32
+        npyv_s8  a3 = npyv_load_s8(src1 + npyv_nlanes_s8 * 2);
+        npyv_s8  b3 = npyv_load_s8(src2 + npyv_nlanes_s8 * 2);
+        npyv_s8  a4 = npyv_load_s8(src1 + npyv_nlanes_s8 * 3);
+        npyv_s8  b4 = npyv_load_s8(src2 + npyv_nlanes_s8 * 3);
+        npyv_b8 c3 = npyv_cmpneq_s8(a3, b3);
+        npyv_b8 c4 = npyv_cmpneq_s8(a4, b4);
+#if 8 == 64
+        npyv_s8  a5 = npyv_load_s8(src1 + npyv_nlanes_s8 * 4);
+        npyv_s8  b5 = npyv_load_s8(src2 + npyv_nlanes_s8 * 4);
+        npyv_s8  a6 = npyv_load_s8(src1 + npyv_nlanes_s8 * 5);
+        npyv_s8  b6 = npyv_load_s8(src2 + npyv_nlanes_s8 * 5);
+        npyv_s8  a7 = npyv_load_s8(src1 + npyv_nlanes_s8 * 6);
+        npyv_s8  b7 = npyv_load_s8(src2 + npyv_nlanes_s8 * 6);
+        npyv_s8  a8 = npyv_load_s8(src1 + npyv_nlanes_s8 * 7);
+        npyv_s8  b8 = npyv_load_s8(src2 + npyv_nlanes_s8 * 7);
+        npyv_b8 c5 = npyv_cmpneq_s8(a5, b5);
+        npyv_b8 c6 = npyv_cmpneq_s8(a6, b6);
+        npyv_b8 c7 = npyv_cmpneq_s8(a7, b7);
+        npyv_b8 c8 = npyv_cmpneq_s8(a8, b8);
+#endif // 8 >= 64
+#endif // 8 >= 32
+#endif // 8 >= 16
+#endif // 8 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 8 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 8 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 8 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 8 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_s8 a = *src1;
+        const npyv_lanetype_s8 b = *src2;
+        *dst = a != b;
+    }
+}
+
+static void simd_binary_scalar1_not_equal_s8(char **args, npy_intp len)
+{
+    npyv_lanetype_s8 scalar = *(npyv_lanetype_s8 *) args[0];
+    npyv_lanetype_s8 *src   = (npyv_lanetype_s8 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_s8 a         = npyv_setall_s8(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 8 >= 8
+        npyv_s8  b1 = npyv_load_s8(src + npyv_nlanes_s8 * 0);
+        npyv_b8 c1 = npyv_cmpneq_s8(a, b1);
+#if 8 >= 16
+        npyv_s8  b2 = npyv_load_s8(src + npyv_nlanes_s8 * 1);
+        npyv_b8 c2 = npyv_cmpneq_s8(a, b2);
+#if 8 >= 32
+        npyv_s8  b3 = npyv_load_s8(src + npyv_nlanes_s8 * 2);
+        npyv_s8  b4 = npyv_load_s8(src + npyv_nlanes_s8 * 3);
+        npyv_b8 c3 = npyv_cmpneq_s8(a, b3);
+        npyv_b8 c4 = npyv_cmpneq_s8(a, b4);
+#if 8 == 64
+        npyv_s8  b5 = npyv_load_s8(src + npyv_nlanes_s8 * 4);
+        npyv_s8  b6 = npyv_load_s8(src + npyv_nlanes_s8 * 5);
+        npyv_s8  b7 = npyv_load_s8(src + npyv_nlanes_s8 * 6);
+        npyv_s8  b8 = npyv_load_s8(src + npyv_nlanes_s8 * 7);
+        npyv_b8 c5 = npyv_cmpneq_s8(a, b5);
+        npyv_b8 c6 = npyv_cmpneq_s8(a, b6);
+        npyv_b8 c7 = npyv_cmpneq_s8(a, b7);
+        npyv_b8 c8 = npyv_cmpneq_s8(a, b8);
+#endif // 8 >= 64
+#endif // 8 >= 32
+#endif // 8 >= 16
+#endif // 8 >= 8
+
+#if 8 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 8 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 8 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 8 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_s8 b = *src;
+        *dst = scalar != b;
+    }
+}
+
+static void simd_binary_scalar2_not_equal_s8(char **args, npy_intp len)
+{
+    npyv_lanetype_s8 *src   = (npyv_lanetype_s8 *) args[0];
+    npyv_lanetype_s8 scalar = *(npyv_lanetype_s8 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_s8 b         = npyv_setall_s8(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 8 >= 8
+        npyv_s8  a1 = npyv_load_s8(src + npyv_nlanes_s8 * 0);
+        npyv_b8 c1 = npyv_cmpneq_s8(a1, b);
+#if 8 >= 16
+        npyv_s8  a2 = npyv_load_s8(src + npyv_nlanes_s8 * 1);
+        npyv_b8 c2 = npyv_cmpneq_s8(a2, b);
+#if 8 >= 32
+        npyv_s8  a3 = npyv_load_s8(src + npyv_nlanes_s8 * 2);
+        npyv_s8  a4 = npyv_load_s8(src + npyv_nlanes_s8 * 3);
+        npyv_b8 c3 = npyv_cmpneq_s8(a3, b);
+        npyv_b8 c4 = npyv_cmpneq_s8(a4, b);
+#if 8 == 64
+        npyv_s8  a5 = npyv_load_s8(src + npyv_nlanes_s8 * 4);
+        npyv_s8  a6 = npyv_load_s8(src + npyv_nlanes_s8 * 5);
+        npyv_s8  a7 = npyv_load_s8(src + npyv_nlanes_s8 * 6);
+        npyv_s8  a8 = npyv_load_s8(src + npyv_nlanes_s8 * 7);
+        npyv_b8 c5 = npyv_cmpneq_s8(a5, b);
+        npyv_b8 c6 = npyv_cmpneq_s8(a6, b);
+        npyv_b8 c7 = npyv_cmpneq_s8(a7, b);
+        npyv_b8 c8 = npyv_cmpneq_s8(a8, b);
+#endif // 8 >= 64
+#endif // 8 >= 32
+#endif // 8 >= 16
+#endif // 8 >= 8
+
+#if 8 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 8 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 8 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 8 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_s8 a = *src;
+        *dst = a != scalar;
+    }
+}
+#endif
+
+
+#line 35
+#if NPY_SIMD && !((0 || 0) && 1)
+static void simd_binary_less_s8(char **args, npy_intp len)
+{
+    npyv_lanetype_s8 *src1 = (npyv_lanetype_s8 *) args[0];
+    npyv_lanetype_s8 *src2 = (npyv_lanetype_s8 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 8 >= 8
+        npyv_s8  a1 = npyv_load_s8(src1 + npyv_nlanes_s8 * 0);
+        npyv_s8  b1 = npyv_load_s8(src2 + npyv_nlanes_s8 * 0);
+        npyv_b8 c1 = npyv_cmplt_s8(a1, b1);
+#if 8 >= 16
+        npyv_s8  a2 = npyv_load_s8(src1 + npyv_nlanes_s8 * 1);
+        npyv_s8  b2 = npyv_load_s8(src2 + npyv_nlanes_s8 * 1);
+        npyv_b8 c2 = npyv_cmplt_s8(a2, b2);
+#if 8 >= 32
+        npyv_s8  a3 = npyv_load_s8(src1 + npyv_nlanes_s8 * 2);
+        npyv_s8  b3 = npyv_load_s8(src2 + npyv_nlanes_s8 * 2);
+        npyv_s8  a4 = npyv_load_s8(src1 + npyv_nlanes_s8 * 3);
+        npyv_s8  b4 = npyv_load_s8(src2 + npyv_nlanes_s8 * 3);
+        npyv_b8 c3 = npyv_cmplt_s8(a3, b3);
+        npyv_b8 c4 = npyv_cmplt_s8(a4, b4);
+#if 8 == 64
+        npyv_s8  a5 = npyv_load_s8(src1 + npyv_nlanes_s8 * 4);
+        npyv_s8  b5 = npyv_load_s8(src2 + npyv_nlanes_s8 * 4);
+        npyv_s8  a6 = npyv_load_s8(src1 + npyv_nlanes_s8 * 5);
+        npyv_s8  b6 = npyv_load_s8(src2 + npyv_nlanes_s8 * 5);
+        npyv_s8  a7 = npyv_load_s8(src1 + npyv_nlanes_s8 * 6);
+        npyv_s8  b7 = npyv_load_s8(src2 + npyv_nlanes_s8 * 6);
+        npyv_s8  a8 = npyv_load_s8(src1 + npyv_nlanes_s8 * 7);
+        npyv_s8  b8 = npyv_load_s8(src2 + npyv_nlanes_s8 * 7);
+        npyv_b8 c5 = npyv_cmplt_s8(a5, b5);
+        npyv_b8 c6 = npyv_cmplt_s8(a6, b6);
+        npyv_b8 c7 = npyv_cmplt_s8(a7, b7);
+        npyv_b8 c8 = npyv_cmplt_s8(a8, b8);
+#endif // 8 >= 64
+#endif // 8 >= 32
+#endif // 8 >= 16
+#endif // 8 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 8 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 8 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 8 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 8 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_s8 a = *src1;
+        const npyv_lanetype_s8 b = *src2;
+        *dst = a < b;
+    }
+}
+
+static void simd_binary_scalar1_less_s8(char **args, npy_intp len)
+{
+    npyv_lanetype_s8 scalar = *(npyv_lanetype_s8 *) args[0];
+    npyv_lanetype_s8 *src   = (npyv_lanetype_s8 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_s8 a         = npyv_setall_s8(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 8 >= 8
+        npyv_s8  b1 = npyv_load_s8(src + npyv_nlanes_s8 * 0);
+        npyv_b8 c1 = npyv_cmplt_s8(a, b1);
+#if 8 >= 16
+        npyv_s8  b2 = npyv_load_s8(src + npyv_nlanes_s8 * 1);
+        npyv_b8 c2 = npyv_cmplt_s8(a, b2);
+#if 8 >= 32
+        npyv_s8  b3 = npyv_load_s8(src + npyv_nlanes_s8 * 2);
+        npyv_s8  b4 = npyv_load_s8(src + npyv_nlanes_s8 * 3);
+        npyv_b8 c3 = npyv_cmplt_s8(a, b3);
+        npyv_b8 c4 = npyv_cmplt_s8(a, b4);
+#if 8 == 64
+        npyv_s8  b5 = npyv_load_s8(src + npyv_nlanes_s8 * 4);
+        npyv_s8  b6 = npyv_load_s8(src + npyv_nlanes_s8 * 5);
+        npyv_s8  b7 = npyv_load_s8(src + npyv_nlanes_s8 * 6);
+        npyv_s8  b8 = npyv_load_s8(src + npyv_nlanes_s8 * 7);
+        npyv_b8 c5 = npyv_cmplt_s8(a, b5);
+        npyv_b8 c6 = npyv_cmplt_s8(a, b6);
+        npyv_b8 c7 = npyv_cmplt_s8(a, b7);
+        npyv_b8 c8 = npyv_cmplt_s8(a, b8);
+#endif // 8 >= 64
+#endif // 8 >= 32
+#endif // 8 >= 16
+#endif // 8 >= 8
+
+#if 8 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 8 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 8 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 8 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_s8 b = *src;
+        *dst = scalar < b;
+    }
+}
+
+static void simd_binary_scalar2_less_s8(char **args, npy_intp len)
+{
+    npyv_lanetype_s8 *src   = (npyv_lanetype_s8 *) args[0];
+    npyv_lanetype_s8 scalar = *(npyv_lanetype_s8 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_s8 b         = npyv_setall_s8(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 8 >= 8
+        npyv_s8  a1 = npyv_load_s8(src + npyv_nlanes_s8 * 0);
+        npyv_b8 c1 = npyv_cmplt_s8(a1, b);
+#if 8 >= 16
+        npyv_s8  a2 = npyv_load_s8(src + npyv_nlanes_s8 * 1);
+        npyv_b8 c2 = npyv_cmplt_s8(a2, b);
+#if 8 >= 32
+        npyv_s8  a3 = npyv_load_s8(src + npyv_nlanes_s8 * 2);
+        npyv_s8  a4 = npyv_load_s8(src + npyv_nlanes_s8 * 3);
+        npyv_b8 c3 = npyv_cmplt_s8(a3, b);
+        npyv_b8 c4 = npyv_cmplt_s8(a4, b);
+#if 8 == 64
+        npyv_s8  a5 = npyv_load_s8(src + npyv_nlanes_s8 * 4);
+        npyv_s8  a6 = npyv_load_s8(src + npyv_nlanes_s8 * 5);
+        npyv_s8  a7 = npyv_load_s8(src + npyv_nlanes_s8 * 6);
+        npyv_s8  a8 = npyv_load_s8(src + npyv_nlanes_s8 * 7);
+        npyv_b8 c5 = npyv_cmplt_s8(a5, b);
+        npyv_b8 c6 = npyv_cmplt_s8(a6, b);
+        npyv_b8 c7 = npyv_cmplt_s8(a7, b);
+        npyv_b8 c8 = npyv_cmplt_s8(a8, b);
+#endif // 8 >= 64
+#endif // 8 >= 32
+#endif // 8 >= 16
+#endif // 8 >= 8
+
+#if 8 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 8 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 8 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 8 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_s8 a = *src;
+        *dst = a < scalar;
+    }
+}
+#endif
+
+
+#line 35
+#if NPY_SIMD && !((0 || 0) && 1)
+static void simd_binary_less_equal_s8(char **args, npy_intp len)
+{
+    npyv_lanetype_s8 *src1 = (npyv_lanetype_s8 *) args[0];
+    npyv_lanetype_s8 *src2 = (npyv_lanetype_s8 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 8 >= 8
+        npyv_s8  a1 = npyv_load_s8(src1 + npyv_nlanes_s8 * 0);
+        npyv_s8  b1 = npyv_load_s8(src2 + npyv_nlanes_s8 * 0);
+        npyv_b8 c1 = npyv_cmple_s8(a1, b1);
+#if 8 >= 16
+        npyv_s8  a2 = npyv_load_s8(src1 + npyv_nlanes_s8 * 1);
+        npyv_s8  b2 = npyv_load_s8(src2 + npyv_nlanes_s8 * 1);
+        npyv_b8 c2 = npyv_cmple_s8(a2, b2);
+#if 8 >= 32
+        npyv_s8  a3 = npyv_load_s8(src1 + npyv_nlanes_s8 * 2);
+        npyv_s8  b3 = npyv_load_s8(src2 + npyv_nlanes_s8 * 2);
+        npyv_s8  a4 = npyv_load_s8(src1 + npyv_nlanes_s8 * 3);
+        npyv_s8  b4 = npyv_load_s8(src2 + npyv_nlanes_s8 * 3);
+        npyv_b8 c3 = npyv_cmple_s8(a3, b3);
+        npyv_b8 c4 = npyv_cmple_s8(a4, b4);
+#if 8 == 64
+        npyv_s8  a5 = npyv_load_s8(src1 + npyv_nlanes_s8 * 4);
+        npyv_s8  b5 = npyv_load_s8(src2 + npyv_nlanes_s8 * 4);
+        npyv_s8  a6 = npyv_load_s8(src1 + npyv_nlanes_s8 * 5);
+        npyv_s8  b6 = npyv_load_s8(src2 + npyv_nlanes_s8 * 5);
+        npyv_s8  a7 = npyv_load_s8(src1 + npyv_nlanes_s8 * 6);
+        npyv_s8  b7 = npyv_load_s8(src2 + npyv_nlanes_s8 * 6);
+        npyv_s8  a8 = npyv_load_s8(src1 + npyv_nlanes_s8 * 7);
+        npyv_s8  b8 = npyv_load_s8(src2 + npyv_nlanes_s8 * 7);
+        npyv_b8 c5 = npyv_cmple_s8(a5, b5);
+        npyv_b8 c6 = npyv_cmple_s8(a6, b6);
+        npyv_b8 c7 = npyv_cmple_s8(a7, b7);
+        npyv_b8 c8 = npyv_cmple_s8(a8, b8);
+#endif // 8 >= 64
+#endif // 8 >= 32
+#endif // 8 >= 16
+#endif // 8 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 8 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 8 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 8 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 8 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_s8 a = *src1;
+        const npyv_lanetype_s8 b = *src2;
+        *dst = a <= b;
+    }
+}
+
+static void simd_binary_scalar1_less_equal_s8(char **args, npy_intp len)
+{
+    npyv_lanetype_s8 scalar = *(npyv_lanetype_s8 *) args[0];
+    npyv_lanetype_s8 *src   = (npyv_lanetype_s8 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_s8 a         = npyv_setall_s8(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 8 >= 8
+        npyv_s8  b1 = npyv_load_s8(src + npyv_nlanes_s8 * 0);
+        npyv_b8 c1 = npyv_cmple_s8(a, b1);
+#if 8 >= 16
+        npyv_s8  b2 = npyv_load_s8(src + npyv_nlanes_s8 * 1);
+        npyv_b8 c2 = npyv_cmple_s8(a, b2);
+#if 8 >= 32
+        npyv_s8  b3 = npyv_load_s8(src + npyv_nlanes_s8 * 2);
+        npyv_s8  b4 = npyv_load_s8(src + npyv_nlanes_s8 * 3);
+        npyv_b8 c3 = npyv_cmple_s8(a, b3);
+        npyv_b8 c4 = npyv_cmple_s8(a, b4);
+#if 8 == 64
+        npyv_s8  b5 = npyv_load_s8(src + npyv_nlanes_s8 * 4);
+        npyv_s8  b6 = npyv_load_s8(src + npyv_nlanes_s8 * 5);
+        npyv_s8  b7 = npyv_load_s8(src + npyv_nlanes_s8 * 6);
+        npyv_s8  b8 = npyv_load_s8(src + npyv_nlanes_s8 * 7);
+        npyv_b8 c5 = npyv_cmple_s8(a, b5);
+        npyv_b8 c6 = npyv_cmple_s8(a, b6);
+        npyv_b8 c7 = npyv_cmple_s8(a, b7);
+        npyv_b8 c8 = npyv_cmple_s8(a, b8);
+#endif // 8 >= 64
+#endif // 8 >= 32
+#endif // 8 >= 16
+#endif // 8 >= 8
+
+#if 8 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 8 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 8 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 8 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_s8 b = *src;
+        *dst = scalar <= b;
+    }
+}
+
+static void simd_binary_scalar2_less_equal_s8(char **args, npy_intp len)
+{
+    npyv_lanetype_s8 *src   = (npyv_lanetype_s8 *) args[0];
+    npyv_lanetype_s8 scalar = *(npyv_lanetype_s8 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_s8 b         = npyv_setall_s8(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 8 >= 8
+        npyv_s8  a1 = npyv_load_s8(src + npyv_nlanes_s8 * 0);
+        npyv_b8 c1 = npyv_cmple_s8(a1, b);
+#if 8 >= 16
+        npyv_s8  a2 = npyv_load_s8(src + npyv_nlanes_s8 * 1);
+        npyv_b8 c2 = npyv_cmple_s8(a2, b);
+#if 8 >= 32
+        npyv_s8  a3 = npyv_load_s8(src + npyv_nlanes_s8 * 2);
+        npyv_s8  a4 = npyv_load_s8(src + npyv_nlanes_s8 * 3);
+        npyv_b8 c3 = npyv_cmple_s8(a3, b);
+        npyv_b8 c4 = npyv_cmple_s8(a4, b);
+#if 8 == 64
+        npyv_s8  a5 = npyv_load_s8(src + npyv_nlanes_s8 * 4);
+        npyv_s8  a6 = npyv_load_s8(src + npyv_nlanes_s8 * 5);
+        npyv_s8  a7 = npyv_load_s8(src + npyv_nlanes_s8 * 6);
+        npyv_s8  a8 = npyv_load_s8(src + npyv_nlanes_s8 * 7);
+        npyv_b8 c5 = npyv_cmple_s8(a5, b);
+        npyv_b8 c6 = npyv_cmple_s8(a6, b);
+        npyv_b8 c7 = npyv_cmple_s8(a7, b);
+        npyv_b8 c8 = npyv_cmple_s8(a8, b);
+#endif // 8 >= 64
+#endif // 8 >= 32
+#endif // 8 >= 16
+#endif // 8 >= 8
+
+#if 8 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 8 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 8 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 8 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_s8 a = *src;
+        *dst = a <= scalar;
+    }
+}
+#endif
+
+
+
+#line 28
+#line 35
+#if NPY_SIMD && !((1 || 0) && 0)
+static void simd_binary_equal_u16(char **args, npy_intp len)
+{
+    npyv_lanetype_u16 *src1 = (npyv_lanetype_u16 *) args[0];
+    npyv_lanetype_u16 *src2 = (npyv_lanetype_u16 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 16 >= 8
+        npyv_u16  a1 = npyv_load_u16(src1 + npyv_nlanes_u16 * 0);
+        npyv_u16  b1 = npyv_load_u16(src2 + npyv_nlanes_u16 * 0);
+        npyv_b16 c1 = npyv_cmpeq_u16(a1, b1);
+#if 16 >= 16
+        npyv_u16  a2 = npyv_load_u16(src1 + npyv_nlanes_u16 * 1);
+        npyv_u16  b2 = npyv_load_u16(src2 + npyv_nlanes_u16 * 1);
+        npyv_b16 c2 = npyv_cmpeq_u16(a2, b2);
+#if 16 >= 32
+        npyv_u16  a3 = npyv_load_u16(src1 + npyv_nlanes_u16 * 2);
+        npyv_u16  b3 = npyv_load_u16(src2 + npyv_nlanes_u16 * 2);
+        npyv_u16  a4 = npyv_load_u16(src1 + npyv_nlanes_u16 * 3);
+        npyv_u16  b4 = npyv_load_u16(src2 + npyv_nlanes_u16 * 3);
+        npyv_b16 c3 = npyv_cmpeq_u16(a3, b3);
+        npyv_b16 c4 = npyv_cmpeq_u16(a4, b4);
+#if 16 == 64
+        npyv_u16  a5 = npyv_load_u16(src1 + npyv_nlanes_u16 * 4);
+        npyv_u16  b5 = npyv_load_u16(src2 + npyv_nlanes_u16 * 4);
+        npyv_u16  a6 = npyv_load_u16(src1 + npyv_nlanes_u16 * 5);
+        npyv_u16  b6 = npyv_load_u16(src2 + npyv_nlanes_u16 * 5);
+        npyv_u16  a7 = npyv_load_u16(src1 + npyv_nlanes_u16 * 6);
+        npyv_u16  b7 = npyv_load_u16(src2 + npyv_nlanes_u16 * 6);
+        npyv_u16  a8 = npyv_load_u16(src1 + npyv_nlanes_u16 * 7);
+        npyv_u16  b8 = npyv_load_u16(src2 + npyv_nlanes_u16 * 7);
+        npyv_b16 c5 = npyv_cmpeq_u16(a5, b5);
+        npyv_b16 c6 = npyv_cmpeq_u16(a6, b6);
+        npyv_b16 c7 = npyv_cmpeq_u16(a7, b7);
+        npyv_b16 c8 = npyv_cmpeq_u16(a8, b8);
+#endif // 16 >= 64
+#endif // 16 >= 32
+#endif // 16 >= 16
+#endif // 16 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 16 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 16 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 16 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 16 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_u16 a = *src1;
+        const npyv_lanetype_u16 b = *src2;
+        *dst = a == b;
+    }
+}
+
+static void simd_binary_scalar1_equal_u16(char **args, npy_intp len)
+{
+    npyv_lanetype_u16 scalar = *(npyv_lanetype_u16 *) args[0];
+    npyv_lanetype_u16 *src   = (npyv_lanetype_u16 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_u16 a         = npyv_setall_u16(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 16 >= 8
+        npyv_u16  b1 = npyv_load_u16(src + npyv_nlanes_u16 * 0);
+        npyv_b16 c1 = npyv_cmpeq_u16(a, b1);
+#if 16 >= 16
+        npyv_u16  b2 = npyv_load_u16(src + npyv_nlanes_u16 * 1);
+        npyv_b16 c2 = npyv_cmpeq_u16(a, b2);
+#if 16 >= 32
+        npyv_u16  b3 = npyv_load_u16(src + npyv_nlanes_u16 * 2);
+        npyv_u16  b4 = npyv_load_u16(src + npyv_nlanes_u16 * 3);
+        npyv_b16 c3 = npyv_cmpeq_u16(a, b3);
+        npyv_b16 c4 = npyv_cmpeq_u16(a, b4);
+#if 16 == 64
+        npyv_u16  b5 = npyv_load_u16(src + npyv_nlanes_u16 * 4);
+        npyv_u16  b6 = npyv_load_u16(src + npyv_nlanes_u16 * 5);
+        npyv_u16  b7 = npyv_load_u16(src + npyv_nlanes_u16 * 6);
+        npyv_u16  b8 = npyv_load_u16(src + npyv_nlanes_u16 * 7);
+        npyv_b16 c5 = npyv_cmpeq_u16(a, b5);
+        npyv_b16 c6 = npyv_cmpeq_u16(a, b6);
+        npyv_b16 c7 = npyv_cmpeq_u16(a, b7);
+        npyv_b16 c8 = npyv_cmpeq_u16(a, b8);
+#endif // 16 >= 64
+#endif // 16 >= 32
+#endif // 16 >= 16
+#endif // 16 >= 8
+
+#if 16 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 16 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 16 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 16 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u16 b = *src;
+        *dst = scalar == b;
+    }
+}
+
+static void simd_binary_scalar2_equal_u16(char **args, npy_intp len)
+{
+    npyv_lanetype_u16 *src   = (npyv_lanetype_u16 *) args[0];
+    npyv_lanetype_u16 scalar = *(npyv_lanetype_u16 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_u16 b         = npyv_setall_u16(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 16 >= 8
+        npyv_u16  a1 = npyv_load_u16(src + npyv_nlanes_u16 * 0);
+        npyv_b16 c1 = npyv_cmpeq_u16(a1, b);
+#if 16 >= 16
+        npyv_u16  a2 = npyv_load_u16(src + npyv_nlanes_u16 * 1);
+        npyv_b16 c2 = npyv_cmpeq_u16(a2, b);
+#if 16 >= 32
+        npyv_u16  a3 = npyv_load_u16(src + npyv_nlanes_u16 * 2);
+        npyv_u16  a4 = npyv_load_u16(src + npyv_nlanes_u16 * 3);
+        npyv_b16 c3 = npyv_cmpeq_u16(a3, b);
+        npyv_b16 c4 = npyv_cmpeq_u16(a4, b);
+#if 16 == 64
+        npyv_u16  a5 = npyv_load_u16(src + npyv_nlanes_u16 * 4);
+        npyv_u16  a6 = npyv_load_u16(src + npyv_nlanes_u16 * 5);
+        npyv_u16  a7 = npyv_load_u16(src + npyv_nlanes_u16 * 6);
+        npyv_u16  a8 = npyv_load_u16(src + npyv_nlanes_u16 * 7);
+        npyv_b16 c5 = npyv_cmpeq_u16(a5, b);
+        npyv_b16 c6 = npyv_cmpeq_u16(a6, b);
+        npyv_b16 c7 = npyv_cmpeq_u16(a7, b);
+        npyv_b16 c8 = npyv_cmpeq_u16(a8, b);
+#endif // 16 >= 64
+#endif // 16 >= 32
+#endif // 16 >= 16
+#endif // 16 >= 8
+
+#if 16 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 16 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 16 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 16 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u16 a = *src;
+        *dst = a == scalar;
+    }
+}
+#endif
+
+
+#line 35
+#if NPY_SIMD && !((0 || 1) && 0)
+static void simd_binary_not_equal_u16(char **args, npy_intp len)
+{
+    npyv_lanetype_u16 *src1 = (npyv_lanetype_u16 *) args[0];
+    npyv_lanetype_u16 *src2 = (npyv_lanetype_u16 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 16 >= 8
+        npyv_u16  a1 = npyv_load_u16(src1 + npyv_nlanes_u16 * 0);
+        npyv_u16  b1 = npyv_load_u16(src2 + npyv_nlanes_u16 * 0);
+        npyv_b16 c1 = npyv_cmpneq_u16(a1, b1);
+#if 16 >= 16
+        npyv_u16  a2 = npyv_load_u16(src1 + npyv_nlanes_u16 * 1);
+        npyv_u16  b2 = npyv_load_u16(src2 + npyv_nlanes_u16 * 1);
+        npyv_b16 c2 = npyv_cmpneq_u16(a2, b2);
+#if 16 >= 32
+        npyv_u16  a3 = npyv_load_u16(src1 + npyv_nlanes_u16 * 2);
+        npyv_u16  b3 = npyv_load_u16(src2 + npyv_nlanes_u16 * 2);
+        npyv_u16  a4 = npyv_load_u16(src1 + npyv_nlanes_u16 * 3);
+        npyv_u16  b4 = npyv_load_u16(src2 + npyv_nlanes_u16 * 3);
+        npyv_b16 c3 = npyv_cmpneq_u16(a3, b3);
+        npyv_b16 c4 = npyv_cmpneq_u16(a4, b4);
+#if 16 == 64
+        npyv_u16  a5 = npyv_load_u16(src1 + npyv_nlanes_u16 * 4);
+        npyv_u16  b5 = npyv_load_u16(src2 + npyv_nlanes_u16 * 4);
+        npyv_u16  a6 = npyv_load_u16(src1 + npyv_nlanes_u16 * 5);
+        npyv_u16  b6 = npyv_load_u16(src2 + npyv_nlanes_u16 * 5);
+        npyv_u16  a7 = npyv_load_u16(src1 + npyv_nlanes_u16 * 6);
+        npyv_u16  b7 = npyv_load_u16(src2 + npyv_nlanes_u16 * 6);
+        npyv_u16  a8 = npyv_load_u16(src1 + npyv_nlanes_u16 * 7);
+        npyv_u16  b8 = npyv_load_u16(src2 + npyv_nlanes_u16 * 7);
+        npyv_b16 c5 = npyv_cmpneq_u16(a5, b5);
+        npyv_b16 c6 = npyv_cmpneq_u16(a6, b6);
+        npyv_b16 c7 = npyv_cmpneq_u16(a7, b7);
+        npyv_b16 c8 = npyv_cmpneq_u16(a8, b8);
+#endif // 16 >= 64
+#endif // 16 >= 32
+#endif // 16 >= 16
+#endif // 16 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 16 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 16 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 16 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 16 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_u16 a = *src1;
+        const npyv_lanetype_u16 b = *src2;
+        *dst = a != b;
+    }
+}
+
+static void simd_binary_scalar1_not_equal_u16(char **args, npy_intp len)
+{
+    npyv_lanetype_u16 scalar = *(npyv_lanetype_u16 *) args[0];
+    npyv_lanetype_u16 *src   = (npyv_lanetype_u16 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_u16 a         = npyv_setall_u16(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 16 >= 8
+        npyv_u16  b1 = npyv_load_u16(src + npyv_nlanes_u16 * 0);
+        npyv_b16 c1 = npyv_cmpneq_u16(a, b1);
+#if 16 >= 16
+        npyv_u16  b2 = npyv_load_u16(src + npyv_nlanes_u16 * 1);
+        npyv_b16 c2 = npyv_cmpneq_u16(a, b2);
+#if 16 >= 32
+        npyv_u16  b3 = npyv_load_u16(src + npyv_nlanes_u16 * 2);
+        npyv_u16  b4 = npyv_load_u16(src + npyv_nlanes_u16 * 3);
+        npyv_b16 c3 = npyv_cmpneq_u16(a, b3);
+        npyv_b16 c4 = npyv_cmpneq_u16(a, b4);
+#if 16 == 64
+        npyv_u16  b5 = npyv_load_u16(src + npyv_nlanes_u16 * 4);
+        npyv_u16  b6 = npyv_load_u16(src + npyv_nlanes_u16 * 5);
+        npyv_u16  b7 = npyv_load_u16(src + npyv_nlanes_u16 * 6);
+        npyv_u16  b8 = npyv_load_u16(src + npyv_nlanes_u16 * 7);
+        npyv_b16 c5 = npyv_cmpneq_u16(a, b5);
+        npyv_b16 c6 = npyv_cmpneq_u16(a, b6);
+        npyv_b16 c7 = npyv_cmpneq_u16(a, b7);
+        npyv_b16 c8 = npyv_cmpneq_u16(a, b8);
+#endif // 16 >= 64
+#endif // 16 >= 32
+#endif // 16 >= 16
+#endif // 16 >= 8
+
+#if 16 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 16 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 16 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 16 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u16 b = *src;
+        *dst = scalar != b;
+    }
+}
+
+static void simd_binary_scalar2_not_equal_u16(char **args, npy_intp len)
+{
+    npyv_lanetype_u16 *src   = (npyv_lanetype_u16 *) args[0];
+    npyv_lanetype_u16 scalar = *(npyv_lanetype_u16 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_u16 b         = npyv_setall_u16(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 16 >= 8
+        npyv_u16  a1 = npyv_load_u16(src + npyv_nlanes_u16 * 0);
+        npyv_b16 c1 = npyv_cmpneq_u16(a1, b);
+#if 16 >= 16
+        npyv_u16  a2 = npyv_load_u16(src + npyv_nlanes_u16 * 1);
+        npyv_b16 c2 = npyv_cmpneq_u16(a2, b);
+#if 16 >= 32
+        npyv_u16  a3 = npyv_load_u16(src + npyv_nlanes_u16 * 2);
+        npyv_u16  a4 = npyv_load_u16(src + npyv_nlanes_u16 * 3);
+        npyv_b16 c3 = npyv_cmpneq_u16(a3, b);
+        npyv_b16 c4 = npyv_cmpneq_u16(a4, b);
+#if 16 == 64
+        npyv_u16  a5 = npyv_load_u16(src + npyv_nlanes_u16 * 4);
+        npyv_u16  a6 = npyv_load_u16(src + npyv_nlanes_u16 * 5);
+        npyv_u16  a7 = npyv_load_u16(src + npyv_nlanes_u16 * 6);
+        npyv_u16  a8 = npyv_load_u16(src + npyv_nlanes_u16 * 7);
+        npyv_b16 c5 = npyv_cmpneq_u16(a5, b);
+        npyv_b16 c6 = npyv_cmpneq_u16(a6, b);
+        npyv_b16 c7 = npyv_cmpneq_u16(a7, b);
+        npyv_b16 c8 = npyv_cmpneq_u16(a8, b);
+#endif // 16 >= 64
+#endif // 16 >= 32
+#endif // 16 >= 16
+#endif // 16 >= 8
+
+#if 16 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 16 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 16 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 16 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u16 a = *src;
+        *dst = a != scalar;
+    }
+}
+#endif
+
+
+#line 35
+#if NPY_SIMD && !((0 || 0) && 0)
+static void simd_binary_less_u16(char **args, npy_intp len)
+{
+    npyv_lanetype_u16 *src1 = (npyv_lanetype_u16 *) args[0];
+    npyv_lanetype_u16 *src2 = (npyv_lanetype_u16 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 16 >= 8
+        npyv_u16  a1 = npyv_load_u16(src1 + npyv_nlanes_u16 * 0);
+        npyv_u16  b1 = npyv_load_u16(src2 + npyv_nlanes_u16 * 0);
+        npyv_b16 c1 = npyv_cmplt_u16(a1, b1);
+#if 16 >= 16
+        npyv_u16  a2 = npyv_load_u16(src1 + npyv_nlanes_u16 * 1);
+        npyv_u16  b2 = npyv_load_u16(src2 + npyv_nlanes_u16 * 1);
+        npyv_b16 c2 = npyv_cmplt_u16(a2, b2);
+#if 16 >= 32
+        npyv_u16  a3 = npyv_load_u16(src1 + npyv_nlanes_u16 * 2);
+        npyv_u16  b3 = npyv_load_u16(src2 + npyv_nlanes_u16 * 2);
+        npyv_u16  a4 = npyv_load_u16(src1 + npyv_nlanes_u16 * 3);
+        npyv_u16  b4 = npyv_load_u16(src2 + npyv_nlanes_u16 * 3);
+        npyv_b16 c3 = npyv_cmplt_u16(a3, b3);
+        npyv_b16 c4 = npyv_cmplt_u16(a4, b4);
+#if 16 == 64
+        npyv_u16  a5 = npyv_load_u16(src1 + npyv_nlanes_u16 * 4);
+        npyv_u16  b5 = npyv_load_u16(src2 + npyv_nlanes_u16 * 4);
+        npyv_u16  a6 = npyv_load_u16(src1 + npyv_nlanes_u16 * 5);
+        npyv_u16  b6 = npyv_load_u16(src2 + npyv_nlanes_u16 * 5);
+        npyv_u16  a7 = npyv_load_u16(src1 + npyv_nlanes_u16 * 6);
+        npyv_u16  b7 = npyv_load_u16(src2 + npyv_nlanes_u16 * 6);
+        npyv_u16  a8 = npyv_load_u16(src1 + npyv_nlanes_u16 * 7);
+        npyv_u16  b8 = npyv_load_u16(src2 + npyv_nlanes_u16 * 7);
+        npyv_b16 c5 = npyv_cmplt_u16(a5, b5);
+        npyv_b16 c6 = npyv_cmplt_u16(a6, b6);
+        npyv_b16 c7 = npyv_cmplt_u16(a7, b7);
+        npyv_b16 c8 = npyv_cmplt_u16(a8, b8);
+#endif // 16 >= 64
+#endif // 16 >= 32
+#endif // 16 >= 16
+#endif // 16 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 16 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 16 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 16 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 16 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_u16 a = *src1;
+        const npyv_lanetype_u16 b = *src2;
+        *dst = a < b;
+    }
+}
+
+static void simd_binary_scalar1_less_u16(char **args, npy_intp len)
+{
+    npyv_lanetype_u16 scalar = *(npyv_lanetype_u16 *) args[0];
+    npyv_lanetype_u16 *src   = (npyv_lanetype_u16 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_u16 a         = npyv_setall_u16(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 16 >= 8
+        npyv_u16  b1 = npyv_load_u16(src + npyv_nlanes_u16 * 0);
+        npyv_b16 c1 = npyv_cmplt_u16(a, b1);
+#if 16 >= 16
+        npyv_u16  b2 = npyv_load_u16(src + npyv_nlanes_u16 * 1);
+        npyv_b16 c2 = npyv_cmplt_u16(a, b2);
+#if 16 >= 32
+        npyv_u16  b3 = npyv_load_u16(src + npyv_nlanes_u16 * 2);
+        npyv_u16  b4 = npyv_load_u16(src + npyv_nlanes_u16 * 3);
+        npyv_b16 c3 = npyv_cmplt_u16(a, b3);
+        npyv_b16 c4 = npyv_cmplt_u16(a, b4);
+#if 16 == 64
+        npyv_u16  b5 = npyv_load_u16(src + npyv_nlanes_u16 * 4);
+        npyv_u16  b6 = npyv_load_u16(src + npyv_nlanes_u16 * 5);
+        npyv_u16  b7 = npyv_load_u16(src + npyv_nlanes_u16 * 6);
+        npyv_u16  b8 = npyv_load_u16(src + npyv_nlanes_u16 * 7);
+        npyv_b16 c5 = npyv_cmplt_u16(a, b5);
+        npyv_b16 c6 = npyv_cmplt_u16(a, b6);
+        npyv_b16 c7 = npyv_cmplt_u16(a, b7);
+        npyv_b16 c8 = npyv_cmplt_u16(a, b8);
+#endif // 16 >= 64
+#endif // 16 >= 32
+#endif // 16 >= 16
+#endif // 16 >= 8
+
+#if 16 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 16 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 16 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 16 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u16 b = *src;
+        *dst = scalar < b;
+    }
+}
+
+static void simd_binary_scalar2_less_u16(char **args, npy_intp len)
+{
+    npyv_lanetype_u16 *src   = (npyv_lanetype_u16 *) args[0];
+    npyv_lanetype_u16 scalar = *(npyv_lanetype_u16 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_u16 b         = npyv_setall_u16(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 16 >= 8
+        npyv_u16  a1 = npyv_load_u16(src + npyv_nlanes_u16 * 0);
+        npyv_b16 c1 = npyv_cmplt_u16(a1, b);
+#if 16 >= 16
+        npyv_u16  a2 = npyv_load_u16(src + npyv_nlanes_u16 * 1);
+        npyv_b16 c2 = npyv_cmplt_u16(a2, b);
+#if 16 >= 32
+        npyv_u16  a3 = npyv_load_u16(src + npyv_nlanes_u16 * 2);
+        npyv_u16  a4 = npyv_load_u16(src + npyv_nlanes_u16 * 3);
+        npyv_b16 c3 = npyv_cmplt_u16(a3, b);
+        npyv_b16 c4 = npyv_cmplt_u16(a4, b);
+#if 16 == 64
+        npyv_u16  a5 = npyv_load_u16(src + npyv_nlanes_u16 * 4);
+        npyv_u16  a6 = npyv_load_u16(src + npyv_nlanes_u16 * 5);
+        npyv_u16  a7 = npyv_load_u16(src + npyv_nlanes_u16 * 6);
+        npyv_u16  a8 = npyv_load_u16(src + npyv_nlanes_u16 * 7);
+        npyv_b16 c5 = npyv_cmplt_u16(a5, b);
+        npyv_b16 c6 = npyv_cmplt_u16(a6, b);
+        npyv_b16 c7 = npyv_cmplt_u16(a7, b);
+        npyv_b16 c8 = npyv_cmplt_u16(a8, b);
+#endif // 16 >= 64
+#endif // 16 >= 32
+#endif // 16 >= 16
+#endif // 16 >= 8
+
+#if 16 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 16 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 16 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 16 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u16 a = *src;
+        *dst = a < scalar;
+    }
+}
+#endif
+
+
+#line 35
+#if NPY_SIMD && !((0 || 0) && 0)
+static void simd_binary_less_equal_u16(char **args, npy_intp len)
+{
+    npyv_lanetype_u16 *src1 = (npyv_lanetype_u16 *) args[0];
+    npyv_lanetype_u16 *src2 = (npyv_lanetype_u16 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 16 >= 8
+        npyv_u16  a1 = npyv_load_u16(src1 + npyv_nlanes_u16 * 0);
+        npyv_u16  b1 = npyv_load_u16(src2 + npyv_nlanes_u16 * 0);
+        npyv_b16 c1 = npyv_cmple_u16(a1, b1);
+#if 16 >= 16
+        npyv_u16  a2 = npyv_load_u16(src1 + npyv_nlanes_u16 * 1);
+        npyv_u16  b2 = npyv_load_u16(src2 + npyv_nlanes_u16 * 1);
+        npyv_b16 c2 = npyv_cmple_u16(a2, b2);
+#if 16 >= 32
+        npyv_u16  a3 = npyv_load_u16(src1 + npyv_nlanes_u16 * 2);
+        npyv_u16  b3 = npyv_load_u16(src2 + npyv_nlanes_u16 * 2);
+        npyv_u16  a4 = npyv_load_u16(src1 + npyv_nlanes_u16 * 3);
+        npyv_u16  b4 = npyv_load_u16(src2 + npyv_nlanes_u16 * 3);
+        npyv_b16 c3 = npyv_cmple_u16(a3, b3);
+        npyv_b16 c4 = npyv_cmple_u16(a4, b4);
+#if 16 == 64
+        npyv_u16  a5 = npyv_load_u16(src1 + npyv_nlanes_u16 * 4);
+        npyv_u16  b5 = npyv_load_u16(src2 + npyv_nlanes_u16 * 4);
+        npyv_u16  a6 = npyv_load_u16(src1 + npyv_nlanes_u16 * 5);
+        npyv_u16  b6 = npyv_load_u16(src2 + npyv_nlanes_u16 * 5);
+        npyv_u16  a7 = npyv_load_u16(src1 + npyv_nlanes_u16 * 6);
+        npyv_u16  b7 = npyv_load_u16(src2 + npyv_nlanes_u16 * 6);
+        npyv_u16  a8 = npyv_load_u16(src1 + npyv_nlanes_u16 * 7);
+        npyv_u16  b8 = npyv_load_u16(src2 + npyv_nlanes_u16 * 7);
+        npyv_b16 c5 = npyv_cmple_u16(a5, b5);
+        npyv_b16 c6 = npyv_cmple_u16(a6, b6);
+        npyv_b16 c7 = npyv_cmple_u16(a7, b7);
+        npyv_b16 c8 = npyv_cmple_u16(a8, b8);
+#endif // 16 >= 64
+#endif // 16 >= 32
+#endif // 16 >= 16
+#endif // 16 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 16 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 16 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 16 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 16 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_u16 a = *src1;
+        const npyv_lanetype_u16 b = *src2;
+        *dst = a <= b;
+    }
+}
+
+static void simd_binary_scalar1_less_equal_u16(char **args, npy_intp len)
+{
+    npyv_lanetype_u16 scalar = *(npyv_lanetype_u16 *) args[0];
+    npyv_lanetype_u16 *src   = (npyv_lanetype_u16 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_u16 a         = npyv_setall_u16(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 16 >= 8
+        npyv_u16  b1 = npyv_load_u16(src + npyv_nlanes_u16 * 0);
+        npyv_b16 c1 = npyv_cmple_u16(a, b1);
+#if 16 >= 16
+        npyv_u16  b2 = npyv_load_u16(src + npyv_nlanes_u16 * 1);
+        npyv_b16 c2 = npyv_cmple_u16(a, b2);
+#if 16 >= 32
+        npyv_u16  b3 = npyv_load_u16(src + npyv_nlanes_u16 * 2);
+        npyv_u16  b4 = npyv_load_u16(src + npyv_nlanes_u16 * 3);
+        npyv_b16 c3 = npyv_cmple_u16(a, b3);
+        npyv_b16 c4 = npyv_cmple_u16(a, b4);
+#if 16 == 64
+        npyv_u16  b5 = npyv_load_u16(src + npyv_nlanes_u16 * 4);
+        npyv_u16  b6 = npyv_load_u16(src + npyv_nlanes_u16 * 5);
+        npyv_u16  b7 = npyv_load_u16(src + npyv_nlanes_u16 * 6);
+        npyv_u16  b8 = npyv_load_u16(src + npyv_nlanes_u16 * 7);
+        npyv_b16 c5 = npyv_cmple_u16(a, b5);
+        npyv_b16 c6 = npyv_cmple_u16(a, b6);
+        npyv_b16 c7 = npyv_cmple_u16(a, b7);
+        npyv_b16 c8 = npyv_cmple_u16(a, b8);
+#endif // 16 >= 64
+#endif // 16 >= 32
+#endif // 16 >= 16
+#endif // 16 >= 8
+
+#if 16 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 16 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 16 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 16 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u16 b = *src;
+        *dst = scalar <= b;
+    }
+}
+
+static void simd_binary_scalar2_less_equal_u16(char **args, npy_intp len)
+{
+    npyv_lanetype_u16 *src   = (npyv_lanetype_u16 *) args[0];
+    npyv_lanetype_u16 scalar = *(npyv_lanetype_u16 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_u16 b         = npyv_setall_u16(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 16 >= 8
+        npyv_u16  a1 = npyv_load_u16(src + npyv_nlanes_u16 * 0);
+        npyv_b16 c1 = npyv_cmple_u16(a1, b);
+#if 16 >= 16
+        npyv_u16  a2 = npyv_load_u16(src + npyv_nlanes_u16 * 1);
+        npyv_b16 c2 = npyv_cmple_u16(a2, b);
+#if 16 >= 32
+        npyv_u16  a3 = npyv_load_u16(src + npyv_nlanes_u16 * 2);
+        npyv_u16  a4 = npyv_load_u16(src + npyv_nlanes_u16 * 3);
+        npyv_b16 c3 = npyv_cmple_u16(a3, b);
+        npyv_b16 c4 = npyv_cmple_u16(a4, b);
+#if 16 == 64
+        npyv_u16  a5 = npyv_load_u16(src + npyv_nlanes_u16 * 4);
+        npyv_u16  a6 = npyv_load_u16(src + npyv_nlanes_u16 * 5);
+        npyv_u16  a7 = npyv_load_u16(src + npyv_nlanes_u16 * 6);
+        npyv_u16  a8 = npyv_load_u16(src + npyv_nlanes_u16 * 7);
+        npyv_b16 c5 = npyv_cmple_u16(a5, b);
+        npyv_b16 c6 = npyv_cmple_u16(a6, b);
+        npyv_b16 c7 = npyv_cmple_u16(a7, b);
+        npyv_b16 c8 = npyv_cmple_u16(a8, b);
+#endif // 16 >= 64
+#endif // 16 >= 32
+#endif // 16 >= 16
+#endif // 16 >= 8
+
+#if 16 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 16 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 16 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 16 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u16 a = *src;
+        *dst = a <= scalar;
+    }
+}
+#endif
+
+
+
+#line 28
+#line 35
+#if NPY_SIMD && !((1 || 0) && 1)
+static void simd_binary_equal_s16(char **args, npy_intp len)
+{
+    npyv_lanetype_s16 *src1 = (npyv_lanetype_s16 *) args[0];
+    npyv_lanetype_s16 *src2 = (npyv_lanetype_s16 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 16 >= 8
+        npyv_s16  a1 = npyv_load_s16(src1 + npyv_nlanes_s16 * 0);
+        npyv_s16  b1 = npyv_load_s16(src2 + npyv_nlanes_s16 * 0);
+        npyv_b16 c1 = npyv_cmpeq_s16(a1, b1);
+#if 16 >= 16
+        npyv_s16  a2 = npyv_load_s16(src1 + npyv_nlanes_s16 * 1);
+        npyv_s16  b2 = npyv_load_s16(src2 + npyv_nlanes_s16 * 1);
+        npyv_b16 c2 = npyv_cmpeq_s16(a2, b2);
+#if 16 >= 32
+        npyv_s16  a3 = npyv_load_s16(src1 + npyv_nlanes_s16 * 2);
+        npyv_s16  b3 = npyv_load_s16(src2 + npyv_nlanes_s16 * 2);
+        npyv_s16  a4 = npyv_load_s16(src1 + npyv_nlanes_s16 * 3);
+        npyv_s16  b4 = npyv_load_s16(src2 + npyv_nlanes_s16 * 3);
+        npyv_b16 c3 = npyv_cmpeq_s16(a3, b3);
+        npyv_b16 c4 = npyv_cmpeq_s16(a4, b4);
+#if 16 == 64
+        npyv_s16  a5 = npyv_load_s16(src1 + npyv_nlanes_s16 * 4);
+        npyv_s16  b5 = npyv_load_s16(src2 + npyv_nlanes_s16 * 4);
+        npyv_s16  a6 = npyv_load_s16(src1 + npyv_nlanes_s16 * 5);
+        npyv_s16  b6 = npyv_load_s16(src2 + npyv_nlanes_s16 * 5);
+        npyv_s16  a7 = npyv_load_s16(src1 + npyv_nlanes_s16 * 6);
+        npyv_s16  b7 = npyv_load_s16(src2 + npyv_nlanes_s16 * 6);
+        npyv_s16  a8 = npyv_load_s16(src1 + npyv_nlanes_s16 * 7);
+        npyv_s16  b8 = npyv_load_s16(src2 + npyv_nlanes_s16 * 7);
+        npyv_b16 c5 = npyv_cmpeq_s16(a5, b5);
+        npyv_b16 c6 = npyv_cmpeq_s16(a6, b6);
+        npyv_b16 c7 = npyv_cmpeq_s16(a7, b7);
+        npyv_b16 c8 = npyv_cmpeq_s16(a8, b8);
+#endif // 16 >= 64
+#endif // 16 >= 32
+#endif // 16 >= 16
+#endif // 16 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 16 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 16 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 16 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 16 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_s16 a = *src1;
+        const npyv_lanetype_s16 b = *src2;
+        *dst = a == b;
+    }
+}
+
+static void simd_binary_scalar1_equal_s16(char **args, npy_intp len)
+{
+    npyv_lanetype_s16 scalar = *(npyv_lanetype_s16 *) args[0];
+    npyv_lanetype_s16 *src   = (npyv_lanetype_s16 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_s16 a         = npyv_setall_s16(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 16 >= 8
+        npyv_s16  b1 = npyv_load_s16(src + npyv_nlanes_s16 * 0);
+        npyv_b16 c1 = npyv_cmpeq_s16(a, b1);
+#if 16 >= 16
+        npyv_s16  b2 = npyv_load_s16(src + npyv_nlanes_s16 * 1);
+        npyv_b16 c2 = npyv_cmpeq_s16(a, b2);
+#if 16 >= 32
+        npyv_s16  b3 = npyv_load_s16(src + npyv_nlanes_s16 * 2);
+        npyv_s16  b4 = npyv_load_s16(src + npyv_nlanes_s16 * 3);
+        npyv_b16 c3 = npyv_cmpeq_s16(a, b3);
+        npyv_b16 c4 = npyv_cmpeq_s16(a, b4);
+#if 16 == 64
+        npyv_s16  b5 = npyv_load_s16(src + npyv_nlanes_s16 * 4);
+        npyv_s16  b6 = npyv_load_s16(src + npyv_nlanes_s16 * 5);
+        npyv_s16  b7 = npyv_load_s16(src + npyv_nlanes_s16 * 6);
+        npyv_s16  b8 = npyv_load_s16(src + npyv_nlanes_s16 * 7);
+        npyv_b16 c5 = npyv_cmpeq_s16(a, b5);
+        npyv_b16 c6 = npyv_cmpeq_s16(a, b6);
+        npyv_b16 c7 = npyv_cmpeq_s16(a, b7);
+        npyv_b16 c8 = npyv_cmpeq_s16(a, b8);
+#endif // 16 >= 64
+#endif // 16 >= 32
+#endif // 16 >= 16
+#endif // 16 >= 8
+
+#if 16 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 16 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 16 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 16 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_s16 b = *src;
+        *dst = scalar == b;
+    }
+}
+
+static void simd_binary_scalar2_equal_s16(char **args, npy_intp len)
+{
+    npyv_lanetype_s16 *src   = (npyv_lanetype_s16 *) args[0];
+    npyv_lanetype_s16 scalar = *(npyv_lanetype_s16 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_s16 b         = npyv_setall_s16(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 16 >= 8
+        npyv_s16  a1 = npyv_load_s16(src + npyv_nlanes_s16 * 0);
+        npyv_b16 c1 = npyv_cmpeq_s16(a1, b);
+#if 16 >= 16
+        npyv_s16  a2 = npyv_load_s16(src + npyv_nlanes_s16 * 1);
+        npyv_b16 c2 = npyv_cmpeq_s16(a2, b);
+#if 16 >= 32
+        npyv_s16  a3 = npyv_load_s16(src + npyv_nlanes_s16 * 2);
+        npyv_s16  a4 = npyv_load_s16(src + npyv_nlanes_s16 * 3);
+        npyv_b16 c3 = npyv_cmpeq_s16(a3, b);
+        npyv_b16 c4 = npyv_cmpeq_s16(a4, b);
+#if 16 == 64
+        npyv_s16  a5 = npyv_load_s16(src + npyv_nlanes_s16 * 4);
+        npyv_s16  a6 = npyv_load_s16(src + npyv_nlanes_s16 * 5);
+        npyv_s16  a7 = npyv_load_s16(src + npyv_nlanes_s16 * 6);
+        npyv_s16  a8 = npyv_load_s16(src + npyv_nlanes_s16 * 7);
+        npyv_b16 c5 = npyv_cmpeq_s16(a5, b);
+        npyv_b16 c6 = npyv_cmpeq_s16(a6, b);
+        npyv_b16 c7 = npyv_cmpeq_s16(a7, b);
+        npyv_b16 c8 = npyv_cmpeq_s16(a8, b);
+#endif // 16 >= 64
+#endif // 16 >= 32
+#endif // 16 >= 16
+#endif // 16 >= 8
+
+#if 16 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 16 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 16 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 16 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_s16 a = *src;
+        *dst = a == scalar;
+    }
+}
+#endif
+
+
+#line 35
+#if NPY_SIMD && !((0 || 1) && 1)
+static void simd_binary_not_equal_s16(char **args, npy_intp len)
+{
+    npyv_lanetype_s16 *src1 = (npyv_lanetype_s16 *) args[0];
+    npyv_lanetype_s16 *src2 = (npyv_lanetype_s16 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 16 >= 8
+        npyv_s16  a1 = npyv_load_s16(src1 + npyv_nlanes_s16 * 0);
+        npyv_s16  b1 = npyv_load_s16(src2 + npyv_nlanes_s16 * 0);
+        npyv_b16 c1 = npyv_cmpneq_s16(a1, b1);
+#if 16 >= 16
+        npyv_s16  a2 = npyv_load_s16(src1 + npyv_nlanes_s16 * 1);
+        npyv_s16  b2 = npyv_load_s16(src2 + npyv_nlanes_s16 * 1);
+        npyv_b16 c2 = npyv_cmpneq_s16(a2, b2);
+#if 16 >= 32
+        npyv_s16  a3 = npyv_load_s16(src1 + npyv_nlanes_s16 * 2);
+        npyv_s16  b3 = npyv_load_s16(src2 + npyv_nlanes_s16 * 2);
+        npyv_s16  a4 = npyv_load_s16(src1 + npyv_nlanes_s16 * 3);
+        npyv_s16  b4 = npyv_load_s16(src2 + npyv_nlanes_s16 * 3);
+        npyv_b16 c3 = npyv_cmpneq_s16(a3, b3);
+        npyv_b16 c4 = npyv_cmpneq_s16(a4, b4);
+#if 16 == 64
+        npyv_s16  a5 = npyv_load_s16(src1 + npyv_nlanes_s16 * 4);
+        npyv_s16  b5 = npyv_load_s16(src2 + npyv_nlanes_s16 * 4);
+        npyv_s16  a6 = npyv_load_s16(src1 + npyv_nlanes_s16 * 5);
+        npyv_s16  b6 = npyv_load_s16(src2 + npyv_nlanes_s16 * 5);
+        npyv_s16  a7 = npyv_load_s16(src1 + npyv_nlanes_s16 * 6);
+        npyv_s16  b7 = npyv_load_s16(src2 + npyv_nlanes_s16 * 6);
+        npyv_s16  a8 = npyv_load_s16(src1 + npyv_nlanes_s16 * 7);
+        npyv_s16  b8 = npyv_load_s16(src2 + npyv_nlanes_s16 * 7);
+        npyv_b16 c5 = npyv_cmpneq_s16(a5, b5);
+        npyv_b16 c6 = npyv_cmpneq_s16(a6, b6);
+        npyv_b16 c7 = npyv_cmpneq_s16(a7, b7);
+        npyv_b16 c8 = npyv_cmpneq_s16(a8, b8);
+#endif // 16 >= 64
+#endif // 16 >= 32
+#endif // 16 >= 16
+#endif // 16 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 16 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 16 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 16 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 16 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_s16 a = *src1;
+        const npyv_lanetype_s16 b = *src2;
+        *dst = a != b;
+    }
+}
+
+static void simd_binary_scalar1_not_equal_s16(char **args, npy_intp len)
+{
+    npyv_lanetype_s16 scalar = *(npyv_lanetype_s16 *) args[0];
+    npyv_lanetype_s16 *src   = (npyv_lanetype_s16 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_s16 a         = npyv_setall_s16(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 16 >= 8
+        npyv_s16  b1 = npyv_load_s16(src + npyv_nlanes_s16 * 0);
+        npyv_b16 c1 = npyv_cmpneq_s16(a, b1);
+#if 16 >= 16
+        npyv_s16  b2 = npyv_load_s16(src + npyv_nlanes_s16 * 1);
+        npyv_b16 c2 = npyv_cmpneq_s16(a, b2);
+#if 16 >= 32
+        npyv_s16  b3 = npyv_load_s16(src + npyv_nlanes_s16 * 2);
+        npyv_s16  b4 = npyv_load_s16(src + npyv_nlanes_s16 * 3);
+        npyv_b16 c3 = npyv_cmpneq_s16(a, b3);
+        npyv_b16 c4 = npyv_cmpneq_s16(a, b4);
+#if 16 == 64
+        npyv_s16  b5 = npyv_load_s16(src + npyv_nlanes_s16 * 4);
+        npyv_s16  b6 = npyv_load_s16(src + npyv_nlanes_s16 * 5);
+        npyv_s16  b7 = npyv_load_s16(src + npyv_nlanes_s16 * 6);
+        npyv_s16  b8 = npyv_load_s16(src + npyv_nlanes_s16 * 7);
+        npyv_b16 c5 = npyv_cmpneq_s16(a, b5);
+        npyv_b16 c6 = npyv_cmpneq_s16(a, b6);
+        npyv_b16 c7 = npyv_cmpneq_s16(a, b7);
+        npyv_b16 c8 = npyv_cmpneq_s16(a, b8);
+#endif // 16 >= 64
+#endif // 16 >= 32
+#endif // 16 >= 16
+#endif // 16 >= 8
+
+#if 16 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 16 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 16 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 16 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_s16 b = *src;
+        *dst = scalar != b;
+    }
+}
+
+static void simd_binary_scalar2_not_equal_s16(char **args, npy_intp len)
+{
+    npyv_lanetype_s16 *src   = (npyv_lanetype_s16 *) args[0];
+    npyv_lanetype_s16 scalar = *(npyv_lanetype_s16 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_s16 b         = npyv_setall_s16(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 16 >= 8
+        npyv_s16  a1 = npyv_load_s16(src + npyv_nlanes_s16 * 0);
+        npyv_b16 c1 = npyv_cmpneq_s16(a1, b);
+#if 16 >= 16
+        npyv_s16  a2 = npyv_load_s16(src + npyv_nlanes_s16 * 1);
+        npyv_b16 c2 = npyv_cmpneq_s16(a2, b);
+#if 16 >= 32
+        npyv_s16  a3 = npyv_load_s16(src + npyv_nlanes_s16 * 2);
+        npyv_s16  a4 = npyv_load_s16(src + npyv_nlanes_s16 * 3);
+        npyv_b16 c3 = npyv_cmpneq_s16(a3, b);
+        npyv_b16 c4 = npyv_cmpneq_s16(a4, b);
+#if 16 == 64
+        npyv_s16  a5 = npyv_load_s16(src + npyv_nlanes_s16 * 4);
+        npyv_s16  a6 = npyv_load_s16(src + npyv_nlanes_s16 * 5);
+        npyv_s16  a7 = npyv_load_s16(src + npyv_nlanes_s16 * 6);
+        npyv_s16  a8 = npyv_load_s16(src + npyv_nlanes_s16 * 7);
+        npyv_b16 c5 = npyv_cmpneq_s16(a5, b);
+        npyv_b16 c6 = npyv_cmpneq_s16(a6, b);
+        npyv_b16 c7 = npyv_cmpneq_s16(a7, b);
+        npyv_b16 c8 = npyv_cmpneq_s16(a8, b);
+#endif // 16 >= 64
+#endif // 16 >= 32
+#endif // 16 >= 16
+#endif // 16 >= 8
+
+#if 16 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 16 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 16 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 16 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_s16 a = *src;
+        *dst = a != scalar;
+    }
+}
+#endif
+
+
+#line 35
+#if NPY_SIMD && !((0 || 0) && 1)
+static void simd_binary_less_s16(char **args, npy_intp len)
+{
+    npyv_lanetype_s16 *src1 = (npyv_lanetype_s16 *) args[0];
+    npyv_lanetype_s16 *src2 = (npyv_lanetype_s16 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 16 >= 8
+        npyv_s16  a1 = npyv_load_s16(src1 + npyv_nlanes_s16 * 0);
+        npyv_s16  b1 = npyv_load_s16(src2 + npyv_nlanes_s16 * 0);
+        npyv_b16 c1 = npyv_cmplt_s16(a1, b1);
+#if 16 >= 16
+        npyv_s16  a2 = npyv_load_s16(src1 + npyv_nlanes_s16 * 1);
+        npyv_s16  b2 = npyv_load_s16(src2 + npyv_nlanes_s16 * 1);
+        npyv_b16 c2 = npyv_cmplt_s16(a2, b2);
+#if 16 >= 32
+        npyv_s16  a3 = npyv_load_s16(src1 + npyv_nlanes_s16 * 2);
+        npyv_s16  b3 = npyv_load_s16(src2 + npyv_nlanes_s16 * 2);
+        npyv_s16  a4 = npyv_load_s16(src1 + npyv_nlanes_s16 * 3);
+        npyv_s16  b4 = npyv_load_s16(src2 + npyv_nlanes_s16 * 3);
+        npyv_b16 c3 = npyv_cmplt_s16(a3, b3);
+        npyv_b16 c4 = npyv_cmplt_s16(a4, b4);
+#if 16 == 64
+        npyv_s16  a5 = npyv_load_s16(src1 + npyv_nlanes_s16 * 4);
+        npyv_s16  b5 = npyv_load_s16(src2 + npyv_nlanes_s16 * 4);
+        npyv_s16  a6 = npyv_load_s16(src1 + npyv_nlanes_s16 * 5);
+        npyv_s16  b6 = npyv_load_s16(src2 + npyv_nlanes_s16 * 5);
+        npyv_s16  a7 = npyv_load_s16(src1 + npyv_nlanes_s16 * 6);
+        npyv_s16  b7 = npyv_load_s16(src2 + npyv_nlanes_s16 * 6);
+        npyv_s16  a8 = npyv_load_s16(src1 + npyv_nlanes_s16 * 7);
+        npyv_s16  b8 = npyv_load_s16(src2 + npyv_nlanes_s16 * 7);
+        npyv_b16 c5 = npyv_cmplt_s16(a5, b5);
+        npyv_b16 c6 = npyv_cmplt_s16(a6, b6);
+        npyv_b16 c7 = npyv_cmplt_s16(a7, b7);
+        npyv_b16 c8 = npyv_cmplt_s16(a8, b8);
+#endif // 16 >= 64
+#endif // 16 >= 32
+#endif // 16 >= 16
+#endif // 16 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 16 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 16 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 16 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 16 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_s16 a = *src1;
+        const npyv_lanetype_s16 b = *src2;
+        *dst = a < b;
+    }
+}
+
+static void simd_binary_scalar1_less_s16(char **args, npy_intp len)
+{
+    npyv_lanetype_s16 scalar = *(npyv_lanetype_s16 *) args[0];
+    npyv_lanetype_s16 *src   = (npyv_lanetype_s16 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_s16 a         = npyv_setall_s16(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 16 >= 8
+        npyv_s16  b1 = npyv_load_s16(src + npyv_nlanes_s16 * 0);
+        npyv_b16 c1 = npyv_cmplt_s16(a, b1);
+#if 16 >= 16
+        npyv_s16  b2 = npyv_load_s16(src + npyv_nlanes_s16 * 1);
+        npyv_b16 c2 = npyv_cmplt_s16(a, b2);
+#if 16 >= 32
+        npyv_s16  b3 = npyv_load_s16(src + npyv_nlanes_s16 * 2);
+        npyv_s16  b4 = npyv_load_s16(src + npyv_nlanes_s16 * 3);
+        npyv_b16 c3 = npyv_cmplt_s16(a, b3);
+        npyv_b16 c4 = npyv_cmplt_s16(a, b4);
+#if 16 == 64
+        npyv_s16  b5 = npyv_load_s16(src + npyv_nlanes_s16 * 4);
+        npyv_s16  b6 = npyv_load_s16(src + npyv_nlanes_s16 * 5);
+        npyv_s16  b7 = npyv_load_s16(src + npyv_nlanes_s16 * 6);
+        npyv_s16  b8 = npyv_load_s16(src + npyv_nlanes_s16 * 7);
+        npyv_b16 c5 = npyv_cmplt_s16(a, b5);
+        npyv_b16 c6 = npyv_cmplt_s16(a, b6);
+        npyv_b16 c7 = npyv_cmplt_s16(a, b7);
+        npyv_b16 c8 = npyv_cmplt_s16(a, b8);
+#endif // 16 >= 64
+#endif // 16 >= 32
+#endif // 16 >= 16
+#endif // 16 >= 8
+
+#if 16 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 16 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 16 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 16 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_s16 b = *src;
+        *dst = scalar < b;
+    }
+}
+
+static void simd_binary_scalar2_less_s16(char **args, npy_intp len)
+{
+    npyv_lanetype_s16 *src   = (npyv_lanetype_s16 *) args[0];
+    npyv_lanetype_s16 scalar = *(npyv_lanetype_s16 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_s16 b         = npyv_setall_s16(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 16 >= 8
+        npyv_s16  a1 = npyv_load_s16(src + npyv_nlanes_s16 * 0);
+        npyv_b16 c1 = npyv_cmplt_s16(a1, b);
+#if 16 >= 16
+        npyv_s16  a2 = npyv_load_s16(src + npyv_nlanes_s16 * 1);
+        npyv_b16 c2 = npyv_cmplt_s16(a2, b);
+#if 16 >= 32
+        npyv_s16  a3 = npyv_load_s16(src + npyv_nlanes_s16 * 2);
+        npyv_s16  a4 = npyv_load_s16(src + npyv_nlanes_s16 * 3);
+        npyv_b16 c3 = npyv_cmplt_s16(a3, b);
+        npyv_b16 c4 = npyv_cmplt_s16(a4, b);
+#if 16 == 64
+        npyv_s16  a5 = npyv_load_s16(src + npyv_nlanes_s16 * 4);
+        npyv_s16  a6 = npyv_load_s16(src + npyv_nlanes_s16 * 5);
+        npyv_s16  a7 = npyv_load_s16(src + npyv_nlanes_s16 * 6);
+        npyv_s16  a8 = npyv_load_s16(src + npyv_nlanes_s16 * 7);
+        npyv_b16 c5 = npyv_cmplt_s16(a5, b);
+        npyv_b16 c6 = npyv_cmplt_s16(a6, b);
+        npyv_b16 c7 = npyv_cmplt_s16(a7, b);
+        npyv_b16 c8 = npyv_cmplt_s16(a8, b);
+#endif // 16 >= 64
+#endif // 16 >= 32
+#endif // 16 >= 16
+#endif // 16 >= 8
+
+#if 16 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 16 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 16 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 16 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_s16 a = *src;
+        *dst = a < scalar;
+    }
+}
+#endif
+
+
+#line 35
+#if NPY_SIMD && !((0 || 0) && 1)
+static void simd_binary_less_equal_s16(char **args, npy_intp len)
+{
+    npyv_lanetype_s16 *src1 = (npyv_lanetype_s16 *) args[0];
+    npyv_lanetype_s16 *src2 = (npyv_lanetype_s16 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 16 >= 8
+        npyv_s16  a1 = npyv_load_s16(src1 + npyv_nlanes_s16 * 0);
+        npyv_s16  b1 = npyv_load_s16(src2 + npyv_nlanes_s16 * 0);
+        npyv_b16 c1 = npyv_cmple_s16(a1, b1);
+#if 16 >= 16
+        npyv_s16  a2 = npyv_load_s16(src1 + npyv_nlanes_s16 * 1);
+        npyv_s16  b2 = npyv_load_s16(src2 + npyv_nlanes_s16 * 1);
+        npyv_b16 c2 = npyv_cmple_s16(a2, b2);
+#if 16 >= 32
+        npyv_s16  a3 = npyv_load_s16(src1 + npyv_nlanes_s16 * 2);
+        npyv_s16  b3 = npyv_load_s16(src2 + npyv_nlanes_s16 * 2);
+        npyv_s16  a4 = npyv_load_s16(src1 + npyv_nlanes_s16 * 3);
+        npyv_s16  b4 = npyv_load_s16(src2 + npyv_nlanes_s16 * 3);
+        npyv_b16 c3 = npyv_cmple_s16(a3, b3);
+        npyv_b16 c4 = npyv_cmple_s16(a4, b4);
+#if 16 == 64
+        npyv_s16  a5 = npyv_load_s16(src1 + npyv_nlanes_s16 * 4);
+        npyv_s16  b5 = npyv_load_s16(src2 + npyv_nlanes_s16 * 4);
+        npyv_s16  a6 = npyv_load_s16(src1 + npyv_nlanes_s16 * 5);
+        npyv_s16  b6 = npyv_load_s16(src2 + npyv_nlanes_s16 * 5);
+        npyv_s16  a7 = npyv_load_s16(src1 + npyv_nlanes_s16 * 6);
+        npyv_s16  b7 = npyv_load_s16(src2 + npyv_nlanes_s16 * 6);
+        npyv_s16  a8 = npyv_load_s16(src1 + npyv_nlanes_s16 * 7);
+        npyv_s16  b8 = npyv_load_s16(src2 + npyv_nlanes_s16 * 7);
+        npyv_b16 c5 = npyv_cmple_s16(a5, b5);
+        npyv_b16 c6 = npyv_cmple_s16(a6, b6);
+        npyv_b16 c7 = npyv_cmple_s16(a7, b7);
+        npyv_b16 c8 = npyv_cmple_s16(a8, b8);
+#endif // 16 >= 64
+#endif // 16 >= 32
+#endif // 16 >= 16
+#endif // 16 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 16 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 16 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 16 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 16 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_s16 a = *src1;
+        const npyv_lanetype_s16 b = *src2;
+        *dst = a <= b;
+    }
+}
+
+static void simd_binary_scalar1_less_equal_s16(char **args, npy_intp len)
+{
+    npyv_lanetype_s16 scalar = *(npyv_lanetype_s16 *) args[0];
+    npyv_lanetype_s16 *src   = (npyv_lanetype_s16 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_s16 a         = npyv_setall_s16(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 16 >= 8
+        npyv_s16  b1 = npyv_load_s16(src + npyv_nlanes_s16 * 0);
+        npyv_b16 c1 = npyv_cmple_s16(a, b1);
+#if 16 >= 16
+        npyv_s16  b2 = npyv_load_s16(src + npyv_nlanes_s16 * 1);
+        npyv_b16 c2 = npyv_cmple_s16(a, b2);
+#if 16 >= 32
+        npyv_s16  b3 = npyv_load_s16(src + npyv_nlanes_s16 * 2);
+        npyv_s16  b4 = npyv_load_s16(src + npyv_nlanes_s16 * 3);
+        npyv_b16 c3 = npyv_cmple_s16(a, b3);
+        npyv_b16 c4 = npyv_cmple_s16(a, b4);
+#if 16 == 64
+        npyv_s16  b5 = npyv_load_s16(src + npyv_nlanes_s16 * 4);
+        npyv_s16  b6 = npyv_load_s16(src + npyv_nlanes_s16 * 5);
+        npyv_s16  b7 = npyv_load_s16(src + npyv_nlanes_s16 * 6);
+        npyv_s16  b8 = npyv_load_s16(src + npyv_nlanes_s16 * 7);
+        npyv_b16 c5 = npyv_cmple_s16(a, b5);
+        npyv_b16 c6 = npyv_cmple_s16(a, b6);
+        npyv_b16 c7 = npyv_cmple_s16(a, b7);
+        npyv_b16 c8 = npyv_cmple_s16(a, b8);
+#endif // 16 >= 64
+#endif // 16 >= 32
+#endif // 16 >= 16
+#endif // 16 >= 8
+
+#if 16 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 16 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 16 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 16 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_s16 b = *src;
+        *dst = scalar <= b;
+    }
+}
+
+static void simd_binary_scalar2_less_equal_s16(char **args, npy_intp len)
+{
+    npyv_lanetype_s16 *src   = (npyv_lanetype_s16 *) args[0];
+    npyv_lanetype_s16 scalar = *(npyv_lanetype_s16 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_s16 b         = npyv_setall_s16(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 16 >= 8
+        npyv_s16  a1 = npyv_load_s16(src + npyv_nlanes_s16 * 0);
+        npyv_b16 c1 = npyv_cmple_s16(a1, b);
+#if 16 >= 16
+        npyv_s16  a2 = npyv_load_s16(src + npyv_nlanes_s16 * 1);
+        npyv_b16 c2 = npyv_cmple_s16(a2, b);
+#if 16 >= 32
+        npyv_s16  a3 = npyv_load_s16(src + npyv_nlanes_s16 * 2);
+        npyv_s16  a4 = npyv_load_s16(src + npyv_nlanes_s16 * 3);
+        npyv_b16 c3 = npyv_cmple_s16(a3, b);
+        npyv_b16 c4 = npyv_cmple_s16(a4, b);
+#if 16 == 64
+        npyv_s16  a5 = npyv_load_s16(src + npyv_nlanes_s16 * 4);
+        npyv_s16  a6 = npyv_load_s16(src + npyv_nlanes_s16 * 5);
+        npyv_s16  a7 = npyv_load_s16(src + npyv_nlanes_s16 * 6);
+        npyv_s16  a8 = npyv_load_s16(src + npyv_nlanes_s16 * 7);
+        npyv_b16 c5 = npyv_cmple_s16(a5, b);
+        npyv_b16 c6 = npyv_cmple_s16(a6, b);
+        npyv_b16 c7 = npyv_cmple_s16(a7, b);
+        npyv_b16 c8 = npyv_cmple_s16(a8, b);
+#endif // 16 >= 64
+#endif // 16 >= 32
+#endif // 16 >= 16
+#endif // 16 >= 8
+
+#if 16 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 16 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 16 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 16 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_s16 a = *src;
+        *dst = a <= scalar;
+    }
+}
+#endif
+
+
+
+#line 28
+#line 35
+#if NPY_SIMD && !((1 || 0) && 0)
+static void simd_binary_equal_u32(char **args, npy_intp len)
+{
+    npyv_lanetype_u32 *src1 = (npyv_lanetype_u32 *) args[0];
+    npyv_lanetype_u32 *src2 = (npyv_lanetype_u32 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 32 >= 8
+        npyv_u32  a1 = npyv_load_u32(src1 + npyv_nlanes_u32 * 0);
+        npyv_u32  b1 = npyv_load_u32(src2 + npyv_nlanes_u32 * 0);
+        npyv_b32 c1 = npyv_cmpeq_u32(a1, b1);
+#if 32 >= 16
+        npyv_u32  a2 = npyv_load_u32(src1 + npyv_nlanes_u32 * 1);
+        npyv_u32  b2 = npyv_load_u32(src2 + npyv_nlanes_u32 * 1);
+        npyv_b32 c2 = npyv_cmpeq_u32(a2, b2);
+#if 32 >= 32
+        npyv_u32  a3 = npyv_load_u32(src1 + npyv_nlanes_u32 * 2);
+        npyv_u32  b3 = npyv_load_u32(src2 + npyv_nlanes_u32 * 2);
+        npyv_u32  a4 = npyv_load_u32(src1 + npyv_nlanes_u32 * 3);
+        npyv_u32  b4 = npyv_load_u32(src2 + npyv_nlanes_u32 * 3);
+        npyv_b32 c3 = npyv_cmpeq_u32(a3, b3);
+        npyv_b32 c4 = npyv_cmpeq_u32(a4, b4);
+#if 32 == 64
+        npyv_u32  a5 = npyv_load_u32(src1 + npyv_nlanes_u32 * 4);
+        npyv_u32  b5 = npyv_load_u32(src2 + npyv_nlanes_u32 * 4);
+        npyv_u32  a6 = npyv_load_u32(src1 + npyv_nlanes_u32 * 5);
+        npyv_u32  b6 = npyv_load_u32(src2 + npyv_nlanes_u32 * 5);
+        npyv_u32  a7 = npyv_load_u32(src1 + npyv_nlanes_u32 * 6);
+        npyv_u32  b7 = npyv_load_u32(src2 + npyv_nlanes_u32 * 6);
+        npyv_u32  a8 = npyv_load_u32(src1 + npyv_nlanes_u32 * 7);
+        npyv_u32  b8 = npyv_load_u32(src2 + npyv_nlanes_u32 * 7);
+        npyv_b32 c5 = npyv_cmpeq_u32(a5, b5);
+        npyv_b32 c6 = npyv_cmpeq_u32(a6, b6);
+        npyv_b32 c7 = npyv_cmpeq_u32(a7, b7);
+        npyv_b32 c8 = npyv_cmpeq_u32(a8, b8);
+#endif // 32 >= 64
+#endif // 32 >= 32
+#endif // 32 >= 16
+#endif // 32 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 32 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 32 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 32 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 32 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_u32 a = *src1;
+        const npyv_lanetype_u32 b = *src2;
+        *dst = a == b;
+    }
+}
+
+static void simd_binary_scalar1_equal_u32(char **args, npy_intp len)
+{
+    npyv_lanetype_u32 scalar = *(npyv_lanetype_u32 *) args[0];
+    npyv_lanetype_u32 *src   = (npyv_lanetype_u32 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_u32 a         = npyv_setall_u32(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 32 >= 8
+        npyv_u32  b1 = npyv_load_u32(src + npyv_nlanes_u32 * 0);
+        npyv_b32 c1 = npyv_cmpeq_u32(a, b1);
+#if 32 >= 16
+        npyv_u32  b2 = npyv_load_u32(src + npyv_nlanes_u32 * 1);
+        npyv_b32 c2 = npyv_cmpeq_u32(a, b2);
+#if 32 >= 32
+        npyv_u32  b3 = npyv_load_u32(src + npyv_nlanes_u32 * 2);
+        npyv_u32  b4 = npyv_load_u32(src + npyv_nlanes_u32 * 3);
+        npyv_b32 c3 = npyv_cmpeq_u32(a, b3);
+        npyv_b32 c4 = npyv_cmpeq_u32(a, b4);
+#if 32 == 64
+        npyv_u32  b5 = npyv_load_u32(src + npyv_nlanes_u32 * 4);
+        npyv_u32  b6 = npyv_load_u32(src + npyv_nlanes_u32 * 5);
+        npyv_u32  b7 = npyv_load_u32(src + npyv_nlanes_u32 * 6);
+        npyv_u32  b8 = npyv_load_u32(src + npyv_nlanes_u32 * 7);
+        npyv_b32 c5 = npyv_cmpeq_u32(a, b5);
+        npyv_b32 c6 = npyv_cmpeq_u32(a, b6);
+        npyv_b32 c7 = npyv_cmpeq_u32(a, b7);
+        npyv_b32 c8 = npyv_cmpeq_u32(a, b8);
+#endif // 32 >= 64
+#endif // 32 >= 32
+#endif // 32 >= 16
+#endif // 32 >= 8
+
+#if 32 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 32 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 32 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 32 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u32 b = *src;
+        *dst = scalar == b;
+    }
+}
+
+static void simd_binary_scalar2_equal_u32(char **args, npy_intp len)
+{
+    npyv_lanetype_u32 *src   = (npyv_lanetype_u32 *) args[0];
+    npyv_lanetype_u32 scalar = *(npyv_lanetype_u32 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_u32 b         = npyv_setall_u32(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 32 >= 8
+        npyv_u32  a1 = npyv_load_u32(src + npyv_nlanes_u32 * 0);
+        npyv_b32 c1 = npyv_cmpeq_u32(a1, b);
+#if 32 >= 16
+        npyv_u32  a2 = npyv_load_u32(src + npyv_nlanes_u32 * 1);
+        npyv_b32 c2 = npyv_cmpeq_u32(a2, b);
+#if 32 >= 32
+        npyv_u32  a3 = npyv_load_u32(src + npyv_nlanes_u32 * 2);
+        npyv_u32  a4 = npyv_load_u32(src + npyv_nlanes_u32 * 3);
+        npyv_b32 c3 = npyv_cmpeq_u32(a3, b);
+        npyv_b32 c4 = npyv_cmpeq_u32(a4, b);
+#if 32 == 64
+        npyv_u32  a5 = npyv_load_u32(src + npyv_nlanes_u32 * 4);
+        npyv_u32  a6 = npyv_load_u32(src + npyv_nlanes_u32 * 5);
+        npyv_u32  a7 = npyv_load_u32(src + npyv_nlanes_u32 * 6);
+        npyv_u32  a8 = npyv_load_u32(src + npyv_nlanes_u32 * 7);
+        npyv_b32 c5 = npyv_cmpeq_u32(a5, b);
+        npyv_b32 c6 = npyv_cmpeq_u32(a6, b);
+        npyv_b32 c7 = npyv_cmpeq_u32(a7, b);
+        npyv_b32 c8 = npyv_cmpeq_u32(a8, b);
+#endif // 32 >= 64
+#endif // 32 >= 32
+#endif // 32 >= 16
+#endif // 32 >= 8
+
+#if 32 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 32 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 32 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 32 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u32 a = *src;
+        *dst = a == scalar;
+    }
+}
+#endif
+
+
+#line 35
+#if NPY_SIMD && !((0 || 1) && 0)
+static void simd_binary_not_equal_u32(char **args, npy_intp len)
+{
+    npyv_lanetype_u32 *src1 = (npyv_lanetype_u32 *) args[0];
+    npyv_lanetype_u32 *src2 = (npyv_lanetype_u32 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 32 >= 8
+        npyv_u32  a1 = npyv_load_u32(src1 + npyv_nlanes_u32 * 0);
+        npyv_u32  b1 = npyv_load_u32(src2 + npyv_nlanes_u32 * 0);
+        npyv_b32 c1 = npyv_cmpneq_u32(a1, b1);
+#if 32 >= 16
+        npyv_u32  a2 = npyv_load_u32(src1 + npyv_nlanes_u32 * 1);
+        npyv_u32  b2 = npyv_load_u32(src2 + npyv_nlanes_u32 * 1);
+        npyv_b32 c2 = npyv_cmpneq_u32(a2, b2);
+#if 32 >= 32
+        npyv_u32  a3 = npyv_load_u32(src1 + npyv_nlanes_u32 * 2);
+        npyv_u32  b3 = npyv_load_u32(src2 + npyv_nlanes_u32 * 2);
+        npyv_u32  a4 = npyv_load_u32(src1 + npyv_nlanes_u32 * 3);
+        npyv_u32  b4 = npyv_load_u32(src2 + npyv_nlanes_u32 * 3);
+        npyv_b32 c3 = npyv_cmpneq_u32(a3, b3);
+        npyv_b32 c4 = npyv_cmpneq_u32(a4, b4);
+#if 32 == 64
+        npyv_u32  a5 = npyv_load_u32(src1 + npyv_nlanes_u32 * 4);
+        npyv_u32  b5 = npyv_load_u32(src2 + npyv_nlanes_u32 * 4);
+        npyv_u32  a6 = npyv_load_u32(src1 + npyv_nlanes_u32 * 5);
+        npyv_u32  b6 = npyv_load_u32(src2 + npyv_nlanes_u32 * 5);
+        npyv_u32  a7 = npyv_load_u32(src1 + npyv_nlanes_u32 * 6);
+        npyv_u32  b7 = npyv_load_u32(src2 + npyv_nlanes_u32 * 6);
+        npyv_u32  a8 = npyv_load_u32(src1 + npyv_nlanes_u32 * 7);
+        npyv_u32  b8 = npyv_load_u32(src2 + npyv_nlanes_u32 * 7);
+        npyv_b32 c5 = npyv_cmpneq_u32(a5, b5);
+        npyv_b32 c6 = npyv_cmpneq_u32(a6, b6);
+        npyv_b32 c7 = npyv_cmpneq_u32(a7, b7);
+        npyv_b32 c8 = npyv_cmpneq_u32(a8, b8);
+#endif // 32 >= 64
+#endif // 32 >= 32
+#endif // 32 >= 16
+#endif // 32 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 32 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 32 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 32 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 32 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_u32 a = *src1;
+        const npyv_lanetype_u32 b = *src2;
+        *dst = a != b;
+    }
+}
+
+static void simd_binary_scalar1_not_equal_u32(char **args, npy_intp len)
+{
+    npyv_lanetype_u32 scalar = *(npyv_lanetype_u32 *) args[0];
+    npyv_lanetype_u32 *src   = (npyv_lanetype_u32 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_u32 a         = npyv_setall_u32(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 32 >= 8
+        npyv_u32  b1 = npyv_load_u32(src + npyv_nlanes_u32 * 0);
+        npyv_b32 c1 = npyv_cmpneq_u32(a, b1);
+#if 32 >= 16
+        npyv_u32  b2 = npyv_load_u32(src + npyv_nlanes_u32 * 1);
+        npyv_b32 c2 = npyv_cmpneq_u32(a, b2);
+#if 32 >= 32
+        npyv_u32  b3 = npyv_load_u32(src + npyv_nlanes_u32 * 2);
+        npyv_u32  b4 = npyv_load_u32(src + npyv_nlanes_u32 * 3);
+        npyv_b32 c3 = npyv_cmpneq_u32(a, b3);
+        npyv_b32 c4 = npyv_cmpneq_u32(a, b4);
+#if 32 == 64
+        npyv_u32  b5 = npyv_load_u32(src + npyv_nlanes_u32 * 4);
+        npyv_u32  b6 = npyv_load_u32(src + npyv_nlanes_u32 * 5);
+        npyv_u32  b7 = npyv_load_u32(src + npyv_nlanes_u32 * 6);
+        npyv_u32  b8 = npyv_load_u32(src + npyv_nlanes_u32 * 7);
+        npyv_b32 c5 = npyv_cmpneq_u32(a, b5);
+        npyv_b32 c6 = npyv_cmpneq_u32(a, b6);
+        npyv_b32 c7 = npyv_cmpneq_u32(a, b7);
+        npyv_b32 c8 = npyv_cmpneq_u32(a, b8);
+#endif // 32 >= 64
+#endif // 32 >= 32
+#endif // 32 >= 16
+#endif // 32 >= 8
+
+#if 32 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 32 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 32 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 32 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u32 b = *src;
+        *dst = scalar != b;
+    }
+}
+
+static void simd_binary_scalar2_not_equal_u32(char **args, npy_intp len)
+{
+    npyv_lanetype_u32 *src   = (npyv_lanetype_u32 *) args[0];
+    npyv_lanetype_u32 scalar = *(npyv_lanetype_u32 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_u32 b         = npyv_setall_u32(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 32 >= 8
+        npyv_u32  a1 = npyv_load_u32(src + npyv_nlanes_u32 * 0);
+        npyv_b32 c1 = npyv_cmpneq_u32(a1, b);
+#if 32 >= 16
+        npyv_u32  a2 = npyv_load_u32(src + npyv_nlanes_u32 * 1);
+        npyv_b32 c2 = npyv_cmpneq_u32(a2, b);
+#if 32 >= 32
+        npyv_u32  a3 = npyv_load_u32(src + npyv_nlanes_u32 * 2);
+        npyv_u32  a4 = npyv_load_u32(src + npyv_nlanes_u32 * 3);
+        npyv_b32 c3 = npyv_cmpneq_u32(a3, b);
+        npyv_b32 c4 = npyv_cmpneq_u32(a4, b);
+#if 32 == 64
+        npyv_u32  a5 = npyv_load_u32(src + npyv_nlanes_u32 * 4);
+        npyv_u32  a6 = npyv_load_u32(src + npyv_nlanes_u32 * 5);
+        npyv_u32  a7 = npyv_load_u32(src + npyv_nlanes_u32 * 6);
+        npyv_u32  a8 = npyv_load_u32(src + npyv_nlanes_u32 * 7);
+        npyv_b32 c5 = npyv_cmpneq_u32(a5, b);
+        npyv_b32 c6 = npyv_cmpneq_u32(a6, b);
+        npyv_b32 c7 = npyv_cmpneq_u32(a7, b);
+        npyv_b32 c8 = npyv_cmpneq_u32(a8, b);
+#endif // 32 >= 64
+#endif // 32 >= 32
+#endif // 32 >= 16
+#endif // 32 >= 8
+
+#if 32 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 32 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 32 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 32 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u32 a = *src;
+        *dst = a != scalar;
+    }
+}
+#endif
+
+
+#line 35
+#if NPY_SIMD && !((0 || 0) && 0)
+static void simd_binary_less_u32(char **args, npy_intp len)
+{
+    npyv_lanetype_u32 *src1 = (npyv_lanetype_u32 *) args[0];
+    npyv_lanetype_u32 *src2 = (npyv_lanetype_u32 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 32 >= 8
+        npyv_u32  a1 = npyv_load_u32(src1 + npyv_nlanes_u32 * 0);
+        npyv_u32  b1 = npyv_load_u32(src2 + npyv_nlanes_u32 * 0);
+        npyv_b32 c1 = npyv_cmplt_u32(a1, b1);
+#if 32 >= 16
+        npyv_u32  a2 = npyv_load_u32(src1 + npyv_nlanes_u32 * 1);
+        npyv_u32  b2 = npyv_load_u32(src2 + npyv_nlanes_u32 * 1);
+        npyv_b32 c2 = npyv_cmplt_u32(a2, b2);
+#if 32 >= 32
+        npyv_u32  a3 = npyv_load_u32(src1 + npyv_nlanes_u32 * 2);
+        npyv_u32  b3 = npyv_load_u32(src2 + npyv_nlanes_u32 * 2);
+        npyv_u32  a4 = npyv_load_u32(src1 + npyv_nlanes_u32 * 3);
+        npyv_u32  b4 = npyv_load_u32(src2 + npyv_nlanes_u32 * 3);
+        npyv_b32 c3 = npyv_cmplt_u32(a3, b3);
+        npyv_b32 c4 = npyv_cmplt_u32(a4, b4);
+#if 32 == 64
+        npyv_u32  a5 = npyv_load_u32(src1 + npyv_nlanes_u32 * 4);
+        npyv_u32  b5 = npyv_load_u32(src2 + npyv_nlanes_u32 * 4);
+        npyv_u32  a6 = npyv_load_u32(src1 + npyv_nlanes_u32 * 5);
+        npyv_u32  b6 = npyv_load_u32(src2 + npyv_nlanes_u32 * 5);
+        npyv_u32  a7 = npyv_load_u32(src1 + npyv_nlanes_u32 * 6);
+        npyv_u32  b7 = npyv_load_u32(src2 + npyv_nlanes_u32 * 6);
+        npyv_u32  a8 = npyv_load_u32(src1 + npyv_nlanes_u32 * 7);
+        npyv_u32  b8 = npyv_load_u32(src2 + npyv_nlanes_u32 * 7);
+        npyv_b32 c5 = npyv_cmplt_u32(a5, b5);
+        npyv_b32 c6 = npyv_cmplt_u32(a6, b6);
+        npyv_b32 c7 = npyv_cmplt_u32(a7, b7);
+        npyv_b32 c8 = npyv_cmplt_u32(a8, b8);
+#endif // 32 >= 64
+#endif // 32 >= 32
+#endif // 32 >= 16
+#endif // 32 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 32 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 32 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 32 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 32 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_u32 a = *src1;
+        const npyv_lanetype_u32 b = *src2;
+        *dst = a < b;
+    }
+}
+
+static void simd_binary_scalar1_less_u32(char **args, npy_intp len)
+{
+    npyv_lanetype_u32 scalar = *(npyv_lanetype_u32 *) args[0];
+    npyv_lanetype_u32 *src   = (npyv_lanetype_u32 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_u32 a         = npyv_setall_u32(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 32 >= 8
+        npyv_u32  b1 = npyv_load_u32(src + npyv_nlanes_u32 * 0);
+        npyv_b32 c1 = npyv_cmplt_u32(a, b1);
+#if 32 >= 16
+        npyv_u32  b2 = npyv_load_u32(src + npyv_nlanes_u32 * 1);
+        npyv_b32 c2 = npyv_cmplt_u32(a, b2);
+#if 32 >= 32
+        npyv_u32  b3 = npyv_load_u32(src + npyv_nlanes_u32 * 2);
+        npyv_u32  b4 = npyv_load_u32(src + npyv_nlanes_u32 * 3);
+        npyv_b32 c3 = npyv_cmplt_u32(a, b3);
+        npyv_b32 c4 = npyv_cmplt_u32(a, b4);
+#if 32 == 64
+        npyv_u32  b5 = npyv_load_u32(src + npyv_nlanes_u32 * 4);
+        npyv_u32  b6 = npyv_load_u32(src + npyv_nlanes_u32 * 5);
+        npyv_u32  b7 = npyv_load_u32(src + npyv_nlanes_u32 * 6);
+        npyv_u32  b8 = npyv_load_u32(src + npyv_nlanes_u32 * 7);
+        npyv_b32 c5 = npyv_cmplt_u32(a, b5);
+        npyv_b32 c6 = npyv_cmplt_u32(a, b6);
+        npyv_b32 c7 = npyv_cmplt_u32(a, b7);
+        npyv_b32 c8 = npyv_cmplt_u32(a, b8);
+#endif // 32 >= 64
+#endif // 32 >= 32
+#endif // 32 >= 16
+#endif // 32 >= 8
+
+#if 32 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 32 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 32 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 32 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u32 b = *src;
+        *dst = scalar < b;
+    }
+}
+
+static void simd_binary_scalar2_less_u32(char **args, npy_intp len)
+{
+    npyv_lanetype_u32 *src   = (npyv_lanetype_u32 *) args[0];
+    npyv_lanetype_u32 scalar = *(npyv_lanetype_u32 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_u32 b         = npyv_setall_u32(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 32 >= 8
+        npyv_u32  a1 = npyv_load_u32(src + npyv_nlanes_u32 * 0);
+        npyv_b32 c1 = npyv_cmplt_u32(a1, b);
+#if 32 >= 16
+        npyv_u32  a2 = npyv_load_u32(src + npyv_nlanes_u32 * 1);
+        npyv_b32 c2 = npyv_cmplt_u32(a2, b);
+#if 32 >= 32
+        npyv_u32  a3 = npyv_load_u32(src + npyv_nlanes_u32 * 2);
+        npyv_u32  a4 = npyv_load_u32(src + npyv_nlanes_u32 * 3);
+        npyv_b32 c3 = npyv_cmplt_u32(a3, b);
+        npyv_b32 c4 = npyv_cmplt_u32(a4, b);
+#if 32 == 64
+        npyv_u32  a5 = npyv_load_u32(src + npyv_nlanes_u32 * 4);
+        npyv_u32  a6 = npyv_load_u32(src + npyv_nlanes_u32 * 5);
+        npyv_u32  a7 = npyv_load_u32(src + npyv_nlanes_u32 * 6);
+        npyv_u32  a8 = npyv_load_u32(src + npyv_nlanes_u32 * 7);
+        npyv_b32 c5 = npyv_cmplt_u32(a5, b);
+        npyv_b32 c6 = npyv_cmplt_u32(a6, b);
+        npyv_b32 c7 = npyv_cmplt_u32(a7, b);
+        npyv_b32 c8 = npyv_cmplt_u32(a8, b);
+#endif // 32 >= 64
+#endif // 32 >= 32
+#endif // 32 >= 16
+#endif // 32 >= 8
+
+#if 32 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 32 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 32 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 32 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u32 a = *src;
+        *dst = a < scalar;
+    }
+}
+#endif
+
+
+#line 35
+#if NPY_SIMD && !((0 || 0) && 0)
+static void simd_binary_less_equal_u32(char **args, npy_intp len)
+{
+    npyv_lanetype_u32 *src1 = (npyv_lanetype_u32 *) args[0];
+    npyv_lanetype_u32 *src2 = (npyv_lanetype_u32 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 32 >= 8
+        npyv_u32  a1 = npyv_load_u32(src1 + npyv_nlanes_u32 * 0);
+        npyv_u32  b1 = npyv_load_u32(src2 + npyv_nlanes_u32 * 0);
+        npyv_b32 c1 = npyv_cmple_u32(a1, b1);
+#if 32 >= 16
+        npyv_u32  a2 = npyv_load_u32(src1 + npyv_nlanes_u32 * 1);
+        npyv_u32  b2 = npyv_load_u32(src2 + npyv_nlanes_u32 * 1);
+        npyv_b32 c2 = npyv_cmple_u32(a2, b2);
+#if 32 >= 32
+        npyv_u32  a3 = npyv_load_u32(src1 + npyv_nlanes_u32 * 2);
+        npyv_u32  b3 = npyv_load_u32(src2 + npyv_nlanes_u32 * 2);
+        npyv_u32  a4 = npyv_load_u32(src1 + npyv_nlanes_u32 * 3);
+        npyv_u32  b4 = npyv_load_u32(src2 + npyv_nlanes_u32 * 3);
+        npyv_b32 c3 = npyv_cmple_u32(a3, b3);
+        npyv_b32 c4 = npyv_cmple_u32(a4, b4);
+#if 32 == 64
+        npyv_u32  a5 = npyv_load_u32(src1 + npyv_nlanes_u32 * 4);
+        npyv_u32  b5 = npyv_load_u32(src2 + npyv_nlanes_u32 * 4);
+        npyv_u32  a6 = npyv_load_u32(src1 + npyv_nlanes_u32 * 5);
+        npyv_u32  b6 = npyv_load_u32(src2 + npyv_nlanes_u32 * 5);
+        npyv_u32  a7 = npyv_load_u32(src1 + npyv_nlanes_u32 * 6);
+        npyv_u32  b7 = npyv_load_u32(src2 + npyv_nlanes_u32 * 6);
+        npyv_u32  a8 = npyv_load_u32(src1 + npyv_nlanes_u32 * 7);
+        npyv_u32  b8 = npyv_load_u32(src2 + npyv_nlanes_u32 * 7);
+        npyv_b32 c5 = npyv_cmple_u32(a5, b5);
+        npyv_b32 c6 = npyv_cmple_u32(a6, b6);
+        npyv_b32 c7 = npyv_cmple_u32(a7, b7);
+        npyv_b32 c8 = npyv_cmple_u32(a8, b8);
+#endif // 32 >= 64
+#endif // 32 >= 32
+#endif // 32 >= 16
+#endif // 32 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 32 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 32 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 32 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 32 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_u32 a = *src1;
+        const npyv_lanetype_u32 b = *src2;
+        *dst = a <= b;
+    }
+}
+
+static void simd_binary_scalar1_less_equal_u32(char **args, npy_intp len)
+{
+    npyv_lanetype_u32 scalar = *(npyv_lanetype_u32 *) args[0];
+    npyv_lanetype_u32 *src   = (npyv_lanetype_u32 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_u32 a         = npyv_setall_u32(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 32 >= 8
+        npyv_u32  b1 = npyv_load_u32(src + npyv_nlanes_u32 * 0);
+        npyv_b32 c1 = npyv_cmple_u32(a, b1);
+#if 32 >= 16
+        npyv_u32  b2 = npyv_load_u32(src + npyv_nlanes_u32 * 1);
+        npyv_b32 c2 = npyv_cmple_u32(a, b2);
+#if 32 >= 32
+        npyv_u32  b3 = npyv_load_u32(src + npyv_nlanes_u32 * 2);
+        npyv_u32  b4 = npyv_load_u32(src + npyv_nlanes_u32 * 3);
+        npyv_b32 c3 = npyv_cmple_u32(a, b3);
+        npyv_b32 c4 = npyv_cmple_u32(a, b4);
+#if 32 == 64
+        npyv_u32  b5 = npyv_load_u32(src + npyv_nlanes_u32 * 4);
+        npyv_u32  b6 = npyv_load_u32(src + npyv_nlanes_u32 * 5);
+        npyv_u32  b7 = npyv_load_u32(src + npyv_nlanes_u32 * 6);
+        npyv_u32  b8 = npyv_load_u32(src + npyv_nlanes_u32 * 7);
+        npyv_b32 c5 = npyv_cmple_u32(a, b5);
+        npyv_b32 c6 = npyv_cmple_u32(a, b6);
+        npyv_b32 c7 = npyv_cmple_u32(a, b7);
+        npyv_b32 c8 = npyv_cmple_u32(a, b8);
+#endif // 32 >= 64
+#endif // 32 >= 32
+#endif // 32 >= 16
+#endif // 32 >= 8
+
+#if 32 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 32 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 32 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 32 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u32 b = *src;
+        *dst = scalar <= b;
+    }
+}
+
+static void simd_binary_scalar2_less_equal_u32(char **args, npy_intp len)
+{
+    npyv_lanetype_u32 *src   = (npyv_lanetype_u32 *) args[0];
+    npyv_lanetype_u32 scalar = *(npyv_lanetype_u32 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_u32 b         = npyv_setall_u32(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 32 >= 8
+        npyv_u32  a1 = npyv_load_u32(src + npyv_nlanes_u32 * 0);
+        npyv_b32 c1 = npyv_cmple_u32(a1, b);
+#if 32 >= 16
+        npyv_u32  a2 = npyv_load_u32(src + npyv_nlanes_u32 * 1);
+        npyv_b32 c2 = npyv_cmple_u32(a2, b);
+#if 32 >= 32
+        npyv_u32  a3 = npyv_load_u32(src + npyv_nlanes_u32 * 2);
+        npyv_u32  a4 = npyv_load_u32(src + npyv_nlanes_u32 * 3);
+        npyv_b32 c3 = npyv_cmple_u32(a3, b);
+        npyv_b32 c4 = npyv_cmple_u32(a4, b);
+#if 32 == 64
+        npyv_u32  a5 = npyv_load_u32(src + npyv_nlanes_u32 * 4);
+        npyv_u32  a6 = npyv_load_u32(src + npyv_nlanes_u32 * 5);
+        npyv_u32  a7 = npyv_load_u32(src + npyv_nlanes_u32 * 6);
+        npyv_u32  a8 = npyv_load_u32(src + npyv_nlanes_u32 * 7);
+        npyv_b32 c5 = npyv_cmple_u32(a5, b);
+        npyv_b32 c6 = npyv_cmple_u32(a6, b);
+        npyv_b32 c7 = npyv_cmple_u32(a7, b);
+        npyv_b32 c8 = npyv_cmple_u32(a8, b);
+#endif // 32 >= 64
+#endif // 32 >= 32
+#endif // 32 >= 16
+#endif // 32 >= 8
+
+#if 32 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 32 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 32 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 32 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u32 a = *src;
+        *dst = a <= scalar;
+    }
+}
+#endif
+
+
+
+#line 28
+#line 35
+#if NPY_SIMD && !((1 || 0) && 1)
+static void simd_binary_equal_s32(char **args, npy_intp len)
+{
+    npyv_lanetype_s32 *src1 = (npyv_lanetype_s32 *) args[0];
+    npyv_lanetype_s32 *src2 = (npyv_lanetype_s32 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 32 >= 8
+        npyv_s32  a1 = npyv_load_s32(src1 + npyv_nlanes_s32 * 0);
+        npyv_s32  b1 = npyv_load_s32(src2 + npyv_nlanes_s32 * 0);
+        npyv_b32 c1 = npyv_cmpeq_s32(a1, b1);
+#if 32 >= 16
+        npyv_s32  a2 = npyv_load_s32(src1 + npyv_nlanes_s32 * 1);
+        npyv_s32  b2 = npyv_load_s32(src2 + npyv_nlanes_s32 * 1);
+        npyv_b32 c2 = npyv_cmpeq_s32(a2, b2);
+#if 32 >= 32
+        npyv_s32  a3 = npyv_load_s32(src1 + npyv_nlanes_s32 * 2);
+        npyv_s32  b3 = npyv_load_s32(src2 + npyv_nlanes_s32 * 2);
+        npyv_s32  a4 = npyv_load_s32(src1 + npyv_nlanes_s32 * 3);
+        npyv_s32  b4 = npyv_load_s32(src2 + npyv_nlanes_s32 * 3);
+        npyv_b32 c3 = npyv_cmpeq_s32(a3, b3);
+        npyv_b32 c4 = npyv_cmpeq_s32(a4, b4);
+#if 32 == 64
+        npyv_s32  a5 = npyv_load_s32(src1 + npyv_nlanes_s32 * 4);
+        npyv_s32  b5 = npyv_load_s32(src2 + npyv_nlanes_s32 * 4);
+        npyv_s32  a6 = npyv_load_s32(src1 + npyv_nlanes_s32 * 5);
+        npyv_s32  b6 = npyv_load_s32(src2 + npyv_nlanes_s32 * 5);
+        npyv_s32  a7 = npyv_load_s32(src1 + npyv_nlanes_s32 * 6);
+        npyv_s32  b7 = npyv_load_s32(src2 + npyv_nlanes_s32 * 6);
+        npyv_s32  a8 = npyv_load_s32(src1 + npyv_nlanes_s32 * 7);
+        npyv_s32  b8 = npyv_load_s32(src2 + npyv_nlanes_s32 * 7);
+        npyv_b32 c5 = npyv_cmpeq_s32(a5, b5);
+        npyv_b32 c6 = npyv_cmpeq_s32(a6, b6);
+        npyv_b32 c7 = npyv_cmpeq_s32(a7, b7);
+        npyv_b32 c8 = npyv_cmpeq_s32(a8, b8);
+#endif // 32 >= 64
+#endif // 32 >= 32
+#endif // 32 >= 16
+#endif // 32 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 32 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 32 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 32 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 32 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_s32 a = *src1;
+        const npyv_lanetype_s32 b = *src2;
+        *dst = a == b;
+    }
+}
+
+static void simd_binary_scalar1_equal_s32(char **args, npy_intp len)
+{
+    npyv_lanetype_s32 scalar = *(npyv_lanetype_s32 *) args[0];
+    npyv_lanetype_s32 *src   = (npyv_lanetype_s32 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_s32 a         = npyv_setall_s32(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 32 >= 8
+        npyv_s32  b1 = npyv_load_s32(src + npyv_nlanes_s32 * 0);
+        npyv_b32 c1 = npyv_cmpeq_s32(a, b1);
+#if 32 >= 16
+        npyv_s32  b2 = npyv_load_s32(src + npyv_nlanes_s32 * 1);
+        npyv_b32 c2 = npyv_cmpeq_s32(a, b2);
+#if 32 >= 32
+        npyv_s32  b3 = npyv_load_s32(src + npyv_nlanes_s32 * 2);
+        npyv_s32  b4 = npyv_load_s32(src + npyv_nlanes_s32 * 3);
+        npyv_b32 c3 = npyv_cmpeq_s32(a, b3);
+        npyv_b32 c4 = npyv_cmpeq_s32(a, b4);
+#if 32 == 64
+        npyv_s32  b5 = npyv_load_s32(src + npyv_nlanes_s32 * 4);
+        npyv_s32  b6 = npyv_load_s32(src + npyv_nlanes_s32 * 5);
+        npyv_s32  b7 = npyv_load_s32(src + npyv_nlanes_s32 * 6);
+        npyv_s32  b8 = npyv_load_s32(src + npyv_nlanes_s32 * 7);
+        npyv_b32 c5 = npyv_cmpeq_s32(a, b5);
+        npyv_b32 c6 = npyv_cmpeq_s32(a, b6);
+        npyv_b32 c7 = npyv_cmpeq_s32(a, b7);
+        npyv_b32 c8 = npyv_cmpeq_s32(a, b8);
+#endif // 32 >= 64
+#endif // 32 >= 32
+#endif // 32 >= 16
+#endif // 32 >= 8
+
+#if 32 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 32 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 32 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 32 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_s32 b = *src;
+        *dst = scalar == b;
+    }
+}
+
+static void simd_binary_scalar2_equal_s32(char **args, npy_intp len)
+{
+    npyv_lanetype_s32 *src   = (npyv_lanetype_s32 *) args[0];
+    npyv_lanetype_s32 scalar = *(npyv_lanetype_s32 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_s32 b         = npyv_setall_s32(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 32 >= 8
+        npyv_s32  a1 = npyv_load_s32(src + npyv_nlanes_s32 * 0);
+        npyv_b32 c1 = npyv_cmpeq_s32(a1, b);
+#if 32 >= 16
+        npyv_s32  a2 = npyv_load_s32(src + npyv_nlanes_s32 * 1);
+        npyv_b32 c2 = npyv_cmpeq_s32(a2, b);
+#if 32 >= 32
+        npyv_s32  a3 = npyv_load_s32(src + npyv_nlanes_s32 * 2);
+        npyv_s32  a4 = npyv_load_s32(src + npyv_nlanes_s32 * 3);
+        npyv_b32 c3 = npyv_cmpeq_s32(a3, b);
+        npyv_b32 c4 = npyv_cmpeq_s32(a4, b);
+#if 32 == 64
+        npyv_s32  a5 = npyv_load_s32(src + npyv_nlanes_s32 * 4);
+        npyv_s32  a6 = npyv_load_s32(src + npyv_nlanes_s32 * 5);
+        npyv_s32  a7 = npyv_load_s32(src + npyv_nlanes_s32 * 6);
+        npyv_s32  a8 = npyv_load_s32(src + npyv_nlanes_s32 * 7);
+        npyv_b32 c5 = npyv_cmpeq_s32(a5, b);
+        npyv_b32 c6 = npyv_cmpeq_s32(a6, b);
+        npyv_b32 c7 = npyv_cmpeq_s32(a7, b);
+        npyv_b32 c8 = npyv_cmpeq_s32(a8, b);
+#endif // 32 >= 64
+#endif // 32 >= 32
+#endif // 32 >= 16
+#endif // 32 >= 8
+
+#if 32 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 32 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 32 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 32 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_s32 a = *src;
+        *dst = a == scalar;
+    }
+}
+#endif
+
+
+#line 35
+#if NPY_SIMD && !((0 || 1) && 1)
+static void simd_binary_not_equal_s32(char **args, npy_intp len)
+{
+    npyv_lanetype_s32 *src1 = (npyv_lanetype_s32 *) args[0];
+    npyv_lanetype_s32 *src2 = (npyv_lanetype_s32 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 32 >= 8
+        npyv_s32  a1 = npyv_load_s32(src1 + npyv_nlanes_s32 * 0);
+        npyv_s32  b1 = npyv_load_s32(src2 + npyv_nlanes_s32 * 0);
+        npyv_b32 c1 = npyv_cmpneq_s32(a1, b1);
+#if 32 >= 16
+        npyv_s32  a2 = npyv_load_s32(src1 + npyv_nlanes_s32 * 1);
+        npyv_s32  b2 = npyv_load_s32(src2 + npyv_nlanes_s32 * 1);
+        npyv_b32 c2 = npyv_cmpneq_s32(a2, b2);
+#if 32 >= 32
+        npyv_s32  a3 = npyv_load_s32(src1 + npyv_nlanes_s32 * 2);
+        npyv_s32  b3 = npyv_load_s32(src2 + npyv_nlanes_s32 * 2);
+        npyv_s32  a4 = npyv_load_s32(src1 + npyv_nlanes_s32 * 3);
+        npyv_s32  b4 = npyv_load_s32(src2 + npyv_nlanes_s32 * 3);
+        npyv_b32 c3 = npyv_cmpneq_s32(a3, b3);
+        npyv_b32 c4 = npyv_cmpneq_s32(a4, b4);
+#if 32 == 64
+        npyv_s32  a5 = npyv_load_s32(src1 + npyv_nlanes_s32 * 4);
+        npyv_s32  b5 = npyv_load_s32(src2 + npyv_nlanes_s32 * 4);
+        npyv_s32  a6 = npyv_load_s32(src1 + npyv_nlanes_s32 * 5);
+        npyv_s32  b6 = npyv_load_s32(src2 + npyv_nlanes_s32 * 5);
+        npyv_s32  a7 = npyv_load_s32(src1 + npyv_nlanes_s32 * 6);
+        npyv_s32  b7 = npyv_load_s32(src2 + npyv_nlanes_s32 * 6);
+        npyv_s32  a8 = npyv_load_s32(src1 + npyv_nlanes_s32 * 7);
+        npyv_s32  b8 = npyv_load_s32(src2 + npyv_nlanes_s32 * 7);
+        npyv_b32 c5 = npyv_cmpneq_s32(a5, b5);
+        npyv_b32 c6 = npyv_cmpneq_s32(a6, b6);
+        npyv_b32 c7 = npyv_cmpneq_s32(a7, b7);
+        npyv_b32 c8 = npyv_cmpneq_s32(a8, b8);
+#endif // 32 >= 64
+#endif // 32 >= 32
+#endif // 32 >= 16
+#endif // 32 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 32 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 32 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 32 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 32 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_s32 a = *src1;
+        const npyv_lanetype_s32 b = *src2;
+        *dst = a != b;
+    }
+}
+
+static void simd_binary_scalar1_not_equal_s32(char **args, npy_intp len)
+{
+    npyv_lanetype_s32 scalar = *(npyv_lanetype_s32 *) args[0];
+    npyv_lanetype_s32 *src   = (npyv_lanetype_s32 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_s32 a         = npyv_setall_s32(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 32 >= 8
+        npyv_s32  b1 = npyv_load_s32(src + npyv_nlanes_s32 * 0);
+        npyv_b32 c1 = npyv_cmpneq_s32(a, b1);
+#if 32 >= 16
+        npyv_s32  b2 = npyv_load_s32(src + npyv_nlanes_s32 * 1);
+        npyv_b32 c2 = npyv_cmpneq_s32(a, b2);
+#if 32 >= 32
+        npyv_s32  b3 = npyv_load_s32(src + npyv_nlanes_s32 * 2);
+        npyv_s32  b4 = npyv_load_s32(src + npyv_nlanes_s32 * 3);
+        npyv_b32 c3 = npyv_cmpneq_s32(a, b3);
+        npyv_b32 c4 = npyv_cmpneq_s32(a, b4);
+#if 32 == 64
+        npyv_s32  b5 = npyv_load_s32(src + npyv_nlanes_s32 * 4);
+        npyv_s32  b6 = npyv_load_s32(src + npyv_nlanes_s32 * 5);
+        npyv_s32  b7 = npyv_load_s32(src + npyv_nlanes_s32 * 6);
+        npyv_s32  b8 = npyv_load_s32(src + npyv_nlanes_s32 * 7);
+        npyv_b32 c5 = npyv_cmpneq_s32(a, b5);
+        npyv_b32 c6 = npyv_cmpneq_s32(a, b6);
+        npyv_b32 c7 = npyv_cmpneq_s32(a, b7);
+        npyv_b32 c8 = npyv_cmpneq_s32(a, b8);
+#endif // 32 >= 64
+#endif // 32 >= 32
+#endif // 32 >= 16
+#endif // 32 >= 8
+
+#if 32 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 32 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 32 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 32 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_s32 b = *src;
+        *dst = scalar != b;
+    }
+}
+
+static void simd_binary_scalar2_not_equal_s32(char **args, npy_intp len)
+{
+    npyv_lanetype_s32 *src   = (npyv_lanetype_s32 *) args[0];
+    npyv_lanetype_s32 scalar = *(npyv_lanetype_s32 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_s32 b         = npyv_setall_s32(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 32 >= 8
+        npyv_s32  a1 = npyv_load_s32(src + npyv_nlanes_s32 * 0);
+        npyv_b32 c1 = npyv_cmpneq_s32(a1, b);
+#if 32 >= 16
+        npyv_s32  a2 = npyv_load_s32(src + npyv_nlanes_s32 * 1);
+        npyv_b32 c2 = npyv_cmpneq_s32(a2, b);
+#if 32 >= 32
+        npyv_s32  a3 = npyv_load_s32(src + npyv_nlanes_s32 * 2);
+        npyv_s32  a4 = npyv_load_s32(src + npyv_nlanes_s32 * 3);
+        npyv_b32 c3 = npyv_cmpneq_s32(a3, b);
+        npyv_b32 c4 = npyv_cmpneq_s32(a4, b);
+#if 32 == 64
+        npyv_s32  a5 = npyv_load_s32(src + npyv_nlanes_s32 * 4);
+        npyv_s32  a6 = npyv_load_s32(src + npyv_nlanes_s32 * 5);
+        npyv_s32  a7 = npyv_load_s32(src + npyv_nlanes_s32 * 6);
+        npyv_s32  a8 = npyv_load_s32(src + npyv_nlanes_s32 * 7);
+        npyv_b32 c5 = npyv_cmpneq_s32(a5, b);
+        npyv_b32 c6 = npyv_cmpneq_s32(a6, b);
+        npyv_b32 c7 = npyv_cmpneq_s32(a7, b);
+        npyv_b32 c8 = npyv_cmpneq_s32(a8, b);
+#endif // 32 >= 64
+#endif // 32 >= 32
+#endif // 32 >= 16
+#endif // 32 >= 8
+
+#if 32 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 32 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 32 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 32 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_s32 a = *src;
+        *dst = a != scalar;
+    }
+}
+#endif
+
+
+#line 35
+#if NPY_SIMD && !((0 || 0) && 1)
+static void simd_binary_less_s32(char **args, npy_intp len)
+{
+    npyv_lanetype_s32 *src1 = (npyv_lanetype_s32 *) args[0];
+    npyv_lanetype_s32 *src2 = (npyv_lanetype_s32 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 32 >= 8
+        npyv_s32  a1 = npyv_load_s32(src1 + npyv_nlanes_s32 * 0);
+        npyv_s32  b1 = npyv_load_s32(src2 + npyv_nlanes_s32 * 0);
+        npyv_b32 c1 = npyv_cmplt_s32(a1, b1);
+#if 32 >= 16
+        npyv_s32  a2 = npyv_load_s32(src1 + npyv_nlanes_s32 * 1);
+        npyv_s32  b2 = npyv_load_s32(src2 + npyv_nlanes_s32 * 1);
+        npyv_b32 c2 = npyv_cmplt_s32(a2, b2);
+#if 32 >= 32
+        npyv_s32  a3 = npyv_load_s32(src1 + npyv_nlanes_s32 * 2);
+        npyv_s32  b3 = npyv_load_s32(src2 + npyv_nlanes_s32 * 2);
+        npyv_s32  a4 = npyv_load_s32(src1 + npyv_nlanes_s32 * 3);
+        npyv_s32  b4 = npyv_load_s32(src2 + npyv_nlanes_s32 * 3);
+        npyv_b32 c3 = npyv_cmplt_s32(a3, b3);
+        npyv_b32 c4 = npyv_cmplt_s32(a4, b4);
+#if 32 == 64
+        npyv_s32  a5 = npyv_load_s32(src1 + npyv_nlanes_s32 * 4);
+        npyv_s32  b5 = npyv_load_s32(src2 + npyv_nlanes_s32 * 4);
+        npyv_s32  a6 = npyv_load_s32(src1 + npyv_nlanes_s32 * 5);
+        npyv_s32  b6 = npyv_load_s32(src2 + npyv_nlanes_s32 * 5);
+        npyv_s32  a7 = npyv_load_s32(src1 + npyv_nlanes_s32 * 6);
+        npyv_s32  b7 = npyv_load_s32(src2 + npyv_nlanes_s32 * 6);
+        npyv_s32  a8 = npyv_load_s32(src1 + npyv_nlanes_s32 * 7);
+        npyv_s32  b8 = npyv_load_s32(src2 + npyv_nlanes_s32 * 7);
+        npyv_b32 c5 = npyv_cmplt_s32(a5, b5);
+        npyv_b32 c6 = npyv_cmplt_s32(a6, b6);
+        npyv_b32 c7 = npyv_cmplt_s32(a7, b7);
+        npyv_b32 c8 = npyv_cmplt_s32(a8, b8);
+#endif // 32 >= 64
+#endif // 32 >= 32
+#endif // 32 >= 16
+#endif // 32 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 32 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 32 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 32 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 32 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_s32 a = *src1;
+        const npyv_lanetype_s32 b = *src2;
+        *dst = a < b;
+    }
+}
+
+static void simd_binary_scalar1_less_s32(char **args, npy_intp len)
+{
+    npyv_lanetype_s32 scalar = *(npyv_lanetype_s32 *) args[0];
+    npyv_lanetype_s32 *src   = (npyv_lanetype_s32 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_s32 a         = npyv_setall_s32(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 32 >= 8
+        npyv_s32  b1 = npyv_load_s32(src + npyv_nlanes_s32 * 0);
+        npyv_b32 c1 = npyv_cmplt_s32(a, b1);
+#if 32 >= 16
+        npyv_s32  b2 = npyv_load_s32(src + npyv_nlanes_s32 * 1);
+        npyv_b32 c2 = npyv_cmplt_s32(a, b2);
+#if 32 >= 32
+        npyv_s32  b3 = npyv_load_s32(src + npyv_nlanes_s32 * 2);
+        npyv_s32  b4 = npyv_load_s32(src + npyv_nlanes_s32 * 3);
+        npyv_b32 c3 = npyv_cmplt_s32(a, b3);
+        npyv_b32 c4 = npyv_cmplt_s32(a, b4);
+#if 32 == 64
+        npyv_s32  b5 = npyv_load_s32(src + npyv_nlanes_s32 * 4);
+        npyv_s32  b6 = npyv_load_s32(src + npyv_nlanes_s32 * 5);
+        npyv_s32  b7 = npyv_load_s32(src + npyv_nlanes_s32 * 6);
+        npyv_s32  b8 = npyv_load_s32(src + npyv_nlanes_s32 * 7);
+        npyv_b32 c5 = npyv_cmplt_s32(a, b5);
+        npyv_b32 c6 = npyv_cmplt_s32(a, b6);
+        npyv_b32 c7 = npyv_cmplt_s32(a, b7);
+        npyv_b32 c8 = npyv_cmplt_s32(a, b8);
+#endif // 32 >= 64
+#endif // 32 >= 32
+#endif // 32 >= 16
+#endif // 32 >= 8
+
+#if 32 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 32 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 32 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 32 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_s32 b = *src;
+        *dst = scalar < b;
+    }
+}
+
+static void simd_binary_scalar2_less_s32(char **args, npy_intp len)
+{
+    npyv_lanetype_s32 *src   = (npyv_lanetype_s32 *) args[0];
+    npyv_lanetype_s32 scalar = *(npyv_lanetype_s32 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_s32 b         = npyv_setall_s32(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 32 >= 8
+        npyv_s32  a1 = npyv_load_s32(src + npyv_nlanes_s32 * 0);
+        npyv_b32 c1 = npyv_cmplt_s32(a1, b);
+#if 32 >= 16
+        npyv_s32  a2 = npyv_load_s32(src + npyv_nlanes_s32 * 1);
+        npyv_b32 c2 = npyv_cmplt_s32(a2, b);
+#if 32 >= 32
+        npyv_s32  a3 = npyv_load_s32(src + npyv_nlanes_s32 * 2);
+        npyv_s32  a4 = npyv_load_s32(src + npyv_nlanes_s32 * 3);
+        npyv_b32 c3 = npyv_cmplt_s32(a3, b);
+        npyv_b32 c4 = npyv_cmplt_s32(a4, b);
+#if 32 == 64
+        npyv_s32  a5 = npyv_load_s32(src + npyv_nlanes_s32 * 4);
+        npyv_s32  a6 = npyv_load_s32(src + npyv_nlanes_s32 * 5);
+        npyv_s32  a7 = npyv_load_s32(src + npyv_nlanes_s32 * 6);
+        npyv_s32  a8 = npyv_load_s32(src + npyv_nlanes_s32 * 7);
+        npyv_b32 c5 = npyv_cmplt_s32(a5, b);
+        npyv_b32 c6 = npyv_cmplt_s32(a6, b);
+        npyv_b32 c7 = npyv_cmplt_s32(a7, b);
+        npyv_b32 c8 = npyv_cmplt_s32(a8, b);
+#endif // 32 >= 64
+#endif // 32 >= 32
+#endif // 32 >= 16
+#endif // 32 >= 8
+
+#if 32 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 32 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 32 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 32 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_s32 a = *src;
+        *dst = a < scalar;
+    }
+}
+#endif
+
+
+#line 35
+#if NPY_SIMD && !((0 || 0) && 1)
+static void simd_binary_less_equal_s32(char **args, npy_intp len)
+{
+    npyv_lanetype_s32 *src1 = (npyv_lanetype_s32 *) args[0];
+    npyv_lanetype_s32 *src2 = (npyv_lanetype_s32 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 32 >= 8
+        npyv_s32  a1 = npyv_load_s32(src1 + npyv_nlanes_s32 * 0);
+        npyv_s32  b1 = npyv_load_s32(src2 + npyv_nlanes_s32 * 0);
+        npyv_b32 c1 = npyv_cmple_s32(a1, b1);
+#if 32 >= 16
+        npyv_s32  a2 = npyv_load_s32(src1 + npyv_nlanes_s32 * 1);
+        npyv_s32  b2 = npyv_load_s32(src2 + npyv_nlanes_s32 * 1);
+        npyv_b32 c2 = npyv_cmple_s32(a2, b2);
+#if 32 >= 32
+        npyv_s32  a3 = npyv_load_s32(src1 + npyv_nlanes_s32 * 2);
+        npyv_s32  b3 = npyv_load_s32(src2 + npyv_nlanes_s32 * 2);
+        npyv_s32  a4 = npyv_load_s32(src1 + npyv_nlanes_s32 * 3);
+        npyv_s32  b4 = npyv_load_s32(src2 + npyv_nlanes_s32 * 3);
+        npyv_b32 c3 = npyv_cmple_s32(a3, b3);
+        npyv_b32 c4 = npyv_cmple_s32(a4, b4);
+#if 32 == 64
+        npyv_s32  a5 = npyv_load_s32(src1 + npyv_nlanes_s32 * 4);
+        npyv_s32  b5 = npyv_load_s32(src2 + npyv_nlanes_s32 * 4);
+        npyv_s32  a6 = npyv_load_s32(src1 + npyv_nlanes_s32 * 5);
+        npyv_s32  b6 = npyv_load_s32(src2 + npyv_nlanes_s32 * 5);
+        npyv_s32  a7 = npyv_load_s32(src1 + npyv_nlanes_s32 * 6);
+        npyv_s32  b7 = npyv_load_s32(src2 + npyv_nlanes_s32 * 6);
+        npyv_s32  a8 = npyv_load_s32(src1 + npyv_nlanes_s32 * 7);
+        npyv_s32  b8 = npyv_load_s32(src2 + npyv_nlanes_s32 * 7);
+        npyv_b32 c5 = npyv_cmple_s32(a5, b5);
+        npyv_b32 c6 = npyv_cmple_s32(a6, b6);
+        npyv_b32 c7 = npyv_cmple_s32(a7, b7);
+        npyv_b32 c8 = npyv_cmple_s32(a8, b8);
+#endif // 32 >= 64
+#endif // 32 >= 32
+#endif // 32 >= 16
+#endif // 32 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 32 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 32 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 32 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 32 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_s32 a = *src1;
+        const npyv_lanetype_s32 b = *src2;
+        *dst = a <= b;
+    }
+}
+
+static void simd_binary_scalar1_less_equal_s32(char **args, npy_intp len)
+{
+    npyv_lanetype_s32 scalar = *(npyv_lanetype_s32 *) args[0];
+    npyv_lanetype_s32 *src   = (npyv_lanetype_s32 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_s32 a         = npyv_setall_s32(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 32 >= 8
+        npyv_s32  b1 = npyv_load_s32(src + npyv_nlanes_s32 * 0);
+        npyv_b32 c1 = npyv_cmple_s32(a, b1);
+#if 32 >= 16
+        npyv_s32  b2 = npyv_load_s32(src + npyv_nlanes_s32 * 1);
+        npyv_b32 c2 = npyv_cmple_s32(a, b2);
+#if 32 >= 32
+        npyv_s32  b3 = npyv_load_s32(src + npyv_nlanes_s32 * 2);
+        npyv_s32  b4 = npyv_load_s32(src + npyv_nlanes_s32 * 3);
+        npyv_b32 c3 = npyv_cmple_s32(a, b3);
+        npyv_b32 c4 = npyv_cmple_s32(a, b4);
+#if 32 == 64
+        npyv_s32  b5 = npyv_load_s32(src + npyv_nlanes_s32 * 4);
+        npyv_s32  b6 = npyv_load_s32(src + npyv_nlanes_s32 * 5);
+        npyv_s32  b7 = npyv_load_s32(src + npyv_nlanes_s32 * 6);
+        npyv_s32  b8 = npyv_load_s32(src + npyv_nlanes_s32 * 7);
+        npyv_b32 c5 = npyv_cmple_s32(a, b5);
+        npyv_b32 c6 = npyv_cmple_s32(a, b6);
+        npyv_b32 c7 = npyv_cmple_s32(a, b7);
+        npyv_b32 c8 = npyv_cmple_s32(a, b8);
+#endif // 32 >= 64
+#endif // 32 >= 32
+#endif // 32 >= 16
+#endif // 32 >= 8
+
+#if 32 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 32 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 32 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 32 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_s32 b = *src;
+        *dst = scalar <= b;
+    }
+}
+
+static void simd_binary_scalar2_less_equal_s32(char **args, npy_intp len)
+{
+    npyv_lanetype_s32 *src   = (npyv_lanetype_s32 *) args[0];
+    npyv_lanetype_s32 scalar = *(npyv_lanetype_s32 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_s32 b         = npyv_setall_s32(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 32 >= 8
+        npyv_s32  a1 = npyv_load_s32(src + npyv_nlanes_s32 * 0);
+        npyv_b32 c1 = npyv_cmple_s32(a1, b);
+#if 32 >= 16
+        npyv_s32  a2 = npyv_load_s32(src + npyv_nlanes_s32 * 1);
+        npyv_b32 c2 = npyv_cmple_s32(a2, b);
+#if 32 >= 32
+        npyv_s32  a3 = npyv_load_s32(src + npyv_nlanes_s32 * 2);
+        npyv_s32  a4 = npyv_load_s32(src + npyv_nlanes_s32 * 3);
+        npyv_b32 c3 = npyv_cmple_s32(a3, b);
+        npyv_b32 c4 = npyv_cmple_s32(a4, b);
+#if 32 == 64
+        npyv_s32  a5 = npyv_load_s32(src + npyv_nlanes_s32 * 4);
+        npyv_s32  a6 = npyv_load_s32(src + npyv_nlanes_s32 * 5);
+        npyv_s32  a7 = npyv_load_s32(src + npyv_nlanes_s32 * 6);
+        npyv_s32  a8 = npyv_load_s32(src + npyv_nlanes_s32 * 7);
+        npyv_b32 c5 = npyv_cmple_s32(a5, b);
+        npyv_b32 c6 = npyv_cmple_s32(a6, b);
+        npyv_b32 c7 = npyv_cmple_s32(a7, b);
+        npyv_b32 c8 = npyv_cmple_s32(a8, b);
+#endif // 32 >= 64
+#endif // 32 >= 32
+#endif // 32 >= 16
+#endif // 32 >= 8
+
+#if 32 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 32 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 32 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 32 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_s32 a = *src;
+        *dst = a <= scalar;
+    }
+}
+#endif
+
+
+
+#line 28
+#line 35
+#if NPY_SIMD && !((1 || 0) && 0)
+static void simd_binary_equal_u64(char **args, npy_intp len)
+{
+    npyv_lanetype_u64 *src1 = (npyv_lanetype_u64 *) args[0];
+    npyv_lanetype_u64 *src2 = (npyv_lanetype_u64 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 64 >= 8
+        npyv_u64  a1 = npyv_load_u64(src1 + npyv_nlanes_u64 * 0);
+        npyv_u64  b1 = npyv_load_u64(src2 + npyv_nlanes_u64 * 0);
+        npyv_b64 c1 = npyv_cmpeq_u64(a1, b1);
+#if 64 >= 16
+        npyv_u64  a2 = npyv_load_u64(src1 + npyv_nlanes_u64 * 1);
+        npyv_u64  b2 = npyv_load_u64(src2 + npyv_nlanes_u64 * 1);
+        npyv_b64 c2 = npyv_cmpeq_u64(a2, b2);
+#if 64 >= 32
+        npyv_u64  a3 = npyv_load_u64(src1 + npyv_nlanes_u64 * 2);
+        npyv_u64  b3 = npyv_load_u64(src2 + npyv_nlanes_u64 * 2);
+        npyv_u64  a4 = npyv_load_u64(src1 + npyv_nlanes_u64 * 3);
+        npyv_u64  b4 = npyv_load_u64(src2 + npyv_nlanes_u64 * 3);
+        npyv_b64 c3 = npyv_cmpeq_u64(a3, b3);
+        npyv_b64 c4 = npyv_cmpeq_u64(a4, b4);
+#if 64 == 64
+        npyv_u64  a5 = npyv_load_u64(src1 + npyv_nlanes_u64 * 4);
+        npyv_u64  b5 = npyv_load_u64(src2 + npyv_nlanes_u64 * 4);
+        npyv_u64  a6 = npyv_load_u64(src1 + npyv_nlanes_u64 * 5);
+        npyv_u64  b6 = npyv_load_u64(src2 + npyv_nlanes_u64 * 5);
+        npyv_u64  a7 = npyv_load_u64(src1 + npyv_nlanes_u64 * 6);
+        npyv_u64  b7 = npyv_load_u64(src2 + npyv_nlanes_u64 * 6);
+        npyv_u64  a8 = npyv_load_u64(src1 + npyv_nlanes_u64 * 7);
+        npyv_u64  b8 = npyv_load_u64(src2 + npyv_nlanes_u64 * 7);
+        npyv_b64 c5 = npyv_cmpeq_u64(a5, b5);
+        npyv_b64 c6 = npyv_cmpeq_u64(a6, b6);
+        npyv_b64 c7 = npyv_cmpeq_u64(a7, b7);
+        npyv_b64 c8 = npyv_cmpeq_u64(a8, b8);
+#endif // 64 >= 64
+#endif // 64 >= 32
+#endif // 64 >= 16
+#endif // 64 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 64 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 64 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 64 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 64 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_u64 a = *src1;
+        const npyv_lanetype_u64 b = *src2;
+        *dst = a == b;
+    }
+}
+
+static void simd_binary_scalar1_equal_u64(char **args, npy_intp len)
+{
+    npyv_lanetype_u64 scalar = *(npyv_lanetype_u64 *) args[0];
+    npyv_lanetype_u64 *src   = (npyv_lanetype_u64 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_u64 a         = npyv_setall_u64(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 64 >= 8
+        npyv_u64  b1 = npyv_load_u64(src + npyv_nlanes_u64 * 0);
+        npyv_b64 c1 = npyv_cmpeq_u64(a, b1);
+#if 64 >= 16
+        npyv_u64  b2 = npyv_load_u64(src + npyv_nlanes_u64 * 1);
+        npyv_b64 c2 = npyv_cmpeq_u64(a, b2);
+#if 64 >= 32
+        npyv_u64  b3 = npyv_load_u64(src + npyv_nlanes_u64 * 2);
+        npyv_u64  b4 = npyv_load_u64(src + npyv_nlanes_u64 * 3);
+        npyv_b64 c3 = npyv_cmpeq_u64(a, b3);
+        npyv_b64 c4 = npyv_cmpeq_u64(a, b4);
+#if 64 == 64
+        npyv_u64  b5 = npyv_load_u64(src + npyv_nlanes_u64 * 4);
+        npyv_u64  b6 = npyv_load_u64(src + npyv_nlanes_u64 * 5);
+        npyv_u64  b7 = npyv_load_u64(src + npyv_nlanes_u64 * 6);
+        npyv_u64  b8 = npyv_load_u64(src + npyv_nlanes_u64 * 7);
+        npyv_b64 c5 = npyv_cmpeq_u64(a, b5);
+        npyv_b64 c6 = npyv_cmpeq_u64(a, b6);
+        npyv_b64 c7 = npyv_cmpeq_u64(a, b7);
+        npyv_b64 c8 = npyv_cmpeq_u64(a, b8);
+#endif // 64 >= 64
+#endif // 64 >= 32
+#endif // 64 >= 16
+#endif // 64 >= 8
+
+#if 64 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 64 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 64 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 64 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u64 b = *src;
+        *dst = scalar == b;
+    }
+}
+
+static void simd_binary_scalar2_equal_u64(char **args, npy_intp len)
+{
+    npyv_lanetype_u64 *src   = (npyv_lanetype_u64 *) args[0];
+    npyv_lanetype_u64 scalar = *(npyv_lanetype_u64 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_u64 b         = npyv_setall_u64(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 64 >= 8
+        npyv_u64  a1 = npyv_load_u64(src + npyv_nlanes_u64 * 0);
+        npyv_b64 c1 = npyv_cmpeq_u64(a1, b);
+#if 64 >= 16
+        npyv_u64  a2 = npyv_load_u64(src + npyv_nlanes_u64 * 1);
+        npyv_b64 c2 = npyv_cmpeq_u64(a2, b);
+#if 64 >= 32
+        npyv_u64  a3 = npyv_load_u64(src + npyv_nlanes_u64 * 2);
+        npyv_u64  a4 = npyv_load_u64(src + npyv_nlanes_u64 * 3);
+        npyv_b64 c3 = npyv_cmpeq_u64(a3, b);
+        npyv_b64 c4 = npyv_cmpeq_u64(a4, b);
+#if 64 == 64
+        npyv_u64  a5 = npyv_load_u64(src + npyv_nlanes_u64 * 4);
+        npyv_u64  a6 = npyv_load_u64(src + npyv_nlanes_u64 * 5);
+        npyv_u64  a7 = npyv_load_u64(src + npyv_nlanes_u64 * 6);
+        npyv_u64  a8 = npyv_load_u64(src + npyv_nlanes_u64 * 7);
+        npyv_b64 c5 = npyv_cmpeq_u64(a5, b);
+        npyv_b64 c6 = npyv_cmpeq_u64(a6, b);
+        npyv_b64 c7 = npyv_cmpeq_u64(a7, b);
+        npyv_b64 c8 = npyv_cmpeq_u64(a8, b);
+#endif // 64 >= 64
+#endif // 64 >= 32
+#endif // 64 >= 16
+#endif // 64 >= 8
+
+#if 64 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 64 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 64 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 64 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u64 a = *src;
+        *dst = a == scalar;
+    }
+}
+#endif
+
+
+#line 35
+#if NPY_SIMD && !((0 || 1) && 0)
+static void simd_binary_not_equal_u64(char **args, npy_intp len)
+{
+    npyv_lanetype_u64 *src1 = (npyv_lanetype_u64 *) args[0];
+    npyv_lanetype_u64 *src2 = (npyv_lanetype_u64 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 64 >= 8
+        npyv_u64  a1 = npyv_load_u64(src1 + npyv_nlanes_u64 * 0);
+        npyv_u64  b1 = npyv_load_u64(src2 + npyv_nlanes_u64 * 0);
+        npyv_b64 c1 = npyv_cmpneq_u64(a1, b1);
+#if 64 >= 16
+        npyv_u64  a2 = npyv_load_u64(src1 + npyv_nlanes_u64 * 1);
+        npyv_u64  b2 = npyv_load_u64(src2 + npyv_nlanes_u64 * 1);
+        npyv_b64 c2 = npyv_cmpneq_u64(a2, b2);
+#if 64 >= 32
+        npyv_u64  a3 = npyv_load_u64(src1 + npyv_nlanes_u64 * 2);
+        npyv_u64  b3 = npyv_load_u64(src2 + npyv_nlanes_u64 * 2);
+        npyv_u64  a4 = npyv_load_u64(src1 + npyv_nlanes_u64 * 3);
+        npyv_u64  b4 = npyv_load_u64(src2 + npyv_nlanes_u64 * 3);
+        npyv_b64 c3 = npyv_cmpneq_u64(a3, b3);
+        npyv_b64 c4 = npyv_cmpneq_u64(a4, b4);
+#if 64 == 64
+        npyv_u64  a5 = npyv_load_u64(src1 + npyv_nlanes_u64 * 4);
+        npyv_u64  b5 = npyv_load_u64(src2 + npyv_nlanes_u64 * 4);
+        npyv_u64  a6 = npyv_load_u64(src1 + npyv_nlanes_u64 * 5);
+        npyv_u64  b6 = npyv_load_u64(src2 + npyv_nlanes_u64 * 5);
+        npyv_u64  a7 = npyv_load_u64(src1 + npyv_nlanes_u64 * 6);
+        npyv_u64  b7 = npyv_load_u64(src2 + npyv_nlanes_u64 * 6);
+        npyv_u64  a8 = npyv_load_u64(src1 + npyv_nlanes_u64 * 7);
+        npyv_u64  b8 = npyv_load_u64(src2 + npyv_nlanes_u64 * 7);
+        npyv_b64 c5 = npyv_cmpneq_u64(a5, b5);
+        npyv_b64 c6 = npyv_cmpneq_u64(a6, b6);
+        npyv_b64 c7 = npyv_cmpneq_u64(a7, b7);
+        npyv_b64 c8 = npyv_cmpneq_u64(a8, b8);
+#endif // 64 >= 64
+#endif // 64 >= 32
+#endif // 64 >= 16
+#endif // 64 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 64 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 64 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 64 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 64 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_u64 a = *src1;
+        const npyv_lanetype_u64 b = *src2;
+        *dst = a != b;
+    }
+}
+
+static void simd_binary_scalar1_not_equal_u64(char **args, npy_intp len)
+{
+    npyv_lanetype_u64 scalar = *(npyv_lanetype_u64 *) args[0];
+    npyv_lanetype_u64 *src   = (npyv_lanetype_u64 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_u64 a         = npyv_setall_u64(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 64 >= 8
+        npyv_u64  b1 = npyv_load_u64(src + npyv_nlanes_u64 * 0);
+        npyv_b64 c1 = npyv_cmpneq_u64(a, b1);
+#if 64 >= 16
+        npyv_u64  b2 = npyv_load_u64(src + npyv_nlanes_u64 * 1);
+        npyv_b64 c2 = npyv_cmpneq_u64(a, b2);
+#if 64 >= 32
+        npyv_u64  b3 = npyv_load_u64(src + npyv_nlanes_u64 * 2);
+        npyv_u64  b4 = npyv_load_u64(src + npyv_nlanes_u64 * 3);
+        npyv_b64 c3 = npyv_cmpneq_u64(a, b3);
+        npyv_b64 c4 = npyv_cmpneq_u64(a, b4);
+#if 64 == 64
+        npyv_u64  b5 = npyv_load_u64(src + npyv_nlanes_u64 * 4);
+        npyv_u64  b6 = npyv_load_u64(src + npyv_nlanes_u64 * 5);
+        npyv_u64  b7 = npyv_load_u64(src + npyv_nlanes_u64 * 6);
+        npyv_u64  b8 = npyv_load_u64(src + npyv_nlanes_u64 * 7);
+        npyv_b64 c5 = npyv_cmpneq_u64(a, b5);
+        npyv_b64 c6 = npyv_cmpneq_u64(a, b6);
+        npyv_b64 c7 = npyv_cmpneq_u64(a, b7);
+        npyv_b64 c8 = npyv_cmpneq_u64(a, b8);
+#endif // 64 >= 64
+#endif // 64 >= 32
+#endif // 64 >= 16
+#endif // 64 >= 8
+
+#if 64 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 64 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 64 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 64 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u64 b = *src;
+        *dst = scalar != b;
+    }
+}
+
+static void simd_binary_scalar2_not_equal_u64(char **args, npy_intp len)
+{
+    npyv_lanetype_u64 *src   = (npyv_lanetype_u64 *) args[0];
+    npyv_lanetype_u64 scalar = *(npyv_lanetype_u64 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_u64 b         = npyv_setall_u64(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 64 >= 8
+        npyv_u64  a1 = npyv_load_u64(src + npyv_nlanes_u64 * 0);
+        npyv_b64 c1 = npyv_cmpneq_u64(a1, b);
+#if 64 >= 16
+        npyv_u64  a2 = npyv_load_u64(src + npyv_nlanes_u64 * 1);
+        npyv_b64 c2 = npyv_cmpneq_u64(a2, b);
+#if 64 >= 32
+        npyv_u64  a3 = npyv_load_u64(src + npyv_nlanes_u64 * 2);
+        npyv_u64  a4 = npyv_load_u64(src + npyv_nlanes_u64 * 3);
+        npyv_b64 c3 = npyv_cmpneq_u64(a3, b);
+        npyv_b64 c4 = npyv_cmpneq_u64(a4, b);
+#if 64 == 64
+        npyv_u64  a5 = npyv_load_u64(src + npyv_nlanes_u64 * 4);
+        npyv_u64  a6 = npyv_load_u64(src + npyv_nlanes_u64 * 5);
+        npyv_u64  a7 = npyv_load_u64(src + npyv_nlanes_u64 * 6);
+        npyv_u64  a8 = npyv_load_u64(src + npyv_nlanes_u64 * 7);
+        npyv_b64 c5 = npyv_cmpneq_u64(a5, b);
+        npyv_b64 c6 = npyv_cmpneq_u64(a6, b);
+        npyv_b64 c7 = npyv_cmpneq_u64(a7, b);
+        npyv_b64 c8 = npyv_cmpneq_u64(a8, b);
+#endif // 64 >= 64
+#endif // 64 >= 32
+#endif // 64 >= 16
+#endif // 64 >= 8
+
+#if 64 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 64 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 64 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 64 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u64 a = *src;
+        *dst = a != scalar;
+    }
+}
+#endif
+
+
+#line 35
+#if NPY_SIMD && !((0 || 0) && 0)
+static void simd_binary_less_u64(char **args, npy_intp len)
+{
+    npyv_lanetype_u64 *src1 = (npyv_lanetype_u64 *) args[0];
+    npyv_lanetype_u64 *src2 = (npyv_lanetype_u64 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 64 >= 8
+        npyv_u64  a1 = npyv_load_u64(src1 + npyv_nlanes_u64 * 0);
+        npyv_u64  b1 = npyv_load_u64(src2 + npyv_nlanes_u64 * 0);
+        npyv_b64 c1 = npyv_cmplt_u64(a1, b1);
+#if 64 >= 16
+        npyv_u64  a2 = npyv_load_u64(src1 + npyv_nlanes_u64 * 1);
+        npyv_u64  b2 = npyv_load_u64(src2 + npyv_nlanes_u64 * 1);
+        npyv_b64 c2 = npyv_cmplt_u64(a2, b2);
+#if 64 >= 32
+        npyv_u64  a3 = npyv_load_u64(src1 + npyv_nlanes_u64 * 2);
+        npyv_u64  b3 = npyv_load_u64(src2 + npyv_nlanes_u64 * 2);
+        npyv_u64  a4 = npyv_load_u64(src1 + npyv_nlanes_u64 * 3);
+        npyv_u64  b4 = npyv_load_u64(src2 + npyv_nlanes_u64 * 3);
+        npyv_b64 c3 = npyv_cmplt_u64(a3, b3);
+        npyv_b64 c4 = npyv_cmplt_u64(a4, b4);
+#if 64 == 64
+        npyv_u64  a5 = npyv_load_u64(src1 + npyv_nlanes_u64 * 4);
+        npyv_u64  b5 = npyv_load_u64(src2 + npyv_nlanes_u64 * 4);
+        npyv_u64  a6 = npyv_load_u64(src1 + npyv_nlanes_u64 * 5);
+        npyv_u64  b6 = npyv_load_u64(src2 + npyv_nlanes_u64 * 5);
+        npyv_u64  a7 = npyv_load_u64(src1 + npyv_nlanes_u64 * 6);
+        npyv_u64  b7 = npyv_load_u64(src2 + npyv_nlanes_u64 * 6);
+        npyv_u64  a8 = npyv_load_u64(src1 + npyv_nlanes_u64 * 7);
+        npyv_u64  b8 = npyv_load_u64(src2 + npyv_nlanes_u64 * 7);
+        npyv_b64 c5 = npyv_cmplt_u64(a5, b5);
+        npyv_b64 c6 = npyv_cmplt_u64(a6, b6);
+        npyv_b64 c7 = npyv_cmplt_u64(a7, b7);
+        npyv_b64 c8 = npyv_cmplt_u64(a8, b8);
+#endif // 64 >= 64
+#endif // 64 >= 32
+#endif // 64 >= 16
+#endif // 64 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 64 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 64 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 64 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 64 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_u64 a = *src1;
+        const npyv_lanetype_u64 b = *src2;
+        *dst = a < b;
+    }
+}
+
+static void simd_binary_scalar1_less_u64(char **args, npy_intp len)
+{
+    npyv_lanetype_u64 scalar = *(npyv_lanetype_u64 *) args[0];
+    npyv_lanetype_u64 *src   = (npyv_lanetype_u64 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_u64 a         = npyv_setall_u64(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 64 >= 8
+        npyv_u64  b1 = npyv_load_u64(src + npyv_nlanes_u64 * 0);
+        npyv_b64 c1 = npyv_cmplt_u64(a, b1);
+#if 64 >= 16
+        npyv_u64  b2 = npyv_load_u64(src + npyv_nlanes_u64 * 1);
+        npyv_b64 c2 = npyv_cmplt_u64(a, b2);
+#if 64 >= 32
+        npyv_u64  b3 = npyv_load_u64(src + npyv_nlanes_u64 * 2);
+        npyv_u64  b4 = npyv_load_u64(src + npyv_nlanes_u64 * 3);
+        npyv_b64 c3 = npyv_cmplt_u64(a, b3);
+        npyv_b64 c4 = npyv_cmplt_u64(a, b4);
+#if 64 == 64
+        npyv_u64  b5 = npyv_load_u64(src + npyv_nlanes_u64 * 4);
+        npyv_u64  b6 = npyv_load_u64(src + npyv_nlanes_u64 * 5);
+        npyv_u64  b7 = npyv_load_u64(src + npyv_nlanes_u64 * 6);
+        npyv_u64  b8 = npyv_load_u64(src + npyv_nlanes_u64 * 7);
+        npyv_b64 c5 = npyv_cmplt_u64(a, b5);
+        npyv_b64 c6 = npyv_cmplt_u64(a, b6);
+        npyv_b64 c7 = npyv_cmplt_u64(a, b7);
+        npyv_b64 c8 = npyv_cmplt_u64(a, b8);
+#endif // 64 >= 64
+#endif // 64 >= 32
+#endif // 64 >= 16
+#endif // 64 >= 8
+
+#if 64 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 64 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 64 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 64 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u64 b = *src;
+        *dst = scalar < b;
+    }
+}
+
+static void simd_binary_scalar2_less_u64(char **args, npy_intp len)
+{
+    npyv_lanetype_u64 *src   = (npyv_lanetype_u64 *) args[0];
+    npyv_lanetype_u64 scalar = *(npyv_lanetype_u64 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_u64 b         = npyv_setall_u64(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 64 >= 8
+        npyv_u64  a1 = npyv_load_u64(src + npyv_nlanes_u64 * 0);
+        npyv_b64 c1 = npyv_cmplt_u64(a1, b);
+#if 64 >= 16
+        npyv_u64  a2 = npyv_load_u64(src + npyv_nlanes_u64 * 1);
+        npyv_b64 c2 = npyv_cmplt_u64(a2, b);
+#if 64 >= 32
+        npyv_u64  a3 = npyv_load_u64(src + npyv_nlanes_u64 * 2);
+        npyv_u64  a4 = npyv_load_u64(src + npyv_nlanes_u64 * 3);
+        npyv_b64 c3 = npyv_cmplt_u64(a3, b);
+        npyv_b64 c4 = npyv_cmplt_u64(a4, b);
+#if 64 == 64
+        npyv_u64  a5 = npyv_load_u64(src + npyv_nlanes_u64 * 4);
+        npyv_u64  a6 = npyv_load_u64(src + npyv_nlanes_u64 * 5);
+        npyv_u64  a7 = npyv_load_u64(src + npyv_nlanes_u64 * 6);
+        npyv_u64  a8 = npyv_load_u64(src + npyv_nlanes_u64 * 7);
+        npyv_b64 c5 = npyv_cmplt_u64(a5, b);
+        npyv_b64 c6 = npyv_cmplt_u64(a6, b);
+        npyv_b64 c7 = npyv_cmplt_u64(a7, b);
+        npyv_b64 c8 = npyv_cmplt_u64(a8, b);
+#endif // 64 >= 64
+#endif // 64 >= 32
+#endif // 64 >= 16
+#endif // 64 >= 8
+
+#if 64 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 64 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 64 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 64 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u64 a = *src;
+        *dst = a < scalar;
+    }
+}
+#endif
+
+
+#line 35
+#if NPY_SIMD && !((0 || 0) && 0)
+static void simd_binary_less_equal_u64(char **args, npy_intp len)
+{
+    npyv_lanetype_u64 *src1 = (npyv_lanetype_u64 *) args[0];
+    npyv_lanetype_u64 *src2 = (npyv_lanetype_u64 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 64 >= 8
+        npyv_u64  a1 = npyv_load_u64(src1 + npyv_nlanes_u64 * 0);
+        npyv_u64  b1 = npyv_load_u64(src2 + npyv_nlanes_u64 * 0);
+        npyv_b64 c1 = npyv_cmple_u64(a1, b1);
+#if 64 >= 16
+        npyv_u64  a2 = npyv_load_u64(src1 + npyv_nlanes_u64 * 1);
+        npyv_u64  b2 = npyv_load_u64(src2 + npyv_nlanes_u64 * 1);
+        npyv_b64 c2 = npyv_cmple_u64(a2, b2);
+#if 64 >= 32
+        npyv_u64  a3 = npyv_load_u64(src1 + npyv_nlanes_u64 * 2);
+        npyv_u64  b3 = npyv_load_u64(src2 + npyv_nlanes_u64 * 2);
+        npyv_u64  a4 = npyv_load_u64(src1 + npyv_nlanes_u64 * 3);
+        npyv_u64  b4 = npyv_load_u64(src2 + npyv_nlanes_u64 * 3);
+        npyv_b64 c3 = npyv_cmple_u64(a3, b3);
+        npyv_b64 c4 = npyv_cmple_u64(a4, b4);
+#if 64 == 64
+        npyv_u64  a5 = npyv_load_u64(src1 + npyv_nlanes_u64 * 4);
+        npyv_u64  b5 = npyv_load_u64(src2 + npyv_nlanes_u64 * 4);
+        npyv_u64  a6 = npyv_load_u64(src1 + npyv_nlanes_u64 * 5);
+        npyv_u64  b6 = npyv_load_u64(src2 + npyv_nlanes_u64 * 5);
+        npyv_u64  a7 = npyv_load_u64(src1 + npyv_nlanes_u64 * 6);
+        npyv_u64  b7 = npyv_load_u64(src2 + npyv_nlanes_u64 * 6);
+        npyv_u64  a8 = npyv_load_u64(src1 + npyv_nlanes_u64 * 7);
+        npyv_u64  b8 = npyv_load_u64(src2 + npyv_nlanes_u64 * 7);
+        npyv_b64 c5 = npyv_cmple_u64(a5, b5);
+        npyv_b64 c6 = npyv_cmple_u64(a6, b6);
+        npyv_b64 c7 = npyv_cmple_u64(a7, b7);
+        npyv_b64 c8 = npyv_cmple_u64(a8, b8);
+#endif // 64 >= 64
+#endif // 64 >= 32
+#endif // 64 >= 16
+#endif // 64 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 64 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 64 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 64 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 64 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_u64 a = *src1;
+        const npyv_lanetype_u64 b = *src2;
+        *dst = a <= b;
+    }
+}
+
+static void simd_binary_scalar1_less_equal_u64(char **args, npy_intp len)
+{
+    npyv_lanetype_u64 scalar = *(npyv_lanetype_u64 *) args[0];
+    npyv_lanetype_u64 *src   = (npyv_lanetype_u64 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_u64 a         = npyv_setall_u64(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 64 >= 8
+        npyv_u64  b1 = npyv_load_u64(src + npyv_nlanes_u64 * 0);
+        npyv_b64 c1 = npyv_cmple_u64(a, b1);
+#if 64 >= 16
+        npyv_u64  b2 = npyv_load_u64(src + npyv_nlanes_u64 * 1);
+        npyv_b64 c2 = npyv_cmple_u64(a, b2);
+#if 64 >= 32
+        npyv_u64  b3 = npyv_load_u64(src + npyv_nlanes_u64 * 2);
+        npyv_u64  b4 = npyv_load_u64(src + npyv_nlanes_u64 * 3);
+        npyv_b64 c3 = npyv_cmple_u64(a, b3);
+        npyv_b64 c4 = npyv_cmple_u64(a, b4);
+#if 64 == 64
+        npyv_u64  b5 = npyv_load_u64(src + npyv_nlanes_u64 * 4);
+        npyv_u64  b6 = npyv_load_u64(src + npyv_nlanes_u64 * 5);
+        npyv_u64  b7 = npyv_load_u64(src + npyv_nlanes_u64 * 6);
+        npyv_u64  b8 = npyv_load_u64(src + npyv_nlanes_u64 * 7);
+        npyv_b64 c5 = npyv_cmple_u64(a, b5);
+        npyv_b64 c6 = npyv_cmple_u64(a, b6);
+        npyv_b64 c7 = npyv_cmple_u64(a, b7);
+        npyv_b64 c8 = npyv_cmple_u64(a, b8);
+#endif // 64 >= 64
+#endif // 64 >= 32
+#endif // 64 >= 16
+#endif // 64 >= 8
+
+#if 64 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 64 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 64 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 64 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u64 b = *src;
+        *dst = scalar <= b;
+    }
+}
+
+static void simd_binary_scalar2_less_equal_u64(char **args, npy_intp len)
+{
+    npyv_lanetype_u64 *src   = (npyv_lanetype_u64 *) args[0];
+    npyv_lanetype_u64 scalar = *(npyv_lanetype_u64 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_u64 b         = npyv_setall_u64(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 64 >= 8
+        npyv_u64  a1 = npyv_load_u64(src + npyv_nlanes_u64 * 0);
+        npyv_b64 c1 = npyv_cmple_u64(a1, b);
+#if 64 >= 16
+        npyv_u64  a2 = npyv_load_u64(src + npyv_nlanes_u64 * 1);
+        npyv_b64 c2 = npyv_cmple_u64(a2, b);
+#if 64 >= 32
+        npyv_u64  a3 = npyv_load_u64(src + npyv_nlanes_u64 * 2);
+        npyv_u64  a4 = npyv_load_u64(src + npyv_nlanes_u64 * 3);
+        npyv_b64 c3 = npyv_cmple_u64(a3, b);
+        npyv_b64 c4 = npyv_cmple_u64(a4, b);
+#if 64 == 64
+        npyv_u64  a5 = npyv_load_u64(src + npyv_nlanes_u64 * 4);
+        npyv_u64  a6 = npyv_load_u64(src + npyv_nlanes_u64 * 5);
+        npyv_u64  a7 = npyv_load_u64(src + npyv_nlanes_u64 * 6);
+        npyv_u64  a8 = npyv_load_u64(src + npyv_nlanes_u64 * 7);
+        npyv_b64 c5 = npyv_cmple_u64(a5, b);
+        npyv_b64 c6 = npyv_cmple_u64(a6, b);
+        npyv_b64 c7 = npyv_cmple_u64(a7, b);
+        npyv_b64 c8 = npyv_cmple_u64(a8, b);
+#endif // 64 >= 64
+#endif // 64 >= 32
+#endif // 64 >= 16
+#endif // 64 >= 8
+
+#if 64 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 64 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 64 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 64 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u64 a = *src;
+        *dst = a <= scalar;
+    }
+}
+#endif
+
+
+
+#line 28
+#line 35
+#if NPY_SIMD && !((1 || 0) && 1)
+static void simd_binary_equal_s64(char **args, npy_intp len)
+{
+    npyv_lanetype_s64 *src1 = (npyv_lanetype_s64 *) args[0];
+    npyv_lanetype_s64 *src2 = (npyv_lanetype_s64 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 64 >= 8
+        npyv_s64  a1 = npyv_load_s64(src1 + npyv_nlanes_s64 * 0);
+        npyv_s64  b1 = npyv_load_s64(src2 + npyv_nlanes_s64 * 0);
+        npyv_b64 c1 = npyv_cmpeq_s64(a1, b1);
+#if 64 >= 16
+        npyv_s64  a2 = npyv_load_s64(src1 + npyv_nlanes_s64 * 1);
+        npyv_s64  b2 = npyv_load_s64(src2 + npyv_nlanes_s64 * 1);
+        npyv_b64 c2 = npyv_cmpeq_s64(a2, b2);
+#if 64 >= 32
+        npyv_s64  a3 = npyv_load_s64(src1 + npyv_nlanes_s64 * 2);
+        npyv_s64  b3 = npyv_load_s64(src2 + npyv_nlanes_s64 * 2);
+        npyv_s64  a4 = npyv_load_s64(src1 + npyv_nlanes_s64 * 3);
+        npyv_s64  b4 = npyv_load_s64(src2 + npyv_nlanes_s64 * 3);
+        npyv_b64 c3 = npyv_cmpeq_s64(a3, b3);
+        npyv_b64 c4 = npyv_cmpeq_s64(a4, b4);
+#if 64 == 64
+        npyv_s64  a5 = npyv_load_s64(src1 + npyv_nlanes_s64 * 4);
+        npyv_s64  b5 = npyv_load_s64(src2 + npyv_nlanes_s64 * 4);
+        npyv_s64  a6 = npyv_load_s64(src1 + npyv_nlanes_s64 * 5);
+        npyv_s64  b6 = npyv_load_s64(src2 + npyv_nlanes_s64 * 5);
+        npyv_s64  a7 = npyv_load_s64(src1 + npyv_nlanes_s64 * 6);
+        npyv_s64  b7 = npyv_load_s64(src2 + npyv_nlanes_s64 * 6);
+        npyv_s64  a8 = npyv_load_s64(src1 + npyv_nlanes_s64 * 7);
+        npyv_s64  b8 = npyv_load_s64(src2 + npyv_nlanes_s64 * 7);
+        npyv_b64 c5 = npyv_cmpeq_s64(a5, b5);
+        npyv_b64 c6 = npyv_cmpeq_s64(a6, b6);
+        npyv_b64 c7 = npyv_cmpeq_s64(a7, b7);
+        npyv_b64 c8 = npyv_cmpeq_s64(a8, b8);
+#endif // 64 >= 64
+#endif // 64 >= 32
+#endif // 64 >= 16
+#endif // 64 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 64 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 64 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 64 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 64 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_s64 a = *src1;
+        const npyv_lanetype_s64 b = *src2;
+        *dst = a == b;
+    }
+}
+
+static void simd_binary_scalar1_equal_s64(char **args, npy_intp len)
+{
+    npyv_lanetype_s64 scalar = *(npyv_lanetype_s64 *) args[0];
+    npyv_lanetype_s64 *src   = (npyv_lanetype_s64 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_s64 a         = npyv_setall_s64(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 64 >= 8
+        npyv_s64  b1 = npyv_load_s64(src + npyv_nlanes_s64 * 0);
+        npyv_b64 c1 = npyv_cmpeq_s64(a, b1);
+#if 64 >= 16
+        npyv_s64  b2 = npyv_load_s64(src + npyv_nlanes_s64 * 1);
+        npyv_b64 c2 = npyv_cmpeq_s64(a, b2);
+#if 64 >= 32
+        npyv_s64  b3 = npyv_load_s64(src + npyv_nlanes_s64 * 2);
+        npyv_s64  b4 = npyv_load_s64(src + npyv_nlanes_s64 * 3);
+        npyv_b64 c3 = npyv_cmpeq_s64(a, b3);
+        npyv_b64 c4 = npyv_cmpeq_s64(a, b4);
+#if 64 == 64
+        npyv_s64  b5 = npyv_load_s64(src + npyv_nlanes_s64 * 4);
+        npyv_s64  b6 = npyv_load_s64(src + npyv_nlanes_s64 * 5);
+        npyv_s64  b7 = npyv_load_s64(src + npyv_nlanes_s64 * 6);
+        npyv_s64  b8 = npyv_load_s64(src + npyv_nlanes_s64 * 7);
+        npyv_b64 c5 = npyv_cmpeq_s64(a, b5);
+        npyv_b64 c6 = npyv_cmpeq_s64(a, b6);
+        npyv_b64 c7 = npyv_cmpeq_s64(a, b7);
+        npyv_b64 c8 = npyv_cmpeq_s64(a, b8);
+#endif // 64 >= 64
+#endif // 64 >= 32
+#endif // 64 >= 16
+#endif // 64 >= 8
+
+#if 64 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 64 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 64 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 64 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_s64 b = *src;
+        *dst = scalar == b;
+    }
+}
+
+static void simd_binary_scalar2_equal_s64(char **args, npy_intp len)
+{
+    npyv_lanetype_s64 *src   = (npyv_lanetype_s64 *) args[0];
+    npyv_lanetype_s64 scalar = *(npyv_lanetype_s64 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_s64 b         = npyv_setall_s64(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 64 >= 8
+        npyv_s64  a1 = npyv_load_s64(src + npyv_nlanes_s64 * 0);
+        npyv_b64 c1 = npyv_cmpeq_s64(a1, b);
+#if 64 >= 16
+        npyv_s64  a2 = npyv_load_s64(src + npyv_nlanes_s64 * 1);
+        npyv_b64 c2 = npyv_cmpeq_s64(a2, b);
+#if 64 >= 32
+        npyv_s64  a3 = npyv_load_s64(src + npyv_nlanes_s64 * 2);
+        npyv_s64  a4 = npyv_load_s64(src + npyv_nlanes_s64 * 3);
+        npyv_b64 c3 = npyv_cmpeq_s64(a3, b);
+        npyv_b64 c4 = npyv_cmpeq_s64(a4, b);
+#if 64 == 64
+        npyv_s64  a5 = npyv_load_s64(src + npyv_nlanes_s64 * 4);
+        npyv_s64  a6 = npyv_load_s64(src + npyv_nlanes_s64 * 5);
+        npyv_s64  a7 = npyv_load_s64(src + npyv_nlanes_s64 * 6);
+        npyv_s64  a8 = npyv_load_s64(src + npyv_nlanes_s64 * 7);
+        npyv_b64 c5 = npyv_cmpeq_s64(a5, b);
+        npyv_b64 c6 = npyv_cmpeq_s64(a6, b);
+        npyv_b64 c7 = npyv_cmpeq_s64(a7, b);
+        npyv_b64 c8 = npyv_cmpeq_s64(a8, b);
+#endif // 64 >= 64
+#endif // 64 >= 32
+#endif // 64 >= 16
+#endif // 64 >= 8
+
+#if 64 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 64 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 64 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 64 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_s64 a = *src;
+        *dst = a == scalar;
+    }
+}
+#endif
+
+
+#line 35
+#if NPY_SIMD && !((0 || 1) && 1)
+static void simd_binary_not_equal_s64(char **args, npy_intp len)
+{
+    npyv_lanetype_s64 *src1 = (npyv_lanetype_s64 *) args[0];
+    npyv_lanetype_s64 *src2 = (npyv_lanetype_s64 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 64 >= 8
+        npyv_s64  a1 = npyv_load_s64(src1 + npyv_nlanes_s64 * 0);
+        npyv_s64  b1 = npyv_load_s64(src2 + npyv_nlanes_s64 * 0);
+        npyv_b64 c1 = npyv_cmpneq_s64(a1, b1);
+#if 64 >= 16
+        npyv_s64  a2 = npyv_load_s64(src1 + npyv_nlanes_s64 * 1);
+        npyv_s64  b2 = npyv_load_s64(src2 + npyv_nlanes_s64 * 1);
+        npyv_b64 c2 = npyv_cmpneq_s64(a2, b2);
+#if 64 >= 32
+        npyv_s64  a3 = npyv_load_s64(src1 + npyv_nlanes_s64 * 2);
+        npyv_s64  b3 = npyv_load_s64(src2 + npyv_nlanes_s64 * 2);
+        npyv_s64  a4 = npyv_load_s64(src1 + npyv_nlanes_s64 * 3);
+        npyv_s64  b4 = npyv_load_s64(src2 + npyv_nlanes_s64 * 3);
+        npyv_b64 c3 = npyv_cmpneq_s64(a3, b3);
+        npyv_b64 c4 = npyv_cmpneq_s64(a4, b4);
+#if 64 == 64
+        npyv_s64  a5 = npyv_load_s64(src1 + npyv_nlanes_s64 * 4);
+        npyv_s64  b5 = npyv_load_s64(src2 + npyv_nlanes_s64 * 4);
+        npyv_s64  a6 = npyv_load_s64(src1 + npyv_nlanes_s64 * 5);
+        npyv_s64  b6 = npyv_load_s64(src2 + npyv_nlanes_s64 * 5);
+        npyv_s64  a7 = npyv_load_s64(src1 + npyv_nlanes_s64 * 6);
+        npyv_s64  b7 = npyv_load_s64(src2 + npyv_nlanes_s64 * 6);
+        npyv_s64  a8 = npyv_load_s64(src1 + npyv_nlanes_s64 * 7);
+        npyv_s64  b8 = npyv_load_s64(src2 + npyv_nlanes_s64 * 7);
+        npyv_b64 c5 = npyv_cmpneq_s64(a5, b5);
+        npyv_b64 c6 = npyv_cmpneq_s64(a6, b6);
+        npyv_b64 c7 = npyv_cmpneq_s64(a7, b7);
+        npyv_b64 c8 = npyv_cmpneq_s64(a8, b8);
+#endif // 64 >= 64
+#endif // 64 >= 32
+#endif // 64 >= 16
+#endif // 64 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 64 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 64 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 64 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 64 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_s64 a = *src1;
+        const npyv_lanetype_s64 b = *src2;
+        *dst = a != b;
+    }
+}
+
+static void simd_binary_scalar1_not_equal_s64(char **args, npy_intp len)
+{
+    npyv_lanetype_s64 scalar = *(npyv_lanetype_s64 *) args[0];
+    npyv_lanetype_s64 *src   = (npyv_lanetype_s64 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_s64 a         = npyv_setall_s64(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 64 >= 8
+        npyv_s64  b1 = npyv_load_s64(src + npyv_nlanes_s64 * 0);
+        npyv_b64 c1 = npyv_cmpneq_s64(a, b1);
+#if 64 >= 16
+        npyv_s64  b2 = npyv_load_s64(src + npyv_nlanes_s64 * 1);
+        npyv_b64 c2 = npyv_cmpneq_s64(a, b2);
+#if 64 >= 32
+        npyv_s64  b3 = npyv_load_s64(src + npyv_nlanes_s64 * 2);
+        npyv_s64  b4 = npyv_load_s64(src + npyv_nlanes_s64 * 3);
+        npyv_b64 c3 = npyv_cmpneq_s64(a, b3);
+        npyv_b64 c4 = npyv_cmpneq_s64(a, b4);
+#if 64 == 64
+        npyv_s64  b5 = npyv_load_s64(src + npyv_nlanes_s64 * 4);
+        npyv_s64  b6 = npyv_load_s64(src + npyv_nlanes_s64 * 5);
+        npyv_s64  b7 = npyv_load_s64(src + npyv_nlanes_s64 * 6);
+        npyv_s64  b8 = npyv_load_s64(src + npyv_nlanes_s64 * 7);
+        npyv_b64 c5 = npyv_cmpneq_s64(a, b5);
+        npyv_b64 c6 = npyv_cmpneq_s64(a, b6);
+        npyv_b64 c7 = npyv_cmpneq_s64(a, b7);
+        npyv_b64 c8 = npyv_cmpneq_s64(a, b8);
+#endif // 64 >= 64
+#endif // 64 >= 32
+#endif // 64 >= 16
+#endif // 64 >= 8
+
+#if 64 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 64 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 64 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 64 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_s64 b = *src;
+        *dst = scalar != b;
+    }
+}
+
+static void simd_binary_scalar2_not_equal_s64(char **args, npy_intp len)
+{
+    npyv_lanetype_s64 *src   = (npyv_lanetype_s64 *) args[0];
+    npyv_lanetype_s64 scalar = *(npyv_lanetype_s64 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_s64 b         = npyv_setall_s64(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 64 >= 8
+        npyv_s64  a1 = npyv_load_s64(src + npyv_nlanes_s64 * 0);
+        npyv_b64 c1 = npyv_cmpneq_s64(a1, b);
+#if 64 >= 16
+        npyv_s64  a2 = npyv_load_s64(src + npyv_nlanes_s64 * 1);
+        npyv_b64 c2 = npyv_cmpneq_s64(a2, b);
+#if 64 >= 32
+        npyv_s64  a3 = npyv_load_s64(src + npyv_nlanes_s64 * 2);
+        npyv_s64  a4 = npyv_load_s64(src + npyv_nlanes_s64 * 3);
+        npyv_b64 c3 = npyv_cmpneq_s64(a3, b);
+        npyv_b64 c4 = npyv_cmpneq_s64(a4, b);
+#if 64 == 64
+        npyv_s64  a5 = npyv_load_s64(src + npyv_nlanes_s64 * 4);
+        npyv_s64  a6 = npyv_load_s64(src + npyv_nlanes_s64 * 5);
+        npyv_s64  a7 = npyv_load_s64(src + npyv_nlanes_s64 * 6);
+        npyv_s64  a8 = npyv_load_s64(src + npyv_nlanes_s64 * 7);
+        npyv_b64 c5 = npyv_cmpneq_s64(a5, b);
+        npyv_b64 c6 = npyv_cmpneq_s64(a6, b);
+        npyv_b64 c7 = npyv_cmpneq_s64(a7, b);
+        npyv_b64 c8 = npyv_cmpneq_s64(a8, b);
+#endif // 64 >= 64
+#endif // 64 >= 32
+#endif // 64 >= 16
+#endif // 64 >= 8
+
+#if 64 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 64 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 64 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 64 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_s64 a = *src;
+        *dst = a != scalar;
+    }
+}
+#endif
+
+
+#line 35
+#if NPY_SIMD && !((0 || 0) && 1)
+static void simd_binary_less_s64(char **args, npy_intp len)
+{
+    npyv_lanetype_s64 *src1 = (npyv_lanetype_s64 *) args[0];
+    npyv_lanetype_s64 *src2 = (npyv_lanetype_s64 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 64 >= 8
+        npyv_s64  a1 = npyv_load_s64(src1 + npyv_nlanes_s64 * 0);
+        npyv_s64  b1 = npyv_load_s64(src2 + npyv_nlanes_s64 * 0);
+        npyv_b64 c1 = npyv_cmplt_s64(a1, b1);
+#if 64 >= 16
+        npyv_s64  a2 = npyv_load_s64(src1 + npyv_nlanes_s64 * 1);
+        npyv_s64  b2 = npyv_load_s64(src2 + npyv_nlanes_s64 * 1);
+        npyv_b64 c2 = npyv_cmplt_s64(a2, b2);
+#if 64 >= 32
+        npyv_s64  a3 = npyv_load_s64(src1 + npyv_nlanes_s64 * 2);
+        npyv_s64  b3 = npyv_load_s64(src2 + npyv_nlanes_s64 * 2);
+        npyv_s64  a4 = npyv_load_s64(src1 + npyv_nlanes_s64 * 3);
+        npyv_s64  b4 = npyv_load_s64(src2 + npyv_nlanes_s64 * 3);
+        npyv_b64 c3 = npyv_cmplt_s64(a3, b3);
+        npyv_b64 c4 = npyv_cmplt_s64(a4, b4);
+#if 64 == 64
+        npyv_s64  a5 = npyv_load_s64(src1 + npyv_nlanes_s64 * 4);
+        npyv_s64  b5 = npyv_load_s64(src2 + npyv_nlanes_s64 * 4);
+        npyv_s64  a6 = npyv_load_s64(src1 + npyv_nlanes_s64 * 5);
+        npyv_s64  b6 = npyv_load_s64(src2 + npyv_nlanes_s64 * 5);
+        npyv_s64  a7 = npyv_load_s64(src1 + npyv_nlanes_s64 * 6);
+        npyv_s64  b7 = npyv_load_s64(src2 + npyv_nlanes_s64 * 6);
+        npyv_s64  a8 = npyv_load_s64(src1 + npyv_nlanes_s64 * 7);
+        npyv_s64  b8 = npyv_load_s64(src2 + npyv_nlanes_s64 * 7);
+        npyv_b64 c5 = npyv_cmplt_s64(a5, b5);
+        npyv_b64 c6 = npyv_cmplt_s64(a6, b6);
+        npyv_b64 c7 = npyv_cmplt_s64(a7, b7);
+        npyv_b64 c8 = npyv_cmplt_s64(a8, b8);
+#endif // 64 >= 64
+#endif // 64 >= 32
+#endif // 64 >= 16
+#endif // 64 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 64 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 64 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 64 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 64 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_s64 a = *src1;
+        const npyv_lanetype_s64 b = *src2;
+        *dst = a < b;
+    }
+}
+
+static void simd_binary_scalar1_less_s64(char **args, npy_intp len)
+{
+    npyv_lanetype_s64 scalar = *(npyv_lanetype_s64 *) args[0];
+    npyv_lanetype_s64 *src   = (npyv_lanetype_s64 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_s64 a         = npyv_setall_s64(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 64 >= 8
+        npyv_s64  b1 = npyv_load_s64(src + npyv_nlanes_s64 * 0);
+        npyv_b64 c1 = npyv_cmplt_s64(a, b1);
+#if 64 >= 16
+        npyv_s64  b2 = npyv_load_s64(src + npyv_nlanes_s64 * 1);
+        npyv_b64 c2 = npyv_cmplt_s64(a, b2);
+#if 64 >= 32
+        npyv_s64  b3 = npyv_load_s64(src + npyv_nlanes_s64 * 2);
+        npyv_s64  b4 = npyv_load_s64(src + npyv_nlanes_s64 * 3);
+        npyv_b64 c3 = npyv_cmplt_s64(a, b3);
+        npyv_b64 c4 = npyv_cmplt_s64(a, b4);
+#if 64 == 64
+        npyv_s64  b5 = npyv_load_s64(src + npyv_nlanes_s64 * 4);
+        npyv_s64  b6 = npyv_load_s64(src + npyv_nlanes_s64 * 5);
+        npyv_s64  b7 = npyv_load_s64(src + npyv_nlanes_s64 * 6);
+        npyv_s64  b8 = npyv_load_s64(src + npyv_nlanes_s64 * 7);
+        npyv_b64 c5 = npyv_cmplt_s64(a, b5);
+        npyv_b64 c6 = npyv_cmplt_s64(a, b6);
+        npyv_b64 c7 = npyv_cmplt_s64(a, b7);
+        npyv_b64 c8 = npyv_cmplt_s64(a, b8);
+#endif // 64 >= 64
+#endif // 64 >= 32
+#endif // 64 >= 16
+#endif // 64 >= 8
+
+#if 64 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 64 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 64 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 64 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_s64 b = *src;
+        *dst = scalar < b;
+    }
+}
+
+static void simd_binary_scalar2_less_s64(char **args, npy_intp len)
+{
+    npyv_lanetype_s64 *src   = (npyv_lanetype_s64 *) args[0];
+    npyv_lanetype_s64 scalar = *(npyv_lanetype_s64 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_s64 b         = npyv_setall_s64(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 64 >= 8
+        npyv_s64  a1 = npyv_load_s64(src + npyv_nlanes_s64 * 0);
+        npyv_b64 c1 = npyv_cmplt_s64(a1, b);
+#if 64 >= 16
+        npyv_s64  a2 = npyv_load_s64(src + npyv_nlanes_s64 * 1);
+        npyv_b64 c2 = npyv_cmplt_s64(a2, b);
+#if 64 >= 32
+        npyv_s64  a3 = npyv_load_s64(src + npyv_nlanes_s64 * 2);
+        npyv_s64  a4 = npyv_load_s64(src + npyv_nlanes_s64 * 3);
+        npyv_b64 c3 = npyv_cmplt_s64(a3, b);
+        npyv_b64 c4 = npyv_cmplt_s64(a4, b);
+#if 64 == 64
+        npyv_s64  a5 = npyv_load_s64(src + npyv_nlanes_s64 * 4);
+        npyv_s64  a6 = npyv_load_s64(src + npyv_nlanes_s64 * 5);
+        npyv_s64  a7 = npyv_load_s64(src + npyv_nlanes_s64 * 6);
+        npyv_s64  a8 = npyv_load_s64(src + npyv_nlanes_s64 * 7);
+        npyv_b64 c5 = npyv_cmplt_s64(a5, b);
+        npyv_b64 c6 = npyv_cmplt_s64(a6, b);
+        npyv_b64 c7 = npyv_cmplt_s64(a7, b);
+        npyv_b64 c8 = npyv_cmplt_s64(a8, b);
+#endif // 64 >= 64
+#endif // 64 >= 32
+#endif // 64 >= 16
+#endif // 64 >= 8
+
+#if 64 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 64 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 64 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 64 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_s64 a = *src;
+        *dst = a < scalar;
+    }
+}
+#endif
+
+
+#line 35
+#if NPY_SIMD && !((0 || 0) && 1)
+static void simd_binary_less_equal_s64(char **args, npy_intp len)
+{
+    npyv_lanetype_s64 *src1 = (npyv_lanetype_s64 *) args[0];
+    npyv_lanetype_s64 *src2 = (npyv_lanetype_s64 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 64 >= 8
+        npyv_s64  a1 = npyv_load_s64(src1 + npyv_nlanes_s64 * 0);
+        npyv_s64  b1 = npyv_load_s64(src2 + npyv_nlanes_s64 * 0);
+        npyv_b64 c1 = npyv_cmple_s64(a1, b1);
+#if 64 >= 16
+        npyv_s64  a2 = npyv_load_s64(src1 + npyv_nlanes_s64 * 1);
+        npyv_s64  b2 = npyv_load_s64(src2 + npyv_nlanes_s64 * 1);
+        npyv_b64 c2 = npyv_cmple_s64(a2, b2);
+#if 64 >= 32
+        npyv_s64  a3 = npyv_load_s64(src1 + npyv_nlanes_s64 * 2);
+        npyv_s64  b3 = npyv_load_s64(src2 + npyv_nlanes_s64 * 2);
+        npyv_s64  a4 = npyv_load_s64(src1 + npyv_nlanes_s64 * 3);
+        npyv_s64  b4 = npyv_load_s64(src2 + npyv_nlanes_s64 * 3);
+        npyv_b64 c3 = npyv_cmple_s64(a3, b3);
+        npyv_b64 c4 = npyv_cmple_s64(a4, b4);
+#if 64 == 64
+        npyv_s64  a5 = npyv_load_s64(src1 + npyv_nlanes_s64 * 4);
+        npyv_s64  b5 = npyv_load_s64(src2 + npyv_nlanes_s64 * 4);
+        npyv_s64  a6 = npyv_load_s64(src1 + npyv_nlanes_s64 * 5);
+        npyv_s64  b6 = npyv_load_s64(src2 + npyv_nlanes_s64 * 5);
+        npyv_s64  a7 = npyv_load_s64(src1 + npyv_nlanes_s64 * 6);
+        npyv_s64  b7 = npyv_load_s64(src2 + npyv_nlanes_s64 * 6);
+        npyv_s64  a8 = npyv_load_s64(src1 + npyv_nlanes_s64 * 7);
+        npyv_s64  b8 = npyv_load_s64(src2 + npyv_nlanes_s64 * 7);
+        npyv_b64 c5 = npyv_cmple_s64(a5, b5);
+        npyv_b64 c6 = npyv_cmple_s64(a6, b6);
+        npyv_b64 c7 = npyv_cmple_s64(a7, b7);
+        npyv_b64 c8 = npyv_cmple_s64(a8, b8);
+#endif // 64 >= 64
+#endif // 64 >= 32
+#endif // 64 >= 16
+#endif // 64 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 64 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 64 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 64 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 64 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_s64 a = *src1;
+        const npyv_lanetype_s64 b = *src2;
+        *dst = a <= b;
+    }
+}
+
+static void simd_binary_scalar1_less_equal_s64(char **args, npy_intp len)
+{
+    npyv_lanetype_s64 scalar = *(npyv_lanetype_s64 *) args[0];
+    npyv_lanetype_s64 *src   = (npyv_lanetype_s64 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_s64 a         = npyv_setall_s64(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 64 >= 8
+        npyv_s64  b1 = npyv_load_s64(src + npyv_nlanes_s64 * 0);
+        npyv_b64 c1 = npyv_cmple_s64(a, b1);
+#if 64 >= 16
+        npyv_s64  b2 = npyv_load_s64(src + npyv_nlanes_s64 * 1);
+        npyv_b64 c2 = npyv_cmple_s64(a, b2);
+#if 64 >= 32
+        npyv_s64  b3 = npyv_load_s64(src + npyv_nlanes_s64 * 2);
+        npyv_s64  b4 = npyv_load_s64(src + npyv_nlanes_s64 * 3);
+        npyv_b64 c3 = npyv_cmple_s64(a, b3);
+        npyv_b64 c4 = npyv_cmple_s64(a, b4);
+#if 64 == 64
+        npyv_s64  b5 = npyv_load_s64(src + npyv_nlanes_s64 * 4);
+        npyv_s64  b6 = npyv_load_s64(src + npyv_nlanes_s64 * 5);
+        npyv_s64  b7 = npyv_load_s64(src + npyv_nlanes_s64 * 6);
+        npyv_s64  b8 = npyv_load_s64(src + npyv_nlanes_s64 * 7);
+        npyv_b64 c5 = npyv_cmple_s64(a, b5);
+        npyv_b64 c6 = npyv_cmple_s64(a, b6);
+        npyv_b64 c7 = npyv_cmple_s64(a, b7);
+        npyv_b64 c8 = npyv_cmple_s64(a, b8);
+#endif // 64 >= 64
+#endif // 64 >= 32
+#endif // 64 >= 16
+#endif // 64 >= 8
+
+#if 64 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 64 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 64 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 64 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_s64 b = *src;
+        *dst = scalar <= b;
+    }
+}
+
+static void simd_binary_scalar2_less_equal_s64(char **args, npy_intp len)
+{
+    npyv_lanetype_s64 *src   = (npyv_lanetype_s64 *) args[0];
+    npyv_lanetype_s64 scalar = *(npyv_lanetype_s64 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_s64 b         = npyv_setall_s64(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 64 >= 8
+        npyv_s64  a1 = npyv_load_s64(src + npyv_nlanes_s64 * 0);
+        npyv_b64 c1 = npyv_cmple_s64(a1, b);
+#if 64 >= 16
+        npyv_s64  a2 = npyv_load_s64(src + npyv_nlanes_s64 * 1);
+        npyv_b64 c2 = npyv_cmple_s64(a2, b);
+#if 64 >= 32
+        npyv_s64  a3 = npyv_load_s64(src + npyv_nlanes_s64 * 2);
+        npyv_s64  a4 = npyv_load_s64(src + npyv_nlanes_s64 * 3);
+        npyv_b64 c3 = npyv_cmple_s64(a3, b);
+        npyv_b64 c4 = npyv_cmple_s64(a4, b);
+#if 64 == 64
+        npyv_s64  a5 = npyv_load_s64(src + npyv_nlanes_s64 * 4);
+        npyv_s64  a6 = npyv_load_s64(src + npyv_nlanes_s64 * 5);
+        npyv_s64  a7 = npyv_load_s64(src + npyv_nlanes_s64 * 6);
+        npyv_s64  a8 = npyv_load_s64(src + npyv_nlanes_s64 * 7);
+        npyv_b64 c5 = npyv_cmple_s64(a5, b);
+        npyv_b64 c6 = npyv_cmple_s64(a6, b);
+        npyv_b64 c7 = npyv_cmple_s64(a7, b);
+        npyv_b64 c8 = npyv_cmple_s64(a8, b);
+#endif // 64 >= 64
+#endif // 64 >= 32
+#endif // 64 >= 16
+#endif // 64 >= 8
+
+#if 64 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 64 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 64 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 64 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_s64 a = *src;
+        *dst = a <= scalar;
+    }
+}
+#endif
+
+
+
+#line 28
+#line 35
+#if NPY_SIMD_F32 && !((1 || 0) && 0)
+static void simd_binary_equal_f32(char **args, npy_intp len)
+{
+    npyv_lanetype_f32 *src1 = (npyv_lanetype_f32 *) args[0];
+    npyv_lanetype_f32 *src2 = (npyv_lanetype_f32 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 32 >= 8
+        npyv_f32  a1 = npyv_load_f32(src1 + npyv_nlanes_f32 * 0);
+        npyv_f32  b1 = npyv_load_f32(src2 + npyv_nlanes_f32 * 0);
+        npyv_b32 c1 = npyv_cmpeq_f32(a1, b1);
+#if 32 >= 16
+        npyv_f32  a2 = npyv_load_f32(src1 + npyv_nlanes_f32 * 1);
+        npyv_f32  b2 = npyv_load_f32(src2 + npyv_nlanes_f32 * 1);
+        npyv_b32 c2 = npyv_cmpeq_f32(a2, b2);
+#if 32 >= 32
+        npyv_f32  a3 = npyv_load_f32(src1 + npyv_nlanes_f32 * 2);
+        npyv_f32  b3 = npyv_load_f32(src2 + npyv_nlanes_f32 * 2);
+        npyv_f32  a4 = npyv_load_f32(src1 + npyv_nlanes_f32 * 3);
+        npyv_f32  b4 = npyv_load_f32(src2 + npyv_nlanes_f32 * 3);
+        npyv_b32 c3 = npyv_cmpeq_f32(a3, b3);
+        npyv_b32 c4 = npyv_cmpeq_f32(a4, b4);
+#if 32 == 64
+        npyv_f32  a5 = npyv_load_f32(src1 + npyv_nlanes_f32 * 4);
+        npyv_f32  b5 = npyv_load_f32(src2 + npyv_nlanes_f32 * 4);
+        npyv_f32  a6 = npyv_load_f32(src1 + npyv_nlanes_f32 * 5);
+        npyv_f32  b6 = npyv_load_f32(src2 + npyv_nlanes_f32 * 5);
+        npyv_f32  a7 = npyv_load_f32(src1 + npyv_nlanes_f32 * 6);
+        npyv_f32  b7 = npyv_load_f32(src2 + npyv_nlanes_f32 * 6);
+        npyv_f32  a8 = npyv_load_f32(src1 + npyv_nlanes_f32 * 7);
+        npyv_f32  b8 = npyv_load_f32(src2 + npyv_nlanes_f32 * 7);
+        npyv_b32 c5 = npyv_cmpeq_f32(a5, b5);
+        npyv_b32 c6 = npyv_cmpeq_f32(a6, b6);
+        npyv_b32 c7 = npyv_cmpeq_f32(a7, b7);
+        npyv_b32 c8 = npyv_cmpeq_f32(a8, b8);
+#endif // 32 >= 64
+#endif // 32 >= 32
+#endif // 32 >= 16
+#endif // 32 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 32 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 32 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 32 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 32 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_f32 a = *src1;
+        const npyv_lanetype_f32 b = *src2;
+        *dst = a == b;
+    }
+}
+
+static void simd_binary_scalar1_equal_f32(char **args, npy_intp len)
+{
+    npyv_lanetype_f32 scalar = *(npyv_lanetype_f32 *) args[0];
+    npyv_lanetype_f32 *src   = (npyv_lanetype_f32 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_f32 a         = npyv_setall_f32(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 32 >= 8
+        npyv_f32  b1 = npyv_load_f32(src + npyv_nlanes_f32 * 0);
+        npyv_b32 c1 = npyv_cmpeq_f32(a, b1);
+#if 32 >= 16
+        npyv_f32  b2 = npyv_load_f32(src + npyv_nlanes_f32 * 1);
+        npyv_b32 c2 = npyv_cmpeq_f32(a, b2);
+#if 32 >= 32
+        npyv_f32  b3 = npyv_load_f32(src + npyv_nlanes_f32 * 2);
+        npyv_f32  b4 = npyv_load_f32(src + npyv_nlanes_f32 * 3);
+        npyv_b32 c3 = npyv_cmpeq_f32(a, b3);
+        npyv_b32 c4 = npyv_cmpeq_f32(a, b4);
+#if 32 == 64
+        npyv_f32  b5 = npyv_load_f32(src + npyv_nlanes_f32 * 4);
+        npyv_f32  b6 = npyv_load_f32(src + npyv_nlanes_f32 * 5);
+        npyv_f32  b7 = npyv_load_f32(src + npyv_nlanes_f32 * 6);
+        npyv_f32  b8 = npyv_load_f32(src + npyv_nlanes_f32 * 7);
+        npyv_b32 c5 = npyv_cmpeq_f32(a, b5);
+        npyv_b32 c6 = npyv_cmpeq_f32(a, b6);
+        npyv_b32 c7 = npyv_cmpeq_f32(a, b7);
+        npyv_b32 c8 = npyv_cmpeq_f32(a, b8);
+#endif // 32 >= 64
+#endif // 32 >= 32
+#endif // 32 >= 16
+#endif // 32 >= 8
+
+#if 32 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 32 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 32 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 32 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_f32 b = *src;
+        *dst = scalar == b;
+    }
+}
+
+static void simd_binary_scalar2_equal_f32(char **args, npy_intp len)
+{
+    npyv_lanetype_f32 *src   = (npyv_lanetype_f32 *) args[0];
+    npyv_lanetype_f32 scalar = *(npyv_lanetype_f32 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_f32 b         = npyv_setall_f32(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 32 >= 8
+        npyv_f32  a1 = npyv_load_f32(src + npyv_nlanes_f32 * 0);
+        npyv_b32 c1 = npyv_cmpeq_f32(a1, b);
+#if 32 >= 16
+        npyv_f32  a2 = npyv_load_f32(src + npyv_nlanes_f32 * 1);
+        npyv_b32 c2 = npyv_cmpeq_f32(a2, b);
+#if 32 >= 32
+        npyv_f32  a3 = npyv_load_f32(src + npyv_nlanes_f32 * 2);
+        npyv_f32  a4 = npyv_load_f32(src + npyv_nlanes_f32 * 3);
+        npyv_b32 c3 = npyv_cmpeq_f32(a3, b);
+        npyv_b32 c4 = npyv_cmpeq_f32(a4, b);
+#if 32 == 64
+        npyv_f32  a5 = npyv_load_f32(src + npyv_nlanes_f32 * 4);
+        npyv_f32  a6 = npyv_load_f32(src + npyv_nlanes_f32 * 5);
+        npyv_f32  a7 = npyv_load_f32(src + npyv_nlanes_f32 * 6);
+        npyv_f32  a8 = npyv_load_f32(src + npyv_nlanes_f32 * 7);
+        npyv_b32 c5 = npyv_cmpeq_f32(a5, b);
+        npyv_b32 c6 = npyv_cmpeq_f32(a6, b);
+        npyv_b32 c7 = npyv_cmpeq_f32(a7, b);
+        npyv_b32 c8 = npyv_cmpeq_f32(a8, b);
+#endif // 32 >= 64
+#endif // 32 >= 32
+#endif // 32 >= 16
+#endif // 32 >= 8
+
+#if 32 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 32 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 32 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 32 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_f32 a = *src;
+        *dst = a == scalar;
+    }
+}
+#endif
+
+
+#line 35
+#if NPY_SIMD_F32 && !((0 || 1) && 0)
+static void simd_binary_not_equal_f32(char **args, npy_intp len)
+{
+    npyv_lanetype_f32 *src1 = (npyv_lanetype_f32 *) args[0];
+    npyv_lanetype_f32 *src2 = (npyv_lanetype_f32 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 32 >= 8
+        npyv_f32  a1 = npyv_load_f32(src1 + npyv_nlanes_f32 * 0);
+        npyv_f32  b1 = npyv_load_f32(src2 + npyv_nlanes_f32 * 0);
+        npyv_b32 c1 = npyv_cmpneq_f32(a1, b1);
+#if 32 >= 16
+        npyv_f32  a2 = npyv_load_f32(src1 + npyv_nlanes_f32 * 1);
+        npyv_f32  b2 = npyv_load_f32(src2 + npyv_nlanes_f32 * 1);
+        npyv_b32 c2 = npyv_cmpneq_f32(a2, b2);
+#if 32 >= 32
+        npyv_f32  a3 = npyv_load_f32(src1 + npyv_nlanes_f32 * 2);
+        npyv_f32  b3 = npyv_load_f32(src2 + npyv_nlanes_f32 * 2);
+        npyv_f32  a4 = npyv_load_f32(src1 + npyv_nlanes_f32 * 3);
+        npyv_f32  b4 = npyv_load_f32(src2 + npyv_nlanes_f32 * 3);
+        npyv_b32 c3 = npyv_cmpneq_f32(a3, b3);
+        npyv_b32 c4 = npyv_cmpneq_f32(a4, b4);
+#if 32 == 64
+        npyv_f32  a5 = npyv_load_f32(src1 + npyv_nlanes_f32 * 4);
+        npyv_f32  b5 = npyv_load_f32(src2 + npyv_nlanes_f32 * 4);
+        npyv_f32  a6 = npyv_load_f32(src1 + npyv_nlanes_f32 * 5);
+        npyv_f32  b6 = npyv_load_f32(src2 + npyv_nlanes_f32 * 5);
+        npyv_f32  a7 = npyv_load_f32(src1 + npyv_nlanes_f32 * 6);
+        npyv_f32  b7 = npyv_load_f32(src2 + npyv_nlanes_f32 * 6);
+        npyv_f32  a8 = npyv_load_f32(src1 + npyv_nlanes_f32 * 7);
+        npyv_f32  b8 = npyv_load_f32(src2 + npyv_nlanes_f32 * 7);
+        npyv_b32 c5 = npyv_cmpneq_f32(a5, b5);
+        npyv_b32 c6 = npyv_cmpneq_f32(a6, b6);
+        npyv_b32 c7 = npyv_cmpneq_f32(a7, b7);
+        npyv_b32 c8 = npyv_cmpneq_f32(a8, b8);
+#endif // 32 >= 64
+#endif // 32 >= 32
+#endif // 32 >= 16
+#endif // 32 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 32 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 32 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 32 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 32 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_f32 a = *src1;
+        const npyv_lanetype_f32 b = *src2;
+        *dst = a != b;
+    }
+}
+
+static void simd_binary_scalar1_not_equal_f32(char **args, npy_intp len)
+{
+    npyv_lanetype_f32 scalar = *(npyv_lanetype_f32 *) args[0];
+    npyv_lanetype_f32 *src   = (npyv_lanetype_f32 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_f32 a         = npyv_setall_f32(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 32 >= 8
+        npyv_f32  b1 = npyv_load_f32(src + npyv_nlanes_f32 * 0);
+        npyv_b32 c1 = npyv_cmpneq_f32(a, b1);
+#if 32 >= 16
+        npyv_f32  b2 = npyv_load_f32(src + npyv_nlanes_f32 * 1);
+        npyv_b32 c2 = npyv_cmpneq_f32(a, b2);
+#if 32 >= 32
+        npyv_f32  b3 = npyv_load_f32(src + npyv_nlanes_f32 * 2);
+        npyv_f32  b4 = npyv_load_f32(src + npyv_nlanes_f32 * 3);
+        npyv_b32 c3 = npyv_cmpneq_f32(a, b3);
+        npyv_b32 c4 = npyv_cmpneq_f32(a, b4);
+#if 32 == 64
+        npyv_f32  b5 = npyv_load_f32(src + npyv_nlanes_f32 * 4);
+        npyv_f32  b6 = npyv_load_f32(src + npyv_nlanes_f32 * 5);
+        npyv_f32  b7 = npyv_load_f32(src + npyv_nlanes_f32 * 6);
+        npyv_f32  b8 = npyv_load_f32(src + npyv_nlanes_f32 * 7);
+        npyv_b32 c5 = npyv_cmpneq_f32(a, b5);
+        npyv_b32 c6 = npyv_cmpneq_f32(a, b6);
+        npyv_b32 c7 = npyv_cmpneq_f32(a, b7);
+        npyv_b32 c8 = npyv_cmpneq_f32(a, b8);
+#endif // 32 >= 64
+#endif // 32 >= 32
+#endif // 32 >= 16
+#endif // 32 >= 8
+
+#if 32 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 32 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 32 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 32 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_f32 b = *src;
+        *dst = scalar != b;
+    }
+}
+
+static void simd_binary_scalar2_not_equal_f32(char **args, npy_intp len)
+{
+    npyv_lanetype_f32 *src   = (npyv_lanetype_f32 *) args[0];
+    npyv_lanetype_f32 scalar = *(npyv_lanetype_f32 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_f32 b         = npyv_setall_f32(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 32 >= 8
+        npyv_f32  a1 = npyv_load_f32(src + npyv_nlanes_f32 * 0);
+        npyv_b32 c1 = npyv_cmpneq_f32(a1, b);
+#if 32 >= 16
+        npyv_f32  a2 = npyv_load_f32(src + npyv_nlanes_f32 * 1);
+        npyv_b32 c2 = npyv_cmpneq_f32(a2, b);
+#if 32 >= 32
+        npyv_f32  a3 = npyv_load_f32(src + npyv_nlanes_f32 * 2);
+        npyv_f32  a4 = npyv_load_f32(src + npyv_nlanes_f32 * 3);
+        npyv_b32 c3 = npyv_cmpneq_f32(a3, b);
+        npyv_b32 c4 = npyv_cmpneq_f32(a4, b);
+#if 32 == 64
+        npyv_f32  a5 = npyv_load_f32(src + npyv_nlanes_f32 * 4);
+        npyv_f32  a6 = npyv_load_f32(src + npyv_nlanes_f32 * 5);
+        npyv_f32  a7 = npyv_load_f32(src + npyv_nlanes_f32 * 6);
+        npyv_f32  a8 = npyv_load_f32(src + npyv_nlanes_f32 * 7);
+        npyv_b32 c5 = npyv_cmpneq_f32(a5, b);
+        npyv_b32 c6 = npyv_cmpneq_f32(a6, b);
+        npyv_b32 c7 = npyv_cmpneq_f32(a7, b);
+        npyv_b32 c8 = npyv_cmpneq_f32(a8, b);
+#endif // 32 >= 64
+#endif // 32 >= 32
+#endif // 32 >= 16
+#endif // 32 >= 8
+
+#if 32 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 32 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 32 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 32 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_f32 a = *src;
+        *dst = a != scalar;
+    }
+}
+#endif
+
+
+#line 35
+#if NPY_SIMD_F32 && !((0 || 0) && 0)
+static void simd_binary_less_f32(char **args, npy_intp len)
+{
+    npyv_lanetype_f32 *src1 = (npyv_lanetype_f32 *) args[0];
+    npyv_lanetype_f32 *src2 = (npyv_lanetype_f32 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 32 >= 8
+        npyv_f32  a1 = npyv_load_f32(src1 + npyv_nlanes_f32 * 0);
+        npyv_f32  b1 = npyv_load_f32(src2 + npyv_nlanes_f32 * 0);
+        npyv_b32 c1 = npyv_cmplt_f32(a1, b1);
+#if 32 >= 16
+        npyv_f32  a2 = npyv_load_f32(src1 + npyv_nlanes_f32 * 1);
+        npyv_f32  b2 = npyv_load_f32(src2 + npyv_nlanes_f32 * 1);
+        npyv_b32 c2 = npyv_cmplt_f32(a2, b2);
+#if 32 >= 32
+        npyv_f32  a3 = npyv_load_f32(src1 + npyv_nlanes_f32 * 2);
+        npyv_f32  b3 = npyv_load_f32(src2 + npyv_nlanes_f32 * 2);
+        npyv_f32  a4 = npyv_load_f32(src1 + npyv_nlanes_f32 * 3);
+        npyv_f32  b4 = npyv_load_f32(src2 + npyv_nlanes_f32 * 3);
+        npyv_b32 c3 = npyv_cmplt_f32(a3, b3);
+        npyv_b32 c4 = npyv_cmplt_f32(a4, b4);
+#if 32 == 64
+        npyv_f32  a5 = npyv_load_f32(src1 + npyv_nlanes_f32 * 4);
+        npyv_f32  b5 = npyv_load_f32(src2 + npyv_nlanes_f32 * 4);
+        npyv_f32  a6 = npyv_load_f32(src1 + npyv_nlanes_f32 * 5);
+        npyv_f32  b6 = npyv_load_f32(src2 + npyv_nlanes_f32 * 5);
+        npyv_f32  a7 = npyv_load_f32(src1 + npyv_nlanes_f32 * 6);
+        npyv_f32  b7 = npyv_load_f32(src2 + npyv_nlanes_f32 * 6);
+        npyv_f32  a8 = npyv_load_f32(src1 + npyv_nlanes_f32 * 7);
+        npyv_f32  b8 = npyv_load_f32(src2 + npyv_nlanes_f32 * 7);
+        npyv_b32 c5 = npyv_cmplt_f32(a5, b5);
+        npyv_b32 c6 = npyv_cmplt_f32(a6, b6);
+        npyv_b32 c7 = npyv_cmplt_f32(a7, b7);
+        npyv_b32 c8 = npyv_cmplt_f32(a8, b8);
+#endif // 32 >= 64
+#endif // 32 >= 32
+#endif // 32 >= 16
+#endif // 32 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 32 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 32 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 32 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 32 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_f32 a = *src1;
+        const npyv_lanetype_f32 b = *src2;
+        *dst = a < b;
+    }
+}
+
+static void simd_binary_scalar1_less_f32(char **args, npy_intp len)
+{
+    npyv_lanetype_f32 scalar = *(npyv_lanetype_f32 *) args[0];
+    npyv_lanetype_f32 *src   = (npyv_lanetype_f32 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_f32 a         = npyv_setall_f32(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 32 >= 8
+        npyv_f32  b1 = npyv_load_f32(src + npyv_nlanes_f32 * 0);
+        npyv_b32 c1 = npyv_cmplt_f32(a, b1);
+#if 32 >= 16
+        npyv_f32  b2 = npyv_load_f32(src + npyv_nlanes_f32 * 1);
+        npyv_b32 c2 = npyv_cmplt_f32(a, b2);
+#if 32 >= 32
+        npyv_f32  b3 = npyv_load_f32(src + npyv_nlanes_f32 * 2);
+        npyv_f32  b4 = npyv_load_f32(src + npyv_nlanes_f32 * 3);
+        npyv_b32 c3 = npyv_cmplt_f32(a, b3);
+        npyv_b32 c4 = npyv_cmplt_f32(a, b4);
+#if 32 == 64
+        npyv_f32  b5 = npyv_load_f32(src + npyv_nlanes_f32 * 4);
+        npyv_f32  b6 = npyv_load_f32(src + npyv_nlanes_f32 * 5);
+        npyv_f32  b7 = npyv_load_f32(src + npyv_nlanes_f32 * 6);
+        npyv_f32  b8 = npyv_load_f32(src + npyv_nlanes_f32 * 7);
+        npyv_b32 c5 = npyv_cmplt_f32(a, b5);
+        npyv_b32 c6 = npyv_cmplt_f32(a, b6);
+        npyv_b32 c7 = npyv_cmplt_f32(a, b7);
+        npyv_b32 c8 = npyv_cmplt_f32(a, b8);
+#endif // 32 >= 64
+#endif // 32 >= 32
+#endif // 32 >= 16
+#endif // 32 >= 8
+
+#if 32 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 32 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 32 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 32 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_f32 b = *src;
+        *dst = scalar < b;
+    }
+}
+
+static void simd_binary_scalar2_less_f32(char **args, npy_intp len)
+{
+    npyv_lanetype_f32 *src   = (npyv_lanetype_f32 *) args[0];
+    npyv_lanetype_f32 scalar = *(npyv_lanetype_f32 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_f32 b         = npyv_setall_f32(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 32 >= 8
+        npyv_f32  a1 = npyv_load_f32(src + npyv_nlanes_f32 * 0);
+        npyv_b32 c1 = npyv_cmplt_f32(a1, b);
+#if 32 >= 16
+        npyv_f32  a2 = npyv_load_f32(src + npyv_nlanes_f32 * 1);
+        npyv_b32 c2 = npyv_cmplt_f32(a2, b);
+#if 32 >= 32
+        npyv_f32  a3 = npyv_load_f32(src + npyv_nlanes_f32 * 2);
+        npyv_f32  a4 = npyv_load_f32(src + npyv_nlanes_f32 * 3);
+        npyv_b32 c3 = npyv_cmplt_f32(a3, b);
+        npyv_b32 c4 = npyv_cmplt_f32(a4, b);
+#if 32 == 64
+        npyv_f32  a5 = npyv_load_f32(src + npyv_nlanes_f32 * 4);
+        npyv_f32  a6 = npyv_load_f32(src + npyv_nlanes_f32 * 5);
+        npyv_f32  a7 = npyv_load_f32(src + npyv_nlanes_f32 * 6);
+        npyv_f32  a8 = npyv_load_f32(src + npyv_nlanes_f32 * 7);
+        npyv_b32 c5 = npyv_cmplt_f32(a5, b);
+        npyv_b32 c6 = npyv_cmplt_f32(a6, b);
+        npyv_b32 c7 = npyv_cmplt_f32(a7, b);
+        npyv_b32 c8 = npyv_cmplt_f32(a8, b);
+#endif // 32 >= 64
+#endif // 32 >= 32
+#endif // 32 >= 16
+#endif // 32 >= 8
+
+#if 32 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 32 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 32 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 32 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_f32 a = *src;
+        *dst = a < scalar;
+    }
+}
+#endif
+
+
+#line 35
+#if NPY_SIMD_F32 && !((0 || 0) && 0)
+static void simd_binary_less_equal_f32(char **args, npy_intp len)
+{
+    npyv_lanetype_f32 *src1 = (npyv_lanetype_f32 *) args[0];
+    npyv_lanetype_f32 *src2 = (npyv_lanetype_f32 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 32 >= 8
+        npyv_f32  a1 = npyv_load_f32(src1 + npyv_nlanes_f32 * 0);
+        npyv_f32  b1 = npyv_load_f32(src2 + npyv_nlanes_f32 * 0);
+        npyv_b32 c1 = npyv_cmple_f32(a1, b1);
+#if 32 >= 16
+        npyv_f32  a2 = npyv_load_f32(src1 + npyv_nlanes_f32 * 1);
+        npyv_f32  b2 = npyv_load_f32(src2 + npyv_nlanes_f32 * 1);
+        npyv_b32 c2 = npyv_cmple_f32(a2, b2);
+#if 32 >= 32
+        npyv_f32  a3 = npyv_load_f32(src1 + npyv_nlanes_f32 * 2);
+        npyv_f32  b3 = npyv_load_f32(src2 + npyv_nlanes_f32 * 2);
+        npyv_f32  a4 = npyv_load_f32(src1 + npyv_nlanes_f32 * 3);
+        npyv_f32  b4 = npyv_load_f32(src2 + npyv_nlanes_f32 * 3);
+        npyv_b32 c3 = npyv_cmple_f32(a3, b3);
+        npyv_b32 c4 = npyv_cmple_f32(a4, b4);
+#if 32 == 64
+        npyv_f32  a5 = npyv_load_f32(src1 + npyv_nlanes_f32 * 4);
+        npyv_f32  b5 = npyv_load_f32(src2 + npyv_nlanes_f32 * 4);
+        npyv_f32  a6 = npyv_load_f32(src1 + npyv_nlanes_f32 * 5);
+        npyv_f32  b6 = npyv_load_f32(src2 + npyv_nlanes_f32 * 5);
+        npyv_f32  a7 = npyv_load_f32(src1 + npyv_nlanes_f32 * 6);
+        npyv_f32  b7 = npyv_load_f32(src2 + npyv_nlanes_f32 * 6);
+        npyv_f32  a8 = npyv_load_f32(src1 + npyv_nlanes_f32 * 7);
+        npyv_f32  b8 = npyv_load_f32(src2 + npyv_nlanes_f32 * 7);
+        npyv_b32 c5 = npyv_cmple_f32(a5, b5);
+        npyv_b32 c6 = npyv_cmple_f32(a6, b6);
+        npyv_b32 c7 = npyv_cmple_f32(a7, b7);
+        npyv_b32 c8 = npyv_cmple_f32(a8, b8);
+#endif // 32 >= 64
+#endif // 32 >= 32
+#endif // 32 >= 16
+#endif // 32 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 32 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 32 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 32 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 32 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_f32 a = *src1;
+        const npyv_lanetype_f32 b = *src2;
+        *dst = a <= b;
+    }
+}
+
+static void simd_binary_scalar1_less_equal_f32(char **args, npy_intp len)
+{
+    npyv_lanetype_f32 scalar = *(npyv_lanetype_f32 *) args[0];
+    npyv_lanetype_f32 *src   = (npyv_lanetype_f32 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_f32 a         = npyv_setall_f32(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 32 >= 8
+        npyv_f32  b1 = npyv_load_f32(src + npyv_nlanes_f32 * 0);
+        npyv_b32 c1 = npyv_cmple_f32(a, b1);
+#if 32 >= 16
+        npyv_f32  b2 = npyv_load_f32(src + npyv_nlanes_f32 * 1);
+        npyv_b32 c2 = npyv_cmple_f32(a, b2);
+#if 32 >= 32
+        npyv_f32  b3 = npyv_load_f32(src + npyv_nlanes_f32 * 2);
+        npyv_f32  b4 = npyv_load_f32(src + npyv_nlanes_f32 * 3);
+        npyv_b32 c3 = npyv_cmple_f32(a, b3);
+        npyv_b32 c4 = npyv_cmple_f32(a, b4);
+#if 32 == 64
+        npyv_f32  b5 = npyv_load_f32(src + npyv_nlanes_f32 * 4);
+        npyv_f32  b6 = npyv_load_f32(src + npyv_nlanes_f32 * 5);
+        npyv_f32  b7 = npyv_load_f32(src + npyv_nlanes_f32 * 6);
+        npyv_f32  b8 = npyv_load_f32(src + npyv_nlanes_f32 * 7);
+        npyv_b32 c5 = npyv_cmple_f32(a, b5);
+        npyv_b32 c6 = npyv_cmple_f32(a, b6);
+        npyv_b32 c7 = npyv_cmple_f32(a, b7);
+        npyv_b32 c8 = npyv_cmple_f32(a, b8);
+#endif // 32 >= 64
+#endif // 32 >= 32
+#endif // 32 >= 16
+#endif // 32 >= 8
+
+#if 32 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 32 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 32 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 32 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_f32 b = *src;
+        *dst = scalar <= b;
+    }
+}
+
+static void simd_binary_scalar2_less_equal_f32(char **args, npy_intp len)
+{
+    npyv_lanetype_f32 *src   = (npyv_lanetype_f32 *) args[0];
+    npyv_lanetype_f32 scalar = *(npyv_lanetype_f32 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_f32 b         = npyv_setall_f32(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 32 >= 8
+        npyv_f32  a1 = npyv_load_f32(src + npyv_nlanes_f32 * 0);
+        npyv_b32 c1 = npyv_cmple_f32(a1, b);
+#if 32 >= 16
+        npyv_f32  a2 = npyv_load_f32(src + npyv_nlanes_f32 * 1);
+        npyv_b32 c2 = npyv_cmple_f32(a2, b);
+#if 32 >= 32
+        npyv_f32  a3 = npyv_load_f32(src + npyv_nlanes_f32 * 2);
+        npyv_f32  a4 = npyv_load_f32(src + npyv_nlanes_f32 * 3);
+        npyv_b32 c3 = npyv_cmple_f32(a3, b);
+        npyv_b32 c4 = npyv_cmple_f32(a4, b);
+#if 32 == 64
+        npyv_f32  a5 = npyv_load_f32(src + npyv_nlanes_f32 * 4);
+        npyv_f32  a6 = npyv_load_f32(src + npyv_nlanes_f32 * 5);
+        npyv_f32  a7 = npyv_load_f32(src + npyv_nlanes_f32 * 6);
+        npyv_f32  a8 = npyv_load_f32(src + npyv_nlanes_f32 * 7);
+        npyv_b32 c5 = npyv_cmple_f32(a5, b);
+        npyv_b32 c6 = npyv_cmple_f32(a6, b);
+        npyv_b32 c7 = npyv_cmple_f32(a7, b);
+        npyv_b32 c8 = npyv_cmple_f32(a8, b);
+#endif // 32 >= 64
+#endif // 32 >= 32
+#endif // 32 >= 16
+#endif // 32 >= 8
+
+#if 32 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 32 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 32 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 32 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_f32 a = *src;
+        *dst = a <= scalar;
+    }
+}
+#endif
+
+
+
+#line 28
+#line 35
+#if NPY_SIMD_F64 && !((1 || 0) && 0)
+static void simd_binary_equal_f64(char **args, npy_intp len)
+{
+    npyv_lanetype_f64 *src1 = (npyv_lanetype_f64 *) args[0];
+    npyv_lanetype_f64 *src2 = (npyv_lanetype_f64 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 64 >= 8
+        npyv_f64  a1 = npyv_load_f64(src1 + npyv_nlanes_f64 * 0);
+        npyv_f64  b1 = npyv_load_f64(src2 + npyv_nlanes_f64 * 0);
+        npyv_b64 c1 = npyv_cmpeq_f64(a1, b1);
+#if 64 >= 16
+        npyv_f64  a2 = npyv_load_f64(src1 + npyv_nlanes_f64 * 1);
+        npyv_f64  b2 = npyv_load_f64(src2 + npyv_nlanes_f64 * 1);
+        npyv_b64 c2 = npyv_cmpeq_f64(a2, b2);
+#if 64 >= 32
+        npyv_f64  a3 = npyv_load_f64(src1 + npyv_nlanes_f64 * 2);
+        npyv_f64  b3 = npyv_load_f64(src2 + npyv_nlanes_f64 * 2);
+        npyv_f64  a4 = npyv_load_f64(src1 + npyv_nlanes_f64 * 3);
+        npyv_f64  b4 = npyv_load_f64(src2 + npyv_nlanes_f64 * 3);
+        npyv_b64 c3 = npyv_cmpeq_f64(a3, b3);
+        npyv_b64 c4 = npyv_cmpeq_f64(a4, b4);
+#if 64 == 64
+        npyv_f64  a5 = npyv_load_f64(src1 + npyv_nlanes_f64 * 4);
+        npyv_f64  b5 = npyv_load_f64(src2 + npyv_nlanes_f64 * 4);
+        npyv_f64  a6 = npyv_load_f64(src1 + npyv_nlanes_f64 * 5);
+        npyv_f64  b6 = npyv_load_f64(src2 + npyv_nlanes_f64 * 5);
+        npyv_f64  a7 = npyv_load_f64(src1 + npyv_nlanes_f64 * 6);
+        npyv_f64  b7 = npyv_load_f64(src2 + npyv_nlanes_f64 * 6);
+        npyv_f64  a8 = npyv_load_f64(src1 + npyv_nlanes_f64 * 7);
+        npyv_f64  b8 = npyv_load_f64(src2 + npyv_nlanes_f64 * 7);
+        npyv_b64 c5 = npyv_cmpeq_f64(a5, b5);
+        npyv_b64 c6 = npyv_cmpeq_f64(a6, b6);
+        npyv_b64 c7 = npyv_cmpeq_f64(a7, b7);
+        npyv_b64 c8 = npyv_cmpeq_f64(a8, b8);
+#endif // 64 >= 64
+#endif // 64 >= 32
+#endif // 64 >= 16
+#endif // 64 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 64 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 64 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 64 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 64 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_f64 a = *src1;
+        const npyv_lanetype_f64 b = *src2;
+        *dst = a == b;
+    }
+}
+
+static void simd_binary_scalar1_equal_f64(char **args, npy_intp len)
+{
+    npyv_lanetype_f64 scalar = *(npyv_lanetype_f64 *) args[0];
+    npyv_lanetype_f64 *src   = (npyv_lanetype_f64 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_f64 a         = npyv_setall_f64(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 64 >= 8
+        npyv_f64  b1 = npyv_load_f64(src + npyv_nlanes_f64 * 0);
+        npyv_b64 c1 = npyv_cmpeq_f64(a, b1);
+#if 64 >= 16
+        npyv_f64  b2 = npyv_load_f64(src + npyv_nlanes_f64 * 1);
+        npyv_b64 c2 = npyv_cmpeq_f64(a, b2);
+#if 64 >= 32
+        npyv_f64  b3 = npyv_load_f64(src + npyv_nlanes_f64 * 2);
+        npyv_f64  b4 = npyv_load_f64(src + npyv_nlanes_f64 * 3);
+        npyv_b64 c3 = npyv_cmpeq_f64(a, b3);
+        npyv_b64 c4 = npyv_cmpeq_f64(a, b4);
+#if 64 == 64
+        npyv_f64  b5 = npyv_load_f64(src + npyv_nlanes_f64 * 4);
+        npyv_f64  b6 = npyv_load_f64(src + npyv_nlanes_f64 * 5);
+        npyv_f64  b7 = npyv_load_f64(src + npyv_nlanes_f64 * 6);
+        npyv_f64  b8 = npyv_load_f64(src + npyv_nlanes_f64 * 7);
+        npyv_b64 c5 = npyv_cmpeq_f64(a, b5);
+        npyv_b64 c6 = npyv_cmpeq_f64(a, b6);
+        npyv_b64 c7 = npyv_cmpeq_f64(a, b7);
+        npyv_b64 c8 = npyv_cmpeq_f64(a, b8);
+#endif // 64 >= 64
+#endif // 64 >= 32
+#endif // 64 >= 16
+#endif // 64 >= 8
+
+#if 64 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 64 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 64 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 64 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_f64 b = *src;
+        *dst = scalar == b;
+    }
+}
+
+static void simd_binary_scalar2_equal_f64(char **args, npy_intp len)
+{
+    npyv_lanetype_f64 *src   = (npyv_lanetype_f64 *) args[0];
+    npyv_lanetype_f64 scalar = *(npyv_lanetype_f64 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_f64 b         = npyv_setall_f64(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 64 >= 8
+        npyv_f64  a1 = npyv_load_f64(src + npyv_nlanes_f64 * 0);
+        npyv_b64 c1 = npyv_cmpeq_f64(a1, b);
+#if 64 >= 16
+        npyv_f64  a2 = npyv_load_f64(src + npyv_nlanes_f64 * 1);
+        npyv_b64 c2 = npyv_cmpeq_f64(a2, b);
+#if 64 >= 32
+        npyv_f64  a3 = npyv_load_f64(src + npyv_nlanes_f64 * 2);
+        npyv_f64  a4 = npyv_load_f64(src + npyv_nlanes_f64 * 3);
+        npyv_b64 c3 = npyv_cmpeq_f64(a3, b);
+        npyv_b64 c4 = npyv_cmpeq_f64(a4, b);
+#if 64 == 64
+        npyv_f64  a5 = npyv_load_f64(src + npyv_nlanes_f64 * 4);
+        npyv_f64  a6 = npyv_load_f64(src + npyv_nlanes_f64 * 5);
+        npyv_f64  a7 = npyv_load_f64(src + npyv_nlanes_f64 * 6);
+        npyv_f64  a8 = npyv_load_f64(src + npyv_nlanes_f64 * 7);
+        npyv_b64 c5 = npyv_cmpeq_f64(a5, b);
+        npyv_b64 c6 = npyv_cmpeq_f64(a6, b);
+        npyv_b64 c7 = npyv_cmpeq_f64(a7, b);
+        npyv_b64 c8 = npyv_cmpeq_f64(a8, b);
+#endif // 64 >= 64
+#endif // 64 >= 32
+#endif // 64 >= 16
+#endif // 64 >= 8
+
+#if 64 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 64 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 64 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 64 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_f64 a = *src;
+        *dst = a == scalar;
+    }
+}
+#endif
+
+
+#line 35
+#if NPY_SIMD_F64 && !((0 || 1) && 0)
+static void simd_binary_not_equal_f64(char **args, npy_intp len)
+{
+    npyv_lanetype_f64 *src1 = (npyv_lanetype_f64 *) args[0];
+    npyv_lanetype_f64 *src2 = (npyv_lanetype_f64 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 64 >= 8
+        npyv_f64  a1 = npyv_load_f64(src1 + npyv_nlanes_f64 * 0);
+        npyv_f64  b1 = npyv_load_f64(src2 + npyv_nlanes_f64 * 0);
+        npyv_b64 c1 = npyv_cmpneq_f64(a1, b1);
+#if 64 >= 16
+        npyv_f64  a2 = npyv_load_f64(src1 + npyv_nlanes_f64 * 1);
+        npyv_f64  b2 = npyv_load_f64(src2 + npyv_nlanes_f64 * 1);
+        npyv_b64 c2 = npyv_cmpneq_f64(a2, b2);
+#if 64 >= 32
+        npyv_f64  a3 = npyv_load_f64(src1 + npyv_nlanes_f64 * 2);
+        npyv_f64  b3 = npyv_load_f64(src2 + npyv_nlanes_f64 * 2);
+        npyv_f64  a4 = npyv_load_f64(src1 + npyv_nlanes_f64 * 3);
+        npyv_f64  b4 = npyv_load_f64(src2 + npyv_nlanes_f64 * 3);
+        npyv_b64 c3 = npyv_cmpneq_f64(a3, b3);
+        npyv_b64 c4 = npyv_cmpneq_f64(a4, b4);
+#if 64 == 64
+        npyv_f64  a5 = npyv_load_f64(src1 + npyv_nlanes_f64 * 4);
+        npyv_f64  b5 = npyv_load_f64(src2 + npyv_nlanes_f64 * 4);
+        npyv_f64  a6 = npyv_load_f64(src1 + npyv_nlanes_f64 * 5);
+        npyv_f64  b6 = npyv_load_f64(src2 + npyv_nlanes_f64 * 5);
+        npyv_f64  a7 = npyv_load_f64(src1 + npyv_nlanes_f64 * 6);
+        npyv_f64  b7 = npyv_load_f64(src2 + npyv_nlanes_f64 * 6);
+        npyv_f64  a8 = npyv_load_f64(src1 + npyv_nlanes_f64 * 7);
+        npyv_f64  b8 = npyv_load_f64(src2 + npyv_nlanes_f64 * 7);
+        npyv_b64 c5 = npyv_cmpneq_f64(a5, b5);
+        npyv_b64 c6 = npyv_cmpneq_f64(a6, b6);
+        npyv_b64 c7 = npyv_cmpneq_f64(a7, b7);
+        npyv_b64 c8 = npyv_cmpneq_f64(a8, b8);
+#endif // 64 >= 64
+#endif // 64 >= 32
+#endif // 64 >= 16
+#endif // 64 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 64 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 64 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 64 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 64 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_f64 a = *src1;
+        const npyv_lanetype_f64 b = *src2;
+        *dst = a != b;
+    }
+}
+
+static void simd_binary_scalar1_not_equal_f64(char **args, npy_intp len)
+{
+    npyv_lanetype_f64 scalar = *(npyv_lanetype_f64 *) args[0];
+    npyv_lanetype_f64 *src   = (npyv_lanetype_f64 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_f64 a         = npyv_setall_f64(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 64 >= 8
+        npyv_f64  b1 = npyv_load_f64(src + npyv_nlanes_f64 * 0);
+        npyv_b64 c1 = npyv_cmpneq_f64(a, b1);
+#if 64 >= 16
+        npyv_f64  b2 = npyv_load_f64(src + npyv_nlanes_f64 * 1);
+        npyv_b64 c2 = npyv_cmpneq_f64(a, b2);
+#if 64 >= 32
+        npyv_f64  b3 = npyv_load_f64(src + npyv_nlanes_f64 * 2);
+        npyv_f64  b4 = npyv_load_f64(src + npyv_nlanes_f64 * 3);
+        npyv_b64 c3 = npyv_cmpneq_f64(a, b3);
+        npyv_b64 c4 = npyv_cmpneq_f64(a, b4);
+#if 64 == 64
+        npyv_f64  b5 = npyv_load_f64(src + npyv_nlanes_f64 * 4);
+        npyv_f64  b6 = npyv_load_f64(src + npyv_nlanes_f64 * 5);
+        npyv_f64  b7 = npyv_load_f64(src + npyv_nlanes_f64 * 6);
+        npyv_f64  b8 = npyv_load_f64(src + npyv_nlanes_f64 * 7);
+        npyv_b64 c5 = npyv_cmpneq_f64(a, b5);
+        npyv_b64 c6 = npyv_cmpneq_f64(a, b6);
+        npyv_b64 c7 = npyv_cmpneq_f64(a, b7);
+        npyv_b64 c8 = npyv_cmpneq_f64(a, b8);
+#endif // 64 >= 64
+#endif // 64 >= 32
+#endif // 64 >= 16
+#endif // 64 >= 8
+
+#if 64 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 64 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 64 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 64 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_f64 b = *src;
+        *dst = scalar != b;
+    }
+}
+
+static void simd_binary_scalar2_not_equal_f64(char **args, npy_intp len)
+{
+    npyv_lanetype_f64 *src   = (npyv_lanetype_f64 *) args[0];
+    npyv_lanetype_f64 scalar = *(npyv_lanetype_f64 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_f64 b         = npyv_setall_f64(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 64 >= 8
+        npyv_f64  a1 = npyv_load_f64(src + npyv_nlanes_f64 * 0);
+        npyv_b64 c1 = npyv_cmpneq_f64(a1, b);
+#if 64 >= 16
+        npyv_f64  a2 = npyv_load_f64(src + npyv_nlanes_f64 * 1);
+        npyv_b64 c2 = npyv_cmpneq_f64(a2, b);
+#if 64 >= 32
+        npyv_f64  a3 = npyv_load_f64(src + npyv_nlanes_f64 * 2);
+        npyv_f64  a4 = npyv_load_f64(src + npyv_nlanes_f64 * 3);
+        npyv_b64 c3 = npyv_cmpneq_f64(a3, b);
+        npyv_b64 c4 = npyv_cmpneq_f64(a4, b);
+#if 64 == 64
+        npyv_f64  a5 = npyv_load_f64(src + npyv_nlanes_f64 * 4);
+        npyv_f64  a6 = npyv_load_f64(src + npyv_nlanes_f64 * 5);
+        npyv_f64  a7 = npyv_load_f64(src + npyv_nlanes_f64 * 6);
+        npyv_f64  a8 = npyv_load_f64(src + npyv_nlanes_f64 * 7);
+        npyv_b64 c5 = npyv_cmpneq_f64(a5, b);
+        npyv_b64 c6 = npyv_cmpneq_f64(a6, b);
+        npyv_b64 c7 = npyv_cmpneq_f64(a7, b);
+        npyv_b64 c8 = npyv_cmpneq_f64(a8, b);
+#endif // 64 >= 64
+#endif // 64 >= 32
+#endif // 64 >= 16
+#endif // 64 >= 8
+
+#if 64 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 64 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 64 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 64 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_f64 a = *src;
+        *dst = a != scalar;
+    }
+}
+#endif
+
+
+#line 35
+#if NPY_SIMD_F64 && !((0 || 0) && 0)
+static void simd_binary_less_f64(char **args, npy_intp len)
+{
+    npyv_lanetype_f64 *src1 = (npyv_lanetype_f64 *) args[0];
+    npyv_lanetype_f64 *src2 = (npyv_lanetype_f64 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 64 >= 8
+        npyv_f64  a1 = npyv_load_f64(src1 + npyv_nlanes_f64 * 0);
+        npyv_f64  b1 = npyv_load_f64(src2 + npyv_nlanes_f64 * 0);
+        npyv_b64 c1 = npyv_cmplt_f64(a1, b1);
+#if 64 >= 16
+        npyv_f64  a2 = npyv_load_f64(src1 + npyv_nlanes_f64 * 1);
+        npyv_f64  b2 = npyv_load_f64(src2 + npyv_nlanes_f64 * 1);
+        npyv_b64 c2 = npyv_cmplt_f64(a2, b2);
+#if 64 >= 32
+        npyv_f64  a3 = npyv_load_f64(src1 + npyv_nlanes_f64 * 2);
+        npyv_f64  b3 = npyv_load_f64(src2 + npyv_nlanes_f64 * 2);
+        npyv_f64  a4 = npyv_load_f64(src1 + npyv_nlanes_f64 * 3);
+        npyv_f64  b4 = npyv_load_f64(src2 + npyv_nlanes_f64 * 3);
+        npyv_b64 c3 = npyv_cmplt_f64(a3, b3);
+        npyv_b64 c4 = npyv_cmplt_f64(a4, b4);
+#if 64 == 64
+        npyv_f64  a5 = npyv_load_f64(src1 + npyv_nlanes_f64 * 4);
+        npyv_f64  b5 = npyv_load_f64(src2 + npyv_nlanes_f64 * 4);
+        npyv_f64  a6 = npyv_load_f64(src1 + npyv_nlanes_f64 * 5);
+        npyv_f64  b6 = npyv_load_f64(src2 + npyv_nlanes_f64 * 5);
+        npyv_f64  a7 = npyv_load_f64(src1 + npyv_nlanes_f64 * 6);
+        npyv_f64  b7 = npyv_load_f64(src2 + npyv_nlanes_f64 * 6);
+        npyv_f64  a8 = npyv_load_f64(src1 + npyv_nlanes_f64 * 7);
+        npyv_f64  b8 = npyv_load_f64(src2 + npyv_nlanes_f64 * 7);
+        npyv_b64 c5 = npyv_cmplt_f64(a5, b5);
+        npyv_b64 c6 = npyv_cmplt_f64(a6, b6);
+        npyv_b64 c7 = npyv_cmplt_f64(a7, b7);
+        npyv_b64 c8 = npyv_cmplt_f64(a8, b8);
+#endif // 64 >= 64
+#endif // 64 >= 32
+#endif // 64 >= 16
+#endif // 64 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 64 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 64 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 64 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 64 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_f64 a = *src1;
+        const npyv_lanetype_f64 b = *src2;
+        *dst = a < b;
+    }
+}
+
+static void simd_binary_scalar1_less_f64(char **args, npy_intp len)
+{
+    npyv_lanetype_f64 scalar = *(npyv_lanetype_f64 *) args[0];
+    npyv_lanetype_f64 *src   = (npyv_lanetype_f64 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_f64 a         = npyv_setall_f64(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 64 >= 8
+        npyv_f64  b1 = npyv_load_f64(src + npyv_nlanes_f64 * 0);
+        npyv_b64 c1 = npyv_cmplt_f64(a, b1);
+#if 64 >= 16
+        npyv_f64  b2 = npyv_load_f64(src + npyv_nlanes_f64 * 1);
+        npyv_b64 c2 = npyv_cmplt_f64(a, b2);
+#if 64 >= 32
+        npyv_f64  b3 = npyv_load_f64(src + npyv_nlanes_f64 * 2);
+        npyv_f64  b4 = npyv_load_f64(src + npyv_nlanes_f64 * 3);
+        npyv_b64 c3 = npyv_cmplt_f64(a, b3);
+        npyv_b64 c4 = npyv_cmplt_f64(a, b4);
+#if 64 == 64
+        npyv_f64  b5 = npyv_load_f64(src + npyv_nlanes_f64 * 4);
+        npyv_f64  b6 = npyv_load_f64(src + npyv_nlanes_f64 * 5);
+        npyv_f64  b7 = npyv_load_f64(src + npyv_nlanes_f64 * 6);
+        npyv_f64  b8 = npyv_load_f64(src + npyv_nlanes_f64 * 7);
+        npyv_b64 c5 = npyv_cmplt_f64(a, b5);
+        npyv_b64 c6 = npyv_cmplt_f64(a, b6);
+        npyv_b64 c7 = npyv_cmplt_f64(a, b7);
+        npyv_b64 c8 = npyv_cmplt_f64(a, b8);
+#endif // 64 >= 64
+#endif // 64 >= 32
+#endif // 64 >= 16
+#endif // 64 >= 8
+
+#if 64 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 64 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 64 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 64 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_f64 b = *src;
+        *dst = scalar < b;
+    }
+}
+
+static void simd_binary_scalar2_less_f64(char **args, npy_intp len)
+{
+    npyv_lanetype_f64 *src   = (npyv_lanetype_f64 *) args[0];
+    npyv_lanetype_f64 scalar = *(npyv_lanetype_f64 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_f64 b         = npyv_setall_f64(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 64 >= 8
+        npyv_f64  a1 = npyv_load_f64(src + npyv_nlanes_f64 * 0);
+        npyv_b64 c1 = npyv_cmplt_f64(a1, b);
+#if 64 >= 16
+        npyv_f64  a2 = npyv_load_f64(src + npyv_nlanes_f64 * 1);
+        npyv_b64 c2 = npyv_cmplt_f64(a2, b);
+#if 64 >= 32
+        npyv_f64  a3 = npyv_load_f64(src + npyv_nlanes_f64 * 2);
+        npyv_f64  a4 = npyv_load_f64(src + npyv_nlanes_f64 * 3);
+        npyv_b64 c3 = npyv_cmplt_f64(a3, b);
+        npyv_b64 c4 = npyv_cmplt_f64(a4, b);
+#if 64 == 64
+        npyv_f64  a5 = npyv_load_f64(src + npyv_nlanes_f64 * 4);
+        npyv_f64  a6 = npyv_load_f64(src + npyv_nlanes_f64 * 5);
+        npyv_f64  a7 = npyv_load_f64(src + npyv_nlanes_f64 * 6);
+        npyv_f64  a8 = npyv_load_f64(src + npyv_nlanes_f64 * 7);
+        npyv_b64 c5 = npyv_cmplt_f64(a5, b);
+        npyv_b64 c6 = npyv_cmplt_f64(a6, b);
+        npyv_b64 c7 = npyv_cmplt_f64(a7, b);
+        npyv_b64 c8 = npyv_cmplt_f64(a8, b);
+#endif // 64 >= 64
+#endif // 64 >= 32
+#endif // 64 >= 16
+#endif // 64 >= 8
+
+#if 64 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 64 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 64 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 64 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_f64 a = *src;
+        *dst = a < scalar;
+    }
+}
+#endif
+
+
+#line 35
+#if NPY_SIMD_F64 && !((0 || 0) && 0)
+static void simd_binary_less_equal_f64(char **args, npy_intp len)
+{
+    npyv_lanetype_f64 *src1 = (npyv_lanetype_f64 *) args[0];
+    npyv_lanetype_f64 *src2 = (npyv_lanetype_f64 *) args[1];
+    npyv_lanetype_u8 *dst     = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask    = npyv_setall_u8(0x1);
+    const int vstep           = npyv_nlanes_u8;
+
+    // Unroll the loop to get a resultant vector with 'vsteps' elements.
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+#if 64 >= 8
+        npyv_f64  a1 = npyv_load_f64(src1 + npyv_nlanes_f64 * 0);
+        npyv_f64  b1 = npyv_load_f64(src2 + npyv_nlanes_f64 * 0);
+        npyv_b64 c1 = npyv_cmple_f64(a1, b1);
+#if 64 >= 16
+        npyv_f64  a2 = npyv_load_f64(src1 + npyv_nlanes_f64 * 1);
+        npyv_f64  b2 = npyv_load_f64(src2 + npyv_nlanes_f64 * 1);
+        npyv_b64 c2 = npyv_cmple_f64(a2, b2);
+#if 64 >= 32
+        npyv_f64  a3 = npyv_load_f64(src1 + npyv_nlanes_f64 * 2);
+        npyv_f64  b3 = npyv_load_f64(src2 + npyv_nlanes_f64 * 2);
+        npyv_f64  a4 = npyv_load_f64(src1 + npyv_nlanes_f64 * 3);
+        npyv_f64  b4 = npyv_load_f64(src2 + npyv_nlanes_f64 * 3);
+        npyv_b64 c3 = npyv_cmple_f64(a3, b3);
+        npyv_b64 c4 = npyv_cmple_f64(a4, b4);
+#if 64 == 64
+        npyv_f64  a5 = npyv_load_f64(src1 + npyv_nlanes_f64 * 4);
+        npyv_f64  b5 = npyv_load_f64(src2 + npyv_nlanes_f64 * 4);
+        npyv_f64  a6 = npyv_load_f64(src1 + npyv_nlanes_f64 * 5);
+        npyv_f64  b6 = npyv_load_f64(src2 + npyv_nlanes_f64 * 5);
+        npyv_f64  a7 = npyv_load_f64(src1 + npyv_nlanes_f64 * 6);
+        npyv_f64  b7 = npyv_load_f64(src2 + npyv_nlanes_f64 * 6);
+        npyv_f64  a8 = npyv_load_f64(src1 + npyv_nlanes_f64 * 7);
+        npyv_f64  b8 = npyv_load_f64(src2 + npyv_nlanes_f64 * 7);
+        npyv_b64 c5 = npyv_cmple_f64(a5, b5);
+        npyv_b64 c6 = npyv_cmple_f64(a6, b6);
+        npyv_b64 c7 = npyv_cmple_f64(a7, b7);
+        npyv_b64 c8 = npyv_cmple_f64(a8, b8);
+#endif // 64 >= 64
+#endif // 64 >= 32
+#endif // 64 >= 16
+#endif // 64 >= 8
+
+        // Pack the 'c' vectors into a single vector 'r'
+#if 64 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 64 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 64 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 64 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_f64 a = *src1;
+        const npyv_lanetype_f64 b = *src2;
+        *dst = a <= b;
+    }
+}
+
+static void simd_binary_scalar1_less_equal_f64(char **args, npy_intp len)
+{
+    npyv_lanetype_f64 scalar = *(npyv_lanetype_f64 *) args[0];
+    npyv_lanetype_f64 *src   = (npyv_lanetype_f64 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_f64 a         = npyv_setall_f64(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 64 >= 8
+        npyv_f64  b1 = npyv_load_f64(src + npyv_nlanes_f64 * 0);
+        npyv_b64 c1 = npyv_cmple_f64(a, b1);
+#if 64 >= 16
+        npyv_f64  b2 = npyv_load_f64(src + npyv_nlanes_f64 * 1);
+        npyv_b64 c2 = npyv_cmple_f64(a, b2);
+#if 64 >= 32
+        npyv_f64  b3 = npyv_load_f64(src + npyv_nlanes_f64 * 2);
+        npyv_f64  b4 = npyv_load_f64(src + npyv_nlanes_f64 * 3);
+        npyv_b64 c3 = npyv_cmple_f64(a, b3);
+        npyv_b64 c4 = npyv_cmple_f64(a, b4);
+#if 64 == 64
+        npyv_f64  b5 = npyv_load_f64(src + npyv_nlanes_f64 * 4);
+        npyv_f64  b6 = npyv_load_f64(src + npyv_nlanes_f64 * 5);
+        npyv_f64  b7 = npyv_load_f64(src + npyv_nlanes_f64 * 6);
+        npyv_f64  b8 = npyv_load_f64(src + npyv_nlanes_f64 * 7);
+        npyv_b64 c5 = npyv_cmple_f64(a, b5);
+        npyv_b64 c6 = npyv_cmple_f64(a, b6);
+        npyv_b64 c7 = npyv_cmple_f64(a, b7);
+        npyv_b64 c8 = npyv_cmple_f64(a, b8);
+#endif // 64 >= 64
+#endif // 64 >= 32
+#endif // 64 >= 16
+#endif // 64 >= 8
+
+#if 64 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 64 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 64 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 64 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_f64 b = *src;
+        *dst = scalar <= b;
+    }
+}
+
+static void simd_binary_scalar2_less_equal_f64(char **args, npy_intp len)
+{
+    npyv_lanetype_f64 *src   = (npyv_lanetype_f64 *) args[0];
+    npyv_lanetype_f64 scalar = *(npyv_lanetype_f64 *) args[1];
+    npyv_lanetype_u8 *dst      = (npyv_lanetype_u8 *) args[2];
+    const npyv_f64 b         = npyv_setall_f64(scalar);
+    const npyv_u8 truemask     = npyv_setall_u8(0x1);
+    const int vstep            = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+#if 64 >= 8
+        npyv_f64  a1 = npyv_load_f64(src + npyv_nlanes_f64 * 0);
+        npyv_b64 c1 = npyv_cmple_f64(a1, b);
+#if 64 >= 16
+        npyv_f64  a2 = npyv_load_f64(src + npyv_nlanes_f64 * 1);
+        npyv_b64 c2 = npyv_cmple_f64(a2, b);
+#if 64 >= 32
+        npyv_f64  a3 = npyv_load_f64(src + npyv_nlanes_f64 * 2);
+        npyv_f64  a4 = npyv_load_f64(src + npyv_nlanes_f64 * 3);
+        npyv_b64 c3 = npyv_cmple_f64(a3, b);
+        npyv_b64 c4 = npyv_cmple_f64(a4, b);
+#if 64 == 64
+        npyv_f64  a5 = npyv_load_f64(src + npyv_nlanes_f64 * 4);
+        npyv_f64  a6 = npyv_load_f64(src + npyv_nlanes_f64 * 5);
+        npyv_f64  a7 = npyv_load_f64(src + npyv_nlanes_f64 * 6);
+        npyv_f64  a8 = npyv_load_f64(src + npyv_nlanes_f64 * 7);
+        npyv_b64 c5 = npyv_cmple_f64(a5, b);
+        npyv_b64 c6 = npyv_cmple_f64(a6, b);
+        npyv_b64 c7 = npyv_cmple_f64(a7, b);
+        npyv_b64 c8 = npyv_cmple_f64(a8, b);
+#endif // 64 >= 64
+#endif // 64 >= 32
+#endif // 64 >= 16
+#endif // 64 >= 8
+
+#if 64 == 8
+        npyv_u8 r = npyv_cvt_u8_b8(c1);
+#elif 64 == 16
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b16(c1, c2));
+#elif 64 == 32
+        npyv_u8 r = npyv_cvt_u8_b8(npyv_pack_b8_b32(c1, c2, c3, c4));
+#elif 64 == 64
+        npyv_u8 r =
+            npyv_cvt_u8_b8(npyv_pack_b8_b64(c1, c2, c3, c4, c5, c6, c7, c8));
+#endif
+        npyv_store_u8(dst, npyv_and_u8(r, truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_f64 a = *src;
+        *dst = a <= scalar;
+    }
+}
+#endif
+
+
+
+
+#line 220
+
+#if NPY_SIMD
+static void simd_binary_equal_b8(char **args, npy_intp len)
+{
+    npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0];
+    npyv_lanetype_u8 *src2 = (npyv_lanetype_u8 *) args[1];
+    npyv_lanetype_u8 *dst  = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask = npyv_setall_u8(0x1);
+    const npyv_u8 vzero    = npyv_setall_u8(0x0);
+    const int vstep        = npyv_nlanes_u8;
+
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+        // Whatever element in src != 0x0 is converted to 0xFF
+        npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src1), vzero);
+        npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src2), vzero);
+        npyv_b8 c = npyv_xnor_b8(a, b);
+        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_u8 a = *src1 != 0;
+        const npyv_lanetype_u8 b = *src2 != 0;
+        *dst = a == b;
+    }
+}
+
+static void simd_binary_scalar1_equal_b8(char **args, npy_intp len)
+{
+    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[0];
+    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[1];
+    npyv_lanetype_u8 *dst   = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 vzero     = npyv_setall_u8(0x0);
+    const npyv_u8 vscalar   = npyv_setall_u8(scalar);
+    const npyv_b8 a         = npyv_cmpeq_u8(vscalar, vzero);
+    const npyv_u8 truemask  = npyv_setall_u8(0x1);
+    const int vstep         = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+        npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src), vzero);
+        npyv_b8 c = npyv_xnor_b8(a, b);
+        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u8 b = *src != 0;
+        *dst = scalar == b;
+    }
+}
+
+static void simd_binary_scalar2_equal_b8(char **args, npy_intp len)
+{
+    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[0];
+    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[1];
+    npyv_lanetype_u8 *dst   = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 vzero     = npyv_setall_u8(0x0);
+    const npyv_u8 vscalar   = npyv_setall_u8(scalar);
+    const npyv_b8 b         = npyv_cmpeq_u8(vscalar, vzero);
+    const npyv_u8 truemask  = npyv_setall_u8(0x1);
+    const int vstep         = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+        npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src), vzero);
+        npyv_b8 c = npyv_xnor_b8(a, b);
+        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u8 a = *src != 0;
+        *dst = a == scalar;
+    }
+}
+#endif
+
+#line 220
+
+#if NPY_SIMD
+static void simd_binary_not_equal_b8(char **args, npy_intp len)
+{
+    npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0];
+    npyv_lanetype_u8 *src2 = (npyv_lanetype_u8 *) args[1];
+    npyv_lanetype_u8 *dst  = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask = npyv_setall_u8(0x1);
+    const npyv_u8 vzero    = npyv_setall_u8(0x0);
+    const int vstep        = npyv_nlanes_u8;
+
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+        // Whatever element in src != 0x0 is converted to 0xFF
+        npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src1), vzero);
+        npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src2), vzero);
+        npyv_b8 c = npyv_xor_b8(a, b);
+        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_u8 a = *src1 != 0;
+        const npyv_lanetype_u8 b = *src2 != 0;
+        *dst = a != b;
+    }
+}
+
+static void simd_binary_scalar1_not_equal_b8(char **args, npy_intp len)
+{
+    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[0];
+    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[1];
+    npyv_lanetype_u8 *dst   = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 vzero     = npyv_setall_u8(0x0);
+    const npyv_u8 vscalar   = npyv_setall_u8(scalar);
+    const npyv_b8 a         = npyv_cmpeq_u8(vscalar, vzero);
+    const npyv_u8 truemask  = npyv_setall_u8(0x1);
+    const int vstep         = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+        npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src), vzero);
+        npyv_b8 c = npyv_xor_b8(a, b);
+        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u8 b = *src != 0;
+        *dst = scalar != b;
+    }
+}
+
+static void simd_binary_scalar2_not_equal_b8(char **args, npy_intp len)
+{
+    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[0];
+    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[1];
+    npyv_lanetype_u8 *dst   = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 vzero     = npyv_setall_u8(0x0);
+    const npyv_u8 vscalar   = npyv_setall_u8(scalar);
+    const npyv_b8 b         = npyv_cmpeq_u8(vscalar, vzero);
+    const npyv_u8 truemask  = npyv_setall_u8(0x1);
+    const int vstep         = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+        npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src), vzero);
+        npyv_b8 c = npyv_xor_b8(a, b);
+        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u8 a = *src != 0;
+        *dst = a != scalar;
+    }
+}
+#endif
+
+#line 220
+
+#if NPY_SIMD
+static void simd_binary_less_b8(char **args, npy_intp len)
+{
+    npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0];
+    npyv_lanetype_u8 *src2 = (npyv_lanetype_u8 *) args[1];
+    npyv_lanetype_u8 *dst  = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask = npyv_setall_u8(0x1);
+    const npyv_u8 vzero    = npyv_setall_u8(0x0);
+    const int vstep        = npyv_nlanes_u8;
+
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+        // Whatever element in src != 0x0 is converted to 0xFF
+        npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src1), vzero);
+        npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src2), vzero);
+        npyv_b8 c = npyv_andc_b8(a, b);
+        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_u8 a = *src1 != 0;
+        const npyv_lanetype_u8 b = *src2 != 0;
+        *dst = a < b;
+    }
+}
+
+static void simd_binary_scalar1_less_b8(char **args, npy_intp len)
+{
+    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[0];
+    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[1];
+    npyv_lanetype_u8 *dst   = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 vzero     = npyv_setall_u8(0x0);
+    const npyv_u8 vscalar   = npyv_setall_u8(scalar);
+    const npyv_b8 a         = npyv_cmpeq_u8(vscalar, vzero);
+    const npyv_u8 truemask  = npyv_setall_u8(0x1);
+    const int vstep         = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+        npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src), vzero);
+        npyv_b8 c = npyv_andc_b8(a, b);
+        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u8 b = *src != 0;
+        *dst = scalar < b;
+    }
+}
+
+static void simd_binary_scalar2_less_b8(char **args, npy_intp len)
+{
+    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[0];
+    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[1];
+    npyv_lanetype_u8 *dst   = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 vzero     = npyv_setall_u8(0x0);
+    const npyv_u8 vscalar   = npyv_setall_u8(scalar);
+    const npyv_b8 b         = npyv_cmpeq_u8(vscalar, vzero);
+    const npyv_u8 truemask  = npyv_setall_u8(0x1);
+    const int vstep         = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+        npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src), vzero);
+        npyv_b8 c = npyv_andc_b8(a, b);
+        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u8 a = *src != 0;
+        *dst = a < scalar;
+    }
+}
+#endif
+
+#line 220
+
+#if NPY_SIMD
+static void simd_binary_less_equal_b8(char **args, npy_intp len)
+{
+    npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0];
+    npyv_lanetype_u8 *src2 = (npyv_lanetype_u8 *) args[1];
+    npyv_lanetype_u8 *dst  = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 truemask = npyv_setall_u8(0x1);
+    const npyv_u8 vzero    = npyv_setall_u8(0x0);
+    const int vstep        = npyv_nlanes_u8;
+
+    for (; len >= vstep;
+         len -= vstep, src1 += vstep, src2 += vstep, dst += vstep) {
+        // Whatever element in src != 0x0 is converted to 0xFF
+        npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src1), vzero);
+        npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src2), vzero);
+        npyv_b8 c = npyv_orc_b8(a, b);
+        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst) {
+        const npyv_lanetype_u8 a = *src1 != 0;
+        const npyv_lanetype_u8 b = *src2 != 0;
+        *dst = a <= b;
+    }
+}
+
+static void simd_binary_scalar1_less_equal_b8(char **args, npy_intp len)
+{
+    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[0];
+    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[1];
+    npyv_lanetype_u8 *dst   = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 vzero     = npyv_setall_u8(0x0);
+    const npyv_u8 vscalar   = npyv_setall_u8(scalar);
+    const npyv_b8 a         = npyv_cmpeq_u8(vscalar, vzero);
+    const npyv_u8 truemask  = npyv_setall_u8(0x1);
+    const int vstep         = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+        npyv_b8 b = npyv_cmpeq_u8(npyv_load_u8(src), vzero);
+        npyv_b8 c = npyv_orc_b8(a, b);
+        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u8 b = *src != 0;
+        *dst = scalar <= b;
+    }
+}
+
+static void simd_binary_scalar2_less_equal_b8(char **args, npy_intp len)
+{
+    npyv_lanetype_u8 *src   = (npyv_lanetype_u8 *) args[0];
+    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[1];
+    npyv_lanetype_u8 *dst   = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 vzero     = npyv_setall_u8(0x0);
+    const npyv_u8 vscalar   = npyv_setall_u8(scalar);
+    const npyv_b8 b         = npyv_cmpeq_u8(vscalar, vzero);
+    const npyv_u8 truemask  = npyv_setall_u8(0x1);
+    const int vstep         = npyv_nlanes_u8;
+
+    for (; len >= vstep; len -= vstep, src += vstep, dst += vstep) {
+        npyv_b8 a = npyv_cmpeq_u8(npyv_load_u8(src), vzero);
+        npyv_b8 c = npyv_orc_b8(a, b);
+        npyv_store_u8(dst, npyv_and_u8(npyv_cvt_u8_b8(c), truemask));
+    }
+
+    for (; len > 0; --len, ++src, ++dst) {
+        const npyv_lanetype_u8 a = *src != 0;
+        *dst = a <= scalar;
+    }
+}
+#endif
+
+
+#line 304
+#line 310
+#if !((1 || 0) && 0)
+static inline void
+run_binary_simd_equal_b8(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_ubyte, npy_bool)) {
+            simd_binary_scalar1_equal_b8(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_ubyte, npy_bool)) {
+            simd_binary_scalar2_equal_b8(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_ubyte, npy_bool)) {
+            simd_binary_equal_b8(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 1
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_ubyte in1 = *(npy_ubyte *)ip1;
+        const npy_ubyte in2 = *(npy_ubyte *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 == in2;
+    }
+}
+#endif
+
+#line 310
+#if !((0 || 1) && 0)
+static inline void
+run_binary_simd_not_equal_b8(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_ubyte, npy_bool)) {
+            simd_binary_scalar1_not_equal_b8(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_ubyte, npy_bool)) {
+            simd_binary_scalar2_not_equal_b8(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_ubyte, npy_bool)) {
+            simd_binary_not_equal_b8(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 1
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_ubyte in1 = *(npy_ubyte *)ip1;
+        const npy_ubyte in2 = *(npy_ubyte *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 != in2;
+    }
+}
+#endif
+
+#line 310
+#if !((0 || 0) && 0)
+static inline void
+run_binary_simd_less_b8(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_ubyte, npy_bool)) {
+            simd_binary_scalar1_less_b8(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_ubyte, npy_bool)) {
+            simd_binary_scalar2_less_b8(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_ubyte, npy_bool)) {
+            simd_binary_less_b8(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 1
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_ubyte in1 = *(npy_ubyte *)ip1;
+        const npy_ubyte in2 = *(npy_ubyte *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 < in2;
+    }
+}
+#endif
+
+#line 310
+#if !((0 || 0) && 0)
+static inline void
+run_binary_simd_less_equal_b8(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_ubyte, npy_bool)) {
+            simd_binary_scalar1_less_equal_b8(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_ubyte, npy_bool)) {
+            simd_binary_scalar2_less_equal_b8(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_ubyte, npy_bool)) {
+            simd_binary_less_equal_b8(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 1
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_ubyte in1 = *(npy_ubyte *)ip1;
+        const npy_ubyte in2 = *(npy_ubyte *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 <= in2;
+    }
+}
+#endif
+
+
+#line 304
+#line 310
+#if !((1 || 0) && 0)
+static inline void
+run_binary_simd_equal_u8(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_ubyte, npy_bool)) {
+            simd_binary_scalar1_equal_u8(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_ubyte, npy_bool)) {
+            simd_binary_scalar2_equal_u8(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_ubyte, npy_bool)) {
+            simd_binary_equal_u8(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_ubyte in1 = *(npy_ubyte *)ip1;
+        const npy_ubyte in2 = *(npy_ubyte *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 == in2;
+    }
+}
+#endif
+
+#line 310
+#if !((0 || 1) && 0)
+static inline void
+run_binary_simd_not_equal_u8(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_ubyte, npy_bool)) {
+            simd_binary_scalar1_not_equal_u8(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_ubyte, npy_bool)) {
+            simd_binary_scalar2_not_equal_u8(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_ubyte, npy_bool)) {
+            simd_binary_not_equal_u8(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_ubyte in1 = *(npy_ubyte *)ip1;
+        const npy_ubyte in2 = *(npy_ubyte *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 != in2;
+    }
+}
+#endif
+
+#line 310
+#if !((0 || 0) && 0)
+static inline void
+run_binary_simd_less_u8(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_ubyte, npy_bool)) {
+            simd_binary_scalar1_less_u8(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_ubyte, npy_bool)) {
+            simd_binary_scalar2_less_u8(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_ubyte, npy_bool)) {
+            simd_binary_less_u8(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_ubyte in1 = *(npy_ubyte *)ip1;
+        const npy_ubyte in2 = *(npy_ubyte *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 < in2;
+    }
+}
+#endif
+
+#line 310
+#if !((0 || 0) && 0)
+static inline void
+run_binary_simd_less_equal_u8(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_ubyte, npy_bool)) {
+            simd_binary_scalar1_less_equal_u8(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_ubyte, npy_bool)) {
+            simd_binary_scalar2_less_equal_u8(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_ubyte, npy_bool)) {
+            simd_binary_less_equal_u8(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_ubyte in1 = *(npy_ubyte *)ip1;
+        const npy_ubyte in2 = *(npy_ubyte *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 <= in2;
+    }
+}
+#endif
+
+
+#line 304
+#line 310
+#if !((1 || 0) && 1)
+static inline void
+run_binary_simd_equal_s8(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_byte, npy_bool)) {
+            simd_binary_scalar1_equal_s8(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_byte, npy_bool)) {
+            simd_binary_scalar2_equal_s8(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_byte, npy_bool)) {
+            simd_binary_equal_s8(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_byte in1 = *(npy_byte *)ip1;
+        const npy_byte in2 = *(npy_byte *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 == in2;
+    }
+}
+#endif
+
+#line 310
+#if !((0 || 1) && 1)
+static inline void
+run_binary_simd_not_equal_s8(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_byte, npy_bool)) {
+            simd_binary_scalar1_not_equal_s8(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_byte, npy_bool)) {
+            simd_binary_scalar2_not_equal_s8(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_byte, npy_bool)) {
+            simd_binary_not_equal_s8(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_byte in1 = *(npy_byte *)ip1;
+        const npy_byte in2 = *(npy_byte *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 != in2;
+    }
+}
+#endif
+
+#line 310
+#if !((0 || 0) && 1)
+static inline void
+run_binary_simd_less_s8(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_byte, npy_bool)) {
+            simd_binary_scalar1_less_s8(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_byte, npy_bool)) {
+            simd_binary_scalar2_less_s8(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_byte, npy_bool)) {
+            simd_binary_less_s8(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_byte in1 = *(npy_byte *)ip1;
+        const npy_byte in2 = *(npy_byte *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 < in2;
+    }
+}
+#endif
+
+#line 310
+#if !((0 || 0) && 1)
+static inline void
+run_binary_simd_less_equal_s8(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_byte, npy_bool)) {
+            simd_binary_scalar1_less_equal_s8(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_byte, npy_bool)) {
+            simd_binary_scalar2_less_equal_s8(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_byte, npy_bool)) {
+            simd_binary_less_equal_s8(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_byte in1 = *(npy_byte *)ip1;
+        const npy_byte in2 = *(npy_byte *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 <= in2;
+    }
+}
+#endif
+
+
+#line 304
+#line 310
+#if !((1 || 0) && 0)
+static inline void
+run_binary_simd_equal_u16(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_ushort, npy_bool)) {
+            simd_binary_scalar1_equal_u16(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_ushort, npy_bool)) {
+            simd_binary_scalar2_equal_u16(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_ushort, npy_bool)) {
+            simd_binary_equal_u16(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_ushort in1 = *(npy_ushort *)ip1;
+        const npy_ushort in2 = *(npy_ushort *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 == in2;
+    }
+}
+#endif
+
+#line 310
+#if !((0 || 1) && 0)
+static inline void
+run_binary_simd_not_equal_u16(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_ushort, npy_bool)) {
+            simd_binary_scalar1_not_equal_u16(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_ushort, npy_bool)) {
+            simd_binary_scalar2_not_equal_u16(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_ushort, npy_bool)) {
+            simd_binary_not_equal_u16(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_ushort in1 = *(npy_ushort *)ip1;
+        const npy_ushort in2 = *(npy_ushort *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 != in2;
+    }
+}
+#endif
+
+#line 310
+#if !((0 || 0) && 0)
+static inline void
+run_binary_simd_less_u16(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_ushort, npy_bool)) {
+            simd_binary_scalar1_less_u16(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_ushort, npy_bool)) {
+            simd_binary_scalar2_less_u16(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_ushort, npy_bool)) {
+            simd_binary_less_u16(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_ushort in1 = *(npy_ushort *)ip1;
+        const npy_ushort in2 = *(npy_ushort *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 < in2;
+    }
+}
+#endif
+
+#line 310
+#if !((0 || 0) && 0)
+static inline void
+run_binary_simd_less_equal_u16(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_ushort, npy_bool)) {
+            simd_binary_scalar1_less_equal_u16(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_ushort, npy_bool)) {
+            simd_binary_scalar2_less_equal_u16(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_ushort, npy_bool)) {
+            simd_binary_less_equal_u16(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_ushort in1 = *(npy_ushort *)ip1;
+        const npy_ushort in2 = *(npy_ushort *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 <= in2;
+    }
+}
+#endif
+
+
+#line 304
+#line 310
+#if !((1 || 0) && 1)
+static inline void
+run_binary_simd_equal_s16(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_short, npy_bool)) {
+            simd_binary_scalar1_equal_s16(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_short, npy_bool)) {
+            simd_binary_scalar2_equal_s16(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_short, npy_bool)) {
+            simd_binary_equal_s16(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_short in1 = *(npy_short *)ip1;
+        const npy_short in2 = *(npy_short *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 == in2;
+    }
+}
+#endif
+
+#line 310
+#if !((0 || 1) && 1)
+static inline void
+run_binary_simd_not_equal_s16(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_short, npy_bool)) {
+            simd_binary_scalar1_not_equal_s16(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_short, npy_bool)) {
+            simd_binary_scalar2_not_equal_s16(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_short, npy_bool)) {
+            simd_binary_not_equal_s16(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_short in1 = *(npy_short *)ip1;
+        const npy_short in2 = *(npy_short *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 != in2;
+    }
+}
+#endif
+
+#line 310
+#if !((0 || 0) && 1)
+static inline void
+run_binary_simd_less_s16(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_short, npy_bool)) {
+            simd_binary_scalar1_less_s16(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_short, npy_bool)) {
+            simd_binary_scalar2_less_s16(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_short, npy_bool)) {
+            simd_binary_less_s16(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_short in1 = *(npy_short *)ip1;
+        const npy_short in2 = *(npy_short *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 < in2;
+    }
+}
+#endif
+
+#line 310
+#if !((0 || 0) && 1)
+static inline void
+run_binary_simd_less_equal_s16(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_short, npy_bool)) {
+            simd_binary_scalar1_less_equal_s16(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_short, npy_bool)) {
+            simd_binary_scalar2_less_equal_s16(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_short, npy_bool)) {
+            simd_binary_less_equal_s16(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_short in1 = *(npy_short *)ip1;
+        const npy_short in2 = *(npy_short *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 <= in2;
+    }
+}
+#endif
+
+
+#line 304
+#line 310
+#if !((1 || 0) && 0)
+static inline void
+run_binary_simd_equal_u32(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_uint, npy_bool)) {
+            simd_binary_scalar1_equal_u32(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_uint, npy_bool)) {
+            simd_binary_scalar2_equal_u32(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_uint, npy_bool)) {
+            simd_binary_equal_u32(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_uint in1 = *(npy_uint *)ip1;
+        const npy_uint in2 = *(npy_uint *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 == in2;
+    }
+}
+#endif
+
+#line 310
+#if !((0 || 1) && 0)
+static inline void
+run_binary_simd_not_equal_u32(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_uint, npy_bool)) {
+            simd_binary_scalar1_not_equal_u32(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_uint, npy_bool)) {
+            simd_binary_scalar2_not_equal_u32(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_uint, npy_bool)) {
+            simd_binary_not_equal_u32(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_uint in1 = *(npy_uint *)ip1;
+        const npy_uint in2 = *(npy_uint *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 != in2;
+    }
+}
+#endif
+
+#line 310
+#if !((0 || 0) && 0)
+static inline void
+run_binary_simd_less_u32(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_uint, npy_bool)) {
+            simd_binary_scalar1_less_u32(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_uint, npy_bool)) {
+            simd_binary_scalar2_less_u32(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_uint, npy_bool)) {
+            simd_binary_less_u32(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_uint in1 = *(npy_uint *)ip1;
+        const npy_uint in2 = *(npy_uint *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 < in2;
+    }
+}
+#endif
+
+#line 310
+#if !((0 || 0) && 0)
+static inline void
+run_binary_simd_less_equal_u32(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_uint, npy_bool)) {
+            simd_binary_scalar1_less_equal_u32(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_uint, npy_bool)) {
+            simd_binary_scalar2_less_equal_u32(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_uint, npy_bool)) {
+            simd_binary_less_equal_u32(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_uint in1 = *(npy_uint *)ip1;
+        const npy_uint in2 = *(npy_uint *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 <= in2;
+    }
+}
+#endif
+
+
+#line 304
+#line 310
+#if !((1 || 0) && 1)
+static inline void
+run_binary_simd_equal_s32(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_int, npy_bool)) {
+            simd_binary_scalar1_equal_s32(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_int, npy_bool)) {
+            simd_binary_scalar2_equal_s32(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_int, npy_bool)) {
+            simd_binary_equal_s32(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_int in1 = *(npy_int *)ip1;
+        const npy_int in2 = *(npy_int *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 == in2;
+    }
+}
+#endif
+
+#line 310
+#if !((0 || 1) && 1)
+static inline void
+run_binary_simd_not_equal_s32(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_int, npy_bool)) {
+            simd_binary_scalar1_not_equal_s32(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_int, npy_bool)) {
+            simd_binary_scalar2_not_equal_s32(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_int, npy_bool)) {
+            simd_binary_not_equal_s32(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_int in1 = *(npy_int *)ip1;
+        const npy_int in2 = *(npy_int *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 != in2;
+    }
+}
+#endif
+
+#line 310
+#if !((0 || 0) && 1)
+static inline void
+run_binary_simd_less_s32(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_int, npy_bool)) {
+            simd_binary_scalar1_less_s32(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_int, npy_bool)) {
+            simd_binary_scalar2_less_s32(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_int, npy_bool)) {
+            simd_binary_less_s32(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_int in1 = *(npy_int *)ip1;
+        const npy_int in2 = *(npy_int *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 < in2;
+    }
+}
+#endif
+
+#line 310
+#if !((0 || 0) && 1)
+static inline void
+run_binary_simd_less_equal_s32(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_int, npy_bool)) {
+            simd_binary_scalar1_less_equal_s32(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_int, npy_bool)) {
+            simd_binary_scalar2_less_equal_s32(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_int, npy_bool)) {
+            simd_binary_less_equal_s32(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_int in1 = *(npy_int *)ip1;
+        const npy_int in2 = *(npy_int *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 <= in2;
+    }
+}
+#endif
+
+
+#line 304
+#line 310
+#if !((1 || 0) && 0)
+static inline void
+run_binary_simd_equal_u64(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_ulonglong, npy_bool)) {
+            simd_binary_scalar1_equal_u64(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_ulonglong, npy_bool)) {
+            simd_binary_scalar2_equal_u64(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_ulonglong, npy_bool)) {
+            simd_binary_equal_u64(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
+        const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 == in2;
+    }
+}
+#endif
+
+#line 310
+#if !((0 || 1) && 0)
+static inline void
+run_binary_simd_not_equal_u64(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_ulonglong, npy_bool)) {
+            simd_binary_scalar1_not_equal_u64(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_ulonglong, npy_bool)) {
+            simd_binary_scalar2_not_equal_u64(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_ulonglong, npy_bool)) {
+            simd_binary_not_equal_u64(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
+        const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 != in2;
+    }
+}
+#endif
+
+#line 310
+#if !((0 || 0) && 0)
+static inline void
+run_binary_simd_less_u64(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_ulonglong, npy_bool)) {
+            simd_binary_scalar1_less_u64(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_ulonglong, npy_bool)) {
+            simd_binary_scalar2_less_u64(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_ulonglong, npy_bool)) {
+            simd_binary_less_u64(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
+        const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 < in2;
+    }
+}
+#endif
+
+#line 310
+#if !((0 || 0) && 0)
+static inline void
+run_binary_simd_less_equal_u64(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_ulonglong, npy_bool)) {
+            simd_binary_scalar1_less_equal_u64(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_ulonglong, npy_bool)) {
+            simd_binary_scalar2_less_equal_u64(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_ulonglong, npy_bool)) {
+            simd_binary_less_equal_u64(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
+        const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 <= in2;
+    }
+}
+#endif
+
+
+#line 304
+#line 310
+#if !((1 || 0) && 1)
+static inline void
+run_binary_simd_equal_s64(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_longlong, npy_bool)) {
+            simd_binary_scalar1_equal_s64(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_longlong, npy_bool)) {
+            simd_binary_scalar2_equal_s64(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_longlong, npy_bool)) {
+            simd_binary_equal_s64(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_longlong in1 = *(npy_longlong *)ip1;
+        const npy_longlong in2 = *(npy_longlong *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 == in2;
+    }
+}
+#endif
+
+#line 310
+#if !((0 || 1) && 1)
+static inline void
+run_binary_simd_not_equal_s64(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_longlong, npy_bool)) {
+            simd_binary_scalar1_not_equal_s64(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_longlong, npy_bool)) {
+            simd_binary_scalar2_not_equal_s64(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_longlong, npy_bool)) {
+            simd_binary_not_equal_s64(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_longlong in1 = *(npy_longlong *)ip1;
+        const npy_longlong in2 = *(npy_longlong *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 != in2;
+    }
+}
+#endif
+
+#line 310
+#if !((0 || 0) && 1)
+static inline void
+run_binary_simd_less_s64(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_longlong, npy_bool)) {
+            simd_binary_scalar1_less_s64(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_longlong, npy_bool)) {
+            simd_binary_scalar2_less_s64(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_longlong, npy_bool)) {
+            simd_binary_less_s64(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_longlong in1 = *(npy_longlong *)ip1;
+        const npy_longlong in2 = *(npy_longlong *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 < in2;
+    }
+}
+#endif
+
+#line 310
+#if !((0 || 0) && 1)
+static inline void
+run_binary_simd_less_equal_s64(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_longlong, npy_bool)) {
+            simd_binary_scalar1_less_equal_s64(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_longlong, npy_bool)) {
+            simd_binary_scalar2_less_equal_s64(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_longlong, npy_bool)) {
+            simd_binary_less_equal_s64(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_longlong in1 = *(npy_longlong *)ip1;
+        const npy_longlong in2 = *(npy_longlong *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 <= in2;
+    }
+}
+#endif
+
+
+#line 304
+#line 310
+#if !((1 || 0) && 0)
+static inline void
+run_binary_simd_equal_f32(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD_F32
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_float, npy_bool)) {
+            simd_binary_scalar1_equal_f32(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_float, npy_bool)) {
+            simd_binary_scalar2_equal_f32(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_float, npy_bool)) {
+            simd_binary_equal_f32(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_float in1 = *(npy_float *)ip1;
+        const npy_float in2 = *(npy_float *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 == in2;
+    }
+}
+#endif
+
+#line 310
+#if !((0 || 1) && 0)
+static inline void
+run_binary_simd_not_equal_f32(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD_F32
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_float, npy_bool)) {
+            simd_binary_scalar1_not_equal_f32(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_float, npy_bool)) {
+            simd_binary_scalar2_not_equal_f32(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_float, npy_bool)) {
+            simd_binary_not_equal_f32(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_float in1 = *(npy_float *)ip1;
+        const npy_float in2 = *(npy_float *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 != in2;
+    }
+}
+#endif
+
+#line 310
+#if !((0 || 0) && 0)
+static inline void
+run_binary_simd_less_f32(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD_F32
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_float, npy_bool)) {
+            simd_binary_scalar1_less_f32(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_float, npy_bool)) {
+            simd_binary_scalar2_less_f32(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_float, npy_bool)) {
+            simd_binary_less_f32(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_float in1 = *(npy_float *)ip1;
+        const npy_float in2 = *(npy_float *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 < in2;
+    }
+}
+#endif
+
+#line 310
+#if !((0 || 0) && 0)
+static inline void
+run_binary_simd_less_equal_f32(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD_F32
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_float, npy_bool)) {
+            simd_binary_scalar1_less_equal_f32(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_float, npy_bool)) {
+            simd_binary_scalar2_less_equal_f32(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_float, npy_bool)) {
+            simd_binary_less_equal_f32(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_float in1 = *(npy_float *)ip1;
+        const npy_float in2 = *(npy_float *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 <= in2;
+    }
+}
+#endif
+
+
+#line 304
+#line 310
+#if !((1 || 0) && 0)
+static inline void
+run_binary_simd_equal_f64(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD_F64
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_double, npy_bool)) {
+            simd_binary_scalar1_equal_f64(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_double, npy_bool)) {
+            simd_binary_scalar2_equal_f64(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_double, npy_bool)) {
+            simd_binary_equal_f64(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_double in1 = *(npy_double *)ip1;
+        const npy_double in2 = *(npy_double *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 == in2;
+    }
+}
+#endif
+
+#line 310
+#if !((0 || 1) && 0)
+static inline void
+run_binary_simd_not_equal_f64(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD_F64
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_double, npy_bool)) {
+            simd_binary_scalar1_not_equal_f64(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_double, npy_bool)) {
+            simd_binary_scalar2_not_equal_f64(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_double, npy_bool)) {
+            simd_binary_not_equal_f64(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_double in1 = *(npy_double *)ip1;
+        const npy_double in2 = *(npy_double *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 != in2;
+    }
+}
+#endif
+
+#line 310
+#if !((0 || 0) && 0)
+static inline void
+run_binary_simd_less_f64(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD_F64
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_double, npy_bool)) {
+            simd_binary_scalar1_less_f64(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_double, npy_bool)) {
+            simd_binary_scalar2_less_f64(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_double, npy_bool)) {
+            simd_binary_less_f64(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_double in1 = *(npy_double *)ip1;
+        const npy_double in2 = *(npy_double *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 < in2;
+    }
+}
+#endif
+
+#line 310
+#if !((0 || 0) && 0)
+static inline void
+run_binary_simd_less_equal_f64(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD_F64
+    if (!is_mem_overlap(args[0], steps[0], args[2], steps[2], dimensions[0]) &&
+        !is_mem_overlap(args[1], steps[1], args[2], steps[2], dimensions[0])
+    ) {
+        /* argument one scalar */
+        if (IS_BINARY_CONT_S1(npy_double, npy_bool)) {
+            simd_binary_scalar1_less_equal_f64(args, dimensions[0]);
+            return;
+        }
+        /* argument two scalar */
+        else if (IS_BINARY_CONT_S2(npy_double, npy_bool)) {
+            simd_binary_scalar2_less_equal_f64(args, dimensions[0]);
+            return;
+        }
+        else if (IS_BINARY_CONT(npy_double, npy_bool)) {
+            simd_binary_less_equal_f64(args, dimensions[0]);
+            return;
+        }
+    }
+#endif
+
+    BINARY_LOOP {
+#if 0
+        npy_bool in1 = *((npy_bool *)ip1) != 0;
+        npy_bool in2 = *((npy_bool *)ip2) != 0;
+#else
+        const npy_double in1 = *(npy_double *)ip1;
+        const npy_double in2 = *(npy_double *)ip2;
+#endif
+        *((npy_bool *)op1) = in1 <= in2;
+    }
+}
+#endif
+
+
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+
+/*
+ * In order to reduce the size of the binary generated from this source, the
+ * following rules are applied: 1) each data type implements its function
+ * 'greater' as a call to the function 'less' but with the arguments swapped,
+ * the same applies to the function 'greater_equal', which is implemented
+ * with a call to the function 'less_equal', and 2) for the integer datatypes
+ * of the same size (eg 8-bit), a single kernel of the functions 'equal' and
+ * 'not_equal' is used to implement both signed and unsigned types.
+ */
+
+#line 372
+#undef TO_SIMD_SFX
+#undef TO_SIMD_UTYPE
+#if 0
+#line 378
+#elif NPY_BITSOF_BYTE == 8
+    #define TO_SIMD_UTYPE(X) X##_u8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s8
+    #else
+        #define TO_SIMD_SFX(X) X##_u8
+    #endif
+
+#line 378
+#elif NPY_BITSOF_BYTE == 16
+    #define TO_SIMD_UTYPE(X) X##_u16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s16
+    #else
+        #define TO_SIMD_SFX(X) X##_u16
+    #endif
+
+#line 378
+#elif NPY_BITSOF_BYTE == 32
+    #define TO_SIMD_UTYPE(X) X##_u32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s32
+    #else
+        #define TO_SIMD_SFX(X) X##_u32
+    #endif
+
+#line 378
+#elif NPY_BITSOF_BYTE == 64
+    #define TO_SIMD_UTYPE(X) X##_u64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s64
+    #else
+        #define TO_SIMD_SFX(X) X##_u64
+    #endif
+
+#endif
+
+#line 392
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_greater)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *nargs[3] = {args[1], args[0], args[2]};
+    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
+    TO_SIMD_SFX(run_binary_simd_less)(nargs, dimensions, nsteps);
+}
+
+#line 392
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_greater_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *nargs[3] = {args[1], args[0], args[2]};
+    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
+    TO_SIMD_SFX(run_binary_simd_less_equal)(nargs, dimensions, nsteps);
+}
+
+
+#line 404
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_less)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_SFX(run_binary_simd_less)(args, dimensions, steps);
+}
+
+#line 404
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_less_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_SFX(run_binary_simd_less_equal)(args, dimensions, steps);
+}
+
+
+#line 414
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_UTYPE(run_binary_simd_equal)(args, dimensions, steps);
+}
+
+#line 414
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_not_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_UTYPE(run_binary_simd_not_equal)(args, dimensions, steps);
+}
+
+
+#line 372
+#undef TO_SIMD_SFX
+#undef TO_SIMD_UTYPE
+#if 0
+#line 378
+#elif NPY_BITSOF_SHORT == 8
+    #define TO_SIMD_UTYPE(X) X##_u8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s8
+    #else
+        #define TO_SIMD_SFX(X) X##_u8
+    #endif
+
+#line 378
+#elif NPY_BITSOF_SHORT == 16
+    #define TO_SIMD_UTYPE(X) X##_u16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s16
+    #else
+        #define TO_SIMD_SFX(X) X##_u16
+    #endif
+
+#line 378
+#elif NPY_BITSOF_SHORT == 32
+    #define TO_SIMD_UTYPE(X) X##_u32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s32
+    #else
+        #define TO_SIMD_SFX(X) X##_u32
+    #endif
+
+#line 378
+#elif NPY_BITSOF_SHORT == 64
+    #define TO_SIMD_UTYPE(X) X##_u64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s64
+    #else
+        #define TO_SIMD_SFX(X) X##_u64
+    #endif
+
+#endif
+
+#line 392
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_greater)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *nargs[3] = {args[1], args[0], args[2]};
+    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
+    TO_SIMD_SFX(run_binary_simd_less)(nargs, dimensions, nsteps);
+}
+
+#line 392
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_greater_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *nargs[3] = {args[1], args[0], args[2]};
+    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
+    TO_SIMD_SFX(run_binary_simd_less_equal)(nargs, dimensions, nsteps);
+}
+
+
+#line 404
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_less)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_SFX(run_binary_simd_less)(args, dimensions, steps);
+}
+
+#line 404
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_less_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_SFX(run_binary_simd_less_equal)(args, dimensions, steps);
+}
+
+
+#line 414
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_UTYPE(run_binary_simd_equal)(args, dimensions, steps);
+}
+
+#line 414
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_not_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_UTYPE(run_binary_simd_not_equal)(args, dimensions, steps);
+}
+
+
+#line 372
+#undef TO_SIMD_SFX
+#undef TO_SIMD_UTYPE
+#if 0
+#line 378
+#elif NPY_BITSOF_INT == 8
+    #define TO_SIMD_UTYPE(X) X##_u8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s8
+    #else
+        #define TO_SIMD_SFX(X) X##_u8
+    #endif
+
+#line 378
+#elif NPY_BITSOF_INT == 16
+    #define TO_SIMD_UTYPE(X) X##_u16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s16
+    #else
+        #define TO_SIMD_SFX(X) X##_u16
+    #endif
+
+#line 378
+#elif NPY_BITSOF_INT == 32
+    #define TO_SIMD_UTYPE(X) X##_u32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s32
+    #else
+        #define TO_SIMD_SFX(X) X##_u32
+    #endif
+
+#line 378
+#elif NPY_BITSOF_INT == 64
+    #define TO_SIMD_UTYPE(X) X##_u64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s64
+    #else
+        #define TO_SIMD_SFX(X) X##_u64
+    #endif
+
+#endif
+
+#line 392
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_greater)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *nargs[3] = {args[1], args[0], args[2]};
+    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
+    TO_SIMD_SFX(run_binary_simd_less)(nargs, dimensions, nsteps);
+}
+
+#line 392
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_greater_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *nargs[3] = {args[1], args[0], args[2]};
+    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
+    TO_SIMD_SFX(run_binary_simd_less_equal)(nargs, dimensions, nsteps);
+}
+
+
+#line 404
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_less)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_SFX(run_binary_simd_less)(args, dimensions, steps);
+}
+
+#line 404
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_less_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_SFX(run_binary_simd_less_equal)(args, dimensions, steps);
+}
+
+
+#line 414
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_UTYPE(run_binary_simd_equal)(args, dimensions, steps);
+}
+
+#line 414
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_not_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_UTYPE(run_binary_simd_not_equal)(args, dimensions, steps);
+}
+
+
+#line 372
+#undef TO_SIMD_SFX
+#undef TO_SIMD_UTYPE
+#if 0
+#line 378
+#elif NPY_BITSOF_LONG == 8
+    #define TO_SIMD_UTYPE(X) X##_u8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s8
+    #else
+        #define TO_SIMD_SFX(X) X##_u8
+    #endif
+
+#line 378
+#elif NPY_BITSOF_LONG == 16
+    #define TO_SIMD_UTYPE(X) X##_u16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s16
+    #else
+        #define TO_SIMD_SFX(X) X##_u16
+    #endif
+
+#line 378
+#elif NPY_BITSOF_LONG == 32
+    #define TO_SIMD_UTYPE(X) X##_u32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s32
+    #else
+        #define TO_SIMD_SFX(X) X##_u32
+    #endif
+
+#line 378
+#elif NPY_BITSOF_LONG == 64
+    #define TO_SIMD_UTYPE(X) X##_u64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s64
+    #else
+        #define TO_SIMD_SFX(X) X##_u64
+    #endif
+
+#endif
+
+#line 392
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_greater)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *nargs[3] = {args[1], args[0], args[2]};
+    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
+    TO_SIMD_SFX(run_binary_simd_less)(nargs, dimensions, nsteps);
+}
+
+#line 392
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_greater_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *nargs[3] = {args[1], args[0], args[2]};
+    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
+    TO_SIMD_SFX(run_binary_simd_less_equal)(nargs, dimensions, nsteps);
+}
+
+
+#line 404
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_less)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_SFX(run_binary_simd_less)(args, dimensions, steps);
+}
+
+#line 404
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_less_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_SFX(run_binary_simd_less_equal)(args, dimensions, steps);
+}
+
+
+#line 414
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_UTYPE(run_binary_simd_equal)(args, dimensions, steps);
+}
+
+#line 414
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_not_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_UTYPE(run_binary_simd_not_equal)(args, dimensions, steps);
+}
+
+
+#line 372
+#undef TO_SIMD_SFX
+#undef TO_SIMD_UTYPE
+#if 0
+#line 378
+#elif NPY_BITSOF_LONGLONG == 8
+    #define TO_SIMD_UTYPE(X) X##_u8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s8
+    #else
+        #define TO_SIMD_SFX(X) X##_u8
+    #endif
+
+#line 378
+#elif NPY_BITSOF_LONGLONG == 16
+    #define TO_SIMD_UTYPE(X) X##_u16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s16
+    #else
+        #define TO_SIMD_SFX(X) X##_u16
+    #endif
+
+#line 378
+#elif NPY_BITSOF_LONGLONG == 32
+    #define TO_SIMD_UTYPE(X) X##_u32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s32
+    #else
+        #define TO_SIMD_SFX(X) X##_u32
+    #endif
+
+#line 378
+#elif NPY_BITSOF_LONGLONG == 64
+    #define TO_SIMD_UTYPE(X) X##_u64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s64
+    #else
+        #define TO_SIMD_SFX(X) X##_u64
+    #endif
+
+#endif
+
+#line 392
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_greater)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *nargs[3] = {args[1], args[0], args[2]};
+    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
+    TO_SIMD_SFX(run_binary_simd_less)(nargs, dimensions, nsteps);
+}
+
+#line 392
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_greater_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *nargs[3] = {args[1], args[0], args[2]};
+    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
+    TO_SIMD_SFX(run_binary_simd_less_equal)(nargs, dimensions, nsteps);
+}
+
+
+#line 404
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_less)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_SFX(run_binary_simd_less)(args, dimensions, steps);
+}
+
+#line 404
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_less_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_SFX(run_binary_simd_less_equal)(args, dimensions, steps);
+}
+
+
+#line 414
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_UTYPE(run_binary_simd_equal)(args, dimensions, steps);
+}
+
+#line 414
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_not_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_UTYPE(run_binary_simd_not_equal)(args, dimensions, steps);
+}
+
+
+#line 372
+#undef TO_SIMD_SFX
+#undef TO_SIMD_UTYPE
+#if 0
+#line 378
+#elif NPY_BITSOF_BYTE == 8
+    #define TO_SIMD_UTYPE(X) X##_u8
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s8
+    #else
+        #define TO_SIMD_SFX(X) X##_u8
+    #endif
+
+#line 378
+#elif NPY_BITSOF_BYTE == 16
+    #define TO_SIMD_UTYPE(X) X##_u16
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s16
+    #else
+        #define TO_SIMD_SFX(X) X##_u16
+    #endif
+
+#line 378
+#elif NPY_BITSOF_BYTE == 32
+    #define TO_SIMD_UTYPE(X) X##_u32
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s32
+    #else
+        #define TO_SIMD_SFX(X) X##_u32
+    #endif
+
+#line 378
+#elif NPY_BITSOF_BYTE == 64
+    #define TO_SIMD_UTYPE(X) X##_u64
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s64
+    #else
+        #define TO_SIMD_SFX(X) X##_u64
+    #endif
+
+#endif
+
+#line 392
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_greater)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *nargs[3] = {args[1], args[0], args[2]};
+    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
+    TO_SIMD_SFX(run_binary_simd_less)(nargs, dimensions, nsteps);
+}
+
+#line 392
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_greater_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *nargs[3] = {args[1], args[0], args[2]};
+    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
+    TO_SIMD_SFX(run_binary_simd_less_equal)(nargs, dimensions, nsteps);
+}
+
+
+#line 404
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_less)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_SFX(run_binary_simd_less)(args, dimensions, steps);
+}
+
+#line 404
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_less_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_SFX(run_binary_simd_less_equal)(args, dimensions, steps);
+}
+
+
+#line 414
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_UTYPE(run_binary_simd_equal)(args, dimensions, steps);
+}
+
+#line 414
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_not_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_UTYPE(run_binary_simd_not_equal)(args, dimensions, steps);
+}
+
+
+#line 372
+#undef TO_SIMD_SFX
+#undef TO_SIMD_UTYPE
+#if 0
+#line 378
+#elif NPY_BITSOF_SHORT == 8
+    #define TO_SIMD_UTYPE(X) X##_u8
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s8
+    #else
+        #define TO_SIMD_SFX(X) X##_u8
+    #endif
+
+#line 378
+#elif NPY_BITSOF_SHORT == 16
+    #define TO_SIMD_UTYPE(X) X##_u16
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s16
+    #else
+        #define TO_SIMD_SFX(X) X##_u16
+    #endif
+
+#line 378
+#elif NPY_BITSOF_SHORT == 32
+    #define TO_SIMD_UTYPE(X) X##_u32
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s32
+    #else
+        #define TO_SIMD_SFX(X) X##_u32
+    #endif
+
+#line 378
+#elif NPY_BITSOF_SHORT == 64
+    #define TO_SIMD_UTYPE(X) X##_u64
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s64
+    #else
+        #define TO_SIMD_SFX(X) X##_u64
+    #endif
+
+#endif
+
+#line 392
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_greater)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *nargs[3] = {args[1], args[0], args[2]};
+    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
+    TO_SIMD_SFX(run_binary_simd_less)(nargs, dimensions, nsteps);
+}
+
+#line 392
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_greater_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *nargs[3] = {args[1], args[0], args[2]};
+    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
+    TO_SIMD_SFX(run_binary_simd_less_equal)(nargs, dimensions, nsteps);
+}
+
+
+#line 404
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_less)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_SFX(run_binary_simd_less)(args, dimensions, steps);
+}
+
+#line 404
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_less_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_SFX(run_binary_simd_less_equal)(args, dimensions, steps);
+}
+
+
+#line 414
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_UTYPE(run_binary_simd_equal)(args, dimensions, steps);
+}
+
+#line 414
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_not_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_UTYPE(run_binary_simd_not_equal)(args, dimensions, steps);
+}
+
+
+#line 372
+#undef TO_SIMD_SFX
+#undef TO_SIMD_UTYPE
+#if 0
+#line 378
+#elif NPY_BITSOF_INT == 8
+    #define TO_SIMD_UTYPE(X) X##_u8
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s8
+    #else
+        #define TO_SIMD_SFX(X) X##_u8
+    #endif
+
+#line 378
+#elif NPY_BITSOF_INT == 16
+    #define TO_SIMD_UTYPE(X) X##_u16
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s16
+    #else
+        #define TO_SIMD_SFX(X) X##_u16
+    #endif
+
+#line 378
+#elif NPY_BITSOF_INT == 32
+    #define TO_SIMD_UTYPE(X) X##_u32
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s32
+    #else
+        #define TO_SIMD_SFX(X) X##_u32
+    #endif
+
+#line 378
+#elif NPY_BITSOF_INT == 64
+    #define TO_SIMD_UTYPE(X) X##_u64
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s64
+    #else
+        #define TO_SIMD_SFX(X) X##_u64
+    #endif
+
+#endif
+
+#line 392
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_greater)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *nargs[3] = {args[1], args[0], args[2]};
+    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
+    TO_SIMD_SFX(run_binary_simd_less)(nargs, dimensions, nsteps);
+}
+
+#line 392
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_greater_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *nargs[3] = {args[1], args[0], args[2]};
+    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
+    TO_SIMD_SFX(run_binary_simd_less_equal)(nargs, dimensions, nsteps);
+}
+
+
+#line 404
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_less)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_SFX(run_binary_simd_less)(args, dimensions, steps);
+}
+
+#line 404
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_less_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_SFX(run_binary_simd_less_equal)(args, dimensions, steps);
+}
+
+
+#line 414
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_UTYPE(run_binary_simd_equal)(args, dimensions, steps);
+}
+
+#line 414
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_not_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_UTYPE(run_binary_simd_not_equal)(args, dimensions, steps);
+}
+
+
+#line 372
+#undef TO_SIMD_SFX
+#undef TO_SIMD_UTYPE
+#if 0
+#line 378
+#elif NPY_BITSOF_LONG == 8
+    #define TO_SIMD_UTYPE(X) X##_u8
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s8
+    #else
+        #define TO_SIMD_SFX(X) X##_u8
+    #endif
+
+#line 378
+#elif NPY_BITSOF_LONG == 16
+    #define TO_SIMD_UTYPE(X) X##_u16
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s16
+    #else
+        #define TO_SIMD_SFX(X) X##_u16
+    #endif
+
+#line 378
+#elif NPY_BITSOF_LONG == 32
+    #define TO_SIMD_UTYPE(X) X##_u32
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s32
+    #else
+        #define TO_SIMD_SFX(X) X##_u32
+    #endif
+
+#line 378
+#elif NPY_BITSOF_LONG == 64
+    #define TO_SIMD_UTYPE(X) X##_u64
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s64
+    #else
+        #define TO_SIMD_SFX(X) X##_u64
+    #endif
+
+#endif
+
+#line 392
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_greater)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *nargs[3] = {args[1], args[0], args[2]};
+    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
+    TO_SIMD_SFX(run_binary_simd_less)(nargs, dimensions, nsteps);
+}
+
+#line 392
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_greater_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *nargs[3] = {args[1], args[0], args[2]};
+    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
+    TO_SIMD_SFX(run_binary_simd_less_equal)(nargs, dimensions, nsteps);
+}
+
+
+#line 404
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_less)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_SFX(run_binary_simd_less)(args, dimensions, steps);
+}
+
+#line 404
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_less_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_SFX(run_binary_simd_less_equal)(args, dimensions, steps);
+}
+
+
+#line 414
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_UTYPE(run_binary_simd_equal)(args, dimensions, steps);
+}
+
+#line 414
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_not_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_UTYPE(run_binary_simd_not_equal)(args, dimensions, steps);
+}
+
+
+#line 372
+#undef TO_SIMD_SFX
+#undef TO_SIMD_UTYPE
+#if 0
+#line 378
+#elif NPY_BITSOF_LONGLONG == 8
+    #define TO_SIMD_UTYPE(X) X##_u8
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s8
+    #else
+        #define TO_SIMD_SFX(X) X##_u8
+    #endif
+
+#line 378
+#elif NPY_BITSOF_LONGLONG == 16
+    #define TO_SIMD_UTYPE(X) X##_u16
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s16
+    #else
+        #define TO_SIMD_SFX(X) X##_u16
+    #endif
+
+#line 378
+#elif NPY_BITSOF_LONGLONG == 32
+    #define TO_SIMD_UTYPE(X) X##_u32
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s32
+    #else
+        #define TO_SIMD_SFX(X) X##_u32
+    #endif
+
+#line 378
+#elif NPY_BITSOF_LONGLONG == 64
+    #define TO_SIMD_UTYPE(X) X##_u64
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s64
+    #else
+        #define TO_SIMD_SFX(X) X##_u64
+    #endif
+
+#endif
+
+#line 392
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_greater)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *nargs[3] = {args[1], args[0], args[2]};
+    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
+    TO_SIMD_SFX(run_binary_simd_less)(nargs, dimensions, nsteps);
+}
+
+#line 392
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_greater_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *nargs[3] = {args[1], args[0], args[2]};
+    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
+    TO_SIMD_SFX(run_binary_simd_less_equal)(nargs, dimensions, nsteps);
+}
+
+
+#line 404
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_less)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_SFX(run_binary_simd_less)(args, dimensions, steps);
+}
+
+#line 404
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_less_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_SFX(run_binary_simd_less_equal)(args, dimensions, steps);
+}
+
+
+#line 414
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_UTYPE(run_binary_simd_equal)(args, dimensions, steps);
+}
+
+#line 414
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_not_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    TO_SIMD_UTYPE(run_binary_simd_not_equal)(args, dimensions, steps);
+}
+
+
+
+#line 428
+#line 432
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_greater)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *nargs[3] = {args[1], args[0], args[2]};
+    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
+    run_binary_simd_less_b8(nargs, dimensions, nsteps);
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 432
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_greater_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *nargs[3] = {args[1], args[0], args[2]};
+    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
+    run_binary_simd_less_equal_b8(nargs, dimensions, nsteps);
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+#line 447
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    run_binary_simd_equal_b8(args, dimensions, steps);
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 447
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_not_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    run_binary_simd_not_equal_b8(args, dimensions, steps);
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 447
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_less)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    run_binary_simd_less_b8(args, dimensions, steps);
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 447
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_less_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    run_binary_simd_less_equal_b8(args, dimensions, steps);
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+#line 428
+#line 432
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_greater)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *nargs[3] = {args[1], args[0], args[2]};
+    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
+    run_binary_simd_less_f32(nargs, dimensions, nsteps);
+#if 1
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 432
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_greater_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *nargs[3] = {args[1], args[0], args[2]};
+    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
+    run_binary_simd_less_equal_f32(nargs, dimensions, nsteps);
+#if 1
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+#line 447
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    run_binary_simd_equal_f32(args, dimensions, steps);
+#if 1
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 447
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_not_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    run_binary_simd_not_equal_f32(args, dimensions, steps);
+#if 1
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 447
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_less)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    run_binary_simd_less_f32(args, dimensions, steps);
+#if 1
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 447
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_less_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    run_binary_simd_less_equal_f32(args, dimensions, steps);
+#if 1
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+#line 428
+#line 432
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_greater)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *nargs[3] = {args[1], args[0], args[2]};
+    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
+    run_binary_simd_less_f64(nargs, dimensions, nsteps);
+#if 1
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 432
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_greater_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *nargs[3] = {args[1], args[0], args[2]};
+    npy_intp nsteps[3] = {steps[1], steps[0], steps[2]};
+    run_binary_simd_less_equal_f64(nargs, dimensions, nsteps);
+#if 1
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+#line 447
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    run_binary_simd_equal_f64(args, dimensions, steps);
+#if 1
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 447
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_not_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    run_binary_simd_not_equal_f64(args, dimensions, steps);
+#if 1
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 447
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_less)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    run_binary_simd_less_f64(args, dimensions, steps);
+#if 1
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 447
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_less_equal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    run_binary_simd_less_equal_f64(args, dimensions, steps);
+#if 1
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+
diff --git a/numpy/core/src/_generated/loops_exponent_log.dispatch.c b/numpy/core/src/_generated/loops_exponent_log.dispatch.c
new file mode 100644
index 000000000000..8c4302dd3795
--- /dev/null
+++ b/numpy/core/src/_generated/loops_exponent_log.dispatch.c
@@ -0,0 +1,1956 @@
+#line 1 "numpy/core/src/umath/loops_exponent_log.dispatch.c.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+/*@targets
+ ** $maxopt baseline
+ ** (avx2 fma3) avx512f avx512_skx
+ **/
+
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include <float.h>
+
+#include "numpy/npy_math.h"
+#include "simd/simd.h"
+#include "npy_svml.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+#include "npy_simd_data.h"
+
+// TODO: tweak & replace raw SIMD with NPYV
+
+/********************************************************************************
+ ** bunch of helper functions used in ISA_exp/log_FLOAT
+ ********************************************************************************/
+#if !defined(_MSC_VER) && defined(NPY_HAVE_AVX512F)
+    /**
+     * For somehow MSVC commit aggressive optimization lead
+     * to raises 'RuntimeWarning: RuntimeWarning: overflow encountered in exp'
+     *
+     * the issue mainly caused by '_mm512_maskz_loadu_ps', we need to
+     * investigate about it while moving to NPYV.
+     */
+    #define SIMD_AVX512F
+#elif defined(NPY_HAVE_AVX2) && defined(NPY_HAVE_FMA3)
+    #define SIMD_AVX2_FMA3
+#endif
+#if !defined(_MSC_VER) && defined(NPY_HAVE_AVX512_SKX)
+    #define SIMD_AVX512_SKX
+#endif
+#if defined(SIMD_AVX512F) && !(defined(__clang__) && (__clang_major__ < 10 || \
+                              (__clang_major__ == 10 && __clang_minor__ < 1)))
+    #define SIMD_AVX512F_NOCLANG_BUG
+#endif
+
+#ifdef SIMD_AVX2_FMA3
+
+NPY_FINLINE __m256
+fma_get_full_load_mask_ps(void)
+{
+    return _mm256_set1_ps(-1.0);
+}
+
+NPY_FINLINE __m256i
+fma_get_full_load_mask_pd(void)
+{
+    return _mm256_castpd_si256(_mm256_set1_pd(-1.0));
+}
+
+NPY_FINLINE __m256
+fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes)
+{
+    float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
+                            1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
+    float* addr = maskint + num_lanes - num_elem;
+    return _mm256_loadu_ps(addr);
+}
+
+NPY_FINLINE __m256i
+fma_get_partial_load_mask_pd(const npy_int num_elem, const npy_int num_lanes)
+{
+    npy_int maskint[16] = {-1,-1,-1,-1,-1,-1,-1,-1,1,1,1,1,1,1,1,1};
+    npy_int* addr = maskint + 2*num_lanes - 2*num_elem;
+    return _mm256_loadu_si256((__m256i*) addr);
+}
+
+NPY_FINLINE __m256
+fma_masked_gather_ps(__m256 src,
+                     npy_float* addr,
+                     __m256i vindex,
+                     __m256 mask)
+{
+    return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4);
+}
+
+NPY_FINLINE __m256d
+fma_masked_gather_pd(__m256d src,
+                     npy_double* addr,
+                     __m128i vindex,
+                     __m256d mask)
+{
+    return _mm256_mask_i32gather_pd(src, addr, vindex, mask, 8);
+}
+
+NPY_FINLINE __m256
+fma_masked_load_ps(__m256 mask, npy_float* addr)
+{
+    return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask));
+}
+
+NPY_FINLINE __m256d
+fma_masked_load_pd(__m256i mask, npy_double* addr)
+{
+    return _mm256_maskload_pd(addr, mask);
+}
+
+NPY_FINLINE __m256
+fma_set_masked_lanes_ps(__m256 x, __m256 val, __m256 mask)
+{
+    return _mm256_blendv_ps(x, val, mask);
+}
+
+NPY_FINLINE __m256d
+fma_set_masked_lanes_pd(__m256d x, __m256d val, __m256d mask)
+{
+    return _mm256_blendv_pd(x, val, mask);
+}
+
+NPY_FINLINE __m256
+fma_blend(__m256 x, __m256 y, __m256 ymask)
+{
+    return _mm256_blendv_ps(x, y, ymask);
+}
+
+NPY_FINLINE __m256
+fma_get_exponent(__m256 x)
+{
+    /*
+     * Special handling of denormals:
+     * 1) Multiply denormal elements with 2**100 (0x71800000)
+     * 2) Get the 8 bits of unbiased exponent
+     * 3) Subtract 100 from exponent of denormals
+     */
+
+    __m256 two_power_100 = _mm256_castsi256_ps(_mm256_set1_epi32(0x71800000));
+    __m256 denormal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_LT_OQ);
+    __m256 normal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_GE_OQ);
+
+    /*
+     * The volatile is probably unnecessary now since we compile clang with
+     * `-ftrapping-math`: https://github.com/numpy/numpy/issues/18005
+     */
+    volatile __m256 temp1 = _mm256_blendv_ps(x, _mm256_set1_ps(0.0f), normal_mask);
+    __m256 temp = _mm256_mul_ps(temp1, two_power_100);
+    x = _mm256_blendv_ps(x, temp, denormal_mask);
+
+    __m256 exp = _mm256_cvtepi32_ps(
+                    _mm256_sub_epi32(
+                        _mm256_srli_epi32(
+                            _mm256_castps_si256(x), 23),_mm256_set1_epi32(0x7E)));
+
+    __m256 denorm_exp = _mm256_sub_ps(exp, _mm256_set1_ps(100.0f));
+    return _mm256_blendv_ps(exp, denorm_exp, denormal_mask);
+}
+
+NPY_FINLINE __m256
+fma_get_mantissa(__m256 x)
+{
+    /*
+     * Special handling of denormals:
+     * 1) Multiply denormal elements with 2**100 (0x71800000)
+     * 2) Get the 23 bits of mantissa
+     * 3) Mantissa for denormals is not affected by the multiplication
+     */
+
+    __m256 two_power_100 = _mm256_castsi256_ps(_mm256_set1_epi32(0x71800000));
+    __m256 denormal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_LT_OQ);
+    __m256 normal_mask = _mm256_cmp_ps(x, _mm256_set1_ps(FLT_MIN), _CMP_GE_OQ);
+
+    /*
+     * The volatile is probably unnecessary now since we compile clang with
+     * `-ftrapping-math`: https://github.com/numpy/numpy/issues/18005
+     */
+    volatile __m256 temp1 = _mm256_blendv_ps(x, _mm256_set1_ps(0.0f), normal_mask);
+    __m256 temp = _mm256_mul_ps(temp1, two_power_100);
+    x = _mm256_blendv_ps(x, temp, denormal_mask);
+
+    __m256i mantissa_bits = _mm256_set1_epi32(0x7fffff);
+    __m256i exp_126_bits  = _mm256_set1_epi32(126 << 23);
+    return _mm256_castsi256_ps(
+                _mm256_or_si256(
+                    _mm256_and_si256(
+                        _mm256_castps_si256(x), mantissa_bits), exp_126_bits));
+}
+
+NPY_FINLINE __m256
+fma_scalef_ps(__m256 poly, __m256 quadrant)
+{
+    /*
+     * Handle denormals (which occur when quadrant <= -125):
+     * 1) This function computes poly*(2^quad) by adding the exponent of
+     poly to quad
+     * 2) When quad <= -125, the output is a denormal and the above logic
+     breaks down
+     * 3) To handle such cases, we split quadrant: -125 + (quadrant + 125)
+     * 4) poly*(2^-125) is computed the usual way
+     * 5) 2^(quad-125) can be computed by: 2 << abs(quad-125)
+     * 6) The final div operation generates the denormal
+     */
+     __m256 minquadrant = _mm256_set1_ps(-125.0f);
+     __m256 denormal_mask = _mm256_cmp_ps(quadrant, minquadrant, _CMP_LE_OQ);
+     if (_mm256_movemask_ps(denormal_mask) != 0x0000) {
+        __m256 quad_diff = _mm256_sub_ps(quadrant, minquadrant);
+        quad_diff = _mm256_sub_ps(_mm256_setzero_ps(), quad_diff);
+        quad_diff = _mm256_blendv_ps(_mm256_setzero_ps(), quad_diff, denormal_mask);
+        __m256i two_power_diff = _mm256_sllv_epi32(
+                                   _mm256_set1_epi32(1), _mm256_cvtps_epi32(quad_diff));
+        quadrant = _mm256_max_ps(quadrant, minquadrant); //keep quadrant >= -126
+        __m256i exponent = _mm256_slli_epi32(_mm256_cvtps_epi32(quadrant), 23);
+        poly = _mm256_castsi256_ps(
+                   _mm256_add_epi32(
+                       _mm256_castps_si256(poly), exponent));
+        __m256 denorm_poly = _mm256_div_ps(poly, _mm256_cvtepi32_ps(two_power_diff));
+        return _mm256_blendv_ps(poly, denorm_poly, denormal_mask);
+     }
+     else {
+        __m256i exponent = _mm256_slli_epi32(_mm256_cvtps_epi32(quadrant), 23);
+        poly = _mm256_castsi256_ps(
+                   _mm256_add_epi32(
+                       _mm256_castps_si256(poly), exponent));
+        return poly;
+     }
+}
+
+#endif // SIMD_AVX2_FMA3
+
+#ifdef SIMD_AVX512F
+
+NPY_FINLINE __mmask16
+avx512_get_full_load_mask_ps(void)
+{
+    return 0xFFFF;
+}
+
+NPY_FINLINE __mmask8
+avx512_get_full_load_mask_pd(void)
+{
+    return 0xFF;
+}
+
+NPY_FINLINE __mmask16
+avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem)
+{
+    return (0x0001 << num_elem) - 0x0001;
+}
+
+NPY_FINLINE __mmask8
+avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem)
+{
+    return (0x01 << num_elem) - 0x01;
+}
+
+NPY_FINLINE __m512
+avx512_masked_gather_ps(__m512 src,
+                        npy_float* addr,
+                        __m512i vindex,
+                        __mmask16 kmask)
+{
+    return _mm512_mask_i32gather_ps(src, kmask, vindex, addr, 4);
+}
+
+NPY_FINLINE __m512d
+avx512_masked_gather_pd(__m512d src,
+                        npy_double* addr,
+                        __m256i vindex,
+                        __mmask8 kmask)
+{
+    return _mm512_mask_i32gather_pd(src, kmask, vindex, addr, 8);
+}
+
+NPY_FINLINE __m512
+avx512_masked_load_ps(__mmask16 mask, npy_float* addr)
+{
+    return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
+}
+
+NPY_FINLINE __m512d
+avx512_masked_load_pd(__mmask8 mask, npy_double* addr)
+{
+    return _mm512_maskz_loadu_pd(mask, (__m512d *)addr);
+}
+
+NPY_FINLINE __m512
+avx512_set_masked_lanes_ps(__m512 x, __m512 val, __mmask16 mask)
+{
+    return _mm512_mask_blend_ps(mask, x, val);
+}
+
+NPY_FINLINE __m512d
+avx512_set_masked_lanes_pd(__m512d x, __m512d val, __mmask8 mask)
+{
+    return _mm512_mask_blend_pd(mask, x, val);
+}
+
+NPY_FINLINE __m512
+avx512_blend(__m512 x, __m512 y, __mmask16 ymask)
+{
+    return _mm512_mask_mov_ps(x, ymask, y);
+}
+
+NPY_FINLINE __m512
+avx512_get_exponent(__m512 x)
+{
+    return _mm512_add_ps(_mm512_getexp_ps(x), _mm512_set1_ps(1.0f));
+}
+
+NPY_FINLINE __m512
+avx512_get_mantissa(__m512 x)
+{
+    return _mm512_getmant_ps(x, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src);
+}
+
+NPY_FINLINE __m512
+avx512_scalef_ps(__m512 poly, __m512 quadrant)
+{
+    return _mm512_scalef_ps(poly, quadrant);
+}
+
+NPY_FINLINE __m512d
+avx512_permute_x4var_pd(__m512d t0,
+                        __m512d t1,
+                        __m512d t2,
+                        __m512d t3,
+                        __m512i index)
+{
+    __mmask8 lut_mask = _mm512_cmp_epi64_mask(
+                          _mm512_and_epi64(_mm512_set1_epi64(0x10ULL), index),
+                          _mm512_set1_epi64(0), _MM_CMPINT_GT);
+    __m512d res1 = _mm512_permutex2var_pd(t0, index, t1);
+    __m512d res2 = _mm512_permutex2var_pd(t2, index, t3);
+    return _mm512_mask_blend_pd(lut_mask, res1, res2);
+}
+
+NPY_FINLINE __m512d
+avx512_permute_x8var_pd(__m512d t0, __m512d t1, __m512d t2, __m512d t3,
+                        __m512d t4, __m512d t5, __m512d t6, __m512d t7,
+                        __m512i index)
+{
+    __mmask8 lut_mask = _mm512_cmp_epi64_mask(
+                          _mm512_and_epi64(_mm512_set1_epi64(0x20ULL), index),
+                          _mm512_set1_epi64(0), _MM_CMPINT_GT);
+    __m512d res1 = avx512_permute_x4var_pd(t0, t1, t2, t3, index);
+    __m512d res2 = avx512_permute_x4var_pd(t4, t5, t6, t7, index);
+    return _mm512_mask_blend_pd(lut_mask, res1, res2);
+}
+
+#endif // SIMD_AVX512F
+
+/********************************************************************************
+ ** Defining the SIMD kernels
+ ********************************************************************************/
+#line 372
+#ifdef SIMD_AVX2_FMA3
+/*
+ * Vectorized Cody-Waite range reduction technique
+ * Performs the reduction step x* = x - y*C in three steps:
+ * 1) x* = x - y*c1
+ * 2) x* = x - y*c2
+ * 3) x* = x - y*c3
+ * c1, c2 are exact floating points, c3 = C - c1 - c2 simulates higher precision
+ */
+NPY_FINLINE __m256
+simd_range_reduction(__m256 x, __m256 y, __m256 c1, __m256 c2, __m256 c3)
+{
+    __m256 reduced_x = _mm256_fmadd_ps(y, c1, x);
+    reduced_x = _mm256_fmadd_ps(y, c2, reduced_x);
+    reduced_x = _mm256_fmadd_ps(y, c3, reduced_x);
+    return reduced_x;
+}
+/*
+ * Vectorized implementation of exp using AVX2 and AVX512:
+ * 1) if x >= xmax; return INF (overflow)
+ * 2) if x <= xmin; return 0.0f (underflow)
+ * 3) Range reduction (using Coyd-Waite):
+ *      a) y = x - k*ln(2); k = rint(x/ln(2)); y \in [0, ln(2)]
+ * 4) Compute exp(y) = P/Q, ratio of 2 polynomials P and Q
+ *      b) P = 5th order and Q = 2nd order polynomials obtained from Remez's
+ *      algorithm (mini-max polynomial approximation)
+ * 5) Compute exp(x) = exp(y) * 2^k
+ * 6) Max ULP error measured across all 32-bit FP's = 2.52 (x = 0xc2781e37)
+ * 7) Max relative error measured across all 32-bit FP's= 2.1264E-07 (for the
+ * same x = 0xc2781e37)
+ */
+static void
+simd_exp_FLOAT(npy_float * op,
+                npy_float * ip,
+                const npy_intp array_size,
+                const npy_intp steps)
+{
+    const npy_intp stride = steps/(npy_intp)sizeof(npy_float);
+    const npy_int num_lanes = 32/(npy_intp)sizeof(npy_float);
+    npy_float xmax = 88.72283935546875f;
+    npy_float xmin = -103.97208404541015625f;
+
+    /*
+     * Note: while generally indices are npy_intp, we ensure that our maximum index
+     * will fit in an int32 as a precondition for this function via
+     * IS_OUTPUT_BLOCKABLE_UNARY
+     */
+    npy_int32 indexarr[16];
+    for (npy_int32 ii = 0; ii < 16; ii++) {
+        indexarr[ii] = ii*stride;
+    }
+
+    /* Load up frequently used constants */
+    __m256 codyw_c1 = _mm256_set1_ps(NPY_CODY_WAITE_LOGE_2_HIGHf);
+    __m256 codyw_c2 = _mm256_set1_ps(NPY_CODY_WAITE_LOGE_2_LOWf);
+    __m256 exp_p0 = _mm256_set1_ps(NPY_COEFF_P0_EXPf);
+    __m256 exp_p1 = _mm256_set1_ps(NPY_COEFF_P1_EXPf);
+    __m256 exp_p2 = _mm256_set1_ps(NPY_COEFF_P2_EXPf);
+    __m256 exp_p3 = _mm256_set1_ps(NPY_COEFF_P3_EXPf);
+    __m256 exp_p4 = _mm256_set1_ps(NPY_COEFF_P4_EXPf);
+    __m256 exp_p5 = _mm256_set1_ps(NPY_COEFF_P5_EXPf);
+    __m256 exp_q0 = _mm256_set1_ps(NPY_COEFF_Q0_EXPf);
+    __m256 exp_q1 = _mm256_set1_ps(NPY_COEFF_Q1_EXPf);
+    __m256 exp_q2 = _mm256_set1_ps(NPY_COEFF_Q2_EXPf);
+    __m256 cvt_magic = _mm256_set1_ps(NPY_RINT_CVT_MAGICf);
+    __m256 log2e = _mm256_set1_ps(NPY_LOG2Ef);
+    __m256 inf = _mm256_set1_ps(NPY_INFINITYF);
+    __m256 ninf = _mm256_set1_ps(-1*NPY_INFINITYF);
+    __m256 zeros_f = _mm256_set1_ps(0.0f);
+    __m256 poly, num_poly, denom_poly, quadrant;
+    __m256i vindex = _mm256_loadu_si256((__m256i*)&indexarr[0]);
+
+    __m256 xmax_mask, xmin_mask, nan_mask, inf_mask, ninf_mask;
+    __m256 overflow_mask = fma_get_partial_load_mask_ps(0, num_lanes);
+    __m256 underflow_mask = fma_get_partial_load_mask_ps(0, num_lanes);
+    __m256 load_mask = fma_get_full_load_mask_ps();
+    npy_intp num_remaining_elements = array_size;
+
+    while (num_remaining_elements > 0) {
+
+        if (num_remaining_elements < num_lanes) {
+            load_mask = fma_get_partial_load_mask_ps(num_remaining_elements,
+                                                       num_lanes);
+        }
+
+        __m256 x;
+        if (stride == 1) {
+            x = fma_masked_load_ps(load_mask, ip);
+        }
+        else {
+            x = fma_masked_gather_ps(zeros_f, ip, vindex, load_mask);
+        }
+
+        nan_mask = _mm256_cmp_ps(x, x, _CMP_NEQ_UQ);
+        x = fma_set_masked_lanes_ps(x, zeros_f, nan_mask);
+
+        xmax_mask = _mm256_cmp_ps(x, _mm256_set1_ps(xmax), _CMP_GE_OQ);
+        xmin_mask = _mm256_cmp_ps(x, _mm256_set1_ps(xmin), _CMP_LE_OQ);
+        inf_mask = _mm256_cmp_ps(x, inf, _CMP_EQ_OQ);
+        ninf_mask = _mm256_cmp_ps(x, ninf, _CMP_EQ_OQ);
+        overflow_mask = _mm256_or_ps(overflow_mask,
+                                    _mm256_xor_ps(xmax_mask, inf_mask));
+        underflow_mask = _mm256_or_ps(underflow_mask,
+                                    _mm256_xor_ps(xmin_mask, ninf_mask));
+
+        x = fma_set_masked_lanes_ps(x, zeros_f, _mm256_or_ps(
+                                    _mm256_or_ps(nan_mask, xmin_mask), xmax_mask));
+
+        quadrant = _mm256_mul_ps(x, log2e);
+
+        /* round to nearest */
+        quadrant = _mm256_add_ps(quadrant, cvt_magic);
+        quadrant = _mm256_sub_ps(quadrant, cvt_magic);
+
+        /* Cody-Waite's range reduction algorithm */
+        x = simd_range_reduction(x, quadrant, codyw_c1, codyw_c2, zeros_f);
+
+        num_poly = _mm256_fmadd_ps(exp_p5, x, exp_p4);
+        num_poly = _mm256_fmadd_ps(num_poly, x, exp_p3);
+        num_poly = _mm256_fmadd_ps(num_poly, x, exp_p2);
+        num_poly = _mm256_fmadd_ps(num_poly, x, exp_p1);
+        num_poly = _mm256_fmadd_ps(num_poly, x, exp_p0);
+        denom_poly = _mm256_fmadd_ps(exp_q2, x, exp_q1);
+        denom_poly = _mm256_fmadd_ps(denom_poly, x, exp_q0);
+        poly = _mm256_div_ps(num_poly, denom_poly);
+
+        /*
+         * compute val = poly * 2^quadrant; which is same as adding the
+         * exponent of quadrant to the exponent of poly. quadrant is an int,
+         * so extracting exponent is simply extracting 8 bits.
+         */
+        poly = fma_scalef_ps(poly, quadrant);
+
+        /*
+         * elem > xmax; return inf
+         * elem < xmin; return 0.0f
+         * elem = +/- nan, return nan
+         */
+        poly = fma_set_masked_lanes_ps(poly, _mm256_set1_ps(NPY_NANF), nan_mask);
+        poly = fma_set_masked_lanes_ps(poly, inf, xmax_mask);
+        poly = fma_set_masked_lanes_ps(poly, zeros_f, xmin_mask);
+
+        _mm256_maskstore_ps(op, _mm256_cvtps_epi32(load_mask), poly);
+
+        ip += num_lanes*stride;
+        op += num_lanes;
+        num_remaining_elements -= num_lanes;
+    }
+
+    if (_mm256_movemask_ps(overflow_mask)) {
+        npy_set_floatstatus_overflow();
+    }
+
+    if (_mm256_movemask_ps(underflow_mask)) {
+        npy_set_floatstatus_underflow();
+    }
+}
+
+/*
+ * Vectorized implementation of log using AVX2 and AVX512
+ * 1) if x < 0.0f; return -NAN (invalid input)
+ * 2) Range reduction: y = x/2^k;
+ *      a) y = normalized mantissa, k is the exponent (0.5 <= y < 1)
+ * 3) Compute log(y) = P/Q, ratio of 2 polynomials P and Q
+ *      b) P = 5th order and Q = 5th order polynomials obtained from Remez's
+ *      algorithm (mini-max polynomial approximation)
+ * 5) Compute log(x) = log(y) + k*ln(2)
+ * 6) Max ULP error measured across all 32-bit FP's = 3.83 (x = 0x3f486945)
+ * 7) Max relative error measured across all 32-bit FP's = 2.359E-07 (for same
+ * x = 0x3f486945)
+ */
+
+static void
+simd_log_FLOAT(npy_float * op,
+                npy_float * ip,
+                const npy_intp array_size,
+                const npy_intp steps)
+{
+    const npy_intp stride = steps/(npy_intp)sizeof(npy_float);
+    const npy_int num_lanes = 32/(npy_intp)sizeof(npy_float);
+
+    /*
+     * Note: while generally indices are npy_intp, we ensure that our maximum index
+     * will fit in an int32 as a precondition for this function via
+     * IS_OUTPUT_BLOCKABLE_UNARY
+     */
+    npy_int32 indexarr[16];
+    for (npy_int32 ii = 0; ii < 16; ii++) {
+        indexarr[ii] = ii*stride;
+    }
+
+    /* Load up frequently used constants */
+    __m256 log_p0 = _mm256_set1_ps(NPY_COEFF_P0_LOGf);
+    __m256 log_p1 = _mm256_set1_ps(NPY_COEFF_P1_LOGf);
+    __m256 log_p2 = _mm256_set1_ps(NPY_COEFF_P2_LOGf);
+    __m256 log_p3 = _mm256_set1_ps(NPY_COEFF_P3_LOGf);
+    __m256 log_p4 = _mm256_set1_ps(NPY_COEFF_P4_LOGf);
+    __m256 log_p5 = _mm256_set1_ps(NPY_COEFF_P5_LOGf);
+    __m256 log_q0 = _mm256_set1_ps(NPY_COEFF_Q0_LOGf);
+    __m256 log_q1 = _mm256_set1_ps(NPY_COEFF_Q1_LOGf);
+    __m256 log_q2 = _mm256_set1_ps(NPY_COEFF_Q2_LOGf);
+    __m256 log_q3 = _mm256_set1_ps(NPY_COEFF_Q3_LOGf);
+    __m256 log_q4 = _mm256_set1_ps(NPY_COEFF_Q4_LOGf);
+    __m256 log_q5 = _mm256_set1_ps(NPY_COEFF_Q5_LOGf);
+    __m256 loge2 = _mm256_set1_ps(NPY_LOGE2f);
+    __m256 nan = _mm256_set1_ps(NPY_NANF);
+    __m256 neg_nan = _mm256_set1_ps(-NPY_NANF);
+    __m256 neg_inf = _mm256_set1_ps(-NPY_INFINITYF);
+    __m256 inf = _mm256_set1_ps(NPY_INFINITYF);
+    __m256 zeros_f = _mm256_set1_ps(0.0f);
+    __m256 ones_f = _mm256_set1_ps(1.0f);
+    __m256i vindex = _mm256_loadu_si256((__m256i*)indexarr);
+    __m256 poly, num_poly, denom_poly, exponent;
+
+    __m256 inf_mask, nan_mask, sqrt2_mask, zero_mask, negx_mask;
+    __m256 invalid_mask = fma_get_partial_load_mask_ps(0, num_lanes);
+    __m256 divide_by_zero_mask = invalid_mask;
+    __m256 load_mask = fma_get_full_load_mask_ps();
+    npy_intp num_remaining_elements = array_size;
+
+    while (num_remaining_elements > 0) {
+
+        if (num_remaining_elements < num_lanes) {
+            load_mask = fma_get_partial_load_mask_ps(num_remaining_elements,
+                                                       num_lanes);
+        }
+
+        __m256 x_in;
+        if (stride == 1) {
+            x_in = fma_masked_load_ps(load_mask, ip);
+        }
+        else {
+            x_in  = fma_masked_gather_ps(zeros_f, ip, vindex, load_mask);
+        }
+
+        negx_mask = _mm256_cmp_ps(x_in, zeros_f, _CMP_LT_OQ);
+        zero_mask = _mm256_cmp_ps(x_in, zeros_f, _CMP_EQ_OQ);
+        inf_mask = _mm256_cmp_ps(x_in, inf, _CMP_EQ_OQ);
+        nan_mask = _mm256_cmp_ps(x_in, x_in, _CMP_NEQ_UQ);
+        divide_by_zero_mask = _mm256_or_ps(divide_by_zero_mask,
+                                        _mm256_and_ps(zero_mask, load_mask));
+        invalid_mask = _mm256_or_ps(invalid_mask, negx_mask);
+
+        __m256 x = fma_set_masked_lanes_ps(x_in, zeros_f, negx_mask);
+
+        /* set x = normalized mantissa */
+        exponent = fma_get_exponent(x);
+        x = fma_get_mantissa(x);
+
+        /* if x < sqrt(2) {exp = exp-1; x = 2*x} */
+        sqrt2_mask = _mm256_cmp_ps(x, _mm256_set1_ps(NPY_SQRT1_2f), _CMP_LE_OQ);
+        x = fma_blend(x, _mm256_add_ps(x,x), sqrt2_mask);
+        exponent = fma_blend(exponent,
+                               _mm256_sub_ps(exponent,ones_f), sqrt2_mask);
+
+        /* x = x - 1 */
+        x = _mm256_sub_ps(x, ones_f);
+
+        /* Polynomial approximation for log(1+x) */
+        num_poly = _mm256_fmadd_ps(log_p5, x, log_p4);
+        num_poly = _mm256_fmadd_ps(num_poly, x, log_p3);
+        num_poly = _mm256_fmadd_ps(num_poly, x, log_p2);
+        num_poly = _mm256_fmadd_ps(num_poly, x, log_p1);
+        num_poly = _mm256_fmadd_ps(num_poly, x, log_p0);
+        denom_poly = _mm256_fmadd_ps(log_q5, x, log_q4);
+        denom_poly = _mm256_fmadd_ps(denom_poly, x, log_q3);
+        denom_poly = _mm256_fmadd_ps(denom_poly, x, log_q2);
+        denom_poly = _mm256_fmadd_ps(denom_poly, x, log_q1);
+        denom_poly = _mm256_fmadd_ps(denom_poly, x, log_q0);
+        poly = _mm256_div_ps(num_poly, denom_poly);
+        poly = _mm256_fmadd_ps(exponent, loge2, poly);
+
+        /*
+         * x < 0.0f; return -NAN
+         * x = +/- NAN; return NAN
+         * x = 0.0f; return -INF
+         */
+        poly = fma_set_masked_lanes_ps(poly, nan, nan_mask);
+        poly = fma_set_masked_lanes_ps(poly, neg_nan, negx_mask);
+        poly = fma_set_masked_lanes_ps(poly, neg_inf, zero_mask);
+        poly = fma_set_masked_lanes_ps(poly, inf, inf_mask);
+
+        _mm256_maskstore_ps(op, _mm256_cvtps_epi32(load_mask), poly);
+
+        ip += num_lanes*stride;
+        op += num_lanes;
+        num_remaining_elements -= num_lanes;
+    }
+
+    if (_mm256_movemask_ps(invalid_mask)) {
+        npy_set_floatstatus_invalid();
+    }
+    if (_mm256_movemask_ps(divide_by_zero_mask)) {
+        npy_set_floatstatus_divbyzero();
+    }
+}
+#endif // SIMD_AVX2_FMA3
+
+#line 372
+#ifdef SIMD_AVX512F
+/*
+ * Vectorized Cody-Waite range reduction technique
+ * Performs the reduction step x* = x - y*C in three steps:
+ * 1) x* = x - y*c1
+ * 2) x* = x - y*c2
+ * 3) x* = x - y*c3
+ * c1, c2 are exact floating points, c3 = C - c1 - c2 simulates higher precision
+ */
+NPY_FINLINE __m512
+simd_range_reduction(__m512 x, __m512 y, __m512 c1, __m512 c2, __m512 c3)
+{
+    __m512 reduced_x = _mm512_fmadd_ps(y, c1, x);
+    reduced_x = _mm512_fmadd_ps(y, c2, reduced_x);
+    reduced_x = _mm512_fmadd_ps(y, c3, reduced_x);
+    return reduced_x;
+}
+/*
+ * Vectorized implementation of exp using AVX2 and AVX512:
+ * 1) if x >= xmax; return INF (overflow)
+ * 2) if x <= xmin; return 0.0f (underflow)
+ * 3) Range reduction (using Coyd-Waite):
+ *      a) y = x - k*ln(2); k = rint(x/ln(2)); y \in [0, ln(2)]
+ * 4) Compute exp(y) = P/Q, ratio of 2 polynomials P and Q
+ *      b) P = 5th order and Q = 2nd order polynomials obtained from Remez's
+ *      algorithm (mini-max polynomial approximation)
+ * 5) Compute exp(x) = exp(y) * 2^k
+ * 6) Max ULP error measured across all 32-bit FP's = 2.52 (x = 0xc2781e37)
+ * 7) Max relative error measured across all 32-bit FP's= 2.1264E-07 (for the
+ * same x = 0xc2781e37)
+ */
+static void
+simd_exp_FLOAT(npy_float * op,
+                npy_float * ip,
+                const npy_intp array_size,
+                const npy_intp steps)
+{
+    const npy_intp stride = steps/(npy_intp)sizeof(npy_float);
+    const npy_int num_lanes = 64/(npy_intp)sizeof(npy_float);
+    npy_float xmax = 88.72283935546875f;
+    npy_float xmin = -103.97208404541015625f;
+
+    /*
+     * Note: while generally indices are npy_intp, we ensure that our maximum index
+     * will fit in an int32 as a precondition for this function via
+     * IS_OUTPUT_BLOCKABLE_UNARY
+     */
+    npy_int32 indexarr[16];
+    for (npy_int32 ii = 0; ii < 16; ii++) {
+        indexarr[ii] = ii*stride;
+    }
+
+    /* Load up frequently used constants */
+    __m512 codyw_c1 = _mm512_set1_ps(NPY_CODY_WAITE_LOGE_2_HIGHf);
+    __m512 codyw_c2 = _mm512_set1_ps(NPY_CODY_WAITE_LOGE_2_LOWf);
+    __m512 exp_p0 = _mm512_set1_ps(NPY_COEFF_P0_EXPf);
+    __m512 exp_p1 = _mm512_set1_ps(NPY_COEFF_P1_EXPf);
+    __m512 exp_p2 = _mm512_set1_ps(NPY_COEFF_P2_EXPf);
+    __m512 exp_p3 = _mm512_set1_ps(NPY_COEFF_P3_EXPf);
+    __m512 exp_p4 = _mm512_set1_ps(NPY_COEFF_P4_EXPf);
+    __m512 exp_p5 = _mm512_set1_ps(NPY_COEFF_P5_EXPf);
+    __m512 exp_q0 = _mm512_set1_ps(NPY_COEFF_Q0_EXPf);
+    __m512 exp_q1 = _mm512_set1_ps(NPY_COEFF_Q1_EXPf);
+    __m512 exp_q2 = _mm512_set1_ps(NPY_COEFF_Q2_EXPf);
+    __m512 cvt_magic = _mm512_set1_ps(NPY_RINT_CVT_MAGICf);
+    __m512 log2e = _mm512_set1_ps(NPY_LOG2Ef);
+    __m512 inf = _mm512_set1_ps(NPY_INFINITYF);
+    __m512 ninf = _mm512_set1_ps(-1*NPY_INFINITYF);
+    __m512 zeros_f = _mm512_set1_ps(0.0f);
+    __m512 poly, num_poly, denom_poly, quadrant;
+    __m512i vindex = _mm512_loadu_si512((__m512i*)&indexarr[0]);
+
+    __mmask16 xmax_mask, xmin_mask, nan_mask, inf_mask, ninf_mask;
+    __mmask16 overflow_mask = avx512_get_partial_load_mask_ps(0, num_lanes);
+    __mmask16 underflow_mask = avx512_get_partial_load_mask_ps(0, num_lanes);
+    __mmask16 load_mask = avx512_get_full_load_mask_ps();
+    npy_intp num_remaining_elements = array_size;
+
+    while (num_remaining_elements > 0) {
+
+        if (num_remaining_elements < num_lanes) {
+            load_mask = avx512_get_partial_load_mask_ps(num_remaining_elements,
+                                                       num_lanes);
+        }
+
+        __m512 x;
+        if (stride == 1) {
+            x = avx512_masked_load_ps(load_mask, ip);
+        }
+        else {
+            x = avx512_masked_gather_ps(zeros_f, ip, vindex, load_mask);
+        }
+
+        nan_mask = _mm512_cmp_ps_mask(x, x, _CMP_NEQ_UQ);
+        x = avx512_set_masked_lanes_ps(x, zeros_f, nan_mask);
+
+        xmax_mask = _mm512_cmp_ps_mask(x, _mm512_set1_ps(xmax), _CMP_GE_OQ);
+        xmin_mask = _mm512_cmp_ps_mask(x, _mm512_set1_ps(xmin), _CMP_LE_OQ);
+        inf_mask = _mm512_cmp_ps_mask(x, inf, _CMP_EQ_OQ);
+        ninf_mask = _mm512_cmp_ps_mask(x, ninf, _CMP_EQ_OQ);
+        overflow_mask = _mm512_kor(overflow_mask,
+                                    _mm512_kxor(xmax_mask, inf_mask));
+        underflow_mask = _mm512_kor(underflow_mask,
+                                    _mm512_kxor(xmin_mask, ninf_mask));
+
+        x = avx512_set_masked_lanes_ps(x, zeros_f, _mm512_kor(
+                                    _mm512_kor(nan_mask, xmin_mask), xmax_mask));
+
+        quadrant = _mm512_mul_ps(x, log2e);
+
+        /* round to nearest */
+        quadrant = _mm512_add_ps(quadrant, cvt_magic);
+        quadrant = _mm512_sub_ps(quadrant, cvt_magic);
+
+        /* Cody-Waite's range reduction algorithm */
+        x = simd_range_reduction(x, quadrant, codyw_c1, codyw_c2, zeros_f);
+
+        num_poly = _mm512_fmadd_ps(exp_p5, x, exp_p4);
+        num_poly = _mm512_fmadd_ps(num_poly, x, exp_p3);
+        num_poly = _mm512_fmadd_ps(num_poly, x, exp_p2);
+        num_poly = _mm512_fmadd_ps(num_poly, x, exp_p1);
+        num_poly = _mm512_fmadd_ps(num_poly, x, exp_p0);
+        denom_poly = _mm512_fmadd_ps(exp_q2, x, exp_q1);
+        denom_poly = _mm512_fmadd_ps(denom_poly, x, exp_q0);
+        poly = _mm512_div_ps(num_poly, denom_poly);
+
+        /*
+         * compute val = poly * 2^quadrant; which is same as adding the
+         * exponent of quadrant to the exponent of poly. quadrant is an int,
+         * so extracting exponent is simply extracting 8 bits.
+         */
+        poly = avx512_scalef_ps(poly, quadrant);
+
+        /*
+         * elem > xmax; return inf
+         * elem < xmin; return 0.0f
+         * elem = +/- nan, return nan
+         */
+        poly = avx512_set_masked_lanes_ps(poly, _mm512_set1_ps(NPY_NANF), nan_mask);
+        poly = avx512_set_masked_lanes_ps(poly, inf, xmax_mask);
+        poly = avx512_set_masked_lanes_ps(poly, zeros_f, xmin_mask);
+
+        _mm512_mask_storeu_ps(op, (load_mask), poly);
+
+        ip += num_lanes*stride;
+        op += num_lanes;
+        num_remaining_elements -= num_lanes;
+    }
+
+    if (npyv_tobits_b32(overflow_mask)) {
+        npy_set_floatstatus_overflow();
+    }
+
+    if (npyv_tobits_b32(underflow_mask)) {
+        npy_set_floatstatus_underflow();
+    }
+}
+
+/*
+ * Vectorized implementation of log using AVX2 and AVX512
+ * 1) if x < 0.0f; return -NAN (invalid input)
+ * 2) Range reduction: y = x/2^k;
+ *      a) y = normalized mantissa, k is the exponent (0.5 <= y < 1)
+ * 3) Compute log(y) = P/Q, ratio of 2 polynomials P and Q
+ *      b) P = 5th order and Q = 5th order polynomials obtained from Remez's
+ *      algorithm (mini-max polynomial approximation)
+ * 5) Compute log(x) = log(y) + k*ln(2)
+ * 6) Max ULP error measured across all 32-bit FP's = 3.83 (x = 0x3f486945)
+ * 7) Max relative error measured across all 32-bit FP's = 2.359E-07 (for same
+ * x = 0x3f486945)
+ */
+
+static void
+simd_log_FLOAT(npy_float * op,
+                npy_float * ip,
+                const npy_intp array_size,
+                const npy_intp steps)
+{
+    const npy_intp stride = steps/(npy_intp)sizeof(npy_float);
+    const npy_int num_lanes = 64/(npy_intp)sizeof(npy_float);
+
+    /*
+     * Note: while generally indices are npy_intp, we ensure that our maximum index
+     * will fit in an int32 as a precondition for this function via
+     * IS_OUTPUT_BLOCKABLE_UNARY
+     */
+    npy_int32 indexarr[16];
+    for (npy_int32 ii = 0; ii < 16; ii++) {
+        indexarr[ii] = ii*stride;
+    }
+
+    /* Load up frequently used constants */
+    __m512 log_p0 = _mm512_set1_ps(NPY_COEFF_P0_LOGf);
+    __m512 log_p1 = _mm512_set1_ps(NPY_COEFF_P1_LOGf);
+    __m512 log_p2 = _mm512_set1_ps(NPY_COEFF_P2_LOGf);
+    __m512 log_p3 = _mm512_set1_ps(NPY_COEFF_P3_LOGf);
+    __m512 log_p4 = _mm512_set1_ps(NPY_COEFF_P4_LOGf);
+    __m512 log_p5 = _mm512_set1_ps(NPY_COEFF_P5_LOGf);
+    __m512 log_q0 = _mm512_set1_ps(NPY_COEFF_Q0_LOGf);
+    __m512 log_q1 = _mm512_set1_ps(NPY_COEFF_Q1_LOGf);
+    __m512 log_q2 = _mm512_set1_ps(NPY_COEFF_Q2_LOGf);
+    __m512 log_q3 = _mm512_set1_ps(NPY_COEFF_Q3_LOGf);
+    __m512 log_q4 = _mm512_set1_ps(NPY_COEFF_Q4_LOGf);
+    __m512 log_q5 = _mm512_set1_ps(NPY_COEFF_Q5_LOGf);
+    __m512 loge2 = _mm512_set1_ps(NPY_LOGE2f);
+    __m512 nan = _mm512_set1_ps(NPY_NANF);
+    __m512 neg_nan = _mm512_set1_ps(-NPY_NANF);
+    __m512 neg_inf = _mm512_set1_ps(-NPY_INFINITYF);
+    __m512 inf = _mm512_set1_ps(NPY_INFINITYF);
+    __m512 zeros_f = _mm512_set1_ps(0.0f);
+    __m512 ones_f = _mm512_set1_ps(1.0f);
+    __m512i vindex = _mm512_loadu_si512((__m512i*)indexarr);
+    __m512 poly, num_poly, denom_poly, exponent;
+
+    __mmask16 inf_mask, nan_mask, sqrt2_mask, zero_mask, negx_mask;
+    __mmask16 invalid_mask = avx512_get_partial_load_mask_ps(0, num_lanes);
+    __mmask16 divide_by_zero_mask = invalid_mask;
+    __mmask16 load_mask = avx512_get_full_load_mask_ps();
+    npy_intp num_remaining_elements = array_size;
+
+    while (num_remaining_elements > 0) {
+
+        if (num_remaining_elements < num_lanes) {
+            load_mask = avx512_get_partial_load_mask_ps(num_remaining_elements,
+                                                       num_lanes);
+        }
+
+        __m512 x_in;
+        if (stride == 1) {
+            x_in = avx512_masked_load_ps(load_mask, ip);
+        }
+        else {
+            x_in  = avx512_masked_gather_ps(zeros_f, ip, vindex, load_mask);
+        }
+
+        negx_mask = _mm512_cmp_ps_mask(x_in, zeros_f, _CMP_LT_OQ);
+        zero_mask = _mm512_cmp_ps_mask(x_in, zeros_f, _CMP_EQ_OQ);
+        inf_mask = _mm512_cmp_ps_mask(x_in, inf, _CMP_EQ_OQ);
+        nan_mask = _mm512_cmp_ps_mask(x_in, x_in, _CMP_NEQ_UQ);
+        divide_by_zero_mask = _mm512_kor(divide_by_zero_mask,
+                                        _mm512_kand(zero_mask, load_mask));
+        invalid_mask = _mm512_kor(invalid_mask, negx_mask);
+
+        __m512 x = avx512_set_masked_lanes_ps(x_in, zeros_f, negx_mask);
+
+        /* set x = normalized mantissa */
+        exponent = avx512_get_exponent(x);
+        x = avx512_get_mantissa(x);
+
+        /* if x < sqrt(2) {exp = exp-1; x = 2*x} */
+        sqrt2_mask = _mm512_cmp_ps_mask(x, _mm512_set1_ps(NPY_SQRT1_2f), _CMP_LE_OQ);
+        x = avx512_blend(x, _mm512_add_ps(x,x), sqrt2_mask);
+        exponent = avx512_blend(exponent,
+                               _mm512_sub_ps(exponent,ones_f), sqrt2_mask);
+
+        /* x = x - 1 */
+        x = _mm512_sub_ps(x, ones_f);
+
+        /* Polynomial approximation for log(1+x) */
+        num_poly = _mm512_fmadd_ps(log_p5, x, log_p4);
+        num_poly = _mm512_fmadd_ps(num_poly, x, log_p3);
+        num_poly = _mm512_fmadd_ps(num_poly, x, log_p2);
+        num_poly = _mm512_fmadd_ps(num_poly, x, log_p1);
+        num_poly = _mm512_fmadd_ps(num_poly, x, log_p0);
+        denom_poly = _mm512_fmadd_ps(log_q5, x, log_q4);
+        denom_poly = _mm512_fmadd_ps(denom_poly, x, log_q3);
+        denom_poly = _mm512_fmadd_ps(denom_poly, x, log_q2);
+        denom_poly = _mm512_fmadd_ps(denom_poly, x, log_q1);
+        denom_poly = _mm512_fmadd_ps(denom_poly, x, log_q0);
+        poly = _mm512_div_ps(num_poly, denom_poly);
+        poly = _mm512_fmadd_ps(exponent, loge2, poly);
+
+        /*
+         * x < 0.0f; return -NAN
+         * x = +/- NAN; return NAN
+         * x = 0.0f; return -INF
+         */
+        poly = avx512_set_masked_lanes_ps(poly, nan, nan_mask);
+        poly = avx512_set_masked_lanes_ps(poly, neg_nan, negx_mask);
+        poly = avx512_set_masked_lanes_ps(poly, neg_inf, zero_mask);
+        poly = avx512_set_masked_lanes_ps(poly, inf, inf_mask);
+
+        _mm512_mask_storeu_ps(op, (load_mask), poly);
+
+        ip += num_lanes*stride;
+        op += num_lanes;
+        num_remaining_elements -= num_lanes;
+    }
+
+    if (npyv_tobits_b32(invalid_mask)) {
+        npy_set_floatstatus_invalid();
+    }
+    if (npyv_tobits_b32(divide_by_zero_mask)) {
+        npy_set_floatstatus_divbyzero();
+    }
+}
+#endif // SIMD_AVX512F
+
+
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+#line 676
+static void
+simd_exp_f64(const npyv_lanetype_f64 *src, npy_intp ssrc,
+                      npyv_lanetype_f64 *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f64;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f64 x;
+#if 0
+        if (ssrc == 1) {
+            x = npyv_load_till_f64(src, len, 0);
+        } else {
+            x = npyv_loadn_till_f64(src, ssrc, len, 0);
+        }
+#else
+        if (ssrc == 1) {
+            x = npyv_load_tillz_f64(src, len);
+        } else {
+            x = npyv_loadn_tillz_f64(src, ssrc, len);
+        }
+#endif
+        npyv_f64 out = __svml_exp8(x);
+        if (sdst == 1) {
+            npyv_store_till_f64(dst, len, out);
+        } else {
+            npyv_storen_till_f64(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 676
+static void
+simd_log_f64(const npyv_lanetype_f64 *src, npy_intp ssrc,
+                      npyv_lanetype_f64 *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f64;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f64 x;
+#if 1
+        if (ssrc == 1) {
+            x = npyv_load_till_f64(src, len, 1);
+        } else {
+            x = npyv_loadn_till_f64(src, ssrc, len, 1);
+        }
+#else
+        if (ssrc == 1) {
+            x = npyv_load_tillz_f64(src, len);
+        } else {
+            x = npyv_loadn_tillz_f64(src, ssrc, len);
+        }
+#endif
+        npyv_f64 out = __svml_log8(x);
+        if (sdst == 1) {
+            npyv_store_till_f64(dst, len, out);
+        } else {
+            npyv_storen_till_f64(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+
+#else
+#ifdef SIMD_AVX512F_NOCLANG_BUG
+/*
+ * Vectorized implementation of exp double using AVX512
+ * Reference: Tang, P.T.P., "Table-driven implementation of the
+ *  exponential function in IEEE floating-point
+ *  arithmetic," ACM Transactions on Mathematical
+ *  Software, vol. 15, pp. 144-157, 1989.
+ * 1) if x > mTH_max or x is INF; return INF (overflow)
+ * 2) if x < mTH_min; return 0.0f (underflow)
+ * 3) if abs(x) < mTH_nearzero; return 1.0f + x
+ * 4) if x is Nan; return Nan
+ * 5) Range reduction:
+ *    x = (32m + j)ln2 / 32 + r; r in [-ln2/64, ln2/64]
+ * 6) exp(r) - 1 is approximated by a polynomial function p(r)
+ *    exp(x) = 2^m(2^(j/32) + 2^(j/32)p(r));
+ */
+static void
+AVX512F_exp_DOUBLE(npy_double * op,
+                npy_double * ip,
+                const npy_intp array_size,
+                const npy_intp steps)
+{
+    npy_intp num_remaining_elements = array_size;
+    const npy_intp stride = steps / (npy_intp)sizeof(npy_double);
+    const npy_int num_lanes = 64 / (npy_intp)sizeof(npy_double);
+    npy_int32 indexarr[8];
+    for (npy_int32 ii = 0; ii < 8; ii++) {
+        indexarr[ii] = ii*stride;
+    }
+
+    __m512d InvLn2N = _mm512_set1_pd(NPY_INV_LN2_MUL_32);
+    __m512d mShift = _mm512_set1_pd(NPY_RINT_CVT_MAGIC);
+    __m512d mNegL1 = _mm512_set1_pd(NPY_TANG_NEG_L1);
+    __m512d mNegL2 = _mm512_set1_pd(NPY_TANG_NEG_L2);
+    __m512i mMod = _mm512_set1_epi64(0x1f);
+    __m512d mA1 = _mm512_set1_pd(NPY_TANG_A1);
+    __m512d mA2 = _mm512_set1_pd(NPY_TANG_A2);
+    __m512d mA3 = _mm512_set1_pd(NPY_TANG_A3);
+    __m512d mA4 = _mm512_set1_pd(NPY_TANG_A4);
+    __m512d mA5 = _mm512_set1_pd(NPY_TANG_A5);
+    __m512d mTH_nearzero = _mm512_set1_pd(0x1p-54);
+    __m512d mTH_max = _mm512_set1_pd(0x1.62e42fefa39efp+9);
+    __m512d mTH_min = _mm512_set1_pd(-0x1.74910d52d3053p+9);
+    __m512d mTH_inf = _mm512_set1_pd(NPY_INFINITY);
+    __m512d mTH_ninf = _mm512_set1_pd(-NPY_INFINITY);
+    __m512d zeros_d = _mm512_set1_pd(0.0f);
+    __m512d ones_d = _mm512_set1_pd(1.0f);
+    __m256i vindex = _mm256_loadu_si256((__m256i*)&indexarr[0]);
+
+    __m512d mTable_top_0 = _mm512_loadu_pd(&(EXP_Table_top[8*0]));
+    __m512d mTable_top_1 = _mm512_loadu_pd(&(EXP_Table_top[8*1]));
+    __m512d mTable_top_2 = _mm512_loadu_pd(&(EXP_Table_top[8*2]));
+    __m512d mTable_top_3 = _mm512_loadu_pd(&(EXP_Table_top[8*3]));
+    __m512d mTable_tail_0 = _mm512_loadu_pd(&(EXP_Table_tail[8*0]));
+    __m512d mTable_tail_1 = _mm512_loadu_pd(&(EXP_Table_tail[8*1]));
+    __m512d mTable_tail_2 = _mm512_loadu_pd(&(EXP_Table_tail[8*2]));
+    __m512d mTable_tail_3 = _mm512_loadu_pd(&(EXP_Table_tail[8*3]));
+
+    __mmask8 overflow_mask = avx512_get_partial_load_mask_pd(0, num_lanes);
+    __mmask8 underflow_mask = avx512_get_partial_load_mask_pd(0, num_lanes);
+    __mmask8 load_mask = avx512_get_full_load_mask_pd();
+    __mmask8 xmin_mask, xmax_mask, inf_mask, ninf_mask, nan_mask, nearzero_mask;
+
+    while (num_remaining_elements > 0) {
+        if (num_remaining_elements < num_lanes) {
+            load_mask = avx512_get_partial_load_mask_pd(num_remaining_elements,
+                                                      num_lanes);
+        }
+
+        __m512d x;
+        if (1 == stride) {
+            x = avx512_masked_load_pd(load_mask, ip);
+        }
+        else {
+            x = avx512_masked_gather_pd(zeros_d, ip, vindex, load_mask);
+        }
+
+        nan_mask = _mm512_cmp_pd_mask(x, x, _CMP_NEQ_UQ);
+        x = avx512_set_masked_lanes_pd(x, zeros_d, nan_mask);
+        xmax_mask = _mm512_cmp_pd_mask(x, mTH_max, _CMP_GT_OQ);
+        xmin_mask = _mm512_cmp_pd_mask(x, mTH_min, _CMP_LT_OQ);
+        inf_mask = _mm512_cmp_pd_mask(x, mTH_inf, _CMP_EQ_OQ);
+        ninf_mask = _mm512_cmp_pd_mask(x, mTH_ninf, _CMP_EQ_OQ);
+        __m512i x_abs = _mm512_and_epi64(_mm512_castpd_si512(x),
+                                _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF));
+        nearzero_mask = _mm512_cmp_pd_mask(_mm512_castsi512_pd(x_abs),
+                                    mTH_nearzero, _CMP_LT_OQ);
+        nearzero_mask = _mm512_kxor(nearzero_mask, nan_mask);
+        overflow_mask = _mm512_kor(overflow_mask,
+                                _mm512_kxor(xmax_mask, inf_mask));
+        underflow_mask = _mm512_kor(underflow_mask,
+                                _mm512_kxor(xmin_mask, ninf_mask));
+        x = avx512_set_masked_lanes_pd(x, zeros_d,
+                        _mm512_kor(_mm512_kor(nan_mask, xmin_mask),
+                            _mm512_kor(xmax_mask, nearzero_mask)));
+
+        /* z = x * 32/ln2 */
+        __m512d z = _mm512_mul_pd(x, InvLn2N);
+
+        /* round to nearest */
+        __m512d kd = _mm512_add_pd(z, mShift);
+        __m512i ki = _mm512_castpd_si512(kd);
+        kd = _mm512_sub_pd(kd, mShift);
+
+        /* r = (x + kd*mNegL1) + kd*mNegL2 */
+        __m512d r1 = _mm512_fmadd_pd(kd, mNegL1, x);
+        __m512d r2 = _mm512_mul_pd(kd, mNegL2);
+        __m512d r = _mm512_add_pd(r1,r2);
+
+        /* Polynomial approximation for exp(r) - 1 */
+        __m512d q = _mm512_fmadd_pd(mA5, r, mA4);
+        q = _mm512_fmadd_pd(q, r, mA3);
+        q = _mm512_fmadd_pd(q, r, mA2);
+        q = _mm512_fmadd_pd(q, r, mA1);
+        q = _mm512_mul_pd(q, r);
+        __m512d p = _mm512_fmadd_pd(r, q, r2);
+        p = _mm512_add_pd(r1, p);
+
+        /* Get 2^(j/32) from lookup table */
+        __m512i j = _mm512_and_epi64(ki, mMod);
+        __m512d top = avx512_permute_x4var_pd(mTable_top_0, mTable_top_1,
+                                  mTable_top_2, mTable_top_3, j);
+        __m512d tail = avx512_permute_x4var_pd(mTable_tail_0, mTable_tail_1,
+                                  mTable_tail_2, mTable_tail_3, j);
+
+        /*
+         * s = top + tail;
+         * exp(x) = 2^m * (top + (tail + s * p));
+         */
+        __m512d s = _mm512_add_pd(top, tail);
+        __m512d res = _mm512_fmadd_pd(s, p, tail);
+        res = _mm512_add_pd(res, top);
+        res= _mm512_scalef_pd(res, _mm512_div_pd(kd, _mm512_set1_pd(32)));
+
+        /* return special cases */
+        res = avx512_set_masked_lanes_pd(res, _mm512_add_pd(x, ones_d),
+                                        nearzero_mask);
+        res = avx512_set_masked_lanes_pd(res, _mm512_set1_pd(NPY_NAN),
+                                        nan_mask);
+        res = avx512_set_masked_lanes_pd(res, mTH_inf, xmax_mask);
+        res = avx512_set_masked_lanes_pd(res, zeros_d, xmin_mask);
+
+        _mm512_mask_storeu_pd(op, load_mask, res);
+
+        ip += num_lanes * stride;
+        op += num_lanes;
+        num_remaining_elements -= num_lanes;
+    }
+    /*
+     * Don't count on the compiler for cast between mask and int registers.
+     * On gcc7 with flags -march>=nocona -O3 can cause FP stack overflow
+     * which may lead to putting NaN into certain HW/FP calculations.
+     *
+     * For more details, please check the comments in:
+     * - https://github.com/numpy/numpy/issues/20356
+     */
+    if (npyv_tobits_b64(overflow_mask)) {
+        npy_set_floatstatus_overflow();
+    }
+
+    if (npyv_tobits_b64(underflow_mask)) {
+        npy_set_floatstatus_underflow();
+    }
+}
+/*
+ * Vectorized implementation of log double using AVX512
+ * Reference:
+ * [1] Tang, Ping Tak Peter. Table-lookup algorithms for elementary functions
+ *     and their error analysis. No. CONF-9106103-1. Argonne National Lab.,
+ *     IL (USA), 1991.
+ * [2] Tang, Ping-Tak Peter. "Table-driven implementation of the logarithm
+ *     function in IEEE floating-point arithmetic." ACM Transactions on
+ *     Mathematical Software (TOMS) 16.4 (1990): 378-400.
+ * [3] Muller, Jean-Michel. "Elementary functions: algorithms and
+ *     implementation." (2016).
+ * 1) if x = 0; return -INF
+ * 2) if x < 0; return NAN
+ * 3) if x is INF; return INF
+ * 4) if x is NAN; return NAN
+ * 5) if x on (1.0 - 0x1p-4, 1.0 + 0x1.09p-4), calling npy_log()
+ * 6) Range reduction:
+ *    log(x) = log(2^m * z)
+ *           = mln2 + log(z)
+ * 7) log(z) = log(z / c_k) + log(c_k);
+ *    where c_k = 1 + k/64, k = 0,1,...,64
+ *    s.t. |x - c_k| <= 1/128 when x on[1,2].
+ * 8) r = 2(x - c_k)/(x + c_k)
+ *    log(x/c_k) = log((1 + r/2) / (1 - r/2))
+ *               = p(r)
+ *               = 2((r/2) + 1/3*(r/2)^3 + 1/5*(r/2)^5 + ...)
+ */
+
+/* LLVM has a bug where AVX-512F intrinsic `_mm512_mask_mul_pd` emits an
+ * unmasked operation with a masked store.  This can cause FP exceptions to
+ * occur for the lanes that are suppose to have been masked.
+ *
+ * See https://bugs.llvm.org/show_bug.cgi?id=51988
+ *
+ * Note, this affects LLVM based compilers like Apple Clang, Clang, and Intel's
+ * ICX.
+ */
+#if defined(__clang__)
+    #if defined(__apple_build_version__)
+    // Apple Clang
+        #if __apple_build_version__ > 11000000
+        // Apple Clang after v11
+        #define WORKAROUND_LLVM__mm512_mask_mul_pd
+        #endif
+    #else
+    // Clang, not Apple Clang
+        #if __clang_major__ > 9
+        // Clang v9+
+        #define WORKAROUND_LLVM__mm512_mask_mul_pd
+        #endif
+    #endif
+#endif
+
+static void
+AVX512F_log_DOUBLE(npy_double * op,
+                npy_double * ip,
+                const npy_intp array_size,
+                const npy_intp steps)
+{
+    npy_intp num_remaining_elements = array_size;
+    const npy_intp stride = steps / (npy_intp)sizeof(npy_double);
+    const npy_int num_lanes = 64 / (npy_intp)sizeof(npy_double);
+    npy_int32 indexarr[8];
+    for (npy_int32 ii = 0; ii < 8; ii++) {
+        indexarr[ii] = ii*stride;
+    }
+
+    __m512d zeros_d = _mm512_set1_pd(0.0f);
+    __m512d ones_d = _mm512_set1_pd(1.0f);
+    __m512d mInf = _mm512_set1_pd(NPY_INFINITY);
+    __m512d mInv64 = _mm512_castsi512_pd(_mm512_set1_epi64(0x3f90000000000000));
+    __m512d mNeg_nan = _mm512_set1_pd(-NPY_NAN);
+    __m512d mNan = _mm512_set1_pd(NPY_NAN);
+    __m512d mNeg_inf = _mm512_set1_pd(-NPY_INFINITY);
+    __m512d mA1 = _mm512_set1_pd(NPY_TANG_LOG_A1);
+    __m512d mA2 = _mm512_set1_pd(NPY_TANG_LOG_A2);
+    __m512d mA3 = _mm512_set1_pd(NPY_TANG_LOG_A3);
+    __m512d mA4 = _mm512_set1_pd(NPY_TANG_LOG_A4);
+    __m512d mLN2HI = _mm512_set1_pd(NPY_TANG_LOG_LN2HI);
+    __m512d mLN2LO = _mm512_set1_pd(NPY_TANG_LOG_LN2LO);
+
+    __m512d mTo_glibc_min = _mm512_set1_pd(1.0 - 0x1p-4);
+    __m512d mTo_glibc_max = _mm512_set1_pd(1.0 + 0x1.09p-4);
+    __m256i vindex = _mm256_loadu_si256((__m256i*)&indexarr[0]);
+
+    /* Load lookup table data */
+    #line 961
+
+    __m512d mLUT_TOP_0 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*0]));
+    __m512d mLUT_TAIL_0 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*0]));
+
+    
+#line 961
+
+    __m512d mLUT_TOP_1 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*1]));
+    __m512d mLUT_TAIL_1 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*1]));
+
+    
+#line 961
+
+    __m512d mLUT_TOP_2 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*2]));
+    __m512d mLUT_TAIL_2 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*2]));
+
+    
+#line 961
+
+    __m512d mLUT_TOP_3 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*3]));
+    __m512d mLUT_TAIL_3 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*3]));
+
+    
+#line 961
+
+    __m512d mLUT_TOP_4 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*4]));
+    __m512d mLUT_TAIL_4 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*4]));
+
+    
+#line 961
+
+    __m512d mLUT_TOP_5 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*5]));
+    __m512d mLUT_TAIL_5 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*5]));
+
+    
+#line 961
+
+    __m512d mLUT_TOP_6 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*6]));
+    __m512d mLUT_TAIL_6 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*6]));
+
+    
+#line 961
+
+    __m512d mLUT_TOP_7 = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*7]));
+    __m512d mLUT_TAIL_7 = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*7]));
+
+    
+
+    __mmask8 load_mask = avx512_get_full_load_mask_pd();
+    __mmask8 invalid_mask = avx512_get_partial_load_mask_pd(0, num_lanes);
+    __mmask8 divide_by_zero_mask = invalid_mask;
+
+    __mmask8 inf_mask, nan_mask, zero_mask, negx_mask, denormal_mask,
+             glibc_mask;
+
+    __m512d x_in;
+    while (num_remaining_elements > 0) {
+        if (num_remaining_elements < num_lanes) {
+            load_mask = avx512_get_partial_load_mask_pd(num_remaining_elements,
+                                                      num_lanes);
+        }
+
+        if (1 == stride) {
+            x_in = avx512_masked_load_pd(load_mask, ip);
+        }
+        else {
+            x_in = avx512_masked_gather_pd(zeros_d, ip, vindex, load_mask);
+        }
+
+        /* call glibc when x on [1.0 - 0x1p-4, 1.0 + 0x1.09p-4] */
+        __mmask8 m1 = _mm512_cmp_pd_mask(x_in, mTo_glibc_max, _CMP_LT_OQ);
+        __mmask8 m2 = _mm512_cmp_pd_mask(x_in, mTo_glibc_min, _CMP_GT_OQ);
+        glibc_mask =  m1 & m2;
+
+        if (glibc_mask != 0xFF) {
+            zero_mask = _mm512_cmp_pd_mask(x_in, zeros_d, _CMP_EQ_OQ);
+            inf_mask = _mm512_cmp_pd_mask(x_in, mInf, _CMP_EQ_OQ);
+            negx_mask = _mm512_cmp_pd_mask(x_in, zeros_d, _CMP_LT_OQ);
+            nan_mask = _mm512_cmp_pd_mask(x_in, x_in, _CMP_NEQ_UQ);
+
+            divide_by_zero_mask = divide_by_zero_mask | (zero_mask & load_mask);
+            invalid_mask = invalid_mask | negx_mask;
+
+            __m512d x = avx512_set_masked_lanes_pd(x_in, zeros_d, negx_mask);
+            __m512i ix = _mm512_castpd_si512(x);
+
+            /* Normalize x when it is denormal */
+            __m512i top12 = _mm512_and_epi64(ix,
+                                _mm512_set1_epi64(0xfff0000000000000));
+            denormal_mask = _mm512_cmp_epi64_mask(top12, _mm512_set1_epi64(0),
+                                _CMP_EQ_OQ);
+            denormal_mask = (~zero_mask) & denormal_mask;
+            __m512d masked_x = x;
+            #ifdef WORKAROUND_LLVM__mm512_mask_mul_pd
+            masked_x = avx512_set_masked_lanes_pd(masked_x, zeros_d, (~denormal_mask));
+            #endif
+            ix = _mm512_castpd_si512(_mm512_mask_mul_pd(x, denormal_mask,
+                                    masked_x, _mm512_set1_pd(0x1p52)));
+            ix = _mm512_mask_sub_epi64(ix, denormal_mask,
+                                    ix, _mm512_set1_epi64(52ULL << 52));
+
+            /*
+             * x = 2^k * z; where z in range [1,2]
+             */
+            __m512i tmp = _mm512_sub_epi64(ix,
+                              _mm512_set1_epi64(0x3ff0000000000000));
+            __m512i i = _mm512_and_epi64(_mm512_srai_epi64(tmp, 52 - 6),
+                            _mm512_set1_epi64(0x3fULL));
+            __m512i ik = _mm512_srai_epi64(tmp, 52);
+            __m512d z = _mm512_castsi512_pd(_mm512_sub_epi64(ix, _mm512_and_epi64(tmp,
+                            _mm512_set1_epi64(0xfff0000000000000))));
+            /* c = i/64 + 1 */
+            __m256i i_32 = _mm512_cvtepi64_epi32(i);
+            __m512d c = _mm512_fmadd_pd(_mm512_cvtepi32_pd(i_32), mInv64, ones_d);
+
+            /* u = 2 * (z - c) / (z + c) */
+            __m512d u = _mm512_div_pd(_mm512_sub_pd(z, c), _mm512_add_pd(z, c));
+            u = _mm512_mul_pd(_mm512_set1_pd(2.0), u);
+
+            /* v = u * u */
+            __m512d v = _mm512_mul_pd(u,u);
+
+            /* log(z/c) = u + u*v*(A1 + v*(A2 + v*(A3 + v*A4))) */
+            __m512d res = _mm512_fmadd_pd(v, mA4, mA3);
+            res = _mm512_fmadd_pd(v, res, mA2);
+            res = _mm512_fmadd_pd(v, res, mA1);
+            res = _mm512_mul_pd(v, res);
+            res = _mm512_fmadd_pd(u, res, u);
+
+            /* Load lookup table data */
+            __m512d c_hi = avx512_permute_x8var_pd(mLUT_TOP_0, mLUT_TOP_1,
+                            mLUT_TOP_2, mLUT_TOP_3, mLUT_TOP_4, mLUT_TOP_5,
+                            mLUT_TOP_6, mLUT_TOP_7, i);
+            __m512d c_lo = avx512_permute_x8var_pd(mLUT_TAIL_0, mLUT_TAIL_1,
+                              mLUT_TAIL_2, mLUT_TAIL_3, mLUT_TAIL_4, mLUT_TAIL_5,
+                              mLUT_TAIL_6, mLUT_TAIL_7, i);
+
+            /*
+             * log(x) = k * ln2_hi + c_hi +
+             *          k * ln2_lo + c_lo +
+             *          log(z/c)
+             */
+            __m256i ik_32 = _mm512_cvtepi64_epi32(ik);
+            __m512d k = _mm512_cvtepi32_pd(ik_32);
+            __m512d tt = _mm512_fmadd_pd(k, mLN2HI, c_hi);
+            __m512d tt2 = _mm512_fmadd_pd(k, mLN2LO, c_lo);
+            tt = _mm512_add_pd(tt, tt2);
+            res = _mm512_add_pd(tt, res);
+
+            /* return special cases */
+            res = avx512_set_masked_lanes_pd(res, mNan, nan_mask);
+            res = avx512_set_masked_lanes_pd(res, mNeg_nan, negx_mask);
+            res = avx512_set_masked_lanes_pd(res, mNeg_inf, zero_mask);
+            res = avx512_set_masked_lanes_pd(res, mInf, inf_mask);
+
+            _mm512_mask_storeu_pd(op, load_mask, res);
+        }
+
+        /* call glibc's log func when x around 1.0f */
+        if (glibc_mask != 0) {
+            double NPY_DECL_ALIGNED(64) ip_fback[8];
+            _mm512_store_pd(ip_fback, x_in);
+
+            for (int ii = 0; ii < 8; ++ii, glibc_mask >>= 1) {
+                if (glibc_mask & 0x01) {
+                    op[ii] = npy_log(ip_fback[ii]);
+                }
+            }
+        }
+        ip += num_lanes * stride;
+        op += num_lanes;
+        num_remaining_elements -= num_lanes;
+    }
+
+    if (npyv_tobits_b64(invalid_mask)) {
+        npy_set_floatstatus_invalid();
+    }
+    if (npyv_tobits_b64(divide_by_zero_mask)) {
+        npy_set_floatstatus_divbyzero();
+    }
+}
+
+#undef WORKAROUND_LLVM__mm512_mask_mul_pd
+
+#endif // SIMD_AVX512F_NOCLANG_BUG
+#endif // NPY_CAN_LINK_SVML
+
+#ifdef SIMD_AVX512_SKX
+#line 1125
+static inline void
+AVX512_SKX_ldexp_FLOAT(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+    const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(npy_float);
+    const npy_intp stride_ip2 = steps[1]/(npy_intp)sizeof(int);
+    const npy_intp stride_op = steps[2]/(npy_intp)sizeof(npy_float);
+    const npy_intp array_size = dimensions[0];
+    npy_intp num_remaining_elements = array_size;
+    npy_float* ip1 = (npy_float*) args[0];
+    int* ip2 = (int*) args[1];
+    npy_float* op  = (npy_float*) args[2];
+
+    __mmask16 load_mask = avx512_get_full_load_mask_ps();
+
+    /*
+     * Note: while generally indices are npy_intp, we ensure that our maximum index
+     * will fit in an int32 as a precondition for this function via
+     * IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP
+     */
+
+    npy_int32 index_ip1[16], index_ip2[16], index_op[16];
+    for (npy_int32 ii = 0; ii < 16; ii++) {
+        index_ip1[ii] = ii*stride_ip1;
+        index_ip2[ii] = ii*stride_ip2;
+        index_op[ii] = ii*stride_op;
+    }
+    __m512i vindex_ip1 = _mm512_loadu_si512((__m512i*)&index_ip1[0]);
+    __m512i vindex_ip2 = _mm512_loadu_si512((__m512i*)&index_ip2[0]);
+    __m512i vindex_op  = _mm512_loadu_si512((__m512i*)&index_op[0]);
+    __m512 zeros_f = _mm512_setzero_ps();
+    __m512i zeros = _mm512_setzero_epi32();
+
+    while (num_remaining_elements > 0) {
+        if (num_remaining_elements < 16) {
+            load_mask = avx512_get_partial_load_mask_ps(
+                                    num_remaining_elements, 16);
+        }
+        __m512 x1;
+        __m512i x2;
+        if (stride_ip1 == 1) {
+            x1 = avx512_masked_load_ps(load_mask, ip1);
+        }
+        else {
+            x1 = avx512_masked_gather_ps(zeros_f, ip1, vindex_ip1, load_mask);
+        }
+        if (stride_ip2 == 1) {
+            x2 = _mm512_maskz_loadu_epi32(load_mask, ip2);
+        }
+        else {
+            x2 = _mm512_mask_i32gather_epi32(zeros, load_mask, vindex_ip2, ip2, 4);
+        }
+
+        __m512 out = _mm512_scalef_ps(x1, _mm512_cvtepi32_ps(x2));
+
+        if (stride_op == 1) {
+            _mm512_mask_storeu_ps(op, load_mask, out);
+        }
+        else {
+            /* scatter! */
+            _mm512_mask_i32scatter_ps(op, load_mask, vindex_op, out, 4);
+        }
+
+        ip1 += 16*stride_ip1;
+        ip2 += 16*stride_ip2;
+        op += 16*stride_op;
+        num_remaining_elements -= 16;
+    }
+}
+
+static inline void
+AVX512_SKX_frexp_FLOAT(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+    const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(npy_float);
+    const npy_intp stride_op1 = steps[1]/(npy_intp)sizeof(npy_float);
+    const npy_intp stride_op2 = steps[2]/(npy_intp)sizeof(int);
+    const npy_intp array_size = dimensions[0];
+    npy_intp num_remaining_elements = array_size;
+    npy_float* ip1 = (npy_float*) args[0];
+    npy_float* op1  = (npy_float*) args[1];
+    int* op2 = (int*) args[2];
+
+    __mmask16 load_mask = avx512_get_full_load_mask_ps();
+
+    /*
+     * Note: while generally indices are npy_intp, we ensure that our maximum index
+     * will fit in an int32 as a precondition for this function via
+     * IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP
+     */
+
+    npy_int32 index_ip1[16], index_op1[16], index_op2[16];
+    for (npy_int32 ii = 0; ii < 16; ii++) {
+        index_ip1[ii] = ii*stride_ip1;
+        index_op1[ii] = ii*stride_op1;
+        index_op2[ii] = ii*stride_op2;
+    }
+    __m512i vindex_ip1 = _mm512_loadu_si512((__m512i*)&index_ip1[0]);
+    __m512i vindex_op1 = _mm512_loadu_si512((__m512i*)&index_op1[0]);
+    __m512i vindex_op2 = _mm512_loadu_si512((__m512i*)&index_op2[0]);
+    __m512 zeros_f = _mm512_setzero_ps();
+
+    while (num_remaining_elements > 0) {
+        if (num_remaining_elements < 16) {
+            load_mask = avx512_get_partial_load_mask_ps(
+                                    num_remaining_elements, 16);
+        }
+        __m512 x1;
+        if (stride_ip1 == 1) {
+            x1 = avx512_masked_load_ps(load_mask, ip1);
+        }
+        else {
+            x1 = avx512_masked_gather_ps(zeros_f, ip1, vindex_ip1, load_mask);
+        }
+
+        /*
+         * The x86 instructions vpgetmant and vpgetexp do not conform
+         * with NumPy's output for special floating points: NAN, +/-INF, +/-0.0
+         * We mask these values with spmask to avoid invalid exceptions.
+         */
+        __mmask16 spmask =_mm512_knot(_mm512_fpclass_ps_mask(
+                                                x1, 0b10011111));
+        __m512 out1 = _mm512_maskz_getmant_ps(
+                                spmask, x1, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src);
+        out1 = _mm512_mask_mov_ps(x1, spmask, out1);
+        __m512i out2 = _mm512_cvtps_epi32(
+                            _mm512_maskz_add_ps(spmask, _mm512_set1_ps(1.0),
+                                _mm512_maskz_getexp_ps(spmask, x1)));
+        if (stride_op1 == 1) {
+            _mm512_mask_storeu_ps(op1, load_mask, out1);
+        }
+        else {
+            _mm512_mask_i32scatter_ps(op1, load_mask, vindex_op1, out1, 4);
+        }
+        if (stride_op2 == 1) {
+            _mm512_mask_storeu_epi32(op2, load_mask, out2);
+        }
+        else {
+            _mm512_mask_i32scatter_epi32(op2, load_mask, vindex_op2, out2, 4);
+        }
+
+        ip1 += 16*stride_ip1;
+        op1 += 16*stride_op1;
+        op2 += 16*stride_op2;
+        num_remaining_elements -= 16;
+    }
+}
+
+#line 1125
+static inline void
+AVX512_SKX_ldexp_DOUBLE(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+    const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(npy_double);
+    const npy_intp stride_ip2 = steps[1]/(npy_intp)sizeof(int);
+    const npy_intp stride_op = steps[2]/(npy_intp)sizeof(npy_double);
+    const npy_intp array_size = dimensions[0];
+    npy_intp num_remaining_elements = array_size;
+    npy_double* ip1 = (npy_double*) args[0];
+    int* ip2 = (int*) args[1];
+    npy_double* op  = (npy_double*) args[2];
+
+    __mmask8 load_mask = avx512_get_full_load_mask_pd();
+
+    /*
+     * Note: while generally indices are npy_intp, we ensure that our maximum index
+     * will fit in an int32 as a precondition for this function via
+     * IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP
+     */
+
+    npy_int32 index_ip1[8], index_ip2[8], index_op[8];
+    for (npy_int32 ii = 0; ii < 8; ii++) {
+        index_ip1[ii] = ii*stride_ip1;
+        index_ip2[ii] = ii*stride_ip2;
+        index_op[ii] = ii*stride_op;
+    }
+    __m256i vindex_ip1 = _mm256_loadu_si256((__m256i*)&index_ip1[0]);
+    __m256i vindex_ip2 = _mm256_loadu_si256((__m256i*)&index_ip2[0]);
+    __m256i vindex_op  = _mm256_loadu_si256((__m256i*)&index_op[0]);
+    __m512d zeros_f = _mm512_setzero_pd();
+    __m256i zeros = _mm256_setzero_si256();
+
+    while (num_remaining_elements > 0) {
+        if (num_remaining_elements < 8) {
+            load_mask = avx512_get_partial_load_mask_pd(
+                                    num_remaining_elements, 8);
+        }
+        __m512d x1;
+        __m256i x2;
+        if (stride_ip1 == 1) {
+            x1 = avx512_masked_load_pd(load_mask, ip1);
+        }
+        else {
+            x1 = avx512_masked_gather_pd(zeros_f, ip1, vindex_ip1, load_mask);
+        }
+        if (stride_ip2 == 1) {
+            x2 = _mm256_maskz_loadu_epi32(load_mask, ip2);
+        }
+        else {
+            x2 = _mm256_mmask_i32gather_epi32(zeros, load_mask, vindex_ip2, ip2, 4);
+        }
+
+        __m512d out = _mm512_scalef_pd(x1, _mm512_cvtepi32_pd(x2));
+
+        if (stride_op == 1) {
+            _mm512_mask_storeu_pd(op, load_mask, out);
+        }
+        else {
+            /* scatter! */
+            _mm512_mask_i32scatter_pd(op, load_mask, vindex_op, out, 8);
+        }
+
+        ip1 += 8*stride_ip1;
+        ip2 += 8*stride_ip2;
+        op += 8*stride_op;
+        num_remaining_elements -= 8;
+    }
+}
+
+static inline void
+AVX512_SKX_frexp_DOUBLE(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+    const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(npy_double);
+    const npy_intp stride_op1 = steps[1]/(npy_intp)sizeof(npy_double);
+    const npy_intp stride_op2 = steps[2]/(npy_intp)sizeof(int);
+    const npy_intp array_size = dimensions[0];
+    npy_intp num_remaining_elements = array_size;
+    npy_double* ip1 = (npy_double*) args[0];
+    npy_double* op1  = (npy_double*) args[1];
+    int* op2 = (int*) args[2];
+
+    __mmask8 load_mask = avx512_get_full_load_mask_pd();
+
+    /*
+     * Note: while generally indices are npy_intp, we ensure that our maximum index
+     * will fit in an int32 as a precondition for this function via
+     * IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP
+     */
+
+    npy_int32 index_ip1[8], index_op1[8], index_op2[8];
+    for (npy_int32 ii = 0; ii < 8; ii++) {
+        index_ip1[ii] = ii*stride_ip1;
+        index_op1[ii] = ii*stride_op1;
+        index_op2[ii] = ii*stride_op2;
+    }
+    __m256i vindex_ip1 = _mm256_loadu_si256((__m256i*)&index_ip1[0]);
+    __m256i vindex_op1 = _mm256_loadu_si256((__m256i*)&index_op1[0]);
+    __m256i vindex_op2 = _mm256_loadu_si256((__m256i*)&index_op2[0]);
+    __m512d zeros_f = _mm512_setzero_pd();
+
+    while (num_remaining_elements > 0) {
+        if (num_remaining_elements < 8) {
+            load_mask = avx512_get_partial_load_mask_pd(
+                                    num_remaining_elements, 8);
+        }
+        __m512d x1;
+        if (stride_ip1 == 1) {
+            x1 = avx512_masked_load_pd(load_mask, ip1);
+        }
+        else {
+            x1 = avx512_masked_gather_pd(zeros_f, ip1, vindex_ip1, load_mask);
+        }
+
+        /*
+         * The x86 instructions vpgetmant and vpgetexp do not conform
+         * with NumPy's output for special floating points: NAN, +/-INF, +/-0.0
+         * We mask these values with spmask to avoid invalid exceptions.
+         */
+        __mmask8 spmask =_mm512_knot(_mm512_fpclass_pd_mask(
+                                                x1, 0b10011111));
+        __m512d out1 = _mm512_maskz_getmant_pd(
+                                spmask, x1, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src);
+        out1 = _mm512_mask_mov_pd(x1, spmask, out1);
+        __m256i out2 = _mm512_cvtpd_epi32(
+                            _mm512_maskz_add_pd(spmask, _mm512_set1_pd(1.0),
+                                _mm512_maskz_getexp_pd(spmask, x1)));
+        if (stride_op1 == 1) {
+            _mm512_mask_storeu_pd(op1, load_mask, out1);
+        }
+        else {
+            _mm512_mask_i32scatter_pd(op1, load_mask, vindex_op1, out1, 8);
+        }
+        if (stride_op2 == 1) {
+            _mm256_mask_storeu_epi32(op2, load_mask, out2);
+        }
+        else {
+            _mm256_mask_i32scatter_epi32(op2, load_mask, vindex_op2, out2, 4);
+        }
+
+        ip1 += 8*stride_ip1;
+        op1 += 8*stride_op1;
+        op2 += 8*stride_op2;
+        num_remaining_elements -= 8;
+    }
+}
+
+#endif // SIMD_AVX512_SKX
+
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+#line 1281
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_exp)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if defined(SIMD_AVX2_FMA3) || defined(SIMD_AVX512F)
+    // third arg in `IS_OUTPUT_BLOCKABLE_UNARY` is dummy
+    // TODO: get ride of this macro during the move to NPYV
+    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), sizeof(npy_float), 64)) {
+        simd_exp_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0]);
+    }
+    else {
+        UNARY_LOOP {
+            /*
+             * We use the AVX function to compute exp/log for scalar elements as well.
+             * This is needed to ensure the output of strided and non-strided
+             * cases match. SIMD code handles strided input cases, but not
+             * strided output.
+             */
+            simd_exp_FLOAT((npy_float *)op1, (npy_float *)ip1, 1, steps[0]);
+        }
+    }
+#else
+    UNARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        *(npy_float *)op1 = npy_expf(in1);
+    }
+#endif
+}
+
+#line 1281
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_log)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if defined(SIMD_AVX2_FMA3) || defined(SIMD_AVX512F)
+    // third arg in `IS_OUTPUT_BLOCKABLE_UNARY` is dummy
+    // TODO: get ride of this macro during the move to NPYV
+    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_float), sizeof(npy_float), 64)) {
+        simd_log_FLOAT((npy_float*)args[1], (npy_float*)args[0], dimensions[0], steps[0]);
+    }
+    else {
+        UNARY_LOOP {
+            /*
+             * We use the AVX function to compute exp/log for scalar elements as well.
+             * This is needed to ensure the output of strided and non-strided
+             * cases match. SIMD code handles strided input cases, but not
+             * strided output.
+             */
+            simd_log_FLOAT((npy_float *)op1, (npy_float *)ip1, 1, steps[0]);
+        }
+    }
+#else
+    UNARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        *(npy_float *)op1 = npy_logf(in1);
+    }
+#endif
+}
+
+
+#line 1314
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_exp)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_double *src = (npy_double*)args[0];
+    npy_double *dst = (npy_double*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(steps[0] % lsize == 0 && steps[1] % lsize == 0);
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+            npyv_loadable_stride_f64(ssrc) &&
+            npyv_storable_stride_f64(sdst)) {
+        simd_exp_f64(src, ssrc, dst, sdst, len);
+        return;
+    }
+#else
+#ifdef SIMD_AVX512F_NOCLANG_BUG
+    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_double), sizeof(npy_double), 64)) {
+        AVX512F_exp_DOUBLE((npy_double*)args[1], (npy_double*)args[0], dimensions[0], steps[0]);
+        return;
+    }
+#endif // SIMD_AVX512F_NOCLANG_BUG
+#endif // NPY_CAN_LINK_SVML
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *(npy_double *)op1 = npy_exp(in1);
+    }
+}
+
+
+#line 1314
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_log)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_double *src = (npy_double*)args[0];
+    npy_double *dst = (npy_double*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(steps[0] % lsize == 0 && steps[1] % lsize == 0);
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+            npyv_loadable_stride_f64(ssrc) &&
+            npyv_storable_stride_f64(sdst)) {
+        simd_log_f64(src, ssrc, dst, sdst, len);
+        return;
+    }
+#else
+#ifdef SIMD_AVX512F_NOCLANG_BUG
+    if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_double), sizeof(npy_double), 64)) {
+        AVX512F_log_DOUBLE((npy_double*)args[1], (npy_double*)args[0], dimensions[0], steps[0]);
+        return;
+    }
+#endif // SIMD_AVX512F_NOCLANG_BUG
+#endif // NPY_CAN_LINK_SVML
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *(npy_double *)op1 = npy_log(in1);
+    }
+}
+
+
+
+#line 1354
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_frexp)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#ifdef SIMD_AVX512_SKX
+    if (IS_UNARY_TWO_OUT_SMALL_STEPS_AND_NOMEMOVERLAP) {
+        AVX512_SKX_frexp_FLOAT(args, dimensions, steps);
+        return;
+    }
+#endif
+    UNARY_LOOP_TWO_OUT {
+        const npy_float in1 = *(npy_float *)ip1;
+        *((npy_float *)op1) = npy_frexpf(in1, (int *)op2);
+    }
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_ldexp)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#ifdef SIMD_AVX512_SKX
+    if (IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP) {
+        AVX512_SKX_ldexp_FLOAT(args, dimensions, steps);
+        return;
+    }
+#endif
+    BINARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        const int in2 = *(int *)ip2;
+        *((npy_float *)op1) = npy_ldexpf(in1, in2);
+    }
+}
+
+#line 1354
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_frexp)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#ifdef SIMD_AVX512_SKX
+    if (IS_UNARY_TWO_OUT_SMALL_STEPS_AND_NOMEMOVERLAP) {
+        AVX512_SKX_frexp_DOUBLE(args, dimensions, steps);
+        return;
+    }
+#endif
+    UNARY_LOOP_TWO_OUT {
+        const npy_double in1 = *(npy_double *)ip1;
+        *((npy_double *)op1) = npy_frexp(in1, (int *)op2);
+    }
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_ldexp)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#ifdef SIMD_AVX512_SKX
+    if (IS_BINARY_SMALL_STEPS_AND_NOMEMOVERLAP) {
+        AVX512_SKX_ldexp_DOUBLE(args, dimensions, steps);
+        return;
+    }
+#endif
+    BINARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        const int in2 = *(int *)ip2;
+        *((npy_double *)op1) = npy_ldexp(in1, in2);
+    }
+}
+
+
diff --git a/numpy/core/src/_generated/loops_hyperbolic.dispatch.c b/numpy/core/src/_generated/loops_hyperbolic.dispatch.c
new file mode 100644
index 000000000000..be0275968739
--- /dev/null
+++ b/numpy/core/src/_generated/loops_hyperbolic.dispatch.c
@@ -0,0 +1,769 @@
+#line 1 "numpy/core/src/umath/loops_hyperbolic.dispatch.c.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+/*@targets
+ ** $maxopt baseline
+ ** (avx2 fma3) AVX512_SKX
+ ** vsx2 vsx4
+ ** neon_vfpv4
+ ** vx vxe
+ **/
+#include "numpy/npy_math.h"
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+
+#if NPY_SIMD_FMA3 // native support
+/*
+ * NOTE: The following implementation of tanh(f32, f64) have been converted from
+ * Intel SVML to universal intrinsics, and the original code can be found in:
+ *
+ * - https://github.com/numpy/SVML/blob/main/linux/avx512/svml_z0_tanh_d_la.s
+ * - https://github.com/numpy/SVML/blob/main/linux/avx512/svml_z0_tanh_s_la.s
+ *
+ * ALGORITHM DESCRIPTION:
+ *
+ *   NOTE: Since the hyperbolic tangent function is odd
+ *         (tanh(x) = -tanh(-x)), below algorithm deals with the absolute
+ *         value of the argument |x|: tanh(x) = sign(x) * tanh(|x|)
+ *
+ *   We use a table lookup method to compute tanh(|x|).
+ *   The basic idea is to split the input range into a number of subintervals
+ *   and to approximate tanh(.) with a polynomial on each of them.
+ *
+ *   IEEE SPECIAL CONDITIONS:
+ *   x = [+,-]0, r = [+,-]0
+ *   x = +Inf,   r = +1
+ *   x = -Inf,   r = -1
+ *   x = QNaN,   r = QNaN
+ *   x = SNaN,   r = QNaN
+ *
+ *
+ *  ALGORITHM DETAILS
+ *
+ *  SVML handle |x| > HUGE_THRESHOLD, INF and NaNs by scalar callout as following:
+ *  1. check special cases
+ *  2. return `+-1` for `|x| > HUGE_THRESHOLD`  otherwise return `x`
+ *
+ *  It wasn't clear to us the reason behind using callout instead of using
+ *  AVX512 directly for single-precision.
+ *  However, we saw it's better to use SIMD instead of following SVML.
+ *
+ *  Main path computations are organized as follows:
+ *  Actually we split the interval [0, SATURATION_THRESHOLD)
+ *  into a number of subintervals.  On each subinterval we approximate tanh(.)
+ *   with a minimax polynomial of pre-defined degree. Polynomial coefficients
+ *  are computed beforehand and stored in table. We also use
+ *
+ *       y := |x| + B,
+ *
+ *  here B depends on subinterval and is used to make argument
+ *   closer to zero.
+ *   We also add large fake interval [SATURATION_THRESHOLD, HUGE_THRESHOLD],
+ *   where 1.0 + 0.0*y + 0.0*y^2 ... coefficients are stored - just to
+ *   preserve main path computation logic but return 1.0 for all arguments.
+ *
+ *   Hence reconstruction looks as follows:
+ *   we extract proper polynomial and range reduction coefficients
+ *        (Pj and B), corresponding to subinterval, to which |x| belongs,
+ *        and return
+ *
+ *       r := sign(x) * (P0 + P1 * y + ... + Pn * y^n)
+ *
+ *   NOTE: we use multiprecision technique to multiply and sum the first
+ *         K terms of the polynomial. So Pj, j = 0..K are stored in
+ *         table each as a pair of target precision numbers (Pj and PLj) to
+ *         achieve wider than target precision.
+ *
+ */
+#if NPY_SIMD_F64
+
+    // For architectures without efficient gather / scatter instructions, it is
+    // better to use a transposed LUT where we can load all coefficients for an
+    // index linearly.  In order to keep the same vertical calculation, we
+    // transpose the coef. into lanes.  2 lane transpose is all that's
+    // implemented so we require `npyv_nlanes_f64` == 2.
+    #if npyv_nlanes_f64 == 2
+    #define TANH_TRANSPOSED_LUT
+    #endif // npyv_nlanes_f64 == 2
+
+static void
+simd_tanh_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_intp len)
+{
+#if defined(TANH_TRANSPOSED_LUT)
+    static const npy_uint64 NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) lut18x16[] = {
+        // 0
+        0x0ull,                0x0ull,                0x3ff0000000000000ull, 0xbbf0b3ea3fdfaa19ull, // b,   c0,  c1,  c2
+        0xbfd5555555555555ull, 0xbce6863ee44ed636ull, 0x3fc1111111112ab5ull, 0xbda1ea19ddddb3b4ull, // c3,  c4,  c5,  c6
+        0xbfaba1ba1990520bull, 0xbe351ca7f096011full, 0x3f9664f94e6ac14eull, 0xbea8c4c1fd7852feull, // c7,  c8,  c9,  c10
+        0xbf822404577aa9ddull, 0xbefdd99a221ed573ull, 0x3f6e3be689423841ull, 0xbf2a1306713a4f3aull, // c11, c12, c13, c14
+        0xbf55d7e76dc56871ull, 0x3f35e67ab76a26e7ull,                                               // c15, c16
+        // 1
+        0x3fcc000000000000ull, 0x3fcb8fd0416a7c92ull, 0x3fee842ca3f08532ull, 0xbfca48aaeb53bc21ull,
+        0xbfd183afc292ba11ull, 0x3fc04dcd0476c75eull, 0x3fb5c19efdfc08adull, 0xbfb0b8df995ce4dfull,
+        0xbf96e37bba52f6fcull, 0x3f9eaaf3320c3851ull, 0xbf94d3343bae39ddull, 0xbfccce16b1046f13ull,
+        0x403d8b07f7a82aa3ull, 0x4070593a3735bab4ull, 0xc0d263511f5baac1ull, 0xc1045e509116b066ull,
+        0x41528c38809c90c7ull, 0x41848ee0627d8206ull,
+        // 2
+        0x3fd4000000000000ull, 0x3fd35f98a0ea650eull, 0x3fed11574af58f1bull, 0xbfd19921f4329916ull,
+        0xbfcc1a4b039c9bfaull, 0x3fc43d3449a80f08ull, 0x3fa74c98dc34fbacull, 0xbfb2955cf41e8164ull,
+        0x3ecff7df18455399ull, 0x3f9cf823fe761fc1ull, 0xbf7bc748e60df843ull, 0xbf81a16f224bb7b6ull,
+        0xbf9f44ab92fbab0aull, 0xbfccab654e44835eull, 0x40169f73b15ebe5cull, 0x4041fab9250984ceull,
+        0xc076d57fb5190b02ull, 0xc0a216d618b489ecull,
+        // 3
+        0x3fdc000000000000ull, 0x3fda5729ee488037ull, 0x3fea945b9c24e4f9ull, 0xbfd5e0f09bef8011ull,
+        0xbfc16e1e6d8d0be6ull, 0x3fc5c26f3699b7e7ull, 0xbf790d6a8eff0a77ull, 0xbfaf9d05c309f7c6ull,
+        0x3f97362834d33a4eull, 0x3f9022271754ff1full, 0xbf8c89372b43ba85ull, 0xbf62cbf00406bc09ull,
+        0x3fb2eac604473d6aull, 0x3fd13ed80037dbacull, 0xc025c1dd41cd6cb5ull, 0xc0458d090ec3de95ull,
+        0x4085f09f888f8adaull, 0x40a5b89107c8af4full,
+        // 4
+        0x3fe4000000000000ull, 0x3fe1bf47eabb8f95ull, 0x3fe6284c3374f815ull, 0xbfd893b59c35c882ull, // b,   c0,  c1,  c2
+        0xbf92426c751e48a2ull, 0x3fc1a686f6ab2533ull, 0xbfac3c021789a786ull, 0xbf987d27ccff4291ull, // c3,  c4,  c5,  c6
+        0x3f9e7f8380184b45ull, 0xbf731fe77c9c60afull, 0xbf8129a092de747aull, 0x3f75b29bb02cf69bull, // c7,  c8,  c9,  c10
+        0x3f45f87d903aaac8ull, 0xbf6045b9076cc487ull, 0xbf58fd89fe05e0d1ull, 0xbf74949d60113d63ull, // c11, c12, c13, c14
+        0x3fa246332a2fcba5ull, 0x3fb69d8374520edaull,                                               // c15, c16
+        // 5
+        0x3fec000000000000ull, 0x3fe686650b8c2015ull, 0x3fe02500a09f8d6eull, 0xbfd6ba7cb7576538ull,
+        0x3fb4f152b2bad124ull, 0x3faf203c316ce730ull, 0xbfae2196b7326859ull, 0x3f8b2ca62572b098ull,
+        0x3f869543e7c420d4ull, 0xbf84a6046865ec7dull, 0x3f60c85b4d538746ull, 0x3f607df0f9f90c17ull,
+        0xbf5e104671036300ull, 0x3f2085ee7e8ac170ull, 0x3f73f7af01d5af7aull, 0x3f7c9fd6200d0adeull,
+        0xbfb29d851a896fcdull, 0xbfbded519f981716ull,
+        // 6
+        0x3ff4000000000000ull, 0x3feb2523bb6b2deeull, 0x3fd1f25131e3a8c0ull, 0xbfce7291743d7555ull,
+        0x3fbbba40cbef72beull, 0xbf89c7a02788557cull, 0xbf93a7a011ff8c2aull, 0x3f8f1cf6c7f5b00aull,
+        0xbf7326bd4914222aull, 0xbf4ca3f1f2b9192bull, 0x3f5be9392199ec18ull, 0xbf4b852a6e0758d5ull,
+        0x3f19bc98ddf0f340ull, 0x3f23524622610430ull, 0xbf1e40bdead17e6bull, 0x3f02cd40e0ad0a9full,
+        0x3ed9065ae369b212ull, 0xbef02d288b5b3371ull,
+        // 7
+        0x3ffc000000000000ull, 0x3fee1fbf97e33527ull, 0x3fbd22ca1c24a139ull, 0xbfbb6d85a01efb80ull,
+        0x3fb01ba038be6a3dull, 0xbf98157e26e0d541ull, 0x3f6e4709c7e8430eull, 0x3f60379811e43dd5ull,
+        0xbf5fc15b0a9d98faull, 0x3f4c77dee0afd227ull, 0xbf2a0c68a4489f10ull, 0xbf0078c63d1b8445ull,
+        0x3f0d4304bc9246e8ull, 0xbeff12a6626911b4ull, 0x3ee224cd6c4513e5ull, 0xbe858ab8e019f311ull,
+        0xbeb8e1ba4c98a030ull, 0x3eb290981209c1a6ull,
+        // 8
+        0x4004000000000000ull, 0x3fef9258260a71c2ull, 0x3f9b3afe1fba5c76ull, 0xbf9addae58c7141aull, // b,   c0,  c1,  c2
+        0x3f916df44871efc8ull, 0xbf807b55c1c7d278ull, 0x3f67682afa611151ull, 0xbf4793826f78537eull, // c3,  c4,  c5,  c6
+        0x3f14cffcfa69fbb6ull, 0x3f04055bce68597aull, 0xbf00462601dc2faaull, 0x3eec12eadd55be7aull, // c7,  c8,  c9,  c10
+        0xbed13c415f7b9d41ull, 0x3eab9008bca408afull, 0xbe24b645e68eeaa3ull, 0xbe792fa6323b7cf8ull, // c11, c12, c13, c14
+        0x3e6ffd0766ad4016ull, 0xbe567e924bf5ff6eull,                                               // c15, c16
+        // 9
+        0x400c000000000000ull, 0x3feff112c63a9077ull, 0x3f6dd37d19b22b21ull, 0xbf6dc59376c7aa19ull,
+        0x3f63c6869dfc8870ull, 0xbf53a18d5843190full, 0x3f3ef2ee77717cbfull, 0xbf2405695e36240full,
+        0x3f057e48e5b79d10ull, 0xbee2bf0cb4a71647ull, 0x3eb7b6a219dea9f4ull, 0xbe6fa600f593181bull,
+        0xbe722b8d9720cdb0ull, 0x3e634df71865f620ull, 0xbe4abfebfb72bc83ull, 0x3e2df04d67876402ull,
+        0xbe0c63c29f505f5bull, 0x3de3f7f7de6b0eb6ull,
+        // 10
+        0x4014000000000000ull, 0x3fefff419668df11ull, 0x3f27ccec13a9ef96ull, 0xbf27cc5e74677410ull,
+        0x3f1fb9aef915d828ull, 0xbf0fb6bbc89b1a5bull, 0x3ef95a4482f180b7ull, 0xbee0e08de39ce756ull,
+        0x3ec33b66d7d77264ull, 0xbea31eaafe73efd5ull, 0x3e80cbcc8d4c5c8aull, 0xbe5a3c935dce3f7dull,
+        0x3e322666d739bec0ull, 0xbe05bb1bcf83ca73ull, 0x3dd51c38f8695ed3ull, 0xbd95c72be95e4d2cull,
+        0xbd7fab216b9e0e49ull, 0x3d69ed18bae3ebbcull,
+        // 11
+        0x401c000000000000ull, 0x3feffffc832750f2ull, 0x3ecbe6c3f33250aeull, 0xbecbe6c0e8b4cc87ull,
+        0x3ec299d1e27c6e11ull, 0xbeb299c9c684a963ull, 0x3e9dc2c27da3b603ull, 0xbe83d709ba5f714eull,
+        0x3e66ac4e578b9b10ull, 0xbe46abb02c4368edull, 0x3e2425bb231a5e29ull, 0xbe001c6d95e3ae96ull,
+        0x3dd76a553d7e7918ull, 0xbdaf2ac143fb6762ull, 0x3d8313ac38c6832bull, 0xbd55a89c30203106ull,
+        0x3d2826b62056aa27ull, 0xbcf7534c4f3dfa71ull,
+        // 12
+        0x4024000000000000ull, 0x3feffffffdc96f35ull, 0x3e41b4865394f75full, 0xbe41b486526b0565ull, // b,   c0,  c1,  c2
+        0x3e379b5ddcca334cull, 0xbe279b5dd4fb3d01ull, 0x3e12e2afd9f7433eull, 0xbdf92e3fc5ee63e0ull, // c3,  c4,  c5,  c6
+        0x3ddcc74b8d3d5c42ull, 0xbdbcc749ca8079ddull, 0x3d9992a4beac8662ull, 0xbd74755a00ea1fd3ull, // c7,  c8,  c9,  c10
+        0x3d4de0fa59416a39ull, 0xbd23eae52a3dbf57ull, 0x3cf7787935626685ull, 0xbccad6b3bb9eff65ull, // c11, c12, c13, c14
+        0x3ca313e31762f523ull, 0xbc730b73f1eaff20ull,                                               // c15, c16
+        // 13
+        0x402c000000000000ull, 0x3fefffffffffcf58ull, 0x3d8853f01bda5f28ull, 0xbd8853f01bef63a4ull, 
+        0x3d8037f57bc62c9aull, 0xbd7037f57ae72aa6ull, 0x3d59f320348679baull, 0xbd414cc030f2110eull,
+        0x3d23c589137f92b4ull, 0xbd03c5883836b9d2ull, 0x3ce191ba5ed3fb67ull, 0xbcbc1c6c063bb7acull,
+        0x3c948716cf3681b4ull, 0xbc6b5e3e9ca0955eull, 0x3c401ffc49c6bc29ull, 0xbc12705ccd3dd884ull,
+        0x3bea37aa21895319ull, 0xbbba2cff8135d462ull,
+        // 14
+        0x4034000000000000ull, 0x3ff0000000000000ull, 0x3c73953c0197ef58ull, 0xbc73955be519be31ull,
+        0x3c6a2d4b50a2cff7ull, 0xbc5a2ca2bba78e86ull, 0x3c44b61d9bbcc940ull, 0xbc2ba022e8d82a87ull,
+        0x3c107f8e2c8707a1ull, 0xbbf07a5416264aecull, 0x3bc892450bad44c4ull, 0xbba3be9a4460fe00ull,
+        0x3b873f9f2d2fda99ull, 0xbb5eca68e2c1ba2eull, 0xbabf0b21acfa52abull, 0xba8e0a4c47ae75f5ull,
+        0x3ae5c7f1fd871496ull, 0xbab5a71b5f7d9035ull,
+        // 15
+        0x0ull,                0x3ff0000000000000ull, 0x0ull,                0x0ull,                
+        0x0ull,                0x0ull,                0x0ull,                0x0ull,               
+        0x0ull,                0x0ull,                0x0ull,                0x0ull,
+        0x0ull,                0x0ull,                0x0ull,                0x0ull,
+        0x0ull,                0x0ull,               
+    };
+#else
+    static const npy_uint64 NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) lut16x18[] = {
+        // 0
+        0x0ull,                0x3fcc000000000000ull, 0x3fd4000000000000ull, 0x3fdc000000000000ull,
+        0x3fe4000000000000ull, 0x3fec000000000000ull, 0x3ff4000000000000ull, 0x3ffc000000000000ull,
+        0x4004000000000000ull, 0x400c000000000000ull, 0x4014000000000000ull, 0x401c000000000000ull,
+        0x4024000000000000ull, 0x402c000000000000ull, 0x4034000000000000ull, 0x0ull,
+        // 1
+        0x0ull,                0x3fcb8fd0416a7c92ull, 0x3fd35f98a0ea650eull, 0x3fda5729ee488037ull,
+        0x3fe1bf47eabb8f95ull, 0x3fe686650b8c2015ull, 0x3feb2523bb6b2deeull, 0x3fee1fbf97e33527ull,
+        0x3fef9258260a71c2ull, 0x3feff112c63a9077ull, 0x3fefff419668df11ull, 0x3feffffc832750f2ull,
+        0x3feffffffdc96f35ull, 0x3fefffffffffcf58ull, 0x3ff0000000000000ull, 0x3ff0000000000000ull,
+        // 2
+        0x3ff0000000000000ull, 0x3fee842ca3f08532ull, 0x3fed11574af58f1bull, 0x3fea945b9c24e4f9ull,
+        0x3fe6284c3374f815ull, 0x3fe02500a09f8d6eull, 0x3fd1f25131e3a8c0ull, 0x3fbd22ca1c24a139ull,
+        0x3f9b3afe1fba5c76ull, 0x3f6dd37d19b22b21ull, 0x3f27ccec13a9ef96ull, 0x3ecbe6c3f33250aeull,
+        0x3e41b4865394f75full, 0x3d8853f01bda5f28ull, 0x3c73953c0197ef58ull, 0x0ull,
+        // 3
+        0xbbf0b3ea3fdfaa19ull, 0xbfca48aaeb53bc21ull, 0xbfd19921f4329916ull, 0xbfd5e0f09bef8011ull,
+        0xbfd893b59c35c882ull, 0xbfd6ba7cb7576538ull, 0xbfce7291743d7555ull, 0xbfbb6d85a01efb80ull,
+        0xbf9addae58c7141aull, 0xbf6dc59376c7aa19ull, 0xbf27cc5e74677410ull, 0xbecbe6c0e8b4cc87ull,
+        0xbe41b486526b0565ull, 0xbd8853f01bef63a4ull, 0xbc73955be519be31ull, 0x0ull,
+        // 4
+        0xbfd5555555555555ull, 0xbfd183afc292ba11ull, 0xbfcc1a4b039c9bfaull, 0xbfc16e1e6d8d0be6ull,
+        0xbf92426c751e48a2ull, 0x3fb4f152b2bad124ull, 0x3fbbba40cbef72beull, 0x3fb01ba038be6a3dull,
+        0x3f916df44871efc8ull, 0x3f63c6869dfc8870ull, 0x3f1fb9aef915d828ull, 0x3ec299d1e27c6e11ull,
+        0x3e379b5ddcca334cull, 0x3d8037f57bc62c9aull, 0x3c6a2d4b50a2cff7ull, 0x0ull,
+        // 5
+        0xbce6863ee44ed636ull, 0x3fc04dcd0476c75eull, 0x3fc43d3449a80f08ull, 0x3fc5c26f3699b7e7ull,
+        0x3fc1a686f6ab2533ull, 0x3faf203c316ce730ull, 0xbf89c7a02788557cull, 0xbf98157e26e0d541ull,
+        0xbf807b55c1c7d278ull, 0xbf53a18d5843190full, 0xbf0fb6bbc89b1a5bull, 0xbeb299c9c684a963ull,
+        0xbe279b5dd4fb3d01ull, 0xbd7037f57ae72aa6ull, 0xbc5a2ca2bba78e86ull, 0x0ull,
+        // 6
+        0x3fc1111111112ab5ull, 0x3fb5c19efdfc08adull, 0x3fa74c98dc34fbacull, 0xbf790d6a8eff0a77ull,
+        0xbfac3c021789a786ull, 0xbfae2196b7326859ull, 0xbf93a7a011ff8c2aull, 0x3f6e4709c7e8430eull,
+        0x3f67682afa611151ull, 0x3f3ef2ee77717cbfull, 0x3ef95a4482f180b7ull, 0x3e9dc2c27da3b603ull,
+        0x3e12e2afd9f7433eull, 0x3d59f320348679baull, 0x3c44b61d9bbcc940ull, 0x0ull,
+        // 7
+        0xbda1ea19ddddb3b4ull, 0xbfb0b8df995ce4dfull, 0xbfb2955cf41e8164ull, 0xbfaf9d05c309f7c6ull,
+        0xbf987d27ccff4291ull, 0x3f8b2ca62572b098ull, 0x3f8f1cf6c7f5b00aull, 0x3f60379811e43dd5ull,
+        0xbf4793826f78537eull, 0xbf2405695e36240full, 0xbee0e08de39ce756ull, 0xbe83d709ba5f714eull,
+        0xbdf92e3fc5ee63e0ull, 0xbd414cc030f2110eull, 0xbc2ba022e8d82a87ull, 0x0ull,
+        // 8
+        0xbfaba1ba1990520bull, 0xbf96e37bba52f6fcull, 0x3ecff7df18455399ull, 0x3f97362834d33a4eull,
+        0x3f9e7f8380184b45ull, 0x3f869543e7c420d4ull, 0xbf7326bd4914222aull, 0xbf5fc15b0a9d98faull,
+        0x3f14cffcfa69fbb6ull, 0x3f057e48e5b79d10ull, 0x3ec33b66d7d77264ull, 0x3e66ac4e578b9b10ull,
+        0x3ddcc74b8d3d5c42ull, 0x3d23c589137f92b4ull, 0x3c107f8e2c8707a1ull, 0x0ull,
+        // 9
+        0xbe351ca7f096011full, 0x3f9eaaf3320c3851ull, 0x3f9cf823fe761fc1ull, 0x3f9022271754ff1full,
+        0xbf731fe77c9c60afull, 0xbf84a6046865ec7dull, 0xbf4ca3f1f2b9192bull, 0x3f4c77dee0afd227ull,
+        0x3f04055bce68597aull, 0xbee2bf0cb4a71647ull, 0xbea31eaafe73efd5ull, 0xbe46abb02c4368edull,
+        0xbdbcc749ca8079ddull, 0xbd03c5883836b9d2ull, 0xbbf07a5416264aecull, 0x0ull,
+        // 10
+        0x3f9664f94e6ac14eull, 0xbf94d3343bae39ddull, 0xbf7bc748e60df843ull, 0xbf8c89372b43ba85ull,
+        0xbf8129a092de747aull, 0x3f60c85b4d538746ull, 0x3f5be9392199ec18ull, 0xbf2a0c68a4489f10ull,
+        0xbf00462601dc2faaull, 0x3eb7b6a219dea9f4ull, 0x3e80cbcc8d4c5c8aull, 0x3e2425bb231a5e29ull,
+        0x3d9992a4beac8662ull, 0x3ce191ba5ed3fb67ull, 0x3bc892450bad44c4ull, 0x0ull,
+        // 11
+        0xbea8c4c1fd7852feull, 0xbfccce16b1046f13ull, 0xbf81a16f224bb7b6ull, 0xbf62cbf00406bc09ull,
+        0x3f75b29bb02cf69bull, 0x3f607df0f9f90c17ull, 0xbf4b852a6e0758d5ull, 0xbf0078c63d1b8445ull,
+        0x3eec12eadd55be7aull, 0xbe6fa600f593181bull, 0xbe5a3c935dce3f7dull, 0xbe001c6d95e3ae96ull,
+        0xbd74755a00ea1fd3ull, 0xbcbc1c6c063bb7acull, 0xbba3be9a4460fe00ull, 0x0ull,
+        // 12
+        0xbf822404577aa9ddull, 0x403d8b07f7a82aa3ull, 0xbf9f44ab92fbab0aull, 0x3fb2eac604473d6aull,
+        0x3f45f87d903aaac8ull, 0xbf5e104671036300ull, 0x3f19bc98ddf0f340ull, 0x3f0d4304bc9246e8ull,
+        0xbed13c415f7b9d41ull, 0xbe722b8d9720cdb0ull, 0x3e322666d739bec0ull, 0x3dd76a553d7e7918ull,
+        0x3d4de0fa59416a39ull, 0x3c948716cf3681b4ull, 0x3b873f9f2d2fda99ull, 0x0ull,
+        // 13
+        0xbefdd99a221ed573ull, 0x4070593a3735bab4ull, 0xbfccab654e44835eull, 0x3fd13ed80037dbacull,
+        0xbf6045b9076cc487ull, 0x3f2085ee7e8ac170ull, 0x3f23524622610430ull, 0xbeff12a6626911b4ull,
+        0x3eab9008bca408afull, 0x3e634df71865f620ull, 0xbe05bb1bcf83ca73ull, 0xbdaf2ac143fb6762ull,
+        0xbd23eae52a3dbf57ull, 0xbc6b5e3e9ca0955eull, 0xbb5eca68e2c1ba2eull, 0x0ull,
+        // 14
+        0x3f6e3be689423841ull, 0xc0d263511f5baac1ull, 0x40169f73b15ebe5cull, 0xc025c1dd41cd6cb5ull,
+        0xbf58fd89fe05e0d1ull, 0x3f73f7af01d5af7aull, 0xbf1e40bdead17e6bull, 0x3ee224cd6c4513e5ull,
+        0xbe24b645e68eeaa3ull, 0xbe4abfebfb72bc83ull, 0x3dd51c38f8695ed3ull, 0x3d8313ac38c6832bull,
+        0x3cf7787935626685ull, 0x3c401ffc49c6bc29ull, 0xbabf0b21acfa52abull, 0x0ull,
+        // 15
+        0xbf2a1306713a4f3aull, 0xc1045e509116b066ull, 0x4041fab9250984ceull, 0xc0458d090ec3de95ull,
+        0xbf74949d60113d63ull, 0x3f7c9fd6200d0adeull, 0x3f02cd40e0ad0a9full, 0xbe858ab8e019f311ull,
+        0xbe792fa6323b7cf8ull, 0x3e2df04d67876402ull, 0xbd95c72be95e4d2cull, 0xbd55a89c30203106ull,
+        0xbccad6b3bb9eff65ull, 0xbc12705ccd3dd884ull, 0xba8e0a4c47ae75f5ull, 0x0ull,
+        // 16
+        0xbf55d7e76dc56871ull, 0x41528c38809c90c7ull, 0xc076d57fb5190b02ull, 0x4085f09f888f8adaull,
+        0x3fa246332a2fcba5ull, 0xbfb29d851a896fcdull, 0x3ed9065ae369b212ull, 0xbeb8e1ba4c98a030ull,
+        0x3e6ffd0766ad4016ull, 0xbe0c63c29f505f5bull, 0xbd7fab216b9e0e49ull, 0x3d2826b62056aa27ull,
+        0x3ca313e31762f523ull, 0x3bea37aa21895319ull, 0x3ae5c7f1fd871496ull, 0x0ull,
+        // 17
+        0x3f35e67ab76a26e7ull, 0x41848ee0627d8206ull, 0xc0a216d618b489ecull, 0x40a5b89107c8af4full,
+        0x3fb69d8374520edaull, 0xbfbded519f981716ull, 0xbef02d288b5b3371ull, 0x3eb290981209c1a6ull,
+        0xbe567e924bf5ff6eull, 0x3de3f7f7de6b0eb6ull, 0x3d69ed18bae3ebbcull, 0xbcf7534c4f3dfa71ull,
+        0xbc730b73f1eaff20ull, 0xbbba2cff8135d462ull, 0xbab5a71b5f7d9035ull, 0x0ull
+    };
+#endif // defined(TANH_TRANSPOSED_LUT)
+
+    const int nlanes = npyv_nlanes_f64;
+    const npyv_f64 qnan = npyv_setall_f64(NPY_NAN);
+    for (; len > 0; len -= nlanes, src += ssrc*nlanes, dst += sdst*nlanes) {
+        npyv_f64 x;
+        if (ssrc == 1) {
+            x = npyv_load_tillz_f64(src, len);
+        } else {
+            x = npyv_loadn_tillz_f64(src, ssrc, len);
+        }
+        npyv_s64 ndnan = npyv_and_s64(npyv_reinterpret_s64_f64(x), npyv_setall_s64(0x7ff8000000000000ll));
+        // |x| > HUGE_THRESHOLD, INF and NaNs.
+        npyv_b64 special_m = npyv_cmple_s64(ndnan, npyv_setall_s64(0x7fe0000000000000ll));
+        npyv_b64 nnan_m = npyv_notnan_f64(x);
+        npyv_s64 idxs = npyv_sub_s64(ndnan, npyv_setall_s64(0x3fc0000000000000ll));
+        // no native 64-bit for max/min and its fine to use 32-bit max/min
+        // since we're not crossing 32-bit edge
+        npyv_s32 idxl = npyv_max_s32(npyv_reinterpret_s32_s64(idxs), npyv_zero_s32());
+                 idxl = npyv_min_s32(idxl, npyv_setall_s32(0x780000));
+        npyv_u64 idx  = npyv_shri_u64(npyv_reinterpret_u64_s32(idxl), 51);
+
+#if defined(TANH_TRANSPOSED_LUT)
+        npyv_f64 e0e1[npyv_nlanes_f64];
+        npyv_lanetype_u64 index[npyv_nlanes_f64];
+        npyv_store_u64(index, idx);
+
+        #line 314
+        #line 317
+        e0e1[0] = npyv_reinterpret_f64_u64(npyv_load_u64(lut18x16 + index[0] * 18 + 0));
+        
+#line 317
+        e0e1[1] = npyv_reinterpret_f64_u64(npyv_load_u64(lut18x16 + index[1] * 18 + 0));
+        
+        npyv_f64 b = npyv_combinel_f64(e0e1[0], e0e1[1]);
+        npyv_f64 c0 = npyv_combineh_f64(e0e1[0], e0e1[1]);
+        
+#line 314
+        #line 317
+        e0e1[0] = npyv_reinterpret_f64_u64(npyv_load_u64(lut18x16 + index[0] * 18 + 2));
+        
+#line 317
+        e0e1[1] = npyv_reinterpret_f64_u64(npyv_load_u64(lut18x16 + index[1] * 18 + 2));
+        
+        npyv_f64 c1 = npyv_combinel_f64(e0e1[0], e0e1[1]);
+        npyv_f64 c2 = npyv_combineh_f64(e0e1[0], e0e1[1]);
+        
+#line 314
+        #line 317
+        e0e1[0] = npyv_reinterpret_f64_u64(npyv_load_u64(lut18x16 + index[0] * 18 + 4));
+        
+#line 317
+        e0e1[1] = npyv_reinterpret_f64_u64(npyv_load_u64(lut18x16 + index[1] * 18 + 4));
+        
+        npyv_f64 c3 = npyv_combinel_f64(e0e1[0], e0e1[1]);
+        npyv_f64 c4 = npyv_combineh_f64(e0e1[0], e0e1[1]);
+        
+#line 314
+        #line 317
+        e0e1[0] = npyv_reinterpret_f64_u64(npyv_load_u64(lut18x16 + index[0] * 18 + 6));
+        
+#line 317
+        e0e1[1] = npyv_reinterpret_f64_u64(npyv_load_u64(lut18x16 + index[1] * 18 + 6));
+        
+        npyv_f64 c5 = npyv_combinel_f64(e0e1[0], e0e1[1]);
+        npyv_f64 c6 = npyv_combineh_f64(e0e1[0], e0e1[1]);
+        
+#line 314
+        #line 317
+        e0e1[0] = npyv_reinterpret_f64_u64(npyv_load_u64(lut18x16 + index[0] * 18 + 8));
+        
+#line 317
+        e0e1[1] = npyv_reinterpret_f64_u64(npyv_load_u64(lut18x16 + index[1] * 18 + 8));
+        
+        npyv_f64 c7 = npyv_combinel_f64(e0e1[0], e0e1[1]);
+        npyv_f64 c8 = npyv_combineh_f64(e0e1[0], e0e1[1]);
+        
+#line 314
+        #line 317
+        e0e1[0] = npyv_reinterpret_f64_u64(npyv_load_u64(lut18x16 + index[0] * 18 + 10));
+        
+#line 317
+        e0e1[1] = npyv_reinterpret_f64_u64(npyv_load_u64(lut18x16 + index[1] * 18 + 10));
+        
+        npyv_f64 c9 = npyv_combinel_f64(e0e1[0], e0e1[1]);
+        npyv_f64 c10 = npyv_combineh_f64(e0e1[0], e0e1[1]);
+        
+#line 314
+        #line 317
+        e0e1[0] = npyv_reinterpret_f64_u64(npyv_load_u64(lut18x16 + index[0] * 18 + 12));
+        
+#line 317
+        e0e1[1] = npyv_reinterpret_f64_u64(npyv_load_u64(lut18x16 + index[1] * 18 + 12));
+        
+        npyv_f64 c11 = npyv_combinel_f64(e0e1[0], e0e1[1]);
+        npyv_f64 c12 = npyv_combineh_f64(e0e1[0], e0e1[1]);
+        
+#line 314
+        #line 317
+        e0e1[0] = npyv_reinterpret_f64_u64(npyv_load_u64(lut18x16 + index[0] * 18 + 14));
+        
+#line 317
+        e0e1[1] = npyv_reinterpret_f64_u64(npyv_load_u64(lut18x16 + index[1] * 18 + 14));
+        
+        npyv_f64 c13 = npyv_combinel_f64(e0e1[0], e0e1[1]);
+        npyv_f64 c14 = npyv_combineh_f64(e0e1[0], e0e1[1]);
+        
+#line 314
+        #line 317
+        e0e1[0] = npyv_reinterpret_f64_u64(npyv_load_u64(lut18x16 + index[0] * 18 + 16));
+        
+#line 317
+        e0e1[1] = npyv_reinterpret_f64_u64(npyv_load_u64(lut18x16 + index[1] * 18 + 16));
+        
+        npyv_f64 c15 = npyv_combinel_f64(e0e1[0], e0e1[1]);
+        npyv_f64 c16 = npyv_combineh_f64(e0e1[0], e0e1[1]);
+        
+#else
+        npyv_f64 b = npyv_lut16_f64((const double*)lut16x18 + 16*0, idx);
+        npyv_f64 c0 = npyv_lut16_f64((const double*)lut16x18 + 1*16, idx);
+        npyv_f64 c1 = npyv_lut16_f64((const double*)lut16x18 + 2*16, idx);
+        npyv_f64 c2 = npyv_lut16_f64((const double*)lut16x18 + 3*16, idx);
+        npyv_f64 c3 = npyv_lut16_f64((const double*)lut16x18 + 4*16, idx);
+        npyv_f64 c4 = npyv_lut16_f64((const double*)lut16x18 + 5*16, idx);
+        npyv_f64 c5 = npyv_lut16_f64((const double*)lut16x18 + 6*16, idx);
+        npyv_f64 c6 = npyv_lut16_f64((const double*)lut16x18 + 7*16, idx);
+        npyv_f64 c7 = npyv_lut16_f64((const double*)lut16x18 + 8*16, idx);
+        npyv_f64 c8 = npyv_lut16_f64((const double*)lut16x18 + 9*16, idx);
+        npyv_f64 c9 = npyv_lut16_f64((const double*)lut16x18 + 10*16, idx);
+        npyv_f64 c10 = npyv_lut16_f64((const double*)lut16x18 + 11*16, idx);
+        npyv_f64 c11 = npyv_lut16_f64((const double*)lut16x18 + 12*16, idx);
+        npyv_f64 c12 = npyv_lut16_f64((const double*)lut16x18 + 13*16, idx);
+        npyv_f64 c13 = npyv_lut16_f64((const double*)lut16x18 + 14*16, idx);
+        npyv_f64 c14 = npyv_lut16_f64((const double*)lut16x18 + 15*16, idx);
+        npyv_f64 c15 = npyv_lut16_f64((const double*)lut16x18 + 16*16, idx);
+        npyv_f64 c16 = npyv_lut16_f64((const double*)lut16x18 + 17*16, idx);
+#endif // defined(TANH_TRANSPOSED_LUT)
+
+        // no need to zerofy nans or avoid FP exceptions by NO_EXC like SVML does
+        // since we're clearing the FP status anyway.
+        npyv_f64 sign = npyv_and_f64(x, npyv_reinterpret_f64_s64(npyv_setall_s64(0x8000000000000000ull)));
+        npyv_f64 y = npyv_sub_f64(npyv_abs_f64(x), b);
+        npyv_f64 r = npyv_muladd_f64(c16, y, c15);
+        r = npyv_muladd_f64(r, y, c14);
+        r = npyv_muladd_f64(r, y, c13);
+        r = npyv_muladd_f64(r, y, c12);
+        r = npyv_muladd_f64(r, y, c11);
+        r = npyv_muladd_f64(r, y, c10);
+        r = npyv_muladd_f64(r, y, c9);
+        r = npyv_muladd_f64(r, y, c8);
+        r = npyv_muladd_f64(r, y, c7);
+        r = npyv_muladd_f64(r, y, c6);
+        r = npyv_muladd_f64(r, y, c5);
+        r = npyv_muladd_f64(r, y, c4);
+        r = npyv_muladd_f64(r, y, c3);
+        r = npyv_muladd_f64(r, y, c2);
+        r = npyv_muladd_f64(r, y, c1);
+        r = npyv_muladd_f64(r, y, c0);
+        // 1.0 if |x| > HUGE_THRESHOLD || INF
+        r = npyv_select_f64(special_m, r, npyv_setall_f64(1.0));
+        r = npyv_or_f64(r, sign);
+        // qnan if nan
+        r = npyv_select_f64(nnan_m, r, qnan);
+        if (sdst == 1) {
+            npyv_store_till_f64(dst, len, r);
+        } else {
+            npyv_storen_till_f64(dst, sdst, len, r);
+        }
+    }
+}
+
+#undef TANH_TRANSPOSED_LUT
+
+#endif // NPY_SIMD_F64
+
+#if NPY_SIMD_F32
+
+
+    // For architectures without efficient gather / scatter instructions, it is
+    // better to use a transposed LUT where we can load all coefficients for an
+    // index linearly.  In order to keep the same vertical calculation, we
+    // transpose the coef. into lanes.  A 4x4 transpose is all that's
+    // supported so we require `npyv_nlanes_f32` == 4.
+    #if npyv_nlanes_f32 == 4
+    #define TANHF_TRANSPOSED_LUT
+    // Define missing universal intrinsics used below
+    #if !defined(npyv_get_lane_u32)
+        #if defined(NPY_HAVE_ASIMD)
+            #define UNDEF_npyv_get_lane_u32
+            #define npyv_get_lane_u32 vgetq_lane_u32
+        #elif defined(NPY_HAVE_SSE41)
+            #define UNDEF_npyv_get_lane_u32
+            #define npyv_get_lane_u32 _mm_extract_epi32
+        #else
+            #undef TANHF_TRANSPOSED_LUT
+        #endif
+    #endif // !defined(npyv_get_lane_u32)
+    #endif // npyv_nlanes_f32 == 4
+
+static void
+simd_tanh_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst, npy_intp len)
+{
+#if defined(TANHF_TRANSPOSED_LUT)
+    static const npy_uint32 NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) lut8x32[] = {
+        // c6       c5          c4          c3          c2          c1          c0          b
+        0xbc0e2f66, 0x3e0910e9, 0xb76dd6b9, 0xbeaaaaa5, 0xb0343c7b, 0x3f800000, 0x0,        0x0,       
+        0x460bda12, 0x43761143, 0xbe1c276d, 0xbeab0612, 0xbd6ee69d, 0x3f7f1f84, 0x3d6fb9c9, 0x3d700000,
+        0x43d638ef, 0x4165ecdc, 0x3c1dcf2f, 0xbea7f01f, 0xbd8f0da7, 0x3f7ebd11, 0x3d8fc35f, 0x3d900000,
+        0xc3e11c3e, 0xc190f756, 0x3dc1a78d, 0xbea4e120, 0xbdae477d, 0x3f7e1e5f, 0x3daf9169, 0x3db00000,
+        // 4
+        0xc2baa4e9, 0xc08c097d, 0x3d96f985, 0xbea387b7, 0xbdcd2a1f, 0x3f7d609f, 0x3dcf49ab, 0x3dd00000,
+        0xc249da2d, 0xc02ba813, 0x3da2b61b, 0xbea15962, 0xbdeba80d, 0x3f7c842d, 0x3deee849, 0x3df00000,
+        0xc1859b82, 0xbf7f6bda, 0x3dc13397, 0xbe9d57f7, 0xbe0c443b, 0x3f7b00e5, 0x3e0f0ee8, 0x3e100000,
+        0x40dd5b57, 0x3f2b1dc0, 0x3dd2f670, 0xbe976b5a, 0xbe293cf3, 0x3f789580, 0x3e2e4984, 0x3e300000,
+        // 8
+        0x40494640, 0x3ece105d, 0x3df48a0a, 0xbe90230d, 0xbe44f282, 0x3f75b8ad, 0x3e4d2f8e, 0x3e500000,
+        0x40c730a8, 0x3f426a94, 0x3e06c5a8, 0xbe880dff, 0xbe5f3651, 0x3f726fd9, 0x3e6bb32e, 0x3e700000,
+        0xbf0f160e, 0xbadb0dc4, 0x3e1a3aba, 0xbe7479b3, 0xbe81c7c0, 0x3f6cc59b, 0x3e8c51cd, 0x3e900000,
+        0x3e30e76f, 0x3da43b17, 0x3e27c405, 0xbe4c3d88, 0xbe96d7ca, 0x3f63fb92, 0x3ea96163, 0x3eb00000,
+        // 12
+        0xbea81387, 0xbd51ab88, 0x3e2e78d0, 0xbe212482, 0xbea7fb8e, 0x3f59ff97, 0x3ec543f1, 0x3ed00000,
+        0xbdb26a1c, 0xbcaea23d, 0x3e2c3e44, 0xbdeb8cba, 0xbeb50e9e, 0x3f4f11d7, 0x3edfd735, 0x3ef00000,
+        0xbd351e57, 0xbd3b6d8d, 0x3e1d3097, 0xbd5e78ad, 0xbec12efe, 0x3f3d7573, 0x3f028438, 0x3f100000,
+        0xbb4c01a0, 0xbd6caaad, 0x3df4a8f4, 0x3c6b5e6e, 0xbec4be92, 0x3f24f360, 0x3f18abf0, 0x3f300000,
+        // 16
+        0x3c1d7bfb, 0xbd795bed, 0x3da38508, 0x3d839143, 0xbebce070, 0x3f0cbfe7, 0x3f2bc480, 0x3f500000,
+        0x3c722cd1, 0xbd5fddda, 0x3d31416a, 0x3dc21ee1, 0xbead510e, 0x3eec1a69, 0x3f3bec1c, 0x3f700000,
+        0x3c973f1c, 0xbd038f3b, 0x3b562657, 0x3de347af, 0xbe8ef7d6, 0x3eb0a801, 0x3f4f2e5b, 0x3f900000,
+        0x3c33a31b, 0xbc1cad63, 0xbcaeeac9, 0x3dcbec96, 0xbe4b8704, 0x3e6753a2, 0x3f613c53, 0x3fb00000,
+        // 20
+        0x3b862ef4, 0x3abb4766, 0xbcce9419, 0x3d99ef2d, 0xbe083237, 0x3e132f1a, 0x3f6ce37d, 0x3fd00000,
+        0x3a27b3d0, 0x3b95f10b, 0xbcaaeac4, 0x3d542ea1, 0xbdaf7449, 0x3db7e7d3, 0x3f743c4f, 0x3ff00000,
+        0xba3b5907, 0x3b825873, 0xbc49e7d0, 0x3cdde701, 0xbd2e1ec4, 0x3d320845, 0x3f7a5feb, 0x40100000,
+        0xba0efc22, 0x3afaea66, 0xbba71ddd, 0x3c2cca67, 0xbc83bf06, 0x3c84d3d4, 0x3f7dea85, 0x40300000,
+        // 24
+        0xb97f9f0f, 0x3a49f878, 0xbb003b0e, 0x3b81cb27, 0xbbc3e0b5, 0x3bc477b7, 0x3f7f3b3d, 0x40500000,
+        0xb8c8af50, 0x39996bf3, 0xba3f9a05, 0x3ac073a1, 0xbb10aadc, 0x3b10d3da, 0x3f7fb78c, 0x40700000,
+        0xb7bdddfb, 0x388f3e6c, 0xb92c08a7, 0x39ac3032, 0xba0157db, 0x3a01601e, 0x3f7fefd4, 0x40900000,
+        0xb64f2950, 0x371bb0e3, 0xb7ba9232, 0x383a94d9, 0xb88c18f2, 0x388c1a3b, 0x3f7ffdd0, 0x40b00000,
+        // 28
+        0xb4e085b1, 0x35a8a5e6, 0xb64a0b0f, 0x36ca081d, 0xb717b096, 0x3717b0da, 0x3f7fffb4, 0x40d00000,
+        0xb3731dfa, 0x34369b17, 0xb4dac169, 0x355abd4c, 0xb5a43bae, 0x35a43bce, 0x3f7ffff6, 0x40f00000,
+        0xb15a1f04, 0x322487b0, 0xb2ab78ac, 0x332b3cb6, 0xb383012c, 0x338306c6, 0x3f7fffff, 0x41100000,
+        0x0,        0x0,        0x0,        0x0,        0x0,        0x0,        0x3f800000, 0x0,       
+    };
+#else
+    static const npy_uint32 NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) lut32x8[] = {
+        // 0
+        0x0,        0x3d700000, 0x3d900000, 0x3db00000, 0x3dd00000, 0x3df00000, 0x3e100000, 0x3e300000,
+        0x3e500000, 0x3e700000, 0x3e900000, 0x3eb00000, 0x3ed00000, 0x3ef00000, 0x3f100000, 0x3f300000,
+        0x3f500000, 0x3f700000, 0x3f900000, 0x3fb00000, 0x3fd00000, 0x3ff00000, 0x40100000, 0x40300000,
+        0x40500000, 0x40700000, 0x40900000, 0x40b00000, 0x40d00000, 0x40f00000, 0x41100000, 0x0,
+        // 1
+        0x0,        0x3d6fb9c9, 0x3d8fc35f, 0x3daf9169, 0x3dcf49ab, 0x3deee849, 0x3e0f0ee8, 0x3e2e4984,
+        0x3e4d2f8e, 0x3e6bb32e, 0x3e8c51cd, 0x3ea96163, 0x3ec543f1, 0x3edfd735, 0x3f028438, 0x3f18abf0,
+        0x3f2bc480, 0x3f3bec1c, 0x3f4f2e5b, 0x3f613c53, 0x3f6ce37d, 0x3f743c4f, 0x3f7a5feb, 0x3f7dea85,
+        0x3f7f3b3d, 0x3f7fb78c, 0x3f7fefd4, 0x3f7ffdd0, 0x3f7fffb4, 0x3f7ffff6, 0x3f7fffff, 0x3f800000,
+        // 2
+        0x3f800000, 0x3f7f1f84, 0x3f7ebd11, 0x3f7e1e5f, 0x3f7d609f, 0x3f7c842d, 0x3f7b00e5, 0x3f789580,
+        0x3f75b8ad, 0x3f726fd9, 0x3f6cc59b, 0x3f63fb92, 0x3f59ff97, 0x3f4f11d7, 0x3f3d7573, 0x3f24f360,
+        0x3f0cbfe7, 0x3eec1a69, 0x3eb0a801, 0x3e6753a2, 0x3e132f1a, 0x3db7e7d3, 0x3d320845, 0x3c84d3d4,
+        0x3bc477b7, 0x3b10d3da, 0x3a01601e, 0x388c1a3b, 0x3717b0da, 0x35a43bce, 0x338306c6, 0x0,
+        // 3
+        0xb0343c7b, 0xbd6ee69d, 0xbd8f0da7, 0xbdae477d, 0xbdcd2a1f, 0xbdeba80d, 0xbe0c443b, 0xbe293cf3,
+        0xbe44f282, 0xbe5f3651, 0xbe81c7c0, 0xbe96d7ca, 0xbea7fb8e, 0xbeb50e9e, 0xbec12efe, 0xbec4be92,
+        0xbebce070, 0xbead510e, 0xbe8ef7d6, 0xbe4b8704, 0xbe083237, 0xbdaf7449, 0xbd2e1ec4, 0xbc83bf06,
+        0xbbc3e0b5, 0xbb10aadc, 0xba0157db, 0xb88c18f2, 0xb717b096, 0xb5a43bae, 0xb383012c, 0x0,
+        // 4
+        0xbeaaaaa5, 0xbeab0612, 0xbea7f01f, 0xbea4e120, 0xbea387b7, 0xbea15962, 0xbe9d57f7, 0xbe976b5a,
+        0xbe90230d, 0xbe880dff, 0xbe7479b3, 0xbe4c3d88, 0xbe212482, 0xbdeb8cba, 0xbd5e78ad, 0x3c6b5e6e,
+        0x3d839143, 0x3dc21ee1, 0x3de347af, 0x3dcbec96, 0x3d99ef2d, 0x3d542ea1, 0x3cdde701, 0x3c2cca67,
+        0x3b81cb27, 0x3ac073a1, 0x39ac3032, 0x383a94d9, 0x36ca081d, 0x355abd4c, 0x332b3cb6, 0x0,
+        // 5
+        0xb76dd6b9, 0xbe1c276d, 0x3c1dcf2f, 0x3dc1a78d, 0x3d96f985, 0x3da2b61b, 0x3dc13397, 0x3dd2f670,
+        0x3df48a0a, 0x3e06c5a8, 0x3e1a3aba, 0x3e27c405, 0x3e2e78d0, 0x3e2c3e44, 0x3e1d3097, 0x3df4a8f4,
+        0x3da38508, 0x3d31416a, 0x3b562657, 0xbcaeeac9, 0xbcce9419, 0xbcaaeac4, 0xbc49e7d0, 0xbba71ddd,
+        0xbb003b0e, 0xba3f9a05, 0xb92c08a7, 0xb7ba9232, 0xb64a0b0f, 0xb4dac169, 0xb2ab78ac, 0x0,
+        // 6
+        0x3e0910e9, 0x43761143, 0x4165ecdc, 0xc190f756, 0xc08c097d, 0xc02ba813, 0xbf7f6bda, 0x3f2b1dc0,
+        0x3ece105d, 0x3f426a94, 0xbadb0dc4, 0x3da43b17, 0xbd51ab88, 0xbcaea23d, 0xbd3b6d8d, 0xbd6caaad,
+        0xbd795bed, 0xbd5fddda, 0xbd038f3b, 0xbc1cad63, 0x3abb4766, 0x3b95f10b, 0x3b825873, 0x3afaea66,
+        0x3a49f878, 0x39996bf3, 0x388f3e6c, 0x371bb0e3, 0x35a8a5e6, 0x34369b17, 0x322487b0, 0x0,
+        // 7
+        0xbc0e2f66, 0x460bda12, 0x43d638ef, 0xc3e11c3e, 0xc2baa4e9, 0xc249da2d, 0xc1859b82, 0x40dd5b57,
+        0x40494640, 0x40c730a8, 0xbf0f160e, 0x3e30e76f, 0xbea81387, 0xbdb26a1c, 0xbd351e57, 0xbb4c01a0,
+        0x3c1d7bfb, 0x3c722cd1, 0x3c973f1c, 0x3c33a31b, 0x3b862ef4, 0x3a27b3d0, 0xba3b5907, 0xba0efc22,
+        0xb97f9f0f, 0xb8c8af50, 0xb7bdddfb, 0xb64f2950, 0xb4e085b1, 0xb3731dfa, 0xb15a1f04, 0x0
+    };
+#endif // defined(TANHF_TRANSPOSED_LUT)
+
+    const int nlanes = npyv_nlanes_f32;
+    const npyv_f32 qnan = npyv_setall_f32(NPY_NANF);
+    for (; len > 0; len -= nlanes, src += ssrc*nlanes, dst += sdst*nlanes) {
+        npyv_f32 x;
+        if (ssrc == 1) {
+            x = npyv_load_tillz_f32(src, len);
+        } else {
+            x = npyv_loadn_tillz_f32(src, ssrc, len);
+        }
+        npyv_s32 ndnan = npyv_and_s32(npyv_reinterpret_s32_f32(x), npyv_setall_s32(0x7fe00000));
+        // check |x| > HUGE_THRESHOLD, INF and NaNs.
+        npyv_b32 special_m = npyv_cmple_s32(ndnan, npyv_setall_s32(0x7f000000));
+        npyv_b32 nnan_m = npyv_notnan_f32(x);
+        npyv_s32 idxs = npyv_sub_s32(ndnan, npyv_setall_s32(0x3d400000));
+                 idxs = npyv_max_s32(idxs, npyv_zero_s32());
+                 idxs = npyv_min_s32(idxs, npyv_setall_s32(0x3e00000));
+        npyv_u32 idx  = npyv_shri_u32(npyv_reinterpret_u32_s32(idxs), 21);
+
+#if defined(TANHF_TRANSPOSED_LUT)
+        npyv_f32 c6543[npyv_nlanes_f32];
+        npyv_f32 c210b[npyv_nlanes_f32];
+        npyv_lanetype_u32 index[npyv_nlanes_f32];
+
+        #line 521
+        index[0] = npyv_get_lane_u32(idx, 0);
+        c6543[0] = npyv_reinterpret_f32_u32(npyv_load_u32(lut8x32 + index[0] * 8));
+        c210b[0] = npyv_reinterpret_f32_u32(npyv_load_u32(lut8x32 + index[0] * 8 + 4));
+        
+#line 521
+        index[1] = npyv_get_lane_u32(idx, 1);
+        c6543[1] = npyv_reinterpret_f32_u32(npyv_load_u32(lut8x32 + index[1] * 8));
+        c210b[1] = npyv_reinterpret_f32_u32(npyv_load_u32(lut8x32 + index[1] * 8 + 4));
+        
+#line 521
+        index[2] = npyv_get_lane_u32(idx, 2);
+        c6543[2] = npyv_reinterpret_f32_u32(npyv_load_u32(lut8x32 + index[2] * 8));
+        c210b[2] = npyv_reinterpret_f32_u32(npyv_load_u32(lut8x32 + index[2] * 8 + 4));
+        
+#line 521
+        index[3] = npyv_get_lane_u32(idx, 3);
+        c6543[3] = npyv_reinterpret_f32_u32(npyv_load_u32(lut8x32 + index[3] * 8));
+        c210b[3] = npyv_reinterpret_f32_u32(npyv_load_u32(lut8x32 + index[3] * 8 + 4));
+        
+
+        // lane0: {c6, c5, c4, c3},  {c2, c1, c0, b}
+        // lane1: {c6, c5, c4, c3},  {c2, c1, c0, b}
+        // lane2: {c6, c5, c4, c3},  {c2, c1, c0, b}
+        // lane3: {c6, c5, c4, c3},  {c2, c1, c0, b}
+        //
+        // transposed:
+        // c6: {lane0, lane1, lane2, lane3}
+        // c5: {lane0, lane1, lane2, lane3}
+        // c4: {lane0, lane1, lane2, lane3}
+        // c3: {lane0, lane1, lane2, lane3}
+        // c2: {lane0, lane1, lane2, lane3}
+        // c1: {lane0, lane1, lane2, lane3}
+        // c0: {lane0, lane1, lane2, lane3}
+        // b : {lane0, lane1, lane2, lane3}
+
+        npyv_f32x2 c6543_l01 = npyv_zip_f32(c6543[0], c6543[1]);
+        npyv_f32x2 c6543_l23 = npyv_zip_f32(c6543[2], c6543[3]);
+        npyv_f32 c6 = npyv_combinel_f32(c6543_l01.val[0], c6543_l23.val[0]);
+        npyv_f32 c5 = npyv_combineh_f32(c6543_l01.val[0], c6543_l23.val[0]);
+        npyv_f32 c4 = npyv_combinel_f32(c6543_l01.val[1], c6543_l23.val[1]);
+        npyv_f32 c3 = npyv_combineh_f32(c6543_l01.val[1], c6543_l23.val[1]);
+
+        npyv_f32x2 c210b_l01 = npyv_zip_f32(c210b[0], c210b[1]);
+        npyv_f32x2 c210b_l23 = npyv_zip_f32(c210b[2], c210b[3]);
+        npyv_f32 c2 = npyv_combinel_f32(c210b_l01.val[0], c210b_l23.val[0]);
+        npyv_f32 c1 = npyv_combineh_f32(c210b_l01.val[0], c210b_l23.val[0]);
+        npyv_f32 c0 = npyv_combinel_f32(c210b_l01.val[1], c210b_l23.val[1]);
+        npyv_f32 b  = npyv_combineh_f32(c210b_l01.val[1], c210b_l23.val[1]);
+#else
+        npyv_f32 b  = npyv_lut32_f32((const float*)lut32x8 + 32*0, idx);
+        npyv_f32 c0 = npyv_lut32_f32((const float*)lut32x8 + 32*1, idx);
+        npyv_f32 c1 = npyv_lut32_f32((const float*)lut32x8 + 32*2, idx);
+        npyv_f32 c2 = npyv_lut32_f32((const float*)lut32x8 + 32*3, idx);
+        npyv_f32 c3 = npyv_lut32_f32((const float*)lut32x8 + 32*4, idx);
+        npyv_f32 c4 = npyv_lut32_f32((const float*)lut32x8 + 32*5, idx);
+        npyv_f32 c5 = npyv_lut32_f32((const float*)lut32x8 + 32*6, idx);
+        npyv_f32 c6 = npyv_lut32_f32((const float*)lut32x8 + 32*7, idx);
+#endif // defined(TANHF_TRANSPOSED_LUT)
+
+        // no need to zerofy nans or avoid FP exceptions by NO_EXC like SVML does
+        // since we're clearing the FP status anyway.
+        npyv_f32 sign = npyv_and_f32(x, npyv_reinterpret_f32_u32(npyv_setall_u32(0x80000000)));
+        npyv_f32 y = npyv_sub_f32(npyv_abs_f32(x), b);
+        npyv_f32 r = npyv_muladd_f32(c6, y, c5);
+        r = npyv_muladd_f32(r, y, c4);
+        r = npyv_muladd_f32(r, y, c3);
+        r = npyv_muladd_f32(r, y, c2);
+        r = npyv_muladd_f32(r, y, c1);
+        r = npyv_muladd_f32(r, y, c0);
+        // 1.0 if |x| > HUGE_THRESHOLD || INF
+        r = npyv_select_f32(special_m, r, npyv_setall_f32(1.0f));
+        r = npyv_or_f32(r, sign);
+        // qnan if nan
+        r = npyv_select_f32(nnan_m, r, qnan);
+        if (sdst == 1) {
+            npyv_store_till_f32(dst, len, r);
+        } else {
+            npyv_storen_till_f32(dst, sdst, len, r);
+        }
+    }
+}
+
+#undef TANHF_TRANSPOSED_LUT
+#if defined(UNDEF_npyv_get_lane_u32)
+#undef UNDEF_npyv_get_lane_u32
+#undef npyv_get_lane_u32
+#endif
+
+#endif // NPY_SIMD_F32
+#endif // NPY_SIMD_FMA3
+
+#line 604
+#line 608
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_tanh)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    const float *src = (float*)args[0];
+          float *dst = (float*)args[1];
+
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+#if NPY_SIMD_FMA3 && NPY_SIMD_F32
+    if (is_mem_overlap(src, steps[0], dst, steps[1], len) ||
+        !npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst)
+    ) {
+        for (; len > 0; --len, src += ssrc, dst += sdst) {
+            simd_tanh_f32(src, 1, dst, 1, 1);
+        }
+    } else {
+        simd_tanh_f32(src, ssrc, dst, sdst, len);
+    }
+    npyv_cleanup();
+    #if 1
+        npy_clear_floatstatus_barrier((char*)dimensions);
+    #endif
+#else
+    for (; len > 0; --len, src += ssrc, dst += sdst) {
+        const float src0 = *src;
+        *dst = npy_tanhf(src0);
+    }
+#endif
+}
+
+
+#line 604
+#line 608
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_tanh)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    const double *src = (double*)args[0];
+          double *dst = (double*)args[1];
+
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+#if NPY_SIMD_FMA3 && NPY_SIMD_F64
+    if (is_mem_overlap(src, steps[0], dst, steps[1], len) ||
+        !npyv_loadable_stride_f64(ssrc) || !npyv_storable_stride_f64(sdst)
+    ) {
+        for (; len > 0; --len, src += ssrc, dst += sdst) {
+            simd_tanh_f64(src, 1, dst, 1, 1);
+        }
+    } else {
+        simd_tanh_f64(src, ssrc, dst, sdst, len);
+    }
+    npyv_cleanup();
+    #if 1
+        npy_clear_floatstatus_barrier((char*)dimensions);
+    #endif
+#else
+    for (; len > 0; --len, src += ssrc, dst += sdst) {
+        const double src0 = *src;
+        *dst = npy_tanh(src0);
+    }
+#endif
+}
+
+
+
diff --git a/numpy/core/src/_generated/loops_logical.dispatch.c b/numpy/core/src/_generated/loops_logical.dispatch.c
new file mode 100644
index 000000000000..93ad936db9a5
--- /dev/null
+++ b/numpy/core/src/_generated/loops_logical.dispatch.c
@@ -0,0 +1,1194 @@
+#line 1 "numpy/core/src/umath/loops_logical.dispatch.c.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+/*@targets
+ ** $maxopt baseline
+ ** neon asimd
+ ** sse2 avx2 avx512_skx
+ ** vsx2
+ ** vx
+ **/
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+/*******************************************************************************
+ ** Defining the SIMD kernels
+ ******************************************************************************/
+
+#if NPY_SIMD
+/*
+ * convert any bit set to boolean true so vectorized and normal operations are
+ * consistent, should not be required if bool is used correctly everywhere but
+ * you never know
+ */
+NPY_FINLINE npyv_u8 byte_to_true(npyv_u8 v)
+{
+    const npyv_u8 zero = npyv_zero_u8();
+    const npyv_u8 truemask = npyv_setall_u8(1 == 1);
+    // cmpeq(v, 0) turns 0x00 -> 0xff and non-zero -> 0x00
+    npyv_u8 tmp = npyv_cvt_u8_b8(npyv_cmpeq_u8(v, zero));
+    // tmp is filled with 0xff/0x00, negate and mask to boolean true
+    return npyv_andc_u8(truemask, tmp);
+}
+/*
+ * convert mask vector (0xff/0x00) to boolean true.  similar to byte_to_true(),
+ * but we've already got a mask and can skip negation.
+ */
+NPY_FINLINE npyv_u8 mask_to_true(npyv_b8 v)
+{
+    const npyv_u8 truemask = npyv_setall_u8(1 == 1);
+    return npyv_and_u8(truemask, npyv_cvt_u8_b8(v));
+}
+/*
+ * For logical_and, we have to be careful to handle non-bool inputs where
+ * bits of each operand might not overlap. Example: a = 0x01, b = 0x80
+ * Both evaluate to boolean true, however, a & b is false.  Return value
+ * should be consistent with byte_to_true().
+ */
+NPY_FINLINE npyv_u8 simd_logical_and_u8(npyv_u8 a, npyv_u8 b)
+{
+    const npyv_u8 zero = npyv_zero_u8();
+    const npyv_u8 truemask = npyv_setall_u8(1 == 1);
+    npyv_b8 ma = npyv_cmpeq_u8(a, zero);
+    npyv_b8 mb = npyv_cmpeq_u8(b, zero);
+    npyv_u8 r = npyv_cvt_u8_b8(npyv_or_b8(ma, mb));
+    return npyv_andc_u8(truemask, r);
+}
+/*
+ * We don't really need the following, but it simplifies the templating code
+ * below since it is paired with simd_logical_and_u8() above.
+ */
+NPY_FINLINE npyv_u8 simd_logical_or_u8(npyv_u8 a, npyv_u8 b)
+{
+    npyv_u8 r = npyv_or_u8(a, b);
+    return byte_to_true(r);
+}
+
+
+#line 82
+static void
+simd_binary_logical_and_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, npy_intp len)
+{
+    #define UNROLL 16
+
+    const int vstep = npyv_nlanes_u8;
+    const int wstep = vstep * UNROLL;
+
+    // Unrolled vectors loop
+    for (; len >= wstep; len -= wstep, ip1 += wstep, ip2 += wstep, op += wstep) {
+        #line 95
+        #if UNROLL > 0
+        npyv_u8 a0 = npyv_load_u8(ip1 + vstep * 0);
+        npyv_u8 b0 = npyv_load_u8(ip2 + vstep * 0);
+        npyv_u8 r0 = simd_logical_and_u8(a0, b0);
+        npyv_store_u8(op + vstep * 0, r0);
+        #endif
+        
+#line 95
+        #if UNROLL > 1
+        npyv_u8 a1 = npyv_load_u8(ip1 + vstep * 1);
+        npyv_u8 b1 = npyv_load_u8(ip2 + vstep * 1);
+        npyv_u8 r1 = simd_logical_and_u8(a1, b1);
+        npyv_store_u8(op + vstep * 1, r1);
+        #endif
+        
+#line 95
+        #if UNROLL > 2
+        npyv_u8 a2 = npyv_load_u8(ip1 + vstep * 2);
+        npyv_u8 b2 = npyv_load_u8(ip2 + vstep * 2);
+        npyv_u8 r2 = simd_logical_and_u8(a2, b2);
+        npyv_store_u8(op + vstep * 2, r2);
+        #endif
+        
+#line 95
+        #if UNROLL > 3
+        npyv_u8 a3 = npyv_load_u8(ip1 + vstep * 3);
+        npyv_u8 b3 = npyv_load_u8(ip2 + vstep * 3);
+        npyv_u8 r3 = simd_logical_and_u8(a3, b3);
+        npyv_store_u8(op + vstep * 3, r3);
+        #endif
+        
+#line 95
+        #if UNROLL > 4
+        npyv_u8 a4 = npyv_load_u8(ip1 + vstep * 4);
+        npyv_u8 b4 = npyv_load_u8(ip2 + vstep * 4);
+        npyv_u8 r4 = simd_logical_and_u8(a4, b4);
+        npyv_store_u8(op + vstep * 4, r4);
+        #endif
+        
+#line 95
+        #if UNROLL > 5
+        npyv_u8 a5 = npyv_load_u8(ip1 + vstep * 5);
+        npyv_u8 b5 = npyv_load_u8(ip2 + vstep * 5);
+        npyv_u8 r5 = simd_logical_and_u8(a5, b5);
+        npyv_store_u8(op + vstep * 5, r5);
+        #endif
+        
+#line 95
+        #if UNROLL > 6
+        npyv_u8 a6 = npyv_load_u8(ip1 + vstep * 6);
+        npyv_u8 b6 = npyv_load_u8(ip2 + vstep * 6);
+        npyv_u8 r6 = simd_logical_and_u8(a6, b6);
+        npyv_store_u8(op + vstep * 6, r6);
+        #endif
+        
+#line 95
+        #if UNROLL > 7
+        npyv_u8 a7 = npyv_load_u8(ip1 + vstep * 7);
+        npyv_u8 b7 = npyv_load_u8(ip2 + vstep * 7);
+        npyv_u8 r7 = simd_logical_and_u8(a7, b7);
+        npyv_store_u8(op + vstep * 7, r7);
+        #endif
+        
+#line 95
+        #if UNROLL > 8
+        npyv_u8 a8 = npyv_load_u8(ip1 + vstep * 8);
+        npyv_u8 b8 = npyv_load_u8(ip2 + vstep * 8);
+        npyv_u8 r8 = simd_logical_and_u8(a8, b8);
+        npyv_store_u8(op + vstep * 8, r8);
+        #endif
+        
+#line 95
+        #if UNROLL > 9
+        npyv_u8 a9 = npyv_load_u8(ip1 + vstep * 9);
+        npyv_u8 b9 = npyv_load_u8(ip2 + vstep * 9);
+        npyv_u8 r9 = simd_logical_and_u8(a9, b9);
+        npyv_store_u8(op + vstep * 9, r9);
+        #endif
+        
+#line 95
+        #if UNROLL > 10
+        npyv_u8 a10 = npyv_load_u8(ip1 + vstep * 10);
+        npyv_u8 b10 = npyv_load_u8(ip2 + vstep * 10);
+        npyv_u8 r10 = simd_logical_and_u8(a10, b10);
+        npyv_store_u8(op + vstep * 10, r10);
+        #endif
+        
+#line 95
+        #if UNROLL > 11
+        npyv_u8 a11 = npyv_load_u8(ip1 + vstep * 11);
+        npyv_u8 b11 = npyv_load_u8(ip2 + vstep * 11);
+        npyv_u8 r11 = simd_logical_and_u8(a11, b11);
+        npyv_store_u8(op + vstep * 11, r11);
+        #endif
+        
+#line 95
+        #if UNROLL > 12
+        npyv_u8 a12 = npyv_load_u8(ip1 + vstep * 12);
+        npyv_u8 b12 = npyv_load_u8(ip2 + vstep * 12);
+        npyv_u8 r12 = simd_logical_and_u8(a12, b12);
+        npyv_store_u8(op + vstep * 12, r12);
+        #endif
+        
+#line 95
+        #if UNROLL > 13
+        npyv_u8 a13 = npyv_load_u8(ip1 + vstep * 13);
+        npyv_u8 b13 = npyv_load_u8(ip2 + vstep * 13);
+        npyv_u8 r13 = simd_logical_and_u8(a13, b13);
+        npyv_store_u8(op + vstep * 13, r13);
+        #endif
+        
+#line 95
+        #if UNROLL > 14
+        npyv_u8 a14 = npyv_load_u8(ip1 + vstep * 14);
+        npyv_u8 b14 = npyv_load_u8(ip2 + vstep * 14);
+        npyv_u8 r14 = simd_logical_and_u8(a14, b14);
+        npyv_store_u8(op + vstep * 14, r14);
+        #endif
+        
+#line 95
+        #if UNROLL > 15
+        npyv_u8 a15 = npyv_load_u8(ip1 + vstep * 15);
+        npyv_u8 b15 = npyv_load_u8(ip2 + vstep * 15);
+        npyv_u8 r15 = simd_logical_and_u8(a15, b15);
+        npyv_store_u8(op + vstep * 15, r15);
+        #endif
+        
+    }
+    #undef UNROLL
+
+    // Single vectors loop
+    for (; len >= vstep; len -= vstep, ip1 += vstep, ip2 += vstep, op += vstep) {
+        npyv_u8 a = npyv_load_u8(ip1);
+        npyv_u8 b = npyv_load_u8(ip2);
+        npyv_u8 r = simd_logical_and_u8(a, b);
+        npyv_store_u8(op, r);
+    }
+
+    // Scalar loop to finish off
+    for (; len > 0; len--, ip1++, ip2++, op++) {
+        *op = *ip1 && *ip2;
+    }
+}
+
+static void
+simd_reduce_logical_and_BOOL(npy_bool * op, npy_bool * ip, npy_intp len)
+{
+    #define UNROLL 8
+
+    const int vstep = npyv_nlanes_u8;
+    const int wstep = vstep * UNROLL;
+
+    // Unrolled vectors loop
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #if defined(NPY_HAVE_SSE2)
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_u8 v0 = npyv_load_u8(ip + vstep * 0);
+        npyv_u8 v1 = npyv_load_u8(ip + vstep * 1);
+        npyv_u8 v2 = npyv_load_u8(ip + vstep * 2);
+        npyv_u8 v3 = npyv_load_u8(ip + vstep * 3);
+        npyv_u8 v4 = npyv_load_u8(ip + vstep * 4);
+        npyv_u8 v5 = npyv_load_u8(ip + vstep * 5);
+        npyv_u8 v6 = npyv_load_u8(ip + vstep * 6);
+        npyv_u8 v7 = npyv_load_u8(ip + vstep * 7);
+
+        npyv_u8 m01 = npyv_min_u8(v0, v1);
+        npyv_u8 m23 = npyv_min_u8(v2, v3);
+        npyv_u8 m45 = npyv_min_u8(v4, v5);
+        npyv_u8 m67 = npyv_min_u8(v6, v7);
+
+        npyv_u8 m0123 = npyv_min_u8(m01, m23);
+        npyv_u8 m4567 = npyv_min_u8(m45, m67);
+
+        npyv_u8 mv = npyv_min_u8(m0123, m4567);
+
+        if(npyv_all_u8(mv) == 0){
+            *op = !1;
+            return;
+        }
+    }
+
+    // Single vectors loop
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        npyv_u8 v0 = npyv_load_u8(ip);
+        if(npyv_all_u8(v0) == 0){
+            *op = !1;
+            return;
+        }
+    }
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ++ip) {
+        *op = *op && *ip;
+        if (*op == 0) {
+            return;
+        }
+    }
+#undef UNROLL
+}
+
+#line 82
+static void
+simd_binary_logical_or_BOOL(npy_bool * op, npy_bool * ip1, npy_bool * ip2, npy_intp len)
+{
+    #define UNROLL 16
+
+    const int vstep = npyv_nlanes_u8;
+    const int wstep = vstep * UNROLL;
+
+    // Unrolled vectors loop
+    for (; len >= wstep; len -= wstep, ip1 += wstep, ip2 += wstep, op += wstep) {
+        #line 95
+        #if UNROLL > 0
+        npyv_u8 a0 = npyv_load_u8(ip1 + vstep * 0);
+        npyv_u8 b0 = npyv_load_u8(ip2 + vstep * 0);
+        npyv_u8 r0 = simd_logical_or_u8(a0, b0);
+        npyv_store_u8(op + vstep * 0, r0);
+        #endif
+        
+#line 95
+        #if UNROLL > 1
+        npyv_u8 a1 = npyv_load_u8(ip1 + vstep * 1);
+        npyv_u8 b1 = npyv_load_u8(ip2 + vstep * 1);
+        npyv_u8 r1 = simd_logical_or_u8(a1, b1);
+        npyv_store_u8(op + vstep * 1, r1);
+        #endif
+        
+#line 95
+        #if UNROLL > 2
+        npyv_u8 a2 = npyv_load_u8(ip1 + vstep * 2);
+        npyv_u8 b2 = npyv_load_u8(ip2 + vstep * 2);
+        npyv_u8 r2 = simd_logical_or_u8(a2, b2);
+        npyv_store_u8(op + vstep * 2, r2);
+        #endif
+        
+#line 95
+        #if UNROLL > 3
+        npyv_u8 a3 = npyv_load_u8(ip1 + vstep * 3);
+        npyv_u8 b3 = npyv_load_u8(ip2 + vstep * 3);
+        npyv_u8 r3 = simd_logical_or_u8(a3, b3);
+        npyv_store_u8(op + vstep * 3, r3);
+        #endif
+        
+#line 95
+        #if UNROLL > 4
+        npyv_u8 a4 = npyv_load_u8(ip1 + vstep * 4);
+        npyv_u8 b4 = npyv_load_u8(ip2 + vstep * 4);
+        npyv_u8 r4 = simd_logical_or_u8(a4, b4);
+        npyv_store_u8(op + vstep * 4, r4);
+        #endif
+        
+#line 95
+        #if UNROLL > 5
+        npyv_u8 a5 = npyv_load_u8(ip1 + vstep * 5);
+        npyv_u8 b5 = npyv_load_u8(ip2 + vstep * 5);
+        npyv_u8 r5 = simd_logical_or_u8(a5, b5);
+        npyv_store_u8(op + vstep * 5, r5);
+        #endif
+        
+#line 95
+        #if UNROLL > 6
+        npyv_u8 a6 = npyv_load_u8(ip1 + vstep * 6);
+        npyv_u8 b6 = npyv_load_u8(ip2 + vstep * 6);
+        npyv_u8 r6 = simd_logical_or_u8(a6, b6);
+        npyv_store_u8(op + vstep * 6, r6);
+        #endif
+        
+#line 95
+        #if UNROLL > 7
+        npyv_u8 a7 = npyv_load_u8(ip1 + vstep * 7);
+        npyv_u8 b7 = npyv_load_u8(ip2 + vstep * 7);
+        npyv_u8 r7 = simd_logical_or_u8(a7, b7);
+        npyv_store_u8(op + vstep * 7, r7);
+        #endif
+        
+#line 95
+        #if UNROLL > 8
+        npyv_u8 a8 = npyv_load_u8(ip1 + vstep * 8);
+        npyv_u8 b8 = npyv_load_u8(ip2 + vstep * 8);
+        npyv_u8 r8 = simd_logical_or_u8(a8, b8);
+        npyv_store_u8(op + vstep * 8, r8);
+        #endif
+        
+#line 95
+        #if UNROLL > 9
+        npyv_u8 a9 = npyv_load_u8(ip1 + vstep * 9);
+        npyv_u8 b9 = npyv_load_u8(ip2 + vstep * 9);
+        npyv_u8 r9 = simd_logical_or_u8(a9, b9);
+        npyv_store_u8(op + vstep * 9, r9);
+        #endif
+        
+#line 95
+        #if UNROLL > 10
+        npyv_u8 a10 = npyv_load_u8(ip1 + vstep * 10);
+        npyv_u8 b10 = npyv_load_u8(ip2 + vstep * 10);
+        npyv_u8 r10 = simd_logical_or_u8(a10, b10);
+        npyv_store_u8(op + vstep * 10, r10);
+        #endif
+        
+#line 95
+        #if UNROLL > 11
+        npyv_u8 a11 = npyv_load_u8(ip1 + vstep * 11);
+        npyv_u8 b11 = npyv_load_u8(ip2 + vstep * 11);
+        npyv_u8 r11 = simd_logical_or_u8(a11, b11);
+        npyv_store_u8(op + vstep * 11, r11);
+        #endif
+        
+#line 95
+        #if UNROLL > 12
+        npyv_u8 a12 = npyv_load_u8(ip1 + vstep * 12);
+        npyv_u8 b12 = npyv_load_u8(ip2 + vstep * 12);
+        npyv_u8 r12 = simd_logical_or_u8(a12, b12);
+        npyv_store_u8(op + vstep * 12, r12);
+        #endif
+        
+#line 95
+        #if UNROLL > 13
+        npyv_u8 a13 = npyv_load_u8(ip1 + vstep * 13);
+        npyv_u8 b13 = npyv_load_u8(ip2 + vstep * 13);
+        npyv_u8 r13 = simd_logical_or_u8(a13, b13);
+        npyv_store_u8(op + vstep * 13, r13);
+        #endif
+        
+#line 95
+        #if UNROLL > 14
+        npyv_u8 a14 = npyv_load_u8(ip1 + vstep * 14);
+        npyv_u8 b14 = npyv_load_u8(ip2 + vstep * 14);
+        npyv_u8 r14 = simd_logical_or_u8(a14, b14);
+        npyv_store_u8(op + vstep * 14, r14);
+        #endif
+        
+#line 95
+        #if UNROLL > 15
+        npyv_u8 a15 = npyv_load_u8(ip1 + vstep * 15);
+        npyv_u8 b15 = npyv_load_u8(ip2 + vstep * 15);
+        npyv_u8 r15 = simd_logical_or_u8(a15, b15);
+        npyv_store_u8(op + vstep * 15, r15);
+        #endif
+        
+    }
+    #undef UNROLL
+
+    // Single vectors loop
+    for (; len >= vstep; len -= vstep, ip1 += vstep, ip2 += vstep, op += vstep) {
+        npyv_u8 a = npyv_load_u8(ip1);
+        npyv_u8 b = npyv_load_u8(ip2);
+        npyv_u8 r = simd_logical_or_u8(a, b);
+        npyv_store_u8(op, r);
+    }
+
+    // Scalar loop to finish off
+    for (; len > 0; len--, ip1++, ip2++, op++) {
+        *op = *ip1 || *ip2;
+    }
+}
+
+static void
+simd_reduce_logical_or_BOOL(npy_bool * op, npy_bool * ip, npy_intp len)
+{
+    #define UNROLL 8
+
+    const int vstep = npyv_nlanes_u8;
+    const int wstep = vstep * UNROLL;
+
+    // Unrolled vectors loop
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #if defined(NPY_HAVE_SSE2)
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_u8 v0 = npyv_load_u8(ip + vstep * 0);
+        npyv_u8 v1 = npyv_load_u8(ip + vstep * 1);
+        npyv_u8 v2 = npyv_load_u8(ip + vstep * 2);
+        npyv_u8 v3 = npyv_load_u8(ip + vstep * 3);
+        npyv_u8 v4 = npyv_load_u8(ip + vstep * 4);
+        npyv_u8 v5 = npyv_load_u8(ip + vstep * 5);
+        npyv_u8 v6 = npyv_load_u8(ip + vstep * 6);
+        npyv_u8 v7 = npyv_load_u8(ip + vstep * 7);
+
+        npyv_u8 m01 = npyv_max_u8(v0, v1);
+        npyv_u8 m23 = npyv_max_u8(v2, v3);
+        npyv_u8 m45 = npyv_max_u8(v4, v5);
+        npyv_u8 m67 = npyv_max_u8(v6, v7);
+
+        npyv_u8 m0123 = npyv_max_u8(m01, m23);
+        npyv_u8 m4567 = npyv_max_u8(m45, m67);
+
+        npyv_u8 mv = npyv_max_u8(m0123, m4567);
+
+        if(npyv_any_u8(mv) != 0){
+            *op = !0;
+            return;
+        }
+    }
+
+    // Single vectors loop
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        npyv_u8 v0 = npyv_load_u8(ip);
+        if(npyv_any_u8(v0) != 0){
+            *op = !0;
+            return;
+        }
+    }
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ++ip) {
+        *op = *op || *ip;
+        if (*op != 0) {
+            return;
+        }
+    }
+#undef UNROLL
+}
+
+
+#line 182
+static void
+simd_logical_not_BOOL(npy_bool * op, npy_bool * ip, npy_intp len)
+{
+    #define UNROLL 16
+
+    const int vstep = npyv_nlanes_u8;
+    const int wstep = vstep * UNROLL;
+
+    #if 1
+    const npyv_u8 zero = npyv_zero_u8();
+    #endif
+
+    // Unrolled vectors loop
+    for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
+        #line 199
+        #if UNROLL > 0
+        npyv_u8 v0 = npyv_load_u8(ip + vstep * 0);
+#if 1
+        npyv_u8 r0 = mask_to_true(npyv_cmpeq_u8(v0, zero));
+#else
+        npyv_u8 r0 = byte_to_true(v0);
+#endif
+        npyv_store_u8(op + vstep * 0, r0);
+        #endif
+        
+#line 199
+        #if UNROLL > 1
+        npyv_u8 v1 = npyv_load_u8(ip + vstep * 1);
+#if 1
+        npyv_u8 r1 = mask_to_true(npyv_cmpeq_u8(v1, zero));
+#else
+        npyv_u8 r1 = byte_to_true(v1);
+#endif
+        npyv_store_u8(op + vstep * 1, r1);
+        #endif
+        
+#line 199
+        #if UNROLL > 2
+        npyv_u8 v2 = npyv_load_u8(ip + vstep * 2);
+#if 1
+        npyv_u8 r2 = mask_to_true(npyv_cmpeq_u8(v2, zero));
+#else
+        npyv_u8 r2 = byte_to_true(v2);
+#endif
+        npyv_store_u8(op + vstep * 2, r2);
+        #endif
+        
+#line 199
+        #if UNROLL > 3
+        npyv_u8 v3 = npyv_load_u8(ip + vstep * 3);
+#if 1
+        npyv_u8 r3 = mask_to_true(npyv_cmpeq_u8(v3, zero));
+#else
+        npyv_u8 r3 = byte_to_true(v3);
+#endif
+        npyv_store_u8(op + vstep * 3, r3);
+        #endif
+        
+#line 199
+        #if UNROLL > 4
+        npyv_u8 v4 = npyv_load_u8(ip + vstep * 4);
+#if 1
+        npyv_u8 r4 = mask_to_true(npyv_cmpeq_u8(v4, zero));
+#else
+        npyv_u8 r4 = byte_to_true(v4);
+#endif
+        npyv_store_u8(op + vstep * 4, r4);
+        #endif
+        
+#line 199
+        #if UNROLL > 5
+        npyv_u8 v5 = npyv_load_u8(ip + vstep * 5);
+#if 1
+        npyv_u8 r5 = mask_to_true(npyv_cmpeq_u8(v5, zero));
+#else
+        npyv_u8 r5 = byte_to_true(v5);
+#endif
+        npyv_store_u8(op + vstep * 5, r5);
+        #endif
+        
+#line 199
+        #if UNROLL > 6
+        npyv_u8 v6 = npyv_load_u8(ip + vstep * 6);
+#if 1
+        npyv_u8 r6 = mask_to_true(npyv_cmpeq_u8(v6, zero));
+#else
+        npyv_u8 r6 = byte_to_true(v6);
+#endif
+        npyv_store_u8(op + vstep * 6, r6);
+        #endif
+        
+#line 199
+        #if UNROLL > 7
+        npyv_u8 v7 = npyv_load_u8(ip + vstep * 7);
+#if 1
+        npyv_u8 r7 = mask_to_true(npyv_cmpeq_u8(v7, zero));
+#else
+        npyv_u8 r7 = byte_to_true(v7);
+#endif
+        npyv_store_u8(op + vstep * 7, r7);
+        #endif
+        
+#line 199
+        #if UNROLL > 8
+        npyv_u8 v8 = npyv_load_u8(ip + vstep * 8);
+#if 1
+        npyv_u8 r8 = mask_to_true(npyv_cmpeq_u8(v8, zero));
+#else
+        npyv_u8 r8 = byte_to_true(v8);
+#endif
+        npyv_store_u8(op + vstep * 8, r8);
+        #endif
+        
+#line 199
+        #if UNROLL > 9
+        npyv_u8 v9 = npyv_load_u8(ip + vstep * 9);
+#if 1
+        npyv_u8 r9 = mask_to_true(npyv_cmpeq_u8(v9, zero));
+#else
+        npyv_u8 r9 = byte_to_true(v9);
+#endif
+        npyv_store_u8(op + vstep * 9, r9);
+        #endif
+        
+#line 199
+        #if UNROLL > 10
+        npyv_u8 v10 = npyv_load_u8(ip + vstep * 10);
+#if 1
+        npyv_u8 r10 = mask_to_true(npyv_cmpeq_u8(v10, zero));
+#else
+        npyv_u8 r10 = byte_to_true(v10);
+#endif
+        npyv_store_u8(op + vstep * 10, r10);
+        #endif
+        
+#line 199
+        #if UNROLL > 11
+        npyv_u8 v11 = npyv_load_u8(ip + vstep * 11);
+#if 1
+        npyv_u8 r11 = mask_to_true(npyv_cmpeq_u8(v11, zero));
+#else
+        npyv_u8 r11 = byte_to_true(v11);
+#endif
+        npyv_store_u8(op + vstep * 11, r11);
+        #endif
+        
+#line 199
+        #if UNROLL > 12
+        npyv_u8 v12 = npyv_load_u8(ip + vstep * 12);
+#if 1
+        npyv_u8 r12 = mask_to_true(npyv_cmpeq_u8(v12, zero));
+#else
+        npyv_u8 r12 = byte_to_true(v12);
+#endif
+        npyv_store_u8(op + vstep * 12, r12);
+        #endif
+        
+#line 199
+        #if UNROLL > 13
+        npyv_u8 v13 = npyv_load_u8(ip + vstep * 13);
+#if 1
+        npyv_u8 r13 = mask_to_true(npyv_cmpeq_u8(v13, zero));
+#else
+        npyv_u8 r13 = byte_to_true(v13);
+#endif
+        npyv_store_u8(op + vstep * 13, r13);
+        #endif
+        
+#line 199
+        #if UNROLL > 14
+        npyv_u8 v14 = npyv_load_u8(ip + vstep * 14);
+#if 1
+        npyv_u8 r14 = mask_to_true(npyv_cmpeq_u8(v14, zero));
+#else
+        npyv_u8 r14 = byte_to_true(v14);
+#endif
+        npyv_store_u8(op + vstep * 14, r14);
+        #endif
+        
+#line 199
+        #if UNROLL > 15
+        npyv_u8 v15 = npyv_load_u8(ip + vstep * 15);
+#if 1
+        npyv_u8 r15 = mask_to_true(npyv_cmpeq_u8(v15, zero));
+#else
+        npyv_u8 r15 = byte_to_true(v15);
+#endif
+        npyv_store_u8(op + vstep * 15, r15);
+        #endif
+        
+    }
+    #undef UNROLL
+
+    // Single vectors loop
+    for (; len >= vstep; len -= vstep, ip += vstep, op += vstep) {
+        npyv_u8 v = npyv_load_u8(ip);
+#if 1
+        npyv_u8 r = mask_to_true(npyv_cmpeq_u8(v, zero));
+#else
+        npyv_u8 r = byte_to_true(v);
+#endif
+        npyv_store_u8(op, r);
+    }
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ++ip, ++op) {
+        *op = (*ip == 0);
+    }
+}
+
+#line 182
+static void
+simd_absolute_BOOL(npy_bool * op, npy_bool * ip, npy_intp len)
+{
+    #define UNROLL 16
+
+    const int vstep = npyv_nlanes_u8;
+    const int wstep = vstep * UNROLL;
+
+    #if 0
+    const npyv_u8 zero = npyv_zero_u8();
+    #endif
+
+    // Unrolled vectors loop
+    for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
+        #line 199
+        #if UNROLL > 0
+        npyv_u8 v0 = npyv_load_u8(ip + vstep * 0);
+#if 0
+        npyv_u8 r0 = mask_to_true(npyv_cmpeq_u8(v0, zero));
+#else
+        npyv_u8 r0 = byte_to_true(v0);
+#endif
+        npyv_store_u8(op + vstep * 0, r0);
+        #endif
+        
+#line 199
+        #if UNROLL > 1
+        npyv_u8 v1 = npyv_load_u8(ip + vstep * 1);
+#if 0
+        npyv_u8 r1 = mask_to_true(npyv_cmpeq_u8(v1, zero));
+#else
+        npyv_u8 r1 = byte_to_true(v1);
+#endif
+        npyv_store_u8(op + vstep * 1, r1);
+        #endif
+        
+#line 199
+        #if UNROLL > 2
+        npyv_u8 v2 = npyv_load_u8(ip + vstep * 2);
+#if 0
+        npyv_u8 r2 = mask_to_true(npyv_cmpeq_u8(v2, zero));
+#else
+        npyv_u8 r2 = byte_to_true(v2);
+#endif
+        npyv_store_u8(op + vstep * 2, r2);
+        #endif
+        
+#line 199
+        #if UNROLL > 3
+        npyv_u8 v3 = npyv_load_u8(ip + vstep * 3);
+#if 0
+        npyv_u8 r3 = mask_to_true(npyv_cmpeq_u8(v3, zero));
+#else
+        npyv_u8 r3 = byte_to_true(v3);
+#endif
+        npyv_store_u8(op + vstep * 3, r3);
+        #endif
+        
+#line 199
+        #if UNROLL > 4
+        npyv_u8 v4 = npyv_load_u8(ip + vstep * 4);
+#if 0
+        npyv_u8 r4 = mask_to_true(npyv_cmpeq_u8(v4, zero));
+#else
+        npyv_u8 r4 = byte_to_true(v4);
+#endif
+        npyv_store_u8(op + vstep * 4, r4);
+        #endif
+        
+#line 199
+        #if UNROLL > 5
+        npyv_u8 v5 = npyv_load_u8(ip + vstep * 5);
+#if 0
+        npyv_u8 r5 = mask_to_true(npyv_cmpeq_u8(v5, zero));
+#else
+        npyv_u8 r5 = byte_to_true(v5);
+#endif
+        npyv_store_u8(op + vstep * 5, r5);
+        #endif
+        
+#line 199
+        #if UNROLL > 6
+        npyv_u8 v6 = npyv_load_u8(ip + vstep * 6);
+#if 0
+        npyv_u8 r6 = mask_to_true(npyv_cmpeq_u8(v6, zero));
+#else
+        npyv_u8 r6 = byte_to_true(v6);
+#endif
+        npyv_store_u8(op + vstep * 6, r6);
+        #endif
+        
+#line 199
+        #if UNROLL > 7
+        npyv_u8 v7 = npyv_load_u8(ip + vstep * 7);
+#if 0
+        npyv_u8 r7 = mask_to_true(npyv_cmpeq_u8(v7, zero));
+#else
+        npyv_u8 r7 = byte_to_true(v7);
+#endif
+        npyv_store_u8(op + vstep * 7, r7);
+        #endif
+        
+#line 199
+        #if UNROLL > 8
+        npyv_u8 v8 = npyv_load_u8(ip + vstep * 8);
+#if 0
+        npyv_u8 r8 = mask_to_true(npyv_cmpeq_u8(v8, zero));
+#else
+        npyv_u8 r8 = byte_to_true(v8);
+#endif
+        npyv_store_u8(op + vstep * 8, r8);
+        #endif
+        
+#line 199
+        #if UNROLL > 9
+        npyv_u8 v9 = npyv_load_u8(ip + vstep * 9);
+#if 0
+        npyv_u8 r9 = mask_to_true(npyv_cmpeq_u8(v9, zero));
+#else
+        npyv_u8 r9 = byte_to_true(v9);
+#endif
+        npyv_store_u8(op + vstep * 9, r9);
+        #endif
+        
+#line 199
+        #if UNROLL > 10
+        npyv_u8 v10 = npyv_load_u8(ip + vstep * 10);
+#if 0
+        npyv_u8 r10 = mask_to_true(npyv_cmpeq_u8(v10, zero));
+#else
+        npyv_u8 r10 = byte_to_true(v10);
+#endif
+        npyv_store_u8(op + vstep * 10, r10);
+        #endif
+        
+#line 199
+        #if UNROLL > 11
+        npyv_u8 v11 = npyv_load_u8(ip + vstep * 11);
+#if 0
+        npyv_u8 r11 = mask_to_true(npyv_cmpeq_u8(v11, zero));
+#else
+        npyv_u8 r11 = byte_to_true(v11);
+#endif
+        npyv_store_u8(op + vstep * 11, r11);
+        #endif
+        
+#line 199
+        #if UNROLL > 12
+        npyv_u8 v12 = npyv_load_u8(ip + vstep * 12);
+#if 0
+        npyv_u8 r12 = mask_to_true(npyv_cmpeq_u8(v12, zero));
+#else
+        npyv_u8 r12 = byte_to_true(v12);
+#endif
+        npyv_store_u8(op + vstep * 12, r12);
+        #endif
+        
+#line 199
+        #if UNROLL > 13
+        npyv_u8 v13 = npyv_load_u8(ip + vstep * 13);
+#if 0
+        npyv_u8 r13 = mask_to_true(npyv_cmpeq_u8(v13, zero));
+#else
+        npyv_u8 r13 = byte_to_true(v13);
+#endif
+        npyv_store_u8(op + vstep * 13, r13);
+        #endif
+        
+#line 199
+        #if UNROLL > 14
+        npyv_u8 v14 = npyv_load_u8(ip + vstep * 14);
+#if 0
+        npyv_u8 r14 = mask_to_true(npyv_cmpeq_u8(v14, zero));
+#else
+        npyv_u8 r14 = byte_to_true(v14);
+#endif
+        npyv_store_u8(op + vstep * 14, r14);
+        #endif
+        
+#line 199
+        #if UNROLL > 15
+        npyv_u8 v15 = npyv_load_u8(ip + vstep * 15);
+#if 0
+        npyv_u8 r15 = mask_to_true(npyv_cmpeq_u8(v15, zero));
+#else
+        npyv_u8 r15 = byte_to_true(v15);
+#endif
+        npyv_store_u8(op + vstep * 15, r15);
+        #endif
+        
+    }
+    #undef UNROLL
+
+    // Single vectors loop
+    for (; len >= vstep; len -= vstep, ip += vstep, op += vstep) {
+        npyv_u8 v = npyv_load_u8(ip);
+#if 0
+        npyv_u8 r = mask_to_true(npyv_cmpeq_u8(v, zero));
+#else
+        npyv_u8 r = byte_to_true(v);
+#endif
+        npyv_store_u8(op, r);
+    }
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ++ip, ++op) {
+        *op = (*ip != 0);
+    }
+}
+
+
+#endif // NPY_SIMD
+
+/*******************************************************************************
+ ** Defining ufunc inner functions
+ ******************************************************************************/
+
+#line 239
+static NPY_INLINE int
+run_binary_simd_logical_or_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (sizeof(npy_bool) == 1 &&
+            IS_BLOCKABLE_BINARY(sizeof(npy_bool), NPY_SIMD_WIDTH)) {
+        simd_binary_logical_or_BOOL((npy_bool*)args[2], (npy_bool*)args[0],
+                               (npy_bool*)args[1], dimensions[0]);
+        return 1;
+    }
+#endif
+    return 0;
+}
+
+
+static NPY_INLINE int
+run_reduce_simd_logical_or_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (sizeof(npy_bool) == 1 &&
+            IS_BLOCKABLE_REDUCE(sizeof(npy_bool), NPY_SIMD_WIDTH)) {
+        simd_reduce_logical_or_BOOL((npy_bool*)args[0], (npy_bool*)args[1],
+                                dimensions[0]);
+        return 1;
+    }
+#endif
+    return 0;
+}
+
+#line 239
+static NPY_INLINE int
+run_binary_simd_logical_and_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (sizeof(npy_bool) == 1 &&
+            IS_BLOCKABLE_BINARY(sizeof(npy_bool), NPY_SIMD_WIDTH)) {
+        simd_binary_logical_and_BOOL((npy_bool*)args[2], (npy_bool*)args[0],
+                               (npy_bool*)args[1], dimensions[0]);
+        return 1;
+    }
+#endif
+    return 0;
+}
+
+
+static NPY_INLINE int
+run_reduce_simd_logical_and_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (sizeof(npy_bool) == 1 &&
+            IS_BLOCKABLE_REDUCE(sizeof(npy_bool), NPY_SIMD_WIDTH)) {
+        simd_reduce_logical_and_BOOL((npy_bool*)args[0], (npy_bool*)args[1],
+                                dimensions[0]);
+        return 1;
+    }
+#endif
+    return 0;
+}
+
+
+#line 272
+static NPY_INLINE int
+run_unary_simd_logical_not_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (sizeof(npy_bool) == 1 &&
+            IS_BLOCKABLE_UNARY(sizeof(npy_bool), NPY_SIMD_WIDTH)) {
+        simd_logical_not_BOOL((npy_bool*)args[1], (npy_bool*)args[0], dimensions[0]);
+        return 1;
+    }
+#endif
+    return 0;
+}
+
+#line 272
+static NPY_INLINE int
+run_unary_simd_absolute_BOOL(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if NPY_SIMD
+    if (sizeof(npy_bool) == 1 &&
+            IS_BLOCKABLE_UNARY(sizeof(npy_bool), NPY_SIMD_WIDTH)) {
+        simd_absolute_BOOL((npy_bool*)args[1], (npy_bool*)args[0], dimensions[0]);
+        return 1;
+    }
+#endif
+    return 0;
+}
+
+
+
+#line 293
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_logical_and)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if(IS_BINARY_REDUCE) {
+#if NPY_SIMD
+        /*
+         * stick with our variant for more reliable performance, only known
+         * platform which outperforms it by ~20% is an i7 with glibc 2.17
+         */
+        if (run_reduce_simd_logical_and_BOOL(args, dimensions, steps)) {
+            return;
+        }
+#else
+        /* for now only use libc on 32-bit/non-x86 */
+        if (steps[1] == 1) {
+            npy_bool * op = (npy_bool *)args[0];
+#if 1
+            /* np.all(), search for a zero (false) */
+            if (*op) {
+                *op = memchr(args[1], 0, dimensions[0]) == NULL;
+            }
+#else
+            /*
+             * np.any(), search for a non-zero (true) via comparing against
+             * zero blocks, memcmp is faster than memchr on SSE4 machines
+             * with glibc >= 2.12 and memchr can only check for equal 1
+             */
+            static const npy_bool zero[4096]; /* zero by C standard */
+            npy_uintp i, n = dimensions[0];
+
+            for (i = 0; !*op && i < n - (n % sizeof(zero)); i += sizeof(zero)) {
+                *op = memcmp(&args[1][i], zero, sizeof(zero)) != 0;
+            }
+            if (!*op && n - i > 0) {
+                *op = memcmp(&args[1][i], zero, n - i) != 0;
+            }
+#endif
+            return;
+        }
+#endif
+        else {
+            BINARY_REDUCE_LOOP(npy_bool) {
+                const npy_bool in2 = *(npy_bool *)ip2;
+                io1 = io1 && in2;
+                if (io1 == 0) {
+                    break;
+                }
+            }
+            *((npy_bool *)iop1) = io1;
+        }
+    }
+    else {
+        if (run_binary_simd_logical_and_BOOL(args, dimensions, steps)) {
+            return;
+        }
+        else {
+            BINARY_LOOP {
+                const npy_bool in1 = *(npy_bool *)ip1;
+                const npy_bool in2 = *(npy_bool *)ip2;
+                *((npy_bool *)op1) = in1 && in2;
+            }
+        }
+    }
+}
+
+#line 293
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_logical_or)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if(IS_BINARY_REDUCE) {
+#if NPY_SIMD
+        /*
+         * stick with our variant for more reliable performance, only known
+         * platform which outperforms it by ~20% is an i7 with glibc 2.17
+         */
+        if (run_reduce_simd_logical_or_BOOL(args, dimensions, steps)) {
+            return;
+        }
+#else
+        /* for now only use libc on 32-bit/non-x86 */
+        if (steps[1] == 1) {
+            npy_bool * op = (npy_bool *)args[0];
+#if 0
+            /* np.all(), search for a zero (false) */
+            if (*op) {
+                *op = memchr(args[1], 0, dimensions[0]) == NULL;
+            }
+#else
+            /*
+             * np.any(), search for a non-zero (true) via comparing against
+             * zero blocks, memcmp is faster than memchr on SSE4 machines
+             * with glibc >= 2.12 and memchr can only check for equal 1
+             */
+            static const npy_bool zero[4096]; /* zero by C standard */
+            npy_uintp i, n = dimensions[0];
+
+            for (i = 0; !*op && i < n - (n % sizeof(zero)); i += sizeof(zero)) {
+                *op = memcmp(&args[1][i], zero, sizeof(zero)) != 0;
+            }
+            if (!*op && n - i > 0) {
+                *op = memcmp(&args[1][i], zero, n - i) != 0;
+            }
+#endif
+            return;
+        }
+#endif
+        else {
+            BINARY_REDUCE_LOOP(npy_bool) {
+                const npy_bool in2 = *(npy_bool *)ip2;
+                io1 = io1 || in2;
+                if (io1 != 0) {
+                    break;
+                }
+            }
+            *((npy_bool *)iop1) = io1;
+        }
+    }
+    else {
+        if (run_binary_simd_logical_or_BOOL(args, dimensions, steps)) {
+            return;
+        }
+        else {
+            BINARY_LOOP {
+                const npy_bool in1 = *(npy_bool *)ip1;
+                const npy_bool in2 = *(npy_bool *)ip2;
+                *((npy_bool *)op1) = in1 || in2;
+            }
+        }
+    }
+}
+
+
+#line 363
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_logical_not)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (run_unary_simd_logical_not_BOOL(args, dimensions, steps)) {
+        return;
+    }
+    else {
+        UNARY_LOOP {
+            npy_bool in1 = *(npy_bool *)ip1;
+            *((npy_bool *)op1) = in1 == 0;
+        }
+    }
+}
+
+#line 363
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BOOL_absolute)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    if (run_unary_simd_absolute_BOOL(args, dimensions, steps)) {
+        return;
+    }
+    else {
+        UNARY_LOOP {
+            npy_bool in1 = *(npy_bool *)ip1;
+            *((npy_bool *)op1) = in1 != 0;
+        }
+    }
+}
+
+
+
diff --git a/numpy/core/src/_generated/loops_minmax.dispatch.c b/numpy/core/src/_generated/loops_minmax.dispatch.c
new file mode 100644
index 000000000000..820f553445a1
--- /dev/null
+++ b/numpy/core/src/_generated/loops_minmax.dispatch.c
@@ -0,0 +1,16798 @@
+#line 1 "numpy/core/src/umath/loops_minmax.dispatch.c.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+/*@targets
+ ** $maxopt baseline
+ ** neon asimd
+ ** sse2 avx2 avx512_skx
+ ** vsx2
+ ** vx vxe
+ **/
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+/*******************************************************************************
+ ** Scalar intrinsics
+ ******************************************************************************/
+// signed/unsigned int
+#define scalar_max_i(A, B) ((A > B) ? A : B)
+#define scalar_min_i(A, B) ((A < B) ? A : B)
+// fp, propagates NaNs
+#define scalar_max(A, B) ((A >= B || npy_isnan(A)) ? A : B)
+#define scalar_max_f scalar_max
+#define scalar_max_d scalar_max
+#define scalar_max_l scalar_max
+#define scalar_min(A, B) ((A <= B || npy_isnan(A)) ? A : B)
+#define scalar_min_f scalar_min
+#define scalar_min_d scalar_min
+#define scalar_min_l scalar_min
+// fp, ignores NaNs
+#define scalar_maxp_f fmaxf
+#define scalar_maxp_d fmax
+#define scalar_maxp_l fmaxl
+#define scalar_minp_f fminf
+#define scalar_minp_d fmin
+#define scalar_minp_l fminl
+
+// special optimization for fp scalars propagates NaNs
+// since there're no C99 support for it
+#ifndef NPY_DISABLE_OPTIMIZATION
+#line 52
+#line 56
+#ifdef NPY_HAVE_SSE2
+#undef scalar_max_f
+NPY_FINLINE npy_float scalar_max_f(npy_float a, npy_float b) {
+    __m128 va = _mm_set_ss(a);
+    __m128 vb = _mm_set_ss(b);
+    __m128 rv = _mm_max_ss(va, vb);
+    // X86 handle second operand
+    __m128 nn = _mm_cmpord_ss(va, va);
+    #ifdef NPY_HAVE_SSE41
+    rv = _mm_blendv_ps(va, rv, nn);
+    #else
+    rv = _mm_xor_ps(va, _mm_and_ps(_mm_xor_ps(va, rv), nn));
+    #endif
+    return _mm_cvtss_f32(rv);
+}
+#endif // SSE2
+#ifdef __aarch64__
+#undef scalar_max_f
+NPY_FINLINE npy_float scalar_max_f(npy_float a, npy_float b) {
+    npy_float result = 0;
+    __asm(
+        "fmax %s[result], %s[a], %s[b]"
+        : [result] "=w" (result)
+        : [a] "w" (a), [b] "w" (b)
+    );
+    return result;
+}
+#endif // __aarch64__
+
+#line 56
+#ifdef NPY_HAVE_SSE2
+#undef scalar_min_f
+NPY_FINLINE npy_float scalar_min_f(npy_float a, npy_float b) {
+    __m128 va = _mm_set_ss(a);
+    __m128 vb = _mm_set_ss(b);
+    __m128 rv = _mm_min_ss(va, vb);
+    // X86 handle second operand
+    __m128 nn = _mm_cmpord_ss(va, va);
+    #ifdef NPY_HAVE_SSE41
+    rv = _mm_blendv_ps(va, rv, nn);
+    #else
+    rv = _mm_xor_ps(va, _mm_and_ps(_mm_xor_ps(va, rv), nn));
+    #endif
+    return _mm_cvtss_f32(rv);
+}
+#endif // SSE2
+#ifdef __aarch64__
+#undef scalar_min_f
+NPY_FINLINE npy_float scalar_min_f(npy_float a, npy_float b) {
+    npy_float result = 0;
+    __asm(
+        "fmin %s[result], %s[a], %s[b]"
+        : [result] "=w" (result)
+        : [a] "w" (a), [b] "w" (b)
+    );
+    return result;
+}
+#endif // __aarch64__
+
+
+#line 52
+#line 56
+#ifdef NPY_HAVE_SSE2
+#undef scalar_max_d
+NPY_FINLINE npy_double scalar_max_d(npy_double a, npy_double b) {
+    __m128d va = _mm_set_sd(a);
+    __m128d vb = _mm_set_sd(b);
+    __m128d rv = _mm_max_sd(va, vb);
+    // X86 handle second operand
+    __m128d nn = _mm_cmpord_sd(va, va);
+    #ifdef NPY_HAVE_SSE41
+    rv = _mm_blendv_pd(va, rv, nn);
+    #else
+    rv = _mm_xor_pd(va, _mm_and_pd(_mm_xor_pd(va, rv), nn));
+    #endif
+    return _mm_cvtsd_f64(rv);
+}
+#endif // SSE2
+#ifdef __aarch64__
+#undef scalar_max_d
+NPY_FINLINE npy_double scalar_max_d(npy_double a, npy_double b) {
+    npy_double result = 0;
+    __asm(
+        "fmax %d[result], %d[a], %d[b]"
+        : [result] "=w" (result)
+        : [a] "w" (a), [b] "w" (b)
+    );
+    return result;
+}
+#endif // __aarch64__
+
+#line 56
+#ifdef NPY_HAVE_SSE2
+#undef scalar_min_d
+NPY_FINLINE npy_double scalar_min_d(npy_double a, npy_double b) {
+    __m128d va = _mm_set_sd(a);
+    __m128d vb = _mm_set_sd(b);
+    __m128d rv = _mm_min_sd(va, vb);
+    // X86 handle second operand
+    __m128d nn = _mm_cmpord_sd(va, va);
+    #ifdef NPY_HAVE_SSE41
+    rv = _mm_blendv_pd(va, rv, nn);
+    #else
+    rv = _mm_xor_pd(va, _mm_and_pd(_mm_xor_pd(va, rv), nn));
+    #endif
+    return _mm_cvtsd_f64(rv);
+}
+#endif // SSE2
+#ifdef __aarch64__
+#undef scalar_min_d
+NPY_FINLINE npy_double scalar_min_d(npy_double a, npy_double b) {
+    npy_double result = 0;
+    __asm(
+        "fmin %d[result], %d[a], %d[b]"
+        : [result] "=w" (result)
+        : [a] "w" (a), [b] "w" (b)
+    );
+    return result;
+}
+#endif // __aarch64__
+
+
+#endif // NPY_DISABLE_OPTIMIZATION
+// mapping to double if its possible
+#if NPY_BITSOF_DOUBLE == NPY_BITSOF_LONGDOUBLE
+#line 92
+    #undef scalar_max_l
+    #define scalar_max_l scalar_max_d
+
+#line 92
+    #undef scalar_min_l
+    #define scalar_min_l scalar_min_d
+
+#line 92
+    #undef scalar_maxp_l
+    #define scalar_maxp_l scalar_maxp_d
+
+#line 92
+    #undef scalar_minp_l
+    #define scalar_minp_l scalar_minp_d
+
+#endif
+
+/*******************************************************************************
+ ** Defining the SIMD kernels
+ ******************************************************************************/
+#line 106
+#line 110
+#define SCALAR_OP scalar_max_i
+#if NPY_SIMD && (!0 || (0 && 0))
+
+#if 0 && !0
+    #define V_INTRIN npyv_maxn_s8 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_maxn_s8
+#else
+    #define V_INTRIN npyv_max_s8
+    #define V_REDUCE_INTRIN npyv_reduce_max_s8
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_max_s8(const npyv_lanetype_s8 *ip, npyv_lanetype_s8 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_s8;
+    const int wstep = vstep*8;
+    npyv_s8 acc = npyv_setall_s8(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_s8 v0 = npyv_load_s8(ip + vstep * 0);
+        npyv_s8 v1 = npyv_load_s8(ip + vstep * 1);
+        npyv_s8 v2 = npyv_load_s8(ip + vstep * 2);
+        npyv_s8 v3 = npyv_load_s8(ip + vstep * 3);
+
+        npyv_s8 v4 = npyv_load_s8(ip + vstep * 4);
+        npyv_s8 v5 = npyv_load_s8(ip + vstep * 5);
+        npyv_s8 v6 = npyv_load_s8(ip + vstep * 6);
+        npyv_s8 v7 = npyv_load_s8(ip + vstep * 7);
+
+        npyv_s8 r01 = V_INTRIN(v0, v1);
+        npyv_s8 r23 = V_INTRIN(v2, v3);
+        npyv_s8 r45 = V_INTRIN(v4, v5);
+        npyv_s8 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_s8(ip));
+    }
+    npyv_lanetype_s8 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_s8 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_max_s8(const npyv_lanetype_s8 *ip1, const npyv_lanetype_s8 *ip2,
+                                     npyv_lanetype_s8 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_s8;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_s8 v0 = npyv_load_s8(&ip1[i + 0 * elemPerVector]);
+        npyv_s8 v1 = npyv_load_s8(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s8 v2 = npyv_load_s8(&ip1[i + 2 * elemPerVector]);
+        npyv_s8 v3 = npyv_load_s8(&ip1[i + 3 * elemPerVector]);
+        npyv_s8 v4 = npyv_load_s8(&ip1[i + 4 * elemPerVector]);
+        npyv_s8 v5 = npyv_load_s8(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_s8 u0 = npyv_load_s8(&ip2[i + 0 * elemPerVector]);
+        npyv_s8 u1 = npyv_load_s8(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s8 u2 = npyv_load_s8(&ip2[i + 2 * elemPerVector]);
+        npyv_s8 u3 = npyv_load_s8(&ip2[i + 3 * elemPerVector]);
+        npyv_s8 u4 = npyv_load_s8(&ip2[i + 4 * elemPerVector]);
+        npyv_s8 u5 = npyv_load_s8(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_s8 m0 = V_INTRIN(v0, u0);
+        npyv_s8 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s8 m2 = V_INTRIN(v2, u2);
+        npyv_s8 m3 = V_INTRIN(v3, u3);
+        npyv_s8 m4 = V_INTRIN(v4, u4);
+        npyv_s8 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_s8(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_s8(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_s8(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_s8(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_s8(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_s8(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_s8 v0 = npyv_load_s8(ip1 + i);
+        npyv_s8 u0 = npyv_load_s8(ip2 + i);
+        npyv_s8 m0 = V_INTRIN(v0, u0);
+        npyv_store_s8(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_s8 in1 = ip1[i];
+        const npyv_lanetype_s8 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_max_s8(const npyv_lanetype_s8 *ip1, npy_intp sip1,
+                           const npyv_lanetype_s8 *ip2, npy_intp sip2,
+                                 npyv_lanetype_s8 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_s8;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_s8 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_s8(ip1);
+        } else {
+            a = npyv_loadn_s8(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_s8(ip2);
+        } else {
+            b = npyv_loadn_s8(ip2, sip2);
+        }
+        npyv_s8 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_s8(op1, r);
+        } else {
+            npyv_storen_s8(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_s8 a = *ip1;
+        const npyv_lanetype_s8 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+#line 110
+#define SCALAR_OP scalar_min_i
+#if NPY_SIMD && (!0 || (0 && 0))
+
+#if 0 && !0
+    #define V_INTRIN npyv_minn_s8 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_minn_s8
+#else
+    #define V_INTRIN npyv_min_s8
+    #define V_REDUCE_INTRIN npyv_reduce_min_s8
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_min_s8(const npyv_lanetype_s8 *ip, npyv_lanetype_s8 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_s8;
+    const int wstep = vstep*8;
+    npyv_s8 acc = npyv_setall_s8(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_s8 v0 = npyv_load_s8(ip + vstep * 0);
+        npyv_s8 v1 = npyv_load_s8(ip + vstep * 1);
+        npyv_s8 v2 = npyv_load_s8(ip + vstep * 2);
+        npyv_s8 v3 = npyv_load_s8(ip + vstep * 3);
+
+        npyv_s8 v4 = npyv_load_s8(ip + vstep * 4);
+        npyv_s8 v5 = npyv_load_s8(ip + vstep * 5);
+        npyv_s8 v6 = npyv_load_s8(ip + vstep * 6);
+        npyv_s8 v7 = npyv_load_s8(ip + vstep * 7);
+
+        npyv_s8 r01 = V_INTRIN(v0, v1);
+        npyv_s8 r23 = V_INTRIN(v2, v3);
+        npyv_s8 r45 = V_INTRIN(v4, v5);
+        npyv_s8 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_s8(ip));
+    }
+    npyv_lanetype_s8 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_s8 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_min_s8(const npyv_lanetype_s8 *ip1, const npyv_lanetype_s8 *ip2,
+                                     npyv_lanetype_s8 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_s8;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_s8 v0 = npyv_load_s8(&ip1[i + 0 * elemPerVector]);
+        npyv_s8 v1 = npyv_load_s8(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s8 v2 = npyv_load_s8(&ip1[i + 2 * elemPerVector]);
+        npyv_s8 v3 = npyv_load_s8(&ip1[i + 3 * elemPerVector]);
+        npyv_s8 v4 = npyv_load_s8(&ip1[i + 4 * elemPerVector]);
+        npyv_s8 v5 = npyv_load_s8(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_s8 u0 = npyv_load_s8(&ip2[i + 0 * elemPerVector]);
+        npyv_s8 u1 = npyv_load_s8(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s8 u2 = npyv_load_s8(&ip2[i + 2 * elemPerVector]);
+        npyv_s8 u3 = npyv_load_s8(&ip2[i + 3 * elemPerVector]);
+        npyv_s8 u4 = npyv_load_s8(&ip2[i + 4 * elemPerVector]);
+        npyv_s8 u5 = npyv_load_s8(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_s8 m0 = V_INTRIN(v0, u0);
+        npyv_s8 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s8 m2 = V_INTRIN(v2, u2);
+        npyv_s8 m3 = V_INTRIN(v3, u3);
+        npyv_s8 m4 = V_INTRIN(v4, u4);
+        npyv_s8 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_s8(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_s8(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_s8(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_s8(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_s8(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_s8(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_s8 v0 = npyv_load_s8(ip1 + i);
+        npyv_s8 u0 = npyv_load_s8(ip2 + i);
+        npyv_s8 m0 = V_INTRIN(v0, u0);
+        npyv_store_s8(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_s8 in1 = ip1[i];
+        const npyv_lanetype_s8 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_min_s8(const npyv_lanetype_s8 *ip1, npy_intp sip1,
+                           const npyv_lanetype_s8 *ip2, npy_intp sip2,
+                                 npyv_lanetype_s8 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_s8;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_s8 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_s8(ip1);
+        } else {
+            a = npyv_loadn_s8(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_s8(ip2);
+        } else {
+            b = npyv_loadn_s8(ip2, sip2);
+        }
+        npyv_s8 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_s8(op1, r);
+        } else {
+            npyv_storen_s8(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_s8 a = *ip1;
+        const npyv_lanetype_s8 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+#line 110
+#define SCALAR_OP scalar_maxp_i
+#if NPY_SIMD && (!1 || (0 && 1))
+
+#if 0 && !1
+    #define V_INTRIN npyv_maxpn_s8 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_maxpn_s8
+#else
+    #define V_INTRIN npyv_maxp_s8
+    #define V_REDUCE_INTRIN npyv_reduce_maxp_s8
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_maxp_s8(const npyv_lanetype_s8 *ip, npyv_lanetype_s8 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_s8;
+    const int wstep = vstep*8;
+    npyv_s8 acc = npyv_setall_s8(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_s8 v0 = npyv_load_s8(ip + vstep * 0);
+        npyv_s8 v1 = npyv_load_s8(ip + vstep * 1);
+        npyv_s8 v2 = npyv_load_s8(ip + vstep * 2);
+        npyv_s8 v3 = npyv_load_s8(ip + vstep * 3);
+
+        npyv_s8 v4 = npyv_load_s8(ip + vstep * 4);
+        npyv_s8 v5 = npyv_load_s8(ip + vstep * 5);
+        npyv_s8 v6 = npyv_load_s8(ip + vstep * 6);
+        npyv_s8 v7 = npyv_load_s8(ip + vstep * 7);
+
+        npyv_s8 r01 = V_INTRIN(v0, v1);
+        npyv_s8 r23 = V_INTRIN(v2, v3);
+        npyv_s8 r45 = V_INTRIN(v4, v5);
+        npyv_s8 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_s8(ip));
+    }
+    npyv_lanetype_s8 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_s8 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_maxp_s8(const npyv_lanetype_s8 *ip1, const npyv_lanetype_s8 *ip2,
+                                     npyv_lanetype_s8 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_s8;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_s8 v0 = npyv_load_s8(&ip1[i + 0 * elemPerVector]);
+        npyv_s8 v1 = npyv_load_s8(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s8 v2 = npyv_load_s8(&ip1[i + 2 * elemPerVector]);
+        npyv_s8 v3 = npyv_load_s8(&ip1[i + 3 * elemPerVector]);
+        npyv_s8 v4 = npyv_load_s8(&ip1[i + 4 * elemPerVector]);
+        npyv_s8 v5 = npyv_load_s8(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_s8 u0 = npyv_load_s8(&ip2[i + 0 * elemPerVector]);
+        npyv_s8 u1 = npyv_load_s8(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s8 u2 = npyv_load_s8(&ip2[i + 2 * elemPerVector]);
+        npyv_s8 u3 = npyv_load_s8(&ip2[i + 3 * elemPerVector]);
+        npyv_s8 u4 = npyv_load_s8(&ip2[i + 4 * elemPerVector]);
+        npyv_s8 u5 = npyv_load_s8(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_s8 m0 = V_INTRIN(v0, u0);
+        npyv_s8 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s8 m2 = V_INTRIN(v2, u2);
+        npyv_s8 m3 = V_INTRIN(v3, u3);
+        npyv_s8 m4 = V_INTRIN(v4, u4);
+        npyv_s8 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_s8(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_s8(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_s8(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_s8(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_s8(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_s8(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_s8 v0 = npyv_load_s8(ip1 + i);
+        npyv_s8 u0 = npyv_load_s8(ip2 + i);
+        npyv_s8 m0 = V_INTRIN(v0, u0);
+        npyv_store_s8(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_s8 in1 = ip1[i];
+        const npyv_lanetype_s8 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_maxp_s8(const npyv_lanetype_s8 *ip1, npy_intp sip1,
+                           const npyv_lanetype_s8 *ip2, npy_intp sip2,
+                                 npyv_lanetype_s8 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_s8;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_s8 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_s8(ip1);
+        } else {
+            a = npyv_loadn_s8(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_s8(ip2);
+        } else {
+            b = npyv_loadn_s8(ip2, sip2);
+        }
+        npyv_s8 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_s8(op1, r);
+        } else {
+            npyv_storen_s8(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_s8 a = *ip1;
+        const npyv_lanetype_s8 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+#line 110
+#define SCALAR_OP scalar_minp_i
+#if NPY_SIMD && (!1 || (0 && 1))
+
+#if 0 && !1
+    #define V_INTRIN npyv_minpn_s8 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_minpn_s8
+#else
+    #define V_INTRIN npyv_minp_s8
+    #define V_REDUCE_INTRIN npyv_reduce_minp_s8
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_minp_s8(const npyv_lanetype_s8 *ip, npyv_lanetype_s8 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_s8;
+    const int wstep = vstep*8;
+    npyv_s8 acc = npyv_setall_s8(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_s8 v0 = npyv_load_s8(ip + vstep * 0);
+        npyv_s8 v1 = npyv_load_s8(ip + vstep * 1);
+        npyv_s8 v2 = npyv_load_s8(ip + vstep * 2);
+        npyv_s8 v3 = npyv_load_s8(ip + vstep * 3);
+
+        npyv_s8 v4 = npyv_load_s8(ip + vstep * 4);
+        npyv_s8 v5 = npyv_load_s8(ip + vstep * 5);
+        npyv_s8 v6 = npyv_load_s8(ip + vstep * 6);
+        npyv_s8 v7 = npyv_load_s8(ip + vstep * 7);
+
+        npyv_s8 r01 = V_INTRIN(v0, v1);
+        npyv_s8 r23 = V_INTRIN(v2, v3);
+        npyv_s8 r45 = V_INTRIN(v4, v5);
+        npyv_s8 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_s8(ip));
+    }
+    npyv_lanetype_s8 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_s8 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_minp_s8(const npyv_lanetype_s8 *ip1, const npyv_lanetype_s8 *ip2,
+                                     npyv_lanetype_s8 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_s8;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_s8 v0 = npyv_load_s8(&ip1[i + 0 * elemPerVector]);
+        npyv_s8 v1 = npyv_load_s8(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s8 v2 = npyv_load_s8(&ip1[i + 2 * elemPerVector]);
+        npyv_s8 v3 = npyv_load_s8(&ip1[i + 3 * elemPerVector]);
+        npyv_s8 v4 = npyv_load_s8(&ip1[i + 4 * elemPerVector]);
+        npyv_s8 v5 = npyv_load_s8(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_s8 u0 = npyv_load_s8(&ip2[i + 0 * elemPerVector]);
+        npyv_s8 u1 = npyv_load_s8(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s8 u2 = npyv_load_s8(&ip2[i + 2 * elemPerVector]);
+        npyv_s8 u3 = npyv_load_s8(&ip2[i + 3 * elemPerVector]);
+        npyv_s8 u4 = npyv_load_s8(&ip2[i + 4 * elemPerVector]);
+        npyv_s8 u5 = npyv_load_s8(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_s8 m0 = V_INTRIN(v0, u0);
+        npyv_s8 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s8 m2 = V_INTRIN(v2, u2);
+        npyv_s8 m3 = V_INTRIN(v3, u3);
+        npyv_s8 m4 = V_INTRIN(v4, u4);
+        npyv_s8 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_s8(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_s8(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_s8(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_s8(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_s8(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_s8(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_s8 v0 = npyv_load_s8(ip1 + i);
+        npyv_s8 u0 = npyv_load_s8(ip2 + i);
+        npyv_s8 m0 = V_INTRIN(v0, u0);
+        npyv_store_s8(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_s8 in1 = ip1[i];
+        const npyv_lanetype_s8 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_minp_s8(const npyv_lanetype_s8 *ip1, npy_intp sip1,
+                           const npyv_lanetype_s8 *ip2, npy_intp sip2,
+                                 npyv_lanetype_s8 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_s8;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_s8 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_s8(ip1);
+        } else {
+            a = npyv_loadn_s8(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_s8(ip2);
+        } else {
+            b = npyv_loadn_s8(ip2, sip2);
+        }
+        npyv_s8 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_s8(op1, r);
+        } else {
+            npyv_storen_s8(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_s8 a = *ip1;
+        const npyv_lanetype_s8 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+
+#line 106
+#line 110
+#define SCALAR_OP scalar_max_i
+#if NPY_SIMD && (!0 || (0 && 0))
+
+#if 0 && !0
+    #define V_INTRIN npyv_maxn_u8 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_maxn_u8
+#else
+    #define V_INTRIN npyv_max_u8
+    #define V_REDUCE_INTRIN npyv_reduce_max_u8
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_max_u8(const npyv_lanetype_u8 *ip, npyv_lanetype_u8 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_u8;
+    const int wstep = vstep*8;
+    npyv_u8 acc = npyv_setall_u8(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_u8 v0 = npyv_load_u8(ip + vstep * 0);
+        npyv_u8 v1 = npyv_load_u8(ip + vstep * 1);
+        npyv_u8 v2 = npyv_load_u8(ip + vstep * 2);
+        npyv_u8 v3 = npyv_load_u8(ip + vstep * 3);
+
+        npyv_u8 v4 = npyv_load_u8(ip + vstep * 4);
+        npyv_u8 v5 = npyv_load_u8(ip + vstep * 5);
+        npyv_u8 v6 = npyv_load_u8(ip + vstep * 6);
+        npyv_u8 v7 = npyv_load_u8(ip + vstep * 7);
+
+        npyv_u8 r01 = V_INTRIN(v0, v1);
+        npyv_u8 r23 = V_INTRIN(v2, v3);
+        npyv_u8 r45 = V_INTRIN(v4, v5);
+        npyv_u8 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_u8(ip));
+    }
+    npyv_lanetype_u8 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_u8 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_max_u8(const npyv_lanetype_u8 *ip1, const npyv_lanetype_u8 *ip2,
+                                     npyv_lanetype_u8 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_u8;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_u8 v0 = npyv_load_u8(&ip1[i + 0 * elemPerVector]);
+        npyv_u8 v1 = npyv_load_u8(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u8 v2 = npyv_load_u8(&ip1[i + 2 * elemPerVector]);
+        npyv_u8 v3 = npyv_load_u8(&ip1[i + 3 * elemPerVector]);
+        npyv_u8 v4 = npyv_load_u8(&ip1[i + 4 * elemPerVector]);
+        npyv_u8 v5 = npyv_load_u8(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_u8 u0 = npyv_load_u8(&ip2[i + 0 * elemPerVector]);
+        npyv_u8 u1 = npyv_load_u8(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u8 u2 = npyv_load_u8(&ip2[i + 2 * elemPerVector]);
+        npyv_u8 u3 = npyv_load_u8(&ip2[i + 3 * elemPerVector]);
+        npyv_u8 u4 = npyv_load_u8(&ip2[i + 4 * elemPerVector]);
+        npyv_u8 u5 = npyv_load_u8(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_u8 m0 = V_INTRIN(v0, u0);
+        npyv_u8 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u8 m2 = V_INTRIN(v2, u2);
+        npyv_u8 m3 = V_INTRIN(v3, u3);
+        npyv_u8 m4 = V_INTRIN(v4, u4);
+        npyv_u8 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_u8(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_u8(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_u8(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_u8(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_u8(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_u8(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_u8 v0 = npyv_load_u8(ip1 + i);
+        npyv_u8 u0 = npyv_load_u8(ip2 + i);
+        npyv_u8 m0 = V_INTRIN(v0, u0);
+        npyv_store_u8(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_u8 in1 = ip1[i];
+        const npyv_lanetype_u8 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_max_u8(const npyv_lanetype_u8 *ip1, npy_intp sip1,
+                           const npyv_lanetype_u8 *ip2, npy_intp sip2,
+                                 npyv_lanetype_u8 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_u8;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_u8 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_u8(ip1);
+        } else {
+            a = npyv_loadn_u8(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_u8(ip2);
+        } else {
+            b = npyv_loadn_u8(ip2, sip2);
+        }
+        npyv_u8 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_u8(op1, r);
+        } else {
+            npyv_storen_u8(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_u8 a = *ip1;
+        const npyv_lanetype_u8 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+#line 110
+#define SCALAR_OP scalar_min_i
+#if NPY_SIMD && (!0 || (0 && 0))
+
+#if 0 && !0
+    #define V_INTRIN npyv_minn_u8 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_minn_u8
+#else
+    #define V_INTRIN npyv_min_u8
+    #define V_REDUCE_INTRIN npyv_reduce_min_u8
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_min_u8(const npyv_lanetype_u8 *ip, npyv_lanetype_u8 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_u8;
+    const int wstep = vstep*8;
+    npyv_u8 acc = npyv_setall_u8(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_u8 v0 = npyv_load_u8(ip + vstep * 0);
+        npyv_u8 v1 = npyv_load_u8(ip + vstep * 1);
+        npyv_u8 v2 = npyv_load_u8(ip + vstep * 2);
+        npyv_u8 v3 = npyv_load_u8(ip + vstep * 3);
+
+        npyv_u8 v4 = npyv_load_u8(ip + vstep * 4);
+        npyv_u8 v5 = npyv_load_u8(ip + vstep * 5);
+        npyv_u8 v6 = npyv_load_u8(ip + vstep * 6);
+        npyv_u8 v7 = npyv_load_u8(ip + vstep * 7);
+
+        npyv_u8 r01 = V_INTRIN(v0, v1);
+        npyv_u8 r23 = V_INTRIN(v2, v3);
+        npyv_u8 r45 = V_INTRIN(v4, v5);
+        npyv_u8 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_u8(ip));
+    }
+    npyv_lanetype_u8 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_u8 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_min_u8(const npyv_lanetype_u8 *ip1, const npyv_lanetype_u8 *ip2,
+                                     npyv_lanetype_u8 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_u8;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_u8 v0 = npyv_load_u8(&ip1[i + 0 * elemPerVector]);
+        npyv_u8 v1 = npyv_load_u8(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u8 v2 = npyv_load_u8(&ip1[i + 2 * elemPerVector]);
+        npyv_u8 v3 = npyv_load_u8(&ip1[i + 3 * elemPerVector]);
+        npyv_u8 v4 = npyv_load_u8(&ip1[i + 4 * elemPerVector]);
+        npyv_u8 v5 = npyv_load_u8(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_u8 u0 = npyv_load_u8(&ip2[i + 0 * elemPerVector]);
+        npyv_u8 u1 = npyv_load_u8(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u8 u2 = npyv_load_u8(&ip2[i + 2 * elemPerVector]);
+        npyv_u8 u3 = npyv_load_u8(&ip2[i + 3 * elemPerVector]);
+        npyv_u8 u4 = npyv_load_u8(&ip2[i + 4 * elemPerVector]);
+        npyv_u8 u5 = npyv_load_u8(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_u8 m0 = V_INTRIN(v0, u0);
+        npyv_u8 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u8 m2 = V_INTRIN(v2, u2);
+        npyv_u8 m3 = V_INTRIN(v3, u3);
+        npyv_u8 m4 = V_INTRIN(v4, u4);
+        npyv_u8 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_u8(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_u8(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_u8(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_u8(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_u8(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_u8(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_u8 v0 = npyv_load_u8(ip1 + i);
+        npyv_u8 u0 = npyv_load_u8(ip2 + i);
+        npyv_u8 m0 = V_INTRIN(v0, u0);
+        npyv_store_u8(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_u8 in1 = ip1[i];
+        const npyv_lanetype_u8 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_min_u8(const npyv_lanetype_u8 *ip1, npy_intp sip1,
+                           const npyv_lanetype_u8 *ip2, npy_intp sip2,
+                                 npyv_lanetype_u8 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_u8;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_u8 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_u8(ip1);
+        } else {
+            a = npyv_loadn_u8(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_u8(ip2);
+        } else {
+            b = npyv_loadn_u8(ip2, sip2);
+        }
+        npyv_u8 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_u8(op1, r);
+        } else {
+            npyv_storen_u8(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_u8 a = *ip1;
+        const npyv_lanetype_u8 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+#line 110
+#define SCALAR_OP scalar_maxp_i
+#if NPY_SIMD && (!1 || (0 && 1))
+
+#if 0 && !1
+    #define V_INTRIN npyv_maxpn_u8 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_maxpn_u8
+#else
+    #define V_INTRIN npyv_maxp_u8
+    #define V_REDUCE_INTRIN npyv_reduce_maxp_u8
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_maxp_u8(const npyv_lanetype_u8 *ip, npyv_lanetype_u8 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_u8;
+    const int wstep = vstep*8;
+    npyv_u8 acc = npyv_setall_u8(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_u8 v0 = npyv_load_u8(ip + vstep * 0);
+        npyv_u8 v1 = npyv_load_u8(ip + vstep * 1);
+        npyv_u8 v2 = npyv_load_u8(ip + vstep * 2);
+        npyv_u8 v3 = npyv_load_u8(ip + vstep * 3);
+
+        npyv_u8 v4 = npyv_load_u8(ip + vstep * 4);
+        npyv_u8 v5 = npyv_load_u8(ip + vstep * 5);
+        npyv_u8 v6 = npyv_load_u8(ip + vstep * 6);
+        npyv_u8 v7 = npyv_load_u8(ip + vstep * 7);
+
+        npyv_u8 r01 = V_INTRIN(v0, v1);
+        npyv_u8 r23 = V_INTRIN(v2, v3);
+        npyv_u8 r45 = V_INTRIN(v4, v5);
+        npyv_u8 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_u8(ip));
+    }
+    npyv_lanetype_u8 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_u8 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_maxp_u8(const npyv_lanetype_u8 *ip1, const npyv_lanetype_u8 *ip2,
+                                     npyv_lanetype_u8 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_u8;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_u8 v0 = npyv_load_u8(&ip1[i + 0 * elemPerVector]);
+        npyv_u8 v1 = npyv_load_u8(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u8 v2 = npyv_load_u8(&ip1[i + 2 * elemPerVector]);
+        npyv_u8 v3 = npyv_load_u8(&ip1[i + 3 * elemPerVector]);
+        npyv_u8 v4 = npyv_load_u8(&ip1[i + 4 * elemPerVector]);
+        npyv_u8 v5 = npyv_load_u8(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_u8 u0 = npyv_load_u8(&ip2[i + 0 * elemPerVector]);
+        npyv_u8 u1 = npyv_load_u8(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u8 u2 = npyv_load_u8(&ip2[i + 2 * elemPerVector]);
+        npyv_u8 u3 = npyv_load_u8(&ip2[i + 3 * elemPerVector]);
+        npyv_u8 u4 = npyv_load_u8(&ip2[i + 4 * elemPerVector]);
+        npyv_u8 u5 = npyv_load_u8(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_u8 m0 = V_INTRIN(v0, u0);
+        npyv_u8 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u8 m2 = V_INTRIN(v2, u2);
+        npyv_u8 m3 = V_INTRIN(v3, u3);
+        npyv_u8 m4 = V_INTRIN(v4, u4);
+        npyv_u8 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_u8(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_u8(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_u8(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_u8(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_u8(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_u8(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_u8 v0 = npyv_load_u8(ip1 + i);
+        npyv_u8 u0 = npyv_load_u8(ip2 + i);
+        npyv_u8 m0 = V_INTRIN(v0, u0);
+        npyv_store_u8(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_u8 in1 = ip1[i];
+        const npyv_lanetype_u8 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_maxp_u8(const npyv_lanetype_u8 *ip1, npy_intp sip1,
+                           const npyv_lanetype_u8 *ip2, npy_intp sip2,
+                                 npyv_lanetype_u8 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_u8;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_u8 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_u8(ip1);
+        } else {
+            a = npyv_loadn_u8(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_u8(ip2);
+        } else {
+            b = npyv_loadn_u8(ip2, sip2);
+        }
+        npyv_u8 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_u8(op1, r);
+        } else {
+            npyv_storen_u8(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_u8 a = *ip1;
+        const npyv_lanetype_u8 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+#line 110
+#define SCALAR_OP scalar_minp_i
+#if NPY_SIMD && (!1 || (0 && 1))
+
+#if 0 && !1
+    #define V_INTRIN npyv_minpn_u8 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_minpn_u8
+#else
+    #define V_INTRIN npyv_minp_u8
+    #define V_REDUCE_INTRIN npyv_reduce_minp_u8
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_minp_u8(const npyv_lanetype_u8 *ip, npyv_lanetype_u8 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_u8;
+    const int wstep = vstep*8;
+    npyv_u8 acc = npyv_setall_u8(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_u8 v0 = npyv_load_u8(ip + vstep * 0);
+        npyv_u8 v1 = npyv_load_u8(ip + vstep * 1);
+        npyv_u8 v2 = npyv_load_u8(ip + vstep * 2);
+        npyv_u8 v3 = npyv_load_u8(ip + vstep * 3);
+
+        npyv_u8 v4 = npyv_load_u8(ip + vstep * 4);
+        npyv_u8 v5 = npyv_load_u8(ip + vstep * 5);
+        npyv_u8 v6 = npyv_load_u8(ip + vstep * 6);
+        npyv_u8 v7 = npyv_load_u8(ip + vstep * 7);
+
+        npyv_u8 r01 = V_INTRIN(v0, v1);
+        npyv_u8 r23 = V_INTRIN(v2, v3);
+        npyv_u8 r45 = V_INTRIN(v4, v5);
+        npyv_u8 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_u8(ip));
+    }
+    npyv_lanetype_u8 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_u8 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_minp_u8(const npyv_lanetype_u8 *ip1, const npyv_lanetype_u8 *ip2,
+                                     npyv_lanetype_u8 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_u8;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_u8 v0 = npyv_load_u8(&ip1[i + 0 * elemPerVector]);
+        npyv_u8 v1 = npyv_load_u8(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u8 v2 = npyv_load_u8(&ip1[i + 2 * elemPerVector]);
+        npyv_u8 v3 = npyv_load_u8(&ip1[i + 3 * elemPerVector]);
+        npyv_u8 v4 = npyv_load_u8(&ip1[i + 4 * elemPerVector]);
+        npyv_u8 v5 = npyv_load_u8(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_u8 u0 = npyv_load_u8(&ip2[i + 0 * elemPerVector]);
+        npyv_u8 u1 = npyv_load_u8(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u8 u2 = npyv_load_u8(&ip2[i + 2 * elemPerVector]);
+        npyv_u8 u3 = npyv_load_u8(&ip2[i + 3 * elemPerVector]);
+        npyv_u8 u4 = npyv_load_u8(&ip2[i + 4 * elemPerVector]);
+        npyv_u8 u5 = npyv_load_u8(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_u8 m0 = V_INTRIN(v0, u0);
+        npyv_u8 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u8 m2 = V_INTRIN(v2, u2);
+        npyv_u8 m3 = V_INTRIN(v3, u3);
+        npyv_u8 m4 = V_INTRIN(v4, u4);
+        npyv_u8 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_u8(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_u8(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_u8(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_u8(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_u8(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_u8(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_u8 v0 = npyv_load_u8(ip1 + i);
+        npyv_u8 u0 = npyv_load_u8(ip2 + i);
+        npyv_u8 m0 = V_INTRIN(v0, u0);
+        npyv_store_u8(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_u8 in1 = ip1[i];
+        const npyv_lanetype_u8 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_minp_u8(const npyv_lanetype_u8 *ip1, npy_intp sip1,
+                           const npyv_lanetype_u8 *ip2, npy_intp sip2,
+                                 npyv_lanetype_u8 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_u8;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_u8 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_u8(ip1);
+        } else {
+            a = npyv_loadn_u8(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_u8(ip2);
+        } else {
+            b = npyv_loadn_u8(ip2, sip2);
+        }
+        npyv_u8 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_u8(op1, r);
+        } else {
+            npyv_storen_u8(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_u8 a = *ip1;
+        const npyv_lanetype_u8 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+
+#line 106
+#line 110
+#define SCALAR_OP scalar_max_i
+#if NPY_SIMD && (!0 || (0 && 0))
+
+#if 0 && !0
+    #define V_INTRIN npyv_maxn_s16 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_maxn_s16
+#else
+    #define V_INTRIN npyv_max_s16
+    #define V_REDUCE_INTRIN npyv_reduce_max_s16
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_max_s16(const npyv_lanetype_s16 *ip, npyv_lanetype_s16 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_s16;
+    const int wstep = vstep*8;
+    npyv_s16 acc = npyv_setall_s16(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_s16 v0 = npyv_load_s16(ip + vstep * 0);
+        npyv_s16 v1 = npyv_load_s16(ip + vstep * 1);
+        npyv_s16 v2 = npyv_load_s16(ip + vstep * 2);
+        npyv_s16 v3 = npyv_load_s16(ip + vstep * 3);
+
+        npyv_s16 v4 = npyv_load_s16(ip + vstep * 4);
+        npyv_s16 v5 = npyv_load_s16(ip + vstep * 5);
+        npyv_s16 v6 = npyv_load_s16(ip + vstep * 6);
+        npyv_s16 v7 = npyv_load_s16(ip + vstep * 7);
+
+        npyv_s16 r01 = V_INTRIN(v0, v1);
+        npyv_s16 r23 = V_INTRIN(v2, v3);
+        npyv_s16 r45 = V_INTRIN(v4, v5);
+        npyv_s16 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_s16(ip));
+    }
+    npyv_lanetype_s16 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_s16 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_max_s16(const npyv_lanetype_s16 *ip1, const npyv_lanetype_s16 *ip2,
+                                     npyv_lanetype_s16 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_s16;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_s16 v0 = npyv_load_s16(&ip1[i + 0 * elemPerVector]);
+        npyv_s16 v1 = npyv_load_s16(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s16 v2 = npyv_load_s16(&ip1[i + 2 * elemPerVector]);
+        npyv_s16 v3 = npyv_load_s16(&ip1[i + 3 * elemPerVector]);
+        npyv_s16 v4 = npyv_load_s16(&ip1[i + 4 * elemPerVector]);
+        npyv_s16 v5 = npyv_load_s16(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_s16 u0 = npyv_load_s16(&ip2[i + 0 * elemPerVector]);
+        npyv_s16 u1 = npyv_load_s16(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s16 u2 = npyv_load_s16(&ip2[i + 2 * elemPerVector]);
+        npyv_s16 u3 = npyv_load_s16(&ip2[i + 3 * elemPerVector]);
+        npyv_s16 u4 = npyv_load_s16(&ip2[i + 4 * elemPerVector]);
+        npyv_s16 u5 = npyv_load_s16(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_s16 m0 = V_INTRIN(v0, u0);
+        npyv_s16 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s16 m2 = V_INTRIN(v2, u2);
+        npyv_s16 m3 = V_INTRIN(v3, u3);
+        npyv_s16 m4 = V_INTRIN(v4, u4);
+        npyv_s16 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_s16(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_s16(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_s16(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_s16(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_s16(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_s16(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_s16 v0 = npyv_load_s16(ip1 + i);
+        npyv_s16 u0 = npyv_load_s16(ip2 + i);
+        npyv_s16 m0 = V_INTRIN(v0, u0);
+        npyv_store_s16(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_s16 in1 = ip1[i];
+        const npyv_lanetype_s16 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_max_s16(const npyv_lanetype_s16 *ip1, npy_intp sip1,
+                           const npyv_lanetype_s16 *ip2, npy_intp sip2,
+                                 npyv_lanetype_s16 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_s16;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_s16 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_s16(ip1);
+        } else {
+            a = npyv_loadn_s16(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_s16(ip2);
+        } else {
+            b = npyv_loadn_s16(ip2, sip2);
+        }
+        npyv_s16 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_s16(op1, r);
+        } else {
+            npyv_storen_s16(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_s16 a = *ip1;
+        const npyv_lanetype_s16 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+#line 110
+#define SCALAR_OP scalar_min_i
+#if NPY_SIMD && (!0 || (0 && 0))
+
+#if 0 && !0
+    #define V_INTRIN npyv_minn_s16 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_minn_s16
+#else
+    #define V_INTRIN npyv_min_s16
+    #define V_REDUCE_INTRIN npyv_reduce_min_s16
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_min_s16(const npyv_lanetype_s16 *ip, npyv_lanetype_s16 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_s16;
+    const int wstep = vstep*8;
+    npyv_s16 acc = npyv_setall_s16(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_s16 v0 = npyv_load_s16(ip + vstep * 0);
+        npyv_s16 v1 = npyv_load_s16(ip + vstep * 1);
+        npyv_s16 v2 = npyv_load_s16(ip + vstep * 2);
+        npyv_s16 v3 = npyv_load_s16(ip + vstep * 3);
+
+        npyv_s16 v4 = npyv_load_s16(ip + vstep * 4);
+        npyv_s16 v5 = npyv_load_s16(ip + vstep * 5);
+        npyv_s16 v6 = npyv_load_s16(ip + vstep * 6);
+        npyv_s16 v7 = npyv_load_s16(ip + vstep * 7);
+
+        npyv_s16 r01 = V_INTRIN(v0, v1);
+        npyv_s16 r23 = V_INTRIN(v2, v3);
+        npyv_s16 r45 = V_INTRIN(v4, v5);
+        npyv_s16 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_s16(ip));
+    }
+    npyv_lanetype_s16 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_s16 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_min_s16(const npyv_lanetype_s16 *ip1, const npyv_lanetype_s16 *ip2,
+                                     npyv_lanetype_s16 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_s16;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_s16 v0 = npyv_load_s16(&ip1[i + 0 * elemPerVector]);
+        npyv_s16 v1 = npyv_load_s16(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s16 v2 = npyv_load_s16(&ip1[i + 2 * elemPerVector]);
+        npyv_s16 v3 = npyv_load_s16(&ip1[i + 3 * elemPerVector]);
+        npyv_s16 v4 = npyv_load_s16(&ip1[i + 4 * elemPerVector]);
+        npyv_s16 v5 = npyv_load_s16(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_s16 u0 = npyv_load_s16(&ip2[i + 0 * elemPerVector]);
+        npyv_s16 u1 = npyv_load_s16(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s16 u2 = npyv_load_s16(&ip2[i + 2 * elemPerVector]);
+        npyv_s16 u3 = npyv_load_s16(&ip2[i + 3 * elemPerVector]);
+        npyv_s16 u4 = npyv_load_s16(&ip2[i + 4 * elemPerVector]);
+        npyv_s16 u5 = npyv_load_s16(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_s16 m0 = V_INTRIN(v0, u0);
+        npyv_s16 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s16 m2 = V_INTRIN(v2, u2);
+        npyv_s16 m3 = V_INTRIN(v3, u3);
+        npyv_s16 m4 = V_INTRIN(v4, u4);
+        npyv_s16 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_s16(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_s16(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_s16(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_s16(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_s16(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_s16(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_s16 v0 = npyv_load_s16(ip1 + i);
+        npyv_s16 u0 = npyv_load_s16(ip2 + i);
+        npyv_s16 m0 = V_INTRIN(v0, u0);
+        npyv_store_s16(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_s16 in1 = ip1[i];
+        const npyv_lanetype_s16 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_min_s16(const npyv_lanetype_s16 *ip1, npy_intp sip1,
+                           const npyv_lanetype_s16 *ip2, npy_intp sip2,
+                                 npyv_lanetype_s16 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_s16;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_s16 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_s16(ip1);
+        } else {
+            a = npyv_loadn_s16(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_s16(ip2);
+        } else {
+            b = npyv_loadn_s16(ip2, sip2);
+        }
+        npyv_s16 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_s16(op1, r);
+        } else {
+            npyv_storen_s16(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_s16 a = *ip1;
+        const npyv_lanetype_s16 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+#line 110
+#define SCALAR_OP scalar_maxp_i
+#if NPY_SIMD && (!1 || (0 && 1))
+
+#if 0 && !1
+    #define V_INTRIN npyv_maxpn_s16 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_maxpn_s16
+#else
+    #define V_INTRIN npyv_maxp_s16
+    #define V_REDUCE_INTRIN npyv_reduce_maxp_s16
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_maxp_s16(const npyv_lanetype_s16 *ip, npyv_lanetype_s16 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_s16;
+    const int wstep = vstep*8;
+    npyv_s16 acc = npyv_setall_s16(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_s16 v0 = npyv_load_s16(ip + vstep * 0);
+        npyv_s16 v1 = npyv_load_s16(ip + vstep * 1);
+        npyv_s16 v2 = npyv_load_s16(ip + vstep * 2);
+        npyv_s16 v3 = npyv_load_s16(ip + vstep * 3);
+
+        npyv_s16 v4 = npyv_load_s16(ip + vstep * 4);
+        npyv_s16 v5 = npyv_load_s16(ip + vstep * 5);
+        npyv_s16 v6 = npyv_load_s16(ip + vstep * 6);
+        npyv_s16 v7 = npyv_load_s16(ip + vstep * 7);
+
+        npyv_s16 r01 = V_INTRIN(v0, v1);
+        npyv_s16 r23 = V_INTRIN(v2, v3);
+        npyv_s16 r45 = V_INTRIN(v4, v5);
+        npyv_s16 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_s16(ip));
+    }
+    npyv_lanetype_s16 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_s16 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_maxp_s16(const npyv_lanetype_s16 *ip1, const npyv_lanetype_s16 *ip2,
+                                     npyv_lanetype_s16 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_s16;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_s16 v0 = npyv_load_s16(&ip1[i + 0 * elemPerVector]);
+        npyv_s16 v1 = npyv_load_s16(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s16 v2 = npyv_load_s16(&ip1[i + 2 * elemPerVector]);
+        npyv_s16 v3 = npyv_load_s16(&ip1[i + 3 * elemPerVector]);
+        npyv_s16 v4 = npyv_load_s16(&ip1[i + 4 * elemPerVector]);
+        npyv_s16 v5 = npyv_load_s16(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_s16 u0 = npyv_load_s16(&ip2[i + 0 * elemPerVector]);
+        npyv_s16 u1 = npyv_load_s16(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s16 u2 = npyv_load_s16(&ip2[i + 2 * elemPerVector]);
+        npyv_s16 u3 = npyv_load_s16(&ip2[i + 3 * elemPerVector]);
+        npyv_s16 u4 = npyv_load_s16(&ip2[i + 4 * elemPerVector]);
+        npyv_s16 u5 = npyv_load_s16(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_s16 m0 = V_INTRIN(v0, u0);
+        npyv_s16 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s16 m2 = V_INTRIN(v2, u2);
+        npyv_s16 m3 = V_INTRIN(v3, u3);
+        npyv_s16 m4 = V_INTRIN(v4, u4);
+        npyv_s16 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_s16(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_s16(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_s16(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_s16(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_s16(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_s16(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_s16 v0 = npyv_load_s16(ip1 + i);
+        npyv_s16 u0 = npyv_load_s16(ip2 + i);
+        npyv_s16 m0 = V_INTRIN(v0, u0);
+        npyv_store_s16(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_s16 in1 = ip1[i];
+        const npyv_lanetype_s16 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_maxp_s16(const npyv_lanetype_s16 *ip1, npy_intp sip1,
+                           const npyv_lanetype_s16 *ip2, npy_intp sip2,
+                                 npyv_lanetype_s16 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_s16;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_s16 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_s16(ip1);
+        } else {
+            a = npyv_loadn_s16(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_s16(ip2);
+        } else {
+            b = npyv_loadn_s16(ip2, sip2);
+        }
+        npyv_s16 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_s16(op1, r);
+        } else {
+            npyv_storen_s16(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_s16 a = *ip1;
+        const npyv_lanetype_s16 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+#line 110
+#define SCALAR_OP scalar_minp_i
+#if NPY_SIMD && (!1 || (0 && 1))
+
+#if 0 && !1
+    #define V_INTRIN npyv_minpn_s16 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_minpn_s16
+#else
+    #define V_INTRIN npyv_minp_s16
+    #define V_REDUCE_INTRIN npyv_reduce_minp_s16
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_minp_s16(const npyv_lanetype_s16 *ip, npyv_lanetype_s16 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_s16;
+    const int wstep = vstep*8;
+    npyv_s16 acc = npyv_setall_s16(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_s16 v0 = npyv_load_s16(ip + vstep * 0);
+        npyv_s16 v1 = npyv_load_s16(ip + vstep * 1);
+        npyv_s16 v2 = npyv_load_s16(ip + vstep * 2);
+        npyv_s16 v3 = npyv_load_s16(ip + vstep * 3);
+
+        npyv_s16 v4 = npyv_load_s16(ip + vstep * 4);
+        npyv_s16 v5 = npyv_load_s16(ip + vstep * 5);
+        npyv_s16 v6 = npyv_load_s16(ip + vstep * 6);
+        npyv_s16 v7 = npyv_load_s16(ip + vstep * 7);
+
+        npyv_s16 r01 = V_INTRIN(v0, v1);
+        npyv_s16 r23 = V_INTRIN(v2, v3);
+        npyv_s16 r45 = V_INTRIN(v4, v5);
+        npyv_s16 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_s16(ip));
+    }
+    npyv_lanetype_s16 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_s16 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_minp_s16(const npyv_lanetype_s16 *ip1, const npyv_lanetype_s16 *ip2,
+                                     npyv_lanetype_s16 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_s16;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_s16 v0 = npyv_load_s16(&ip1[i + 0 * elemPerVector]);
+        npyv_s16 v1 = npyv_load_s16(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s16 v2 = npyv_load_s16(&ip1[i + 2 * elemPerVector]);
+        npyv_s16 v3 = npyv_load_s16(&ip1[i + 3 * elemPerVector]);
+        npyv_s16 v4 = npyv_load_s16(&ip1[i + 4 * elemPerVector]);
+        npyv_s16 v5 = npyv_load_s16(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_s16 u0 = npyv_load_s16(&ip2[i + 0 * elemPerVector]);
+        npyv_s16 u1 = npyv_load_s16(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s16 u2 = npyv_load_s16(&ip2[i + 2 * elemPerVector]);
+        npyv_s16 u3 = npyv_load_s16(&ip2[i + 3 * elemPerVector]);
+        npyv_s16 u4 = npyv_load_s16(&ip2[i + 4 * elemPerVector]);
+        npyv_s16 u5 = npyv_load_s16(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_s16 m0 = V_INTRIN(v0, u0);
+        npyv_s16 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s16 m2 = V_INTRIN(v2, u2);
+        npyv_s16 m3 = V_INTRIN(v3, u3);
+        npyv_s16 m4 = V_INTRIN(v4, u4);
+        npyv_s16 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_s16(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_s16(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_s16(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_s16(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_s16(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_s16(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_s16 v0 = npyv_load_s16(ip1 + i);
+        npyv_s16 u0 = npyv_load_s16(ip2 + i);
+        npyv_s16 m0 = V_INTRIN(v0, u0);
+        npyv_store_s16(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_s16 in1 = ip1[i];
+        const npyv_lanetype_s16 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_minp_s16(const npyv_lanetype_s16 *ip1, npy_intp sip1,
+                           const npyv_lanetype_s16 *ip2, npy_intp sip2,
+                                 npyv_lanetype_s16 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_s16;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_s16 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_s16(ip1);
+        } else {
+            a = npyv_loadn_s16(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_s16(ip2);
+        } else {
+            b = npyv_loadn_s16(ip2, sip2);
+        }
+        npyv_s16 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_s16(op1, r);
+        } else {
+            npyv_storen_s16(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_s16 a = *ip1;
+        const npyv_lanetype_s16 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+
+#line 106
+#line 110
+#define SCALAR_OP scalar_max_i
+#if NPY_SIMD && (!0 || (0 && 0))
+
+#if 0 && !0
+    #define V_INTRIN npyv_maxn_u16 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_maxn_u16
+#else
+    #define V_INTRIN npyv_max_u16
+    #define V_REDUCE_INTRIN npyv_reduce_max_u16
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_max_u16(const npyv_lanetype_u16 *ip, npyv_lanetype_u16 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_u16;
+    const int wstep = vstep*8;
+    npyv_u16 acc = npyv_setall_u16(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_u16 v0 = npyv_load_u16(ip + vstep * 0);
+        npyv_u16 v1 = npyv_load_u16(ip + vstep * 1);
+        npyv_u16 v2 = npyv_load_u16(ip + vstep * 2);
+        npyv_u16 v3 = npyv_load_u16(ip + vstep * 3);
+
+        npyv_u16 v4 = npyv_load_u16(ip + vstep * 4);
+        npyv_u16 v5 = npyv_load_u16(ip + vstep * 5);
+        npyv_u16 v6 = npyv_load_u16(ip + vstep * 6);
+        npyv_u16 v7 = npyv_load_u16(ip + vstep * 7);
+
+        npyv_u16 r01 = V_INTRIN(v0, v1);
+        npyv_u16 r23 = V_INTRIN(v2, v3);
+        npyv_u16 r45 = V_INTRIN(v4, v5);
+        npyv_u16 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_u16(ip));
+    }
+    npyv_lanetype_u16 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_u16 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_max_u16(const npyv_lanetype_u16 *ip1, const npyv_lanetype_u16 *ip2,
+                                     npyv_lanetype_u16 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_u16;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_u16 v0 = npyv_load_u16(&ip1[i + 0 * elemPerVector]);
+        npyv_u16 v1 = npyv_load_u16(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u16 v2 = npyv_load_u16(&ip1[i + 2 * elemPerVector]);
+        npyv_u16 v3 = npyv_load_u16(&ip1[i + 3 * elemPerVector]);
+        npyv_u16 v4 = npyv_load_u16(&ip1[i + 4 * elemPerVector]);
+        npyv_u16 v5 = npyv_load_u16(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_u16 u0 = npyv_load_u16(&ip2[i + 0 * elemPerVector]);
+        npyv_u16 u1 = npyv_load_u16(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u16 u2 = npyv_load_u16(&ip2[i + 2 * elemPerVector]);
+        npyv_u16 u3 = npyv_load_u16(&ip2[i + 3 * elemPerVector]);
+        npyv_u16 u4 = npyv_load_u16(&ip2[i + 4 * elemPerVector]);
+        npyv_u16 u5 = npyv_load_u16(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_u16 m0 = V_INTRIN(v0, u0);
+        npyv_u16 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u16 m2 = V_INTRIN(v2, u2);
+        npyv_u16 m3 = V_INTRIN(v3, u3);
+        npyv_u16 m4 = V_INTRIN(v4, u4);
+        npyv_u16 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_u16(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_u16(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_u16(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_u16(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_u16(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_u16(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_u16 v0 = npyv_load_u16(ip1 + i);
+        npyv_u16 u0 = npyv_load_u16(ip2 + i);
+        npyv_u16 m0 = V_INTRIN(v0, u0);
+        npyv_store_u16(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_u16 in1 = ip1[i];
+        const npyv_lanetype_u16 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_max_u16(const npyv_lanetype_u16 *ip1, npy_intp sip1,
+                           const npyv_lanetype_u16 *ip2, npy_intp sip2,
+                                 npyv_lanetype_u16 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_u16;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_u16 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_u16(ip1);
+        } else {
+            a = npyv_loadn_u16(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_u16(ip2);
+        } else {
+            b = npyv_loadn_u16(ip2, sip2);
+        }
+        npyv_u16 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_u16(op1, r);
+        } else {
+            npyv_storen_u16(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_u16 a = *ip1;
+        const npyv_lanetype_u16 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+#line 110
+#define SCALAR_OP scalar_min_i
+#if NPY_SIMD && (!0 || (0 && 0))
+
+#if 0 && !0
+    #define V_INTRIN npyv_minn_u16 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_minn_u16
+#else
+    #define V_INTRIN npyv_min_u16
+    #define V_REDUCE_INTRIN npyv_reduce_min_u16
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_min_u16(const npyv_lanetype_u16 *ip, npyv_lanetype_u16 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_u16;
+    const int wstep = vstep*8;
+    npyv_u16 acc = npyv_setall_u16(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_u16 v0 = npyv_load_u16(ip + vstep * 0);
+        npyv_u16 v1 = npyv_load_u16(ip + vstep * 1);
+        npyv_u16 v2 = npyv_load_u16(ip + vstep * 2);
+        npyv_u16 v3 = npyv_load_u16(ip + vstep * 3);
+
+        npyv_u16 v4 = npyv_load_u16(ip + vstep * 4);
+        npyv_u16 v5 = npyv_load_u16(ip + vstep * 5);
+        npyv_u16 v6 = npyv_load_u16(ip + vstep * 6);
+        npyv_u16 v7 = npyv_load_u16(ip + vstep * 7);
+
+        npyv_u16 r01 = V_INTRIN(v0, v1);
+        npyv_u16 r23 = V_INTRIN(v2, v3);
+        npyv_u16 r45 = V_INTRIN(v4, v5);
+        npyv_u16 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_u16(ip));
+    }
+    npyv_lanetype_u16 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_u16 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_min_u16(const npyv_lanetype_u16 *ip1, const npyv_lanetype_u16 *ip2,
+                                     npyv_lanetype_u16 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_u16;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_u16 v0 = npyv_load_u16(&ip1[i + 0 * elemPerVector]);
+        npyv_u16 v1 = npyv_load_u16(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u16 v2 = npyv_load_u16(&ip1[i + 2 * elemPerVector]);
+        npyv_u16 v3 = npyv_load_u16(&ip1[i + 3 * elemPerVector]);
+        npyv_u16 v4 = npyv_load_u16(&ip1[i + 4 * elemPerVector]);
+        npyv_u16 v5 = npyv_load_u16(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_u16 u0 = npyv_load_u16(&ip2[i + 0 * elemPerVector]);
+        npyv_u16 u1 = npyv_load_u16(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u16 u2 = npyv_load_u16(&ip2[i + 2 * elemPerVector]);
+        npyv_u16 u3 = npyv_load_u16(&ip2[i + 3 * elemPerVector]);
+        npyv_u16 u4 = npyv_load_u16(&ip2[i + 4 * elemPerVector]);
+        npyv_u16 u5 = npyv_load_u16(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_u16 m0 = V_INTRIN(v0, u0);
+        npyv_u16 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u16 m2 = V_INTRIN(v2, u2);
+        npyv_u16 m3 = V_INTRIN(v3, u3);
+        npyv_u16 m4 = V_INTRIN(v4, u4);
+        npyv_u16 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_u16(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_u16(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_u16(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_u16(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_u16(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_u16(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_u16 v0 = npyv_load_u16(ip1 + i);
+        npyv_u16 u0 = npyv_load_u16(ip2 + i);
+        npyv_u16 m0 = V_INTRIN(v0, u0);
+        npyv_store_u16(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_u16 in1 = ip1[i];
+        const npyv_lanetype_u16 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_min_u16(const npyv_lanetype_u16 *ip1, npy_intp sip1,
+                           const npyv_lanetype_u16 *ip2, npy_intp sip2,
+                                 npyv_lanetype_u16 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_u16;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_u16 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_u16(ip1);
+        } else {
+            a = npyv_loadn_u16(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_u16(ip2);
+        } else {
+            b = npyv_loadn_u16(ip2, sip2);
+        }
+        npyv_u16 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_u16(op1, r);
+        } else {
+            npyv_storen_u16(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_u16 a = *ip1;
+        const npyv_lanetype_u16 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+#line 110
+#define SCALAR_OP scalar_maxp_i
+#if NPY_SIMD && (!1 || (0 && 1))
+
+#if 0 && !1
+    #define V_INTRIN npyv_maxpn_u16 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_maxpn_u16
+#else
+    #define V_INTRIN npyv_maxp_u16
+    #define V_REDUCE_INTRIN npyv_reduce_maxp_u16
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_maxp_u16(const npyv_lanetype_u16 *ip, npyv_lanetype_u16 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_u16;
+    const int wstep = vstep*8;
+    npyv_u16 acc = npyv_setall_u16(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_u16 v0 = npyv_load_u16(ip + vstep * 0);
+        npyv_u16 v1 = npyv_load_u16(ip + vstep * 1);
+        npyv_u16 v2 = npyv_load_u16(ip + vstep * 2);
+        npyv_u16 v3 = npyv_load_u16(ip + vstep * 3);
+
+        npyv_u16 v4 = npyv_load_u16(ip + vstep * 4);
+        npyv_u16 v5 = npyv_load_u16(ip + vstep * 5);
+        npyv_u16 v6 = npyv_load_u16(ip + vstep * 6);
+        npyv_u16 v7 = npyv_load_u16(ip + vstep * 7);
+
+        npyv_u16 r01 = V_INTRIN(v0, v1);
+        npyv_u16 r23 = V_INTRIN(v2, v3);
+        npyv_u16 r45 = V_INTRIN(v4, v5);
+        npyv_u16 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_u16(ip));
+    }
+    npyv_lanetype_u16 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_u16 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_maxp_u16(const npyv_lanetype_u16 *ip1, const npyv_lanetype_u16 *ip2,
+                                     npyv_lanetype_u16 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_u16;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_u16 v0 = npyv_load_u16(&ip1[i + 0 * elemPerVector]);
+        npyv_u16 v1 = npyv_load_u16(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u16 v2 = npyv_load_u16(&ip1[i + 2 * elemPerVector]);
+        npyv_u16 v3 = npyv_load_u16(&ip1[i + 3 * elemPerVector]);
+        npyv_u16 v4 = npyv_load_u16(&ip1[i + 4 * elemPerVector]);
+        npyv_u16 v5 = npyv_load_u16(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_u16 u0 = npyv_load_u16(&ip2[i + 0 * elemPerVector]);
+        npyv_u16 u1 = npyv_load_u16(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u16 u2 = npyv_load_u16(&ip2[i + 2 * elemPerVector]);
+        npyv_u16 u3 = npyv_load_u16(&ip2[i + 3 * elemPerVector]);
+        npyv_u16 u4 = npyv_load_u16(&ip2[i + 4 * elemPerVector]);
+        npyv_u16 u5 = npyv_load_u16(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_u16 m0 = V_INTRIN(v0, u0);
+        npyv_u16 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u16 m2 = V_INTRIN(v2, u2);
+        npyv_u16 m3 = V_INTRIN(v3, u3);
+        npyv_u16 m4 = V_INTRIN(v4, u4);
+        npyv_u16 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_u16(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_u16(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_u16(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_u16(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_u16(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_u16(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_u16 v0 = npyv_load_u16(ip1 + i);
+        npyv_u16 u0 = npyv_load_u16(ip2 + i);
+        npyv_u16 m0 = V_INTRIN(v0, u0);
+        npyv_store_u16(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_u16 in1 = ip1[i];
+        const npyv_lanetype_u16 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_maxp_u16(const npyv_lanetype_u16 *ip1, npy_intp sip1,
+                           const npyv_lanetype_u16 *ip2, npy_intp sip2,
+                                 npyv_lanetype_u16 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_u16;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_u16 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_u16(ip1);
+        } else {
+            a = npyv_loadn_u16(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_u16(ip2);
+        } else {
+            b = npyv_loadn_u16(ip2, sip2);
+        }
+        npyv_u16 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_u16(op1, r);
+        } else {
+            npyv_storen_u16(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_u16 a = *ip1;
+        const npyv_lanetype_u16 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+#line 110
+#define SCALAR_OP scalar_minp_i
+#if NPY_SIMD && (!1 || (0 && 1))
+
+#if 0 && !1
+    #define V_INTRIN npyv_minpn_u16 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_minpn_u16
+#else
+    #define V_INTRIN npyv_minp_u16
+    #define V_REDUCE_INTRIN npyv_reduce_minp_u16
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_minp_u16(const npyv_lanetype_u16 *ip, npyv_lanetype_u16 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_u16;
+    const int wstep = vstep*8;
+    npyv_u16 acc = npyv_setall_u16(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_u16 v0 = npyv_load_u16(ip + vstep * 0);
+        npyv_u16 v1 = npyv_load_u16(ip + vstep * 1);
+        npyv_u16 v2 = npyv_load_u16(ip + vstep * 2);
+        npyv_u16 v3 = npyv_load_u16(ip + vstep * 3);
+
+        npyv_u16 v4 = npyv_load_u16(ip + vstep * 4);
+        npyv_u16 v5 = npyv_load_u16(ip + vstep * 5);
+        npyv_u16 v6 = npyv_load_u16(ip + vstep * 6);
+        npyv_u16 v7 = npyv_load_u16(ip + vstep * 7);
+
+        npyv_u16 r01 = V_INTRIN(v0, v1);
+        npyv_u16 r23 = V_INTRIN(v2, v3);
+        npyv_u16 r45 = V_INTRIN(v4, v5);
+        npyv_u16 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_u16(ip));
+    }
+    npyv_lanetype_u16 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_u16 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_minp_u16(const npyv_lanetype_u16 *ip1, const npyv_lanetype_u16 *ip2,
+                                     npyv_lanetype_u16 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_u16;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_u16 v0 = npyv_load_u16(&ip1[i + 0 * elemPerVector]);
+        npyv_u16 v1 = npyv_load_u16(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u16 v2 = npyv_load_u16(&ip1[i + 2 * elemPerVector]);
+        npyv_u16 v3 = npyv_load_u16(&ip1[i + 3 * elemPerVector]);
+        npyv_u16 v4 = npyv_load_u16(&ip1[i + 4 * elemPerVector]);
+        npyv_u16 v5 = npyv_load_u16(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_u16 u0 = npyv_load_u16(&ip2[i + 0 * elemPerVector]);
+        npyv_u16 u1 = npyv_load_u16(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u16 u2 = npyv_load_u16(&ip2[i + 2 * elemPerVector]);
+        npyv_u16 u3 = npyv_load_u16(&ip2[i + 3 * elemPerVector]);
+        npyv_u16 u4 = npyv_load_u16(&ip2[i + 4 * elemPerVector]);
+        npyv_u16 u5 = npyv_load_u16(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_u16 m0 = V_INTRIN(v0, u0);
+        npyv_u16 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u16 m2 = V_INTRIN(v2, u2);
+        npyv_u16 m3 = V_INTRIN(v3, u3);
+        npyv_u16 m4 = V_INTRIN(v4, u4);
+        npyv_u16 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_u16(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_u16(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_u16(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_u16(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_u16(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_u16(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_u16 v0 = npyv_load_u16(ip1 + i);
+        npyv_u16 u0 = npyv_load_u16(ip2 + i);
+        npyv_u16 m0 = V_INTRIN(v0, u0);
+        npyv_store_u16(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_u16 in1 = ip1[i];
+        const npyv_lanetype_u16 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_minp_u16(const npyv_lanetype_u16 *ip1, npy_intp sip1,
+                           const npyv_lanetype_u16 *ip2, npy_intp sip2,
+                                 npyv_lanetype_u16 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_u16;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_u16 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_u16(ip1);
+        } else {
+            a = npyv_loadn_u16(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_u16(ip2);
+        } else {
+            b = npyv_loadn_u16(ip2, sip2);
+        }
+        npyv_u16 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_u16(op1, r);
+        } else {
+            npyv_storen_u16(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_u16 a = *ip1;
+        const npyv_lanetype_u16 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+
+#line 106
+#line 110
+#define SCALAR_OP scalar_max_i
+#if NPY_SIMD && (!0 || (0 && 0))
+
+#if 0 && !0
+    #define V_INTRIN npyv_maxn_s32 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_maxn_s32
+#else
+    #define V_INTRIN npyv_max_s32
+    #define V_REDUCE_INTRIN npyv_reduce_max_s32
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_max_s32(const npyv_lanetype_s32 *ip, npyv_lanetype_s32 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_s32;
+    const int wstep = vstep*8;
+    npyv_s32 acc = npyv_setall_s32(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_s32 v0 = npyv_load_s32(ip + vstep * 0);
+        npyv_s32 v1 = npyv_load_s32(ip + vstep * 1);
+        npyv_s32 v2 = npyv_load_s32(ip + vstep * 2);
+        npyv_s32 v3 = npyv_load_s32(ip + vstep * 3);
+
+        npyv_s32 v4 = npyv_load_s32(ip + vstep * 4);
+        npyv_s32 v5 = npyv_load_s32(ip + vstep * 5);
+        npyv_s32 v6 = npyv_load_s32(ip + vstep * 6);
+        npyv_s32 v7 = npyv_load_s32(ip + vstep * 7);
+
+        npyv_s32 r01 = V_INTRIN(v0, v1);
+        npyv_s32 r23 = V_INTRIN(v2, v3);
+        npyv_s32 r45 = V_INTRIN(v4, v5);
+        npyv_s32 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_s32(ip));
+    }
+    npyv_lanetype_s32 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_s32 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_max_s32(const npyv_lanetype_s32 *ip1, const npyv_lanetype_s32 *ip2,
+                                     npyv_lanetype_s32 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_s32;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_s32 v0 = npyv_load_s32(&ip1[i + 0 * elemPerVector]);
+        npyv_s32 v1 = npyv_load_s32(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s32 v2 = npyv_load_s32(&ip1[i + 2 * elemPerVector]);
+        npyv_s32 v3 = npyv_load_s32(&ip1[i + 3 * elemPerVector]);
+        npyv_s32 v4 = npyv_load_s32(&ip1[i + 4 * elemPerVector]);
+        npyv_s32 v5 = npyv_load_s32(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_s32 u0 = npyv_load_s32(&ip2[i + 0 * elemPerVector]);
+        npyv_s32 u1 = npyv_load_s32(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s32 u2 = npyv_load_s32(&ip2[i + 2 * elemPerVector]);
+        npyv_s32 u3 = npyv_load_s32(&ip2[i + 3 * elemPerVector]);
+        npyv_s32 u4 = npyv_load_s32(&ip2[i + 4 * elemPerVector]);
+        npyv_s32 u5 = npyv_load_s32(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_s32 m0 = V_INTRIN(v0, u0);
+        npyv_s32 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s32 m2 = V_INTRIN(v2, u2);
+        npyv_s32 m3 = V_INTRIN(v3, u3);
+        npyv_s32 m4 = V_INTRIN(v4, u4);
+        npyv_s32 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_s32(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_s32(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_s32(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_s32(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_s32(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_s32(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_s32 v0 = npyv_load_s32(ip1 + i);
+        npyv_s32 u0 = npyv_load_s32(ip2 + i);
+        npyv_s32 m0 = V_INTRIN(v0, u0);
+        npyv_store_s32(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_s32 in1 = ip1[i];
+        const npyv_lanetype_s32 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_max_s32(const npyv_lanetype_s32 *ip1, npy_intp sip1,
+                           const npyv_lanetype_s32 *ip2, npy_intp sip2,
+                                 npyv_lanetype_s32 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_s32;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_s32 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_s32(ip1);
+        } else {
+            a = npyv_loadn_s32(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_s32(ip2);
+        } else {
+            b = npyv_loadn_s32(ip2, sip2);
+        }
+        npyv_s32 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_s32(op1, r);
+        } else {
+            npyv_storen_s32(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_s32 a = *ip1;
+        const npyv_lanetype_s32 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+#line 110
+#define SCALAR_OP scalar_min_i
+#if NPY_SIMD && (!0 || (0 && 0))
+
+#if 0 && !0
+    #define V_INTRIN npyv_minn_s32 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_minn_s32
+#else
+    #define V_INTRIN npyv_min_s32
+    #define V_REDUCE_INTRIN npyv_reduce_min_s32
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_min_s32(const npyv_lanetype_s32 *ip, npyv_lanetype_s32 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_s32;
+    const int wstep = vstep*8;
+    npyv_s32 acc = npyv_setall_s32(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_s32 v0 = npyv_load_s32(ip + vstep * 0);
+        npyv_s32 v1 = npyv_load_s32(ip + vstep * 1);
+        npyv_s32 v2 = npyv_load_s32(ip + vstep * 2);
+        npyv_s32 v3 = npyv_load_s32(ip + vstep * 3);
+
+        npyv_s32 v4 = npyv_load_s32(ip + vstep * 4);
+        npyv_s32 v5 = npyv_load_s32(ip + vstep * 5);
+        npyv_s32 v6 = npyv_load_s32(ip + vstep * 6);
+        npyv_s32 v7 = npyv_load_s32(ip + vstep * 7);
+
+        npyv_s32 r01 = V_INTRIN(v0, v1);
+        npyv_s32 r23 = V_INTRIN(v2, v3);
+        npyv_s32 r45 = V_INTRIN(v4, v5);
+        npyv_s32 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_s32(ip));
+    }
+    npyv_lanetype_s32 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_s32 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_min_s32(const npyv_lanetype_s32 *ip1, const npyv_lanetype_s32 *ip2,
+                                     npyv_lanetype_s32 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_s32;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_s32 v0 = npyv_load_s32(&ip1[i + 0 * elemPerVector]);
+        npyv_s32 v1 = npyv_load_s32(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s32 v2 = npyv_load_s32(&ip1[i + 2 * elemPerVector]);
+        npyv_s32 v3 = npyv_load_s32(&ip1[i + 3 * elemPerVector]);
+        npyv_s32 v4 = npyv_load_s32(&ip1[i + 4 * elemPerVector]);
+        npyv_s32 v5 = npyv_load_s32(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_s32 u0 = npyv_load_s32(&ip2[i + 0 * elemPerVector]);
+        npyv_s32 u1 = npyv_load_s32(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s32 u2 = npyv_load_s32(&ip2[i + 2 * elemPerVector]);
+        npyv_s32 u3 = npyv_load_s32(&ip2[i + 3 * elemPerVector]);
+        npyv_s32 u4 = npyv_load_s32(&ip2[i + 4 * elemPerVector]);
+        npyv_s32 u5 = npyv_load_s32(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_s32 m0 = V_INTRIN(v0, u0);
+        npyv_s32 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s32 m2 = V_INTRIN(v2, u2);
+        npyv_s32 m3 = V_INTRIN(v3, u3);
+        npyv_s32 m4 = V_INTRIN(v4, u4);
+        npyv_s32 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_s32(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_s32(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_s32(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_s32(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_s32(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_s32(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_s32 v0 = npyv_load_s32(ip1 + i);
+        npyv_s32 u0 = npyv_load_s32(ip2 + i);
+        npyv_s32 m0 = V_INTRIN(v0, u0);
+        npyv_store_s32(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_s32 in1 = ip1[i];
+        const npyv_lanetype_s32 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_min_s32(const npyv_lanetype_s32 *ip1, npy_intp sip1,
+                           const npyv_lanetype_s32 *ip2, npy_intp sip2,
+                                 npyv_lanetype_s32 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_s32;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_s32 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_s32(ip1);
+        } else {
+            a = npyv_loadn_s32(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_s32(ip2);
+        } else {
+            b = npyv_loadn_s32(ip2, sip2);
+        }
+        npyv_s32 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_s32(op1, r);
+        } else {
+            npyv_storen_s32(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_s32 a = *ip1;
+        const npyv_lanetype_s32 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+#line 110
+#define SCALAR_OP scalar_maxp_i
+#if NPY_SIMD && (!1 || (0 && 1))
+
+#if 0 && !1
+    #define V_INTRIN npyv_maxpn_s32 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_maxpn_s32
+#else
+    #define V_INTRIN npyv_maxp_s32
+    #define V_REDUCE_INTRIN npyv_reduce_maxp_s32
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_maxp_s32(const npyv_lanetype_s32 *ip, npyv_lanetype_s32 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_s32;
+    const int wstep = vstep*8;
+    npyv_s32 acc = npyv_setall_s32(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_s32 v0 = npyv_load_s32(ip + vstep * 0);
+        npyv_s32 v1 = npyv_load_s32(ip + vstep * 1);
+        npyv_s32 v2 = npyv_load_s32(ip + vstep * 2);
+        npyv_s32 v3 = npyv_load_s32(ip + vstep * 3);
+
+        npyv_s32 v4 = npyv_load_s32(ip + vstep * 4);
+        npyv_s32 v5 = npyv_load_s32(ip + vstep * 5);
+        npyv_s32 v6 = npyv_load_s32(ip + vstep * 6);
+        npyv_s32 v7 = npyv_load_s32(ip + vstep * 7);
+
+        npyv_s32 r01 = V_INTRIN(v0, v1);
+        npyv_s32 r23 = V_INTRIN(v2, v3);
+        npyv_s32 r45 = V_INTRIN(v4, v5);
+        npyv_s32 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_s32(ip));
+    }
+    npyv_lanetype_s32 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_s32 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_maxp_s32(const npyv_lanetype_s32 *ip1, const npyv_lanetype_s32 *ip2,
+                                     npyv_lanetype_s32 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_s32;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_s32 v0 = npyv_load_s32(&ip1[i + 0 * elemPerVector]);
+        npyv_s32 v1 = npyv_load_s32(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s32 v2 = npyv_load_s32(&ip1[i + 2 * elemPerVector]);
+        npyv_s32 v3 = npyv_load_s32(&ip1[i + 3 * elemPerVector]);
+        npyv_s32 v4 = npyv_load_s32(&ip1[i + 4 * elemPerVector]);
+        npyv_s32 v5 = npyv_load_s32(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_s32 u0 = npyv_load_s32(&ip2[i + 0 * elemPerVector]);
+        npyv_s32 u1 = npyv_load_s32(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s32 u2 = npyv_load_s32(&ip2[i + 2 * elemPerVector]);
+        npyv_s32 u3 = npyv_load_s32(&ip2[i + 3 * elemPerVector]);
+        npyv_s32 u4 = npyv_load_s32(&ip2[i + 4 * elemPerVector]);
+        npyv_s32 u5 = npyv_load_s32(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_s32 m0 = V_INTRIN(v0, u0);
+        npyv_s32 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s32 m2 = V_INTRIN(v2, u2);
+        npyv_s32 m3 = V_INTRIN(v3, u3);
+        npyv_s32 m4 = V_INTRIN(v4, u4);
+        npyv_s32 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_s32(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_s32(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_s32(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_s32(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_s32(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_s32(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_s32 v0 = npyv_load_s32(ip1 + i);
+        npyv_s32 u0 = npyv_load_s32(ip2 + i);
+        npyv_s32 m0 = V_INTRIN(v0, u0);
+        npyv_store_s32(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_s32 in1 = ip1[i];
+        const npyv_lanetype_s32 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_maxp_s32(const npyv_lanetype_s32 *ip1, npy_intp sip1,
+                           const npyv_lanetype_s32 *ip2, npy_intp sip2,
+                                 npyv_lanetype_s32 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_s32;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_s32 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_s32(ip1);
+        } else {
+            a = npyv_loadn_s32(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_s32(ip2);
+        } else {
+            b = npyv_loadn_s32(ip2, sip2);
+        }
+        npyv_s32 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_s32(op1, r);
+        } else {
+            npyv_storen_s32(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_s32 a = *ip1;
+        const npyv_lanetype_s32 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+#line 110
+#define SCALAR_OP scalar_minp_i
+#if NPY_SIMD && (!1 || (0 && 1))
+
+#if 0 && !1
+    #define V_INTRIN npyv_minpn_s32 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_minpn_s32
+#else
+    #define V_INTRIN npyv_minp_s32
+    #define V_REDUCE_INTRIN npyv_reduce_minp_s32
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_minp_s32(const npyv_lanetype_s32 *ip, npyv_lanetype_s32 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_s32;
+    const int wstep = vstep*8;
+    npyv_s32 acc = npyv_setall_s32(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_s32 v0 = npyv_load_s32(ip + vstep * 0);
+        npyv_s32 v1 = npyv_load_s32(ip + vstep * 1);
+        npyv_s32 v2 = npyv_load_s32(ip + vstep * 2);
+        npyv_s32 v3 = npyv_load_s32(ip + vstep * 3);
+
+        npyv_s32 v4 = npyv_load_s32(ip + vstep * 4);
+        npyv_s32 v5 = npyv_load_s32(ip + vstep * 5);
+        npyv_s32 v6 = npyv_load_s32(ip + vstep * 6);
+        npyv_s32 v7 = npyv_load_s32(ip + vstep * 7);
+
+        npyv_s32 r01 = V_INTRIN(v0, v1);
+        npyv_s32 r23 = V_INTRIN(v2, v3);
+        npyv_s32 r45 = V_INTRIN(v4, v5);
+        npyv_s32 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_s32(ip));
+    }
+    npyv_lanetype_s32 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_s32 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_minp_s32(const npyv_lanetype_s32 *ip1, const npyv_lanetype_s32 *ip2,
+                                     npyv_lanetype_s32 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_s32;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_s32 v0 = npyv_load_s32(&ip1[i + 0 * elemPerVector]);
+        npyv_s32 v1 = npyv_load_s32(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s32 v2 = npyv_load_s32(&ip1[i + 2 * elemPerVector]);
+        npyv_s32 v3 = npyv_load_s32(&ip1[i + 3 * elemPerVector]);
+        npyv_s32 v4 = npyv_load_s32(&ip1[i + 4 * elemPerVector]);
+        npyv_s32 v5 = npyv_load_s32(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_s32 u0 = npyv_load_s32(&ip2[i + 0 * elemPerVector]);
+        npyv_s32 u1 = npyv_load_s32(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s32 u2 = npyv_load_s32(&ip2[i + 2 * elemPerVector]);
+        npyv_s32 u3 = npyv_load_s32(&ip2[i + 3 * elemPerVector]);
+        npyv_s32 u4 = npyv_load_s32(&ip2[i + 4 * elemPerVector]);
+        npyv_s32 u5 = npyv_load_s32(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_s32 m0 = V_INTRIN(v0, u0);
+        npyv_s32 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s32 m2 = V_INTRIN(v2, u2);
+        npyv_s32 m3 = V_INTRIN(v3, u3);
+        npyv_s32 m4 = V_INTRIN(v4, u4);
+        npyv_s32 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_s32(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_s32(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_s32(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_s32(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_s32(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_s32(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_s32 v0 = npyv_load_s32(ip1 + i);
+        npyv_s32 u0 = npyv_load_s32(ip2 + i);
+        npyv_s32 m0 = V_INTRIN(v0, u0);
+        npyv_store_s32(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_s32 in1 = ip1[i];
+        const npyv_lanetype_s32 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_minp_s32(const npyv_lanetype_s32 *ip1, npy_intp sip1,
+                           const npyv_lanetype_s32 *ip2, npy_intp sip2,
+                                 npyv_lanetype_s32 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_s32;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_s32 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_s32(ip1);
+        } else {
+            a = npyv_loadn_s32(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_s32(ip2);
+        } else {
+            b = npyv_loadn_s32(ip2, sip2);
+        }
+        npyv_s32 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_s32(op1, r);
+        } else {
+            npyv_storen_s32(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_s32 a = *ip1;
+        const npyv_lanetype_s32 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+
+#line 106
+#line 110
+#define SCALAR_OP scalar_max_i
+#if NPY_SIMD && (!0 || (0 && 0))
+
+#if 0 && !0
+    #define V_INTRIN npyv_maxn_u32 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_maxn_u32
+#else
+    #define V_INTRIN npyv_max_u32
+    #define V_REDUCE_INTRIN npyv_reduce_max_u32
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_max_u32(const npyv_lanetype_u32 *ip, npyv_lanetype_u32 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_u32;
+    const int wstep = vstep*8;
+    npyv_u32 acc = npyv_setall_u32(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_u32 v0 = npyv_load_u32(ip + vstep * 0);
+        npyv_u32 v1 = npyv_load_u32(ip + vstep * 1);
+        npyv_u32 v2 = npyv_load_u32(ip + vstep * 2);
+        npyv_u32 v3 = npyv_load_u32(ip + vstep * 3);
+
+        npyv_u32 v4 = npyv_load_u32(ip + vstep * 4);
+        npyv_u32 v5 = npyv_load_u32(ip + vstep * 5);
+        npyv_u32 v6 = npyv_load_u32(ip + vstep * 6);
+        npyv_u32 v7 = npyv_load_u32(ip + vstep * 7);
+
+        npyv_u32 r01 = V_INTRIN(v0, v1);
+        npyv_u32 r23 = V_INTRIN(v2, v3);
+        npyv_u32 r45 = V_INTRIN(v4, v5);
+        npyv_u32 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_u32(ip));
+    }
+    npyv_lanetype_u32 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_u32 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_max_u32(const npyv_lanetype_u32 *ip1, const npyv_lanetype_u32 *ip2,
+                                     npyv_lanetype_u32 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_u32;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_u32 v0 = npyv_load_u32(&ip1[i + 0 * elemPerVector]);
+        npyv_u32 v1 = npyv_load_u32(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u32 v2 = npyv_load_u32(&ip1[i + 2 * elemPerVector]);
+        npyv_u32 v3 = npyv_load_u32(&ip1[i + 3 * elemPerVector]);
+        npyv_u32 v4 = npyv_load_u32(&ip1[i + 4 * elemPerVector]);
+        npyv_u32 v5 = npyv_load_u32(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_u32 u0 = npyv_load_u32(&ip2[i + 0 * elemPerVector]);
+        npyv_u32 u1 = npyv_load_u32(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u32 u2 = npyv_load_u32(&ip2[i + 2 * elemPerVector]);
+        npyv_u32 u3 = npyv_load_u32(&ip2[i + 3 * elemPerVector]);
+        npyv_u32 u4 = npyv_load_u32(&ip2[i + 4 * elemPerVector]);
+        npyv_u32 u5 = npyv_load_u32(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_u32 m0 = V_INTRIN(v0, u0);
+        npyv_u32 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u32 m2 = V_INTRIN(v2, u2);
+        npyv_u32 m3 = V_INTRIN(v3, u3);
+        npyv_u32 m4 = V_INTRIN(v4, u4);
+        npyv_u32 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_u32(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_u32(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_u32(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_u32(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_u32(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_u32(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_u32 v0 = npyv_load_u32(ip1 + i);
+        npyv_u32 u0 = npyv_load_u32(ip2 + i);
+        npyv_u32 m0 = V_INTRIN(v0, u0);
+        npyv_store_u32(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_u32 in1 = ip1[i];
+        const npyv_lanetype_u32 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_max_u32(const npyv_lanetype_u32 *ip1, npy_intp sip1,
+                           const npyv_lanetype_u32 *ip2, npy_intp sip2,
+                                 npyv_lanetype_u32 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_u32;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_u32 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_u32(ip1);
+        } else {
+            a = npyv_loadn_u32(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_u32(ip2);
+        } else {
+            b = npyv_loadn_u32(ip2, sip2);
+        }
+        npyv_u32 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_u32(op1, r);
+        } else {
+            npyv_storen_u32(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_u32 a = *ip1;
+        const npyv_lanetype_u32 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+#line 110
+#define SCALAR_OP scalar_min_i
+#if NPY_SIMD && (!0 || (0 && 0))
+
+#if 0 && !0
+    #define V_INTRIN npyv_minn_u32 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_minn_u32
+#else
+    #define V_INTRIN npyv_min_u32
+    #define V_REDUCE_INTRIN npyv_reduce_min_u32
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_min_u32(const npyv_lanetype_u32 *ip, npyv_lanetype_u32 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_u32;
+    const int wstep = vstep*8;
+    npyv_u32 acc = npyv_setall_u32(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_u32 v0 = npyv_load_u32(ip + vstep * 0);
+        npyv_u32 v1 = npyv_load_u32(ip + vstep * 1);
+        npyv_u32 v2 = npyv_load_u32(ip + vstep * 2);
+        npyv_u32 v3 = npyv_load_u32(ip + vstep * 3);
+
+        npyv_u32 v4 = npyv_load_u32(ip + vstep * 4);
+        npyv_u32 v5 = npyv_load_u32(ip + vstep * 5);
+        npyv_u32 v6 = npyv_load_u32(ip + vstep * 6);
+        npyv_u32 v7 = npyv_load_u32(ip + vstep * 7);
+
+        npyv_u32 r01 = V_INTRIN(v0, v1);
+        npyv_u32 r23 = V_INTRIN(v2, v3);
+        npyv_u32 r45 = V_INTRIN(v4, v5);
+        npyv_u32 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_u32(ip));
+    }
+    npyv_lanetype_u32 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_u32 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_min_u32(const npyv_lanetype_u32 *ip1, const npyv_lanetype_u32 *ip2,
+                                     npyv_lanetype_u32 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_u32;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_u32 v0 = npyv_load_u32(&ip1[i + 0 * elemPerVector]);
+        npyv_u32 v1 = npyv_load_u32(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u32 v2 = npyv_load_u32(&ip1[i + 2 * elemPerVector]);
+        npyv_u32 v3 = npyv_load_u32(&ip1[i + 3 * elemPerVector]);
+        npyv_u32 v4 = npyv_load_u32(&ip1[i + 4 * elemPerVector]);
+        npyv_u32 v5 = npyv_load_u32(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_u32 u0 = npyv_load_u32(&ip2[i + 0 * elemPerVector]);
+        npyv_u32 u1 = npyv_load_u32(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u32 u2 = npyv_load_u32(&ip2[i + 2 * elemPerVector]);
+        npyv_u32 u3 = npyv_load_u32(&ip2[i + 3 * elemPerVector]);
+        npyv_u32 u4 = npyv_load_u32(&ip2[i + 4 * elemPerVector]);
+        npyv_u32 u5 = npyv_load_u32(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_u32 m0 = V_INTRIN(v0, u0);
+        npyv_u32 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u32 m2 = V_INTRIN(v2, u2);
+        npyv_u32 m3 = V_INTRIN(v3, u3);
+        npyv_u32 m4 = V_INTRIN(v4, u4);
+        npyv_u32 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_u32(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_u32(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_u32(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_u32(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_u32(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_u32(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_u32 v0 = npyv_load_u32(ip1 + i);
+        npyv_u32 u0 = npyv_load_u32(ip2 + i);
+        npyv_u32 m0 = V_INTRIN(v0, u0);
+        npyv_store_u32(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_u32 in1 = ip1[i];
+        const npyv_lanetype_u32 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_min_u32(const npyv_lanetype_u32 *ip1, npy_intp sip1,
+                           const npyv_lanetype_u32 *ip2, npy_intp sip2,
+                                 npyv_lanetype_u32 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_u32;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_u32 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_u32(ip1);
+        } else {
+            a = npyv_loadn_u32(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_u32(ip2);
+        } else {
+            b = npyv_loadn_u32(ip2, sip2);
+        }
+        npyv_u32 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_u32(op1, r);
+        } else {
+            npyv_storen_u32(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_u32 a = *ip1;
+        const npyv_lanetype_u32 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+#line 110
+#define SCALAR_OP scalar_maxp_i
+#if NPY_SIMD && (!1 || (0 && 1))
+
+#if 0 && !1
+    #define V_INTRIN npyv_maxpn_u32 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_maxpn_u32
+#else
+    #define V_INTRIN npyv_maxp_u32
+    #define V_REDUCE_INTRIN npyv_reduce_maxp_u32
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_maxp_u32(const npyv_lanetype_u32 *ip, npyv_lanetype_u32 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_u32;
+    const int wstep = vstep*8;
+    npyv_u32 acc = npyv_setall_u32(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_u32 v0 = npyv_load_u32(ip + vstep * 0);
+        npyv_u32 v1 = npyv_load_u32(ip + vstep * 1);
+        npyv_u32 v2 = npyv_load_u32(ip + vstep * 2);
+        npyv_u32 v3 = npyv_load_u32(ip + vstep * 3);
+
+        npyv_u32 v4 = npyv_load_u32(ip + vstep * 4);
+        npyv_u32 v5 = npyv_load_u32(ip + vstep * 5);
+        npyv_u32 v6 = npyv_load_u32(ip + vstep * 6);
+        npyv_u32 v7 = npyv_load_u32(ip + vstep * 7);
+
+        npyv_u32 r01 = V_INTRIN(v0, v1);
+        npyv_u32 r23 = V_INTRIN(v2, v3);
+        npyv_u32 r45 = V_INTRIN(v4, v5);
+        npyv_u32 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_u32(ip));
+    }
+    npyv_lanetype_u32 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_u32 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_maxp_u32(const npyv_lanetype_u32 *ip1, const npyv_lanetype_u32 *ip2,
+                                     npyv_lanetype_u32 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_u32;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_u32 v0 = npyv_load_u32(&ip1[i + 0 * elemPerVector]);
+        npyv_u32 v1 = npyv_load_u32(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u32 v2 = npyv_load_u32(&ip1[i + 2 * elemPerVector]);
+        npyv_u32 v3 = npyv_load_u32(&ip1[i + 3 * elemPerVector]);
+        npyv_u32 v4 = npyv_load_u32(&ip1[i + 4 * elemPerVector]);
+        npyv_u32 v5 = npyv_load_u32(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_u32 u0 = npyv_load_u32(&ip2[i + 0 * elemPerVector]);
+        npyv_u32 u1 = npyv_load_u32(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u32 u2 = npyv_load_u32(&ip2[i + 2 * elemPerVector]);
+        npyv_u32 u3 = npyv_load_u32(&ip2[i + 3 * elemPerVector]);
+        npyv_u32 u4 = npyv_load_u32(&ip2[i + 4 * elemPerVector]);
+        npyv_u32 u5 = npyv_load_u32(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_u32 m0 = V_INTRIN(v0, u0);
+        npyv_u32 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u32 m2 = V_INTRIN(v2, u2);
+        npyv_u32 m3 = V_INTRIN(v3, u3);
+        npyv_u32 m4 = V_INTRIN(v4, u4);
+        npyv_u32 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_u32(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_u32(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_u32(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_u32(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_u32(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_u32(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_u32 v0 = npyv_load_u32(ip1 + i);
+        npyv_u32 u0 = npyv_load_u32(ip2 + i);
+        npyv_u32 m0 = V_INTRIN(v0, u0);
+        npyv_store_u32(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_u32 in1 = ip1[i];
+        const npyv_lanetype_u32 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_maxp_u32(const npyv_lanetype_u32 *ip1, npy_intp sip1,
+                           const npyv_lanetype_u32 *ip2, npy_intp sip2,
+                                 npyv_lanetype_u32 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_u32;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_u32 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_u32(ip1);
+        } else {
+            a = npyv_loadn_u32(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_u32(ip2);
+        } else {
+            b = npyv_loadn_u32(ip2, sip2);
+        }
+        npyv_u32 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_u32(op1, r);
+        } else {
+            npyv_storen_u32(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_u32 a = *ip1;
+        const npyv_lanetype_u32 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+#line 110
+#define SCALAR_OP scalar_minp_i
+#if NPY_SIMD && (!1 || (0 && 1))
+
+#if 0 && !1
+    #define V_INTRIN npyv_minpn_u32 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_minpn_u32
+#else
+    #define V_INTRIN npyv_minp_u32
+    #define V_REDUCE_INTRIN npyv_reduce_minp_u32
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_minp_u32(const npyv_lanetype_u32 *ip, npyv_lanetype_u32 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_u32;
+    const int wstep = vstep*8;
+    npyv_u32 acc = npyv_setall_u32(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_u32 v0 = npyv_load_u32(ip + vstep * 0);
+        npyv_u32 v1 = npyv_load_u32(ip + vstep * 1);
+        npyv_u32 v2 = npyv_load_u32(ip + vstep * 2);
+        npyv_u32 v3 = npyv_load_u32(ip + vstep * 3);
+
+        npyv_u32 v4 = npyv_load_u32(ip + vstep * 4);
+        npyv_u32 v5 = npyv_load_u32(ip + vstep * 5);
+        npyv_u32 v6 = npyv_load_u32(ip + vstep * 6);
+        npyv_u32 v7 = npyv_load_u32(ip + vstep * 7);
+
+        npyv_u32 r01 = V_INTRIN(v0, v1);
+        npyv_u32 r23 = V_INTRIN(v2, v3);
+        npyv_u32 r45 = V_INTRIN(v4, v5);
+        npyv_u32 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_u32(ip));
+    }
+    npyv_lanetype_u32 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_u32 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_minp_u32(const npyv_lanetype_u32 *ip1, const npyv_lanetype_u32 *ip2,
+                                     npyv_lanetype_u32 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_u32;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_u32 v0 = npyv_load_u32(&ip1[i + 0 * elemPerVector]);
+        npyv_u32 v1 = npyv_load_u32(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u32 v2 = npyv_load_u32(&ip1[i + 2 * elemPerVector]);
+        npyv_u32 v3 = npyv_load_u32(&ip1[i + 3 * elemPerVector]);
+        npyv_u32 v4 = npyv_load_u32(&ip1[i + 4 * elemPerVector]);
+        npyv_u32 v5 = npyv_load_u32(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_u32 u0 = npyv_load_u32(&ip2[i + 0 * elemPerVector]);
+        npyv_u32 u1 = npyv_load_u32(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u32 u2 = npyv_load_u32(&ip2[i + 2 * elemPerVector]);
+        npyv_u32 u3 = npyv_load_u32(&ip2[i + 3 * elemPerVector]);
+        npyv_u32 u4 = npyv_load_u32(&ip2[i + 4 * elemPerVector]);
+        npyv_u32 u5 = npyv_load_u32(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_u32 m0 = V_INTRIN(v0, u0);
+        npyv_u32 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u32 m2 = V_INTRIN(v2, u2);
+        npyv_u32 m3 = V_INTRIN(v3, u3);
+        npyv_u32 m4 = V_INTRIN(v4, u4);
+        npyv_u32 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_u32(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_u32(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_u32(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_u32(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_u32(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_u32(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_u32 v0 = npyv_load_u32(ip1 + i);
+        npyv_u32 u0 = npyv_load_u32(ip2 + i);
+        npyv_u32 m0 = V_INTRIN(v0, u0);
+        npyv_store_u32(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_u32 in1 = ip1[i];
+        const npyv_lanetype_u32 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_minp_u32(const npyv_lanetype_u32 *ip1, npy_intp sip1,
+                           const npyv_lanetype_u32 *ip2, npy_intp sip2,
+                                 npyv_lanetype_u32 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_u32;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_u32 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_u32(ip1);
+        } else {
+            a = npyv_loadn_u32(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_u32(ip2);
+        } else {
+            b = npyv_loadn_u32(ip2, sip2);
+        }
+        npyv_u32 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_u32(op1, r);
+        } else {
+            npyv_storen_u32(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_u32 a = *ip1;
+        const npyv_lanetype_u32 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+
+#line 106
+#line 110
+#define SCALAR_OP scalar_max_i
+#if NPY_SIMD && (!0 || (0 && 0))
+
+#if 0 && !0
+    #define V_INTRIN npyv_maxn_s64 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_maxn_s64
+#else
+    #define V_INTRIN npyv_max_s64
+    #define V_REDUCE_INTRIN npyv_reduce_max_s64
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_max_s64(const npyv_lanetype_s64 *ip, npyv_lanetype_s64 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_s64;
+    const int wstep = vstep*8;
+    npyv_s64 acc = npyv_setall_s64(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_s64 v0 = npyv_load_s64(ip + vstep * 0);
+        npyv_s64 v1 = npyv_load_s64(ip + vstep * 1);
+        npyv_s64 v2 = npyv_load_s64(ip + vstep * 2);
+        npyv_s64 v3 = npyv_load_s64(ip + vstep * 3);
+
+        npyv_s64 v4 = npyv_load_s64(ip + vstep * 4);
+        npyv_s64 v5 = npyv_load_s64(ip + vstep * 5);
+        npyv_s64 v6 = npyv_load_s64(ip + vstep * 6);
+        npyv_s64 v7 = npyv_load_s64(ip + vstep * 7);
+
+        npyv_s64 r01 = V_INTRIN(v0, v1);
+        npyv_s64 r23 = V_INTRIN(v2, v3);
+        npyv_s64 r45 = V_INTRIN(v4, v5);
+        npyv_s64 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_s64(ip));
+    }
+    npyv_lanetype_s64 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_s64 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_max_s64(const npyv_lanetype_s64 *ip1, const npyv_lanetype_s64 *ip2,
+                                     npyv_lanetype_s64 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_s64;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_s64 v0 = npyv_load_s64(&ip1[i + 0 * elemPerVector]);
+        npyv_s64 v1 = npyv_load_s64(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s64 v2 = npyv_load_s64(&ip1[i + 2 * elemPerVector]);
+        npyv_s64 v3 = npyv_load_s64(&ip1[i + 3 * elemPerVector]);
+        npyv_s64 v4 = npyv_load_s64(&ip1[i + 4 * elemPerVector]);
+        npyv_s64 v5 = npyv_load_s64(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_s64 u0 = npyv_load_s64(&ip2[i + 0 * elemPerVector]);
+        npyv_s64 u1 = npyv_load_s64(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s64 u2 = npyv_load_s64(&ip2[i + 2 * elemPerVector]);
+        npyv_s64 u3 = npyv_load_s64(&ip2[i + 3 * elemPerVector]);
+        npyv_s64 u4 = npyv_load_s64(&ip2[i + 4 * elemPerVector]);
+        npyv_s64 u5 = npyv_load_s64(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_s64 m0 = V_INTRIN(v0, u0);
+        npyv_s64 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s64 m2 = V_INTRIN(v2, u2);
+        npyv_s64 m3 = V_INTRIN(v3, u3);
+        npyv_s64 m4 = V_INTRIN(v4, u4);
+        npyv_s64 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_s64(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_s64(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_s64(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_s64(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_s64(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_s64(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_s64 v0 = npyv_load_s64(ip1 + i);
+        npyv_s64 u0 = npyv_load_s64(ip2 + i);
+        npyv_s64 m0 = V_INTRIN(v0, u0);
+        npyv_store_s64(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_s64 in1 = ip1[i];
+        const npyv_lanetype_s64 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_max_s64(const npyv_lanetype_s64 *ip1, npy_intp sip1,
+                           const npyv_lanetype_s64 *ip2, npy_intp sip2,
+                                 npyv_lanetype_s64 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_s64;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_s64 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_s64(ip1);
+        } else {
+            a = npyv_loadn_s64(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_s64(ip2);
+        } else {
+            b = npyv_loadn_s64(ip2, sip2);
+        }
+        npyv_s64 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_s64(op1, r);
+        } else {
+            npyv_storen_s64(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_s64 a = *ip1;
+        const npyv_lanetype_s64 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+#line 110
+#define SCALAR_OP scalar_min_i
+#if NPY_SIMD && (!0 || (0 && 0))
+
+#if 0 && !0
+    #define V_INTRIN npyv_minn_s64 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_minn_s64
+#else
+    #define V_INTRIN npyv_min_s64
+    #define V_REDUCE_INTRIN npyv_reduce_min_s64
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_min_s64(const npyv_lanetype_s64 *ip, npyv_lanetype_s64 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_s64;
+    const int wstep = vstep*8;
+    npyv_s64 acc = npyv_setall_s64(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_s64 v0 = npyv_load_s64(ip + vstep * 0);
+        npyv_s64 v1 = npyv_load_s64(ip + vstep * 1);
+        npyv_s64 v2 = npyv_load_s64(ip + vstep * 2);
+        npyv_s64 v3 = npyv_load_s64(ip + vstep * 3);
+
+        npyv_s64 v4 = npyv_load_s64(ip + vstep * 4);
+        npyv_s64 v5 = npyv_load_s64(ip + vstep * 5);
+        npyv_s64 v6 = npyv_load_s64(ip + vstep * 6);
+        npyv_s64 v7 = npyv_load_s64(ip + vstep * 7);
+
+        npyv_s64 r01 = V_INTRIN(v0, v1);
+        npyv_s64 r23 = V_INTRIN(v2, v3);
+        npyv_s64 r45 = V_INTRIN(v4, v5);
+        npyv_s64 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_s64(ip));
+    }
+    npyv_lanetype_s64 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_s64 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_min_s64(const npyv_lanetype_s64 *ip1, const npyv_lanetype_s64 *ip2,
+                                     npyv_lanetype_s64 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_s64;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_s64 v0 = npyv_load_s64(&ip1[i + 0 * elemPerVector]);
+        npyv_s64 v1 = npyv_load_s64(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s64 v2 = npyv_load_s64(&ip1[i + 2 * elemPerVector]);
+        npyv_s64 v3 = npyv_load_s64(&ip1[i + 3 * elemPerVector]);
+        npyv_s64 v4 = npyv_load_s64(&ip1[i + 4 * elemPerVector]);
+        npyv_s64 v5 = npyv_load_s64(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_s64 u0 = npyv_load_s64(&ip2[i + 0 * elemPerVector]);
+        npyv_s64 u1 = npyv_load_s64(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s64 u2 = npyv_load_s64(&ip2[i + 2 * elemPerVector]);
+        npyv_s64 u3 = npyv_load_s64(&ip2[i + 3 * elemPerVector]);
+        npyv_s64 u4 = npyv_load_s64(&ip2[i + 4 * elemPerVector]);
+        npyv_s64 u5 = npyv_load_s64(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_s64 m0 = V_INTRIN(v0, u0);
+        npyv_s64 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s64 m2 = V_INTRIN(v2, u2);
+        npyv_s64 m3 = V_INTRIN(v3, u3);
+        npyv_s64 m4 = V_INTRIN(v4, u4);
+        npyv_s64 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_s64(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_s64(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_s64(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_s64(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_s64(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_s64(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_s64 v0 = npyv_load_s64(ip1 + i);
+        npyv_s64 u0 = npyv_load_s64(ip2 + i);
+        npyv_s64 m0 = V_INTRIN(v0, u0);
+        npyv_store_s64(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_s64 in1 = ip1[i];
+        const npyv_lanetype_s64 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_min_s64(const npyv_lanetype_s64 *ip1, npy_intp sip1,
+                           const npyv_lanetype_s64 *ip2, npy_intp sip2,
+                                 npyv_lanetype_s64 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_s64;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_s64 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_s64(ip1);
+        } else {
+            a = npyv_loadn_s64(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_s64(ip2);
+        } else {
+            b = npyv_loadn_s64(ip2, sip2);
+        }
+        npyv_s64 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_s64(op1, r);
+        } else {
+            npyv_storen_s64(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_s64 a = *ip1;
+        const npyv_lanetype_s64 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+#line 110
+#define SCALAR_OP scalar_maxp_i
+#if NPY_SIMD && (!1 || (0 && 1))
+
+#if 0 && !1
+    #define V_INTRIN npyv_maxpn_s64 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_maxpn_s64
+#else
+    #define V_INTRIN npyv_maxp_s64
+    #define V_REDUCE_INTRIN npyv_reduce_maxp_s64
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_maxp_s64(const npyv_lanetype_s64 *ip, npyv_lanetype_s64 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_s64;
+    const int wstep = vstep*8;
+    npyv_s64 acc = npyv_setall_s64(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_s64 v0 = npyv_load_s64(ip + vstep * 0);
+        npyv_s64 v1 = npyv_load_s64(ip + vstep * 1);
+        npyv_s64 v2 = npyv_load_s64(ip + vstep * 2);
+        npyv_s64 v3 = npyv_load_s64(ip + vstep * 3);
+
+        npyv_s64 v4 = npyv_load_s64(ip + vstep * 4);
+        npyv_s64 v5 = npyv_load_s64(ip + vstep * 5);
+        npyv_s64 v6 = npyv_load_s64(ip + vstep * 6);
+        npyv_s64 v7 = npyv_load_s64(ip + vstep * 7);
+
+        npyv_s64 r01 = V_INTRIN(v0, v1);
+        npyv_s64 r23 = V_INTRIN(v2, v3);
+        npyv_s64 r45 = V_INTRIN(v4, v5);
+        npyv_s64 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_s64(ip));
+    }
+    npyv_lanetype_s64 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_s64 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_maxp_s64(const npyv_lanetype_s64 *ip1, const npyv_lanetype_s64 *ip2,
+                                     npyv_lanetype_s64 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_s64;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_s64 v0 = npyv_load_s64(&ip1[i + 0 * elemPerVector]);
+        npyv_s64 v1 = npyv_load_s64(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s64 v2 = npyv_load_s64(&ip1[i + 2 * elemPerVector]);
+        npyv_s64 v3 = npyv_load_s64(&ip1[i + 3 * elemPerVector]);
+        npyv_s64 v4 = npyv_load_s64(&ip1[i + 4 * elemPerVector]);
+        npyv_s64 v5 = npyv_load_s64(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_s64 u0 = npyv_load_s64(&ip2[i + 0 * elemPerVector]);
+        npyv_s64 u1 = npyv_load_s64(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s64 u2 = npyv_load_s64(&ip2[i + 2 * elemPerVector]);
+        npyv_s64 u3 = npyv_load_s64(&ip2[i + 3 * elemPerVector]);
+        npyv_s64 u4 = npyv_load_s64(&ip2[i + 4 * elemPerVector]);
+        npyv_s64 u5 = npyv_load_s64(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_s64 m0 = V_INTRIN(v0, u0);
+        npyv_s64 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s64 m2 = V_INTRIN(v2, u2);
+        npyv_s64 m3 = V_INTRIN(v3, u3);
+        npyv_s64 m4 = V_INTRIN(v4, u4);
+        npyv_s64 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_s64(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_s64(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_s64(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_s64(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_s64(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_s64(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_s64 v0 = npyv_load_s64(ip1 + i);
+        npyv_s64 u0 = npyv_load_s64(ip2 + i);
+        npyv_s64 m0 = V_INTRIN(v0, u0);
+        npyv_store_s64(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_s64 in1 = ip1[i];
+        const npyv_lanetype_s64 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_maxp_s64(const npyv_lanetype_s64 *ip1, npy_intp sip1,
+                           const npyv_lanetype_s64 *ip2, npy_intp sip2,
+                                 npyv_lanetype_s64 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_s64;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_s64 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_s64(ip1);
+        } else {
+            a = npyv_loadn_s64(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_s64(ip2);
+        } else {
+            b = npyv_loadn_s64(ip2, sip2);
+        }
+        npyv_s64 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_s64(op1, r);
+        } else {
+            npyv_storen_s64(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_s64 a = *ip1;
+        const npyv_lanetype_s64 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+#line 110
+#define SCALAR_OP scalar_minp_i
+#if NPY_SIMD && (!1 || (0 && 1))
+
+#if 0 && !1
+    #define V_INTRIN npyv_minpn_s64 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_minpn_s64
+#else
+    #define V_INTRIN npyv_minp_s64
+    #define V_REDUCE_INTRIN npyv_reduce_minp_s64
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_minp_s64(const npyv_lanetype_s64 *ip, npyv_lanetype_s64 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_s64;
+    const int wstep = vstep*8;
+    npyv_s64 acc = npyv_setall_s64(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_s64 v0 = npyv_load_s64(ip + vstep * 0);
+        npyv_s64 v1 = npyv_load_s64(ip + vstep * 1);
+        npyv_s64 v2 = npyv_load_s64(ip + vstep * 2);
+        npyv_s64 v3 = npyv_load_s64(ip + vstep * 3);
+
+        npyv_s64 v4 = npyv_load_s64(ip + vstep * 4);
+        npyv_s64 v5 = npyv_load_s64(ip + vstep * 5);
+        npyv_s64 v6 = npyv_load_s64(ip + vstep * 6);
+        npyv_s64 v7 = npyv_load_s64(ip + vstep * 7);
+
+        npyv_s64 r01 = V_INTRIN(v0, v1);
+        npyv_s64 r23 = V_INTRIN(v2, v3);
+        npyv_s64 r45 = V_INTRIN(v4, v5);
+        npyv_s64 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_s64(ip));
+    }
+    npyv_lanetype_s64 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_s64 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_minp_s64(const npyv_lanetype_s64 *ip1, const npyv_lanetype_s64 *ip2,
+                                     npyv_lanetype_s64 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_s64;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_s64 v0 = npyv_load_s64(&ip1[i + 0 * elemPerVector]);
+        npyv_s64 v1 = npyv_load_s64(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s64 v2 = npyv_load_s64(&ip1[i + 2 * elemPerVector]);
+        npyv_s64 v3 = npyv_load_s64(&ip1[i + 3 * elemPerVector]);
+        npyv_s64 v4 = npyv_load_s64(&ip1[i + 4 * elemPerVector]);
+        npyv_s64 v5 = npyv_load_s64(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_s64 u0 = npyv_load_s64(&ip2[i + 0 * elemPerVector]);
+        npyv_s64 u1 = npyv_load_s64(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s64 u2 = npyv_load_s64(&ip2[i + 2 * elemPerVector]);
+        npyv_s64 u3 = npyv_load_s64(&ip2[i + 3 * elemPerVector]);
+        npyv_s64 u4 = npyv_load_s64(&ip2[i + 4 * elemPerVector]);
+        npyv_s64 u5 = npyv_load_s64(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_s64 m0 = V_INTRIN(v0, u0);
+        npyv_s64 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_s64 m2 = V_INTRIN(v2, u2);
+        npyv_s64 m3 = V_INTRIN(v3, u3);
+        npyv_s64 m4 = V_INTRIN(v4, u4);
+        npyv_s64 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_s64(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_s64(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_s64(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_s64(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_s64(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_s64(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_s64 v0 = npyv_load_s64(ip1 + i);
+        npyv_s64 u0 = npyv_load_s64(ip2 + i);
+        npyv_s64 m0 = V_INTRIN(v0, u0);
+        npyv_store_s64(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_s64 in1 = ip1[i];
+        const npyv_lanetype_s64 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_minp_s64(const npyv_lanetype_s64 *ip1, npy_intp sip1,
+                           const npyv_lanetype_s64 *ip2, npy_intp sip2,
+                                 npyv_lanetype_s64 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_s64;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_s64 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_s64(ip1);
+        } else {
+            a = npyv_loadn_s64(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_s64(ip2);
+        } else {
+            b = npyv_loadn_s64(ip2, sip2);
+        }
+        npyv_s64 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_s64(op1, r);
+        } else {
+            npyv_storen_s64(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_s64 a = *ip1;
+        const npyv_lanetype_s64 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+
+#line 106
+#line 110
+#define SCALAR_OP scalar_max_i
+#if NPY_SIMD && (!0 || (0 && 0))
+
+#if 0 && !0
+    #define V_INTRIN npyv_maxn_u64 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_maxn_u64
+#else
+    #define V_INTRIN npyv_max_u64
+    #define V_REDUCE_INTRIN npyv_reduce_max_u64
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_max_u64(const npyv_lanetype_u64 *ip, npyv_lanetype_u64 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_u64;
+    const int wstep = vstep*8;
+    npyv_u64 acc = npyv_setall_u64(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_u64 v0 = npyv_load_u64(ip + vstep * 0);
+        npyv_u64 v1 = npyv_load_u64(ip + vstep * 1);
+        npyv_u64 v2 = npyv_load_u64(ip + vstep * 2);
+        npyv_u64 v3 = npyv_load_u64(ip + vstep * 3);
+
+        npyv_u64 v4 = npyv_load_u64(ip + vstep * 4);
+        npyv_u64 v5 = npyv_load_u64(ip + vstep * 5);
+        npyv_u64 v6 = npyv_load_u64(ip + vstep * 6);
+        npyv_u64 v7 = npyv_load_u64(ip + vstep * 7);
+
+        npyv_u64 r01 = V_INTRIN(v0, v1);
+        npyv_u64 r23 = V_INTRIN(v2, v3);
+        npyv_u64 r45 = V_INTRIN(v4, v5);
+        npyv_u64 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_u64(ip));
+    }
+    npyv_lanetype_u64 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_u64 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_max_u64(const npyv_lanetype_u64 *ip1, const npyv_lanetype_u64 *ip2,
+                                     npyv_lanetype_u64 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_u64;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_u64 v0 = npyv_load_u64(&ip1[i + 0 * elemPerVector]);
+        npyv_u64 v1 = npyv_load_u64(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u64 v2 = npyv_load_u64(&ip1[i + 2 * elemPerVector]);
+        npyv_u64 v3 = npyv_load_u64(&ip1[i + 3 * elemPerVector]);
+        npyv_u64 v4 = npyv_load_u64(&ip1[i + 4 * elemPerVector]);
+        npyv_u64 v5 = npyv_load_u64(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_u64 u0 = npyv_load_u64(&ip2[i + 0 * elemPerVector]);
+        npyv_u64 u1 = npyv_load_u64(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u64 u2 = npyv_load_u64(&ip2[i + 2 * elemPerVector]);
+        npyv_u64 u3 = npyv_load_u64(&ip2[i + 3 * elemPerVector]);
+        npyv_u64 u4 = npyv_load_u64(&ip2[i + 4 * elemPerVector]);
+        npyv_u64 u5 = npyv_load_u64(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_u64 m0 = V_INTRIN(v0, u0);
+        npyv_u64 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u64 m2 = V_INTRIN(v2, u2);
+        npyv_u64 m3 = V_INTRIN(v3, u3);
+        npyv_u64 m4 = V_INTRIN(v4, u4);
+        npyv_u64 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_u64(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_u64(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_u64(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_u64(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_u64(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_u64(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_u64 v0 = npyv_load_u64(ip1 + i);
+        npyv_u64 u0 = npyv_load_u64(ip2 + i);
+        npyv_u64 m0 = V_INTRIN(v0, u0);
+        npyv_store_u64(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_u64 in1 = ip1[i];
+        const npyv_lanetype_u64 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_max_u64(const npyv_lanetype_u64 *ip1, npy_intp sip1,
+                           const npyv_lanetype_u64 *ip2, npy_intp sip2,
+                                 npyv_lanetype_u64 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_u64;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_u64 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_u64(ip1);
+        } else {
+            a = npyv_loadn_u64(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_u64(ip2);
+        } else {
+            b = npyv_loadn_u64(ip2, sip2);
+        }
+        npyv_u64 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_u64(op1, r);
+        } else {
+            npyv_storen_u64(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_u64 a = *ip1;
+        const npyv_lanetype_u64 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+#line 110
+#define SCALAR_OP scalar_min_i
+#if NPY_SIMD && (!0 || (0 && 0))
+
+#if 0 && !0
+    #define V_INTRIN npyv_minn_u64 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_minn_u64
+#else
+    #define V_INTRIN npyv_min_u64
+    #define V_REDUCE_INTRIN npyv_reduce_min_u64
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_min_u64(const npyv_lanetype_u64 *ip, npyv_lanetype_u64 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_u64;
+    const int wstep = vstep*8;
+    npyv_u64 acc = npyv_setall_u64(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_u64 v0 = npyv_load_u64(ip + vstep * 0);
+        npyv_u64 v1 = npyv_load_u64(ip + vstep * 1);
+        npyv_u64 v2 = npyv_load_u64(ip + vstep * 2);
+        npyv_u64 v3 = npyv_load_u64(ip + vstep * 3);
+
+        npyv_u64 v4 = npyv_load_u64(ip + vstep * 4);
+        npyv_u64 v5 = npyv_load_u64(ip + vstep * 5);
+        npyv_u64 v6 = npyv_load_u64(ip + vstep * 6);
+        npyv_u64 v7 = npyv_load_u64(ip + vstep * 7);
+
+        npyv_u64 r01 = V_INTRIN(v0, v1);
+        npyv_u64 r23 = V_INTRIN(v2, v3);
+        npyv_u64 r45 = V_INTRIN(v4, v5);
+        npyv_u64 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_u64(ip));
+    }
+    npyv_lanetype_u64 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_u64 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_min_u64(const npyv_lanetype_u64 *ip1, const npyv_lanetype_u64 *ip2,
+                                     npyv_lanetype_u64 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_u64;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_u64 v0 = npyv_load_u64(&ip1[i + 0 * elemPerVector]);
+        npyv_u64 v1 = npyv_load_u64(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u64 v2 = npyv_load_u64(&ip1[i + 2 * elemPerVector]);
+        npyv_u64 v3 = npyv_load_u64(&ip1[i + 3 * elemPerVector]);
+        npyv_u64 v4 = npyv_load_u64(&ip1[i + 4 * elemPerVector]);
+        npyv_u64 v5 = npyv_load_u64(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_u64 u0 = npyv_load_u64(&ip2[i + 0 * elemPerVector]);
+        npyv_u64 u1 = npyv_load_u64(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u64 u2 = npyv_load_u64(&ip2[i + 2 * elemPerVector]);
+        npyv_u64 u3 = npyv_load_u64(&ip2[i + 3 * elemPerVector]);
+        npyv_u64 u4 = npyv_load_u64(&ip2[i + 4 * elemPerVector]);
+        npyv_u64 u5 = npyv_load_u64(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_u64 m0 = V_INTRIN(v0, u0);
+        npyv_u64 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u64 m2 = V_INTRIN(v2, u2);
+        npyv_u64 m3 = V_INTRIN(v3, u3);
+        npyv_u64 m4 = V_INTRIN(v4, u4);
+        npyv_u64 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_u64(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_u64(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_u64(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_u64(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_u64(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_u64(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_u64 v0 = npyv_load_u64(ip1 + i);
+        npyv_u64 u0 = npyv_load_u64(ip2 + i);
+        npyv_u64 m0 = V_INTRIN(v0, u0);
+        npyv_store_u64(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_u64 in1 = ip1[i];
+        const npyv_lanetype_u64 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_min_u64(const npyv_lanetype_u64 *ip1, npy_intp sip1,
+                           const npyv_lanetype_u64 *ip2, npy_intp sip2,
+                                 npyv_lanetype_u64 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_u64;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_u64 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_u64(ip1);
+        } else {
+            a = npyv_loadn_u64(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_u64(ip2);
+        } else {
+            b = npyv_loadn_u64(ip2, sip2);
+        }
+        npyv_u64 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_u64(op1, r);
+        } else {
+            npyv_storen_u64(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_u64 a = *ip1;
+        const npyv_lanetype_u64 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+#line 110
+#define SCALAR_OP scalar_maxp_i
+#if NPY_SIMD && (!1 || (0 && 1))
+
+#if 0 && !1
+    #define V_INTRIN npyv_maxpn_u64 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_maxpn_u64
+#else
+    #define V_INTRIN npyv_maxp_u64
+    #define V_REDUCE_INTRIN npyv_reduce_maxp_u64
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_maxp_u64(const npyv_lanetype_u64 *ip, npyv_lanetype_u64 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_u64;
+    const int wstep = vstep*8;
+    npyv_u64 acc = npyv_setall_u64(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_u64 v0 = npyv_load_u64(ip + vstep * 0);
+        npyv_u64 v1 = npyv_load_u64(ip + vstep * 1);
+        npyv_u64 v2 = npyv_load_u64(ip + vstep * 2);
+        npyv_u64 v3 = npyv_load_u64(ip + vstep * 3);
+
+        npyv_u64 v4 = npyv_load_u64(ip + vstep * 4);
+        npyv_u64 v5 = npyv_load_u64(ip + vstep * 5);
+        npyv_u64 v6 = npyv_load_u64(ip + vstep * 6);
+        npyv_u64 v7 = npyv_load_u64(ip + vstep * 7);
+
+        npyv_u64 r01 = V_INTRIN(v0, v1);
+        npyv_u64 r23 = V_INTRIN(v2, v3);
+        npyv_u64 r45 = V_INTRIN(v4, v5);
+        npyv_u64 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_u64(ip));
+    }
+    npyv_lanetype_u64 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_u64 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_maxp_u64(const npyv_lanetype_u64 *ip1, const npyv_lanetype_u64 *ip2,
+                                     npyv_lanetype_u64 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_u64;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_u64 v0 = npyv_load_u64(&ip1[i + 0 * elemPerVector]);
+        npyv_u64 v1 = npyv_load_u64(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u64 v2 = npyv_load_u64(&ip1[i + 2 * elemPerVector]);
+        npyv_u64 v3 = npyv_load_u64(&ip1[i + 3 * elemPerVector]);
+        npyv_u64 v4 = npyv_load_u64(&ip1[i + 4 * elemPerVector]);
+        npyv_u64 v5 = npyv_load_u64(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_u64 u0 = npyv_load_u64(&ip2[i + 0 * elemPerVector]);
+        npyv_u64 u1 = npyv_load_u64(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u64 u2 = npyv_load_u64(&ip2[i + 2 * elemPerVector]);
+        npyv_u64 u3 = npyv_load_u64(&ip2[i + 3 * elemPerVector]);
+        npyv_u64 u4 = npyv_load_u64(&ip2[i + 4 * elemPerVector]);
+        npyv_u64 u5 = npyv_load_u64(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_u64 m0 = V_INTRIN(v0, u0);
+        npyv_u64 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u64 m2 = V_INTRIN(v2, u2);
+        npyv_u64 m3 = V_INTRIN(v3, u3);
+        npyv_u64 m4 = V_INTRIN(v4, u4);
+        npyv_u64 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_u64(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_u64(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_u64(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_u64(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_u64(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_u64(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_u64 v0 = npyv_load_u64(ip1 + i);
+        npyv_u64 u0 = npyv_load_u64(ip2 + i);
+        npyv_u64 m0 = V_INTRIN(v0, u0);
+        npyv_store_u64(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_u64 in1 = ip1[i];
+        const npyv_lanetype_u64 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_maxp_u64(const npyv_lanetype_u64 *ip1, npy_intp sip1,
+                           const npyv_lanetype_u64 *ip2, npy_intp sip2,
+                                 npyv_lanetype_u64 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_u64;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_u64 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_u64(ip1);
+        } else {
+            a = npyv_loadn_u64(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_u64(ip2);
+        } else {
+            b = npyv_loadn_u64(ip2, sip2);
+        }
+        npyv_u64 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_u64(op1, r);
+        } else {
+            npyv_storen_u64(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_u64 a = *ip1;
+        const npyv_lanetype_u64 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+#line 110
+#define SCALAR_OP scalar_minp_i
+#if NPY_SIMD && (!1 || (0 && 1))
+
+#if 0 && !1
+    #define V_INTRIN npyv_minpn_u64 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_minpn_u64
+#else
+    #define V_INTRIN npyv_minp_u64
+    #define V_REDUCE_INTRIN npyv_reduce_minp_u64
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_minp_u64(const npyv_lanetype_u64 *ip, npyv_lanetype_u64 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_u64;
+    const int wstep = vstep*8;
+    npyv_u64 acc = npyv_setall_u64(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_u64 v0 = npyv_load_u64(ip + vstep * 0);
+        npyv_u64 v1 = npyv_load_u64(ip + vstep * 1);
+        npyv_u64 v2 = npyv_load_u64(ip + vstep * 2);
+        npyv_u64 v3 = npyv_load_u64(ip + vstep * 3);
+
+        npyv_u64 v4 = npyv_load_u64(ip + vstep * 4);
+        npyv_u64 v5 = npyv_load_u64(ip + vstep * 5);
+        npyv_u64 v6 = npyv_load_u64(ip + vstep * 6);
+        npyv_u64 v7 = npyv_load_u64(ip + vstep * 7);
+
+        npyv_u64 r01 = V_INTRIN(v0, v1);
+        npyv_u64 r23 = V_INTRIN(v2, v3);
+        npyv_u64 r45 = V_INTRIN(v4, v5);
+        npyv_u64 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_u64(ip));
+    }
+    npyv_lanetype_u64 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_u64 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_minp_u64(const npyv_lanetype_u64 *ip1, const npyv_lanetype_u64 *ip2,
+                                     npyv_lanetype_u64 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_u64;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_u64 v0 = npyv_load_u64(&ip1[i + 0 * elemPerVector]);
+        npyv_u64 v1 = npyv_load_u64(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u64 v2 = npyv_load_u64(&ip1[i + 2 * elemPerVector]);
+        npyv_u64 v3 = npyv_load_u64(&ip1[i + 3 * elemPerVector]);
+        npyv_u64 v4 = npyv_load_u64(&ip1[i + 4 * elemPerVector]);
+        npyv_u64 v5 = npyv_load_u64(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_u64 u0 = npyv_load_u64(&ip2[i + 0 * elemPerVector]);
+        npyv_u64 u1 = npyv_load_u64(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u64 u2 = npyv_load_u64(&ip2[i + 2 * elemPerVector]);
+        npyv_u64 u3 = npyv_load_u64(&ip2[i + 3 * elemPerVector]);
+        npyv_u64 u4 = npyv_load_u64(&ip2[i + 4 * elemPerVector]);
+        npyv_u64 u5 = npyv_load_u64(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_u64 m0 = V_INTRIN(v0, u0);
+        npyv_u64 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_u64 m2 = V_INTRIN(v2, u2);
+        npyv_u64 m3 = V_INTRIN(v3, u3);
+        npyv_u64 m4 = V_INTRIN(v4, u4);
+        npyv_u64 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_u64(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_u64(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_u64(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_u64(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_u64(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_u64(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_u64 v0 = npyv_load_u64(ip1 + i);
+        npyv_u64 u0 = npyv_load_u64(ip2 + i);
+        npyv_u64 m0 = V_INTRIN(v0, u0);
+        npyv_store_u64(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_u64 in1 = ip1[i];
+        const npyv_lanetype_u64 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 0 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_minp_u64(const npyv_lanetype_u64 *ip1, npy_intp sip1,
+                           const npyv_lanetype_u64 *ip2, npy_intp sip2,
+                                 npyv_lanetype_u64 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_u64;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_u64 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_u64(ip1);
+        } else {
+            a = npyv_loadn_u64(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_u64(ip2);
+        } else {
+            b = npyv_loadn_u64(ip2, sip2);
+        }
+        npyv_u64 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_u64(op1, r);
+        } else {
+            npyv_storen_u64(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_u64 a = *ip1;
+        const npyv_lanetype_u64 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+
+#line 106
+#line 110
+#define SCALAR_OP scalar_max_f
+#if NPY_SIMD_F32 && (!0 || (1 && 0))
+
+#if 1 && !0
+    #define V_INTRIN npyv_maxn_f32 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_maxn_f32
+#else
+    #define V_INTRIN npyv_max_f32
+    #define V_REDUCE_INTRIN npyv_reduce_max_f32
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_max_f32(const npyv_lanetype_f32 *ip, npyv_lanetype_f32 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep*8;
+    npyv_f32 acc = npyv_setall_f32(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_f32 v0 = npyv_load_f32(ip + vstep * 0);
+        npyv_f32 v1 = npyv_load_f32(ip + vstep * 1);
+        npyv_f32 v2 = npyv_load_f32(ip + vstep * 2);
+        npyv_f32 v3 = npyv_load_f32(ip + vstep * 3);
+
+        npyv_f32 v4 = npyv_load_f32(ip + vstep * 4);
+        npyv_f32 v5 = npyv_load_f32(ip + vstep * 5);
+        npyv_f32 v6 = npyv_load_f32(ip + vstep * 6);
+        npyv_f32 v7 = npyv_load_f32(ip + vstep * 7);
+
+        npyv_f32 r01 = V_INTRIN(v0, v1);
+        npyv_f32 r23 = V_INTRIN(v2, v3);
+        npyv_f32 r45 = V_INTRIN(v4, v5);
+        npyv_f32 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_f32(ip));
+    }
+    npyv_lanetype_f32 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_f32 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_max_f32(const npyv_lanetype_f32 *ip1, const npyv_lanetype_f32 *ip2,
+                                     npyv_lanetype_f32 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_f32;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_f32 v0 = npyv_load_f32(&ip1[i + 0 * elemPerVector]);
+        npyv_f32 v1 = npyv_load_f32(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_f32 v2 = npyv_load_f32(&ip1[i + 2 * elemPerVector]);
+        npyv_f32 v3 = npyv_load_f32(&ip1[i + 3 * elemPerVector]);
+        npyv_f32 v4 = npyv_load_f32(&ip1[i + 4 * elemPerVector]);
+        npyv_f32 v5 = npyv_load_f32(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_f32 u0 = npyv_load_f32(&ip2[i + 0 * elemPerVector]);
+        npyv_f32 u1 = npyv_load_f32(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_f32 u2 = npyv_load_f32(&ip2[i + 2 * elemPerVector]);
+        npyv_f32 u3 = npyv_load_f32(&ip2[i + 3 * elemPerVector]);
+        npyv_f32 u4 = npyv_load_f32(&ip2[i + 4 * elemPerVector]);
+        npyv_f32 u5 = npyv_load_f32(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_f32 m0 = V_INTRIN(v0, u0);
+        npyv_f32 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_f32 m2 = V_INTRIN(v2, u2);
+        npyv_f32 m3 = V_INTRIN(v3, u3);
+        npyv_f32 m4 = V_INTRIN(v4, u4);
+        npyv_f32 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_f32(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_f32(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_f32(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_f32(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_f32(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_f32(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_f32 v0 = npyv_load_f32(ip1 + i);
+        npyv_f32 u0 = npyv_load_f32(ip2 + i);
+        npyv_f32 m0 = V_INTRIN(v0, u0);
+        npyv_store_f32(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_f32 in1 = ip1[i];
+        const npyv_lanetype_f32 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 1 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_max_f32(const npyv_lanetype_f32 *ip1, npy_intp sip1,
+                           const npyv_lanetype_f32 *ip2, npy_intp sip2,
+                                 npyv_lanetype_f32 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_f32;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_f32 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_f32(ip1);
+        } else {
+            a = npyv_loadn_f32(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_f32(ip2);
+        } else {
+            b = npyv_loadn_f32(ip2, sip2);
+        }
+        npyv_f32 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_f32(op1, r);
+        } else {
+            npyv_storen_f32(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_f32 a = *ip1;
+        const npyv_lanetype_f32 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+#line 110
+#define SCALAR_OP scalar_min_f
+#if NPY_SIMD_F32 && (!0 || (1 && 0))
+
+#if 1 && !0
+    #define V_INTRIN npyv_minn_f32 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_minn_f32
+#else
+    #define V_INTRIN npyv_min_f32
+    #define V_REDUCE_INTRIN npyv_reduce_min_f32
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_min_f32(const npyv_lanetype_f32 *ip, npyv_lanetype_f32 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep*8;
+    npyv_f32 acc = npyv_setall_f32(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_f32 v0 = npyv_load_f32(ip + vstep * 0);
+        npyv_f32 v1 = npyv_load_f32(ip + vstep * 1);
+        npyv_f32 v2 = npyv_load_f32(ip + vstep * 2);
+        npyv_f32 v3 = npyv_load_f32(ip + vstep * 3);
+
+        npyv_f32 v4 = npyv_load_f32(ip + vstep * 4);
+        npyv_f32 v5 = npyv_load_f32(ip + vstep * 5);
+        npyv_f32 v6 = npyv_load_f32(ip + vstep * 6);
+        npyv_f32 v7 = npyv_load_f32(ip + vstep * 7);
+
+        npyv_f32 r01 = V_INTRIN(v0, v1);
+        npyv_f32 r23 = V_INTRIN(v2, v3);
+        npyv_f32 r45 = V_INTRIN(v4, v5);
+        npyv_f32 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_f32(ip));
+    }
+    npyv_lanetype_f32 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_f32 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_min_f32(const npyv_lanetype_f32 *ip1, const npyv_lanetype_f32 *ip2,
+                                     npyv_lanetype_f32 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_f32;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_f32 v0 = npyv_load_f32(&ip1[i + 0 * elemPerVector]);
+        npyv_f32 v1 = npyv_load_f32(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_f32 v2 = npyv_load_f32(&ip1[i + 2 * elemPerVector]);
+        npyv_f32 v3 = npyv_load_f32(&ip1[i + 3 * elemPerVector]);
+        npyv_f32 v4 = npyv_load_f32(&ip1[i + 4 * elemPerVector]);
+        npyv_f32 v5 = npyv_load_f32(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_f32 u0 = npyv_load_f32(&ip2[i + 0 * elemPerVector]);
+        npyv_f32 u1 = npyv_load_f32(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_f32 u2 = npyv_load_f32(&ip2[i + 2 * elemPerVector]);
+        npyv_f32 u3 = npyv_load_f32(&ip2[i + 3 * elemPerVector]);
+        npyv_f32 u4 = npyv_load_f32(&ip2[i + 4 * elemPerVector]);
+        npyv_f32 u5 = npyv_load_f32(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_f32 m0 = V_INTRIN(v0, u0);
+        npyv_f32 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_f32 m2 = V_INTRIN(v2, u2);
+        npyv_f32 m3 = V_INTRIN(v3, u3);
+        npyv_f32 m4 = V_INTRIN(v4, u4);
+        npyv_f32 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_f32(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_f32(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_f32(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_f32(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_f32(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_f32(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_f32 v0 = npyv_load_f32(ip1 + i);
+        npyv_f32 u0 = npyv_load_f32(ip2 + i);
+        npyv_f32 m0 = V_INTRIN(v0, u0);
+        npyv_store_f32(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_f32 in1 = ip1[i];
+        const npyv_lanetype_f32 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 1 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_min_f32(const npyv_lanetype_f32 *ip1, npy_intp sip1,
+                           const npyv_lanetype_f32 *ip2, npy_intp sip2,
+                                 npyv_lanetype_f32 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_f32;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_f32 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_f32(ip1);
+        } else {
+            a = npyv_loadn_f32(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_f32(ip2);
+        } else {
+            b = npyv_loadn_f32(ip2, sip2);
+        }
+        npyv_f32 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_f32(op1, r);
+        } else {
+            npyv_storen_f32(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_f32 a = *ip1;
+        const npyv_lanetype_f32 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+#line 110
+#define SCALAR_OP scalar_maxp_f
+#if NPY_SIMD_F32 && (!1 || (1 && 1))
+
+#if 1 && !1
+    #define V_INTRIN npyv_maxpn_f32 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_maxpn_f32
+#else
+    #define V_INTRIN npyv_maxp_f32
+    #define V_REDUCE_INTRIN npyv_reduce_maxp_f32
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_maxp_f32(const npyv_lanetype_f32 *ip, npyv_lanetype_f32 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep*8;
+    npyv_f32 acc = npyv_setall_f32(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_f32 v0 = npyv_load_f32(ip + vstep * 0);
+        npyv_f32 v1 = npyv_load_f32(ip + vstep * 1);
+        npyv_f32 v2 = npyv_load_f32(ip + vstep * 2);
+        npyv_f32 v3 = npyv_load_f32(ip + vstep * 3);
+
+        npyv_f32 v4 = npyv_load_f32(ip + vstep * 4);
+        npyv_f32 v5 = npyv_load_f32(ip + vstep * 5);
+        npyv_f32 v6 = npyv_load_f32(ip + vstep * 6);
+        npyv_f32 v7 = npyv_load_f32(ip + vstep * 7);
+
+        npyv_f32 r01 = V_INTRIN(v0, v1);
+        npyv_f32 r23 = V_INTRIN(v2, v3);
+        npyv_f32 r45 = V_INTRIN(v4, v5);
+        npyv_f32 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_f32(ip));
+    }
+    npyv_lanetype_f32 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_f32 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_maxp_f32(const npyv_lanetype_f32 *ip1, const npyv_lanetype_f32 *ip2,
+                                     npyv_lanetype_f32 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_f32;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_f32 v0 = npyv_load_f32(&ip1[i + 0 * elemPerVector]);
+        npyv_f32 v1 = npyv_load_f32(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_f32 v2 = npyv_load_f32(&ip1[i + 2 * elemPerVector]);
+        npyv_f32 v3 = npyv_load_f32(&ip1[i + 3 * elemPerVector]);
+        npyv_f32 v4 = npyv_load_f32(&ip1[i + 4 * elemPerVector]);
+        npyv_f32 v5 = npyv_load_f32(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_f32 u0 = npyv_load_f32(&ip2[i + 0 * elemPerVector]);
+        npyv_f32 u1 = npyv_load_f32(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_f32 u2 = npyv_load_f32(&ip2[i + 2 * elemPerVector]);
+        npyv_f32 u3 = npyv_load_f32(&ip2[i + 3 * elemPerVector]);
+        npyv_f32 u4 = npyv_load_f32(&ip2[i + 4 * elemPerVector]);
+        npyv_f32 u5 = npyv_load_f32(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_f32 m0 = V_INTRIN(v0, u0);
+        npyv_f32 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_f32 m2 = V_INTRIN(v2, u2);
+        npyv_f32 m3 = V_INTRIN(v3, u3);
+        npyv_f32 m4 = V_INTRIN(v4, u4);
+        npyv_f32 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_f32(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_f32(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_f32(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_f32(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_f32(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_f32(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_f32 v0 = npyv_load_f32(ip1 + i);
+        npyv_f32 u0 = npyv_load_f32(ip2 + i);
+        npyv_f32 m0 = V_INTRIN(v0, u0);
+        npyv_store_f32(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_f32 in1 = ip1[i];
+        const npyv_lanetype_f32 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 1 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_maxp_f32(const npyv_lanetype_f32 *ip1, npy_intp sip1,
+                           const npyv_lanetype_f32 *ip2, npy_intp sip2,
+                                 npyv_lanetype_f32 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_f32;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_f32 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_f32(ip1);
+        } else {
+            a = npyv_loadn_f32(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_f32(ip2);
+        } else {
+            b = npyv_loadn_f32(ip2, sip2);
+        }
+        npyv_f32 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_f32(op1, r);
+        } else {
+            npyv_storen_f32(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_f32 a = *ip1;
+        const npyv_lanetype_f32 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+#line 110
+#define SCALAR_OP scalar_minp_f
+#if NPY_SIMD_F32 && (!1 || (1 && 1))
+
+#if 1 && !1
+    #define V_INTRIN npyv_minpn_f32 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_minpn_f32
+#else
+    #define V_INTRIN npyv_minp_f32
+    #define V_REDUCE_INTRIN npyv_reduce_minp_f32
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_minp_f32(const npyv_lanetype_f32 *ip, npyv_lanetype_f32 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep*8;
+    npyv_f32 acc = npyv_setall_f32(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_f32 v0 = npyv_load_f32(ip + vstep * 0);
+        npyv_f32 v1 = npyv_load_f32(ip + vstep * 1);
+        npyv_f32 v2 = npyv_load_f32(ip + vstep * 2);
+        npyv_f32 v3 = npyv_load_f32(ip + vstep * 3);
+
+        npyv_f32 v4 = npyv_load_f32(ip + vstep * 4);
+        npyv_f32 v5 = npyv_load_f32(ip + vstep * 5);
+        npyv_f32 v6 = npyv_load_f32(ip + vstep * 6);
+        npyv_f32 v7 = npyv_load_f32(ip + vstep * 7);
+
+        npyv_f32 r01 = V_INTRIN(v0, v1);
+        npyv_f32 r23 = V_INTRIN(v2, v3);
+        npyv_f32 r45 = V_INTRIN(v4, v5);
+        npyv_f32 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_f32(ip));
+    }
+    npyv_lanetype_f32 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_f32 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_minp_f32(const npyv_lanetype_f32 *ip1, const npyv_lanetype_f32 *ip2,
+                                     npyv_lanetype_f32 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_f32;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_f32 v0 = npyv_load_f32(&ip1[i + 0 * elemPerVector]);
+        npyv_f32 v1 = npyv_load_f32(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_f32 v2 = npyv_load_f32(&ip1[i + 2 * elemPerVector]);
+        npyv_f32 v3 = npyv_load_f32(&ip1[i + 3 * elemPerVector]);
+        npyv_f32 v4 = npyv_load_f32(&ip1[i + 4 * elemPerVector]);
+        npyv_f32 v5 = npyv_load_f32(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_f32 u0 = npyv_load_f32(&ip2[i + 0 * elemPerVector]);
+        npyv_f32 u1 = npyv_load_f32(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_f32 u2 = npyv_load_f32(&ip2[i + 2 * elemPerVector]);
+        npyv_f32 u3 = npyv_load_f32(&ip2[i + 3 * elemPerVector]);
+        npyv_f32 u4 = npyv_load_f32(&ip2[i + 4 * elemPerVector]);
+        npyv_f32 u5 = npyv_load_f32(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_f32 m0 = V_INTRIN(v0, u0);
+        npyv_f32 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_f32 m2 = V_INTRIN(v2, u2);
+        npyv_f32 m3 = V_INTRIN(v3, u3);
+        npyv_f32 m4 = V_INTRIN(v4, u4);
+        npyv_f32 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_f32(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_f32(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_f32(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_f32(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_f32(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_f32(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_f32 v0 = npyv_load_f32(ip1 + i);
+        npyv_f32 u0 = npyv_load_f32(ip2 + i);
+        npyv_f32 m0 = V_INTRIN(v0, u0);
+        npyv_store_f32(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_f32 in1 = ip1[i];
+        const npyv_lanetype_f32 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 1 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_minp_f32(const npyv_lanetype_f32 *ip1, npy_intp sip1,
+                           const npyv_lanetype_f32 *ip2, npy_intp sip2,
+                                 npyv_lanetype_f32 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_f32;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_f32 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_f32(ip1);
+        } else {
+            a = npyv_loadn_f32(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_f32(ip2);
+        } else {
+            b = npyv_loadn_f32(ip2, sip2);
+        }
+        npyv_f32 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_f32(op1, r);
+        } else {
+            npyv_storen_f32(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_f32 a = *ip1;
+        const npyv_lanetype_f32 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+
+#line 106
+#line 110
+#define SCALAR_OP scalar_max_d
+#if NPY_SIMD_F64 && (!0 || (1 && 0))
+
+#if 1 && !0
+    #define V_INTRIN npyv_maxn_f64 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_maxn_f64
+#else
+    #define V_INTRIN npyv_max_f64
+    #define V_REDUCE_INTRIN npyv_reduce_max_f64
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_max_f64(const npyv_lanetype_f64 *ip, npyv_lanetype_f64 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep*8;
+    npyv_f64 acc = npyv_setall_f64(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_f64 v0 = npyv_load_f64(ip + vstep * 0);
+        npyv_f64 v1 = npyv_load_f64(ip + vstep * 1);
+        npyv_f64 v2 = npyv_load_f64(ip + vstep * 2);
+        npyv_f64 v3 = npyv_load_f64(ip + vstep * 3);
+
+        npyv_f64 v4 = npyv_load_f64(ip + vstep * 4);
+        npyv_f64 v5 = npyv_load_f64(ip + vstep * 5);
+        npyv_f64 v6 = npyv_load_f64(ip + vstep * 6);
+        npyv_f64 v7 = npyv_load_f64(ip + vstep * 7);
+
+        npyv_f64 r01 = V_INTRIN(v0, v1);
+        npyv_f64 r23 = V_INTRIN(v2, v3);
+        npyv_f64 r45 = V_INTRIN(v4, v5);
+        npyv_f64 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_f64(ip));
+    }
+    npyv_lanetype_f64 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_f64 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_max_f64(const npyv_lanetype_f64 *ip1, const npyv_lanetype_f64 *ip2,
+                                     npyv_lanetype_f64 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_f64;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_f64 v0 = npyv_load_f64(&ip1[i + 0 * elemPerVector]);
+        npyv_f64 v1 = npyv_load_f64(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_f64 v2 = npyv_load_f64(&ip1[i + 2 * elemPerVector]);
+        npyv_f64 v3 = npyv_load_f64(&ip1[i + 3 * elemPerVector]);
+        npyv_f64 v4 = npyv_load_f64(&ip1[i + 4 * elemPerVector]);
+        npyv_f64 v5 = npyv_load_f64(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_f64 u0 = npyv_load_f64(&ip2[i + 0 * elemPerVector]);
+        npyv_f64 u1 = npyv_load_f64(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_f64 u2 = npyv_load_f64(&ip2[i + 2 * elemPerVector]);
+        npyv_f64 u3 = npyv_load_f64(&ip2[i + 3 * elemPerVector]);
+        npyv_f64 u4 = npyv_load_f64(&ip2[i + 4 * elemPerVector]);
+        npyv_f64 u5 = npyv_load_f64(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_f64 m0 = V_INTRIN(v0, u0);
+        npyv_f64 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_f64 m2 = V_INTRIN(v2, u2);
+        npyv_f64 m3 = V_INTRIN(v3, u3);
+        npyv_f64 m4 = V_INTRIN(v4, u4);
+        npyv_f64 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_f64(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_f64(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_f64(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_f64(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_f64(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_f64(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_f64 v0 = npyv_load_f64(ip1 + i);
+        npyv_f64 u0 = npyv_load_f64(ip2 + i);
+        npyv_f64 m0 = V_INTRIN(v0, u0);
+        npyv_store_f64(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_f64 in1 = ip1[i];
+        const npyv_lanetype_f64 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 1 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_max_f64(const npyv_lanetype_f64 *ip1, npy_intp sip1,
+                           const npyv_lanetype_f64 *ip2, npy_intp sip2,
+                                 npyv_lanetype_f64 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_f64;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_f64 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_f64(ip1);
+        } else {
+            a = npyv_loadn_f64(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_f64(ip2);
+        } else {
+            b = npyv_loadn_f64(ip2, sip2);
+        }
+        npyv_f64 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_f64(op1, r);
+        } else {
+            npyv_storen_f64(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_f64 a = *ip1;
+        const npyv_lanetype_f64 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+#line 110
+#define SCALAR_OP scalar_min_d
+#if NPY_SIMD_F64 && (!0 || (1 && 0))
+
+#if 1 && !0
+    #define V_INTRIN npyv_minn_f64 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_minn_f64
+#else
+    #define V_INTRIN npyv_min_f64
+    #define V_REDUCE_INTRIN npyv_reduce_min_f64
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_min_f64(const npyv_lanetype_f64 *ip, npyv_lanetype_f64 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep*8;
+    npyv_f64 acc = npyv_setall_f64(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_f64 v0 = npyv_load_f64(ip + vstep * 0);
+        npyv_f64 v1 = npyv_load_f64(ip + vstep * 1);
+        npyv_f64 v2 = npyv_load_f64(ip + vstep * 2);
+        npyv_f64 v3 = npyv_load_f64(ip + vstep * 3);
+
+        npyv_f64 v4 = npyv_load_f64(ip + vstep * 4);
+        npyv_f64 v5 = npyv_load_f64(ip + vstep * 5);
+        npyv_f64 v6 = npyv_load_f64(ip + vstep * 6);
+        npyv_f64 v7 = npyv_load_f64(ip + vstep * 7);
+
+        npyv_f64 r01 = V_INTRIN(v0, v1);
+        npyv_f64 r23 = V_INTRIN(v2, v3);
+        npyv_f64 r45 = V_INTRIN(v4, v5);
+        npyv_f64 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_f64(ip));
+    }
+    npyv_lanetype_f64 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_f64 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_min_f64(const npyv_lanetype_f64 *ip1, const npyv_lanetype_f64 *ip2,
+                                     npyv_lanetype_f64 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_f64;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_f64 v0 = npyv_load_f64(&ip1[i + 0 * elemPerVector]);
+        npyv_f64 v1 = npyv_load_f64(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_f64 v2 = npyv_load_f64(&ip1[i + 2 * elemPerVector]);
+        npyv_f64 v3 = npyv_load_f64(&ip1[i + 3 * elemPerVector]);
+        npyv_f64 v4 = npyv_load_f64(&ip1[i + 4 * elemPerVector]);
+        npyv_f64 v5 = npyv_load_f64(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_f64 u0 = npyv_load_f64(&ip2[i + 0 * elemPerVector]);
+        npyv_f64 u1 = npyv_load_f64(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_f64 u2 = npyv_load_f64(&ip2[i + 2 * elemPerVector]);
+        npyv_f64 u3 = npyv_load_f64(&ip2[i + 3 * elemPerVector]);
+        npyv_f64 u4 = npyv_load_f64(&ip2[i + 4 * elemPerVector]);
+        npyv_f64 u5 = npyv_load_f64(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_f64 m0 = V_INTRIN(v0, u0);
+        npyv_f64 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_f64 m2 = V_INTRIN(v2, u2);
+        npyv_f64 m3 = V_INTRIN(v3, u3);
+        npyv_f64 m4 = V_INTRIN(v4, u4);
+        npyv_f64 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_f64(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_f64(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_f64(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_f64(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_f64(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_f64(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_f64 v0 = npyv_load_f64(ip1 + i);
+        npyv_f64 u0 = npyv_load_f64(ip2 + i);
+        npyv_f64 m0 = V_INTRIN(v0, u0);
+        npyv_store_f64(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_f64 in1 = ip1[i];
+        const npyv_lanetype_f64 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 1 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_min_f64(const npyv_lanetype_f64 *ip1, npy_intp sip1,
+                           const npyv_lanetype_f64 *ip2, npy_intp sip2,
+                                 npyv_lanetype_f64 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_f64;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_f64 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_f64(ip1);
+        } else {
+            a = npyv_loadn_f64(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_f64(ip2);
+        } else {
+            b = npyv_loadn_f64(ip2, sip2);
+        }
+        npyv_f64 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_f64(op1, r);
+        } else {
+            npyv_storen_f64(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_f64 a = *ip1;
+        const npyv_lanetype_f64 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+#line 110
+#define SCALAR_OP scalar_maxp_d
+#if NPY_SIMD_F64 && (!1 || (1 && 1))
+
+#if 1 && !1
+    #define V_INTRIN npyv_maxpn_f64 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_maxpn_f64
+#else
+    #define V_INTRIN npyv_maxp_f64
+    #define V_REDUCE_INTRIN npyv_reduce_maxp_f64
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_maxp_f64(const npyv_lanetype_f64 *ip, npyv_lanetype_f64 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep*8;
+    npyv_f64 acc = npyv_setall_f64(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_f64 v0 = npyv_load_f64(ip + vstep * 0);
+        npyv_f64 v1 = npyv_load_f64(ip + vstep * 1);
+        npyv_f64 v2 = npyv_load_f64(ip + vstep * 2);
+        npyv_f64 v3 = npyv_load_f64(ip + vstep * 3);
+
+        npyv_f64 v4 = npyv_load_f64(ip + vstep * 4);
+        npyv_f64 v5 = npyv_load_f64(ip + vstep * 5);
+        npyv_f64 v6 = npyv_load_f64(ip + vstep * 6);
+        npyv_f64 v7 = npyv_load_f64(ip + vstep * 7);
+
+        npyv_f64 r01 = V_INTRIN(v0, v1);
+        npyv_f64 r23 = V_INTRIN(v2, v3);
+        npyv_f64 r45 = V_INTRIN(v4, v5);
+        npyv_f64 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_f64(ip));
+    }
+    npyv_lanetype_f64 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_f64 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_maxp_f64(const npyv_lanetype_f64 *ip1, const npyv_lanetype_f64 *ip2,
+                                     npyv_lanetype_f64 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_f64;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_f64 v0 = npyv_load_f64(&ip1[i + 0 * elemPerVector]);
+        npyv_f64 v1 = npyv_load_f64(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_f64 v2 = npyv_load_f64(&ip1[i + 2 * elemPerVector]);
+        npyv_f64 v3 = npyv_load_f64(&ip1[i + 3 * elemPerVector]);
+        npyv_f64 v4 = npyv_load_f64(&ip1[i + 4 * elemPerVector]);
+        npyv_f64 v5 = npyv_load_f64(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_f64 u0 = npyv_load_f64(&ip2[i + 0 * elemPerVector]);
+        npyv_f64 u1 = npyv_load_f64(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_f64 u2 = npyv_load_f64(&ip2[i + 2 * elemPerVector]);
+        npyv_f64 u3 = npyv_load_f64(&ip2[i + 3 * elemPerVector]);
+        npyv_f64 u4 = npyv_load_f64(&ip2[i + 4 * elemPerVector]);
+        npyv_f64 u5 = npyv_load_f64(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_f64 m0 = V_INTRIN(v0, u0);
+        npyv_f64 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_f64 m2 = V_INTRIN(v2, u2);
+        npyv_f64 m3 = V_INTRIN(v3, u3);
+        npyv_f64 m4 = V_INTRIN(v4, u4);
+        npyv_f64 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_f64(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_f64(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_f64(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_f64(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_f64(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_f64(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_f64 v0 = npyv_load_f64(ip1 + i);
+        npyv_f64 u0 = npyv_load_f64(ip2 + i);
+        npyv_f64 m0 = V_INTRIN(v0, u0);
+        npyv_store_f64(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_f64 in1 = ip1[i];
+        const npyv_lanetype_f64 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 1 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_maxp_f64(const npyv_lanetype_f64 *ip1, npy_intp sip1,
+                           const npyv_lanetype_f64 *ip2, npy_intp sip2,
+                                 npyv_lanetype_f64 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_f64;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_f64 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_f64(ip1);
+        } else {
+            a = npyv_loadn_f64(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_f64(ip2);
+        } else {
+            b = npyv_loadn_f64(ip2, sip2);
+        }
+        npyv_f64 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_f64(op1, r);
+        } else {
+            npyv_storen_f64(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_f64 a = *ip1;
+        const npyv_lanetype_f64 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+#line 110
+#define SCALAR_OP scalar_minp_d
+#if NPY_SIMD_F64 && (!1 || (1 && 1))
+
+#if 1 && !1
+    #define V_INTRIN npyv_minpn_f64 // propagates NaNs
+    #define V_REDUCE_INTRIN npyv_reduce_minpn_f64
+#else
+    #define V_INTRIN npyv_minp_f64
+    #define V_REDUCE_INTRIN npyv_reduce_minp_f64
+#endif
+
+// contiguous input.
+static inline void
+simd_reduce_c_minp_f64(const npyv_lanetype_f64 *ip, npyv_lanetype_f64 *op1, npy_intp len)
+{
+    if (len < 1) {
+        return;
+    }
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep*8;
+    npyv_f64 acc = npyv_setall_f64(op1[0]);
+    for (; len >= wstep; len -= wstep, ip += wstep) {
+    #ifdef NPY_HAVE_SSE2
+        NPY_PREFETCH(ip + wstep, 0, 3);
+    #endif
+        npyv_f64 v0 = npyv_load_f64(ip + vstep * 0);
+        npyv_f64 v1 = npyv_load_f64(ip + vstep * 1);
+        npyv_f64 v2 = npyv_load_f64(ip + vstep * 2);
+        npyv_f64 v3 = npyv_load_f64(ip + vstep * 3);
+
+        npyv_f64 v4 = npyv_load_f64(ip + vstep * 4);
+        npyv_f64 v5 = npyv_load_f64(ip + vstep * 5);
+        npyv_f64 v6 = npyv_load_f64(ip + vstep * 6);
+        npyv_f64 v7 = npyv_load_f64(ip + vstep * 7);
+
+        npyv_f64 r01 = V_INTRIN(v0, v1);
+        npyv_f64 r23 = V_INTRIN(v2, v3);
+        npyv_f64 r45 = V_INTRIN(v4, v5);
+        npyv_f64 r67 = V_INTRIN(v6, v7);
+        acc = V_INTRIN(acc, V_INTRIN(V_INTRIN(r01, r23), V_INTRIN(r45, r67)));
+    }
+    for (; len >= vstep; len -= vstep, ip += vstep) {
+        acc = V_INTRIN(acc, npyv_load_f64(ip));
+    }
+    npyv_lanetype_f64 r = V_REDUCE_INTRIN(acc);
+    // Scalar - finish up any remaining iterations
+    for (; len > 0; --len, ++ip) {
+        const npyv_lanetype_f64 in2 = *ip;
+        r = SCALAR_OP(r, in2);
+    }
+    op1[0] = r;
+}
+
+// contiguous inputs and output.
+static inline void
+simd_binary_ccc_minp_f64(const npyv_lanetype_f64 *ip1, const npyv_lanetype_f64 *ip2,
+                                     npyv_lanetype_f64 *op1, npy_intp len)
+{
+#if NPY_SIMD_WIDTH == 128
+    // Note, 6x unroll was chosen for best results on Apple M1
+    const int vectorsPerLoop = 6;
+#else
+    // To avoid memory bandwidth bottleneck
+    const int vectorsPerLoop = 2;
+#endif
+    const int elemPerVector = npyv_nlanes_f64;
+    int elemPerLoop = vectorsPerLoop * elemPerVector;
+
+    npy_intp i = 0;
+
+    for (; (i+elemPerLoop) <= len; i += elemPerLoop) {
+        npyv_f64 v0 = npyv_load_f64(&ip1[i + 0 * elemPerVector]);
+        npyv_f64 v1 = npyv_load_f64(&ip1[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_f64 v2 = npyv_load_f64(&ip1[i + 2 * elemPerVector]);
+        npyv_f64 v3 = npyv_load_f64(&ip1[i + 3 * elemPerVector]);
+        npyv_f64 v4 = npyv_load_f64(&ip1[i + 4 * elemPerVector]);
+        npyv_f64 v5 = npyv_load_f64(&ip1[i + 5 * elemPerVector]);
+    #endif
+        npyv_f64 u0 = npyv_load_f64(&ip2[i + 0 * elemPerVector]);
+        npyv_f64 u1 = npyv_load_f64(&ip2[i + 1 * elemPerVector]);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_f64 u2 = npyv_load_f64(&ip2[i + 2 * elemPerVector]);
+        npyv_f64 u3 = npyv_load_f64(&ip2[i + 3 * elemPerVector]);
+        npyv_f64 u4 = npyv_load_f64(&ip2[i + 4 * elemPerVector]);
+        npyv_f64 u5 = npyv_load_f64(&ip2[i + 5 * elemPerVector]);
+    #endif
+        npyv_f64 m0 = V_INTRIN(v0, u0);
+        npyv_f64 m1 = V_INTRIN(v1, u1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_f64 m2 = V_INTRIN(v2, u2);
+        npyv_f64 m3 = V_INTRIN(v3, u3);
+        npyv_f64 m4 = V_INTRIN(v4, u4);
+        npyv_f64 m5 = V_INTRIN(v5, u5);
+    #endif
+        npyv_store_f64(&op1[i + 0 * elemPerVector], m0);
+        npyv_store_f64(&op1[i + 1 * elemPerVector], m1);
+    #if NPY_SIMD_WIDTH == 128
+        npyv_store_f64(&op1[i + 2 * elemPerVector], m2);
+        npyv_store_f64(&op1[i + 3 * elemPerVector], m3);
+        npyv_store_f64(&op1[i + 4 * elemPerVector], m4);
+        npyv_store_f64(&op1[i + 5 * elemPerVector], m5);
+    #endif
+    }
+    for (; (i+elemPerVector) <= len; i += elemPerVector) {
+        npyv_f64 v0 = npyv_load_f64(ip1 + i);
+        npyv_f64 u0 = npyv_load_f64(ip2 + i);
+        npyv_f64 m0 = V_INTRIN(v0, u0);
+        npyv_store_f64(op1 + i, m0);
+    }
+    // Scalar - finish up any remaining iterations
+    for (; i < len; ++i) {
+        const npyv_lanetype_f64 in1 = ip1[i];
+        const npyv_lanetype_f64 in2 = ip2[i];
+        op1[i] = SCALAR_OP(in1, in2);
+    }
+}
+// non-contiguous for float 32/64-bit memory access
+#if 1 && !defined(NPY_HAVE_NEON)
+// unroll scalars faster than non-contiguous vector load/store on Arm
+static inline void
+simd_binary_minp_f64(const npyv_lanetype_f64 *ip1, npy_intp sip1,
+                           const npyv_lanetype_f64 *ip2, npy_intp sip2,
+                                 npyv_lanetype_f64 *op1, npy_intp sop1,
+                                 npy_intp len)
+{
+    const int vstep = npyv_nlanes_f64;
+    for (; len >= vstep; len -= vstep, ip1 += sip1*vstep,
+                         ip2 += sip2*vstep, op1 += sop1*vstep
+    ) {
+        npyv_f64 a, b;
+        if (sip1 == 1) {
+            a = npyv_load_f64(ip1);
+        } else {
+            a = npyv_loadn_f64(ip1, sip1);
+        }
+        if (sip2 == 1) {
+            b = npyv_load_f64(ip2);
+        } else {
+            b = npyv_loadn_f64(ip2, sip2);
+        }
+        npyv_f64 r = V_INTRIN(a, b);
+        if (sop1 == 1) {
+            npyv_store_f64(op1, r);
+        } else {
+            npyv_storen_f64(op1, sop1, r);
+        }
+    }
+    for (; len > 0; --len, ip1 += sip1, ip2 += sip2, op1 += sop1) {
+        const npyv_lanetype_f64 a = *ip1;
+        const npyv_lanetype_f64 b = *ip2;
+        *op1 = SCALAR_OP(a, b);
+    }
+}
+#endif
+
+#undef V_INTRIN
+#undef V_REDUCE_INTRIN
+
+#endif // simd_chk && (!fp_only || (is_fp && fp_only))
+
+#undef SCALAR_OP
+
+
+
+/*******************************************************************************
+ ** Defining ufunc inner functions
+ ******************************************************************************/
+#line 294
+#undef TO_SIMD_SFX
+#if 0
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_BYTE == 8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_BYTE == 16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_BYTE == 32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_BYTE == 64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 321
+#if !0 || (0 && 0)
+#define SCALAR_OP scalar_max_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_maximum)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_ubyte)) {
+            TO_SIMD_SFX(simd_reduce_c_max)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_ubyte, npy_ubyte)) {
+            TO_SIMD_SFX(simd_binary_ccc_max)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_max)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_ubyte m0 = *((npy_ubyte *)(ip2 + (i + 0) * is2));
+            npy_ubyte m1 = *((npy_ubyte *)(ip2 + (i + 1) * is2));
+            npy_ubyte m2 = *((npy_ubyte *)(ip2 + (i + 2) * is2));
+            npy_ubyte m3 = *((npy_ubyte *)(ip2 + (i + 3) * is2));
+            npy_ubyte m4 = *((npy_ubyte *)(ip2 + (i + 4) * is2));
+            npy_ubyte m5 = *((npy_ubyte *)(ip2 + (i + 5) * is2));
+            npy_ubyte m6 = *((npy_ubyte *)(ip2 + (i + 6) * is2));
+            npy_ubyte m7 = *((npy_ubyte *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_ubyte v0 = *((npy_ubyte *)(ip2 + (i + 0) * is2));
+                npy_ubyte v1 = *((npy_ubyte *)(ip2 + (i + 1) * is2));
+                npy_ubyte v2 = *((npy_ubyte *)(ip2 + (i + 2) * is2));
+                npy_ubyte v3 = *((npy_ubyte *)(ip2 + (i + 3) * is2));
+                npy_ubyte v4 = *((npy_ubyte *)(ip2 + (i + 4) * is2));
+                npy_ubyte v5 = *((npy_ubyte *)(ip2 + (i + 5) * is2));
+                npy_ubyte v6 = *((npy_ubyte *)(ip2 + (i + 6) * is2));
+                npy_ubyte v7 = *((npy_ubyte *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_ubyte *)op1) = SCALAR_OP(*((npy_ubyte *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_ubyte v0 = *((npy_ubyte *)(ip1 + (i + 0) * is1));
+            npy_ubyte u0 = *((npy_ubyte *)(ip2 + (i + 0) * is2));
+            *((npy_ubyte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_ubyte v1 = *((npy_ubyte *)(ip1 + (i + 1) * is1));
+            npy_ubyte u1 = *((npy_ubyte *)(ip2 + (i + 1) * is2));
+            *((npy_ubyte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_ubyte v2 = *((npy_ubyte *)(ip1 + (i + 2) * is1));
+            npy_ubyte u2 = *((npy_ubyte *)(ip2 + (i + 2) * is2));
+            *((npy_ubyte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_ubyte v3 = *((npy_ubyte *)(ip1 + (i + 3) * is1));
+            npy_ubyte u3 = *((npy_ubyte *)(ip2 + (i + 3) * is2));
+            *((npy_ubyte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_ubyte in1 = *(npy_ubyte *)ip1;
+        const npy_ubyte in2 = *(npy_ubyte *)ip2;
+        *((npy_ubyte *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_maximum_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_ubyte *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ubyte *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_ubyte *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !0 || (0 && 0)
+#define SCALAR_OP scalar_min_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_minimum)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_ubyte)) {
+            TO_SIMD_SFX(simd_reduce_c_min)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_ubyte, npy_ubyte)) {
+            TO_SIMD_SFX(simd_binary_ccc_min)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_min)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_ubyte m0 = *((npy_ubyte *)(ip2 + (i + 0) * is2));
+            npy_ubyte m1 = *((npy_ubyte *)(ip2 + (i + 1) * is2));
+            npy_ubyte m2 = *((npy_ubyte *)(ip2 + (i + 2) * is2));
+            npy_ubyte m3 = *((npy_ubyte *)(ip2 + (i + 3) * is2));
+            npy_ubyte m4 = *((npy_ubyte *)(ip2 + (i + 4) * is2));
+            npy_ubyte m5 = *((npy_ubyte *)(ip2 + (i + 5) * is2));
+            npy_ubyte m6 = *((npy_ubyte *)(ip2 + (i + 6) * is2));
+            npy_ubyte m7 = *((npy_ubyte *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_ubyte v0 = *((npy_ubyte *)(ip2 + (i + 0) * is2));
+                npy_ubyte v1 = *((npy_ubyte *)(ip2 + (i + 1) * is2));
+                npy_ubyte v2 = *((npy_ubyte *)(ip2 + (i + 2) * is2));
+                npy_ubyte v3 = *((npy_ubyte *)(ip2 + (i + 3) * is2));
+                npy_ubyte v4 = *((npy_ubyte *)(ip2 + (i + 4) * is2));
+                npy_ubyte v5 = *((npy_ubyte *)(ip2 + (i + 5) * is2));
+                npy_ubyte v6 = *((npy_ubyte *)(ip2 + (i + 6) * is2));
+                npy_ubyte v7 = *((npy_ubyte *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_ubyte *)op1) = SCALAR_OP(*((npy_ubyte *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_ubyte v0 = *((npy_ubyte *)(ip1 + (i + 0) * is1));
+            npy_ubyte u0 = *((npy_ubyte *)(ip2 + (i + 0) * is2));
+            *((npy_ubyte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_ubyte v1 = *((npy_ubyte *)(ip1 + (i + 1) * is1));
+            npy_ubyte u1 = *((npy_ubyte *)(ip2 + (i + 1) * is2));
+            *((npy_ubyte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_ubyte v2 = *((npy_ubyte *)(ip1 + (i + 2) * is1));
+            npy_ubyte u2 = *((npy_ubyte *)(ip2 + (i + 2) * is2));
+            *((npy_ubyte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_ubyte v3 = *((npy_ubyte *)(ip1 + (i + 3) * is1));
+            npy_ubyte u3 = *((npy_ubyte *)(ip2 + (i + 3) * is2));
+            *((npy_ubyte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_ubyte in1 = *(npy_ubyte *)ip1;
+        const npy_ubyte in2 = *(npy_ubyte *)ip2;
+        *((npy_ubyte *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_minimum_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_ubyte *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ubyte *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_ubyte *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !1 || (0 && 1)
+#define SCALAR_OP scalar_maxp_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_fmax)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_ubyte)) {
+            TO_SIMD_SFX(simd_reduce_c_maxp)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_ubyte, npy_ubyte)) {
+            TO_SIMD_SFX(simd_binary_ccc_maxp)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_maxp)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_ubyte m0 = *((npy_ubyte *)(ip2 + (i + 0) * is2));
+            npy_ubyte m1 = *((npy_ubyte *)(ip2 + (i + 1) * is2));
+            npy_ubyte m2 = *((npy_ubyte *)(ip2 + (i + 2) * is2));
+            npy_ubyte m3 = *((npy_ubyte *)(ip2 + (i + 3) * is2));
+            npy_ubyte m4 = *((npy_ubyte *)(ip2 + (i + 4) * is2));
+            npy_ubyte m5 = *((npy_ubyte *)(ip2 + (i + 5) * is2));
+            npy_ubyte m6 = *((npy_ubyte *)(ip2 + (i + 6) * is2));
+            npy_ubyte m7 = *((npy_ubyte *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_ubyte v0 = *((npy_ubyte *)(ip2 + (i + 0) * is2));
+                npy_ubyte v1 = *((npy_ubyte *)(ip2 + (i + 1) * is2));
+                npy_ubyte v2 = *((npy_ubyte *)(ip2 + (i + 2) * is2));
+                npy_ubyte v3 = *((npy_ubyte *)(ip2 + (i + 3) * is2));
+                npy_ubyte v4 = *((npy_ubyte *)(ip2 + (i + 4) * is2));
+                npy_ubyte v5 = *((npy_ubyte *)(ip2 + (i + 5) * is2));
+                npy_ubyte v6 = *((npy_ubyte *)(ip2 + (i + 6) * is2));
+                npy_ubyte v7 = *((npy_ubyte *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_ubyte *)op1) = SCALAR_OP(*((npy_ubyte *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_ubyte v0 = *((npy_ubyte *)(ip1 + (i + 0) * is1));
+            npy_ubyte u0 = *((npy_ubyte *)(ip2 + (i + 0) * is2));
+            *((npy_ubyte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_ubyte v1 = *((npy_ubyte *)(ip1 + (i + 1) * is1));
+            npy_ubyte u1 = *((npy_ubyte *)(ip2 + (i + 1) * is2));
+            *((npy_ubyte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_ubyte v2 = *((npy_ubyte *)(ip1 + (i + 2) * is1));
+            npy_ubyte u2 = *((npy_ubyte *)(ip2 + (i + 2) * is2));
+            *((npy_ubyte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_ubyte v3 = *((npy_ubyte *)(ip1 + (i + 3) * is1));
+            npy_ubyte u3 = *((npy_ubyte *)(ip2 + (i + 3) * is2));
+            *((npy_ubyte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_ubyte in1 = *(npy_ubyte *)ip1;
+        const npy_ubyte in2 = *(npy_ubyte *)ip2;
+        *((npy_ubyte *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_fmax_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_ubyte *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ubyte *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_ubyte *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !1 || (0 && 1)
+#define SCALAR_OP scalar_minp_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_fmin)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_ubyte)) {
+            TO_SIMD_SFX(simd_reduce_c_minp)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_ubyte, npy_ubyte)) {
+            TO_SIMD_SFX(simd_binary_ccc_minp)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_minp)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_ubyte m0 = *((npy_ubyte *)(ip2 + (i + 0) * is2));
+            npy_ubyte m1 = *((npy_ubyte *)(ip2 + (i + 1) * is2));
+            npy_ubyte m2 = *((npy_ubyte *)(ip2 + (i + 2) * is2));
+            npy_ubyte m3 = *((npy_ubyte *)(ip2 + (i + 3) * is2));
+            npy_ubyte m4 = *((npy_ubyte *)(ip2 + (i + 4) * is2));
+            npy_ubyte m5 = *((npy_ubyte *)(ip2 + (i + 5) * is2));
+            npy_ubyte m6 = *((npy_ubyte *)(ip2 + (i + 6) * is2));
+            npy_ubyte m7 = *((npy_ubyte *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_ubyte v0 = *((npy_ubyte *)(ip2 + (i + 0) * is2));
+                npy_ubyte v1 = *((npy_ubyte *)(ip2 + (i + 1) * is2));
+                npy_ubyte v2 = *((npy_ubyte *)(ip2 + (i + 2) * is2));
+                npy_ubyte v3 = *((npy_ubyte *)(ip2 + (i + 3) * is2));
+                npy_ubyte v4 = *((npy_ubyte *)(ip2 + (i + 4) * is2));
+                npy_ubyte v5 = *((npy_ubyte *)(ip2 + (i + 5) * is2));
+                npy_ubyte v6 = *((npy_ubyte *)(ip2 + (i + 6) * is2));
+                npy_ubyte v7 = *((npy_ubyte *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_ubyte *)op1) = SCALAR_OP(*((npy_ubyte *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_ubyte v0 = *((npy_ubyte *)(ip1 + (i + 0) * is1));
+            npy_ubyte u0 = *((npy_ubyte *)(ip2 + (i + 0) * is2));
+            *((npy_ubyte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_ubyte v1 = *((npy_ubyte *)(ip1 + (i + 1) * is1));
+            npy_ubyte u1 = *((npy_ubyte *)(ip2 + (i + 1) * is2));
+            *((npy_ubyte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_ubyte v2 = *((npy_ubyte *)(ip1 + (i + 2) * is1));
+            npy_ubyte u2 = *((npy_ubyte *)(ip2 + (i + 2) * is2));
+            *((npy_ubyte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_ubyte v3 = *((npy_ubyte *)(ip1 + (i + 3) * is1));
+            npy_ubyte u3 = *((npy_ubyte *)(ip2 + (i + 3) * is2));
+            *((npy_ubyte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_ubyte in1 = *(npy_ubyte *)ip1;
+        const npy_ubyte in2 = *(npy_ubyte *)ip2;
+        *((npy_ubyte *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UBYTE_fmin_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_ubyte *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ubyte *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_ubyte *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+
+#line 294
+#undef TO_SIMD_SFX
+#if 0
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_SHORT == 8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_SHORT == 16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_SHORT == 32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_SHORT == 64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 321
+#if !0 || (0 && 0)
+#define SCALAR_OP scalar_max_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_maximum)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_ushort)) {
+            TO_SIMD_SFX(simd_reduce_c_max)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_ushort, npy_ushort)) {
+            TO_SIMD_SFX(simd_binary_ccc_max)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_max)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_ushort m0 = *((npy_ushort *)(ip2 + (i + 0) * is2));
+            npy_ushort m1 = *((npy_ushort *)(ip2 + (i + 1) * is2));
+            npy_ushort m2 = *((npy_ushort *)(ip2 + (i + 2) * is2));
+            npy_ushort m3 = *((npy_ushort *)(ip2 + (i + 3) * is2));
+            npy_ushort m4 = *((npy_ushort *)(ip2 + (i + 4) * is2));
+            npy_ushort m5 = *((npy_ushort *)(ip2 + (i + 5) * is2));
+            npy_ushort m6 = *((npy_ushort *)(ip2 + (i + 6) * is2));
+            npy_ushort m7 = *((npy_ushort *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_ushort v0 = *((npy_ushort *)(ip2 + (i + 0) * is2));
+                npy_ushort v1 = *((npy_ushort *)(ip2 + (i + 1) * is2));
+                npy_ushort v2 = *((npy_ushort *)(ip2 + (i + 2) * is2));
+                npy_ushort v3 = *((npy_ushort *)(ip2 + (i + 3) * is2));
+                npy_ushort v4 = *((npy_ushort *)(ip2 + (i + 4) * is2));
+                npy_ushort v5 = *((npy_ushort *)(ip2 + (i + 5) * is2));
+                npy_ushort v6 = *((npy_ushort *)(ip2 + (i + 6) * is2));
+                npy_ushort v7 = *((npy_ushort *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_ushort *)op1) = SCALAR_OP(*((npy_ushort *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_ushort v0 = *((npy_ushort *)(ip1 + (i + 0) * is1));
+            npy_ushort u0 = *((npy_ushort *)(ip2 + (i + 0) * is2));
+            *((npy_ushort *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_ushort v1 = *((npy_ushort *)(ip1 + (i + 1) * is1));
+            npy_ushort u1 = *((npy_ushort *)(ip2 + (i + 1) * is2));
+            *((npy_ushort *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_ushort v2 = *((npy_ushort *)(ip1 + (i + 2) * is1));
+            npy_ushort u2 = *((npy_ushort *)(ip2 + (i + 2) * is2));
+            *((npy_ushort *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_ushort v3 = *((npy_ushort *)(ip1 + (i + 3) * is1));
+            npy_ushort u3 = *((npy_ushort *)(ip2 + (i + 3) * is2));
+            *((npy_ushort *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_ushort in1 = *(npy_ushort *)ip1;
+        const npy_ushort in2 = *(npy_ushort *)ip2;
+        *((npy_ushort *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_maximum_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_ushort *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ushort *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_ushort *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !0 || (0 && 0)
+#define SCALAR_OP scalar_min_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_minimum)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_ushort)) {
+            TO_SIMD_SFX(simd_reduce_c_min)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_ushort, npy_ushort)) {
+            TO_SIMD_SFX(simd_binary_ccc_min)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_min)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_ushort m0 = *((npy_ushort *)(ip2 + (i + 0) * is2));
+            npy_ushort m1 = *((npy_ushort *)(ip2 + (i + 1) * is2));
+            npy_ushort m2 = *((npy_ushort *)(ip2 + (i + 2) * is2));
+            npy_ushort m3 = *((npy_ushort *)(ip2 + (i + 3) * is2));
+            npy_ushort m4 = *((npy_ushort *)(ip2 + (i + 4) * is2));
+            npy_ushort m5 = *((npy_ushort *)(ip2 + (i + 5) * is2));
+            npy_ushort m6 = *((npy_ushort *)(ip2 + (i + 6) * is2));
+            npy_ushort m7 = *((npy_ushort *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_ushort v0 = *((npy_ushort *)(ip2 + (i + 0) * is2));
+                npy_ushort v1 = *((npy_ushort *)(ip2 + (i + 1) * is2));
+                npy_ushort v2 = *((npy_ushort *)(ip2 + (i + 2) * is2));
+                npy_ushort v3 = *((npy_ushort *)(ip2 + (i + 3) * is2));
+                npy_ushort v4 = *((npy_ushort *)(ip2 + (i + 4) * is2));
+                npy_ushort v5 = *((npy_ushort *)(ip2 + (i + 5) * is2));
+                npy_ushort v6 = *((npy_ushort *)(ip2 + (i + 6) * is2));
+                npy_ushort v7 = *((npy_ushort *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_ushort *)op1) = SCALAR_OP(*((npy_ushort *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_ushort v0 = *((npy_ushort *)(ip1 + (i + 0) * is1));
+            npy_ushort u0 = *((npy_ushort *)(ip2 + (i + 0) * is2));
+            *((npy_ushort *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_ushort v1 = *((npy_ushort *)(ip1 + (i + 1) * is1));
+            npy_ushort u1 = *((npy_ushort *)(ip2 + (i + 1) * is2));
+            *((npy_ushort *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_ushort v2 = *((npy_ushort *)(ip1 + (i + 2) * is1));
+            npy_ushort u2 = *((npy_ushort *)(ip2 + (i + 2) * is2));
+            *((npy_ushort *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_ushort v3 = *((npy_ushort *)(ip1 + (i + 3) * is1));
+            npy_ushort u3 = *((npy_ushort *)(ip2 + (i + 3) * is2));
+            *((npy_ushort *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_ushort in1 = *(npy_ushort *)ip1;
+        const npy_ushort in2 = *(npy_ushort *)ip2;
+        *((npy_ushort *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_minimum_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_ushort *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ushort *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_ushort *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !1 || (0 && 1)
+#define SCALAR_OP scalar_maxp_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_fmax)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_ushort)) {
+            TO_SIMD_SFX(simd_reduce_c_maxp)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_ushort, npy_ushort)) {
+            TO_SIMD_SFX(simd_binary_ccc_maxp)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_maxp)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_ushort m0 = *((npy_ushort *)(ip2 + (i + 0) * is2));
+            npy_ushort m1 = *((npy_ushort *)(ip2 + (i + 1) * is2));
+            npy_ushort m2 = *((npy_ushort *)(ip2 + (i + 2) * is2));
+            npy_ushort m3 = *((npy_ushort *)(ip2 + (i + 3) * is2));
+            npy_ushort m4 = *((npy_ushort *)(ip2 + (i + 4) * is2));
+            npy_ushort m5 = *((npy_ushort *)(ip2 + (i + 5) * is2));
+            npy_ushort m6 = *((npy_ushort *)(ip2 + (i + 6) * is2));
+            npy_ushort m7 = *((npy_ushort *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_ushort v0 = *((npy_ushort *)(ip2 + (i + 0) * is2));
+                npy_ushort v1 = *((npy_ushort *)(ip2 + (i + 1) * is2));
+                npy_ushort v2 = *((npy_ushort *)(ip2 + (i + 2) * is2));
+                npy_ushort v3 = *((npy_ushort *)(ip2 + (i + 3) * is2));
+                npy_ushort v4 = *((npy_ushort *)(ip2 + (i + 4) * is2));
+                npy_ushort v5 = *((npy_ushort *)(ip2 + (i + 5) * is2));
+                npy_ushort v6 = *((npy_ushort *)(ip2 + (i + 6) * is2));
+                npy_ushort v7 = *((npy_ushort *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_ushort *)op1) = SCALAR_OP(*((npy_ushort *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_ushort v0 = *((npy_ushort *)(ip1 + (i + 0) * is1));
+            npy_ushort u0 = *((npy_ushort *)(ip2 + (i + 0) * is2));
+            *((npy_ushort *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_ushort v1 = *((npy_ushort *)(ip1 + (i + 1) * is1));
+            npy_ushort u1 = *((npy_ushort *)(ip2 + (i + 1) * is2));
+            *((npy_ushort *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_ushort v2 = *((npy_ushort *)(ip1 + (i + 2) * is1));
+            npy_ushort u2 = *((npy_ushort *)(ip2 + (i + 2) * is2));
+            *((npy_ushort *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_ushort v3 = *((npy_ushort *)(ip1 + (i + 3) * is1));
+            npy_ushort u3 = *((npy_ushort *)(ip2 + (i + 3) * is2));
+            *((npy_ushort *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_ushort in1 = *(npy_ushort *)ip1;
+        const npy_ushort in2 = *(npy_ushort *)ip2;
+        *((npy_ushort *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_fmax_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_ushort *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ushort *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_ushort *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !1 || (0 && 1)
+#define SCALAR_OP scalar_minp_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_fmin)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_ushort)) {
+            TO_SIMD_SFX(simd_reduce_c_minp)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_ushort, npy_ushort)) {
+            TO_SIMD_SFX(simd_binary_ccc_minp)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_minp)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_ushort m0 = *((npy_ushort *)(ip2 + (i + 0) * is2));
+            npy_ushort m1 = *((npy_ushort *)(ip2 + (i + 1) * is2));
+            npy_ushort m2 = *((npy_ushort *)(ip2 + (i + 2) * is2));
+            npy_ushort m3 = *((npy_ushort *)(ip2 + (i + 3) * is2));
+            npy_ushort m4 = *((npy_ushort *)(ip2 + (i + 4) * is2));
+            npy_ushort m5 = *((npy_ushort *)(ip2 + (i + 5) * is2));
+            npy_ushort m6 = *((npy_ushort *)(ip2 + (i + 6) * is2));
+            npy_ushort m7 = *((npy_ushort *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_ushort v0 = *((npy_ushort *)(ip2 + (i + 0) * is2));
+                npy_ushort v1 = *((npy_ushort *)(ip2 + (i + 1) * is2));
+                npy_ushort v2 = *((npy_ushort *)(ip2 + (i + 2) * is2));
+                npy_ushort v3 = *((npy_ushort *)(ip2 + (i + 3) * is2));
+                npy_ushort v4 = *((npy_ushort *)(ip2 + (i + 4) * is2));
+                npy_ushort v5 = *((npy_ushort *)(ip2 + (i + 5) * is2));
+                npy_ushort v6 = *((npy_ushort *)(ip2 + (i + 6) * is2));
+                npy_ushort v7 = *((npy_ushort *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_ushort *)op1) = SCALAR_OP(*((npy_ushort *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_ushort v0 = *((npy_ushort *)(ip1 + (i + 0) * is1));
+            npy_ushort u0 = *((npy_ushort *)(ip2 + (i + 0) * is2));
+            *((npy_ushort *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_ushort v1 = *((npy_ushort *)(ip1 + (i + 1) * is1));
+            npy_ushort u1 = *((npy_ushort *)(ip2 + (i + 1) * is2));
+            *((npy_ushort *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_ushort v2 = *((npy_ushort *)(ip1 + (i + 2) * is1));
+            npy_ushort u2 = *((npy_ushort *)(ip2 + (i + 2) * is2));
+            *((npy_ushort *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_ushort v3 = *((npy_ushort *)(ip1 + (i + 3) * is1));
+            npy_ushort u3 = *((npy_ushort *)(ip2 + (i + 3) * is2));
+            *((npy_ushort *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_ushort in1 = *(npy_ushort *)ip1;
+        const npy_ushort in2 = *(npy_ushort *)ip2;
+        *((npy_ushort *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(USHORT_fmin_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_ushort *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ushort *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_ushort *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+
+#line 294
+#undef TO_SIMD_SFX
+#if 0
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_INT == 8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_INT == 16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_INT == 32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_INT == 64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 321
+#if !0 || (0 && 0)
+#define SCALAR_OP scalar_max_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_maximum)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_uint)) {
+            TO_SIMD_SFX(simd_reduce_c_max)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_uint, npy_uint)) {
+            TO_SIMD_SFX(simd_binary_ccc_max)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_max)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_uint m0 = *((npy_uint *)(ip2 + (i + 0) * is2));
+            npy_uint m1 = *((npy_uint *)(ip2 + (i + 1) * is2));
+            npy_uint m2 = *((npy_uint *)(ip2 + (i + 2) * is2));
+            npy_uint m3 = *((npy_uint *)(ip2 + (i + 3) * is2));
+            npy_uint m4 = *((npy_uint *)(ip2 + (i + 4) * is2));
+            npy_uint m5 = *((npy_uint *)(ip2 + (i + 5) * is2));
+            npy_uint m6 = *((npy_uint *)(ip2 + (i + 6) * is2));
+            npy_uint m7 = *((npy_uint *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_uint v0 = *((npy_uint *)(ip2 + (i + 0) * is2));
+                npy_uint v1 = *((npy_uint *)(ip2 + (i + 1) * is2));
+                npy_uint v2 = *((npy_uint *)(ip2 + (i + 2) * is2));
+                npy_uint v3 = *((npy_uint *)(ip2 + (i + 3) * is2));
+                npy_uint v4 = *((npy_uint *)(ip2 + (i + 4) * is2));
+                npy_uint v5 = *((npy_uint *)(ip2 + (i + 5) * is2));
+                npy_uint v6 = *((npy_uint *)(ip2 + (i + 6) * is2));
+                npy_uint v7 = *((npy_uint *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_uint *)op1) = SCALAR_OP(*((npy_uint *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_uint v0 = *((npy_uint *)(ip1 + (i + 0) * is1));
+            npy_uint u0 = *((npy_uint *)(ip2 + (i + 0) * is2));
+            *((npy_uint *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_uint v1 = *((npy_uint *)(ip1 + (i + 1) * is1));
+            npy_uint u1 = *((npy_uint *)(ip2 + (i + 1) * is2));
+            *((npy_uint *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_uint v2 = *((npy_uint *)(ip1 + (i + 2) * is1));
+            npy_uint u2 = *((npy_uint *)(ip2 + (i + 2) * is2));
+            *((npy_uint *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_uint v3 = *((npy_uint *)(ip1 + (i + 3) * is1));
+            npy_uint u3 = *((npy_uint *)(ip2 + (i + 3) * is2));
+            *((npy_uint *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_uint in1 = *(npy_uint *)ip1;
+        const npy_uint in2 = *(npy_uint *)ip2;
+        *((npy_uint *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_maximum_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_uint *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_uint *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_uint *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !0 || (0 && 0)
+#define SCALAR_OP scalar_min_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_minimum)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_uint)) {
+            TO_SIMD_SFX(simd_reduce_c_min)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_uint, npy_uint)) {
+            TO_SIMD_SFX(simd_binary_ccc_min)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_min)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_uint m0 = *((npy_uint *)(ip2 + (i + 0) * is2));
+            npy_uint m1 = *((npy_uint *)(ip2 + (i + 1) * is2));
+            npy_uint m2 = *((npy_uint *)(ip2 + (i + 2) * is2));
+            npy_uint m3 = *((npy_uint *)(ip2 + (i + 3) * is2));
+            npy_uint m4 = *((npy_uint *)(ip2 + (i + 4) * is2));
+            npy_uint m5 = *((npy_uint *)(ip2 + (i + 5) * is2));
+            npy_uint m6 = *((npy_uint *)(ip2 + (i + 6) * is2));
+            npy_uint m7 = *((npy_uint *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_uint v0 = *((npy_uint *)(ip2 + (i + 0) * is2));
+                npy_uint v1 = *((npy_uint *)(ip2 + (i + 1) * is2));
+                npy_uint v2 = *((npy_uint *)(ip2 + (i + 2) * is2));
+                npy_uint v3 = *((npy_uint *)(ip2 + (i + 3) * is2));
+                npy_uint v4 = *((npy_uint *)(ip2 + (i + 4) * is2));
+                npy_uint v5 = *((npy_uint *)(ip2 + (i + 5) * is2));
+                npy_uint v6 = *((npy_uint *)(ip2 + (i + 6) * is2));
+                npy_uint v7 = *((npy_uint *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_uint *)op1) = SCALAR_OP(*((npy_uint *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_uint v0 = *((npy_uint *)(ip1 + (i + 0) * is1));
+            npy_uint u0 = *((npy_uint *)(ip2 + (i + 0) * is2));
+            *((npy_uint *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_uint v1 = *((npy_uint *)(ip1 + (i + 1) * is1));
+            npy_uint u1 = *((npy_uint *)(ip2 + (i + 1) * is2));
+            *((npy_uint *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_uint v2 = *((npy_uint *)(ip1 + (i + 2) * is1));
+            npy_uint u2 = *((npy_uint *)(ip2 + (i + 2) * is2));
+            *((npy_uint *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_uint v3 = *((npy_uint *)(ip1 + (i + 3) * is1));
+            npy_uint u3 = *((npy_uint *)(ip2 + (i + 3) * is2));
+            *((npy_uint *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_uint in1 = *(npy_uint *)ip1;
+        const npy_uint in2 = *(npy_uint *)ip2;
+        *((npy_uint *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_minimum_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_uint *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_uint *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_uint *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !1 || (0 && 1)
+#define SCALAR_OP scalar_maxp_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_fmax)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_uint)) {
+            TO_SIMD_SFX(simd_reduce_c_maxp)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_uint, npy_uint)) {
+            TO_SIMD_SFX(simd_binary_ccc_maxp)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_maxp)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_uint m0 = *((npy_uint *)(ip2 + (i + 0) * is2));
+            npy_uint m1 = *((npy_uint *)(ip2 + (i + 1) * is2));
+            npy_uint m2 = *((npy_uint *)(ip2 + (i + 2) * is2));
+            npy_uint m3 = *((npy_uint *)(ip2 + (i + 3) * is2));
+            npy_uint m4 = *((npy_uint *)(ip2 + (i + 4) * is2));
+            npy_uint m5 = *((npy_uint *)(ip2 + (i + 5) * is2));
+            npy_uint m6 = *((npy_uint *)(ip2 + (i + 6) * is2));
+            npy_uint m7 = *((npy_uint *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_uint v0 = *((npy_uint *)(ip2 + (i + 0) * is2));
+                npy_uint v1 = *((npy_uint *)(ip2 + (i + 1) * is2));
+                npy_uint v2 = *((npy_uint *)(ip2 + (i + 2) * is2));
+                npy_uint v3 = *((npy_uint *)(ip2 + (i + 3) * is2));
+                npy_uint v4 = *((npy_uint *)(ip2 + (i + 4) * is2));
+                npy_uint v5 = *((npy_uint *)(ip2 + (i + 5) * is2));
+                npy_uint v6 = *((npy_uint *)(ip2 + (i + 6) * is2));
+                npy_uint v7 = *((npy_uint *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_uint *)op1) = SCALAR_OP(*((npy_uint *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_uint v0 = *((npy_uint *)(ip1 + (i + 0) * is1));
+            npy_uint u0 = *((npy_uint *)(ip2 + (i + 0) * is2));
+            *((npy_uint *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_uint v1 = *((npy_uint *)(ip1 + (i + 1) * is1));
+            npy_uint u1 = *((npy_uint *)(ip2 + (i + 1) * is2));
+            *((npy_uint *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_uint v2 = *((npy_uint *)(ip1 + (i + 2) * is1));
+            npy_uint u2 = *((npy_uint *)(ip2 + (i + 2) * is2));
+            *((npy_uint *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_uint v3 = *((npy_uint *)(ip1 + (i + 3) * is1));
+            npy_uint u3 = *((npy_uint *)(ip2 + (i + 3) * is2));
+            *((npy_uint *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_uint in1 = *(npy_uint *)ip1;
+        const npy_uint in2 = *(npy_uint *)ip2;
+        *((npy_uint *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_fmax_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_uint *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_uint *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_uint *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !1 || (0 && 1)
+#define SCALAR_OP scalar_minp_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_fmin)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_uint)) {
+            TO_SIMD_SFX(simd_reduce_c_minp)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_uint, npy_uint)) {
+            TO_SIMD_SFX(simd_binary_ccc_minp)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_minp)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_uint m0 = *((npy_uint *)(ip2 + (i + 0) * is2));
+            npy_uint m1 = *((npy_uint *)(ip2 + (i + 1) * is2));
+            npy_uint m2 = *((npy_uint *)(ip2 + (i + 2) * is2));
+            npy_uint m3 = *((npy_uint *)(ip2 + (i + 3) * is2));
+            npy_uint m4 = *((npy_uint *)(ip2 + (i + 4) * is2));
+            npy_uint m5 = *((npy_uint *)(ip2 + (i + 5) * is2));
+            npy_uint m6 = *((npy_uint *)(ip2 + (i + 6) * is2));
+            npy_uint m7 = *((npy_uint *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_uint v0 = *((npy_uint *)(ip2 + (i + 0) * is2));
+                npy_uint v1 = *((npy_uint *)(ip2 + (i + 1) * is2));
+                npy_uint v2 = *((npy_uint *)(ip2 + (i + 2) * is2));
+                npy_uint v3 = *((npy_uint *)(ip2 + (i + 3) * is2));
+                npy_uint v4 = *((npy_uint *)(ip2 + (i + 4) * is2));
+                npy_uint v5 = *((npy_uint *)(ip2 + (i + 5) * is2));
+                npy_uint v6 = *((npy_uint *)(ip2 + (i + 6) * is2));
+                npy_uint v7 = *((npy_uint *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_uint *)op1) = SCALAR_OP(*((npy_uint *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_uint v0 = *((npy_uint *)(ip1 + (i + 0) * is1));
+            npy_uint u0 = *((npy_uint *)(ip2 + (i + 0) * is2));
+            *((npy_uint *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_uint v1 = *((npy_uint *)(ip1 + (i + 1) * is1));
+            npy_uint u1 = *((npy_uint *)(ip2 + (i + 1) * is2));
+            *((npy_uint *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_uint v2 = *((npy_uint *)(ip1 + (i + 2) * is1));
+            npy_uint u2 = *((npy_uint *)(ip2 + (i + 2) * is2));
+            *((npy_uint *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_uint v3 = *((npy_uint *)(ip1 + (i + 3) * is1));
+            npy_uint u3 = *((npy_uint *)(ip2 + (i + 3) * is2));
+            *((npy_uint *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_uint in1 = *(npy_uint *)ip1;
+        const npy_uint in2 = *(npy_uint *)ip2;
+        *((npy_uint *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(UINT_fmin_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_uint *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_uint *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_uint *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+
+#line 294
+#undef TO_SIMD_SFX
+#if 0
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_LONG == 8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_LONG == 16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_LONG == 32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_LONG == 64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 321
+#if !0 || (0 && 0)
+#define SCALAR_OP scalar_max_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_maximum)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_ulong)) {
+            TO_SIMD_SFX(simd_reduce_c_max)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_ulong, npy_ulong)) {
+            TO_SIMD_SFX(simd_binary_ccc_max)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_max)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_ulong m0 = *((npy_ulong *)(ip2 + (i + 0) * is2));
+            npy_ulong m1 = *((npy_ulong *)(ip2 + (i + 1) * is2));
+            npy_ulong m2 = *((npy_ulong *)(ip2 + (i + 2) * is2));
+            npy_ulong m3 = *((npy_ulong *)(ip2 + (i + 3) * is2));
+            npy_ulong m4 = *((npy_ulong *)(ip2 + (i + 4) * is2));
+            npy_ulong m5 = *((npy_ulong *)(ip2 + (i + 5) * is2));
+            npy_ulong m6 = *((npy_ulong *)(ip2 + (i + 6) * is2));
+            npy_ulong m7 = *((npy_ulong *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_ulong v0 = *((npy_ulong *)(ip2 + (i + 0) * is2));
+                npy_ulong v1 = *((npy_ulong *)(ip2 + (i + 1) * is2));
+                npy_ulong v2 = *((npy_ulong *)(ip2 + (i + 2) * is2));
+                npy_ulong v3 = *((npy_ulong *)(ip2 + (i + 3) * is2));
+                npy_ulong v4 = *((npy_ulong *)(ip2 + (i + 4) * is2));
+                npy_ulong v5 = *((npy_ulong *)(ip2 + (i + 5) * is2));
+                npy_ulong v6 = *((npy_ulong *)(ip2 + (i + 6) * is2));
+                npy_ulong v7 = *((npy_ulong *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_ulong *)op1) = SCALAR_OP(*((npy_ulong *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_ulong v0 = *((npy_ulong *)(ip1 + (i + 0) * is1));
+            npy_ulong u0 = *((npy_ulong *)(ip2 + (i + 0) * is2));
+            *((npy_ulong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_ulong v1 = *((npy_ulong *)(ip1 + (i + 1) * is1));
+            npy_ulong u1 = *((npy_ulong *)(ip2 + (i + 1) * is2));
+            *((npy_ulong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_ulong v2 = *((npy_ulong *)(ip1 + (i + 2) * is1));
+            npy_ulong u2 = *((npy_ulong *)(ip2 + (i + 2) * is2));
+            *((npy_ulong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_ulong v3 = *((npy_ulong *)(ip1 + (i + 3) * is1));
+            npy_ulong u3 = *((npy_ulong *)(ip2 + (i + 3) * is2));
+            *((npy_ulong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_ulong in1 = *(npy_ulong *)ip1;
+        const npy_ulong in2 = *(npy_ulong *)ip2;
+        *((npy_ulong *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_maximum_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_ulong *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ulong *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_ulong *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !0 || (0 && 0)
+#define SCALAR_OP scalar_min_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_minimum)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_ulong)) {
+            TO_SIMD_SFX(simd_reduce_c_min)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_ulong, npy_ulong)) {
+            TO_SIMD_SFX(simd_binary_ccc_min)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_min)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_ulong m0 = *((npy_ulong *)(ip2 + (i + 0) * is2));
+            npy_ulong m1 = *((npy_ulong *)(ip2 + (i + 1) * is2));
+            npy_ulong m2 = *((npy_ulong *)(ip2 + (i + 2) * is2));
+            npy_ulong m3 = *((npy_ulong *)(ip2 + (i + 3) * is2));
+            npy_ulong m4 = *((npy_ulong *)(ip2 + (i + 4) * is2));
+            npy_ulong m5 = *((npy_ulong *)(ip2 + (i + 5) * is2));
+            npy_ulong m6 = *((npy_ulong *)(ip2 + (i + 6) * is2));
+            npy_ulong m7 = *((npy_ulong *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_ulong v0 = *((npy_ulong *)(ip2 + (i + 0) * is2));
+                npy_ulong v1 = *((npy_ulong *)(ip2 + (i + 1) * is2));
+                npy_ulong v2 = *((npy_ulong *)(ip2 + (i + 2) * is2));
+                npy_ulong v3 = *((npy_ulong *)(ip2 + (i + 3) * is2));
+                npy_ulong v4 = *((npy_ulong *)(ip2 + (i + 4) * is2));
+                npy_ulong v5 = *((npy_ulong *)(ip2 + (i + 5) * is2));
+                npy_ulong v6 = *((npy_ulong *)(ip2 + (i + 6) * is2));
+                npy_ulong v7 = *((npy_ulong *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_ulong *)op1) = SCALAR_OP(*((npy_ulong *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_ulong v0 = *((npy_ulong *)(ip1 + (i + 0) * is1));
+            npy_ulong u0 = *((npy_ulong *)(ip2 + (i + 0) * is2));
+            *((npy_ulong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_ulong v1 = *((npy_ulong *)(ip1 + (i + 1) * is1));
+            npy_ulong u1 = *((npy_ulong *)(ip2 + (i + 1) * is2));
+            *((npy_ulong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_ulong v2 = *((npy_ulong *)(ip1 + (i + 2) * is1));
+            npy_ulong u2 = *((npy_ulong *)(ip2 + (i + 2) * is2));
+            *((npy_ulong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_ulong v3 = *((npy_ulong *)(ip1 + (i + 3) * is1));
+            npy_ulong u3 = *((npy_ulong *)(ip2 + (i + 3) * is2));
+            *((npy_ulong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_ulong in1 = *(npy_ulong *)ip1;
+        const npy_ulong in2 = *(npy_ulong *)ip2;
+        *((npy_ulong *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_minimum_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_ulong *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ulong *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_ulong *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !1 || (0 && 1)
+#define SCALAR_OP scalar_maxp_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_fmax)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_ulong)) {
+            TO_SIMD_SFX(simd_reduce_c_maxp)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_ulong, npy_ulong)) {
+            TO_SIMD_SFX(simd_binary_ccc_maxp)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_maxp)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_ulong m0 = *((npy_ulong *)(ip2 + (i + 0) * is2));
+            npy_ulong m1 = *((npy_ulong *)(ip2 + (i + 1) * is2));
+            npy_ulong m2 = *((npy_ulong *)(ip2 + (i + 2) * is2));
+            npy_ulong m3 = *((npy_ulong *)(ip2 + (i + 3) * is2));
+            npy_ulong m4 = *((npy_ulong *)(ip2 + (i + 4) * is2));
+            npy_ulong m5 = *((npy_ulong *)(ip2 + (i + 5) * is2));
+            npy_ulong m6 = *((npy_ulong *)(ip2 + (i + 6) * is2));
+            npy_ulong m7 = *((npy_ulong *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_ulong v0 = *((npy_ulong *)(ip2 + (i + 0) * is2));
+                npy_ulong v1 = *((npy_ulong *)(ip2 + (i + 1) * is2));
+                npy_ulong v2 = *((npy_ulong *)(ip2 + (i + 2) * is2));
+                npy_ulong v3 = *((npy_ulong *)(ip2 + (i + 3) * is2));
+                npy_ulong v4 = *((npy_ulong *)(ip2 + (i + 4) * is2));
+                npy_ulong v5 = *((npy_ulong *)(ip2 + (i + 5) * is2));
+                npy_ulong v6 = *((npy_ulong *)(ip2 + (i + 6) * is2));
+                npy_ulong v7 = *((npy_ulong *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_ulong *)op1) = SCALAR_OP(*((npy_ulong *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_ulong v0 = *((npy_ulong *)(ip1 + (i + 0) * is1));
+            npy_ulong u0 = *((npy_ulong *)(ip2 + (i + 0) * is2));
+            *((npy_ulong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_ulong v1 = *((npy_ulong *)(ip1 + (i + 1) * is1));
+            npy_ulong u1 = *((npy_ulong *)(ip2 + (i + 1) * is2));
+            *((npy_ulong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_ulong v2 = *((npy_ulong *)(ip1 + (i + 2) * is1));
+            npy_ulong u2 = *((npy_ulong *)(ip2 + (i + 2) * is2));
+            *((npy_ulong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_ulong v3 = *((npy_ulong *)(ip1 + (i + 3) * is1));
+            npy_ulong u3 = *((npy_ulong *)(ip2 + (i + 3) * is2));
+            *((npy_ulong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_ulong in1 = *(npy_ulong *)ip1;
+        const npy_ulong in2 = *(npy_ulong *)ip2;
+        *((npy_ulong *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_fmax_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_ulong *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ulong *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_ulong *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !1 || (0 && 1)
+#define SCALAR_OP scalar_minp_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_fmin)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_ulong)) {
+            TO_SIMD_SFX(simd_reduce_c_minp)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_ulong, npy_ulong)) {
+            TO_SIMD_SFX(simd_binary_ccc_minp)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_minp)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_ulong m0 = *((npy_ulong *)(ip2 + (i + 0) * is2));
+            npy_ulong m1 = *((npy_ulong *)(ip2 + (i + 1) * is2));
+            npy_ulong m2 = *((npy_ulong *)(ip2 + (i + 2) * is2));
+            npy_ulong m3 = *((npy_ulong *)(ip2 + (i + 3) * is2));
+            npy_ulong m4 = *((npy_ulong *)(ip2 + (i + 4) * is2));
+            npy_ulong m5 = *((npy_ulong *)(ip2 + (i + 5) * is2));
+            npy_ulong m6 = *((npy_ulong *)(ip2 + (i + 6) * is2));
+            npy_ulong m7 = *((npy_ulong *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_ulong v0 = *((npy_ulong *)(ip2 + (i + 0) * is2));
+                npy_ulong v1 = *((npy_ulong *)(ip2 + (i + 1) * is2));
+                npy_ulong v2 = *((npy_ulong *)(ip2 + (i + 2) * is2));
+                npy_ulong v3 = *((npy_ulong *)(ip2 + (i + 3) * is2));
+                npy_ulong v4 = *((npy_ulong *)(ip2 + (i + 4) * is2));
+                npy_ulong v5 = *((npy_ulong *)(ip2 + (i + 5) * is2));
+                npy_ulong v6 = *((npy_ulong *)(ip2 + (i + 6) * is2));
+                npy_ulong v7 = *((npy_ulong *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_ulong *)op1) = SCALAR_OP(*((npy_ulong *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_ulong v0 = *((npy_ulong *)(ip1 + (i + 0) * is1));
+            npy_ulong u0 = *((npy_ulong *)(ip2 + (i + 0) * is2));
+            *((npy_ulong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_ulong v1 = *((npy_ulong *)(ip1 + (i + 1) * is1));
+            npy_ulong u1 = *((npy_ulong *)(ip2 + (i + 1) * is2));
+            *((npy_ulong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_ulong v2 = *((npy_ulong *)(ip1 + (i + 2) * is1));
+            npy_ulong u2 = *((npy_ulong *)(ip2 + (i + 2) * is2));
+            *((npy_ulong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_ulong v3 = *((npy_ulong *)(ip1 + (i + 3) * is1));
+            npy_ulong u3 = *((npy_ulong *)(ip2 + (i + 3) * is2));
+            *((npy_ulong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_ulong in1 = *(npy_ulong *)ip1;
+        const npy_ulong in2 = *(npy_ulong *)ip2;
+        *((npy_ulong *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONG_fmin_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_ulong *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ulong *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_ulong *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+
+#line 294
+#undef TO_SIMD_SFX
+#if 0
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 321
+#if !0 || (0 && 0)
+#define SCALAR_OP scalar_max_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_maximum)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_ulonglong)) {
+            TO_SIMD_SFX(simd_reduce_c_max)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_ulonglong, npy_ulonglong)) {
+            TO_SIMD_SFX(simd_binary_ccc_max)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_max)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_ulonglong m0 = *((npy_ulonglong *)(ip2 + (i + 0) * is2));
+            npy_ulonglong m1 = *((npy_ulonglong *)(ip2 + (i + 1) * is2));
+            npy_ulonglong m2 = *((npy_ulonglong *)(ip2 + (i + 2) * is2));
+            npy_ulonglong m3 = *((npy_ulonglong *)(ip2 + (i + 3) * is2));
+            npy_ulonglong m4 = *((npy_ulonglong *)(ip2 + (i + 4) * is2));
+            npy_ulonglong m5 = *((npy_ulonglong *)(ip2 + (i + 5) * is2));
+            npy_ulonglong m6 = *((npy_ulonglong *)(ip2 + (i + 6) * is2));
+            npy_ulonglong m7 = *((npy_ulonglong *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_ulonglong v0 = *((npy_ulonglong *)(ip2 + (i + 0) * is2));
+                npy_ulonglong v1 = *((npy_ulonglong *)(ip2 + (i + 1) * is2));
+                npy_ulonglong v2 = *((npy_ulonglong *)(ip2 + (i + 2) * is2));
+                npy_ulonglong v3 = *((npy_ulonglong *)(ip2 + (i + 3) * is2));
+                npy_ulonglong v4 = *((npy_ulonglong *)(ip2 + (i + 4) * is2));
+                npy_ulonglong v5 = *((npy_ulonglong *)(ip2 + (i + 5) * is2));
+                npy_ulonglong v6 = *((npy_ulonglong *)(ip2 + (i + 6) * is2));
+                npy_ulonglong v7 = *((npy_ulonglong *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_ulonglong *)op1) = SCALAR_OP(*((npy_ulonglong *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_ulonglong v0 = *((npy_ulonglong *)(ip1 + (i + 0) * is1));
+            npy_ulonglong u0 = *((npy_ulonglong *)(ip2 + (i + 0) * is2));
+            *((npy_ulonglong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_ulonglong v1 = *((npy_ulonglong *)(ip1 + (i + 1) * is1));
+            npy_ulonglong u1 = *((npy_ulonglong *)(ip2 + (i + 1) * is2));
+            *((npy_ulonglong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_ulonglong v2 = *((npy_ulonglong *)(ip1 + (i + 2) * is1));
+            npy_ulonglong u2 = *((npy_ulonglong *)(ip2 + (i + 2) * is2));
+            *((npy_ulonglong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_ulonglong v3 = *((npy_ulonglong *)(ip1 + (i + 3) * is1));
+            npy_ulonglong u3 = *((npy_ulonglong *)(ip2 + (i + 3) * is2));
+            *((npy_ulonglong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
+        const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
+        *((npy_ulonglong *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_maximum_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_ulonglong *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ulonglong *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_ulonglong *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !0 || (0 && 0)
+#define SCALAR_OP scalar_min_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_minimum)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_ulonglong)) {
+            TO_SIMD_SFX(simd_reduce_c_min)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_ulonglong, npy_ulonglong)) {
+            TO_SIMD_SFX(simd_binary_ccc_min)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_min)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_ulonglong m0 = *((npy_ulonglong *)(ip2 + (i + 0) * is2));
+            npy_ulonglong m1 = *((npy_ulonglong *)(ip2 + (i + 1) * is2));
+            npy_ulonglong m2 = *((npy_ulonglong *)(ip2 + (i + 2) * is2));
+            npy_ulonglong m3 = *((npy_ulonglong *)(ip2 + (i + 3) * is2));
+            npy_ulonglong m4 = *((npy_ulonglong *)(ip2 + (i + 4) * is2));
+            npy_ulonglong m5 = *((npy_ulonglong *)(ip2 + (i + 5) * is2));
+            npy_ulonglong m6 = *((npy_ulonglong *)(ip2 + (i + 6) * is2));
+            npy_ulonglong m7 = *((npy_ulonglong *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_ulonglong v0 = *((npy_ulonglong *)(ip2 + (i + 0) * is2));
+                npy_ulonglong v1 = *((npy_ulonglong *)(ip2 + (i + 1) * is2));
+                npy_ulonglong v2 = *((npy_ulonglong *)(ip2 + (i + 2) * is2));
+                npy_ulonglong v3 = *((npy_ulonglong *)(ip2 + (i + 3) * is2));
+                npy_ulonglong v4 = *((npy_ulonglong *)(ip2 + (i + 4) * is2));
+                npy_ulonglong v5 = *((npy_ulonglong *)(ip2 + (i + 5) * is2));
+                npy_ulonglong v6 = *((npy_ulonglong *)(ip2 + (i + 6) * is2));
+                npy_ulonglong v7 = *((npy_ulonglong *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_ulonglong *)op1) = SCALAR_OP(*((npy_ulonglong *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_ulonglong v0 = *((npy_ulonglong *)(ip1 + (i + 0) * is1));
+            npy_ulonglong u0 = *((npy_ulonglong *)(ip2 + (i + 0) * is2));
+            *((npy_ulonglong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_ulonglong v1 = *((npy_ulonglong *)(ip1 + (i + 1) * is1));
+            npy_ulonglong u1 = *((npy_ulonglong *)(ip2 + (i + 1) * is2));
+            *((npy_ulonglong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_ulonglong v2 = *((npy_ulonglong *)(ip1 + (i + 2) * is1));
+            npy_ulonglong u2 = *((npy_ulonglong *)(ip2 + (i + 2) * is2));
+            *((npy_ulonglong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_ulonglong v3 = *((npy_ulonglong *)(ip1 + (i + 3) * is1));
+            npy_ulonglong u3 = *((npy_ulonglong *)(ip2 + (i + 3) * is2));
+            *((npy_ulonglong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
+        const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
+        *((npy_ulonglong *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_minimum_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_ulonglong *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ulonglong *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_ulonglong *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !1 || (0 && 1)
+#define SCALAR_OP scalar_maxp_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_fmax)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_ulonglong)) {
+            TO_SIMD_SFX(simd_reduce_c_maxp)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_ulonglong, npy_ulonglong)) {
+            TO_SIMD_SFX(simd_binary_ccc_maxp)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_maxp)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_ulonglong m0 = *((npy_ulonglong *)(ip2 + (i + 0) * is2));
+            npy_ulonglong m1 = *((npy_ulonglong *)(ip2 + (i + 1) * is2));
+            npy_ulonglong m2 = *((npy_ulonglong *)(ip2 + (i + 2) * is2));
+            npy_ulonglong m3 = *((npy_ulonglong *)(ip2 + (i + 3) * is2));
+            npy_ulonglong m4 = *((npy_ulonglong *)(ip2 + (i + 4) * is2));
+            npy_ulonglong m5 = *((npy_ulonglong *)(ip2 + (i + 5) * is2));
+            npy_ulonglong m6 = *((npy_ulonglong *)(ip2 + (i + 6) * is2));
+            npy_ulonglong m7 = *((npy_ulonglong *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_ulonglong v0 = *((npy_ulonglong *)(ip2 + (i + 0) * is2));
+                npy_ulonglong v1 = *((npy_ulonglong *)(ip2 + (i + 1) * is2));
+                npy_ulonglong v2 = *((npy_ulonglong *)(ip2 + (i + 2) * is2));
+                npy_ulonglong v3 = *((npy_ulonglong *)(ip2 + (i + 3) * is2));
+                npy_ulonglong v4 = *((npy_ulonglong *)(ip2 + (i + 4) * is2));
+                npy_ulonglong v5 = *((npy_ulonglong *)(ip2 + (i + 5) * is2));
+                npy_ulonglong v6 = *((npy_ulonglong *)(ip2 + (i + 6) * is2));
+                npy_ulonglong v7 = *((npy_ulonglong *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_ulonglong *)op1) = SCALAR_OP(*((npy_ulonglong *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_ulonglong v0 = *((npy_ulonglong *)(ip1 + (i + 0) * is1));
+            npy_ulonglong u0 = *((npy_ulonglong *)(ip2 + (i + 0) * is2));
+            *((npy_ulonglong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_ulonglong v1 = *((npy_ulonglong *)(ip1 + (i + 1) * is1));
+            npy_ulonglong u1 = *((npy_ulonglong *)(ip2 + (i + 1) * is2));
+            *((npy_ulonglong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_ulonglong v2 = *((npy_ulonglong *)(ip1 + (i + 2) * is1));
+            npy_ulonglong u2 = *((npy_ulonglong *)(ip2 + (i + 2) * is2));
+            *((npy_ulonglong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_ulonglong v3 = *((npy_ulonglong *)(ip1 + (i + 3) * is1));
+            npy_ulonglong u3 = *((npy_ulonglong *)(ip2 + (i + 3) * is2));
+            *((npy_ulonglong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
+        const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
+        *((npy_ulonglong *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_fmax_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_ulonglong *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ulonglong *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_ulonglong *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !1 || (0 && 1)
+#define SCALAR_OP scalar_minp_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_fmin)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_ulonglong)) {
+            TO_SIMD_SFX(simd_reduce_c_minp)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_ulonglong, npy_ulonglong)) {
+            TO_SIMD_SFX(simd_binary_ccc_minp)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_minp)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_ulonglong m0 = *((npy_ulonglong *)(ip2 + (i + 0) * is2));
+            npy_ulonglong m1 = *((npy_ulonglong *)(ip2 + (i + 1) * is2));
+            npy_ulonglong m2 = *((npy_ulonglong *)(ip2 + (i + 2) * is2));
+            npy_ulonglong m3 = *((npy_ulonglong *)(ip2 + (i + 3) * is2));
+            npy_ulonglong m4 = *((npy_ulonglong *)(ip2 + (i + 4) * is2));
+            npy_ulonglong m5 = *((npy_ulonglong *)(ip2 + (i + 5) * is2));
+            npy_ulonglong m6 = *((npy_ulonglong *)(ip2 + (i + 6) * is2));
+            npy_ulonglong m7 = *((npy_ulonglong *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_ulonglong v0 = *((npy_ulonglong *)(ip2 + (i + 0) * is2));
+                npy_ulonglong v1 = *((npy_ulonglong *)(ip2 + (i + 1) * is2));
+                npy_ulonglong v2 = *((npy_ulonglong *)(ip2 + (i + 2) * is2));
+                npy_ulonglong v3 = *((npy_ulonglong *)(ip2 + (i + 3) * is2));
+                npy_ulonglong v4 = *((npy_ulonglong *)(ip2 + (i + 4) * is2));
+                npy_ulonglong v5 = *((npy_ulonglong *)(ip2 + (i + 5) * is2));
+                npy_ulonglong v6 = *((npy_ulonglong *)(ip2 + (i + 6) * is2));
+                npy_ulonglong v7 = *((npy_ulonglong *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_ulonglong *)op1) = SCALAR_OP(*((npy_ulonglong *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_ulonglong v0 = *((npy_ulonglong *)(ip1 + (i + 0) * is1));
+            npy_ulonglong u0 = *((npy_ulonglong *)(ip2 + (i + 0) * is2));
+            *((npy_ulonglong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_ulonglong v1 = *((npy_ulonglong *)(ip1 + (i + 1) * is1));
+            npy_ulonglong u1 = *((npy_ulonglong *)(ip2 + (i + 1) * is2));
+            *((npy_ulonglong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_ulonglong v2 = *((npy_ulonglong *)(ip1 + (i + 2) * is1));
+            npy_ulonglong u2 = *((npy_ulonglong *)(ip2 + (i + 2) * is2));
+            *((npy_ulonglong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_ulonglong v3 = *((npy_ulonglong *)(ip1 + (i + 3) * is1));
+            npy_ulonglong u3 = *((npy_ulonglong *)(ip2 + (i + 3) * is2));
+            *((npy_ulonglong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
+        const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
+        *((npy_ulonglong *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(ULONGLONG_fmin_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_ulonglong *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_ulonglong *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_ulonglong *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+
+#line 294
+#undef TO_SIMD_SFX
+#if 0
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_BYTE == 8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_BYTE == 16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_BYTE == 32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_BYTE == 64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 321
+#if !0 || (0 && 0)
+#define SCALAR_OP scalar_max_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_maximum)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_byte)) {
+            TO_SIMD_SFX(simd_reduce_c_max)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_byte, npy_byte)) {
+            TO_SIMD_SFX(simd_binary_ccc_max)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_max)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_byte m0 = *((npy_byte *)(ip2 + (i + 0) * is2));
+            npy_byte m1 = *((npy_byte *)(ip2 + (i + 1) * is2));
+            npy_byte m2 = *((npy_byte *)(ip2 + (i + 2) * is2));
+            npy_byte m3 = *((npy_byte *)(ip2 + (i + 3) * is2));
+            npy_byte m4 = *((npy_byte *)(ip2 + (i + 4) * is2));
+            npy_byte m5 = *((npy_byte *)(ip2 + (i + 5) * is2));
+            npy_byte m6 = *((npy_byte *)(ip2 + (i + 6) * is2));
+            npy_byte m7 = *((npy_byte *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_byte v0 = *((npy_byte *)(ip2 + (i + 0) * is2));
+                npy_byte v1 = *((npy_byte *)(ip2 + (i + 1) * is2));
+                npy_byte v2 = *((npy_byte *)(ip2 + (i + 2) * is2));
+                npy_byte v3 = *((npy_byte *)(ip2 + (i + 3) * is2));
+                npy_byte v4 = *((npy_byte *)(ip2 + (i + 4) * is2));
+                npy_byte v5 = *((npy_byte *)(ip2 + (i + 5) * is2));
+                npy_byte v6 = *((npy_byte *)(ip2 + (i + 6) * is2));
+                npy_byte v7 = *((npy_byte *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_byte *)op1) = SCALAR_OP(*((npy_byte *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_byte v0 = *((npy_byte *)(ip1 + (i + 0) * is1));
+            npy_byte u0 = *((npy_byte *)(ip2 + (i + 0) * is2));
+            *((npy_byte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_byte v1 = *((npy_byte *)(ip1 + (i + 1) * is1));
+            npy_byte u1 = *((npy_byte *)(ip2 + (i + 1) * is2));
+            *((npy_byte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_byte v2 = *((npy_byte *)(ip1 + (i + 2) * is1));
+            npy_byte u2 = *((npy_byte *)(ip2 + (i + 2) * is2));
+            *((npy_byte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_byte v3 = *((npy_byte *)(ip1 + (i + 3) * is1));
+            npy_byte u3 = *((npy_byte *)(ip2 + (i + 3) * is2));
+            *((npy_byte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_byte in1 = *(npy_byte *)ip1;
+        const npy_byte in2 = *(npy_byte *)ip2;
+        *((npy_byte *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_maximum_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_byte *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_byte *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_byte *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !0 || (0 && 0)
+#define SCALAR_OP scalar_min_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_minimum)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_byte)) {
+            TO_SIMD_SFX(simd_reduce_c_min)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_byte, npy_byte)) {
+            TO_SIMD_SFX(simd_binary_ccc_min)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_min)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_byte m0 = *((npy_byte *)(ip2 + (i + 0) * is2));
+            npy_byte m1 = *((npy_byte *)(ip2 + (i + 1) * is2));
+            npy_byte m2 = *((npy_byte *)(ip2 + (i + 2) * is2));
+            npy_byte m3 = *((npy_byte *)(ip2 + (i + 3) * is2));
+            npy_byte m4 = *((npy_byte *)(ip2 + (i + 4) * is2));
+            npy_byte m5 = *((npy_byte *)(ip2 + (i + 5) * is2));
+            npy_byte m6 = *((npy_byte *)(ip2 + (i + 6) * is2));
+            npy_byte m7 = *((npy_byte *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_byte v0 = *((npy_byte *)(ip2 + (i + 0) * is2));
+                npy_byte v1 = *((npy_byte *)(ip2 + (i + 1) * is2));
+                npy_byte v2 = *((npy_byte *)(ip2 + (i + 2) * is2));
+                npy_byte v3 = *((npy_byte *)(ip2 + (i + 3) * is2));
+                npy_byte v4 = *((npy_byte *)(ip2 + (i + 4) * is2));
+                npy_byte v5 = *((npy_byte *)(ip2 + (i + 5) * is2));
+                npy_byte v6 = *((npy_byte *)(ip2 + (i + 6) * is2));
+                npy_byte v7 = *((npy_byte *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_byte *)op1) = SCALAR_OP(*((npy_byte *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_byte v0 = *((npy_byte *)(ip1 + (i + 0) * is1));
+            npy_byte u0 = *((npy_byte *)(ip2 + (i + 0) * is2));
+            *((npy_byte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_byte v1 = *((npy_byte *)(ip1 + (i + 1) * is1));
+            npy_byte u1 = *((npy_byte *)(ip2 + (i + 1) * is2));
+            *((npy_byte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_byte v2 = *((npy_byte *)(ip1 + (i + 2) * is1));
+            npy_byte u2 = *((npy_byte *)(ip2 + (i + 2) * is2));
+            *((npy_byte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_byte v3 = *((npy_byte *)(ip1 + (i + 3) * is1));
+            npy_byte u3 = *((npy_byte *)(ip2 + (i + 3) * is2));
+            *((npy_byte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_byte in1 = *(npy_byte *)ip1;
+        const npy_byte in2 = *(npy_byte *)ip2;
+        *((npy_byte *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_minimum_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_byte *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_byte *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_byte *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !1 || (0 && 1)
+#define SCALAR_OP scalar_maxp_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_fmax)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_byte)) {
+            TO_SIMD_SFX(simd_reduce_c_maxp)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_byte, npy_byte)) {
+            TO_SIMD_SFX(simd_binary_ccc_maxp)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_maxp)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_byte m0 = *((npy_byte *)(ip2 + (i + 0) * is2));
+            npy_byte m1 = *((npy_byte *)(ip2 + (i + 1) * is2));
+            npy_byte m2 = *((npy_byte *)(ip2 + (i + 2) * is2));
+            npy_byte m3 = *((npy_byte *)(ip2 + (i + 3) * is2));
+            npy_byte m4 = *((npy_byte *)(ip2 + (i + 4) * is2));
+            npy_byte m5 = *((npy_byte *)(ip2 + (i + 5) * is2));
+            npy_byte m6 = *((npy_byte *)(ip2 + (i + 6) * is2));
+            npy_byte m7 = *((npy_byte *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_byte v0 = *((npy_byte *)(ip2 + (i + 0) * is2));
+                npy_byte v1 = *((npy_byte *)(ip2 + (i + 1) * is2));
+                npy_byte v2 = *((npy_byte *)(ip2 + (i + 2) * is2));
+                npy_byte v3 = *((npy_byte *)(ip2 + (i + 3) * is2));
+                npy_byte v4 = *((npy_byte *)(ip2 + (i + 4) * is2));
+                npy_byte v5 = *((npy_byte *)(ip2 + (i + 5) * is2));
+                npy_byte v6 = *((npy_byte *)(ip2 + (i + 6) * is2));
+                npy_byte v7 = *((npy_byte *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_byte *)op1) = SCALAR_OP(*((npy_byte *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_byte v0 = *((npy_byte *)(ip1 + (i + 0) * is1));
+            npy_byte u0 = *((npy_byte *)(ip2 + (i + 0) * is2));
+            *((npy_byte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_byte v1 = *((npy_byte *)(ip1 + (i + 1) * is1));
+            npy_byte u1 = *((npy_byte *)(ip2 + (i + 1) * is2));
+            *((npy_byte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_byte v2 = *((npy_byte *)(ip1 + (i + 2) * is1));
+            npy_byte u2 = *((npy_byte *)(ip2 + (i + 2) * is2));
+            *((npy_byte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_byte v3 = *((npy_byte *)(ip1 + (i + 3) * is1));
+            npy_byte u3 = *((npy_byte *)(ip2 + (i + 3) * is2));
+            *((npy_byte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_byte in1 = *(npy_byte *)ip1;
+        const npy_byte in2 = *(npy_byte *)ip2;
+        *((npy_byte *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_fmax_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_byte *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_byte *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_byte *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !1 || (0 && 1)
+#define SCALAR_OP scalar_minp_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_fmin)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_byte)) {
+            TO_SIMD_SFX(simd_reduce_c_minp)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_byte, npy_byte)) {
+            TO_SIMD_SFX(simd_binary_ccc_minp)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_minp)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_byte m0 = *((npy_byte *)(ip2 + (i + 0) * is2));
+            npy_byte m1 = *((npy_byte *)(ip2 + (i + 1) * is2));
+            npy_byte m2 = *((npy_byte *)(ip2 + (i + 2) * is2));
+            npy_byte m3 = *((npy_byte *)(ip2 + (i + 3) * is2));
+            npy_byte m4 = *((npy_byte *)(ip2 + (i + 4) * is2));
+            npy_byte m5 = *((npy_byte *)(ip2 + (i + 5) * is2));
+            npy_byte m6 = *((npy_byte *)(ip2 + (i + 6) * is2));
+            npy_byte m7 = *((npy_byte *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_byte v0 = *((npy_byte *)(ip2 + (i + 0) * is2));
+                npy_byte v1 = *((npy_byte *)(ip2 + (i + 1) * is2));
+                npy_byte v2 = *((npy_byte *)(ip2 + (i + 2) * is2));
+                npy_byte v3 = *((npy_byte *)(ip2 + (i + 3) * is2));
+                npy_byte v4 = *((npy_byte *)(ip2 + (i + 4) * is2));
+                npy_byte v5 = *((npy_byte *)(ip2 + (i + 5) * is2));
+                npy_byte v6 = *((npy_byte *)(ip2 + (i + 6) * is2));
+                npy_byte v7 = *((npy_byte *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_byte *)op1) = SCALAR_OP(*((npy_byte *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_byte v0 = *((npy_byte *)(ip1 + (i + 0) * is1));
+            npy_byte u0 = *((npy_byte *)(ip2 + (i + 0) * is2));
+            *((npy_byte *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_byte v1 = *((npy_byte *)(ip1 + (i + 1) * is1));
+            npy_byte u1 = *((npy_byte *)(ip2 + (i + 1) * is2));
+            *((npy_byte *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_byte v2 = *((npy_byte *)(ip1 + (i + 2) * is1));
+            npy_byte u2 = *((npy_byte *)(ip2 + (i + 2) * is2));
+            *((npy_byte *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_byte v3 = *((npy_byte *)(ip1 + (i + 3) * is1));
+            npy_byte u3 = *((npy_byte *)(ip2 + (i + 3) * is2));
+            *((npy_byte *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_byte in1 = *(npy_byte *)ip1;
+        const npy_byte in2 = *(npy_byte *)ip2;
+        *((npy_byte *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(BYTE_fmin_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_byte *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_byte *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_byte *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+
+#line 294
+#undef TO_SIMD_SFX
+#if 0
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_SHORT == 8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_SHORT == 16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_SHORT == 32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_SHORT == 64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 321
+#if !0 || (0 && 0)
+#define SCALAR_OP scalar_max_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_maximum)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_short)) {
+            TO_SIMD_SFX(simd_reduce_c_max)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_short, npy_short)) {
+            TO_SIMD_SFX(simd_binary_ccc_max)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_max)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_short m0 = *((npy_short *)(ip2 + (i + 0) * is2));
+            npy_short m1 = *((npy_short *)(ip2 + (i + 1) * is2));
+            npy_short m2 = *((npy_short *)(ip2 + (i + 2) * is2));
+            npy_short m3 = *((npy_short *)(ip2 + (i + 3) * is2));
+            npy_short m4 = *((npy_short *)(ip2 + (i + 4) * is2));
+            npy_short m5 = *((npy_short *)(ip2 + (i + 5) * is2));
+            npy_short m6 = *((npy_short *)(ip2 + (i + 6) * is2));
+            npy_short m7 = *((npy_short *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_short v0 = *((npy_short *)(ip2 + (i + 0) * is2));
+                npy_short v1 = *((npy_short *)(ip2 + (i + 1) * is2));
+                npy_short v2 = *((npy_short *)(ip2 + (i + 2) * is2));
+                npy_short v3 = *((npy_short *)(ip2 + (i + 3) * is2));
+                npy_short v4 = *((npy_short *)(ip2 + (i + 4) * is2));
+                npy_short v5 = *((npy_short *)(ip2 + (i + 5) * is2));
+                npy_short v6 = *((npy_short *)(ip2 + (i + 6) * is2));
+                npy_short v7 = *((npy_short *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_short *)op1) = SCALAR_OP(*((npy_short *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_short v0 = *((npy_short *)(ip1 + (i + 0) * is1));
+            npy_short u0 = *((npy_short *)(ip2 + (i + 0) * is2));
+            *((npy_short *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_short v1 = *((npy_short *)(ip1 + (i + 1) * is1));
+            npy_short u1 = *((npy_short *)(ip2 + (i + 1) * is2));
+            *((npy_short *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_short v2 = *((npy_short *)(ip1 + (i + 2) * is1));
+            npy_short u2 = *((npy_short *)(ip2 + (i + 2) * is2));
+            *((npy_short *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_short v3 = *((npy_short *)(ip1 + (i + 3) * is1));
+            npy_short u3 = *((npy_short *)(ip2 + (i + 3) * is2));
+            *((npy_short *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_short in1 = *(npy_short *)ip1;
+        const npy_short in2 = *(npy_short *)ip2;
+        *((npy_short *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_maximum_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_short *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_short *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_short *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !0 || (0 && 0)
+#define SCALAR_OP scalar_min_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_minimum)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_short)) {
+            TO_SIMD_SFX(simd_reduce_c_min)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_short, npy_short)) {
+            TO_SIMD_SFX(simd_binary_ccc_min)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_min)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_short m0 = *((npy_short *)(ip2 + (i + 0) * is2));
+            npy_short m1 = *((npy_short *)(ip2 + (i + 1) * is2));
+            npy_short m2 = *((npy_short *)(ip2 + (i + 2) * is2));
+            npy_short m3 = *((npy_short *)(ip2 + (i + 3) * is2));
+            npy_short m4 = *((npy_short *)(ip2 + (i + 4) * is2));
+            npy_short m5 = *((npy_short *)(ip2 + (i + 5) * is2));
+            npy_short m6 = *((npy_short *)(ip2 + (i + 6) * is2));
+            npy_short m7 = *((npy_short *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_short v0 = *((npy_short *)(ip2 + (i + 0) * is2));
+                npy_short v1 = *((npy_short *)(ip2 + (i + 1) * is2));
+                npy_short v2 = *((npy_short *)(ip2 + (i + 2) * is2));
+                npy_short v3 = *((npy_short *)(ip2 + (i + 3) * is2));
+                npy_short v4 = *((npy_short *)(ip2 + (i + 4) * is2));
+                npy_short v5 = *((npy_short *)(ip2 + (i + 5) * is2));
+                npy_short v6 = *((npy_short *)(ip2 + (i + 6) * is2));
+                npy_short v7 = *((npy_short *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_short *)op1) = SCALAR_OP(*((npy_short *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_short v0 = *((npy_short *)(ip1 + (i + 0) * is1));
+            npy_short u0 = *((npy_short *)(ip2 + (i + 0) * is2));
+            *((npy_short *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_short v1 = *((npy_short *)(ip1 + (i + 1) * is1));
+            npy_short u1 = *((npy_short *)(ip2 + (i + 1) * is2));
+            *((npy_short *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_short v2 = *((npy_short *)(ip1 + (i + 2) * is1));
+            npy_short u2 = *((npy_short *)(ip2 + (i + 2) * is2));
+            *((npy_short *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_short v3 = *((npy_short *)(ip1 + (i + 3) * is1));
+            npy_short u3 = *((npy_short *)(ip2 + (i + 3) * is2));
+            *((npy_short *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_short in1 = *(npy_short *)ip1;
+        const npy_short in2 = *(npy_short *)ip2;
+        *((npy_short *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_minimum_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_short *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_short *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_short *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !1 || (0 && 1)
+#define SCALAR_OP scalar_maxp_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_fmax)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_short)) {
+            TO_SIMD_SFX(simd_reduce_c_maxp)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_short, npy_short)) {
+            TO_SIMD_SFX(simd_binary_ccc_maxp)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_maxp)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_short m0 = *((npy_short *)(ip2 + (i + 0) * is2));
+            npy_short m1 = *((npy_short *)(ip2 + (i + 1) * is2));
+            npy_short m2 = *((npy_short *)(ip2 + (i + 2) * is2));
+            npy_short m3 = *((npy_short *)(ip2 + (i + 3) * is2));
+            npy_short m4 = *((npy_short *)(ip2 + (i + 4) * is2));
+            npy_short m5 = *((npy_short *)(ip2 + (i + 5) * is2));
+            npy_short m6 = *((npy_short *)(ip2 + (i + 6) * is2));
+            npy_short m7 = *((npy_short *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_short v0 = *((npy_short *)(ip2 + (i + 0) * is2));
+                npy_short v1 = *((npy_short *)(ip2 + (i + 1) * is2));
+                npy_short v2 = *((npy_short *)(ip2 + (i + 2) * is2));
+                npy_short v3 = *((npy_short *)(ip2 + (i + 3) * is2));
+                npy_short v4 = *((npy_short *)(ip2 + (i + 4) * is2));
+                npy_short v5 = *((npy_short *)(ip2 + (i + 5) * is2));
+                npy_short v6 = *((npy_short *)(ip2 + (i + 6) * is2));
+                npy_short v7 = *((npy_short *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_short *)op1) = SCALAR_OP(*((npy_short *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_short v0 = *((npy_short *)(ip1 + (i + 0) * is1));
+            npy_short u0 = *((npy_short *)(ip2 + (i + 0) * is2));
+            *((npy_short *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_short v1 = *((npy_short *)(ip1 + (i + 1) * is1));
+            npy_short u1 = *((npy_short *)(ip2 + (i + 1) * is2));
+            *((npy_short *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_short v2 = *((npy_short *)(ip1 + (i + 2) * is1));
+            npy_short u2 = *((npy_short *)(ip2 + (i + 2) * is2));
+            *((npy_short *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_short v3 = *((npy_short *)(ip1 + (i + 3) * is1));
+            npy_short u3 = *((npy_short *)(ip2 + (i + 3) * is2));
+            *((npy_short *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_short in1 = *(npy_short *)ip1;
+        const npy_short in2 = *(npy_short *)ip2;
+        *((npy_short *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_fmax_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_short *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_short *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_short *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !1 || (0 && 1)
+#define SCALAR_OP scalar_minp_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_fmin)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_short)) {
+            TO_SIMD_SFX(simd_reduce_c_minp)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_short, npy_short)) {
+            TO_SIMD_SFX(simd_binary_ccc_minp)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_minp)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_short m0 = *((npy_short *)(ip2 + (i + 0) * is2));
+            npy_short m1 = *((npy_short *)(ip2 + (i + 1) * is2));
+            npy_short m2 = *((npy_short *)(ip2 + (i + 2) * is2));
+            npy_short m3 = *((npy_short *)(ip2 + (i + 3) * is2));
+            npy_short m4 = *((npy_short *)(ip2 + (i + 4) * is2));
+            npy_short m5 = *((npy_short *)(ip2 + (i + 5) * is2));
+            npy_short m6 = *((npy_short *)(ip2 + (i + 6) * is2));
+            npy_short m7 = *((npy_short *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_short v0 = *((npy_short *)(ip2 + (i + 0) * is2));
+                npy_short v1 = *((npy_short *)(ip2 + (i + 1) * is2));
+                npy_short v2 = *((npy_short *)(ip2 + (i + 2) * is2));
+                npy_short v3 = *((npy_short *)(ip2 + (i + 3) * is2));
+                npy_short v4 = *((npy_short *)(ip2 + (i + 4) * is2));
+                npy_short v5 = *((npy_short *)(ip2 + (i + 5) * is2));
+                npy_short v6 = *((npy_short *)(ip2 + (i + 6) * is2));
+                npy_short v7 = *((npy_short *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_short *)op1) = SCALAR_OP(*((npy_short *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_short v0 = *((npy_short *)(ip1 + (i + 0) * is1));
+            npy_short u0 = *((npy_short *)(ip2 + (i + 0) * is2));
+            *((npy_short *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_short v1 = *((npy_short *)(ip1 + (i + 1) * is1));
+            npy_short u1 = *((npy_short *)(ip2 + (i + 1) * is2));
+            *((npy_short *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_short v2 = *((npy_short *)(ip1 + (i + 2) * is1));
+            npy_short u2 = *((npy_short *)(ip2 + (i + 2) * is2));
+            *((npy_short *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_short v3 = *((npy_short *)(ip1 + (i + 3) * is1));
+            npy_short u3 = *((npy_short *)(ip2 + (i + 3) * is2));
+            *((npy_short *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_short in1 = *(npy_short *)ip1;
+        const npy_short in2 = *(npy_short *)ip2;
+        *((npy_short *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(SHORT_fmin_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_short *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_short *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_short *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+
+#line 294
+#undef TO_SIMD_SFX
+#if 0
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_INT == 8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_INT == 16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_INT == 32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_INT == 64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 321
+#if !0 || (0 && 0)
+#define SCALAR_OP scalar_max_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_maximum)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_int)) {
+            TO_SIMD_SFX(simd_reduce_c_max)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_int, npy_int)) {
+            TO_SIMD_SFX(simd_binary_ccc_max)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_max)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_int m0 = *((npy_int *)(ip2 + (i + 0) * is2));
+            npy_int m1 = *((npy_int *)(ip2 + (i + 1) * is2));
+            npy_int m2 = *((npy_int *)(ip2 + (i + 2) * is2));
+            npy_int m3 = *((npy_int *)(ip2 + (i + 3) * is2));
+            npy_int m4 = *((npy_int *)(ip2 + (i + 4) * is2));
+            npy_int m5 = *((npy_int *)(ip2 + (i + 5) * is2));
+            npy_int m6 = *((npy_int *)(ip2 + (i + 6) * is2));
+            npy_int m7 = *((npy_int *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_int v0 = *((npy_int *)(ip2 + (i + 0) * is2));
+                npy_int v1 = *((npy_int *)(ip2 + (i + 1) * is2));
+                npy_int v2 = *((npy_int *)(ip2 + (i + 2) * is2));
+                npy_int v3 = *((npy_int *)(ip2 + (i + 3) * is2));
+                npy_int v4 = *((npy_int *)(ip2 + (i + 4) * is2));
+                npy_int v5 = *((npy_int *)(ip2 + (i + 5) * is2));
+                npy_int v6 = *((npy_int *)(ip2 + (i + 6) * is2));
+                npy_int v7 = *((npy_int *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_int *)op1) = SCALAR_OP(*((npy_int *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_int v0 = *((npy_int *)(ip1 + (i + 0) * is1));
+            npy_int u0 = *((npy_int *)(ip2 + (i + 0) * is2));
+            *((npy_int *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_int v1 = *((npy_int *)(ip1 + (i + 1) * is1));
+            npy_int u1 = *((npy_int *)(ip2 + (i + 1) * is2));
+            *((npy_int *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_int v2 = *((npy_int *)(ip1 + (i + 2) * is1));
+            npy_int u2 = *((npy_int *)(ip2 + (i + 2) * is2));
+            *((npy_int *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_int v3 = *((npy_int *)(ip1 + (i + 3) * is1));
+            npy_int u3 = *((npy_int *)(ip2 + (i + 3) * is2));
+            *((npy_int *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_int in1 = *(npy_int *)ip1;
+        const npy_int in2 = *(npy_int *)ip2;
+        *((npy_int *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_maximum_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_int *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_int *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_int *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !0 || (0 && 0)
+#define SCALAR_OP scalar_min_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_minimum)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_int)) {
+            TO_SIMD_SFX(simd_reduce_c_min)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_int, npy_int)) {
+            TO_SIMD_SFX(simd_binary_ccc_min)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_min)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_int m0 = *((npy_int *)(ip2 + (i + 0) * is2));
+            npy_int m1 = *((npy_int *)(ip2 + (i + 1) * is2));
+            npy_int m2 = *((npy_int *)(ip2 + (i + 2) * is2));
+            npy_int m3 = *((npy_int *)(ip2 + (i + 3) * is2));
+            npy_int m4 = *((npy_int *)(ip2 + (i + 4) * is2));
+            npy_int m5 = *((npy_int *)(ip2 + (i + 5) * is2));
+            npy_int m6 = *((npy_int *)(ip2 + (i + 6) * is2));
+            npy_int m7 = *((npy_int *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_int v0 = *((npy_int *)(ip2 + (i + 0) * is2));
+                npy_int v1 = *((npy_int *)(ip2 + (i + 1) * is2));
+                npy_int v2 = *((npy_int *)(ip2 + (i + 2) * is2));
+                npy_int v3 = *((npy_int *)(ip2 + (i + 3) * is2));
+                npy_int v4 = *((npy_int *)(ip2 + (i + 4) * is2));
+                npy_int v5 = *((npy_int *)(ip2 + (i + 5) * is2));
+                npy_int v6 = *((npy_int *)(ip2 + (i + 6) * is2));
+                npy_int v7 = *((npy_int *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_int *)op1) = SCALAR_OP(*((npy_int *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_int v0 = *((npy_int *)(ip1 + (i + 0) * is1));
+            npy_int u0 = *((npy_int *)(ip2 + (i + 0) * is2));
+            *((npy_int *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_int v1 = *((npy_int *)(ip1 + (i + 1) * is1));
+            npy_int u1 = *((npy_int *)(ip2 + (i + 1) * is2));
+            *((npy_int *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_int v2 = *((npy_int *)(ip1 + (i + 2) * is1));
+            npy_int u2 = *((npy_int *)(ip2 + (i + 2) * is2));
+            *((npy_int *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_int v3 = *((npy_int *)(ip1 + (i + 3) * is1));
+            npy_int u3 = *((npy_int *)(ip2 + (i + 3) * is2));
+            *((npy_int *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_int in1 = *(npy_int *)ip1;
+        const npy_int in2 = *(npy_int *)ip2;
+        *((npy_int *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_minimum_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_int *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_int *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_int *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !1 || (0 && 1)
+#define SCALAR_OP scalar_maxp_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_fmax)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_int)) {
+            TO_SIMD_SFX(simd_reduce_c_maxp)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_int, npy_int)) {
+            TO_SIMD_SFX(simd_binary_ccc_maxp)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_maxp)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_int m0 = *((npy_int *)(ip2 + (i + 0) * is2));
+            npy_int m1 = *((npy_int *)(ip2 + (i + 1) * is2));
+            npy_int m2 = *((npy_int *)(ip2 + (i + 2) * is2));
+            npy_int m3 = *((npy_int *)(ip2 + (i + 3) * is2));
+            npy_int m4 = *((npy_int *)(ip2 + (i + 4) * is2));
+            npy_int m5 = *((npy_int *)(ip2 + (i + 5) * is2));
+            npy_int m6 = *((npy_int *)(ip2 + (i + 6) * is2));
+            npy_int m7 = *((npy_int *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_int v0 = *((npy_int *)(ip2 + (i + 0) * is2));
+                npy_int v1 = *((npy_int *)(ip2 + (i + 1) * is2));
+                npy_int v2 = *((npy_int *)(ip2 + (i + 2) * is2));
+                npy_int v3 = *((npy_int *)(ip2 + (i + 3) * is2));
+                npy_int v4 = *((npy_int *)(ip2 + (i + 4) * is2));
+                npy_int v5 = *((npy_int *)(ip2 + (i + 5) * is2));
+                npy_int v6 = *((npy_int *)(ip2 + (i + 6) * is2));
+                npy_int v7 = *((npy_int *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_int *)op1) = SCALAR_OP(*((npy_int *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_int v0 = *((npy_int *)(ip1 + (i + 0) * is1));
+            npy_int u0 = *((npy_int *)(ip2 + (i + 0) * is2));
+            *((npy_int *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_int v1 = *((npy_int *)(ip1 + (i + 1) * is1));
+            npy_int u1 = *((npy_int *)(ip2 + (i + 1) * is2));
+            *((npy_int *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_int v2 = *((npy_int *)(ip1 + (i + 2) * is1));
+            npy_int u2 = *((npy_int *)(ip2 + (i + 2) * is2));
+            *((npy_int *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_int v3 = *((npy_int *)(ip1 + (i + 3) * is1));
+            npy_int u3 = *((npy_int *)(ip2 + (i + 3) * is2));
+            *((npy_int *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_int in1 = *(npy_int *)ip1;
+        const npy_int in2 = *(npy_int *)ip2;
+        *((npy_int *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_fmax_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_int *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_int *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_int *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !1 || (0 && 1)
+#define SCALAR_OP scalar_minp_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_fmin)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_int)) {
+            TO_SIMD_SFX(simd_reduce_c_minp)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_int, npy_int)) {
+            TO_SIMD_SFX(simd_binary_ccc_minp)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_minp)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_int m0 = *((npy_int *)(ip2 + (i + 0) * is2));
+            npy_int m1 = *((npy_int *)(ip2 + (i + 1) * is2));
+            npy_int m2 = *((npy_int *)(ip2 + (i + 2) * is2));
+            npy_int m3 = *((npy_int *)(ip2 + (i + 3) * is2));
+            npy_int m4 = *((npy_int *)(ip2 + (i + 4) * is2));
+            npy_int m5 = *((npy_int *)(ip2 + (i + 5) * is2));
+            npy_int m6 = *((npy_int *)(ip2 + (i + 6) * is2));
+            npy_int m7 = *((npy_int *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_int v0 = *((npy_int *)(ip2 + (i + 0) * is2));
+                npy_int v1 = *((npy_int *)(ip2 + (i + 1) * is2));
+                npy_int v2 = *((npy_int *)(ip2 + (i + 2) * is2));
+                npy_int v3 = *((npy_int *)(ip2 + (i + 3) * is2));
+                npy_int v4 = *((npy_int *)(ip2 + (i + 4) * is2));
+                npy_int v5 = *((npy_int *)(ip2 + (i + 5) * is2));
+                npy_int v6 = *((npy_int *)(ip2 + (i + 6) * is2));
+                npy_int v7 = *((npy_int *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_int *)op1) = SCALAR_OP(*((npy_int *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_int v0 = *((npy_int *)(ip1 + (i + 0) * is1));
+            npy_int u0 = *((npy_int *)(ip2 + (i + 0) * is2));
+            *((npy_int *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_int v1 = *((npy_int *)(ip1 + (i + 1) * is1));
+            npy_int u1 = *((npy_int *)(ip2 + (i + 1) * is2));
+            *((npy_int *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_int v2 = *((npy_int *)(ip1 + (i + 2) * is1));
+            npy_int u2 = *((npy_int *)(ip2 + (i + 2) * is2));
+            *((npy_int *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_int v3 = *((npy_int *)(ip1 + (i + 3) * is1));
+            npy_int u3 = *((npy_int *)(ip2 + (i + 3) * is2));
+            *((npy_int *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_int in1 = *(npy_int *)ip1;
+        const npy_int in2 = *(npy_int *)ip2;
+        *((npy_int *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(INT_fmin_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_int *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_int *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_int *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+
+#line 294
+#undef TO_SIMD_SFX
+#if 0
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_LONG == 8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_LONG == 16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_LONG == 32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_LONG == 64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 321
+#if !0 || (0 && 0)
+#define SCALAR_OP scalar_max_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_maximum)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_long)) {
+            TO_SIMD_SFX(simd_reduce_c_max)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_long, npy_long)) {
+            TO_SIMD_SFX(simd_binary_ccc_max)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_max)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_long m0 = *((npy_long *)(ip2 + (i + 0) * is2));
+            npy_long m1 = *((npy_long *)(ip2 + (i + 1) * is2));
+            npy_long m2 = *((npy_long *)(ip2 + (i + 2) * is2));
+            npy_long m3 = *((npy_long *)(ip2 + (i + 3) * is2));
+            npy_long m4 = *((npy_long *)(ip2 + (i + 4) * is2));
+            npy_long m5 = *((npy_long *)(ip2 + (i + 5) * is2));
+            npy_long m6 = *((npy_long *)(ip2 + (i + 6) * is2));
+            npy_long m7 = *((npy_long *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_long v0 = *((npy_long *)(ip2 + (i + 0) * is2));
+                npy_long v1 = *((npy_long *)(ip2 + (i + 1) * is2));
+                npy_long v2 = *((npy_long *)(ip2 + (i + 2) * is2));
+                npy_long v3 = *((npy_long *)(ip2 + (i + 3) * is2));
+                npy_long v4 = *((npy_long *)(ip2 + (i + 4) * is2));
+                npy_long v5 = *((npy_long *)(ip2 + (i + 5) * is2));
+                npy_long v6 = *((npy_long *)(ip2 + (i + 6) * is2));
+                npy_long v7 = *((npy_long *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_long *)op1) = SCALAR_OP(*((npy_long *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_long v0 = *((npy_long *)(ip1 + (i + 0) * is1));
+            npy_long u0 = *((npy_long *)(ip2 + (i + 0) * is2));
+            *((npy_long *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_long v1 = *((npy_long *)(ip1 + (i + 1) * is1));
+            npy_long u1 = *((npy_long *)(ip2 + (i + 1) * is2));
+            *((npy_long *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_long v2 = *((npy_long *)(ip1 + (i + 2) * is1));
+            npy_long u2 = *((npy_long *)(ip2 + (i + 2) * is2));
+            *((npy_long *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_long v3 = *((npy_long *)(ip1 + (i + 3) * is1));
+            npy_long u3 = *((npy_long *)(ip2 + (i + 3) * is2));
+            *((npy_long *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_long in1 = *(npy_long *)ip1;
+        const npy_long in2 = *(npy_long *)ip2;
+        *((npy_long *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_maximum_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_long *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_long *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_long *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !0 || (0 && 0)
+#define SCALAR_OP scalar_min_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_minimum)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_long)) {
+            TO_SIMD_SFX(simd_reduce_c_min)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_long, npy_long)) {
+            TO_SIMD_SFX(simd_binary_ccc_min)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_min)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_long m0 = *((npy_long *)(ip2 + (i + 0) * is2));
+            npy_long m1 = *((npy_long *)(ip2 + (i + 1) * is2));
+            npy_long m2 = *((npy_long *)(ip2 + (i + 2) * is2));
+            npy_long m3 = *((npy_long *)(ip2 + (i + 3) * is2));
+            npy_long m4 = *((npy_long *)(ip2 + (i + 4) * is2));
+            npy_long m5 = *((npy_long *)(ip2 + (i + 5) * is2));
+            npy_long m6 = *((npy_long *)(ip2 + (i + 6) * is2));
+            npy_long m7 = *((npy_long *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_long v0 = *((npy_long *)(ip2 + (i + 0) * is2));
+                npy_long v1 = *((npy_long *)(ip2 + (i + 1) * is2));
+                npy_long v2 = *((npy_long *)(ip2 + (i + 2) * is2));
+                npy_long v3 = *((npy_long *)(ip2 + (i + 3) * is2));
+                npy_long v4 = *((npy_long *)(ip2 + (i + 4) * is2));
+                npy_long v5 = *((npy_long *)(ip2 + (i + 5) * is2));
+                npy_long v6 = *((npy_long *)(ip2 + (i + 6) * is2));
+                npy_long v7 = *((npy_long *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_long *)op1) = SCALAR_OP(*((npy_long *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_long v0 = *((npy_long *)(ip1 + (i + 0) * is1));
+            npy_long u0 = *((npy_long *)(ip2 + (i + 0) * is2));
+            *((npy_long *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_long v1 = *((npy_long *)(ip1 + (i + 1) * is1));
+            npy_long u1 = *((npy_long *)(ip2 + (i + 1) * is2));
+            *((npy_long *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_long v2 = *((npy_long *)(ip1 + (i + 2) * is1));
+            npy_long u2 = *((npy_long *)(ip2 + (i + 2) * is2));
+            *((npy_long *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_long v3 = *((npy_long *)(ip1 + (i + 3) * is1));
+            npy_long u3 = *((npy_long *)(ip2 + (i + 3) * is2));
+            *((npy_long *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_long in1 = *(npy_long *)ip1;
+        const npy_long in2 = *(npy_long *)ip2;
+        *((npy_long *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_minimum_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_long *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_long *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_long *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !1 || (0 && 1)
+#define SCALAR_OP scalar_maxp_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_fmax)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_long)) {
+            TO_SIMD_SFX(simd_reduce_c_maxp)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_long, npy_long)) {
+            TO_SIMD_SFX(simd_binary_ccc_maxp)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_maxp)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_long m0 = *((npy_long *)(ip2 + (i + 0) * is2));
+            npy_long m1 = *((npy_long *)(ip2 + (i + 1) * is2));
+            npy_long m2 = *((npy_long *)(ip2 + (i + 2) * is2));
+            npy_long m3 = *((npy_long *)(ip2 + (i + 3) * is2));
+            npy_long m4 = *((npy_long *)(ip2 + (i + 4) * is2));
+            npy_long m5 = *((npy_long *)(ip2 + (i + 5) * is2));
+            npy_long m6 = *((npy_long *)(ip2 + (i + 6) * is2));
+            npy_long m7 = *((npy_long *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_long v0 = *((npy_long *)(ip2 + (i + 0) * is2));
+                npy_long v1 = *((npy_long *)(ip2 + (i + 1) * is2));
+                npy_long v2 = *((npy_long *)(ip2 + (i + 2) * is2));
+                npy_long v3 = *((npy_long *)(ip2 + (i + 3) * is2));
+                npy_long v4 = *((npy_long *)(ip2 + (i + 4) * is2));
+                npy_long v5 = *((npy_long *)(ip2 + (i + 5) * is2));
+                npy_long v6 = *((npy_long *)(ip2 + (i + 6) * is2));
+                npy_long v7 = *((npy_long *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_long *)op1) = SCALAR_OP(*((npy_long *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_long v0 = *((npy_long *)(ip1 + (i + 0) * is1));
+            npy_long u0 = *((npy_long *)(ip2 + (i + 0) * is2));
+            *((npy_long *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_long v1 = *((npy_long *)(ip1 + (i + 1) * is1));
+            npy_long u1 = *((npy_long *)(ip2 + (i + 1) * is2));
+            *((npy_long *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_long v2 = *((npy_long *)(ip1 + (i + 2) * is1));
+            npy_long u2 = *((npy_long *)(ip2 + (i + 2) * is2));
+            *((npy_long *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_long v3 = *((npy_long *)(ip1 + (i + 3) * is1));
+            npy_long u3 = *((npy_long *)(ip2 + (i + 3) * is2));
+            *((npy_long *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_long in1 = *(npy_long *)ip1;
+        const npy_long in2 = *(npy_long *)ip2;
+        *((npy_long *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_fmax_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_long *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_long *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_long *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !1 || (0 && 1)
+#define SCALAR_OP scalar_minp_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_fmin)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_long)) {
+            TO_SIMD_SFX(simd_reduce_c_minp)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_long, npy_long)) {
+            TO_SIMD_SFX(simd_binary_ccc_minp)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_minp)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_long m0 = *((npy_long *)(ip2 + (i + 0) * is2));
+            npy_long m1 = *((npy_long *)(ip2 + (i + 1) * is2));
+            npy_long m2 = *((npy_long *)(ip2 + (i + 2) * is2));
+            npy_long m3 = *((npy_long *)(ip2 + (i + 3) * is2));
+            npy_long m4 = *((npy_long *)(ip2 + (i + 4) * is2));
+            npy_long m5 = *((npy_long *)(ip2 + (i + 5) * is2));
+            npy_long m6 = *((npy_long *)(ip2 + (i + 6) * is2));
+            npy_long m7 = *((npy_long *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_long v0 = *((npy_long *)(ip2 + (i + 0) * is2));
+                npy_long v1 = *((npy_long *)(ip2 + (i + 1) * is2));
+                npy_long v2 = *((npy_long *)(ip2 + (i + 2) * is2));
+                npy_long v3 = *((npy_long *)(ip2 + (i + 3) * is2));
+                npy_long v4 = *((npy_long *)(ip2 + (i + 4) * is2));
+                npy_long v5 = *((npy_long *)(ip2 + (i + 5) * is2));
+                npy_long v6 = *((npy_long *)(ip2 + (i + 6) * is2));
+                npy_long v7 = *((npy_long *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_long *)op1) = SCALAR_OP(*((npy_long *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_long v0 = *((npy_long *)(ip1 + (i + 0) * is1));
+            npy_long u0 = *((npy_long *)(ip2 + (i + 0) * is2));
+            *((npy_long *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_long v1 = *((npy_long *)(ip1 + (i + 1) * is1));
+            npy_long u1 = *((npy_long *)(ip2 + (i + 1) * is2));
+            *((npy_long *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_long v2 = *((npy_long *)(ip1 + (i + 2) * is1));
+            npy_long u2 = *((npy_long *)(ip2 + (i + 2) * is2));
+            *((npy_long *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_long v3 = *((npy_long *)(ip1 + (i + 3) * is1));
+            npy_long u3 = *((npy_long *)(ip2 + (i + 3) * is2));
+            *((npy_long *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_long in1 = *(npy_long *)ip1;
+        const npy_long in2 = *(npy_long *)ip2;
+        *((npy_long *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONG_fmin_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_long *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_long *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_long *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+
+#line 294
+#undef TO_SIMD_SFX
+#if 0
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 321
+#if !0 || (0 && 0)
+#define SCALAR_OP scalar_max_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_maximum)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_longlong)) {
+            TO_SIMD_SFX(simd_reduce_c_max)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_longlong, npy_longlong)) {
+            TO_SIMD_SFX(simd_binary_ccc_max)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_max)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_longlong m0 = *((npy_longlong *)(ip2 + (i + 0) * is2));
+            npy_longlong m1 = *((npy_longlong *)(ip2 + (i + 1) * is2));
+            npy_longlong m2 = *((npy_longlong *)(ip2 + (i + 2) * is2));
+            npy_longlong m3 = *((npy_longlong *)(ip2 + (i + 3) * is2));
+            npy_longlong m4 = *((npy_longlong *)(ip2 + (i + 4) * is2));
+            npy_longlong m5 = *((npy_longlong *)(ip2 + (i + 5) * is2));
+            npy_longlong m6 = *((npy_longlong *)(ip2 + (i + 6) * is2));
+            npy_longlong m7 = *((npy_longlong *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_longlong v0 = *((npy_longlong *)(ip2 + (i + 0) * is2));
+                npy_longlong v1 = *((npy_longlong *)(ip2 + (i + 1) * is2));
+                npy_longlong v2 = *((npy_longlong *)(ip2 + (i + 2) * is2));
+                npy_longlong v3 = *((npy_longlong *)(ip2 + (i + 3) * is2));
+                npy_longlong v4 = *((npy_longlong *)(ip2 + (i + 4) * is2));
+                npy_longlong v5 = *((npy_longlong *)(ip2 + (i + 5) * is2));
+                npy_longlong v6 = *((npy_longlong *)(ip2 + (i + 6) * is2));
+                npy_longlong v7 = *((npy_longlong *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_longlong *)op1) = SCALAR_OP(*((npy_longlong *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_longlong v0 = *((npy_longlong *)(ip1 + (i + 0) * is1));
+            npy_longlong u0 = *((npy_longlong *)(ip2 + (i + 0) * is2));
+            *((npy_longlong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_longlong v1 = *((npy_longlong *)(ip1 + (i + 1) * is1));
+            npy_longlong u1 = *((npy_longlong *)(ip2 + (i + 1) * is2));
+            *((npy_longlong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_longlong v2 = *((npy_longlong *)(ip1 + (i + 2) * is1));
+            npy_longlong u2 = *((npy_longlong *)(ip2 + (i + 2) * is2));
+            *((npy_longlong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_longlong v3 = *((npy_longlong *)(ip1 + (i + 3) * is1));
+            npy_longlong u3 = *((npy_longlong *)(ip2 + (i + 3) * is2));
+            *((npy_longlong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_longlong in1 = *(npy_longlong *)ip1;
+        const npy_longlong in2 = *(npy_longlong *)ip2;
+        *((npy_longlong *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_maximum_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_longlong *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_longlong *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_longlong *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !0 || (0 && 0)
+#define SCALAR_OP scalar_min_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_minimum)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_longlong)) {
+            TO_SIMD_SFX(simd_reduce_c_min)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_longlong, npy_longlong)) {
+            TO_SIMD_SFX(simd_binary_ccc_min)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_min)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_longlong m0 = *((npy_longlong *)(ip2 + (i + 0) * is2));
+            npy_longlong m1 = *((npy_longlong *)(ip2 + (i + 1) * is2));
+            npy_longlong m2 = *((npy_longlong *)(ip2 + (i + 2) * is2));
+            npy_longlong m3 = *((npy_longlong *)(ip2 + (i + 3) * is2));
+            npy_longlong m4 = *((npy_longlong *)(ip2 + (i + 4) * is2));
+            npy_longlong m5 = *((npy_longlong *)(ip2 + (i + 5) * is2));
+            npy_longlong m6 = *((npy_longlong *)(ip2 + (i + 6) * is2));
+            npy_longlong m7 = *((npy_longlong *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_longlong v0 = *((npy_longlong *)(ip2 + (i + 0) * is2));
+                npy_longlong v1 = *((npy_longlong *)(ip2 + (i + 1) * is2));
+                npy_longlong v2 = *((npy_longlong *)(ip2 + (i + 2) * is2));
+                npy_longlong v3 = *((npy_longlong *)(ip2 + (i + 3) * is2));
+                npy_longlong v4 = *((npy_longlong *)(ip2 + (i + 4) * is2));
+                npy_longlong v5 = *((npy_longlong *)(ip2 + (i + 5) * is2));
+                npy_longlong v6 = *((npy_longlong *)(ip2 + (i + 6) * is2));
+                npy_longlong v7 = *((npy_longlong *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_longlong *)op1) = SCALAR_OP(*((npy_longlong *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_longlong v0 = *((npy_longlong *)(ip1 + (i + 0) * is1));
+            npy_longlong u0 = *((npy_longlong *)(ip2 + (i + 0) * is2));
+            *((npy_longlong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_longlong v1 = *((npy_longlong *)(ip1 + (i + 1) * is1));
+            npy_longlong u1 = *((npy_longlong *)(ip2 + (i + 1) * is2));
+            *((npy_longlong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_longlong v2 = *((npy_longlong *)(ip1 + (i + 2) * is1));
+            npy_longlong u2 = *((npy_longlong *)(ip2 + (i + 2) * is2));
+            *((npy_longlong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_longlong v3 = *((npy_longlong *)(ip1 + (i + 3) * is1));
+            npy_longlong u3 = *((npy_longlong *)(ip2 + (i + 3) * is2));
+            *((npy_longlong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_longlong in1 = *(npy_longlong *)ip1;
+        const npy_longlong in2 = *(npy_longlong *)ip2;
+        *((npy_longlong *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_minimum_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_longlong *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_longlong *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_longlong *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !1 || (0 && 1)
+#define SCALAR_OP scalar_maxp_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_fmax)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_longlong)) {
+            TO_SIMD_SFX(simd_reduce_c_maxp)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_longlong, npy_longlong)) {
+            TO_SIMD_SFX(simd_binary_ccc_maxp)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_maxp)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_longlong m0 = *((npy_longlong *)(ip2 + (i + 0) * is2));
+            npy_longlong m1 = *((npy_longlong *)(ip2 + (i + 1) * is2));
+            npy_longlong m2 = *((npy_longlong *)(ip2 + (i + 2) * is2));
+            npy_longlong m3 = *((npy_longlong *)(ip2 + (i + 3) * is2));
+            npy_longlong m4 = *((npy_longlong *)(ip2 + (i + 4) * is2));
+            npy_longlong m5 = *((npy_longlong *)(ip2 + (i + 5) * is2));
+            npy_longlong m6 = *((npy_longlong *)(ip2 + (i + 6) * is2));
+            npy_longlong m7 = *((npy_longlong *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_longlong v0 = *((npy_longlong *)(ip2 + (i + 0) * is2));
+                npy_longlong v1 = *((npy_longlong *)(ip2 + (i + 1) * is2));
+                npy_longlong v2 = *((npy_longlong *)(ip2 + (i + 2) * is2));
+                npy_longlong v3 = *((npy_longlong *)(ip2 + (i + 3) * is2));
+                npy_longlong v4 = *((npy_longlong *)(ip2 + (i + 4) * is2));
+                npy_longlong v5 = *((npy_longlong *)(ip2 + (i + 5) * is2));
+                npy_longlong v6 = *((npy_longlong *)(ip2 + (i + 6) * is2));
+                npy_longlong v7 = *((npy_longlong *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_longlong *)op1) = SCALAR_OP(*((npy_longlong *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_longlong v0 = *((npy_longlong *)(ip1 + (i + 0) * is1));
+            npy_longlong u0 = *((npy_longlong *)(ip2 + (i + 0) * is2));
+            *((npy_longlong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_longlong v1 = *((npy_longlong *)(ip1 + (i + 1) * is1));
+            npy_longlong u1 = *((npy_longlong *)(ip2 + (i + 1) * is2));
+            *((npy_longlong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_longlong v2 = *((npy_longlong *)(ip1 + (i + 2) * is1));
+            npy_longlong u2 = *((npy_longlong *)(ip2 + (i + 2) * is2));
+            *((npy_longlong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_longlong v3 = *((npy_longlong *)(ip1 + (i + 3) * is1));
+            npy_longlong u3 = *((npy_longlong *)(ip2 + (i + 3) * is2));
+            *((npy_longlong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_longlong in1 = *(npy_longlong *)ip1;
+        const npy_longlong in2 = *(npy_longlong *)ip2;
+        *((npy_longlong *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_fmax_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_longlong *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_longlong *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_longlong *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !1 || (0 && 1)
+#define SCALAR_OP scalar_minp_i
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_fmin)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_longlong)) {
+            TO_SIMD_SFX(simd_reduce_c_minp)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_longlong, npy_longlong)) {
+            TO_SIMD_SFX(simd_binary_ccc_minp)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 0
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_minp)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_longlong m0 = *((npy_longlong *)(ip2 + (i + 0) * is2));
+            npy_longlong m1 = *((npy_longlong *)(ip2 + (i + 1) * is2));
+            npy_longlong m2 = *((npy_longlong *)(ip2 + (i + 2) * is2));
+            npy_longlong m3 = *((npy_longlong *)(ip2 + (i + 3) * is2));
+            npy_longlong m4 = *((npy_longlong *)(ip2 + (i + 4) * is2));
+            npy_longlong m5 = *((npy_longlong *)(ip2 + (i + 5) * is2));
+            npy_longlong m6 = *((npy_longlong *)(ip2 + (i + 6) * is2));
+            npy_longlong m7 = *((npy_longlong *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_longlong v0 = *((npy_longlong *)(ip2 + (i + 0) * is2));
+                npy_longlong v1 = *((npy_longlong *)(ip2 + (i + 1) * is2));
+                npy_longlong v2 = *((npy_longlong *)(ip2 + (i + 2) * is2));
+                npy_longlong v3 = *((npy_longlong *)(ip2 + (i + 3) * is2));
+                npy_longlong v4 = *((npy_longlong *)(ip2 + (i + 4) * is2));
+                npy_longlong v5 = *((npy_longlong *)(ip2 + (i + 5) * is2));
+                npy_longlong v6 = *((npy_longlong *)(ip2 + (i + 6) * is2));
+                npy_longlong v7 = *((npy_longlong *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_longlong *)op1) = SCALAR_OP(*((npy_longlong *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_longlong v0 = *((npy_longlong *)(ip1 + (i + 0) * is1));
+            npy_longlong u0 = *((npy_longlong *)(ip2 + (i + 0) * is2));
+            *((npy_longlong *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_longlong v1 = *((npy_longlong *)(ip1 + (i + 1) * is1));
+            npy_longlong u1 = *((npy_longlong *)(ip2 + (i + 1) * is2));
+            *((npy_longlong *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_longlong v2 = *((npy_longlong *)(ip1 + (i + 2) * is1));
+            npy_longlong u2 = *((npy_longlong *)(ip2 + (i + 2) * is2));
+            *((npy_longlong *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_longlong v3 = *((npy_longlong *)(ip1 + (i + 3) * is1));
+            npy_longlong u3 = *((npy_longlong *)(ip2 + (i + 3) * is2));
+            *((npy_longlong *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_longlong in1 = *(npy_longlong *)ip1;
+        const npy_longlong in2 = *(npy_longlong *)ip2;
+        *((npy_longlong *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGLONG_fmin_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_longlong *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_longlong *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_longlong *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+
+#line 294
+#undef TO_SIMD_SFX
+#if 0
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_FLOAT == 8
+    #if 1
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_FLOAT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_FLOAT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_FLOAT == 16
+    #if 1
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_FLOAT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_FLOAT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_FLOAT == 32
+    #if 1
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_FLOAT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_FLOAT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_FLOAT == 64
+    #if 1
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_FLOAT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_FLOAT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 321
+#if !0 || (1 && 0)
+#define SCALAR_OP scalar_max_f
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_maximum)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_float)) {
+            TO_SIMD_SFX(simd_reduce_c_max)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_float, npy_float)) {
+            TO_SIMD_SFX(simd_binary_ccc_max)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 1
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_max)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_float m0 = *((npy_float *)(ip2 + (i + 0) * is2));
+            npy_float m1 = *((npy_float *)(ip2 + (i + 1) * is2));
+            npy_float m2 = *((npy_float *)(ip2 + (i + 2) * is2));
+            npy_float m3 = *((npy_float *)(ip2 + (i + 3) * is2));
+            npy_float m4 = *((npy_float *)(ip2 + (i + 4) * is2));
+            npy_float m5 = *((npy_float *)(ip2 + (i + 5) * is2));
+            npy_float m6 = *((npy_float *)(ip2 + (i + 6) * is2));
+            npy_float m7 = *((npy_float *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_float v0 = *((npy_float *)(ip2 + (i + 0) * is2));
+                npy_float v1 = *((npy_float *)(ip2 + (i + 1) * is2));
+                npy_float v2 = *((npy_float *)(ip2 + (i + 2) * is2));
+                npy_float v3 = *((npy_float *)(ip2 + (i + 3) * is2));
+                npy_float v4 = *((npy_float *)(ip2 + (i + 4) * is2));
+                npy_float v5 = *((npy_float *)(ip2 + (i + 5) * is2));
+                npy_float v6 = *((npy_float *)(ip2 + (i + 6) * is2));
+                npy_float v7 = *((npy_float *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_float *)op1) = SCALAR_OP(*((npy_float *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_float v0 = *((npy_float *)(ip1 + (i + 0) * is1));
+            npy_float u0 = *((npy_float *)(ip2 + (i + 0) * is2));
+            *((npy_float *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_float v1 = *((npy_float *)(ip1 + (i + 1) * is1));
+            npy_float u1 = *((npy_float *)(ip2 + (i + 1) * is2));
+            *((npy_float *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_float v2 = *((npy_float *)(ip1 + (i + 2) * is1));
+            npy_float u2 = *((npy_float *)(ip2 + (i + 2) * is2));
+            *((npy_float *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_float v3 = *((npy_float *)(ip1 + (i + 3) * is1));
+            npy_float u3 = *((npy_float *)(ip2 + (i + 3) * is2));
+            *((npy_float *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_float in1 = *(npy_float *)ip1;
+        const npy_float in2 = *(npy_float *)ip2;
+        *((npy_float *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 1
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_maximum_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_float *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_float *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_float *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !0 || (1 && 0)
+#define SCALAR_OP scalar_min_f
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_minimum)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_float)) {
+            TO_SIMD_SFX(simd_reduce_c_min)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_float, npy_float)) {
+            TO_SIMD_SFX(simd_binary_ccc_min)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 1
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_min)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_float m0 = *((npy_float *)(ip2 + (i + 0) * is2));
+            npy_float m1 = *((npy_float *)(ip2 + (i + 1) * is2));
+            npy_float m2 = *((npy_float *)(ip2 + (i + 2) * is2));
+            npy_float m3 = *((npy_float *)(ip2 + (i + 3) * is2));
+            npy_float m4 = *((npy_float *)(ip2 + (i + 4) * is2));
+            npy_float m5 = *((npy_float *)(ip2 + (i + 5) * is2));
+            npy_float m6 = *((npy_float *)(ip2 + (i + 6) * is2));
+            npy_float m7 = *((npy_float *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_float v0 = *((npy_float *)(ip2 + (i + 0) * is2));
+                npy_float v1 = *((npy_float *)(ip2 + (i + 1) * is2));
+                npy_float v2 = *((npy_float *)(ip2 + (i + 2) * is2));
+                npy_float v3 = *((npy_float *)(ip2 + (i + 3) * is2));
+                npy_float v4 = *((npy_float *)(ip2 + (i + 4) * is2));
+                npy_float v5 = *((npy_float *)(ip2 + (i + 5) * is2));
+                npy_float v6 = *((npy_float *)(ip2 + (i + 6) * is2));
+                npy_float v7 = *((npy_float *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_float *)op1) = SCALAR_OP(*((npy_float *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_float v0 = *((npy_float *)(ip1 + (i + 0) * is1));
+            npy_float u0 = *((npy_float *)(ip2 + (i + 0) * is2));
+            *((npy_float *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_float v1 = *((npy_float *)(ip1 + (i + 1) * is1));
+            npy_float u1 = *((npy_float *)(ip2 + (i + 1) * is2));
+            *((npy_float *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_float v2 = *((npy_float *)(ip1 + (i + 2) * is1));
+            npy_float u2 = *((npy_float *)(ip2 + (i + 2) * is2));
+            *((npy_float *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_float v3 = *((npy_float *)(ip1 + (i + 3) * is1));
+            npy_float u3 = *((npy_float *)(ip2 + (i + 3) * is2));
+            *((npy_float *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_float in1 = *(npy_float *)ip1;
+        const npy_float in2 = *(npy_float *)ip2;
+        *((npy_float *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 1
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_minimum_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_float *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_float *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_float *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !1 || (1 && 1)
+#define SCALAR_OP scalar_maxp_f
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_fmax)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_float)) {
+            TO_SIMD_SFX(simd_reduce_c_maxp)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_float, npy_float)) {
+            TO_SIMD_SFX(simd_binary_ccc_maxp)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 1
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_maxp)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_float m0 = *((npy_float *)(ip2 + (i + 0) * is2));
+            npy_float m1 = *((npy_float *)(ip2 + (i + 1) * is2));
+            npy_float m2 = *((npy_float *)(ip2 + (i + 2) * is2));
+            npy_float m3 = *((npy_float *)(ip2 + (i + 3) * is2));
+            npy_float m4 = *((npy_float *)(ip2 + (i + 4) * is2));
+            npy_float m5 = *((npy_float *)(ip2 + (i + 5) * is2));
+            npy_float m6 = *((npy_float *)(ip2 + (i + 6) * is2));
+            npy_float m7 = *((npy_float *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_float v0 = *((npy_float *)(ip2 + (i + 0) * is2));
+                npy_float v1 = *((npy_float *)(ip2 + (i + 1) * is2));
+                npy_float v2 = *((npy_float *)(ip2 + (i + 2) * is2));
+                npy_float v3 = *((npy_float *)(ip2 + (i + 3) * is2));
+                npy_float v4 = *((npy_float *)(ip2 + (i + 4) * is2));
+                npy_float v5 = *((npy_float *)(ip2 + (i + 5) * is2));
+                npy_float v6 = *((npy_float *)(ip2 + (i + 6) * is2));
+                npy_float v7 = *((npy_float *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_float *)op1) = SCALAR_OP(*((npy_float *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_float v0 = *((npy_float *)(ip1 + (i + 0) * is1));
+            npy_float u0 = *((npy_float *)(ip2 + (i + 0) * is2));
+            *((npy_float *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_float v1 = *((npy_float *)(ip1 + (i + 1) * is1));
+            npy_float u1 = *((npy_float *)(ip2 + (i + 1) * is2));
+            *((npy_float *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_float v2 = *((npy_float *)(ip1 + (i + 2) * is1));
+            npy_float u2 = *((npy_float *)(ip2 + (i + 2) * is2));
+            *((npy_float *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_float v3 = *((npy_float *)(ip1 + (i + 3) * is1));
+            npy_float u3 = *((npy_float *)(ip2 + (i + 3) * is2));
+            *((npy_float *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_float in1 = *(npy_float *)ip1;
+        const npy_float in2 = *(npy_float *)ip2;
+        *((npy_float *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 1
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_fmax_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_float *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_float *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_float *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !1 || (1 && 1)
+#define SCALAR_OP scalar_minp_f
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_fmin)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_float)) {
+            TO_SIMD_SFX(simd_reduce_c_minp)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_float, npy_float)) {
+            TO_SIMD_SFX(simd_binary_ccc_minp)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 1
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_minp)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_float m0 = *((npy_float *)(ip2 + (i + 0) * is2));
+            npy_float m1 = *((npy_float *)(ip2 + (i + 1) * is2));
+            npy_float m2 = *((npy_float *)(ip2 + (i + 2) * is2));
+            npy_float m3 = *((npy_float *)(ip2 + (i + 3) * is2));
+            npy_float m4 = *((npy_float *)(ip2 + (i + 4) * is2));
+            npy_float m5 = *((npy_float *)(ip2 + (i + 5) * is2));
+            npy_float m6 = *((npy_float *)(ip2 + (i + 6) * is2));
+            npy_float m7 = *((npy_float *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_float v0 = *((npy_float *)(ip2 + (i + 0) * is2));
+                npy_float v1 = *((npy_float *)(ip2 + (i + 1) * is2));
+                npy_float v2 = *((npy_float *)(ip2 + (i + 2) * is2));
+                npy_float v3 = *((npy_float *)(ip2 + (i + 3) * is2));
+                npy_float v4 = *((npy_float *)(ip2 + (i + 4) * is2));
+                npy_float v5 = *((npy_float *)(ip2 + (i + 5) * is2));
+                npy_float v6 = *((npy_float *)(ip2 + (i + 6) * is2));
+                npy_float v7 = *((npy_float *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_float *)op1) = SCALAR_OP(*((npy_float *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_float v0 = *((npy_float *)(ip1 + (i + 0) * is1));
+            npy_float u0 = *((npy_float *)(ip2 + (i + 0) * is2));
+            *((npy_float *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_float v1 = *((npy_float *)(ip1 + (i + 1) * is1));
+            npy_float u1 = *((npy_float *)(ip2 + (i + 1) * is2));
+            *((npy_float *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_float v2 = *((npy_float *)(ip1 + (i + 2) * is1));
+            npy_float u2 = *((npy_float *)(ip2 + (i + 2) * is2));
+            *((npy_float *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_float v3 = *((npy_float *)(ip1 + (i + 3) * is1));
+            npy_float u3 = *((npy_float *)(ip2 + (i + 3) * is2));
+            *((npy_float *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_float in1 = *(npy_float *)ip1;
+        const npy_float in2 = *(npy_float *)ip2;
+        *((npy_float *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 1
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(FLOAT_fmin_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_float *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_float *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_float *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+
+#line 294
+#undef TO_SIMD_SFX
+#if 0
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 8
+    #if 1
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_DOUBLE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_DOUBLE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 16
+    #if 1
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_DOUBLE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_DOUBLE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 32
+    #if 1
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_DOUBLE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_DOUBLE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 64
+    #if 1
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_DOUBLE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_DOUBLE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 321
+#if !0 || (1 && 0)
+#define SCALAR_OP scalar_max_d
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_maximum)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_double)) {
+            TO_SIMD_SFX(simd_reduce_c_max)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_double, npy_double)) {
+            TO_SIMD_SFX(simd_binary_ccc_max)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 1
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_max)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_double m0 = *((npy_double *)(ip2 + (i + 0) * is2));
+            npy_double m1 = *((npy_double *)(ip2 + (i + 1) * is2));
+            npy_double m2 = *((npy_double *)(ip2 + (i + 2) * is2));
+            npy_double m3 = *((npy_double *)(ip2 + (i + 3) * is2));
+            npy_double m4 = *((npy_double *)(ip2 + (i + 4) * is2));
+            npy_double m5 = *((npy_double *)(ip2 + (i + 5) * is2));
+            npy_double m6 = *((npy_double *)(ip2 + (i + 6) * is2));
+            npy_double m7 = *((npy_double *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_double v0 = *((npy_double *)(ip2 + (i + 0) * is2));
+                npy_double v1 = *((npy_double *)(ip2 + (i + 1) * is2));
+                npy_double v2 = *((npy_double *)(ip2 + (i + 2) * is2));
+                npy_double v3 = *((npy_double *)(ip2 + (i + 3) * is2));
+                npy_double v4 = *((npy_double *)(ip2 + (i + 4) * is2));
+                npy_double v5 = *((npy_double *)(ip2 + (i + 5) * is2));
+                npy_double v6 = *((npy_double *)(ip2 + (i + 6) * is2));
+                npy_double v7 = *((npy_double *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_double *)op1) = SCALAR_OP(*((npy_double *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_double v0 = *((npy_double *)(ip1 + (i + 0) * is1));
+            npy_double u0 = *((npy_double *)(ip2 + (i + 0) * is2));
+            *((npy_double *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_double v1 = *((npy_double *)(ip1 + (i + 1) * is1));
+            npy_double u1 = *((npy_double *)(ip2 + (i + 1) * is2));
+            *((npy_double *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_double v2 = *((npy_double *)(ip1 + (i + 2) * is1));
+            npy_double u2 = *((npy_double *)(ip2 + (i + 2) * is2));
+            *((npy_double *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_double v3 = *((npy_double *)(ip1 + (i + 3) * is1));
+            npy_double u3 = *((npy_double *)(ip2 + (i + 3) * is2));
+            *((npy_double *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_double in1 = *(npy_double *)ip1;
+        const npy_double in2 = *(npy_double *)ip2;
+        *((npy_double *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 1
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_maximum_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_double *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_double *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_double *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !0 || (1 && 0)
+#define SCALAR_OP scalar_min_d
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_minimum)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_double)) {
+            TO_SIMD_SFX(simd_reduce_c_min)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_double, npy_double)) {
+            TO_SIMD_SFX(simd_binary_ccc_min)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 1
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_min)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_double m0 = *((npy_double *)(ip2 + (i + 0) * is2));
+            npy_double m1 = *((npy_double *)(ip2 + (i + 1) * is2));
+            npy_double m2 = *((npy_double *)(ip2 + (i + 2) * is2));
+            npy_double m3 = *((npy_double *)(ip2 + (i + 3) * is2));
+            npy_double m4 = *((npy_double *)(ip2 + (i + 4) * is2));
+            npy_double m5 = *((npy_double *)(ip2 + (i + 5) * is2));
+            npy_double m6 = *((npy_double *)(ip2 + (i + 6) * is2));
+            npy_double m7 = *((npy_double *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_double v0 = *((npy_double *)(ip2 + (i + 0) * is2));
+                npy_double v1 = *((npy_double *)(ip2 + (i + 1) * is2));
+                npy_double v2 = *((npy_double *)(ip2 + (i + 2) * is2));
+                npy_double v3 = *((npy_double *)(ip2 + (i + 3) * is2));
+                npy_double v4 = *((npy_double *)(ip2 + (i + 4) * is2));
+                npy_double v5 = *((npy_double *)(ip2 + (i + 5) * is2));
+                npy_double v6 = *((npy_double *)(ip2 + (i + 6) * is2));
+                npy_double v7 = *((npy_double *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_double *)op1) = SCALAR_OP(*((npy_double *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_double v0 = *((npy_double *)(ip1 + (i + 0) * is1));
+            npy_double u0 = *((npy_double *)(ip2 + (i + 0) * is2));
+            *((npy_double *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_double v1 = *((npy_double *)(ip1 + (i + 1) * is1));
+            npy_double u1 = *((npy_double *)(ip2 + (i + 1) * is2));
+            *((npy_double *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_double v2 = *((npy_double *)(ip1 + (i + 2) * is1));
+            npy_double u2 = *((npy_double *)(ip2 + (i + 2) * is2));
+            *((npy_double *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_double v3 = *((npy_double *)(ip1 + (i + 3) * is1));
+            npy_double u3 = *((npy_double *)(ip2 + (i + 3) * is2));
+            *((npy_double *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_double in1 = *(npy_double *)ip1;
+        const npy_double in2 = *(npy_double *)ip2;
+        *((npy_double *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 1
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_minimum_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_double *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_double *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_double *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !1 || (1 && 1)
+#define SCALAR_OP scalar_maxp_d
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_fmax)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_double)) {
+            TO_SIMD_SFX(simd_reduce_c_maxp)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_double, npy_double)) {
+            TO_SIMD_SFX(simd_binary_ccc_maxp)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 1
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_maxp)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_double m0 = *((npy_double *)(ip2 + (i + 0) * is2));
+            npy_double m1 = *((npy_double *)(ip2 + (i + 1) * is2));
+            npy_double m2 = *((npy_double *)(ip2 + (i + 2) * is2));
+            npy_double m3 = *((npy_double *)(ip2 + (i + 3) * is2));
+            npy_double m4 = *((npy_double *)(ip2 + (i + 4) * is2));
+            npy_double m5 = *((npy_double *)(ip2 + (i + 5) * is2));
+            npy_double m6 = *((npy_double *)(ip2 + (i + 6) * is2));
+            npy_double m7 = *((npy_double *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_double v0 = *((npy_double *)(ip2 + (i + 0) * is2));
+                npy_double v1 = *((npy_double *)(ip2 + (i + 1) * is2));
+                npy_double v2 = *((npy_double *)(ip2 + (i + 2) * is2));
+                npy_double v3 = *((npy_double *)(ip2 + (i + 3) * is2));
+                npy_double v4 = *((npy_double *)(ip2 + (i + 4) * is2));
+                npy_double v5 = *((npy_double *)(ip2 + (i + 5) * is2));
+                npy_double v6 = *((npy_double *)(ip2 + (i + 6) * is2));
+                npy_double v7 = *((npy_double *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_double *)op1) = SCALAR_OP(*((npy_double *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_double v0 = *((npy_double *)(ip1 + (i + 0) * is1));
+            npy_double u0 = *((npy_double *)(ip2 + (i + 0) * is2));
+            *((npy_double *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_double v1 = *((npy_double *)(ip1 + (i + 1) * is1));
+            npy_double u1 = *((npy_double *)(ip2 + (i + 1) * is2));
+            *((npy_double *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_double v2 = *((npy_double *)(ip1 + (i + 2) * is1));
+            npy_double u2 = *((npy_double *)(ip2 + (i + 2) * is2));
+            *((npy_double *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_double v3 = *((npy_double *)(ip1 + (i + 3) * is1));
+            npy_double u3 = *((npy_double *)(ip2 + (i + 3) * is2));
+            *((npy_double *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_double in1 = *(npy_double *)ip1;
+        const npy_double in2 = *(npy_double *)ip2;
+        *((npy_double *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 1
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_fmax_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_double *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_double *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_double *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !1 || (1 && 1)
+#define SCALAR_OP scalar_minp_d
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_fmin)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_double)) {
+            TO_SIMD_SFX(simd_reduce_c_minp)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_double, npy_double)) {
+            TO_SIMD_SFX(simd_binary_ccc_minp)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 1
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_minp)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_double m0 = *((npy_double *)(ip2 + (i + 0) * is2));
+            npy_double m1 = *((npy_double *)(ip2 + (i + 1) * is2));
+            npy_double m2 = *((npy_double *)(ip2 + (i + 2) * is2));
+            npy_double m3 = *((npy_double *)(ip2 + (i + 3) * is2));
+            npy_double m4 = *((npy_double *)(ip2 + (i + 4) * is2));
+            npy_double m5 = *((npy_double *)(ip2 + (i + 5) * is2));
+            npy_double m6 = *((npy_double *)(ip2 + (i + 6) * is2));
+            npy_double m7 = *((npy_double *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_double v0 = *((npy_double *)(ip2 + (i + 0) * is2));
+                npy_double v1 = *((npy_double *)(ip2 + (i + 1) * is2));
+                npy_double v2 = *((npy_double *)(ip2 + (i + 2) * is2));
+                npy_double v3 = *((npy_double *)(ip2 + (i + 3) * is2));
+                npy_double v4 = *((npy_double *)(ip2 + (i + 4) * is2));
+                npy_double v5 = *((npy_double *)(ip2 + (i + 5) * is2));
+                npy_double v6 = *((npy_double *)(ip2 + (i + 6) * is2));
+                npy_double v7 = *((npy_double *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_double *)op1) = SCALAR_OP(*((npy_double *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_double v0 = *((npy_double *)(ip1 + (i + 0) * is1));
+            npy_double u0 = *((npy_double *)(ip2 + (i + 0) * is2));
+            *((npy_double *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_double v1 = *((npy_double *)(ip1 + (i + 1) * is1));
+            npy_double u1 = *((npy_double *)(ip2 + (i + 1) * is2));
+            *((npy_double *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_double v2 = *((npy_double *)(ip1 + (i + 2) * is1));
+            npy_double u2 = *((npy_double *)(ip2 + (i + 2) * is2));
+            *((npy_double *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_double v3 = *((npy_double *)(ip1 + (i + 3) * is1));
+            npy_double u3 = *((npy_double *)(ip2 + (i + 3) * is2));
+            *((npy_double *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_double in1 = *(npy_double *)ip1;
+        const npy_double in2 = *(npy_double *)ip2;
+        *((npy_double *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 1
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(DOUBLE_fmin_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_double *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_double *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_double *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+
+#line 294
+#undef TO_SIMD_SFX
+#if 0
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 8
+    #if 1
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_LONGDOUBLE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONGDOUBLE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 16
+    #if 1
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_LONGDOUBLE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONGDOUBLE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 32
+    #if 1
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_LONGDOUBLE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONGDOUBLE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 299
+#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 64
+    #if 1
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_LONGDOUBLE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONGDOUBLE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 321
+#if !0 || (1 && 0)
+#define SCALAR_OP scalar_max_l
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_maximum)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_longdouble)) {
+            TO_SIMD_SFX(simd_reduce_c_max)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_longdouble, npy_longdouble)) {
+            TO_SIMD_SFX(simd_binary_ccc_max)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 1
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_max)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_longdouble m0 = *((npy_longdouble *)(ip2 + (i + 0) * is2));
+            npy_longdouble m1 = *((npy_longdouble *)(ip2 + (i + 1) * is2));
+            npy_longdouble m2 = *((npy_longdouble *)(ip2 + (i + 2) * is2));
+            npy_longdouble m3 = *((npy_longdouble *)(ip2 + (i + 3) * is2));
+            npy_longdouble m4 = *((npy_longdouble *)(ip2 + (i + 4) * is2));
+            npy_longdouble m5 = *((npy_longdouble *)(ip2 + (i + 5) * is2));
+            npy_longdouble m6 = *((npy_longdouble *)(ip2 + (i + 6) * is2));
+            npy_longdouble m7 = *((npy_longdouble *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_longdouble v0 = *((npy_longdouble *)(ip2 + (i + 0) * is2));
+                npy_longdouble v1 = *((npy_longdouble *)(ip2 + (i + 1) * is2));
+                npy_longdouble v2 = *((npy_longdouble *)(ip2 + (i + 2) * is2));
+                npy_longdouble v3 = *((npy_longdouble *)(ip2 + (i + 3) * is2));
+                npy_longdouble v4 = *((npy_longdouble *)(ip2 + (i + 4) * is2));
+                npy_longdouble v5 = *((npy_longdouble *)(ip2 + (i + 5) * is2));
+                npy_longdouble v6 = *((npy_longdouble *)(ip2 + (i + 6) * is2));
+                npy_longdouble v7 = *((npy_longdouble *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_longdouble *)op1) = SCALAR_OP(*((npy_longdouble *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_longdouble v0 = *((npy_longdouble *)(ip1 + (i + 0) * is1));
+            npy_longdouble u0 = *((npy_longdouble *)(ip2 + (i + 0) * is2));
+            *((npy_longdouble *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_longdouble v1 = *((npy_longdouble *)(ip1 + (i + 1) * is1));
+            npy_longdouble u1 = *((npy_longdouble *)(ip2 + (i + 1) * is2));
+            *((npy_longdouble *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_longdouble v2 = *((npy_longdouble *)(ip1 + (i + 2) * is1));
+            npy_longdouble u2 = *((npy_longdouble *)(ip2 + (i + 2) * is2));
+            *((npy_longdouble *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_longdouble v3 = *((npy_longdouble *)(ip1 + (i + 3) * is1));
+            npy_longdouble u3 = *((npy_longdouble *)(ip2 + (i + 3) * is2));
+            *((npy_longdouble *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        const npy_longdouble in2 = *(npy_longdouble *)ip2;
+        *((npy_longdouble *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 1
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_maximum_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_longdouble *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_longdouble *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_longdouble *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !0 || (1 && 0)
+#define SCALAR_OP scalar_min_l
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_minimum)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_longdouble)) {
+            TO_SIMD_SFX(simd_reduce_c_min)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_longdouble, npy_longdouble)) {
+            TO_SIMD_SFX(simd_binary_ccc_min)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 1
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_min)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_longdouble m0 = *((npy_longdouble *)(ip2 + (i + 0) * is2));
+            npy_longdouble m1 = *((npy_longdouble *)(ip2 + (i + 1) * is2));
+            npy_longdouble m2 = *((npy_longdouble *)(ip2 + (i + 2) * is2));
+            npy_longdouble m3 = *((npy_longdouble *)(ip2 + (i + 3) * is2));
+            npy_longdouble m4 = *((npy_longdouble *)(ip2 + (i + 4) * is2));
+            npy_longdouble m5 = *((npy_longdouble *)(ip2 + (i + 5) * is2));
+            npy_longdouble m6 = *((npy_longdouble *)(ip2 + (i + 6) * is2));
+            npy_longdouble m7 = *((npy_longdouble *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_longdouble v0 = *((npy_longdouble *)(ip2 + (i + 0) * is2));
+                npy_longdouble v1 = *((npy_longdouble *)(ip2 + (i + 1) * is2));
+                npy_longdouble v2 = *((npy_longdouble *)(ip2 + (i + 2) * is2));
+                npy_longdouble v3 = *((npy_longdouble *)(ip2 + (i + 3) * is2));
+                npy_longdouble v4 = *((npy_longdouble *)(ip2 + (i + 4) * is2));
+                npy_longdouble v5 = *((npy_longdouble *)(ip2 + (i + 5) * is2));
+                npy_longdouble v6 = *((npy_longdouble *)(ip2 + (i + 6) * is2));
+                npy_longdouble v7 = *((npy_longdouble *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_longdouble *)op1) = SCALAR_OP(*((npy_longdouble *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_longdouble v0 = *((npy_longdouble *)(ip1 + (i + 0) * is1));
+            npy_longdouble u0 = *((npy_longdouble *)(ip2 + (i + 0) * is2));
+            *((npy_longdouble *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_longdouble v1 = *((npy_longdouble *)(ip1 + (i + 1) * is1));
+            npy_longdouble u1 = *((npy_longdouble *)(ip2 + (i + 1) * is2));
+            *((npy_longdouble *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_longdouble v2 = *((npy_longdouble *)(ip1 + (i + 2) * is1));
+            npy_longdouble u2 = *((npy_longdouble *)(ip2 + (i + 2) * is2));
+            *((npy_longdouble *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_longdouble v3 = *((npy_longdouble *)(ip1 + (i + 3) * is1));
+            npy_longdouble u3 = *((npy_longdouble *)(ip2 + (i + 3) * is2));
+            *((npy_longdouble *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        const npy_longdouble in2 = *(npy_longdouble *)ip2;
+        *((npy_longdouble *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 1
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_minimum_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_longdouble *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_longdouble *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_longdouble *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !1 || (1 && 1)
+#define SCALAR_OP scalar_maxp_l
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_fmax)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_longdouble)) {
+            TO_SIMD_SFX(simd_reduce_c_maxp)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_longdouble, npy_longdouble)) {
+            TO_SIMD_SFX(simd_binary_ccc_maxp)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 1
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_maxp)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_longdouble m0 = *((npy_longdouble *)(ip2 + (i + 0) * is2));
+            npy_longdouble m1 = *((npy_longdouble *)(ip2 + (i + 1) * is2));
+            npy_longdouble m2 = *((npy_longdouble *)(ip2 + (i + 2) * is2));
+            npy_longdouble m3 = *((npy_longdouble *)(ip2 + (i + 3) * is2));
+            npy_longdouble m4 = *((npy_longdouble *)(ip2 + (i + 4) * is2));
+            npy_longdouble m5 = *((npy_longdouble *)(ip2 + (i + 5) * is2));
+            npy_longdouble m6 = *((npy_longdouble *)(ip2 + (i + 6) * is2));
+            npy_longdouble m7 = *((npy_longdouble *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_longdouble v0 = *((npy_longdouble *)(ip2 + (i + 0) * is2));
+                npy_longdouble v1 = *((npy_longdouble *)(ip2 + (i + 1) * is2));
+                npy_longdouble v2 = *((npy_longdouble *)(ip2 + (i + 2) * is2));
+                npy_longdouble v3 = *((npy_longdouble *)(ip2 + (i + 3) * is2));
+                npy_longdouble v4 = *((npy_longdouble *)(ip2 + (i + 4) * is2));
+                npy_longdouble v5 = *((npy_longdouble *)(ip2 + (i + 5) * is2));
+                npy_longdouble v6 = *((npy_longdouble *)(ip2 + (i + 6) * is2));
+                npy_longdouble v7 = *((npy_longdouble *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_longdouble *)op1) = SCALAR_OP(*((npy_longdouble *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_longdouble v0 = *((npy_longdouble *)(ip1 + (i + 0) * is1));
+            npy_longdouble u0 = *((npy_longdouble *)(ip2 + (i + 0) * is2));
+            *((npy_longdouble *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_longdouble v1 = *((npy_longdouble *)(ip1 + (i + 1) * is1));
+            npy_longdouble u1 = *((npy_longdouble *)(ip2 + (i + 1) * is2));
+            *((npy_longdouble *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_longdouble v2 = *((npy_longdouble *)(ip1 + (i + 2) * is1));
+            npy_longdouble u2 = *((npy_longdouble *)(ip2 + (i + 2) * is2));
+            *((npy_longdouble *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_longdouble v3 = *((npy_longdouble *)(ip1 + (i + 3) * is1));
+            npy_longdouble u3 = *((npy_longdouble *)(ip2 + (i + 3) * is2));
+            *((npy_longdouble *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        const npy_longdouble in2 = *(npy_longdouble *)ip2;
+        *((npy_longdouble *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 1
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_fmax_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_longdouble *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_longdouble *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_longdouble *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+#line 321
+#if !1 || (1 && 1)
+#define SCALAR_OP scalar_minp_l
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_fmin)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];
+    npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2],
+             len = dimensions[0];
+    npy_intp i = 0;
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (IS_BINARY_REDUCE) {
+        // reduce and contiguous
+        if (is2 == sizeof(npy_longdouble)) {
+            TO_SIMD_SFX(simd_reduce_c_minp)(
+                (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    }
+    else if (!is_mem_overlap(ip1, is1, op1, os1, len) &&
+        !is_mem_overlap(ip2, is2, op1, os1, len)
+    ) {
+        // no overlap and operands are binary contiguous
+        if (IS_BINARY_CONT(npy_longdouble, npy_longdouble)) {
+            TO_SIMD_SFX(simd_binary_ccc_minp)(
+                (STYPE*)ip1, (STYPE*)ip2, (STYPE*)op1, len
+            );
+            goto clear_fp;
+        }
+    // unroll scalars faster than non-contiguous vector load/store on Arm
+    #if !defined(NPY_HAVE_NEON) && 1
+        if (TO_SIMD_SFX(npyv_loadable_stride)(is1/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_loadable_stride)(is2/sizeof(STYPE)) &&
+            TO_SIMD_SFX(npyv_storable_stride)(os1/sizeof(STYPE))
+        ) {
+            TO_SIMD_SFX(simd_binary_minp)(
+                (STYPE*)ip1, is1/sizeof(STYPE),
+                (STYPE*)ip2, is2/sizeof(STYPE),
+                (STYPE*)op1, os1/sizeof(STYPE), len
+            );
+            goto clear_fp;
+        }
+    #endif
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    // scalar unrolls
+    if (IS_BINARY_REDUCE) {
+        // Note, 8x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 8;
+        if((i+elemPerLoop) <= len){
+            npy_longdouble m0 = *((npy_longdouble *)(ip2 + (i + 0) * is2));
+            npy_longdouble m1 = *((npy_longdouble *)(ip2 + (i + 1) * is2));
+            npy_longdouble m2 = *((npy_longdouble *)(ip2 + (i + 2) * is2));
+            npy_longdouble m3 = *((npy_longdouble *)(ip2 + (i + 3) * is2));
+            npy_longdouble m4 = *((npy_longdouble *)(ip2 + (i + 4) * is2));
+            npy_longdouble m5 = *((npy_longdouble *)(ip2 + (i + 5) * is2));
+            npy_longdouble m6 = *((npy_longdouble *)(ip2 + (i + 6) * is2));
+            npy_longdouble m7 = *((npy_longdouble *)(ip2 + (i + 7) * is2));
+
+            i += elemPerLoop;
+            for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+                npy_longdouble v0 = *((npy_longdouble *)(ip2 + (i + 0) * is2));
+                npy_longdouble v1 = *((npy_longdouble *)(ip2 + (i + 1) * is2));
+                npy_longdouble v2 = *((npy_longdouble *)(ip2 + (i + 2) * is2));
+                npy_longdouble v3 = *((npy_longdouble *)(ip2 + (i + 3) * is2));
+                npy_longdouble v4 = *((npy_longdouble *)(ip2 + (i + 4) * is2));
+                npy_longdouble v5 = *((npy_longdouble *)(ip2 + (i + 5) * is2));
+                npy_longdouble v6 = *((npy_longdouble *)(ip2 + (i + 6) * is2));
+                npy_longdouble v7 = *((npy_longdouble *)(ip2 + (i + 7) * is2));
+
+                m0 = SCALAR_OP(m0, v0);
+                m1 = SCALAR_OP(m1, v1);
+                m2 = SCALAR_OP(m2, v2);
+                m3 = SCALAR_OP(m3, v3);
+                m4 = SCALAR_OP(m4, v4);
+                m5 = SCALAR_OP(m5, v5);
+                m6 = SCALAR_OP(m6, v6);
+                m7 = SCALAR_OP(m7, v7);
+            }
+
+            m0 = SCALAR_OP(m0, m1);
+            m2 = SCALAR_OP(m2, m3);
+            m4 = SCALAR_OP(m4, m5);
+            m6 = SCALAR_OP(m6, m7);
+
+            m0 = SCALAR_OP(m0, m2);
+            m4 = SCALAR_OP(m4, m6);
+
+            m0 = SCALAR_OP(m0, m4);
+
+             *((npy_longdouble *)op1) = SCALAR_OP(*((npy_longdouble *)op1), m0);
+        }
+    } else{
+        // Note, 4x unroll was chosen for best results on Apple M1
+        npy_intp elemPerLoop = 4;
+        for(; (i+elemPerLoop)<=len; i+=elemPerLoop){
+            /* Note, we can't just load all, do all ops, then store all here.
+             * Sometimes ufuncs are called with `accumulate`, which makes the
+             * assumption that previous iterations have finished before next
+             * iteration.  For example, the output of iteration 2 depends on the
+             * result of iteration 1.
+             */
+
+            #line 431
+            npy_longdouble v0 = *((npy_longdouble *)(ip1 + (i + 0) * is1));
+            npy_longdouble u0 = *((npy_longdouble *)(ip2 + (i + 0) * is2));
+            *((npy_longdouble *)(op1 + (i + 0) * os1)) = SCALAR_OP(v0, u0);
+            
+#line 431
+            npy_longdouble v1 = *((npy_longdouble *)(ip1 + (i + 1) * is1));
+            npy_longdouble u1 = *((npy_longdouble *)(ip2 + (i + 1) * is2));
+            *((npy_longdouble *)(op1 + (i + 1) * os1)) = SCALAR_OP(v1, u1);
+            
+#line 431
+            npy_longdouble v2 = *((npy_longdouble *)(ip1 + (i + 2) * is1));
+            npy_longdouble u2 = *((npy_longdouble *)(ip2 + (i + 2) * is2));
+            *((npy_longdouble *)(op1 + (i + 2) * os1)) = SCALAR_OP(v2, u2);
+            
+#line 431
+            npy_longdouble v3 = *((npy_longdouble *)(ip1 + (i + 3) * is1));
+            npy_longdouble u3 = *((npy_longdouble *)(ip2 + (i + 3) * is2));
+            *((npy_longdouble *)(op1 + (i + 3) * os1)) = SCALAR_OP(v3, u3);
+            
+        }
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    ip1 += is1 * i;
+    ip2 += is2 * i;
+    op1 += os1 * i;
+    for (; i < len; ++i, ip1 += is1, ip2 += is2, op1 += os1) {
+        const npy_longdouble in1 = *(npy_longdouble *)ip1;
+        const npy_longdouble in2 = *(npy_longdouble *)ip2;
+        *((npy_longdouble *)op1) = SCALAR_OP(in1, in2);
+    }
+#ifdef TO_SIMD_SFX
+clear_fp:
+    npyv_cleanup();
+#endif
+#if 1
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_fmin_indexed)
+(PyArrayMethod_Context *NPY_UNUSED(context), char *const *args, npy_intp const *dimensions, npy_intp const *steps, NpyAuxData *NPY_UNUSED(func))
+{
+    char *ip1 = args[0];
+    char *indxp = args[1];
+    char *value = args[2];
+    npy_intp is1 = steps[0], isindex = steps[1], isb = steps[2];
+    npy_intp n = dimensions[0];
+    npy_intp shape = steps[3];
+    npy_intp i;
+    npy_longdouble *indexed;
+    for(i = 0; i < n; i++, indxp += isindex, value += isb) {
+        npy_intp indx = *(npy_intp *)indxp;
+        if (indx < 0) {
+            indx += shape;
+        }
+        indexed = (npy_longdouble *)(ip1 + is1 * indx);
+        *indexed = SCALAR_OP(*indexed, *(npy_longdouble *)value);
+    }
+    return 0;
+}
+
+#undef SCALAR_OP
+
+#endif // !fp_only || (is_fp && fp_only)
+
+
+
+
diff --git a/numpy/core/src/_generated/loops_modulo.dispatch.c b/numpy/core/src/_generated/loops_modulo.dispatch.c
new file mode 100644
index 000000000000..2c8af2265b0d
--- /dev/null
+++ b/numpy/core/src/_generated/loops_modulo.dispatch.c
@@ -0,0 +1,6059 @@
+#line 1 "numpy/core/src/umath/loops_modulo.dispatch.c.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+/*@targets
+ ** baseline vsx4
+ **/
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+
+#define DIVIDEBYZERO_OVERFLOW_CHECK(x, y, min_val, signed) \
+    (NPY_UNLIKELY(                                         \
+        (signed)                                    ?      \
+        ((y == 0) || ((x == min_val) && (y == -1))) :      \
+        (y == 0))                                          \
+    )
+
+#define FLAG_IF_DIVIDEBYZERO(x) do {     \
+    if (NPY_UNLIKELY(x == 0)) {          \
+        npy_set_floatstatus_divbyzero(); \
+    }                                    \
+} while (0)
+
+
+#if NPY_SIMD && defined(NPY_HAVE_VSX4)
+typedef struct {
+    npyv_u32x2 hi;
+    npyv_u32x2 lo;
+} vsx4_u32x4;
+
+typedef struct {
+    npyv_s32x2 hi;
+    npyv_s32x2 lo;
+} vsx4_s32x4;
+
+// Converts 1 8-bit vector into 2 16-bit vectors
+NPY_FINLINE npyv_s16x2
+vsx4_expand_s16_s8(npyv_s8 data)
+{
+    npyv_s16x2 r;
+    r.val[0] = vec_unpackh(data);
+    r.val[1] = vec_unpackl(data);
+    return r;
+}
+
+// Converts 1 16-bit vector into 2 32-bit vectors
+NPY_FINLINE npyv_s32x2
+vsx4_expand_s32_s16(npyv_s16 data)
+{
+    npyv_s32x2 r;
+    r.val[0] = vec_unpackh(data);
+    r.val[1] = vec_unpackl(data);
+    return r;
+}
+
+#line 65
+// Converts 1 8-bit vector into 4 32-bit vectors
+NPY_FINLINE vsx4_u32x4
+vsx4_expand_u32_u8(npyv_u8 data)
+{
+    vsx4_u32x4 r;
+    npyv_u16x2 expand = npyv_expand_u16_u8(data);
+    r.hi = npyv_expand_u32_u16(expand.val[0]);
+    r.lo = npyv_expand_u32_u16(expand.val[1]);
+    return r;
+}
+
+#line 79
+/*
+ * Computes division/modulo of 2 8-bit signed/unsigned integer vectors
+ *
+ * As Power10 only supports integer vector division/modulo for data of 32 bits
+ * or greater, we have to convert npyv_u8 into 4x npyv_u32, execute the integer
+ * vector division/modulo instruction, and then, convert the result back to
+ * npyv_u8.
+ */
+NPY_FINLINE npyv_u8
+vsx4_div_u8(npyv_u8 a, npyv_u8 b)
+{
+    vsx4_u32x4 a_expand = vsx4_expand_u32_u8(a);
+    vsx4_u32x4 b_expand = vsx4_expand_u32_u8(b);
+    npyv_u32 v1 = vec_div(a_expand.hi.val[0], b_expand.hi.val[0]);
+    npyv_u32 v2 = vec_div(a_expand.hi.val[1], b_expand.hi.val[1]);
+    npyv_u32 v3 = vec_div(a_expand.lo.val[0], b_expand.lo.val[0]);
+    npyv_u32 v4 = vec_div(a_expand.lo.val[1], b_expand.lo.val[1]);
+    npyv_u16 hi = vec_pack(v1, v2);
+    npyv_u16 lo = vec_pack(v3, v4);
+    return vec_pack(hi, lo);
+}
+
+NPY_FINLINE npyv_u8
+vsx4_div_scalar_u8(npyv_u8 a, const vsx4_u32x4 b_expand)
+{
+    vsx4_u32x4 a_expand = vsx4_expand_u32_u8(a);
+    npyv_u32 v1 = vec_div(a_expand.hi.val[0], b_expand.hi.val[0]);
+    npyv_u32 v2 = vec_div(a_expand.hi.val[1], b_expand.hi.val[1]);
+    npyv_u32 v3 = vec_div(a_expand.lo.val[0], b_expand.lo.val[0]);
+    npyv_u32 v4 = vec_div(a_expand.lo.val[1], b_expand.lo.val[1]);
+    npyv_u16 hi = vec_pack(v1, v2);
+    npyv_u16 lo = vec_pack(v3, v4);
+    return vec_pack(hi, lo);
+}
+
+NPY_FINLINE npyv_u16
+vsx4_div_u16(npyv_u16 a, npyv_u16 b)
+{
+    npyv_u32x2 a_expand = npyv_expand_u32_u16(a);
+    npyv_u32x2 b_expand = npyv_expand_u32_u16(b);
+    npyv_u32 v1 = vec_div(a_expand.val[0], b_expand.val[0]);
+    npyv_u32 v2 = vec_div(a_expand.val[1], b_expand.val[1]);
+    return vec_pack(v1, v2);
+}
+
+NPY_FINLINE npyv_u16
+vsx4_div_scalar_u16(npyv_u16 a, const npyv_u32x2 b_expand)
+{
+    npyv_u32x2 a_expand = npyv_expand_u32_u16(a);
+    npyv_u32 v1 = vec_div(a_expand.val[0], b_expand.val[0]);
+    npyv_u32 v2 = vec_div(a_expand.val[1], b_expand.val[1]);
+    return vec_pack(v1, v2);
+}
+
+#define vsx4_div_u32 vec_div
+#define vsx4_div_u64 vec_div
+#define vsx4_div_scalar_u32 vec_div
+#define vsx4_div_scalar_u64 vec_div
+
+#line 79
+/*
+ * Computes division/modulo of 2 8-bit signed/unsigned integer vectors
+ *
+ * As Power10 only supports integer vector division/modulo for data of 32 bits
+ * or greater, we have to convert npyv_u8 into 4x npyv_u32, execute the integer
+ * vector division/modulo instruction, and then, convert the result back to
+ * npyv_u8.
+ */
+NPY_FINLINE npyv_u8
+vsx4_mod_u8(npyv_u8 a, npyv_u8 b)
+{
+    vsx4_u32x4 a_expand = vsx4_expand_u32_u8(a);
+    vsx4_u32x4 b_expand = vsx4_expand_u32_u8(b);
+    npyv_u32 v1 = vec_mod(a_expand.hi.val[0], b_expand.hi.val[0]);
+    npyv_u32 v2 = vec_mod(a_expand.hi.val[1], b_expand.hi.val[1]);
+    npyv_u32 v3 = vec_mod(a_expand.lo.val[0], b_expand.lo.val[0]);
+    npyv_u32 v4 = vec_mod(a_expand.lo.val[1], b_expand.lo.val[1]);
+    npyv_u16 hi = vec_pack(v1, v2);
+    npyv_u16 lo = vec_pack(v3, v4);
+    return vec_pack(hi, lo);
+}
+
+NPY_FINLINE npyv_u8
+vsx4_mod_scalar_u8(npyv_u8 a, const vsx4_u32x4 b_expand)
+{
+    vsx4_u32x4 a_expand = vsx4_expand_u32_u8(a);
+    npyv_u32 v1 = vec_mod(a_expand.hi.val[0], b_expand.hi.val[0]);
+    npyv_u32 v2 = vec_mod(a_expand.hi.val[1], b_expand.hi.val[1]);
+    npyv_u32 v3 = vec_mod(a_expand.lo.val[0], b_expand.lo.val[0]);
+    npyv_u32 v4 = vec_mod(a_expand.lo.val[1], b_expand.lo.val[1]);
+    npyv_u16 hi = vec_pack(v1, v2);
+    npyv_u16 lo = vec_pack(v3, v4);
+    return vec_pack(hi, lo);
+}
+
+NPY_FINLINE npyv_u16
+vsx4_mod_u16(npyv_u16 a, npyv_u16 b)
+{
+    npyv_u32x2 a_expand = npyv_expand_u32_u16(a);
+    npyv_u32x2 b_expand = npyv_expand_u32_u16(b);
+    npyv_u32 v1 = vec_mod(a_expand.val[0], b_expand.val[0]);
+    npyv_u32 v2 = vec_mod(a_expand.val[1], b_expand.val[1]);
+    return vec_pack(v1, v2);
+}
+
+NPY_FINLINE npyv_u16
+vsx4_mod_scalar_u16(npyv_u16 a, const npyv_u32x2 b_expand)
+{
+    npyv_u32x2 a_expand = npyv_expand_u32_u16(a);
+    npyv_u32 v1 = vec_mod(a_expand.val[0], b_expand.val[0]);
+    npyv_u32 v2 = vec_mod(a_expand.val[1], b_expand.val[1]);
+    return vec_pack(v1, v2);
+}
+
+#define vsx4_mod_u32 vec_mod
+#define vsx4_mod_u64 vec_mod
+#define vsx4_mod_scalar_u32 vec_mod
+#define vsx4_mod_scalar_u64 vec_mod
+
+
+#line 65
+// Converts 1 8-bit vector into 4 32-bit vectors
+NPY_FINLINE vsx4_s32x4
+vsx4_expand_s32_s8(npyv_s8 data)
+{
+    vsx4_s32x4 r;
+    npyv_s16x2 expand = vsx4_expand_s16_s8(data);
+    r.hi = vsx4_expand_s32_s16(expand.val[0]);
+    r.lo = vsx4_expand_s32_s16(expand.val[1]);
+    return r;
+}
+
+#line 79
+/*
+ * Computes division/modulo of 2 8-bit signed/unsigned integer vectors
+ *
+ * As Power10 only supports integer vector division/modulo for data of 32 bits
+ * or greater, we have to convert npyv_u8 into 4x npyv_u32, execute the integer
+ * vector division/modulo instruction, and then, convert the result back to
+ * npyv_u8.
+ */
+NPY_FINLINE npyv_s8
+vsx4_div_s8(npyv_s8 a, npyv_s8 b)
+{
+    vsx4_s32x4 a_expand = vsx4_expand_s32_s8(a);
+    vsx4_s32x4 b_expand = vsx4_expand_s32_s8(b);
+    npyv_s32 v1 = vec_div(a_expand.hi.val[0], b_expand.hi.val[0]);
+    npyv_s32 v2 = vec_div(a_expand.hi.val[1], b_expand.hi.val[1]);
+    npyv_s32 v3 = vec_div(a_expand.lo.val[0], b_expand.lo.val[0]);
+    npyv_s32 v4 = vec_div(a_expand.lo.val[1], b_expand.lo.val[1]);
+    npyv_s16 hi = vec_pack(v1, v2);
+    npyv_s16 lo = vec_pack(v3, v4);
+    return vec_pack(hi, lo);
+}
+
+NPY_FINLINE npyv_s8
+vsx4_div_scalar_s8(npyv_s8 a, const vsx4_s32x4 b_expand)
+{
+    vsx4_s32x4 a_expand = vsx4_expand_s32_s8(a);
+    npyv_s32 v1 = vec_div(a_expand.hi.val[0], b_expand.hi.val[0]);
+    npyv_s32 v2 = vec_div(a_expand.hi.val[1], b_expand.hi.val[1]);
+    npyv_s32 v3 = vec_div(a_expand.lo.val[0], b_expand.lo.val[0]);
+    npyv_s32 v4 = vec_div(a_expand.lo.val[1], b_expand.lo.val[1]);
+    npyv_s16 hi = vec_pack(v1, v2);
+    npyv_s16 lo = vec_pack(v3, v4);
+    return vec_pack(hi, lo);
+}
+
+NPY_FINLINE npyv_s16
+vsx4_div_s16(npyv_s16 a, npyv_s16 b)
+{
+    npyv_s32x2 a_expand = vsx4_expand_s32_s16(a);
+    npyv_s32x2 b_expand = vsx4_expand_s32_s16(b);
+    npyv_s32 v1 = vec_div(a_expand.val[0], b_expand.val[0]);
+    npyv_s32 v2 = vec_div(a_expand.val[1], b_expand.val[1]);
+    return vec_pack(v1, v2);
+}
+
+NPY_FINLINE npyv_s16
+vsx4_div_scalar_s16(npyv_s16 a, const npyv_s32x2 b_expand)
+{
+    npyv_s32x2 a_expand = vsx4_expand_s32_s16(a);
+    npyv_s32 v1 = vec_div(a_expand.val[0], b_expand.val[0]);
+    npyv_s32 v2 = vec_div(a_expand.val[1], b_expand.val[1]);
+    return vec_pack(v1, v2);
+}
+
+#define vsx4_div_s32 vec_div
+#define vsx4_div_s64 vec_div
+#define vsx4_div_scalar_s32 vec_div
+#define vsx4_div_scalar_s64 vec_div
+
+#line 79
+/*
+ * Computes division/modulo of 2 8-bit signed/unsigned integer vectors
+ *
+ * As Power10 only supports integer vector division/modulo for data of 32 bits
+ * or greater, we have to convert npyv_u8 into 4x npyv_u32, execute the integer
+ * vector division/modulo instruction, and then, convert the result back to
+ * npyv_u8.
+ */
+NPY_FINLINE npyv_s8
+vsx4_mod_s8(npyv_s8 a, npyv_s8 b)
+{
+    vsx4_s32x4 a_expand = vsx4_expand_s32_s8(a);
+    vsx4_s32x4 b_expand = vsx4_expand_s32_s8(b);
+    npyv_s32 v1 = vec_mod(a_expand.hi.val[0], b_expand.hi.val[0]);
+    npyv_s32 v2 = vec_mod(a_expand.hi.val[1], b_expand.hi.val[1]);
+    npyv_s32 v3 = vec_mod(a_expand.lo.val[0], b_expand.lo.val[0]);
+    npyv_s32 v4 = vec_mod(a_expand.lo.val[1], b_expand.lo.val[1]);
+    npyv_s16 hi = vec_pack(v1, v2);
+    npyv_s16 lo = vec_pack(v3, v4);
+    return vec_pack(hi, lo);
+}
+
+NPY_FINLINE npyv_s8
+vsx4_mod_scalar_s8(npyv_s8 a, const vsx4_s32x4 b_expand)
+{
+    vsx4_s32x4 a_expand = vsx4_expand_s32_s8(a);
+    npyv_s32 v1 = vec_mod(a_expand.hi.val[0], b_expand.hi.val[0]);
+    npyv_s32 v2 = vec_mod(a_expand.hi.val[1], b_expand.hi.val[1]);
+    npyv_s32 v3 = vec_mod(a_expand.lo.val[0], b_expand.lo.val[0]);
+    npyv_s32 v4 = vec_mod(a_expand.lo.val[1], b_expand.lo.val[1]);
+    npyv_s16 hi = vec_pack(v1, v2);
+    npyv_s16 lo = vec_pack(v3, v4);
+    return vec_pack(hi, lo);
+}
+
+NPY_FINLINE npyv_s16
+vsx4_mod_s16(npyv_s16 a, npyv_s16 b)
+{
+    npyv_s32x2 a_expand = vsx4_expand_s32_s16(a);
+    npyv_s32x2 b_expand = vsx4_expand_s32_s16(b);
+    npyv_s32 v1 = vec_mod(a_expand.val[0], b_expand.val[0]);
+    npyv_s32 v2 = vec_mod(a_expand.val[1], b_expand.val[1]);
+    return vec_pack(v1, v2);
+}
+
+NPY_FINLINE npyv_s16
+vsx4_mod_scalar_s16(npyv_s16 a, const npyv_s32x2 b_expand)
+{
+    npyv_s32x2 a_expand = vsx4_expand_s32_s16(a);
+    npyv_s32 v1 = vec_mod(a_expand.val[0], b_expand.val[0]);
+    npyv_s32 v2 = vec_mod(a_expand.val[1], b_expand.val[1]);
+    return vec_pack(v1, v2);
+}
+
+#define vsx4_mod_s32 vec_mod
+#define vsx4_mod_s64 vec_mod
+#define vsx4_mod_scalar_s32 vec_mod
+#define vsx4_mod_scalar_s64 vec_mod
+
+
+
+#line 146
+// Generates the divisor for the division/modulo operations
+NPY_FINLINE vsx4_u32x4
+vsx4_divisor_u8(const npyv_u8 vscalar)
+{
+    return vsx4_expand_u32_u8(vscalar);
+}
+
+#line 146
+// Generates the divisor for the division/modulo operations
+NPY_FINLINE npyv_u32x2
+vsx4_divisor_u16(const npyv_u16 vscalar)
+{
+    return npyv_expand_u32_u16(vscalar);
+}
+
+#line 146
+// Generates the divisor for the division/modulo operations
+NPY_FINLINE vsx4_s32x4
+vsx4_divisor_s8(const npyv_s8 vscalar)
+{
+    return vsx4_expand_s32_s8(vscalar);
+}
+
+#line 146
+// Generates the divisor for the division/modulo operations
+NPY_FINLINE npyv_s32x2
+vsx4_divisor_s16(const npyv_s16 vscalar)
+{
+    return vsx4_expand_s32_s16(vscalar);
+}
+
+
+#line 157
+NPY_FINLINE npyv_u32
+vsx4_divisor_u32(const npyv_u32 vscalar)
+{
+    return vscalar;
+}
+
+#line 157
+NPY_FINLINE npyv_u64
+vsx4_divisor_u64(const npyv_u64 vscalar)
+{
+    return vscalar;
+}
+
+#line 157
+NPY_FINLINE npyv_s32
+vsx4_divisor_s32(const npyv_s32 vscalar)
+{
+    return vscalar;
+}
+
+#line 157
+NPY_FINLINE npyv_s64
+vsx4_divisor_s64(const npyv_s64 vscalar)
+{
+    return vscalar;
+}
+
+
+#line 170
+#line 174
+static inline void
+vsx4_simd_fmod_contig_u8(char **args, npy_intp len)
+{
+    npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0];
+    npyv_lanetype_u8 *src2 = (npyv_lanetype_u8 *) args[1];
+    npyv_lanetype_u8 *dst1 = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 vzero    = npyv_zero_u8();
+    const int vstep           = npyv_nlanes_u8;
+#if 0 == 2 /* divmod */
+    npyv_lanetype_u8 *dst2 = (npyv_lanetype_u8 *) args[3];
+    npyv_b8 warn          = npyv_cvt_b8_u8(npyv_zero_u8());
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep, dst2 += vstep) {
+        npyv_u8 a        = npyv_load_u8(src1);
+        npyv_u8 b        = npyv_load_u8(src2);
+        npyv_u8 quo      = vsx4_div_u8(a, b);
+        npyv_u8 rem      = npyv_sub_u8(a, vec_mul(b, quo));
+        npyv_b8 bzero   = npyv_cmpeq_u8(b, vzero);
+        // when b is 0, forces the remainder to be 0 too
+                        rem = npyv_select_u8(bzero, vzero, rem);
+                       warn = npyv_or_u8(bzero, warn);
+        npyv_store_u8(dst1, quo);
+        npyv_store_u8(dst2, rem);
+    }
+
+    if (!vec_all_eq(warn, vzero)) {
+        npy_set_floatstatus_divbyzero();
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
+        const npyv_lanetype_u8 a = *src1;
+        const npyv_lanetype_u8 b = *src2;
+        if (NPY_UNLIKELY(b == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *dst1 = 0;
+            *dst2 = 0;
+        } else{
+            *dst1 = a / b;
+            *dst2 = a % b;
+        }
+    }
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep) {
+        npyv_u8 a = npyv_load_u8(src1);
+        npyv_u8 b = npyv_load_u8(src2);
+        npyv_u8 c = vsx4_mod_u8(a, b);
+        npyv_store_u8(dst1, c);
+        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
+            npy_set_floatstatus_divbyzero();
+        }
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
+        const npyv_lanetype_u8 a = *src1;
+        const npyv_lanetype_u8 b = *src2;
+        if (NPY_UNLIKELY(b == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *dst1 = 0;
+        } else{
+            *dst1 = a % b;
+        }
+    }
+#endif
+    npyv_cleanup();
+}
+
+static inline void
+vsx4_simd_fmod_by_scalar_contig_u8(char **args, npy_intp len)
+{
+    npyv_lanetype_u8 *src1  = (npyv_lanetype_u8 *) args[0];
+    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[1];
+    npyv_lanetype_u8 *dst1  = (npyv_lanetype_u8 *) args[2];
+    const int vstep            = npyv_nlanes_u8;
+    const npyv_u8 vscalar   = npyv_setall_u8(scalar);
+    const vsx4_u32x4 divisor    = vsx4_divisor_u8(vscalar);
+#if 0 == 2 /* divmod */
+    npyv_lanetype_u8 *dst2 = (npyv_lanetype_u8 *) args[3];
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
+         dst2 += vstep) {
+        npyv_u8 a   = npyv_load_u8(src1);
+        npyv_u8 quo = vsx4_div_scalar_u8(a, divisor);
+        npyv_u8 rem = npyv_sub_u8(a, vec_mul(vscalar, quo));
+        npyv_store_u8(dst1, quo);
+        npyv_store_u8(dst2, rem);
+    }
+
+    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
+        const npyv_lanetype_u8 a = *src1;
+        *dst1 = a / scalar;
+        *dst2 = a % scalar;
+    }
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
+        npyv_u8 a = npyv_load_u8(src1);
+        npyv_u8 c = vsx4_mod_scalar_u8(a, divisor);
+        npyv_store_u8(dst1, c);
+    }
+
+    for (; len > 0; --len, ++src1, ++dst1) {
+        const npyv_lanetype_u8 a = *src1;
+        *dst1 = a % scalar;
+    }
+#endif
+    npyv_cleanup();
+}
+
+#line 174
+static inline void
+vsx4_simd_remainder_contig_u8(char **args, npy_intp len)
+{
+    npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0];
+    npyv_lanetype_u8 *src2 = (npyv_lanetype_u8 *) args[1];
+    npyv_lanetype_u8 *dst1 = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 vzero    = npyv_zero_u8();
+    const int vstep           = npyv_nlanes_u8;
+#if 1 == 2 /* divmod */
+    npyv_lanetype_u8 *dst2 = (npyv_lanetype_u8 *) args[3];
+    npyv_b8 warn          = npyv_cvt_b8_u8(npyv_zero_u8());
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep, dst2 += vstep) {
+        npyv_u8 a        = npyv_load_u8(src1);
+        npyv_u8 b        = npyv_load_u8(src2);
+        npyv_u8 quo      = vsx4_div_u8(a, b);
+        npyv_u8 rem      = npyv_sub_u8(a, vec_mul(b, quo));
+        npyv_b8 bzero   = npyv_cmpeq_u8(b, vzero);
+        // when b is 0, forces the remainder to be 0 too
+                        rem = npyv_select_u8(bzero, vzero, rem);
+                       warn = npyv_or_u8(bzero, warn);
+        npyv_store_u8(dst1, quo);
+        npyv_store_u8(dst2, rem);
+    }
+
+    if (!vec_all_eq(warn, vzero)) {
+        npy_set_floatstatus_divbyzero();
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
+        const npyv_lanetype_u8 a = *src1;
+        const npyv_lanetype_u8 b = *src2;
+        if (NPY_UNLIKELY(b == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *dst1 = 0;
+            *dst2 = 0;
+        } else{
+            *dst1 = a / b;
+            *dst2 = a % b;
+        }
+    }
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep) {
+        npyv_u8 a = npyv_load_u8(src1);
+        npyv_u8 b = npyv_load_u8(src2);
+        npyv_u8 c = vsx4_mod_u8(a, b);
+        npyv_store_u8(dst1, c);
+        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
+            npy_set_floatstatus_divbyzero();
+        }
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
+        const npyv_lanetype_u8 a = *src1;
+        const npyv_lanetype_u8 b = *src2;
+        if (NPY_UNLIKELY(b == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *dst1 = 0;
+        } else{
+            *dst1 = a % b;
+        }
+    }
+#endif
+    npyv_cleanup();
+}
+
+static inline void
+vsx4_simd_remainder_by_scalar_contig_u8(char **args, npy_intp len)
+{
+    npyv_lanetype_u8 *src1  = (npyv_lanetype_u8 *) args[0];
+    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[1];
+    npyv_lanetype_u8 *dst1  = (npyv_lanetype_u8 *) args[2];
+    const int vstep            = npyv_nlanes_u8;
+    const npyv_u8 vscalar   = npyv_setall_u8(scalar);
+    const vsx4_u32x4 divisor    = vsx4_divisor_u8(vscalar);
+#if 1 == 2 /* divmod */
+    npyv_lanetype_u8 *dst2 = (npyv_lanetype_u8 *) args[3];
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
+         dst2 += vstep) {
+        npyv_u8 a   = npyv_load_u8(src1);
+        npyv_u8 quo = vsx4_div_scalar_u8(a, divisor);
+        npyv_u8 rem = npyv_sub_u8(a, vec_mul(vscalar, quo));
+        npyv_store_u8(dst1, quo);
+        npyv_store_u8(dst2, rem);
+    }
+
+    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
+        const npyv_lanetype_u8 a = *src1;
+        *dst1 = a / scalar;
+        *dst2 = a % scalar;
+    }
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
+        npyv_u8 a = npyv_load_u8(src1);
+        npyv_u8 c = vsx4_mod_scalar_u8(a, divisor);
+        npyv_store_u8(dst1, c);
+    }
+
+    for (; len > 0; --len, ++src1, ++dst1) {
+        const npyv_lanetype_u8 a = *src1;
+        *dst1 = a % scalar;
+    }
+#endif
+    npyv_cleanup();
+}
+
+#line 174
+static inline void
+vsx4_simd_divmod_contig_u8(char **args, npy_intp len)
+{
+    npyv_lanetype_u8 *src1 = (npyv_lanetype_u8 *) args[0];
+    npyv_lanetype_u8 *src2 = (npyv_lanetype_u8 *) args[1];
+    npyv_lanetype_u8 *dst1 = (npyv_lanetype_u8 *) args[2];
+    const npyv_u8 vzero    = npyv_zero_u8();
+    const int vstep           = npyv_nlanes_u8;
+#if 2 == 2 /* divmod */
+    npyv_lanetype_u8 *dst2 = (npyv_lanetype_u8 *) args[3];
+    npyv_b8 warn          = npyv_cvt_b8_u8(npyv_zero_u8());
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep, dst2 += vstep) {
+        npyv_u8 a        = npyv_load_u8(src1);
+        npyv_u8 b        = npyv_load_u8(src2);
+        npyv_u8 quo      = vsx4_div_u8(a, b);
+        npyv_u8 rem      = npyv_sub_u8(a, vec_mul(b, quo));
+        npyv_b8 bzero   = npyv_cmpeq_u8(b, vzero);
+        // when b is 0, forces the remainder to be 0 too
+                        rem = npyv_select_u8(bzero, vzero, rem);
+                       warn = npyv_or_u8(bzero, warn);
+        npyv_store_u8(dst1, quo);
+        npyv_store_u8(dst2, rem);
+    }
+
+    if (!vec_all_eq(warn, vzero)) {
+        npy_set_floatstatus_divbyzero();
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
+        const npyv_lanetype_u8 a = *src1;
+        const npyv_lanetype_u8 b = *src2;
+        if (NPY_UNLIKELY(b == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *dst1 = 0;
+            *dst2 = 0;
+        } else{
+            *dst1 = a / b;
+            *dst2 = a % b;
+        }
+    }
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep) {
+        npyv_u8 a = npyv_load_u8(src1);
+        npyv_u8 b = npyv_load_u8(src2);
+        npyv_u8 c = vsx4_mod_u8(a, b);
+        npyv_store_u8(dst1, c);
+        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
+            npy_set_floatstatus_divbyzero();
+        }
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
+        const npyv_lanetype_u8 a = *src1;
+        const npyv_lanetype_u8 b = *src2;
+        if (NPY_UNLIKELY(b == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *dst1 = 0;
+        } else{
+            *dst1 = a % b;
+        }
+    }
+#endif
+    npyv_cleanup();
+}
+
+static inline void
+vsx4_simd_divmod_by_scalar_contig_u8(char **args, npy_intp len)
+{
+    npyv_lanetype_u8 *src1  = (npyv_lanetype_u8 *) args[0];
+    npyv_lanetype_u8 scalar = *(npyv_lanetype_u8 *) args[1];
+    npyv_lanetype_u8 *dst1  = (npyv_lanetype_u8 *) args[2];
+    const int vstep            = npyv_nlanes_u8;
+    const npyv_u8 vscalar   = npyv_setall_u8(scalar);
+    const vsx4_u32x4 divisor    = vsx4_divisor_u8(vscalar);
+#if 2 == 2 /* divmod */
+    npyv_lanetype_u8 *dst2 = (npyv_lanetype_u8 *) args[3];
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
+         dst2 += vstep) {
+        npyv_u8 a   = npyv_load_u8(src1);
+        npyv_u8 quo = vsx4_div_scalar_u8(a, divisor);
+        npyv_u8 rem = npyv_sub_u8(a, vec_mul(vscalar, quo));
+        npyv_store_u8(dst1, quo);
+        npyv_store_u8(dst2, rem);
+    }
+
+    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
+        const npyv_lanetype_u8 a = *src1;
+        *dst1 = a / scalar;
+        *dst2 = a % scalar;
+    }
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
+        npyv_u8 a = npyv_load_u8(src1);
+        npyv_u8 c = vsx4_mod_scalar_u8(a, divisor);
+        npyv_store_u8(dst1, c);
+    }
+
+    for (; len > 0; --len, ++src1, ++dst1) {
+        const npyv_lanetype_u8 a = *src1;
+        *dst1 = a % scalar;
+    }
+#endif
+    npyv_cleanup();
+}
+
+
+#line 170
+#line 174
+static inline void
+vsx4_simd_fmod_contig_u16(char **args, npy_intp len)
+{
+    npyv_lanetype_u16 *src1 = (npyv_lanetype_u16 *) args[0];
+    npyv_lanetype_u16 *src2 = (npyv_lanetype_u16 *) args[1];
+    npyv_lanetype_u16 *dst1 = (npyv_lanetype_u16 *) args[2];
+    const npyv_u16 vzero    = npyv_zero_u16();
+    const int vstep           = npyv_nlanes_u16;
+#if 0 == 2 /* divmod */
+    npyv_lanetype_u16 *dst2 = (npyv_lanetype_u16 *) args[3];
+    npyv_b16 warn          = npyv_cvt_b16_u16(npyv_zero_u16());
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep, dst2 += vstep) {
+        npyv_u16 a        = npyv_load_u16(src1);
+        npyv_u16 b        = npyv_load_u16(src2);
+        npyv_u16 quo      = vsx4_div_u16(a, b);
+        npyv_u16 rem      = npyv_sub_u16(a, vec_mul(b, quo));
+        npyv_b16 bzero   = npyv_cmpeq_u16(b, vzero);
+        // when b is 0, forces the remainder to be 0 too
+                        rem = npyv_select_u16(bzero, vzero, rem);
+                       warn = npyv_or_u16(bzero, warn);
+        npyv_store_u16(dst1, quo);
+        npyv_store_u16(dst2, rem);
+    }
+
+    if (!vec_all_eq(warn, vzero)) {
+        npy_set_floatstatus_divbyzero();
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
+        const npyv_lanetype_u16 a = *src1;
+        const npyv_lanetype_u16 b = *src2;
+        if (NPY_UNLIKELY(b == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *dst1 = 0;
+            *dst2 = 0;
+        } else{
+            *dst1 = a / b;
+            *dst2 = a % b;
+        }
+    }
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep) {
+        npyv_u16 a = npyv_load_u16(src1);
+        npyv_u16 b = npyv_load_u16(src2);
+        npyv_u16 c = vsx4_mod_u16(a, b);
+        npyv_store_u16(dst1, c);
+        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
+            npy_set_floatstatus_divbyzero();
+        }
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
+        const npyv_lanetype_u16 a = *src1;
+        const npyv_lanetype_u16 b = *src2;
+        if (NPY_UNLIKELY(b == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *dst1 = 0;
+        } else{
+            *dst1 = a % b;
+        }
+    }
+#endif
+    npyv_cleanup();
+}
+
+static inline void
+vsx4_simd_fmod_by_scalar_contig_u16(char **args, npy_intp len)
+{
+    npyv_lanetype_u16 *src1  = (npyv_lanetype_u16 *) args[0];
+    npyv_lanetype_u16 scalar = *(npyv_lanetype_u16 *) args[1];
+    npyv_lanetype_u16 *dst1  = (npyv_lanetype_u16 *) args[2];
+    const int vstep            = npyv_nlanes_u16;
+    const npyv_u16 vscalar   = npyv_setall_u16(scalar);
+    const npyv_u32x2 divisor    = vsx4_divisor_u16(vscalar);
+#if 0 == 2 /* divmod */
+    npyv_lanetype_u16 *dst2 = (npyv_lanetype_u16 *) args[3];
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
+         dst2 += vstep) {
+        npyv_u16 a   = npyv_load_u16(src1);
+        npyv_u16 quo = vsx4_div_scalar_u16(a, divisor);
+        npyv_u16 rem = npyv_sub_u16(a, vec_mul(vscalar, quo));
+        npyv_store_u16(dst1, quo);
+        npyv_store_u16(dst2, rem);
+    }
+
+    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
+        const npyv_lanetype_u16 a = *src1;
+        *dst1 = a / scalar;
+        *dst2 = a % scalar;
+    }
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
+        npyv_u16 a = npyv_load_u16(src1);
+        npyv_u16 c = vsx4_mod_scalar_u16(a, divisor);
+        npyv_store_u16(dst1, c);
+    }
+
+    for (; len > 0; --len, ++src1, ++dst1) {
+        const npyv_lanetype_u16 a = *src1;
+        *dst1 = a % scalar;
+    }
+#endif
+    npyv_cleanup();
+}
+
+#line 174
+static inline void
+vsx4_simd_remainder_contig_u16(char **args, npy_intp len)
+{
+    npyv_lanetype_u16 *src1 = (npyv_lanetype_u16 *) args[0];
+    npyv_lanetype_u16 *src2 = (npyv_lanetype_u16 *) args[1];
+    npyv_lanetype_u16 *dst1 = (npyv_lanetype_u16 *) args[2];
+    const npyv_u16 vzero    = npyv_zero_u16();
+    const int vstep           = npyv_nlanes_u16;
+#if 1 == 2 /* divmod */
+    npyv_lanetype_u16 *dst2 = (npyv_lanetype_u16 *) args[3];
+    npyv_b16 warn          = npyv_cvt_b16_u16(npyv_zero_u16());
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep, dst2 += vstep) {
+        npyv_u16 a        = npyv_load_u16(src1);
+        npyv_u16 b        = npyv_load_u16(src2);
+        npyv_u16 quo      = vsx4_div_u16(a, b);
+        npyv_u16 rem      = npyv_sub_u16(a, vec_mul(b, quo));
+        npyv_b16 bzero   = npyv_cmpeq_u16(b, vzero);
+        // when b is 0, forces the remainder to be 0 too
+                        rem = npyv_select_u16(bzero, vzero, rem);
+                       warn = npyv_or_u16(bzero, warn);
+        npyv_store_u16(dst1, quo);
+        npyv_store_u16(dst2, rem);
+    }
+
+    if (!vec_all_eq(warn, vzero)) {
+        npy_set_floatstatus_divbyzero();
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
+        const npyv_lanetype_u16 a = *src1;
+        const npyv_lanetype_u16 b = *src2;
+        if (NPY_UNLIKELY(b == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *dst1 = 0;
+            *dst2 = 0;
+        } else{
+            *dst1 = a / b;
+            *dst2 = a % b;
+        }
+    }
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep) {
+        npyv_u16 a = npyv_load_u16(src1);
+        npyv_u16 b = npyv_load_u16(src2);
+        npyv_u16 c = vsx4_mod_u16(a, b);
+        npyv_store_u16(dst1, c);
+        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
+            npy_set_floatstatus_divbyzero();
+        }
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
+        const npyv_lanetype_u16 a = *src1;
+        const npyv_lanetype_u16 b = *src2;
+        if (NPY_UNLIKELY(b == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *dst1 = 0;
+        } else{
+            *dst1 = a % b;
+        }
+    }
+#endif
+    npyv_cleanup();
+}
+
+static inline void
+vsx4_simd_remainder_by_scalar_contig_u16(char **args, npy_intp len)
+{
+    npyv_lanetype_u16 *src1  = (npyv_lanetype_u16 *) args[0];
+    npyv_lanetype_u16 scalar = *(npyv_lanetype_u16 *) args[1];
+    npyv_lanetype_u16 *dst1  = (npyv_lanetype_u16 *) args[2];
+    const int vstep            = npyv_nlanes_u16;
+    const npyv_u16 vscalar   = npyv_setall_u16(scalar);
+    const npyv_u32x2 divisor    = vsx4_divisor_u16(vscalar);
+#if 1 == 2 /* divmod */
+    npyv_lanetype_u16 *dst2 = (npyv_lanetype_u16 *) args[3];
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
+         dst2 += vstep) {
+        npyv_u16 a   = npyv_load_u16(src1);
+        npyv_u16 quo = vsx4_div_scalar_u16(a, divisor);
+        npyv_u16 rem = npyv_sub_u16(a, vec_mul(vscalar, quo));
+        npyv_store_u16(dst1, quo);
+        npyv_store_u16(dst2, rem);
+    }
+
+    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
+        const npyv_lanetype_u16 a = *src1;
+        *dst1 = a / scalar;
+        *dst2 = a % scalar;
+    }
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
+        npyv_u16 a = npyv_load_u16(src1);
+        npyv_u16 c = vsx4_mod_scalar_u16(a, divisor);
+        npyv_store_u16(dst1, c);
+    }
+
+    for (; len > 0; --len, ++src1, ++dst1) {
+        const npyv_lanetype_u16 a = *src1;
+        *dst1 = a % scalar;
+    }
+#endif
+    npyv_cleanup();
+}
+
+#line 174
+static inline void
+vsx4_simd_divmod_contig_u16(char **args, npy_intp len)
+{
+    npyv_lanetype_u16 *src1 = (npyv_lanetype_u16 *) args[0];
+    npyv_lanetype_u16 *src2 = (npyv_lanetype_u16 *) args[1];
+    npyv_lanetype_u16 *dst1 = (npyv_lanetype_u16 *) args[2];
+    const npyv_u16 vzero    = npyv_zero_u16();
+    const int vstep           = npyv_nlanes_u16;
+#if 2 == 2 /* divmod */
+    npyv_lanetype_u16 *dst2 = (npyv_lanetype_u16 *) args[3];
+    npyv_b16 warn          = npyv_cvt_b16_u16(npyv_zero_u16());
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep, dst2 += vstep) {
+        npyv_u16 a        = npyv_load_u16(src1);
+        npyv_u16 b        = npyv_load_u16(src2);
+        npyv_u16 quo      = vsx4_div_u16(a, b);
+        npyv_u16 rem      = npyv_sub_u16(a, vec_mul(b, quo));
+        npyv_b16 bzero   = npyv_cmpeq_u16(b, vzero);
+        // when b is 0, forces the remainder to be 0 too
+                        rem = npyv_select_u16(bzero, vzero, rem);
+                       warn = npyv_or_u16(bzero, warn);
+        npyv_store_u16(dst1, quo);
+        npyv_store_u16(dst2, rem);
+    }
+
+    if (!vec_all_eq(warn, vzero)) {
+        npy_set_floatstatus_divbyzero();
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
+        const npyv_lanetype_u16 a = *src1;
+        const npyv_lanetype_u16 b = *src2;
+        if (NPY_UNLIKELY(b == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *dst1 = 0;
+            *dst2 = 0;
+        } else{
+            *dst1 = a / b;
+            *dst2 = a % b;
+        }
+    }
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep) {
+        npyv_u16 a = npyv_load_u16(src1);
+        npyv_u16 b = npyv_load_u16(src2);
+        npyv_u16 c = vsx4_mod_u16(a, b);
+        npyv_store_u16(dst1, c);
+        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
+            npy_set_floatstatus_divbyzero();
+        }
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
+        const npyv_lanetype_u16 a = *src1;
+        const npyv_lanetype_u16 b = *src2;
+        if (NPY_UNLIKELY(b == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *dst1 = 0;
+        } else{
+            *dst1 = a % b;
+        }
+    }
+#endif
+    npyv_cleanup();
+}
+
+static inline void
+vsx4_simd_divmod_by_scalar_contig_u16(char **args, npy_intp len)
+{
+    npyv_lanetype_u16 *src1  = (npyv_lanetype_u16 *) args[0];
+    npyv_lanetype_u16 scalar = *(npyv_lanetype_u16 *) args[1];
+    npyv_lanetype_u16 *dst1  = (npyv_lanetype_u16 *) args[2];
+    const int vstep            = npyv_nlanes_u16;
+    const npyv_u16 vscalar   = npyv_setall_u16(scalar);
+    const npyv_u32x2 divisor    = vsx4_divisor_u16(vscalar);
+#if 2 == 2 /* divmod */
+    npyv_lanetype_u16 *dst2 = (npyv_lanetype_u16 *) args[3];
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
+         dst2 += vstep) {
+        npyv_u16 a   = npyv_load_u16(src1);
+        npyv_u16 quo = vsx4_div_scalar_u16(a, divisor);
+        npyv_u16 rem = npyv_sub_u16(a, vec_mul(vscalar, quo));
+        npyv_store_u16(dst1, quo);
+        npyv_store_u16(dst2, rem);
+    }
+
+    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
+        const npyv_lanetype_u16 a = *src1;
+        *dst1 = a / scalar;
+        *dst2 = a % scalar;
+    }
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
+        npyv_u16 a = npyv_load_u16(src1);
+        npyv_u16 c = vsx4_mod_scalar_u16(a, divisor);
+        npyv_store_u16(dst1, c);
+    }
+
+    for (; len > 0; --len, ++src1, ++dst1) {
+        const npyv_lanetype_u16 a = *src1;
+        *dst1 = a % scalar;
+    }
+#endif
+    npyv_cleanup();
+}
+
+
+#line 170
+#line 174
+static inline void
+vsx4_simd_fmod_contig_u32(char **args, npy_intp len)
+{
+    npyv_lanetype_u32 *src1 = (npyv_lanetype_u32 *) args[0];
+    npyv_lanetype_u32 *src2 = (npyv_lanetype_u32 *) args[1];
+    npyv_lanetype_u32 *dst1 = (npyv_lanetype_u32 *) args[2];
+    const npyv_u32 vzero    = npyv_zero_u32();
+    const int vstep           = npyv_nlanes_u32;
+#if 0 == 2 /* divmod */
+    npyv_lanetype_u32 *dst2 = (npyv_lanetype_u32 *) args[3];
+    npyv_b32 warn          = npyv_cvt_b32_u32(npyv_zero_u32());
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep, dst2 += vstep) {
+        npyv_u32 a        = npyv_load_u32(src1);
+        npyv_u32 b        = npyv_load_u32(src2);
+        npyv_u32 quo      = vsx4_div_u32(a, b);
+        npyv_u32 rem      = npyv_sub_u32(a, vec_mul(b, quo));
+        npyv_b32 bzero   = npyv_cmpeq_u32(b, vzero);
+        // when b is 0, forces the remainder to be 0 too
+                        rem = npyv_select_u32(bzero, vzero, rem);
+                       warn = npyv_or_u32(bzero, warn);
+        npyv_store_u32(dst1, quo);
+        npyv_store_u32(dst2, rem);
+    }
+
+    if (!vec_all_eq(warn, vzero)) {
+        npy_set_floatstatus_divbyzero();
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
+        const npyv_lanetype_u32 a = *src1;
+        const npyv_lanetype_u32 b = *src2;
+        if (NPY_UNLIKELY(b == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *dst1 = 0;
+            *dst2 = 0;
+        } else{
+            *dst1 = a / b;
+            *dst2 = a % b;
+        }
+    }
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep) {
+        npyv_u32 a = npyv_load_u32(src1);
+        npyv_u32 b = npyv_load_u32(src2);
+        npyv_u32 c = vsx4_mod_u32(a, b);
+        npyv_store_u32(dst1, c);
+        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
+            npy_set_floatstatus_divbyzero();
+        }
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
+        const npyv_lanetype_u32 a = *src1;
+        const npyv_lanetype_u32 b = *src2;
+        if (NPY_UNLIKELY(b == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *dst1 = 0;
+        } else{
+            *dst1 = a % b;
+        }
+    }
+#endif
+    npyv_cleanup();
+}
+
+static inline void
+vsx4_simd_fmod_by_scalar_contig_u32(char **args, npy_intp len)
+{
+    npyv_lanetype_u32 *src1  = (npyv_lanetype_u32 *) args[0];
+    npyv_lanetype_u32 scalar = *(npyv_lanetype_u32 *) args[1];
+    npyv_lanetype_u32 *dst1  = (npyv_lanetype_u32 *) args[2];
+    const int vstep            = npyv_nlanes_u32;
+    const npyv_u32 vscalar   = npyv_setall_u32(scalar);
+    const npyv_u32 divisor    = vsx4_divisor_u32(vscalar);
+#if 0 == 2 /* divmod */
+    npyv_lanetype_u32 *dst2 = (npyv_lanetype_u32 *) args[3];
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
+         dst2 += vstep) {
+        npyv_u32 a   = npyv_load_u32(src1);
+        npyv_u32 quo = vsx4_div_scalar_u32(a, divisor);
+        npyv_u32 rem = npyv_sub_u32(a, vec_mul(vscalar, quo));
+        npyv_store_u32(dst1, quo);
+        npyv_store_u32(dst2, rem);
+    }
+
+    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
+        const npyv_lanetype_u32 a = *src1;
+        *dst1 = a / scalar;
+        *dst2 = a % scalar;
+    }
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
+        npyv_u32 a = npyv_load_u32(src1);
+        npyv_u32 c = vsx4_mod_scalar_u32(a, divisor);
+        npyv_store_u32(dst1, c);
+    }
+
+    for (; len > 0; --len, ++src1, ++dst1) {
+        const npyv_lanetype_u32 a = *src1;
+        *dst1 = a % scalar;
+    }
+#endif
+    npyv_cleanup();
+}
+
+#line 174
+static inline void
+vsx4_simd_remainder_contig_u32(char **args, npy_intp len)
+{
+    npyv_lanetype_u32 *src1 = (npyv_lanetype_u32 *) args[0];
+    npyv_lanetype_u32 *src2 = (npyv_lanetype_u32 *) args[1];
+    npyv_lanetype_u32 *dst1 = (npyv_lanetype_u32 *) args[2];
+    const npyv_u32 vzero    = npyv_zero_u32();
+    const int vstep           = npyv_nlanes_u32;
+#if 1 == 2 /* divmod */
+    npyv_lanetype_u32 *dst2 = (npyv_lanetype_u32 *) args[3];
+    npyv_b32 warn          = npyv_cvt_b32_u32(npyv_zero_u32());
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep, dst2 += vstep) {
+        npyv_u32 a        = npyv_load_u32(src1);
+        npyv_u32 b        = npyv_load_u32(src2);
+        npyv_u32 quo      = vsx4_div_u32(a, b);
+        npyv_u32 rem      = npyv_sub_u32(a, vec_mul(b, quo));
+        npyv_b32 bzero   = npyv_cmpeq_u32(b, vzero);
+        // when b is 0, forces the remainder to be 0 too
+                        rem = npyv_select_u32(bzero, vzero, rem);
+                       warn = npyv_or_u32(bzero, warn);
+        npyv_store_u32(dst1, quo);
+        npyv_store_u32(dst2, rem);
+    }
+
+    if (!vec_all_eq(warn, vzero)) {
+        npy_set_floatstatus_divbyzero();
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
+        const npyv_lanetype_u32 a = *src1;
+        const npyv_lanetype_u32 b = *src2;
+        if (NPY_UNLIKELY(b == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *dst1 = 0;
+            *dst2 = 0;
+        } else{
+            *dst1 = a / b;
+            *dst2 = a % b;
+        }
+    }
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep) {
+        npyv_u32 a = npyv_load_u32(src1);
+        npyv_u32 b = npyv_load_u32(src2);
+        npyv_u32 c = vsx4_mod_u32(a, b);
+        npyv_store_u32(dst1, c);
+        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
+            npy_set_floatstatus_divbyzero();
+        }
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
+        const npyv_lanetype_u32 a = *src1;
+        const npyv_lanetype_u32 b = *src2;
+        if (NPY_UNLIKELY(b == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *dst1 = 0;
+        } else{
+            *dst1 = a % b;
+        }
+    }
+#endif
+    npyv_cleanup();
+}
+
+static inline void
+vsx4_simd_remainder_by_scalar_contig_u32(char **args, npy_intp len)
+{
+    npyv_lanetype_u32 *src1  = (npyv_lanetype_u32 *) args[0];
+    npyv_lanetype_u32 scalar = *(npyv_lanetype_u32 *) args[1];
+    npyv_lanetype_u32 *dst1  = (npyv_lanetype_u32 *) args[2];
+    const int vstep            = npyv_nlanes_u32;
+    const npyv_u32 vscalar   = npyv_setall_u32(scalar);
+    const npyv_u32 divisor    = vsx4_divisor_u32(vscalar);
+#if 1 == 2 /* divmod */
+    npyv_lanetype_u32 *dst2 = (npyv_lanetype_u32 *) args[3];
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
+         dst2 += vstep) {
+        npyv_u32 a   = npyv_load_u32(src1);
+        npyv_u32 quo = vsx4_div_scalar_u32(a, divisor);
+        npyv_u32 rem = npyv_sub_u32(a, vec_mul(vscalar, quo));
+        npyv_store_u32(dst1, quo);
+        npyv_store_u32(dst2, rem);
+    }
+
+    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
+        const npyv_lanetype_u32 a = *src1;
+        *dst1 = a / scalar;
+        *dst2 = a % scalar;
+    }
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
+        npyv_u32 a = npyv_load_u32(src1);
+        npyv_u32 c = vsx4_mod_scalar_u32(a, divisor);
+        npyv_store_u32(dst1, c);
+    }
+
+    for (; len > 0; --len, ++src1, ++dst1) {
+        const npyv_lanetype_u32 a = *src1;
+        *dst1 = a % scalar;
+    }
+#endif
+    npyv_cleanup();
+}
+
+#line 174
+static inline void
+vsx4_simd_divmod_contig_u32(char **args, npy_intp len)
+{
+    npyv_lanetype_u32 *src1 = (npyv_lanetype_u32 *) args[0];
+    npyv_lanetype_u32 *src2 = (npyv_lanetype_u32 *) args[1];
+    npyv_lanetype_u32 *dst1 = (npyv_lanetype_u32 *) args[2];
+    const npyv_u32 vzero    = npyv_zero_u32();
+    const int vstep           = npyv_nlanes_u32;
+#if 2 == 2 /* divmod */
+    npyv_lanetype_u32 *dst2 = (npyv_lanetype_u32 *) args[3];
+    npyv_b32 warn          = npyv_cvt_b32_u32(npyv_zero_u32());
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep, dst2 += vstep) {
+        npyv_u32 a        = npyv_load_u32(src1);
+        npyv_u32 b        = npyv_load_u32(src2);
+        npyv_u32 quo      = vsx4_div_u32(a, b);
+        npyv_u32 rem      = npyv_sub_u32(a, vec_mul(b, quo));
+        npyv_b32 bzero   = npyv_cmpeq_u32(b, vzero);
+        // when b is 0, forces the remainder to be 0 too
+                        rem = npyv_select_u32(bzero, vzero, rem);
+                       warn = npyv_or_u32(bzero, warn);
+        npyv_store_u32(dst1, quo);
+        npyv_store_u32(dst2, rem);
+    }
+
+    if (!vec_all_eq(warn, vzero)) {
+        npy_set_floatstatus_divbyzero();
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
+        const npyv_lanetype_u32 a = *src1;
+        const npyv_lanetype_u32 b = *src2;
+        if (NPY_UNLIKELY(b == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *dst1 = 0;
+            *dst2 = 0;
+        } else{
+            *dst1 = a / b;
+            *dst2 = a % b;
+        }
+    }
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep) {
+        npyv_u32 a = npyv_load_u32(src1);
+        npyv_u32 b = npyv_load_u32(src2);
+        npyv_u32 c = vsx4_mod_u32(a, b);
+        npyv_store_u32(dst1, c);
+        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
+            npy_set_floatstatus_divbyzero();
+        }
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
+        const npyv_lanetype_u32 a = *src1;
+        const npyv_lanetype_u32 b = *src2;
+        if (NPY_UNLIKELY(b == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *dst1 = 0;
+        } else{
+            *dst1 = a % b;
+        }
+    }
+#endif
+    npyv_cleanup();
+}
+
+static inline void
+vsx4_simd_divmod_by_scalar_contig_u32(char **args, npy_intp len)
+{
+    npyv_lanetype_u32 *src1  = (npyv_lanetype_u32 *) args[0];
+    npyv_lanetype_u32 scalar = *(npyv_lanetype_u32 *) args[1];
+    npyv_lanetype_u32 *dst1  = (npyv_lanetype_u32 *) args[2];
+    const int vstep            = npyv_nlanes_u32;
+    const npyv_u32 vscalar   = npyv_setall_u32(scalar);
+    const npyv_u32 divisor    = vsx4_divisor_u32(vscalar);
+#if 2 == 2 /* divmod */
+    npyv_lanetype_u32 *dst2 = (npyv_lanetype_u32 *) args[3];
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
+         dst2 += vstep) {
+        npyv_u32 a   = npyv_load_u32(src1);
+        npyv_u32 quo = vsx4_div_scalar_u32(a, divisor);
+        npyv_u32 rem = npyv_sub_u32(a, vec_mul(vscalar, quo));
+        npyv_store_u32(dst1, quo);
+        npyv_store_u32(dst2, rem);
+    }
+
+    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
+        const npyv_lanetype_u32 a = *src1;
+        *dst1 = a / scalar;
+        *dst2 = a % scalar;
+    }
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
+        npyv_u32 a = npyv_load_u32(src1);
+        npyv_u32 c = vsx4_mod_scalar_u32(a, divisor);
+        npyv_store_u32(dst1, c);
+    }
+
+    for (; len > 0; --len, ++src1, ++dst1) {
+        const npyv_lanetype_u32 a = *src1;
+        *dst1 = a % scalar;
+    }
+#endif
+    npyv_cleanup();
+}
+
+
+#line 170
+#line 174
+static inline void
+vsx4_simd_fmod_contig_u64(char **args, npy_intp len)
+{
+    npyv_lanetype_u64 *src1 = (npyv_lanetype_u64 *) args[0];
+    npyv_lanetype_u64 *src2 = (npyv_lanetype_u64 *) args[1];
+    npyv_lanetype_u64 *dst1 = (npyv_lanetype_u64 *) args[2];
+    const npyv_u64 vzero    = npyv_zero_u64();
+    const int vstep           = npyv_nlanes_u64;
+#if 0 == 2 /* divmod */
+    npyv_lanetype_u64 *dst2 = (npyv_lanetype_u64 *) args[3];
+    npyv_b64 warn          = npyv_cvt_b64_u64(npyv_zero_u64());
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep, dst2 += vstep) {
+        npyv_u64 a        = npyv_load_u64(src1);
+        npyv_u64 b        = npyv_load_u64(src2);
+        npyv_u64 quo      = vsx4_div_u64(a, b);
+        npyv_u64 rem      = npyv_sub_u64(a, vec_mul(b, quo));
+        npyv_b64 bzero   = npyv_cmpeq_u64(b, vzero);
+        // when b is 0, forces the remainder to be 0 too
+                        rem = npyv_select_u64(bzero, vzero, rem);
+                       warn = npyv_or_u64(bzero, warn);
+        npyv_store_u64(dst1, quo);
+        npyv_store_u64(dst2, rem);
+    }
+
+    if (!vec_all_eq(warn, vzero)) {
+        npy_set_floatstatus_divbyzero();
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
+        const npyv_lanetype_u64 a = *src1;
+        const npyv_lanetype_u64 b = *src2;
+        if (NPY_UNLIKELY(b == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *dst1 = 0;
+            *dst2 = 0;
+        } else{
+            *dst1 = a / b;
+            *dst2 = a % b;
+        }
+    }
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep) {
+        npyv_u64 a = npyv_load_u64(src1);
+        npyv_u64 b = npyv_load_u64(src2);
+        npyv_u64 c = vsx4_mod_u64(a, b);
+        npyv_store_u64(dst1, c);
+        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
+            npy_set_floatstatus_divbyzero();
+        }
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
+        const npyv_lanetype_u64 a = *src1;
+        const npyv_lanetype_u64 b = *src2;
+        if (NPY_UNLIKELY(b == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *dst1 = 0;
+        } else{
+            *dst1 = a % b;
+        }
+    }
+#endif
+    npyv_cleanup();
+}
+
+static inline void
+vsx4_simd_fmod_by_scalar_contig_u64(char **args, npy_intp len)
+{
+    npyv_lanetype_u64 *src1  = (npyv_lanetype_u64 *) args[0];
+    npyv_lanetype_u64 scalar = *(npyv_lanetype_u64 *) args[1];
+    npyv_lanetype_u64 *dst1  = (npyv_lanetype_u64 *) args[2];
+    const int vstep            = npyv_nlanes_u64;
+    const npyv_u64 vscalar   = npyv_setall_u64(scalar);
+    const npyv_u64 divisor    = vsx4_divisor_u64(vscalar);
+#if 0 == 2 /* divmod */
+    npyv_lanetype_u64 *dst2 = (npyv_lanetype_u64 *) args[3];
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
+         dst2 += vstep) {
+        npyv_u64 a   = npyv_load_u64(src1);
+        npyv_u64 quo = vsx4_div_scalar_u64(a, divisor);
+        npyv_u64 rem = npyv_sub_u64(a, vec_mul(vscalar, quo));
+        npyv_store_u64(dst1, quo);
+        npyv_store_u64(dst2, rem);
+    }
+
+    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
+        const npyv_lanetype_u64 a = *src1;
+        *dst1 = a / scalar;
+        *dst2 = a % scalar;
+    }
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
+        npyv_u64 a = npyv_load_u64(src1);
+        npyv_u64 c = vsx4_mod_scalar_u64(a, divisor);
+        npyv_store_u64(dst1, c);
+    }
+
+    for (; len > 0; --len, ++src1, ++dst1) {
+        const npyv_lanetype_u64 a = *src1;
+        *dst1 = a % scalar;
+    }
+#endif
+    npyv_cleanup();
+}
+
+#line 174
+static inline void
+vsx4_simd_remainder_contig_u64(char **args, npy_intp len)
+{
+    npyv_lanetype_u64 *src1 = (npyv_lanetype_u64 *) args[0];
+    npyv_lanetype_u64 *src2 = (npyv_lanetype_u64 *) args[1];
+    npyv_lanetype_u64 *dst1 = (npyv_lanetype_u64 *) args[2];
+    const npyv_u64 vzero    = npyv_zero_u64();
+    const int vstep           = npyv_nlanes_u64;
+#if 1 == 2 /* divmod */
+    npyv_lanetype_u64 *dst2 = (npyv_lanetype_u64 *) args[3];
+    npyv_b64 warn          = npyv_cvt_b64_u64(npyv_zero_u64());
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep, dst2 += vstep) {
+        npyv_u64 a        = npyv_load_u64(src1);
+        npyv_u64 b        = npyv_load_u64(src2);
+        npyv_u64 quo      = vsx4_div_u64(a, b);
+        npyv_u64 rem      = npyv_sub_u64(a, vec_mul(b, quo));
+        npyv_b64 bzero   = npyv_cmpeq_u64(b, vzero);
+        // when b is 0, forces the remainder to be 0 too
+                        rem = npyv_select_u64(bzero, vzero, rem);
+                       warn = npyv_or_u64(bzero, warn);
+        npyv_store_u64(dst1, quo);
+        npyv_store_u64(dst2, rem);
+    }
+
+    if (!vec_all_eq(warn, vzero)) {
+        npy_set_floatstatus_divbyzero();
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
+        const npyv_lanetype_u64 a = *src1;
+        const npyv_lanetype_u64 b = *src2;
+        if (NPY_UNLIKELY(b == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *dst1 = 0;
+            *dst2 = 0;
+        } else{
+            *dst1 = a / b;
+            *dst2 = a % b;
+        }
+    }
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep) {
+        npyv_u64 a = npyv_load_u64(src1);
+        npyv_u64 b = npyv_load_u64(src2);
+        npyv_u64 c = vsx4_mod_u64(a, b);
+        npyv_store_u64(dst1, c);
+        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
+            npy_set_floatstatus_divbyzero();
+        }
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
+        const npyv_lanetype_u64 a = *src1;
+        const npyv_lanetype_u64 b = *src2;
+        if (NPY_UNLIKELY(b == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *dst1 = 0;
+        } else{
+            *dst1 = a % b;
+        }
+    }
+#endif
+    npyv_cleanup();
+}
+
+static inline void
+vsx4_simd_remainder_by_scalar_contig_u64(char **args, npy_intp len)
+{
+    npyv_lanetype_u64 *src1  = (npyv_lanetype_u64 *) args[0];
+    npyv_lanetype_u64 scalar = *(npyv_lanetype_u64 *) args[1];
+    npyv_lanetype_u64 *dst1  = (npyv_lanetype_u64 *) args[2];
+    const int vstep            = npyv_nlanes_u64;
+    const npyv_u64 vscalar   = npyv_setall_u64(scalar);
+    const npyv_u64 divisor    = vsx4_divisor_u64(vscalar);
+#if 1 == 2 /* divmod */
+    npyv_lanetype_u64 *dst2 = (npyv_lanetype_u64 *) args[3];
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
+         dst2 += vstep) {
+        npyv_u64 a   = npyv_load_u64(src1);
+        npyv_u64 quo = vsx4_div_scalar_u64(a, divisor);
+        npyv_u64 rem = npyv_sub_u64(a, vec_mul(vscalar, quo));
+        npyv_store_u64(dst1, quo);
+        npyv_store_u64(dst2, rem);
+    }
+
+    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
+        const npyv_lanetype_u64 a = *src1;
+        *dst1 = a / scalar;
+        *dst2 = a % scalar;
+    }
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
+        npyv_u64 a = npyv_load_u64(src1);
+        npyv_u64 c = vsx4_mod_scalar_u64(a, divisor);
+        npyv_store_u64(dst1, c);
+    }
+
+    for (; len > 0; --len, ++src1, ++dst1) {
+        const npyv_lanetype_u64 a = *src1;
+        *dst1 = a % scalar;
+    }
+#endif
+    npyv_cleanup();
+}
+
+#line 174
+static inline void
+vsx4_simd_divmod_contig_u64(char **args, npy_intp len)
+{
+    npyv_lanetype_u64 *src1 = (npyv_lanetype_u64 *) args[0];
+    npyv_lanetype_u64 *src2 = (npyv_lanetype_u64 *) args[1];
+    npyv_lanetype_u64 *dst1 = (npyv_lanetype_u64 *) args[2];
+    const npyv_u64 vzero    = npyv_zero_u64();
+    const int vstep           = npyv_nlanes_u64;
+#if 2 == 2 /* divmod */
+    npyv_lanetype_u64 *dst2 = (npyv_lanetype_u64 *) args[3];
+    npyv_b64 warn          = npyv_cvt_b64_u64(npyv_zero_u64());
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep, dst2 += vstep) {
+        npyv_u64 a        = npyv_load_u64(src1);
+        npyv_u64 b        = npyv_load_u64(src2);
+        npyv_u64 quo      = vsx4_div_u64(a, b);
+        npyv_u64 rem      = npyv_sub_u64(a, vec_mul(b, quo));
+        npyv_b64 bzero   = npyv_cmpeq_u64(b, vzero);
+        // when b is 0, forces the remainder to be 0 too
+                        rem = npyv_select_u64(bzero, vzero, rem);
+                       warn = npyv_or_u64(bzero, warn);
+        npyv_store_u64(dst1, quo);
+        npyv_store_u64(dst2, rem);
+    }
+
+    if (!vec_all_eq(warn, vzero)) {
+        npy_set_floatstatus_divbyzero();
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
+        const npyv_lanetype_u64 a = *src1;
+        const npyv_lanetype_u64 b = *src2;
+        if (NPY_UNLIKELY(b == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *dst1 = 0;
+            *dst2 = 0;
+        } else{
+            *dst1 = a / b;
+            *dst2 = a % b;
+        }
+    }
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep) {
+        npyv_u64 a = npyv_load_u64(src1);
+        npyv_u64 b = npyv_load_u64(src2);
+        npyv_u64 c = vsx4_mod_u64(a, b);
+        npyv_store_u64(dst1, c);
+        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
+            npy_set_floatstatus_divbyzero();
+        }
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
+        const npyv_lanetype_u64 a = *src1;
+        const npyv_lanetype_u64 b = *src2;
+        if (NPY_UNLIKELY(b == 0)) {
+            npy_set_floatstatus_divbyzero();
+            *dst1 = 0;
+        } else{
+            *dst1 = a % b;
+        }
+    }
+#endif
+    npyv_cleanup();
+}
+
+static inline void
+vsx4_simd_divmod_by_scalar_contig_u64(char **args, npy_intp len)
+{
+    npyv_lanetype_u64 *src1  = (npyv_lanetype_u64 *) args[0];
+    npyv_lanetype_u64 scalar = *(npyv_lanetype_u64 *) args[1];
+    npyv_lanetype_u64 *dst1  = (npyv_lanetype_u64 *) args[2];
+    const int vstep            = npyv_nlanes_u64;
+    const npyv_u64 vscalar   = npyv_setall_u64(scalar);
+    const npyv_u64 divisor    = vsx4_divisor_u64(vscalar);
+#if 2 == 2 /* divmod */
+    npyv_lanetype_u64 *dst2 = (npyv_lanetype_u64 *) args[3];
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
+         dst2 += vstep) {
+        npyv_u64 a   = npyv_load_u64(src1);
+        npyv_u64 quo = vsx4_div_scalar_u64(a, divisor);
+        npyv_u64 rem = npyv_sub_u64(a, vec_mul(vscalar, quo));
+        npyv_store_u64(dst1, quo);
+        npyv_store_u64(dst2, rem);
+    }
+
+    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
+        const npyv_lanetype_u64 a = *src1;
+        *dst1 = a / scalar;
+        *dst2 = a % scalar;
+    }
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
+        npyv_u64 a = npyv_load_u64(src1);
+        npyv_u64 c = vsx4_mod_scalar_u64(a, divisor);
+        npyv_store_u64(dst1, c);
+    }
+
+    for (; len > 0; --len, ++src1, ++dst1) {
+        const npyv_lanetype_u64 a = *src1;
+        *dst1 = a % scalar;
+    }
+#endif
+    npyv_cleanup();
+}
+
+
+
+#line 291
+#line 295
+static inline void
+vsx4_simd_fmod_contig_s8(char **args, npy_intp len)
+{
+    npyv_lanetype_s8 *src1 = (npyv_lanetype_s8 *) args[0];
+    npyv_lanetype_s8 *src2 = (npyv_lanetype_s8 *) args[1];
+    npyv_lanetype_s8 *dst1 = (npyv_lanetype_s8 *) args[2];
+    const npyv_s8 vzero    = npyv_zero_s8();
+    const int vstep           = npyv_nlanes_s8;
+#if 0 == 2 /* divmod */
+    npyv_lanetype_s8 *dst2 = (npyv_lanetype_s8 *) args[3];
+    const npyv_s8 vneg_one = npyv_setall_s8(-1);
+    const npyv_s8 vmin     = npyv_setall_s8(NPY_MIN_INT8);
+    npyv_b8 warn_zero     = npyv_cvt_b8_s8(npyv_zero_s8());
+    npyv_b8 warn_overflow = npyv_cvt_b8_s8(npyv_zero_s8());
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep, dst2 += vstep) {
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep) {
+#endif
+        npyv_s8 a = npyv_load_s8(src1);
+        npyv_s8 b = npyv_load_s8(src2);
+#if 0 <= 1 /* fmod and remainder */
+        npyv_s8 rem       = vsx4_mod_s8(a, b);
+#else /* divmod */
+        npyv_s8 quo       = vsx4_div_s8(a, b);
+        npyv_s8 rem       = npyv_sub_s8(a, vec_mul(b, quo));
+        // (b == 0 || (a == NPY_MIN_INT8 && b == -1))
+        npyv_b8 bzero    = npyv_cmpeq_s8(b, vzero);
+        npyv_b8 amin     = npyv_cmpeq_s8(a, vmin);
+        npyv_b8 bneg_one = npyv_cmpeq_s8(b, vneg_one);
+        npyv_b8 overflow = npyv_and_s8(bneg_one, amin);
+                warn_zero = npyv_or_s8(bzero, warn_zero);
+               warn_overflow = npyv_or_s8(overflow, warn_overflow);
+#endif
+#if 0 >= 1 /* remainder and divmod */
+        // handle mixed case the way Python does
+        // ((a > 0) == (b > 0) || rem == 0)
+        npyv_b8 a_gt_zero  = npyv_cmpgt_s8(a, vzero);
+        npyv_b8 b_gt_zero  = npyv_cmpgt_s8(b, vzero);
+        npyv_b8 ab_eq_cond = npyv_cmpeq_s8(a_gt_zero, b_gt_zero);
+        npyv_b8 rem_zero   = npyv_cmpeq_s8(rem, vzero);
+        npyv_b8 or         = npyv_or_s8(ab_eq_cond, rem_zero);
+        npyv_s8 to_add      = npyv_select_s8(or, vzero, b);
+                           rem = npyv_add_s8(rem, to_add);
+#endif
+#if 0 == 2 /* divmod */
+        npyv_s8 to_sub = npyv_select_s8(or, vzero, vneg_one);
+                      quo = npyv_add_s8(quo, to_sub);
+                      // Divide by zero
+                      quo = npyv_select_s8(bzero, vzero, quo);
+                      rem = npyv_select_s8(bzero, vzero, rem);
+                      // Overflow
+                      quo = npyv_select_s8(overflow, vmin, quo);
+                      rem = npyv_select_s8(overflow, vzero, rem);
+        npyv_store_s8(dst1, quo);
+        npyv_store_s8(dst2, rem);
+#else /* fmod and remainder */
+        npyv_store_s8(dst1, rem);
+        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
+            npy_set_floatstatus_divbyzero();
+        }
+#endif
+    }
+
+#if 0 == 2 /* divmod */
+    if (!vec_all_eq(warn_zero, vzero)) {
+        npy_set_floatstatus_divbyzero();
+    }
+    if (!vec_all_eq(warn_overflow, vzero)) {
+        npy_set_floatstatus_overflow();
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
+        const npyv_lanetype_s8 a = *src1;
+        const npyv_lanetype_s8 b = *src2;
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT8, NPY_TRUE)) {
+            if (b == 0) {
+                npy_set_floatstatus_divbyzero();
+                *dst1 = 0;
+                *dst2 = 0;
+            }
+            else {
+                npy_set_floatstatus_overflow();
+                *dst1 = NPY_MIN_INT8;
+                *dst2 = 0;
+            }
+        }
+        else {
+            *dst1 = a / b;
+            *dst2 = a % b;
+            if (!((a > 0) == (b > 0) || *dst2 == 0)) {
+                *dst1 -= 1;
+                *dst2 += b;
+            }
+        }
+    }
+#else /* fmod and remainder */
+    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
+        const npyv_lanetype_s8 a = *src1;
+        const npyv_lanetype_s8 b = *src2;
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT8, NPY_TRUE)) {
+            FLAG_IF_DIVIDEBYZERO(b);
+            *dst1 = 0;
+        } else{
+            *dst1 = a % b;
+#if 0 == 1 /* remainder */
+            if (!((a > 0) == (b > 0) || *dst1 == 0)) {
+                *dst1 += b;
+            }
+#endif
+        }
+    }
+#endif
+    npyv_cleanup();
+}
+
+static inline void
+vsx4_simd_fmod_by_scalar_contig_s8(char **args, npy_intp len)
+{
+    npyv_lanetype_s8 *src1  = (npyv_lanetype_s8 *) args[0];
+    npyv_lanetype_s8 scalar = *(npyv_lanetype_s8 *) args[1];
+    npyv_lanetype_s8 *dst1  = (npyv_lanetype_s8 *) args[2];
+    const npyv_s8 vscalar   = npyv_setall_s8(scalar);
+    const vsx4_s32x4 divisor    = vsx4_divisor_s8(vscalar);
+    const int vstep            = npyv_nlanes_s8;
+#if 0 >= 1 /* remainder and divmod */
+    const npyv_s8 vzero     = npyv_zero_s8();
+    npyv_b8 b_gt_zero      = npyv_cmpgt_s8(vscalar, vzero);
+#endif
+#if 0 == 2 /* divmod */
+    npyv_b8 warn          = npyv_cvt_b8_s8(npyv_zero_s8());
+    const npyv_s8 vmin     = npyv_setall_s8(NPY_MIN_INT8);
+    const npyv_s8 vneg_one = npyv_setall_s8(-1);
+    npyv_b8 bneg_one      = npyv_cmpeq_s8(vscalar, vneg_one);
+    npyv_lanetype_s8 *dst2 = (npyv_lanetype_s8 *) args[3];
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
+         dst2 += vstep) {
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
+#endif
+        npyv_s8 a = npyv_load_s8(src1);
+#if 0 <= 1 /* fmod and remainder */
+        npyv_s8 rem       = vsx4_mod_scalar_s8(a, divisor);
+#else /* divmod */
+        npyv_s8 quo       = vsx4_div_scalar_s8(a, divisor);
+        npyv_s8 rem       = npyv_sub_s8(a, vec_mul(vscalar, quo));
+        // (a == NPY_MIN_INT8 && b == -1)
+        npyv_b8 amin     = npyv_cmpeq_s8(a, vmin);
+        npyv_b8 overflow = npyv_and_s8(bneg_one, amin);
+                        warn = npyv_or_s8(overflow, warn);
+#endif
+#if 0 >= 1 /* remainder and divmod */
+        // handle mixed case the way Python does
+        // ((a > 0) == (b > 0) || rem == 0)
+        npyv_b8 a_gt_zero  = npyv_cmpgt_s8(a, vzero);
+        npyv_b8 ab_eq_cond = npyv_cmpeq_s8(a_gt_zero, b_gt_zero);
+        npyv_b8 rem_zero   = npyv_cmpeq_s8(rem, vzero);
+        npyv_b8 or         = npyv_or_s8(ab_eq_cond, rem_zero);
+        npyv_s8 to_add      = npyv_select_s8(or, vzero, vscalar);
+                           rem = npyv_add_s8(rem, to_add);
+#endif
+#if 0 == 2 /* divmod */
+        npyv_s8 to_sub = npyv_select_s8(or, vzero, vneg_one);
+        quo               = npyv_add_s8(quo, to_sub);
+        // Overflow: set quo to minimum and rem to 0
+        quo               = npyv_select_s8(overflow, vmin, quo);
+        rem               = npyv_select_s8(overflow, vzero, rem);
+        npyv_store_s8(dst1, quo);
+        npyv_store_s8(dst2, rem);
+#else /* fmod and remainder */
+        npyv_store_s8(dst1, rem);
+#endif
+    }
+
+#if 0 == 2 /* divmod */
+    if (!vec_all_eq(warn, vzero)) {
+        npy_set_floatstatus_overflow();
+    }
+
+    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
+        const npyv_lanetype_s8 a = *src1;
+        if (NPY_UNLIKELY(a == NPY_MIN_INT8 && scalar == -1)) {
+            npy_set_floatstatus_overflow();
+            *dst1 = NPY_MIN_INT8;
+            *dst2 = 0;
+        }
+        else {
+            *dst1 = a / scalar;
+            *dst2 = a % scalar;
+            if (!((a > 0) == (scalar > 0) || *dst2 == 0)) {
+                *dst1 -= 1;
+                *dst2 += scalar;
+            }
+        }
+    }
+#else /* fmod and remainder */
+    for (; len > 0; --len, ++src1, ++dst1) {
+        const npyv_lanetype_s8 a = *src1;
+        *dst1 = a % scalar;
+#if 0 == 1 /* remainder */
+        if (!((a > 0) == (scalar > 0) || *dst1 == 0)) {
+            *dst1 += scalar;
+        }
+#endif
+    }
+#endif
+    npyv_cleanup();
+}
+
+#line 295
+static inline void
+vsx4_simd_remainder_contig_s8(char **args, npy_intp len)
+{
+    npyv_lanetype_s8 *src1 = (npyv_lanetype_s8 *) args[0];
+    npyv_lanetype_s8 *src2 = (npyv_lanetype_s8 *) args[1];
+    npyv_lanetype_s8 *dst1 = (npyv_lanetype_s8 *) args[2];
+    const npyv_s8 vzero    = npyv_zero_s8();
+    const int vstep           = npyv_nlanes_s8;
+#if 1 == 2 /* divmod */
+    npyv_lanetype_s8 *dst2 = (npyv_lanetype_s8 *) args[3];
+    const npyv_s8 vneg_one = npyv_setall_s8(-1);
+    const npyv_s8 vmin     = npyv_setall_s8(NPY_MIN_INT8);
+    npyv_b8 warn_zero     = npyv_cvt_b8_s8(npyv_zero_s8());
+    npyv_b8 warn_overflow = npyv_cvt_b8_s8(npyv_zero_s8());
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep, dst2 += vstep) {
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep) {
+#endif
+        npyv_s8 a = npyv_load_s8(src1);
+        npyv_s8 b = npyv_load_s8(src2);
+#if 1 <= 1 /* fmod and remainder */
+        npyv_s8 rem       = vsx4_mod_s8(a, b);
+#else /* divmod */
+        npyv_s8 quo       = vsx4_div_s8(a, b);
+        npyv_s8 rem       = npyv_sub_s8(a, vec_mul(b, quo));
+        // (b == 0 || (a == NPY_MIN_INT8 && b == -1))
+        npyv_b8 bzero    = npyv_cmpeq_s8(b, vzero);
+        npyv_b8 amin     = npyv_cmpeq_s8(a, vmin);
+        npyv_b8 bneg_one = npyv_cmpeq_s8(b, vneg_one);
+        npyv_b8 overflow = npyv_and_s8(bneg_one, amin);
+                warn_zero = npyv_or_s8(bzero, warn_zero);
+               warn_overflow = npyv_or_s8(overflow, warn_overflow);
+#endif
+#if 1 >= 1 /* remainder and divmod */
+        // handle mixed case the way Python does
+        // ((a > 0) == (b > 0) || rem == 0)
+        npyv_b8 a_gt_zero  = npyv_cmpgt_s8(a, vzero);
+        npyv_b8 b_gt_zero  = npyv_cmpgt_s8(b, vzero);
+        npyv_b8 ab_eq_cond = npyv_cmpeq_s8(a_gt_zero, b_gt_zero);
+        npyv_b8 rem_zero   = npyv_cmpeq_s8(rem, vzero);
+        npyv_b8 or         = npyv_or_s8(ab_eq_cond, rem_zero);
+        npyv_s8 to_add      = npyv_select_s8(or, vzero, b);
+                           rem = npyv_add_s8(rem, to_add);
+#endif
+#if 1 == 2 /* divmod */
+        npyv_s8 to_sub = npyv_select_s8(or, vzero, vneg_one);
+                      quo = npyv_add_s8(quo, to_sub);
+                      // Divide by zero
+                      quo = npyv_select_s8(bzero, vzero, quo);
+                      rem = npyv_select_s8(bzero, vzero, rem);
+                      // Overflow
+                      quo = npyv_select_s8(overflow, vmin, quo);
+                      rem = npyv_select_s8(overflow, vzero, rem);
+        npyv_store_s8(dst1, quo);
+        npyv_store_s8(dst2, rem);
+#else /* fmod and remainder */
+        npyv_store_s8(dst1, rem);
+        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
+            npy_set_floatstatus_divbyzero();
+        }
+#endif
+    }
+
+#if 1 == 2 /* divmod */
+    if (!vec_all_eq(warn_zero, vzero)) {
+        npy_set_floatstatus_divbyzero();
+    }
+    if (!vec_all_eq(warn_overflow, vzero)) {
+        npy_set_floatstatus_overflow();
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
+        const npyv_lanetype_s8 a = *src1;
+        const npyv_lanetype_s8 b = *src2;
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT8, NPY_TRUE)) {
+            if (b == 0) {
+                npy_set_floatstatus_divbyzero();
+                *dst1 = 0;
+                *dst2 = 0;
+            }
+            else {
+                npy_set_floatstatus_overflow();
+                *dst1 = NPY_MIN_INT8;
+                *dst2 = 0;
+            }
+        }
+        else {
+            *dst1 = a / b;
+            *dst2 = a % b;
+            if (!((a > 0) == (b > 0) || *dst2 == 0)) {
+                *dst1 -= 1;
+                *dst2 += b;
+            }
+        }
+    }
+#else /* fmod and remainder */
+    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
+        const npyv_lanetype_s8 a = *src1;
+        const npyv_lanetype_s8 b = *src2;
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT8, NPY_TRUE)) {
+            FLAG_IF_DIVIDEBYZERO(b);
+            *dst1 = 0;
+        } else{
+            *dst1 = a % b;
+#if 1 == 1 /* remainder */
+            if (!((a > 0) == (b > 0) || *dst1 == 0)) {
+                *dst1 += b;
+            }
+#endif
+        }
+    }
+#endif
+    npyv_cleanup();
+}
+
+static inline void
+vsx4_simd_remainder_by_scalar_contig_s8(char **args, npy_intp len)
+{
+    npyv_lanetype_s8 *src1  = (npyv_lanetype_s8 *) args[0];
+    npyv_lanetype_s8 scalar = *(npyv_lanetype_s8 *) args[1];
+    npyv_lanetype_s8 *dst1  = (npyv_lanetype_s8 *) args[2];
+    const npyv_s8 vscalar   = npyv_setall_s8(scalar);
+    const vsx4_s32x4 divisor    = vsx4_divisor_s8(vscalar);
+    const int vstep            = npyv_nlanes_s8;
+#if 1 >= 1 /* remainder and divmod */
+    const npyv_s8 vzero     = npyv_zero_s8();
+    npyv_b8 b_gt_zero      = npyv_cmpgt_s8(vscalar, vzero);
+#endif
+#if 1 == 2 /* divmod */
+    npyv_b8 warn          = npyv_cvt_b8_s8(npyv_zero_s8());
+    const npyv_s8 vmin     = npyv_setall_s8(NPY_MIN_INT8);
+    const npyv_s8 vneg_one = npyv_setall_s8(-1);
+    npyv_b8 bneg_one      = npyv_cmpeq_s8(vscalar, vneg_one);
+    npyv_lanetype_s8 *dst2 = (npyv_lanetype_s8 *) args[3];
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
+         dst2 += vstep) {
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
+#endif
+        npyv_s8 a = npyv_load_s8(src1);
+#if 1 <= 1 /* fmod and remainder */
+        npyv_s8 rem       = vsx4_mod_scalar_s8(a, divisor);
+#else /* divmod */
+        npyv_s8 quo       = vsx4_div_scalar_s8(a, divisor);
+        npyv_s8 rem       = npyv_sub_s8(a, vec_mul(vscalar, quo));
+        // (a == NPY_MIN_INT8 && b == -1)
+        npyv_b8 amin     = npyv_cmpeq_s8(a, vmin);
+        npyv_b8 overflow = npyv_and_s8(bneg_one, amin);
+                        warn = npyv_or_s8(overflow, warn);
+#endif
+#if 1 >= 1 /* remainder and divmod */
+        // handle mixed case the way Python does
+        // ((a > 0) == (b > 0) || rem == 0)
+        npyv_b8 a_gt_zero  = npyv_cmpgt_s8(a, vzero);
+        npyv_b8 ab_eq_cond = npyv_cmpeq_s8(a_gt_zero, b_gt_zero);
+        npyv_b8 rem_zero   = npyv_cmpeq_s8(rem, vzero);
+        npyv_b8 or         = npyv_or_s8(ab_eq_cond, rem_zero);
+        npyv_s8 to_add      = npyv_select_s8(or, vzero, vscalar);
+                           rem = npyv_add_s8(rem, to_add);
+#endif
+#if 1 == 2 /* divmod */
+        npyv_s8 to_sub = npyv_select_s8(or, vzero, vneg_one);
+        quo               = npyv_add_s8(quo, to_sub);
+        // Overflow: set quo to minimum and rem to 0
+        quo               = npyv_select_s8(overflow, vmin, quo);
+        rem               = npyv_select_s8(overflow, vzero, rem);
+        npyv_store_s8(dst1, quo);
+        npyv_store_s8(dst2, rem);
+#else /* fmod and remainder */
+        npyv_store_s8(dst1, rem);
+#endif
+    }
+
+#if 1 == 2 /* divmod */
+    if (!vec_all_eq(warn, vzero)) {
+        npy_set_floatstatus_overflow();
+    }
+
+    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
+        const npyv_lanetype_s8 a = *src1;
+        if (NPY_UNLIKELY(a == NPY_MIN_INT8 && scalar == -1)) {
+            npy_set_floatstatus_overflow();
+            *dst1 = NPY_MIN_INT8;
+            *dst2 = 0;
+        }
+        else {
+            *dst1 = a / scalar;
+            *dst2 = a % scalar;
+            if (!((a > 0) == (scalar > 0) || *dst2 == 0)) {
+                *dst1 -= 1;
+                *dst2 += scalar;
+            }
+        }
+    }
+#else /* fmod and remainder */
+    for (; len > 0; --len, ++src1, ++dst1) {
+        const npyv_lanetype_s8 a = *src1;
+        *dst1 = a % scalar;
+#if 1 == 1 /* remainder */
+        if (!((a > 0) == (scalar > 0) || *dst1 == 0)) {
+            *dst1 += scalar;
+        }
+#endif
+    }
+#endif
+    npyv_cleanup();
+}
+
+#line 295
+static inline void
+vsx4_simd_divmod_contig_s8(char **args, npy_intp len)
+{
+    npyv_lanetype_s8 *src1 = (npyv_lanetype_s8 *) args[0];
+    npyv_lanetype_s8 *src2 = (npyv_lanetype_s8 *) args[1];
+    npyv_lanetype_s8 *dst1 = (npyv_lanetype_s8 *) args[2];
+    const npyv_s8 vzero    = npyv_zero_s8();
+    const int vstep           = npyv_nlanes_s8;
+#if 2 == 2 /* divmod */
+    npyv_lanetype_s8 *dst2 = (npyv_lanetype_s8 *) args[3];
+    const npyv_s8 vneg_one = npyv_setall_s8(-1);
+    const npyv_s8 vmin     = npyv_setall_s8(NPY_MIN_INT8);
+    npyv_b8 warn_zero     = npyv_cvt_b8_s8(npyv_zero_s8());
+    npyv_b8 warn_overflow = npyv_cvt_b8_s8(npyv_zero_s8());
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep, dst2 += vstep) {
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep) {
+#endif
+        npyv_s8 a = npyv_load_s8(src1);
+        npyv_s8 b = npyv_load_s8(src2);
+#if 2 <= 1 /* fmod and remainder */
+        npyv_s8 rem       = vsx4_mod_s8(a, b);
+#else /* divmod */
+        npyv_s8 quo       = vsx4_div_s8(a, b);
+        npyv_s8 rem       = npyv_sub_s8(a, vec_mul(b, quo));
+        // (b == 0 || (a == NPY_MIN_INT8 && b == -1))
+        npyv_b8 bzero    = npyv_cmpeq_s8(b, vzero);
+        npyv_b8 amin     = npyv_cmpeq_s8(a, vmin);
+        npyv_b8 bneg_one = npyv_cmpeq_s8(b, vneg_one);
+        npyv_b8 overflow = npyv_and_s8(bneg_one, amin);
+                warn_zero = npyv_or_s8(bzero, warn_zero);
+               warn_overflow = npyv_or_s8(overflow, warn_overflow);
+#endif
+#if 2 >= 1 /* remainder and divmod */
+        // handle mixed case the way Python does
+        // ((a > 0) == (b > 0) || rem == 0)
+        npyv_b8 a_gt_zero  = npyv_cmpgt_s8(a, vzero);
+        npyv_b8 b_gt_zero  = npyv_cmpgt_s8(b, vzero);
+        npyv_b8 ab_eq_cond = npyv_cmpeq_s8(a_gt_zero, b_gt_zero);
+        npyv_b8 rem_zero   = npyv_cmpeq_s8(rem, vzero);
+        npyv_b8 or         = npyv_or_s8(ab_eq_cond, rem_zero);
+        npyv_s8 to_add      = npyv_select_s8(or, vzero, b);
+                           rem = npyv_add_s8(rem, to_add);
+#endif
+#if 2 == 2 /* divmod */
+        npyv_s8 to_sub = npyv_select_s8(or, vzero, vneg_one);
+                      quo = npyv_add_s8(quo, to_sub);
+                      // Divide by zero
+                      quo = npyv_select_s8(bzero, vzero, quo);
+                      rem = npyv_select_s8(bzero, vzero, rem);
+                      // Overflow
+                      quo = npyv_select_s8(overflow, vmin, quo);
+                      rem = npyv_select_s8(overflow, vzero, rem);
+        npyv_store_s8(dst1, quo);
+        npyv_store_s8(dst2, rem);
+#else /* fmod and remainder */
+        npyv_store_s8(dst1, rem);
+        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
+            npy_set_floatstatus_divbyzero();
+        }
+#endif
+    }
+
+#if 2 == 2 /* divmod */
+    if (!vec_all_eq(warn_zero, vzero)) {
+        npy_set_floatstatus_divbyzero();
+    }
+    if (!vec_all_eq(warn_overflow, vzero)) {
+        npy_set_floatstatus_overflow();
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
+        const npyv_lanetype_s8 a = *src1;
+        const npyv_lanetype_s8 b = *src2;
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT8, NPY_TRUE)) {
+            if (b == 0) {
+                npy_set_floatstatus_divbyzero();
+                *dst1 = 0;
+                *dst2 = 0;
+            }
+            else {
+                npy_set_floatstatus_overflow();
+                *dst1 = NPY_MIN_INT8;
+                *dst2 = 0;
+            }
+        }
+        else {
+            *dst1 = a / b;
+            *dst2 = a % b;
+            if (!((a > 0) == (b > 0) || *dst2 == 0)) {
+                *dst1 -= 1;
+                *dst2 += b;
+            }
+        }
+    }
+#else /* fmod and remainder */
+    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
+        const npyv_lanetype_s8 a = *src1;
+        const npyv_lanetype_s8 b = *src2;
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT8, NPY_TRUE)) {
+            FLAG_IF_DIVIDEBYZERO(b);
+            *dst1 = 0;
+        } else{
+            *dst1 = a % b;
+#if 2 == 1 /* remainder */
+            if (!((a > 0) == (b > 0) || *dst1 == 0)) {
+                *dst1 += b;
+            }
+#endif
+        }
+    }
+#endif
+    npyv_cleanup();
+}
+
+static inline void
+vsx4_simd_divmod_by_scalar_contig_s8(char **args, npy_intp len)
+{
+    npyv_lanetype_s8 *src1  = (npyv_lanetype_s8 *) args[0];
+    npyv_lanetype_s8 scalar = *(npyv_lanetype_s8 *) args[1];
+    npyv_lanetype_s8 *dst1  = (npyv_lanetype_s8 *) args[2];
+    const npyv_s8 vscalar   = npyv_setall_s8(scalar);
+    const vsx4_s32x4 divisor    = vsx4_divisor_s8(vscalar);
+    const int vstep            = npyv_nlanes_s8;
+#if 2 >= 1 /* remainder and divmod */
+    const npyv_s8 vzero     = npyv_zero_s8();
+    npyv_b8 b_gt_zero      = npyv_cmpgt_s8(vscalar, vzero);
+#endif
+#if 2 == 2 /* divmod */
+    npyv_b8 warn          = npyv_cvt_b8_s8(npyv_zero_s8());
+    const npyv_s8 vmin     = npyv_setall_s8(NPY_MIN_INT8);
+    const npyv_s8 vneg_one = npyv_setall_s8(-1);
+    npyv_b8 bneg_one      = npyv_cmpeq_s8(vscalar, vneg_one);
+    npyv_lanetype_s8 *dst2 = (npyv_lanetype_s8 *) args[3];
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
+         dst2 += vstep) {
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
+#endif
+        npyv_s8 a = npyv_load_s8(src1);
+#if 2 <= 1 /* fmod and remainder */
+        npyv_s8 rem       = vsx4_mod_scalar_s8(a, divisor);
+#else /* divmod */
+        npyv_s8 quo       = vsx4_div_scalar_s8(a, divisor);
+        npyv_s8 rem       = npyv_sub_s8(a, vec_mul(vscalar, quo));
+        // (a == NPY_MIN_INT8 && b == -1)
+        npyv_b8 amin     = npyv_cmpeq_s8(a, vmin);
+        npyv_b8 overflow = npyv_and_s8(bneg_one, amin);
+                        warn = npyv_or_s8(overflow, warn);
+#endif
+#if 2 >= 1 /* remainder and divmod */
+        // handle mixed case the way Python does
+        // ((a > 0) == (b > 0) || rem == 0)
+        npyv_b8 a_gt_zero  = npyv_cmpgt_s8(a, vzero);
+        npyv_b8 ab_eq_cond = npyv_cmpeq_s8(a_gt_zero, b_gt_zero);
+        npyv_b8 rem_zero   = npyv_cmpeq_s8(rem, vzero);
+        npyv_b8 or         = npyv_or_s8(ab_eq_cond, rem_zero);
+        npyv_s8 to_add      = npyv_select_s8(or, vzero, vscalar);
+                           rem = npyv_add_s8(rem, to_add);
+#endif
+#if 2 == 2 /* divmod */
+        npyv_s8 to_sub = npyv_select_s8(or, vzero, vneg_one);
+        quo               = npyv_add_s8(quo, to_sub);
+        // Overflow: set quo to minimum and rem to 0
+        quo               = npyv_select_s8(overflow, vmin, quo);
+        rem               = npyv_select_s8(overflow, vzero, rem);
+        npyv_store_s8(dst1, quo);
+        npyv_store_s8(dst2, rem);
+#else /* fmod and remainder */
+        npyv_store_s8(dst1, rem);
+#endif
+    }
+
+#if 2 == 2 /* divmod */
+    if (!vec_all_eq(warn, vzero)) {
+        npy_set_floatstatus_overflow();
+    }
+
+    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
+        const npyv_lanetype_s8 a = *src1;
+        if (NPY_UNLIKELY(a == NPY_MIN_INT8 && scalar == -1)) {
+            npy_set_floatstatus_overflow();
+            *dst1 = NPY_MIN_INT8;
+            *dst2 = 0;
+        }
+        else {
+            *dst1 = a / scalar;
+            *dst2 = a % scalar;
+            if (!((a > 0) == (scalar > 0) || *dst2 == 0)) {
+                *dst1 -= 1;
+                *dst2 += scalar;
+            }
+        }
+    }
+#else /* fmod and remainder */
+    for (; len > 0; --len, ++src1, ++dst1) {
+        const npyv_lanetype_s8 a = *src1;
+        *dst1 = a % scalar;
+#if 2 == 1 /* remainder */
+        if (!((a > 0) == (scalar > 0) || *dst1 == 0)) {
+            *dst1 += scalar;
+        }
+#endif
+    }
+#endif
+    npyv_cleanup();
+}
+
+
+#line 291
+#line 295
+static inline void
+vsx4_simd_fmod_contig_s16(char **args, npy_intp len)
+{
+    npyv_lanetype_s16 *src1 = (npyv_lanetype_s16 *) args[0];
+    npyv_lanetype_s16 *src2 = (npyv_lanetype_s16 *) args[1];
+    npyv_lanetype_s16 *dst1 = (npyv_lanetype_s16 *) args[2];
+    const npyv_s16 vzero    = npyv_zero_s16();
+    const int vstep           = npyv_nlanes_s16;
+#if 0 == 2 /* divmod */
+    npyv_lanetype_s16 *dst2 = (npyv_lanetype_s16 *) args[3];
+    const npyv_s16 vneg_one = npyv_setall_s16(-1);
+    const npyv_s16 vmin     = npyv_setall_s16(NPY_MIN_INT16);
+    npyv_b16 warn_zero     = npyv_cvt_b16_s16(npyv_zero_s16());
+    npyv_b16 warn_overflow = npyv_cvt_b16_s16(npyv_zero_s16());
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep, dst2 += vstep) {
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep) {
+#endif
+        npyv_s16 a = npyv_load_s16(src1);
+        npyv_s16 b = npyv_load_s16(src2);
+#if 0 <= 1 /* fmod and remainder */
+        npyv_s16 rem       = vsx4_mod_s16(a, b);
+#else /* divmod */
+        npyv_s16 quo       = vsx4_div_s16(a, b);
+        npyv_s16 rem       = npyv_sub_s16(a, vec_mul(b, quo));
+        // (b == 0 || (a == NPY_MIN_INT16 && b == -1))
+        npyv_b16 bzero    = npyv_cmpeq_s16(b, vzero);
+        npyv_b16 amin     = npyv_cmpeq_s16(a, vmin);
+        npyv_b16 bneg_one = npyv_cmpeq_s16(b, vneg_one);
+        npyv_b16 overflow = npyv_and_s16(bneg_one, amin);
+                warn_zero = npyv_or_s16(bzero, warn_zero);
+               warn_overflow = npyv_or_s16(overflow, warn_overflow);
+#endif
+#if 0 >= 1 /* remainder and divmod */
+        // handle mixed case the way Python does
+        // ((a > 0) == (b > 0) || rem == 0)
+        npyv_b16 a_gt_zero  = npyv_cmpgt_s16(a, vzero);
+        npyv_b16 b_gt_zero  = npyv_cmpgt_s16(b, vzero);
+        npyv_b16 ab_eq_cond = npyv_cmpeq_s16(a_gt_zero, b_gt_zero);
+        npyv_b16 rem_zero   = npyv_cmpeq_s16(rem, vzero);
+        npyv_b16 or         = npyv_or_s16(ab_eq_cond, rem_zero);
+        npyv_s16 to_add      = npyv_select_s16(or, vzero, b);
+                           rem = npyv_add_s16(rem, to_add);
+#endif
+#if 0 == 2 /* divmod */
+        npyv_s16 to_sub = npyv_select_s16(or, vzero, vneg_one);
+                      quo = npyv_add_s16(quo, to_sub);
+                      // Divide by zero
+                      quo = npyv_select_s16(bzero, vzero, quo);
+                      rem = npyv_select_s16(bzero, vzero, rem);
+                      // Overflow
+                      quo = npyv_select_s16(overflow, vmin, quo);
+                      rem = npyv_select_s16(overflow, vzero, rem);
+        npyv_store_s16(dst1, quo);
+        npyv_store_s16(dst2, rem);
+#else /* fmod and remainder */
+        npyv_store_s16(dst1, rem);
+        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
+            npy_set_floatstatus_divbyzero();
+        }
+#endif
+    }
+
+#if 0 == 2 /* divmod */
+    if (!vec_all_eq(warn_zero, vzero)) {
+        npy_set_floatstatus_divbyzero();
+    }
+    if (!vec_all_eq(warn_overflow, vzero)) {
+        npy_set_floatstatus_overflow();
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
+        const npyv_lanetype_s16 a = *src1;
+        const npyv_lanetype_s16 b = *src2;
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT16, NPY_TRUE)) {
+            if (b == 0) {
+                npy_set_floatstatus_divbyzero();
+                *dst1 = 0;
+                *dst2 = 0;
+            }
+            else {
+                npy_set_floatstatus_overflow();
+                *dst1 = NPY_MIN_INT16;
+                *dst2 = 0;
+            }
+        }
+        else {
+            *dst1 = a / b;
+            *dst2 = a % b;
+            if (!((a > 0) == (b > 0) || *dst2 == 0)) {
+                *dst1 -= 1;
+                *dst2 += b;
+            }
+        }
+    }
+#else /* fmod and remainder */
+    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
+        const npyv_lanetype_s16 a = *src1;
+        const npyv_lanetype_s16 b = *src2;
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT16, NPY_TRUE)) {
+            FLAG_IF_DIVIDEBYZERO(b);
+            *dst1 = 0;
+        } else{
+            *dst1 = a % b;
+#if 0 == 1 /* remainder */
+            if (!((a > 0) == (b > 0) || *dst1 == 0)) {
+                *dst1 += b;
+            }
+#endif
+        }
+    }
+#endif
+    npyv_cleanup();
+}
+
+static inline void
+vsx4_simd_fmod_by_scalar_contig_s16(char **args, npy_intp len)
+{
+    npyv_lanetype_s16 *src1  = (npyv_lanetype_s16 *) args[0];
+    npyv_lanetype_s16 scalar = *(npyv_lanetype_s16 *) args[1];
+    npyv_lanetype_s16 *dst1  = (npyv_lanetype_s16 *) args[2];
+    const npyv_s16 vscalar   = npyv_setall_s16(scalar);
+    const npyv_s32x2 divisor    = vsx4_divisor_s16(vscalar);
+    const int vstep            = npyv_nlanes_s16;
+#if 0 >= 1 /* remainder and divmod */
+    const npyv_s16 vzero     = npyv_zero_s16();
+    npyv_b16 b_gt_zero      = npyv_cmpgt_s16(vscalar, vzero);
+#endif
+#if 0 == 2 /* divmod */
+    npyv_b16 warn          = npyv_cvt_b16_s16(npyv_zero_s16());
+    const npyv_s16 vmin     = npyv_setall_s16(NPY_MIN_INT16);
+    const npyv_s16 vneg_one = npyv_setall_s16(-1);
+    npyv_b16 bneg_one      = npyv_cmpeq_s16(vscalar, vneg_one);
+    npyv_lanetype_s16 *dst2 = (npyv_lanetype_s16 *) args[3];
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
+         dst2 += vstep) {
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
+#endif
+        npyv_s16 a = npyv_load_s16(src1);
+#if 0 <= 1 /* fmod and remainder */
+        npyv_s16 rem       = vsx4_mod_scalar_s16(a, divisor);
+#else /* divmod */
+        npyv_s16 quo       = vsx4_div_scalar_s16(a, divisor);
+        npyv_s16 rem       = npyv_sub_s16(a, vec_mul(vscalar, quo));
+        // (a == NPY_MIN_INT16 && b == -1)
+        npyv_b16 amin     = npyv_cmpeq_s16(a, vmin);
+        npyv_b16 overflow = npyv_and_s16(bneg_one, amin);
+                        warn = npyv_or_s16(overflow, warn);
+#endif
+#if 0 >= 1 /* remainder and divmod */
+        // handle mixed case the way Python does
+        // ((a > 0) == (b > 0) || rem == 0)
+        npyv_b16 a_gt_zero  = npyv_cmpgt_s16(a, vzero);
+        npyv_b16 ab_eq_cond = npyv_cmpeq_s16(a_gt_zero, b_gt_zero);
+        npyv_b16 rem_zero   = npyv_cmpeq_s16(rem, vzero);
+        npyv_b16 or         = npyv_or_s16(ab_eq_cond, rem_zero);
+        npyv_s16 to_add      = npyv_select_s16(or, vzero, vscalar);
+                           rem = npyv_add_s16(rem, to_add);
+#endif
+#if 0 == 2 /* divmod */
+        npyv_s16 to_sub = npyv_select_s16(or, vzero, vneg_one);
+        quo               = npyv_add_s16(quo, to_sub);
+        // Overflow: set quo to minimum and rem to 0
+        quo               = npyv_select_s16(overflow, vmin, quo);
+        rem               = npyv_select_s16(overflow, vzero, rem);
+        npyv_store_s16(dst1, quo);
+        npyv_store_s16(dst2, rem);
+#else /* fmod and remainder */
+        npyv_store_s16(dst1, rem);
+#endif
+    }
+
+#if 0 == 2 /* divmod */
+    if (!vec_all_eq(warn, vzero)) {
+        npy_set_floatstatus_overflow();
+    }
+
+    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
+        const npyv_lanetype_s16 a = *src1;
+        if (NPY_UNLIKELY(a == NPY_MIN_INT16 && scalar == -1)) {
+            npy_set_floatstatus_overflow();
+            *dst1 = NPY_MIN_INT16;
+            *dst2 = 0;
+        }
+        else {
+            *dst1 = a / scalar;
+            *dst2 = a % scalar;
+            if (!((a > 0) == (scalar > 0) || *dst2 == 0)) {
+                *dst1 -= 1;
+                *dst2 += scalar;
+            }
+        }
+    }
+#else /* fmod and remainder */
+    for (; len > 0; --len, ++src1, ++dst1) {
+        const npyv_lanetype_s16 a = *src1;
+        *dst1 = a % scalar;
+#if 0 == 1 /* remainder */
+        if (!((a > 0) == (scalar > 0) || *dst1 == 0)) {
+            *dst1 += scalar;
+        }
+#endif
+    }
+#endif
+    npyv_cleanup();
+}
+
+#line 295
+static inline void
+vsx4_simd_remainder_contig_s16(char **args, npy_intp len)
+{
+    npyv_lanetype_s16 *src1 = (npyv_lanetype_s16 *) args[0];
+    npyv_lanetype_s16 *src2 = (npyv_lanetype_s16 *) args[1];
+    npyv_lanetype_s16 *dst1 = (npyv_lanetype_s16 *) args[2];
+    const npyv_s16 vzero    = npyv_zero_s16();
+    const int vstep           = npyv_nlanes_s16;
+#if 1 == 2 /* divmod */
+    npyv_lanetype_s16 *dst2 = (npyv_lanetype_s16 *) args[3];
+    const npyv_s16 vneg_one = npyv_setall_s16(-1);
+    const npyv_s16 vmin     = npyv_setall_s16(NPY_MIN_INT16);
+    npyv_b16 warn_zero     = npyv_cvt_b16_s16(npyv_zero_s16());
+    npyv_b16 warn_overflow = npyv_cvt_b16_s16(npyv_zero_s16());
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep, dst2 += vstep) {
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep) {
+#endif
+        npyv_s16 a = npyv_load_s16(src1);
+        npyv_s16 b = npyv_load_s16(src2);
+#if 1 <= 1 /* fmod and remainder */
+        npyv_s16 rem       = vsx4_mod_s16(a, b);
+#else /* divmod */
+        npyv_s16 quo       = vsx4_div_s16(a, b);
+        npyv_s16 rem       = npyv_sub_s16(a, vec_mul(b, quo));
+        // (b == 0 || (a == NPY_MIN_INT16 && b == -1))
+        npyv_b16 bzero    = npyv_cmpeq_s16(b, vzero);
+        npyv_b16 amin     = npyv_cmpeq_s16(a, vmin);
+        npyv_b16 bneg_one = npyv_cmpeq_s16(b, vneg_one);
+        npyv_b16 overflow = npyv_and_s16(bneg_one, amin);
+                warn_zero = npyv_or_s16(bzero, warn_zero);
+               warn_overflow = npyv_or_s16(overflow, warn_overflow);
+#endif
+#if 1 >= 1 /* remainder and divmod */
+        // handle mixed case the way Python does
+        // ((a > 0) == (b > 0) || rem == 0)
+        npyv_b16 a_gt_zero  = npyv_cmpgt_s16(a, vzero);
+        npyv_b16 b_gt_zero  = npyv_cmpgt_s16(b, vzero);
+        npyv_b16 ab_eq_cond = npyv_cmpeq_s16(a_gt_zero, b_gt_zero);
+        npyv_b16 rem_zero   = npyv_cmpeq_s16(rem, vzero);
+        npyv_b16 or         = npyv_or_s16(ab_eq_cond, rem_zero);
+        npyv_s16 to_add      = npyv_select_s16(or, vzero, b);
+                           rem = npyv_add_s16(rem, to_add);
+#endif
+#if 1 == 2 /* divmod */
+        npyv_s16 to_sub = npyv_select_s16(or, vzero, vneg_one);
+                      quo = npyv_add_s16(quo, to_sub);
+                      // Divide by zero
+                      quo = npyv_select_s16(bzero, vzero, quo);
+                      rem = npyv_select_s16(bzero, vzero, rem);
+                      // Overflow
+                      quo = npyv_select_s16(overflow, vmin, quo);
+                      rem = npyv_select_s16(overflow, vzero, rem);
+        npyv_store_s16(dst1, quo);
+        npyv_store_s16(dst2, rem);
+#else /* fmod and remainder */
+        npyv_store_s16(dst1, rem);
+        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
+            npy_set_floatstatus_divbyzero();
+        }
+#endif
+    }
+
+#if 1 == 2 /* divmod */
+    if (!vec_all_eq(warn_zero, vzero)) {
+        npy_set_floatstatus_divbyzero();
+    }
+    if (!vec_all_eq(warn_overflow, vzero)) {
+        npy_set_floatstatus_overflow();
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
+        const npyv_lanetype_s16 a = *src1;
+        const npyv_lanetype_s16 b = *src2;
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT16, NPY_TRUE)) {
+            if (b == 0) {
+                npy_set_floatstatus_divbyzero();
+                *dst1 = 0;
+                *dst2 = 0;
+            }
+            else {
+                npy_set_floatstatus_overflow();
+                *dst1 = NPY_MIN_INT16;
+                *dst2 = 0;
+            }
+        }
+        else {
+            *dst1 = a / b;
+            *dst2 = a % b;
+            if (!((a > 0) == (b > 0) || *dst2 == 0)) {
+                *dst1 -= 1;
+                *dst2 += b;
+            }
+        }
+    }
+#else /* fmod and remainder */
+    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
+        const npyv_lanetype_s16 a = *src1;
+        const npyv_lanetype_s16 b = *src2;
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT16, NPY_TRUE)) {
+            FLAG_IF_DIVIDEBYZERO(b);
+            *dst1 = 0;
+        } else{
+            *dst1 = a % b;
+#if 1 == 1 /* remainder */
+            if (!((a > 0) == (b > 0) || *dst1 == 0)) {
+                *dst1 += b;
+            }
+#endif
+        }
+    }
+#endif
+    npyv_cleanup();
+}
+
+static inline void
+vsx4_simd_remainder_by_scalar_contig_s16(char **args, npy_intp len)
+{
+    npyv_lanetype_s16 *src1  = (npyv_lanetype_s16 *) args[0];
+    npyv_lanetype_s16 scalar = *(npyv_lanetype_s16 *) args[1];
+    npyv_lanetype_s16 *dst1  = (npyv_lanetype_s16 *) args[2];
+    const npyv_s16 vscalar   = npyv_setall_s16(scalar);
+    const npyv_s32x2 divisor    = vsx4_divisor_s16(vscalar);
+    const int vstep            = npyv_nlanes_s16;
+#if 1 >= 1 /* remainder and divmod */
+    const npyv_s16 vzero     = npyv_zero_s16();
+    npyv_b16 b_gt_zero      = npyv_cmpgt_s16(vscalar, vzero);
+#endif
+#if 1 == 2 /* divmod */
+    npyv_b16 warn          = npyv_cvt_b16_s16(npyv_zero_s16());
+    const npyv_s16 vmin     = npyv_setall_s16(NPY_MIN_INT16);
+    const npyv_s16 vneg_one = npyv_setall_s16(-1);
+    npyv_b16 bneg_one      = npyv_cmpeq_s16(vscalar, vneg_one);
+    npyv_lanetype_s16 *dst2 = (npyv_lanetype_s16 *) args[3];
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
+         dst2 += vstep) {
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
+#endif
+        npyv_s16 a = npyv_load_s16(src1);
+#if 1 <= 1 /* fmod and remainder */
+        npyv_s16 rem       = vsx4_mod_scalar_s16(a, divisor);
+#else /* divmod */
+        npyv_s16 quo       = vsx4_div_scalar_s16(a, divisor);
+        npyv_s16 rem       = npyv_sub_s16(a, vec_mul(vscalar, quo));
+        // (a == NPY_MIN_INT16 && b == -1)
+        npyv_b16 amin     = npyv_cmpeq_s16(a, vmin);
+        npyv_b16 overflow = npyv_and_s16(bneg_one, amin);
+                        warn = npyv_or_s16(overflow, warn);
+#endif
+#if 1 >= 1 /* remainder and divmod */
+        // handle mixed case the way Python does
+        // ((a > 0) == (b > 0) || rem == 0)
+        npyv_b16 a_gt_zero  = npyv_cmpgt_s16(a, vzero);
+        npyv_b16 ab_eq_cond = npyv_cmpeq_s16(a_gt_zero, b_gt_zero);
+        npyv_b16 rem_zero   = npyv_cmpeq_s16(rem, vzero);
+        npyv_b16 or         = npyv_or_s16(ab_eq_cond, rem_zero);
+        npyv_s16 to_add      = npyv_select_s16(or, vzero, vscalar);
+                           rem = npyv_add_s16(rem, to_add);
+#endif
+#if 1 == 2 /* divmod */
+        npyv_s16 to_sub = npyv_select_s16(or, vzero, vneg_one);
+        quo               = npyv_add_s16(quo, to_sub);
+        // Overflow: set quo to minimum and rem to 0
+        quo               = npyv_select_s16(overflow, vmin, quo);
+        rem               = npyv_select_s16(overflow, vzero, rem);
+        npyv_store_s16(dst1, quo);
+        npyv_store_s16(dst2, rem);
+#else /* fmod and remainder */
+        npyv_store_s16(dst1, rem);
+#endif
+    }
+
+#if 1 == 2 /* divmod */
+    if (!vec_all_eq(warn, vzero)) {
+        npy_set_floatstatus_overflow();
+    }
+
+    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
+        const npyv_lanetype_s16 a = *src1;
+        if (NPY_UNLIKELY(a == NPY_MIN_INT16 && scalar == -1)) {
+            npy_set_floatstatus_overflow();
+            *dst1 = NPY_MIN_INT16;
+            *dst2 = 0;
+        }
+        else {
+            *dst1 = a / scalar;
+            *dst2 = a % scalar;
+            if (!((a > 0) == (scalar > 0) || *dst2 == 0)) {
+                *dst1 -= 1;
+                *dst2 += scalar;
+            }
+        }
+    }
+#else /* fmod and remainder */
+    for (; len > 0; --len, ++src1, ++dst1) {
+        const npyv_lanetype_s16 a = *src1;
+        *dst1 = a % scalar;
+#if 1 == 1 /* remainder */
+        if (!((a > 0) == (scalar > 0) || *dst1 == 0)) {
+            *dst1 += scalar;
+        }
+#endif
+    }
+#endif
+    npyv_cleanup();
+}
+
+#line 295
+static inline void
+vsx4_simd_divmod_contig_s16(char **args, npy_intp len)
+{
+    npyv_lanetype_s16 *src1 = (npyv_lanetype_s16 *) args[0];
+    npyv_lanetype_s16 *src2 = (npyv_lanetype_s16 *) args[1];
+    npyv_lanetype_s16 *dst1 = (npyv_lanetype_s16 *) args[2];
+    const npyv_s16 vzero    = npyv_zero_s16();
+    const int vstep           = npyv_nlanes_s16;
+#if 2 == 2 /* divmod */
+    npyv_lanetype_s16 *dst2 = (npyv_lanetype_s16 *) args[3];
+    const npyv_s16 vneg_one = npyv_setall_s16(-1);
+    const npyv_s16 vmin     = npyv_setall_s16(NPY_MIN_INT16);
+    npyv_b16 warn_zero     = npyv_cvt_b16_s16(npyv_zero_s16());
+    npyv_b16 warn_overflow = npyv_cvt_b16_s16(npyv_zero_s16());
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep, dst2 += vstep) {
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep) {
+#endif
+        npyv_s16 a = npyv_load_s16(src1);
+        npyv_s16 b = npyv_load_s16(src2);
+#if 2 <= 1 /* fmod and remainder */
+        npyv_s16 rem       = vsx4_mod_s16(a, b);
+#else /* divmod */
+        npyv_s16 quo       = vsx4_div_s16(a, b);
+        npyv_s16 rem       = npyv_sub_s16(a, vec_mul(b, quo));
+        // (b == 0 || (a == NPY_MIN_INT16 && b == -1))
+        npyv_b16 bzero    = npyv_cmpeq_s16(b, vzero);
+        npyv_b16 amin     = npyv_cmpeq_s16(a, vmin);
+        npyv_b16 bneg_one = npyv_cmpeq_s16(b, vneg_one);
+        npyv_b16 overflow = npyv_and_s16(bneg_one, amin);
+                warn_zero = npyv_or_s16(bzero, warn_zero);
+               warn_overflow = npyv_or_s16(overflow, warn_overflow);
+#endif
+#if 2 >= 1 /* remainder and divmod */
+        // handle mixed case the way Python does
+        // ((a > 0) == (b > 0) || rem == 0)
+        npyv_b16 a_gt_zero  = npyv_cmpgt_s16(a, vzero);
+        npyv_b16 b_gt_zero  = npyv_cmpgt_s16(b, vzero);
+        npyv_b16 ab_eq_cond = npyv_cmpeq_s16(a_gt_zero, b_gt_zero);
+        npyv_b16 rem_zero   = npyv_cmpeq_s16(rem, vzero);
+        npyv_b16 or         = npyv_or_s16(ab_eq_cond, rem_zero);
+        npyv_s16 to_add      = npyv_select_s16(or, vzero, b);
+                           rem = npyv_add_s16(rem, to_add);
+#endif
+#if 2 == 2 /* divmod */
+        npyv_s16 to_sub = npyv_select_s16(or, vzero, vneg_one);
+                      quo = npyv_add_s16(quo, to_sub);
+                      // Divide by zero
+                      quo = npyv_select_s16(bzero, vzero, quo);
+                      rem = npyv_select_s16(bzero, vzero, rem);
+                      // Overflow
+                      quo = npyv_select_s16(overflow, vmin, quo);
+                      rem = npyv_select_s16(overflow, vzero, rem);
+        npyv_store_s16(dst1, quo);
+        npyv_store_s16(dst2, rem);
+#else /* fmod and remainder */
+        npyv_store_s16(dst1, rem);
+        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
+            npy_set_floatstatus_divbyzero();
+        }
+#endif
+    }
+
+#if 2 == 2 /* divmod */
+    if (!vec_all_eq(warn_zero, vzero)) {
+        npy_set_floatstatus_divbyzero();
+    }
+    if (!vec_all_eq(warn_overflow, vzero)) {
+        npy_set_floatstatus_overflow();
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
+        const npyv_lanetype_s16 a = *src1;
+        const npyv_lanetype_s16 b = *src2;
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT16, NPY_TRUE)) {
+            if (b == 0) {
+                npy_set_floatstatus_divbyzero();
+                *dst1 = 0;
+                *dst2 = 0;
+            }
+            else {
+                npy_set_floatstatus_overflow();
+                *dst1 = NPY_MIN_INT16;
+                *dst2 = 0;
+            }
+        }
+        else {
+            *dst1 = a / b;
+            *dst2 = a % b;
+            if (!((a > 0) == (b > 0) || *dst2 == 0)) {
+                *dst1 -= 1;
+                *dst2 += b;
+            }
+        }
+    }
+#else /* fmod and remainder */
+    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
+        const npyv_lanetype_s16 a = *src1;
+        const npyv_lanetype_s16 b = *src2;
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT16, NPY_TRUE)) {
+            FLAG_IF_DIVIDEBYZERO(b);
+            *dst1 = 0;
+        } else{
+            *dst1 = a % b;
+#if 2 == 1 /* remainder */
+            if (!((a > 0) == (b > 0) || *dst1 == 0)) {
+                *dst1 += b;
+            }
+#endif
+        }
+    }
+#endif
+    npyv_cleanup();
+}
+
+static inline void
+vsx4_simd_divmod_by_scalar_contig_s16(char **args, npy_intp len)
+{
+    npyv_lanetype_s16 *src1  = (npyv_lanetype_s16 *) args[0];
+    npyv_lanetype_s16 scalar = *(npyv_lanetype_s16 *) args[1];
+    npyv_lanetype_s16 *dst1  = (npyv_lanetype_s16 *) args[2];
+    const npyv_s16 vscalar   = npyv_setall_s16(scalar);
+    const npyv_s32x2 divisor    = vsx4_divisor_s16(vscalar);
+    const int vstep            = npyv_nlanes_s16;
+#if 2 >= 1 /* remainder and divmod */
+    const npyv_s16 vzero     = npyv_zero_s16();
+    npyv_b16 b_gt_zero      = npyv_cmpgt_s16(vscalar, vzero);
+#endif
+#if 2 == 2 /* divmod */
+    npyv_b16 warn          = npyv_cvt_b16_s16(npyv_zero_s16());
+    const npyv_s16 vmin     = npyv_setall_s16(NPY_MIN_INT16);
+    const npyv_s16 vneg_one = npyv_setall_s16(-1);
+    npyv_b16 bneg_one      = npyv_cmpeq_s16(vscalar, vneg_one);
+    npyv_lanetype_s16 *dst2 = (npyv_lanetype_s16 *) args[3];
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
+         dst2 += vstep) {
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
+#endif
+        npyv_s16 a = npyv_load_s16(src1);
+#if 2 <= 1 /* fmod and remainder */
+        npyv_s16 rem       = vsx4_mod_scalar_s16(a, divisor);
+#else /* divmod */
+        npyv_s16 quo       = vsx4_div_scalar_s16(a, divisor);
+        npyv_s16 rem       = npyv_sub_s16(a, vec_mul(vscalar, quo));
+        // (a == NPY_MIN_INT16 && b == -1)
+        npyv_b16 amin     = npyv_cmpeq_s16(a, vmin);
+        npyv_b16 overflow = npyv_and_s16(bneg_one, amin);
+                        warn = npyv_or_s16(overflow, warn);
+#endif
+#if 2 >= 1 /* remainder and divmod */
+        // handle mixed case the way Python does
+        // ((a > 0) == (b > 0) || rem == 0)
+        npyv_b16 a_gt_zero  = npyv_cmpgt_s16(a, vzero);
+        npyv_b16 ab_eq_cond = npyv_cmpeq_s16(a_gt_zero, b_gt_zero);
+        npyv_b16 rem_zero   = npyv_cmpeq_s16(rem, vzero);
+        npyv_b16 or         = npyv_or_s16(ab_eq_cond, rem_zero);
+        npyv_s16 to_add      = npyv_select_s16(or, vzero, vscalar);
+                           rem = npyv_add_s16(rem, to_add);
+#endif
+#if 2 == 2 /* divmod */
+        npyv_s16 to_sub = npyv_select_s16(or, vzero, vneg_one);
+        quo               = npyv_add_s16(quo, to_sub);
+        // Overflow: set quo to minimum and rem to 0
+        quo               = npyv_select_s16(overflow, vmin, quo);
+        rem               = npyv_select_s16(overflow, vzero, rem);
+        npyv_store_s16(dst1, quo);
+        npyv_store_s16(dst2, rem);
+#else /* fmod and remainder */
+        npyv_store_s16(dst1, rem);
+#endif
+    }
+
+#if 2 == 2 /* divmod */
+    if (!vec_all_eq(warn, vzero)) {
+        npy_set_floatstatus_overflow();
+    }
+
+    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
+        const npyv_lanetype_s16 a = *src1;
+        if (NPY_UNLIKELY(a == NPY_MIN_INT16 && scalar == -1)) {
+            npy_set_floatstatus_overflow();
+            *dst1 = NPY_MIN_INT16;
+            *dst2 = 0;
+        }
+        else {
+            *dst1 = a / scalar;
+            *dst2 = a % scalar;
+            if (!((a > 0) == (scalar > 0) || *dst2 == 0)) {
+                *dst1 -= 1;
+                *dst2 += scalar;
+            }
+        }
+    }
+#else /* fmod and remainder */
+    for (; len > 0; --len, ++src1, ++dst1) {
+        const npyv_lanetype_s16 a = *src1;
+        *dst1 = a % scalar;
+#if 2 == 1 /* remainder */
+        if (!((a > 0) == (scalar > 0) || *dst1 == 0)) {
+            *dst1 += scalar;
+        }
+#endif
+    }
+#endif
+    npyv_cleanup();
+}
+
+
+#line 291
+#line 295
+static inline void
+vsx4_simd_fmod_contig_s32(char **args, npy_intp len)
+{
+    npyv_lanetype_s32 *src1 = (npyv_lanetype_s32 *) args[0];
+    npyv_lanetype_s32 *src2 = (npyv_lanetype_s32 *) args[1];
+    npyv_lanetype_s32 *dst1 = (npyv_lanetype_s32 *) args[2];
+    const npyv_s32 vzero    = npyv_zero_s32();
+    const int vstep           = npyv_nlanes_s32;
+#if 0 == 2 /* divmod */
+    npyv_lanetype_s32 *dst2 = (npyv_lanetype_s32 *) args[3];
+    const npyv_s32 vneg_one = npyv_setall_s32(-1);
+    const npyv_s32 vmin     = npyv_setall_s32(NPY_MIN_INT32);
+    npyv_b32 warn_zero     = npyv_cvt_b32_s32(npyv_zero_s32());
+    npyv_b32 warn_overflow = npyv_cvt_b32_s32(npyv_zero_s32());
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep, dst2 += vstep) {
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep) {
+#endif
+        npyv_s32 a = npyv_load_s32(src1);
+        npyv_s32 b = npyv_load_s32(src2);
+#if 0 <= 1 /* fmod and remainder */
+        npyv_s32 rem       = vsx4_mod_s32(a, b);
+#else /* divmod */
+        npyv_s32 quo       = vsx4_div_s32(a, b);
+        npyv_s32 rem       = npyv_sub_s32(a, vec_mul(b, quo));
+        // (b == 0 || (a == NPY_MIN_INT32 && b == -1))
+        npyv_b32 bzero    = npyv_cmpeq_s32(b, vzero);
+        npyv_b32 amin     = npyv_cmpeq_s32(a, vmin);
+        npyv_b32 bneg_one = npyv_cmpeq_s32(b, vneg_one);
+        npyv_b32 overflow = npyv_and_s32(bneg_one, amin);
+                warn_zero = npyv_or_s32(bzero, warn_zero);
+               warn_overflow = npyv_or_s32(overflow, warn_overflow);
+#endif
+#if 0 >= 1 /* remainder and divmod */
+        // handle mixed case the way Python does
+        // ((a > 0) == (b > 0) || rem == 0)
+        npyv_b32 a_gt_zero  = npyv_cmpgt_s32(a, vzero);
+        npyv_b32 b_gt_zero  = npyv_cmpgt_s32(b, vzero);
+        npyv_b32 ab_eq_cond = npyv_cmpeq_s32(a_gt_zero, b_gt_zero);
+        npyv_b32 rem_zero   = npyv_cmpeq_s32(rem, vzero);
+        npyv_b32 or         = npyv_or_s32(ab_eq_cond, rem_zero);
+        npyv_s32 to_add      = npyv_select_s32(or, vzero, b);
+                           rem = npyv_add_s32(rem, to_add);
+#endif
+#if 0 == 2 /* divmod */
+        npyv_s32 to_sub = npyv_select_s32(or, vzero, vneg_one);
+                      quo = npyv_add_s32(quo, to_sub);
+                      // Divide by zero
+                      quo = npyv_select_s32(bzero, vzero, quo);
+                      rem = npyv_select_s32(bzero, vzero, rem);
+                      // Overflow
+                      quo = npyv_select_s32(overflow, vmin, quo);
+                      rem = npyv_select_s32(overflow, vzero, rem);
+        npyv_store_s32(dst1, quo);
+        npyv_store_s32(dst2, rem);
+#else /* fmod and remainder */
+        npyv_store_s32(dst1, rem);
+        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
+            npy_set_floatstatus_divbyzero();
+        }
+#endif
+    }
+
+#if 0 == 2 /* divmod */
+    if (!vec_all_eq(warn_zero, vzero)) {
+        npy_set_floatstatus_divbyzero();
+    }
+    if (!vec_all_eq(warn_overflow, vzero)) {
+        npy_set_floatstatus_overflow();
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
+        const npyv_lanetype_s32 a = *src1;
+        const npyv_lanetype_s32 b = *src2;
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT32, NPY_TRUE)) {
+            if (b == 0) {
+                npy_set_floatstatus_divbyzero();
+                *dst1 = 0;
+                *dst2 = 0;
+            }
+            else {
+                npy_set_floatstatus_overflow();
+                *dst1 = NPY_MIN_INT32;
+                *dst2 = 0;
+            }
+        }
+        else {
+            *dst1 = a / b;
+            *dst2 = a % b;
+            if (!((a > 0) == (b > 0) || *dst2 == 0)) {
+                *dst1 -= 1;
+                *dst2 += b;
+            }
+        }
+    }
+#else /* fmod and remainder */
+    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
+        const npyv_lanetype_s32 a = *src1;
+        const npyv_lanetype_s32 b = *src2;
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT32, NPY_TRUE)) {
+            FLAG_IF_DIVIDEBYZERO(b);
+            *dst1 = 0;
+        } else{
+            *dst1 = a % b;
+#if 0 == 1 /* remainder */
+            if (!((a > 0) == (b > 0) || *dst1 == 0)) {
+                *dst1 += b;
+            }
+#endif
+        }
+    }
+#endif
+    npyv_cleanup();
+}
+
+static inline void
+vsx4_simd_fmod_by_scalar_contig_s32(char **args, npy_intp len)
+{
+    npyv_lanetype_s32 *src1  = (npyv_lanetype_s32 *) args[0];
+    npyv_lanetype_s32 scalar = *(npyv_lanetype_s32 *) args[1];
+    npyv_lanetype_s32 *dst1  = (npyv_lanetype_s32 *) args[2];
+    const npyv_s32 vscalar   = npyv_setall_s32(scalar);
+    const npyv_s32 divisor    = vsx4_divisor_s32(vscalar);
+    const int vstep            = npyv_nlanes_s32;
+#if 0 >= 1 /* remainder and divmod */
+    const npyv_s32 vzero     = npyv_zero_s32();
+    npyv_b32 b_gt_zero      = npyv_cmpgt_s32(vscalar, vzero);
+#endif
+#if 0 == 2 /* divmod */
+    npyv_b32 warn          = npyv_cvt_b32_s32(npyv_zero_s32());
+    const npyv_s32 vmin     = npyv_setall_s32(NPY_MIN_INT32);
+    const npyv_s32 vneg_one = npyv_setall_s32(-1);
+    npyv_b32 bneg_one      = npyv_cmpeq_s32(vscalar, vneg_one);
+    npyv_lanetype_s32 *dst2 = (npyv_lanetype_s32 *) args[3];
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
+         dst2 += vstep) {
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
+#endif
+        npyv_s32 a = npyv_load_s32(src1);
+#if 0 <= 1 /* fmod and remainder */
+        npyv_s32 rem       = vsx4_mod_scalar_s32(a, divisor);
+#else /* divmod */
+        npyv_s32 quo       = vsx4_div_scalar_s32(a, divisor);
+        npyv_s32 rem       = npyv_sub_s32(a, vec_mul(vscalar, quo));
+        // (a == NPY_MIN_INT32 && b == -1)
+        npyv_b32 amin     = npyv_cmpeq_s32(a, vmin);
+        npyv_b32 overflow = npyv_and_s32(bneg_one, amin);
+                        warn = npyv_or_s32(overflow, warn);
+#endif
+#if 0 >= 1 /* remainder and divmod */
+        // handle mixed case the way Python does
+        // ((a > 0) == (b > 0) || rem == 0)
+        npyv_b32 a_gt_zero  = npyv_cmpgt_s32(a, vzero);
+        npyv_b32 ab_eq_cond = npyv_cmpeq_s32(a_gt_zero, b_gt_zero);
+        npyv_b32 rem_zero   = npyv_cmpeq_s32(rem, vzero);
+        npyv_b32 or         = npyv_or_s32(ab_eq_cond, rem_zero);
+        npyv_s32 to_add      = npyv_select_s32(or, vzero, vscalar);
+                           rem = npyv_add_s32(rem, to_add);
+#endif
+#if 0 == 2 /* divmod */
+        npyv_s32 to_sub = npyv_select_s32(or, vzero, vneg_one);
+        quo               = npyv_add_s32(quo, to_sub);
+        // Overflow: set quo to minimum and rem to 0
+        quo               = npyv_select_s32(overflow, vmin, quo);
+        rem               = npyv_select_s32(overflow, vzero, rem);
+        npyv_store_s32(dst1, quo);
+        npyv_store_s32(dst2, rem);
+#else /* fmod and remainder */
+        npyv_store_s32(dst1, rem);
+#endif
+    }
+
+#if 0 == 2 /* divmod */
+    if (!vec_all_eq(warn, vzero)) {
+        npy_set_floatstatus_overflow();
+    }
+
+    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
+        const npyv_lanetype_s32 a = *src1;
+        if (NPY_UNLIKELY(a == NPY_MIN_INT32 && scalar == -1)) {
+            npy_set_floatstatus_overflow();
+            *dst1 = NPY_MIN_INT32;
+            *dst2 = 0;
+        }
+        else {
+            *dst1 = a / scalar;
+            *dst2 = a % scalar;
+            if (!((a > 0) == (scalar > 0) || *dst2 == 0)) {
+                *dst1 -= 1;
+                *dst2 += scalar;
+            }
+        }
+    }
+#else /* fmod and remainder */
+    for (; len > 0; --len, ++src1, ++dst1) {
+        const npyv_lanetype_s32 a = *src1;
+        *dst1 = a % scalar;
+#if 0 == 1 /* remainder */
+        if (!((a > 0) == (scalar > 0) || *dst1 == 0)) {
+            *dst1 += scalar;
+        }
+#endif
+    }
+#endif
+    npyv_cleanup();
+}
+
+#line 295
+static inline void
+vsx4_simd_remainder_contig_s32(char **args, npy_intp len)
+{
+    npyv_lanetype_s32 *src1 = (npyv_lanetype_s32 *) args[0];
+    npyv_lanetype_s32 *src2 = (npyv_lanetype_s32 *) args[1];
+    npyv_lanetype_s32 *dst1 = (npyv_lanetype_s32 *) args[2];
+    const npyv_s32 vzero    = npyv_zero_s32();
+    const int vstep           = npyv_nlanes_s32;
+#if 1 == 2 /* divmod */
+    npyv_lanetype_s32 *dst2 = (npyv_lanetype_s32 *) args[3];
+    const npyv_s32 vneg_one = npyv_setall_s32(-1);
+    const npyv_s32 vmin     = npyv_setall_s32(NPY_MIN_INT32);
+    npyv_b32 warn_zero     = npyv_cvt_b32_s32(npyv_zero_s32());
+    npyv_b32 warn_overflow = npyv_cvt_b32_s32(npyv_zero_s32());
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep, dst2 += vstep) {
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep) {
+#endif
+        npyv_s32 a = npyv_load_s32(src1);
+        npyv_s32 b = npyv_load_s32(src2);
+#if 1 <= 1 /* fmod and remainder */
+        npyv_s32 rem       = vsx4_mod_s32(a, b);
+#else /* divmod */
+        npyv_s32 quo       = vsx4_div_s32(a, b);
+        npyv_s32 rem       = npyv_sub_s32(a, vec_mul(b, quo));
+        // (b == 0 || (a == NPY_MIN_INT32 && b == -1))
+        npyv_b32 bzero    = npyv_cmpeq_s32(b, vzero);
+        npyv_b32 amin     = npyv_cmpeq_s32(a, vmin);
+        npyv_b32 bneg_one = npyv_cmpeq_s32(b, vneg_one);
+        npyv_b32 overflow = npyv_and_s32(bneg_one, amin);
+                warn_zero = npyv_or_s32(bzero, warn_zero);
+               warn_overflow = npyv_or_s32(overflow, warn_overflow);
+#endif
+#if 1 >= 1 /* remainder and divmod */
+        // handle mixed case the way Python does
+        // ((a > 0) == (b > 0) || rem == 0)
+        npyv_b32 a_gt_zero  = npyv_cmpgt_s32(a, vzero);
+        npyv_b32 b_gt_zero  = npyv_cmpgt_s32(b, vzero);
+        npyv_b32 ab_eq_cond = npyv_cmpeq_s32(a_gt_zero, b_gt_zero);
+        npyv_b32 rem_zero   = npyv_cmpeq_s32(rem, vzero);
+        npyv_b32 or         = npyv_or_s32(ab_eq_cond, rem_zero);
+        npyv_s32 to_add      = npyv_select_s32(or, vzero, b);
+                           rem = npyv_add_s32(rem, to_add);
+#endif
+#if 1 == 2 /* divmod */
+        npyv_s32 to_sub = npyv_select_s32(or, vzero, vneg_one);
+                      quo = npyv_add_s32(quo, to_sub);
+                      // Divide by zero
+                      quo = npyv_select_s32(bzero, vzero, quo);
+                      rem = npyv_select_s32(bzero, vzero, rem);
+                      // Overflow
+                      quo = npyv_select_s32(overflow, vmin, quo);
+                      rem = npyv_select_s32(overflow, vzero, rem);
+        npyv_store_s32(dst1, quo);
+        npyv_store_s32(dst2, rem);
+#else /* fmod and remainder */
+        npyv_store_s32(dst1, rem);
+        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
+            npy_set_floatstatus_divbyzero();
+        }
+#endif
+    }
+
+#if 1 == 2 /* divmod */
+    if (!vec_all_eq(warn_zero, vzero)) {
+        npy_set_floatstatus_divbyzero();
+    }
+    if (!vec_all_eq(warn_overflow, vzero)) {
+        npy_set_floatstatus_overflow();
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
+        const npyv_lanetype_s32 a = *src1;
+        const npyv_lanetype_s32 b = *src2;
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT32, NPY_TRUE)) {
+            if (b == 0) {
+                npy_set_floatstatus_divbyzero();
+                *dst1 = 0;
+                *dst2 = 0;
+            }
+            else {
+                npy_set_floatstatus_overflow();
+                *dst1 = NPY_MIN_INT32;
+                *dst2 = 0;
+            }
+        }
+        else {
+            *dst1 = a / b;
+            *dst2 = a % b;
+            if (!((a > 0) == (b > 0) || *dst2 == 0)) {
+                *dst1 -= 1;
+                *dst2 += b;
+            }
+        }
+    }
+#else /* fmod and remainder */
+    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
+        const npyv_lanetype_s32 a = *src1;
+        const npyv_lanetype_s32 b = *src2;
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT32, NPY_TRUE)) {
+            FLAG_IF_DIVIDEBYZERO(b);
+            *dst1 = 0;
+        } else{
+            *dst1 = a % b;
+#if 1 == 1 /* remainder */
+            if (!((a > 0) == (b > 0) || *dst1 == 0)) {
+                *dst1 += b;
+            }
+#endif
+        }
+    }
+#endif
+    npyv_cleanup();
+}
+
+static inline void
+vsx4_simd_remainder_by_scalar_contig_s32(char **args, npy_intp len)
+{
+    npyv_lanetype_s32 *src1  = (npyv_lanetype_s32 *) args[0];
+    npyv_lanetype_s32 scalar = *(npyv_lanetype_s32 *) args[1];
+    npyv_lanetype_s32 *dst1  = (npyv_lanetype_s32 *) args[2];
+    const npyv_s32 vscalar   = npyv_setall_s32(scalar);
+    const npyv_s32 divisor    = vsx4_divisor_s32(vscalar);
+    const int vstep            = npyv_nlanes_s32;
+#if 1 >= 1 /* remainder and divmod */
+    const npyv_s32 vzero     = npyv_zero_s32();
+    npyv_b32 b_gt_zero      = npyv_cmpgt_s32(vscalar, vzero);
+#endif
+#if 1 == 2 /* divmod */
+    npyv_b32 warn          = npyv_cvt_b32_s32(npyv_zero_s32());
+    const npyv_s32 vmin     = npyv_setall_s32(NPY_MIN_INT32);
+    const npyv_s32 vneg_one = npyv_setall_s32(-1);
+    npyv_b32 bneg_one      = npyv_cmpeq_s32(vscalar, vneg_one);
+    npyv_lanetype_s32 *dst2 = (npyv_lanetype_s32 *) args[3];
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
+         dst2 += vstep) {
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
+#endif
+        npyv_s32 a = npyv_load_s32(src1);
+#if 1 <= 1 /* fmod and remainder */
+        npyv_s32 rem       = vsx4_mod_scalar_s32(a, divisor);
+#else /* divmod */
+        npyv_s32 quo       = vsx4_div_scalar_s32(a, divisor);
+        npyv_s32 rem       = npyv_sub_s32(a, vec_mul(vscalar, quo));
+        // (a == NPY_MIN_INT32 && b == -1)
+        npyv_b32 amin     = npyv_cmpeq_s32(a, vmin);
+        npyv_b32 overflow = npyv_and_s32(bneg_one, amin);
+                        warn = npyv_or_s32(overflow, warn);
+#endif
+#if 1 >= 1 /* remainder and divmod */
+        // handle mixed case the way Python does
+        // ((a > 0) == (b > 0) || rem == 0)
+        npyv_b32 a_gt_zero  = npyv_cmpgt_s32(a, vzero);
+        npyv_b32 ab_eq_cond = npyv_cmpeq_s32(a_gt_zero, b_gt_zero);
+        npyv_b32 rem_zero   = npyv_cmpeq_s32(rem, vzero);
+        npyv_b32 or         = npyv_or_s32(ab_eq_cond, rem_zero);
+        npyv_s32 to_add      = npyv_select_s32(or, vzero, vscalar);
+                           rem = npyv_add_s32(rem, to_add);
+#endif
+#if 1 == 2 /* divmod */
+        npyv_s32 to_sub = npyv_select_s32(or, vzero, vneg_one);
+        quo               = npyv_add_s32(quo, to_sub);
+        // Overflow: set quo to minimum and rem to 0
+        quo               = npyv_select_s32(overflow, vmin, quo);
+        rem               = npyv_select_s32(overflow, vzero, rem);
+        npyv_store_s32(dst1, quo);
+        npyv_store_s32(dst2, rem);
+#else /* fmod and remainder */
+        npyv_store_s32(dst1, rem);
+#endif
+    }
+
+#if 1 == 2 /* divmod */
+    if (!vec_all_eq(warn, vzero)) {
+        npy_set_floatstatus_overflow();
+    }
+
+    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
+        const npyv_lanetype_s32 a = *src1;
+        if (NPY_UNLIKELY(a == NPY_MIN_INT32 && scalar == -1)) {
+            npy_set_floatstatus_overflow();
+            *dst1 = NPY_MIN_INT32;
+            *dst2 = 0;
+        }
+        else {
+            *dst1 = a / scalar;
+            *dst2 = a % scalar;
+            if (!((a > 0) == (scalar > 0) || *dst2 == 0)) {
+                *dst1 -= 1;
+                *dst2 += scalar;
+            }
+        }
+    }
+#else /* fmod and remainder */
+    for (; len > 0; --len, ++src1, ++dst1) {
+        const npyv_lanetype_s32 a = *src1;
+        *dst1 = a % scalar;
+#if 1 == 1 /* remainder */
+        if (!((a > 0) == (scalar > 0) || *dst1 == 0)) {
+            *dst1 += scalar;
+        }
+#endif
+    }
+#endif
+    npyv_cleanup();
+}
+
+#line 295
+static inline void
+vsx4_simd_divmod_contig_s32(char **args, npy_intp len)
+{
+    npyv_lanetype_s32 *src1 = (npyv_lanetype_s32 *) args[0];
+    npyv_lanetype_s32 *src2 = (npyv_lanetype_s32 *) args[1];
+    npyv_lanetype_s32 *dst1 = (npyv_lanetype_s32 *) args[2];
+    const npyv_s32 vzero    = npyv_zero_s32();
+    const int vstep           = npyv_nlanes_s32;
+#if 2 == 2 /* divmod */
+    npyv_lanetype_s32 *dst2 = (npyv_lanetype_s32 *) args[3];
+    const npyv_s32 vneg_one = npyv_setall_s32(-1);
+    const npyv_s32 vmin     = npyv_setall_s32(NPY_MIN_INT32);
+    npyv_b32 warn_zero     = npyv_cvt_b32_s32(npyv_zero_s32());
+    npyv_b32 warn_overflow = npyv_cvt_b32_s32(npyv_zero_s32());
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep, dst2 += vstep) {
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep) {
+#endif
+        npyv_s32 a = npyv_load_s32(src1);
+        npyv_s32 b = npyv_load_s32(src2);
+#if 2 <= 1 /* fmod and remainder */
+        npyv_s32 rem       = vsx4_mod_s32(a, b);
+#else /* divmod */
+        npyv_s32 quo       = vsx4_div_s32(a, b);
+        npyv_s32 rem       = npyv_sub_s32(a, vec_mul(b, quo));
+        // (b == 0 || (a == NPY_MIN_INT32 && b == -1))
+        npyv_b32 bzero    = npyv_cmpeq_s32(b, vzero);
+        npyv_b32 amin     = npyv_cmpeq_s32(a, vmin);
+        npyv_b32 bneg_one = npyv_cmpeq_s32(b, vneg_one);
+        npyv_b32 overflow = npyv_and_s32(bneg_one, amin);
+                warn_zero = npyv_or_s32(bzero, warn_zero);
+               warn_overflow = npyv_or_s32(overflow, warn_overflow);
+#endif
+#if 2 >= 1 /* remainder and divmod */
+        // handle mixed case the way Python does
+        // ((a > 0) == (b > 0) || rem == 0)
+        npyv_b32 a_gt_zero  = npyv_cmpgt_s32(a, vzero);
+        npyv_b32 b_gt_zero  = npyv_cmpgt_s32(b, vzero);
+        npyv_b32 ab_eq_cond = npyv_cmpeq_s32(a_gt_zero, b_gt_zero);
+        npyv_b32 rem_zero   = npyv_cmpeq_s32(rem, vzero);
+        npyv_b32 or         = npyv_or_s32(ab_eq_cond, rem_zero);
+        npyv_s32 to_add      = npyv_select_s32(or, vzero, b);
+                           rem = npyv_add_s32(rem, to_add);
+#endif
+#if 2 == 2 /* divmod */
+        npyv_s32 to_sub = npyv_select_s32(or, vzero, vneg_one);
+                      quo = npyv_add_s32(quo, to_sub);
+                      // Divide by zero
+                      quo = npyv_select_s32(bzero, vzero, quo);
+                      rem = npyv_select_s32(bzero, vzero, rem);
+                      // Overflow
+                      quo = npyv_select_s32(overflow, vmin, quo);
+                      rem = npyv_select_s32(overflow, vzero, rem);
+        npyv_store_s32(dst1, quo);
+        npyv_store_s32(dst2, rem);
+#else /* fmod and remainder */
+        npyv_store_s32(dst1, rem);
+        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
+            npy_set_floatstatus_divbyzero();
+        }
+#endif
+    }
+
+#if 2 == 2 /* divmod */
+    if (!vec_all_eq(warn_zero, vzero)) {
+        npy_set_floatstatus_divbyzero();
+    }
+    if (!vec_all_eq(warn_overflow, vzero)) {
+        npy_set_floatstatus_overflow();
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
+        const npyv_lanetype_s32 a = *src1;
+        const npyv_lanetype_s32 b = *src2;
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT32, NPY_TRUE)) {
+            if (b == 0) {
+                npy_set_floatstatus_divbyzero();
+                *dst1 = 0;
+                *dst2 = 0;
+            }
+            else {
+                npy_set_floatstatus_overflow();
+                *dst1 = NPY_MIN_INT32;
+                *dst2 = 0;
+            }
+        }
+        else {
+            *dst1 = a / b;
+            *dst2 = a % b;
+            if (!((a > 0) == (b > 0) || *dst2 == 0)) {
+                *dst1 -= 1;
+                *dst2 += b;
+            }
+        }
+    }
+#else /* fmod and remainder */
+    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
+        const npyv_lanetype_s32 a = *src1;
+        const npyv_lanetype_s32 b = *src2;
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT32, NPY_TRUE)) {
+            FLAG_IF_DIVIDEBYZERO(b);
+            *dst1 = 0;
+        } else{
+            *dst1 = a % b;
+#if 2 == 1 /* remainder */
+            if (!((a > 0) == (b > 0) || *dst1 == 0)) {
+                *dst1 += b;
+            }
+#endif
+        }
+    }
+#endif
+    npyv_cleanup();
+}
+
+static inline void
+vsx4_simd_divmod_by_scalar_contig_s32(char **args, npy_intp len)
+{
+    npyv_lanetype_s32 *src1  = (npyv_lanetype_s32 *) args[0];
+    npyv_lanetype_s32 scalar = *(npyv_lanetype_s32 *) args[1];
+    npyv_lanetype_s32 *dst1  = (npyv_lanetype_s32 *) args[2];
+    const npyv_s32 vscalar   = npyv_setall_s32(scalar);
+    const npyv_s32 divisor    = vsx4_divisor_s32(vscalar);
+    const int vstep            = npyv_nlanes_s32;
+#if 2 >= 1 /* remainder and divmod */
+    const npyv_s32 vzero     = npyv_zero_s32();
+    npyv_b32 b_gt_zero      = npyv_cmpgt_s32(vscalar, vzero);
+#endif
+#if 2 == 2 /* divmod */
+    npyv_b32 warn          = npyv_cvt_b32_s32(npyv_zero_s32());
+    const npyv_s32 vmin     = npyv_setall_s32(NPY_MIN_INT32);
+    const npyv_s32 vneg_one = npyv_setall_s32(-1);
+    npyv_b32 bneg_one      = npyv_cmpeq_s32(vscalar, vneg_one);
+    npyv_lanetype_s32 *dst2 = (npyv_lanetype_s32 *) args[3];
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
+         dst2 += vstep) {
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
+#endif
+        npyv_s32 a = npyv_load_s32(src1);
+#if 2 <= 1 /* fmod and remainder */
+        npyv_s32 rem       = vsx4_mod_scalar_s32(a, divisor);
+#else /* divmod */
+        npyv_s32 quo       = vsx4_div_scalar_s32(a, divisor);
+        npyv_s32 rem       = npyv_sub_s32(a, vec_mul(vscalar, quo));
+        // (a == NPY_MIN_INT32 && b == -1)
+        npyv_b32 amin     = npyv_cmpeq_s32(a, vmin);
+        npyv_b32 overflow = npyv_and_s32(bneg_one, amin);
+                        warn = npyv_or_s32(overflow, warn);
+#endif
+#if 2 >= 1 /* remainder and divmod */
+        // handle mixed case the way Python does
+        // ((a > 0) == (b > 0) || rem == 0)
+        npyv_b32 a_gt_zero  = npyv_cmpgt_s32(a, vzero);
+        npyv_b32 ab_eq_cond = npyv_cmpeq_s32(a_gt_zero, b_gt_zero);
+        npyv_b32 rem_zero   = npyv_cmpeq_s32(rem, vzero);
+        npyv_b32 or         = npyv_or_s32(ab_eq_cond, rem_zero);
+        npyv_s32 to_add      = npyv_select_s32(or, vzero, vscalar);
+                           rem = npyv_add_s32(rem, to_add);
+#endif
+#if 2 == 2 /* divmod */
+        npyv_s32 to_sub = npyv_select_s32(or, vzero, vneg_one);
+        quo               = npyv_add_s32(quo, to_sub);
+        // Overflow: set quo to minimum and rem to 0
+        quo               = npyv_select_s32(overflow, vmin, quo);
+        rem               = npyv_select_s32(overflow, vzero, rem);
+        npyv_store_s32(dst1, quo);
+        npyv_store_s32(dst2, rem);
+#else /* fmod and remainder */
+        npyv_store_s32(dst1, rem);
+#endif
+    }
+
+#if 2 == 2 /* divmod */
+    if (!vec_all_eq(warn, vzero)) {
+        npy_set_floatstatus_overflow();
+    }
+
+    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
+        const npyv_lanetype_s32 a = *src1;
+        if (NPY_UNLIKELY(a == NPY_MIN_INT32 && scalar == -1)) {
+            npy_set_floatstatus_overflow();
+            *dst1 = NPY_MIN_INT32;
+            *dst2 = 0;
+        }
+        else {
+            *dst1 = a / scalar;
+            *dst2 = a % scalar;
+            if (!((a > 0) == (scalar > 0) || *dst2 == 0)) {
+                *dst1 -= 1;
+                *dst2 += scalar;
+            }
+        }
+    }
+#else /* fmod and remainder */
+    for (; len > 0; --len, ++src1, ++dst1) {
+        const npyv_lanetype_s32 a = *src1;
+        *dst1 = a % scalar;
+#if 2 == 1 /* remainder */
+        if (!((a > 0) == (scalar > 0) || *dst1 == 0)) {
+            *dst1 += scalar;
+        }
+#endif
+    }
+#endif
+    npyv_cleanup();
+}
+
+
+#line 291
+#line 295
+static inline void
+vsx4_simd_fmod_contig_s64(char **args, npy_intp len)
+{
+    npyv_lanetype_s64 *src1 = (npyv_lanetype_s64 *) args[0];
+    npyv_lanetype_s64 *src2 = (npyv_lanetype_s64 *) args[1];
+    npyv_lanetype_s64 *dst1 = (npyv_lanetype_s64 *) args[2];
+    const npyv_s64 vzero    = npyv_zero_s64();
+    const int vstep           = npyv_nlanes_s64;
+#if 0 == 2 /* divmod */
+    npyv_lanetype_s64 *dst2 = (npyv_lanetype_s64 *) args[3];
+    const npyv_s64 vneg_one = npyv_setall_s64(-1);
+    const npyv_s64 vmin     = npyv_setall_s64(NPY_MIN_INT64);
+    npyv_b64 warn_zero     = npyv_cvt_b64_s64(npyv_zero_s64());
+    npyv_b64 warn_overflow = npyv_cvt_b64_s64(npyv_zero_s64());
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep, dst2 += vstep) {
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep) {
+#endif
+        npyv_s64 a = npyv_load_s64(src1);
+        npyv_s64 b = npyv_load_s64(src2);
+#if 0 <= 1 /* fmod and remainder */
+        npyv_s64 rem       = vsx4_mod_s64(a, b);
+#else /* divmod */
+        npyv_s64 quo       = vsx4_div_s64(a, b);
+        npyv_s64 rem       = npyv_sub_s64(a, vec_mul(b, quo));
+        // (b == 0 || (a == NPY_MIN_INT64 && b == -1))
+        npyv_b64 bzero    = npyv_cmpeq_s64(b, vzero);
+        npyv_b64 amin     = npyv_cmpeq_s64(a, vmin);
+        npyv_b64 bneg_one = npyv_cmpeq_s64(b, vneg_one);
+        npyv_b64 overflow = npyv_and_s64(bneg_one, amin);
+                warn_zero = npyv_or_s64(bzero, warn_zero);
+               warn_overflow = npyv_or_s64(overflow, warn_overflow);
+#endif
+#if 0 >= 1 /* remainder and divmod */
+        // handle mixed case the way Python does
+        // ((a > 0) == (b > 0) || rem == 0)
+        npyv_b64 a_gt_zero  = npyv_cmpgt_s64(a, vzero);
+        npyv_b64 b_gt_zero  = npyv_cmpgt_s64(b, vzero);
+        npyv_b64 ab_eq_cond = npyv_cmpeq_s64(a_gt_zero, b_gt_zero);
+        npyv_b64 rem_zero   = npyv_cmpeq_s64(rem, vzero);
+        npyv_b64 or         = npyv_or_s64(ab_eq_cond, rem_zero);
+        npyv_s64 to_add      = npyv_select_s64(or, vzero, b);
+                           rem = npyv_add_s64(rem, to_add);
+#endif
+#if 0 == 2 /* divmod */
+        npyv_s64 to_sub = npyv_select_s64(or, vzero, vneg_one);
+                      quo = npyv_add_s64(quo, to_sub);
+                      // Divide by zero
+                      quo = npyv_select_s64(bzero, vzero, quo);
+                      rem = npyv_select_s64(bzero, vzero, rem);
+                      // Overflow
+                      quo = npyv_select_s64(overflow, vmin, quo);
+                      rem = npyv_select_s64(overflow, vzero, rem);
+        npyv_store_s64(dst1, quo);
+        npyv_store_s64(dst2, rem);
+#else /* fmod and remainder */
+        npyv_store_s64(dst1, rem);
+        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
+            npy_set_floatstatus_divbyzero();
+        }
+#endif
+    }
+
+#if 0 == 2 /* divmod */
+    if (!vec_all_eq(warn_zero, vzero)) {
+        npy_set_floatstatus_divbyzero();
+    }
+    if (!vec_all_eq(warn_overflow, vzero)) {
+        npy_set_floatstatus_overflow();
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
+        const npyv_lanetype_s64 a = *src1;
+        const npyv_lanetype_s64 b = *src2;
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT64, NPY_TRUE)) {
+            if (b == 0) {
+                npy_set_floatstatus_divbyzero();
+                *dst1 = 0;
+                *dst2 = 0;
+            }
+            else {
+                npy_set_floatstatus_overflow();
+                *dst1 = NPY_MIN_INT64;
+                *dst2 = 0;
+            }
+        }
+        else {
+            *dst1 = a / b;
+            *dst2 = a % b;
+            if (!((a > 0) == (b > 0) || *dst2 == 0)) {
+                *dst1 -= 1;
+                *dst2 += b;
+            }
+        }
+    }
+#else /* fmod and remainder */
+    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
+        const npyv_lanetype_s64 a = *src1;
+        const npyv_lanetype_s64 b = *src2;
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT64, NPY_TRUE)) {
+            FLAG_IF_DIVIDEBYZERO(b);
+            *dst1 = 0;
+        } else{
+            *dst1 = a % b;
+#if 0 == 1 /* remainder */
+            if (!((a > 0) == (b > 0) || *dst1 == 0)) {
+                *dst1 += b;
+            }
+#endif
+        }
+    }
+#endif
+    npyv_cleanup();
+}
+
+static inline void
+vsx4_simd_fmod_by_scalar_contig_s64(char **args, npy_intp len)
+{
+    npyv_lanetype_s64 *src1  = (npyv_lanetype_s64 *) args[0];
+    npyv_lanetype_s64 scalar = *(npyv_lanetype_s64 *) args[1];
+    npyv_lanetype_s64 *dst1  = (npyv_lanetype_s64 *) args[2];
+    const npyv_s64 vscalar   = npyv_setall_s64(scalar);
+    const npyv_s64 divisor    = vsx4_divisor_s64(vscalar);
+    const int vstep            = npyv_nlanes_s64;
+#if 0 >= 1 /* remainder and divmod */
+    const npyv_s64 vzero     = npyv_zero_s64();
+    npyv_b64 b_gt_zero      = npyv_cmpgt_s64(vscalar, vzero);
+#endif
+#if 0 == 2 /* divmod */
+    npyv_b64 warn          = npyv_cvt_b64_s64(npyv_zero_s64());
+    const npyv_s64 vmin     = npyv_setall_s64(NPY_MIN_INT64);
+    const npyv_s64 vneg_one = npyv_setall_s64(-1);
+    npyv_b64 bneg_one      = npyv_cmpeq_s64(vscalar, vneg_one);
+    npyv_lanetype_s64 *dst2 = (npyv_lanetype_s64 *) args[3];
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
+         dst2 += vstep) {
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
+#endif
+        npyv_s64 a = npyv_load_s64(src1);
+#if 0 <= 1 /* fmod and remainder */
+        npyv_s64 rem       = vsx4_mod_scalar_s64(a, divisor);
+#else /* divmod */
+        npyv_s64 quo       = vsx4_div_scalar_s64(a, divisor);
+        npyv_s64 rem       = npyv_sub_s64(a, vec_mul(vscalar, quo));
+        // (a == NPY_MIN_INT64 && b == -1)
+        npyv_b64 amin     = npyv_cmpeq_s64(a, vmin);
+        npyv_b64 overflow = npyv_and_s64(bneg_one, amin);
+                        warn = npyv_or_s64(overflow, warn);
+#endif
+#if 0 >= 1 /* remainder and divmod */
+        // handle mixed case the way Python does
+        // ((a > 0) == (b > 0) || rem == 0)
+        npyv_b64 a_gt_zero  = npyv_cmpgt_s64(a, vzero);
+        npyv_b64 ab_eq_cond = npyv_cmpeq_s64(a_gt_zero, b_gt_zero);
+        npyv_b64 rem_zero   = npyv_cmpeq_s64(rem, vzero);
+        npyv_b64 or         = npyv_or_s64(ab_eq_cond, rem_zero);
+        npyv_s64 to_add      = npyv_select_s64(or, vzero, vscalar);
+                           rem = npyv_add_s64(rem, to_add);
+#endif
+#if 0 == 2 /* divmod */
+        npyv_s64 to_sub = npyv_select_s64(or, vzero, vneg_one);
+        quo               = npyv_add_s64(quo, to_sub);
+        // Overflow: set quo to minimum and rem to 0
+        quo               = npyv_select_s64(overflow, vmin, quo);
+        rem               = npyv_select_s64(overflow, vzero, rem);
+        npyv_store_s64(dst1, quo);
+        npyv_store_s64(dst2, rem);
+#else /* fmod and remainder */
+        npyv_store_s64(dst1, rem);
+#endif
+    }
+
+#if 0 == 2 /* divmod */
+    if (!vec_all_eq(warn, vzero)) {
+        npy_set_floatstatus_overflow();
+    }
+
+    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
+        const npyv_lanetype_s64 a = *src1;
+        if (NPY_UNLIKELY(a == NPY_MIN_INT64 && scalar == -1)) {
+            npy_set_floatstatus_overflow();
+            *dst1 = NPY_MIN_INT64;
+            *dst2 = 0;
+        }
+        else {
+            *dst1 = a / scalar;
+            *dst2 = a % scalar;
+            if (!((a > 0) == (scalar > 0) || *dst2 == 0)) {
+                *dst1 -= 1;
+                *dst2 += scalar;
+            }
+        }
+    }
+#else /* fmod and remainder */
+    for (; len > 0; --len, ++src1, ++dst1) {
+        const npyv_lanetype_s64 a = *src1;
+        *dst1 = a % scalar;
+#if 0 == 1 /* remainder */
+        if (!((a > 0) == (scalar > 0) || *dst1 == 0)) {
+            *dst1 += scalar;
+        }
+#endif
+    }
+#endif
+    npyv_cleanup();
+}
+
+#line 295
+static inline void
+vsx4_simd_remainder_contig_s64(char **args, npy_intp len)
+{
+    npyv_lanetype_s64 *src1 = (npyv_lanetype_s64 *) args[0];
+    npyv_lanetype_s64 *src2 = (npyv_lanetype_s64 *) args[1];
+    npyv_lanetype_s64 *dst1 = (npyv_lanetype_s64 *) args[2];
+    const npyv_s64 vzero    = npyv_zero_s64();
+    const int vstep           = npyv_nlanes_s64;
+#if 1 == 2 /* divmod */
+    npyv_lanetype_s64 *dst2 = (npyv_lanetype_s64 *) args[3];
+    const npyv_s64 vneg_one = npyv_setall_s64(-1);
+    const npyv_s64 vmin     = npyv_setall_s64(NPY_MIN_INT64);
+    npyv_b64 warn_zero     = npyv_cvt_b64_s64(npyv_zero_s64());
+    npyv_b64 warn_overflow = npyv_cvt_b64_s64(npyv_zero_s64());
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep, dst2 += vstep) {
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep) {
+#endif
+        npyv_s64 a = npyv_load_s64(src1);
+        npyv_s64 b = npyv_load_s64(src2);
+#if 1 <= 1 /* fmod and remainder */
+        npyv_s64 rem       = vsx4_mod_s64(a, b);
+#else /* divmod */
+        npyv_s64 quo       = vsx4_div_s64(a, b);
+        npyv_s64 rem       = npyv_sub_s64(a, vec_mul(b, quo));
+        // (b == 0 || (a == NPY_MIN_INT64 && b == -1))
+        npyv_b64 bzero    = npyv_cmpeq_s64(b, vzero);
+        npyv_b64 amin     = npyv_cmpeq_s64(a, vmin);
+        npyv_b64 bneg_one = npyv_cmpeq_s64(b, vneg_one);
+        npyv_b64 overflow = npyv_and_s64(bneg_one, amin);
+                warn_zero = npyv_or_s64(bzero, warn_zero);
+               warn_overflow = npyv_or_s64(overflow, warn_overflow);
+#endif
+#if 1 >= 1 /* remainder and divmod */
+        // handle mixed case the way Python does
+        // ((a > 0) == (b > 0) || rem == 0)
+        npyv_b64 a_gt_zero  = npyv_cmpgt_s64(a, vzero);
+        npyv_b64 b_gt_zero  = npyv_cmpgt_s64(b, vzero);
+        npyv_b64 ab_eq_cond = npyv_cmpeq_s64(a_gt_zero, b_gt_zero);
+        npyv_b64 rem_zero   = npyv_cmpeq_s64(rem, vzero);
+        npyv_b64 or         = npyv_or_s64(ab_eq_cond, rem_zero);
+        npyv_s64 to_add      = npyv_select_s64(or, vzero, b);
+                           rem = npyv_add_s64(rem, to_add);
+#endif
+#if 1 == 2 /* divmod */
+        npyv_s64 to_sub = npyv_select_s64(or, vzero, vneg_one);
+                      quo = npyv_add_s64(quo, to_sub);
+                      // Divide by zero
+                      quo = npyv_select_s64(bzero, vzero, quo);
+                      rem = npyv_select_s64(bzero, vzero, rem);
+                      // Overflow
+                      quo = npyv_select_s64(overflow, vmin, quo);
+                      rem = npyv_select_s64(overflow, vzero, rem);
+        npyv_store_s64(dst1, quo);
+        npyv_store_s64(dst2, rem);
+#else /* fmod and remainder */
+        npyv_store_s64(dst1, rem);
+        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
+            npy_set_floatstatus_divbyzero();
+        }
+#endif
+    }
+
+#if 1 == 2 /* divmod */
+    if (!vec_all_eq(warn_zero, vzero)) {
+        npy_set_floatstatus_divbyzero();
+    }
+    if (!vec_all_eq(warn_overflow, vzero)) {
+        npy_set_floatstatus_overflow();
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
+        const npyv_lanetype_s64 a = *src1;
+        const npyv_lanetype_s64 b = *src2;
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT64, NPY_TRUE)) {
+            if (b == 0) {
+                npy_set_floatstatus_divbyzero();
+                *dst1 = 0;
+                *dst2 = 0;
+            }
+            else {
+                npy_set_floatstatus_overflow();
+                *dst1 = NPY_MIN_INT64;
+                *dst2 = 0;
+            }
+        }
+        else {
+            *dst1 = a / b;
+            *dst2 = a % b;
+            if (!((a > 0) == (b > 0) || *dst2 == 0)) {
+                *dst1 -= 1;
+                *dst2 += b;
+            }
+        }
+    }
+#else /* fmod and remainder */
+    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
+        const npyv_lanetype_s64 a = *src1;
+        const npyv_lanetype_s64 b = *src2;
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT64, NPY_TRUE)) {
+            FLAG_IF_DIVIDEBYZERO(b);
+            *dst1 = 0;
+        } else{
+            *dst1 = a % b;
+#if 1 == 1 /* remainder */
+            if (!((a > 0) == (b > 0) || *dst1 == 0)) {
+                *dst1 += b;
+            }
+#endif
+        }
+    }
+#endif
+    npyv_cleanup();
+}
+
+static inline void
+vsx4_simd_remainder_by_scalar_contig_s64(char **args, npy_intp len)
+{
+    npyv_lanetype_s64 *src1  = (npyv_lanetype_s64 *) args[0];
+    npyv_lanetype_s64 scalar = *(npyv_lanetype_s64 *) args[1];
+    npyv_lanetype_s64 *dst1  = (npyv_lanetype_s64 *) args[2];
+    const npyv_s64 vscalar   = npyv_setall_s64(scalar);
+    const npyv_s64 divisor    = vsx4_divisor_s64(vscalar);
+    const int vstep            = npyv_nlanes_s64;
+#if 1 >= 1 /* remainder and divmod */
+    const npyv_s64 vzero     = npyv_zero_s64();
+    npyv_b64 b_gt_zero      = npyv_cmpgt_s64(vscalar, vzero);
+#endif
+#if 1 == 2 /* divmod */
+    npyv_b64 warn          = npyv_cvt_b64_s64(npyv_zero_s64());
+    const npyv_s64 vmin     = npyv_setall_s64(NPY_MIN_INT64);
+    const npyv_s64 vneg_one = npyv_setall_s64(-1);
+    npyv_b64 bneg_one      = npyv_cmpeq_s64(vscalar, vneg_one);
+    npyv_lanetype_s64 *dst2 = (npyv_lanetype_s64 *) args[3];
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
+         dst2 += vstep) {
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
+#endif
+        npyv_s64 a = npyv_load_s64(src1);
+#if 1 <= 1 /* fmod and remainder */
+        npyv_s64 rem       = vsx4_mod_scalar_s64(a, divisor);
+#else /* divmod */
+        npyv_s64 quo       = vsx4_div_scalar_s64(a, divisor);
+        npyv_s64 rem       = npyv_sub_s64(a, vec_mul(vscalar, quo));
+        // (a == NPY_MIN_INT64 && b == -1)
+        npyv_b64 amin     = npyv_cmpeq_s64(a, vmin);
+        npyv_b64 overflow = npyv_and_s64(bneg_one, amin);
+                        warn = npyv_or_s64(overflow, warn);
+#endif
+#if 1 >= 1 /* remainder and divmod */
+        // handle mixed case the way Python does
+        // ((a > 0) == (b > 0) || rem == 0)
+        npyv_b64 a_gt_zero  = npyv_cmpgt_s64(a, vzero);
+        npyv_b64 ab_eq_cond = npyv_cmpeq_s64(a_gt_zero, b_gt_zero);
+        npyv_b64 rem_zero   = npyv_cmpeq_s64(rem, vzero);
+        npyv_b64 or         = npyv_or_s64(ab_eq_cond, rem_zero);
+        npyv_s64 to_add      = npyv_select_s64(or, vzero, vscalar);
+                           rem = npyv_add_s64(rem, to_add);
+#endif
+#if 1 == 2 /* divmod */
+        npyv_s64 to_sub = npyv_select_s64(or, vzero, vneg_one);
+        quo               = npyv_add_s64(quo, to_sub);
+        // Overflow: set quo to minimum and rem to 0
+        quo               = npyv_select_s64(overflow, vmin, quo);
+        rem               = npyv_select_s64(overflow, vzero, rem);
+        npyv_store_s64(dst1, quo);
+        npyv_store_s64(dst2, rem);
+#else /* fmod and remainder */
+        npyv_store_s64(dst1, rem);
+#endif
+    }
+
+#if 1 == 2 /* divmod */
+    if (!vec_all_eq(warn, vzero)) {
+        npy_set_floatstatus_overflow();
+    }
+
+    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
+        const npyv_lanetype_s64 a = *src1;
+        if (NPY_UNLIKELY(a == NPY_MIN_INT64 && scalar == -1)) {
+            npy_set_floatstatus_overflow();
+            *dst1 = NPY_MIN_INT64;
+            *dst2 = 0;
+        }
+        else {
+            *dst1 = a / scalar;
+            *dst2 = a % scalar;
+            if (!((a > 0) == (scalar > 0) || *dst2 == 0)) {
+                *dst1 -= 1;
+                *dst2 += scalar;
+            }
+        }
+    }
+#else /* fmod and remainder */
+    for (; len > 0; --len, ++src1, ++dst1) {
+        const npyv_lanetype_s64 a = *src1;
+        *dst1 = a % scalar;
+#if 1 == 1 /* remainder */
+        if (!((a > 0) == (scalar > 0) || *dst1 == 0)) {
+            *dst1 += scalar;
+        }
+#endif
+    }
+#endif
+    npyv_cleanup();
+}
+
+#line 295
+static inline void
+vsx4_simd_divmod_contig_s64(char **args, npy_intp len)
+{
+    npyv_lanetype_s64 *src1 = (npyv_lanetype_s64 *) args[0];
+    npyv_lanetype_s64 *src2 = (npyv_lanetype_s64 *) args[1];
+    npyv_lanetype_s64 *dst1 = (npyv_lanetype_s64 *) args[2];
+    const npyv_s64 vzero    = npyv_zero_s64();
+    const int vstep           = npyv_nlanes_s64;
+#if 2 == 2 /* divmod */
+    npyv_lanetype_s64 *dst2 = (npyv_lanetype_s64 *) args[3];
+    const npyv_s64 vneg_one = npyv_setall_s64(-1);
+    const npyv_s64 vmin     = npyv_setall_s64(NPY_MIN_INT64);
+    npyv_b64 warn_zero     = npyv_cvt_b64_s64(npyv_zero_s64());
+    npyv_b64 warn_overflow = npyv_cvt_b64_s64(npyv_zero_s64());
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep, dst2 += vstep) {
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, src2 += vstep,
+         dst1 += vstep) {
+#endif
+        npyv_s64 a = npyv_load_s64(src1);
+        npyv_s64 b = npyv_load_s64(src2);
+#if 2 <= 1 /* fmod and remainder */
+        npyv_s64 rem       = vsx4_mod_s64(a, b);
+#else /* divmod */
+        npyv_s64 quo       = vsx4_div_s64(a, b);
+        npyv_s64 rem       = npyv_sub_s64(a, vec_mul(b, quo));
+        // (b == 0 || (a == NPY_MIN_INT64 && b == -1))
+        npyv_b64 bzero    = npyv_cmpeq_s64(b, vzero);
+        npyv_b64 amin     = npyv_cmpeq_s64(a, vmin);
+        npyv_b64 bneg_one = npyv_cmpeq_s64(b, vneg_one);
+        npyv_b64 overflow = npyv_and_s64(bneg_one, amin);
+                warn_zero = npyv_or_s64(bzero, warn_zero);
+               warn_overflow = npyv_or_s64(overflow, warn_overflow);
+#endif
+#if 2 >= 1 /* remainder and divmod */
+        // handle mixed case the way Python does
+        // ((a > 0) == (b > 0) || rem == 0)
+        npyv_b64 a_gt_zero  = npyv_cmpgt_s64(a, vzero);
+        npyv_b64 b_gt_zero  = npyv_cmpgt_s64(b, vzero);
+        npyv_b64 ab_eq_cond = npyv_cmpeq_s64(a_gt_zero, b_gt_zero);
+        npyv_b64 rem_zero   = npyv_cmpeq_s64(rem, vzero);
+        npyv_b64 or         = npyv_or_s64(ab_eq_cond, rem_zero);
+        npyv_s64 to_add      = npyv_select_s64(or, vzero, b);
+                           rem = npyv_add_s64(rem, to_add);
+#endif
+#if 2 == 2 /* divmod */
+        npyv_s64 to_sub = npyv_select_s64(or, vzero, vneg_one);
+                      quo = npyv_add_s64(quo, to_sub);
+                      // Divide by zero
+                      quo = npyv_select_s64(bzero, vzero, quo);
+                      rem = npyv_select_s64(bzero, vzero, rem);
+                      // Overflow
+                      quo = npyv_select_s64(overflow, vmin, quo);
+                      rem = npyv_select_s64(overflow, vzero, rem);
+        npyv_store_s64(dst1, quo);
+        npyv_store_s64(dst2, rem);
+#else /* fmod and remainder */
+        npyv_store_s64(dst1, rem);
+        if (NPY_UNLIKELY(vec_any_eq(b, vzero))) {
+            npy_set_floatstatus_divbyzero();
+        }
+#endif
+    }
+
+#if 2 == 2 /* divmod */
+    if (!vec_all_eq(warn_zero, vzero)) {
+        npy_set_floatstatus_divbyzero();
+    }
+    if (!vec_all_eq(warn_overflow, vzero)) {
+        npy_set_floatstatus_overflow();
+    }
+
+    for (; len > 0; --len, ++src1, ++src2, ++dst1, ++dst2) {
+        const npyv_lanetype_s64 a = *src1;
+        const npyv_lanetype_s64 b = *src2;
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT64, NPY_TRUE)) {
+            if (b == 0) {
+                npy_set_floatstatus_divbyzero();
+                *dst1 = 0;
+                *dst2 = 0;
+            }
+            else {
+                npy_set_floatstatus_overflow();
+                *dst1 = NPY_MIN_INT64;
+                *dst2 = 0;
+            }
+        }
+        else {
+            *dst1 = a / b;
+            *dst2 = a % b;
+            if (!((a > 0) == (b > 0) || *dst2 == 0)) {
+                *dst1 -= 1;
+                *dst2 += b;
+            }
+        }
+    }
+#else /* fmod and remainder */
+    for (; len > 0; --len, ++src1, ++src2, ++dst1) {
+        const npyv_lanetype_s64 a = *src1;
+        const npyv_lanetype_s64 b = *src2;
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(a, b, NPY_MIN_INT64, NPY_TRUE)) {
+            FLAG_IF_DIVIDEBYZERO(b);
+            *dst1 = 0;
+        } else{
+            *dst1 = a % b;
+#if 2 == 1 /* remainder */
+            if (!((a > 0) == (b > 0) || *dst1 == 0)) {
+                *dst1 += b;
+            }
+#endif
+        }
+    }
+#endif
+    npyv_cleanup();
+}
+
+static inline void
+vsx4_simd_divmod_by_scalar_contig_s64(char **args, npy_intp len)
+{
+    npyv_lanetype_s64 *src1  = (npyv_lanetype_s64 *) args[0];
+    npyv_lanetype_s64 scalar = *(npyv_lanetype_s64 *) args[1];
+    npyv_lanetype_s64 *dst1  = (npyv_lanetype_s64 *) args[2];
+    const npyv_s64 vscalar   = npyv_setall_s64(scalar);
+    const npyv_s64 divisor    = vsx4_divisor_s64(vscalar);
+    const int vstep            = npyv_nlanes_s64;
+#if 2 >= 1 /* remainder and divmod */
+    const npyv_s64 vzero     = npyv_zero_s64();
+    npyv_b64 b_gt_zero      = npyv_cmpgt_s64(vscalar, vzero);
+#endif
+#if 2 == 2 /* divmod */
+    npyv_b64 warn          = npyv_cvt_b64_s64(npyv_zero_s64());
+    const npyv_s64 vmin     = npyv_setall_s64(NPY_MIN_INT64);
+    const npyv_s64 vneg_one = npyv_setall_s64(-1);
+    npyv_b64 bneg_one      = npyv_cmpeq_s64(vscalar, vneg_one);
+    npyv_lanetype_s64 *dst2 = (npyv_lanetype_s64 *) args[3];
+
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep,
+         dst2 += vstep) {
+#else /* fmod and remainder */
+    for (; len >= vstep; len -= vstep, src1 += vstep, dst1 += vstep) {
+#endif
+        npyv_s64 a = npyv_load_s64(src1);
+#if 2 <= 1 /* fmod and remainder */
+        npyv_s64 rem       = vsx4_mod_scalar_s64(a, divisor);
+#else /* divmod */
+        npyv_s64 quo       = vsx4_div_scalar_s64(a, divisor);
+        npyv_s64 rem       = npyv_sub_s64(a, vec_mul(vscalar, quo));
+        // (a == NPY_MIN_INT64 && b == -1)
+        npyv_b64 amin     = npyv_cmpeq_s64(a, vmin);
+        npyv_b64 overflow = npyv_and_s64(bneg_one, amin);
+                        warn = npyv_or_s64(overflow, warn);
+#endif
+#if 2 >= 1 /* remainder and divmod */
+        // handle mixed case the way Python does
+        // ((a > 0) == (b > 0) || rem == 0)
+        npyv_b64 a_gt_zero  = npyv_cmpgt_s64(a, vzero);
+        npyv_b64 ab_eq_cond = npyv_cmpeq_s64(a_gt_zero, b_gt_zero);
+        npyv_b64 rem_zero   = npyv_cmpeq_s64(rem, vzero);
+        npyv_b64 or         = npyv_or_s64(ab_eq_cond, rem_zero);
+        npyv_s64 to_add      = npyv_select_s64(or, vzero, vscalar);
+                           rem = npyv_add_s64(rem, to_add);
+#endif
+#if 2 == 2 /* divmod */
+        npyv_s64 to_sub = npyv_select_s64(or, vzero, vneg_one);
+        quo               = npyv_add_s64(quo, to_sub);
+        // Overflow: set quo to minimum and rem to 0
+        quo               = npyv_select_s64(overflow, vmin, quo);
+        rem               = npyv_select_s64(overflow, vzero, rem);
+        npyv_store_s64(dst1, quo);
+        npyv_store_s64(dst2, rem);
+#else /* fmod and remainder */
+        npyv_store_s64(dst1, rem);
+#endif
+    }
+
+#if 2 == 2 /* divmod */
+    if (!vec_all_eq(warn, vzero)) {
+        npy_set_floatstatus_overflow();
+    }
+
+    for (; len > 0; --len, ++src1, ++dst1, ++dst2) {
+        const npyv_lanetype_s64 a = *src1;
+        if (NPY_UNLIKELY(a == NPY_MIN_INT64 && scalar == -1)) {
+            npy_set_floatstatus_overflow();
+            *dst1 = NPY_MIN_INT64;
+            *dst2 = 0;
+        }
+        else {
+            *dst1 = a / scalar;
+            *dst2 = a % scalar;
+            if (!((a > 0) == (scalar > 0) || *dst2 == 0)) {
+                *dst1 -= 1;
+                *dst2 += scalar;
+            }
+        }
+    }
+#else /* fmod and remainder */
+    for (; len > 0; --len, ++src1, ++dst1) {
+        const npyv_lanetype_s64 a = *src1;
+        *dst1 = a % scalar;
+#if 2 == 1 /* remainder */
+        if (!((a > 0) == (scalar > 0) || *dst1 == 0)) {
+            *dst1 += scalar;
+        }
+#endif
+    }
+#endif
+    npyv_cleanup();
+}
+
+
+#endif // NPY_SIMD && defined(NPY_HAVE_VSX4)
+
+/*****************************************************************************
+ ** Defining ufunc inner functions
+ *****************************************************************************/
+
+#line 524
+#undef TO_SIMD_SFX
+#if 0
+#line 529
+#elif NPY_BITSOF_BYTE == 8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s8
+    #else
+        #define TO_SIMD_SFX(X) X##_u8
+    #endif
+
+#line 529
+#elif NPY_BITSOF_BYTE == 16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s16
+    #else
+        #define TO_SIMD_SFX(X) X##_u16
+    #endif
+
+#line 529
+#elif NPY_BITSOF_BYTE == 32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s32
+    #else
+        #define TO_SIMD_SFX(X) X##_u32
+    #endif
+
+#line 529
+#elif NPY_BITSOF_BYTE == 64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s64
+    #else
+        #define TO_SIMD_SFX(X) X##_u64
+    #endif
+
+#endif
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_fmod)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
+    // both arguments are arrays of the same size
+    if (IS_BLOCKABLE_BINARY(sizeof(npy_ubyte), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]);
+        return;
+    }
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ubyte), NPY_SIMD_WIDTH) &&
+             (*(npy_ubyte *)args[1]) != 0) {
+        TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]);
+        return ;
+    }
+#endif
+    BINARY_LOOP {
+        const npy_ubyte in1 = *(npy_ubyte *)ip1;
+        const npy_ubyte in2 = *(npy_ubyte *)ip2;
+#if 0
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_UBYTE, NPY_TRUE)) {
+#else
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
+#endif
+            FLAG_IF_DIVIDEBYZERO(in2);
+            *((npy_ubyte *)op1) = 0;
+        } else{
+            *((npy_ubyte *)op1)= in1 % in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_remainder)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
+    // both arguments are arrays of the same size
+    if (IS_BLOCKABLE_BINARY(sizeof(npy_ubyte), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]);
+        return;
+    }
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ubyte), NPY_SIMD_WIDTH) &&
+             (*(npy_ubyte *)args[1]) != 0) {
+        TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]);
+        return ;
+    }
+#endif
+    BINARY_LOOP {
+        const npy_ubyte in1 = *(npy_ubyte *)ip1;
+        const npy_ubyte in2 = *(npy_ubyte *)ip2;
+#if 0
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_UBYTE, NPY_TRUE)) {
+#else
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
+#endif
+            FLAG_IF_DIVIDEBYZERO(in2);
+            *((npy_ubyte *)op1) = 0;
+        } else{
+#if 0
+            /* handle mixed case the way Python does */
+            const npy_ubyte rem = in1 % in2;
+            if ((in1 > 0) == (in2 > 0) || rem == 0) {
+                *((npy_ubyte *)op1) = rem;
+            }
+            else {
+                *((npy_ubyte *)op1) = rem + in2;
+            }
+#else
+            *((npy_ubyte *)op1)= in1 % in2;
+#endif
+        }
+    }
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_divmod)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
+    // both arguments are arrays of the same size
+    if (IS_BLOCKABLE_BINARY(sizeof(npy_ubyte), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]);
+        return;
+    }
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ubyte), NPY_SIMD_WIDTH) &&
+             (*(npy_ubyte *)args[1]) != 0) {
+        TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]);
+        return ;
+    }
+#endif
+#if 0
+    BINARY_LOOP_TWO_OUT {
+        const npy_ubyte in1 = *(npy_ubyte *)ip1;
+        const npy_ubyte in2 = *(npy_ubyte *)ip2;
+        /* see FIXME note for divide above */
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_UBYTE, NPY_TRUE)) {
+            if (in2 == 0) {
+                npy_set_floatstatus_divbyzero();
+                *((npy_ubyte *)op1) = 0;
+                *((npy_ubyte *)op2) = 0;
+            }
+            else {
+                npy_set_floatstatus_overflow();
+                *((npy_ubyte *)op1) = NPY_MIN_UBYTE;
+                *((npy_ubyte *)op2) = 0;
+            }
+        }
+        else {
+            /* handle mixed case the way Python does */
+            const npy_ubyte quo = in1 / in2;
+            const npy_ubyte rem = in1 % in2;
+            if ((in1 > 0) == (in2 > 0) || rem == 0) {
+                *((npy_ubyte *)op1) = quo;
+                *((npy_ubyte *)op2) = rem;
+            }
+            else {
+                *((npy_ubyte *)op1) = quo - 1;
+                *((npy_ubyte *)op2) = rem + in2;
+            }
+        }
+    }
+#else
+    BINARY_LOOP_TWO_OUT {
+        const npy_ubyte in1 = *(npy_ubyte *)ip1;
+        const npy_ubyte in2 = *(npy_ubyte *)ip2;
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
+            npy_set_floatstatus_divbyzero();
+            *((npy_ubyte *)op1) = 0;
+            *((npy_ubyte *)op2) = 0;
+        }
+        else {
+            *((npy_ubyte *)op1)= in1/in2;
+            *((npy_ubyte *)op2) = in1 % in2;
+        }
+    }
+#endif
+}
+
+#line 524
+#undef TO_SIMD_SFX
+#if 0
+#line 529
+#elif NPY_BITSOF_SHORT == 8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s8
+    #else
+        #define TO_SIMD_SFX(X) X##_u8
+    #endif
+
+#line 529
+#elif NPY_BITSOF_SHORT == 16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s16
+    #else
+        #define TO_SIMD_SFX(X) X##_u16
+    #endif
+
+#line 529
+#elif NPY_BITSOF_SHORT == 32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s32
+    #else
+        #define TO_SIMD_SFX(X) X##_u32
+    #endif
+
+#line 529
+#elif NPY_BITSOF_SHORT == 64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s64
+    #else
+        #define TO_SIMD_SFX(X) X##_u64
+    #endif
+
+#endif
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_fmod)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
+    // both arguments are arrays of the same size
+    if (IS_BLOCKABLE_BINARY(sizeof(npy_ushort), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]);
+        return;
+    }
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ushort), NPY_SIMD_WIDTH) &&
+             (*(npy_ushort *)args[1]) != 0) {
+        TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]);
+        return ;
+    }
+#endif
+    BINARY_LOOP {
+        const npy_ushort in1 = *(npy_ushort *)ip1;
+        const npy_ushort in2 = *(npy_ushort *)ip2;
+#if 0
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_USHORT, NPY_TRUE)) {
+#else
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
+#endif
+            FLAG_IF_DIVIDEBYZERO(in2);
+            *((npy_ushort *)op1) = 0;
+        } else{
+            *((npy_ushort *)op1)= in1 % in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_remainder)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
+    // both arguments are arrays of the same size
+    if (IS_BLOCKABLE_BINARY(sizeof(npy_ushort), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]);
+        return;
+    }
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ushort), NPY_SIMD_WIDTH) &&
+             (*(npy_ushort *)args[1]) != 0) {
+        TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]);
+        return ;
+    }
+#endif
+    BINARY_LOOP {
+        const npy_ushort in1 = *(npy_ushort *)ip1;
+        const npy_ushort in2 = *(npy_ushort *)ip2;
+#if 0
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_USHORT, NPY_TRUE)) {
+#else
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
+#endif
+            FLAG_IF_DIVIDEBYZERO(in2);
+            *((npy_ushort *)op1) = 0;
+        } else{
+#if 0
+            /* handle mixed case the way Python does */
+            const npy_ushort rem = in1 % in2;
+            if ((in1 > 0) == (in2 > 0) || rem == 0) {
+                *((npy_ushort *)op1) = rem;
+            }
+            else {
+                *((npy_ushort *)op1) = rem + in2;
+            }
+#else
+            *((npy_ushort *)op1)= in1 % in2;
+#endif
+        }
+    }
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_divmod)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
+    // both arguments are arrays of the same size
+    if (IS_BLOCKABLE_BINARY(sizeof(npy_ushort), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]);
+        return;
+    }
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ushort), NPY_SIMD_WIDTH) &&
+             (*(npy_ushort *)args[1]) != 0) {
+        TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]);
+        return ;
+    }
+#endif
+#if 0
+    BINARY_LOOP_TWO_OUT {
+        const npy_ushort in1 = *(npy_ushort *)ip1;
+        const npy_ushort in2 = *(npy_ushort *)ip2;
+        /* see FIXME note for divide above */
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_USHORT, NPY_TRUE)) {
+            if (in2 == 0) {
+                npy_set_floatstatus_divbyzero();
+                *((npy_ushort *)op1) = 0;
+                *((npy_ushort *)op2) = 0;
+            }
+            else {
+                npy_set_floatstatus_overflow();
+                *((npy_ushort *)op1) = NPY_MIN_USHORT;
+                *((npy_ushort *)op2) = 0;
+            }
+        }
+        else {
+            /* handle mixed case the way Python does */
+            const npy_ushort quo = in1 / in2;
+            const npy_ushort rem = in1 % in2;
+            if ((in1 > 0) == (in2 > 0) || rem == 0) {
+                *((npy_ushort *)op1) = quo;
+                *((npy_ushort *)op2) = rem;
+            }
+            else {
+                *((npy_ushort *)op1) = quo - 1;
+                *((npy_ushort *)op2) = rem + in2;
+            }
+        }
+    }
+#else
+    BINARY_LOOP_TWO_OUT {
+        const npy_ushort in1 = *(npy_ushort *)ip1;
+        const npy_ushort in2 = *(npy_ushort *)ip2;
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
+            npy_set_floatstatus_divbyzero();
+            *((npy_ushort *)op1) = 0;
+            *((npy_ushort *)op2) = 0;
+        }
+        else {
+            *((npy_ushort *)op1)= in1/in2;
+            *((npy_ushort *)op2) = in1 % in2;
+        }
+    }
+#endif
+}
+
+#line 524
+#undef TO_SIMD_SFX
+#if 0
+#line 529
+#elif NPY_BITSOF_INT == 8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s8
+    #else
+        #define TO_SIMD_SFX(X) X##_u8
+    #endif
+
+#line 529
+#elif NPY_BITSOF_INT == 16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s16
+    #else
+        #define TO_SIMD_SFX(X) X##_u16
+    #endif
+
+#line 529
+#elif NPY_BITSOF_INT == 32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s32
+    #else
+        #define TO_SIMD_SFX(X) X##_u32
+    #endif
+
+#line 529
+#elif NPY_BITSOF_INT == 64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s64
+    #else
+        #define TO_SIMD_SFX(X) X##_u64
+    #endif
+
+#endif
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_fmod)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
+    // both arguments are arrays of the same size
+    if (IS_BLOCKABLE_BINARY(sizeof(npy_uint), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]);
+        return;
+    }
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_uint), NPY_SIMD_WIDTH) &&
+             (*(npy_uint *)args[1]) != 0) {
+        TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]);
+        return ;
+    }
+#endif
+    BINARY_LOOP {
+        const npy_uint in1 = *(npy_uint *)ip1;
+        const npy_uint in2 = *(npy_uint *)ip2;
+#if 0
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_UINT, NPY_TRUE)) {
+#else
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
+#endif
+            FLAG_IF_DIVIDEBYZERO(in2);
+            *((npy_uint *)op1) = 0;
+        } else{
+            *((npy_uint *)op1)= in1 % in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_remainder)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
+    // both arguments are arrays of the same size
+    if (IS_BLOCKABLE_BINARY(sizeof(npy_uint), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]);
+        return;
+    }
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_uint), NPY_SIMD_WIDTH) &&
+             (*(npy_uint *)args[1]) != 0) {
+        TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]);
+        return ;
+    }
+#endif
+    BINARY_LOOP {
+        const npy_uint in1 = *(npy_uint *)ip1;
+        const npy_uint in2 = *(npy_uint *)ip2;
+#if 0
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_UINT, NPY_TRUE)) {
+#else
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
+#endif
+            FLAG_IF_DIVIDEBYZERO(in2);
+            *((npy_uint *)op1) = 0;
+        } else{
+#if 0
+            /* handle mixed case the way Python does */
+            const npy_uint rem = in1 % in2;
+            if ((in1 > 0) == (in2 > 0) || rem == 0) {
+                *((npy_uint *)op1) = rem;
+            }
+            else {
+                *((npy_uint *)op1) = rem + in2;
+            }
+#else
+            *((npy_uint *)op1)= in1 % in2;
+#endif
+        }
+    }
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_divmod)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
+    // both arguments are arrays of the same size
+    if (IS_BLOCKABLE_BINARY(sizeof(npy_uint), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]);
+        return;
+    }
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_uint), NPY_SIMD_WIDTH) &&
+             (*(npy_uint *)args[1]) != 0) {
+        TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]);
+        return ;
+    }
+#endif
+#if 0
+    BINARY_LOOP_TWO_OUT {
+        const npy_uint in1 = *(npy_uint *)ip1;
+        const npy_uint in2 = *(npy_uint *)ip2;
+        /* see FIXME note for divide above */
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_UINT, NPY_TRUE)) {
+            if (in2 == 0) {
+                npy_set_floatstatus_divbyzero();
+                *((npy_uint *)op1) = 0;
+                *((npy_uint *)op2) = 0;
+            }
+            else {
+                npy_set_floatstatus_overflow();
+                *((npy_uint *)op1) = NPY_MIN_UINT;
+                *((npy_uint *)op2) = 0;
+            }
+        }
+        else {
+            /* handle mixed case the way Python does */
+            const npy_uint quo = in1 / in2;
+            const npy_uint rem = in1 % in2;
+            if ((in1 > 0) == (in2 > 0) || rem == 0) {
+                *((npy_uint *)op1) = quo;
+                *((npy_uint *)op2) = rem;
+            }
+            else {
+                *((npy_uint *)op1) = quo - 1;
+                *((npy_uint *)op2) = rem + in2;
+            }
+        }
+    }
+#else
+    BINARY_LOOP_TWO_OUT {
+        const npy_uint in1 = *(npy_uint *)ip1;
+        const npy_uint in2 = *(npy_uint *)ip2;
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
+            npy_set_floatstatus_divbyzero();
+            *((npy_uint *)op1) = 0;
+            *((npy_uint *)op2) = 0;
+        }
+        else {
+            *((npy_uint *)op1)= in1/in2;
+            *((npy_uint *)op2) = in1 % in2;
+        }
+    }
+#endif
+}
+
+#line 524
+#undef TO_SIMD_SFX
+#if 0
+#line 529
+#elif NPY_BITSOF_LONG == 8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s8
+    #else
+        #define TO_SIMD_SFX(X) X##_u8
+    #endif
+
+#line 529
+#elif NPY_BITSOF_LONG == 16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s16
+    #else
+        #define TO_SIMD_SFX(X) X##_u16
+    #endif
+
+#line 529
+#elif NPY_BITSOF_LONG == 32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s32
+    #else
+        #define TO_SIMD_SFX(X) X##_u32
+    #endif
+
+#line 529
+#elif NPY_BITSOF_LONG == 64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s64
+    #else
+        #define TO_SIMD_SFX(X) X##_u64
+    #endif
+
+#endif
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_fmod)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
+    // both arguments are arrays of the same size
+    if (IS_BLOCKABLE_BINARY(sizeof(npy_ulong), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]);
+        return;
+    }
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ulong), NPY_SIMD_WIDTH) &&
+             (*(npy_ulong *)args[1]) != 0) {
+        TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]);
+        return ;
+    }
+#endif
+    BINARY_LOOP {
+        const npy_ulong in1 = *(npy_ulong *)ip1;
+        const npy_ulong in2 = *(npy_ulong *)ip2;
+#if 0
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_ULONG, NPY_TRUE)) {
+#else
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
+#endif
+            FLAG_IF_DIVIDEBYZERO(in2);
+            *((npy_ulong *)op1) = 0;
+        } else{
+            *((npy_ulong *)op1)= in1 % in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_remainder)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
+    // both arguments are arrays of the same size
+    if (IS_BLOCKABLE_BINARY(sizeof(npy_ulong), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]);
+        return;
+    }
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ulong), NPY_SIMD_WIDTH) &&
+             (*(npy_ulong *)args[1]) != 0) {
+        TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]);
+        return ;
+    }
+#endif
+    BINARY_LOOP {
+        const npy_ulong in1 = *(npy_ulong *)ip1;
+        const npy_ulong in2 = *(npy_ulong *)ip2;
+#if 0
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_ULONG, NPY_TRUE)) {
+#else
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
+#endif
+            FLAG_IF_DIVIDEBYZERO(in2);
+            *((npy_ulong *)op1) = 0;
+        } else{
+#if 0
+            /* handle mixed case the way Python does */
+            const npy_ulong rem = in1 % in2;
+            if ((in1 > 0) == (in2 > 0) || rem == 0) {
+                *((npy_ulong *)op1) = rem;
+            }
+            else {
+                *((npy_ulong *)op1) = rem + in2;
+            }
+#else
+            *((npy_ulong *)op1)= in1 % in2;
+#endif
+        }
+    }
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_divmod)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
+    // both arguments are arrays of the same size
+    if (IS_BLOCKABLE_BINARY(sizeof(npy_ulong), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]);
+        return;
+    }
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ulong), NPY_SIMD_WIDTH) &&
+             (*(npy_ulong *)args[1]) != 0) {
+        TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]);
+        return ;
+    }
+#endif
+#if 0
+    BINARY_LOOP_TWO_OUT {
+        const npy_ulong in1 = *(npy_ulong *)ip1;
+        const npy_ulong in2 = *(npy_ulong *)ip2;
+        /* see FIXME note for divide above */
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_ULONG, NPY_TRUE)) {
+            if (in2 == 0) {
+                npy_set_floatstatus_divbyzero();
+                *((npy_ulong *)op1) = 0;
+                *((npy_ulong *)op2) = 0;
+            }
+            else {
+                npy_set_floatstatus_overflow();
+                *((npy_ulong *)op1) = NPY_MIN_ULONG;
+                *((npy_ulong *)op2) = 0;
+            }
+        }
+        else {
+            /* handle mixed case the way Python does */
+            const npy_ulong quo = in1 / in2;
+            const npy_ulong rem = in1 % in2;
+            if ((in1 > 0) == (in2 > 0) || rem == 0) {
+                *((npy_ulong *)op1) = quo;
+                *((npy_ulong *)op2) = rem;
+            }
+            else {
+                *((npy_ulong *)op1) = quo - 1;
+                *((npy_ulong *)op2) = rem + in2;
+            }
+        }
+    }
+#else
+    BINARY_LOOP_TWO_OUT {
+        const npy_ulong in1 = *(npy_ulong *)ip1;
+        const npy_ulong in2 = *(npy_ulong *)ip2;
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
+            npy_set_floatstatus_divbyzero();
+            *((npy_ulong *)op1) = 0;
+            *((npy_ulong *)op2) = 0;
+        }
+        else {
+            *((npy_ulong *)op1)= in1/in2;
+            *((npy_ulong *)op2) = in1 % in2;
+        }
+    }
+#endif
+}
+
+#line 524
+#undef TO_SIMD_SFX
+#if 0
+#line 529
+#elif NPY_BITSOF_LONGLONG == 8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s8
+    #else
+        #define TO_SIMD_SFX(X) X##_u8
+    #endif
+
+#line 529
+#elif NPY_BITSOF_LONGLONG == 16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s16
+    #else
+        #define TO_SIMD_SFX(X) X##_u16
+    #endif
+
+#line 529
+#elif NPY_BITSOF_LONGLONG == 32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s32
+    #else
+        #define TO_SIMD_SFX(X) X##_u32
+    #endif
+
+#line 529
+#elif NPY_BITSOF_LONGLONG == 64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_s64
+    #else
+        #define TO_SIMD_SFX(X) X##_u64
+    #endif
+
+#endif
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_fmod)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
+    // both arguments are arrays of the same size
+    if (IS_BLOCKABLE_BINARY(sizeof(npy_ulonglong), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]);
+        return;
+    }
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ulonglong), NPY_SIMD_WIDTH) &&
+             (*(npy_ulonglong *)args[1]) != 0) {
+        TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]);
+        return ;
+    }
+#endif
+    BINARY_LOOP {
+        const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
+        const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
+#if 0
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_ULONGLONG, NPY_TRUE)) {
+#else
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
+#endif
+            FLAG_IF_DIVIDEBYZERO(in2);
+            *((npy_ulonglong *)op1) = 0;
+        } else{
+            *((npy_ulonglong *)op1)= in1 % in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_remainder)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
+    // both arguments are arrays of the same size
+    if (IS_BLOCKABLE_BINARY(sizeof(npy_ulonglong), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]);
+        return;
+    }
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ulonglong), NPY_SIMD_WIDTH) &&
+             (*(npy_ulonglong *)args[1]) != 0) {
+        TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]);
+        return ;
+    }
+#endif
+    BINARY_LOOP {
+        const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
+        const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
+#if 0
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_ULONGLONG, NPY_TRUE)) {
+#else
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
+#endif
+            FLAG_IF_DIVIDEBYZERO(in2);
+            *((npy_ulonglong *)op1) = 0;
+        } else{
+#if 0
+            /* handle mixed case the way Python does */
+            const npy_ulonglong rem = in1 % in2;
+            if ((in1 > 0) == (in2 > 0) || rem == 0) {
+                *((npy_ulonglong *)op1) = rem;
+            }
+            else {
+                *((npy_ulonglong *)op1) = rem + in2;
+            }
+#else
+            *((npy_ulonglong *)op1)= in1 % in2;
+#endif
+        }
+    }
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_divmod)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
+    // both arguments are arrays of the same size
+    if (IS_BLOCKABLE_BINARY(sizeof(npy_ulonglong), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]);
+        return;
+    }
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_ulonglong), NPY_SIMD_WIDTH) &&
+             (*(npy_ulonglong *)args[1]) != 0) {
+        TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]);
+        return ;
+    }
+#endif
+#if 0
+    BINARY_LOOP_TWO_OUT {
+        const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
+        const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
+        /* see FIXME note for divide above */
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_ULONGLONG, NPY_TRUE)) {
+            if (in2 == 0) {
+                npy_set_floatstatus_divbyzero();
+                *((npy_ulonglong *)op1) = 0;
+                *((npy_ulonglong *)op2) = 0;
+            }
+            else {
+                npy_set_floatstatus_overflow();
+                *((npy_ulonglong *)op1) = NPY_MIN_ULONGLONG;
+                *((npy_ulonglong *)op2) = 0;
+            }
+        }
+        else {
+            /* handle mixed case the way Python does */
+            const npy_ulonglong quo = in1 / in2;
+            const npy_ulonglong rem = in1 % in2;
+            if ((in1 > 0) == (in2 > 0) || rem == 0) {
+                *((npy_ulonglong *)op1) = quo;
+                *((npy_ulonglong *)op2) = rem;
+            }
+            else {
+                *((npy_ulonglong *)op1) = quo - 1;
+                *((npy_ulonglong *)op2) = rem + in2;
+            }
+        }
+    }
+#else
+    BINARY_LOOP_TWO_OUT {
+        const npy_ulonglong in1 = *(npy_ulonglong *)ip1;
+        const npy_ulonglong in2 = *(npy_ulonglong *)ip2;
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
+            npy_set_floatstatus_divbyzero();
+            *((npy_ulonglong *)op1) = 0;
+            *((npy_ulonglong *)op2) = 0;
+        }
+        else {
+            *((npy_ulonglong *)op1)= in1/in2;
+            *((npy_ulonglong *)op2) = in1 % in2;
+        }
+    }
+#endif
+}
+
+#line 524
+#undef TO_SIMD_SFX
+#if 0
+#line 529
+#elif NPY_BITSOF_BYTE == 8
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s8
+    #else
+        #define TO_SIMD_SFX(X) X##_u8
+    #endif
+
+#line 529
+#elif NPY_BITSOF_BYTE == 16
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s16
+    #else
+        #define TO_SIMD_SFX(X) X##_u16
+    #endif
+
+#line 529
+#elif NPY_BITSOF_BYTE == 32
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s32
+    #else
+        #define TO_SIMD_SFX(X) X##_u32
+    #endif
+
+#line 529
+#elif NPY_BITSOF_BYTE == 64
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s64
+    #else
+        #define TO_SIMD_SFX(X) X##_u64
+    #endif
+
+#endif
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_fmod)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
+    // both arguments are arrays of the same size
+    if (IS_BLOCKABLE_BINARY(sizeof(npy_byte), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]);
+        return;
+    }
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_byte), NPY_SIMD_WIDTH) &&
+             (*(npy_byte *)args[1]) != 0) {
+        TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]);
+        return ;
+    }
+#endif
+    BINARY_LOOP {
+        const npy_byte in1 = *(npy_byte *)ip1;
+        const npy_byte in2 = *(npy_byte *)ip2;
+#if 1
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_BYTE, NPY_TRUE)) {
+#else
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
+#endif
+            FLAG_IF_DIVIDEBYZERO(in2);
+            *((npy_byte *)op1) = 0;
+        } else{
+            *((npy_byte *)op1)= in1 % in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_remainder)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
+    // both arguments are arrays of the same size
+    if (IS_BLOCKABLE_BINARY(sizeof(npy_byte), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]);
+        return;
+    }
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_byte), NPY_SIMD_WIDTH) &&
+             (*(npy_byte *)args[1]) != 0) {
+        TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]);
+        return ;
+    }
+#endif
+    BINARY_LOOP {
+        const npy_byte in1 = *(npy_byte *)ip1;
+        const npy_byte in2 = *(npy_byte *)ip2;
+#if 1
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_BYTE, NPY_TRUE)) {
+#else
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
+#endif
+            FLAG_IF_DIVIDEBYZERO(in2);
+            *((npy_byte *)op1) = 0;
+        } else{
+#if 1
+            /* handle mixed case the way Python does */
+            const npy_byte rem = in1 % in2;
+            if ((in1 > 0) == (in2 > 0) || rem == 0) {
+                *((npy_byte *)op1) = rem;
+            }
+            else {
+                *((npy_byte *)op1) = rem + in2;
+            }
+#else
+            *((npy_byte *)op1)= in1 % in2;
+#endif
+        }
+    }
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_divmod)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
+    // both arguments are arrays of the same size
+    if (IS_BLOCKABLE_BINARY(sizeof(npy_byte), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]);
+        return;
+    }
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_byte), NPY_SIMD_WIDTH) &&
+             (*(npy_byte *)args[1]) != 0) {
+        TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]);
+        return ;
+    }
+#endif
+#if 1
+    BINARY_LOOP_TWO_OUT {
+        const npy_byte in1 = *(npy_byte *)ip1;
+        const npy_byte in2 = *(npy_byte *)ip2;
+        /* see FIXME note for divide above */
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_BYTE, NPY_TRUE)) {
+            if (in2 == 0) {
+                npy_set_floatstatus_divbyzero();
+                *((npy_byte *)op1) = 0;
+                *((npy_byte *)op2) = 0;
+            }
+            else {
+                npy_set_floatstatus_overflow();
+                *((npy_byte *)op1) = NPY_MIN_BYTE;
+                *((npy_byte *)op2) = 0;
+            }
+        }
+        else {
+            /* handle mixed case the way Python does */
+            const npy_byte quo = in1 / in2;
+            const npy_byte rem = in1 % in2;
+            if ((in1 > 0) == (in2 > 0) || rem == 0) {
+                *((npy_byte *)op1) = quo;
+                *((npy_byte *)op2) = rem;
+            }
+            else {
+                *((npy_byte *)op1) = quo - 1;
+                *((npy_byte *)op2) = rem + in2;
+            }
+        }
+    }
+#else
+    BINARY_LOOP_TWO_OUT {
+        const npy_byte in1 = *(npy_byte *)ip1;
+        const npy_byte in2 = *(npy_byte *)ip2;
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
+            npy_set_floatstatus_divbyzero();
+            *((npy_byte *)op1) = 0;
+            *((npy_byte *)op2) = 0;
+        }
+        else {
+            *((npy_byte *)op1)= in1/in2;
+            *((npy_byte *)op2) = in1 % in2;
+        }
+    }
+#endif
+}
+
+#line 524
+#undef TO_SIMD_SFX
+#if 0
+#line 529
+#elif NPY_BITSOF_SHORT == 8
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s8
+    #else
+        #define TO_SIMD_SFX(X) X##_u8
+    #endif
+
+#line 529
+#elif NPY_BITSOF_SHORT == 16
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s16
+    #else
+        #define TO_SIMD_SFX(X) X##_u16
+    #endif
+
+#line 529
+#elif NPY_BITSOF_SHORT == 32
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s32
+    #else
+        #define TO_SIMD_SFX(X) X##_u32
+    #endif
+
+#line 529
+#elif NPY_BITSOF_SHORT == 64
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s64
+    #else
+        #define TO_SIMD_SFX(X) X##_u64
+    #endif
+
+#endif
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_fmod)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
+    // both arguments are arrays of the same size
+    if (IS_BLOCKABLE_BINARY(sizeof(npy_short), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]);
+        return;
+    }
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_short), NPY_SIMD_WIDTH) &&
+             (*(npy_short *)args[1]) != 0) {
+        TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]);
+        return ;
+    }
+#endif
+    BINARY_LOOP {
+        const npy_short in1 = *(npy_short *)ip1;
+        const npy_short in2 = *(npy_short *)ip2;
+#if 1
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_SHORT, NPY_TRUE)) {
+#else
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
+#endif
+            FLAG_IF_DIVIDEBYZERO(in2);
+            *((npy_short *)op1) = 0;
+        } else{
+            *((npy_short *)op1)= in1 % in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_remainder)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
+    // both arguments are arrays of the same size
+    if (IS_BLOCKABLE_BINARY(sizeof(npy_short), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]);
+        return;
+    }
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_short), NPY_SIMD_WIDTH) &&
+             (*(npy_short *)args[1]) != 0) {
+        TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]);
+        return ;
+    }
+#endif
+    BINARY_LOOP {
+        const npy_short in1 = *(npy_short *)ip1;
+        const npy_short in2 = *(npy_short *)ip2;
+#if 1
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_SHORT, NPY_TRUE)) {
+#else
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
+#endif
+            FLAG_IF_DIVIDEBYZERO(in2);
+            *((npy_short *)op1) = 0;
+        } else{
+#if 1
+            /* handle mixed case the way Python does */
+            const npy_short rem = in1 % in2;
+            if ((in1 > 0) == (in2 > 0) || rem == 0) {
+                *((npy_short *)op1) = rem;
+            }
+            else {
+                *((npy_short *)op1) = rem + in2;
+            }
+#else
+            *((npy_short *)op1)= in1 % in2;
+#endif
+        }
+    }
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_divmod)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
+    // both arguments are arrays of the same size
+    if (IS_BLOCKABLE_BINARY(sizeof(npy_short), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]);
+        return;
+    }
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_short), NPY_SIMD_WIDTH) &&
+             (*(npy_short *)args[1]) != 0) {
+        TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]);
+        return ;
+    }
+#endif
+#if 1
+    BINARY_LOOP_TWO_OUT {
+        const npy_short in1 = *(npy_short *)ip1;
+        const npy_short in2 = *(npy_short *)ip2;
+        /* see FIXME note for divide above */
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_SHORT, NPY_TRUE)) {
+            if (in2 == 0) {
+                npy_set_floatstatus_divbyzero();
+                *((npy_short *)op1) = 0;
+                *((npy_short *)op2) = 0;
+            }
+            else {
+                npy_set_floatstatus_overflow();
+                *((npy_short *)op1) = NPY_MIN_SHORT;
+                *((npy_short *)op2) = 0;
+            }
+        }
+        else {
+            /* handle mixed case the way Python does */
+            const npy_short quo = in1 / in2;
+            const npy_short rem = in1 % in2;
+            if ((in1 > 0) == (in2 > 0) || rem == 0) {
+                *((npy_short *)op1) = quo;
+                *((npy_short *)op2) = rem;
+            }
+            else {
+                *((npy_short *)op1) = quo - 1;
+                *((npy_short *)op2) = rem + in2;
+            }
+        }
+    }
+#else
+    BINARY_LOOP_TWO_OUT {
+        const npy_short in1 = *(npy_short *)ip1;
+        const npy_short in2 = *(npy_short *)ip2;
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
+            npy_set_floatstatus_divbyzero();
+            *((npy_short *)op1) = 0;
+            *((npy_short *)op2) = 0;
+        }
+        else {
+            *((npy_short *)op1)= in1/in2;
+            *((npy_short *)op2) = in1 % in2;
+        }
+    }
+#endif
+}
+
+#line 524
+#undef TO_SIMD_SFX
+#if 0
+#line 529
+#elif NPY_BITSOF_INT == 8
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s8
+    #else
+        #define TO_SIMD_SFX(X) X##_u8
+    #endif
+
+#line 529
+#elif NPY_BITSOF_INT == 16
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s16
+    #else
+        #define TO_SIMD_SFX(X) X##_u16
+    #endif
+
+#line 529
+#elif NPY_BITSOF_INT == 32
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s32
+    #else
+        #define TO_SIMD_SFX(X) X##_u32
+    #endif
+
+#line 529
+#elif NPY_BITSOF_INT == 64
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s64
+    #else
+        #define TO_SIMD_SFX(X) X##_u64
+    #endif
+
+#endif
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_fmod)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
+    // both arguments are arrays of the same size
+    if (IS_BLOCKABLE_BINARY(sizeof(npy_int), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]);
+        return;
+    }
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_int), NPY_SIMD_WIDTH) &&
+             (*(npy_int *)args[1]) != 0) {
+        TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]);
+        return ;
+    }
+#endif
+    BINARY_LOOP {
+        const npy_int in1 = *(npy_int *)ip1;
+        const npy_int in2 = *(npy_int *)ip2;
+#if 1
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_INT, NPY_TRUE)) {
+#else
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
+#endif
+            FLAG_IF_DIVIDEBYZERO(in2);
+            *((npy_int *)op1) = 0;
+        } else{
+            *((npy_int *)op1)= in1 % in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_remainder)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
+    // both arguments are arrays of the same size
+    if (IS_BLOCKABLE_BINARY(sizeof(npy_int), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]);
+        return;
+    }
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_int), NPY_SIMD_WIDTH) &&
+             (*(npy_int *)args[1]) != 0) {
+        TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]);
+        return ;
+    }
+#endif
+    BINARY_LOOP {
+        const npy_int in1 = *(npy_int *)ip1;
+        const npy_int in2 = *(npy_int *)ip2;
+#if 1
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_INT, NPY_TRUE)) {
+#else
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
+#endif
+            FLAG_IF_DIVIDEBYZERO(in2);
+            *((npy_int *)op1) = 0;
+        } else{
+#if 1
+            /* handle mixed case the way Python does */
+            const npy_int rem = in1 % in2;
+            if ((in1 > 0) == (in2 > 0) || rem == 0) {
+                *((npy_int *)op1) = rem;
+            }
+            else {
+                *((npy_int *)op1) = rem + in2;
+            }
+#else
+            *((npy_int *)op1)= in1 % in2;
+#endif
+        }
+    }
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_divmod)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
+    // both arguments are arrays of the same size
+    if (IS_BLOCKABLE_BINARY(sizeof(npy_int), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]);
+        return;
+    }
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_int), NPY_SIMD_WIDTH) &&
+             (*(npy_int *)args[1]) != 0) {
+        TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]);
+        return ;
+    }
+#endif
+#if 1
+    BINARY_LOOP_TWO_OUT {
+        const npy_int in1 = *(npy_int *)ip1;
+        const npy_int in2 = *(npy_int *)ip2;
+        /* see FIXME note for divide above */
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_INT, NPY_TRUE)) {
+            if (in2 == 0) {
+                npy_set_floatstatus_divbyzero();
+                *((npy_int *)op1) = 0;
+                *((npy_int *)op2) = 0;
+            }
+            else {
+                npy_set_floatstatus_overflow();
+                *((npy_int *)op1) = NPY_MIN_INT;
+                *((npy_int *)op2) = 0;
+            }
+        }
+        else {
+            /* handle mixed case the way Python does */
+            const npy_int quo = in1 / in2;
+            const npy_int rem = in1 % in2;
+            if ((in1 > 0) == (in2 > 0) || rem == 0) {
+                *((npy_int *)op1) = quo;
+                *((npy_int *)op2) = rem;
+            }
+            else {
+                *((npy_int *)op1) = quo - 1;
+                *((npy_int *)op2) = rem + in2;
+            }
+        }
+    }
+#else
+    BINARY_LOOP_TWO_OUT {
+        const npy_int in1 = *(npy_int *)ip1;
+        const npy_int in2 = *(npy_int *)ip2;
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
+            npy_set_floatstatus_divbyzero();
+            *((npy_int *)op1) = 0;
+            *((npy_int *)op2) = 0;
+        }
+        else {
+            *((npy_int *)op1)= in1/in2;
+            *((npy_int *)op2) = in1 % in2;
+        }
+    }
+#endif
+}
+
+#line 524
+#undef TO_SIMD_SFX
+#if 0
+#line 529
+#elif NPY_BITSOF_LONG == 8
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s8
+    #else
+        #define TO_SIMD_SFX(X) X##_u8
+    #endif
+
+#line 529
+#elif NPY_BITSOF_LONG == 16
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s16
+    #else
+        #define TO_SIMD_SFX(X) X##_u16
+    #endif
+
+#line 529
+#elif NPY_BITSOF_LONG == 32
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s32
+    #else
+        #define TO_SIMD_SFX(X) X##_u32
+    #endif
+
+#line 529
+#elif NPY_BITSOF_LONG == 64
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s64
+    #else
+        #define TO_SIMD_SFX(X) X##_u64
+    #endif
+
+#endif
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_fmod)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
+    // both arguments are arrays of the same size
+    if (IS_BLOCKABLE_BINARY(sizeof(npy_long), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]);
+        return;
+    }
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_long), NPY_SIMD_WIDTH) &&
+             (*(npy_long *)args[1]) != 0) {
+        TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]);
+        return ;
+    }
+#endif
+    BINARY_LOOP {
+        const npy_long in1 = *(npy_long *)ip1;
+        const npy_long in2 = *(npy_long *)ip2;
+#if 1
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_LONG, NPY_TRUE)) {
+#else
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
+#endif
+            FLAG_IF_DIVIDEBYZERO(in2);
+            *((npy_long *)op1) = 0;
+        } else{
+            *((npy_long *)op1)= in1 % in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_remainder)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
+    // both arguments are arrays of the same size
+    if (IS_BLOCKABLE_BINARY(sizeof(npy_long), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]);
+        return;
+    }
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_long), NPY_SIMD_WIDTH) &&
+             (*(npy_long *)args[1]) != 0) {
+        TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]);
+        return ;
+    }
+#endif
+    BINARY_LOOP {
+        const npy_long in1 = *(npy_long *)ip1;
+        const npy_long in2 = *(npy_long *)ip2;
+#if 1
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_LONG, NPY_TRUE)) {
+#else
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
+#endif
+            FLAG_IF_DIVIDEBYZERO(in2);
+            *((npy_long *)op1) = 0;
+        } else{
+#if 1
+            /* handle mixed case the way Python does */
+            const npy_long rem = in1 % in2;
+            if ((in1 > 0) == (in2 > 0) || rem == 0) {
+                *((npy_long *)op1) = rem;
+            }
+            else {
+                *((npy_long *)op1) = rem + in2;
+            }
+#else
+            *((npy_long *)op1)= in1 % in2;
+#endif
+        }
+    }
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_divmod)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
+    // both arguments are arrays of the same size
+    if (IS_BLOCKABLE_BINARY(sizeof(npy_long), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]);
+        return;
+    }
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_long), NPY_SIMD_WIDTH) &&
+             (*(npy_long *)args[1]) != 0) {
+        TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]);
+        return ;
+    }
+#endif
+#if 1
+    BINARY_LOOP_TWO_OUT {
+        const npy_long in1 = *(npy_long *)ip1;
+        const npy_long in2 = *(npy_long *)ip2;
+        /* see FIXME note for divide above */
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_LONG, NPY_TRUE)) {
+            if (in2 == 0) {
+                npy_set_floatstatus_divbyzero();
+                *((npy_long *)op1) = 0;
+                *((npy_long *)op2) = 0;
+            }
+            else {
+                npy_set_floatstatus_overflow();
+                *((npy_long *)op1) = NPY_MIN_LONG;
+                *((npy_long *)op2) = 0;
+            }
+        }
+        else {
+            /* handle mixed case the way Python does */
+            const npy_long quo = in1 / in2;
+            const npy_long rem = in1 % in2;
+            if ((in1 > 0) == (in2 > 0) || rem == 0) {
+                *((npy_long *)op1) = quo;
+                *((npy_long *)op2) = rem;
+            }
+            else {
+                *((npy_long *)op1) = quo - 1;
+                *((npy_long *)op2) = rem + in2;
+            }
+        }
+    }
+#else
+    BINARY_LOOP_TWO_OUT {
+        const npy_long in1 = *(npy_long *)ip1;
+        const npy_long in2 = *(npy_long *)ip2;
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
+            npy_set_floatstatus_divbyzero();
+            *((npy_long *)op1) = 0;
+            *((npy_long *)op2) = 0;
+        }
+        else {
+            *((npy_long *)op1)= in1/in2;
+            *((npy_long *)op2) = in1 % in2;
+        }
+    }
+#endif
+}
+
+#line 524
+#undef TO_SIMD_SFX
+#if 0
+#line 529
+#elif NPY_BITSOF_LONGLONG == 8
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s8
+    #else
+        #define TO_SIMD_SFX(X) X##_u8
+    #endif
+
+#line 529
+#elif NPY_BITSOF_LONGLONG == 16
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s16
+    #else
+        #define TO_SIMD_SFX(X) X##_u16
+    #endif
+
+#line 529
+#elif NPY_BITSOF_LONGLONG == 32
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s32
+    #else
+        #define TO_SIMD_SFX(X) X##_u32
+    #endif
+
+#line 529
+#elif NPY_BITSOF_LONGLONG == 64
+    #if 1
+        #define TO_SIMD_SFX(X) X##_s64
+    #else
+        #define TO_SIMD_SFX(X) X##_u64
+    #endif
+
+#endif
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_fmod)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
+    // both arguments are arrays of the same size
+    if (IS_BLOCKABLE_BINARY(sizeof(npy_longlong), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_fmod_contig)(args, dimensions[0]);
+        return;
+    }
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_longlong), NPY_SIMD_WIDTH) &&
+             (*(npy_longlong *)args[1]) != 0) {
+        TO_SIMD_SFX(vsx4_simd_fmod_by_scalar_contig)(args, dimensions[0]);
+        return ;
+    }
+#endif
+    BINARY_LOOP {
+        const npy_longlong in1 = *(npy_longlong *)ip1;
+        const npy_longlong in2 = *(npy_longlong *)ip2;
+#if 1
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_LONGLONG, NPY_TRUE)) {
+#else
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
+#endif
+            FLAG_IF_DIVIDEBYZERO(in2);
+            *((npy_longlong *)op1) = 0;
+        } else{
+            *((npy_longlong *)op1)= in1 % in2;
+        }
+    }
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_remainder)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
+    // both arguments are arrays of the same size
+    if (IS_BLOCKABLE_BINARY(sizeof(npy_longlong), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_remainder_contig)(args, dimensions[0]);
+        return;
+    }
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_longlong), NPY_SIMD_WIDTH) &&
+             (*(npy_longlong *)args[1]) != 0) {
+        TO_SIMD_SFX(vsx4_simd_remainder_by_scalar_contig)(args, dimensions[0]);
+        return ;
+    }
+#endif
+    BINARY_LOOP {
+        const npy_longlong in1 = *(npy_longlong *)ip1;
+        const npy_longlong in2 = *(npy_longlong *)ip2;
+#if 1
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_LONGLONG, NPY_TRUE)) {
+#else
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
+#endif
+            FLAG_IF_DIVIDEBYZERO(in2);
+            *((npy_longlong *)op1) = 0;
+        } else{
+#if 1
+            /* handle mixed case the way Python does */
+            const npy_longlong rem = in1 % in2;
+            if ((in1 > 0) == (in2 > 0) || rem == 0) {
+                *((npy_longlong *)op1) = rem;
+            }
+            else {
+                *((npy_longlong *)op1) = rem + in2;
+            }
+#else
+            *((npy_longlong *)op1)= in1 % in2;
+#endif
+        }
+    }
+}
+
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_divmod)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if defined(NPY_HAVE_VSX4) && NPY_SIMD && defined(TO_SIMD_SFX)
+    // both arguments are arrays of the same size
+    if (IS_BLOCKABLE_BINARY(sizeof(npy_longlong), NPY_SIMD_WIDTH)) {
+        TO_SIMD_SFX(vsx4_simd_divmod_contig)(args, dimensions[0]);
+        return;
+    }
+    // for contiguous block of memory, divisor is a scalar and not 0
+    else if (IS_BLOCKABLE_BINARY_SCALAR2(sizeof(npy_longlong), NPY_SIMD_WIDTH) &&
+             (*(npy_longlong *)args[1]) != 0) {
+        TO_SIMD_SFX(vsx4_simd_divmod_by_scalar_contig)(args, dimensions[0]);
+        return ;
+    }
+#endif
+#if 1
+    BINARY_LOOP_TWO_OUT {
+        const npy_longlong in1 = *(npy_longlong *)ip1;
+        const npy_longlong in2 = *(npy_longlong *)ip2;
+        /* see FIXME note for divide above */
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, NPY_MIN_LONGLONG, NPY_TRUE)) {
+            if (in2 == 0) {
+                npy_set_floatstatus_divbyzero();
+                *((npy_longlong *)op1) = 0;
+                *((npy_longlong *)op2) = 0;
+            }
+            else {
+                npy_set_floatstatus_overflow();
+                *((npy_longlong *)op1) = NPY_MIN_LONGLONG;
+                *((npy_longlong *)op2) = 0;
+            }
+        }
+        else {
+            /* handle mixed case the way Python does */
+            const npy_longlong quo = in1 / in2;
+            const npy_longlong rem = in1 % in2;
+            if ((in1 > 0) == (in2 > 0) || rem == 0) {
+                *((npy_longlong *)op1) = quo;
+                *((npy_longlong *)op2) = rem;
+            }
+            else {
+                *((npy_longlong *)op1) = quo - 1;
+                *((npy_longlong *)op2) = rem + in2;
+            }
+        }
+    }
+#else
+    BINARY_LOOP_TWO_OUT {
+        const npy_longlong in1 = *(npy_longlong *)ip1;
+        const npy_longlong in2 = *(npy_longlong *)ip2;
+        if (DIVIDEBYZERO_OVERFLOW_CHECK(in1, in2, 0, NPY_FALSE)) {
+            npy_set_floatstatus_divbyzero();
+            *((npy_longlong *)op1) = 0;
+            *((npy_longlong *)op2) = 0;
+        }
+        else {
+            *((npy_longlong *)op1)= in1/in2;
+            *((npy_longlong *)op2) = in1 % in2;
+        }
+    }
+#endif
+}
+
+
diff --git a/numpy/core/src/_generated/loops_trigonometric.dispatch.c b/numpy/core/src/_generated/loops_trigonometric.dispatch.c
new file mode 100644
index 000000000000..a2f7c91142d4
--- /dev/null
+++ b/numpy/core/src/_generated/loops_trigonometric.dispatch.c
@@ -0,0 +1,607 @@
+#line 1 "numpy/core/src/umath/loops_trigonometric.dispatch.c.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+/*@targets
+ ** $maxopt baseline
+ ** (avx2 fma3) avx512f
+ ** vsx2 vsx3 vsx4
+ ** neon_vfpv4
+ ** vxe vxe2
+ **/
+#include "numpy/npy_math.h"
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "fast_loop_macros.h"
+/*
+ * TODO:
+ * - use vectorized version of Payne-Hanek style reduction for large elements or
+ *   when there's no native FUSED support instead of fallback to libc
+ */
+#if NPY_SIMD_FMA3  // native support
+#line 24
+#if NPY_SIMD_F64 && 0
+/*
+ * Vectorized Cody-Waite range reduction technique
+ * Performs the reduction step x* = x - y*C in three steps:
+ * 1) x* = x - y*c1
+ * 2) x* = x - y*c2
+ * 3) x* = x - y*c3
+ * c1, c2 are exact floating points, c3 = C - c1 - c2 simulates higher precision
+ */
+NPY_FINLINE npyv_f64
+simd_range_reduction_f64(npyv_f64 x, npyv_f64 y, npyv_f64 c1, npyv_f64 c2, npyv_f64 c3)
+{
+    npyv_f64 reduced_x = npyv_muladd_f64(y, c1, x);
+    reduced_x = npyv_muladd_f64(y, c2, reduced_x);
+    reduced_x = npyv_muladd_f64(y, c3, reduced_x);
+    return reduced_x;
+}
+#endif
+
+#line 24
+#if NPY_SIMD_F32 && 1
+/*
+ * Vectorized Cody-Waite range reduction technique
+ * Performs the reduction step x* = x - y*C in three steps:
+ * 1) x* = x - y*c1
+ * 2) x* = x - y*c2
+ * 3) x* = x - y*c3
+ * c1, c2 are exact floating points, c3 = C - c1 - c2 simulates higher precision
+ */
+NPY_FINLINE npyv_f32
+simd_range_reduction_f32(npyv_f32 x, npyv_f32 y, npyv_f32 c1, npyv_f32 c2, npyv_f32 c3)
+{
+    npyv_f32 reduced_x = npyv_muladd_f32(y, c1, x);
+    reduced_x = npyv_muladd_f32(y, c2, reduced_x);
+    reduced_x = npyv_muladd_f32(y, c3, reduced_x);
+    return reduced_x;
+}
+#endif
+
+/* Disable SIMD code and revert to libm: see
+ * https://mail.python.org/archives/list/numpy-discussion@python.org/thread/C6EYZZSR4EWGVKHAZXLE7IBILRMNVK7L/
+ * for detailed discussion on this*/
+#if 0 // NPY_SIMD_F64
+#line 50
+#if defined(NPY_OS_WIN32) || defined(NPY_OS_CYGWIN)
+NPY_FINLINE npyv_f64
+#else
+NPY_NOINLINE npyv_f64
+#endif
+simd_cos_scalar_f64(npyv_f64 out, npy_uint64 cmp_bits)
+{
+    // MSVC doesn't compile with direct vector access, so we copy it here
+    // as we have no npyv_get_lane/npyv_set_lane intrinsics
+    npy_double NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) out_copy[npyv_nlanes_f64];
+    npyv_storea_f64(out_copy, out);
+
+    for (unsigned i = 0; i < npyv_nlanes_f64; ++i) {
+        if (cmp_bits & (1 << i)) {
+            out_copy[i] = npy_cos(out_copy[i]);
+        }
+    }
+
+    return npyv_loada_f64(out_copy);
+}
+
+#line 50
+#if defined(NPY_OS_WIN32) || defined(NPY_OS_CYGWIN)
+NPY_FINLINE npyv_f64
+#else
+NPY_NOINLINE npyv_f64
+#endif
+simd_sin_scalar_f64(npyv_f64 out, npy_uint64 cmp_bits)
+{
+    // MSVC doesn't compile with direct vector access, so we copy it here
+    // as we have no npyv_get_lane/npyv_set_lane intrinsics
+    npy_double NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) out_copy[npyv_nlanes_f64];
+    npyv_storea_f64(out_copy, out);
+
+    for (unsigned i = 0; i < npyv_nlanes_f64; ++i) {
+        if (cmp_bits & (1 << i)) {
+            out_copy[i] = npy_sin(out_copy[i]);
+        }
+    }
+
+    return npyv_loada_f64(out_copy);
+}
+
+
+/*
+ * Approximate sine algorithm for x \in [-pi/2, pi/2]
+ * worst-case error is 3.5 ulp.
+ * abs error: 0x1.be222a58p-53 in [-pi/2, pi/2].
+ */
+NPY_FINLINE npyv_f64
+simd_approx_sine_poly_f64(npyv_f64 r)
+{
+    const npyv_f64 poly1 = npyv_setall_f64(-0x1.9f4a9c8b21dc9p-41);
+    const npyv_f64 poly2 = npyv_setall_f64(0x1.60e88a10163f2p-33);
+    const npyv_f64 poly3 = npyv_setall_f64(-0x1.ae6361b7254e7p-26);
+    const npyv_f64 poly4 = npyv_setall_f64(0x1.71de382e8d62bp-19);
+    const npyv_f64 poly5 = npyv_setall_f64(-0x1.a01a019aeb4ffp-13);
+    const npyv_f64 poly6 = npyv_setall_f64(0x1.111111110b25ep-7);
+    const npyv_f64 poly7 = npyv_setall_f64(-0x1.55555555554c3p-3);
+
+    npyv_f64 r2 = npyv_mul_f64(r, r);
+    npyv_f64 y = npyv_muladd_f64(poly1, r2, poly2);
+    y = npyv_muladd_f64(y, r2, poly3);
+    y = npyv_muladd_f64(y, r2, poly4);
+    y = npyv_muladd_f64(y, r2, poly5);
+    y = npyv_muladd_f64(y, r2, poly6);
+    y = npyv_muladd_f64(y, r2, poly7);
+    y = npyv_muladd_f64(npyv_mul_f64(y, r2), r, r);
+
+    return y;
+}
+
+/* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
+NPY_FINLINE npyv_f64
+simd_range_reduction_pi2(npyv_f64 r, npyv_f64 n) {
+    const npyv_f64 pi1 = npyv_setall_f64(-0x1.921fb54442d18p+1);
+    const npyv_f64 pi2 = npyv_setall_f64(-0x1.1a62633145c06p-53);
+    const npyv_f64 pi3 = npyv_setall_f64(-0x1.c1cd129024e09p-106);
+
+    return simd_range_reduction_f64(r, n, pi1, pi2, pi3);
+}
+
+NPY_FINLINE npyv_b64 simd_sin_range_check_f64(npyv_u64 ir) {
+    const npyv_u64 tiny_bound = npyv_setall_u64(0x202); /* top12 (asuint64 (0x1p-509)).  */
+    const npyv_u64 simd_thresh = npyv_setall_u64(0x214); /* top12 (asuint64 (RangeVal)) - SIMD_TINY_BOUND.  */
+
+    return npyv_cmpge_u64(npyv_sub_u64(npyv_shri_u64(ir, 52), tiny_bound), simd_thresh);
+}
+
+NPY_FINLINE npyv_b64 simd_cos_range_check_f64(npyv_u64 ir) {
+    const npyv_f64 range_val = npyv_setall_f64(0x1p23);
+
+    return npyv_cmpge_u64(ir, npyv_reinterpret_u64_f64(range_val));
+}
+
+NPY_FINLINE npyv_f64
+simd_cos_poly_f64(npyv_f64 r, npyv_u64 ir, npyv_u64 sign)
+{
+    const npyv_f64 inv_pi = npyv_setall_f64(0x1.45f306dc9c883p-2);
+    const npyv_f64 half_pi = npyv_setall_f64(0x1.921fb54442d18p+0);
+    const npyv_f64 shift = npyv_setall_f64(0x1.8p52);
+
+    /* n = rint((|x|+pi/2)/pi) - 0.5.  */
+    npyv_f64 n = npyv_muladd_f64(inv_pi, npyv_add_f64(r, half_pi), shift);
+    npyv_u64 odd = npyv_shli_u64(npyv_reinterpret_u64_f64(n), 63);
+    n = npyv_sub_f64(n, shift);
+    n = npyv_sub_f64(n, npyv_setall_f64(0.5));
+
+    /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
+    r = simd_range_reduction_pi2(r, n);
+
+    /* sin(r) poly approx.  */
+    npyv_f64 y = simd_approx_sine_poly_f64(r);
+
+    /* sign.  */
+    return npyv_reinterpret_f64_u64(npyv_xor_u64(npyv_reinterpret_u64_f64(y), odd));
+}
+
+NPY_FINLINE npyv_f64
+simd_sin_poly_f64(npyv_f64 r, npyv_u64 ir, npyv_u64 sign)
+{
+    const npyv_f64 inv_pi = npyv_setall_f64(0x1.45f306dc9c883p-2);
+    const npyv_f64 shift = npyv_setall_f64(0x1.8p52);
+
+    /* n = rint(|x|/pi).  */
+    npyv_f64 n = npyv_muladd_f64(inv_pi, r, shift);
+    npyv_u64 odd = npyv_shli_u64(npyv_reinterpret_u64_f64(n), 63);
+    n = npyv_sub_f64(n, shift);
+
+    /* r = |x| - n*pi  (range reduction into -pi/2 .. pi/2).  */
+    r = simd_range_reduction_pi2(r, n);
+
+    /* sin(r) poly approx.  */
+    npyv_f64 y = simd_approx_sine_poly_f64(r);
+
+    /* sign.  */
+    return npyv_reinterpret_f64_u64(npyv_xor_u64(npyv_xor_u64(npyv_reinterpret_u64_f64(y), sign), odd));
+}
+
+#line 170
+NPY_FINLINE void
+simd_cos_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_u64 abs_mask = npyv_setall_u64(0x7fffffffffffffff);
+    const int vstep = npyv_nlanes_f64;
+
+    npyv_f64 out = npyv_zero_f64();
+    npyv_f64 x_in;
+
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        if (ssrc == 1) {
+            x_in = npyv_load_tillz_f64(src, len);
+        } else {
+            x_in = npyv_loadn_tillz_f64(src, ssrc, len);
+        }
+
+        npyv_u64 ir = npyv_and_u64(npyv_reinterpret_u64_f64(x_in), abs_mask);
+        npyv_f64 r = npyv_reinterpret_f64_u64(ir);
+        npyv_u64 sign = npyv_and_u64(npyv_reinterpret_u64_f64(x_in), npyv_not_u64(abs_mask));
+
+        npyv_b64 cmp = simd_cos_range_check_f64(ir);
+        /* If fenv exceptions are to be triggered correctly, set any special lanes
+        to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+        scalar loop later.  */
+        r = npyv_select_f64(cmp, npyv_setall_f64(1.0), r);
+
+        // Some in range, at least one calculation is useful
+        if (!npyv_all_b64(cmp)) {
+            out = simd_cos_poly_f64(r, ir, sign);
+        }
+
+        if (npyv_any_b64(cmp)) {
+            out = npyv_select_f64(cmp, x_in, out);
+            out = simd_cos_scalar_f64(out, npyv_tobits_b64(cmp));
+        }
+
+        if (sdst == 1) {
+            npyv_store_till_f64(dst, len, out);
+        } else {
+            npyv_storen_till_f64(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 170
+NPY_FINLINE void
+simd_sin_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_u64 abs_mask = npyv_setall_u64(0x7fffffffffffffff);
+    const int vstep = npyv_nlanes_f64;
+
+    npyv_f64 out = npyv_zero_f64();
+    npyv_f64 x_in;
+
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        if (ssrc == 1) {
+            x_in = npyv_load_tillz_f64(src, len);
+        } else {
+            x_in = npyv_loadn_tillz_f64(src, ssrc, len);
+        }
+
+        npyv_u64 ir = npyv_and_u64(npyv_reinterpret_u64_f64(x_in), abs_mask);
+        npyv_f64 r = npyv_reinterpret_f64_u64(ir);
+        npyv_u64 sign = npyv_and_u64(npyv_reinterpret_u64_f64(x_in), npyv_not_u64(abs_mask));
+
+        npyv_b64 cmp = simd_sin_range_check_f64(ir);
+        /* If fenv exceptions are to be triggered correctly, set any special lanes
+        to 1 (which is neutral w.r.t. fenv). These lanes will be fixed by
+        scalar loop later.  */
+        r = npyv_select_f64(cmp, npyv_setall_f64(1.0), r);
+
+        // Some in range, at least one calculation is useful
+        if (!npyv_all_b64(cmp)) {
+            out = simd_sin_poly_f64(r, ir, sign);
+        }
+
+        if (npyv_any_b64(cmp)) {
+            out = npyv_select_f64(cmp, x_in, out);
+            out = simd_sin_scalar_f64(out, npyv_tobits_b64(cmp));
+        }
+
+        if (sdst == 1) {
+            npyv_store_till_f64(dst, len, out);
+        } else {
+            npyv_storen_till_f64(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#endif // NPY_SIMD_F64
+
+#if NPY_SIMD_F32
+/*
+ * Approximate cosine algorithm for x \in [-PI/4, PI/4]
+ * Maximum ULP across all 32-bit floats = 0.875
+ */
+NPY_FINLINE npyv_f32
+simd_cosine_poly_f32(npyv_f32 x2)
+{
+    const npyv_f32 invf8 = npyv_setall_f32(0x1.98e616p-16f);
+    const npyv_f32 invf6 = npyv_setall_f32(-0x1.6c06dcp-10f);
+    const npyv_f32 invf4 = npyv_setall_f32(0x1.55553cp-05f);
+    const npyv_f32 invf2 = npyv_setall_f32(-0x1.000000p-01f);
+    const npyv_f32 invf0 = npyv_setall_f32(0x1.000000p+00f);
+
+    npyv_f32 r = npyv_muladd_f32(invf8, x2, invf6);
+    r = npyv_muladd_f32(r, x2, invf4);
+    r = npyv_muladd_f32(r, x2, invf2);
+    r = npyv_muladd_f32(r, x2, invf0);
+    return r;
+}
+/*
+ * Approximate sine algorithm for x \in [-PI/4, PI/4]
+ * Maximum ULP across all 32-bit floats = 0.647
+ * Polynomial approximation based on unpublished work by T. Myklebust
+ */
+NPY_FINLINE npyv_f32
+simd_sine_poly_f32(npyv_f32 x, npyv_f32 x2)
+{
+    const npyv_f32 invf9 = npyv_setall_f32(0x1.7d3bbcp-19f);
+    const npyv_f32 invf7 = npyv_setall_f32(-0x1.a06bbap-13f);
+    const npyv_f32 invf5 = npyv_setall_f32(0x1.11119ap-07f);
+    const npyv_f32 invf3 = npyv_setall_f32(-0x1.555556p-03f);
+
+    npyv_f32 r = npyv_muladd_f32(invf9, x2, invf7);
+    r = npyv_muladd_f32(r, x2, invf5);
+    r = npyv_muladd_f32(r, x2, invf3);
+    r = npyv_muladd_f32(r, x2, npyv_zero_f32());
+    r = npyv_muladd_f32(r, x, x);
+    return r;
+}
+/*
+ * Vectorized approximate sine/cosine algorithms: The following code is a
+ * vectorized version of the algorithm presented here:
+ * https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751
+ * (1) Load data in registers and generate mask for elements that are
+ * within range [-71476.0625f, 71476.0625f] for cosine and [-117435.992f,
+ * 117435.992f] for sine.
+ * (2) For elements within range, perform range reduction using Cody-Waite's
+ * method: x* = x - y*PI/2, where y = rint(x*2/PI). x* \in [-PI/4, PI/4].
+ * (3) Map cos(x) to (+/-)sine or (+/-)cosine of x* based on the quadrant k =
+ * int(y).
+ * (4) For elements outside that range, Cody-Waite reduction performs poorly
+ * leading to catastrophic cancellation. We compute cosine by calling glibc in
+ * a scalar fashion.
+ * (5) Vectorized implementation has a max ULP of 1.49 and performs at least
+ * 5-7x(x86) - 2.5-3x(Power) - 1-2x(Arm) faster than scalar implementations
+ * when magnitude of all elements in the array < 71476.0625f (117435.992f for sine).
+ * Worst case performance is when all the elements are large leading to about 1-2% reduction in
+ * performance.
+ */
+typedef enum
+{
+    SIMD_COMPUTE_SIN,
+    SIMD_COMPUTE_COS
+} SIMD_TRIG_OP;
+
+static void SIMD_MSVC_NOINLINE
+simd_sincos_f32(const float *src, npy_intp ssrc, float *dst, npy_intp sdst,
+                npy_intp len, SIMD_TRIG_OP trig_op)
+{
+    // Load up frequently used constants
+    const npyv_f32 zerosf = npyv_zero_f32();
+    const npyv_s32 ones  = npyv_setall_s32(1);
+    const npyv_s32 twos  = npyv_setall_s32(2);
+    const npyv_f32 two_over_pi = npyv_setall_f32(0x1.45f306p-1f);
+    const npyv_f32 codyw_pio2_highf = npyv_setall_f32(-0x1.921fb0p+00f);
+    const npyv_f32 codyw_pio2_medf = npyv_setall_f32(-0x1.5110b4p-22f);
+    const npyv_f32 codyw_pio2_lowf = npyv_setall_f32(-0x1.846988p-48f);
+    const npyv_f32 rint_cvt_magic = npyv_setall_f32(0x1.800000p+23f);
+    // Cody-Waite's range
+    float max_codi = 117435.992f;
+    if (trig_op == SIMD_COMPUTE_COS) {
+        max_codi = 71476.0625f;
+    }
+    const npyv_f32 max_cody = npyv_setall_f32(max_codi);
+    const int vstep = npyv_nlanes_f32;
+
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f32 x_in;
+        if (ssrc == 1) {
+            x_in = npyv_load_tillz_f32(src, len);
+        } else {
+            x_in = npyv_loadn_tillz_f32(src, ssrc, len);
+        }
+        npyv_b32 nnan_mask = npyv_notnan_f32(x_in);
+    #if NPY_SIMD_CMPSIGNAL
+        // Eliminate NaN to avoid FP invalid exception
+        x_in = npyv_and_f32(x_in, npyv_reinterpret_f32_u32(npyv_cvt_u32_b32(nnan_mask)));
+    #endif
+        npyv_b32 simd_mask = npyv_cmple_f32(npyv_abs_f32(x_in), max_cody);
+        npy_uint64 simd_maski = npyv_tobits_b32(simd_mask);
+        /*
+         * For elements outside of this range, Cody-Waite's range reduction
+         * becomes inaccurate and we will call libc to compute cosine for
+         * these numbers
+         */
+        if (simd_maski != 0) {
+            npyv_f32 x = npyv_select_f32(npyv_and_b32(nnan_mask, simd_mask), x_in, zerosf);
+
+            npyv_f32 quadrant = npyv_mul_f32(x, two_over_pi);
+            // round to nearest, -0.0f -> +0.0f, and |a| must be <= 0x1.0p+22
+            quadrant = npyv_add_f32(quadrant, rint_cvt_magic);
+            quadrant = npyv_sub_f32(quadrant, rint_cvt_magic);
+
+            // Cody-Waite's range reduction algorithm
+            npyv_f32 reduced_x = simd_range_reduction_f32(
+                x, quadrant, codyw_pio2_highf, codyw_pio2_medf, codyw_pio2_lowf
+            );
+            npyv_f32 reduced_x2 = npyv_square_f32(reduced_x);
+
+            // compute cosine and sine
+            npyv_f32 cos = simd_cosine_poly_f32(reduced_x2);
+            npyv_f32 sin = simd_sine_poly_f32(reduced_x, reduced_x2);
+
+            npyv_s32 iquadrant = npyv_round_s32_f32(quadrant);
+            if (trig_op == SIMD_COMPUTE_COS) {
+                iquadrant = npyv_add_s32(iquadrant, ones);
+            }
+            // blend sin and cos based on the quadrant
+            npyv_b32 sine_mask = npyv_cmpeq_s32(npyv_and_s32(iquadrant, ones), npyv_zero_s32());
+            cos = npyv_select_f32(sine_mask, sin, cos);
+
+            // multiply by -1 for appropriate elements
+            npyv_b32 negate_mask = npyv_cmpeq_s32(npyv_and_s32(iquadrant, twos), twos);
+            cos = npyv_ifsub_f32(negate_mask, zerosf, cos, cos);
+            cos = npyv_select_f32(nnan_mask, cos, npyv_setall_f32(NPY_NANF));
+
+            if (sdst == 1) {
+                npyv_store_till_f32(dst, len, cos);
+            } else {
+                npyv_storen_till_f32(dst, sdst, len, cos);
+            }
+        }
+        if (simd_maski != (npy_uint64)((1 << vstep) - 1)) {
+            float NPY_DECL_ALIGNED(NPY_SIMD_WIDTH) ip_fback[npyv_nlanes_f32];
+            npyv_storea_f32(ip_fback, x_in);
+
+            // process elements using libc for large elements
+            if (trig_op == SIMD_COMPUTE_COS) {
+                for (unsigned i = 0; i < npyv_nlanes_f32; ++i) {
+                    if ((simd_maski >> i) & 1) {
+                        continue;
+                    }
+                    dst[sdst*i] = npy_cosf(ip_fback[i]);
+                }
+            }
+            else {
+                for (unsigned i = 0; i < npyv_nlanes_f32; ++i) {
+                    if ((simd_maski >> i) & 1) {
+                        continue;
+                    }
+                    dst[sdst*i] = npy_sinf(ip_fback[i]);
+                }
+            }
+        }
+    }
+    npyv_cleanup();
+}
+#endif // NPY_SIMD_FP32
+#endif // NYP_SIMD_FMA3
+
+#line 391
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_cos)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    /* Disable SIMD code and revert to libm: see
+     * https://mail.python.org/archives/list/numpy-discussion@python.org/thread/C6EYZZSR4EWGVKHAZXLE7IBILRMNVK7L/
+     * for detailed discussion on this*/
+//#if NPY_SIMD_F64 && NPY_SIMD_FMA3
+#if 0
+    const double *src = (double*)args[0];
+          double *dst = (double*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+
+    if (is_mem_overlap(src, steps[0], dst, steps[1], len) ||
+        !npyv_loadable_stride_f64(ssrc) || !npyv_storable_stride_f64(sdst)
+    ) {
+        for (; len > 0; --len, src += ssrc, dst += sdst) {
+            simd_cos_f64(src, 1, dst, 1, 1);
+        }
+    } else {
+        simd_cos_f64(src, ssrc, dst, sdst, len);
+    }
+#else
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *(npy_double *)op1 = npy_cos(in1);
+    }
+#endif
+}
+
+#line 391
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_sin)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+    /* Disable SIMD code and revert to libm: see
+     * https://mail.python.org/archives/list/numpy-discussion@python.org/thread/C6EYZZSR4EWGVKHAZXLE7IBILRMNVK7L/
+     * for detailed discussion on this*/
+//#if NPY_SIMD_F64 && NPY_SIMD_FMA3
+#if 0
+    const double *src = (double*)args[0];
+          double *dst = (double*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+
+    if (is_mem_overlap(src, steps[0], dst, steps[1], len) ||
+        !npyv_loadable_stride_f64(ssrc) || !npyv_storable_stride_f64(sdst)
+    ) {
+        for (; len > 0; --len, src += ssrc, dst += sdst) {
+            simd_sin_f64(src, 1, dst, 1, 1);
+        }
+    } else {
+        simd_sin_f64(src, ssrc, dst, sdst, len);
+    }
+#else
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *(npy_double *)op1 = npy_sin(in1);
+    }
+#endif
+}
+
+
+#line 429
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_sin)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD_F32 && NPY_SIMD_FMA3
+    const npy_float *src = (npy_float*)args[0];
+          npy_float *dst = (npy_float*)args[1];
+
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (is_mem_overlap(src, steps[0], dst, steps[1], len) ||
+        !npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst)
+    ) {
+        for (; len > 0; --len, src += ssrc, dst += sdst) {
+            simd_sincos_f32(src, 1, dst, 1, 1, SIMD_COMPUTE_SIN);
+        }
+    } else {
+        simd_sincos_f32(src, ssrc, dst, sdst, len, SIMD_COMPUTE_SIN);
+    }
+#else
+    UNARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        *(npy_float *)op1 = npy_sinf(in1);
+    }
+#endif
+}
+
+#line 429
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_cos)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD_F32 && NPY_SIMD_FMA3
+    const npy_float *src = (npy_float*)args[0];
+          npy_float *dst = (npy_float*)args[1];
+
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (is_mem_overlap(src, steps[0], dst, steps[1], len) ||
+        !npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst)
+    ) {
+        for (; len > 0; --len, src += ssrc, dst += sdst) {
+            simd_sincos_f32(src, 1, dst, 1, 1, SIMD_COMPUTE_COS);
+        }
+    } else {
+        simd_sincos_f32(src, ssrc, dst, sdst, len, SIMD_COMPUTE_COS);
+    }
+#else
+    UNARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        *(npy_float *)op1 = npy_cosf(in1);
+    }
+#endif
+}
+
+
diff --git a/numpy/core/src/_generated/loops_umath_fp.dispatch.c b/numpy/core/src/_generated/loops_umath_fp.dispatch.c
new file mode 100644
index 000000000000..be87d3398bc4
--- /dev/null
+++ b/numpy/core/src/_generated/loops_umath_fp.dispatch.c
@@ -0,0 +1,3223 @@
+#line 1 "numpy/core/src/umath/loops_umath_fp.dispatch.c.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+/*@targets
+ ** $maxopt baseline avx512_skx
+ */
+#include "numpy/npy_math.h"
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "npy_svml.h"
+#include "fast_loop_macros.h"
+
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+#line 17
+#line 22
+static void
+simd_exp2_f32(const npyv_lanetype_f32 *src, npy_intp ssrc,
+                        npyv_lanetype_f32 *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f32;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f32 x;
+        #if 0
+            if (ssrc == 1) {
+                x = npyv_load_till_f32(src, len, 0);
+            } else {
+                x = npyv_loadn_till_f32(src, ssrc, len, 0);
+            }
+        #else
+            if (ssrc == 1) {
+                x = npyv_load_tillz_f32(src, len);
+            } else {
+                x = npyv_loadn_tillz_f32(src, ssrc, len);
+            }
+        #endif
+    #if 0 == 32
+        // Workaround, invalid value encountered when x is set to nan
+        npyv_b32 nnan_mask = npyv_notnan_f32(x);
+        npyv_f32 x_exnan = npyv_select_f32(nnan_mask, x, npyv_setall_f32(0));
+        npyv_f32 out = __svml_exp2f16(x_exnan);
+        out = npyv_select_f32(nnan_mask, out, x);
+    #else
+        npyv_f32 out = __svml_exp2f16(x);
+    #endif
+        if (sdst == 1) {
+            npyv_store_till_f32(dst, len, out);
+        } else {
+            npyv_storen_till_f32(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 22
+static void
+simd_log2_f32(const npyv_lanetype_f32 *src, npy_intp ssrc,
+                        npyv_lanetype_f32 *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f32;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f32 x;
+        #if 1
+            if (ssrc == 1) {
+                x = npyv_load_till_f32(src, len, 1);
+            } else {
+                x = npyv_loadn_till_f32(src, ssrc, len, 1);
+            }
+        #else
+            if (ssrc == 1) {
+                x = npyv_load_tillz_f32(src, len);
+            } else {
+                x = npyv_loadn_tillz_f32(src, ssrc, len);
+            }
+        #endif
+    #if 0 == 32
+        // Workaround, invalid value encountered when x is set to nan
+        npyv_b32 nnan_mask = npyv_notnan_f32(x);
+        npyv_f32 x_exnan = npyv_select_f32(nnan_mask, x, npyv_setall_f32(1));
+        npyv_f32 out = __svml_log2f16(x_exnan);
+        out = npyv_select_f32(nnan_mask, out, x);
+    #else
+        npyv_f32 out = __svml_log2f16(x);
+    #endif
+        if (sdst == 1) {
+            npyv_store_till_f32(dst, len, out);
+        } else {
+            npyv_storen_till_f32(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 22
+static void
+simd_log10_f32(const npyv_lanetype_f32 *src, npy_intp ssrc,
+                        npyv_lanetype_f32 *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f32;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f32 x;
+        #if 1
+            if (ssrc == 1) {
+                x = npyv_load_till_f32(src, len, 1);
+            } else {
+                x = npyv_loadn_till_f32(src, ssrc, len, 1);
+            }
+        #else
+            if (ssrc == 1) {
+                x = npyv_load_tillz_f32(src, len);
+            } else {
+                x = npyv_loadn_tillz_f32(src, ssrc, len);
+            }
+        #endif
+    #if 0 == 32
+        // Workaround, invalid value encountered when x is set to nan
+        npyv_b32 nnan_mask = npyv_notnan_f32(x);
+        npyv_f32 x_exnan = npyv_select_f32(nnan_mask, x, npyv_setall_f32(1));
+        npyv_f32 out = __svml_log10f16(x_exnan);
+        out = npyv_select_f32(nnan_mask, out, x);
+    #else
+        npyv_f32 out = __svml_log10f16(x);
+    #endif
+        if (sdst == 1) {
+            npyv_store_till_f32(dst, len, out);
+        } else {
+            npyv_storen_till_f32(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 22
+static void
+simd_expm1_f32(const npyv_lanetype_f32 *src, npy_intp ssrc,
+                        npyv_lanetype_f32 *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f32;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f32 x;
+        #if 1
+            if (ssrc == 1) {
+                x = npyv_load_till_f32(src, len, 1);
+            } else {
+                x = npyv_loadn_till_f32(src, ssrc, len, 1);
+            }
+        #else
+            if (ssrc == 1) {
+                x = npyv_load_tillz_f32(src, len);
+            } else {
+                x = npyv_loadn_tillz_f32(src, ssrc, len);
+            }
+        #endif
+    #if 64 == 32
+        // Workaround, invalid value encountered when x is set to nan
+        npyv_b32 nnan_mask = npyv_notnan_f32(x);
+        npyv_f32 x_exnan = npyv_select_f32(nnan_mask, x, npyv_setall_f32(1));
+        npyv_f32 out = __svml_expm1f16(x_exnan);
+        out = npyv_select_f32(nnan_mask, out, x);
+    #else
+        npyv_f32 out = __svml_expm1f16(x);
+    #endif
+        if (sdst == 1) {
+            npyv_store_till_f32(dst, len, out);
+        } else {
+            npyv_storen_till_f32(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 22
+static void
+simd_log1p_f32(const npyv_lanetype_f32 *src, npy_intp ssrc,
+                        npyv_lanetype_f32 *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f32;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f32 x;
+        #if 0
+            if (ssrc == 1) {
+                x = npyv_load_till_f32(src, len, 0);
+            } else {
+                x = npyv_loadn_till_f32(src, ssrc, len, 0);
+            }
+        #else
+            if (ssrc == 1) {
+                x = npyv_load_tillz_f32(src, len);
+            } else {
+                x = npyv_loadn_tillz_f32(src, ssrc, len);
+            }
+        #endif
+    #if 0 == 32
+        // Workaround, invalid value encountered when x is set to nan
+        npyv_b32 nnan_mask = npyv_notnan_f32(x);
+        npyv_f32 x_exnan = npyv_select_f32(nnan_mask, x, npyv_setall_f32(0));
+        npyv_f32 out = __svml_log1pf16(x_exnan);
+        out = npyv_select_f32(nnan_mask, out, x);
+    #else
+        npyv_f32 out = __svml_log1pf16(x);
+    #endif
+        if (sdst == 1) {
+            npyv_store_till_f32(dst, len, out);
+        } else {
+            npyv_storen_till_f32(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 22
+static void
+simd_cbrt_f32(const npyv_lanetype_f32 *src, npy_intp ssrc,
+                        npyv_lanetype_f32 *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f32;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f32 x;
+        #if 0
+            if (ssrc == 1) {
+                x = npyv_load_till_f32(src, len, 0);
+            } else {
+                x = npyv_loadn_till_f32(src, ssrc, len, 0);
+            }
+        #else
+            if (ssrc == 1) {
+                x = npyv_load_tillz_f32(src, len);
+            } else {
+                x = npyv_loadn_tillz_f32(src, ssrc, len);
+            }
+        #endif
+    #if 0 == 32
+        // Workaround, invalid value encountered when x is set to nan
+        npyv_b32 nnan_mask = npyv_notnan_f32(x);
+        npyv_f32 x_exnan = npyv_select_f32(nnan_mask, x, npyv_setall_f32(0));
+        npyv_f32 out = __svml_cbrtf16(x_exnan);
+        out = npyv_select_f32(nnan_mask, out, x);
+    #else
+        npyv_f32 out = __svml_cbrtf16(x);
+    #endif
+        if (sdst == 1) {
+            npyv_store_till_f32(dst, len, out);
+        } else {
+            npyv_storen_till_f32(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 22
+static void
+simd_tan_f32(const npyv_lanetype_f32 *src, npy_intp ssrc,
+                        npyv_lanetype_f32 *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f32;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f32 x;
+        #if 0
+            if (ssrc == 1) {
+                x = npyv_load_till_f32(src, len, 0);
+            } else {
+                x = npyv_loadn_till_f32(src, ssrc, len, 0);
+            }
+        #else
+            if (ssrc == 1) {
+                x = npyv_load_tillz_f32(src, len);
+            } else {
+                x = npyv_loadn_tillz_f32(src, ssrc, len);
+            }
+        #endif
+    #if 0 == 32
+        // Workaround, invalid value encountered when x is set to nan
+        npyv_b32 nnan_mask = npyv_notnan_f32(x);
+        npyv_f32 x_exnan = npyv_select_f32(nnan_mask, x, npyv_setall_f32(0));
+        npyv_f32 out = __svml_tanf16(x_exnan);
+        out = npyv_select_f32(nnan_mask, out, x);
+    #else
+        npyv_f32 out = __svml_tanf16(x);
+    #endif
+        if (sdst == 1) {
+            npyv_store_till_f32(dst, len, out);
+        } else {
+            npyv_storen_till_f32(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 22
+static void
+simd_asin_f32(const npyv_lanetype_f32 *src, npy_intp ssrc,
+                        npyv_lanetype_f32 *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f32;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f32 x;
+        #if 0
+            if (ssrc == 1) {
+                x = npyv_load_till_f32(src, len, 0);
+            } else {
+                x = npyv_loadn_till_f32(src, ssrc, len, 0);
+            }
+        #else
+            if (ssrc == 1) {
+                x = npyv_load_tillz_f32(src, len);
+            } else {
+                x = npyv_loadn_tillz_f32(src, ssrc, len);
+            }
+        #endif
+    #if 0 == 32
+        // Workaround, invalid value encountered when x is set to nan
+        npyv_b32 nnan_mask = npyv_notnan_f32(x);
+        npyv_f32 x_exnan = npyv_select_f32(nnan_mask, x, npyv_setall_f32(0));
+        npyv_f32 out = __svml_asinf16(x_exnan);
+        out = npyv_select_f32(nnan_mask, out, x);
+    #else
+        npyv_f32 out = __svml_asinf16(x);
+    #endif
+        if (sdst == 1) {
+            npyv_store_till_f32(dst, len, out);
+        } else {
+            npyv_storen_till_f32(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 22
+static void
+simd_acos_f32(const npyv_lanetype_f32 *src, npy_intp ssrc,
+                        npyv_lanetype_f32 *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f32;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f32 x;
+        #if 0
+            if (ssrc == 1) {
+                x = npyv_load_till_f32(src, len, 0);
+            } else {
+                x = npyv_loadn_till_f32(src, ssrc, len, 0);
+            }
+        #else
+            if (ssrc == 1) {
+                x = npyv_load_tillz_f32(src, len);
+            } else {
+                x = npyv_loadn_tillz_f32(src, ssrc, len);
+            }
+        #endif
+    #if 0 == 32
+        // Workaround, invalid value encountered when x is set to nan
+        npyv_b32 nnan_mask = npyv_notnan_f32(x);
+        npyv_f32 x_exnan = npyv_select_f32(nnan_mask, x, npyv_setall_f32(0));
+        npyv_f32 out = __svml_acosf16(x_exnan);
+        out = npyv_select_f32(nnan_mask, out, x);
+    #else
+        npyv_f32 out = __svml_acosf16(x);
+    #endif
+        if (sdst == 1) {
+            npyv_store_till_f32(dst, len, out);
+        } else {
+            npyv_storen_till_f32(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 22
+static void
+simd_atan_f32(const npyv_lanetype_f32 *src, npy_intp ssrc,
+                        npyv_lanetype_f32 *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f32;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f32 x;
+        #if 0
+            if (ssrc == 1) {
+                x = npyv_load_till_f32(src, len, 0);
+            } else {
+                x = npyv_loadn_till_f32(src, ssrc, len, 0);
+            }
+        #else
+            if (ssrc == 1) {
+                x = npyv_load_tillz_f32(src, len);
+            } else {
+                x = npyv_loadn_tillz_f32(src, ssrc, len);
+            }
+        #endif
+    #if 0 == 32
+        // Workaround, invalid value encountered when x is set to nan
+        npyv_b32 nnan_mask = npyv_notnan_f32(x);
+        npyv_f32 x_exnan = npyv_select_f32(nnan_mask, x, npyv_setall_f32(0));
+        npyv_f32 out = __svml_atanf16(x_exnan);
+        out = npyv_select_f32(nnan_mask, out, x);
+    #else
+        npyv_f32 out = __svml_atanf16(x);
+    #endif
+        if (sdst == 1) {
+            npyv_store_till_f32(dst, len, out);
+        } else {
+            npyv_storen_till_f32(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 22
+static void
+simd_sinh_f32(const npyv_lanetype_f32 *src, npy_intp ssrc,
+                        npyv_lanetype_f32 *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f32;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f32 x;
+        #if 0
+            if (ssrc == 1) {
+                x = npyv_load_till_f32(src, len, 0);
+            } else {
+                x = npyv_loadn_till_f32(src, ssrc, len, 0);
+            }
+        #else
+            if (ssrc == 1) {
+                x = npyv_load_tillz_f32(src, len);
+            } else {
+                x = npyv_loadn_tillz_f32(src, ssrc, len);
+            }
+        #endif
+    #if 0 == 32
+        // Workaround, invalid value encountered when x is set to nan
+        npyv_b32 nnan_mask = npyv_notnan_f32(x);
+        npyv_f32 x_exnan = npyv_select_f32(nnan_mask, x, npyv_setall_f32(0));
+        npyv_f32 out = __svml_sinhf16(x_exnan);
+        out = npyv_select_f32(nnan_mask, out, x);
+    #else
+        npyv_f32 out = __svml_sinhf16(x);
+    #endif
+        if (sdst == 1) {
+            npyv_store_till_f32(dst, len, out);
+        } else {
+            npyv_storen_till_f32(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 22
+static void
+simd_cosh_f32(const npyv_lanetype_f32 *src, npy_intp ssrc,
+                        npyv_lanetype_f32 *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f32;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f32 x;
+        #if 0
+            if (ssrc == 1) {
+                x = npyv_load_till_f32(src, len, 0);
+            } else {
+                x = npyv_loadn_till_f32(src, ssrc, len, 0);
+            }
+        #else
+            if (ssrc == 1) {
+                x = npyv_load_tillz_f32(src, len);
+            } else {
+                x = npyv_loadn_tillz_f32(src, ssrc, len);
+            }
+        #endif
+    #if 0 == 32
+        // Workaround, invalid value encountered when x is set to nan
+        npyv_b32 nnan_mask = npyv_notnan_f32(x);
+        npyv_f32 x_exnan = npyv_select_f32(nnan_mask, x, npyv_setall_f32(0));
+        npyv_f32 out = __svml_coshf16(x_exnan);
+        out = npyv_select_f32(nnan_mask, out, x);
+    #else
+        npyv_f32 out = __svml_coshf16(x);
+    #endif
+        if (sdst == 1) {
+            npyv_store_till_f32(dst, len, out);
+        } else {
+            npyv_storen_till_f32(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 22
+static void
+simd_asinh_f32(const npyv_lanetype_f32 *src, npy_intp ssrc,
+                        npyv_lanetype_f32 *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f32;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f32 x;
+        #if 0
+            if (ssrc == 1) {
+                x = npyv_load_till_f32(src, len, 0);
+            } else {
+                x = npyv_loadn_till_f32(src, ssrc, len, 0);
+            }
+        #else
+            if (ssrc == 1) {
+                x = npyv_load_tillz_f32(src, len);
+            } else {
+                x = npyv_loadn_tillz_f32(src, ssrc, len);
+            }
+        #endif
+    #if 0 == 32
+        // Workaround, invalid value encountered when x is set to nan
+        npyv_b32 nnan_mask = npyv_notnan_f32(x);
+        npyv_f32 x_exnan = npyv_select_f32(nnan_mask, x, npyv_setall_f32(0));
+        npyv_f32 out = __svml_asinhf16(x_exnan);
+        out = npyv_select_f32(nnan_mask, out, x);
+    #else
+        npyv_f32 out = __svml_asinhf16(x);
+    #endif
+        if (sdst == 1) {
+            npyv_store_till_f32(dst, len, out);
+        } else {
+            npyv_storen_till_f32(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 22
+static void
+simd_acosh_f32(const npyv_lanetype_f32 *src, npy_intp ssrc,
+                        npyv_lanetype_f32 *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f32;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f32 x;
+        #if 1
+            if (ssrc == 1) {
+                x = npyv_load_till_f32(src, len, 1);
+            } else {
+                x = npyv_loadn_till_f32(src, ssrc, len, 1);
+            }
+        #else
+            if (ssrc == 1) {
+                x = npyv_load_tillz_f32(src, len);
+            } else {
+                x = npyv_loadn_tillz_f32(src, ssrc, len);
+            }
+        #endif
+    #if 0 == 32
+        // Workaround, invalid value encountered when x is set to nan
+        npyv_b32 nnan_mask = npyv_notnan_f32(x);
+        npyv_f32 x_exnan = npyv_select_f32(nnan_mask, x, npyv_setall_f32(1));
+        npyv_f32 out = __svml_acoshf16(x_exnan);
+        out = npyv_select_f32(nnan_mask, out, x);
+    #else
+        npyv_f32 out = __svml_acoshf16(x);
+    #endif
+        if (sdst == 1) {
+            npyv_store_till_f32(dst, len, out);
+        } else {
+            npyv_storen_till_f32(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 22
+static void
+simd_atanh_f32(const npyv_lanetype_f32 *src, npy_intp ssrc,
+                        npyv_lanetype_f32 *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f32;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f32 x;
+        #if 0
+            if (ssrc == 1) {
+                x = npyv_load_till_f32(src, len, 0);
+            } else {
+                x = npyv_loadn_till_f32(src, ssrc, len, 0);
+            }
+        #else
+            if (ssrc == 1) {
+                x = npyv_load_tillz_f32(src, len);
+            } else {
+                x = npyv_loadn_tillz_f32(src, ssrc, len);
+            }
+        #endif
+    #if 0 == 32
+        // Workaround, invalid value encountered when x is set to nan
+        npyv_b32 nnan_mask = npyv_notnan_f32(x);
+        npyv_f32 x_exnan = npyv_select_f32(nnan_mask, x, npyv_setall_f32(0));
+        npyv_f32 out = __svml_atanhf16(x_exnan);
+        out = npyv_select_f32(nnan_mask, out, x);
+    #else
+        npyv_f32 out = __svml_atanhf16(x);
+    #endif
+        if (sdst == 1) {
+            npyv_store_till_f32(dst, len, out);
+        } else {
+            npyv_storen_till_f32(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+
+#line 17
+#line 22
+static void
+simd_exp2_f64(const npyv_lanetype_f64 *src, npy_intp ssrc,
+                        npyv_lanetype_f64 *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f64;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f64 x;
+        #if 0
+            if (ssrc == 1) {
+                x = npyv_load_till_f64(src, len, 0);
+            } else {
+                x = npyv_loadn_till_f64(src, ssrc, len, 0);
+            }
+        #else
+            if (ssrc == 1) {
+                x = npyv_load_tillz_f64(src, len);
+            } else {
+                x = npyv_loadn_tillz_f64(src, ssrc, len);
+            }
+        #endif
+    #if 0 == 64
+        // Workaround, invalid value encountered when x is set to nan
+        npyv_b64 nnan_mask = npyv_notnan_f64(x);
+        npyv_f64 x_exnan = npyv_select_f64(nnan_mask, x, npyv_setall_f64(0));
+        npyv_f64 out = __svml_exp28(x_exnan);
+        out = npyv_select_f64(nnan_mask, out, x);
+    #else
+        npyv_f64 out = __svml_exp28(x);
+    #endif
+        if (sdst == 1) {
+            npyv_store_till_f64(dst, len, out);
+        } else {
+            npyv_storen_till_f64(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 22
+static void
+simd_log2_f64(const npyv_lanetype_f64 *src, npy_intp ssrc,
+                        npyv_lanetype_f64 *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f64;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f64 x;
+        #if 1
+            if (ssrc == 1) {
+                x = npyv_load_till_f64(src, len, 1);
+            } else {
+                x = npyv_loadn_till_f64(src, ssrc, len, 1);
+            }
+        #else
+            if (ssrc == 1) {
+                x = npyv_load_tillz_f64(src, len);
+            } else {
+                x = npyv_loadn_tillz_f64(src, ssrc, len);
+            }
+        #endif
+    #if 0 == 64
+        // Workaround, invalid value encountered when x is set to nan
+        npyv_b64 nnan_mask = npyv_notnan_f64(x);
+        npyv_f64 x_exnan = npyv_select_f64(nnan_mask, x, npyv_setall_f64(1));
+        npyv_f64 out = __svml_log28(x_exnan);
+        out = npyv_select_f64(nnan_mask, out, x);
+    #else
+        npyv_f64 out = __svml_log28(x);
+    #endif
+        if (sdst == 1) {
+            npyv_store_till_f64(dst, len, out);
+        } else {
+            npyv_storen_till_f64(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 22
+static void
+simd_log10_f64(const npyv_lanetype_f64 *src, npy_intp ssrc,
+                        npyv_lanetype_f64 *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f64;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f64 x;
+        #if 1
+            if (ssrc == 1) {
+                x = npyv_load_till_f64(src, len, 1);
+            } else {
+                x = npyv_loadn_till_f64(src, ssrc, len, 1);
+            }
+        #else
+            if (ssrc == 1) {
+                x = npyv_load_tillz_f64(src, len);
+            } else {
+                x = npyv_loadn_tillz_f64(src, ssrc, len);
+            }
+        #endif
+    #if 0 == 64
+        // Workaround, invalid value encountered when x is set to nan
+        npyv_b64 nnan_mask = npyv_notnan_f64(x);
+        npyv_f64 x_exnan = npyv_select_f64(nnan_mask, x, npyv_setall_f64(1));
+        npyv_f64 out = __svml_log108(x_exnan);
+        out = npyv_select_f64(nnan_mask, out, x);
+    #else
+        npyv_f64 out = __svml_log108(x);
+    #endif
+        if (sdst == 1) {
+            npyv_store_till_f64(dst, len, out);
+        } else {
+            npyv_storen_till_f64(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 22
+static void
+simd_expm1_f64(const npyv_lanetype_f64 *src, npy_intp ssrc,
+                        npyv_lanetype_f64 *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f64;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f64 x;
+        #if 1
+            if (ssrc == 1) {
+                x = npyv_load_till_f64(src, len, 1);
+            } else {
+                x = npyv_loadn_till_f64(src, ssrc, len, 1);
+            }
+        #else
+            if (ssrc == 1) {
+                x = npyv_load_tillz_f64(src, len);
+            } else {
+                x = npyv_loadn_tillz_f64(src, ssrc, len);
+            }
+        #endif
+    #if 64 == 64
+        // Workaround, invalid value encountered when x is set to nan
+        npyv_b64 nnan_mask = npyv_notnan_f64(x);
+        npyv_f64 x_exnan = npyv_select_f64(nnan_mask, x, npyv_setall_f64(1));
+        npyv_f64 out = __svml_expm18(x_exnan);
+        out = npyv_select_f64(nnan_mask, out, x);
+    #else
+        npyv_f64 out = __svml_expm18(x);
+    #endif
+        if (sdst == 1) {
+            npyv_store_till_f64(dst, len, out);
+        } else {
+            npyv_storen_till_f64(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 22
+static void
+simd_log1p_f64(const npyv_lanetype_f64 *src, npy_intp ssrc,
+                        npyv_lanetype_f64 *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f64;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f64 x;
+        #if 0
+            if (ssrc == 1) {
+                x = npyv_load_till_f64(src, len, 0);
+            } else {
+                x = npyv_loadn_till_f64(src, ssrc, len, 0);
+            }
+        #else
+            if (ssrc == 1) {
+                x = npyv_load_tillz_f64(src, len);
+            } else {
+                x = npyv_loadn_tillz_f64(src, ssrc, len);
+            }
+        #endif
+    #if 0 == 64
+        // Workaround, invalid value encountered when x is set to nan
+        npyv_b64 nnan_mask = npyv_notnan_f64(x);
+        npyv_f64 x_exnan = npyv_select_f64(nnan_mask, x, npyv_setall_f64(0));
+        npyv_f64 out = __svml_log1p8(x_exnan);
+        out = npyv_select_f64(nnan_mask, out, x);
+    #else
+        npyv_f64 out = __svml_log1p8(x);
+    #endif
+        if (sdst == 1) {
+            npyv_store_till_f64(dst, len, out);
+        } else {
+            npyv_storen_till_f64(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 22
+static void
+simd_cbrt_f64(const npyv_lanetype_f64 *src, npy_intp ssrc,
+                        npyv_lanetype_f64 *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f64;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f64 x;
+        #if 0
+            if (ssrc == 1) {
+                x = npyv_load_till_f64(src, len, 0);
+            } else {
+                x = npyv_loadn_till_f64(src, ssrc, len, 0);
+            }
+        #else
+            if (ssrc == 1) {
+                x = npyv_load_tillz_f64(src, len);
+            } else {
+                x = npyv_loadn_tillz_f64(src, ssrc, len);
+            }
+        #endif
+    #if 0 == 64
+        // Workaround, invalid value encountered when x is set to nan
+        npyv_b64 nnan_mask = npyv_notnan_f64(x);
+        npyv_f64 x_exnan = npyv_select_f64(nnan_mask, x, npyv_setall_f64(0));
+        npyv_f64 out = __svml_cbrt8(x_exnan);
+        out = npyv_select_f64(nnan_mask, out, x);
+    #else
+        npyv_f64 out = __svml_cbrt8(x);
+    #endif
+        if (sdst == 1) {
+            npyv_store_till_f64(dst, len, out);
+        } else {
+            npyv_storen_till_f64(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 22
+static void
+simd_tan_f64(const npyv_lanetype_f64 *src, npy_intp ssrc,
+                        npyv_lanetype_f64 *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f64;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f64 x;
+        #if 0
+            if (ssrc == 1) {
+                x = npyv_load_till_f64(src, len, 0);
+            } else {
+                x = npyv_loadn_till_f64(src, ssrc, len, 0);
+            }
+        #else
+            if (ssrc == 1) {
+                x = npyv_load_tillz_f64(src, len);
+            } else {
+                x = npyv_loadn_tillz_f64(src, ssrc, len);
+            }
+        #endif
+    #if 0 == 64
+        // Workaround, invalid value encountered when x is set to nan
+        npyv_b64 nnan_mask = npyv_notnan_f64(x);
+        npyv_f64 x_exnan = npyv_select_f64(nnan_mask, x, npyv_setall_f64(0));
+        npyv_f64 out = __svml_tan8(x_exnan);
+        out = npyv_select_f64(nnan_mask, out, x);
+    #else
+        npyv_f64 out = __svml_tan8(x);
+    #endif
+        if (sdst == 1) {
+            npyv_store_till_f64(dst, len, out);
+        } else {
+            npyv_storen_till_f64(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 22
+static void
+simd_asin_f64(const npyv_lanetype_f64 *src, npy_intp ssrc,
+                        npyv_lanetype_f64 *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f64;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f64 x;
+        #if 0
+            if (ssrc == 1) {
+                x = npyv_load_till_f64(src, len, 0);
+            } else {
+                x = npyv_loadn_till_f64(src, ssrc, len, 0);
+            }
+        #else
+            if (ssrc == 1) {
+                x = npyv_load_tillz_f64(src, len);
+            } else {
+                x = npyv_loadn_tillz_f64(src, ssrc, len);
+            }
+        #endif
+    #if 0 == 64
+        // Workaround, invalid value encountered when x is set to nan
+        npyv_b64 nnan_mask = npyv_notnan_f64(x);
+        npyv_f64 x_exnan = npyv_select_f64(nnan_mask, x, npyv_setall_f64(0));
+        npyv_f64 out = __svml_asin8(x_exnan);
+        out = npyv_select_f64(nnan_mask, out, x);
+    #else
+        npyv_f64 out = __svml_asin8(x);
+    #endif
+        if (sdst == 1) {
+            npyv_store_till_f64(dst, len, out);
+        } else {
+            npyv_storen_till_f64(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 22
+static void
+simd_acos_f64(const npyv_lanetype_f64 *src, npy_intp ssrc,
+                        npyv_lanetype_f64 *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f64;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f64 x;
+        #if 0
+            if (ssrc == 1) {
+                x = npyv_load_till_f64(src, len, 0);
+            } else {
+                x = npyv_loadn_till_f64(src, ssrc, len, 0);
+            }
+        #else
+            if (ssrc == 1) {
+                x = npyv_load_tillz_f64(src, len);
+            } else {
+                x = npyv_loadn_tillz_f64(src, ssrc, len);
+            }
+        #endif
+    #if 0 == 64
+        // Workaround, invalid value encountered when x is set to nan
+        npyv_b64 nnan_mask = npyv_notnan_f64(x);
+        npyv_f64 x_exnan = npyv_select_f64(nnan_mask, x, npyv_setall_f64(0));
+        npyv_f64 out = __svml_acos8(x_exnan);
+        out = npyv_select_f64(nnan_mask, out, x);
+    #else
+        npyv_f64 out = __svml_acos8(x);
+    #endif
+        if (sdst == 1) {
+            npyv_store_till_f64(dst, len, out);
+        } else {
+            npyv_storen_till_f64(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 22
+static void
+simd_atan_f64(const npyv_lanetype_f64 *src, npy_intp ssrc,
+                        npyv_lanetype_f64 *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f64;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f64 x;
+        #if 0
+            if (ssrc == 1) {
+                x = npyv_load_till_f64(src, len, 0);
+            } else {
+                x = npyv_loadn_till_f64(src, ssrc, len, 0);
+            }
+        #else
+            if (ssrc == 1) {
+                x = npyv_load_tillz_f64(src, len);
+            } else {
+                x = npyv_loadn_tillz_f64(src, ssrc, len);
+            }
+        #endif
+    #if 0 == 64
+        // Workaround, invalid value encountered when x is set to nan
+        npyv_b64 nnan_mask = npyv_notnan_f64(x);
+        npyv_f64 x_exnan = npyv_select_f64(nnan_mask, x, npyv_setall_f64(0));
+        npyv_f64 out = __svml_atan8(x_exnan);
+        out = npyv_select_f64(nnan_mask, out, x);
+    #else
+        npyv_f64 out = __svml_atan8(x);
+    #endif
+        if (sdst == 1) {
+            npyv_store_till_f64(dst, len, out);
+        } else {
+            npyv_storen_till_f64(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 22
+static void
+simd_sinh_f64(const npyv_lanetype_f64 *src, npy_intp ssrc,
+                        npyv_lanetype_f64 *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f64;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f64 x;
+        #if 0
+            if (ssrc == 1) {
+                x = npyv_load_till_f64(src, len, 0);
+            } else {
+                x = npyv_loadn_till_f64(src, ssrc, len, 0);
+            }
+        #else
+            if (ssrc == 1) {
+                x = npyv_load_tillz_f64(src, len);
+            } else {
+                x = npyv_loadn_tillz_f64(src, ssrc, len);
+            }
+        #endif
+    #if 0 == 64
+        // Workaround, invalid value encountered when x is set to nan
+        npyv_b64 nnan_mask = npyv_notnan_f64(x);
+        npyv_f64 x_exnan = npyv_select_f64(nnan_mask, x, npyv_setall_f64(0));
+        npyv_f64 out = __svml_sinh8(x_exnan);
+        out = npyv_select_f64(nnan_mask, out, x);
+    #else
+        npyv_f64 out = __svml_sinh8(x);
+    #endif
+        if (sdst == 1) {
+            npyv_store_till_f64(dst, len, out);
+        } else {
+            npyv_storen_till_f64(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 22
+static void
+simd_cosh_f64(const npyv_lanetype_f64 *src, npy_intp ssrc,
+                        npyv_lanetype_f64 *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f64;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f64 x;
+        #if 0
+            if (ssrc == 1) {
+                x = npyv_load_till_f64(src, len, 0);
+            } else {
+                x = npyv_loadn_till_f64(src, ssrc, len, 0);
+            }
+        #else
+            if (ssrc == 1) {
+                x = npyv_load_tillz_f64(src, len);
+            } else {
+                x = npyv_loadn_tillz_f64(src, ssrc, len);
+            }
+        #endif
+    #if 0 == 64
+        // Workaround, invalid value encountered when x is set to nan
+        npyv_b64 nnan_mask = npyv_notnan_f64(x);
+        npyv_f64 x_exnan = npyv_select_f64(nnan_mask, x, npyv_setall_f64(0));
+        npyv_f64 out = __svml_cosh8(x_exnan);
+        out = npyv_select_f64(nnan_mask, out, x);
+    #else
+        npyv_f64 out = __svml_cosh8(x);
+    #endif
+        if (sdst == 1) {
+            npyv_store_till_f64(dst, len, out);
+        } else {
+            npyv_storen_till_f64(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 22
+static void
+simd_asinh_f64(const npyv_lanetype_f64 *src, npy_intp ssrc,
+                        npyv_lanetype_f64 *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f64;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f64 x;
+        #if 0
+            if (ssrc == 1) {
+                x = npyv_load_till_f64(src, len, 0);
+            } else {
+                x = npyv_loadn_till_f64(src, ssrc, len, 0);
+            }
+        #else
+            if (ssrc == 1) {
+                x = npyv_load_tillz_f64(src, len);
+            } else {
+                x = npyv_loadn_tillz_f64(src, ssrc, len);
+            }
+        #endif
+    #if 0 == 64
+        // Workaround, invalid value encountered when x is set to nan
+        npyv_b64 nnan_mask = npyv_notnan_f64(x);
+        npyv_f64 x_exnan = npyv_select_f64(nnan_mask, x, npyv_setall_f64(0));
+        npyv_f64 out = __svml_asinh8(x_exnan);
+        out = npyv_select_f64(nnan_mask, out, x);
+    #else
+        npyv_f64 out = __svml_asinh8(x);
+    #endif
+        if (sdst == 1) {
+            npyv_store_till_f64(dst, len, out);
+        } else {
+            npyv_storen_till_f64(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 22
+static void
+simd_acosh_f64(const npyv_lanetype_f64 *src, npy_intp ssrc,
+                        npyv_lanetype_f64 *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f64;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f64 x;
+        #if 1
+            if (ssrc == 1) {
+                x = npyv_load_till_f64(src, len, 1);
+            } else {
+                x = npyv_loadn_till_f64(src, ssrc, len, 1);
+            }
+        #else
+            if (ssrc == 1) {
+                x = npyv_load_tillz_f64(src, len);
+            } else {
+                x = npyv_loadn_tillz_f64(src, ssrc, len);
+            }
+        #endif
+    #if 0 == 64
+        // Workaround, invalid value encountered when x is set to nan
+        npyv_b64 nnan_mask = npyv_notnan_f64(x);
+        npyv_f64 x_exnan = npyv_select_f64(nnan_mask, x, npyv_setall_f64(1));
+        npyv_f64 out = __svml_acosh8(x_exnan);
+        out = npyv_select_f64(nnan_mask, out, x);
+    #else
+        npyv_f64 out = __svml_acosh8(x);
+    #endif
+        if (sdst == 1) {
+            npyv_store_till_f64(dst, len, out);
+        } else {
+            npyv_storen_till_f64(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 22
+static void
+simd_atanh_f64(const npyv_lanetype_f64 *src, npy_intp ssrc,
+                        npyv_lanetype_f64 *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f64;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f64 x;
+        #if 0
+            if (ssrc == 1) {
+                x = npyv_load_till_f64(src, len, 0);
+            } else {
+                x = npyv_loadn_till_f64(src, ssrc, len, 0);
+            }
+        #else
+            if (ssrc == 1) {
+                x = npyv_load_tillz_f64(src, len);
+            } else {
+                x = npyv_loadn_tillz_f64(src, ssrc, len);
+            }
+        #endif
+    #if 0 == 64
+        // Workaround, invalid value encountered when x is set to nan
+        npyv_b64 nnan_mask = npyv_notnan_f64(x);
+        npyv_f64 x_exnan = npyv_select_f64(nnan_mask, x, npyv_setall_f64(0));
+        npyv_f64 out = __svml_atanh8(x_exnan);
+        out = npyv_select_f64(nnan_mask, out, x);
+    #else
+        npyv_f64 out = __svml_atanh8(x);
+    #endif
+        if (sdst == 1) {
+            npyv_store_till_f64(dst, len, out);
+        } else {
+            npyv_storen_till_f64(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+
+
+#line 66
+#line 69
+
+static void
+simd_pow_f32(const npyv_lanetype_f32 *src1, npy_intp ssrc1,
+                  const npyv_lanetype_f32 *src2, npy_intp ssrc2,
+                        npyv_lanetype_f32 *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f32;
+    for (; len > 0; len -= vstep, src1 += ssrc1*vstep, src2 += ssrc2*vstep, dst += sdst*vstep) {
+        npyv_f32 x1;
+        if (ssrc1 == 1) {
+            x1 = npyv_load_till_f32(src1, len, 1);
+        } else {
+            x1 = npyv_loadn_till_f32(src1, ssrc1, len, 1);
+        }
+
+        npyv_f32 x2;
+        if (ssrc2 == 1) {
+            x2 = npyv_load_till_f32(src2, len, 1);
+        } else {
+            x2 = npyv_loadn_till_f32(src2, ssrc2, len, 1);
+        }
+
+        npyv_f32 out = __svml_powf16(x1, x2);
+        if (sdst == 1) {
+            npyv_store_till_f32(dst, len, out);
+        } else {
+            npyv_storen_till_f32(dst, sdst, len, out);
+        }
+    }
+}
+
+#line 69
+
+static void
+simd_atan2_f32(const npyv_lanetype_f32 *src1, npy_intp ssrc1,
+                  const npyv_lanetype_f32 *src2, npy_intp ssrc2,
+                        npyv_lanetype_f32 *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f32;
+    for (; len > 0; len -= vstep, src1 += ssrc1*vstep, src2 += ssrc2*vstep, dst += sdst*vstep) {
+        npyv_f32 x1;
+        if (ssrc1 == 1) {
+            x1 = npyv_load_till_f32(src1, len, 1);
+        } else {
+            x1 = npyv_loadn_till_f32(src1, ssrc1, len, 1);
+        }
+
+        npyv_f32 x2;
+        if (ssrc2 == 1) {
+            x2 = npyv_load_till_f32(src2, len, 1);
+        } else {
+            x2 = npyv_loadn_till_f32(src2, ssrc2, len, 1);
+        }
+
+        npyv_f32 out = __svml_atan2f16(x1, x2);
+        if (sdst == 1) {
+            npyv_store_till_f32(dst, len, out);
+        } else {
+            npyv_storen_till_f32(dst, sdst, len, out);
+        }
+    }
+}
+
+
+#line 66
+#line 69
+
+static void
+simd_pow_f64(const npyv_lanetype_f64 *src1, npy_intp ssrc1,
+                  const npyv_lanetype_f64 *src2, npy_intp ssrc2,
+                        npyv_lanetype_f64 *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f64;
+    for (; len > 0; len -= vstep, src1 += ssrc1*vstep, src2 += ssrc2*vstep, dst += sdst*vstep) {
+        npyv_f64 x1;
+        if (ssrc1 == 1) {
+            x1 = npyv_load_till_f64(src1, len, 1);
+        } else {
+            x1 = npyv_loadn_till_f64(src1, ssrc1, len, 1);
+        }
+
+        npyv_f64 x2;
+        if (ssrc2 == 1) {
+            x2 = npyv_load_till_f64(src2, len, 1);
+        } else {
+            x2 = npyv_loadn_till_f64(src2, ssrc2, len, 1);
+        }
+
+        npyv_f64 out = __svml_pow8(x1, x2);
+        if (sdst == 1) {
+            npyv_store_till_f64(dst, len, out);
+        } else {
+            npyv_storen_till_f64(dst, sdst, len, out);
+        }
+    }
+}
+
+#line 69
+
+static void
+simd_atan2_f64(const npyv_lanetype_f64 *src1, npy_intp ssrc1,
+                  const npyv_lanetype_f64 *src2, npy_intp ssrc2,
+                        npyv_lanetype_f64 *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f64;
+    for (; len > 0; len -= vstep, src1 += ssrc1*vstep, src2 += ssrc2*vstep, dst += sdst*vstep) {
+        npyv_f64 x1;
+        if (ssrc1 == 1) {
+            x1 = npyv_load_till_f64(src1, len, 1);
+        } else {
+            x1 = npyv_loadn_till_f64(src1, ssrc1, len, 1);
+        }
+
+        npyv_f64 x2;
+        if (ssrc2 == 1) {
+            x2 = npyv_load_till_f64(src2, len, 1);
+        } else {
+            x2 = npyv_loadn_till_f64(src2, ssrc2, len, 1);
+        }
+
+        npyv_f64 out = __svml_atan28(x1, x2);
+        if (sdst == 1) {
+            npyv_store_till_f64(dst, len, out);
+        } else {
+            npyv_storen_till_f64(dst, sdst, len, out);
+        }
+    }
+}
+
+
+
+typedef __m256i npyvh_f16;
+#define npyv_cvt_f16_f32 _mm512_cvtph_ps
+#define npyv_cvt_f32_f16 _mm512_cvtps_ph
+#define npyvh_load_f16(PTR) _mm256_loadu_si256((const __m256i*)(PTR))
+#define npyvh_store_f16(PTR, data) _mm256_storeu_si256((__m256i*)PTR, data)
+NPY_FINLINE npyvh_f16 npyvh_load_till_f16(const npy_half *ptr, npy_uintp nlane, npy_half fill)
+{
+    assert(nlane > 0);
+    const __m256i vfill = _mm256_set1_epi16(fill);
+    const __mmask16 mask = (0x0001 << nlane) - 0x0001;
+    return _mm256_mask_loadu_epi16(vfill, mask, ptr);
+}
+NPY_FINLINE void npyvh_store_till_f16(npy_half *ptr, npy_uintp nlane, npyvh_f16 data)
+{
+    assert(nlane > 0);
+    const __mmask16 mask = (0x0001 << nlane) - 0x0001;
+    _mm256_mask_storeu_epi16(ptr, mask, data);
+}
+
+#line 125
+static void
+avx512_sin_f16(const npy_half *src, npy_half *dst, npy_intp len)
+{
+    const int num_lanes = npyv_nlanes_f32;
+    npyvh_f16 x, out;
+    npyv_f32 x_ps, out_ps;
+    for (; len > 0; len -= num_lanes, src += num_lanes, dst += num_lanes) {
+        if (len >= num_lanes) {
+            x       = npyvh_load_f16(src);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_sinf16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_f16(dst, out);
+        }
+        else {
+            x       = npyvh_load_till_f16(src, len, 0);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_sinf16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_till_f16(dst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 125
+static void
+avx512_cos_f16(const npy_half *src, npy_half *dst, npy_intp len)
+{
+    const int num_lanes = npyv_nlanes_f32;
+    npyvh_f16 x, out;
+    npyv_f32 x_ps, out_ps;
+    for (; len > 0; len -= num_lanes, src += num_lanes, dst += num_lanes) {
+        if (len >= num_lanes) {
+            x       = npyvh_load_f16(src);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_cosf16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_f16(dst, out);
+        }
+        else {
+            x       = npyvh_load_till_f16(src, len, 0);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_cosf16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_till_f16(dst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 125
+static void
+avx512_tan_f16(const npy_half *src, npy_half *dst, npy_intp len)
+{
+    const int num_lanes = npyv_nlanes_f32;
+    npyvh_f16 x, out;
+    npyv_f32 x_ps, out_ps;
+    for (; len > 0; len -= num_lanes, src += num_lanes, dst += num_lanes) {
+        if (len >= num_lanes) {
+            x       = npyvh_load_f16(src);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_tanf16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_f16(dst, out);
+        }
+        else {
+            x       = npyvh_load_till_f16(src, len, 0);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_tanf16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_till_f16(dst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 125
+static void
+avx512_exp_f16(const npy_half *src, npy_half *dst, npy_intp len)
+{
+    const int num_lanes = npyv_nlanes_f32;
+    npyvh_f16 x, out;
+    npyv_f32 x_ps, out_ps;
+    for (; len > 0; len -= num_lanes, src += num_lanes, dst += num_lanes) {
+        if (len >= num_lanes) {
+            x       = npyvh_load_f16(src);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_expf16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_f16(dst, out);
+        }
+        else {
+            x       = npyvh_load_till_f16(src, len, 0);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_expf16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_till_f16(dst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 125
+static void
+avx512_exp2_f16(const npy_half *src, npy_half *dst, npy_intp len)
+{
+    const int num_lanes = npyv_nlanes_f32;
+    npyvh_f16 x, out;
+    npyv_f32 x_ps, out_ps;
+    for (; len > 0; len -= num_lanes, src += num_lanes, dst += num_lanes) {
+        if (len >= num_lanes) {
+            x       = npyvh_load_f16(src);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_exp2f16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_f16(dst, out);
+        }
+        else {
+            x       = npyvh_load_till_f16(src, len, 0);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_exp2f16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_till_f16(dst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 125
+static void
+avx512_expm1_f16(const npy_half *src, npy_half *dst, npy_intp len)
+{
+    const int num_lanes = npyv_nlanes_f32;
+    npyvh_f16 x, out;
+    npyv_f32 x_ps, out_ps;
+    for (; len > 0; len -= num_lanes, src += num_lanes, dst += num_lanes) {
+        if (len >= num_lanes) {
+            x       = npyvh_load_f16(src);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_expm1f16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_f16(dst, out);
+        }
+        else {
+            x       = npyvh_load_till_f16(src, len, 0x3c00);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_expm1f16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_till_f16(dst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 125
+static void
+avx512_log_f16(const npy_half *src, npy_half *dst, npy_intp len)
+{
+    const int num_lanes = npyv_nlanes_f32;
+    npyvh_f16 x, out;
+    npyv_f32 x_ps, out_ps;
+    for (; len > 0; len -= num_lanes, src += num_lanes, dst += num_lanes) {
+        if (len >= num_lanes) {
+            x       = npyvh_load_f16(src);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_logf16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_f16(dst, out);
+        }
+        else {
+            x       = npyvh_load_till_f16(src, len, 0x3c00);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_logf16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_till_f16(dst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 125
+static void
+avx512_log2_f16(const npy_half *src, npy_half *dst, npy_intp len)
+{
+    const int num_lanes = npyv_nlanes_f32;
+    npyvh_f16 x, out;
+    npyv_f32 x_ps, out_ps;
+    for (; len > 0; len -= num_lanes, src += num_lanes, dst += num_lanes) {
+        if (len >= num_lanes) {
+            x       = npyvh_load_f16(src);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_log2f16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_f16(dst, out);
+        }
+        else {
+            x       = npyvh_load_till_f16(src, len, 0x3c00);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_log2f16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_till_f16(dst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 125
+static void
+avx512_log10_f16(const npy_half *src, npy_half *dst, npy_intp len)
+{
+    const int num_lanes = npyv_nlanes_f32;
+    npyvh_f16 x, out;
+    npyv_f32 x_ps, out_ps;
+    for (; len > 0; len -= num_lanes, src += num_lanes, dst += num_lanes) {
+        if (len >= num_lanes) {
+            x       = npyvh_load_f16(src);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_log10f16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_f16(dst, out);
+        }
+        else {
+            x       = npyvh_load_till_f16(src, len, 0x3c00);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_log10f16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_till_f16(dst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 125
+static void
+avx512_log1p_f16(const npy_half *src, npy_half *dst, npy_intp len)
+{
+    const int num_lanes = npyv_nlanes_f32;
+    npyvh_f16 x, out;
+    npyv_f32 x_ps, out_ps;
+    for (; len > 0; len -= num_lanes, src += num_lanes, dst += num_lanes) {
+        if (len >= num_lanes) {
+            x       = npyvh_load_f16(src);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_log1pf16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_f16(dst, out);
+        }
+        else {
+            x       = npyvh_load_till_f16(src, len, 0);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_log1pf16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_till_f16(dst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 125
+static void
+avx512_cbrt_f16(const npy_half *src, npy_half *dst, npy_intp len)
+{
+    const int num_lanes = npyv_nlanes_f32;
+    npyvh_f16 x, out;
+    npyv_f32 x_ps, out_ps;
+    for (; len > 0; len -= num_lanes, src += num_lanes, dst += num_lanes) {
+        if (len >= num_lanes) {
+            x       = npyvh_load_f16(src);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_cbrtf16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_f16(dst, out);
+        }
+        else {
+            x       = npyvh_load_till_f16(src, len, 0);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_cbrtf16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_till_f16(dst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 125
+static void
+avx512_asin_f16(const npy_half *src, npy_half *dst, npy_intp len)
+{
+    const int num_lanes = npyv_nlanes_f32;
+    npyvh_f16 x, out;
+    npyv_f32 x_ps, out_ps;
+    for (; len > 0; len -= num_lanes, src += num_lanes, dst += num_lanes) {
+        if (len >= num_lanes) {
+            x       = npyvh_load_f16(src);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_asinf16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_f16(dst, out);
+        }
+        else {
+            x       = npyvh_load_till_f16(src, len, 0);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_asinf16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_till_f16(dst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 125
+static void
+avx512_acos_f16(const npy_half *src, npy_half *dst, npy_intp len)
+{
+    const int num_lanes = npyv_nlanes_f32;
+    npyvh_f16 x, out;
+    npyv_f32 x_ps, out_ps;
+    for (; len > 0; len -= num_lanes, src += num_lanes, dst += num_lanes) {
+        if (len >= num_lanes) {
+            x       = npyvh_load_f16(src);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_acosf16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_f16(dst, out);
+        }
+        else {
+            x       = npyvh_load_till_f16(src, len, 0);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_acosf16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_till_f16(dst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 125
+static void
+avx512_atan_f16(const npy_half *src, npy_half *dst, npy_intp len)
+{
+    const int num_lanes = npyv_nlanes_f32;
+    npyvh_f16 x, out;
+    npyv_f32 x_ps, out_ps;
+    for (; len > 0; len -= num_lanes, src += num_lanes, dst += num_lanes) {
+        if (len >= num_lanes) {
+            x       = npyvh_load_f16(src);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_atanf16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_f16(dst, out);
+        }
+        else {
+            x       = npyvh_load_till_f16(src, len, 0);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_atanf16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_till_f16(dst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 125
+static void
+avx512_sinh_f16(const npy_half *src, npy_half *dst, npy_intp len)
+{
+    const int num_lanes = npyv_nlanes_f32;
+    npyvh_f16 x, out;
+    npyv_f32 x_ps, out_ps;
+    for (; len > 0; len -= num_lanes, src += num_lanes, dst += num_lanes) {
+        if (len >= num_lanes) {
+            x       = npyvh_load_f16(src);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_sinhf16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_f16(dst, out);
+        }
+        else {
+            x       = npyvh_load_till_f16(src, len, 0);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_sinhf16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_till_f16(dst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 125
+static void
+avx512_cosh_f16(const npy_half *src, npy_half *dst, npy_intp len)
+{
+    const int num_lanes = npyv_nlanes_f32;
+    npyvh_f16 x, out;
+    npyv_f32 x_ps, out_ps;
+    for (; len > 0; len -= num_lanes, src += num_lanes, dst += num_lanes) {
+        if (len >= num_lanes) {
+            x       = npyvh_load_f16(src);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_coshf16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_f16(dst, out);
+        }
+        else {
+            x       = npyvh_load_till_f16(src, len, 0);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_coshf16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_till_f16(dst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 125
+static void
+avx512_tanh_f16(const npy_half *src, npy_half *dst, npy_intp len)
+{
+    const int num_lanes = npyv_nlanes_f32;
+    npyvh_f16 x, out;
+    npyv_f32 x_ps, out_ps;
+    for (; len > 0; len -= num_lanes, src += num_lanes, dst += num_lanes) {
+        if (len >= num_lanes) {
+            x       = npyvh_load_f16(src);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_tanhf16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_f16(dst, out);
+        }
+        else {
+            x       = npyvh_load_till_f16(src, len, 0);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_tanhf16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_till_f16(dst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 125
+static void
+avx512_asinh_f16(const npy_half *src, npy_half *dst, npy_intp len)
+{
+    const int num_lanes = npyv_nlanes_f32;
+    npyvh_f16 x, out;
+    npyv_f32 x_ps, out_ps;
+    for (; len > 0; len -= num_lanes, src += num_lanes, dst += num_lanes) {
+        if (len >= num_lanes) {
+            x       = npyvh_load_f16(src);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_asinhf16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_f16(dst, out);
+        }
+        else {
+            x       = npyvh_load_till_f16(src, len, 0);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_asinhf16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_till_f16(dst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 125
+static void
+avx512_acosh_f16(const npy_half *src, npy_half *dst, npy_intp len)
+{
+    const int num_lanes = npyv_nlanes_f32;
+    npyvh_f16 x, out;
+    npyv_f32 x_ps, out_ps;
+    for (; len > 0; len -= num_lanes, src += num_lanes, dst += num_lanes) {
+        if (len >= num_lanes) {
+            x       = npyvh_load_f16(src);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_acoshf16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_f16(dst, out);
+        }
+        else {
+            x       = npyvh_load_till_f16(src, len, 0x3c00);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_acoshf16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_till_f16(dst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#line 125
+static void
+avx512_atanh_f16(const npy_half *src, npy_half *dst, npy_intp len)
+{
+    const int num_lanes = npyv_nlanes_f32;
+    npyvh_f16 x, out;
+    npyv_f32 x_ps, out_ps;
+    for (; len > 0; len -= num_lanes, src += num_lanes, dst += num_lanes) {
+        if (len >= num_lanes) {
+            x       = npyvh_load_f16(src);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_atanhf16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_f16(dst, out);
+        }
+        else {
+            x       = npyvh_load_till_f16(src, len, 0);
+            x_ps    = npyv_cvt_f16_f32(x);
+            out_ps  = __svml_atanhf16(x_ps);
+            out     = npyv_cvt_f32_f16(out_ps, 0);
+            npyvh_store_till_f16(dst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+
+#endif
+
+#line 156
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(HALF_sin)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_half *src = (npy_half*)args[0];
+          npy_half *dst = (npy_half*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        (ssrc == 1) &&
+        (sdst == 1)) {
+        avx512_sin_f16(src, dst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_float in1 = npy_half_to_float(*(npy_half *)ip1);
+        *((npy_half *)op1) = npy_float_to_half(npy_sinf(in1));
+    }
+}
+
+#line 156
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(HALF_cos)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_half *src = (npy_half*)args[0];
+          npy_half *dst = (npy_half*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        (ssrc == 1) &&
+        (sdst == 1)) {
+        avx512_cos_f16(src, dst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_float in1 = npy_half_to_float(*(npy_half *)ip1);
+        *((npy_half *)op1) = npy_float_to_half(npy_cosf(in1));
+    }
+}
+
+#line 156
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(HALF_tan)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_half *src = (npy_half*)args[0];
+          npy_half *dst = (npy_half*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        (ssrc == 1) &&
+        (sdst == 1)) {
+        avx512_tan_f16(src, dst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_float in1 = npy_half_to_float(*(npy_half *)ip1);
+        *((npy_half *)op1) = npy_float_to_half(npy_tanf(in1));
+    }
+}
+
+#line 156
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(HALF_exp)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_half *src = (npy_half*)args[0];
+          npy_half *dst = (npy_half*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        (ssrc == 1) &&
+        (sdst == 1)) {
+        avx512_exp_f16(src, dst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_float in1 = npy_half_to_float(*(npy_half *)ip1);
+        *((npy_half *)op1) = npy_float_to_half(npy_expf(in1));
+    }
+}
+
+#line 156
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(HALF_exp2)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_half *src = (npy_half*)args[0];
+          npy_half *dst = (npy_half*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        (ssrc == 1) &&
+        (sdst == 1)) {
+        avx512_exp2_f16(src, dst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_float in1 = npy_half_to_float(*(npy_half *)ip1);
+        *((npy_half *)op1) = npy_float_to_half(npy_exp2f(in1));
+    }
+}
+
+#line 156
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(HALF_expm1)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_half *src = (npy_half*)args[0];
+          npy_half *dst = (npy_half*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        (ssrc == 1) &&
+        (sdst == 1)) {
+        avx512_expm1_f16(src, dst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_float in1 = npy_half_to_float(*(npy_half *)ip1);
+        *((npy_half *)op1) = npy_float_to_half(npy_expm1f(in1));
+    }
+}
+
+#line 156
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(HALF_log)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_half *src = (npy_half*)args[0];
+          npy_half *dst = (npy_half*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        (ssrc == 1) &&
+        (sdst == 1)) {
+        avx512_log_f16(src, dst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_float in1 = npy_half_to_float(*(npy_half *)ip1);
+        *((npy_half *)op1) = npy_float_to_half(npy_logf(in1));
+    }
+}
+
+#line 156
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(HALF_log2)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_half *src = (npy_half*)args[0];
+          npy_half *dst = (npy_half*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        (ssrc == 1) &&
+        (sdst == 1)) {
+        avx512_log2_f16(src, dst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_float in1 = npy_half_to_float(*(npy_half *)ip1);
+        *((npy_half *)op1) = npy_float_to_half(npy_log2f(in1));
+    }
+}
+
+#line 156
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(HALF_log10)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_half *src = (npy_half*)args[0];
+          npy_half *dst = (npy_half*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        (ssrc == 1) &&
+        (sdst == 1)) {
+        avx512_log10_f16(src, dst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_float in1 = npy_half_to_float(*(npy_half *)ip1);
+        *((npy_half *)op1) = npy_float_to_half(npy_log10f(in1));
+    }
+}
+
+#line 156
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(HALF_log1p)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_half *src = (npy_half*)args[0];
+          npy_half *dst = (npy_half*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        (ssrc == 1) &&
+        (sdst == 1)) {
+        avx512_log1p_f16(src, dst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_float in1 = npy_half_to_float(*(npy_half *)ip1);
+        *((npy_half *)op1) = npy_float_to_half(npy_log1pf(in1));
+    }
+}
+
+#line 156
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(HALF_cbrt)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_half *src = (npy_half*)args[0];
+          npy_half *dst = (npy_half*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        (ssrc == 1) &&
+        (sdst == 1)) {
+        avx512_cbrt_f16(src, dst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_float in1 = npy_half_to_float(*(npy_half *)ip1);
+        *((npy_half *)op1) = npy_float_to_half(npy_cbrtf(in1));
+    }
+}
+
+#line 156
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(HALF_arcsin)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_half *src = (npy_half*)args[0];
+          npy_half *dst = (npy_half*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        (ssrc == 1) &&
+        (sdst == 1)) {
+        avx512_asin_f16(src, dst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_float in1 = npy_half_to_float(*(npy_half *)ip1);
+        *((npy_half *)op1) = npy_float_to_half(npy_asinf(in1));
+    }
+}
+
+#line 156
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(HALF_arccos)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_half *src = (npy_half*)args[0];
+          npy_half *dst = (npy_half*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        (ssrc == 1) &&
+        (sdst == 1)) {
+        avx512_acos_f16(src, dst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_float in1 = npy_half_to_float(*(npy_half *)ip1);
+        *((npy_half *)op1) = npy_float_to_half(npy_acosf(in1));
+    }
+}
+
+#line 156
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(HALF_arctan)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_half *src = (npy_half*)args[0];
+          npy_half *dst = (npy_half*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        (ssrc == 1) &&
+        (sdst == 1)) {
+        avx512_atan_f16(src, dst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_float in1 = npy_half_to_float(*(npy_half *)ip1);
+        *((npy_half *)op1) = npy_float_to_half(npy_atanf(in1));
+    }
+}
+
+#line 156
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(HALF_sinh)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_half *src = (npy_half*)args[0];
+          npy_half *dst = (npy_half*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        (ssrc == 1) &&
+        (sdst == 1)) {
+        avx512_sinh_f16(src, dst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_float in1 = npy_half_to_float(*(npy_half *)ip1);
+        *((npy_half *)op1) = npy_float_to_half(npy_sinhf(in1));
+    }
+}
+
+#line 156
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(HALF_cosh)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_half *src = (npy_half*)args[0];
+          npy_half *dst = (npy_half*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        (ssrc == 1) &&
+        (sdst == 1)) {
+        avx512_cosh_f16(src, dst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_float in1 = npy_half_to_float(*(npy_half *)ip1);
+        *((npy_half *)op1) = npy_float_to_half(npy_coshf(in1));
+    }
+}
+
+#line 156
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(HALF_tanh)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_half *src = (npy_half*)args[0];
+          npy_half *dst = (npy_half*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        (ssrc == 1) &&
+        (sdst == 1)) {
+        avx512_tanh_f16(src, dst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_float in1 = npy_half_to_float(*(npy_half *)ip1);
+        *((npy_half *)op1) = npy_float_to_half(npy_tanhf(in1));
+    }
+}
+
+#line 156
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(HALF_arcsinh)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_half *src = (npy_half*)args[0];
+          npy_half *dst = (npy_half*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        (ssrc == 1) &&
+        (sdst == 1)) {
+        avx512_asinh_f16(src, dst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_float in1 = npy_half_to_float(*(npy_half *)ip1);
+        *((npy_half *)op1) = npy_float_to_half(npy_asinhf(in1));
+    }
+}
+
+#line 156
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(HALF_arccosh)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_half *src = (npy_half*)args[0];
+          npy_half *dst = (npy_half*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        (ssrc == 1) &&
+        (sdst == 1)) {
+        avx512_acosh_f16(src, dst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_float in1 = npy_half_to_float(*(npy_half *)ip1);
+        *((npy_half *)op1) = npy_float_to_half(npy_acoshf(in1));
+    }
+}
+
+#line 156
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(HALF_arctanh)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_half *src = (npy_half*)args[0];
+          npy_half *dst = (npy_half*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        (ssrc == 1) &&
+        (sdst == 1)) {
+        avx512_atanh_f16(src, dst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_float in1 = npy_half_to_float(*(npy_half *)ip1);
+        *((npy_half *)op1) = npy_float_to_half(npy_atanhf(in1));
+    }
+}
+
+
+#line 186
+#line 190
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_exp2)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_double *src = (npy_double*)args[0];
+          npy_double *dst = (npy_double*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        npyv_loadable_stride_f64(ssrc) &&
+        npyv_storable_stride_f64(sdst)) {
+        simd_exp2_f64(src, ssrc, dst, sdst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *(npy_double *)op1 = npy_exp2(in1);
+    }
+}
+
+#line 190
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_log2)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_double *src = (npy_double*)args[0];
+          npy_double *dst = (npy_double*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        npyv_loadable_stride_f64(ssrc) &&
+        npyv_storable_stride_f64(sdst)) {
+        simd_log2_f64(src, ssrc, dst, sdst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *(npy_double *)op1 = npy_log2(in1);
+    }
+}
+
+#line 190
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_log10)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_double *src = (npy_double*)args[0];
+          npy_double *dst = (npy_double*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        npyv_loadable_stride_f64(ssrc) &&
+        npyv_storable_stride_f64(sdst)) {
+        simd_log10_f64(src, ssrc, dst, sdst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *(npy_double *)op1 = npy_log10(in1);
+    }
+}
+
+#line 190
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_expm1)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_double *src = (npy_double*)args[0];
+          npy_double *dst = (npy_double*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        npyv_loadable_stride_f64(ssrc) &&
+        npyv_storable_stride_f64(sdst)) {
+        simd_expm1_f64(src, ssrc, dst, sdst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *(npy_double *)op1 = npy_expm1(in1);
+    }
+}
+
+#line 190
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_log1p)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_double *src = (npy_double*)args[0];
+          npy_double *dst = (npy_double*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        npyv_loadable_stride_f64(ssrc) &&
+        npyv_storable_stride_f64(sdst)) {
+        simd_log1p_f64(src, ssrc, dst, sdst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *(npy_double *)op1 = npy_log1p(in1);
+    }
+}
+
+#line 190
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_cbrt)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_double *src = (npy_double*)args[0];
+          npy_double *dst = (npy_double*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        npyv_loadable_stride_f64(ssrc) &&
+        npyv_storable_stride_f64(sdst)) {
+        simd_cbrt_f64(src, ssrc, dst, sdst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *(npy_double *)op1 = npy_cbrt(in1);
+    }
+}
+
+#line 190
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_tan)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_double *src = (npy_double*)args[0];
+          npy_double *dst = (npy_double*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        npyv_loadable_stride_f64(ssrc) &&
+        npyv_storable_stride_f64(sdst)) {
+        simd_tan_f64(src, ssrc, dst, sdst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *(npy_double *)op1 = npy_tan(in1);
+    }
+}
+
+#line 190
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_arcsin)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_double *src = (npy_double*)args[0];
+          npy_double *dst = (npy_double*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        npyv_loadable_stride_f64(ssrc) &&
+        npyv_storable_stride_f64(sdst)) {
+        simd_asin_f64(src, ssrc, dst, sdst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *(npy_double *)op1 = npy_asin(in1);
+    }
+}
+
+#line 190
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_arccos)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_double *src = (npy_double*)args[0];
+          npy_double *dst = (npy_double*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        npyv_loadable_stride_f64(ssrc) &&
+        npyv_storable_stride_f64(sdst)) {
+        simd_acos_f64(src, ssrc, dst, sdst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *(npy_double *)op1 = npy_acos(in1);
+    }
+}
+
+#line 190
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_arctan)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_double *src = (npy_double*)args[0];
+          npy_double *dst = (npy_double*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        npyv_loadable_stride_f64(ssrc) &&
+        npyv_storable_stride_f64(sdst)) {
+        simd_atan_f64(src, ssrc, dst, sdst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *(npy_double *)op1 = npy_atan(in1);
+    }
+}
+
+#line 190
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_sinh)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_double *src = (npy_double*)args[0];
+          npy_double *dst = (npy_double*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        npyv_loadable_stride_f64(ssrc) &&
+        npyv_storable_stride_f64(sdst)) {
+        simd_sinh_f64(src, ssrc, dst, sdst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *(npy_double *)op1 = npy_sinh(in1);
+    }
+}
+
+#line 190
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_cosh)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_double *src = (npy_double*)args[0];
+          npy_double *dst = (npy_double*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        npyv_loadable_stride_f64(ssrc) &&
+        npyv_storable_stride_f64(sdst)) {
+        simd_cosh_f64(src, ssrc, dst, sdst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *(npy_double *)op1 = npy_cosh(in1);
+    }
+}
+
+#line 190
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_arcsinh)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_double *src = (npy_double*)args[0];
+          npy_double *dst = (npy_double*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        npyv_loadable_stride_f64(ssrc) &&
+        npyv_storable_stride_f64(sdst)) {
+        simd_asinh_f64(src, ssrc, dst, sdst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *(npy_double *)op1 = npy_asinh(in1);
+    }
+}
+
+#line 190
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_arccosh)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_double *src = (npy_double*)args[0];
+          npy_double *dst = (npy_double*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        npyv_loadable_stride_f64(ssrc) &&
+        npyv_storable_stride_f64(sdst)) {
+        simd_acosh_f64(src, ssrc, dst, sdst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *(npy_double *)op1 = npy_acosh(in1);
+    }
+}
+
+#line 190
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_arctanh)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_double *src = (npy_double*)args[0];
+          npy_double *dst = (npy_double*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        npyv_loadable_stride_f64(ssrc) &&
+        npyv_storable_stride_f64(sdst)) {
+        simd_atanh_f64(src, ssrc, dst, sdst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *(npy_double *)op1 = npy_atanh(in1);
+    }
+}
+
+
+#line 186
+#line 190
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_exp2)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_float *src = (npy_float*)args[0];
+          npy_float *dst = (npy_float*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        npyv_loadable_stride_f32(ssrc) &&
+        npyv_storable_stride_f32(sdst)) {
+        simd_exp2_f32(src, ssrc, dst, sdst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        *(npy_float *)op1 = npy_exp2f(in1);
+    }
+}
+
+#line 190
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_log2)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_float *src = (npy_float*)args[0];
+          npy_float *dst = (npy_float*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        npyv_loadable_stride_f32(ssrc) &&
+        npyv_storable_stride_f32(sdst)) {
+        simd_log2_f32(src, ssrc, dst, sdst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        *(npy_float *)op1 = npy_log2f(in1);
+    }
+}
+
+#line 190
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_log10)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_float *src = (npy_float*)args[0];
+          npy_float *dst = (npy_float*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        npyv_loadable_stride_f32(ssrc) &&
+        npyv_storable_stride_f32(sdst)) {
+        simd_log10_f32(src, ssrc, dst, sdst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        *(npy_float *)op1 = npy_log10f(in1);
+    }
+}
+
+#line 190
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_expm1)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_float *src = (npy_float*)args[0];
+          npy_float *dst = (npy_float*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        npyv_loadable_stride_f32(ssrc) &&
+        npyv_storable_stride_f32(sdst)) {
+        simd_expm1_f32(src, ssrc, dst, sdst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        *(npy_float *)op1 = npy_expm1f(in1);
+    }
+}
+
+#line 190
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_log1p)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_float *src = (npy_float*)args[0];
+          npy_float *dst = (npy_float*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        npyv_loadable_stride_f32(ssrc) &&
+        npyv_storable_stride_f32(sdst)) {
+        simd_log1p_f32(src, ssrc, dst, sdst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        *(npy_float *)op1 = npy_log1pf(in1);
+    }
+}
+
+#line 190
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_cbrt)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_float *src = (npy_float*)args[0];
+          npy_float *dst = (npy_float*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        npyv_loadable_stride_f32(ssrc) &&
+        npyv_storable_stride_f32(sdst)) {
+        simd_cbrt_f32(src, ssrc, dst, sdst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        *(npy_float *)op1 = npy_cbrtf(in1);
+    }
+}
+
+#line 190
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_tan)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_float *src = (npy_float*)args[0];
+          npy_float *dst = (npy_float*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        npyv_loadable_stride_f32(ssrc) &&
+        npyv_storable_stride_f32(sdst)) {
+        simd_tan_f32(src, ssrc, dst, sdst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        *(npy_float *)op1 = npy_tanf(in1);
+    }
+}
+
+#line 190
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_arcsin)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_float *src = (npy_float*)args[0];
+          npy_float *dst = (npy_float*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        npyv_loadable_stride_f32(ssrc) &&
+        npyv_storable_stride_f32(sdst)) {
+        simd_asin_f32(src, ssrc, dst, sdst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        *(npy_float *)op1 = npy_asinf(in1);
+    }
+}
+
+#line 190
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_arccos)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_float *src = (npy_float*)args[0];
+          npy_float *dst = (npy_float*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        npyv_loadable_stride_f32(ssrc) &&
+        npyv_storable_stride_f32(sdst)) {
+        simd_acos_f32(src, ssrc, dst, sdst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        *(npy_float *)op1 = npy_acosf(in1);
+    }
+}
+
+#line 190
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_arctan)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_float *src = (npy_float*)args[0];
+          npy_float *dst = (npy_float*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        npyv_loadable_stride_f32(ssrc) &&
+        npyv_storable_stride_f32(sdst)) {
+        simd_atan_f32(src, ssrc, dst, sdst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        *(npy_float *)op1 = npy_atanf(in1);
+    }
+}
+
+#line 190
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_sinh)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_float *src = (npy_float*)args[0];
+          npy_float *dst = (npy_float*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        npyv_loadable_stride_f32(ssrc) &&
+        npyv_storable_stride_f32(sdst)) {
+        simd_sinh_f32(src, ssrc, dst, sdst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        *(npy_float *)op1 = npy_sinhf(in1);
+    }
+}
+
+#line 190
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_cosh)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_float *src = (npy_float*)args[0];
+          npy_float *dst = (npy_float*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        npyv_loadable_stride_f32(ssrc) &&
+        npyv_storable_stride_f32(sdst)) {
+        simd_cosh_f32(src, ssrc, dst, sdst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        *(npy_float *)op1 = npy_coshf(in1);
+    }
+}
+
+#line 190
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_arcsinh)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_float *src = (npy_float*)args[0];
+          npy_float *dst = (npy_float*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        npyv_loadable_stride_f32(ssrc) &&
+        npyv_storable_stride_f32(sdst)) {
+        simd_asinh_f32(src, ssrc, dst, sdst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        *(npy_float *)op1 = npy_asinhf(in1);
+    }
+}
+
+#line 190
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_arccosh)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_float *src = (npy_float*)args[0];
+          npy_float *dst = (npy_float*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        npyv_loadable_stride_f32(ssrc) &&
+        npyv_storable_stride_f32(sdst)) {
+        simd_acosh_f32(src, ssrc, dst, sdst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        *(npy_float *)op1 = npy_acoshf(in1);
+    }
+}
+
+#line 190
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_arctanh)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_float *src = (npy_float*)args[0];
+          npy_float *dst = (npy_float*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        npyv_loadable_stride_f32(ssrc) &&
+        npyv_storable_stride_f32(sdst)) {
+        simd_atanh_f32(src, ssrc, dst, sdst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        *(npy_float *)op1 = npy_atanhf(in1);
+    }
+}
+
+
+
+#line 222
+#line 226
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_power)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_double *src1 = (npy_double*)args[0];
+    const npy_double *src2 = (npy_double*)args[1];
+          npy_double *dst  = (npy_double*)args[2];
+    const int lsize = sizeof(src1[0]);
+    const npy_intp ssrc1 = steps[0] / lsize;
+    const npy_intp ssrc2 = steps[1] / lsize;
+    const npy_intp sdst  = steps[2] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (!is_mem_overlap(src1, steps[0], dst, steps[2], len) && !is_mem_overlap(src2, steps[1], dst, steps[2], len) &&
+        npyv_loadable_stride_f64(ssrc1) && npyv_loadable_stride_f64(ssrc2) &&
+        npyv_storable_stride_f64(sdst)) {
+        simd_pow_f64(src1, ssrc1, src2, ssrc2, dst, sdst, len);
+        return;
+    }
+#endif
+    BINARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        const npy_double in2 = *(npy_double *)ip2;
+        *(npy_double *)op1 = npy_pow(in1, in2);
+    }
+}
+
+#line 226
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_arctan2)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_double *src1 = (npy_double*)args[0];
+    const npy_double *src2 = (npy_double*)args[1];
+          npy_double *dst  = (npy_double*)args[2];
+    const int lsize = sizeof(src1[0]);
+    const npy_intp ssrc1 = steps[0] / lsize;
+    const npy_intp ssrc2 = steps[1] / lsize;
+    const npy_intp sdst  = steps[2] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (!is_mem_overlap(src1, steps[0], dst, steps[2], len) && !is_mem_overlap(src2, steps[1], dst, steps[2], len) &&
+        npyv_loadable_stride_f64(ssrc1) && npyv_loadable_stride_f64(ssrc2) &&
+        npyv_storable_stride_f64(sdst)) {
+        simd_atan2_f64(src1, ssrc1, src2, ssrc2, dst, sdst, len);
+        return;
+    }
+#endif
+    BINARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        const npy_double in2 = *(npy_double *)ip2;
+        *(npy_double *)op1 = npy_atan2(in1, in2);
+    }
+}
+
+
+#line 222
+#line 226
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_power)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_float *src1 = (npy_float*)args[0];
+    const npy_float *src2 = (npy_float*)args[1];
+          npy_float *dst  = (npy_float*)args[2];
+    const int lsize = sizeof(src1[0]);
+    const npy_intp ssrc1 = steps[0] / lsize;
+    const npy_intp ssrc2 = steps[1] / lsize;
+    const npy_intp sdst  = steps[2] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (!is_mem_overlap(src1, steps[0], dst, steps[2], len) && !is_mem_overlap(src2, steps[1], dst, steps[2], len) &&
+        npyv_loadable_stride_f32(ssrc1) && npyv_loadable_stride_f32(ssrc2) &&
+        npyv_storable_stride_f32(sdst)) {
+        simd_pow_f32(src1, ssrc1, src2, ssrc2, dst, sdst, len);
+        return;
+    }
+#endif
+    BINARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        const npy_float in2 = *(npy_float *)ip2;
+        *(npy_float *)op1 = npy_powf(in1, in2);
+    }
+}
+
+#line 226
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_arctan2)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const npy_float *src1 = (npy_float*)args[0];
+    const npy_float *src2 = (npy_float*)args[1];
+          npy_float *dst  = (npy_float*)args[2];
+    const int lsize = sizeof(src1[0]);
+    const npy_intp ssrc1 = steps[0] / lsize;
+    const npy_intp ssrc2 = steps[1] / lsize;
+    const npy_intp sdst  = steps[2] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(len <= 1 || (steps[0] % lsize == 0 && steps[1] % lsize == 0));
+    if (!is_mem_overlap(src1, steps[0], dst, steps[2], len) && !is_mem_overlap(src2, steps[1], dst, steps[2], len) &&
+        npyv_loadable_stride_f32(ssrc1) && npyv_loadable_stride_f32(ssrc2) &&
+        npyv_storable_stride_f32(sdst)) {
+        simd_atan2_f32(src1, ssrc1, src2, ssrc2, dst, sdst, len);
+        return;
+    }
+#endif
+    BINARY_LOOP {
+        const npy_float in1 = *(npy_float *)ip1;
+        const npy_float in2 = *(npy_float *)ip2;
+        *(npy_float *)op1 = npy_atan2f(in1, in2);
+    }
+}
+
+
+
diff --git a/numpy/core/src/_generated/loops_unary.dispatch.c b/numpy/core/src/_generated/loops_unary.dispatch.c
new file mode 100644
index 000000000000..ede51e155d93
--- /dev/null
+++ b/numpy/core/src/_generated/loops_unary.dispatch.c
@@ -0,0 +1,8982 @@
+#line 1 "numpy/core/src/umath/loops_unary.dispatch.c.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+/*@targets
+ ** $maxopt baseline
+ ** neon asimd
+ ** sse2 avx2 avx512_skx
+ ** vsx2
+ ** vx vxe
+ **/
+
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "numpy/npy_math.h"
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+/*******************************************************************************
+ ** Scalar ops
+ ******************************************************************************/
+#define scalar_negative(X) (-X)
+
+/*******************************************************************************
+ ** extra SIMD intrinsics
+ ******************************************************************************/
+
+#if NPY_SIMD
+
+#line 36
+static NPY_INLINE npyv_s8
+npyv_negative_s8(npyv_s8 v)
+{
+#if defined(NPY_HAVE_NEON) && (defined(__aarch64__) || 8 < 64)
+    return npyv_reinterpret_s8_s8(vnegq_s8(npyv_reinterpret_s8_s8(v)));
+#else
+    // (x ^ -1) + 1
+    const npyv_s8 m1 = npyv_setall_s8((npyv_lanetype_s8)-1);
+    return npyv_sub_s8(npyv_xor_s8(v, m1), m1);
+#endif
+}
+
+#line 36
+static NPY_INLINE npyv_u8
+npyv_negative_u8(npyv_u8 v)
+{
+#if defined(NPY_HAVE_NEON) && (defined(__aarch64__) || 8 < 64)
+    return npyv_reinterpret_u8_s8(vnegq_s8(npyv_reinterpret_s8_u8(v)));
+#else
+    // (x ^ -1) + 1
+    const npyv_u8 m1 = npyv_setall_u8((npyv_lanetype_u8)-1);
+    return npyv_sub_u8(npyv_xor_u8(v, m1), m1);
+#endif
+}
+
+#line 36
+static NPY_INLINE npyv_s16
+npyv_negative_s16(npyv_s16 v)
+{
+#if defined(NPY_HAVE_NEON) && (defined(__aarch64__) || 16 < 64)
+    return npyv_reinterpret_s16_s16(vnegq_s16(npyv_reinterpret_s16_s16(v)));
+#else
+    // (x ^ -1) + 1
+    const npyv_s16 m1 = npyv_setall_s16((npyv_lanetype_s16)-1);
+    return npyv_sub_s16(npyv_xor_s16(v, m1), m1);
+#endif
+}
+
+#line 36
+static NPY_INLINE npyv_u16
+npyv_negative_u16(npyv_u16 v)
+{
+#if defined(NPY_HAVE_NEON) && (defined(__aarch64__) || 16 < 64)
+    return npyv_reinterpret_u16_s16(vnegq_s16(npyv_reinterpret_s16_u16(v)));
+#else
+    // (x ^ -1) + 1
+    const npyv_u16 m1 = npyv_setall_u16((npyv_lanetype_u16)-1);
+    return npyv_sub_u16(npyv_xor_u16(v, m1), m1);
+#endif
+}
+
+#line 36
+static NPY_INLINE npyv_s32
+npyv_negative_s32(npyv_s32 v)
+{
+#if defined(NPY_HAVE_NEON) && (defined(__aarch64__) || 32 < 64)
+    return npyv_reinterpret_s32_s32(vnegq_s32(npyv_reinterpret_s32_s32(v)));
+#else
+    // (x ^ -1) + 1
+    const npyv_s32 m1 = npyv_setall_s32((npyv_lanetype_s32)-1);
+    return npyv_sub_s32(npyv_xor_s32(v, m1), m1);
+#endif
+}
+
+#line 36
+static NPY_INLINE npyv_u32
+npyv_negative_u32(npyv_u32 v)
+{
+#if defined(NPY_HAVE_NEON) && (defined(__aarch64__) || 32 < 64)
+    return npyv_reinterpret_u32_s32(vnegq_s32(npyv_reinterpret_s32_u32(v)));
+#else
+    // (x ^ -1) + 1
+    const npyv_u32 m1 = npyv_setall_u32((npyv_lanetype_u32)-1);
+    return npyv_sub_u32(npyv_xor_u32(v, m1), m1);
+#endif
+}
+
+#line 36
+static NPY_INLINE npyv_s64
+npyv_negative_s64(npyv_s64 v)
+{
+#if defined(NPY_HAVE_NEON) && (defined(__aarch64__) || 64 < 64)
+    return npyv_reinterpret_s64_s64(vnegq_s64(npyv_reinterpret_s64_s64(v)));
+#else
+    // (x ^ -1) + 1
+    const npyv_s64 m1 = npyv_setall_s64((npyv_lanetype_s64)-1);
+    return npyv_sub_s64(npyv_xor_s64(v, m1), m1);
+#endif
+}
+
+#line 36
+static NPY_INLINE npyv_u64
+npyv_negative_u64(npyv_u64 v)
+{
+#if defined(NPY_HAVE_NEON) && (defined(__aarch64__) || 64 < 64)
+    return npyv_reinterpret_u64_s64(vnegq_s64(npyv_reinterpret_s64_u64(v)));
+#else
+    // (x ^ -1) + 1
+    const npyv_u64 m1 = npyv_setall_u64((npyv_lanetype_u64)-1);
+    return npyv_sub_u64(npyv_xor_u64(v, m1), m1);
+#endif
+}
+
+
+#line 54
+#if NPY_SIMD_F32
+static NPY_INLINE npyv_f32
+npyv_negative_f32(npyv_f32 v)
+{
+#if defined(NPY_HAVE_NEON)
+    return vnegq_f32(v);
+#else
+    // (v ^ signmask)
+    const npyv_f32 signmask = npyv_setall_f32(-0.f);
+    return npyv_xor_f32(v, signmask);
+#endif
+}
+#endif // NPY_SIMD_F32
+
+#line 54
+#if NPY_SIMD_F64
+static NPY_INLINE npyv_f64
+npyv_negative_f64(npyv_f64 v)
+{
+#if defined(NPY_HAVE_NEON)
+    return vnegq_f64(v);
+#else
+    // (v ^ signmask)
+    const npyv_f64 signmask = npyv_setall_f64(-0.);
+    return npyv_xor_f64(v, signmask);
+#endif
+}
+#endif // NPY_SIMD_F64
+
+
+#endif // NPY_SIMD
+
+/********************************************************************************
+ ** Defining the SIMD kernels
+ ********************************************************************************/
+#line 80
+#line 85
+#if NPY_SIMD
+#if 4 < 1
+#error "Unroll must be at least 1"
+#elif NPY_SIMD != 128 && 4 > 2
+// Avoid memory bandwidth bottleneck for larger SIMD
+#define UNROLL 2
+#else
+#define UNROLL 4
+#endif
+// contiguous inputs and output.
+static NPY_INLINE void
+simd_unary_cc_negative_s8(const npyv_lanetype_s8 *ip,
+                             npyv_lanetype_s8 *op,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_s8;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
+    #line 108
+    #if UNROLL > 0
+        npyv_s8 v_0 = npyv_load_s8(ip + 0 * vstep);
+        npyv_s8 r_0 = npyv_negative_s8(v_0);
+        npyv_store_s8(op + 0 * vstep, r_0);
+    #endif
+    
+#line 108
+    #if UNROLL > 1
+        npyv_s8 v_1 = npyv_load_s8(ip + 1 * vstep);
+        npyv_s8 r_1 = npyv_negative_s8(v_1);
+        npyv_store_s8(op + 1 * vstep, r_1);
+    #endif
+    
+#line 108
+    #if UNROLL > 2
+        npyv_s8 v_2 = npyv_load_s8(ip + 2 * vstep);
+        npyv_s8 r_2 = npyv_negative_s8(v_2);
+        npyv_store_s8(op + 2 * vstep, r_2);
+    #endif
+    
+#line 108
+    #if UNROLL > 3
+        npyv_s8 v_3 = npyv_load_s8(ip + 3 * vstep);
+        npyv_s8 r_3 = npyv_negative_s8(v_3);
+        npyv_store_s8(op + 3 * vstep, r_3);
+    #endif
+    
+#line 108
+    #if UNROLL > 4
+        npyv_s8 v_4 = npyv_load_s8(ip + 4 * vstep);
+        npyv_s8 r_4 = npyv_negative_s8(v_4);
+        npyv_store_s8(op + 4 * vstep, r_4);
+    #endif
+    
+#line 108
+    #if UNROLL > 5
+        npyv_s8 v_5 = npyv_load_s8(ip + 5 * vstep);
+        npyv_s8 r_5 = npyv_negative_s8(v_5);
+        npyv_store_s8(op + 5 * vstep, r_5);
+    #endif
+    
+#line 108
+    #if UNROLL > 6
+        npyv_s8 v_6 = npyv_load_s8(ip + 6 * vstep);
+        npyv_s8 r_6 = npyv_negative_s8(v_6);
+        npyv_store_s8(op + 6 * vstep, r_6);
+    #endif
+    
+#line 108
+    #if UNROLL > 7
+        npyv_s8 v_7 = npyv_load_s8(ip + 7 * vstep);
+        npyv_s8 r_7 = npyv_negative_s8(v_7);
+        npyv_store_s8(op + 7 * vstep, r_7);
+    #endif
+    
+#line 108
+    #if UNROLL > 8
+        npyv_s8 v_8 = npyv_load_s8(ip + 8 * vstep);
+        npyv_s8 r_8 = npyv_negative_s8(v_8);
+        npyv_store_s8(op + 8 * vstep, r_8);
+    #endif
+    
+#line 108
+    #if UNROLL > 9
+        npyv_s8 v_9 = npyv_load_s8(ip + 9 * vstep);
+        npyv_s8 r_9 = npyv_negative_s8(v_9);
+        npyv_store_s8(op + 9 * vstep, r_9);
+    #endif
+    
+#line 108
+    #if UNROLL > 10
+        npyv_s8 v_10 = npyv_load_s8(ip + 10 * vstep);
+        npyv_s8 r_10 = npyv_negative_s8(v_10);
+        npyv_store_s8(op + 10 * vstep, r_10);
+    #endif
+    
+#line 108
+    #if UNROLL > 11
+        npyv_s8 v_11 = npyv_load_s8(ip + 11 * vstep);
+        npyv_s8 r_11 = npyv_negative_s8(v_11);
+        npyv_store_s8(op + 11 * vstep, r_11);
+    #endif
+    
+#line 108
+    #if UNROLL > 12
+        npyv_s8 v_12 = npyv_load_s8(ip + 12 * vstep);
+        npyv_s8 r_12 = npyv_negative_s8(v_12);
+        npyv_store_s8(op + 12 * vstep, r_12);
+    #endif
+    
+#line 108
+    #if UNROLL > 13
+        npyv_s8 v_13 = npyv_load_s8(ip + 13 * vstep);
+        npyv_s8 r_13 = npyv_negative_s8(v_13);
+        npyv_store_s8(op + 13 * vstep, r_13);
+    #endif
+    
+#line 108
+    #if UNROLL > 14
+        npyv_s8 v_14 = npyv_load_s8(ip + 14 * vstep);
+        npyv_s8 r_14 = npyv_negative_s8(v_14);
+        npyv_store_s8(op + 14 * vstep, r_14);
+    #endif
+    
+#line 108
+    #if UNROLL > 15
+        npyv_s8 v_15 = npyv_load_s8(ip + 15 * vstep);
+        npyv_s8 r_15 = npyv_negative_s8(v_15);
+        npyv_store_s8(op + 15 * vstep, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += vstep, op +=vstep) {
+        npyv_s8 v = npyv_load_s8(ip);
+        npyv_s8 r = npyv_negative_s8(v);
+        npyv_store_s8(op, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ++ip, ++op) {
+        *op = scalar_negative(*ip);
+    }
+}
+
+#if 0
+// contiguous input, non-contiguous output
+static NPY_INLINE void
+simd_unary_cn_negative_s8(const npyv_lanetype_s8 *ip,
+                             npyv_lanetype_s8 *op, npy_intp ostride,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_s8;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += wstep, op += ostride*wstep) {
+    #line 142
+    #if UNROLL > 0
+        npyv_s8 v_0 = npyv_load_s8(ip + 0 * vstep);
+        npyv_s8 r_0 = npyv_negative_s8(v_0);
+        npyv_storen_s8(op + 0 * vstep * ostride, ostride, r_0);
+    #endif
+    
+#line 142
+    #if UNROLL > 1
+        npyv_s8 v_1 = npyv_load_s8(ip + 1 * vstep);
+        npyv_s8 r_1 = npyv_negative_s8(v_1);
+        npyv_storen_s8(op + 1 * vstep * ostride, ostride, r_1);
+    #endif
+    
+#line 142
+    #if UNROLL > 2
+        npyv_s8 v_2 = npyv_load_s8(ip + 2 * vstep);
+        npyv_s8 r_2 = npyv_negative_s8(v_2);
+        npyv_storen_s8(op + 2 * vstep * ostride, ostride, r_2);
+    #endif
+    
+#line 142
+    #if UNROLL > 3
+        npyv_s8 v_3 = npyv_load_s8(ip + 3 * vstep);
+        npyv_s8 r_3 = npyv_negative_s8(v_3);
+        npyv_storen_s8(op + 3 * vstep * ostride, ostride, r_3);
+    #endif
+    
+#line 142
+    #if UNROLL > 4
+        npyv_s8 v_4 = npyv_load_s8(ip + 4 * vstep);
+        npyv_s8 r_4 = npyv_negative_s8(v_4);
+        npyv_storen_s8(op + 4 * vstep * ostride, ostride, r_4);
+    #endif
+    
+#line 142
+    #if UNROLL > 5
+        npyv_s8 v_5 = npyv_load_s8(ip + 5 * vstep);
+        npyv_s8 r_5 = npyv_negative_s8(v_5);
+        npyv_storen_s8(op + 5 * vstep * ostride, ostride, r_5);
+    #endif
+    
+#line 142
+    #if UNROLL > 6
+        npyv_s8 v_6 = npyv_load_s8(ip + 6 * vstep);
+        npyv_s8 r_6 = npyv_negative_s8(v_6);
+        npyv_storen_s8(op + 6 * vstep * ostride, ostride, r_6);
+    #endif
+    
+#line 142
+    #if UNROLL > 7
+        npyv_s8 v_7 = npyv_load_s8(ip + 7 * vstep);
+        npyv_s8 r_7 = npyv_negative_s8(v_7);
+        npyv_storen_s8(op + 7 * vstep * ostride, ostride, r_7);
+    #endif
+    
+#line 142
+    #if UNROLL > 8
+        npyv_s8 v_8 = npyv_load_s8(ip + 8 * vstep);
+        npyv_s8 r_8 = npyv_negative_s8(v_8);
+        npyv_storen_s8(op + 8 * vstep * ostride, ostride, r_8);
+    #endif
+    
+#line 142
+    #if UNROLL > 9
+        npyv_s8 v_9 = npyv_load_s8(ip + 9 * vstep);
+        npyv_s8 r_9 = npyv_negative_s8(v_9);
+        npyv_storen_s8(op + 9 * vstep * ostride, ostride, r_9);
+    #endif
+    
+#line 142
+    #if UNROLL > 10
+        npyv_s8 v_10 = npyv_load_s8(ip + 10 * vstep);
+        npyv_s8 r_10 = npyv_negative_s8(v_10);
+        npyv_storen_s8(op + 10 * vstep * ostride, ostride, r_10);
+    #endif
+    
+#line 142
+    #if UNROLL > 11
+        npyv_s8 v_11 = npyv_load_s8(ip + 11 * vstep);
+        npyv_s8 r_11 = npyv_negative_s8(v_11);
+        npyv_storen_s8(op + 11 * vstep * ostride, ostride, r_11);
+    #endif
+    
+#line 142
+    #if UNROLL > 12
+        npyv_s8 v_12 = npyv_load_s8(ip + 12 * vstep);
+        npyv_s8 r_12 = npyv_negative_s8(v_12);
+        npyv_storen_s8(op + 12 * vstep * ostride, ostride, r_12);
+    #endif
+    
+#line 142
+    #if UNROLL > 13
+        npyv_s8 v_13 = npyv_load_s8(ip + 13 * vstep);
+        npyv_s8 r_13 = npyv_negative_s8(v_13);
+        npyv_storen_s8(op + 13 * vstep * ostride, ostride, r_13);
+    #endif
+    
+#line 142
+    #if UNROLL > 14
+        npyv_s8 v_14 = npyv_load_s8(ip + 14 * vstep);
+        npyv_s8 r_14 = npyv_negative_s8(v_14);
+        npyv_storen_s8(op + 14 * vstep * ostride, ostride, r_14);
+    #endif
+    
+#line 142
+    #if UNROLL > 15
+        npyv_s8 v_15 = npyv_load_s8(ip + 15 * vstep);
+        npyv_s8 r_15 = npyv_negative_s8(v_15);
+        npyv_storen_s8(op + 15 * vstep * ostride, ostride, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += vstep, op += ostride*vstep) {
+        npyv_s8 v = npyv_load_s8(ip);
+        npyv_s8 r = npyv_negative_s8(v);
+        npyv_storen_s8(op, ostride, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ++ip, op += ostride) {
+        *op = scalar_negative(*ip);
+    }
+}
+// non-contiguous input, contiguous output
+static NPY_INLINE void
+simd_unary_nc_negative_s8(const npyv_lanetype_s8 *ip, npy_intp istride,
+                             npyv_lanetype_s8 *op,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_s8;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += wstep) {
+    #line 174
+    #if UNROLL > 0
+        npyv_s8 v_0 = npyv_loadn_s8(ip + 0 * vstep * istride, istride);
+        npyv_s8 r_0 = npyv_negative_s8(v_0);
+        npyv_store_s8(op + 0 * vstep, r_0);
+    #endif
+    
+#line 174
+    #if UNROLL > 1
+        npyv_s8 v_1 = npyv_loadn_s8(ip + 1 * vstep * istride, istride);
+        npyv_s8 r_1 = npyv_negative_s8(v_1);
+        npyv_store_s8(op + 1 * vstep, r_1);
+    #endif
+    
+#line 174
+    #if UNROLL > 2
+        npyv_s8 v_2 = npyv_loadn_s8(ip + 2 * vstep * istride, istride);
+        npyv_s8 r_2 = npyv_negative_s8(v_2);
+        npyv_store_s8(op + 2 * vstep, r_2);
+    #endif
+    
+#line 174
+    #if UNROLL > 3
+        npyv_s8 v_3 = npyv_loadn_s8(ip + 3 * vstep * istride, istride);
+        npyv_s8 r_3 = npyv_negative_s8(v_3);
+        npyv_store_s8(op + 3 * vstep, r_3);
+    #endif
+    
+#line 174
+    #if UNROLL > 4
+        npyv_s8 v_4 = npyv_loadn_s8(ip + 4 * vstep * istride, istride);
+        npyv_s8 r_4 = npyv_negative_s8(v_4);
+        npyv_store_s8(op + 4 * vstep, r_4);
+    #endif
+    
+#line 174
+    #if UNROLL > 5
+        npyv_s8 v_5 = npyv_loadn_s8(ip + 5 * vstep * istride, istride);
+        npyv_s8 r_5 = npyv_negative_s8(v_5);
+        npyv_store_s8(op + 5 * vstep, r_5);
+    #endif
+    
+#line 174
+    #if UNROLL > 6
+        npyv_s8 v_6 = npyv_loadn_s8(ip + 6 * vstep * istride, istride);
+        npyv_s8 r_6 = npyv_negative_s8(v_6);
+        npyv_store_s8(op + 6 * vstep, r_6);
+    #endif
+    
+#line 174
+    #if UNROLL > 7
+        npyv_s8 v_7 = npyv_loadn_s8(ip + 7 * vstep * istride, istride);
+        npyv_s8 r_7 = npyv_negative_s8(v_7);
+        npyv_store_s8(op + 7 * vstep, r_7);
+    #endif
+    
+#line 174
+    #if UNROLL > 8
+        npyv_s8 v_8 = npyv_loadn_s8(ip + 8 * vstep * istride, istride);
+        npyv_s8 r_8 = npyv_negative_s8(v_8);
+        npyv_store_s8(op + 8 * vstep, r_8);
+    #endif
+    
+#line 174
+    #if UNROLL > 9
+        npyv_s8 v_9 = npyv_loadn_s8(ip + 9 * vstep * istride, istride);
+        npyv_s8 r_9 = npyv_negative_s8(v_9);
+        npyv_store_s8(op + 9 * vstep, r_9);
+    #endif
+    
+#line 174
+    #if UNROLL > 10
+        npyv_s8 v_10 = npyv_loadn_s8(ip + 10 * vstep * istride, istride);
+        npyv_s8 r_10 = npyv_negative_s8(v_10);
+        npyv_store_s8(op + 10 * vstep, r_10);
+    #endif
+    
+#line 174
+    #if UNROLL > 11
+        npyv_s8 v_11 = npyv_loadn_s8(ip + 11 * vstep * istride, istride);
+        npyv_s8 r_11 = npyv_negative_s8(v_11);
+        npyv_store_s8(op + 11 * vstep, r_11);
+    #endif
+    
+#line 174
+    #if UNROLL > 12
+        npyv_s8 v_12 = npyv_loadn_s8(ip + 12 * vstep * istride, istride);
+        npyv_s8 r_12 = npyv_negative_s8(v_12);
+        npyv_store_s8(op + 12 * vstep, r_12);
+    #endif
+    
+#line 174
+    #if UNROLL > 13
+        npyv_s8 v_13 = npyv_loadn_s8(ip + 13 * vstep * istride, istride);
+        npyv_s8 r_13 = npyv_negative_s8(v_13);
+        npyv_store_s8(op + 13 * vstep, r_13);
+    #endif
+    
+#line 174
+    #if UNROLL > 14
+        npyv_s8 v_14 = npyv_loadn_s8(ip + 14 * vstep * istride, istride);
+        npyv_s8 r_14 = npyv_negative_s8(v_14);
+        npyv_store_s8(op + 14 * vstep, r_14);
+    #endif
+    
+#line 174
+    #if UNROLL > 15
+        npyv_s8 v_15 = npyv_loadn_s8(ip + 15 * vstep * istride, istride);
+        npyv_s8 r_15 = npyv_negative_s8(v_15);
+        npyv_store_s8(op + 15 * vstep, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += vstep) {
+        npyv_s8 v = npyv_loadn_s8(ip, istride);
+        npyv_s8 r = npyv_negative_s8(v);
+        npyv_store_s8(op, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ip += istride, ++op) {
+        *op = scalar_negative(*ip);
+    }
+}
+// non-contiguous input and output
+// limit unroll to 2x
+#if UNROLL > 2
+#undef UNROLL
+#define UNROLL 2
+#endif
+// X86 does better with unrolled scalar for heavy non-contiguous
+#ifndef NPY_HAVE_SSE2
+static NPY_INLINE void
+simd_unary_nn_negative_s8(const npyv_lanetype_s8 *ip, npy_intp istride,
+                             npyv_lanetype_s8 *op, npy_intp ostride,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_s8;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+    #line 213
+    #if UNROLL > 0
+        npyv_s8 v_0 = npyv_loadn_s8(ip + 0 * vstep * istride, istride);
+        npyv_s8 r_0 = npyv_negative_s8(v_0);
+        npyv_storen_s8(op + 0 * vstep * ostride, ostride, r_0);
+    #endif
+    
+#line 213
+    #if UNROLL > 1
+        npyv_s8 v_1 = npyv_loadn_s8(ip + 1 * vstep * istride, istride);
+        npyv_s8 r_1 = npyv_negative_s8(v_1);
+        npyv_storen_s8(op + 1 * vstep * ostride, ostride, r_1);
+    #endif
+    
+#line 213
+    #if UNROLL > 2
+        npyv_s8 v_2 = npyv_loadn_s8(ip + 2 * vstep * istride, istride);
+        npyv_s8 r_2 = npyv_negative_s8(v_2);
+        npyv_storen_s8(op + 2 * vstep * ostride, ostride, r_2);
+    #endif
+    
+#line 213
+    #if UNROLL > 3
+        npyv_s8 v_3 = npyv_loadn_s8(ip + 3 * vstep * istride, istride);
+        npyv_s8 r_3 = npyv_negative_s8(v_3);
+        npyv_storen_s8(op + 3 * vstep * ostride, ostride, r_3);
+    #endif
+    
+#line 213
+    #if UNROLL > 4
+        npyv_s8 v_4 = npyv_loadn_s8(ip + 4 * vstep * istride, istride);
+        npyv_s8 r_4 = npyv_negative_s8(v_4);
+        npyv_storen_s8(op + 4 * vstep * ostride, ostride, r_4);
+    #endif
+    
+#line 213
+    #if UNROLL > 5
+        npyv_s8 v_5 = npyv_loadn_s8(ip + 5 * vstep * istride, istride);
+        npyv_s8 r_5 = npyv_negative_s8(v_5);
+        npyv_storen_s8(op + 5 * vstep * ostride, ostride, r_5);
+    #endif
+    
+#line 213
+    #if UNROLL > 6
+        npyv_s8 v_6 = npyv_loadn_s8(ip + 6 * vstep * istride, istride);
+        npyv_s8 r_6 = npyv_negative_s8(v_6);
+        npyv_storen_s8(op + 6 * vstep * ostride, ostride, r_6);
+    #endif
+    
+#line 213
+    #if UNROLL > 7
+        npyv_s8 v_7 = npyv_loadn_s8(ip + 7 * vstep * istride, istride);
+        npyv_s8 r_7 = npyv_negative_s8(v_7);
+        npyv_storen_s8(op + 7 * vstep * ostride, ostride, r_7);
+    #endif
+    
+#line 213
+    #if UNROLL > 8
+        npyv_s8 v_8 = npyv_loadn_s8(ip + 8 * vstep * istride, istride);
+        npyv_s8 r_8 = npyv_negative_s8(v_8);
+        npyv_storen_s8(op + 8 * vstep * ostride, ostride, r_8);
+    #endif
+    
+#line 213
+    #if UNROLL > 9
+        npyv_s8 v_9 = npyv_loadn_s8(ip + 9 * vstep * istride, istride);
+        npyv_s8 r_9 = npyv_negative_s8(v_9);
+        npyv_storen_s8(op + 9 * vstep * ostride, ostride, r_9);
+    #endif
+    
+#line 213
+    #if UNROLL > 10
+        npyv_s8 v_10 = npyv_loadn_s8(ip + 10 * vstep * istride, istride);
+        npyv_s8 r_10 = npyv_negative_s8(v_10);
+        npyv_storen_s8(op + 10 * vstep * ostride, ostride, r_10);
+    #endif
+    
+#line 213
+    #if UNROLL > 11
+        npyv_s8 v_11 = npyv_loadn_s8(ip + 11 * vstep * istride, istride);
+        npyv_s8 r_11 = npyv_negative_s8(v_11);
+        npyv_storen_s8(op + 11 * vstep * ostride, ostride, r_11);
+    #endif
+    
+#line 213
+    #if UNROLL > 12
+        npyv_s8 v_12 = npyv_loadn_s8(ip + 12 * vstep * istride, istride);
+        npyv_s8 r_12 = npyv_negative_s8(v_12);
+        npyv_storen_s8(op + 12 * vstep * ostride, ostride, r_12);
+    #endif
+    
+#line 213
+    #if UNROLL > 13
+        npyv_s8 v_13 = npyv_loadn_s8(ip + 13 * vstep * istride, istride);
+        npyv_s8 r_13 = npyv_negative_s8(v_13);
+        npyv_storen_s8(op + 13 * vstep * ostride, ostride, r_13);
+    #endif
+    
+#line 213
+    #if UNROLL > 14
+        npyv_s8 v_14 = npyv_loadn_s8(ip + 14 * vstep * istride, istride);
+        npyv_s8 r_14 = npyv_negative_s8(v_14);
+        npyv_storen_s8(op + 14 * vstep * ostride, ostride, r_14);
+    #endif
+    
+#line 213
+    #if UNROLL > 15
+        npyv_s8 v_15 = npyv_loadn_s8(ip + 15 * vstep * istride, istride);
+        npyv_s8 r_15 = npyv_negative_s8(v_15);
+        npyv_storen_s8(op + 15 * vstep * ostride, ostride, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+        npyv_s8 v = npyv_loadn_s8(ip, istride);
+        npyv_s8 r = npyv_negative_s8(v);
+        npyv_storen_s8(op, ostride, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = scalar_negative(*ip);
+    }
+}
+#endif // NPY_HAVE_SSE2
+#endif // 0
+#undef UNROLL
+#endif // NPY_SIMD
+/*end repeat1**/
+
+#line 80
+#line 85
+#if NPY_SIMD
+#if 4 < 1
+#error "Unroll must be at least 1"
+#elif NPY_SIMD != 128 && 4 > 2
+// Avoid memory bandwidth bottleneck for larger SIMD
+#define UNROLL 2
+#else
+#define UNROLL 4
+#endif
+// contiguous inputs and output.
+static NPY_INLINE void
+simd_unary_cc_negative_u8(const npyv_lanetype_u8 *ip,
+                             npyv_lanetype_u8 *op,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_u8;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
+    #line 108
+    #if UNROLL > 0
+        npyv_u8 v_0 = npyv_load_u8(ip + 0 * vstep);
+        npyv_u8 r_0 = npyv_negative_u8(v_0);
+        npyv_store_u8(op + 0 * vstep, r_0);
+    #endif
+    
+#line 108
+    #if UNROLL > 1
+        npyv_u8 v_1 = npyv_load_u8(ip + 1 * vstep);
+        npyv_u8 r_1 = npyv_negative_u8(v_1);
+        npyv_store_u8(op + 1 * vstep, r_1);
+    #endif
+    
+#line 108
+    #if UNROLL > 2
+        npyv_u8 v_2 = npyv_load_u8(ip + 2 * vstep);
+        npyv_u8 r_2 = npyv_negative_u8(v_2);
+        npyv_store_u8(op + 2 * vstep, r_2);
+    #endif
+    
+#line 108
+    #if UNROLL > 3
+        npyv_u8 v_3 = npyv_load_u8(ip + 3 * vstep);
+        npyv_u8 r_3 = npyv_negative_u8(v_3);
+        npyv_store_u8(op + 3 * vstep, r_3);
+    #endif
+    
+#line 108
+    #if UNROLL > 4
+        npyv_u8 v_4 = npyv_load_u8(ip + 4 * vstep);
+        npyv_u8 r_4 = npyv_negative_u8(v_4);
+        npyv_store_u8(op + 4 * vstep, r_4);
+    #endif
+    
+#line 108
+    #if UNROLL > 5
+        npyv_u8 v_5 = npyv_load_u8(ip + 5 * vstep);
+        npyv_u8 r_5 = npyv_negative_u8(v_5);
+        npyv_store_u8(op + 5 * vstep, r_5);
+    #endif
+    
+#line 108
+    #if UNROLL > 6
+        npyv_u8 v_6 = npyv_load_u8(ip + 6 * vstep);
+        npyv_u8 r_6 = npyv_negative_u8(v_6);
+        npyv_store_u8(op + 6 * vstep, r_6);
+    #endif
+    
+#line 108
+    #if UNROLL > 7
+        npyv_u8 v_7 = npyv_load_u8(ip + 7 * vstep);
+        npyv_u8 r_7 = npyv_negative_u8(v_7);
+        npyv_store_u8(op + 7 * vstep, r_7);
+    #endif
+    
+#line 108
+    #if UNROLL > 8
+        npyv_u8 v_8 = npyv_load_u8(ip + 8 * vstep);
+        npyv_u8 r_8 = npyv_negative_u8(v_8);
+        npyv_store_u8(op + 8 * vstep, r_8);
+    #endif
+    
+#line 108
+    #if UNROLL > 9
+        npyv_u8 v_9 = npyv_load_u8(ip + 9 * vstep);
+        npyv_u8 r_9 = npyv_negative_u8(v_9);
+        npyv_store_u8(op + 9 * vstep, r_9);
+    #endif
+    
+#line 108
+    #if UNROLL > 10
+        npyv_u8 v_10 = npyv_load_u8(ip + 10 * vstep);
+        npyv_u8 r_10 = npyv_negative_u8(v_10);
+        npyv_store_u8(op + 10 * vstep, r_10);
+    #endif
+    
+#line 108
+    #if UNROLL > 11
+        npyv_u8 v_11 = npyv_load_u8(ip + 11 * vstep);
+        npyv_u8 r_11 = npyv_negative_u8(v_11);
+        npyv_store_u8(op + 11 * vstep, r_11);
+    #endif
+    
+#line 108
+    #if UNROLL > 12
+        npyv_u8 v_12 = npyv_load_u8(ip + 12 * vstep);
+        npyv_u8 r_12 = npyv_negative_u8(v_12);
+        npyv_store_u8(op + 12 * vstep, r_12);
+    #endif
+    
+#line 108
+    #if UNROLL > 13
+        npyv_u8 v_13 = npyv_load_u8(ip + 13 * vstep);
+        npyv_u8 r_13 = npyv_negative_u8(v_13);
+        npyv_store_u8(op + 13 * vstep, r_13);
+    #endif
+    
+#line 108
+    #if UNROLL > 14
+        npyv_u8 v_14 = npyv_load_u8(ip + 14 * vstep);
+        npyv_u8 r_14 = npyv_negative_u8(v_14);
+        npyv_store_u8(op + 14 * vstep, r_14);
+    #endif
+    
+#line 108
+    #if UNROLL > 15
+        npyv_u8 v_15 = npyv_load_u8(ip + 15 * vstep);
+        npyv_u8 r_15 = npyv_negative_u8(v_15);
+        npyv_store_u8(op + 15 * vstep, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += vstep, op +=vstep) {
+        npyv_u8 v = npyv_load_u8(ip);
+        npyv_u8 r = npyv_negative_u8(v);
+        npyv_store_u8(op, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ++ip, ++op) {
+        *op = scalar_negative(*ip);
+    }
+}
+
+#if 0
+// contiguous input, non-contiguous output
+static NPY_INLINE void
+simd_unary_cn_negative_u8(const npyv_lanetype_u8 *ip,
+                             npyv_lanetype_u8 *op, npy_intp ostride,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_u8;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += wstep, op += ostride*wstep) {
+    #line 142
+    #if UNROLL > 0
+        npyv_u8 v_0 = npyv_load_u8(ip + 0 * vstep);
+        npyv_u8 r_0 = npyv_negative_u8(v_0);
+        npyv_storen_u8(op + 0 * vstep * ostride, ostride, r_0);
+    #endif
+    
+#line 142
+    #if UNROLL > 1
+        npyv_u8 v_1 = npyv_load_u8(ip + 1 * vstep);
+        npyv_u8 r_1 = npyv_negative_u8(v_1);
+        npyv_storen_u8(op + 1 * vstep * ostride, ostride, r_1);
+    #endif
+    
+#line 142
+    #if UNROLL > 2
+        npyv_u8 v_2 = npyv_load_u8(ip + 2 * vstep);
+        npyv_u8 r_2 = npyv_negative_u8(v_2);
+        npyv_storen_u8(op + 2 * vstep * ostride, ostride, r_2);
+    #endif
+    
+#line 142
+    #if UNROLL > 3
+        npyv_u8 v_3 = npyv_load_u8(ip + 3 * vstep);
+        npyv_u8 r_3 = npyv_negative_u8(v_3);
+        npyv_storen_u8(op + 3 * vstep * ostride, ostride, r_3);
+    #endif
+    
+#line 142
+    #if UNROLL > 4
+        npyv_u8 v_4 = npyv_load_u8(ip + 4 * vstep);
+        npyv_u8 r_4 = npyv_negative_u8(v_4);
+        npyv_storen_u8(op + 4 * vstep * ostride, ostride, r_4);
+    #endif
+    
+#line 142
+    #if UNROLL > 5
+        npyv_u8 v_5 = npyv_load_u8(ip + 5 * vstep);
+        npyv_u8 r_5 = npyv_negative_u8(v_5);
+        npyv_storen_u8(op + 5 * vstep * ostride, ostride, r_5);
+    #endif
+    
+#line 142
+    #if UNROLL > 6
+        npyv_u8 v_6 = npyv_load_u8(ip + 6 * vstep);
+        npyv_u8 r_6 = npyv_negative_u8(v_6);
+        npyv_storen_u8(op + 6 * vstep * ostride, ostride, r_6);
+    #endif
+    
+#line 142
+    #if UNROLL > 7
+        npyv_u8 v_7 = npyv_load_u8(ip + 7 * vstep);
+        npyv_u8 r_7 = npyv_negative_u8(v_7);
+        npyv_storen_u8(op + 7 * vstep * ostride, ostride, r_7);
+    #endif
+    
+#line 142
+    #if UNROLL > 8
+        npyv_u8 v_8 = npyv_load_u8(ip + 8 * vstep);
+        npyv_u8 r_8 = npyv_negative_u8(v_8);
+        npyv_storen_u8(op + 8 * vstep * ostride, ostride, r_8);
+    #endif
+    
+#line 142
+    #if UNROLL > 9
+        npyv_u8 v_9 = npyv_load_u8(ip + 9 * vstep);
+        npyv_u8 r_9 = npyv_negative_u8(v_9);
+        npyv_storen_u8(op + 9 * vstep * ostride, ostride, r_9);
+    #endif
+    
+#line 142
+    #if UNROLL > 10
+        npyv_u8 v_10 = npyv_load_u8(ip + 10 * vstep);
+        npyv_u8 r_10 = npyv_negative_u8(v_10);
+        npyv_storen_u8(op + 10 * vstep * ostride, ostride, r_10);
+    #endif
+    
+#line 142
+    #if UNROLL > 11
+        npyv_u8 v_11 = npyv_load_u8(ip + 11 * vstep);
+        npyv_u8 r_11 = npyv_negative_u8(v_11);
+        npyv_storen_u8(op + 11 * vstep * ostride, ostride, r_11);
+    #endif
+    
+#line 142
+    #if UNROLL > 12
+        npyv_u8 v_12 = npyv_load_u8(ip + 12 * vstep);
+        npyv_u8 r_12 = npyv_negative_u8(v_12);
+        npyv_storen_u8(op + 12 * vstep * ostride, ostride, r_12);
+    #endif
+    
+#line 142
+    #if UNROLL > 13
+        npyv_u8 v_13 = npyv_load_u8(ip + 13 * vstep);
+        npyv_u8 r_13 = npyv_negative_u8(v_13);
+        npyv_storen_u8(op + 13 * vstep * ostride, ostride, r_13);
+    #endif
+    
+#line 142
+    #if UNROLL > 14
+        npyv_u8 v_14 = npyv_load_u8(ip + 14 * vstep);
+        npyv_u8 r_14 = npyv_negative_u8(v_14);
+        npyv_storen_u8(op + 14 * vstep * ostride, ostride, r_14);
+    #endif
+    
+#line 142
+    #if UNROLL > 15
+        npyv_u8 v_15 = npyv_load_u8(ip + 15 * vstep);
+        npyv_u8 r_15 = npyv_negative_u8(v_15);
+        npyv_storen_u8(op + 15 * vstep * ostride, ostride, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += vstep, op += ostride*vstep) {
+        npyv_u8 v = npyv_load_u8(ip);
+        npyv_u8 r = npyv_negative_u8(v);
+        npyv_storen_u8(op, ostride, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ++ip, op += ostride) {
+        *op = scalar_negative(*ip);
+    }
+}
+// non-contiguous input, contiguous output
+static NPY_INLINE void
+simd_unary_nc_negative_u8(const npyv_lanetype_u8 *ip, npy_intp istride,
+                             npyv_lanetype_u8 *op,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_u8;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += wstep) {
+    #line 174
+    #if UNROLL > 0
+        npyv_u8 v_0 = npyv_loadn_u8(ip + 0 * vstep * istride, istride);
+        npyv_u8 r_0 = npyv_negative_u8(v_0);
+        npyv_store_u8(op + 0 * vstep, r_0);
+    #endif
+    
+#line 174
+    #if UNROLL > 1
+        npyv_u8 v_1 = npyv_loadn_u8(ip + 1 * vstep * istride, istride);
+        npyv_u8 r_1 = npyv_negative_u8(v_1);
+        npyv_store_u8(op + 1 * vstep, r_1);
+    #endif
+    
+#line 174
+    #if UNROLL > 2
+        npyv_u8 v_2 = npyv_loadn_u8(ip + 2 * vstep * istride, istride);
+        npyv_u8 r_2 = npyv_negative_u8(v_2);
+        npyv_store_u8(op + 2 * vstep, r_2);
+    #endif
+    
+#line 174
+    #if UNROLL > 3
+        npyv_u8 v_3 = npyv_loadn_u8(ip + 3 * vstep * istride, istride);
+        npyv_u8 r_3 = npyv_negative_u8(v_3);
+        npyv_store_u8(op + 3 * vstep, r_3);
+    #endif
+    
+#line 174
+    #if UNROLL > 4
+        npyv_u8 v_4 = npyv_loadn_u8(ip + 4 * vstep * istride, istride);
+        npyv_u8 r_4 = npyv_negative_u8(v_4);
+        npyv_store_u8(op + 4 * vstep, r_4);
+    #endif
+    
+#line 174
+    #if UNROLL > 5
+        npyv_u8 v_5 = npyv_loadn_u8(ip + 5 * vstep * istride, istride);
+        npyv_u8 r_5 = npyv_negative_u8(v_5);
+        npyv_store_u8(op + 5 * vstep, r_5);
+    #endif
+    
+#line 174
+    #if UNROLL > 6
+        npyv_u8 v_6 = npyv_loadn_u8(ip + 6 * vstep * istride, istride);
+        npyv_u8 r_6 = npyv_negative_u8(v_6);
+        npyv_store_u8(op + 6 * vstep, r_6);
+    #endif
+    
+#line 174
+    #if UNROLL > 7
+        npyv_u8 v_7 = npyv_loadn_u8(ip + 7 * vstep * istride, istride);
+        npyv_u8 r_7 = npyv_negative_u8(v_7);
+        npyv_store_u8(op + 7 * vstep, r_7);
+    #endif
+    
+#line 174
+    #if UNROLL > 8
+        npyv_u8 v_8 = npyv_loadn_u8(ip + 8 * vstep * istride, istride);
+        npyv_u8 r_8 = npyv_negative_u8(v_8);
+        npyv_store_u8(op + 8 * vstep, r_8);
+    #endif
+    
+#line 174
+    #if UNROLL > 9
+        npyv_u8 v_9 = npyv_loadn_u8(ip + 9 * vstep * istride, istride);
+        npyv_u8 r_9 = npyv_negative_u8(v_9);
+        npyv_store_u8(op + 9 * vstep, r_9);
+    #endif
+    
+#line 174
+    #if UNROLL > 10
+        npyv_u8 v_10 = npyv_loadn_u8(ip + 10 * vstep * istride, istride);
+        npyv_u8 r_10 = npyv_negative_u8(v_10);
+        npyv_store_u8(op + 10 * vstep, r_10);
+    #endif
+    
+#line 174
+    #if UNROLL > 11
+        npyv_u8 v_11 = npyv_loadn_u8(ip + 11 * vstep * istride, istride);
+        npyv_u8 r_11 = npyv_negative_u8(v_11);
+        npyv_store_u8(op + 11 * vstep, r_11);
+    #endif
+    
+#line 174
+    #if UNROLL > 12
+        npyv_u8 v_12 = npyv_loadn_u8(ip + 12 * vstep * istride, istride);
+        npyv_u8 r_12 = npyv_negative_u8(v_12);
+        npyv_store_u8(op + 12 * vstep, r_12);
+    #endif
+    
+#line 174
+    #if UNROLL > 13
+        npyv_u8 v_13 = npyv_loadn_u8(ip + 13 * vstep * istride, istride);
+        npyv_u8 r_13 = npyv_negative_u8(v_13);
+        npyv_store_u8(op + 13 * vstep, r_13);
+    #endif
+    
+#line 174
+    #if UNROLL > 14
+        npyv_u8 v_14 = npyv_loadn_u8(ip + 14 * vstep * istride, istride);
+        npyv_u8 r_14 = npyv_negative_u8(v_14);
+        npyv_store_u8(op + 14 * vstep, r_14);
+    #endif
+    
+#line 174
+    #if UNROLL > 15
+        npyv_u8 v_15 = npyv_loadn_u8(ip + 15 * vstep * istride, istride);
+        npyv_u8 r_15 = npyv_negative_u8(v_15);
+        npyv_store_u8(op + 15 * vstep, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += vstep) {
+        npyv_u8 v = npyv_loadn_u8(ip, istride);
+        npyv_u8 r = npyv_negative_u8(v);
+        npyv_store_u8(op, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ip += istride, ++op) {
+        *op = scalar_negative(*ip);
+    }
+}
+// non-contiguous input and output
+// limit unroll to 2x
+#if UNROLL > 2
+#undef UNROLL
+#define UNROLL 2
+#endif
+// X86 does better with unrolled scalar for heavy non-contiguous
+#ifndef NPY_HAVE_SSE2
+static NPY_INLINE void
+simd_unary_nn_negative_u8(const npyv_lanetype_u8 *ip, npy_intp istride,
+                             npyv_lanetype_u8 *op, npy_intp ostride,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_u8;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+    #line 213
+    #if UNROLL > 0
+        npyv_u8 v_0 = npyv_loadn_u8(ip + 0 * vstep * istride, istride);
+        npyv_u8 r_0 = npyv_negative_u8(v_0);
+        npyv_storen_u8(op + 0 * vstep * ostride, ostride, r_0);
+    #endif
+    
+#line 213
+    #if UNROLL > 1
+        npyv_u8 v_1 = npyv_loadn_u8(ip + 1 * vstep * istride, istride);
+        npyv_u8 r_1 = npyv_negative_u8(v_1);
+        npyv_storen_u8(op + 1 * vstep * ostride, ostride, r_1);
+    #endif
+    
+#line 213
+    #if UNROLL > 2
+        npyv_u8 v_2 = npyv_loadn_u8(ip + 2 * vstep * istride, istride);
+        npyv_u8 r_2 = npyv_negative_u8(v_2);
+        npyv_storen_u8(op + 2 * vstep * ostride, ostride, r_2);
+    #endif
+    
+#line 213
+    #if UNROLL > 3
+        npyv_u8 v_3 = npyv_loadn_u8(ip + 3 * vstep * istride, istride);
+        npyv_u8 r_3 = npyv_negative_u8(v_3);
+        npyv_storen_u8(op + 3 * vstep * ostride, ostride, r_3);
+    #endif
+    
+#line 213
+    #if UNROLL > 4
+        npyv_u8 v_4 = npyv_loadn_u8(ip + 4 * vstep * istride, istride);
+        npyv_u8 r_4 = npyv_negative_u8(v_4);
+        npyv_storen_u8(op + 4 * vstep * ostride, ostride, r_4);
+    #endif
+    
+#line 213
+    #if UNROLL > 5
+        npyv_u8 v_5 = npyv_loadn_u8(ip + 5 * vstep * istride, istride);
+        npyv_u8 r_5 = npyv_negative_u8(v_5);
+        npyv_storen_u8(op + 5 * vstep * ostride, ostride, r_5);
+    #endif
+    
+#line 213
+    #if UNROLL > 6
+        npyv_u8 v_6 = npyv_loadn_u8(ip + 6 * vstep * istride, istride);
+        npyv_u8 r_6 = npyv_negative_u8(v_6);
+        npyv_storen_u8(op + 6 * vstep * ostride, ostride, r_6);
+    #endif
+    
+#line 213
+    #if UNROLL > 7
+        npyv_u8 v_7 = npyv_loadn_u8(ip + 7 * vstep * istride, istride);
+        npyv_u8 r_7 = npyv_negative_u8(v_7);
+        npyv_storen_u8(op + 7 * vstep * ostride, ostride, r_7);
+    #endif
+    
+#line 213
+    #if UNROLL > 8
+        npyv_u8 v_8 = npyv_loadn_u8(ip + 8 * vstep * istride, istride);
+        npyv_u8 r_8 = npyv_negative_u8(v_8);
+        npyv_storen_u8(op + 8 * vstep * ostride, ostride, r_8);
+    #endif
+    
+#line 213
+    #if UNROLL > 9
+        npyv_u8 v_9 = npyv_loadn_u8(ip + 9 * vstep * istride, istride);
+        npyv_u8 r_9 = npyv_negative_u8(v_9);
+        npyv_storen_u8(op + 9 * vstep * ostride, ostride, r_9);
+    #endif
+    
+#line 213
+    #if UNROLL > 10
+        npyv_u8 v_10 = npyv_loadn_u8(ip + 10 * vstep * istride, istride);
+        npyv_u8 r_10 = npyv_negative_u8(v_10);
+        npyv_storen_u8(op + 10 * vstep * ostride, ostride, r_10);
+    #endif
+    
+#line 213
+    #if UNROLL > 11
+        npyv_u8 v_11 = npyv_loadn_u8(ip + 11 * vstep * istride, istride);
+        npyv_u8 r_11 = npyv_negative_u8(v_11);
+        npyv_storen_u8(op + 11 * vstep * ostride, ostride, r_11);
+    #endif
+    
+#line 213
+    #if UNROLL > 12
+        npyv_u8 v_12 = npyv_loadn_u8(ip + 12 * vstep * istride, istride);
+        npyv_u8 r_12 = npyv_negative_u8(v_12);
+        npyv_storen_u8(op + 12 * vstep * ostride, ostride, r_12);
+    #endif
+    
+#line 213
+    #if UNROLL > 13
+        npyv_u8 v_13 = npyv_loadn_u8(ip + 13 * vstep * istride, istride);
+        npyv_u8 r_13 = npyv_negative_u8(v_13);
+        npyv_storen_u8(op + 13 * vstep * ostride, ostride, r_13);
+    #endif
+    
+#line 213
+    #if UNROLL > 14
+        npyv_u8 v_14 = npyv_loadn_u8(ip + 14 * vstep * istride, istride);
+        npyv_u8 r_14 = npyv_negative_u8(v_14);
+        npyv_storen_u8(op + 14 * vstep * ostride, ostride, r_14);
+    #endif
+    
+#line 213
+    #if UNROLL > 15
+        npyv_u8 v_15 = npyv_loadn_u8(ip + 15 * vstep * istride, istride);
+        npyv_u8 r_15 = npyv_negative_u8(v_15);
+        npyv_storen_u8(op + 15 * vstep * ostride, ostride, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+        npyv_u8 v = npyv_loadn_u8(ip, istride);
+        npyv_u8 r = npyv_negative_u8(v);
+        npyv_storen_u8(op, ostride, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = scalar_negative(*ip);
+    }
+}
+#endif // NPY_HAVE_SSE2
+#endif // 0
+#undef UNROLL
+#endif // NPY_SIMD
+/*end repeat1**/
+
+#line 80
+#line 85
+#if NPY_SIMD
+#if 4 < 1
+#error "Unroll must be at least 1"
+#elif NPY_SIMD != 128 && 4 > 2
+// Avoid memory bandwidth bottleneck for larger SIMD
+#define UNROLL 2
+#else
+#define UNROLL 4
+#endif
+// contiguous inputs and output.
+static NPY_INLINE void
+simd_unary_cc_negative_s16(const npyv_lanetype_s16 *ip,
+                             npyv_lanetype_s16 *op,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_s16;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
+    #line 108
+    #if UNROLL > 0
+        npyv_s16 v_0 = npyv_load_s16(ip + 0 * vstep);
+        npyv_s16 r_0 = npyv_negative_s16(v_0);
+        npyv_store_s16(op + 0 * vstep, r_0);
+    #endif
+    
+#line 108
+    #if UNROLL > 1
+        npyv_s16 v_1 = npyv_load_s16(ip + 1 * vstep);
+        npyv_s16 r_1 = npyv_negative_s16(v_1);
+        npyv_store_s16(op + 1 * vstep, r_1);
+    #endif
+    
+#line 108
+    #if UNROLL > 2
+        npyv_s16 v_2 = npyv_load_s16(ip + 2 * vstep);
+        npyv_s16 r_2 = npyv_negative_s16(v_2);
+        npyv_store_s16(op + 2 * vstep, r_2);
+    #endif
+    
+#line 108
+    #if UNROLL > 3
+        npyv_s16 v_3 = npyv_load_s16(ip + 3 * vstep);
+        npyv_s16 r_3 = npyv_negative_s16(v_3);
+        npyv_store_s16(op + 3 * vstep, r_3);
+    #endif
+    
+#line 108
+    #if UNROLL > 4
+        npyv_s16 v_4 = npyv_load_s16(ip + 4 * vstep);
+        npyv_s16 r_4 = npyv_negative_s16(v_4);
+        npyv_store_s16(op + 4 * vstep, r_4);
+    #endif
+    
+#line 108
+    #if UNROLL > 5
+        npyv_s16 v_5 = npyv_load_s16(ip + 5 * vstep);
+        npyv_s16 r_5 = npyv_negative_s16(v_5);
+        npyv_store_s16(op + 5 * vstep, r_5);
+    #endif
+    
+#line 108
+    #if UNROLL > 6
+        npyv_s16 v_6 = npyv_load_s16(ip + 6 * vstep);
+        npyv_s16 r_6 = npyv_negative_s16(v_6);
+        npyv_store_s16(op + 6 * vstep, r_6);
+    #endif
+    
+#line 108
+    #if UNROLL > 7
+        npyv_s16 v_7 = npyv_load_s16(ip + 7 * vstep);
+        npyv_s16 r_7 = npyv_negative_s16(v_7);
+        npyv_store_s16(op + 7 * vstep, r_7);
+    #endif
+    
+#line 108
+    #if UNROLL > 8
+        npyv_s16 v_8 = npyv_load_s16(ip + 8 * vstep);
+        npyv_s16 r_8 = npyv_negative_s16(v_8);
+        npyv_store_s16(op + 8 * vstep, r_8);
+    #endif
+    
+#line 108
+    #if UNROLL > 9
+        npyv_s16 v_9 = npyv_load_s16(ip + 9 * vstep);
+        npyv_s16 r_9 = npyv_negative_s16(v_9);
+        npyv_store_s16(op + 9 * vstep, r_9);
+    #endif
+    
+#line 108
+    #if UNROLL > 10
+        npyv_s16 v_10 = npyv_load_s16(ip + 10 * vstep);
+        npyv_s16 r_10 = npyv_negative_s16(v_10);
+        npyv_store_s16(op + 10 * vstep, r_10);
+    #endif
+    
+#line 108
+    #if UNROLL > 11
+        npyv_s16 v_11 = npyv_load_s16(ip + 11 * vstep);
+        npyv_s16 r_11 = npyv_negative_s16(v_11);
+        npyv_store_s16(op + 11 * vstep, r_11);
+    #endif
+    
+#line 108
+    #if UNROLL > 12
+        npyv_s16 v_12 = npyv_load_s16(ip + 12 * vstep);
+        npyv_s16 r_12 = npyv_negative_s16(v_12);
+        npyv_store_s16(op + 12 * vstep, r_12);
+    #endif
+    
+#line 108
+    #if UNROLL > 13
+        npyv_s16 v_13 = npyv_load_s16(ip + 13 * vstep);
+        npyv_s16 r_13 = npyv_negative_s16(v_13);
+        npyv_store_s16(op + 13 * vstep, r_13);
+    #endif
+    
+#line 108
+    #if UNROLL > 14
+        npyv_s16 v_14 = npyv_load_s16(ip + 14 * vstep);
+        npyv_s16 r_14 = npyv_negative_s16(v_14);
+        npyv_store_s16(op + 14 * vstep, r_14);
+    #endif
+    
+#line 108
+    #if UNROLL > 15
+        npyv_s16 v_15 = npyv_load_s16(ip + 15 * vstep);
+        npyv_s16 r_15 = npyv_negative_s16(v_15);
+        npyv_store_s16(op + 15 * vstep, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += vstep, op +=vstep) {
+        npyv_s16 v = npyv_load_s16(ip);
+        npyv_s16 r = npyv_negative_s16(v);
+        npyv_store_s16(op, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ++ip, ++op) {
+        *op = scalar_negative(*ip);
+    }
+}
+
+#if 0
+// contiguous input, non-contiguous output
+static NPY_INLINE void
+simd_unary_cn_negative_s16(const npyv_lanetype_s16 *ip,
+                             npyv_lanetype_s16 *op, npy_intp ostride,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_s16;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += wstep, op += ostride*wstep) {
+    #line 142
+    #if UNROLL > 0
+        npyv_s16 v_0 = npyv_load_s16(ip + 0 * vstep);
+        npyv_s16 r_0 = npyv_negative_s16(v_0);
+        npyv_storen_s16(op + 0 * vstep * ostride, ostride, r_0);
+    #endif
+    
+#line 142
+    #if UNROLL > 1
+        npyv_s16 v_1 = npyv_load_s16(ip + 1 * vstep);
+        npyv_s16 r_1 = npyv_negative_s16(v_1);
+        npyv_storen_s16(op + 1 * vstep * ostride, ostride, r_1);
+    #endif
+    
+#line 142
+    #if UNROLL > 2
+        npyv_s16 v_2 = npyv_load_s16(ip + 2 * vstep);
+        npyv_s16 r_2 = npyv_negative_s16(v_2);
+        npyv_storen_s16(op + 2 * vstep * ostride, ostride, r_2);
+    #endif
+    
+#line 142
+    #if UNROLL > 3
+        npyv_s16 v_3 = npyv_load_s16(ip + 3 * vstep);
+        npyv_s16 r_3 = npyv_negative_s16(v_3);
+        npyv_storen_s16(op + 3 * vstep * ostride, ostride, r_3);
+    #endif
+    
+#line 142
+    #if UNROLL > 4
+        npyv_s16 v_4 = npyv_load_s16(ip + 4 * vstep);
+        npyv_s16 r_4 = npyv_negative_s16(v_4);
+        npyv_storen_s16(op + 4 * vstep * ostride, ostride, r_4);
+    #endif
+    
+#line 142
+    #if UNROLL > 5
+        npyv_s16 v_5 = npyv_load_s16(ip + 5 * vstep);
+        npyv_s16 r_5 = npyv_negative_s16(v_5);
+        npyv_storen_s16(op + 5 * vstep * ostride, ostride, r_5);
+    #endif
+    
+#line 142
+    #if UNROLL > 6
+        npyv_s16 v_6 = npyv_load_s16(ip + 6 * vstep);
+        npyv_s16 r_6 = npyv_negative_s16(v_6);
+        npyv_storen_s16(op + 6 * vstep * ostride, ostride, r_6);
+    #endif
+    
+#line 142
+    #if UNROLL > 7
+        npyv_s16 v_7 = npyv_load_s16(ip + 7 * vstep);
+        npyv_s16 r_7 = npyv_negative_s16(v_7);
+        npyv_storen_s16(op + 7 * vstep * ostride, ostride, r_7);
+    #endif
+    
+#line 142
+    #if UNROLL > 8
+        npyv_s16 v_8 = npyv_load_s16(ip + 8 * vstep);
+        npyv_s16 r_8 = npyv_negative_s16(v_8);
+        npyv_storen_s16(op + 8 * vstep * ostride, ostride, r_8);
+    #endif
+    
+#line 142
+    #if UNROLL > 9
+        npyv_s16 v_9 = npyv_load_s16(ip + 9 * vstep);
+        npyv_s16 r_9 = npyv_negative_s16(v_9);
+        npyv_storen_s16(op + 9 * vstep * ostride, ostride, r_9);
+    #endif
+    
+#line 142
+    #if UNROLL > 10
+        npyv_s16 v_10 = npyv_load_s16(ip + 10 * vstep);
+        npyv_s16 r_10 = npyv_negative_s16(v_10);
+        npyv_storen_s16(op + 10 * vstep * ostride, ostride, r_10);
+    #endif
+    
+#line 142
+    #if UNROLL > 11
+        npyv_s16 v_11 = npyv_load_s16(ip + 11 * vstep);
+        npyv_s16 r_11 = npyv_negative_s16(v_11);
+        npyv_storen_s16(op + 11 * vstep * ostride, ostride, r_11);
+    #endif
+    
+#line 142
+    #if UNROLL > 12
+        npyv_s16 v_12 = npyv_load_s16(ip + 12 * vstep);
+        npyv_s16 r_12 = npyv_negative_s16(v_12);
+        npyv_storen_s16(op + 12 * vstep * ostride, ostride, r_12);
+    #endif
+    
+#line 142
+    #if UNROLL > 13
+        npyv_s16 v_13 = npyv_load_s16(ip + 13 * vstep);
+        npyv_s16 r_13 = npyv_negative_s16(v_13);
+        npyv_storen_s16(op + 13 * vstep * ostride, ostride, r_13);
+    #endif
+    
+#line 142
+    #if UNROLL > 14
+        npyv_s16 v_14 = npyv_load_s16(ip + 14 * vstep);
+        npyv_s16 r_14 = npyv_negative_s16(v_14);
+        npyv_storen_s16(op + 14 * vstep * ostride, ostride, r_14);
+    #endif
+    
+#line 142
+    #if UNROLL > 15
+        npyv_s16 v_15 = npyv_load_s16(ip + 15 * vstep);
+        npyv_s16 r_15 = npyv_negative_s16(v_15);
+        npyv_storen_s16(op + 15 * vstep * ostride, ostride, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += vstep, op += ostride*vstep) {
+        npyv_s16 v = npyv_load_s16(ip);
+        npyv_s16 r = npyv_negative_s16(v);
+        npyv_storen_s16(op, ostride, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ++ip, op += ostride) {
+        *op = scalar_negative(*ip);
+    }
+}
+// non-contiguous input, contiguous output
+static NPY_INLINE void
+simd_unary_nc_negative_s16(const npyv_lanetype_s16 *ip, npy_intp istride,
+                             npyv_lanetype_s16 *op,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_s16;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += wstep) {
+    #line 174
+    #if UNROLL > 0
+        npyv_s16 v_0 = npyv_loadn_s16(ip + 0 * vstep * istride, istride);
+        npyv_s16 r_0 = npyv_negative_s16(v_0);
+        npyv_store_s16(op + 0 * vstep, r_0);
+    #endif
+    
+#line 174
+    #if UNROLL > 1
+        npyv_s16 v_1 = npyv_loadn_s16(ip + 1 * vstep * istride, istride);
+        npyv_s16 r_1 = npyv_negative_s16(v_1);
+        npyv_store_s16(op + 1 * vstep, r_1);
+    #endif
+    
+#line 174
+    #if UNROLL > 2
+        npyv_s16 v_2 = npyv_loadn_s16(ip + 2 * vstep * istride, istride);
+        npyv_s16 r_2 = npyv_negative_s16(v_2);
+        npyv_store_s16(op + 2 * vstep, r_2);
+    #endif
+    
+#line 174
+    #if UNROLL > 3
+        npyv_s16 v_3 = npyv_loadn_s16(ip + 3 * vstep * istride, istride);
+        npyv_s16 r_3 = npyv_negative_s16(v_3);
+        npyv_store_s16(op + 3 * vstep, r_3);
+    #endif
+    
+#line 174
+    #if UNROLL > 4
+        npyv_s16 v_4 = npyv_loadn_s16(ip + 4 * vstep * istride, istride);
+        npyv_s16 r_4 = npyv_negative_s16(v_4);
+        npyv_store_s16(op + 4 * vstep, r_4);
+    #endif
+    
+#line 174
+    #if UNROLL > 5
+        npyv_s16 v_5 = npyv_loadn_s16(ip + 5 * vstep * istride, istride);
+        npyv_s16 r_5 = npyv_negative_s16(v_5);
+        npyv_store_s16(op + 5 * vstep, r_5);
+    #endif
+    
+#line 174
+    #if UNROLL > 6
+        npyv_s16 v_6 = npyv_loadn_s16(ip + 6 * vstep * istride, istride);
+        npyv_s16 r_6 = npyv_negative_s16(v_6);
+        npyv_store_s16(op + 6 * vstep, r_6);
+    #endif
+    
+#line 174
+    #if UNROLL > 7
+        npyv_s16 v_7 = npyv_loadn_s16(ip + 7 * vstep * istride, istride);
+        npyv_s16 r_7 = npyv_negative_s16(v_7);
+        npyv_store_s16(op + 7 * vstep, r_7);
+    #endif
+    
+#line 174
+    #if UNROLL > 8
+        npyv_s16 v_8 = npyv_loadn_s16(ip + 8 * vstep * istride, istride);
+        npyv_s16 r_8 = npyv_negative_s16(v_8);
+        npyv_store_s16(op + 8 * vstep, r_8);
+    #endif
+    
+#line 174
+    #if UNROLL > 9
+        npyv_s16 v_9 = npyv_loadn_s16(ip + 9 * vstep * istride, istride);
+        npyv_s16 r_9 = npyv_negative_s16(v_9);
+        npyv_store_s16(op + 9 * vstep, r_9);
+    #endif
+    
+#line 174
+    #if UNROLL > 10
+        npyv_s16 v_10 = npyv_loadn_s16(ip + 10 * vstep * istride, istride);
+        npyv_s16 r_10 = npyv_negative_s16(v_10);
+        npyv_store_s16(op + 10 * vstep, r_10);
+    #endif
+    
+#line 174
+    #if UNROLL > 11
+        npyv_s16 v_11 = npyv_loadn_s16(ip + 11 * vstep * istride, istride);
+        npyv_s16 r_11 = npyv_negative_s16(v_11);
+        npyv_store_s16(op + 11 * vstep, r_11);
+    #endif
+    
+#line 174
+    #if UNROLL > 12
+        npyv_s16 v_12 = npyv_loadn_s16(ip + 12 * vstep * istride, istride);
+        npyv_s16 r_12 = npyv_negative_s16(v_12);
+        npyv_store_s16(op + 12 * vstep, r_12);
+    #endif
+    
+#line 174
+    #if UNROLL > 13
+        npyv_s16 v_13 = npyv_loadn_s16(ip + 13 * vstep * istride, istride);
+        npyv_s16 r_13 = npyv_negative_s16(v_13);
+        npyv_store_s16(op + 13 * vstep, r_13);
+    #endif
+    
+#line 174
+    #if UNROLL > 14
+        npyv_s16 v_14 = npyv_loadn_s16(ip + 14 * vstep * istride, istride);
+        npyv_s16 r_14 = npyv_negative_s16(v_14);
+        npyv_store_s16(op + 14 * vstep, r_14);
+    #endif
+    
+#line 174
+    #if UNROLL > 15
+        npyv_s16 v_15 = npyv_loadn_s16(ip + 15 * vstep * istride, istride);
+        npyv_s16 r_15 = npyv_negative_s16(v_15);
+        npyv_store_s16(op + 15 * vstep, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += vstep) {
+        npyv_s16 v = npyv_loadn_s16(ip, istride);
+        npyv_s16 r = npyv_negative_s16(v);
+        npyv_store_s16(op, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ip += istride, ++op) {
+        *op = scalar_negative(*ip);
+    }
+}
+// non-contiguous input and output
+// limit unroll to 2x
+#if UNROLL > 2
+#undef UNROLL
+#define UNROLL 2
+#endif
+// X86 does better with unrolled scalar for heavy non-contiguous
+#ifndef NPY_HAVE_SSE2
+static NPY_INLINE void
+simd_unary_nn_negative_s16(const npyv_lanetype_s16 *ip, npy_intp istride,
+                             npyv_lanetype_s16 *op, npy_intp ostride,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_s16;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+    #line 213
+    #if UNROLL > 0
+        npyv_s16 v_0 = npyv_loadn_s16(ip + 0 * vstep * istride, istride);
+        npyv_s16 r_0 = npyv_negative_s16(v_0);
+        npyv_storen_s16(op + 0 * vstep * ostride, ostride, r_0);
+    #endif
+    
+#line 213
+    #if UNROLL > 1
+        npyv_s16 v_1 = npyv_loadn_s16(ip + 1 * vstep * istride, istride);
+        npyv_s16 r_1 = npyv_negative_s16(v_1);
+        npyv_storen_s16(op + 1 * vstep * ostride, ostride, r_1);
+    #endif
+    
+#line 213
+    #if UNROLL > 2
+        npyv_s16 v_2 = npyv_loadn_s16(ip + 2 * vstep * istride, istride);
+        npyv_s16 r_2 = npyv_negative_s16(v_2);
+        npyv_storen_s16(op + 2 * vstep * ostride, ostride, r_2);
+    #endif
+    
+#line 213
+    #if UNROLL > 3
+        npyv_s16 v_3 = npyv_loadn_s16(ip + 3 * vstep * istride, istride);
+        npyv_s16 r_3 = npyv_negative_s16(v_3);
+        npyv_storen_s16(op + 3 * vstep * ostride, ostride, r_3);
+    #endif
+    
+#line 213
+    #if UNROLL > 4
+        npyv_s16 v_4 = npyv_loadn_s16(ip + 4 * vstep * istride, istride);
+        npyv_s16 r_4 = npyv_negative_s16(v_4);
+        npyv_storen_s16(op + 4 * vstep * ostride, ostride, r_4);
+    #endif
+    
+#line 213
+    #if UNROLL > 5
+        npyv_s16 v_5 = npyv_loadn_s16(ip + 5 * vstep * istride, istride);
+        npyv_s16 r_5 = npyv_negative_s16(v_5);
+        npyv_storen_s16(op + 5 * vstep * ostride, ostride, r_5);
+    #endif
+    
+#line 213
+    #if UNROLL > 6
+        npyv_s16 v_6 = npyv_loadn_s16(ip + 6 * vstep * istride, istride);
+        npyv_s16 r_6 = npyv_negative_s16(v_6);
+        npyv_storen_s16(op + 6 * vstep * ostride, ostride, r_6);
+    #endif
+    
+#line 213
+    #if UNROLL > 7
+        npyv_s16 v_7 = npyv_loadn_s16(ip + 7 * vstep * istride, istride);
+        npyv_s16 r_7 = npyv_negative_s16(v_7);
+        npyv_storen_s16(op + 7 * vstep * ostride, ostride, r_7);
+    #endif
+    
+#line 213
+    #if UNROLL > 8
+        npyv_s16 v_8 = npyv_loadn_s16(ip + 8 * vstep * istride, istride);
+        npyv_s16 r_8 = npyv_negative_s16(v_8);
+        npyv_storen_s16(op + 8 * vstep * ostride, ostride, r_8);
+    #endif
+    
+#line 213
+    #if UNROLL > 9
+        npyv_s16 v_9 = npyv_loadn_s16(ip + 9 * vstep * istride, istride);
+        npyv_s16 r_9 = npyv_negative_s16(v_9);
+        npyv_storen_s16(op + 9 * vstep * ostride, ostride, r_9);
+    #endif
+    
+#line 213
+    #if UNROLL > 10
+        npyv_s16 v_10 = npyv_loadn_s16(ip + 10 * vstep * istride, istride);
+        npyv_s16 r_10 = npyv_negative_s16(v_10);
+        npyv_storen_s16(op + 10 * vstep * ostride, ostride, r_10);
+    #endif
+    
+#line 213
+    #if UNROLL > 11
+        npyv_s16 v_11 = npyv_loadn_s16(ip + 11 * vstep * istride, istride);
+        npyv_s16 r_11 = npyv_negative_s16(v_11);
+        npyv_storen_s16(op + 11 * vstep * ostride, ostride, r_11);
+    #endif
+    
+#line 213
+    #if UNROLL > 12
+        npyv_s16 v_12 = npyv_loadn_s16(ip + 12 * vstep * istride, istride);
+        npyv_s16 r_12 = npyv_negative_s16(v_12);
+        npyv_storen_s16(op + 12 * vstep * ostride, ostride, r_12);
+    #endif
+    
+#line 213
+    #if UNROLL > 13
+        npyv_s16 v_13 = npyv_loadn_s16(ip + 13 * vstep * istride, istride);
+        npyv_s16 r_13 = npyv_negative_s16(v_13);
+        npyv_storen_s16(op + 13 * vstep * ostride, ostride, r_13);
+    #endif
+    
+#line 213
+    #if UNROLL > 14
+        npyv_s16 v_14 = npyv_loadn_s16(ip + 14 * vstep * istride, istride);
+        npyv_s16 r_14 = npyv_negative_s16(v_14);
+        npyv_storen_s16(op + 14 * vstep * ostride, ostride, r_14);
+    #endif
+    
+#line 213
+    #if UNROLL > 15
+        npyv_s16 v_15 = npyv_loadn_s16(ip + 15 * vstep * istride, istride);
+        npyv_s16 r_15 = npyv_negative_s16(v_15);
+        npyv_storen_s16(op + 15 * vstep * ostride, ostride, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+        npyv_s16 v = npyv_loadn_s16(ip, istride);
+        npyv_s16 r = npyv_negative_s16(v);
+        npyv_storen_s16(op, ostride, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = scalar_negative(*ip);
+    }
+}
+#endif // NPY_HAVE_SSE2
+#endif // 0
+#undef UNROLL
+#endif // NPY_SIMD
+/*end repeat1**/
+
+#line 80
+#line 85
+#if NPY_SIMD
+#if 4 < 1
+#error "Unroll must be at least 1"
+#elif NPY_SIMD != 128 && 4 > 2
+// Avoid memory bandwidth bottleneck for larger SIMD
+#define UNROLL 2
+#else
+#define UNROLL 4
+#endif
+// contiguous inputs and output.
+static NPY_INLINE void
+simd_unary_cc_negative_u16(const npyv_lanetype_u16 *ip,
+                             npyv_lanetype_u16 *op,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_u16;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
+    #line 108
+    #if UNROLL > 0
+        npyv_u16 v_0 = npyv_load_u16(ip + 0 * vstep);
+        npyv_u16 r_0 = npyv_negative_u16(v_0);
+        npyv_store_u16(op + 0 * vstep, r_0);
+    #endif
+    
+#line 108
+    #if UNROLL > 1
+        npyv_u16 v_1 = npyv_load_u16(ip + 1 * vstep);
+        npyv_u16 r_1 = npyv_negative_u16(v_1);
+        npyv_store_u16(op + 1 * vstep, r_1);
+    #endif
+    
+#line 108
+    #if UNROLL > 2
+        npyv_u16 v_2 = npyv_load_u16(ip + 2 * vstep);
+        npyv_u16 r_2 = npyv_negative_u16(v_2);
+        npyv_store_u16(op + 2 * vstep, r_2);
+    #endif
+    
+#line 108
+    #if UNROLL > 3
+        npyv_u16 v_3 = npyv_load_u16(ip + 3 * vstep);
+        npyv_u16 r_3 = npyv_negative_u16(v_3);
+        npyv_store_u16(op + 3 * vstep, r_3);
+    #endif
+    
+#line 108
+    #if UNROLL > 4
+        npyv_u16 v_4 = npyv_load_u16(ip + 4 * vstep);
+        npyv_u16 r_4 = npyv_negative_u16(v_4);
+        npyv_store_u16(op + 4 * vstep, r_4);
+    #endif
+    
+#line 108
+    #if UNROLL > 5
+        npyv_u16 v_5 = npyv_load_u16(ip + 5 * vstep);
+        npyv_u16 r_5 = npyv_negative_u16(v_5);
+        npyv_store_u16(op + 5 * vstep, r_5);
+    #endif
+    
+#line 108
+    #if UNROLL > 6
+        npyv_u16 v_6 = npyv_load_u16(ip + 6 * vstep);
+        npyv_u16 r_6 = npyv_negative_u16(v_6);
+        npyv_store_u16(op + 6 * vstep, r_6);
+    #endif
+    
+#line 108
+    #if UNROLL > 7
+        npyv_u16 v_7 = npyv_load_u16(ip + 7 * vstep);
+        npyv_u16 r_7 = npyv_negative_u16(v_7);
+        npyv_store_u16(op + 7 * vstep, r_7);
+    #endif
+    
+#line 108
+    #if UNROLL > 8
+        npyv_u16 v_8 = npyv_load_u16(ip + 8 * vstep);
+        npyv_u16 r_8 = npyv_negative_u16(v_8);
+        npyv_store_u16(op + 8 * vstep, r_8);
+    #endif
+    
+#line 108
+    #if UNROLL > 9
+        npyv_u16 v_9 = npyv_load_u16(ip + 9 * vstep);
+        npyv_u16 r_9 = npyv_negative_u16(v_9);
+        npyv_store_u16(op + 9 * vstep, r_9);
+    #endif
+    
+#line 108
+    #if UNROLL > 10
+        npyv_u16 v_10 = npyv_load_u16(ip + 10 * vstep);
+        npyv_u16 r_10 = npyv_negative_u16(v_10);
+        npyv_store_u16(op + 10 * vstep, r_10);
+    #endif
+    
+#line 108
+    #if UNROLL > 11
+        npyv_u16 v_11 = npyv_load_u16(ip + 11 * vstep);
+        npyv_u16 r_11 = npyv_negative_u16(v_11);
+        npyv_store_u16(op + 11 * vstep, r_11);
+    #endif
+    
+#line 108
+    #if UNROLL > 12
+        npyv_u16 v_12 = npyv_load_u16(ip + 12 * vstep);
+        npyv_u16 r_12 = npyv_negative_u16(v_12);
+        npyv_store_u16(op + 12 * vstep, r_12);
+    #endif
+    
+#line 108
+    #if UNROLL > 13
+        npyv_u16 v_13 = npyv_load_u16(ip + 13 * vstep);
+        npyv_u16 r_13 = npyv_negative_u16(v_13);
+        npyv_store_u16(op + 13 * vstep, r_13);
+    #endif
+    
+#line 108
+    #if UNROLL > 14
+        npyv_u16 v_14 = npyv_load_u16(ip + 14 * vstep);
+        npyv_u16 r_14 = npyv_negative_u16(v_14);
+        npyv_store_u16(op + 14 * vstep, r_14);
+    #endif
+    
+#line 108
+    #if UNROLL > 15
+        npyv_u16 v_15 = npyv_load_u16(ip + 15 * vstep);
+        npyv_u16 r_15 = npyv_negative_u16(v_15);
+        npyv_store_u16(op + 15 * vstep, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += vstep, op +=vstep) {
+        npyv_u16 v = npyv_load_u16(ip);
+        npyv_u16 r = npyv_negative_u16(v);
+        npyv_store_u16(op, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ++ip, ++op) {
+        *op = scalar_negative(*ip);
+    }
+}
+
+#if 0
+// contiguous input, non-contiguous output
+static NPY_INLINE void
+simd_unary_cn_negative_u16(const npyv_lanetype_u16 *ip,
+                             npyv_lanetype_u16 *op, npy_intp ostride,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_u16;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += wstep, op += ostride*wstep) {
+    #line 142
+    #if UNROLL > 0
+        npyv_u16 v_0 = npyv_load_u16(ip + 0 * vstep);
+        npyv_u16 r_0 = npyv_negative_u16(v_0);
+        npyv_storen_u16(op + 0 * vstep * ostride, ostride, r_0);
+    #endif
+    
+#line 142
+    #if UNROLL > 1
+        npyv_u16 v_1 = npyv_load_u16(ip + 1 * vstep);
+        npyv_u16 r_1 = npyv_negative_u16(v_1);
+        npyv_storen_u16(op + 1 * vstep * ostride, ostride, r_1);
+    #endif
+    
+#line 142
+    #if UNROLL > 2
+        npyv_u16 v_2 = npyv_load_u16(ip + 2 * vstep);
+        npyv_u16 r_2 = npyv_negative_u16(v_2);
+        npyv_storen_u16(op + 2 * vstep * ostride, ostride, r_2);
+    #endif
+    
+#line 142
+    #if UNROLL > 3
+        npyv_u16 v_3 = npyv_load_u16(ip + 3 * vstep);
+        npyv_u16 r_3 = npyv_negative_u16(v_3);
+        npyv_storen_u16(op + 3 * vstep * ostride, ostride, r_3);
+    #endif
+    
+#line 142
+    #if UNROLL > 4
+        npyv_u16 v_4 = npyv_load_u16(ip + 4 * vstep);
+        npyv_u16 r_4 = npyv_negative_u16(v_4);
+        npyv_storen_u16(op + 4 * vstep * ostride, ostride, r_4);
+    #endif
+    
+#line 142
+    #if UNROLL > 5
+        npyv_u16 v_5 = npyv_load_u16(ip + 5 * vstep);
+        npyv_u16 r_5 = npyv_negative_u16(v_5);
+        npyv_storen_u16(op + 5 * vstep * ostride, ostride, r_5);
+    #endif
+    
+#line 142
+    #if UNROLL > 6
+        npyv_u16 v_6 = npyv_load_u16(ip + 6 * vstep);
+        npyv_u16 r_6 = npyv_negative_u16(v_6);
+        npyv_storen_u16(op + 6 * vstep * ostride, ostride, r_6);
+    #endif
+    
+#line 142
+    #if UNROLL > 7
+        npyv_u16 v_7 = npyv_load_u16(ip + 7 * vstep);
+        npyv_u16 r_7 = npyv_negative_u16(v_7);
+        npyv_storen_u16(op + 7 * vstep * ostride, ostride, r_7);
+    #endif
+    
+#line 142
+    #if UNROLL > 8
+        npyv_u16 v_8 = npyv_load_u16(ip + 8 * vstep);
+        npyv_u16 r_8 = npyv_negative_u16(v_8);
+        npyv_storen_u16(op + 8 * vstep * ostride, ostride, r_8);
+    #endif
+    
+#line 142
+    #if UNROLL > 9
+        npyv_u16 v_9 = npyv_load_u16(ip + 9 * vstep);
+        npyv_u16 r_9 = npyv_negative_u16(v_9);
+        npyv_storen_u16(op + 9 * vstep * ostride, ostride, r_9);
+    #endif
+    
+#line 142
+    #if UNROLL > 10
+        npyv_u16 v_10 = npyv_load_u16(ip + 10 * vstep);
+        npyv_u16 r_10 = npyv_negative_u16(v_10);
+        npyv_storen_u16(op + 10 * vstep * ostride, ostride, r_10);
+    #endif
+    
+#line 142
+    #if UNROLL > 11
+        npyv_u16 v_11 = npyv_load_u16(ip + 11 * vstep);
+        npyv_u16 r_11 = npyv_negative_u16(v_11);
+        npyv_storen_u16(op + 11 * vstep * ostride, ostride, r_11);
+    #endif
+    
+#line 142
+    #if UNROLL > 12
+        npyv_u16 v_12 = npyv_load_u16(ip + 12 * vstep);
+        npyv_u16 r_12 = npyv_negative_u16(v_12);
+        npyv_storen_u16(op + 12 * vstep * ostride, ostride, r_12);
+    #endif
+    
+#line 142
+    #if UNROLL > 13
+        npyv_u16 v_13 = npyv_load_u16(ip + 13 * vstep);
+        npyv_u16 r_13 = npyv_negative_u16(v_13);
+        npyv_storen_u16(op + 13 * vstep * ostride, ostride, r_13);
+    #endif
+    
+#line 142
+    #if UNROLL > 14
+        npyv_u16 v_14 = npyv_load_u16(ip + 14 * vstep);
+        npyv_u16 r_14 = npyv_negative_u16(v_14);
+        npyv_storen_u16(op + 14 * vstep * ostride, ostride, r_14);
+    #endif
+    
+#line 142
+    #if UNROLL > 15
+        npyv_u16 v_15 = npyv_load_u16(ip + 15 * vstep);
+        npyv_u16 r_15 = npyv_negative_u16(v_15);
+        npyv_storen_u16(op + 15 * vstep * ostride, ostride, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += vstep, op += ostride*vstep) {
+        npyv_u16 v = npyv_load_u16(ip);
+        npyv_u16 r = npyv_negative_u16(v);
+        npyv_storen_u16(op, ostride, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ++ip, op += ostride) {
+        *op = scalar_negative(*ip);
+    }
+}
+// non-contiguous input, contiguous output
+static NPY_INLINE void
+simd_unary_nc_negative_u16(const npyv_lanetype_u16 *ip, npy_intp istride,
+                             npyv_lanetype_u16 *op,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_u16;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += wstep) {
+    #line 174
+    #if UNROLL > 0
+        npyv_u16 v_0 = npyv_loadn_u16(ip + 0 * vstep * istride, istride);
+        npyv_u16 r_0 = npyv_negative_u16(v_0);
+        npyv_store_u16(op + 0 * vstep, r_0);
+    #endif
+    
+#line 174
+    #if UNROLL > 1
+        npyv_u16 v_1 = npyv_loadn_u16(ip + 1 * vstep * istride, istride);
+        npyv_u16 r_1 = npyv_negative_u16(v_1);
+        npyv_store_u16(op + 1 * vstep, r_1);
+    #endif
+    
+#line 174
+    #if UNROLL > 2
+        npyv_u16 v_2 = npyv_loadn_u16(ip + 2 * vstep * istride, istride);
+        npyv_u16 r_2 = npyv_negative_u16(v_2);
+        npyv_store_u16(op + 2 * vstep, r_2);
+    #endif
+    
+#line 174
+    #if UNROLL > 3
+        npyv_u16 v_3 = npyv_loadn_u16(ip + 3 * vstep * istride, istride);
+        npyv_u16 r_3 = npyv_negative_u16(v_3);
+        npyv_store_u16(op + 3 * vstep, r_3);
+    #endif
+    
+#line 174
+    #if UNROLL > 4
+        npyv_u16 v_4 = npyv_loadn_u16(ip + 4 * vstep * istride, istride);
+        npyv_u16 r_4 = npyv_negative_u16(v_4);
+        npyv_store_u16(op + 4 * vstep, r_4);
+    #endif
+    
+#line 174
+    #if UNROLL > 5
+        npyv_u16 v_5 = npyv_loadn_u16(ip + 5 * vstep * istride, istride);
+        npyv_u16 r_5 = npyv_negative_u16(v_5);
+        npyv_store_u16(op + 5 * vstep, r_5);
+    #endif
+    
+#line 174
+    #if UNROLL > 6
+        npyv_u16 v_6 = npyv_loadn_u16(ip + 6 * vstep * istride, istride);
+        npyv_u16 r_6 = npyv_negative_u16(v_6);
+        npyv_store_u16(op + 6 * vstep, r_6);
+    #endif
+    
+#line 174
+    #if UNROLL > 7
+        npyv_u16 v_7 = npyv_loadn_u16(ip + 7 * vstep * istride, istride);
+        npyv_u16 r_7 = npyv_negative_u16(v_7);
+        npyv_store_u16(op + 7 * vstep, r_7);
+    #endif
+    
+#line 174
+    #if UNROLL > 8
+        npyv_u16 v_8 = npyv_loadn_u16(ip + 8 * vstep * istride, istride);
+        npyv_u16 r_8 = npyv_negative_u16(v_8);
+        npyv_store_u16(op + 8 * vstep, r_8);
+    #endif
+    
+#line 174
+    #if UNROLL > 9
+        npyv_u16 v_9 = npyv_loadn_u16(ip + 9 * vstep * istride, istride);
+        npyv_u16 r_9 = npyv_negative_u16(v_9);
+        npyv_store_u16(op + 9 * vstep, r_9);
+    #endif
+    
+#line 174
+    #if UNROLL > 10
+        npyv_u16 v_10 = npyv_loadn_u16(ip + 10 * vstep * istride, istride);
+        npyv_u16 r_10 = npyv_negative_u16(v_10);
+        npyv_store_u16(op + 10 * vstep, r_10);
+    #endif
+    
+#line 174
+    #if UNROLL > 11
+        npyv_u16 v_11 = npyv_loadn_u16(ip + 11 * vstep * istride, istride);
+        npyv_u16 r_11 = npyv_negative_u16(v_11);
+        npyv_store_u16(op + 11 * vstep, r_11);
+    #endif
+    
+#line 174
+    #if UNROLL > 12
+        npyv_u16 v_12 = npyv_loadn_u16(ip + 12 * vstep * istride, istride);
+        npyv_u16 r_12 = npyv_negative_u16(v_12);
+        npyv_store_u16(op + 12 * vstep, r_12);
+    #endif
+    
+#line 174
+    #if UNROLL > 13
+        npyv_u16 v_13 = npyv_loadn_u16(ip + 13 * vstep * istride, istride);
+        npyv_u16 r_13 = npyv_negative_u16(v_13);
+        npyv_store_u16(op + 13 * vstep, r_13);
+    #endif
+    
+#line 174
+    #if UNROLL > 14
+        npyv_u16 v_14 = npyv_loadn_u16(ip + 14 * vstep * istride, istride);
+        npyv_u16 r_14 = npyv_negative_u16(v_14);
+        npyv_store_u16(op + 14 * vstep, r_14);
+    #endif
+    
+#line 174
+    #if UNROLL > 15
+        npyv_u16 v_15 = npyv_loadn_u16(ip + 15 * vstep * istride, istride);
+        npyv_u16 r_15 = npyv_negative_u16(v_15);
+        npyv_store_u16(op + 15 * vstep, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += vstep) {
+        npyv_u16 v = npyv_loadn_u16(ip, istride);
+        npyv_u16 r = npyv_negative_u16(v);
+        npyv_store_u16(op, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ip += istride, ++op) {
+        *op = scalar_negative(*ip);
+    }
+}
+// non-contiguous input and output
+// limit unroll to 2x
+#if UNROLL > 2
+#undef UNROLL
+#define UNROLL 2
+#endif
+// X86 does better with unrolled scalar for heavy non-contiguous
+#ifndef NPY_HAVE_SSE2
+static NPY_INLINE void
+simd_unary_nn_negative_u16(const npyv_lanetype_u16 *ip, npy_intp istride,
+                             npyv_lanetype_u16 *op, npy_intp ostride,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_u16;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+    #line 213
+    #if UNROLL > 0
+        npyv_u16 v_0 = npyv_loadn_u16(ip + 0 * vstep * istride, istride);
+        npyv_u16 r_0 = npyv_negative_u16(v_0);
+        npyv_storen_u16(op + 0 * vstep * ostride, ostride, r_0);
+    #endif
+    
+#line 213
+    #if UNROLL > 1
+        npyv_u16 v_1 = npyv_loadn_u16(ip + 1 * vstep * istride, istride);
+        npyv_u16 r_1 = npyv_negative_u16(v_1);
+        npyv_storen_u16(op + 1 * vstep * ostride, ostride, r_1);
+    #endif
+    
+#line 213
+    #if UNROLL > 2
+        npyv_u16 v_2 = npyv_loadn_u16(ip + 2 * vstep * istride, istride);
+        npyv_u16 r_2 = npyv_negative_u16(v_2);
+        npyv_storen_u16(op + 2 * vstep * ostride, ostride, r_2);
+    #endif
+    
+#line 213
+    #if UNROLL > 3
+        npyv_u16 v_3 = npyv_loadn_u16(ip + 3 * vstep * istride, istride);
+        npyv_u16 r_3 = npyv_negative_u16(v_3);
+        npyv_storen_u16(op + 3 * vstep * ostride, ostride, r_3);
+    #endif
+    
+#line 213
+    #if UNROLL > 4
+        npyv_u16 v_4 = npyv_loadn_u16(ip + 4 * vstep * istride, istride);
+        npyv_u16 r_4 = npyv_negative_u16(v_4);
+        npyv_storen_u16(op + 4 * vstep * ostride, ostride, r_4);
+    #endif
+    
+#line 213
+    #if UNROLL > 5
+        npyv_u16 v_5 = npyv_loadn_u16(ip + 5 * vstep * istride, istride);
+        npyv_u16 r_5 = npyv_negative_u16(v_5);
+        npyv_storen_u16(op + 5 * vstep * ostride, ostride, r_5);
+    #endif
+    
+#line 213
+    #if UNROLL > 6
+        npyv_u16 v_6 = npyv_loadn_u16(ip + 6 * vstep * istride, istride);
+        npyv_u16 r_6 = npyv_negative_u16(v_6);
+        npyv_storen_u16(op + 6 * vstep * ostride, ostride, r_6);
+    #endif
+    
+#line 213
+    #if UNROLL > 7
+        npyv_u16 v_7 = npyv_loadn_u16(ip + 7 * vstep * istride, istride);
+        npyv_u16 r_7 = npyv_negative_u16(v_7);
+        npyv_storen_u16(op + 7 * vstep * ostride, ostride, r_7);
+    #endif
+    
+#line 213
+    #if UNROLL > 8
+        npyv_u16 v_8 = npyv_loadn_u16(ip + 8 * vstep * istride, istride);
+        npyv_u16 r_8 = npyv_negative_u16(v_8);
+        npyv_storen_u16(op + 8 * vstep * ostride, ostride, r_8);
+    #endif
+    
+#line 213
+    #if UNROLL > 9
+        npyv_u16 v_9 = npyv_loadn_u16(ip + 9 * vstep * istride, istride);
+        npyv_u16 r_9 = npyv_negative_u16(v_9);
+        npyv_storen_u16(op + 9 * vstep * ostride, ostride, r_9);
+    #endif
+    
+#line 213
+    #if UNROLL > 10
+        npyv_u16 v_10 = npyv_loadn_u16(ip + 10 * vstep * istride, istride);
+        npyv_u16 r_10 = npyv_negative_u16(v_10);
+        npyv_storen_u16(op + 10 * vstep * ostride, ostride, r_10);
+    #endif
+    
+#line 213
+    #if UNROLL > 11
+        npyv_u16 v_11 = npyv_loadn_u16(ip + 11 * vstep * istride, istride);
+        npyv_u16 r_11 = npyv_negative_u16(v_11);
+        npyv_storen_u16(op + 11 * vstep * ostride, ostride, r_11);
+    #endif
+    
+#line 213
+    #if UNROLL > 12
+        npyv_u16 v_12 = npyv_loadn_u16(ip + 12 * vstep * istride, istride);
+        npyv_u16 r_12 = npyv_negative_u16(v_12);
+        npyv_storen_u16(op + 12 * vstep * ostride, ostride, r_12);
+    #endif
+    
+#line 213
+    #if UNROLL > 13
+        npyv_u16 v_13 = npyv_loadn_u16(ip + 13 * vstep * istride, istride);
+        npyv_u16 r_13 = npyv_negative_u16(v_13);
+        npyv_storen_u16(op + 13 * vstep * ostride, ostride, r_13);
+    #endif
+    
+#line 213
+    #if UNROLL > 14
+        npyv_u16 v_14 = npyv_loadn_u16(ip + 14 * vstep * istride, istride);
+        npyv_u16 r_14 = npyv_negative_u16(v_14);
+        npyv_storen_u16(op + 14 * vstep * ostride, ostride, r_14);
+    #endif
+    
+#line 213
+    #if UNROLL > 15
+        npyv_u16 v_15 = npyv_loadn_u16(ip + 15 * vstep * istride, istride);
+        npyv_u16 r_15 = npyv_negative_u16(v_15);
+        npyv_storen_u16(op + 15 * vstep * ostride, ostride, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+        npyv_u16 v = npyv_loadn_u16(ip, istride);
+        npyv_u16 r = npyv_negative_u16(v);
+        npyv_storen_u16(op, ostride, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = scalar_negative(*ip);
+    }
+}
+#endif // NPY_HAVE_SSE2
+#endif // 0
+#undef UNROLL
+#endif // NPY_SIMD
+/*end repeat1**/
+
+#line 80
+#line 85
+#if NPY_SIMD
+#if 4 < 1
+#error "Unroll must be at least 1"
+#elif NPY_SIMD != 128 && 4 > 2
+// Avoid memory bandwidth bottleneck for larger SIMD
+#define UNROLL 2
+#else
+#define UNROLL 4
+#endif
+// contiguous inputs and output.
+static NPY_INLINE void
+simd_unary_cc_negative_s32(const npyv_lanetype_s32 *ip,
+                             npyv_lanetype_s32 *op,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_s32;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
+    #line 108
+    #if UNROLL > 0
+        npyv_s32 v_0 = npyv_load_s32(ip + 0 * vstep);
+        npyv_s32 r_0 = npyv_negative_s32(v_0);
+        npyv_store_s32(op + 0 * vstep, r_0);
+    #endif
+    
+#line 108
+    #if UNROLL > 1
+        npyv_s32 v_1 = npyv_load_s32(ip + 1 * vstep);
+        npyv_s32 r_1 = npyv_negative_s32(v_1);
+        npyv_store_s32(op + 1 * vstep, r_1);
+    #endif
+    
+#line 108
+    #if UNROLL > 2
+        npyv_s32 v_2 = npyv_load_s32(ip + 2 * vstep);
+        npyv_s32 r_2 = npyv_negative_s32(v_2);
+        npyv_store_s32(op + 2 * vstep, r_2);
+    #endif
+    
+#line 108
+    #if UNROLL > 3
+        npyv_s32 v_3 = npyv_load_s32(ip + 3 * vstep);
+        npyv_s32 r_3 = npyv_negative_s32(v_3);
+        npyv_store_s32(op + 3 * vstep, r_3);
+    #endif
+    
+#line 108
+    #if UNROLL > 4
+        npyv_s32 v_4 = npyv_load_s32(ip + 4 * vstep);
+        npyv_s32 r_4 = npyv_negative_s32(v_4);
+        npyv_store_s32(op + 4 * vstep, r_4);
+    #endif
+    
+#line 108
+    #if UNROLL > 5
+        npyv_s32 v_5 = npyv_load_s32(ip + 5 * vstep);
+        npyv_s32 r_5 = npyv_negative_s32(v_5);
+        npyv_store_s32(op + 5 * vstep, r_5);
+    #endif
+    
+#line 108
+    #if UNROLL > 6
+        npyv_s32 v_6 = npyv_load_s32(ip + 6 * vstep);
+        npyv_s32 r_6 = npyv_negative_s32(v_6);
+        npyv_store_s32(op + 6 * vstep, r_6);
+    #endif
+    
+#line 108
+    #if UNROLL > 7
+        npyv_s32 v_7 = npyv_load_s32(ip + 7 * vstep);
+        npyv_s32 r_7 = npyv_negative_s32(v_7);
+        npyv_store_s32(op + 7 * vstep, r_7);
+    #endif
+    
+#line 108
+    #if UNROLL > 8
+        npyv_s32 v_8 = npyv_load_s32(ip + 8 * vstep);
+        npyv_s32 r_8 = npyv_negative_s32(v_8);
+        npyv_store_s32(op + 8 * vstep, r_8);
+    #endif
+    
+#line 108
+    #if UNROLL > 9
+        npyv_s32 v_9 = npyv_load_s32(ip + 9 * vstep);
+        npyv_s32 r_9 = npyv_negative_s32(v_9);
+        npyv_store_s32(op + 9 * vstep, r_9);
+    #endif
+    
+#line 108
+    #if UNROLL > 10
+        npyv_s32 v_10 = npyv_load_s32(ip + 10 * vstep);
+        npyv_s32 r_10 = npyv_negative_s32(v_10);
+        npyv_store_s32(op + 10 * vstep, r_10);
+    #endif
+    
+#line 108
+    #if UNROLL > 11
+        npyv_s32 v_11 = npyv_load_s32(ip + 11 * vstep);
+        npyv_s32 r_11 = npyv_negative_s32(v_11);
+        npyv_store_s32(op + 11 * vstep, r_11);
+    #endif
+    
+#line 108
+    #if UNROLL > 12
+        npyv_s32 v_12 = npyv_load_s32(ip + 12 * vstep);
+        npyv_s32 r_12 = npyv_negative_s32(v_12);
+        npyv_store_s32(op + 12 * vstep, r_12);
+    #endif
+    
+#line 108
+    #if UNROLL > 13
+        npyv_s32 v_13 = npyv_load_s32(ip + 13 * vstep);
+        npyv_s32 r_13 = npyv_negative_s32(v_13);
+        npyv_store_s32(op + 13 * vstep, r_13);
+    #endif
+    
+#line 108
+    #if UNROLL > 14
+        npyv_s32 v_14 = npyv_load_s32(ip + 14 * vstep);
+        npyv_s32 r_14 = npyv_negative_s32(v_14);
+        npyv_store_s32(op + 14 * vstep, r_14);
+    #endif
+    
+#line 108
+    #if UNROLL > 15
+        npyv_s32 v_15 = npyv_load_s32(ip + 15 * vstep);
+        npyv_s32 r_15 = npyv_negative_s32(v_15);
+        npyv_store_s32(op + 15 * vstep, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += vstep, op +=vstep) {
+        npyv_s32 v = npyv_load_s32(ip);
+        npyv_s32 r = npyv_negative_s32(v);
+        npyv_store_s32(op, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ++ip, ++op) {
+        *op = scalar_negative(*ip);
+    }
+}
+
+#if 1
+// contiguous input, non-contiguous output
+static NPY_INLINE void
+simd_unary_cn_negative_s32(const npyv_lanetype_s32 *ip,
+                             npyv_lanetype_s32 *op, npy_intp ostride,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_s32;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += wstep, op += ostride*wstep) {
+    #line 142
+    #if UNROLL > 0
+        npyv_s32 v_0 = npyv_load_s32(ip + 0 * vstep);
+        npyv_s32 r_0 = npyv_negative_s32(v_0);
+        npyv_storen_s32(op + 0 * vstep * ostride, ostride, r_0);
+    #endif
+    
+#line 142
+    #if UNROLL > 1
+        npyv_s32 v_1 = npyv_load_s32(ip + 1 * vstep);
+        npyv_s32 r_1 = npyv_negative_s32(v_1);
+        npyv_storen_s32(op + 1 * vstep * ostride, ostride, r_1);
+    #endif
+    
+#line 142
+    #if UNROLL > 2
+        npyv_s32 v_2 = npyv_load_s32(ip + 2 * vstep);
+        npyv_s32 r_2 = npyv_negative_s32(v_2);
+        npyv_storen_s32(op + 2 * vstep * ostride, ostride, r_2);
+    #endif
+    
+#line 142
+    #if UNROLL > 3
+        npyv_s32 v_3 = npyv_load_s32(ip + 3 * vstep);
+        npyv_s32 r_3 = npyv_negative_s32(v_3);
+        npyv_storen_s32(op + 3 * vstep * ostride, ostride, r_3);
+    #endif
+    
+#line 142
+    #if UNROLL > 4
+        npyv_s32 v_4 = npyv_load_s32(ip + 4 * vstep);
+        npyv_s32 r_4 = npyv_negative_s32(v_4);
+        npyv_storen_s32(op + 4 * vstep * ostride, ostride, r_4);
+    #endif
+    
+#line 142
+    #if UNROLL > 5
+        npyv_s32 v_5 = npyv_load_s32(ip + 5 * vstep);
+        npyv_s32 r_5 = npyv_negative_s32(v_5);
+        npyv_storen_s32(op + 5 * vstep * ostride, ostride, r_5);
+    #endif
+    
+#line 142
+    #if UNROLL > 6
+        npyv_s32 v_6 = npyv_load_s32(ip + 6 * vstep);
+        npyv_s32 r_6 = npyv_negative_s32(v_6);
+        npyv_storen_s32(op + 6 * vstep * ostride, ostride, r_6);
+    #endif
+    
+#line 142
+    #if UNROLL > 7
+        npyv_s32 v_7 = npyv_load_s32(ip + 7 * vstep);
+        npyv_s32 r_7 = npyv_negative_s32(v_7);
+        npyv_storen_s32(op + 7 * vstep * ostride, ostride, r_7);
+    #endif
+    
+#line 142
+    #if UNROLL > 8
+        npyv_s32 v_8 = npyv_load_s32(ip + 8 * vstep);
+        npyv_s32 r_8 = npyv_negative_s32(v_8);
+        npyv_storen_s32(op + 8 * vstep * ostride, ostride, r_8);
+    #endif
+    
+#line 142
+    #if UNROLL > 9
+        npyv_s32 v_9 = npyv_load_s32(ip + 9 * vstep);
+        npyv_s32 r_9 = npyv_negative_s32(v_9);
+        npyv_storen_s32(op + 9 * vstep * ostride, ostride, r_9);
+    #endif
+    
+#line 142
+    #if UNROLL > 10
+        npyv_s32 v_10 = npyv_load_s32(ip + 10 * vstep);
+        npyv_s32 r_10 = npyv_negative_s32(v_10);
+        npyv_storen_s32(op + 10 * vstep * ostride, ostride, r_10);
+    #endif
+    
+#line 142
+    #if UNROLL > 11
+        npyv_s32 v_11 = npyv_load_s32(ip + 11 * vstep);
+        npyv_s32 r_11 = npyv_negative_s32(v_11);
+        npyv_storen_s32(op + 11 * vstep * ostride, ostride, r_11);
+    #endif
+    
+#line 142
+    #if UNROLL > 12
+        npyv_s32 v_12 = npyv_load_s32(ip + 12 * vstep);
+        npyv_s32 r_12 = npyv_negative_s32(v_12);
+        npyv_storen_s32(op + 12 * vstep * ostride, ostride, r_12);
+    #endif
+    
+#line 142
+    #if UNROLL > 13
+        npyv_s32 v_13 = npyv_load_s32(ip + 13 * vstep);
+        npyv_s32 r_13 = npyv_negative_s32(v_13);
+        npyv_storen_s32(op + 13 * vstep * ostride, ostride, r_13);
+    #endif
+    
+#line 142
+    #if UNROLL > 14
+        npyv_s32 v_14 = npyv_load_s32(ip + 14 * vstep);
+        npyv_s32 r_14 = npyv_negative_s32(v_14);
+        npyv_storen_s32(op + 14 * vstep * ostride, ostride, r_14);
+    #endif
+    
+#line 142
+    #if UNROLL > 15
+        npyv_s32 v_15 = npyv_load_s32(ip + 15 * vstep);
+        npyv_s32 r_15 = npyv_negative_s32(v_15);
+        npyv_storen_s32(op + 15 * vstep * ostride, ostride, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += vstep, op += ostride*vstep) {
+        npyv_s32 v = npyv_load_s32(ip);
+        npyv_s32 r = npyv_negative_s32(v);
+        npyv_storen_s32(op, ostride, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ++ip, op += ostride) {
+        *op = scalar_negative(*ip);
+    }
+}
+// non-contiguous input, contiguous output
+static NPY_INLINE void
+simd_unary_nc_negative_s32(const npyv_lanetype_s32 *ip, npy_intp istride,
+                             npyv_lanetype_s32 *op,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_s32;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += wstep) {
+    #line 174
+    #if UNROLL > 0
+        npyv_s32 v_0 = npyv_loadn_s32(ip + 0 * vstep * istride, istride);
+        npyv_s32 r_0 = npyv_negative_s32(v_0);
+        npyv_store_s32(op + 0 * vstep, r_0);
+    #endif
+    
+#line 174
+    #if UNROLL > 1
+        npyv_s32 v_1 = npyv_loadn_s32(ip + 1 * vstep * istride, istride);
+        npyv_s32 r_1 = npyv_negative_s32(v_1);
+        npyv_store_s32(op + 1 * vstep, r_1);
+    #endif
+    
+#line 174
+    #if UNROLL > 2
+        npyv_s32 v_2 = npyv_loadn_s32(ip + 2 * vstep * istride, istride);
+        npyv_s32 r_2 = npyv_negative_s32(v_2);
+        npyv_store_s32(op + 2 * vstep, r_2);
+    #endif
+    
+#line 174
+    #if UNROLL > 3
+        npyv_s32 v_3 = npyv_loadn_s32(ip + 3 * vstep * istride, istride);
+        npyv_s32 r_3 = npyv_negative_s32(v_3);
+        npyv_store_s32(op + 3 * vstep, r_3);
+    #endif
+    
+#line 174
+    #if UNROLL > 4
+        npyv_s32 v_4 = npyv_loadn_s32(ip + 4 * vstep * istride, istride);
+        npyv_s32 r_4 = npyv_negative_s32(v_4);
+        npyv_store_s32(op + 4 * vstep, r_4);
+    #endif
+    
+#line 174
+    #if UNROLL > 5
+        npyv_s32 v_5 = npyv_loadn_s32(ip + 5 * vstep * istride, istride);
+        npyv_s32 r_5 = npyv_negative_s32(v_5);
+        npyv_store_s32(op + 5 * vstep, r_5);
+    #endif
+    
+#line 174
+    #if UNROLL > 6
+        npyv_s32 v_6 = npyv_loadn_s32(ip + 6 * vstep * istride, istride);
+        npyv_s32 r_6 = npyv_negative_s32(v_6);
+        npyv_store_s32(op + 6 * vstep, r_6);
+    #endif
+    
+#line 174
+    #if UNROLL > 7
+        npyv_s32 v_7 = npyv_loadn_s32(ip + 7 * vstep * istride, istride);
+        npyv_s32 r_7 = npyv_negative_s32(v_7);
+        npyv_store_s32(op + 7 * vstep, r_7);
+    #endif
+    
+#line 174
+    #if UNROLL > 8
+        npyv_s32 v_8 = npyv_loadn_s32(ip + 8 * vstep * istride, istride);
+        npyv_s32 r_8 = npyv_negative_s32(v_8);
+        npyv_store_s32(op + 8 * vstep, r_8);
+    #endif
+    
+#line 174
+    #if UNROLL > 9
+        npyv_s32 v_9 = npyv_loadn_s32(ip + 9 * vstep * istride, istride);
+        npyv_s32 r_9 = npyv_negative_s32(v_9);
+        npyv_store_s32(op + 9 * vstep, r_9);
+    #endif
+    
+#line 174
+    #if UNROLL > 10
+        npyv_s32 v_10 = npyv_loadn_s32(ip + 10 * vstep * istride, istride);
+        npyv_s32 r_10 = npyv_negative_s32(v_10);
+        npyv_store_s32(op + 10 * vstep, r_10);
+    #endif
+    
+#line 174
+    #if UNROLL > 11
+        npyv_s32 v_11 = npyv_loadn_s32(ip + 11 * vstep * istride, istride);
+        npyv_s32 r_11 = npyv_negative_s32(v_11);
+        npyv_store_s32(op + 11 * vstep, r_11);
+    #endif
+    
+#line 174
+    #if UNROLL > 12
+        npyv_s32 v_12 = npyv_loadn_s32(ip + 12 * vstep * istride, istride);
+        npyv_s32 r_12 = npyv_negative_s32(v_12);
+        npyv_store_s32(op + 12 * vstep, r_12);
+    #endif
+    
+#line 174
+    #if UNROLL > 13
+        npyv_s32 v_13 = npyv_loadn_s32(ip + 13 * vstep * istride, istride);
+        npyv_s32 r_13 = npyv_negative_s32(v_13);
+        npyv_store_s32(op + 13 * vstep, r_13);
+    #endif
+    
+#line 174
+    #if UNROLL > 14
+        npyv_s32 v_14 = npyv_loadn_s32(ip + 14 * vstep * istride, istride);
+        npyv_s32 r_14 = npyv_negative_s32(v_14);
+        npyv_store_s32(op + 14 * vstep, r_14);
+    #endif
+    
+#line 174
+    #if UNROLL > 15
+        npyv_s32 v_15 = npyv_loadn_s32(ip + 15 * vstep * istride, istride);
+        npyv_s32 r_15 = npyv_negative_s32(v_15);
+        npyv_store_s32(op + 15 * vstep, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += vstep) {
+        npyv_s32 v = npyv_loadn_s32(ip, istride);
+        npyv_s32 r = npyv_negative_s32(v);
+        npyv_store_s32(op, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ip += istride, ++op) {
+        *op = scalar_negative(*ip);
+    }
+}
+// non-contiguous input and output
+// limit unroll to 2x
+#if UNROLL > 2
+#undef UNROLL
+#define UNROLL 2
+#endif
+// X86 does better with unrolled scalar for heavy non-contiguous
+#ifndef NPY_HAVE_SSE2
+static NPY_INLINE void
+simd_unary_nn_negative_s32(const npyv_lanetype_s32 *ip, npy_intp istride,
+                             npyv_lanetype_s32 *op, npy_intp ostride,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_s32;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+    #line 213
+    #if UNROLL > 0
+        npyv_s32 v_0 = npyv_loadn_s32(ip + 0 * vstep * istride, istride);
+        npyv_s32 r_0 = npyv_negative_s32(v_0);
+        npyv_storen_s32(op + 0 * vstep * ostride, ostride, r_0);
+    #endif
+    
+#line 213
+    #if UNROLL > 1
+        npyv_s32 v_1 = npyv_loadn_s32(ip + 1 * vstep * istride, istride);
+        npyv_s32 r_1 = npyv_negative_s32(v_1);
+        npyv_storen_s32(op + 1 * vstep * ostride, ostride, r_1);
+    #endif
+    
+#line 213
+    #if UNROLL > 2
+        npyv_s32 v_2 = npyv_loadn_s32(ip + 2 * vstep * istride, istride);
+        npyv_s32 r_2 = npyv_negative_s32(v_2);
+        npyv_storen_s32(op + 2 * vstep * ostride, ostride, r_2);
+    #endif
+    
+#line 213
+    #if UNROLL > 3
+        npyv_s32 v_3 = npyv_loadn_s32(ip + 3 * vstep * istride, istride);
+        npyv_s32 r_3 = npyv_negative_s32(v_3);
+        npyv_storen_s32(op + 3 * vstep * ostride, ostride, r_3);
+    #endif
+    
+#line 213
+    #if UNROLL > 4
+        npyv_s32 v_4 = npyv_loadn_s32(ip + 4 * vstep * istride, istride);
+        npyv_s32 r_4 = npyv_negative_s32(v_4);
+        npyv_storen_s32(op + 4 * vstep * ostride, ostride, r_4);
+    #endif
+    
+#line 213
+    #if UNROLL > 5
+        npyv_s32 v_5 = npyv_loadn_s32(ip + 5 * vstep * istride, istride);
+        npyv_s32 r_5 = npyv_negative_s32(v_5);
+        npyv_storen_s32(op + 5 * vstep * ostride, ostride, r_5);
+    #endif
+    
+#line 213
+    #if UNROLL > 6
+        npyv_s32 v_6 = npyv_loadn_s32(ip + 6 * vstep * istride, istride);
+        npyv_s32 r_6 = npyv_negative_s32(v_6);
+        npyv_storen_s32(op + 6 * vstep * ostride, ostride, r_6);
+    #endif
+    
+#line 213
+    #if UNROLL > 7
+        npyv_s32 v_7 = npyv_loadn_s32(ip + 7 * vstep * istride, istride);
+        npyv_s32 r_7 = npyv_negative_s32(v_7);
+        npyv_storen_s32(op + 7 * vstep * ostride, ostride, r_7);
+    #endif
+    
+#line 213
+    #if UNROLL > 8
+        npyv_s32 v_8 = npyv_loadn_s32(ip + 8 * vstep * istride, istride);
+        npyv_s32 r_8 = npyv_negative_s32(v_8);
+        npyv_storen_s32(op + 8 * vstep * ostride, ostride, r_8);
+    #endif
+    
+#line 213
+    #if UNROLL > 9
+        npyv_s32 v_9 = npyv_loadn_s32(ip + 9 * vstep * istride, istride);
+        npyv_s32 r_9 = npyv_negative_s32(v_9);
+        npyv_storen_s32(op + 9 * vstep * ostride, ostride, r_9);
+    #endif
+    
+#line 213
+    #if UNROLL > 10
+        npyv_s32 v_10 = npyv_loadn_s32(ip + 10 * vstep * istride, istride);
+        npyv_s32 r_10 = npyv_negative_s32(v_10);
+        npyv_storen_s32(op + 10 * vstep * ostride, ostride, r_10);
+    #endif
+    
+#line 213
+    #if UNROLL > 11
+        npyv_s32 v_11 = npyv_loadn_s32(ip + 11 * vstep * istride, istride);
+        npyv_s32 r_11 = npyv_negative_s32(v_11);
+        npyv_storen_s32(op + 11 * vstep * ostride, ostride, r_11);
+    #endif
+    
+#line 213
+    #if UNROLL > 12
+        npyv_s32 v_12 = npyv_loadn_s32(ip + 12 * vstep * istride, istride);
+        npyv_s32 r_12 = npyv_negative_s32(v_12);
+        npyv_storen_s32(op + 12 * vstep * ostride, ostride, r_12);
+    #endif
+    
+#line 213
+    #if UNROLL > 13
+        npyv_s32 v_13 = npyv_loadn_s32(ip + 13 * vstep * istride, istride);
+        npyv_s32 r_13 = npyv_negative_s32(v_13);
+        npyv_storen_s32(op + 13 * vstep * ostride, ostride, r_13);
+    #endif
+    
+#line 213
+    #if UNROLL > 14
+        npyv_s32 v_14 = npyv_loadn_s32(ip + 14 * vstep * istride, istride);
+        npyv_s32 r_14 = npyv_negative_s32(v_14);
+        npyv_storen_s32(op + 14 * vstep * ostride, ostride, r_14);
+    #endif
+    
+#line 213
+    #if UNROLL > 15
+        npyv_s32 v_15 = npyv_loadn_s32(ip + 15 * vstep * istride, istride);
+        npyv_s32 r_15 = npyv_negative_s32(v_15);
+        npyv_storen_s32(op + 15 * vstep * ostride, ostride, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+        npyv_s32 v = npyv_loadn_s32(ip, istride);
+        npyv_s32 r = npyv_negative_s32(v);
+        npyv_storen_s32(op, ostride, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = scalar_negative(*ip);
+    }
+}
+#endif // NPY_HAVE_SSE2
+#endif // 1
+#undef UNROLL
+#endif // NPY_SIMD
+/*end repeat1**/
+
+#line 80
+#line 85
+#if NPY_SIMD
+#if 4 < 1
+#error "Unroll must be at least 1"
+#elif NPY_SIMD != 128 && 4 > 2
+// Avoid memory bandwidth bottleneck for larger SIMD
+#define UNROLL 2
+#else
+#define UNROLL 4
+#endif
+// contiguous inputs and output.
+static NPY_INLINE void
+simd_unary_cc_negative_u32(const npyv_lanetype_u32 *ip,
+                             npyv_lanetype_u32 *op,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_u32;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
+    #line 108
+    #if UNROLL > 0
+        npyv_u32 v_0 = npyv_load_u32(ip + 0 * vstep);
+        npyv_u32 r_0 = npyv_negative_u32(v_0);
+        npyv_store_u32(op + 0 * vstep, r_0);
+    #endif
+    
+#line 108
+    #if UNROLL > 1
+        npyv_u32 v_1 = npyv_load_u32(ip + 1 * vstep);
+        npyv_u32 r_1 = npyv_negative_u32(v_1);
+        npyv_store_u32(op + 1 * vstep, r_1);
+    #endif
+    
+#line 108
+    #if UNROLL > 2
+        npyv_u32 v_2 = npyv_load_u32(ip + 2 * vstep);
+        npyv_u32 r_2 = npyv_negative_u32(v_2);
+        npyv_store_u32(op + 2 * vstep, r_2);
+    #endif
+    
+#line 108
+    #if UNROLL > 3
+        npyv_u32 v_3 = npyv_load_u32(ip + 3 * vstep);
+        npyv_u32 r_3 = npyv_negative_u32(v_3);
+        npyv_store_u32(op + 3 * vstep, r_3);
+    #endif
+    
+#line 108
+    #if UNROLL > 4
+        npyv_u32 v_4 = npyv_load_u32(ip + 4 * vstep);
+        npyv_u32 r_4 = npyv_negative_u32(v_4);
+        npyv_store_u32(op + 4 * vstep, r_4);
+    #endif
+    
+#line 108
+    #if UNROLL > 5
+        npyv_u32 v_5 = npyv_load_u32(ip + 5 * vstep);
+        npyv_u32 r_5 = npyv_negative_u32(v_5);
+        npyv_store_u32(op + 5 * vstep, r_5);
+    #endif
+    
+#line 108
+    #if UNROLL > 6
+        npyv_u32 v_6 = npyv_load_u32(ip + 6 * vstep);
+        npyv_u32 r_6 = npyv_negative_u32(v_6);
+        npyv_store_u32(op + 6 * vstep, r_6);
+    #endif
+    
+#line 108
+    #if UNROLL > 7
+        npyv_u32 v_7 = npyv_load_u32(ip + 7 * vstep);
+        npyv_u32 r_7 = npyv_negative_u32(v_7);
+        npyv_store_u32(op + 7 * vstep, r_7);
+    #endif
+    
+#line 108
+    #if UNROLL > 8
+        npyv_u32 v_8 = npyv_load_u32(ip + 8 * vstep);
+        npyv_u32 r_8 = npyv_negative_u32(v_8);
+        npyv_store_u32(op + 8 * vstep, r_8);
+    #endif
+    
+#line 108
+    #if UNROLL > 9
+        npyv_u32 v_9 = npyv_load_u32(ip + 9 * vstep);
+        npyv_u32 r_9 = npyv_negative_u32(v_9);
+        npyv_store_u32(op + 9 * vstep, r_9);
+    #endif
+    
+#line 108
+    #if UNROLL > 10
+        npyv_u32 v_10 = npyv_load_u32(ip + 10 * vstep);
+        npyv_u32 r_10 = npyv_negative_u32(v_10);
+        npyv_store_u32(op + 10 * vstep, r_10);
+    #endif
+    
+#line 108
+    #if UNROLL > 11
+        npyv_u32 v_11 = npyv_load_u32(ip + 11 * vstep);
+        npyv_u32 r_11 = npyv_negative_u32(v_11);
+        npyv_store_u32(op + 11 * vstep, r_11);
+    #endif
+    
+#line 108
+    #if UNROLL > 12
+        npyv_u32 v_12 = npyv_load_u32(ip + 12 * vstep);
+        npyv_u32 r_12 = npyv_negative_u32(v_12);
+        npyv_store_u32(op + 12 * vstep, r_12);
+    #endif
+    
+#line 108
+    #if UNROLL > 13
+        npyv_u32 v_13 = npyv_load_u32(ip + 13 * vstep);
+        npyv_u32 r_13 = npyv_negative_u32(v_13);
+        npyv_store_u32(op + 13 * vstep, r_13);
+    #endif
+    
+#line 108
+    #if UNROLL > 14
+        npyv_u32 v_14 = npyv_load_u32(ip + 14 * vstep);
+        npyv_u32 r_14 = npyv_negative_u32(v_14);
+        npyv_store_u32(op + 14 * vstep, r_14);
+    #endif
+    
+#line 108
+    #if UNROLL > 15
+        npyv_u32 v_15 = npyv_load_u32(ip + 15 * vstep);
+        npyv_u32 r_15 = npyv_negative_u32(v_15);
+        npyv_store_u32(op + 15 * vstep, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += vstep, op +=vstep) {
+        npyv_u32 v = npyv_load_u32(ip);
+        npyv_u32 r = npyv_negative_u32(v);
+        npyv_store_u32(op, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ++ip, ++op) {
+        *op = scalar_negative(*ip);
+    }
+}
+
+#if 1
+// contiguous input, non-contiguous output
+static NPY_INLINE void
+simd_unary_cn_negative_u32(const npyv_lanetype_u32 *ip,
+                             npyv_lanetype_u32 *op, npy_intp ostride,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_u32;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += wstep, op += ostride*wstep) {
+    #line 142
+    #if UNROLL > 0
+        npyv_u32 v_0 = npyv_load_u32(ip + 0 * vstep);
+        npyv_u32 r_0 = npyv_negative_u32(v_0);
+        npyv_storen_u32(op + 0 * vstep * ostride, ostride, r_0);
+    #endif
+    
+#line 142
+    #if UNROLL > 1
+        npyv_u32 v_1 = npyv_load_u32(ip + 1 * vstep);
+        npyv_u32 r_1 = npyv_negative_u32(v_1);
+        npyv_storen_u32(op + 1 * vstep * ostride, ostride, r_1);
+    #endif
+    
+#line 142
+    #if UNROLL > 2
+        npyv_u32 v_2 = npyv_load_u32(ip + 2 * vstep);
+        npyv_u32 r_2 = npyv_negative_u32(v_2);
+        npyv_storen_u32(op + 2 * vstep * ostride, ostride, r_2);
+    #endif
+    
+#line 142
+    #if UNROLL > 3
+        npyv_u32 v_3 = npyv_load_u32(ip + 3 * vstep);
+        npyv_u32 r_3 = npyv_negative_u32(v_3);
+        npyv_storen_u32(op + 3 * vstep * ostride, ostride, r_3);
+    #endif
+    
+#line 142
+    #if UNROLL > 4
+        npyv_u32 v_4 = npyv_load_u32(ip + 4 * vstep);
+        npyv_u32 r_4 = npyv_negative_u32(v_4);
+        npyv_storen_u32(op + 4 * vstep * ostride, ostride, r_4);
+    #endif
+    
+#line 142
+    #if UNROLL > 5
+        npyv_u32 v_5 = npyv_load_u32(ip + 5 * vstep);
+        npyv_u32 r_5 = npyv_negative_u32(v_5);
+        npyv_storen_u32(op + 5 * vstep * ostride, ostride, r_5);
+    #endif
+    
+#line 142
+    #if UNROLL > 6
+        npyv_u32 v_6 = npyv_load_u32(ip + 6 * vstep);
+        npyv_u32 r_6 = npyv_negative_u32(v_6);
+        npyv_storen_u32(op + 6 * vstep * ostride, ostride, r_6);
+    #endif
+    
+#line 142
+    #if UNROLL > 7
+        npyv_u32 v_7 = npyv_load_u32(ip + 7 * vstep);
+        npyv_u32 r_7 = npyv_negative_u32(v_7);
+        npyv_storen_u32(op + 7 * vstep * ostride, ostride, r_7);
+    #endif
+    
+#line 142
+    #if UNROLL > 8
+        npyv_u32 v_8 = npyv_load_u32(ip + 8 * vstep);
+        npyv_u32 r_8 = npyv_negative_u32(v_8);
+        npyv_storen_u32(op + 8 * vstep * ostride, ostride, r_8);
+    #endif
+    
+#line 142
+    #if UNROLL > 9
+        npyv_u32 v_9 = npyv_load_u32(ip + 9 * vstep);
+        npyv_u32 r_9 = npyv_negative_u32(v_9);
+        npyv_storen_u32(op + 9 * vstep * ostride, ostride, r_9);
+    #endif
+    
+#line 142
+    #if UNROLL > 10
+        npyv_u32 v_10 = npyv_load_u32(ip + 10 * vstep);
+        npyv_u32 r_10 = npyv_negative_u32(v_10);
+        npyv_storen_u32(op + 10 * vstep * ostride, ostride, r_10);
+    #endif
+    
+#line 142
+    #if UNROLL > 11
+        npyv_u32 v_11 = npyv_load_u32(ip + 11 * vstep);
+        npyv_u32 r_11 = npyv_negative_u32(v_11);
+        npyv_storen_u32(op + 11 * vstep * ostride, ostride, r_11);
+    #endif
+    
+#line 142
+    #if UNROLL > 12
+        npyv_u32 v_12 = npyv_load_u32(ip + 12 * vstep);
+        npyv_u32 r_12 = npyv_negative_u32(v_12);
+        npyv_storen_u32(op + 12 * vstep * ostride, ostride, r_12);
+    #endif
+    
+#line 142
+    #if UNROLL > 13
+        npyv_u32 v_13 = npyv_load_u32(ip + 13 * vstep);
+        npyv_u32 r_13 = npyv_negative_u32(v_13);
+        npyv_storen_u32(op + 13 * vstep * ostride, ostride, r_13);
+    #endif
+    
+#line 142
+    #if UNROLL > 14
+        npyv_u32 v_14 = npyv_load_u32(ip + 14 * vstep);
+        npyv_u32 r_14 = npyv_negative_u32(v_14);
+        npyv_storen_u32(op + 14 * vstep * ostride, ostride, r_14);
+    #endif
+    
+#line 142
+    #if UNROLL > 15
+        npyv_u32 v_15 = npyv_load_u32(ip + 15 * vstep);
+        npyv_u32 r_15 = npyv_negative_u32(v_15);
+        npyv_storen_u32(op + 15 * vstep * ostride, ostride, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += vstep, op += ostride*vstep) {
+        npyv_u32 v = npyv_load_u32(ip);
+        npyv_u32 r = npyv_negative_u32(v);
+        npyv_storen_u32(op, ostride, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ++ip, op += ostride) {
+        *op = scalar_negative(*ip);
+    }
+}
+// non-contiguous input, contiguous output
+static NPY_INLINE void
+simd_unary_nc_negative_u32(const npyv_lanetype_u32 *ip, npy_intp istride,
+                             npyv_lanetype_u32 *op,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_u32;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += wstep) {
+    #line 174
+    #if UNROLL > 0
+        npyv_u32 v_0 = npyv_loadn_u32(ip + 0 * vstep * istride, istride);
+        npyv_u32 r_0 = npyv_negative_u32(v_0);
+        npyv_store_u32(op + 0 * vstep, r_0);
+    #endif
+    
+#line 174
+    #if UNROLL > 1
+        npyv_u32 v_1 = npyv_loadn_u32(ip + 1 * vstep * istride, istride);
+        npyv_u32 r_1 = npyv_negative_u32(v_1);
+        npyv_store_u32(op + 1 * vstep, r_1);
+    #endif
+    
+#line 174
+    #if UNROLL > 2
+        npyv_u32 v_2 = npyv_loadn_u32(ip + 2 * vstep * istride, istride);
+        npyv_u32 r_2 = npyv_negative_u32(v_2);
+        npyv_store_u32(op + 2 * vstep, r_2);
+    #endif
+    
+#line 174
+    #if UNROLL > 3
+        npyv_u32 v_3 = npyv_loadn_u32(ip + 3 * vstep * istride, istride);
+        npyv_u32 r_3 = npyv_negative_u32(v_3);
+        npyv_store_u32(op + 3 * vstep, r_3);
+    #endif
+    
+#line 174
+    #if UNROLL > 4
+        npyv_u32 v_4 = npyv_loadn_u32(ip + 4 * vstep * istride, istride);
+        npyv_u32 r_4 = npyv_negative_u32(v_4);
+        npyv_store_u32(op + 4 * vstep, r_4);
+    #endif
+    
+#line 174
+    #if UNROLL > 5
+        npyv_u32 v_5 = npyv_loadn_u32(ip + 5 * vstep * istride, istride);
+        npyv_u32 r_5 = npyv_negative_u32(v_5);
+        npyv_store_u32(op + 5 * vstep, r_5);
+    #endif
+    
+#line 174
+    #if UNROLL > 6
+        npyv_u32 v_6 = npyv_loadn_u32(ip + 6 * vstep * istride, istride);
+        npyv_u32 r_6 = npyv_negative_u32(v_6);
+        npyv_store_u32(op + 6 * vstep, r_6);
+    #endif
+    
+#line 174
+    #if UNROLL > 7
+        npyv_u32 v_7 = npyv_loadn_u32(ip + 7 * vstep * istride, istride);
+        npyv_u32 r_7 = npyv_negative_u32(v_7);
+        npyv_store_u32(op + 7 * vstep, r_7);
+    #endif
+    
+#line 174
+    #if UNROLL > 8
+        npyv_u32 v_8 = npyv_loadn_u32(ip + 8 * vstep * istride, istride);
+        npyv_u32 r_8 = npyv_negative_u32(v_8);
+        npyv_store_u32(op + 8 * vstep, r_8);
+    #endif
+    
+#line 174
+    #if UNROLL > 9
+        npyv_u32 v_9 = npyv_loadn_u32(ip + 9 * vstep * istride, istride);
+        npyv_u32 r_9 = npyv_negative_u32(v_9);
+        npyv_store_u32(op + 9 * vstep, r_9);
+    #endif
+    
+#line 174
+    #if UNROLL > 10
+        npyv_u32 v_10 = npyv_loadn_u32(ip + 10 * vstep * istride, istride);
+        npyv_u32 r_10 = npyv_negative_u32(v_10);
+        npyv_store_u32(op + 10 * vstep, r_10);
+    #endif
+    
+#line 174
+    #if UNROLL > 11
+        npyv_u32 v_11 = npyv_loadn_u32(ip + 11 * vstep * istride, istride);
+        npyv_u32 r_11 = npyv_negative_u32(v_11);
+        npyv_store_u32(op + 11 * vstep, r_11);
+    #endif
+    
+#line 174
+    #if UNROLL > 12
+        npyv_u32 v_12 = npyv_loadn_u32(ip + 12 * vstep * istride, istride);
+        npyv_u32 r_12 = npyv_negative_u32(v_12);
+        npyv_store_u32(op + 12 * vstep, r_12);
+    #endif
+    
+#line 174
+    #if UNROLL > 13
+        npyv_u32 v_13 = npyv_loadn_u32(ip + 13 * vstep * istride, istride);
+        npyv_u32 r_13 = npyv_negative_u32(v_13);
+        npyv_store_u32(op + 13 * vstep, r_13);
+    #endif
+    
+#line 174
+    #if UNROLL > 14
+        npyv_u32 v_14 = npyv_loadn_u32(ip + 14 * vstep * istride, istride);
+        npyv_u32 r_14 = npyv_negative_u32(v_14);
+        npyv_store_u32(op + 14 * vstep, r_14);
+    #endif
+    
+#line 174
+    #if UNROLL > 15
+        npyv_u32 v_15 = npyv_loadn_u32(ip + 15 * vstep * istride, istride);
+        npyv_u32 r_15 = npyv_negative_u32(v_15);
+        npyv_store_u32(op + 15 * vstep, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += vstep) {
+        npyv_u32 v = npyv_loadn_u32(ip, istride);
+        npyv_u32 r = npyv_negative_u32(v);
+        npyv_store_u32(op, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ip += istride, ++op) {
+        *op = scalar_negative(*ip);
+    }
+}
+// non-contiguous input and output
+// limit unroll to 2x
+#if UNROLL > 2
+#undef UNROLL
+#define UNROLL 2
+#endif
+// X86 does better with unrolled scalar for heavy non-contiguous
+#ifndef NPY_HAVE_SSE2
+static NPY_INLINE void
+simd_unary_nn_negative_u32(const npyv_lanetype_u32 *ip, npy_intp istride,
+                             npyv_lanetype_u32 *op, npy_intp ostride,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_u32;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+    #line 213
+    #if UNROLL > 0
+        npyv_u32 v_0 = npyv_loadn_u32(ip + 0 * vstep * istride, istride);
+        npyv_u32 r_0 = npyv_negative_u32(v_0);
+        npyv_storen_u32(op + 0 * vstep * ostride, ostride, r_0);
+    #endif
+    
+#line 213
+    #if UNROLL > 1
+        npyv_u32 v_1 = npyv_loadn_u32(ip + 1 * vstep * istride, istride);
+        npyv_u32 r_1 = npyv_negative_u32(v_1);
+        npyv_storen_u32(op + 1 * vstep * ostride, ostride, r_1);
+    #endif
+    
+#line 213
+    #if UNROLL > 2
+        npyv_u32 v_2 = npyv_loadn_u32(ip + 2 * vstep * istride, istride);
+        npyv_u32 r_2 = npyv_negative_u32(v_2);
+        npyv_storen_u32(op + 2 * vstep * ostride, ostride, r_2);
+    #endif
+    
+#line 213
+    #if UNROLL > 3
+        npyv_u32 v_3 = npyv_loadn_u32(ip + 3 * vstep * istride, istride);
+        npyv_u32 r_3 = npyv_negative_u32(v_3);
+        npyv_storen_u32(op + 3 * vstep * ostride, ostride, r_3);
+    #endif
+    
+#line 213
+    #if UNROLL > 4
+        npyv_u32 v_4 = npyv_loadn_u32(ip + 4 * vstep * istride, istride);
+        npyv_u32 r_4 = npyv_negative_u32(v_4);
+        npyv_storen_u32(op + 4 * vstep * ostride, ostride, r_4);
+    #endif
+    
+#line 213
+    #if UNROLL > 5
+        npyv_u32 v_5 = npyv_loadn_u32(ip + 5 * vstep * istride, istride);
+        npyv_u32 r_5 = npyv_negative_u32(v_5);
+        npyv_storen_u32(op + 5 * vstep * ostride, ostride, r_5);
+    #endif
+    
+#line 213
+    #if UNROLL > 6
+        npyv_u32 v_6 = npyv_loadn_u32(ip + 6 * vstep * istride, istride);
+        npyv_u32 r_6 = npyv_negative_u32(v_6);
+        npyv_storen_u32(op + 6 * vstep * ostride, ostride, r_6);
+    #endif
+    
+#line 213
+    #if UNROLL > 7
+        npyv_u32 v_7 = npyv_loadn_u32(ip + 7 * vstep * istride, istride);
+        npyv_u32 r_7 = npyv_negative_u32(v_7);
+        npyv_storen_u32(op + 7 * vstep * ostride, ostride, r_7);
+    #endif
+    
+#line 213
+    #if UNROLL > 8
+        npyv_u32 v_8 = npyv_loadn_u32(ip + 8 * vstep * istride, istride);
+        npyv_u32 r_8 = npyv_negative_u32(v_8);
+        npyv_storen_u32(op + 8 * vstep * ostride, ostride, r_8);
+    #endif
+    
+#line 213
+    #if UNROLL > 9
+        npyv_u32 v_9 = npyv_loadn_u32(ip + 9 * vstep * istride, istride);
+        npyv_u32 r_9 = npyv_negative_u32(v_9);
+        npyv_storen_u32(op + 9 * vstep * ostride, ostride, r_9);
+    #endif
+    
+#line 213
+    #if UNROLL > 10
+        npyv_u32 v_10 = npyv_loadn_u32(ip + 10 * vstep * istride, istride);
+        npyv_u32 r_10 = npyv_negative_u32(v_10);
+        npyv_storen_u32(op + 10 * vstep * ostride, ostride, r_10);
+    #endif
+    
+#line 213
+    #if UNROLL > 11
+        npyv_u32 v_11 = npyv_loadn_u32(ip + 11 * vstep * istride, istride);
+        npyv_u32 r_11 = npyv_negative_u32(v_11);
+        npyv_storen_u32(op + 11 * vstep * ostride, ostride, r_11);
+    #endif
+    
+#line 213
+    #if UNROLL > 12
+        npyv_u32 v_12 = npyv_loadn_u32(ip + 12 * vstep * istride, istride);
+        npyv_u32 r_12 = npyv_negative_u32(v_12);
+        npyv_storen_u32(op + 12 * vstep * ostride, ostride, r_12);
+    #endif
+    
+#line 213
+    #if UNROLL > 13
+        npyv_u32 v_13 = npyv_loadn_u32(ip + 13 * vstep * istride, istride);
+        npyv_u32 r_13 = npyv_negative_u32(v_13);
+        npyv_storen_u32(op + 13 * vstep * ostride, ostride, r_13);
+    #endif
+    
+#line 213
+    #if UNROLL > 14
+        npyv_u32 v_14 = npyv_loadn_u32(ip + 14 * vstep * istride, istride);
+        npyv_u32 r_14 = npyv_negative_u32(v_14);
+        npyv_storen_u32(op + 14 * vstep * ostride, ostride, r_14);
+    #endif
+    
+#line 213
+    #if UNROLL > 15
+        npyv_u32 v_15 = npyv_loadn_u32(ip + 15 * vstep * istride, istride);
+        npyv_u32 r_15 = npyv_negative_u32(v_15);
+        npyv_storen_u32(op + 15 * vstep * ostride, ostride, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+        npyv_u32 v = npyv_loadn_u32(ip, istride);
+        npyv_u32 r = npyv_negative_u32(v);
+        npyv_storen_u32(op, ostride, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = scalar_negative(*ip);
+    }
+}
+#endif // NPY_HAVE_SSE2
+#endif // 1
+#undef UNROLL
+#endif // NPY_SIMD
+/*end repeat1**/
+
+#line 80
+#line 85
+#if NPY_SIMD
+#if 4 < 1
+#error "Unroll must be at least 1"
+#elif NPY_SIMD != 128 && 4 > 2
+// Avoid memory bandwidth bottleneck for larger SIMD
+#define UNROLL 2
+#else
+#define UNROLL 4
+#endif
+// contiguous inputs and output.
+static NPY_INLINE void
+simd_unary_cc_negative_s64(const npyv_lanetype_s64 *ip,
+                             npyv_lanetype_s64 *op,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_s64;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
+    #line 108
+    #if UNROLL > 0
+        npyv_s64 v_0 = npyv_load_s64(ip + 0 * vstep);
+        npyv_s64 r_0 = npyv_negative_s64(v_0);
+        npyv_store_s64(op + 0 * vstep, r_0);
+    #endif
+    
+#line 108
+    #if UNROLL > 1
+        npyv_s64 v_1 = npyv_load_s64(ip + 1 * vstep);
+        npyv_s64 r_1 = npyv_negative_s64(v_1);
+        npyv_store_s64(op + 1 * vstep, r_1);
+    #endif
+    
+#line 108
+    #if UNROLL > 2
+        npyv_s64 v_2 = npyv_load_s64(ip + 2 * vstep);
+        npyv_s64 r_2 = npyv_negative_s64(v_2);
+        npyv_store_s64(op + 2 * vstep, r_2);
+    #endif
+    
+#line 108
+    #if UNROLL > 3
+        npyv_s64 v_3 = npyv_load_s64(ip + 3 * vstep);
+        npyv_s64 r_3 = npyv_negative_s64(v_3);
+        npyv_store_s64(op + 3 * vstep, r_3);
+    #endif
+    
+#line 108
+    #if UNROLL > 4
+        npyv_s64 v_4 = npyv_load_s64(ip + 4 * vstep);
+        npyv_s64 r_4 = npyv_negative_s64(v_4);
+        npyv_store_s64(op + 4 * vstep, r_4);
+    #endif
+    
+#line 108
+    #if UNROLL > 5
+        npyv_s64 v_5 = npyv_load_s64(ip + 5 * vstep);
+        npyv_s64 r_5 = npyv_negative_s64(v_5);
+        npyv_store_s64(op + 5 * vstep, r_5);
+    #endif
+    
+#line 108
+    #if UNROLL > 6
+        npyv_s64 v_6 = npyv_load_s64(ip + 6 * vstep);
+        npyv_s64 r_6 = npyv_negative_s64(v_6);
+        npyv_store_s64(op + 6 * vstep, r_6);
+    #endif
+    
+#line 108
+    #if UNROLL > 7
+        npyv_s64 v_7 = npyv_load_s64(ip + 7 * vstep);
+        npyv_s64 r_7 = npyv_negative_s64(v_7);
+        npyv_store_s64(op + 7 * vstep, r_7);
+    #endif
+    
+#line 108
+    #if UNROLL > 8
+        npyv_s64 v_8 = npyv_load_s64(ip + 8 * vstep);
+        npyv_s64 r_8 = npyv_negative_s64(v_8);
+        npyv_store_s64(op + 8 * vstep, r_8);
+    #endif
+    
+#line 108
+    #if UNROLL > 9
+        npyv_s64 v_9 = npyv_load_s64(ip + 9 * vstep);
+        npyv_s64 r_9 = npyv_negative_s64(v_9);
+        npyv_store_s64(op + 9 * vstep, r_9);
+    #endif
+    
+#line 108
+    #if UNROLL > 10
+        npyv_s64 v_10 = npyv_load_s64(ip + 10 * vstep);
+        npyv_s64 r_10 = npyv_negative_s64(v_10);
+        npyv_store_s64(op + 10 * vstep, r_10);
+    #endif
+    
+#line 108
+    #if UNROLL > 11
+        npyv_s64 v_11 = npyv_load_s64(ip + 11 * vstep);
+        npyv_s64 r_11 = npyv_negative_s64(v_11);
+        npyv_store_s64(op + 11 * vstep, r_11);
+    #endif
+    
+#line 108
+    #if UNROLL > 12
+        npyv_s64 v_12 = npyv_load_s64(ip + 12 * vstep);
+        npyv_s64 r_12 = npyv_negative_s64(v_12);
+        npyv_store_s64(op + 12 * vstep, r_12);
+    #endif
+    
+#line 108
+    #if UNROLL > 13
+        npyv_s64 v_13 = npyv_load_s64(ip + 13 * vstep);
+        npyv_s64 r_13 = npyv_negative_s64(v_13);
+        npyv_store_s64(op + 13 * vstep, r_13);
+    #endif
+    
+#line 108
+    #if UNROLL > 14
+        npyv_s64 v_14 = npyv_load_s64(ip + 14 * vstep);
+        npyv_s64 r_14 = npyv_negative_s64(v_14);
+        npyv_store_s64(op + 14 * vstep, r_14);
+    #endif
+    
+#line 108
+    #if UNROLL > 15
+        npyv_s64 v_15 = npyv_load_s64(ip + 15 * vstep);
+        npyv_s64 r_15 = npyv_negative_s64(v_15);
+        npyv_store_s64(op + 15 * vstep, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += vstep, op +=vstep) {
+        npyv_s64 v = npyv_load_s64(ip);
+        npyv_s64 r = npyv_negative_s64(v);
+        npyv_store_s64(op, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ++ip, ++op) {
+        *op = scalar_negative(*ip);
+    }
+}
+
+#if 1
+// contiguous input, non-contiguous output
+static NPY_INLINE void
+simd_unary_cn_negative_s64(const npyv_lanetype_s64 *ip,
+                             npyv_lanetype_s64 *op, npy_intp ostride,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_s64;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += wstep, op += ostride*wstep) {
+    #line 142
+    #if UNROLL > 0
+        npyv_s64 v_0 = npyv_load_s64(ip + 0 * vstep);
+        npyv_s64 r_0 = npyv_negative_s64(v_0);
+        npyv_storen_s64(op + 0 * vstep * ostride, ostride, r_0);
+    #endif
+    
+#line 142
+    #if UNROLL > 1
+        npyv_s64 v_1 = npyv_load_s64(ip + 1 * vstep);
+        npyv_s64 r_1 = npyv_negative_s64(v_1);
+        npyv_storen_s64(op + 1 * vstep * ostride, ostride, r_1);
+    #endif
+    
+#line 142
+    #if UNROLL > 2
+        npyv_s64 v_2 = npyv_load_s64(ip + 2 * vstep);
+        npyv_s64 r_2 = npyv_negative_s64(v_2);
+        npyv_storen_s64(op + 2 * vstep * ostride, ostride, r_2);
+    #endif
+    
+#line 142
+    #if UNROLL > 3
+        npyv_s64 v_3 = npyv_load_s64(ip + 3 * vstep);
+        npyv_s64 r_3 = npyv_negative_s64(v_3);
+        npyv_storen_s64(op + 3 * vstep * ostride, ostride, r_3);
+    #endif
+    
+#line 142
+    #if UNROLL > 4
+        npyv_s64 v_4 = npyv_load_s64(ip + 4 * vstep);
+        npyv_s64 r_4 = npyv_negative_s64(v_4);
+        npyv_storen_s64(op + 4 * vstep * ostride, ostride, r_4);
+    #endif
+    
+#line 142
+    #if UNROLL > 5
+        npyv_s64 v_5 = npyv_load_s64(ip + 5 * vstep);
+        npyv_s64 r_5 = npyv_negative_s64(v_5);
+        npyv_storen_s64(op + 5 * vstep * ostride, ostride, r_5);
+    #endif
+    
+#line 142
+    #if UNROLL > 6
+        npyv_s64 v_6 = npyv_load_s64(ip + 6 * vstep);
+        npyv_s64 r_6 = npyv_negative_s64(v_6);
+        npyv_storen_s64(op + 6 * vstep * ostride, ostride, r_6);
+    #endif
+    
+#line 142
+    #if UNROLL > 7
+        npyv_s64 v_7 = npyv_load_s64(ip + 7 * vstep);
+        npyv_s64 r_7 = npyv_negative_s64(v_7);
+        npyv_storen_s64(op + 7 * vstep * ostride, ostride, r_7);
+    #endif
+    
+#line 142
+    #if UNROLL > 8
+        npyv_s64 v_8 = npyv_load_s64(ip + 8 * vstep);
+        npyv_s64 r_8 = npyv_negative_s64(v_8);
+        npyv_storen_s64(op + 8 * vstep * ostride, ostride, r_8);
+    #endif
+    
+#line 142
+    #if UNROLL > 9
+        npyv_s64 v_9 = npyv_load_s64(ip + 9 * vstep);
+        npyv_s64 r_9 = npyv_negative_s64(v_9);
+        npyv_storen_s64(op + 9 * vstep * ostride, ostride, r_9);
+    #endif
+    
+#line 142
+    #if UNROLL > 10
+        npyv_s64 v_10 = npyv_load_s64(ip + 10 * vstep);
+        npyv_s64 r_10 = npyv_negative_s64(v_10);
+        npyv_storen_s64(op + 10 * vstep * ostride, ostride, r_10);
+    #endif
+    
+#line 142
+    #if UNROLL > 11
+        npyv_s64 v_11 = npyv_load_s64(ip + 11 * vstep);
+        npyv_s64 r_11 = npyv_negative_s64(v_11);
+        npyv_storen_s64(op + 11 * vstep * ostride, ostride, r_11);
+    #endif
+    
+#line 142
+    #if UNROLL > 12
+        npyv_s64 v_12 = npyv_load_s64(ip + 12 * vstep);
+        npyv_s64 r_12 = npyv_negative_s64(v_12);
+        npyv_storen_s64(op + 12 * vstep * ostride, ostride, r_12);
+    #endif
+    
+#line 142
+    #if UNROLL > 13
+        npyv_s64 v_13 = npyv_load_s64(ip + 13 * vstep);
+        npyv_s64 r_13 = npyv_negative_s64(v_13);
+        npyv_storen_s64(op + 13 * vstep * ostride, ostride, r_13);
+    #endif
+    
+#line 142
+    #if UNROLL > 14
+        npyv_s64 v_14 = npyv_load_s64(ip + 14 * vstep);
+        npyv_s64 r_14 = npyv_negative_s64(v_14);
+        npyv_storen_s64(op + 14 * vstep * ostride, ostride, r_14);
+    #endif
+    
+#line 142
+    #if UNROLL > 15
+        npyv_s64 v_15 = npyv_load_s64(ip + 15 * vstep);
+        npyv_s64 r_15 = npyv_negative_s64(v_15);
+        npyv_storen_s64(op + 15 * vstep * ostride, ostride, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += vstep, op += ostride*vstep) {
+        npyv_s64 v = npyv_load_s64(ip);
+        npyv_s64 r = npyv_negative_s64(v);
+        npyv_storen_s64(op, ostride, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ++ip, op += ostride) {
+        *op = scalar_negative(*ip);
+    }
+}
+// non-contiguous input, contiguous output
+static NPY_INLINE void
+simd_unary_nc_negative_s64(const npyv_lanetype_s64 *ip, npy_intp istride,
+                             npyv_lanetype_s64 *op,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_s64;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += wstep) {
+    #line 174
+    #if UNROLL > 0
+        npyv_s64 v_0 = npyv_loadn_s64(ip + 0 * vstep * istride, istride);
+        npyv_s64 r_0 = npyv_negative_s64(v_0);
+        npyv_store_s64(op + 0 * vstep, r_0);
+    #endif
+    
+#line 174
+    #if UNROLL > 1
+        npyv_s64 v_1 = npyv_loadn_s64(ip + 1 * vstep * istride, istride);
+        npyv_s64 r_1 = npyv_negative_s64(v_1);
+        npyv_store_s64(op + 1 * vstep, r_1);
+    #endif
+    
+#line 174
+    #if UNROLL > 2
+        npyv_s64 v_2 = npyv_loadn_s64(ip + 2 * vstep * istride, istride);
+        npyv_s64 r_2 = npyv_negative_s64(v_2);
+        npyv_store_s64(op + 2 * vstep, r_2);
+    #endif
+    
+#line 174
+    #if UNROLL > 3
+        npyv_s64 v_3 = npyv_loadn_s64(ip + 3 * vstep * istride, istride);
+        npyv_s64 r_3 = npyv_negative_s64(v_3);
+        npyv_store_s64(op + 3 * vstep, r_3);
+    #endif
+    
+#line 174
+    #if UNROLL > 4
+        npyv_s64 v_4 = npyv_loadn_s64(ip + 4 * vstep * istride, istride);
+        npyv_s64 r_4 = npyv_negative_s64(v_4);
+        npyv_store_s64(op + 4 * vstep, r_4);
+    #endif
+    
+#line 174
+    #if UNROLL > 5
+        npyv_s64 v_5 = npyv_loadn_s64(ip + 5 * vstep * istride, istride);
+        npyv_s64 r_5 = npyv_negative_s64(v_5);
+        npyv_store_s64(op + 5 * vstep, r_5);
+    #endif
+    
+#line 174
+    #if UNROLL > 6
+        npyv_s64 v_6 = npyv_loadn_s64(ip + 6 * vstep * istride, istride);
+        npyv_s64 r_6 = npyv_negative_s64(v_6);
+        npyv_store_s64(op + 6 * vstep, r_6);
+    #endif
+    
+#line 174
+    #if UNROLL > 7
+        npyv_s64 v_7 = npyv_loadn_s64(ip + 7 * vstep * istride, istride);
+        npyv_s64 r_7 = npyv_negative_s64(v_7);
+        npyv_store_s64(op + 7 * vstep, r_7);
+    #endif
+    
+#line 174
+    #if UNROLL > 8
+        npyv_s64 v_8 = npyv_loadn_s64(ip + 8 * vstep * istride, istride);
+        npyv_s64 r_8 = npyv_negative_s64(v_8);
+        npyv_store_s64(op + 8 * vstep, r_8);
+    #endif
+    
+#line 174
+    #if UNROLL > 9
+        npyv_s64 v_9 = npyv_loadn_s64(ip + 9 * vstep * istride, istride);
+        npyv_s64 r_9 = npyv_negative_s64(v_9);
+        npyv_store_s64(op + 9 * vstep, r_9);
+    #endif
+    
+#line 174
+    #if UNROLL > 10
+        npyv_s64 v_10 = npyv_loadn_s64(ip + 10 * vstep * istride, istride);
+        npyv_s64 r_10 = npyv_negative_s64(v_10);
+        npyv_store_s64(op + 10 * vstep, r_10);
+    #endif
+    
+#line 174
+    #if UNROLL > 11
+        npyv_s64 v_11 = npyv_loadn_s64(ip + 11 * vstep * istride, istride);
+        npyv_s64 r_11 = npyv_negative_s64(v_11);
+        npyv_store_s64(op + 11 * vstep, r_11);
+    #endif
+    
+#line 174
+    #if UNROLL > 12
+        npyv_s64 v_12 = npyv_loadn_s64(ip + 12 * vstep * istride, istride);
+        npyv_s64 r_12 = npyv_negative_s64(v_12);
+        npyv_store_s64(op + 12 * vstep, r_12);
+    #endif
+    
+#line 174
+    #if UNROLL > 13
+        npyv_s64 v_13 = npyv_loadn_s64(ip + 13 * vstep * istride, istride);
+        npyv_s64 r_13 = npyv_negative_s64(v_13);
+        npyv_store_s64(op + 13 * vstep, r_13);
+    #endif
+    
+#line 174
+    #if UNROLL > 14
+        npyv_s64 v_14 = npyv_loadn_s64(ip + 14 * vstep * istride, istride);
+        npyv_s64 r_14 = npyv_negative_s64(v_14);
+        npyv_store_s64(op + 14 * vstep, r_14);
+    #endif
+    
+#line 174
+    #if UNROLL > 15
+        npyv_s64 v_15 = npyv_loadn_s64(ip + 15 * vstep * istride, istride);
+        npyv_s64 r_15 = npyv_negative_s64(v_15);
+        npyv_store_s64(op + 15 * vstep, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += vstep) {
+        npyv_s64 v = npyv_loadn_s64(ip, istride);
+        npyv_s64 r = npyv_negative_s64(v);
+        npyv_store_s64(op, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ip += istride, ++op) {
+        *op = scalar_negative(*ip);
+    }
+}
+// non-contiguous input and output
+// limit unroll to 2x
+#if UNROLL > 2
+#undef UNROLL
+#define UNROLL 2
+#endif
+// X86 does better with unrolled scalar for heavy non-contiguous
+#ifndef NPY_HAVE_SSE2
+static NPY_INLINE void
+simd_unary_nn_negative_s64(const npyv_lanetype_s64 *ip, npy_intp istride,
+                             npyv_lanetype_s64 *op, npy_intp ostride,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_s64;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+    #line 213
+    #if UNROLL > 0
+        npyv_s64 v_0 = npyv_loadn_s64(ip + 0 * vstep * istride, istride);
+        npyv_s64 r_0 = npyv_negative_s64(v_0);
+        npyv_storen_s64(op + 0 * vstep * ostride, ostride, r_0);
+    #endif
+    
+#line 213
+    #if UNROLL > 1
+        npyv_s64 v_1 = npyv_loadn_s64(ip + 1 * vstep * istride, istride);
+        npyv_s64 r_1 = npyv_negative_s64(v_1);
+        npyv_storen_s64(op + 1 * vstep * ostride, ostride, r_1);
+    #endif
+    
+#line 213
+    #if UNROLL > 2
+        npyv_s64 v_2 = npyv_loadn_s64(ip + 2 * vstep * istride, istride);
+        npyv_s64 r_2 = npyv_negative_s64(v_2);
+        npyv_storen_s64(op + 2 * vstep * ostride, ostride, r_2);
+    #endif
+    
+#line 213
+    #if UNROLL > 3
+        npyv_s64 v_3 = npyv_loadn_s64(ip + 3 * vstep * istride, istride);
+        npyv_s64 r_3 = npyv_negative_s64(v_3);
+        npyv_storen_s64(op + 3 * vstep * ostride, ostride, r_3);
+    #endif
+    
+#line 213
+    #if UNROLL > 4
+        npyv_s64 v_4 = npyv_loadn_s64(ip + 4 * vstep * istride, istride);
+        npyv_s64 r_4 = npyv_negative_s64(v_4);
+        npyv_storen_s64(op + 4 * vstep * ostride, ostride, r_4);
+    #endif
+    
+#line 213
+    #if UNROLL > 5
+        npyv_s64 v_5 = npyv_loadn_s64(ip + 5 * vstep * istride, istride);
+        npyv_s64 r_5 = npyv_negative_s64(v_5);
+        npyv_storen_s64(op + 5 * vstep * ostride, ostride, r_5);
+    #endif
+    
+#line 213
+    #if UNROLL > 6
+        npyv_s64 v_6 = npyv_loadn_s64(ip + 6 * vstep * istride, istride);
+        npyv_s64 r_6 = npyv_negative_s64(v_6);
+        npyv_storen_s64(op + 6 * vstep * ostride, ostride, r_6);
+    #endif
+    
+#line 213
+    #if UNROLL > 7
+        npyv_s64 v_7 = npyv_loadn_s64(ip + 7 * vstep * istride, istride);
+        npyv_s64 r_7 = npyv_negative_s64(v_7);
+        npyv_storen_s64(op + 7 * vstep * ostride, ostride, r_7);
+    #endif
+    
+#line 213
+    #if UNROLL > 8
+        npyv_s64 v_8 = npyv_loadn_s64(ip + 8 * vstep * istride, istride);
+        npyv_s64 r_8 = npyv_negative_s64(v_8);
+        npyv_storen_s64(op + 8 * vstep * ostride, ostride, r_8);
+    #endif
+    
+#line 213
+    #if UNROLL > 9
+        npyv_s64 v_9 = npyv_loadn_s64(ip + 9 * vstep * istride, istride);
+        npyv_s64 r_9 = npyv_negative_s64(v_9);
+        npyv_storen_s64(op + 9 * vstep * ostride, ostride, r_9);
+    #endif
+    
+#line 213
+    #if UNROLL > 10
+        npyv_s64 v_10 = npyv_loadn_s64(ip + 10 * vstep * istride, istride);
+        npyv_s64 r_10 = npyv_negative_s64(v_10);
+        npyv_storen_s64(op + 10 * vstep * ostride, ostride, r_10);
+    #endif
+    
+#line 213
+    #if UNROLL > 11
+        npyv_s64 v_11 = npyv_loadn_s64(ip + 11 * vstep * istride, istride);
+        npyv_s64 r_11 = npyv_negative_s64(v_11);
+        npyv_storen_s64(op + 11 * vstep * ostride, ostride, r_11);
+    #endif
+    
+#line 213
+    #if UNROLL > 12
+        npyv_s64 v_12 = npyv_loadn_s64(ip + 12 * vstep * istride, istride);
+        npyv_s64 r_12 = npyv_negative_s64(v_12);
+        npyv_storen_s64(op + 12 * vstep * ostride, ostride, r_12);
+    #endif
+    
+#line 213
+    #if UNROLL > 13
+        npyv_s64 v_13 = npyv_loadn_s64(ip + 13 * vstep * istride, istride);
+        npyv_s64 r_13 = npyv_negative_s64(v_13);
+        npyv_storen_s64(op + 13 * vstep * ostride, ostride, r_13);
+    #endif
+    
+#line 213
+    #if UNROLL > 14
+        npyv_s64 v_14 = npyv_loadn_s64(ip + 14 * vstep * istride, istride);
+        npyv_s64 r_14 = npyv_negative_s64(v_14);
+        npyv_storen_s64(op + 14 * vstep * ostride, ostride, r_14);
+    #endif
+    
+#line 213
+    #if UNROLL > 15
+        npyv_s64 v_15 = npyv_loadn_s64(ip + 15 * vstep * istride, istride);
+        npyv_s64 r_15 = npyv_negative_s64(v_15);
+        npyv_storen_s64(op + 15 * vstep * ostride, ostride, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+        npyv_s64 v = npyv_loadn_s64(ip, istride);
+        npyv_s64 r = npyv_negative_s64(v);
+        npyv_storen_s64(op, ostride, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = scalar_negative(*ip);
+    }
+}
+#endif // NPY_HAVE_SSE2
+#endif // 1
+#undef UNROLL
+#endif // NPY_SIMD
+/*end repeat1**/
+
+#line 80
+#line 85
+#if NPY_SIMD
+#if 4 < 1
+#error "Unroll must be at least 1"
+#elif NPY_SIMD != 128 && 4 > 2
+// Avoid memory bandwidth bottleneck for larger SIMD
+#define UNROLL 2
+#else
+#define UNROLL 4
+#endif
+// contiguous inputs and output.
+static NPY_INLINE void
+simd_unary_cc_negative_u64(const npyv_lanetype_u64 *ip,
+                             npyv_lanetype_u64 *op,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_u64;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
+    #line 108
+    #if UNROLL > 0
+        npyv_u64 v_0 = npyv_load_u64(ip + 0 * vstep);
+        npyv_u64 r_0 = npyv_negative_u64(v_0);
+        npyv_store_u64(op + 0 * vstep, r_0);
+    #endif
+    
+#line 108
+    #if UNROLL > 1
+        npyv_u64 v_1 = npyv_load_u64(ip + 1 * vstep);
+        npyv_u64 r_1 = npyv_negative_u64(v_1);
+        npyv_store_u64(op + 1 * vstep, r_1);
+    #endif
+    
+#line 108
+    #if UNROLL > 2
+        npyv_u64 v_2 = npyv_load_u64(ip + 2 * vstep);
+        npyv_u64 r_2 = npyv_negative_u64(v_2);
+        npyv_store_u64(op + 2 * vstep, r_2);
+    #endif
+    
+#line 108
+    #if UNROLL > 3
+        npyv_u64 v_3 = npyv_load_u64(ip + 3 * vstep);
+        npyv_u64 r_3 = npyv_negative_u64(v_3);
+        npyv_store_u64(op + 3 * vstep, r_3);
+    #endif
+    
+#line 108
+    #if UNROLL > 4
+        npyv_u64 v_4 = npyv_load_u64(ip + 4 * vstep);
+        npyv_u64 r_4 = npyv_negative_u64(v_4);
+        npyv_store_u64(op + 4 * vstep, r_4);
+    #endif
+    
+#line 108
+    #if UNROLL > 5
+        npyv_u64 v_5 = npyv_load_u64(ip + 5 * vstep);
+        npyv_u64 r_5 = npyv_negative_u64(v_5);
+        npyv_store_u64(op + 5 * vstep, r_5);
+    #endif
+    
+#line 108
+    #if UNROLL > 6
+        npyv_u64 v_6 = npyv_load_u64(ip + 6 * vstep);
+        npyv_u64 r_6 = npyv_negative_u64(v_6);
+        npyv_store_u64(op + 6 * vstep, r_6);
+    #endif
+    
+#line 108
+    #if UNROLL > 7
+        npyv_u64 v_7 = npyv_load_u64(ip + 7 * vstep);
+        npyv_u64 r_7 = npyv_negative_u64(v_7);
+        npyv_store_u64(op + 7 * vstep, r_7);
+    #endif
+    
+#line 108
+    #if UNROLL > 8
+        npyv_u64 v_8 = npyv_load_u64(ip + 8 * vstep);
+        npyv_u64 r_8 = npyv_negative_u64(v_8);
+        npyv_store_u64(op + 8 * vstep, r_8);
+    #endif
+    
+#line 108
+    #if UNROLL > 9
+        npyv_u64 v_9 = npyv_load_u64(ip + 9 * vstep);
+        npyv_u64 r_9 = npyv_negative_u64(v_9);
+        npyv_store_u64(op + 9 * vstep, r_9);
+    #endif
+    
+#line 108
+    #if UNROLL > 10
+        npyv_u64 v_10 = npyv_load_u64(ip + 10 * vstep);
+        npyv_u64 r_10 = npyv_negative_u64(v_10);
+        npyv_store_u64(op + 10 * vstep, r_10);
+    #endif
+    
+#line 108
+    #if UNROLL > 11
+        npyv_u64 v_11 = npyv_load_u64(ip + 11 * vstep);
+        npyv_u64 r_11 = npyv_negative_u64(v_11);
+        npyv_store_u64(op + 11 * vstep, r_11);
+    #endif
+    
+#line 108
+    #if UNROLL > 12
+        npyv_u64 v_12 = npyv_load_u64(ip + 12 * vstep);
+        npyv_u64 r_12 = npyv_negative_u64(v_12);
+        npyv_store_u64(op + 12 * vstep, r_12);
+    #endif
+    
+#line 108
+    #if UNROLL > 13
+        npyv_u64 v_13 = npyv_load_u64(ip + 13 * vstep);
+        npyv_u64 r_13 = npyv_negative_u64(v_13);
+        npyv_store_u64(op + 13 * vstep, r_13);
+    #endif
+    
+#line 108
+    #if UNROLL > 14
+        npyv_u64 v_14 = npyv_load_u64(ip + 14 * vstep);
+        npyv_u64 r_14 = npyv_negative_u64(v_14);
+        npyv_store_u64(op + 14 * vstep, r_14);
+    #endif
+    
+#line 108
+    #if UNROLL > 15
+        npyv_u64 v_15 = npyv_load_u64(ip + 15 * vstep);
+        npyv_u64 r_15 = npyv_negative_u64(v_15);
+        npyv_store_u64(op + 15 * vstep, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += vstep, op +=vstep) {
+        npyv_u64 v = npyv_load_u64(ip);
+        npyv_u64 r = npyv_negative_u64(v);
+        npyv_store_u64(op, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ++ip, ++op) {
+        *op = scalar_negative(*ip);
+    }
+}
+
+#if 1
+// contiguous input, non-contiguous output
+static NPY_INLINE void
+simd_unary_cn_negative_u64(const npyv_lanetype_u64 *ip,
+                             npyv_lanetype_u64 *op, npy_intp ostride,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_u64;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += wstep, op += ostride*wstep) {
+    #line 142
+    #if UNROLL > 0
+        npyv_u64 v_0 = npyv_load_u64(ip + 0 * vstep);
+        npyv_u64 r_0 = npyv_negative_u64(v_0);
+        npyv_storen_u64(op + 0 * vstep * ostride, ostride, r_0);
+    #endif
+    
+#line 142
+    #if UNROLL > 1
+        npyv_u64 v_1 = npyv_load_u64(ip + 1 * vstep);
+        npyv_u64 r_1 = npyv_negative_u64(v_1);
+        npyv_storen_u64(op + 1 * vstep * ostride, ostride, r_1);
+    #endif
+    
+#line 142
+    #if UNROLL > 2
+        npyv_u64 v_2 = npyv_load_u64(ip + 2 * vstep);
+        npyv_u64 r_2 = npyv_negative_u64(v_2);
+        npyv_storen_u64(op + 2 * vstep * ostride, ostride, r_2);
+    #endif
+    
+#line 142
+    #if UNROLL > 3
+        npyv_u64 v_3 = npyv_load_u64(ip + 3 * vstep);
+        npyv_u64 r_3 = npyv_negative_u64(v_3);
+        npyv_storen_u64(op + 3 * vstep * ostride, ostride, r_3);
+    #endif
+    
+#line 142
+    #if UNROLL > 4
+        npyv_u64 v_4 = npyv_load_u64(ip + 4 * vstep);
+        npyv_u64 r_4 = npyv_negative_u64(v_4);
+        npyv_storen_u64(op + 4 * vstep * ostride, ostride, r_4);
+    #endif
+    
+#line 142
+    #if UNROLL > 5
+        npyv_u64 v_5 = npyv_load_u64(ip + 5 * vstep);
+        npyv_u64 r_5 = npyv_negative_u64(v_5);
+        npyv_storen_u64(op + 5 * vstep * ostride, ostride, r_5);
+    #endif
+    
+#line 142
+    #if UNROLL > 6
+        npyv_u64 v_6 = npyv_load_u64(ip + 6 * vstep);
+        npyv_u64 r_6 = npyv_negative_u64(v_6);
+        npyv_storen_u64(op + 6 * vstep * ostride, ostride, r_6);
+    #endif
+    
+#line 142
+    #if UNROLL > 7
+        npyv_u64 v_7 = npyv_load_u64(ip + 7 * vstep);
+        npyv_u64 r_7 = npyv_negative_u64(v_7);
+        npyv_storen_u64(op + 7 * vstep * ostride, ostride, r_7);
+    #endif
+    
+#line 142
+    #if UNROLL > 8
+        npyv_u64 v_8 = npyv_load_u64(ip + 8 * vstep);
+        npyv_u64 r_8 = npyv_negative_u64(v_8);
+        npyv_storen_u64(op + 8 * vstep * ostride, ostride, r_8);
+    #endif
+    
+#line 142
+    #if UNROLL > 9
+        npyv_u64 v_9 = npyv_load_u64(ip + 9 * vstep);
+        npyv_u64 r_9 = npyv_negative_u64(v_9);
+        npyv_storen_u64(op + 9 * vstep * ostride, ostride, r_9);
+    #endif
+    
+#line 142
+    #if UNROLL > 10
+        npyv_u64 v_10 = npyv_load_u64(ip + 10 * vstep);
+        npyv_u64 r_10 = npyv_negative_u64(v_10);
+        npyv_storen_u64(op + 10 * vstep * ostride, ostride, r_10);
+    #endif
+    
+#line 142
+    #if UNROLL > 11
+        npyv_u64 v_11 = npyv_load_u64(ip + 11 * vstep);
+        npyv_u64 r_11 = npyv_negative_u64(v_11);
+        npyv_storen_u64(op + 11 * vstep * ostride, ostride, r_11);
+    #endif
+    
+#line 142
+    #if UNROLL > 12
+        npyv_u64 v_12 = npyv_load_u64(ip + 12 * vstep);
+        npyv_u64 r_12 = npyv_negative_u64(v_12);
+        npyv_storen_u64(op + 12 * vstep * ostride, ostride, r_12);
+    #endif
+    
+#line 142
+    #if UNROLL > 13
+        npyv_u64 v_13 = npyv_load_u64(ip + 13 * vstep);
+        npyv_u64 r_13 = npyv_negative_u64(v_13);
+        npyv_storen_u64(op + 13 * vstep * ostride, ostride, r_13);
+    #endif
+    
+#line 142
+    #if UNROLL > 14
+        npyv_u64 v_14 = npyv_load_u64(ip + 14 * vstep);
+        npyv_u64 r_14 = npyv_negative_u64(v_14);
+        npyv_storen_u64(op + 14 * vstep * ostride, ostride, r_14);
+    #endif
+    
+#line 142
+    #if UNROLL > 15
+        npyv_u64 v_15 = npyv_load_u64(ip + 15 * vstep);
+        npyv_u64 r_15 = npyv_negative_u64(v_15);
+        npyv_storen_u64(op + 15 * vstep * ostride, ostride, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += vstep, op += ostride*vstep) {
+        npyv_u64 v = npyv_load_u64(ip);
+        npyv_u64 r = npyv_negative_u64(v);
+        npyv_storen_u64(op, ostride, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ++ip, op += ostride) {
+        *op = scalar_negative(*ip);
+    }
+}
+// non-contiguous input, contiguous output
+static NPY_INLINE void
+simd_unary_nc_negative_u64(const npyv_lanetype_u64 *ip, npy_intp istride,
+                             npyv_lanetype_u64 *op,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_u64;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += wstep) {
+    #line 174
+    #if UNROLL > 0
+        npyv_u64 v_0 = npyv_loadn_u64(ip + 0 * vstep * istride, istride);
+        npyv_u64 r_0 = npyv_negative_u64(v_0);
+        npyv_store_u64(op + 0 * vstep, r_0);
+    #endif
+    
+#line 174
+    #if UNROLL > 1
+        npyv_u64 v_1 = npyv_loadn_u64(ip + 1 * vstep * istride, istride);
+        npyv_u64 r_1 = npyv_negative_u64(v_1);
+        npyv_store_u64(op + 1 * vstep, r_1);
+    #endif
+    
+#line 174
+    #if UNROLL > 2
+        npyv_u64 v_2 = npyv_loadn_u64(ip + 2 * vstep * istride, istride);
+        npyv_u64 r_2 = npyv_negative_u64(v_2);
+        npyv_store_u64(op + 2 * vstep, r_2);
+    #endif
+    
+#line 174
+    #if UNROLL > 3
+        npyv_u64 v_3 = npyv_loadn_u64(ip + 3 * vstep * istride, istride);
+        npyv_u64 r_3 = npyv_negative_u64(v_3);
+        npyv_store_u64(op + 3 * vstep, r_3);
+    #endif
+    
+#line 174
+    #if UNROLL > 4
+        npyv_u64 v_4 = npyv_loadn_u64(ip + 4 * vstep * istride, istride);
+        npyv_u64 r_4 = npyv_negative_u64(v_4);
+        npyv_store_u64(op + 4 * vstep, r_4);
+    #endif
+    
+#line 174
+    #if UNROLL > 5
+        npyv_u64 v_5 = npyv_loadn_u64(ip + 5 * vstep * istride, istride);
+        npyv_u64 r_5 = npyv_negative_u64(v_5);
+        npyv_store_u64(op + 5 * vstep, r_5);
+    #endif
+    
+#line 174
+    #if UNROLL > 6
+        npyv_u64 v_6 = npyv_loadn_u64(ip + 6 * vstep * istride, istride);
+        npyv_u64 r_6 = npyv_negative_u64(v_6);
+        npyv_store_u64(op + 6 * vstep, r_6);
+    #endif
+    
+#line 174
+    #if UNROLL > 7
+        npyv_u64 v_7 = npyv_loadn_u64(ip + 7 * vstep * istride, istride);
+        npyv_u64 r_7 = npyv_negative_u64(v_7);
+        npyv_store_u64(op + 7 * vstep, r_7);
+    #endif
+    
+#line 174
+    #if UNROLL > 8
+        npyv_u64 v_8 = npyv_loadn_u64(ip + 8 * vstep * istride, istride);
+        npyv_u64 r_8 = npyv_negative_u64(v_8);
+        npyv_store_u64(op + 8 * vstep, r_8);
+    #endif
+    
+#line 174
+    #if UNROLL > 9
+        npyv_u64 v_9 = npyv_loadn_u64(ip + 9 * vstep * istride, istride);
+        npyv_u64 r_9 = npyv_negative_u64(v_9);
+        npyv_store_u64(op + 9 * vstep, r_9);
+    #endif
+    
+#line 174
+    #if UNROLL > 10
+        npyv_u64 v_10 = npyv_loadn_u64(ip + 10 * vstep * istride, istride);
+        npyv_u64 r_10 = npyv_negative_u64(v_10);
+        npyv_store_u64(op + 10 * vstep, r_10);
+    #endif
+    
+#line 174
+    #if UNROLL > 11
+        npyv_u64 v_11 = npyv_loadn_u64(ip + 11 * vstep * istride, istride);
+        npyv_u64 r_11 = npyv_negative_u64(v_11);
+        npyv_store_u64(op + 11 * vstep, r_11);
+    #endif
+    
+#line 174
+    #if UNROLL > 12
+        npyv_u64 v_12 = npyv_loadn_u64(ip + 12 * vstep * istride, istride);
+        npyv_u64 r_12 = npyv_negative_u64(v_12);
+        npyv_store_u64(op + 12 * vstep, r_12);
+    #endif
+    
+#line 174
+    #if UNROLL > 13
+        npyv_u64 v_13 = npyv_loadn_u64(ip + 13 * vstep * istride, istride);
+        npyv_u64 r_13 = npyv_negative_u64(v_13);
+        npyv_store_u64(op + 13 * vstep, r_13);
+    #endif
+    
+#line 174
+    #if UNROLL > 14
+        npyv_u64 v_14 = npyv_loadn_u64(ip + 14 * vstep * istride, istride);
+        npyv_u64 r_14 = npyv_negative_u64(v_14);
+        npyv_store_u64(op + 14 * vstep, r_14);
+    #endif
+    
+#line 174
+    #if UNROLL > 15
+        npyv_u64 v_15 = npyv_loadn_u64(ip + 15 * vstep * istride, istride);
+        npyv_u64 r_15 = npyv_negative_u64(v_15);
+        npyv_store_u64(op + 15 * vstep, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += vstep) {
+        npyv_u64 v = npyv_loadn_u64(ip, istride);
+        npyv_u64 r = npyv_negative_u64(v);
+        npyv_store_u64(op, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ip += istride, ++op) {
+        *op = scalar_negative(*ip);
+    }
+}
+// non-contiguous input and output
+// limit unroll to 2x
+#if UNROLL > 2
+#undef UNROLL
+#define UNROLL 2
+#endif
+// X86 does better with unrolled scalar for heavy non-contiguous
+#ifndef NPY_HAVE_SSE2
+static NPY_INLINE void
+simd_unary_nn_negative_u64(const npyv_lanetype_u64 *ip, npy_intp istride,
+                             npyv_lanetype_u64 *op, npy_intp ostride,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_u64;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+    #line 213
+    #if UNROLL > 0
+        npyv_u64 v_0 = npyv_loadn_u64(ip + 0 * vstep * istride, istride);
+        npyv_u64 r_0 = npyv_negative_u64(v_0);
+        npyv_storen_u64(op + 0 * vstep * ostride, ostride, r_0);
+    #endif
+    
+#line 213
+    #if UNROLL > 1
+        npyv_u64 v_1 = npyv_loadn_u64(ip + 1 * vstep * istride, istride);
+        npyv_u64 r_1 = npyv_negative_u64(v_1);
+        npyv_storen_u64(op + 1 * vstep * ostride, ostride, r_1);
+    #endif
+    
+#line 213
+    #if UNROLL > 2
+        npyv_u64 v_2 = npyv_loadn_u64(ip + 2 * vstep * istride, istride);
+        npyv_u64 r_2 = npyv_negative_u64(v_2);
+        npyv_storen_u64(op + 2 * vstep * ostride, ostride, r_2);
+    #endif
+    
+#line 213
+    #if UNROLL > 3
+        npyv_u64 v_3 = npyv_loadn_u64(ip + 3 * vstep * istride, istride);
+        npyv_u64 r_3 = npyv_negative_u64(v_3);
+        npyv_storen_u64(op + 3 * vstep * ostride, ostride, r_3);
+    #endif
+    
+#line 213
+    #if UNROLL > 4
+        npyv_u64 v_4 = npyv_loadn_u64(ip + 4 * vstep * istride, istride);
+        npyv_u64 r_4 = npyv_negative_u64(v_4);
+        npyv_storen_u64(op + 4 * vstep * ostride, ostride, r_4);
+    #endif
+    
+#line 213
+    #if UNROLL > 5
+        npyv_u64 v_5 = npyv_loadn_u64(ip + 5 * vstep * istride, istride);
+        npyv_u64 r_5 = npyv_negative_u64(v_5);
+        npyv_storen_u64(op + 5 * vstep * ostride, ostride, r_5);
+    #endif
+    
+#line 213
+    #if UNROLL > 6
+        npyv_u64 v_6 = npyv_loadn_u64(ip + 6 * vstep * istride, istride);
+        npyv_u64 r_6 = npyv_negative_u64(v_6);
+        npyv_storen_u64(op + 6 * vstep * ostride, ostride, r_6);
+    #endif
+    
+#line 213
+    #if UNROLL > 7
+        npyv_u64 v_7 = npyv_loadn_u64(ip + 7 * vstep * istride, istride);
+        npyv_u64 r_7 = npyv_negative_u64(v_7);
+        npyv_storen_u64(op + 7 * vstep * ostride, ostride, r_7);
+    #endif
+    
+#line 213
+    #if UNROLL > 8
+        npyv_u64 v_8 = npyv_loadn_u64(ip + 8 * vstep * istride, istride);
+        npyv_u64 r_8 = npyv_negative_u64(v_8);
+        npyv_storen_u64(op + 8 * vstep * ostride, ostride, r_8);
+    #endif
+    
+#line 213
+    #if UNROLL > 9
+        npyv_u64 v_9 = npyv_loadn_u64(ip + 9 * vstep * istride, istride);
+        npyv_u64 r_9 = npyv_negative_u64(v_9);
+        npyv_storen_u64(op + 9 * vstep * ostride, ostride, r_9);
+    #endif
+    
+#line 213
+    #if UNROLL > 10
+        npyv_u64 v_10 = npyv_loadn_u64(ip + 10 * vstep * istride, istride);
+        npyv_u64 r_10 = npyv_negative_u64(v_10);
+        npyv_storen_u64(op + 10 * vstep * ostride, ostride, r_10);
+    #endif
+    
+#line 213
+    #if UNROLL > 11
+        npyv_u64 v_11 = npyv_loadn_u64(ip + 11 * vstep * istride, istride);
+        npyv_u64 r_11 = npyv_negative_u64(v_11);
+        npyv_storen_u64(op + 11 * vstep * ostride, ostride, r_11);
+    #endif
+    
+#line 213
+    #if UNROLL > 12
+        npyv_u64 v_12 = npyv_loadn_u64(ip + 12 * vstep * istride, istride);
+        npyv_u64 r_12 = npyv_negative_u64(v_12);
+        npyv_storen_u64(op + 12 * vstep * ostride, ostride, r_12);
+    #endif
+    
+#line 213
+    #if UNROLL > 13
+        npyv_u64 v_13 = npyv_loadn_u64(ip + 13 * vstep * istride, istride);
+        npyv_u64 r_13 = npyv_negative_u64(v_13);
+        npyv_storen_u64(op + 13 * vstep * ostride, ostride, r_13);
+    #endif
+    
+#line 213
+    #if UNROLL > 14
+        npyv_u64 v_14 = npyv_loadn_u64(ip + 14 * vstep * istride, istride);
+        npyv_u64 r_14 = npyv_negative_u64(v_14);
+        npyv_storen_u64(op + 14 * vstep * ostride, ostride, r_14);
+    #endif
+    
+#line 213
+    #if UNROLL > 15
+        npyv_u64 v_15 = npyv_loadn_u64(ip + 15 * vstep * istride, istride);
+        npyv_u64 r_15 = npyv_negative_u64(v_15);
+        npyv_storen_u64(op + 15 * vstep * ostride, ostride, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+        npyv_u64 v = npyv_loadn_u64(ip, istride);
+        npyv_u64 r = npyv_negative_u64(v);
+        npyv_storen_u64(op, ostride, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = scalar_negative(*ip);
+    }
+}
+#endif // NPY_HAVE_SSE2
+#endif // 1
+#undef UNROLL
+#endif // NPY_SIMD
+/*end repeat1**/
+
+#line 80
+#line 85
+#if NPY_SIMD_F32
+#if 4 < 1
+#error "Unroll must be at least 1"
+#elif NPY_SIMD != 128 && 4 > 2
+// Avoid memory bandwidth bottleneck for larger SIMD
+#define UNROLL 2
+#else
+#define UNROLL 4
+#endif
+// contiguous inputs and output.
+static NPY_INLINE void
+simd_unary_cc_negative_f32(const npyv_lanetype_f32 *ip,
+                             npyv_lanetype_f32 *op,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
+    #line 108
+    #if UNROLL > 0
+        npyv_f32 v_0 = npyv_load_f32(ip + 0 * vstep);
+        npyv_f32 r_0 = npyv_negative_f32(v_0);
+        npyv_store_f32(op + 0 * vstep, r_0);
+    #endif
+    
+#line 108
+    #if UNROLL > 1
+        npyv_f32 v_1 = npyv_load_f32(ip + 1 * vstep);
+        npyv_f32 r_1 = npyv_negative_f32(v_1);
+        npyv_store_f32(op + 1 * vstep, r_1);
+    #endif
+    
+#line 108
+    #if UNROLL > 2
+        npyv_f32 v_2 = npyv_load_f32(ip + 2 * vstep);
+        npyv_f32 r_2 = npyv_negative_f32(v_2);
+        npyv_store_f32(op + 2 * vstep, r_2);
+    #endif
+    
+#line 108
+    #if UNROLL > 3
+        npyv_f32 v_3 = npyv_load_f32(ip + 3 * vstep);
+        npyv_f32 r_3 = npyv_negative_f32(v_3);
+        npyv_store_f32(op + 3 * vstep, r_3);
+    #endif
+    
+#line 108
+    #if UNROLL > 4
+        npyv_f32 v_4 = npyv_load_f32(ip + 4 * vstep);
+        npyv_f32 r_4 = npyv_negative_f32(v_4);
+        npyv_store_f32(op + 4 * vstep, r_4);
+    #endif
+    
+#line 108
+    #if UNROLL > 5
+        npyv_f32 v_5 = npyv_load_f32(ip + 5 * vstep);
+        npyv_f32 r_5 = npyv_negative_f32(v_5);
+        npyv_store_f32(op + 5 * vstep, r_5);
+    #endif
+    
+#line 108
+    #if UNROLL > 6
+        npyv_f32 v_6 = npyv_load_f32(ip + 6 * vstep);
+        npyv_f32 r_6 = npyv_negative_f32(v_6);
+        npyv_store_f32(op + 6 * vstep, r_6);
+    #endif
+    
+#line 108
+    #if UNROLL > 7
+        npyv_f32 v_7 = npyv_load_f32(ip + 7 * vstep);
+        npyv_f32 r_7 = npyv_negative_f32(v_7);
+        npyv_store_f32(op + 7 * vstep, r_7);
+    #endif
+    
+#line 108
+    #if UNROLL > 8
+        npyv_f32 v_8 = npyv_load_f32(ip + 8 * vstep);
+        npyv_f32 r_8 = npyv_negative_f32(v_8);
+        npyv_store_f32(op + 8 * vstep, r_8);
+    #endif
+    
+#line 108
+    #if UNROLL > 9
+        npyv_f32 v_9 = npyv_load_f32(ip + 9 * vstep);
+        npyv_f32 r_9 = npyv_negative_f32(v_9);
+        npyv_store_f32(op + 9 * vstep, r_9);
+    #endif
+    
+#line 108
+    #if UNROLL > 10
+        npyv_f32 v_10 = npyv_load_f32(ip + 10 * vstep);
+        npyv_f32 r_10 = npyv_negative_f32(v_10);
+        npyv_store_f32(op + 10 * vstep, r_10);
+    #endif
+    
+#line 108
+    #if UNROLL > 11
+        npyv_f32 v_11 = npyv_load_f32(ip + 11 * vstep);
+        npyv_f32 r_11 = npyv_negative_f32(v_11);
+        npyv_store_f32(op + 11 * vstep, r_11);
+    #endif
+    
+#line 108
+    #if UNROLL > 12
+        npyv_f32 v_12 = npyv_load_f32(ip + 12 * vstep);
+        npyv_f32 r_12 = npyv_negative_f32(v_12);
+        npyv_store_f32(op + 12 * vstep, r_12);
+    #endif
+    
+#line 108
+    #if UNROLL > 13
+        npyv_f32 v_13 = npyv_load_f32(ip + 13 * vstep);
+        npyv_f32 r_13 = npyv_negative_f32(v_13);
+        npyv_store_f32(op + 13 * vstep, r_13);
+    #endif
+    
+#line 108
+    #if UNROLL > 14
+        npyv_f32 v_14 = npyv_load_f32(ip + 14 * vstep);
+        npyv_f32 r_14 = npyv_negative_f32(v_14);
+        npyv_store_f32(op + 14 * vstep, r_14);
+    #endif
+    
+#line 108
+    #if UNROLL > 15
+        npyv_f32 v_15 = npyv_load_f32(ip + 15 * vstep);
+        npyv_f32 r_15 = npyv_negative_f32(v_15);
+        npyv_store_f32(op + 15 * vstep, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += vstep, op +=vstep) {
+        npyv_f32 v = npyv_load_f32(ip);
+        npyv_f32 r = npyv_negative_f32(v);
+        npyv_store_f32(op, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ++ip, ++op) {
+        *op = scalar_negative(*ip);
+    }
+}
+
+#if 1
+// contiguous input, non-contiguous output
+static NPY_INLINE void
+simd_unary_cn_negative_f32(const npyv_lanetype_f32 *ip,
+                             npyv_lanetype_f32 *op, npy_intp ostride,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += wstep, op += ostride*wstep) {
+    #line 142
+    #if UNROLL > 0
+        npyv_f32 v_0 = npyv_load_f32(ip + 0 * vstep);
+        npyv_f32 r_0 = npyv_negative_f32(v_0);
+        npyv_storen_f32(op + 0 * vstep * ostride, ostride, r_0);
+    #endif
+    
+#line 142
+    #if UNROLL > 1
+        npyv_f32 v_1 = npyv_load_f32(ip + 1 * vstep);
+        npyv_f32 r_1 = npyv_negative_f32(v_1);
+        npyv_storen_f32(op + 1 * vstep * ostride, ostride, r_1);
+    #endif
+    
+#line 142
+    #if UNROLL > 2
+        npyv_f32 v_2 = npyv_load_f32(ip + 2 * vstep);
+        npyv_f32 r_2 = npyv_negative_f32(v_2);
+        npyv_storen_f32(op + 2 * vstep * ostride, ostride, r_2);
+    #endif
+    
+#line 142
+    #if UNROLL > 3
+        npyv_f32 v_3 = npyv_load_f32(ip + 3 * vstep);
+        npyv_f32 r_3 = npyv_negative_f32(v_3);
+        npyv_storen_f32(op + 3 * vstep * ostride, ostride, r_3);
+    #endif
+    
+#line 142
+    #if UNROLL > 4
+        npyv_f32 v_4 = npyv_load_f32(ip + 4 * vstep);
+        npyv_f32 r_4 = npyv_negative_f32(v_4);
+        npyv_storen_f32(op + 4 * vstep * ostride, ostride, r_4);
+    #endif
+    
+#line 142
+    #if UNROLL > 5
+        npyv_f32 v_5 = npyv_load_f32(ip + 5 * vstep);
+        npyv_f32 r_5 = npyv_negative_f32(v_5);
+        npyv_storen_f32(op + 5 * vstep * ostride, ostride, r_5);
+    #endif
+    
+#line 142
+    #if UNROLL > 6
+        npyv_f32 v_6 = npyv_load_f32(ip + 6 * vstep);
+        npyv_f32 r_6 = npyv_negative_f32(v_6);
+        npyv_storen_f32(op + 6 * vstep * ostride, ostride, r_6);
+    #endif
+    
+#line 142
+    #if UNROLL > 7
+        npyv_f32 v_7 = npyv_load_f32(ip + 7 * vstep);
+        npyv_f32 r_7 = npyv_negative_f32(v_7);
+        npyv_storen_f32(op + 7 * vstep * ostride, ostride, r_7);
+    #endif
+    
+#line 142
+    #if UNROLL > 8
+        npyv_f32 v_8 = npyv_load_f32(ip + 8 * vstep);
+        npyv_f32 r_8 = npyv_negative_f32(v_8);
+        npyv_storen_f32(op + 8 * vstep * ostride, ostride, r_8);
+    #endif
+    
+#line 142
+    #if UNROLL > 9
+        npyv_f32 v_9 = npyv_load_f32(ip + 9 * vstep);
+        npyv_f32 r_9 = npyv_negative_f32(v_9);
+        npyv_storen_f32(op + 9 * vstep * ostride, ostride, r_9);
+    #endif
+    
+#line 142
+    #if UNROLL > 10
+        npyv_f32 v_10 = npyv_load_f32(ip + 10 * vstep);
+        npyv_f32 r_10 = npyv_negative_f32(v_10);
+        npyv_storen_f32(op + 10 * vstep * ostride, ostride, r_10);
+    #endif
+    
+#line 142
+    #if UNROLL > 11
+        npyv_f32 v_11 = npyv_load_f32(ip + 11 * vstep);
+        npyv_f32 r_11 = npyv_negative_f32(v_11);
+        npyv_storen_f32(op + 11 * vstep * ostride, ostride, r_11);
+    #endif
+    
+#line 142
+    #if UNROLL > 12
+        npyv_f32 v_12 = npyv_load_f32(ip + 12 * vstep);
+        npyv_f32 r_12 = npyv_negative_f32(v_12);
+        npyv_storen_f32(op + 12 * vstep * ostride, ostride, r_12);
+    #endif
+    
+#line 142
+    #if UNROLL > 13
+        npyv_f32 v_13 = npyv_load_f32(ip + 13 * vstep);
+        npyv_f32 r_13 = npyv_negative_f32(v_13);
+        npyv_storen_f32(op + 13 * vstep * ostride, ostride, r_13);
+    #endif
+    
+#line 142
+    #if UNROLL > 14
+        npyv_f32 v_14 = npyv_load_f32(ip + 14 * vstep);
+        npyv_f32 r_14 = npyv_negative_f32(v_14);
+        npyv_storen_f32(op + 14 * vstep * ostride, ostride, r_14);
+    #endif
+    
+#line 142
+    #if UNROLL > 15
+        npyv_f32 v_15 = npyv_load_f32(ip + 15 * vstep);
+        npyv_f32 r_15 = npyv_negative_f32(v_15);
+        npyv_storen_f32(op + 15 * vstep * ostride, ostride, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += vstep, op += ostride*vstep) {
+        npyv_f32 v = npyv_load_f32(ip);
+        npyv_f32 r = npyv_negative_f32(v);
+        npyv_storen_f32(op, ostride, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ++ip, op += ostride) {
+        *op = scalar_negative(*ip);
+    }
+}
+// non-contiguous input, contiguous output
+static NPY_INLINE void
+simd_unary_nc_negative_f32(const npyv_lanetype_f32 *ip, npy_intp istride,
+                             npyv_lanetype_f32 *op,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += wstep) {
+    #line 174
+    #if UNROLL > 0
+        npyv_f32 v_0 = npyv_loadn_f32(ip + 0 * vstep * istride, istride);
+        npyv_f32 r_0 = npyv_negative_f32(v_0);
+        npyv_store_f32(op + 0 * vstep, r_0);
+    #endif
+    
+#line 174
+    #if UNROLL > 1
+        npyv_f32 v_1 = npyv_loadn_f32(ip + 1 * vstep * istride, istride);
+        npyv_f32 r_1 = npyv_negative_f32(v_1);
+        npyv_store_f32(op + 1 * vstep, r_1);
+    #endif
+    
+#line 174
+    #if UNROLL > 2
+        npyv_f32 v_2 = npyv_loadn_f32(ip + 2 * vstep * istride, istride);
+        npyv_f32 r_2 = npyv_negative_f32(v_2);
+        npyv_store_f32(op + 2 * vstep, r_2);
+    #endif
+    
+#line 174
+    #if UNROLL > 3
+        npyv_f32 v_3 = npyv_loadn_f32(ip + 3 * vstep * istride, istride);
+        npyv_f32 r_3 = npyv_negative_f32(v_3);
+        npyv_store_f32(op + 3 * vstep, r_3);
+    #endif
+    
+#line 174
+    #if UNROLL > 4
+        npyv_f32 v_4 = npyv_loadn_f32(ip + 4 * vstep * istride, istride);
+        npyv_f32 r_4 = npyv_negative_f32(v_4);
+        npyv_store_f32(op + 4 * vstep, r_4);
+    #endif
+    
+#line 174
+    #if UNROLL > 5
+        npyv_f32 v_5 = npyv_loadn_f32(ip + 5 * vstep * istride, istride);
+        npyv_f32 r_5 = npyv_negative_f32(v_5);
+        npyv_store_f32(op + 5 * vstep, r_5);
+    #endif
+    
+#line 174
+    #if UNROLL > 6
+        npyv_f32 v_6 = npyv_loadn_f32(ip + 6 * vstep * istride, istride);
+        npyv_f32 r_6 = npyv_negative_f32(v_6);
+        npyv_store_f32(op + 6 * vstep, r_6);
+    #endif
+    
+#line 174
+    #if UNROLL > 7
+        npyv_f32 v_7 = npyv_loadn_f32(ip + 7 * vstep * istride, istride);
+        npyv_f32 r_7 = npyv_negative_f32(v_7);
+        npyv_store_f32(op + 7 * vstep, r_7);
+    #endif
+    
+#line 174
+    #if UNROLL > 8
+        npyv_f32 v_8 = npyv_loadn_f32(ip + 8 * vstep * istride, istride);
+        npyv_f32 r_8 = npyv_negative_f32(v_8);
+        npyv_store_f32(op + 8 * vstep, r_8);
+    #endif
+    
+#line 174
+    #if UNROLL > 9
+        npyv_f32 v_9 = npyv_loadn_f32(ip + 9 * vstep * istride, istride);
+        npyv_f32 r_9 = npyv_negative_f32(v_9);
+        npyv_store_f32(op + 9 * vstep, r_9);
+    #endif
+    
+#line 174
+    #if UNROLL > 10
+        npyv_f32 v_10 = npyv_loadn_f32(ip + 10 * vstep * istride, istride);
+        npyv_f32 r_10 = npyv_negative_f32(v_10);
+        npyv_store_f32(op + 10 * vstep, r_10);
+    #endif
+    
+#line 174
+    #if UNROLL > 11
+        npyv_f32 v_11 = npyv_loadn_f32(ip + 11 * vstep * istride, istride);
+        npyv_f32 r_11 = npyv_negative_f32(v_11);
+        npyv_store_f32(op + 11 * vstep, r_11);
+    #endif
+    
+#line 174
+    #if UNROLL > 12
+        npyv_f32 v_12 = npyv_loadn_f32(ip + 12 * vstep * istride, istride);
+        npyv_f32 r_12 = npyv_negative_f32(v_12);
+        npyv_store_f32(op + 12 * vstep, r_12);
+    #endif
+    
+#line 174
+    #if UNROLL > 13
+        npyv_f32 v_13 = npyv_loadn_f32(ip + 13 * vstep * istride, istride);
+        npyv_f32 r_13 = npyv_negative_f32(v_13);
+        npyv_store_f32(op + 13 * vstep, r_13);
+    #endif
+    
+#line 174
+    #if UNROLL > 14
+        npyv_f32 v_14 = npyv_loadn_f32(ip + 14 * vstep * istride, istride);
+        npyv_f32 r_14 = npyv_negative_f32(v_14);
+        npyv_store_f32(op + 14 * vstep, r_14);
+    #endif
+    
+#line 174
+    #if UNROLL > 15
+        npyv_f32 v_15 = npyv_loadn_f32(ip + 15 * vstep * istride, istride);
+        npyv_f32 r_15 = npyv_negative_f32(v_15);
+        npyv_store_f32(op + 15 * vstep, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += vstep) {
+        npyv_f32 v = npyv_loadn_f32(ip, istride);
+        npyv_f32 r = npyv_negative_f32(v);
+        npyv_store_f32(op, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ip += istride, ++op) {
+        *op = scalar_negative(*ip);
+    }
+}
+// non-contiguous input and output
+// limit unroll to 2x
+#if UNROLL > 2
+#undef UNROLL
+#define UNROLL 2
+#endif
+// X86 does better with unrolled scalar for heavy non-contiguous
+#ifndef NPY_HAVE_SSE2
+static NPY_INLINE void
+simd_unary_nn_negative_f32(const npyv_lanetype_f32 *ip, npy_intp istride,
+                             npyv_lanetype_f32 *op, npy_intp ostride,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+    #line 213
+    #if UNROLL > 0
+        npyv_f32 v_0 = npyv_loadn_f32(ip + 0 * vstep * istride, istride);
+        npyv_f32 r_0 = npyv_negative_f32(v_0);
+        npyv_storen_f32(op + 0 * vstep * ostride, ostride, r_0);
+    #endif
+    
+#line 213
+    #if UNROLL > 1
+        npyv_f32 v_1 = npyv_loadn_f32(ip + 1 * vstep * istride, istride);
+        npyv_f32 r_1 = npyv_negative_f32(v_1);
+        npyv_storen_f32(op + 1 * vstep * ostride, ostride, r_1);
+    #endif
+    
+#line 213
+    #if UNROLL > 2
+        npyv_f32 v_2 = npyv_loadn_f32(ip + 2 * vstep * istride, istride);
+        npyv_f32 r_2 = npyv_negative_f32(v_2);
+        npyv_storen_f32(op + 2 * vstep * ostride, ostride, r_2);
+    #endif
+    
+#line 213
+    #if UNROLL > 3
+        npyv_f32 v_3 = npyv_loadn_f32(ip + 3 * vstep * istride, istride);
+        npyv_f32 r_3 = npyv_negative_f32(v_3);
+        npyv_storen_f32(op + 3 * vstep * ostride, ostride, r_3);
+    #endif
+    
+#line 213
+    #if UNROLL > 4
+        npyv_f32 v_4 = npyv_loadn_f32(ip + 4 * vstep * istride, istride);
+        npyv_f32 r_4 = npyv_negative_f32(v_4);
+        npyv_storen_f32(op + 4 * vstep * ostride, ostride, r_4);
+    #endif
+    
+#line 213
+    #if UNROLL > 5
+        npyv_f32 v_5 = npyv_loadn_f32(ip + 5 * vstep * istride, istride);
+        npyv_f32 r_5 = npyv_negative_f32(v_5);
+        npyv_storen_f32(op + 5 * vstep * ostride, ostride, r_5);
+    #endif
+    
+#line 213
+    #if UNROLL > 6
+        npyv_f32 v_6 = npyv_loadn_f32(ip + 6 * vstep * istride, istride);
+        npyv_f32 r_6 = npyv_negative_f32(v_6);
+        npyv_storen_f32(op + 6 * vstep * ostride, ostride, r_6);
+    #endif
+    
+#line 213
+    #if UNROLL > 7
+        npyv_f32 v_7 = npyv_loadn_f32(ip + 7 * vstep * istride, istride);
+        npyv_f32 r_7 = npyv_negative_f32(v_7);
+        npyv_storen_f32(op + 7 * vstep * ostride, ostride, r_7);
+    #endif
+    
+#line 213
+    #if UNROLL > 8
+        npyv_f32 v_8 = npyv_loadn_f32(ip + 8 * vstep * istride, istride);
+        npyv_f32 r_8 = npyv_negative_f32(v_8);
+        npyv_storen_f32(op + 8 * vstep * ostride, ostride, r_8);
+    #endif
+    
+#line 213
+    #if UNROLL > 9
+        npyv_f32 v_9 = npyv_loadn_f32(ip + 9 * vstep * istride, istride);
+        npyv_f32 r_9 = npyv_negative_f32(v_9);
+        npyv_storen_f32(op + 9 * vstep * ostride, ostride, r_9);
+    #endif
+    
+#line 213
+    #if UNROLL > 10
+        npyv_f32 v_10 = npyv_loadn_f32(ip + 10 * vstep * istride, istride);
+        npyv_f32 r_10 = npyv_negative_f32(v_10);
+        npyv_storen_f32(op + 10 * vstep * ostride, ostride, r_10);
+    #endif
+    
+#line 213
+    #if UNROLL > 11
+        npyv_f32 v_11 = npyv_loadn_f32(ip + 11 * vstep * istride, istride);
+        npyv_f32 r_11 = npyv_negative_f32(v_11);
+        npyv_storen_f32(op + 11 * vstep * ostride, ostride, r_11);
+    #endif
+    
+#line 213
+    #if UNROLL > 12
+        npyv_f32 v_12 = npyv_loadn_f32(ip + 12 * vstep * istride, istride);
+        npyv_f32 r_12 = npyv_negative_f32(v_12);
+        npyv_storen_f32(op + 12 * vstep * ostride, ostride, r_12);
+    #endif
+    
+#line 213
+    #if UNROLL > 13
+        npyv_f32 v_13 = npyv_loadn_f32(ip + 13 * vstep * istride, istride);
+        npyv_f32 r_13 = npyv_negative_f32(v_13);
+        npyv_storen_f32(op + 13 * vstep * ostride, ostride, r_13);
+    #endif
+    
+#line 213
+    #if UNROLL > 14
+        npyv_f32 v_14 = npyv_loadn_f32(ip + 14 * vstep * istride, istride);
+        npyv_f32 r_14 = npyv_negative_f32(v_14);
+        npyv_storen_f32(op + 14 * vstep * ostride, ostride, r_14);
+    #endif
+    
+#line 213
+    #if UNROLL > 15
+        npyv_f32 v_15 = npyv_loadn_f32(ip + 15 * vstep * istride, istride);
+        npyv_f32 r_15 = npyv_negative_f32(v_15);
+        npyv_storen_f32(op + 15 * vstep * ostride, ostride, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+        npyv_f32 v = npyv_loadn_f32(ip, istride);
+        npyv_f32 r = npyv_negative_f32(v);
+        npyv_storen_f32(op, ostride, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = scalar_negative(*ip);
+    }
+}
+#endif // NPY_HAVE_SSE2
+#endif // 1
+#undef UNROLL
+#endif // NPY_SIMD_F32
+/*end repeat1**/
+
+#line 80
+#line 85
+#if NPY_SIMD_F64
+#if 4 < 1
+#error "Unroll must be at least 1"
+#elif NPY_SIMD != 128 && 4 > 2
+// Avoid memory bandwidth bottleneck for larger SIMD
+#define UNROLL 2
+#else
+#define UNROLL 4
+#endif
+// contiguous inputs and output.
+static NPY_INLINE void
+simd_unary_cc_negative_f64(const npyv_lanetype_f64 *ip,
+                             npyv_lanetype_f64 *op,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += wstep, op += wstep) {
+    #line 108
+    #if UNROLL > 0
+        npyv_f64 v_0 = npyv_load_f64(ip + 0 * vstep);
+        npyv_f64 r_0 = npyv_negative_f64(v_0);
+        npyv_store_f64(op + 0 * vstep, r_0);
+    #endif
+    
+#line 108
+    #if UNROLL > 1
+        npyv_f64 v_1 = npyv_load_f64(ip + 1 * vstep);
+        npyv_f64 r_1 = npyv_negative_f64(v_1);
+        npyv_store_f64(op + 1 * vstep, r_1);
+    #endif
+    
+#line 108
+    #if UNROLL > 2
+        npyv_f64 v_2 = npyv_load_f64(ip + 2 * vstep);
+        npyv_f64 r_2 = npyv_negative_f64(v_2);
+        npyv_store_f64(op + 2 * vstep, r_2);
+    #endif
+    
+#line 108
+    #if UNROLL > 3
+        npyv_f64 v_3 = npyv_load_f64(ip + 3 * vstep);
+        npyv_f64 r_3 = npyv_negative_f64(v_3);
+        npyv_store_f64(op + 3 * vstep, r_3);
+    #endif
+    
+#line 108
+    #if UNROLL > 4
+        npyv_f64 v_4 = npyv_load_f64(ip + 4 * vstep);
+        npyv_f64 r_4 = npyv_negative_f64(v_4);
+        npyv_store_f64(op + 4 * vstep, r_4);
+    #endif
+    
+#line 108
+    #if UNROLL > 5
+        npyv_f64 v_5 = npyv_load_f64(ip + 5 * vstep);
+        npyv_f64 r_5 = npyv_negative_f64(v_5);
+        npyv_store_f64(op + 5 * vstep, r_5);
+    #endif
+    
+#line 108
+    #if UNROLL > 6
+        npyv_f64 v_6 = npyv_load_f64(ip + 6 * vstep);
+        npyv_f64 r_6 = npyv_negative_f64(v_6);
+        npyv_store_f64(op + 6 * vstep, r_6);
+    #endif
+    
+#line 108
+    #if UNROLL > 7
+        npyv_f64 v_7 = npyv_load_f64(ip + 7 * vstep);
+        npyv_f64 r_7 = npyv_negative_f64(v_7);
+        npyv_store_f64(op + 7 * vstep, r_7);
+    #endif
+    
+#line 108
+    #if UNROLL > 8
+        npyv_f64 v_8 = npyv_load_f64(ip + 8 * vstep);
+        npyv_f64 r_8 = npyv_negative_f64(v_8);
+        npyv_store_f64(op + 8 * vstep, r_8);
+    #endif
+    
+#line 108
+    #if UNROLL > 9
+        npyv_f64 v_9 = npyv_load_f64(ip + 9 * vstep);
+        npyv_f64 r_9 = npyv_negative_f64(v_9);
+        npyv_store_f64(op + 9 * vstep, r_9);
+    #endif
+    
+#line 108
+    #if UNROLL > 10
+        npyv_f64 v_10 = npyv_load_f64(ip + 10 * vstep);
+        npyv_f64 r_10 = npyv_negative_f64(v_10);
+        npyv_store_f64(op + 10 * vstep, r_10);
+    #endif
+    
+#line 108
+    #if UNROLL > 11
+        npyv_f64 v_11 = npyv_load_f64(ip + 11 * vstep);
+        npyv_f64 r_11 = npyv_negative_f64(v_11);
+        npyv_store_f64(op + 11 * vstep, r_11);
+    #endif
+    
+#line 108
+    #if UNROLL > 12
+        npyv_f64 v_12 = npyv_load_f64(ip + 12 * vstep);
+        npyv_f64 r_12 = npyv_negative_f64(v_12);
+        npyv_store_f64(op + 12 * vstep, r_12);
+    #endif
+    
+#line 108
+    #if UNROLL > 13
+        npyv_f64 v_13 = npyv_load_f64(ip + 13 * vstep);
+        npyv_f64 r_13 = npyv_negative_f64(v_13);
+        npyv_store_f64(op + 13 * vstep, r_13);
+    #endif
+    
+#line 108
+    #if UNROLL > 14
+        npyv_f64 v_14 = npyv_load_f64(ip + 14 * vstep);
+        npyv_f64 r_14 = npyv_negative_f64(v_14);
+        npyv_store_f64(op + 14 * vstep, r_14);
+    #endif
+    
+#line 108
+    #if UNROLL > 15
+        npyv_f64 v_15 = npyv_load_f64(ip + 15 * vstep);
+        npyv_f64 r_15 = npyv_negative_f64(v_15);
+        npyv_store_f64(op + 15 * vstep, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += vstep, op +=vstep) {
+        npyv_f64 v = npyv_load_f64(ip);
+        npyv_f64 r = npyv_negative_f64(v);
+        npyv_store_f64(op, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ++ip, ++op) {
+        *op = scalar_negative(*ip);
+    }
+}
+
+#if 1
+// contiguous input, non-contiguous output
+static NPY_INLINE void
+simd_unary_cn_negative_f64(const npyv_lanetype_f64 *ip,
+                             npyv_lanetype_f64 *op, npy_intp ostride,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += wstep, op += ostride*wstep) {
+    #line 142
+    #if UNROLL > 0
+        npyv_f64 v_0 = npyv_load_f64(ip + 0 * vstep);
+        npyv_f64 r_0 = npyv_negative_f64(v_0);
+        npyv_storen_f64(op + 0 * vstep * ostride, ostride, r_0);
+    #endif
+    
+#line 142
+    #if UNROLL > 1
+        npyv_f64 v_1 = npyv_load_f64(ip + 1 * vstep);
+        npyv_f64 r_1 = npyv_negative_f64(v_1);
+        npyv_storen_f64(op + 1 * vstep * ostride, ostride, r_1);
+    #endif
+    
+#line 142
+    #if UNROLL > 2
+        npyv_f64 v_2 = npyv_load_f64(ip + 2 * vstep);
+        npyv_f64 r_2 = npyv_negative_f64(v_2);
+        npyv_storen_f64(op + 2 * vstep * ostride, ostride, r_2);
+    #endif
+    
+#line 142
+    #if UNROLL > 3
+        npyv_f64 v_3 = npyv_load_f64(ip + 3 * vstep);
+        npyv_f64 r_3 = npyv_negative_f64(v_3);
+        npyv_storen_f64(op + 3 * vstep * ostride, ostride, r_3);
+    #endif
+    
+#line 142
+    #if UNROLL > 4
+        npyv_f64 v_4 = npyv_load_f64(ip + 4 * vstep);
+        npyv_f64 r_4 = npyv_negative_f64(v_4);
+        npyv_storen_f64(op + 4 * vstep * ostride, ostride, r_4);
+    #endif
+    
+#line 142
+    #if UNROLL > 5
+        npyv_f64 v_5 = npyv_load_f64(ip + 5 * vstep);
+        npyv_f64 r_5 = npyv_negative_f64(v_5);
+        npyv_storen_f64(op + 5 * vstep * ostride, ostride, r_5);
+    #endif
+    
+#line 142
+    #if UNROLL > 6
+        npyv_f64 v_6 = npyv_load_f64(ip + 6 * vstep);
+        npyv_f64 r_6 = npyv_negative_f64(v_6);
+        npyv_storen_f64(op + 6 * vstep * ostride, ostride, r_6);
+    #endif
+    
+#line 142
+    #if UNROLL > 7
+        npyv_f64 v_7 = npyv_load_f64(ip + 7 * vstep);
+        npyv_f64 r_7 = npyv_negative_f64(v_7);
+        npyv_storen_f64(op + 7 * vstep * ostride, ostride, r_7);
+    #endif
+    
+#line 142
+    #if UNROLL > 8
+        npyv_f64 v_8 = npyv_load_f64(ip + 8 * vstep);
+        npyv_f64 r_8 = npyv_negative_f64(v_8);
+        npyv_storen_f64(op + 8 * vstep * ostride, ostride, r_8);
+    #endif
+    
+#line 142
+    #if UNROLL > 9
+        npyv_f64 v_9 = npyv_load_f64(ip + 9 * vstep);
+        npyv_f64 r_9 = npyv_negative_f64(v_9);
+        npyv_storen_f64(op + 9 * vstep * ostride, ostride, r_9);
+    #endif
+    
+#line 142
+    #if UNROLL > 10
+        npyv_f64 v_10 = npyv_load_f64(ip + 10 * vstep);
+        npyv_f64 r_10 = npyv_negative_f64(v_10);
+        npyv_storen_f64(op + 10 * vstep * ostride, ostride, r_10);
+    #endif
+    
+#line 142
+    #if UNROLL > 11
+        npyv_f64 v_11 = npyv_load_f64(ip + 11 * vstep);
+        npyv_f64 r_11 = npyv_negative_f64(v_11);
+        npyv_storen_f64(op + 11 * vstep * ostride, ostride, r_11);
+    #endif
+    
+#line 142
+    #if UNROLL > 12
+        npyv_f64 v_12 = npyv_load_f64(ip + 12 * vstep);
+        npyv_f64 r_12 = npyv_negative_f64(v_12);
+        npyv_storen_f64(op + 12 * vstep * ostride, ostride, r_12);
+    #endif
+    
+#line 142
+    #if UNROLL > 13
+        npyv_f64 v_13 = npyv_load_f64(ip + 13 * vstep);
+        npyv_f64 r_13 = npyv_negative_f64(v_13);
+        npyv_storen_f64(op + 13 * vstep * ostride, ostride, r_13);
+    #endif
+    
+#line 142
+    #if UNROLL > 14
+        npyv_f64 v_14 = npyv_load_f64(ip + 14 * vstep);
+        npyv_f64 r_14 = npyv_negative_f64(v_14);
+        npyv_storen_f64(op + 14 * vstep * ostride, ostride, r_14);
+    #endif
+    
+#line 142
+    #if UNROLL > 15
+        npyv_f64 v_15 = npyv_load_f64(ip + 15 * vstep);
+        npyv_f64 r_15 = npyv_negative_f64(v_15);
+        npyv_storen_f64(op + 15 * vstep * ostride, ostride, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += vstep, op += ostride*vstep) {
+        npyv_f64 v = npyv_load_f64(ip);
+        npyv_f64 r = npyv_negative_f64(v);
+        npyv_storen_f64(op, ostride, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ++ip, op += ostride) {
+        *op = scalar_negative(*ip);
+    }
+}
+// non-contiguous input, contiguous output
+static NPY_INLINE void
+simd_unary_nc_negative_f64(const npyv_lanetype_f64 *ip, npy_intp istride,
+                             npyv_lanetype_f64 *op,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += wstep) {
+    #line 174
+    #if UNROLL > 0
+        npyv_f64 v_0 = npyv_loadn_f64(ip + 0 * vstep * istride, istride);
+        npyv_f64 r_0 = npyv_negative_f64(v_0);
+        npyv_store_f64(op + 0 * vstep, r_0);
+    #endif
+    
+#line 174
+    #if UNROLL > 1
+        npyv_f64 v_1 = npyv_loadn_f64(ip + 1 * vstep * istride, istride);
+        npyv_f64 r_1 = npyv_negative_f64(v_1);
+        npyv_store_f64(op + 1 * vstep, r_1);
+    #endif
+    
+#line 174
+    #if UNROLL > 2
+        npyv_f64 v_2 = npyv_loadn_f64(ip + 2 * vstep * istride, istride);
+        npyv_f64 r_2 = npyv_negative_f64(v_2);
+        npyv_store_f64(op + 2 * vstep, r_2);
+    #endif
+    
+#line 174
+    #if UNROLL > 3
+        npyv_f64 v_3 = npyv_loadn_f64(ip + 3 * vstep * istride, istride);
+        npyv_f64 r_3 = npyv_negative_f64(v_3);
+        npyv_store_f64(op + 3 * vstep, r_3);
+    #endif
+    
+#line 174
+    #if UNROLL > 4
+        npyv_f64 v_4 = npyv_loadn_f64(ip + 4 * vstep * istride, istride);
+        npyv_f64 r_4 = npyv_negative_f64(v_4);
+        npyv_store_f64(op + 4 * vstep, r_4);
+    #endif
+    
+#line 174
+    #if UNROLL > 5
+        npyv_f64 v_5 = npyv_loadn_f64(ip + 5 * vstep * istride, istride);
+        npyv_f64 r_5 = npyv_negative_f64(v_5);
+        npyv_store_f64(op + 5 * vstep, r_5);
+    #endif
+    
+#line 174
+    #if UNROLL > 6
+        npyv_f64 v_6 = npyv_loadn_f64(ip + 6 * vstep * istride, istride);
+        npyv_f64 r_6 = npyv_negative_f64(v_6);
+        npyv_store_f64(op + 6 * vstep, r_6);
+    #endif
+    
+#line 174
+    #if UNROLL > 7
+        npyv_f64 v_7 = npyv_loadn_f64(ip + 7 * vstep * istride, istride);
+        npyv_f64 r_7 = npyv_negative_f64(v_7);
+        npyv_store_f64(op + 7 * vstep, r_7);
+    #endif
+    
+#line 174
+    #if UNROLL > 8
+        npyv_f64 v_8 = npyv_loadn_f64(ip + 8 * vstep * istride, istride);
+        npyv_f64 r_8 = npyv_negative_f64(v_8);
+        npyv_store_f64(op + 8 * vstep, r_8);
+    #endif
+    
+#line 174
+    #if UNROLL > 9
+        npyv_f64 v_9 = npyv_loadn_f64(ip + 9 * vstep * istride, istride);
+        npyv_f64 r_9 = npyv_negative_f64(v_9);
+        npyv_store_f64(op + 9 * vstep, r_9);
+    #endif
+    
+#line 174
+    #if UNROLL > 10
+        npyv_f64 v_10 = npyv_loadn_f64(ip + 10 * vstep * istride, istride);
+        npyv_f64 r_10 = npyv_negative_f64(v_10);
+        npyv_store_f64(op + 10 * vstep, r_10);
+    #endif
+    
+#line 174
+    #if UNROLL > 11
+        npyv_f64 v_11 = npyv_loadn_f64(ip + 11 * vstep * istride, istride);
+        npyv_f64 r_11 = npyv_negative_f64(v_11);
+        npyv_store_f64(op + 11 * vstep, r_11);
+    #endif
+    
+#line 174
+    #if UNROLL > 12
+        npyv_f64 v_12 = npyv_loadn_f64(ip + 12 * vstep * istride, istride);
+        npyv_f64 r_12 = npyv_negative_f64(v_12);
+        npyv_store_f64(op + 12 * vstep, r_12);
+    #endif
+    
+#line 174
+    #if UNROLL > 13
+        npyv_f64 v_13 = npyv_loadn_f64(ip + 13 * vstep * istride, istride);
+        npyv_f64 r_13 = npyv_negative_f64(v_13);
+        npyv_store_f64(op + 13 * vstep, r_13);
+    #endif
+    
+#line 174
+    #if UNROLL > 14
+        npyv_f64 v_14 = npyv_loadn_f64(ip + 14 * vstep * istride, istride);
+        npyv_f64 r_14 = npyv_negative_f64(v_14);
+        npyv_store_f64(op + 14 * vstep, r_14);
+    #endif
+    
+#line 174
+    #if UNROLL > 15
+        npyv_f64 v_15 = npyv_loadn_f64(ip + 15 * vstep * istride, istride);
+        npyv_f64 r_15 = npyv_negative_f64(v_15);
+        npyv_store_f64(op + 15 * vstep, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += vstep) {
+        npyv_f64 v = npyv_loadn_f64(ip, istride);
+        npyv_f64 r = npyv_negative_f64(v);
+        npyv_store_f64(op, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ip += istride, ++op) {
+        *op = scalar_negative(*ip);
+    }
+}
+// non-contiguous input and output
+// limit unroll to 2x
+#if UNROLL > 2
+#undef UNROLL
+#define UNROLL 2
+#endif
+// X86 does better with unrolled scalar for heavy non-contiguous
+#ifndef NPY_HAVE_SSE2
+static NPY_INLINE void
+simd_unary_nn_negative_f64(const npyv_lanetype_f64 *ip, npy_intp istride,
+                             npyv_lanetype_f64 *op, npy_intp ostride,
+                             npy_intp len)
+{
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * UNROLL;
+
+    // unrolled vector loop
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+    #line 213
+    #if UNROLL > 0
+        npyv_f64 v_0 = npyv_loadn_f64(ip + 0 * vstep * istride, istride);
+        npyv_f64 r_0 = npyv_negative_f64(v_0);
+        npyv_storen_f64(op + 0 * vstep * ostride, ostride, r_0);
+    #endif
+    
+#line 213
+    #if UNROLL > 1
+        npyv_f64 v_1 = npyv_loadn_f64(ip + 1 * vstep * istride, istride);
+        npyv_f64 r_1 = npyv_negative_f64(v_1);
+        npyv_storen_f64(op + 1 * vstep * ostride, ostride, r_1);
+    #endif
+    
+#line 213
+    #if UNROLL > 2
+        npyv_f64 v_2 = npyv_loadn_f64(ip + 2 * vstep * istride, istride);
+        npyv_f64 r_2 = npyv_negative_f64(v_2);
+        npyv_storen_f64(op + 2 * vstep * ostride, ostride, r_2);
+    #endif
+    
+#line 213
+    #if UNROLL > 3
+        npyv_f64 v_3 = npyv_loadn_f64(ip + 3 * vstep * istride, istride);
+        npyv_f64 r_3 = npyv_negative_f64(v_3);
+        npyv_storen_f64(op + 3 * vstep * ostride, ostride, r_3);
+    #endif
+    
+#line 213
+    #if UNROLL > 4
+        npyv_f64 v_4 = npyv_loadn_f64(ip + 4 * vstep * istride, istride);
+        npyv_f64 r_4 = npyv_negative_f64(v_4);
+        npyv_storen_f64(op + 4 * vstep * ostride, ostride, r_4);
+    #endif
+    
+#line 213
+    #if UNROLL > 5
+        npyv_f64 v_5 = npyv_loadn_f64(ip + 5 * vstep * istride, istride);
+        npyv_f64 r_5 = npyv_negative_f64(v_5);
+        npyv_storen_f64(op + 5 * vstep * ostride, ostride, r_5);
+    #endif
+    
+#line 213
+    #if UNROLL > 6
+        npyv_f64 v_6 = npyv_loadn_f64(ip + 6 * vstep * istride, istride);
+        npyv_f64 r_6 = npyv_negative_f64(v_6);
+        npyv_storen_f64(op + 6 * vstep * ostride, ostride, r_6);
+    #endif
+    
+#line 213
+    #if UNROLL > 7
+        npyv_f64 v_7 = npyv_loadn_f64(ip + 7 * vstep * istride, istride);
+        npyv_f64 r_7 = npyv_negative_f64(v_7);
+        npyv_storen_f64(op + 7 * vstep * ostride, ostride, r_7);
+    #endif
+    
+#line 213
+    #if UNROLL > 8
+        npyv_f64 v_8 = npyv_loadn_f64(ip + 8 * vstep * istride, istride);
+        npyv_f64 r_8 = npyv_negative_f64(v_8);
+        npyv_storen_f64(op + 8 * vstep * ostride, ostride, r_8);
+    #endif
+    
+#line 213
+    #if UNROLL > 9
+        npyv_f64 v_9 = npyv_loadn_f64(ip + 9 * vstep * istride, istride);
+        npyv_f64 r_9 = npyv_negative_f64(v_9);
+        npyv_storen_f64(op + 9 * vstep * ostride, ostride, r_9);
+    #endif
+    
+#line 213
+    #if UNROLL > 10
+        npyv_f64 v_10 = npyv_loadn_f64(ip + 10 * vstep * istride, istride);
+        npyv_f64 r_10 = npyv_negative_f64(v_10);
+        npyv_storen_f64(op + 10 * vstep * ostride, ostride, r_10);
+    #endif
+    
+#line 213
+    #if UNROLL > 11
+        npyv_f64 v_11 = npyv_loadn_f64(ip + 11 * vstep * istride, istride);
+        npyv_f64 r_11 = npyv_negative_f64(v_11);
+        npyv_storen_f64(op + 11 * vstep * ostride, ostride, r_11);
+    #endif
+    
+#line 213
+    #if UNROLL > 12
+        npyv_f64 v_12 = npyv_loadn_f64(ip + 12 * vstep * istride, istride);
+        npyv_f64 r_12 = npyv_negative_f64(v_12);
+        npyv_storen_f64(op + 12 * vstep * ostride, ostride, r_12);
+    #endif
+    
+#line 213
+    #if UNROLL > 13
+        npyv_f64 v_13 = npyv_loadn_f64(ip + 13 * vstep * istride, istride);
+        npyv_f64 r_13 = npyv_negative_f64(v_13);
+        npyv_storen_f64(op + 13 * vstep * ostride, ostride, r_13);
+    #endif
+    
+#line 213
+    #if UNROLL > 14
+        npyv_f64 v_14 = npyv_loadn_f64(ip + 14 * vstep * istride, istride);
+        npyv_f64 r_14 = npyv_negative_f64(v_14);
+        npyv_storen_f64(op + 14 * vstep * ostride, ostride, r_14);
+    #endif
+    
+#line 213
+    #if UNROLL > 15
+        npyv_f64 v_15 = npyv_loadn_f64(ip + 15 * vstep * istride, istride);
+        npyv_f64 r_15 = npyv_negative_f64(v_15);
+        npyv_storen_f64(op + 15 * vstep * ostride, ostride, r_15);
+    #endif
+    
+    }
+    // single vector loop
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+        npyv_f64 v = npyv_loadn_f64(ip, istride);
+        npyv_f64 r = npyv_negative_f64(v);
+        npyv_storen_f64(op, ostride, r);
+    }
+    // scalar finish up any remaining iterations
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = scalar_negative(*ip);
+    }
+}
+#endif // NPY_HAVE_SSE2
+#endif // 1
+#undef UNROLL
+#endif // NPY_SIMD_F64
+/*end repeat1**/
+
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+#line 257
+#undef TO_SIMD_SFX
+#if 0
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_BYTE == 8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_BYTE == 16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_BYTE == 32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_BYTE == 64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 283
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UBYTE_negative)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip = args[0], *op = args[1];
+    npy_intp istep = steps[0], ostep = steps[1],
+             len = dimensions[0];
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (!is_mem_overlap(ip, istep, op, ostep, len)) {
+        if (IS_UNARY_CONT(npy_ubyte, npy_ubyte)) {
+            // no overlap and operands are contiguous
+            TO_SIMD_SFX(simd_unary_cc_negative)(
+                (STYPE*)ip, (STYPE*)op, len
+            );
+            goto clear;
+        }
+    #if 0
+        const npy_intp istride = istep / sizeof(STYPE);
+        const npy_intp ostride = ostep / sizeof(STYPE);
+        if (TO_SIMD_SFX(npyv_loadable_stride)(istride) &&
+            TO_SIMD_SFX(npyv_storable_stride)(ostride))
+        {
+            if (istride == 1 && ostride != 1) {
+                // contiguous input, non-contiguous output
+                TO_SIMD_SFX(simd_unary_cn_negative)(
+                    (STYPE*)ip, (STYPE*)op, ostride, len
+                );
+                goto clear;
+            }
+            else if (istride != 1 && ostride == 1) {
+                // non-contiguous input, contiguous output
+                TO_SIMD_SFX(simd_unary_nc_negative)(
+                    (STYPE*)ip, istride, (STYPE*)op, len
+                );
+                goto clear;
+            }
+        // X86 does better with unrolled scalar for heavy non-contiguous
+        #ifndef NPY_HAVE_SSE2
+            else if (istride != 1 && ostride != 1) {
+                // non-contiguous input and output
+                TO_SIMD_SFX(simd_unary_nn_negative)(
+                    (STYPE*)ip, istride, (STYPE*)op, ostride, len
+                );
+                goto clear;
+            }
+        #endif
+        }
+    #endif // 0
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    /*
+     * scalar unrolls
+     * 8x unroll performed best on
+     *  - Apple M1 Native / arm64
+     *  - Apple M1 Rosetta / SSE42
+     *  - iMacPro / AVX512
+     */
+    #define UNROLL 8
+    for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
+    #line 347
+    #if UNROLL > 0
+        const npy_ubyte in_0 = *((const npy_ubyte *)(ip + 0 * istep));
+        *((npy_ubyte *)(op + 0 * ostep)) = scalar_negative(in_0);
+    #endif
+    
+#line 347
+    #if UNROLL > 1
+        const npy_ubyte in_1 = *((const npy_ubyte *)(ip + 1 * istep));
+        *((npy_ubyte *)(op + 1 * ostep)) = scalar_negative(in_1);
+    #endif
+    
+#line 347
+    #if UNROLL > 2
+        const npy_ubyte in_2 = *((const npy_ubyte *)(ip + 2 * istep));
+        *((npy_ubyte *)(op + 2 * ostep)) = scalar_negative(in_2);
+    #endif
+    
+#line 347
+    #if UNROLL > 3
+        const npy_ubyte in_3 = *((const npy_ubyte *)(ip + 3 * istep));
+        *((npy_ubyte *)(op + 3 * ostep)) = scalar_negative(in_3);
+    #endif
+    
+#line 347
+    #if UNROLL > 4
+        const npy_ubyte in_4 = *((const npy_ubyte *)(ip + 4 * istep));
+        *((npy_ubyte *)(op + 4 * ostep)) = scalar_negative(in_4);
+    #endif
+    
+#line 347
+    #if UNROLL > 5
+        const npy_ubyte in_5 = *((const npy_ubyte *)(ip + 5 * istep));
+        *((npy_ubyte *)(op + 5 * ostep)) = scalar_negative(in_5);
+    #endif
+    
+#line 347
+    #if UNROLL > 6
+        const npy_ubyte in_6 = *((const npy_ubyte *)(ip + 6 * istep));
+        *((npy_ubyte *)(op + 6 * ostep)) = scalar_negative(in_6);
+    #endif
+    
+#line 347
+    #if UNROLL > 7
+        const npy_ubyte in_7 = *((const npy_ubyte *)(ip + 7 * istep));
+        *((npy_ubyte *)(op + 7 * ostep)) = scalar_negative(in_7);
+    #endif
+    
+#line 347
+    #if UNROLL > 8
+        const npy_ubyte in_8 = *((const npy_ubyte *)(ip + 8 * istep));
+        *((npy_ubyte *)(op + 8 * ostep)) = scalar_negative(in_8);
+    #endif
+    
+#line 347
+    #if UNROLL > 9
+        const npy_ubyte in_9 = *((const npy_ubyte *)(ip + 9 * istep));
+        *((npy_ubyte *)(op + 9 * ostep)) = scalar_negative(in_9);
+    #endif
+    
+#line 347
+    #if UNROLL > 10
+        const npy_ubyte in_10 = *((const npy_ubyte *)(ip + 10 * istep));
+        *((npy_ubyte *)(op + 10 * ostep)) = scalar_negative(in_10);
+    #endif
+    
+#line 347
+    #if UNROLL > 11
+        const npy_ubyte in_11 = *((const npy_ubyte *)(ip + 11 * istep));
+        *((npy_ubyte *)(op + 11 * ostep)) = scalar_negative(in_11);
+    #endif
+    
+#line 347
+    #if UNROLL > 12
+        const npy_ubyte in_12 = *((const npy_ubyte *)(ip + 12 * istep));
+        *((npy_ubyte *)(op + 12 * ostep)) = scalar_negative(in_12);
+    #endif
+    
+#line 347
+    #if UNROLL > 13
+        const npy_ubyte in_13 = *((const npy_ubyte *)(ip + 13 * istep));
+        *((npy_ubyte *)(op + 13 * ostep)) = scalar_negative(in_13);
+    #endif
+    
+#line 347
+    #if UNROLL > 14
+        const npy_ubyte in_14 = *((const npy_ubyte *)(ip + 14 * istep));
+        *((npy_ubyte *)(op + 14 * ostep)) = scalar_negative(in_14);
+    #endif
+    
+#line 347
+    #if UNROLL > 15
+        const npy_ubyte in_15 = *((const npy_ubyte *)(ip + 15 * istep));
+        *((npy_ubyte *)(op + 15 * ostep)) = scalar_negative(in_15);
+    #endif
+    
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    for (; len > 0; --len, ip += istep, op += ostep) {
+        *((npy_ubyte *)op) = scalar_negative(*(const npy_ubyte *)ip);
+    }
+#ifdef TO_SIMD_SFX
+clear:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 257
+#undef TO_SIMD_SFX
+#if 0
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_SHORT == 8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_SHORT == 16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_SHORT == 32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_SHORT == 64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 283
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(USHORT_negative)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip = args[0], *op = args[1];
+    npy_intp istep = steps[0], ostep = steps[1],
+             len = dimensions[0];
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (!is_mem_overlap(ip, istep, op, ostep, len)) {
+        if (IS_UNARY_CONT(npy_ushort, npy_ushort)) {
+            // no overlap and operands are contiguous
+            TO_SIMD_SFX(simd_unary_cc_negative)(
+                (STYPE*)ip, (STYPE*)op, len
+            );
+            goto clear;
+        }
+    #if 0
+        const npy_intp istride = istep / sizeof(STYPE);
+        const npy_intp ostride = ostep / sizeof(STYPE);
+        if (TO_SIMD_SFX(npyv_loadable_stride)(istride) &&
+            TO_SIMD_SFX(npyv_storable_stride)(ostride))
+        {
+            if (istride == 1 && ostride != 1) {
+                // contiguous input, non-contiguous output
+                TO_SIMD_SFX(simd_unary_cn_negative)(
+                    (STYPE*)ip, (STYPE*)op, ostride, len
+                );
+                goto clear;
+            }
+            else if (istride != 1 && ostride == 1) {
+                // non-contiguous input, contiguous output
+                TO_SIMD_SFX(simd_unary_nc_negative)(
+                    (STYPE*)ip, istride, (STYPE*)op, len
+                );
+                goto clear;
+            }
+        // X86 does better with unrolled scalar for heavy non-contiguous
+        #ifndef NPY_HAVE_SSE2
+            else if (istride != 1 && ostride != 1) {
+                // non-contiguous input and output
+                TO_SIMD_SFX(simd_unary_nn_negative)(
+                    (STYPE*)ip, istride, (STYPE*)op, ostride, len
+                );
+                goto clear;
+            }
+        #endif
+        }
+    #endif // 0
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    /*
+     * scalar unrolls
+     * 8x unroll performed best on
+     *  - Apple M1 Native / arm64
+     *  - Apple M1 Rosetta / SSE42
+     *  - iMacPro / AVX512
+     */
+    #define UNROLL 8
+    for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
+    #line 347
+    #if UNROLL > 0
+        const npy_ushort in_0 = *((const npy_ushort *)(ip + 0 * istep));
+        *((npy_ushort *)(op + 0 * ostep)) = scalar_negative(in_0);
+    #endif
+    
+#line 347
+    #if UNROLL > 1
+        const npy_ushort in_1 = *((const npy_ushort *)(ip + 1 * istep));
+        *((npy_ushort *)(op + 1 * ostep)) = scalar_negative(in_1);
+    #endif
+    
+#line 347
+    #if UNROLL > 2
+        const npy_ushort in_2 = *((const npy_ushort *)(ip + 2 * istep));
+        *((npy_ushort *)(op + 2 * ostep)) = scalar_negative(in_2);
+    #endif
+    
+#line 347
+    #if UNROLL > 3
+        const npy_ushort in_3 = *((const npy_ushort *)(ip + 3 * istep));
+        *((npy_ushort *)(op + 3 * ostep)) = scalar_negative(in_3);
+    #endif
+    
+#line 347
+    #if UNROLL > 4
+        const npy_ushort in_4 = *((const npy_ushort *)(ip + 4 * istep));
+        *((npy_ushort *)(op + 4 * ostep)) = scalar_negative(in_4);
+    #endif
+    
+#line 347
+    #if UNROLL > 5
+        const npy_ushort in_5 = *((const npy_ushort *)(ip + 5 * istep));
+        *((npy_ushort *)(op + 5 * ostep)) = scalar_negative(in_5);
+    #endif
+    
+#line 347
+    #if UNROLL > 6
+        const npy_ushort in_6 = *((const npy_ushort *)(ip + 6 * istep));
+        *((npy_ushort *)(op + 6 * ostep)) = scalar_negative(in_6);
+    #endif
+    
+#line 347
+    #if UNROLL > 7
+        const npy_ushort in_7 = *((const npy_ushort *)(ip + 7 * istep));
+        *((npy_ushort *)(op + 7 * ostep)) = scalar_negative(in_7);
+    #endif
+    
+#line 347
+    #if UNROLL > 8
+        const npy_ushort in_8 = *((const npy_ushort *)(ip + 8 * istep));
+        *((npy_ushort *)(op + 8 * ostep)) = scalar_negative(in_8);
+    #endif
+    
+#line 347
+    #if UNROLL > 9
+        const npy_ushort in_9 = *((const npy_ushort *)(ip + 9 * istep));
+        *((npy_ushort *)(op + 9 * ostep)) = scalar_negative(in_9);
+    #endif
+    
+#line 347
+    #if UNROLL > 10
+        const npy_ushort in_10 = *((const npy_ushort *)(ip + 10 * istep));
+        *((npy_ushort *)(op + 10 * ostep)) = scalar_negative(in_10);
+    #endif
+    
+#line 347
+    #if UNROLL > 11
+        const npy_ushort in_11 = *((const npy_ushort *)(ip + 11 * istep));
+        *((npy_ushort *)(op + 11 * ostep)) = scalar_negative(in_11);
+    #endif
+    
+#line 347
+    #if UNROLL > 12
+        const npy_ushort in_12 = *((const npy_ushort *)(ip + 12 * istep));
+        *((npy_ushort *)(op + 12 * ostep)) = scalar_negative(in_12);
+    #endif
+    
+#line 347
+    #if UNROLL > 13
+        const npy_ushort in_13 = *((const npy_ushort *)(ip + 13 * istep));
+        *((npy_ushort *)(op + 13 * ostep)) = scalar_negative(in_13);
+    #endif
+    
+#line 347
+    #if UNROLL > 14
+        const npy_ushort in_14 = *((const npy_ushort *)(ip + 14 * istep));
+        *((npy_ushort *)(op + 14 * ostep)) = scalar_negative(in_14);
+    #endif
+    
+#line 347
+    #if UNROLL > 15
+        const npy_ushort in_15 = *((const npy_ushort *)(ip + 15 * istep));
+        *((npy_ushort *)(op + 15 * ostep)) = scalar_negative(in_15);
+    #endif
+    
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    for (; len > 0; --len, ip += istep, op += ostep) {
+        *((npy_ushort *)op) = scalar_negative(*(const npy_ushort *)ip);
+    }
+#ifdef TO_SIMD_SFX
+clear:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 257
+#undef TO_SIMD_SFX
+#if 0
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_INT == 8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_INT == 16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_INT == 32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_INT == 64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 283
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(UINT_negative)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip = args[0], *op = args[1];
+    npy_intp istep = steps[0], ostep = steps[1],
+             len = dimensions[0];
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (!is_mem_overlap(ip, istep, op, ostep, len)) {
+        if (IS_UNARY_CONT(npy_uint, npy_uint)) {
+            // no overlap and operands are contiguous
+            TO_SIMD_SFX(simd_unary_cc_negative)(
+                (STYPE*)ip, (STYPE*)op, len
+            );
+            goto clear;
+        }
+    #if 1
+        const npy_intp istride = istep / sizeof(STYPE);
+        const npy_intp ostride = ostep / sizeof(STYPE);
+        if (TO_SIMD_SFX(npyv_loadable_stride)(istride) &&
+            TO_SIMD_SFX(npyv_storable_stride)(ostride))
+        {
+            if (istride == 1 && ostride != 1) {
+                // contiguous input, non-contiguous output
+                TO_SIMD_SFX(simd_unary_cn_negative)(
+                    (STYPE*)ip, (STYPE*)op, ostride, len
+                );
+                goto clear;
+            }
+            else if (istride != 1 && ostride == 1) {
+                // non-contiguous input, contiguous output
+                TO_SIMD_SFX(simd_unary_nc_negative)(
+                    (STYPE*)ip, istride, (STYPE*)op, len
+                );
+                goto clear;
+            }
+        // X86 does better with unrolled scalar for heavy non-contiguous
+        #ifndef NPY_HAVE_SSE2
+            else if (istride != 1 && ostride != 1) {
+                // non-contiguous input and output
+                TO_SIMD_SFX(simd_unary_nn_negative)(
+                    (STYPE*)ip, istride, (STYPE*)op, ostride, len
+                );
+                goto clear;
+            }
+        #endif
+        }
+    #endif // 1
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    /*
+     * scalar unrolls
+     * 8x unroll performed best on
+     *  - Apple M1 Native / arm64
+     *  - Apple M1 Rosetta / SSE42
+     *  - iMacPro / AVX512
+     */
+    #define UNROLL 8
+    for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
+    #line 347
+    #if UNROLL > 0
+        const npy_uint in_0 = *((const npy_uint *)(ip + 0 * istep));
+        *((npy_uint *)(op + 0 * ostep)) = scalar_negative(in_0);
+    #endif
+    
+#line 347
+    #if UNROLL > 1
+        const npy_uint in_1 = *((const npy_uint *)(ip + 1 * istep));
+        *((npy_uint *)(op + 1 * ostep)) = scalar_negative(in_1);
+    #endif
+    
+#line 347
+    #if UNROLL > 2
+        const npy_uint in_2 = *((const npy_uint *)(ip + 2 * istep));
+        *((npy_uint *)(op + 2 * ostep)) = scalar_negative(in_2);
+    #endif
+    
+#line 347
+    #if UNROLL > 3
+        const npy_uint in_3 = *((const npy_uint *)(ip + 3 * istep));
+        *((npy_uint *)(op + 3 * ostep)) = scalar_negative(in_3);
+    #endif
+    
+#line 347
+    #if UNROLL > 4
+        const npy_uint in_4 = *((const npy_uint *)(ip + 4 * istep));
+        *((npy_uint *)(op + 4 * ostep)) = scalar_negative(in_4);
+    #endif
+    
+#line 347
+    #if UNROLL > 5
+        const npy_uint in_5 = *((const npy_uint *)(ip + 5 * istep));
+        *((npy_uint *)(op + 5 * ostep)) = scalar_negative(in_5);
+    #endif
+    
+#line 347
+    #if UNROLL > 6
+        const npy_uint in_6 = *((const npy_uint *)(ip + 6 * istep));
+        *((npy_uint *)(op + 6 * ostep)) = scalar_negative(in_6);
+    #endif
+    
+#line 347
+    #if UNROLL > 7
+        const npy_uint in_7 = *((const npy_uint *)(ip + 7 * istep));
+        *((npy_uint *)(op + 7 * ostep)) = scalar_negative(in_7);
+    #endif
+    
+#line 347
+    #if UNROLL > 8
+        const npy_uint in_8 = *((const npy_uint *)(ip + 8 * istep));
+        *((npy_uint *)(op + 8 * ostep)) = scalar_negative(in_8);
+    #endif
+    
+#line 347
+    #if UNROLL > 9
+        const npy_uint in_9 = *((const npy_uint *)(ip + 9 * istep));
+        *((npy_uint *)(op + 9 * ostep)) = scalar_negative(in_9);
+    #endif
+    
+#line 347
+    #if UNROLL > 10
+        const npy_uint in_10 = *((const npy_uint *)(ip + 10 * istep));
+        *((npy_uint *)(op + 10 * ostep)) = scalar_negative(in_10);
+    #endif
+    
+#line 347
+    #if UNROLL > 11
+        const npy_uint in_11 = *((const npy_uint *)(ip + 11 * istep));
+        *((npy_uint *)(op + 11 * ostep)) = scalar_negative(in_11);
+    #endif
+    
+#line 347
+    #if UNROLL > 12
+        const npy_uint in_12 = *((const npy_uint *)(ip + 12 * istep));
+        *((npy_uint *)(op + 12 * ostep)) = scalar_negative(in_12);
+    #endif
+    
+#line 347
+    #if UNROLL > 13
+        const npy_uint in_13 = *((const npy_uint *)(ip + 13 * istep));
+        *((npy_uint *)(op + 13 * ostep)) = scalar_negative(in_13);
+    #endif
+    
+#line 347
+    #if UNROLL > 14
+        const npy_uint in_14 = *((const npy_uint *)(ip + 14 * istep));
+        *((npy_uint *)(op + 14 * ostep)) = scalar_negative(in_14);
+    #endif
+    
+#line 347
+    #if UNROLL > 15
+        const npy_uint in_15 = *((const npy_uint *)(ip + 15 * istep));
+        *((npy_uint *)(op + 15 * ostep)) = scalar_negative(in_15);
+    #endif
+    
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    for (; len > 0; --len, ip += istep, op += ostep) {
+        *((npy_uint *)op) = scalar_negative(*(const npy_uint *)ip);
+    }
+#ifdef TO_SIMD_SFX
+clear:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 257
+#undef TO_SIMD_SFX
+#if 0
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_LONG == 8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_LONG == 16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_LONG == 32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_LONG == 64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 283
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONG_negative)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip = args[0], *op = args[1];
+    npy_intp istep = steps[0], ostep = steps[1],
+             len = dimensions[0];
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (!is_mem_overlap(ip, istep, op, ostep, len)) {
+        if (IS_UNARY_CONT(npy_ulong, npy_ulong)) {
+            // no overlap and operands are contiguous
+            TO_SIMD_SFX(simd_unary_cc_negative)(
+                (STYPE*)ip, (STYPE*)op, len
+            );
+            goto clear;
+        }
+    #if 1
+        const npy_intp istride = istep / sizeof(STYPE);
+        const npy_intp ostride = ostep / sizeof(STYPE);
+        if (TO_SIMD_SFX(npyv_loadable_stride)(istride) &&
+            TO_SIMD_SFX(npyv_storable_stride)(ostride))
+        {
+            if (istride == 1 && ostride != 1) {
+                // contiguous input, non-contiguous output
+                TO_SIMD_SFX(simd_unary_cn_negative)(
+                    (STYPE*)ip, (STYPE*)op, ostride, len
+                );
+                goto clear;
+            }
+            else if (istride != 1 && ostride == 1) {
+                // non-contiguous input, contiguous output
+                TO_SIMD_SFX(simd_unary_nc_negative)(
+                    (STYPE*)ip, istride, (STYPE*)op, len
+                );
+                goto clear;
+            }
+        // X86 does better with unrolled scalar for heavy non-contiguous
+        #ifndef NPY_HAVE_SSE2
+            else if (istride != 1 && ostride != 1) {
+                // non-contiguous input and output
+                TO_SIMD_SFX(simd_unary_nn_negative)(
+                    (STYPE*)ip, istride, (STYPE*)op, ostride, len
+                );
+                goto clear;
+            }
+        #endif
+        }
+    #endif // 1
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    /*
+     * scalar unrolls
+     * 8x unroll performed best on
+     *  - Apple M1 Native / arm64
+     *  - Apple M1 Rosetta / SSE42
+     *  - iMacPro / AVX512
+     */
+    #define UNROLL 8
+    for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
+    #line 347
+    #if UNROLL > 0
+        const npy_ulong in_0 = *((const npy_ulong *)(ip + 0 * istep));
+        *((npy_ulong *)(op + 0 * ostep)) = scalar_negative(in_0);
+    #endif
+    
+#line 347
+    #if UNROLL > 1
+        const npy_ulong in_1 = *((const npy_ulong *)(ip + 1 * istep));
+        *((npy_ulong *)(op + 1 * ostep)) = scalar_negative(in_1);
+    #endif
+    
+#line 347
+    #if UNROLL > 2
+        const npy_ulong in_2 = *((const npy_ulong *)(ip + 2 * istep));
+        *((npy_ulong *)(op + 2 * ostep)) = scalar_negative(in_2);
+    #endif
+    
+#line 347
+    #if UNROLL > 3
+        const npy_ulong in_3 = *((const npy_ulong *)(ip + 3 * istep));
+        *((npy_ulong *)(op + 3 * ostep)) = scalar_negative(in_3);
+    #endif
+    
+#line 347
+    #if UNROLL > 4
+        const npy_ulong in_4 = *((const npy_ulong *)(ip + 4 * istep));
+        *((npy_ulong *)(op + 4 * ostep)) = scalar_negative(in_4);
+    #endif
+    
+#line 347
+    #if UNROLL > 5
+        const npy_ulong in_5 = *((const npy_ulong *)(ip + 5 * istep));
+        *((npy_ulong *)(op + 5 * ostep)) = scalar_negative(in_5);
+    #endif
+    
+#line 347
+    #if UNROLL > 6
+        const npy_ulong in_6 = *((const npy_ulong *)(ip + 6 * istep));
+        *((npy_ulong *)(op + 6 * ostep)) = scalar_negative(in_6);
+    #endif
+    
+#line 347
+    #if UNROLL > 7
+        const npy_ulong in_7 = *((const npy_ulong *)(ip + 7 * istep));
+        *((npy_ulong *)(op + 7 * ostep)) = scalar_negative(in_7);
+    #endif
+    
+#line 347
+    #if UNROLL > 8
+        const npy_ulong in_8 = *((const npy_ulong *)(ip + 8 * istep));
+        *((npy_ulong *)(op + 8 * ostep)) = scalar_negative(in_8);
+    #endif
+    
+#line 347
+    #if UNROLL > 9
+        const npy_ulong in_9 = *((const npy_ulong *)(ip + 9 * istep));
+        *((npy_ulong *)(op + 9 * ostep)) = scalar_negative(in_9);
+    #endif
+    
+#line 347
+    #if UNROLL > 10
+        const npy_ulong in_10 = *((const npy_ulong *)(ip + 10 * istep));
+        *((npy_ulong *)(op + 10 * ostep)) = scalar_negative(in_10);
+    #endif
+    
+#line 347
+    #if UNROLL > 11
+        const npy_ulong in_11 = *((const npy_ulong *)(ip + 11 * istep));
+        *((npy_ulong *)(op + 11 * ostep)) = scalar_negative(in_11);
+    #endif
+    
+#line 347
+    #if UNROLL > 12
+        const npy_ulong in_12 = *((const npy_ulong *)(ip + 12 * istep));
+        *((npy_ulong *)(op + 12 * ostep)) = scalar_negative(in_12);
+    #endif
+    
+#line 347
+    #if UNROLL > 13
+        const npy_ulong in_13 = *((const npy_ulong *)(ip + 13 * istep));
+        *((npy_ulong *)(op + 13 * ostep)) = scalar_negative(in_13);
+    #endif
+    
+#line 347
+    #if UNROLL > 14
+        const npy_ulong in_14 = *((const npy_ulong *)(ip + 14 * istep));
+        *((npy_ulong *)(op + 14 * ostep)) = scalar_negative(in_14);
+    #endif
+    
+#line 347
+    #if UNROLL > 15
+        const npy_ulong in_15 = *((const npy_ulong *)(ip + 15 * istep));
+        *((npy_ulong *)(op + 15 * ostep)) = scalar_negative(in_15);
+    #endif
+    
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    for (; len > 0; --len, ip += istep, op += ostep) {
+        *((npy_ulong *)op) = scalar_negative(*(const npy_ulong *)ip);
+    }
+#ifdef TO_SIMD_SFX
+clear:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 257
+#undef TO_SIMD_SFX
+#if 0
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 1
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 283
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(ULONGLONG_negative)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip = args[0], *op = args[1];
+    npy_intp istep = steps[0], ostep = steps[1],
+             len = dimensions[0];
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (!is_mem_overlap(ip, istep, op, ostep, len)) {
+        if (IS_UNARY_CONT(npy_ulonglong, npy_ulonglong)) {
+            // no overlap and operands are contiguous
+            TO_SIMD_SFX(simd_unary_cc_negative)(
+                (STYPE*)ip, (STYPE*)op, len
+            );
+            goto clear;
+        }
+    #if 1
+        const npy_intp istride = istep / sizeof(STYPE);
+        const npy_intp ostride = ostep / sizeof(STYPE);
+        if (TO_SIMD_SFX(npyv_loadable_stride)(istride) &&
+            TO_SIMD_SFX(npyv_storable_stride)(ostride))
+        {
+            if (istride == 1 && ostride != 1) {
+                // contiguous input, non-contiguous output
+                TO_SIMD_SFX(simd_unary_cn_negative)(
+                    (STYPE*)ip, (STYPE*)op, ostride, len
+                );
+                goto clear;
+            }
+            else if (istride != 1 && ostride == 1) {
+                // non-contiguous input, contiguous output
+                TO_SIMD_SFX(simd_unary_nc_negative)(
+                    (STYPE*)ip, istride, (STYPE*)op, len
+                );
+                goto clear;
+            }
+        // X86 does better with unrolled scalar for heavy non-contiguous
+        #ifndef NPY_HAVE_SSE2
+            else if (istride != 1 && ostride != 1) {
+                // non-contiguous input and output
+                TO_SIMD_SFX(simd_unary_nn_negative)(
+                    (STYPE*)ip, istride, (STYPE*)op, ostride, len
+                );
+                goto clear;
+            }
+        #endif
+        }
+    #endif // 1
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    /*
+     * scalar unrolls
+     * 8x unroll performed best on
+     *  - Apple M1 Native / arm64
+     *  - Apple M1 Rosetta / SSE42
+     *  - iMacPro / AVX512
+     */
+    #define UNROLL 8
+    for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
+    #line 347
+    #if UNROLL > 0
+        const npy_ulonglong in_0 = *((const npy_ulonglong *)(ip + 0 * istep));
+        *((npy_ulonglong *)(op + 0 * ostep)) = scalar_negative(in_0);
+    #endif
+    
+#line 347
+    #if UNROLL > 1
+        const npy_ulonglong in_1 = *((const npy_ulonglong *)(ip + 1 * istep));
+        *((npy_ulonglong *)(op + 1 * ostep)) = scalar_negative(in_1);
+    #endif
+    
+#line 347
+    #if UNROLL > 2
+        const npy_ulonglong in_2 = *((const npy_ulonglong *)(ip + 2 * istep));
+        *((npy_ulonglong *)(op + 2 * ostep)) = scalar_negative(in_2);
+    #endif
+    
+#line 347
+    #if UNROLL > 3
+        const npy_ulonglong in_3 = *((const npy_ulonglong *)(ip + 3 * istep));
+        *((npy_ulonglong *)(op + 3 * ostep)) = scalar_negative(in_3);
+    #endif
+    
+#line 347
+    #if UNROLL > 4
+        const npy_ulonglong in_4 = *((const npy_ulonglong *)(ip + 4 * istep));
+        *((npy_ulonglong *)(op + 4 * ostep)) = scalar_negative(in_4);
+    #endif
+    
+#line 347
+    #if UNROLL > 5
+        const npy_ulonglong in_5 = *((const npy_ulonglong *)(ip + 5 * istep));
+        *((npy_ulonglong *)(op + 5 * ostep)) = scalar_negative(in_5);
+    #endif
+    
+#line 347
+    #if UNROLL > 6
+        const npy_ulonglong in_6 = *((const npy_ulonglong *)(ip + 6 * istep));
+        *((npy_ulonglong *)(op + 6 * ostep)) = scalar_negative(in_6);
+    #endif
+    
+#line 347
+    #if UNROLL > 7
+        const npy_ulonglong in_7 = *((const npy_ulonglong *)(ip + 7 * istep));
+        *((npy_ulonglong *)(op + 7 * ostep)) = scalar_negative(in_7);
+    #endif
+    
+#line 347
+    #if UNROLL > 8
+        const npy_ulonglong in_8 = *((const npy_ulonglong *)(ip + 8 * istep));
+        *((npy_ulonglong *)(op + 8 * ostep)) = scalar_negative(in_8);
+    #endif
+    
+#line 347
+    #if UNROLL > 9
+        const npy_ulonglong in_9 = *((const npy_ulonglong *)(ip + 9 * istep));
+        *((npy_ulonglong *)(op + 9 * ostep)) = scalar_negative(in_9);
+    #endif
+    
+#line 347
+    #if UNROLL > 10
+        const npy_ulonglong in_10 = *((const npy_ulonglong *)(ip + 10 * istep));
+        *((npy_ulonglong *)(op + 10 * ostep)) = scalar_negative(in_10);
+    #endif
+    
+#line 347
+    #if UNROLL > 11
+        const npy_ulonglong in_11 = *((const npy_ulonglong *)(ip + 11 * istep));
+        *((npy_ulonglong *)(op + 11 * ostep)) = scalar_negative(in_11);
+    #endif
+    
+#line 347
+    #if UNROLL > 12
+        const npy_ulonglong in_12 = *((const npy_ulonglong *)(ip + 12 * istep));
+        *((npy_ulonglong *)(op + 12 * ostep)) = scalar_negative(in_12);
+    #endif
+    
+#line 347
+    #if UNROLL > 13
+        const npy_ulonglong in_13 = *((const npy_ulonglong *)(ip + 13 * istep));
+        *((npy_ulonglong *)(op + 13 * ostep)) = scalar_negative(in_13);
+    #endif
+    
+#line 347
+    #if UNROLL > 14
+        const npy_ulonglong in_14 = *((const npy_ulonglong *)(ip + 14 * istep));
+        *((npy_ulonglong *)(op + 14 * ostep)) = scalar_negative(in_14);
+    #endif
+    
+#line 347
+    #if UNROLL > 15
+        const npy_ulonglong in_15 = *((const npy_ulonglong *)(ip + 15 * istep));
+        *((npy_ulonglong *)(op + 15 * ostep)) = scalar_negative(in_15);
+    #endif
+    
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    for (; len > 0; --len, ip += istep, op += ostep) {
+        *((npy_ulonglong *)op) = scalar_negative(*(const npy_ulonglong *)ip);
+    }
+#ifdef TO_SIMD_SFX
+clear:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 257
+#undef TO_SIMD_SFX
+#if 0
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_BYTE == 8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_BYTE == 16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_BYTE == 32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_BYTE == 64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_BYTE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_BYTE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 283
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(BYTE_negative)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip = args[0], *op = args[1];
+    npy_intp istep = steps[0], ostep = steps[1],
+             len = dimensions[0];
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (!is_mem_overlap(ip, istep, op, ostep, len)) {
+        if (IS_UNARY_CONT(npy_byte, npy_byte)) {
+            // no overlap and operands are contiguous
+            TO_SIMD_SFX(simd_unary_cc_negative)(
+                (STYPE*)ip, (STYPE*)op, len
+            );
+            goto clear;
+        }
+    #if 0
+        const npy_intp istride = istep / sizeof(STYPE);
+        const npy_intp ostride = ostep / sizeof(STYPE);
+        if (TO_SIMD_SFX(npyv_loadable_stride)(istride) &&
+            TO_SIMD_SFX(npyv_storable_stride)(ostride))
+        {
+            if (istride == 1 && ostride != 1) {
+                // contiguous input, non-contiguous output
+                TO_SIMD_SFX(simd_unary_cn_negative)(
+                    (STYPE*)ip, (STYPE*)op, ostride, len
+                );
+                goto clear;
+            }
+            else if (istride != 1 && ostride == 1) {
+                // non-contiguous input, contiguous output
+                TO_SIMD_SFX(simd_unary_nc_negative)(
+                    (STYPE*)ip, istride, (STYPE*)op, len
+                );
+                goto clear;
+            }
+        // X86 does better with unrolled scalar for heavy non-contiguous
+        #ifndef NPY_HAVE_SSE2
+            else if (istride != 1 && ostride != 1) {
+                // non-contiguous input and output
+                TO_SIMD_SFX(simd_unary_nn_negative)(
+                    (STYPE*)ip, istride, (STYPE*)op, ostride, len
+                );
+                goto clear;
+            }
+        #endif
+        }
+    #endif // 0
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    /*
+     * scalar unrolls
+     * 8x unroll performed best on
+     *  - Apple M1 Native / arm64
+     *  - Apple M1 Rosetta / SSE42
+     *  - iMacPro / AVX512
+     */
+    #define UNROLL 8
+    for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
+    #line 347
+    #if UNROLL > 0
+        const npy_byte in_0 = *((const npy_byte *)(ip + 0 * istep));
+        *((npy_byte *)(op + 0 * ostep)) = scalar_negative(in_0);
+    #endif
+    
+#line 347
+    #if UNROLL > 1
+        const npy_byte in_1 = *((const npy_byte *)(ip + 1 * istep));
+        *((npy_byte *)(op + 1 * ostep)) = scalar_negative(in_1);
+    #endif
+    
+#line 347
+    #if UNROLL > 2
+        const npy_byte in_2 = *((const npy_byte *)(ip + 2 * istep));
+        *((npy_byte *)(op + 2 * ostep)) = scalar_negative(in_2);
+    #endif
+    
+#line 347
+    #if UNROLL > 3
+        const npy_byte in_3 = *((const npy_byte *)(ip + 3 * istep));
+        *((npy_byte *)(op + 3 * ostep)) = scalar_negative(in_3);
+    #endif
+    
+#line 347
+    #if UNROLL > 4
+        const npy_byte in_4 = *((const npy_byte *)(ip + 4 * istep));
+        *((npy_byte *)(op + 4 * ostep)) = scalar_negative(in_4);
+    #endif
+    
+#line 347
+    #if UNROLL > 5
+        const npy_byte in_5 = *((const npy_byte *)(ip + 5 * istep));
+        *((npy_byte *)(op + 5 * ostep)) = scalar_negative(in_5);
+    #endif
+    
+#line 347
+    #if UNROLL > 6
+        const npy_byte in_6 = *((const npy_byte *)(ip + 6 * istep));
+        *((npy_byte *)(op + 6 * ostep)) = scalar_negative(in_6);
+    #endif
+    
+#line 347
+    #if UNROLL > 7
+        const npy_byte in_7 = *((const npy_byte *)(ip + 7 * istep));
+        *((npy_byte *)(op + 7 * ostep)) = scalar_negative(in_7);
+    #endif
+    
+#line 347
+    #if UNROLL > 8
+        const npy_byte in_8 = *((const npy_byte *)(ip + 8 * istep));
+        *((npy_byte *)(op + 8 * ostep)) = scalar_negative(in_8);
+    #endif
+    
+#line 347
+    #if UNROLL > 9
+        const npy_byte in_9 = *((const npy_byte *)(ip + 9 * istep));
+        *((npy_byte *)(op + 9 * ostep)) = scalar_negative(in_9);
+    #endif
+    
+#line 347
+    #if UNROLL > 10
+        const npy_byte in_10 = *((const npy_byte *)(ip + 10 * istep));
+        *((npy_byte *)(op + 10 * ostep)) = scalar_negative(in_10);
+    #endif
+    
+#line 347
+    #if UNROLL > 11
+        const npy_byte in_11 = *((const npy_byte *)(ip + 11 * istep));
+        *((npy_byte *)(op + 11 * ostep)) = scalar_negative(in_11);
+    #endif
+    
+#line 347
+    #if UNROLL > 12
+        const npy_byte in_12 = *((const npy_byte *)(ip + 12 * istep));
+        *((npy_byte *)(op + 12 * ostep)) = scalar_negative(in_12);
+    #endif
+    
+#line 347
+    #if UNROLL > 13
+        const npy_byte in_13 = *((const npy_byte *)(ip + 13 * istep));
+        *((npy_byte *)(op + 13 * ostep)) = scalar_negative(in_13);
+    #endif
+    
+#line 347
+    #if UNROLL > 14
+        const npy_byte in_14 = *((const npy_byte *)(ip + 14 * istep));
+        *((npy_byte *)(op + 14 * ostep)) = scalar_negative(in_14);
+    #endif
+    
+#line 347
+    #if UNROLL > 15
+        const npy_byte in_15 = *((const npy_byte *)(ip + 15 * istep));
+        *((npy_byte *)(op + 15 * ostep)) = scalar_negative(in_15);
+    #endif
+    
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    for (; len > 0; --len, ip += istep, op += ostep) {
+        *((npy_byte *)op) = scalar_negative(*(const npy_byte *)ip);
+    }
+#ifdef TO_SIMD_SFX
+clear:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 257
+#undef TO_SIMD_SFX
+#if 0
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_SHORT == 8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_SHORT == 16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_SHORT == 32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_SHORT == 64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_SHORT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_SHORT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 283
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(SHORT_negative)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip = args[0], *op = args[1];
+    npy_intp istep = steps[0], ostep = steps[1],
+             len = dimensions[0];
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (!is_mem_overlap(ip, istep, op, ostep, len)) {
+        if (IS_UNARY_CONT(npy_short, npy_short)) {
+            // no overlap and operands are contiguous
+            TO_SIMD_SFX(simd_unary_cc_negative)(
+                (STYPE*)ip, (STYPE*)op, len
+            );
+            goto clear;
+        }
+    #if 0
+        const npy_intp istride = istep / sizeof(STYPE);
+        const npy_intp ostride = ostep / sizeof(STYPE);
+        if (TO_SIMD_SFX(npyv_loadable_stride)(istride) &&
+            TO_SIMD_SFX(npyv_storable_stride)(ostride))
+        {
+            if (istride == 1 && ostride != 1) {
+                // contiguous input, non-contiguous output
+                TO_SIMD_SFX(simd_unary_cn_negative)(
+                    (STYPE*)ip, (STYPE*)op, ostride, len
+                );
+                goto clear;
+            }
+            else if (istride != 1 && ostride == 1) {
+                // non-contiguous input, contiguous output
+                TO_SIMD_SFX(simd_unary_nc_negative)(
+                    (STYPE*)ip, istride, (STYPE*)op, len
+                );
+                goto clear;
+            }
+        // X86 does better with unrolled scalar for heavy non-contiguous
+        #ifndef NPY_HAVE_SSE2
+            else if (istride != 1 && ostride != 1) {
+                // non-contiguous input and output
+                TO_SIMD_SFX(simd_unary_nn_negative)(
+                    (STYPE*)ip, istride, (STYPE*)op, ostride, len
+                );
+                goto clear;
+            }
+        #endif
+        }
+    #endif // 0
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    /*
+     * scalar unrolls
+     * 8x unroll performed best on
+     *  - Apple M1 Native / arm64
+     *  - Apple M1 Rosetta / SSE42
+     *  - iMacPro / AVX512
+     */
+    #define UNROLL 8
+    for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
+    #line 347
+    #if UNROLL > 0
+        const npy_short in_0 = *((const npy_short *)(ip + 0 * istep));
+        *((npy_short *)(op + 0 * ostep)) = scalar_negative(in_0);
+    #endif
+    
+#line 347
+    #if UNROLL > 1
+        const npy_short in_1 = *((const npy_short *)(ip + 1 * istep));
+        *((npy_short *)(op + 1 * ostep)) = scalar_negative(in_1);
+    #endif
+    
+#line 347
+    #if UNROLL > 2
+        const npy_short in_2 = *((const npy_short *)(ip + 2 * istep));
+        *((npy_short *)(op + 2 * ostep)) = scalar_negative(in_2);
+    #endif
+    
+#line 347
+    #if UNROLL > 3
+        const npy_short in_3 = *((const npy_short *)(ip + 3 * istep));
+        *((npy_short *)(op + 3 * ostep)) = scalar_negative(in_3);
+    #endif
+    
+#line 347
+    #if UNROLL > 4
+        const npy_short in_4 = *((const npy_short *)(ip + 4 * istep));
+        *((npy_short *)(op + 4 * ostep)) = scalar_negative(in_4);
+    #endif
+    
+#line 347
+    #if UNROLL > 5
+        const npy_short in_5 = *((const npy_short *)(ip + 5 * istep));
+        *((npy_short *)(op + 5 * ostep)) = scalar_negative(in_5);
+    #endif
+    
+#line 347
+    #if UNROLL > 6
+        const npy_short in_6 = *((const npy_short *)(ip + 6 * istep));
+        *((npy_short *)(op + 6 * ostep)) = scalar_negative(in_6);
+    #endif
+    
+#line 347
+    #if UNROLL > 7
+        const npy_short in_7 = *((const npy_short *)(ip + 7 * istep));
+        *((npy_short *)(op + 7 * ostep)) = scalar_negative(in_7);
+    #endif
+    
+#line 347
+    #if UNROLL > 8
+        const npy_short in_8 = *((const npy_short *)(ip + 8 * istep));
+        *((npy_short *)(op + 8 * ostep)) = scalar_negative(in_8);
+    #endif
+    
+#line 347
+    #if UNROLL > 9
+        const npy_short in_9 = *((const npy_short *)(ip + 9 * istep));
+        *((npy_short *)(op + 9 * ostep)) = scalar_negative(in_9);
+    #endif
+    
+#line 347
+    #if UNROLL > 10
+        const npy_short in_10 = *((const npy_short *)(ip + 10 * istep));
+        *((npy_short *)(op + 10 * ostep)) = scalar_negative(in_10);
+    #endif
+    
+#line 347
+    #if UNROLL > 11
+        const npy_short in_11 = *((const npy_short *)(ip + 11 * istep));
+        *((npy_short *)(op + 11 * ostep)) = scalar_negative(in_11);
+    #endif
+    
+#line 347
+    #if UNROLL > 12
+        const npy_short in_12 = *((const npy_short *)(ip + 12 * istep));
+        *((npy_short *)(op + 12 * ostep)) = scalar_negative(in_12);
+    #endif
+    
+#line 347
+    #if UNROLL > 13
+        const npy_short in_13 = *((const npy_short *)(ip + 13 * istep));
+        *((npy_short *)(op + 13 * ostep)) = scalar_negative(in_13);
+    #endif
+    
+#line 347
+    #if UNROLL > 14
+        const npy_short in_14 = *((const npy_short *)(ip + 14 * istep));
+        *((npy_short *)(op + 14 * ostep)) = scalar_negative(in_14);
+    #endif
+    
+#line 347
+    #if UNROLL > 15
+        const npy_short in_15 = *((const npy_short *)(ip + 15 * istep));
+        *((npy_short *)(op + 15 * ostep)) = scalar_negative(in_15);
+    #endif
+    
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    for (; len > 0; --len, ip += istep, op += ostep) {
+        *((npy_short *)op) = scalar_negative(*(const npy_short *)ip);
+    }
+#ifdef TO_SIMD_SFX
+clear:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 257
+#undef TO_SIMD_SFX
+#if 0
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_INT == 8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_INT == 16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_INT == 32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_INT == 64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_INT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_INT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 283
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(INT_negative)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip = args[0], *op = args[1];
+    npy_intp istep = steps[0], ostep = steps[1],
+             len = dimensions[0];
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (!is_mem_overlap(ip, istep, op, ostep, len)) {
+        if (IS_UNARY_CONT(npy_int, npy_int)) {
+            // no overlap and operands are contiguous
+            TO_SIMD_SFX(simd_unary_cc_negative)(
+                (STYPE*)ip, (STYPE*)op, len
+            );
+            goto clear;
+        }
+    #if 1
+        const npy_intp istride = istep / sizeof(STYPE);
+        const npy_intp ostride = ostep / sizeof(STYPE);
+        if (TO_SIMD_SFX(npyv_loadable_stride)(istride) &&
+            TO_SIMD_SFX(npyv_storable_stride)(ostride))
+        {
+            if (istride == 1 && ostride != 1) {
+                // contiguous input, non-contiguous output
+                TO_SIMD_SFX(simd_unary_cn_negative)(
+                    (STYPE*)ip, (STYPE*)op, ostride, len
+                );
+                goto clear;
+            }
+            else if (istride != 1 && ostride == 1) {
+                // non-contiguous input, contiguous output
+                TO_SIMD_SFX(simd_unary_nc_negative)(
+                    (STYPE*)ip, istride, (STYPE*)op, len
+                );
+                goto clear;
+            }
+        // X86 does better with unrolled scalar for heavy non-contiguous
+        #ifndef NPY_HAVE_SSE2
+            else if (istride != 1 && ostride != 1) {
+                // non-contiguous input and output
+                TO_SIMD_SFX(simd_unary_nn_negative)(
+                    (STYPE*)ip, istride, (STYPE*)op, ostride, len
+                );
+                goto clear;
+            }
+        #endif
+        }
+    #endif // 1
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    /*
+     * scalar unrolls
+     * 8x unroll performed best on
+     *  - Apple M1 Native / arm64
+     *  - Apple M1 Rosetta / SSE42
+     *  - iMacPro / AVX512
+     */
+    #define UNROLL 8
+    for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
+    #line 347
+    #if UNROLL > 0
+        const npy_int in_0 = *((const npy_int *)(ip + 0 * istep));
+        *((npy_int *)(op + 0 * ostep)) = scalar_negative(in_0);
+    #endif
+    
+#line 347
+    #if UNROLL > 1
+        const npy_int in_1 = *((const npy_int *)(ip + 1 * istep));
+        *((npy_int *)(op + 1 * ostep)) = scalar_negative(in_1);
+    #endif
+    
+#line 347
+    #if UNROLL > 2
+        const npy_int in_2 = *((const npy_int *)(ip + 2 * istep));
+        *((npy_int *)(op + 2 * ostep)) = scalar_negative(in_2);
+    #endif
+    
+#line 347
+    #if UNROLL > 3
+        const npy_int in_3 = *((const npy_int *)(ip + 3 * istep));
+        *((npy_int *)(op + 3 * ostep)) = scalar_negative(in_3);
+    #endif
+    
+#line 347
+    #if UNROLL > 4
+        const npy_int in_4 = *((const npy_int *)(ip + 4 * istep));
+        *((npy_int *)(op + 4 * ostep)) = scalar_negative(in_4);
+    #endif
+    
+#line 347
+    #if UNROLL > 5
+        const npy_int in_5 = *((const npy_int *)(ip + 5 * istep));
+        *((npy_int *)(op + 5 * ostep)) = scalar_negative(in_5);
+    #endif
+    
+#line 347
+    #if UNROLL > 6
+        const npy_int in_6 = *((const npy_int *)(ip + 6 * istep));
+        *((npy_int *)(op + 6 * ostep)) = scalar_negative(in_6);
+    #endif
+    
+#line 347
+    #if UNROLL > 7
+        const npy_int in_7 = *((const npy_int *)(ip + 7 * istep));
+        *((npy_int *)(op + 7 * ostep)) = scalar_negative(in_7);
+    #endif
+    
+#line 347
+    #if UNROLL > 8
+        const npy_int in_8 = *((const npy_int *)(ip + 8 * istep));
+        *((npy_int *)(op + 8 * ostep)) = scalar_negative(in_8);
+    #endif
+    
+#line 347
+    #if UNROLL > 9
+        const npy_int in_9 = *((const npy_int *)(ip + 9 * istep));
+        *((npy_int *)(op + 9 * ostep)) = scalar_negative(in_9);
+    #endif
+    
+#line 347
+    #if UNROLL > 10
+        const npy_int in_10 = *((const npy_int *)(ip + 10 * istep));
+        *((npy_int *)(op + 10 * ostep)) = scalar_negative(in_10);
+    #endif
+    
+#line 347
+    #if UNROLL > 11
+        const npy_int in_11 = *((const npy_int *)(ip + 11 * istep));
+        *((npy_int *)(op + 11 * ostep)) = scalar_negative(in_11);
+    #endif
+    
+#line 347
+    #if UNROLL > 12
+        const npy_int in_12 = *((const npy_int *)(ip + 12 * istep));
+        *((npy_int *)(op + 12 * ostep)) = scalar_negative(in_12);
+    #endif
+    
+#line 347
+    #if UNROLL > 13
+        const npy_int in_13 = *((const npy_int *)(ip + 13 * istep));
+        *((npy_int *)(op + 13 * ostep)) = scalar_negative(in_13);
+    #endif
+    
+#line 347
+    #if UNROLL > 14
+        const npy_int in_14 = *((const npy_int *)(ip + 14 * istep));
+        *((npy_int *)(op + 14 * ostep)) = scalar_negative(in_14);
+    #endif
+    
+#line 347
+    #if UNROLL > 15
+        const npy_int in_15 = *((const npy_int *)(ip + 15 * istep));
+        *((npy_int *)(op + 15 * ostep)) = scalar_negative(in_15);
+    #endif
+    
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    for (; len > 0; --len, ip += istep, op += ostep) {
+        *((npy_int *)op) = scalar_negative(*(const npy_int *)ip);
+    }
+#ifdef TO_SIMD_SFX
+clear:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 257
+#undef TO_SIMD_SFX
+#if 0
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_LONG == 8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_LONG == 16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_LONG == 32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_LONG == 64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_LONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 283
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONG_negative)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip = args[0], *op = args[1];
+    npy_intp istep = steps[0], ostep = steps[1],
+             len = dimensions[0];
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (!is_mem_overlap(ip, istep, op, ostep, len)) {
+        if (IS_UNARY_CONT(npy_long, npy_long)) {
+            // no overlap and operands are contiguous
+            TO_SIMD_SFX(simd_unary_cc_negative)(
+                (STYPE*)ip, (STYPE*)op, len
+            );
+            goto clear;
+        }
+    #if 1
+        const npy_intp istride = istep / sizeof(STYPE);
+        const npy_intp ostride = ostep / sizeof(STYPE);
+        if (TO_SIMD_SFX(npyv_loadable_stride)(istride) &&
+            TO_SIMD_SFX(npyv_storable_stride)(ostride))
+        {
+            if (istride == 1 && ostride != 1) {
+                // contiguous input, non-contiguous output
+                TO_SIMD_SFX(simd_unary_cn_negative)(
+                    (STYPE*)ip, (STYPE*)op, ostride, len
+                );
+                goto clear;
+            }
+            else if (istride != 1 && ostride == 1) {
+                // non-contiguous input, contiguous output
+                TO_SIMD_SFX(simd_unary_nc_negative)(
+                    (STYPE*)ip, istride, (STYPE*)op, len
+                );
+                goto clear;
+            }
+        // X86 does better with unrolled scalar for heavy non-contiguous
+        #ifndef NPY_HAVE_SSE2
+            else if (istride != 1 && ostride != 1) {
+                // non-contiguous input and output
+                TO_SIMD_SFX(simd_unary_nn_negative)(
+                    (STYPE*)ip, istride, (STYPE*)op, ostride, len
+                );
+                goto clear;
+            }
+        #endif
+        }
+    #endif // 1
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    /*
+     * scalar unrolls
+     * 8x unroll performed best on
+     *  - Apple M1 Native / arm64
+     *  - Apple M1 Rosetta / SSE42
+     *  - iMacPro / AVX512
+     */
+    #define UNROLL 8
+    for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
+    #line 347
+    #if UNROLL > 0
+        const npy_long in_0 = *((const npy_long *)(ip + 0 * istep));
+        *((npy_long *)(op + 0 * ostep)) = scalar_negative(in_0);
+    #endif
+    
+#line 347
+    #if UNROLL > 1
+        const npy_long in_1 = *((const npy_long *)(ip + 1 * istep));
+        *((npy_long *)(op + 1 * ostep)) = scalar_negative(in_1);
+    #endif
+    
+#line 347
+    #if UNROLL > 2
+        const npy_long in_2 = *((const npy_long *)(ip + 2 * istep));
+        *((npy_long *)(op + 2 * ostep)) = scalar_negative(in_2);
+    #endif
+    
+#line 347
+    #if UNROLL > 3
+        const npy_long in_3 = *((const npy_long *)(ip + 3 * istep));
+        *((npy_long *)(op + 3 * ostep)) = scalar_negative(in_3);
+    #endif
+    
+#line 347
+    #if UNROLL > 4
+        const npy_long in_4 = *((const npy_long *)(ip + 4 * istep));
+        *((npy_long *)(op + 4 * ostep)) = scalar_negative(in_4);
+    #endif
+    
+#line 347
+    #if UNROLL > 5
+        const npy_long in_5 = *((const npy_long *)(ip + 5 * istep));
+        *((npy_long *)(op + 5 * ostep)) = scalar_negative(in_5);
+    #endif
+    
+#line 347
+    #if UNROLL > 6
+        const npy_long in_6 = *((const npy_long *)(ip + 6 * istep));
+        *((npy_long *)(op + 6 * ostep)) = scalar_negative(in_6);
+    #endif
+    
+#line 347
+    #if UNROLL > 7
+        const npy_long in_7 = *((const npy_long *)(ip + 7 * istep));
+        *((npy_long *)(op + 7 * ostep)) = scalar_negative(in_7);
+    #endif
+    
+#line 347
+    #if UNROLL > 8
+        const npy_long in_8 = *((const npy_long *)(ip + 8 * istep));
+        *((npy_long *)(op + 8 * ostep)) = scalar_negative(in_8);
+    #endif
+    
+#line 347
+    #if UNROLL > 9
+        const npy_long in_9 = *((const npy_long *)(ip + 9 * istep));
+        *((npy_long *)(op + 9 * ostep)) = scalar_negative(in_9);
+    #endif
+    
+#line 347
+    #if UNROLL > 10
+        const npy_long in_10 = *((const npy_long *)(ip + 10 * istep));
+        *((npy_long *)(op + 10 * ostep)) = scalar_negative(in_10);
+    #endif
+    
+#line 347
+    #if UNROLL > 11
+        const npy_long in_11 = *((const npy_long *)(ip + 11 * istep));
+        *((npy_long *)(op + 11 * ostep)) = scalar_negative(in_11);
+    #endif
+    
+#line 347
+    #if UNROLL > 12
+        const npy_long in_12 = *((const npy_long *)(ip + 12 * istep));
+        *((npy_long *)(op + 12 * ostep)) = scalar_negative(in_12);
+    #endif
+    
+#line 347
+    #if UNROLL > 13
+        const npy_long in_13 = *((const npy_long *)(ip + 13 * istep));
+        *((npy_long *)(op + 13 * ostep)) = scalar_negative(in_13);
+    #endif
+    
+#line 347
+    #if UNROLL > 14
+        const npy_long in_14 = *((const npy_long *)(ip + 14 * istep));
+        *((npy_long *)(op + 14 * ostep)) = scalar_negative(in_14);
+    #endif
+    
+#line 347
+    #if UNROLL > 15
+        const npy_long in_15 = *((const npy_long *)(ip + 15 * istep));
+        *((npy_long *)(op + 15 * ostep)) = scalar_negative(in_15);
+    #endif
+    
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    for (; len > 0; --len, ip += istep, op += ostep) {
+        *((npy_long *)op) = scalar_negative(*(const npy_long *)ip);
+    }
+#ifdef TO_SIMD_SFX
+clear:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 257
+#undef TO_SIMD_SFX
+#if 0
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 8
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 16
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 32
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_LONGLONG == 64
+    #if 0
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_LONGLONG == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONGLONG == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 283
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGLONG_negative)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip = args[0], *op = args[1];
+    npy_intp istep = steps[0], ostep = steps[1],
+             len = dimensions[0];
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (!is_mem_overlap(ip, istep, op, ostep, len)) {
+        if (IS_UNARY_CONT(npy_longlong, npy_longlong)) {
+            // no overlap and operands are contiguous
+            TO_SIMD_SFX(simd_unary_cc_negative)(
+                (STYPE*)ip, (STYPE*)op, len
+            );
+            goto clear;
+        }
+    #if 1
+        const npy_intp istride = istep / sizeof(STYPE);
+        const npy_intp ostride = ostep / sizeof(STYPE);
+        if (TO_SIMD_SFX(npyv_loadable_stride)(istride) &&
+            TO_SIMD_SFX(npyv_storable_stride)(ostride))
+        {
+            if (istride == 1 && ostride != 1) {
+                // contiguous input, non-contiguous output
+                TO_SIMD_SFX(simd_unary_cn_negative)(
+                    (STYPE*)ip, (STYPE*)op, ostride, len
+                );
+                goto clear;
+            }
+            else if (istride != 1 && ostride == 1) {
+                // non-contiguous input, contiguous output
+                TO_SIMD_SFX(simd_unary_nc_negative)(
+                    (STYPE*)ip, istride, (STYPE*)op, len
+                );
+                goto clear;
+            }
+        // X86 does better with unrolled scalar for heavy non-contiguous
+        #ifndef NPY_HAVE_SSE2
+            else if (istride != 1 && ostride != 1) {
+                // non-contiguous input and output
+                TO_SIMD_SFX(simd_unary_nn_negative)(
+                    (STYPE*)ip, istride, (STYPE*)op, ostride, len
+                );
+                goto clear;
+            }
+        #endif
+        }
+    #endif // 1
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    /*
+     * scalar unrolls
+     * 8x unroll performed best on
+     *  - Apple M1 Native / arm64
+     *  - Apple M1 Rosetta / SSE42
+     *  - iMacPro / AVX512
+     */
+    #define UNROLL 8
+    for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
+    #line 347
+    #if UNROLL > 0
+        const npy_longlong in_0 = *((const npy_longlong *)(ip + 0 * istep));
+        *((npy_longlong *)(op + 0 * ostep)) = scalar_negative(in_0);
+    #endif
+    
+#line 347
+    #if UNROLL > 1
+        const npy_longlong in_1 = *((const npy_longlong *)(ip + 1 * istep));
+        *((npy_longlong *)(op + 1 * ostep)) = scalar_negative(in_1);
+    #endif
+    
+#line 347
+    #if UNROLL > 2
+        const npy_longlong in_2 = *((const npy_longlong *)(ip + 2 * istep));
+        *((npy_longlong *)(op + 2 * ostep)) = scalar_negative(in_2);
+    #endif
+    
+#line 347
+    #if UNROLL > 3
+        const npy_longlong in_3 = *((const npy_longlong *)(ip + 3 * istep));
+        *((npy_longlong *)(op + 3 * ostep)) = scalar_negative(in_3);
+    #endif
+    
+#line 347
+    #if UNROLL > 4
+        const npy_longlong in_4 = *((const npy_longlong *)(ip + 4 * istep));
+        *((npy_longlong *)(op + 4 * ostep)) = scalar_negative(in_4);
+    #endif
+    
+#line 347
+    #if UNROLL > 5
+        const npy_longlong in_5 = *((const npy_longlong *)(ip + 5 * istep));
+        *((npy_longlong *)(op + 5 * ostep)) = scalar_negative(in_5);
+    #endif
+    
+#line 347
+    #if UNROLL > 6
+        const npy_longlong in_6 = *((const npy_longlong *)(ip + 6 * istep));
+        *((npy_longlong *)(op + 6 * ostep)) = scalar_negative(in_6);
+    #endif
+    
+#line 347
+    #if UNROLL > 7
+        const npy_longlong in_7 = *((const npy_longlong *)(ip + 7 * istep));
+        *((npy_longlong *)(op + 7 * ostep)) = scalar_negative(in_7);
+    #endif
+    
+#line 347
+    #if UNROLL > 8
+        const npy_longlong in_8 = *((const npy_longlong *)(ip + 8 * istep));
+        *((npy_longlong *)(op + 8 * ostep)) = scalar_negative(in_8);
+    #endif
+    
+#line 347
+    #if UNROLL > 9
+        const npy_longlong in_9 = *((const npy_longlong *)(ip + 9 * istep));
+        *((npy_longlong *)(op + 9 * ostep)) = scalar_negative(in_9);
+    #endif
+    
+#line 347
+    #if UNROLL > 10
+        const npy_longlong in_10 = *((const npy_longlong *)(ip + 10 * istep));
+        *((npy_longlong *)(op + 10 * ostep)) = scalar_negative(in_10);
+    #endif
+    
+#line 347
+    #if UNROLL > 11
+        const npy_longlong in_11 = *((const npy_longlong *)(ip + 11 * istep));
+        *((npy_longlong *)(op + 11 * ostep)) = scalar_negative(in_11);
+    #endif
+    
+#line 347
+    #if UNROLL > 12
+        const npy_longlong in_12 = *((const npy_longlong *)(ip + 12 * istep));
+        *((npy_longlong *)(op + 12 * ostep)) = scalar_negative(in_12);
+    #endif
+    
+#line 347
+    #if UNROLL > 13
+        const npy_longlong in_13 = *((const npy_longlong *)(ip + 13 * istep));
+        *((npy_longlong *)(op + 13 * ostep)) = scalar_negative(in_13);
+    #endif
+    
+#line 347
+    #if UNROLL > 14
+        const npy_longlong in_14 = *((const npy_longlong *)(ip + 14 * istep));
+        *((npy_longlong *)(op + 14 * ostep)) = scalar_negative(in_14);
+    #endif
+    
+#line 347
+    #if UNROLL > 15
+        const npy_longlong in_15 = *((const npy_longlong *)(ip + 15 * istep));
+        *((npy_longlong *)(op + 15 * ostep)) = scalar_negative(in_15);
+    #endif
+    
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    for (; len > 0; --len, ip += istep, op += ostep) {
+        *((npy_longlong *)op) = scalar_negative(*(const npy_longlong *)ip);
+    }
+#ifdef TO_SIMD_SFX
+clear:
+    npyv_cleanup();
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 257
+#undef TO_SIMD_SFX
+#if 0
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_FLOAT == 8
+    #if 1
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_FLOAT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_FLOAT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_FLOAT == 16
+    #if 1
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_FLOAT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_FLOAT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_FLOAT == 32
+    #if 1
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_FLOAT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_FLOAT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_FLOAT == 64
+    #if 1
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_FLOAT == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_FLOAT == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 283
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_negative)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip = args[0], *op = args[1];
+    npy_intp istep = steps[0], ostep = steps[1],
+             len = dimensions[0];
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (!is_mem_overlap(ip, istep, op, ostep, len)) {
+        if (IS_UNARY_CONT(npy_float, npy_float)) {
+            // no overlap and operands are contiguous
+            TO_SIMD_SFX(simd_unary_cc_negative)(
+                (STYPE*)ip, (STYPE*)op, len
+            );
+            goto clear;
+        }
+    #if 1
+        const npy_intp istride = istep / sizeof(STYPE);
+        const npy_intp ostride = ostep / sizeof(STYPE);
+        if (TO_SIMD_SFX(npyv_loadable_stride)(istride) &&
+            TO_SIMD_SFX(npyv_storable_stride)(ostride))
+        {
+            if (istride == 1 && ostride != 1) {
+                // contiguous input, non-contiguous output
+                TO_SIMD_SFX(simd_unary_cn_negative)(
+                    (STYPE*)ip, (STYPE*)op, ostride, len
+                );
+                goto clear;
+            }
+            else if (istride != 1 && ostride == 1) {
+                // non-contiguous input, contiguous output
+                TO_SIMD_SFX(simd_unary_nc_negative)(
+                    (STYPE*)ip, istride, (STYPE*)op, len
+                );
+                goto clear;
+            }
+        // X86 does better with unrolled scalar for heavy non-contiguous
+        #ifndef NPY_HAVE_SSE2
+            else if (istride != 1 && ostride != 1) {
+                // non-contiguous input and output
+                TO_SIMD_SFX(simd_unary_nn_negative)(
+                    (STYPE*)ip, istride, (STYPE*)op, ostride, len
+                );
+                goto clear;
+            }
+        #endif
+        }
+    #endif // 1
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    /*
+     * scalar unrolls
+     * 8x unroll performed best on
+     *  - Apple M1 Native / arm64
+     *  - Apple M1 Rosetta / SSE42
+     *  - iMacPro / AVX512
+     */
+    #define UNROLL 8
+    for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
+    #line 347
+    #if UNROLL > 0
+        const npy_float in_0 = *((const npy_float *)(ip + 0 * istep));
+        *((npy_float *)(op + 0 * ostep)) = scalar_negative(in_0);
+    #endif
+    
+#line 347
+    #if UNROLL > 1
+        const npy_float in_1 = *((const npy_float *)(ip + 1 * istep));
+        *((npy_float *)(op + 1 * ostep)) = scalar_negative(in_1);
+    #endif
+    
+#line 347
+    #if UNROLL > 2
+        const npy_float in_2 = *((const npy_float *)(ip + 2 * istep));
+        *((npy_float *)(op + 2 * ostep)) = scalar_negative(in_2);
+    #endif
+    
+#line 347
+    #if UNROLL > 3
+        const npy_float in_3 = *((const npy_float *)(ip + 3 * istep));
+        *((npy_float *)(op + 3 * ostep)) = scalar_negative(in_3);
+    #endif
+    
+#line 347
+    #if UNROLL > 4
+        const npy_float in_4 = *((const npy_float *)(ip + 4 * istep));
+        *((npy_float *)(op + 4 * ostep)) = scalar_negative(in_4);
+    #endif
+    
+#line 347
+    #if UNROLL > 5
+        const npy_float in_5 = *((const npy_float *)(ip + 5 * istep));
+        *((npy_float *)(op + 5 * ostep)) = scalar_negative(in_5);
+    #endif
+    
+#line 347
+    #if UNROLL > 6
+        const npy_float in_6 = *((const npy_float *)(ip + 6 * istep));
+        *((npy_float *)(op + 6 * ostep)) = scalar_negative(in_6);
+    #endif
+    
+#line 347
+    #if UNROLL > 7
+        const npy_float in_7 = *((const npy_float *)(ip + 7 * istep));
+        *((npy_float *)(op + 7 * ostep)) = scalar_negative(in_7);
+    #endif
+    
+#line 347
+    #if UNROLL > 8
+        const npy_float in_8 = *((const npy_float *)(ip + 8 * istep));
+        *((npy_float *)(op + 8 * ostep)) = scalar_negative(in_8);
+    #endif
+    
+#line 347
+    #if UNROLL > 9
+        const npy_float in_9 = *((const npy_float *)(ip + 9 * istep));
+        *((npy_float *)(op + 9 * ostep)) = scalar_negative(in_9);
+    #endif
+    
+#line 347
+    #if UNROLL > 10
+        const npy_float in_10 = *((const npy_float *)(ip + 10 * istep));
+        *((npy_float *)(op + 10 * ostep)) = scalar_negative(in_10);
+    #endif
+    
+#line 347
+    #if UNROLL > 11
+        const npy_float in_11 = *((const npy_float *)(ip + 11 * istep));
+        *((npy_float *)(op + 11 * ostep)) = scalar_negative(in_11);
+    #endif
+    
+#line 347
+    #if UNROLL > 12
+        const npy_float in_12 = *((const npy_float *)(ip + 12 * istep));
+        *((npy_float *)(op + 12 * ostep)) = scalar_negative(in_12);
+    #endif
+    
+#line 347
+    #if UNROLL > 13
+        const npy_float in_13 = *((const npy_float *)(ip + 13 * istep));
+        *((npy_float *)(op + 13 * ostep)) = scalar_negative(in_13);
+    #endif
+    
+#line 347
+    #if UNROLL > 14
+        const npy_float in_14 = *((const npy_float *)(ip + 14 * istep));
+        *((npy_float *)(op + 14 * ostep)) = scalar_negative(in_14);
+    #endif
+    
+#line 347
+    #if UNROLL > 15
+        const npy_float in_15 = *((const npy_float *)(ip + 15 * istep));
+        *((npy_float *)(op + 15 * ostep)) = scalar_negative(in_15);
+    #endif
+    
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    for (; len > 0; --len, ip += istep, op += ostep) {
+        *((npy_float *)op) = scalar_negative(*(const npy_float *)ip);
+    }
+#ifdef TO_SIMD_SFX
+clear:
+    npyv_cleanup();
+#endif
+#if 1
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 257
+#undef TO_SIMD_SFX
+#if 0
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 8
+    #if 1
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_DOUBLE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_DOUBLE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 16
+    #if 1
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_DOUBLE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_DOUBLE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 32
+    #if 1
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_DOUBLE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_DOUBLE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_DOUBLE == 64
+    #if 1
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_DOUBLE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_DOUBLE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 283
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_negative)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip = args[0], *op = args[1];
+    npy_intp istep = steps[0], ostep = steps[1],
+             len = dimensions[0];
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (!is_mem_overlap(ip, istep, op, ostep, len)) {
+        if (IS_UNARY_CONT(npy_double, npy_double)) {
+            // no overlap and operands are contiguous
+            TO_SIMD_SFX(simd_unary_cc_negative)(
+                (STYPE*)ip, (STYPE*)op, len
+            );
+            goto clear;
+        }
+    #if 1
+        const npy_intp istride = istep / sizeof(STYPE);
+        const npy_intp ostride = ostep / sizeof(STYPE);
+        if (TO_SIMD_SFX(npyv_loadable_stride)(istride) &&
+            TO_SIMD_SFX(npyv_storable_stride)(ostride))
+        {
+            if (istride == 1 && ostride != 1) {
+                // contiguous input, non-contiguous output
+                TO_SIMD_SFX(simd_unary_cn_negative)(
+                    (STYPE*)ip, (STYPE*)op, ostride, len
+                );
+                goto clear;
+            }
+            else if (istride != 1 && ostride == 1) {
+                // non-contiguous input, contiguous output
+                TO_SIMD_SFX(simd_unary_nc_negative)(
+                    (STYPE*)ip, istride, (STYPE*)op, len
+                );
+                goto clear;
+            }
+        // X86 does better with unrolled scalar for heavy non-contiguous
+        #ifndef NPY_HAVE_SSE2
+            else if (istride != 1 && ostride != 1) {
+                // non-contiguous input and output
+                TO_SIMD_SFX(simd_unary_nn_negative)(
+                    (STYPE*)ip, istride, (STYPE*)op, ostride, len
+                );
+                goto clear;
+            }
+        #endif
+        }
+    #endif // 1
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    /*
+     * scalar unrolls
+     * 8x unroll performed best on
+     *  - Apple M1 Native / arm64
+     *  - Apple M1 Rosetta / SSE42
+     *  - iMacPro / AVX512
+     */
+    #define UNROLL 8
+    for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
+    #line 347
+    #if UNROLL > 0
+        const npy_double in_0 = *((const npy_double *)(ip + 0 * istep));
+        *((npy_double *)(op + 0 * ostep)) = scalar_negative(in_0);
+    #endif
+    
+#line 347
+    #if UNROLL > 1
+        const npy_double in_1 = *((const npy_double *)(ip + 1 * istep));
+        *((npy_double *)(op + 1 * ostep)) = scalar_negative(in_1);
+    #endif
+    
+#line 347
+    #if UNROLL > 2
+        const npy_double in_2 = *((const npy_double *)(ip + 2 * istep));
+        *((npy_double *)(op + 2 * ostep)) = scalar_negative(in_2);
+    #endif
+    
+#line 347
+    #if UNROLL > 3
+        const npy_double in_3 = *((const npy_double *)(ip + 3 * istep));
+        *((npy_double *)(op + 3 * ostep)) = scalar_negative(in_3);
+    #endif
+    
+#line 347
+    #if UNROLL > 4
+        const npy_double in_4 = *((const npy_double *)(ip + 4 * istep));
+        *((npy_double *)(op + 4 * ostep)) = scalar_negative(in_4);
+    #endif
+    
+#line 347
+    #if UNROLL > 5
+        const npy_double in_5 = *((const npy_double *)(ip + 5 * istep));
+        *((npy_double *)(op + 5 * ostep)) = scalar_negative(in_5);
+    #endif
+    
+#line 347
+    #if UNROLL > 6
+        const npy_double in_6 = *((const npy_double *)(ip + 6 * istep));
+        *((npy_double *)(op + 6 * ostep)) = scalar_negative(in_6);
+    #endif
+    
+#line 347
+    #if UNROLL > 7
+        const npy_double in_7 = *((const npy_double *)(ip + 7 * istep));
+        *((npy_double *)(op + 7 * ostep)) = scalar_negative(in_7);
+    #endif
+    
+#line 347
+    #if UNROLL > 8
+        const npy_double in_8 = *((const npy_double *)(ip + 8 * istep));
+        *((npy_double *)(op + 8 * ostep)) = scalar_negative(in_8);
+    #endif
+    
+#line 347
+    #if UNROLL > 9
+        const npy_double in_9 = *((const npy_double *)(ip + 9 * istep));
+        *((npy_double *)(op + 9 * ostep)) = scalar_negative(in_9);
+    #endif
+    
+#line 347
+    #if UNROLL > 10
+        const npy_double in_10 = *((const npy_double *)(ip + 10 * istep));
+        *((npy_double *)(op + 10 * ostep)) = scalar_negative(in_10);
+    #endif
+    
+#line 347
+    #if UNROLL > 11
+        const npy_double in_11 = *((const npy_double *)(ip + 11 * istep));
+        *((npy_double *)(op + 11 * ostep)) = scalar_negative(in_11);
+    #endif
+    
+#line 347
+    #if UNROLL > 12
+        const npy_double in_12 = *((const npy_double *)(ip + 12 * istep));
+        *((npy_double *)(op + 12 * ostep)) = scalar_negative(in_12);
+    #endif
+    
+#line 347
+    #if UNROLL > 13
+        const npy_double in_13 = *((const npy_double *)(ip + 13 * istep));
+        *((npy_double *)(op + 13 * ostep)) = scalar_negative(in_13);
+    #endif
+    
+#line 347
+    #if UNROLL > 14
+        const npy_double in_14 = *((const npy_double *)(ip + 14 * istep));
+        *((npy_double *)(op + 14 * ostep)) = scalar_negative(in_14);
+    #endif
+    
+#line 347
+    #if UNROLL > 15
+        const npy_double in_15 = *((const npy_double *)(ip + 15 * istep));
+        *((npy_double *)(op + 15 * ostep)) = scalar_negative(in_15);
+    #endif
+    
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    for (; len > 0; --len, ip += istep, op += ostep) {
+        *((npy_double *)op) = scalar_negative(*(const npy_double *)ip);
+    }
+#ifdef TO_SIMD_SFX
+clear:
+    npyv_cleanup();
+#endif
+#if 1
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 257
+#undef TO_SIMD_SFX
+#if 0
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 8
+    #if 1
+        #define TO_SIMD_SFX(X) X##_f8
+        #if NPY_BITSOF_LONGDOUBLE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONGDOUBLE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u8
+    #else
+        #define TO_SIMD_SFX(X) X##_s8
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 16
+    #if 1
+        #define TO_SIMD_SFX(X) X##_f16
+        #if NPY_BITSOF_LONGDOUBLE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONGDOUBLE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u16
+    #else
+        #define TO_SIMD_SFX(X) X##_s16
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 32
+    #if 1
+        #define TO_SIMD_SFX(X) X##_f32
+        #if NPY_BITSOF_LONGDOUBLE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONGDOUBLE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u32
+    #else
+        #define TO_SIMD_SFX(X) X##_s32
+    #endif
+
+#line 262
+#elif NPY_SIMD && NPY_BITSOF_LONGDOUBLE == 64
+    #if 1
+        #define TO_SIMD_SFX(X) X##_f64
+        #if NPY_BITSOF_LONGDOUBLE == 32 && !NPY_SIMD_F32
+            #undef TO_SIMD_SFX
+        #endif
+        #if NPY_BITSOF_LONGDOUBLE == 64 && !NPY_SIMD_F64
+            #undef TO_SIMD_SFX
+        #endif
+    #elif 0
+        #define TO_SIMD_SFX(X) X##_u64
+    #else
+        #define TO_SIMD_SFX(X) X##_s64
+    #endif
+
+#endif
+
+#line 283
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(LONGDOUBLE_negative)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    char *ip = args[0], *op = args[1];
+    npy_intp istep = steps[0], ostep = steps[1],
+             len = dimensions[0];
+#ifdef TO_SIMD_SFX
+    #undef STYPE
+    #define STYPE TO_SIMD_SFX(npyv_lanetype)
+    if (!is_mem_overlap(ip, istep, op, ostep, len)) {
+        if (IS_UNARY_CONT(npy_longdouble, npy_longdouble)) {
+            // no overlap and operands are contiguous
+            TO_SIMD_SFX(simd_unary_cc_negative)(
+                (STYPE*)ip, (STYPE*)op, len
+            );
+            goto clear;
+        }
+    #if 1
+        const npy_intp istride = istep / sizeof(STYPE);
+        const npy_intp ostride = ostep / sizeof(STYPE);
+        if (TO_SIMD_SFX(npyv_loadable_stride)(istride) &&
+            TO_SIMD_SFX(npyv_storable_stride)(ostride))
+        {
+            if (istride == 1 && ostride != 1) {
+                // contiguous input, non-contiguous output
+                TO_SIMD_SFX(simd_unary_cn_negative)(
+                    (STYPE*)ip, (STYPE*)op, ostride, len
+                );
+                goto clear;
+            }
+            else if (istride != 1 && ostride == 1) {
+                // non-contiguous input, contiguous output
+                TO_SIMD_SFX(simd_unary_nc_negative)(
+                    (STYPE*)ip, istride, (STYPE*)op, len
+                );
+                goto clear;
+            }
+        // X86 does better with unrolled scalar for heavy non-contiguous
+        #ifndef NPY_HAVE_SSE2
+            else if (istride != 1 && ostride != 1) {
+                // non-contiguous input and output
+                TO_SIMD_SFX(simd_unary_nn_negative)(
+                    (STYPE*)ip, istride, (STYPE*)op, ostride, len
+                );
+                goto clear;
+            }
+        #endif
+        }
+    #endif // 1
+    }
+#endif // TO_SIMD_SFX
+#ifndef NPY_DISABLE_OPTIMIZATION
+    /*
+     * scalar unrolls
+     * 8x unroll performed best on
+     *  - Apple M1 Native / arm64
+     *  - Apple M1 Rosetta / SSE42
+     *  - iMacPro / AVX512
+     */
+    #define UNROLL 8
+    for (; len >= UNROLL; len -= UNROLL, ip += istep*UNROLL, op += ostep*UNROLL) {
+    #line 347
+    #if UNROLL > 0
+        const npy_longdouble in_0 = *((const npy_longdouble *)(ip + 0 * istep));
+        *((npy_longdouble *)(op + 0 * ostep)) = scalar_negative(in_0);
+    #endif
+    
+#line 347
+    #if UNROLL > 1
+        const npy_longdouble in_1 = *((const npy_longdouble *)(ip + 1 * istep));
+        *((npy_longdouble *)(op + 1 * ostep)) = scalar_negative(in_1);
+    #endif
+    
+#line 347
+    #if UNROLL > 2
+        const npy_longdouble in_2 = *((const npy_longdouble *)(ip + 2 * istep));
+        *((npy_longdouble *)(op + 2 * ostep)) = scalar_negative(in_2);
+    #endif
+    
+#line 347
+    #if UNROLL > 3
+        const npy_longdouble in_3 = *((const npy_longdouble *)(ip + 3 * istep));
+        *((npy_longdouble *)(op + 3 * ostep)) = scalar_negative(in_3);
+    #endif
+    
+#line 347
+    #if UNROLL > 4
+        const npy_longdouble in_4 = *((const npy_longdouble *)(ip + 4 * istep));
+        *((npy_longdouble *)(op + 4 * ostep)) = scalar_negative(in_4);
+    #endif
+    
+#line 347
+    #if UNROLL > 5
+        const npy_longdouble in_5 = *((const npy_longdouble *)(ip + 5 * istep));
+        *((npy_longdouble *)(op + 5 * ostep)) = scalar_negative(in_5);
+    #endif
+    
+#line 347
+    #if UNROLL > 6
+        const npy_longdouble in_6 = *((const npy_longdouble *)(ip + 6 * istep));
+        *((npy_longdouble *)(op + 6 * ostep)) = scalar_negative(in_6);
+    #endif
+    
+#line 347
+    #if UNROLL > 7
+        const npy_longdouble in_7 = *((const npy_longdouble *)(ip + 7 * istep));
+        *((npy_longdouble *)(op + 7 * ostep)) = scalar_negative(in_7);
+    #endif
+    
+#line 347
+    #if UNROLL > 8
+        const npy_longdouble in_8 = *((const npy_longdouble *)(ip + 8 * istep));
+        *((npy_longdouble *)(op + 8 * ostep)) = scalar_negative(in_8);
+    #endif
+    
+#line 347
+    #if UNROLL > 9
+        const npy_longdouble in_9 = *((const npy_longdouble *)(ip + 9 * istep));
+        *((npy_longdouble *)(op + 9 * ostep)) = scalar_negative(in_9);
+    #endif
+    
+#line 347
+    #if UNROLL > 10
+        const npy_longdouble in_10 = *((const npy_longdouble *)(ip + 10 * istep));
+        *((npy_longdouble *)(op + 10 * ostep)) = scalar_negative(in_10);
+    #endif
+    
+#line 347
+    #if UNROLL > 11
+        const npy_longdouble in_11 = *((const npy_longdouble *)(ip + 11 * istep));
+        *((npy_longdouble *)(op + 11 * ostep)) = scalar_negative(in_11);
+    #endif
+    
+#line 347
+    #if UNROLL > 12
+        const npy_longdouble in_12 = *((const npy_longdouble *)(ip + 12 * istep));
+        *((npy_longdouble *)(op + 12 * ostep)) = scalar_negative(in_12);
+    #endif
+    
+#line 347
+    #if UNROLL > 13
+        const npy_longdouble in_13 = *((const npy_longdouble *)(ip + 13 * istep));
+        *((npy_longdouble *)(op + 13 * ostep)) = scalar_negative(in_13);
+    #endif
+    
+#line 347
+    #if UNROLL > 14
+        const npy_longdouble in_14 = *((const npy_longdouble *)(ip + 14 * istep));
+        *((npy_longdouble *)(op + 14 * ostep)) = scalar_negative(in_14);
+    #endif
+    
+#line 347
+    #if UNROLL > 15
+        const npy_longdouble in_15 = *((const npy_longdouble *)(ip + 15 * istep));
+        *((npy_longdouble *)(op + 15 * ostep)) = scalar_negative(in_15);
+    #endif
+    
+    }
+#endif // NPY_DISABLE_OPTIMIZATION
+    for (; len > 0; --len, ip += istep, op += ostep) {
+        *((npy_longdouble *)op) = scalar_negative(*(const npy_longdouble *)ip);
+    }
+#ifdef TO_SIMD_SFX
+clear:
+    npyv_cleanup();
+#endif
+#if 1
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+#undef NEGATIVE_CONTIG_ONLY
+
diff --git a/numpy/core/src/_generated/loops_unary_complex.dispatch.c b/numpy/core/src/_generated/loops_unary_complex.dispatch.c
new file mode 100644
index 000000000000..438863ce9fd8
--- /dev/null
+++ b/numpy/core/src/_generated/loops_unary_complex.dispatch.c
@@ -0,0 +1,231 @@
+#line 1 "numpy/core/src/umath/loops_unary_complex.dispatch.c.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+/*@targets
+ ** $maxopt baseline
+ ** sse2 (avx2 fma3) avx512f
+ ** neon asimd
+ ** vsx2 vsx3
+ ** vx vxe
+ **/
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+#line 30
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_f32
+simd_cabsolute_f32(npyv_f32 re, npyv_f32 im)
+{
+    const npyv_f32 inf = npyv_setall_f32(NPY_INFINITYF);
+    const npyv_f32 nan = npyv_setall_f32(NPY_NANF);
+
+    re = npyv_abs_f32(re);
+    im = npyv_abs_f32(im);
+    /*
+     * If real or imag = INF, then convert it to inf + j*inf
+     * Handles: inf + j*nan, nan + j*inf
+     */
+    npyv_b32 re_infmask = npyv_cmpeq_f32(re, inf);
+    npyv_b32 im_infmask = npyv_cmpeq_f32(im, inf);
+    im = npyv_select_f32(re_infmask, inf, im);
+    re = npyv_select_f32(im_infmask, inf, re);
+    /*
+     * If real or imag = NAN, then convert it to nan + j*nan
+     * Handles: x + j*nan, nan + j*x
+     */
+    npyv_b32 re_nnanmask = npyv_notnan_f32(re);
+    npyv_b32 im_nnanmask = npyv_notnan_f32(im);
+    im = npyv_select_f32(re_nnanmask, im, nan);
+    re = npyv_select_f32(im_nnanmask, re, nan);
+
+    npyv_f32 larger  = npyv_max_f32(re, im);
+    npyv_f32 smaller = npyv_min_f32(im, re);
+    /*
+     * Calculate div_mask to prevent 0./0. and inf/inf operations in div
+     */
+    npyv_b32 zeromask = npyv_cmpeq_f32(larger, npyv_zero_f32());
+    npyv_b32 infmask = npyv_cmpeq_f32(smaller, inf);
+    npyv_b32 div_mask = npyv_not_b32(npyv_or_b32(zeromask, infmask));
+
+    npyv_f32 ratio = npyv_ifdivz_f32(div_mask, smaller, larger);
+    npyv_f32 hypot = npyv_sqrt_f32(
+        npyv_muladd_f32(ratio, ratio, npyv_setall_f32(1.0f)
+    ));
+    return npyv_mul_f32(hypot, larger);
+}
+#endif // VECTOR
+
+#line 30
+#if NPY_SIMD_F64
+NPY_FINLINE npyv_f64
+simd_cabsolute_f64(npyv_f64 re, npyv_f64 im)
+{
+    const npyv_f64 inf = npyv_setall_f64(NPY_INFINITY);
+    const npyv_f64 nan = npyv_setall_f64(NPY_NAN);
+
+    re = npyv_abs_f64(re);
+    im = npyv_abs_f64(im);
+    /*
+     * If real or imag = INF, then convert it to inf + j*inf
+     * Handles: inf + j*nan, nan + j*inf
+     */
+    npyv_b64 re_infmask = npyv_cmpeq_f64(re, inf);
+    npyv_b64 im_infmask = npyv_cmpeq_f64(im, inf);
+    im = npyv_select_f64(re_infmask, inf, im);
+    re = npyv_select_f64(im_infmask, inf, re);
+    /*
+     * If real or imag = NAN, then convert it to nan + j*nan
+     * Handles: x + j*nan, nan + j*x
+     */
+    npyv_b64 re_nnanmask = npyv_notnan_f64(re);
+    npyv_b64 im_nnanmask = npyv_notnan_f64(im);
+    im = npyv_select_f64(re_nnanmask, im, nan);
+    re = npyv_select_f64(im_nnanmask, re, nan);
+
+    npyv_f64 larger  = npyv_max_f64(re, im);
+    npyv_f64 smaller = npyv_min_f64(im, re);
+    /*
+     * Calculate div_mask to prevent 0./0. and inf/inf operations in div
+     */
+    npyv_b64 zeromask = npyv_cmpeq_f64(larger, npyv_zero_f64());
+    npyv_b64 infmask = npyv_cmpeq_f64(smaller, inf);
+    npyv_b64 div_mask = npyv_not_b64(npyv_or_b64(zeromask, infmask));
+
+    npyv_f64 ratio = npyv_ifdivz_f64(div_mask, smaller, larger);
+    npyv_f64 hypot = npyv_sqrt_f64(
+        npyv_muladd_f64(ratio, ratio, npyv_setall_f64(1.0)
+    ));
+    return npyv_mul_f64(hypot, larger);
+}
+#endif // VECTOR
+
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+#line 86
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(CFLOAT_absolute)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if NPY_SIMD_F32
+    npy_intp len = dimensions[0];
+    npy_intp ssrc = steps[0] / sizeof(npy_float);
+    npy_intp sdst = steps[1] / sizeof(npy_float);
+
+    if (!is_mem_overlap(args[0], steps[0], args[1], steps[1], len) &&
+        npyv_loadable_stride_f32(ssrc) && npyv_storable_stride_f32(sdst)
+        && steps[0] % sizeof(npy_float) == 0
+        && steps[1] % sizeof(npy_float) == 0
+    ) {
+        const npy_float *src = (npy_float*)args[0];
+              npy_float *dst = (npy_float*)args[1];
+
+        const int vstep = npyv_nlanes_f32;
+        const int wstep = vstep * 2;
+        const int hstep = vstep / 2;
+
+        if (ssrc == 2 && sdst == 1) {
+            for (; len >= vstep; len -= vstep, src += wstep, dst += vstep) {
+                npyv_f32x2 ab = npyv_load_f32x2(src);
+                npyv_f32 r = simd_cabsolute_f32(ab.val[0], ab.val[1]);
+                npyv_store_f32(dst, r);
+            }
+        }
+        else {
+            for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+                npyv_f32 re_im0 = npyv_loadn2_f32(src, ssrc);
+                npyv_f32 re_im1 = npyv_loadn2_f32(src + ssrc*hstep, ssrc);
+                npyv_f32x2 ab = npyv_unzip_f32(re_im0, re_im1);
+                npyv_f32 r = simd_cabsolute_f32(ab.val[0], ab.val[1]);
+                npyv_storen_f32(dst, sdst, r);
+            }
+        }
+        for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+            npyv_f32 rl = npyv_loadn_tillz_f32(src, ssrc, len);
+            npyv_f32 im = npyv_loadn_tillz_f32(src + 1, ssrc, len);
+            npyv_f32 r = simd_cabsolute_f32(rl, im);
+            npyv_storen_till_f32(dst, sdst, len, r);
+        }
+        npyv_cleanup();
+        npy_clear_floatstatus_barrier((char*)&len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_float re = ((npy_float *)ip1)[0];
+        const npy_float im = ((npy_float *)ip1)[1];
+        *((npy_float *)op1) = npy_hypotf(re, im);
+    }
+}
+
+#line 86
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(CDOUBLE_absolute)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if NPY_SIMD_F64
+    npy_intp len = dimensions[0];
+    npy_intp ssrc = steps[0] / sizeof(npy_double);
+    npy_intp sdst = steps[1] / sizeof(npy_double);
+
+    if (!is_mem_overlap(args[0], steps[0], args[1], steps[1], len) &&
+        npyv_loadable_stride_f64(ssrc) && npyv_storable_stride_f64(sdst)
+        && steps[0] % sizeof(npy_double) == 0
+        && steps[1] % sizeof(npy_double) == 0
+    ) {
+        const npy_double *src = (npy_double*)args[0];
+              npy_double *dst = (npy_double*)args[1];
+
+        const int vstep = npyv_nlanes_f64;
+        const int wstep = vstep * 2;
+        const int hstep = vstep / 2;
+
+        if (ssrc == 2 && sdst == 1) {
+            for (; len >= vstep; len -= vstep, src += wstep, dst += vstep) {
+                npyv_f64x2 ab = npyv_load_f64x2(src);
+                npyv_f64 r = simd_cabsolute_f64(ab.val[0], ab.val[1]);
+                npyv_store_f64(dst, r);
+            }
+        }
+        else {
+            for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+                npyv_f64 re_im0 = npyv_loadn2_f64(src, ssrc);
+                npyv_f64 re_im1 = npyv_loadn2_f64(src + ssrc*hstep, ssrc);
+                npyv_f64x2 ab = npyv_unzip_f64(re_im0, re_im1);
+                npyv_f64 r = simd_cabsolute_f64(ab.val[0], ab.val[1]);
+                npyv_storen_f64(dst, sdst, r);
+            }
+        }
+        for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+            npyv_f64 rl = npyv_loadn_tillz_f64(src, ssrc, len);
+            npyv_f64 im = npyv_loadn_tillz_f64(src + 1, ssrc, len);
+            npyv_f64 r = simd_cabsolute_f64(rl, im);
+            npyv_storen_till_f64(dst, sdst, len, r);
+        }
+        npyv_cleanup();
+        npy_clear_floatstatus_barrier((char*)&len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_double re = ((npy_double *)ip1)[0];
+        const npy_double im = ((npy_double *)ip1)[1];
+        *((npy_double *)op1) = npy_hypot(re, im);
+    }
+}
+
+
diff --git a/numpy/core/src/_generated/loops_unary_fp.dispatch.c b/numpy/core/src/_generated/loops_unary_fp.dispatch.c
new file mode 100644
index 000000000000..c8425918c44b
--- /dev/null
+++ b/numpy/core/src/_generated/loops_unary_fp.dispatch.c
@@ -0,0 +1,9338 @@
+#line 1 "numpy/core/src/umath/loops_unary_fp.dispatch.c.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+/*@targets
+ ** $maxopt baseline
+ ** sse2 sse41
+ ** vsx2
+ ** neon asimd
+ ** vx vxe
+ **/
+/**
+ * Force use SSE only on x86, even if AVX2 or AVX512F are enabled
+ * through the baseline, since scatter(AVX512F) and gather very costly
+ * to handle non-contiguous memory access comparing with SSE for
+ * such small operations that this file covers.
+*/
+#define NPY_SIMD_FORCE_128
+#include "numpy/npy_math.h"
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+/**********************************************************
+ ** Scalars
+ **********************************************************/
+#if !NPY_SIMD_F32
+NPY_FINLINE float c_recip_f32(float a)
+{ return 1.0f / a; }
+NPY_FINLINE float c_abs_f32(float a)
+{
+    const float tmp = a > 0 ? a : -a;
+    /* add 0 to clear -0.0 */
+    return tmp + 0;
+}
+NPY_FINLINE float c_square_f32(float a)
+{ return a * a; }
+#endif // !NPY_SIMD_F32
+
+#if !NPY_SIMD_F64
+NPY_FINLINE double c_recip_f64(double a)
+{ return 1.0 / a; }
+NPY_FINLINE double c_abs_f64(double a)
+{
+    const double tmp = a > 0 ? a : -a;
+    /* add 0 to clear -0.0 */
+    return tmp + 0;
+}
+NPY_FINLINE double c_square_f64(double a)
+{ return a * a; }
+#endif // !NPY_SIMD_F64
+/**
+ * MSVC(32-bit mode) requires a clarified contiguous loop
+ * in order to use SSE, otherwise it uses a soft version of square root
+ * that doesn't raise a domain error.
+ */
+#if defined(_MSC_VER) && defined(_M_IX86) && !NPY_SIMD
+    #include <emmintrin.h>
+    NPY_FINLINE float c_sqrt_f32(float _a)
+    {
+        __m128 a = _mm_load_ss(&_a);
+        __m128 lower = _mm_sqrt_ss(a);
+        return _mm_cvtss_f32(lower);
+    }
+    NPY_FINLINE double c_sqrt_f64(double _a)
+    {
+        __m128d a = _mm_load_sd(&_a);
+        __m128d lower = _mm_sqrt_pd(a);
+        return _mm_cvtsd_f64(lower);
+    }
+#else
+    #define c_sqrt_f32 npy_sqrtf
+    #define c_sqrt_f64 npy_sqrt
+#endif
+
+#define c_ceil_f32 npy_ceilf
+#define c_ceil_f64 npy_ceil
+
+#define c_trunc_f32 npy_truncf
+#define c_trunc_f64 npy_trunc
+
+#define c_floor_f32 npy_floorf
+#define c_floor_f64 npy_floor
+
+#define c_rint_f32 npy_rintf
+#define c_rint_f64 npy_rint
+
+/********************************************************************************
+ ** Defining the SIMD kernels
+ ********************************************************************************/
+/** Notes:
+ * - avoid the use of libmath to unify fp/domain errors
+ *   for both scalars and vectors among all compilers/architectures.
+ * - use intrinsic npyv_load_till_* instead of npyv_load_tillz_
+ *   to fill the remind lanes with 1.0 to avoid divide by zero fp
+ *   exception in reciprocal.
+ */
+#define CONTIG  0
+#define NCONTIG 1
+
+#line 101
+#if NPY_SIMD_F32
+#line 107
+#line 112
+static void simd_FLOAT_rint_CONTIG_CONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f32 *src = _src;
+          npyv_lanetype_f32 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 4;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
+            #else
+                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f32 v_unary0 = npyv_rint_f32(v_src0);
+        #endif
+        
+#line 126
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
+            #else
+                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f32 v_unary1 = npyv_rint_f32(v_src1);
+        #endif
+        
+#line 126
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
+            #else
+                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f32 v_unary2 = npyv_rint_f32(v_src2);
+        #endif
+        
+#line 126
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
+            #else
+                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f32 v_unary3 = npyv_rint_f32(v_src3);
+        #endif
+        
+        #line 138
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f32 v_src0 = npyv_load_f32(src);
+    #else
+        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
+    #endif
+        npyv_f32 v_unary0 = npyv_rint_f32(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_f32(dst, v_unary0);
+    #else
+        npyv_storen_f32(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if CONTIG == CONTIG
+        #if 0
+            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f32 v_unary0 = npyv_rint_f32(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_till_f32(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f32(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_FLOAT_rint_NCONTIG_CONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f32 *src = _src;
+          npyv_lanetype_f32 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 4;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 4 > 0
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
+            #else
+                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f32 v_unary0 = npyv_rint_f32(v_src0);
+        #endif
+        
+#line 126
+        #if 4 > 1
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
+            #else
+                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f32 v_unary1 = npyv_rint_f32(v_src1);
+        #endif
+        
+#line 126
+        #if 4 > 2
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
+            #else
+                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f32 v_unary2 = npyv_rint_f32(v_src2);
+        #endif
+        
+#line 126
+        #if 4 > 3
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
+            #else
+                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f32 v_unary3 = npyv_rint_f32(v_src3);
+        #endif
+        
+        #line 138
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f32 v_src0 = npyv_load_f32(src);
+    #else
+        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
+    #endif
+        npyv_f32 v_unary0 = npyv_rint_f32(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_f32(dst, v_unary0);
+    #else
+        npyv_storen_f32(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if NCONTIG == CONTIG
+        #if 0
+            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f32 v_unary0 = npyv_rint_f32(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_till_f32(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f32(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_FLOAT_rint_CONTIG_NCONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f32 *src = _src;
+          npyv_lanetype_f32 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 2;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 2 > 0
+            #if CONTIG == CONTIG
+                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
+            #else
+                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f32 v_unary0 = npyv_rint_f32(v_src0);
+        #endif
+        
+#line 126
+        #if 2 > 1
+            #if CONTIG == CONTIG
+                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
+            #else
+                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f32 v_unary1 = npyv_rint_f32(v_src1);
+        #endif
+        
+#line 126
+        #if 2 > 2
+            #if CONTIG == CONTIG
+                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
+            #else
+                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f32 v_unary2 = npyv_rint_f32(v_src2);
+        #endif
+        
+#line 126
+        #if 2 > 3
+            #if CONTIG == CONTIG
+                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
+            #else
+                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f32 v_unary3 = npyv_rint_f32(v_src3);
+        #endif
+        
+        #line 138
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f32 v_src0 = npyv_load_f32(src);
+    #else
+        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
+    #endif
+        npyv_f32 v_unary0 = npyv_rint_f32(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_f32(dst, v_unary0);
+    #else
+        npyv_storen_f32(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if CONTIG == CONTIG
+        #if 0
+            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f32 v_unary0 = npyv_rint_f32(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_till_f32(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f32(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_FLOAT_rint_NCONTIG_NCONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f32 *src = _src;
+          npyv_lanetype_f32 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 2;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
+            #else
+                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f32 v_unary0 = npyv_rint_f32(v_src0);
+        #endif
+        
+#line 126
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
+            #else
+                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f32 v_unary1 = npyv_rint_f32(v_src1);
+        #endif
+        
+#line 126
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
+            #else
+                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f32 v_unary2 = npyv_rint_f32(v_src2);
+        #endif
+        
+#line 126
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
+            #else
+                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f32 v_unary3 = npyv_rint_f32(v_src3);
+        #endif
+        
+        #line 138
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f32 v_src0 = npyv_load_f32(src);
+    #else
+        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
+    #endif
+        npyv_f32 v_unary0 = npyv_rint_f32(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_f32(dst, v_unary0);
+    #else
+        npyv_storen_f32(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if NCONTIG == CONTIG
+        #if 0
+            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f32 v_unary0 = npyv_rint_f32(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_till_f32(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f32(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+
+#line 107
+#line 112
+static void simd_FLOAT_floor_CONTIG_CONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f32 *src = _src;
+          npyv_lanetype_f32 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 4;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
+            #else
+                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f32 v_unary0 = npyv_floor_f32(v_src0);
+        #endif
+        
+#line 126
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
+            #else
+                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f32 v_unary1 = npyv_floor_f32(v_src1);
+        #endif
+        
+#line 126
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
+            #else
+                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f32 v_unary2 = npyv_floor_f32(v_src2);
+        #endif
+        
+#line 126
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
+            #else
+                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f32 v_unary3 = npyv_floor_f32(v_src3);
+        #endif
+        
+        #line 138
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f32 v_src0 = npyv_load_f32(src);
+    #else
+        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
+    #endif
+        npyv_f32 v_unary0 = npyv_floor_f32(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_f32(dst, v_unary0);
+    #else
+        npyv_storen_f32(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if CONTIG == CONTIG
+        #if 0
+            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f32 v_unary0 = npyv_floor_f32(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_till_f32(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f32(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_FLOAT_floor_NCONTIG_CONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f32 *src = _src;
+          npyv_lanetype_f32 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 4;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 4 > 0
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
+            #else
+                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f32 v_unary0 = npyv_floor_f32(v_src0);
+        #endif
+        
+#line 126
+        #if 4 > 1
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
+            #else
+                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f32 v_unary1 = npyv_floor_f32(v_src1);
+        #endif
+        
+#line 126
+        #if 4 > 2
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
+            #else
+                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f32 v_unary2 = npyv_floor_f32(v_src2);
+        #endif
+        
+#line 126
+        #if 4 > 3
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
+            #else
+                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f32 v_unary3 = npyv_floor_f32(v_src3);
+        #endif
+        
+        #line 138
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f32 v_src0 = npyv_load_f32(src);
+    #else
+        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
+    #endif
+        npyv_f32 v_unary0 = npyv_floor_f32(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_f32(dst, v_unary0);
+    #else
+        npyv_storen_f32(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if NCONTIG == CONTIG
+        #if 0
+            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f32 v_unary0 = npyv_floor_f32(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_till_f32(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f32(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_FLOAT_floor_CONTIG_NCONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f32 *src = _src;
+          npyv_lanetype_f32 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 2;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 2 > 0
+            #if CONTIG == CONTIG
+                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
+            #else
+                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f32 v_unary0 = npyv_floor_f32(v_src0);
+        #endif
+        
+#line 126
+        #if 2 > 1
+            #if CONTIG == CONTIG
+                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
+            #else
+                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f32 v_unary1 = npyv_floor_f32(v_src1);
+        #endif
+        
+#line 126
+        #if 2 > 2
+            #if CONTIG == CONTIG
+                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
+            #else
+                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f32 v_unary2 = npyv_floor_f32(v_src2);
+        #endif
+        
+#line 126
+        #if 2 > 3
+            #if CONTIG == CONTIG
+                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
+            #else
+                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f32 v_unary3 = npyv_floor_f32(v_src3);
+        #endif
+        
+        #line 138
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f32 v_src0 = npyv_load_f32(src);
+    #else
+        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
+    #endif
+        npyv_f32 v_unary0 = npyv_floor_f32(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_f32(dst, v_unary0);
+    #else
+        npyv_storen_f32(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if CONTIG == CONTIG
+        #if 0
+            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f32 v_unary0 = npyv_floor_f32(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_till_f32(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f32(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_FLOAT_floor_NCONTIG_NCONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f32 *src = _src;
+          npyv_lanetype_f32 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 2;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
+            #else
+                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f32 v_unary0 = npyv_floor_f32(v_src0);
+        #endif
+        
+#line 126
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
+            #else
+                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f32 v_unary1 = npyv_floor_f32(v_src1);
+        #endif
+        
+#line 126
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
+            #else
+                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f32 v_unary2 = npyv_floor_f32(v_src2);
+        #endif
+        
+#line 126
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
+            #else
+                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f32 v_unary3 = npyv_floor_f32(v_src3);
+        #endif
+        
+        #line 138
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f32 v_src0 = npyv_load_f32(src);
+    #else
+        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
+    #endif
+        npyv_f32 v_unary0 = npyv_floor_f32(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_f32(dst, v_unary0);
+    #else
+        npyv_storen_f32(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if NCONTIG == CONTIG
+        #if 0
+            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f32 v_unary0 = npyv_floor_f32(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_till_f32(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f32(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+
+#line 107
+#line 112
+static void simd_FLOAT_ceil_CONTIG_CONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f32 *src = _src;
+          npyv_lanetype_f32 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 4;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
+            #else
+                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f32 v_unary0 = npyv_ceil_f32(v_src0);
+        #endif
+        
+#line 126
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
+            #else
+                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f32 v_unary1 = npyv_ceil_f32(v_src1);
+        #endif
+        
+#line 126
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
+            #else
+                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f32 v_unary2 = npyv_ceil_f32(v_src2);
+        #endif
+        
+#line 126
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
+            #else
+                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f32 v_unary3 = npyv_ceil_f32(v_src3);
+        #endif
+        
+        #line 138
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f32 v_src0 = npyv_load_f32(src);
+    #else
+        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
+    #endif
+        npyv_f32 v_unary0 = npyv_ceil_f32(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_f32(dst, v_unary0);
+    #else
+        npyv_storen_f32(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if CONTIG == CONTIG
+        #if 0
+            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f32 v_unary0 = npyv_ceil_f32(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_till_f32(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f32(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_FLOAT_ceil_NCONTIG_CONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f32 *src = _src;
+          npyv_lanetype_f32 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 4;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 4 > 0
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
+            #else
+                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f32 v_unary0 = npyv_ceil_f32(v_src0);
+        #endif
+        
+#line 126
+        #if 4 > 1
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
+            #else
+                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f32 v_unary1 = npyv_ceil_f32(v_src1);
+        #endif
+        
+#line 126
+        #if 4 > 2
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
+            #else
+                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f32 v_unary2 = npyv_ceil_f32(v_src2);
+        #endif
+        
+#line 126
+        #if 4 > 3
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
+            #else
+                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f32 v_unary3 = npyv_ceil_f32(v_src3);
+        #endif
+        
+        #line 138
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f32 v_src0 = npyv_load_f32(src);
+    #else
+        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
+    #endif
+        npyv_f32 v_unary0 = npyv_ceil_f32(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_f32(dst, v_unary0);
+    #else
+        npyv_storen_f32(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if NCONTIG == CONTIG
+        #if 0
+            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f32 v_unary0 = npyv_ceil_f32(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_till_f32(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f32(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_FLOAT_ceil_CONTIG_NCONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f32 *src = _src;
+          npyv_lanetype_f32 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 2;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 2 > 0
+            #if CONTIG == CONTIG
+                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
+            #else
+                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f32 v_unary0 = npyv_ceil_f32(v_src0);
+        #endif
+        
+#line 126
+        #if 2 > 1
+            #if CONTIG == CONTIG
+                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
+            #else
+                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f32 v_unary1 = npyv_ceil_f32(v_src1);
+        #endif
+        
+#line 126
+        #if 2 > 2
+            #if CONTIG == CONTIG
+                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
+            #else
+                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f32 v_unary2 = npyv_ceil_f32(v_src2);
+        #endif
+        
+#line 126
+        #if 2 > 3
+            #if CONTIG == CONTIG
+                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
+            #else
+                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f32 v_unary3 = npyv_ceil_f32(v_src3);
+        #endif
+        
+        #line 138
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f32 v_src0 = npyv_load_f32(src);
+    #else
+        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
+    #endif
+        npyv_f32 v_unary0 = npyv_ceil_f32(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_f32(dst, v_unary0);
+    #else
+        npyv_storen_f32(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if CONTIG == CONTIG
+        #if 0
+            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f32 v_unary0 = npyv_ceil_f32(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_till_f32(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f32(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_FLOAT_ceil_NCONTIG_NCONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f32 *src = _src;
+          npyv_lanetype_f32 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 2;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
+            #else
+                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f32 v_unary0 = npyv_ceil_f32(v_src0);
+        #endif
+        
+#line 126
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
+            #else
+                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f32 v_unary1 = npyv_ceil_f32(v_src1);
+        #endif
+        
+#line 126
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
+            #else
+                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f32 v_unary2 = npyv_ceil_f32(v_src2);
+        #endif
+        
+#line 126
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
+            #else
+                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f32 v_unary3 = npyv_ceil_f32(v_src3);
+        #endif
+        
+        #line 138
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f32 v_src0 = npyv_load_f32(src);
+    #else
+        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
+    #endif
+        npyv_f32 v_unary0 = npyv_ceil_f32(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_f32(dst, v_unary0);
+    #else
+        npyv_storen_f32(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if NCONTIG == CONTIG
+        #if 0
+            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f32 v_unary0 = npyv_ceil_f32(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_till_f32(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f32(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+
+#line 107
+#line 112
+static void simd_FLOAT_trunc_CONTIG_CONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f32 *src = _src;
+          npyv_lanetype_f32 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 4;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
+            #else
+                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f32 v_unary0 = npyv_trunc_f32(v_src0);
+        #endif
+        
+#line 126
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
+            #else
+                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f32 v_unary1 = npyv_trunc_f32(v_src1);
+        #endif
+        
+#line 126
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
+            #else
+                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f32 v_unary2 = npyv_trunc_f32(v_src2);
+        #endif
+        
+#line 126
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
+            #else
+                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f32 v_unary3 = npyv_trunc_f32(v_src3);
+        #endif
+        
+        #line 138
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f32 v_src0 = npyv_load_f32(src);
+    #else
+        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
+    #endif
+        npyv_f32 v_unary0 = npyv_trunc_f32(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_f32(dst, v_unary0);
+    #else
+        npyv_storen_f32(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if CONTIG == CONTIG
+        #if 0
+            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f32 v_unary0 = npyv_trunc_f32(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_till_f32(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f32(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_FLOAT_trunc_NCONTIG_CONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f32 *src = _src;
+          npyv_lanetype_f32 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 4;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 4 > 0
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
+            #else
+                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f32 v_unary0 = npyv_trunc_f32(v_src0);
+        #endif
+        
+#line 126
+        #if 4 > 1
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
+            #else
+                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f32 v_unary1 = npyv_trunc_f32(v_src1);
+        #endif
+        
+#line 126
+        #if 4 > 2
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
+            #else
+                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f32 v_unary2 = npyv_trunc_f32(v_src2);
+        #endif
+        
+#line 126
+        #if 4 > 3
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
+            #else
+                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f32 v_unary3 = npyv_trunc_f32(v_src3);
+        #endif
+        
+        #line 138
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f32 v_src0 = npyv_load_f32(src);
+    #else
+        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
+    #endif
+        npyv_f32 v_unary0 = npyv_trunc_f32(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_f32(dst, v_unary0);
+    #else
+        npyv_storen_f32(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if NCONTIG == CONTIG
+        #if 0
+            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f32 v_unary0 = npyv_trunc_f32(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_till_f32(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f32(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_FLOAT_trunc_CONTIG_NCONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f32 *src = _src;
+          npyv_lanetype_f32 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 2;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 2 > 0
+            #if CONTIG == CONTIG
+                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
+            #else
+                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f32 v_unary0 = npyv_trunc_f32(v_src0);
+        #endif
+        
+#line 126
+        #if 2 > 1
+            #if CONTIG == CONTIG
+                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
+            #else
+                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f32 v_unary1 = npyv_trunc_f32(v_src1);
+        #endif
+        
+#line 126
+        #if 2 > 2
+            #if CONTIG == CONTIG
+                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
+            #else
+                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f32 v_unary2 = npyv_trunc_f32(v_src2);
+        #endif
+        
+#line 126
+        #if 2 > 3
+            #if CONTIG == CONTIG
+                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
+            #else
+                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f32 v_unary3 = npyv_trunc_f32(v_src3);
+        #endif
+        
+        #line 138
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f32 v_src0 = npyv_load_f32(src);
+    #else
+        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
+    #endif
+        npyv_f32 v_unary0 = npyv_trunc_f32(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_f32(dst, v_unary0);
+    #else
+        npyv_storen_f32(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if CONTIG == CONTIG
+        #if 0
+            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f32 v_unary0 = npyv_trunc_f32(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_till_f32(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f32(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_FLOAT_trunc_NCONTIG_NCONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f32 *src = _src;
+          npyv_lanetype_f32 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 2;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
+            #else
+                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f32 v_unary0 = npyv_trunc_f32(v_src0);
+        #endif
+        
+#line 126
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
+            #else
+                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f32 v_unary1 = npyv_trunc_f32(v_src1);
+        #endif
+        
+#line 126
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
+            #else
+                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f32 v_unary2 = npyv_trunc_f32(v_src2);
+        #endif
+        
+#line 126
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
+            #else
+                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f32 v_unary3 = npyv_trunc_f32(v_src3);
+        #endif
+        
+        #line 138
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f32 v_src0 = npyv_load_f32(src);
+    #else
+        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
+    #endif
+        npyv_f32 v_unary0 = npyv_trunc_f32(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_f32(dst, v_unary0);
+    #else
+        npyv_storen_f32(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if NCONTIG == CONTIG
+        #if 0
+            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f32 v_unary0 = npyv_trunc_f32(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_till_f32(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f32(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+
+#line 107
+#line 112
+static void simd_FLOAT_sqrt_CONTIG_CONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f32 *src = _src;
+          npyv_lanetype_f32 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 4;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
+            #else
+                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f32 v_unary0 = npyv_sqrt_f32(v_src0);
+        #endif
+        
+#line 126
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
+            #else
+                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f32 v_unary1 = npyv_sqrt_f32(v_src1);
+        #endif
+        
+#line 126
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
+            #else
+                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f32 v_unary2 = npyv_sqrt_f32(v_src2);
+        #endif
+        
+#line 126
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
+            #else
+                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f32 v_unary3 = npyv_sqrt_f32(v_src3);
+        #endif
+        
+        #line 138
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f32 v_src0 = npyv_load_f32(src);
+    #else
+        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
+    #endif
+        npyv_f32 v_unary0 = npyv_sqrt_f32(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_f32(dst, v_unary0);
+    #else
+        npyv_storen_f32(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if CONTIG == CONTIG
+        #if 0
+            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f32 v_unary0 = npyv_sqrt_f32(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_till_f32(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f32(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_FLOAT_sqrt_NCONTIG_CONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f32 *src = _src;
+          npyv_lanetype_f32 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 4;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 4 > 0
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
+            #else
+                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f32 v_unary0 = npyv_sqrt_f32(v_src0);
+        #endif
+        
+#line 126
+        #if 4 > 1
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
+            #else
+                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f32 v_unary1 = npyv_sqrt_f32(v_src1);
+        #endif
+        
+#line 126
+        #if 4 > 2
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
+            #else
+                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f32 v_unary2 = npyv_sqrt_f32(v_src2);
+        #endif
+        
+#line 126
+        #if 4 > 3
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
+            #else
+                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f32 v_unary3 = npyv_sqrt_f32(v_src3);
+        #endif
+        
+        #line 138
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f32 v_src0 = npyv_load_f32(src);
+    #else
+        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
+    #endif
+        npyv_f32 v_unary0 = npyv_sqrt_f32(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_f32(dst, v_unary0);
+    #else
+        npyv_storen_f32(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if NCONTIG == CONTIG
+        #if 0
+            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f32 v_unary0 = npyv_sqrt_f32(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_till_f32(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f32(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_FLOAT_sqrt_CONTIG_NCONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f32 *src = _src;
+          npyv_lanetype_f32 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 2;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 2 > 0
+            #if CONTIG == CONTIG
+                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
+            #else
+                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f32 v_unary0 = npyv_sqrt_f32(v_src0);
+        #endif
+        
+#line 126
+        #if 2 > 1
+            #if CONTIG == CONTIG
+                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
+            #else
+                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f32 v_unary1 = npyv_sqrt_f32(v_src1);
+        #endif
+        
+#line 126
+        #if 2 > 2
+            #if CONTIG == CONTIG
+                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
+            #else
+                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f32 v_unary2 = npyv_sqrt_f32(v_src2);
+        #endif
+        
+#line 126
+        #if 2 > 3
+            #if CONTIG == CONTIG
+                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
+            #else
+                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f32 v_unary3 = npyv_sqrt_f32(v_src3);
+        #endif
+        
+        #line 138
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f32 v_src0 = npyv_load_f32(src);
+    #else
+        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
+    #endif
+        npyv_f32 v_unary0 = npyv_sqrt_f32(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_f32(dst, v_unary0);
+    #else
+        npyv_storen_f32(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if CONTIG == CONTIG
+        #if 0
+            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f32 v_unary0 = npyv_sqrt_f32(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_till_f32(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f32(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_FLOAT_sqrt_NCONTIG_NCONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f32 *src = _src;
+          npyv_lanetype_f32 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 2;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
+            #else
+                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f32 v_unary0 = npyv_sqrt_f32(v_src0);
+        #endif
+        
+#line 126
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
+            #else
+                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f32 v_unary1 = npyv_sqrt_f32(v_src1);
+        #endif
+        
+#line 126
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
+            #else
+                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f32 v_unary2 = npyv_sqrt_f32(v_src2);
+        #endif
+        
+#line 126
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
+            #else
+                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f32 v_unary3 = npyv_sqrt_f32(v_src3);
+        #endif
+        
+        #line 138
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f32 v_src0 = npyv_load_f32(src);
+    #else
+        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
+    #endif
+        npyv_f32 v_unary0 = npyv_sqrt_f32(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_f32(dst, v_unary0);
+    #else
+        npyv_storen_f32(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if NCONTIG == CONTIG
+        #if 0
+            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f32 v_unary0 = npyv_sqrt_f32(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_till_f32(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f32(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+
+#line 107
+#line 112
+static void simd_FLOAT_absolute_CONTIG_CONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f32 *src = _src;
+          npyv_lanetype_f32 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 4;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
+            #else
+                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f32 v_unary0 = npyv_abs_f32(v_src0);
+        #endif
+        
+#line 126
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
+            #else
+                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f32 v_unary1 = npyv_abs_f32(v_src1);
+        #endif
+        
+#line 126
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
+            #else
+                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f32 v_unary2 = npyv_abs_f32(v_src2);
+        #endif
+        
+#line 126
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
+            #else
+                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f32 v_unary3 = npyv_abs_f32(v_src3);
+        #endif
+        
+        #line 138
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f32 v_src0 = npyv_load_f32(src);
+    #else
+        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
+    #endif
+        npyv_f32 v_unary0 = npyv_abs_f32(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_f32(dst, v_unary0);
+    #else
+        npyv_storen_f32(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if CONTIG == CONTIG
+        #if 0
+            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f32 v_unary0 = npyv_abs_f32(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_till_f32(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f32(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_FLOAT_absolute_NCONTIG_CONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f32 *src = _src;
+          npyv_lanetype_f32 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 4;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 4 > 0
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
+            #else
+                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f32 v_unary0 = npyv_abs_f32(v_src0);
+        #endif
+        
+#line 126
+        #if 4 > 1
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
+            #else
+                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f32 v_unary1 = npyv_abs_f32(v_src1);
+        #endif
+        
+#line 126
+        #if 4 > 2
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
+            #else
+                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f32 v_unary2 = npyv_abs_f32(v_src2);
+        #endif
+        
+#line 126
+        #if 4 > 3
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
+            #else
+                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f32 v_unary3 = npyv_abs_f32(v_src3);
+        #endif
+        
+        #line 138
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f32 v_src0 = npyv_load_f32(src);
+    #else
+        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
+    #endif
+        npyv_f32 v_unary0 = npyv_abs_f32(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_f32(dst, v_unary0);
+    #else
+        npyv_storen_f32(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if NCONTIG == CONTIG
+        #if 0
+            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f32 v_unary0 = npyv_abs_f32(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_till_f32(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f32(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_FLOAT_absolute_CONTIG_NCONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f32 *src = _src;
+          npyv_lanetype_f32 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 2;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 2 > 0
+            #if CONTIG == CONTIG
+                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
+            #else
+                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f32 v_unary0 = npyv_abs_f32(v_src0);
+        #endif
+        
+#line 126
+        #if 2 > 1
+            #if CONTIG == CONTIG
+                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
+            #else
+                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f32 v_unary1 = npyv_abs_f32(v_src1);
+        #endif
+        
+#line 126
+        #if 2 > 2
+            #if CONTIG == CONTIG
+                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
+            #else
+                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f32 v_unary2 = npyv_abs_f32(v_src2);
+        #endif
+        
+#line 126
+        #if 2 > 3
+            #if CONTIG == CONTIG
+                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
+            #else
+                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f32 v_unary3 = npyv_abs_f32(v_src3);
+        #endif
+        
+        #line 138
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f32 v_src0 = npyv_load_f32(src);
+    #else
+        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
+    #endif
+        npyv_f32 v_unary0 = npyv_abs_f32(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_f32(dst, v_unary0);
+    #else
+        npyv_storen_f32(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if CONTIG == CONTIG
+        #if 0
+            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f32 v_unary0 = npyv_abs_f32(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_till_f32(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f32(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_FLOAT_absolute_NCONTIG_NCONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f32 *src = _src;
+          npyv_lanetype_f32 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 2;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
+            #else
+                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f32 v_unary0 = npyv_abs_f32(v_src0);
+        #endif
+        
+#line 126
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
+            #else
+                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f32 v_unary1 = npyv_abs_f32(v_src1);
+        #endif
+        
+#line 126
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
+            #else
+                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f32 v_unary2 = npyv_abs_f32(v_src2);
+        #endif
+        
+#line 126
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
+            #else
+                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f32 v_unary3 = npyv_abs_f32(v_src3);
+        #endif
+        
+        #line 138
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f32 v_src0 = npyv_load_f32(src);
+    #else
+        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
+    #endif
+        npyv_f32 v_unary0 = npyv_abs_f32(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_f32(dst, v_unary0);
+    #else
+        npyv_storen_f32(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if NCONTIG == CONTIG
+        #if 0
+            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f32 v_unary0 = npyv_abs_f32(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_till_f32(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f32(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+
+#line 107
+#line 112
+static void simd_FLOAT_square_CONTIG_CONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f32 *src = _src;
+          npyv_lanetype_f32 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 4;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
+            #else
+                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f32 v_unary0 = npyv_square_f32(v_src0);
+        #endif
+        
+#line 126
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
+            #else
+                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f32 v_unary1 = npyv_square_f32(v_src1);
+        #endif
+        
+#line 126
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
+            #else
+                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f32 v_unary2 = npyv_square_f32(v_src2);
+        #endif
+        
+#line 126
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
+            #else
+                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f32 v_unary3 = npyv_square_f32(v_src3);
+        #endif
+        
+        #line 138
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f32 v_src0 = npyv_load_f32(src);
+    #else
+        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
+    #endif
+        npyv_f32 v_unary0 = npyv_square_f32(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_f32(dst, v_unary0);
+    #else
+        npyv_storen_f32(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if CONTIG == CONTIG
+        #if 0
+            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f32 v_unary0 = npyv_square_f32(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_till_f32(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f32(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_FLOAT_square_NCONTIG_CONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f32 *src = _src;
+          npyv_lanetype_f32 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 4;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 4 > 0
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
+            #else
+                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f32 v_unary0 = npyv_square_f32(v_src0);
+        #endif
+        
+#line 126
+        #if 4 > 1
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
+            #else
+                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f32 v_unary1 = npyv_square_f32(v_src1);
+        #endif
+        
+#line 126
+        #if 4 > 2
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
+            #else
+                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f32 v_unary2 = npyv_square_f32(v_src2);
+        #endif
+        
+#line 126
+        #if 4 > 3
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
+            #else
+                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f32 v_unary3 = npyv_square_f32(v_src3);
+        #endif
+        
+        #line 138
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f32 v_src0 = npyv_load_f32(src);
+    #else
+        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
+    #endif
+        npyv_f32 v_unary0 = npyv_square_f32(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_f32(dst, v_unary0);
+    #else
+        npyv_storen_f32(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if NCONTIG == CONTIG
+        #if 0
+            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f32 v_unary0 = npyv_square_f32(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_till_f32(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f32(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_FLOAT_square_CONTIG_NCONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f32 *src = _src;
+          npyv_lanetype_f32 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 2;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 2 > 0
+            #if CONTIG == CONTIG
+                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
+            #else
+                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f32 v_unary0 = npyv_square_f32(v_src0);
+        #endif
+        
+#line 126
+        #if 2 > 1
+            #if CONTIG == CONTIG
+                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
+            #else
+                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f32 v_unary1 = npyv_square_f32(v_src1);
+        #endif
+        
+#line 126
+        #if 2 > 2
+            #if CONTIG == CONTIG
+                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
+            #else
+                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f32 v_unary2 = npyv_square_f32(v_src2);
+        #endif
+        
+#line 126
+        #if 2 > 3
+            #if CONTIG == CONTIG
+                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
+            #else
+                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f32 v_unary3 = npyv_square_f32(v_src3);
+        #endif
+        
+        #line 138
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f32 v_src0 = npyv_load_f32(src);
+    #else
+        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
+    #endif
+        npyv_f32 v_unary0 = npyv_square_f32(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_f32(dst, v_unary0);
+    #else
+        npyv_storen_f32(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if CONTIG == CONTIG
+        #if 0
+            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f32 v_unary0 = npyv_square_f32(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_till_f32(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f32(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_FLOAT_square_NCONTIG_NCONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f32 *src = _src;
+          npyv_lanetype_f32 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 2;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
+            #else
+                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f32 v_unary0 = npyv_square_f32(v_src0);
+        #endif
+        
+#line 126
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
+            #else
+                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f32 v_unary1 = npyv_square_f32(v_src1);
+        #endif
+        
+#line 126
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
+            #else
+                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f32 v_unary2 = npyv_square_f32(v_src2);
+        #endif
+        
+#line 126
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
+            #else
+                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f32 v_unary3 = npyv_square_f32(v_src3);
+        #endif
+        
+        #line 138
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f32 v_src0 = npyv_load_f32(src);
+    #else
+        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
+    #endif
+        npyv_f32 v_unary0 = npyv_square_f32(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_f32(dst, v_unary0);
+    #else
+        npyv_storen_f32(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if NCONTIG == CONTIG
+        #if 0
+            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f32 v_unary0 = npyv_square_f32(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_till_f32(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f32(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+
+#line 107
+#line 112
+static void simd_FLOAT_reciprocal_CONTIG_CONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f32 *src = _src;
+          npyv_lanetype_f32 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 4;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
+            #else
+                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f32 v_unary0 = npyv_recip_f32(v_src0);
+        #endif
+        
+#line 126
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
+            #else
+                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f32 v_unary1 = npyv_recip_f32(v_src1);
+        #endif
+        
+#line 126
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
+            #else
+                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f32 v_unary2 = npyv_recip_f32(v_src2);
+        #endif
+        
+#line 126
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
+            #else
+                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f32 v_unary3 = npyv_recip_f32(v_src3);
+        #endif
+        
+        #line 138
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f32 v_src0 = npyv_load_f32(src);
+    #else
+        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
+    #endif
+        npyv_f32 v_unary0 = npyv_recip_f32(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_f32(dst, v_unary0);
+    #else
+        npyv_storen_f32(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if CONTIG == CONTIG
+        #if 1
+            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
+        #endif
+    #else
+        #if 1
+            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f32 v_unary0 = npyv_recip_f32(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_till_f32(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f32(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_FLOAT_reciprocal_NCONTIG_CONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f32 *src = _src;
+          npyv_lanetype_f32 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 4;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 4 > 0
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
+            #else
+                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f32 v_unary0 = npyv_recip_f32(v_src0);
+        #endif
+        
+#line 126
+        #if 4 > 1
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
+            #else
+                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f32 v_unary1 = npyv_recip_f32(v_src1);
+        #endif
+        
+#line 126
+        #if 4 > 2
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
+            #else
+                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f32 v_unary2 = npyv_recip_f32(v_src2);
+        #endif
+        
+#line 126
+        #if 4 > 3
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
+            #else
+                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f32 v_unary3 = npyv_recip_f32(v_src3);
+        #endif
+        
+        #line 138
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_store_f32(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f32 v_src0 = npyv_load_f32(src);
+    #else
+        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
+    #endif
+        npyv_f32 v_unary0 = npyv_recip_f32(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_f32(dst, v_unary0);
+    #else
+        npyv_storen_f32(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if NCONTIG == CONTIG
+        #if 1
+            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
+        #endif
+    #else
+        #if 1
+            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f32 v_unary0 = npyv_recip_f32(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_till_f32(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f32(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_FLOAT_reciprocal_CONTIG_NCONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f32 *src = _src;
+          npyv_lanetype_f32 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 2;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 2 > 0
+            #if CONTIG == CONTIG
+                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
+            #else
+                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f32 v_unary0 = npyv_recip_f32(v_src0);
+        #endif
+        
+#line 126
+        #if 2 > 1
+            #if CONTIG == CONTIG
+                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
+            #else
+                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f32 v_unary1 = npyv_recip_f32(v_src1);
+        #endif
+        
+#line 126
+        #if 2 > 2
+            #if CONTIG == CONTIG
+                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
+            #else
+                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f32 v_unary2 = npyv_recip_f32(v_src2);
+        #endif
+        
+#line 126
+        #if 2 > 3
+            #if CONTIG == CONTIG
+                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
+            #else
+                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f32 v_unary3 = npyv_recip_f32(v_src3);
+        #endif
+        
+        #line 138
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f32 v_src0 = npyv_load_f32(src);
+    #else
+        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
+    #endif
+        npyv_f32 v_unary0 = npyv_recip_f32(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_f32(dst, v_unary0);
+    #else
+        npyv_storen_f32(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if CONTIG == CONTIG
+        #if 1
+            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
+        #endif
+    #else
+        #if 1
+            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f32 v_unary0 = npyv_recip_f32(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_till_f32(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f32(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_FLOAT_reciprocal_NCONTIG_NCONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f32 *src = _src;
+          npyv_lanetype_f32 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * 2;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src0 = npyv_load_f32(src + vstep*0);
+            #else
+                npyv_f32 v_src0 = npyv_loadn_f32(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f32 v_unary0 = npyv_recip_f32(v_src0);
+        #endif
+        
+#line 126
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src1 = npyv_load_f32(src + vstep*1);
+            #else
+                npyv_f32 v_src1 = npyv_loadn_f32(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f32 v_unary1 = npyv_recip_f32(v_src1);
+        #endif
+        
+#line 126
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src2 = npyv_load_f32(src + vstep*2);
+            #else
+                npyv_f32 v_src2 = npyv_loadn_f32(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f32 v_unary2 = npyv_recip_f32(v_src2);
+        #endif
+        
+#line 126
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_f32 v_src3 = npyv_load_f32(src + vstep*3);
+            #else
+                npyv_f32 v_src3 = npyv_loadn_f32(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f32 v_unary3 = npyv_recip_f32(v_src3);
+        #endif
+        
+        #line 138
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_store_f32(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f32(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f32 v_src0 = npyv_load_f32(src);
+    #else
+        npyv_f32 v_src0 = npyv_loadn_f32(src, ssrc);
+    #endif
+        npyv_f32 v_unary0 = npyv_recip_f32(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_f32(dst, v_unary0);
+    #else
+        npyv_storen_f32(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if NCONTIG == CONTIG
+        #if 1
+            npyv_f32 v_src0 = npyv_load_till_f32(src, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_load_tillz_f32(src, len);
+        #endif
+    #else
+        #if 1
+            npyv_f32 v_src0 = npyv_loadn_till_f32(src, ssrc, len, 1);
+        #else
+            npyv_f32 v_src0 = npyv_loadn_tillz_f32(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f32 v_unary0 = npyv_recip_f32(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_till_f32(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f32(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+
+#endif // NPY_SIMD_F32
+
+#line 101
+#if NPY_SIMD_F64
+#line 107
+#line 112
+static void simd_DOUBLE_rint_CONTIG_CONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f64 *src = _src;
+          npyv_lanetype_f64 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 4;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
+            #else
+                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f64 v_unary0 = npyv_rint_f64(v_src0);
+        #endif
+        
+#line 126
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
+            #else
+                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f64 v_unary1 = npyv_rint_f64(v_src1);
+        #endif
+        
+#line 126
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
+            #else
+                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f64 v_unary2 = npyv_rint_f64(v_src2);
+        #endif
+        
+#line 126
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
+            #else
+                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f64 v_unary3 = npyv_rint_f64(v_src3);
+        #endif
+        
+        #line 138
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f64 v_src0 = npyv_load_f64(src);
+    #else
+        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
+    #endif
+        npyv_f64 v_unary0 = npyv_rint_f64(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_f64(dst, v_unary0);
+    #else
+        npyv_storen_f64(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if CONTIG == CONTIG
+        #if 0
+            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f64 v_unary0 = npyv_rint_f64(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_till_f64(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f64(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_DOUBLE_rint_NCONTIG_CONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f64 *src = _src;
+          npyv_lanetype_f64 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 4;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 4 > 0
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
+            #else
+                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f64 v_unary0 = npyv_rint_f64(v_src0);
+        #endif
+        
+#line 126
+        #if 4 > 1
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
+            #else
+                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f64 v_unary1 = npyv_rint_f64(v_src1);
+        #endif
+        
+#line 126
+        #if 4 > 2
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
+            #else
+                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f64 v_unary2 = npyv_rint_f64(v_src2);
+        #endif
+        
+#line 126
+        #if 4 > 3
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
+            #else
+                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f64 v_unary3 = npyv_rint_f64(v_src3);
+        #endif
+        
+        #line 138
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f64 v_src0 = npyv_load_f64(src);
+    #else
+        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
+    #endif
+        npyv_f64 v_unary0 = npyv_rint_f64(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_f64(dst, v_unary0);
+    #else
+        npyv_storen_f64(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if NCONTIG == CONTIG
+        #if 0
+            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f64 v_unary0 = npyv_rint_f64(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_till_f64(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f64(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_DOUBLE_rint_CONTIG_NCONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f64 *src = _src;
+          npyv_lanetype_f64 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 2;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 2 > 0
+            #if CONTIG == CONTIG
+                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
+            #else
+                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f64 v_unary0 = npyv_rint_f64(v_src0);
+        #endif
+        
+#line 126
+        #if 2 > 1
+            #if CONTIG == CONTIG
+                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
+            #else
+                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f64 v_unary1 = npyv_rint_f64(v_src1);
+        #endif
+        
+#line 126
+        #if 2 > 2
+            #if CONTIG == CONTIG
+                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
+            #else
+                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f64 v_unary2 = npyv_rint_f64(v_src2);
+        #endif
+        
+#line 126
+        #if 2 > 3
+            #if CONTIG == CONTIG
+                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
+            #else
+                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f64 v_unary3 = npyv_rint_f64(v_src3);
+        #endif
+        
+        #line 138
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f64 v_src0 = npyv_load_f64(src);
+    #else
+        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
+    #endif
+        npyv_f64 v_unary0 = npyv_rint_f64(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_f64(dst, v_unary0);
+    #else
+        npyv_storen_f64(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if CONTIG == CONTIG
+        #if 0
+            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f64 v_unary0 = npyv_rint_f64(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_till_f64(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f64(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_DOUBLE_rint_NCONTIG_NCONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f64 *src = _src;
+          npyv_lanetype_f64 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 2;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
+            #else
+                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f64 v_unary0 = npyv_rint_f64(v_src0);
+        #endif
+        
+#line 126
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
+            #else
+                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f64 v_unary1 = npyv_rint_f64(v_src1);
+        #endif
+        
+#line 126
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
+            #else
+                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f64 v_unary2 = npyv_rint_f64(v_src2);
+        #endif
+        
+#line 126
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
+            #else
+                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f64 v_unary3 = npyv_rint_f64(v_src3);
+        #endif
+        
+        #line 138
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f64 v_src0 = npyv_load_f64(src);
+    #else
+        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
+    #endif
+        npyv_f64 v_unary0 = npyv_rint_f64(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_f64(dst, v_unary0);
+    #else
+        npyv_storen_f64(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if NCONTIG == CONTIG
+        #if 0
+            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f64 v_unary0 = npyv_rint_f64(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_till_f64(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f64(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+
+#line 107
+#line 112
+static void simd_DOUBLE_floor_CONTIG_CONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f64 *src = _src;
+          npyv_lanetype_f64 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 4;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
+            #else
+                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f64 v_unary0 = npyv_floor_f64(v_src0);
+        #endif
+        
+#line 126
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
+            #else
+                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f64 v_unary1 = npyv_floor_f64(v_src1);
+        #endif
+        
+#line 126
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
+            #else
+                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f64 v_unary2 = npyv_floor_f64(v_src2);
+        #endif
+        
+#line 126
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
+            #else
+                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f64 v_unary3 = npyv_floor_f64(v_src3);
+        #endif
+        
+        #line 138
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f64 v_src0 = npyv_load_f64(src);
+    #else
+        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
+    #endif
+        npyv_f64 v_unary0 = npyv_floor_f64(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_f64(dst, v_unary0);
+    #else
+        npyv_storen_f64(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if CONTIG == CONTIG
+        #if 0
+            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f64 v_unary0 = npyv_floor_f64(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_till_f64(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f64(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_DOUBLE_floor_NCONTIG_CONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f64 *src = _src;
+          npyv_lanetype_f64 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 4;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 4 > 0
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
+            #else
+                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f64 v_unary0 = npyv_floor_f64(v_src0);
+        #endif
+        
+#line 126
+        #if 4 > 1
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
+            #else
+                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f64 v_unary1 = npyv_floor_f64(v_src1);
+        #endif
+        
+#line 126
+        #if 4 > 2
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
+            #else
+                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f64 v_unary2 = npyv_floor_f64(v_src2);
+        #endif
+        
+#line 126
+        #if 4 > 3
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
+            #else
+                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f64 v_unary3 = npyv_floor_f64(v_src3);
+        #endif
+        
+        #line 138
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f64 v_src0 = npyv_load_f64(src);
+    #else
+        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
+    #endif
+        npyv_f64 v_unary0 = npyv_floor_f64(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_f64(dst, v_unary0);
+    #else
+        npyv_storen_f64(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if NCONTIG == CONTIG
+        #if 0
+            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f64 v_unary0 = npyv_floor_f64(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_till_f64(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f64(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_DOUBLE_floor_CONTIG_NCONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f64 *src = _src;
+          npyv_lanetype_f64 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 2;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 2 > 0
+            #if CONTIG == CONTIG
+                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
+            #else
+                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f64 v_unary0 = npyv_floor_f64(v_src0);
+        #endif
+        
+#line 126
+        #if 2 > 1
+            #if CONTIG == CONTIG
+                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
+            #else
+                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f64 v_unary1 = npyv_floor_f64(v_src1);
+        #endif
+        
+#line 126
+        #if 2 > 2
+            #if CONTIG == CONTIG
+                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
+            #else
+                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f64 v_unary2 = npyv_floor_f64(v_src2);
+        #endif
+        
+#line 126
+        #if 2 > 3
+            #if CONTIG == CONTIG
+                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
+            #else
+                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f64 v_unary3 = npyv_floor_f64(v_src3);
+        #endif
+        
+        #line 138
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f64 v_src0 = npyv_load_f64(src);
+    #else
+        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
+    #endif
+        npyv_f64 v_unary0 = npyv_floor_f64(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_f64(dst, v_unary0);
+    #else
+        npyv_storen_f64(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if CONTIG == CONTIG
+        #if 0
+            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f64 v_unary0 = npyv_floor_f64(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_till_f64(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f64(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_DOUBLE_floor_NCONTIG_NCONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f64 *src = _src;
+          npyv_lanetype_f64 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 2;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
+            #else
+                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f64 v_unary0 = npyv_floor_f64(v_src0);
+        #endif
+        
+#line 126
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
+            #else
+                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f64 v_unary1 = npyv_floor_f64(v_src1);
+        #endif
+        
+#line 126
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
+            #else
+                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f64 v_unary2 = npyv_floor_f64(v_src2);
+        #endif
+        
+#line 126
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
+            #else
+                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f64 v_unary3 = npyv_floor_f64(v_src3);
+        #endif
+        
+        #line 138
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f64 v_src0 = npyv_load_f64(src);
+    #else
+        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
+    #endif
+        npyv_f64 v_unary0 = npyv_floor_f64(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_f64(dst, v_unary0);
+    #else
+        npyv_storen_f64(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if NCONTIG == CONTIG
+        #if 0
+            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f64 v_unary0 = npyv_floor_f64(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_till_f64(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f64(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+
+#line 107
+#line 112
+static void simd_DOUBLE_ceil_CONTIG_CONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f64 *src = _src;
+          npyv_lanetype_f64 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 4;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
+            #else
+                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f64 v_unary0 = npyv_ceil_f64(v_src0);
+        #endif
+        
+#line 126
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
+            #else
+                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f64 v_unary1 = npyv_ceil_f64(v_src1);
+        #endif
+        
+#line 126
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
+            #else
+                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f64 v_unary2 = npyv_ceil_f64(v_src2);
+        #endif
+        
+#line 126
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
+            #else
+                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f64 v_unary3 = npyv_ceil_f64(v_src3);
+        #endif
+        
+        #line 138
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f64 v_src0 = npyv_load_f64(src);
+    #else
+        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
+    #endif
+        npyv_f64 v_unary0 = npyv_ceil_f64(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_f64(dst, v_unary0);
+    #else
+        npyv_storen_f64(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if CONTIG == CONTIG
+        #if 0
+            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f64 v_unary0 = npyv_ceil_f64(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_till_f64(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f64(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_DOUBLE_ceil_NCONTIG_CONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f64 *src = _src;
+          npyv_lanetype_f64 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 4;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 4 > 0
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
+            #else
+                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f64 v_unary0 = npyv_ceil_f64(v_src0);
+        #endif
+        
+#line 126
+        #if 4 > 1
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
+            #else
+                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f64 v_unary1 = npyv_ceil_f64(v_src1);
+        #endif
+        
+#line 126
+        #if 4 > 2
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
+            #else
+                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f64 v_unary2 = npyv_ceil_f64(v_src2);
+        #endif
+        
+#line 126
+        #if 4 > 3
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
+            #else
+                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f64 v_unary3 = npyv_ceil_f64(v_src3);
+        #endif
+        
+        #line 138
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f64 v_src0 = npyv_load_f64(src);
+    #else
+        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
+    #endif
+        npyv_f64 v_unary0 = npyv_ceil_f64(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_f64(dst, v_unary0);
+    #else
+        npyv_storen_f64(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if NCONTIG == CONTIG
+        #if 0
+            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f64 v_unary0 = npyv_ceil_f64(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_till_f64(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f64(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_DOUBLE_ceil_CONTIG_NCONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f64 *src = _src;
+          npyv_lanetype_f64 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 2;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 2 > 0
+            #if CONTIG == CONTIG
+                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
+            #else
+                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f64 v_unary0 = npyv_ceil_f64(v_src0);
+        #endif
+        
+#line 126
+        #if 2 > 1
+            #if CONTIG == CONTIG
+                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
+            #else
+                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f64 v_unary1 = npyv_ceil_f64(v_src1);
+        #endif
+        
+#line 126
+        #if 2 > 2
+            #if CONTIG == CONTIG
+                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
+            #else
+                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f64 v_unary2 = npyv_ceil_f64(v_src2);
+        #endif
+        
+#line 126
+        #if 2 > 3
+            #if CONTIG == CONTIG
+                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
+            #else
+                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f64 v_unary3 = npyv_ceil_f64(v_src3);
+        #endif
+        
+        #line 138
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f64 v_src0 = npyv_load_f64(src);
+    #else
+        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
+    #endif
+        npyv_f64 v_unary0 = npyv_ceil_f64(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_f64(dst, v_unary0);
+    #else
+        npyv_storen_f64(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if CONTIG == CONTIG
+        #if 0
+            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f64 v_unary0 = npyv_ceil_f64(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_till_f64(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f64(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_DOUBLE_ceil_NCONTIG_NCONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f64 *src = _src;
+          npyv_lanetype_f64 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 2;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
+            #else
+                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f64 v_unary0 = npyv_ceil_f64(v_src0);
+        #endif
+        
+#line 126
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
+            #else
+                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f64 v_unary1 = npyv_ceil_f64(v_src1);
+        #endif
+        
+#line 126
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
+            #else
+                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f64 v_unary2 = npyv_ceil_f64(v_src2);
+        #endif
+        
+#line 126
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
+            #else
+                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f64 v_unary3 = npyv_ceil_f64(v_src3);
+        #endif
+        
+        #line 138
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f64 v_src0 = npyv_load_f64(src);
+    #else
+        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
+    #endif
+        npyv_f64 v_unary0 = npyv_ceil_f64(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_f64(dst, v_unary0);
+    #else
+        npyv_storen_f64(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if NCONTIG == CONTIG
+        #if 0
+            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f64 v_unary0 = npyv_ceil_f64(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_till_f64(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f64(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+
+#line 107
+#line 112
+static void simd_DOUBLE_trunc_CONTIG_CONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f64 *src = _src;
+          npyv_lanetype_f64 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 4;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
+            #else
+                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f64 v_unary0 = npyv_trunc_f64(v_src0);
+        #endif
+        
+#line 126
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
+            #else
+                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f64 v_unary1 = npyv_trunc_f64(v_src1);
+        #endif
+        
+#line 126
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
+            #else
+                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f64 v_unary2 = npyv_trunc_f64(v_src2);
+        #endif
+        
+#line 126
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
+            #else
+                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f64 v_unary3 = npyv_trunc_f64(v_src3);
+        #endif
+        
+        #line 138
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f64 v_src0 = npyv_load_f64(src);
+    #else
+        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
+    #endif
+        npyv_f64 v_unary0 = npyv_trunc_f64(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_f64(dst, v_unary0);
+    #else
+        npyv_storen_f64(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if CONTIG == CONTIG
+        #if 0
+            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f64 v_unary0 = npyv_trunc_f64(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_till_f64(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f64(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_DOUBLE_trunc_NCONTIG_CONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f64 *src = _src;
+          npyv_lanetype_f64 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 4;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 4 > 0
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
+            #else
+                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f64 v_unary0 = npyv_trunc_f64(v_src0);
+        #endif
+        
+#line 126
+        #if 4 > 1
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
+            #else
+                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f64 v_unary1 = npyv_trunc_f64(v_src1);
+        #endif
+        
+#line 126
+        #if 4 > 2
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
+            #else
+                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f64 v_unary2 = npyv_trunc_f64(v_src2);
+        #endif
+        
+#line 126
+        #if 4 > 3
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
+            #else
+                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f64 v_unary3 = npyv_trunc_f64(v_src3);
+        #endif
+        
+        #line 138
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f64 v_src0 = npyv_load_f64(src);
+    #else
+        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
+    #endif
+        npyv_f64 v_unary0 = npyv_trunc_f64(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_f64(dst, v_unary0);
+    #else
+        npyv_storen_f64(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if NCONTIG == CONTIG
+        #if 0
+            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f64 v_unary0 = npyv_trunc_f64(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_till_f64(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f64(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_DOUBLE_trunc_CONTIG_NCONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f64 *src = _src;
+          npyv_lanetype_f64 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 2;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 2 > 0
+            #if CONTIG == CONTIG
+                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
+            #else
+                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f64 v_unary0 = npyv_trunc_f64(v_src0);
+        #endif
+        
+#line 126
+        #if 2 > 1
+            #if CONTIG == CONTIG
+                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
+            #else
+                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f64 v_unary1 = npyv_trunc_f64(v_src1);
+        #endif
+        
+#line 126
+        #if 2 > 2
+            #if CONTIG == CONTIG
+                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
+            #else
+                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f64 v_unary2 = npyv_trunc_f64(v_src2);
+        #endif
+        
+#line 126
+        #if 2 > 3
+            #if CONTIG == CONTIG
+                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
+            #else
+                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f64 v_unary3 = npyv_trunc_f64(v_src3);
+        #endif
+        
+        #line 138
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f64 v_src0 = npyv_load_f64(src);
+    #else
+        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
+    #endif
+        npyv_f64 v_unary0 = npyv_trunc_f64(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_f64(dst, v_unary0);
+    #else
+        npyv_storen_f64(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if CONTIG == CONTIG
+        #if 0
+            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f64 v_unary0 = npyv_trunc_f64(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_till_f64(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f64(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_DOUBLE_trunc_NCONTIG_NCONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f64 *src = _src;
+          npyv_lanetype_f64 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 2;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
+            #else
+                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f64 v_unary0 = npyv_trunc_f64(v_src0);
+        #endif
+        
+#line 126
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
+            #else
+                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f64 v_unary1 = npyv_trunc_f64(v_src1);
+        #endif
+        
+#line 126
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
+            #else
+                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f64 v_unary2 = npyv_trunc_f64(v_src2);
+        #endif
+        
+#line 126
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
+            #else
+                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f64 v_unary3 = npyv_trunc_f64(v_src3);
+        #endif
+        
+        #line 138
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f64 v_src0 = npyv_load_f64(src);
+    #else
+        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
+    #endif
+        npyv_f64 v_unary0 = npyv_trunc_f64(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_f64(dst, v_unary0);
+    #else
+        npyv_storen_f64(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if NCONTIG == CONTIG
+        #if 0
+            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f64 v_unary0 = npyv_trunc_f64(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_till_f64(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f64(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+
+#line 107
+#line 112
+static void simd_DOUBLE_sqrt_CONTIG_CONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f64 *src = _src;
+          npyv_lanetype_f64 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 4;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
+            #else
+                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f64 v_unary0 = npyv_sqrt_f64(v_src0);
+        #endif
+        
+#line 126
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
+            #else
+                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f64 v_unary1 = npyv_sqrt_f64(v_src1);
+        #endif
+        
+#line 126
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
+            #else
+                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f64 v_unary2 = npyv_sqrt_f64(v_src2);
+        #endif
+        
+#line 126
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
+            #else
+                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f64 v_unary3 = npyv_sqrt_f64(v_src3);
+        #endif
+        
+        #line 138
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f64 v_src0 = npyv_load_f64(src);
+    #else
+        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
+    #endif
+        npyv_f64 v_unary0 = npyv_sqrt_f64(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_f64(dst, v_unary0);
+    #else
+        npyv_storen_f64(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if CONTIG == CONTIG
+        #if 0
+            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f64 v_unary0 = npyv_sqrt_f64(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_till_f64(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f64(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_DOUBLE_sqrt_NCONTIG_CONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f64 *src = _src;
+          npyv_lanetype_f64 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 4;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 4 > 0
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
+            #else
+                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f64 v_unary0 = npyv_sqrt_f64(v_src0);
+        #endif
+        
+#line 126
+        #if 4 > 1
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
+            #else
+                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f64 v_unary1 = npyv_sqrt_f64(v_src1);
+        #endif
+        
+#line 126
+        #if 4 > 2
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
+            #else
+                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f64 v_unary2 = npyv_sqrt_f64(v_src2);
+        #endif
+        
+#line 126
+        #if 4 > 3
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
+            #else
+                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f64 v_unary3 = npyv_sqrt_f64(v_src3);
+        #endif
+        
+        #line 138
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f64 v_src0 = npyv_load_f64(src);
+    #else
+        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
+    #endif
+        npyv_f64 v_unary0 = npyv_sqrt_f64(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_f64(dst, v_unary0);
+    #else
+        npyv_storen_f64(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if NCONTIG == CONTIG
+        #if 0
+            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f64 v_unary0 = npyv_sqrt_f64(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_till_f64(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f64(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_DOUBLE_sqrt_CONTIG_NCONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f64 *src = _src;
+          npyv_lanetype_f64 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 2;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 2 > 0
+            #if CONTIG == CONTIG
+                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
+            #else
+                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f64 v_unary0 = npyv_sqrt_f64(v_src0);
+        #endif
+        
+#line 126
+        #if 2 > 1
+            #if CONTIG == CONTIG
+                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
+            #else
+                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f64 v_unary1 = npyv_sqrt_f64(v_src1);
+        #endif
+        
+#line 126
+        #if 2 > 2
+            #if CONTIG == CONTIG
+                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
+            #else
+                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f64 v_unary2 = npyv_sqrt_f64(v_src2);
+        #endif
+        
+#line 126
+        #if 2 > 3
+            #if CONTIG == CONTIG
+                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
+            #else
+                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f64 v_unary3 = npyv_sqrt_f64(v_src3);
+        #endif
+        
+        #line 138
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f64 v_src0 = npyv_load_f64(src);
+    #else
+        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
+    #endif
+        npyv_f64 v_unary0 = npyv_sqrt_f64(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_f64(dst, v_unary0);
+    #else
+        npyv_storen_f64(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if CONTIG == CONTIG
+        #if 0
+            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f64 v_unary0 = npyv_sqrt_f64(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_till_f64(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f64(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_DOUBLE_sqrt_NCONTIG_NCONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f64 *src = _src;
+          npyv_lanetype_f64 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 2;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
+            #else
+                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f64 v_unary0 = npyv_sqrt_f64(v_src0);
+        #endif
+        
+#line 126
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
+            #else
+                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f64 v_unary1 = npyv_sqrt_f64(v_src1);
+        #endif
+        
+#line 126
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
+            #else
+                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f64 v_unary2 = npyv_sqrt_f64(v_src2);
+        #endif
+        
+#line 126
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
+            #else
+                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f64 v_unary3 = npyv_sqrt_f64(v_src3);
+        #endif
+        
+        #line 138
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f64 v_src0 = npyv_load_f64(src);
+    #else
+        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
+    #endif
+        npyv_f64 v_unary0 = npyv_sqrt_f64(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_f64(dst, v_unary0);
+    #else
+        npyv_storen_f64(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if NCONTIG == CONTIG
+        #if 0
+            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f64 v_unary0 = npyv_sqrt_f64(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_till_f64(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f64(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+
+#line 107
+#line 112
+static void simd_DOUBLE_absolute_CONTIG_CONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f64 *src = _src;
+          npyv_lanetype_f64 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 4;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
+            #else
+                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f64 v_unary0 = npyv_abs_f64(v_src0);
+        #endif
+        
+#line 126
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
+            #else
+                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f64 v_unary1 = npyv_abs_f64(v_src1);
+        #endif
+        
+#line 126
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
+            #else
+                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f64 v_unary2 = npyv_abs_f64(v_src2);
+        #endif
+        
+#line 126
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
+            #else
+                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f64 v_unary3 = npyv_abs_f64(v_src3);
+        #endif
+        
+        #line 138
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f64 v_src0 = npyv_load_f64(src);
+    #else
+        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
+    #endif
+        npyv_f64 v_unary0 = npyv_abs_f64(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_f64(dst, v_unary0);
+    #else
+        npyv_storen_f64(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if CONTIG == CONTIG
+        #if 0
+            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f64 v_unary0 = npyv_abs_f64(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_till_f64(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f64(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_DOUBLE_absolute_NCONTIG_CONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f64 *src = _src;
+          npyv_lanetype_f64 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 4;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 4 > 0
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
+            #else
+                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f64 v_unary0 = npyv_abs_f64(v_src0);
+        #endif
+        
+#line 126
+        #if 4 > 1
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
+            #else
+                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f64 v_unary1 = npyv_abs_f64(v_src1);
+        #endif
+        
+#line 126
+        #if 4 > 2
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
+            #else
+                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f64 v_unary2 = npyv_abs_f64(v_src2);
+        #endif
+        
+#line 126
+        #if 4 > 3
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
+            #else
+                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f64 v_unary3 = npyv_abs_f64(v_src3);
+        #endif
+        
+        #line 138
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f64 v_src0 = npyv_load_f64(src);
+    #else
+        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
+    #endif
+        npyv_f64 v_unary0 = npyv_abs_f64(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_f64(dst, v_unary0);
+    #else
+        npyv_storen_f64(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if NCONTIG == CONTIG
+        #if 0
+            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f64 v_unary0 = npyv_abs_f64(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_till_f64(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f64(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_DOUBLE_absolute_CONTIG_NCONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f64 *src = _src;
+          npyv_lanetype_f64 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 2;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 2 > 0
+            #if CONTIG == CONTIG
+                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
+            #else
+                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f64 v_unary0 = npyv_abs_f64(v_src0);
+        #endif
+        
+#line 126
+        #if 2 > 1
+            #if CONTIG == CONTIG
+                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
+            #else
+                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f64 v_unary1 = npyv_abs_f64(v_src1);
+        #endif
+        
+#line 126
+        #if 2 > 2
+            #if CONTIG == CONTIG
+                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
+            #else
+                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f64 v_unary2 = npyv_abs_f64(v_src2);
+        #endif
+        
+#line 126
+        #if 2 > 3
+            #if CONTIG == CONTIG
+                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
+            #else
+                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f64 v_unary3 = npyv_abs_f64(v_src3);
+        #endif
+        
+        #line 138
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f64 v_src0 = npyv_load_f64(src);
+    #else
+        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
+    #endif
+        npyv_f64 v_unary0 = npyv_abs_f64(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_f64(dst, v_unary0);
+    #else
+        npyv_storen_f64(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if CONTIG == CONTIG
+        #if 0
+            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f64 v_unary0 = npyv_abs_f64(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_till_f64(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f64(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_DOUBLE_absolute_NCONTIG_NCONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f64 *src = _src;
+          npyv_lanetype_f64 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 2;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
+            #else
+                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f64 v_unary0 = npyv_abs_f64(v_src0);
+        #endif
+        
+#line 126
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
+            #else
+                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f64 v_unary1 = npyv_abs_f64(v_src1);
+        #endif
+        
+#line 126
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
+            #else
+                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f64 v_unary2 = npyv_abs_f64(v_src2);
+        #endif
+        
+#line 126
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
+            #else
+                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f64 v_unary3 = npyv_abs_f64(v_src3);
+        #endif
+        
+        #line 138
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f64 v_src0 = npyv_load_f64(src);
+    #else
+        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
+    #endif
+        npyv_f64 v_unary0 = npyv_abs_f64(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_f64(dst, v_unary0);
+    #else
+        npyv_storen_f64(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if NCONTIG == CONTIG
+        #if 0
+            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f64 v_unary0 = npyv_abs_f64(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_till_f64(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f64(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+
+#line 107
+#line 112
+static void simd_DOUBLE_square_CONTIG_CONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f64 *src = _src;
+          npyv_lanetype_f64 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 4;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
+            #else
+                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f64 v_unary0 = npyv_square_f64(v_src0);
+        #endif
+        
+#line 126
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
+            #else
+                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f64 v_unary1 = npyv_square_f64(v_src1);
+        #endif
+        
+#line 126
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
+            #else
+                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f64 v_unary2 = npyv_square_f64(v_src2);
+        #endif
+        
+#line 126
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
+            #else
+                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f64 v_unary3 = npyv_square_f64(v_src3);
+        #endif
+        
+        #line 138
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f64 v_src0 = npyv_load_f64(src);
+    #else
+        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
+    #endif
+        npyv_f64 v_unary0 = npyv_square_f64(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_f64(dst, v_unary0);
+    #else
+        npyv_storen_f64(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if CONTIG == CONTIG
+        #if 0
+            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f64 v_unary0 = npyv_square_f64(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_till_f64(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f64(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_DOUBLE_square_NCONTIG_CONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f64 *src = _src;
+          npyv_lanetype_f64 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 4;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 4 > 0
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
+            #else
+                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f64 v_unary0 = npyv_square_f64(v_src0);
+        #endif
+        
+#line 126
+        #if 4 > 1
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
+            #else
+                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f64 v_unary1 = npyv_square_f64(v_src1);
+        #endif
+        
+#line 126
+        #if 4 > 2
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
+            #else
+                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f64 v_unary2 = npyv_square_f64(v_src2);
+        #endif
+        
+#line 126
+        #if 4 > 3
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
+            #else
+                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f64 v_unary3 = npyv_square_f64(v_src3);
+        #endif
+        
+        #line 138
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f64 v_src0 = npyv_load_f64(src);
+    #else
+        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
+    #endif
+        npyv_f64 v_unary0 = npyv_square_f64(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_f64(dst, v_unary0);
+    #else
+        npyv_storen_f64(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if NCONTIG == CONTIG
+        #if 0
+            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f64 v_unary0 = npyv_square_f64(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_till_f64(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f64(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_DOUBLE_square_CONTIG_NCONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f64 *src = _src;
+          npyv_lanetype_f64 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 2;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 2 > 0
+            #if CONTIG == CONTIG
+                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
+            #else
+                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f64 v_unary0 = npyv_square_f64(v_src0);
+        #endif
+        
+#line 126
+        #if 2 > 1
+            #if CONTIG == CONTIG
+                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
+            #else
+                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f64 v_unary1 = npyv_square_f64(v_src1);
+        #endif
+        
+#line 126
+        #if 2 > 2
+            #if CONTIG == CONTIG
+                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
+            #else
+                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f64 v_unary2 = npyv_square_f64(v_src2);
+        #endif
+        
+#line 126
+        #if 2 > 3
+            #if CONTIG == CONTIG
+                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
+            #else
+                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f64 v_unary3 = npyv_square_f64(v_src3);
+        #endif
+        
+        #line 138
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f64 v_src0 = npyv_load_f64(src);
+    #else
+        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
+    #endif
+        npyv_f64 v_unary0 = npyv_square_f64(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_f64(dst, v_unary0);
+    #else
+        npyv_storen_f64(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if CONTIG == CONTIG
+        #if 0
+            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f64 v_unary0 = npyv_square_f64(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_till_f64(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f64(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_DOUBLE_square_NCONTIG_NCONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f64 *src = _src;
+          npyv_lanetype_f64 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 2;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
+            #else
+                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f64 v_unary0 = npyv_square_f64(v_src0);
+        #endif
+        
+#line 126
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
+            #else
+                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f64 v_unary1 = npyv_square_f64(v_src1);
+        #endif
+        
+#line 126
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
+            #else
+                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f64 v_unary2 = npyv_square_f64(v_src2);
+        #endif
+        
+#line 126
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
+            #else
+                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f64 v_unary3 = npyv_square_f64(v_src3);
+        #endif
+        
+        #line 138
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f64 v_src0 = npyv_load_f64(src);
+    #else
+        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
+    #endif
+        npyv_f64 v_unary0 = npyv_square_f64(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_f64(dst, v_unary0);
+    #else
+        npyv_storen_f64(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if NCONTIG == CONTIG
+        #if 0
+            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
+        #endif
+    #else
+        #if 0
+            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f64 v_unary0 = npyv_square_f64(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_till_f64(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f64(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+
+#line 107
+#line 112
+static void simd_DOUBLE_reciprocal_CONTIG_CONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f64 *src = _src;
+          npyv_lanetype_f64 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 4;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
+            #else
+                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f64 v_unary0 = npyv_recip_f64(v_src0);
+        #endif
+        
+#line 126
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
+            #else
+                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f64 v_unary1 = npyv_recip_f64(v_src1);
+        #endif
+        
+#line 126
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
+            #else
+                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f64 v_unary2 = npyv_recip_f64(v_src2);
+        #endif
+        
+#line 126
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
+            #else
+                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f64 v_unary3 = npyv_recip_f64(v_src3);
+        #endif
+        
+        #line 138
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f64 v_src0 = npyv_load_f64(src);
+    #else
+        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
+    #endif
+        npyv_f64 v_unary0 = npyv_recip_f64(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_f64(dst, v_unary0);
+    #else
+        npyv_storen_f64(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if CONTIG == CONTIG
+        #if 1
+            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
+        #endif
+    #else
+        #if 1
+            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f64 v_unary0 = npyv_recip_f64(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_till_f64(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f64(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_DOUBLE_reciprocal_NCONTIG_CONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f64 *src = _src;
+          npyv_lanetype_f64 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 4;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 4 > 0
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
+            #else
+                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f64 v_unary0 = npyv_recip_f64(v_src0);
+        #endif
+        
+#line 126
+        #if 4 > 1
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
+            #else
+                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f64 v_unary1 = npyv_recip_f64(v_src1);
+        #endif
+        
+#line 126
+        #if 4 > 2
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
+            #else
+                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f64 v_unary2 = npyv_recip_f64(v_src2);
+        #endif
+        
+#line 126
+        #if 4 > 3
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
+            #else
+                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f64 v_unary3 = npyv_recip_f64(v_src3);
+        #endif
+        
+        #line 138
+        #if 4 > 0
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 1
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 2
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 4 > 3
+            #if CONTIG == CONTIG
+                npyv_store_f64(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f64 v_src0 = npyv_load_f64(src);
+    #else
+        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
+    #endif
+        npyv_f64 v_unary0 = npyv_recip_f64(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_f64(dst, v_unary0);
+    #else
+        npyv_storen_f64(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if NCONTIG == CONTIG
+        #if 1
+            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
+        #endif
+    #else
+        #if 1
+            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f64 v_unary0 = npyv_recip_f64(v_src0);
+    #if CONTIG == CONTIG
+        npyv_store_till_f64(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f64(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_DOUBLE_reciprocal_CONTIG_NCONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f64 *src = _src;
+          npyv_lanetype_f64 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 2;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 2 > 0
+            #if CONTIG == CONTIG
+                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
+            #else
+                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f64 v_unary0 = npyv_recip_f64(v_src0);
+        #endif
+        
+#line 126
+        #if 2 > 1
+            #if CONTIG == CONTIG
+                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
+            #else
+                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f64 v_unary1 = npyv_recip_f64(v_src1);
+        #endif
+        
+#line 126
+        #if 2 > 2
+            #if CONTIG == CONTIG
+                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
+            #else
+                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f64 v_unary2 = npyv_recip_f64(v_src2);
+        #endif
+        
+#line 126
+        #if 2 > 3
+            #if CONTIG == CONTIG
+                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
+            #else
+                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f64 v_unary3 = npyv_recip_f64(v_src3);
+        #endif
+        
+        #line 138
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f64 v_src0 = npyv_load_f64(src);
+    #else
+        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
+    #endif
+        npyv_f64 v_unary0 = npyv_recip_f64(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_f64(dst, v_unary0);
+    #else
+        npyv_storen_f64(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if CONTIG == CONTIG
+        #if 1
+            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
+        #endif
+    #else
+        #if 1
+            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f64 v_unary0 = npyv_recip_f64(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_till_f64(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f64(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+#line 112
+static void simd_DOUBLE_reciprocal_NCONTIG_NCONTIG
+(const void *_src, npy_intp ssrc, void *_dst, npy_intp sdst, npy_intp len)
+{
+    const npyv_lanetype_f64 *src = _src;
+          npyv_lanetype_f64 *dst = _dst;
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * 2;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, src += ssrc*wstep, dst += sdst*wstep) {
+        #line 126
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src0 = npyv_load_f64(src + vstep*0);
+            #else
+                npyv_f64 v_src0 = npyv_loadn_f64(src + ssrc*vstep*0, ssrc);
+            #endif
+            npyv_f64 v_unary0 = npyv_recip_f64(v_src0);
+        #endif
+        
+#line 126
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src1 = npyv_load_f64(src + vstep*1);
+            #else
+                npyv_f64 v_src1 = npyv_loadn_f64(src + ssrc*vstep*1, ssrc);
+            #endif
+            npyv_f64 v_unary1 = npyv_recip_f64(v_src1);
+        #endif
+        
+#line 126
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src2 = npyv_load_f64(src + vstep*2);
+            #else
+                npyv_f64 v_src2 = npyv_loadn_f64(src + ssrc*vstep*2, ssrc);
+            #endif
+            npyv_f64 v_unary2 = npyv_recip_f64(v_src2);
+        #endif
+        
+#line 126
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_f64 v_src3 = npyv_load_f64(src + vstep*3);
+            #else
+                npyv_f64 v_src3 = npyv_loadn_f64(src + ssrc*vstep*3, ssrc);
+            #endif
+            npyv_f64 v_unary3 = npyv_recip_f64(v_src3);
+        #endif
+        
+        #line 138
+        #if 2 > 0
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*0, v_unary0);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*0, sdst, v_unary0);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 1
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*1, v_unary1);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*1, sdst, v_unary1);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 2
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*2, v_unary2);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*2, sdst, v_unary2);
+            #endif
+        #endif
+        
+#line 138
+        #if 2 > 3
+            #if NCONTIG == CONTIG
+                npyv_store_f64(dst + vstep*3, v_unary3);
+            #else
+                npyv_storen_f64(dst + sdst*vstep*3, sdst, v_unary3);
+            #endif
+        #endif
+        
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f64 v_src0 = npyv_load_f64(src);
+    #else
+        npyv_f64 v_src0 = npyv_loadn_f64(src, ssrc);
+    #endif
+        npyv_f64 v_unary0 = npyv_recip_f64(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_f64(dst, v_unary0);
+    #else
+        npyv_storen_f64(dst, sdst, v_unary0);
+    #endif
+    }
+
+    // last partial iteration, if needed
+    if(len > 0){
+    #if NCONTIG == CONTIG
+        #if 1
+            npyv_f64 v_src0 = npyv_load_till_f64(src, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_load_tillz_f64(src, len);
+        #endif
+    #else
+        #if 1
+            npyv_f64 v_src0 = npyv_loadn_till_f64(src, ssrc, len, 1);
+        #else
+            npyv_f64 v_src0 = npyv_loadn_tillz_f64(src, ssrc, len);
+        #endif
+    #endif
+        npyv_f64 v_unary0 = npyv_recip_f64(v_src0);
+    #if NCONTIG == CONTIG
+        npyv_store_till_f64(dst, len, v_unary0);
+    #else
+        npyv_storen_till_f64(dst, sdst, len, v_unary0);
+    #endif
+    }
+
+    npyv_cleanup();
+}
+
+
+#endif // NPY_SIMD_F64
+
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+#line 201
+#line 206
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_rint)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    const char *src = args[0]; char *dst = args[1];
+    const npy_intp src_step = steps[0];
+    const npy_intp dst_step = steps[1];
+    npy_intp len = dimensions[0];
+#if NPY_SIMD_F32
+    const int lsize = sizeof(npyv_lanetype_f32);
+    assert(len <= 1 || (src_step % lsize == 0 && dst_step % lsize == 0));
+    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
+        goto no_unroll;
+    }
+    const npy_intp ssrc = src_step / lsize;
+    const npy_intp sdst = dst_step / lsize;
+    if (!npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst)) {
+        goto no_unroll;
+    }
+    if (ssrc == 1 && sdst == 1) {
+        simd_FLOAT_rint_CONTIG_CONTIG(src, 1, dst, 1, len);
+    }
+    else if (sdst == 1) {
+        simd_FLOAT_rint_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
+    }
+    else if (ssrc == 1) {
+        simd_FLOAT_rint_CONTIG_NCONTIG(src, 1, dst, sdst, len);
+    } else {
+        simd_FLOAT_rint_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
+    }
+    goto clear;
+no_unroll:
+#endif // NPY_SIMD_F32
+    for (; len > 0; --len, src += src_step, dst += dst_step) {
+    #if NPY_SIMD_F32
+        // to guarantee the same precision and fp/domain errors for both scalars and vectors
+        simd_FLOAT_rint_CONTIG_CONTIG(src, 0, dst, 0, 1);
+    #else
+        const npyv_lanetype_f32 src0 = *(npyv_lanetype_f32*)src;
+        *(npyv_lanetype_f32*)dst = c_rint_f32(src0);
+    #endif
+    }
+#if NPY_SIMD_F32
+clear:;
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 206
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_floor)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    const char *src = args[0]; char *dst = args[1];
+    const npy_intp src_step = steps[0];
+    const npy_intp dst_step = steps[1];
+    npy_intp len = dimensions[0];
+#if NPY_SIMD_F32
+    const int lsize = sizeof(npyv_lanetype_f32);
+    assert(len <= 1 || (src_step % lsize == 0 && dst_step % lsize == 0));
+    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
+        goto no_unroll;
+    }
+    const npy_intp ssrc = src_step / lsize;
+    const npy_intp sdst = dst_step / lsize;
+    if (!npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst)) {
+        goto no_unroll;
+    }
+    if (ssrc == 1 && sdst == 1) {
+        simd_FLOAT_floor_CONTIG_CONTIG(src, 1, dst, 1, len);
+    }
+    else if (sdst == 1) {
+        simd_FLOAT_floor_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
+    }
+    else if (ssrc == 1) {
+        simd_FLOAT_floor_CONTIG_NCONTIG(src, 1, dst, sdst, len);
+    } else {
+        simd_FLOAT_floor_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
+    }
+    goto clear;
+no_unroll:
+#endif // NPY_SIMD_F32
+    for (; len > 0; --len, src += src_step, dst += dst_step) {
+    #if NPY_SIMD_F32
+        // to guarantee the same precision and fp/domain errors for both scalars and vectors
+        simd_FLOAT_floor_CONTIG_CONTIG(src, 0, dst, 0, 1);
+    #else
+        const npyv_lanetype_f32 src0 = *(npyv_lanetype_f32*)src;
+        *(npyv_lanetype_f32*)dst = c_floor_f32(src0);
+    #endif
+    }
+#if NPY_SIMD_F32
+clear:;
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 206
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_ceil)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    const char *src = args[0]; char *dst = args[1];
+    const npy_intp src_step = steps[0];
+    const npy_intp dst_step = steps[1];
+    npy_intp len = dimensions[0];
+#if NPY_SIMD_F32
+    const int lsize = sizeof(npyv_lanetype_f32);
+    assert(len <= 1 || (src_step % lsize == 0 && dst_step % lsize == 0));
+    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
+        goto no_unroll;
+    }
+    const npy_intp ssrc = src_step / lsize;
+    const npy_intp sdst = dst_step / lsize;
+    if (!npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst)) {
+        goto no_unroll;
+    }
+    if (ssrc == 1 && sdst == 1) {
+        simd_FLOAT_ceil_CONTIG_CONTIG(src, 1, dst, 1, len);
+    }
+    else if (sdst == 1) {
+        simd_FLOAT_ceil_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
+    }
+    else if (ssrc == 1) {
+        simd_FLOAT_ceil_CONTIG_NCONTIG(src, 1, dst, sdst, len);
+    } else {
+        simd_FLOAT_ceil_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
+    }
+    goto clear;
+no_unroll:
+#endif // NPY_SIMD_F32
+    for (; len > 0; --len, src += src_step, dst += dst_step) {
+    #if NPY_SIMD_F32
+        // to guarantee the same precision and fp/domain errors for both scalars and vectors
+        simd_FLOAT_ceil_CONTIG_CONTIG(src, 0, dst, 0, 1);
+    #else
+        const npyv_lanetype_f32 src0 = *(npyv_lanetype_f32*)src;
+        *(npyv_lanetype_f32*)dst = c_ceil_f32(src0);
+    #endif
+    }
+#if NPY_SIMD_F32
+clear:;
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 206
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_trunc)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    const char *src = args[0]; char *dst = args[1];
+    const npy_intp src_step = steps[0];
+    const npy_intp dst_step = steps[1];
+    npy_intp len = dimensions[0];
+#if NPY_SIMD_F32
+    const int lsize = sizeof(npyv_lanetype_f32);
+    assert(len <= 1 || (src_step % lsize == 0 && dst_step % lsize == 0));
+    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
+        goto no_unroll;
+    }
+    const npy_intp ssrc = src_step / lsize;
+    const npy_intp sdst = dst_step / lsize;
+    if (!npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst)) {
+        goto no_unroll;
+    }
+    if (ssrc == 1 && sdst == 1) {
+        simd_FLOAT_trunc_CONTIG_CONTIG(src, 1, dst, 1, len);
+    }
+    else if (sdst == 1) {
+        simd_FLOAT_trunc_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
+    }
+    else if (ssrc == 1) {
+        simd_FLOAT_trunc_CONTIG_NCONTIG(src, 1, dst, sdst, len);
+    } else {
+        simd_FLOAT_trunc_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
+    }
+    goto clear;
+no_unroll:
+#endif // NPY_SIMD_F32
+    for (; len > 0; --len, src += src_step, dst += dst_step) {
+    #if NPY_SIMD_F32
+        // to guarantee the same precision and fp/domain errors for both scalars and vectors
+        simd_FLOAT_trunc_CONTIG_CONTIG(src, 0, dst, 0, 1);
+    #else
+        const npyv_lanetype_f32 src0 = *(npyv_lanetype_f32*)src;
+        *(npyv_lanetype_f32*)dst = c_trunc_f32(src0);
+    #endif
+    }
+#if NPY_SIMD_F32
+clear:;
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 206
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_sqrt)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    const char *src = args[0]; char *dst = args[1];
+    const npy_intp src_step = steps[0];
+    const npy_intp dst_step = steps[1];
+    npy_intp len = dimensions[0];
+#if NPY_SIMD_F32
+    const int lsize = sizeof(npyv_lanetype_f32);
+    assert(len <= 1 || (src_step % lsize == 0 && dst_step % lsize == 0));
+    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
+        goto no_unroll;
+    }
+    const npy_intp ssrc = src_step / lsize;
+    const npy_intp sdst = dst_step / lsize;
+    if (!npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst)) {
+        goto no_unroll;
+    }
+    if (ssrc == 1 && sdst == 1) {
+        simd_FLOAT_sqrt_CONTIG_CONTIG(src, 1, dst, 1, len);
+    }
+    else if (sdst == 1) {
+        simd_FLOAT_sqrt_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
+    }
+    else if (ssrc == 1) {
+        simd_FLOAT_sqrt_CONTIG_NCONTIG(src, 1, dst, sdst, len);
+    } else {
+        simd_FLOAT_sqrt_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
+    }
+    goto clear;
+no_unroll:
+#endif // NPY_SIMD_F32
+    for (; len > 0; --len, src += src_step, dst += dst_step) {
+    #if NPY_SIMD_F32
+        // to guarantee the same precision and fp/domain errors for both scalars and vectors
+        simd_FLOAT_sqrt_CONTIG_CONTIG(src, 0, dst, 0, 1);
+    #else
+        const npyv_lanetype_f32 src0 = *(npyv_lanetype_f32*)src;
+        *(npyv_lanetype_f32*)dst = c_sqrt_f32(src0);
+    #endif
+    }
+#if NPY_SIMD_F32
+clear:;
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 206
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_absolute)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    const char *src = args[0]; char *dst = args[1];
+    const npy_intp src_step = steps[0];
+    const npy_intp dst_step = steps[1];
+    npy_intp len = dimensions[0];
+#if NPY_SIMD_F32
+    const int lsize = sizeof(npyv_lanetype_f32);
+    assert(len <= 1 || (src_step % lsize == 0 && dst_step % lsize == 0));
+    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
+        goto no_unroll;
+    }
+    const npy_intp ssrc = src_step / lsize;
+    const npy_intp sdst = dst_step / lsize;
+    if (!npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst)) {
+        goto no_unroll;
+    }
+    if (ssrc == 1 && sdst == 1) {
+        simd_FLOAT_absolute_CONTIG_CONTIG(src, 1, dst, 1, len);
+    }
+    else if (sdst == 1) {
+        simd_FLOAT_absolute_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
+    }
+    else if (ssrc == 1) {
+        simd_FLOAT_absolute_CONTIG_NCONTIG(src, 1, dst, sdst, len);
+    } else {
+        simd_FLOAT_absolute_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
+    }
+    goto clear;
+no_unroll:
+#endif // NPY_SIMD_F32
+    for (; len > 0; --len, src += src_step, dst += dst_step) {
+    #if NPY_SIMD_F32
+        // to guarantee the same precision and fp/domain errors for both scalars and vectors
+        simd_FLOAT_absolute_CONTIG_CONTIG(src, 0, dst, 0, 1);
+    #else
+        const npyv_lanetype_f32 src0 = *(npyv_lanetype_f32*)src;
+        *(npyv_lanetype_f32*)dst = c_abs_f32(src0);
+    #endif
+    }
+#if NPY_SIMD_F32
+clear:;
+#endif
+#if 1
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 206
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_square)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    const char *src = args[0]; char *dst = args[1];
+    const npy_intp src_step = steps[0];
+    const npy_intp dst_step = steps[1];
+    npy_intp len = dimensions[0];
+#if NPY_SIMD_F32
+    const int lsize = sizeof(npyv_lanetype_f32);
+    assert(len <= 1 || (src_step % lsize == 0 && dst_step % lsize == 0));
+    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
+        goto no_unroll;
+    }
+    const npy_intp ssrc = src_step / lsize;
+    const npy_intp sdst = dst_step / lsize;
+    if (!npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst)) {
+        goto no_unroll;
+    }
+    if (ssrc == 1 && sdst == 1) {
+        simd_FLOAT_square_CONTIG_CONTIG(src, 1, dst, 1, len);
+    }
+    else if (sdst == 1) {
+        simd_FLOAT_square_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
+    }
+    else if (ssrc == 1) {
+        simd_FLOAT_square_CONTIG_NCONTIG(src, 1, dst, sdst, len);
+    } else {
+        simd_FLOAT_square_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
+    }
+    goto clear;
+no_unroll:
+#endif // NPY_SIMD_F32
+    for (; len > 0; --len, src += src_step, dst += dst_step) {
+    #if NPY_SIMD_F32
+        // to guarantee the same precision and fp/domain errors for both scalars and vectors
+        simd_FLOAT_square_CONTIG_CONTIG(src, 0, dst, 0, 1);
+    #else
+        const npyv_lanetype_f32 src0 = *(npyv_lanetype_f32*)src;
+        *(npyv_lanetype_f32*)dst = c_square_f32(src0);
+    #endif
+    }
+#if NPY_SIMD_F32
+clear:;
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 206
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_reciprocal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    const char *src = args[0]; char *dst = args[1];
+    const npy_intp src_step = steps[0];
+    const npy_intp dst_step = steps[1];
+    npy_intp len = dimensions[0];
+#if NPY_SIMD_F32
+    const int lsize = sizeof(npyv_lanetype_f32);
+    assert(len <= 1 || (src_step % lsize == 0 && dst_step % lsize == 0));
+    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
+        goto no_unroll;
+    }
+    const npy_intp ssrc = src_step / lsize;
+    const npy_intp sdst = dst_step / lsize;
+    if (!npyv_loadable_stride_f32(ssrc) || !npyv_storable_stride_f32(sdst)) {
+        goto no_unroll;
+    }
+    if (ssrc == 1 && sdst == 1) {
+        simd_FLOAT_reciprocal_CONTIG_CONTIG(src, 1, dst, 1, len);
+    }
+    else if (sdst == 1) {
+        simd_FLOAT_reciprocal_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
+    }
+    else if (ssrc == 1) {
+        simd_FLOAT_reciprocal_CONTIG_NCONTIG(src, 1, dst, sdst, len);
+    } else {
+        simd_FLOAT_reciprocal_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
+    }
+    goto clear;
+no_unroll:
+#endif // NPY_SIMD_F32
+    for (; len > 0; --len, src += src_step, dst += dst_step) {
+    #if NPY_SIMD_F32
+        // to guarantee the same precision and fp/domain errors for both scalars and vectors
+        simd_FLOAT_reciprocal_CONTIG_CONTIG(src, 0, dst, 0, 1);
+    #else
+        const npyv_lanetype_f32 src0 = *(npyv_lanetype_f32*)src;
+        *(npyv_lanetype_f32*)dst = c_recip_f32(src0);
+    #endif
+    }
+#if NPY_SIMD_F32
+clear:;
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+#line 201
+#line 206
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_rint)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    const char *src = args[0]; char *dst = args[1];
+    const npy_intp src_step = steps[0];
+    const npy_intp dst_step = steps[1];
+    npy_intp len = dimensions[0];
+#if NPY_SIMD_F64
+    const int lsize = sizeof(npyv_lanetype_f64);
+    assert(len <= 1 || (src_step % lsize == 0 && dst_step % lsize == 0));
+    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
+        goto no_unroll;
+    }
+    const npy_intp ssrc = src_step / lsize;
+    const npy_intp sdst = dst_step / lsize;
+    if (!npyv_loadable_stride_f64(ssrc) || !npyv_storable_stride_f64(sdst)) {
+        goto no_unroll;
+    }
+    if (ssrc == 1 && sdst == 1) {
+        simd_DOUBLE_rint_CONTIG_CONTIG(src, 1, dst, 1, len);
+    }
+    else if (sdst == 1) {
+        simd_DOUBLE_rint_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
+    }
+    else if (ssrc == 1) {
+        simd_DOUBLE_rint_CONTIG_NCONTIG(src, 1, dst, sdst, len);
+    } else {
+        simd_DOUBLE_rint_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
+    }
+    goto clear;
+no_unroll:
+#endif // NPY_SIMD_F64
+    for (; len > 0; --len, src += src_step, dst += dst_step) {
+    #if NPY_SIMD_F64
+        // to guarantee the same precision and fp/domain errors for both scalars and vectors
+        simd_DOUBLE_rint_CONTIG_CONTIG(src, 0, dst, 0, 1);
+    #else
+        const npyv_lanetype_f64 src0 = *(npyv_lanetype_f64*)src;
+        *(npyv_lanetype_f64*)dst = c_rint_f64(src0);
+    #endif
+    }
+#if NPY_SIMD_F64
+clear:;
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 206
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_floor)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    const char *src = args[0]; char *dst = args[1];
+    const npy_intp src_step = steps[0];
+    const npy_intp dst_step = steps[1];
+    npy_intp len = dimensions[0];
+#if NPY_SIMD_F64
+    const int lsize = sizeof(npyv_lanetype_f64);
+    assert(len <= 1 || (src_step % lsize == 0 && dst_step % lsize == 0));
+    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
+        goto no_unroll;
+    }
+    const npy_intp ssrc = src_step / lsize;
+    const npy_intp sdst = dst_step / lsize;
+    if (!npyv_loadable_stride_f64(ssrc) || !npyv_storable_stride_f64(sdst)) {
+        goto no_unroll;
+    }
+    if (ssrc == 1 && sdst == 1) {
+        simd_DOUBLE_floor_CONTIG_CONTIG(src, 1, dst, 1, len);
+    }
+    else if (sdst == 1) {
+        simd_DOUBLE_floor_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
+    }
+    else if (ssrc == 1) {
+        simd_DOUBLE_floor_CONTIG_NCONTIG(src, 1, dst, sdst, len);
+    } else {
+        simd_DOUBLE_floor_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
+    }
+    goto clear;
+no_unroll:
+#endif // NPY_SIMD_F64
+    for (; len > 0; --len, src += src_step, dst += dst_step) {
+    #if NPY_SIMD_F64
+        // to guarantee the same precision and fp/domain errors for both scalars and vectors
+        simd_DOUBLE_floor_CONTIG_CONTIG(src, 0, dst, 0, 1);
+    #else
+        const npyv_lanetype_f64 src0 = *(npyv_lanetype_f64*)src;
+        *(npyv_lanetype_f64*)dst = c_floor_f64(src0);
+    #endif
+    }
+#if NPY_SIMD_F64
+clear:;
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 206
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_ceil)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    const char *src = args[0]; char *dst = args[1];
+    const npy_intp src_step = steps[0];
+    const npy_intp dst_step = steps[1];
+    npy_intp len = dimensions[0];
+#if NPY_SIMD_F64
+    const int lsize = sizeof(npyv_lanetype_f64);
+    assert(len <= 1 || (src_step % lsize == 0 && dst_step % lsize == 0));
+    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
+        goto no_unroll;
+    }
+    const npy_intp ssrc = src_step / lsize;
+    const npy_intp sdst = dst_step / lsize;
+    if (!npyv_loadable_stride_f64(ssrc) || !npyv_storable_stride_f64(sdst)) {
+        goto no_unroll;
+    }
+    if (ssrc == 1 && sdst == 1) {
+        simd_DOUBLE_ceil_CONTIG_CONTIG(src, 1, dst, 1, len);
+    }
+    else if (sdst == 1) {
+        simd_DOUBLE_ceil_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
+    }
+    else if (ssrc == 1) {
+        simd_DOUBLE_ceil_CONTIG_NCONTIG(src, 1, dst, sdst, len);
+    } else {
+        simd_DOUBLE_ceil_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
+    }
+    goto clear;
+no_unroll:
+#endif // NPY_SIMD_F64
+    for (; len > 0; --len, src += src_step, dst += dst_step) {
+    #if NPY_SIMD_F64
+        // to guarantee the same precision and fp/domain errors for both scalars and vectors
+        simd_DOUBLE_ceil_CONTIG_CONTIG(src, 0, dst, 0, 1);
+    #else
+        const npyv_lanetype_f64 src0 = *(npyv_lanetype_f64*)src;
+        *(npyv_lanetype_f64*)dst = c_ceil_f64(src0);
+    #endif
+    }
+#if NPY_SIMD_F64
+clear:;
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 206
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_trunc)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    const char *src = args[0]; char *dst = args[1];
+    const npy_intp src_step = steps[0];
+    const npy_intp dst_step = steps[1];
+    npy_intp len = dimensions[0];
+#if NPY_SIMD_F64
+    const int lsize = sizeof(npyv_lanetype_f64);
+    assert(len <= 1 || (src_step % lsize == 0 && dst_step % lsize == 0));
+    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
+        goto no_unroll;
+    }
+    const npy_intp ssrc = src_step / lsize;
+    const npy_intp sdst = dst_step / lsize;
+    if (!npyv_loadable_stride_f64(ssrc) || !npyv_storable_stride_f64(sdst)) {
+        goto no_unroll;
+    }
+    if (ssrc == 1 && sdst == 1) {
+        simd_DOUBLE_trunc_CONTIG_CONTIG(src, 1, dst, 1, len);
+    }
+    else if (sdst == 1) {
+        simd_DOUBLE_trunc_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
+    }
+    else if (ssrc == 1) {
+        simd_DOUBLE_trunc_CONTIG_NCONTIG(src, 1, dst, sdst, len);
+    } else {
+        simd_DOUBLE_trunc_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
+    }
+    goto clear;
+no_unroll:
+#endif // NPY_SIMD_F64
+    for (; len > 0; --len, src += src_step, dst += dst_step) {
+    #if NPY_SIMD_F64
+        // to guarantee the same precision and fp/domain errors for both scalars and vectors
+        simd_DOUBLE_trunc_CONTIG_CONTIG(src, 0, dst, 0, 1);
+    #else
+        const npyv_lanetype_f64 src0 = *(npyv_lanetype_f64*)src;
+        *(npyv_lanetype_f64*)dst = c_trunc_f64(src0);
+    #endif
+    }
+#if NPY_SIMD_F64
+clear:;
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 206
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_sqrt)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    const char *src = args[0]; char *dst = args[1];
+    const npy_intp src_step = steps[0];
+    const npy_intp dst_step = steps[1];
+    npy_intp len = dimensions[0];
+#if NPY_SIMD_F64
+    const int lsize = sizeof(npyv_lanetype_f64);
+    assert(len <= 1 || (src_step % lsize == 0 && dst_step % lsize == 0));
+    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
+        goto no_unroll;
+    }
+    const npy_intp ssrc = src_step / lsize;
+    const npy_intp sdst = dst_step / lsize;
+    if (!npyv_loadable_stride_f64(ssrc) || !npyv_storable_stride_f64(sdst)) {
+        goto no_unroll;
+    }
+    if (ssrc == 1 && sdst == 1) {
+        simd_DOUBLE_sqrt_CONTIG_CONTIG(src, 1, dst, 1, len);
+    }
+    else if (sdst == 1) {
+        simd_DOUBLE_sqrt_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
+    }
+    else if (ssrc == 1) {
+        simd_DOUBLE_sqrt_CONTIG_NCONTIG(src, 1, dst, sdst, len);
+    } else {
+        simd_DOUBLE_sqrt_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
+    }
+    goto clear;
+no_unroll:
+#endif // NPY_SIMD_F64
+    for (; len > 0; --len, src += src_step, dst += dst_step) {
+    #if NPY_SIMD_F64
+        // to guarantee the same precision and fp/domain errors for both scalars and vectors
+        simd_DOUBLE_sqrt_CONTIG_CONTIG(src, 0, dst, 0, 1);
+    #else
+        const npyv_lanetype_f64 src0 = *(npyv_lanetype_f64*)src;
+        *(npyv_lanetype_f64*)dst = c_sqrt_f64(src0);
+    #endif
+    }
+#if NPY_SIMD_F64
+clear:;
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 206
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_absolute)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    const char *src = args[0]; char *dst = args[1];
+    const npy_intp src_step = steps[0];
+    const npy_intp dst_step = steps[1];
+    npy_intp len = dimensions[0];
+#if NPY_SIMD_F64
+    const int lsize = sizeof(npyv_lanetype_f64);
+    assert(len <= 1 || (src_step % lsize == 0 && dst_step % lsize == 0));
+    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
+        goto no_unroll;
+    }
+    const npy_intp ssrc = src_step / lsize;
+    const npy_intp sdst = dst_step / lsize;
+    if (!npyv_loadable_stride_f64(ssrc) || !npyv_storable_stride_f64(sdst)) {
+        goto no_unroll;
+    }
+    if (ssrc == 1 && sdst == 1) {
+        simd_DOUBLE_absolute_CONTIG_CONTIG(src, 1, dst, 1, len);
+    }
+    else if (sdst == 1) {
+        simd_DOUBLE_absolute_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
+    }
+    else if (ssrc == 1) {
+        simd_DOUBLE_absolute_CONTIG_NCONTIG(src, 1, dst, sdst, len);
+    } else {
+        simd_DOUBLE_absolute_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
+    }
+    goto clear;
+no_unroll:
+#endif // NPY_SIMD_F64
+    for (; len > 0; --len, src += src_step, dst += dst_step) {
+    #if NPY_SIMD_F64
+        // to guarantee the same precision and fp/domain errors for both scalars and vectors
+        simd_DOUBLE_absolute_CONTIG_CONTIG(src, 0, dst, 0, 1);
+    #else
+        const npyv_lanetype_f64 src0 = *(npyv_lanetype_f64*)src;
+        *(npyv_lanetype_f64*)dst = c_abs_f64(src0);
+    #endif
+    }
+#if NPY_SIMD_F64
+clear:;
+#endif
+#if 1
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 206
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_square)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    const char *src = args[0]; char *dst = args[1];
+    const npy_intp src_step = steps[0];
+    const npy_intp dst_step = steps[1];
+    npy_intp len = dimensions[0];
+#if NPY_SIMD_F64
+    const int lsize = sizeof(npyv_lanetype_f64);
+    assert(len <= 1 || (src_step % lsize == 0 && dst_step % lsize == 0));
+    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
+        goto no_unroll;
+    }
+    const npy_intp ssrc = src_step / lsize;
+    const npy_intp sdst = dst_step / lsize;
+    if (!npyv_loadable_stride_f64(ssrc) || !npyv_storable_stride_f64(sdst)) {
+        goto no_unroll;
+    }
+    if (ssrc == 1 && sdst == 1) {
+        simd_DOUBLE_square_CONTIG_CONTIG(src, 1, dst, 1, len);
+    }
+    else if (sdst == 1) {
+        simd_DOUBLE_square_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
+    }
+    else if (ssrc == 1) {
+        simd_DOUBLE_square_CONTIG_NCONTIG(src, 1, dst, sdst, len);
+    } else {
+        simd_DOUBLE_square_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
+    }
+    goto clear;
+no_unroll:
+#endif // NPY_SIMD_F64
+    for (; len > 0; --len, src += src_step, dst += dst_step) {
+    #if NPY_SIMD_F64
+        // to guarantee the same precision and fp/domain errors for both scalars and vectors
+        simd_DOUBLE_square_CONTIG_CONTIG(src, 0, dst, 0, 1);
+    #else
+        const npyv_lanetype_f64 src0 = *(npyv_lanetype_f64*)src;
+        *(npyv_lanetype_f64*)dst = c_square_f64(src0);
+    #endif
+    }
+#if NPY_SIMD_F64
+clear:;
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+#line 206
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_reciprocal)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    const char *src = args[0]; char *dst = args[1];
+    const npy_intp src_step = steps[0];
+    const npy_intp dst_step = steps[1];
+    npy_intp len = dimensions[0];
+#if NPY_SIMD_F64
+    const int lsize = sizeof(npyv_lanetype_f64);
+    assert(len <= 1 || (src_step % lsize == 0 && dst_step % lsize == 0));
+    if (is_mem_overlap(src, src_step, dst, dst_step, len)) {
+        goto no_unroll;
+    }
+    const npy_intp ssrc = src_step / lsize;
+    const npy_intp sdst = dst_step / lsize;
+    if (!npyv_loadable_stride_f64(ssrc) || !npyv_storable_stride_f64(sdst)) {
+        goto no_unroll;
+    }
+    if (ssrc == 1 && sdst == 1) {
+        simd_DOUBLE_reciprocal_CONTIG_CONTIG(src, 1, dst, 1, len);
+    }
+    else if (sdst == 1) {
+        simd_DOUBLE_reciprocal_NCONTIG_CONTIG(src, ssrc, dst, 1, len);
+    }
+    else if (ssrc == 1) {
+        simd_DOUBLE_reciprocal_CONTIG_NCONTIG(src, 1, dst, sdst, len);
+    } else {
+        simd_DOUBLE_reciprocal_NCONTIG_NCONTIG(src, ssrc, dst, sdst, len);
+    }
+    goto clear;
+no_unroll:
+#endif // NPY_SIMD_F64
+    for (; len > 0; --len, src += src_step, dst += dst_step) {
+    #if NPY_SIMD_F64
+        // to guarantee the same precision and fp/domain errors for both scalars and vectors
+        simd_DOUBLE_reciprocal_CONTIG_CONTIG(src, 0, dst, 0, 1);
+    #else
+        const npyv_lanetype_f64 src0 = *(npyv_lanetype_f64*)src;
+        *(npyv_lanetype_f64*)dst = c_recip_f64(src0);
+    #endif
+    }
+#if NPY_SIMD_F64
+clear:;
+#endif
+#if 0
+    npy_clear_floatstatus_barrier((char*)dimensions);
+#endif
+}
+
+
+
diff --git a/numpy/core/src/_generated/loops_unary_fp_le.dispatch.c b/numpy/core/src/_generated/loops_unary_fp_le.dispatch.c
new file mode 100644
index 000000000000..6729a1a90e50
--- /dev/null
+++ b/numpy/core/src/_generated/loops_unary_fp_le.dispatch.c
@@ -0,0 +1,3742 @@
+#line 1 "numpy/core/src/umath/loops_unary_fp_le.dispatch.c.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+/*@targets
+ ** $maxopt baseline
+ ** sse2 sse41
+ ** vsx2
+ ** neon asimd
+ **/
+
+/**
+ * Force use SSE only on x86, even if AVX2 or AVX512F are enabled
+ * through the baseline, since scatter(AVX512F) and gather very costly
+ * to handle non-contiguous memory access comparing with SSE for
+ * such small operations that this file covers.
+ */
+#define NPY_SIMD_FORCE_128
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#include <float.h>
+#include "numpy/npy_math.h"
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "lowlevel_strided_loops.h"
+// Provides the various *_LOOP macros
+#include "fast_loop_macros.h"
+
+/**
+ * This code should really be merged into loops_unary_fp.dispatch.c.src
+ * However there is an issue with enabling the code here for VX and VXE
+ * as the shifts don't behave as expected.
+ * See the code below that references NPY__CPU_TARGET_VX and
+ * NPY_BIG_ENDIAN. Suspect that this is a big endian vector issue.
+ *
+ * Splitting the files out allows us to keep loops_unary_fp.dispatch.c.src
+ * building for VX and VXE so we don't regress performance while adding this
+ * code for other platforms.
+ */
+// TODO(@seiko2plus): add support for big-endian
+#if NPY_SIMD_BIGENDIAN
+    #undef NPY_SIMD
+    #undef NPY_SIMD_F32
+    #undef NPY_SIMD_F64
+    #define NPY_SIMD 0
+    #define NPY_SIMD_F32 0
+    #define NPY_SIMD_F64 0
+#endif
+
+/*******************************************************************************
+ ** extra SIMD intrinsics
+ ******************************************************************************/
+
+#if NPY_SIMD
+
+/**
+ * We define intrinsics for isnan, isinf, isfinite, and signbit below.  There's
+ * a few flavors of each.  We'll use f32 as an example although f64 versions
+ * are also defined.
+ * 
+ * npyv_u32 npyv_KIND_f32(npyv_f32 v)
+ *   These are mainly used for the single vector loops.  As such, result should
+ *   be bool true / false, ready to write back.
+ * 
+ * npyv_b32 _npyv_KIND_f32(npyv_f32 v)
+ *   These are used by the geneal intrinsics above as well as the multi-vector
+ *   packing intrinsics.  The multi-vector packing intrinsics are the ones
+ *   utilized in the unrolled vector loops.  Results should be vector masks
+ *   of 0x00/0xff.
+ * 
+ * npyv_u8 npyv_pack_KIND_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
+ *   These are the multi-vector packing intrinsics utilized by unrolled vector
+ *   loops.  They perform the operation on all input vectors and pack the
+ *   results to a single npyv_u8.  Assuming NPY_SIMD == 128, that means we
+ *   can pack results from 4x npyv_f32 or 8x npyv_64 in a single npyv_u8.
+ *   Result should be bool true / false, ready to write back.
+ */
+
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_u32
+npyv_isnan_f32(npyv_f32 v)
+{
+    const npyv_u8 truemask = npyv_reinterpret_u8_u32(npyv_setall_u32(1==1));
+    npyv_u8 notnan = npyv_reinterpret_u8_u32(npyv_cvt_u32_b32(npyv_notnan_f32(v)));
+    return npyv_reinterpret_u32_u8(npyv_andc_u8(truemask, notnan));
+}
+NPY_FINLINE npyv_u8
+npyv_pack_isnan_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
+{
+    const npyv_u8 truemask = npyv_setall_u8(1==1);
+    npyv_b32 b0 = npyv_notnan_f32(v0);
+    npyv_b32 b1 = npyv_notnan_f32(v1);
+    npyv_b32 b2 = npyv_notnan_f32(v2);
+    npyv_b32 b3 = npyv_notnan_f32(v3);
+    npyv_b8 notnan = npyv_pack_b8_b32(b0, b1, b2, b3);
+    return npyv_andc_u8(truemask, npyv_cvt_u8_b8(notnan));
+}
+#endif
+#if NPY_SIMD_F64
+NPY_FINLINE npyv_u64
+npyv_isnan_f64(npyv_f64 v)
+{
+    const npyv_u8 truemask = npyv_reinterpret_u8_u64(npyv_setall_u64(1==1));
+    npyv_u8 notnan = npyv_reinterpret_u8_u64(npyv_cvt_u64_b64(npyv_notnan_f64(v)));
+    return npyv_reinterpret_u64_u8(npyv_andc_u8(truemask, notnan));
+}
+NPY_FINLINE npyv_u8
+npyv_pack_isnan_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3,
+                    npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7)
+{
+    const npyv_u8 truemask = npyv_setall_u8(1==1);
+    npyv_b64 b0 = npyv_notnan_f64(v0);
+    npyv_b64 b1 = npyv_notnan_f64(v1);
+    npyv_b64 b2 = npyv_notnan_f64(v2);
+    npyv_b64 b3 = npyv_notnan_f64(v3);
+    npyv_b64 b4 = npyv_notnan_f64(v4);
+    npyv_b64 b5 = npyv_notnan_f64(v5);
+    npyv_b64 b6 = npyv_notnan_f64(v6);
+    npyv_b64 b7 = npyv_notnan_f64(v7);
+    npyv_b8 notnan = npyv_pack_b8_b64(b0, b1, b2, b3, b4, b5, b6, b7);
+    return npyv_andc_u8(truemask, npyv_cvt_u8_b8(notnan));
+}
+#endif
+
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_b32
+_npyv_isinf_f32(npyv_f32 v)
+{
+#if defined(NPY_HAVE_NEON)
+    // abs(v) > FLT_MAX
+    const npyv_f32 fltmax = npyv_setall_f32(FLT_MAX);
+    return vcagtq_f32(v, fltmax);
+#else
+    // cast out the sign and check if all exponent bits are set.
+    const npyv_u32 exp_mask = npyv_setall_u32(0xff000000);
+    npyv_u32 bits = npyv_shli_u32(npyv_reinterpret_u32_f32(v), 1);
+    return npyv_cmpeq_u32(bits, exp_mask);
+#endif
+}
+NPY_FINLINE npyv_u32
+npyv_isinf_f32(npyv_f32 v)
+{
+    const npyv_u32 truemask = npyv_setall_u32(1==1);
+    return npyv_and_u32(truemask, npyv_cvt_u32_b32(_npyv_isinf_f32(v)));
+}
+NPY_FINLINE npyv_u8
+npyv_pack_isinf_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
+{
+    const npyv_u8 truemask = npyv_setall_u8(1==1);
+    npyv_b32 b0 = _npyv_isinf_f32(v0);
+    npyv_b32 b1 = _npyv_isinf_f32(v1);
+    npyv_b32 b2 = _npyv_isinf_f32(v2);
+    npyv_b32 b3 = _npyv_isinf_f32(v3);
+    npyv_b8 isinf = npyv_pack_b8_b32(b0, b1, b2, b3);
+    return npyv_and_u8(truemask, npyv_cvt_u8_b8(isinf));
+}
+#endif // NPY_SIMD_F32
+#if NPY_SIMD_F64
+NPY_FINLINE npyv_b64
+_npyv_isinf_f64(npyv_f64 v)
+{
+#if defined(NPY_HAVE_NEON)
+    // abs(v) > DBL_MAX
+    const npyv_f64 fltmax = npyv_setall_f64(DBL_MAX);
+    return vcagtq_f64(v, fltmax);
+#else
+    // cast out the sign and check if all exponent bits are set.
+    const npyv_u64 exp_mask = npyv_setall_u64(0xffe0000000000000);
+    npyv_u64 bits = npyv_shli_u64(npyv_reinterpret_u64_f64(v), 1);
+    return npyv_cmpeq_u64(bits, exp_mask);
+#endif
+}
+NPY_FINLINE npyv_u64
+npyv_isinf_f64(npyv_f64 v)
+{
+    const npyv_u64 truemask = npyv_setall_u64(1==1);
+    return npyv_and_u64(truemask, npyv_cvt_u64_b64(_npyv_isinf_f64(v)));
+}
+NPY_FINLINE npyv_u8
+npyv_pack_isinf_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3,
+                    npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7)
+{
+    const npyv_u8 truemask = npyv_setall_u8(1==1);
+    npyv_b64 b0 = _npyv_isinf_f64(v0);
+    npyv_b64 b1 = _npyv_isinf_f64(v1);
+    npyv_b64 b2 = _npyv_isinf_f64(v2);
+    npyv_b64 b3 = _npyv_isinf_f64(v3);
+    npyv_b64 b4 = _npyv_isinf_f64(v4);
+    npyv_b64 b5 = _npyv_isinf_f64(v5);
+    npyv_b64 b6 = _npyv_isinf_f64(v6);
+    npyv_b64 b7 = _npyv_isinf_f64(v7);
+    npyv_b8 isinf = npyv_pack_b8_b64(b0, b1, b2, b3, b4, b5, b6, b7);
+    return npyv_and_u8(truemask, npyv_cvt_u8_b8(isinf));
+}
+#endif // NPY_SIMD_F64
+
+
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_b32
+npyv_notfinite_f32(npyv_f32 v)
+{
+    // cast out the sign and check if all exponent bits are set
+    // no matter the mentissa is.
+    const npyv_u32 exp_mask = npyv_setall_u32(0x7f800000);
+    npyv_u32 bits = npyv_reinterpret_u32_f32(v);
+    bits = npyv_and_u32(bits, exp_mask);
+    return npyv_cmpeq_u32(bits, exp_mask);
+}
+NPY_FINLINE npyv_u32
+npyv_isfinite_f32(npyv_f32 v)
+{
+    const npyv_u8 truemask = npyv_reinterpret_u8_u32(npyv_setall_u32(1==1));
+    npyv_u8 notfinite = npyv_reinterpret_u8_u32(npyv_cvt_u32_b32(npyv_notfinite_f32(v)));
+    return npyv_reinterpret_u32_u8(npyv_andc_u8(truemask, notfinite));
+}
+NPY_FINLINE npyv_u8
+npyv_pack_isfinite_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
+{
+#if defined(NPY_HAVE_NEON) && defined(__aarch64__)
+    // F32 exponent is 8-bits, which means we can pack multiple into
+    // a single vector.  We shift out sign bit so that we're left
+    // with only exponent in high byte.  If not all bits are set,
+    // then we've got a finite number.
+    uint8x16x4_t tbl;
+    tbl.val[0] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v0), 1));
+    tbl.val[1] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v1), 1));
+    tbl.val[2] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v2), 1));
+    tbl.val[3] = npyv_reinterpret_u8_u32(npyv_shli_u32(npyv_reinterpret_u32_f32(v3), 1));
+
+    const npyv_u8 permute = {3,7,11,15,  19,23,27,31,  35,39,43,47,  51,55,59,63};
+    npyv_u8 r = vqtbl4q_u8(tbl, permute);
+
+    const npyv_u8 expmask = npyv_setall_u8(0xff);
+    r = npyv_cmpneq_u8(r, expmask);
+    r = vshrq_n_u8(r, 7);
+    return r;
+#else
+    const npyv_u8 truemask = npyv_setall_u8(1==1);
+    npyv_b32 b0 = npyv_notfinite_f32(v0);
+    npyv_b32 b1 = npyv_notfinite_f32(v1);
+    npyv_b32 b2 = npyv_notfinite_f32(v2);
+    npyv_b32 b3 = npyv_notfinite_f32(v3);
+    npyv_b8 notfinite = npyv_pack_b8_b32(b0, b1, b2, b3);
+    return npyv_andc_u8(truemask, npyv_cvt_u8_b8(notfinite));
+#endif
+}
+#endif // NPY_SIMD_F32
+#if NPY_SIMD_F64
+NPY_FINLINE npyv_b64
+npyv_notfinite_f64(npyv_f64 v)
+{
+    // cast out the sign and check if all exponent bits are set
+    // no matter the mantissa is.
+    const npyv_u64 exp_mask = npyv_setall_u64(0x7ff0000000000000);
+    npyv_u64 bits = npyv_reinterpret_u64_f64(v);
+    bits = npyv_and_u64(bits, exp_mask);
+    return npyv_cmpeq_u64(bits, exp_mask);
+}
+NPY_FINLINE npyv_u64
+npyv_isfinite_f64(npyv_f64 v)
+{
+    const npyv_u8 truemask = npyv_reinterpret_u8_u64(npyv_setall_u64(1==1));
+    npyv_u8 notfinite = npyv_reinterpret_u8_u64(npyv_cvt_u64_b64(npyv_notfinite_f64(v)));
+    return npyv_reinterpret_u64_u8(npyv_andc_u8(truemask, notfinite));
+}
+NPY_FINLINE npyv_u8
+npyv_pack_isfinite_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3,
+                       npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7)
+{
+#if defined(NPY_HAVE_NEON) && defined(__aarch64__)
+    // F64 exponent is 11-bits, which means we can pack multiple into
+    // a single vector.  We'll need to use u16 to fit all exponent
+    // bits.  If not all bits are set, then we've got a finite number.
+    uint8x16x4_t t0123, t4567;
+    t0123.val[0] = npyv_reinterpret_u8_f64(v0);
+    t0123.val[1] = npyv_reinterpret_u8_f64(v1);
+    t0123.val[2] = npyv_reinterpret_u8_f64(v2);
+    t0123.val[3] = npyv_reinterpret_u8_f64(v3);
+    t4567.val[0] = npyv_reinterpret_u8_f64(v4);
+    t4567.val[1] = npyv_reinterpret_u8_f64(v5);
+    t4567.val[2] = npyv_reinterpret_u8_f64(v6);
+    t4567.val[3] = npyv_reinterpret_u8_f64(v7);
+
+    const npyv_u8 permute = {6,7,14,15,  22,23,30,31,  38,39,46,47,  54,55,62,63};
+    npyv_u16 r0 = npyv_reinterpret_u16_u8(vqtbl4q_u8(t0123, permute));
+    npyv_u16 r1 = npyv_reinterpret_u16_u8(vqtbl4q_u8(t4567, permute));
+
+    const npyv_u16 expmask = npyv_setall_u16(0x7ff0);
+    r0 = npyv_and_u16(r0, expmask);
+    r0 = npyv_cmpneq_u16(r0, expmask);
+    r0 = npyv_shri_u16(r0, 15);
+    r1 = npyv_and_u16(r1, expmask);
+    r1 = npyv_cmpneq_u16(r1, expmask);
+    r1 = npyv_shri_u16(r1, 15);
+
+    npyv_u8 r = npyv_pack_b8_b16(r0, r1);
+    return r;
+#else
+    const npyv_u8 truemask = npyv_setall_u8(1==1);
+    npyv_b64 b0 = npyv_notfinite_f64(v0);
+    npyv_b64 b1 = npyv_notfinite_f64(v1);
+    npyv_b64 b2 = npyv_notfinite_f64(v2);
+    npyv_b64 b3 = npyv_notfinite_f64(v3);
+    npyv_b64 b4 = npyv_notfinite_f64(v4);
+    npyv_b64 b5 = npyv_notfinite_f64(v5);
+    npyv_b64 b6 = npyv_notfinite_f64(v6);
+    npyv_b64 b7 = npyv_notfinite_f64(v7);
+    npyv_b8 notfinite = npyv_pack_b8_b64(b0, b1, b2, b3, b4, b5, b6, b7);
+    return npyv_andc_u8(truemask, npyv_cvt_u8_b8(notfinite));
+#endif
+}
+#endif // NPY_SIMD_F64
+
+#if NPY_SIMD_F32
+NPY_FINLINE npyv_u32
+npyv_signbit_f32(npyv_f32 v)
+{
+    return npyv_shri_u32(npyv_reinterpret_u32_f32(v), (sizeof(npyv_lanetype_f32)*8)-1);
+}
+NPY_FINLINE npyv_u8
+npyv_pack_signbit_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3)
+{
+#if defined(NPY_HAVE_NEON) && defined(__aarch64__)
+    // We only need high byte for signbit, which means we can pack
+    // multiple inputs into a single vector.
+    uint8x16x4_t tbl;
+    tbl.val[0] = npyv_reinterpret_u8_f32(v0);
+    tbl.val[1] = npyv_reinterpret_u8_f32(v1);
+    tbl.val[2] = npyv_reinterpret_u8_f32(v2);
+    tbl.val[3] = npyv_reinterpret_u8_f32(v3);
+
+    const npyv_u8 permute = {3,7,11,15,  19,23,27,31,  35,39,43,47,  51,55,59,63};
+    npyv_u8 r = vqtbl4q_u8(tbl, permute);
+            r = vshrq_n_u8(r, 7);
+    return r;
+#else
+    npyv_b32 b0 = npyv_cvt_b32_u32(npyv_signbit_f32(v0));
+    npyv_b32 b1 = npyv_cvt_b32_u32(npyv_signbit_f32(v1));
+    npyv_b32 b2 = npyv_cvt_b32_u32(npyv_signbit_f32(v2));
+    npyv_b32 b3 = npyv_cvt_b32_u32(npyv_signbit_f32(v3));
+    npyv_b8 signbit = npyv_pack_b8_b32(b0, b1, b2, b3);
+    return npyv_cvt_u8_b8(signbit);
+#endif
+}
+#endif // NPY_SIMD_F32
+#if NPY_SIMD_F64
+NPY_FINLINE npyv_u64
+npyv_signbit_f64(npyv_f64 v)
+{
+    return npyv_shri_u64(npyv_reinterpret_u64_f64(v), (sizeof(npyv_lanetype_f64)*8)-1);
+}
+NPY_FINLINE npyv_u8
+npyv_pack_signbit_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3,
+                      npyv_f64 v4, npyv_f64 v5, npyv_f64 v6, npyv_f64 v7)
+{
+#if defined(NPY_HAVE_NEON) && defined(__aarch64__)
+    // We only need high byte for signbit, which means we can pack
+    // multiple inputs into a single vector.
+
+    // vuzp2 faster than vtbl for f64
+    npyv_u32 v01 = vuzp2q_u32(npyv_reinterpret_u32_f64(v0), npyv_reinterpret_u32_f64(v1));
+    npyv_u32 v23 = vuzp2q_u32(npyv_reinterpret_u32_f64(v2), npyv_reinterpret_u32_f64(v3));
+    npyv_u32 v45 = vuzp2q_u32(npyv_reinterpret_u32_f64(v4), npyv_reinterpret_u32_f64(v5));
+    npyv_u32 v67 = vuzp2q_u32(npyv_reinterpret_u32_f64(v6), npyv_reinterpret_u32_f64(v7));
+
+    npyv_u16 v0123 = vuzp2q_u16(npyv_reinterpret_u16_u32(v01), npyv_reinterpret_u16_u32(v23));
+    npyv_u16 v4567 = vuzp2q_u16(npyv_reinterpret_u16_u32(v45), npyv_reinterpret_u16_u32(v67));
+
+    npyv_u8 r = vuzp2q_u8(npyv_reinterpret_u8_u16(v0123), npyv_reinterpret_u8_u16(v4567));
+            r = vshrq_n_u8(r, 7);
+    return r;
+#else
+    npyv_b64 b0 = npyv_cvt_b64_u64(npyv_signbit_f64(v0));
+    npyv_b64 b1 = npyv_cvt_b64_u64(npyv_signbit_f64(v1));
+    npyv_b64 b2 = npyv_cvt_b64_u64(npyv_signbit_f64(v2));
+    npyv_b64 b3 = npyv_cvt_b64_u64(npyv_signbit_f64(v3));
+    npyv_b64 b4 = npyv_cvt_b64_u64(npyv_signbit_f64(v4));
+    npyv_b64 b5 = npyv_cvt_b64_u64(npyv_signbit_f64(v5));
+    npyv_b64 b6 = npyv_cvt_b64_u64(npyv_signbit_f64(v6));
+    npyv_b64 b7 = npyv_cvt_b64_u64(npyv_signbit_f64(v7));
+    npyv_b8 signbit = npyv_pack_b8_b64(b0, b1, b2, b3, b4, b5, b6, b7);
+    return npyv_cvt_u8_b8(signbit);
+#endif
+}
+#endif // NPY_SIMD_F64
+
+#endif // NPY_SIMD
+
+/********************************************************************************
+ ** Defining the SIMD kernels
+ ********************************************************************************/
+/** Notes:
+ * - avoid the use of libmath to unify fp/domain errors
+ *   for both scalars and vectors among all compilers/architectures.
+ * - use intrinsic npyv_load_till_* instead of npyv_load_tillz_
+ *   to fill the remind lanes with 1.0 to avoid divide by zero fp
+ *   exception in reciprocal.
+ */
+#define CONTIG  0
+#define NCONTIG 1
+
+#line 406
+#if NPY_SIMD_F32
+#line 410
+#line 414
+static void simd_unary_isnan_FLOAT_CONTIG_CONTIG
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+    const npyv_lanetype_f32 *ip = src;
+    npy_bool *op = dst;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_f32)
+    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+        // Load vectors
+        #if CONTIG == CONTIG
+            // contiguous input
+            npyv_f32 v0 = npyv_load_f32(ip + vstep * 0);
+            npyv_f32 v1 = npyv_load_f32(ip + vstep * 1);
+            npyv_f32 v2 = npyv_load_f32(ip + vstep * 2);
+            npyv_f32 v3 = npyv_load_f32(ip + vstep * 3);
+            #if PACK_FACTOR == 8
+            npyv_f32 v4 = npyv_load_f32(ip + vstep * 4);
+            npyv_f32 v5 = npyv_load_f32(ip + vstep * 5);
+            npyv_f32 v6 = npyv_load_f32(ip + vstep * 6);
+            npyv_f32 v7 = npyv_load_f32(ip + vstep * 7);
+            #endif
+        #else
+            // non-contiguous input
+            npyv_f32 v0 = npyv_loadn_f32(ip + istride * vstep * 0, istride);
+            npyv_f32 v1 = npyv_loadn_f32(ip + istride * vstep * 1, istride);
+            npyv_f32 v2 = npyv_loadn_f32(ip + istride * vstep * 2, istride);
+            npyv_f32 v3 = npyv_loadn_f32(ip + istride * vstep * 3, istride);
+            #if PACK_FACTOR == 8
+            npyv_f32 v4 = npyv_loadn_f32(ip + istride * vstep * 4, istride);
+            npyv_f32 v5 = npyv_loadn_f32(ip + istride * vstep * 5, istride);
+            npyv_f32 v6 = npyv_loadn_f32(ip + istride * vstep * 6, istride);
+            npyv_f32 v7 = npyv_loadn_f32(ip + istride * vstep * 7, istride);
+            #endif
+        #endif
+
+        #if PACK_FACTOR == 4
+        npyv_u8 r = npyv_pack_isnan_f32(v0, v1, v2, v3);
+        #elif PACK_FACTOR == 8
+        npyv_u8 r = npyv_pack_isnan_f32(v0, v1, v2, v3, v4, v5, v6, v7);
+        #endif
+
+        #if CONTIG == CONTIG
+            npyv_store_u8(op, r);
+        #else // CONTIG == CONTIG
+            // Results are packed, so we can just loop over them
+            npy_uint8 lane[npyv_nlanes_u8];
+            npyv_store_u8(lane, r);
+            for (int ln=0; (ln * sizeof(npyv_lanetype_f32)) < npyv_nlanes_u8; ++ln){
+                op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_f32)];
+            }
+        #endif // CONTIG == CONTIG
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f32 v = npyv_load_f32(ip);
+    #else
+        npyv_f32 v = npyv_loadn_f32(ip, istride);
+    #endif
+
+        npyv_u32 r = npyv_isnan_f32(v);
+
+        npy_uint8 lane[npyv_nlanes_u8];
+        npyv_store_u8(lane, npyv_reinterpret_u8_u32(r));
+
+        op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_f32)];
+        op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_f32)];
+        #if npyv_nlanes_f32 == 4
+        op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_f32)];
+        op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_f32)];
+        #endif
+    }
+
+    #undef PACK_FACTOR
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = (npy_isnan(*ip) != 0);
+    }
+
+    npyv_cleanup();
+}
+
+#line 414
+static void simd_unary_isnan_FLOAT_NCONTIG_CONTIG
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+    const npyv_lanetype_f32 *ip = src;
+    npy_bool *op = dst;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_f32)
+    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+        // Load vectors
+        #if NCONTIG == CONTIG
+            // contiguous input
+            npyv_f32 v0 = npyv_load_f32(ip + vstep * 0);
+            npyv_f32 v1 = npyv_load_f32(ip + vstep * 1);
+            npyv_f32 v2 = npyv_load_f32(ip + vstep * 2);
+            npyv_f32 v3 = npyv_load_f32(ip + vstep * 3);
+            #if PACK_FACTOR == 8
+            npyv_f32 v4 = npyv_load_f32(ip + vstep * 4);
+            npyv_f32 v5 = npyv_load_f32(ip + vstep * 5);
+            npyv_f32 v6 = npyv_load_f32(ip + vstep * 6);
+            npyv_f32 v7 = npyv_load_f32(ip + vstep * 7);
+            #endif
+        #else
+            // non-contiguous input
+            npyv_f32 v0 = npyv_loadn_f32(ip + istride * vstep * 0, istride);
+            npyv_f32 v1 = npyv_loadn_f32(ip + istride * vstep * 1, istride);
+            npyv_f32 v2 = npyv_loadn_f32(ip + istride * vstep * 2, istride);
+            npyv_f32 v3 = npyv_loadn_f32(ip + istride * vstep * 3, istride);
+            #if PACK_FACTOR == 8
+            npyv_f32 v4 = npyv_loadn_f32(ip + istride * vstep * 4, istride);
+            npyv_f32 v5 = npyv_loadn_f32(ip + istride * vstep * 5, istride);
+            npyv_f32 v6 = npyv_loadn_f32(ip + istride * vstep * 6, istride);
+            npyv_f32 v7 = npyv_loadn_f32(ip + istride * vstep * 7, istride);
+            #endif
+        #endif
+
+        #if PACK_FACTOR == 4
+        npyv_u8 r = npyv_pack_isnan_f32(v0, v1, v2, v3);
+        #elif PACK_FACTOR == 8
+        npyv_u8 r = npyv_pack_isnan_f32(v0, v1, v2, v3, v4, v5, v6, v7);
+        #endif
+
+        #if CONTIG == CONTIG
+            npyv_store_u8(op, r);
+        #else // CONTIG == CONTIG
+            // Results are packed, so we can just loop over them
+            npy_uint8 lane[npyv_nlanes_u8];
+            npyv_store_u8(lane, r);
+            for (int ln=0; (ln * sizeof(npyv_lanetype_f32)) < npyv_nlanes_u8; ++ln){
+                op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_f32)];
+            }
+        #endif // CONTIG == CONTIG
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f32 v = npyv_load_f32(ip);
+    #else
+        npyv_f32 v = npyv_loadn_f32(ip, istride);
+    #endif
+
+        npyv_u32 r = npyv_isnan_f32(v);
+
+        npy_uint8 lane[npyv_nlanes_u8];
+        npyv_store_u8(lane, npyv_reinterpret_u8_u32(r));
+
+        op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_f32)];
+        op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_f32)];
+        #if npyv_nlanes_f32 == 4
+        op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_f32)];
+        op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_f32)];
+        #endif
+    }
+
+    #undef PACK_FACTOR
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = (npy_isnan(*ip) != 0);
+    }
+
+    npyv_cleanup();
+}
+
+#line 414
+static void simd_unary_isnan_FLOAT_CONTIG_NCONTIG
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+    const npyv_lanetype_f32 *ip = src;
+    npy_bool *op = dst;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_f32)
+    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+        // Load vectors
+        #if CONTIG == CONTIG
+            // contiguous input
+            npyv_f32 v0 = npyv_load_f32(ip + vstep * 0);
+            npyv_f32 v1 = npyv_load_f32(ip + vstep * 1);
+            npyv_f32 v2 = npyv_load_f32(ip + vstep * 2);
+            npyv_f32 v3 = npyv_load_f32(ip + vstep * 3);
+            #if PACK_FACTOR == 8
+            npyv_f32 v4 = npyv_load_f32(ip + vstep * 4);
+            npyv_f32 v5 = npyv_load_f32(ip + vstep * 5);
+            npyv_f32 v6 = npyv_load_f32(ip + vstep * 6);
+            npyv_f32 v7 = npyv_load_f32(ip + vstep * 7);
+            #endif
+        #else
+            // non-contiguous input
+            npyv_f32 v0 = npyv_loadn_f32(ip + istride * vstep * 0, istride);
+            npyv_f32 v1 = npyv_loadn_f32(ip + istride * vstep * 1, istride);
+            npyv_f32 v2 = npyv_loadn_f32(ip + istride * vstep * 2, istride);
+            npyv_f32 v3 = npyv_loadn_f32(ip + istride * vstep * 3, istride);
+            #if PACK_FACTOR == 8
+            npyv_f32 v4 = npyv_loadn_f32(ip + istride * vstep * 4, istride);
+            npyv_f32 v5 = npyv_loadn_f32(ip + istride * vstep * 5, istride);
+            npyv_f32 v6 = npyv_loadn_f32(ip + istride * vstep * 6, istride);
+            npyv_f32 v7 = npyv_loadn_f32(ip + istride * vstep * 7, istride);
+            #endif
+        #endif
+
+        #if PACK_FACTOR == 4
+        npyv_u8 r = npyv_pack_isnan_f32(v0, v1, v2, v3);
+        #elif PACK_FACTOR == 8
+        npyv_u8 r = npyv_pack_isnan_f32(v0, v1, v2, v3, v4, v5, v6, v7);
+        #endif
+
+        #if NCONTIG == CONTIG
+            npyv_store_u8(op, r);
+        #else // NCONTIG == CONTIG
+            // Results are packed, so we can just loop over them
+            npy_uint8 lane[npyv_nlanes_u8];
+            npyv_store_u8(lane, r);
+            for (int ln=0; (ln * sizeof(npyv_lanetype_f32)) < npyv_nlanes_u8; ++ln){
+                op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_f32)];
+            }
+        #endif // NCONTIG == CONTIG
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f32 v = npyv_load_f32(ip);
+    #else
+        npyv_f32 v = npyv_loadn_f32(ip, istride);
+    #endif
+
+        npyv_u32 r = npyv_isnan_f32(v);
+
+        npy_uint8 lane[npyv_nlanes_u8];
+        npyv_store_u8(lane, npyv_reinterpret_u8_u32(r));
+
+        op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_f32)];
+        op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_f32)];
+        #if npyv_nlanes_f32 == 4
+        op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_f32)];
+        op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_f32)];
+        #endif
+    }
+
+    #undef PACK_FACTOR
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = (npy_isnan(*ip) != 0);
+    }
+
+    npyv_cleanup();
+}
+
+#line 414
+static void simd_unary_isnan_FLOAT_NCONTIG_NCONTIG
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+    const npyv_lanetype_f32 *ip = src;
+    npy_bool *op = dst;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_f32)
+    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+        // Load vectors
+        #if NCONTIG == CONTIG
+            // contiguous input
+            npyv_f32 v0 = npyv_load_f32(ip + vstep * 0);
+            npyv_f32 v1 = npyv_load_f32(ip + vstep * 1);
+            npyv_f32 v2 = npyv_load_f32(ip + vstep * 2);
+            npyv_f32 v3 = npyv_load_f32(ip + vstep * 3);
+            #if PACK_FACTOR == 8
+            npyv_f32 v4 = npyv_load_f32(ip + vstep * 4);
+            npyv_f32 v5 = npyv_load_f32(ip + vstep * 5);
+            npyv_f32 v6 = npyv_load_f32(ip + vstep * 6);
+            npyv_f32 v7 = npyv_load_f32(ip + vstep * 7);
+            #endif
+        #else
+            // non-contiguous input
+            npyv_f32 v0 = npyv_loadn_f32(ip + istride * vstep * 0, istride);
+            npyv_f32 v1 = npyv_loadn_f32(ip + istride * vstep * 1, istride);
+            npyv_f32 v2 = npyv_loadn_f32(ip + istride * vstep * 2, istride);
+            npyv_f32 v3 = npyv_loadn_f32(ip + istride * vstep * 3, istride);
+            #if PACK_FACTOR == 8
+            npyv_f32 v4 = npyv_loadn_f32(ip + istride * vstep * 4, istride);
+            npyv_f32 v5 = npyv_loadn_f32(ip + istride * vstep * 5, istride);
+            npyv_f32 v6 = npyv_loadn_f32(ip + istride * vstep * 6, istride);
+            npyv_f32 v7 = npyv_loadn_f32(ip + istride * vstep * 7, istride);
+            #endif
+        #endif
+
+        #if PACK_FACTOR == 4
+        npyv_u8 r = npyv_pack_isnan_f32(v0, v1, v2, v3);
+        #elif PACK_FACTOR == 8
+        npyv_u8 r = npyv_pack_isnan_f32(v0, v1, v2, v3, v4, v5, v6, v7);
+        #endif
+
+        #if NCONTIG == CONTIG
+            npyv_store_u8(op, r);
+        #else // NCONTIG == CONTIG
+            // Results are packed, so we can just loop over them
+            npy_uint8 lane[npyv_nlanes_u8];
+            npyv_store_u8(lane, r);
+            for (int ln=0; (ln * sizeof(npyv_lanetype_f32)) < npyv_nlanes_u8; ++ln){
+                op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_f32)];
+            }
+        #endif // NCONTIG == CONTIG
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f32 v = npyv_load_f32(ip);
+    #else
+        npyv_f32 v = npyv_loadn_f32(ip, istride);
+    #endif
+
+        npyv_u32 r = npyv_isnan_f32(v);
+
+        npy_uint8 lane[npyv_nlanes_u8];
+        npyv_store_u8(lane, npyv_reinterpret_u8_u32(r));
+
+        op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_f32)];
+        op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_f32)];
+        #if npyv_nlanes_f32 == 4
+        op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_f32)];
+        op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_f32)];
+        #endif
+    }
+
+    #undef PACK_FACTOR
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = (npy_isnan(*ip) != 0);
+    }
+
+    npyv_cleanup();
+}
+
+
+#line 410
+#line 414
+static void simd_unary_isinf_FLOAT_CONTIG_CONTIG
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+    const npyv_lanetype_f32 *ip = src;
+    npy_bool *op = dst;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_f32)
+    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+        // Load vectors
+        #if CONTIG == CONTIG
+            // contiguous input
+            npyv_f32 v0 = npyv_load_f32(ip + vstep * 0);
+            npyv_f32 v1 = npyv_load_f32(ip + vstep * 1);
+            npyv_f32 v2 = npyv_load_f32(ip + vstep * 2);
+            npyv_f32 v3 = npyv_load_f32(ip + vstep * 3);
+            #if PACK_FACTOR == 8
+            npyv_f32 v4 = npyv_load_f32(ip + vstep * 4);
+            npyv_f32 v5 = npyv_load_f32(ip + vstep * 5);
+            npyv_f32 v6 = npyv_load_f32(ip + vstep * 6);
+            npyv_f32 v7 = npyv_load_f32(ip + vstep * 7);
+            #endif
+        #else
+            // non-contiguous input
+            npyv_f32 v0 = npyv_loadn_f32(ip + istride * vstep * 0, istride);
+            npyv_f32 v1 = npyv_loadn_f32(ip + istride * vstep * 1, istride);
+            npyv_f32 v2 = npyv_loadn_f32(ip + istride * vstep * 2, istride);
+            npyv_f32 v3 = npyv_loadn_f32(ip + istride * vstep * 3, istride);
+            #if PACK_FACTOR == 8
+            npyv_f32 v4 = npyv_loadn_f32(ip + istride * vstep * 4, istride);
+            npyv_f32 v5 = npyv_loadn_f32(ip + istride * vstep * 5, istride);
+            npyv_f32 v6 = npyv_loadn_f32(ip + istride * vstep * 6, istride);
+            npyv_f32 v7 = npyv_loadn_f32(ip + istride * vstep * 7, istride);
+            #endif
+        #endif
+
+        #if PACK_FACTOR == 4
+        npyv_u8 r = npyv_pack_isinf_f32(v0, v1, v2, v3);
+        #elif PACK_FACTOR == 8
+        npyv_u8 r = npyv_pack_isinf_f32(v0, v1, v2, v3, v4, v5, v6, v7);
+        #endif
+
+        #if CONTIG == CONTIG
+            npyv_store_u8(op, r);
+        #else // CONTIG == CONTIG
+            // Results are packed, so we can just loop over them
+            npy_uint8 lane[npyv_nlanes_u8];
+            npyv_store_u8(lane, r);
+            for (int ln=0; (ln * sizeof(npyv_lanetype_f32)) < npyv_nlanes_u8; ++ln){
+                op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_f32)];
+            }
+        #endif // CONTIG == CONTIG
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f32 v = npyv_load_f32(ip);
+    #else
+        npyv_f32 v = npyv_loadn_f32(ip, istride);
+    #endif
+
+        npyv_u32 r = npyv_isinf_f32(v);
+
+        npy_uint8 lane[npyv_nlanes_u8];
+        npyv_store_u8(lane, npyv_reinterpret_u8_u32(r));
+
+        op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_f32)];
+        op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_f32)];
+        #if npyv_nlanes_f32 == 4
+        op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_f32)];
+        op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_f32)];
+        #endif
+    }
+
+    #undef PACK_FACTOR
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = (npy_isinf(*ip) != 0);
+    }
+
+    npyv_cleanup();
+}
+
+#line 414
+static void simd_unary_isinf_FLOAT_NCONTIG_CONTIG
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+    const npyv_lanetype_f32 *ip = src;
+    npy_bool *op = dst;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_f32)
+    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+        // Load vectors
+        #if NCONTIG == CONTIG
+            // contiguous input
+            npyv_f32 v0 = npyv_load_f32(ip + vstep * 0);
+            npyv_f32 v1 = npyv_load_f32(ip + vstep * 1);
+            npyv_f32 v2 = npyv_load_f32(ip + vstep * 2);
+            npyv_f32 v3 = npyv_load_f32(ip + vstep * 3);
+            #if PACK_FACTOR == 8
+            npyv_f32 v4 = npyv_load_f32(ip + vstep * 4);
+            npyv_f32 v5 = npyv_load_f32(ip + vstep * 5);
+            npyv_f32 v6 = npyv_load_f32(ip + vstep * 6);
+            npyv_f32 v7 = npyv_load_f32(ip + vstep * 7);
+            #endif
+        #else
+            // non-contiguous input
+            npyv_f32 v0 = npyv_loadn_f32(ip + istride * vstep * 0, istride);
+            npyv_f32 v1 = npyv_loadn_f32(ip + istride * vstep * 1, istride);
+            npyv_f32 v2 = npyv_loadn_f32(ip + istride * vstep * 2, istride);
+            npyv_f32 v3 = npyv_loadn_f32(ip + istride * vstep * 3, istride);
+            #if PACK_FACTOR == 8
+            npyv_f32 v4 = npyv_loadn_f32(ip + istride * vstep * 4, istride);
+            npyv_f32 v5 = npyv_loadn_f32(ip + istride * vstep * 5, istride);
+            npyv_f32 v6 = npyv_loadn_f32(ip + istride * vstep * 6, istride);
+            npyv_f32 v7 = npyv_loadn_f32(ip + istride * vstep * 7, istride);
+            #endif
+        #endif
+
+        #if PACK_FACTOR == 4
+        npyv_u8 r = npyv_pack_isinf_f32(v0, v1, v2, v3);
+        #elif PACK_FACTOR == 8
+        npyv_u8 r = npyv_pack_isinf_f32(v0, v1, v2, v3, v4, v5, v6, v7);
+        #endif
+
+        #if CONTIG == CONTIG
+            npyv_store_u8(op, r);
+        #else // CONTIG == CONTIG
+            // Results are packed, so we can just loop over them
+            npy_uint8 lane[npyv_nlanes_u8];
+            npyv_store_u8(lane, r);
+            for (int ln=0; (ln * sizeof(npyv_lanetype_f32)) < npyv_nlanes_u8; ++ln){
+                op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_f32)];
+            }
+        #endif // CONTIG == CONTIG
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f32 v = npyv_load_f32(ip);
+    #else
+        npyv_f32 v = npyv_loadn_f32(ip, istride);
+    #endif
+
+        npyv_u32 r = npyv_isinf_f32(v);
+
+        npy_uint8 lane[npyv_nlanes_u8];
+        npyv_store_u8(lane, npyv_reinterpret_u8_u32(r));
+
+        op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_f32)];
+        op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_f32)];
+        #if npyv_nlanes_f32 == 4
+        op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_f32)];
+        op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_f32)];
+        #endif
+    }
+
+    #undef PACK_FACTOR
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = (npy_isinf(*ip) != 0);
+    }
+
+    npyv_cleanup();
+}
+
+#line 414
+static void simd_unary_isinf_FLOAT_CONTIG_NCONTIG
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+    const npyv_lanetype_f32 *ip = src;
+    npy_bool *op = dst;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_f32)
+    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+        // Load vectors
+        #if CONTIG == CONTIG
+            // contiguous input
+            npyv_f32 v0 = npyv_load_f32(ip + vstep * 0);
+            npyv_f32 v1 = npyv_load_f32(ip + vstep * 1);
+            npyv_f32 v2 = npyv_load_f32(ip + vstep * 2);
+            npyv_f32 v3 = npyv_load_f32(ip + vstep * 3);
+            #if PACK_FACTOR == 8
+            npyv_f32 v4 = npyv_load_f32(ip + vstep * 4);
+            npyv_f32 v5 = npyv_load_f32(ip + vstep * 5);
+            npyv_f32 v6 = npyv_load_f32(ip + vstep * 6);
+            npyv_f32 v7 = npyv_load_f32(ip + vstep * 7);
+            #endif
+        #else
+            // non-contiguous input
+            npyv_f32 v0 = npyv_loadn_f32(ip + istride * vstep * 0, istride);
+            npyv_f32 v1 = npyv_loadn_f32(ip + istride * vstep * 1, istride);
+            npyv_f32 v2 = npyv_loadn_f32(ip + istride * vstep * 2, istride);
+            npyv_f32 v3 = npyv_loadn_f32(ip + istride * vstep * 3, istride);
+            #if PACK_FACTOR == 8
+            npyv_f32 v4 = npyv_loadn_f32(ip + istride * vstep * 4, istride);
+            npyv_f32 v5 = npyv_loadn_f32(ip + istride * vstep * 5, istride);
+            npyv_f32 v6 = npyv_loadn_f32(ip + istride * vstep * 6, istride);
+            npyv_f32 v7 = npyv_loadn_f32(ip + istride * vstep * 7, istride);
+            #endif
+        #endif
+
+        #if PACK_FACTOR == 4
+        npyv_u8 r = npyv_pack_isinf_f32(v0, v1, v2, v3);
+        #elif PACK_FACTOR == 8
+        npyv_u8 r = npyv_pack_isinf_f32(v0, v1, v2, v3, v4, v5, v6, v7);
+        #endif
+
+        #if NCONTIG == CONTIG
+            npyv_store_u8(op, r);
+        #else // NCONTIG == CONTIG
+            // Results are packed, so we can just loop over them
+            npy_uint8 lane[npyv_nlanes_u8];
+            npyv_store_u8(lane, r);
+            for (int ln=0; (ln * sizeof(npyv_lanetype_f32)) < npyv_nlanes_u8; ++ln){
+                op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_f32)];
+            }
+        #endif // NCONTIG == CONTIG
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f32 v = npyv_load_f32(ip);
+    #else
+        npyv_f32 v = npyv_loadn_f32(ip, istride);
+    #endif
+
+        npyv_u32 r = npyv_isinf_f32(v);
+
+        npy_uint8 lane[npyv_nlanes_u8];
+        npyv_store_u8(lane, npyv_reinterpret_u8_u32(r));
+
+        op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_f32)];
+        op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_f32)];
+        #if npyv_nlanes_f32 == 4
+        op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_f32)];
+        op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_f32)];
+        #endif
+    }
+
+    #undef PACK_FACTOR
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = (npy_isinf(*ip) != 0);
+    }
+
+    npyv_cleanup();
+}
+
+#line 414
+static void simd_unary_isinf_FLOAT_NCONTIG_NCONTIG
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+    const npyv_lanetype_f32 *ip = src;
+    npy_bool *op = dst;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_f32)
+    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+        // Load vectors
+        #if NCONTIG == CONTIG
+            // contiguous input
+            npyv_f32 v0 = npyv_load_f32(ip + vstep * 0);
+            npyv_f32 v1 = npyv_load_f32(ip + vstep * 1);
+            npyv_f32 v2 = npyv_load_f32(ip + vstep * 2);
+            npyv_f32 v3 = npyv_load_f32(ip + vstep * 3);
+            #if PACK_FACTOR == 8
+            npyv_f32 v4 = npyv_load_f32(ip + vstep * 4);
+            npyv_f32 v5 = npyv_load_f32(ip + vstep * 5);
+            npyv_f32 v6 = npyv_load_f32(ip + vstep * 6);
+            npyv_f32 v7 = npyv_load_f32(ip + vstep * 7);
+            #endif
+        #else
+            // non-contiguous input
+            npyv_f32 v0 = npyv_loadn_f32(ip + istride * vstep * 0, istride);
+            npyv_f32 v1 = npyv_loadn_f32(ip + istride * vstep * 1, istride);
+            npyv_f32 v2 = npyv_loadn_f32(ip + istride * vstep * 2, istride);
+            npyv_f32 v3 = npyv_loadn_f32(ip + istride * vstep * 3, istride);
+            #if PACK_FACTOR == 8
+            npyv_f32 v4 = npyv_loadn_f32(ip + istride * vstep * 4, istride);
+            npyv_f32 v5 = npyv_loadn_f32(ip + istride * vstep * 5, istride);
+            npyv_f32 v6 = npyv_loadn_f32(ip + istride * vstep * 6, istride);
+            npyv_f32 v7 = npyv_loadn_f32(ip + istride * vstep * 7, istride);
+            #endif
+        #endif
+
+        #if PACK_FACTOR == 4
+        npyv_u8 r = npyv_pack_isinf_f32(v0, v1, v2, v3);
+        #elif PACK_FACTOR == 8
+        npyv_u8 r = npyv_pack_isinf_f32(v0, v1, v2, v3, v4, v5, v6, v7);
+        #endif
+
+        #if NCONTIG == CONTIG
+            npyv_store_u8(op, r);
+        #else // NCONTIG == CONTIG
+            // Results are packed, so we can just loop over them
+            npy_uint8 lane[npyv_nlanes_u8];
+            npyv_store_u8(lane, r);
+            for (int ln=0; (ln * sizeof(npyv_lanetype_f32)) < npyv_nlanes_u8; ++ln){
+                op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_f32)];
+            }
+        #endif // NCONTIG == CONTIG
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f32 v = npyv_load_f32(ip);
+    #else
+        npyv_f32 v = npyv_loadn_f32(ip, istride);
+    #endif
+
+        npyv_u32 r = npyv_isinf_f32(v);
+
+        npy_uint8 lane[npyv_nlanes_u8];
+        npyv_store_u8(lane, npyv_reinterpret_u8_u32(r));
+
+        op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_f32)];
+        op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_f32)];
+        #if npyv_nlanes_f32 == 4
+        op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_f32)];
+        op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_f32)];
+        #endif
+    }
+
+    #undef PACK_FACTOR
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = (npy_isinf(*ip) != 0);
+    }
+
+    npyv_cleanup();
+}
+
+
+#line 410
+#line 414
+static void simd_unary_isfinite_FLOAT_CONTIG_CONTIG
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+    const npyv_lanetype_f32 *ip = src;
+    npy_bool *op = dst;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_f32)
+    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+        // Load vectors
+        #if CONTIG == CONTIG
+            // contiguous input
+            npyv_f32 v0 = npyv_load_f32(ip + vstep * 0);
+            npyv_f32 v1 = npyv_load_f32(ip + vstep * 1);
+            npyv_f32 v2 = npyv_load_f32(ip + vstep * 2);
+            npyv_f32 v3 = npyv_load_f32(ip + vstep * 3);
+            #if PACK_FACTOR == 8
+            npyv_f32 v4 = npyv_load_f32(ip + vstep * 4);
+            npyv_f32 v5 = npyv_load_f32(ip + vstep * 5);
+            npyv_f32 v6 = npyv_load_f32(ip + vstep * 6);
+            npyv_f32 v7 = npyv_load_f32(ip + vstep * 7);
+            #endif
+        #else
+            // non-contiguous input
+            npyv_f32 v0 = npyv_loadn_f32(ip + istride * vstep * 0, istride);
+            npyv_f32 v1 = npyv_loadn_f32(ip + istride * vstep * 1, istride);
+            npyv_f32 v2 = npyv_loadn_f32(ip + istride * vstep * 2, istride);
+            npyv_f32 v3 = npyv_loadn_f32(ip + istride * vstep * 3, istride);
+            #if PACK_FACTOR == 8
+            npyv_f32 v4 = npyv_loadn_f32(ip + istride * vstep * 4, istride);
+            npyv_f32 v5 = npyv_loadn_f32(ip + istride * vstep * 5, istride);
+            npyv_f32 v6 = npyv_loadn_f32(ip + istride * vstep * 6, istride);
+            npyv_f32 v7 = npyv_loadn_f32(ip + istride * vstep * 7, istride);
+            #endif
+        #endif
+
+        #if PACK_FACTOR == 4
+        npyv_u8 r = npyv_pack_isfinite_f32(v0, v1, v2, v3);
+        #elif PACK_FACTOR == 8
+        npyv_u8 r = npyv_pack_isfinite_f32(v0, v1, v2, v3, v4, v5, v6, v7);
+        #endif
+
+        #if CONTIG == CONTIG
+            npyv_store_u8(op, r);
+        #else // CONTIG == CONTIG
+            // Results are packed, so we can just loop over them
+            npy_uint8 lane[npyv_nlanes_u8];
+            npyv_store_u8(lane, r);
+            for (int ln=0; (ln * sizeof(npyv_lanetype_f32)) < npyv_nlanes_u8; ++ln){
+                op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_f32)];
+            }
+        #endif // CONTIG == CONTIG
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f32 v = npyv_load_f32(ip);
+    #else
+        npyv_f32 v = npyv_loadn_f32(ip, istride);
+    #endif
+
+        npyv_u32 r = npyv_isfinite_f32(v);
+
+        npy_uint8 lane[npyv_nlanes_u8];
+        npyv_store_u8(lane, npyv_reinterpret_u8_u32(r));
+
+        op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_f32)];
+        op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_f32)];
+        #if npyv_nlanes_f32 == 4
+        op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_f32)];
+        op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_f32)];
+        #endif
+    }
+
+    #undef PACK_FACTOR
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = (npy_isfinite(*ip) != 0);
+    }
+
+    npyv_cleanup();
+}
+
+#line 414
+static void simd_unary_isfinite_FLOAT_NCONTIG_CONTIG
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+    const npyv_lanetype_f32 *ip = src;
+    npy_bool *op = dst;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_f32)
+    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+        // Load vectors
+        #if NCONTIG == CONTIG
+            // contiguous input
+            npyv_f32 v0 = npyv_load_f32(ip + vstep * 0);
+            npyv_f32 v1 = npyv_load_f32(ip + vstep * 1);
+            npyv_f32 v2 = npyv_load_f32(ip + vstep * 2);
+            npyv_f32 v3 = npyv_load_f32(ip + vstep * 3);
+            #if PACK_FACTOR == 8
+            npyv_f32 v4 = npyv_load_f32(ip + vstep * 4);
+            npyv_f32 v5 = npyv_load_f32(ip + vstep * 5);
+            npyv_f32 v6 = npyv_load_f32(ip + vstep * 6);
+            npyv_f32 v7 = npyv_load_f32(ip + vstep * 7);
+            #endif
+        #else
+            // non-contiguous input
+            npyv_f32 v0 = npyv_loadn_f32(ip + istride * vstep * 0, istride);
+            npyv_f32 v1 = npyv_loadn_f32(ip + istride * vstep * 1, istride);
+            npyv_f32 v2 = npyv_loadn_f32(ip + istride * vstep * 2, istride);
+            npyv_f32 v3 = npyv_loadn_f32(ip + istride * vstep * 3, istride);
+            #if PACK_FACTOR == 8
+            npyv_f32 v4 = npyv_loadn_f32(ip + istride * vstep * 4, istride);
+            npyv_f32 v5 = npyv_loadn_f32(ip + istride * vstep * 5, istride);
+            npyv_f32 v6 = npyv_loadn_f32(ip + istride * vstep * 6, istride);
+            npyv_f32 v7 = npyv_loadn_f32(ip + istride * vstep * 7, istride);
+            #endif
+        #endif
+
+        #if PACK_FACTOR == 4
+        npyv_u8 r = npyv_pack_isfinite_f32(v0, v1, v2, v3);
+        #elif PACK_FACTOR == 8
+        npyv_u8 r = npyv_pack_isfinite_f32(v0, v1, v2, v3, v4, v5, v6, v7);
+        #endif
+
+        #if CONTIG == CONTIG
+            npyv_store_u8(op, r);
+        #else // CONTIG == CONTIG
+            // Results are packed, so we can just loop over them
+            npy_uint8 lane[npyv_nlanes_u8];
+            npyv_store_u8(lane, r);
+            for (int ln=0; (ln * sizeof(npyv_lanetype_f32)) < npyv_nlanes_u8; ++ln){
+                op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_f32)];
+            }
+        #endif // CONTIG == CONTIG
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f32 v = npyv_load_f32(ip);
+    #else
+        npyv_f32 v = npyv_loadn_f32(ip, istride);
+    #endif
+
+        npyv_u32 r = npyv_isfinite_f32(v);
+
+        npy_uint8 lane[npyv_nlanes_u8];
+        npyv_store_u8(lane, npyv_reinterpret_u8_u32(r));
+
+        op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_f32)];
+        op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_f32)];
+        #if npyv_nlanes_f32 == 4
+        op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_f32)];
+        op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_f32)];
+        #endif
+    }
+
+    #undef PACK_FACTOR
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = (npy_isfinite(*ip) != 0);
+    }
+
+    npyv_cleanup();
+}
+
+#line 414
+static void simd_unary_isfinite_FLOAT_CONTIG_NCONTIG
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+    const npyv_lanetype_f32 *ip = src;
+    npy_bool *op = dst;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_f32)
+    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+        // Load vectors
+        #if CONTIG == CONTIG
+            // contiguous input
+            npyv_f32 v0 = npyv_load_f32(ip + vstep * 0);
+            npyv_f32 v1 = npyv_load_f32(ip + vstep * 1);
+            npyv_f32 v2 = npyv_load_f32(ip + vstep * 2);
+            npyv_f32 v3 = npyv_load_f32(ip + vstep * 3);
+            #if PACK_FACTOR == 8
+            npyv_f32 v4 = npyv_load_f32(ip + vstep * 4);
+            npyv_f32 v5 = npyv_load_f32(ip + vstep * 5);
+            npyv_f32 v6 = npyv_load_f32(ip + vstep * 6);
+            npyv_f32 v7 = npyv_load_f32(ip + vstep * 7);
+            #endif
+        #else
+            // non-contiguous input
+            npyv_f32 v0 = npyv_loadn_f32(ip + istride * vstep * 0, istride);
+            npyv_f32 v1 = npyv_loadn_f32(ip + istride * vstep * 1, istride);
+            npyv_f32 v2 = npyv_loadn_f32(ip + istride * vstep * 2, istride);
+            npyv_f32 v3 = npyv_loadn_f32(ip + istride * vstep * 3, istride);
+            #if PACK_FACTOR == 8
+            npyv_f32 v4 = npyv_loadn_f32(ip + istride * vstep * 4, istride);
+            npyv_f32 v5 = npyv_loadn_f32(ip + istride * vstep * 5, istride);
+            npyv_f32 v6 = npyv_loadn_f32(ip + istride * vstep * 6, istride);
+            npyv_f32 v7 = npyv_loadn_f32(ip + istride * vstep * 7, istride);
+            #endif
+        #endif
+
+        #if PACK_FACTOR == 4
+        npyv_u8 r = npyv_pack_isfinite_f32(v0, v1, v2, v3);
+        #elif PACK_FACTOR == 8
+        npyv_u8 r = npyv_pack_isfinite_f32(v0, v1, v2, v3, v4, v5, v6, v7);
+        #endif
+
+        #if NCONTIG == CONTIG
+            npyv_store_u8(op, r);
+        #else // NCONTIG == CONTIG
+            // Results are packed, so we can just loop over them
+            npy_uint8 lane[npyv_nlanes_u8];
+            npyv_store_u8(lane, r);
+            for (int ln=0; (ln * sizeof(npyv_lanetype_f32)) < npyv_nlanes_u8; ++ln){
+                op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_f32)];
+            }
+        #endif // NCONTIG == CONTIG
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f32 v = npyv_load_f32(ip);
+    #else
+        npyv_f32 v = npyv_loadn_f32(ip, istride);
+    #endif
+
+        npyv_u32 r = npyv_isfinite_f32(v);
+
+        npy_uint8 lane[npyv_nlanes_u8];
+        npyv_store_u8(lane, npyv_reinterpret_u8_u32(r));
+
+        op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_f32)];
+        op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_f32)];
+        #if npyv_nlanes_f32 == 4
+        op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_f32)];
+        op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_f32)];
+        #endif
+    }
+
+    #undef PACK_FACTOR
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = (npy_isfinite(*ip) != 0);
+    }
+
+    npyv_cleanup();
+}
+
+#line 414
+static void simd_unary_isfinite_FLOAT_NCONTIG_NCONTIG
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+    const npyv_lanetype_f32 *ip = src;
+    npy_bool *op = dst;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_f32)
+    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+        // Load vectors
+        #if NCONTIG == CONTIG
+            // contiguous input
+            npyv_f32 v0 = npyv_load_f32(ip + vstep * 0);
+            npyv_f32 v1 = npyv_load_f32(ip + vstep * 1);
+            npyv_f32 v2 = npyv_load_f32(ip + vstep * 2);
+            npyv_f32 v3 = npyv_load_f32(ip + vstep * 3);
+            #if PACK_FACTOR == 8
+            npyv_f32 v4 = npyv_load_f32(ip + vstep * 4);
+            npyv_f32 v5 = npyv_load_f32(ip + vstep * 5);
+            npyv_f32 v6 = npyv_load_f32(ip + vstep * 6);
+            npyv_f32 v7 = npyv_load_f32(ip + vstep * 7);
+            #endif
+        #else
+            // non-contiguous input
+            npyv_f32 v0 = npyv_loadn_f32(ip + istride * vstep * 0, istride);
+            npyv_f32 v1 = npyv_loadn_f32(ip + istride * vstep * 1, istride);
+            npyv_f32 v2 = npyv_loadn_f32(ip + istride * vstep * 2, istride);
+            npyv_f32 v3 = npyv_loadn_f32(ip + istride * vstep * 3, istride);
+            #if PACK_FACTOR == 8
+            npyv_f32 v4 = npyv_loadn_f32(ip + istride * vstep * 4, istride);
+            npyv_f32 v5 = npyv_loadn_f32(ip + istride * vstep * 5, istride);
+            npyv_f32 v6 = npyv_loadn_f32(ip + istride * vstep * 6, istride);
+            npyv_f32 v7 = npyv_loadn_f32(ip + istride * vstep * 7, istride);
+            #endif
+        #endif
+
+        #if PACK_FACTOR == 4
+        npyv_u8 r = npyv_pack_isfinite_f32(v0, v1, v2, v3);
+        #elif PACK_FACTOR == 8
+        npyv_u8 r = npyv_pack_isfinite_f32(v0, v1, v2, v3, v4, v5, v6, v7);
+        #endif
+
+        #if NCONTIG == CONTIG
+            npyv_store_u8(op, r);
+        #else // NCONTIG == CONTIG
+            // Results are packed, so we can just loop over them
+            npy_uint8 lane[npyv_nlanes_u8];
+            npyv_store_u8(lane, r);
+            for (int ln=0; (ln * sizeof(npyv_lanetype_f32)) < npyv_nlanes_u8; ++ln){
+                op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_f32)];
+            }
+        #endif // NCONTIG == CONTIG
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f32 v = npyv_load_f32(ip);
+    #else
+        npyv_f32 v = npyv_loadn_f32(ip, istride);
+    #endif
+
+        npyv_u32 r = npyv_isfinite_f32(v);
+
+        npy_uint8 lane[npyv_nlanes_u8];
+        npyv_store_u8(lane, npyv_reinterpret_u8_u32(r));
+
+        op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_f32)];
+        op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_f32)];
+        #if npyv_nlanes_f32 == 4
+        op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_f32)];
+        op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_f32)];
+        #endif
+    }
+
+    #undef PACK_FACTOR
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = (npy_isfinite(*ip) != 0);
+    }
+
+    npyv_cleanup();
+}
+
+
+#line 410
+#line 414
+static void simd_unary_signbit_FLOAT_CONTIG_CONTIG
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+    const npyv_lanetype_f32 *ip = src;
+    npy_bool *op = dst;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_f32)
+    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+        // Load vectors
+        #if CONTIG == CONTIG
+            // contiguous input
+            npyv_f32 v0 = npyv_load_f32(ip + vstep * 0);
+            npyv_f32 v1 = npyv_load_f32(ip + vstep * 1);
+            npyv_f32 v2 = npyv_load_f32(ip + vstep * 2);
+            npyv_f32 v3 = npyv_load_f32(ip + vstep * 3);
+            #if PACK_FACTOR == 8
+            npyv_f32 v4 = npyv_load_f32(ip + vstep * 4);
+            npyv_f32 v5 = npyv_load_f32(ip + vstep * 5);
+            npyv_f32 v6 = npyv_load_f32(ip + vstep * 6);
+            npyv_f32 v7 = npyv_load_f32(ip + vstep * 7);
+            #endif
+        #else
+            // non-contiguous input
+            npyv_f32 v0 = npyv_loadn_f32(ip + istride * vstep * 0, istride);
+            npyv_f32 v1 = npyv_loadn_f32(ip + istride * vstep * 1, istride);
+            npyv_f32 v2 = npyv_loadn_f32(ip + istride * vstep * 2, istride);
+            npyv_f32 v3 = npyv_loadn_f32(ip + istride * vstep * 3, istride);
+            #if PACK_FACTOR == 8
+            npyv_f32 v4 = npyv_loadn_f32(ip + istride * vstep * 4, istride);
+            npyv_f32 v5 = npyv_loadn_f32(ip + istride * vstep * 5, istride);
+            npyv_f32 v6 = npyv_loadn_f32(ip + istride * vstep * 6, istride);
+            npyv_f32 v7 = npyv_loadn_f32(ip + istride * vstep * 7, istride);
+            #endif
+        #endif
+
+        #if PACK_FACTOR == 4
+        npyv_u8 r = npyv_pack_signbit_f32(v0, v1, v2, v3);
+        #elif PACK_FACTOR == 8
+        npyv_u8 r = npyv_pack_signbit_f32(v0, v1, v2, v3, v4, v5, v6, v7);
+        #endif
+
+        #if CONTIG == CONTIG
+            npyv_store_u8(op, r);
+        #else // CONTIG == CONTIG
+            // Results are packed, so we can just loop over them
+            npy_uint8 lane[npyv_nlanes_u8];
+            npyv_store_u8(lane, r);
+            for (int ln=0; (ln * sizeof(npyv_lanetype_f32)) < npyv_nlanes_u8; ++ln){
+                op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_f32)];
+            }
+        #endif // CONTIG == CONTIG
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f32 v = npyv_load_f32(ip);
+    #else
+        npyv_f32 v = npyv_loadn_f32(ip, istride);
+    #endif
+
+        npyv_u32 r = npyv_signbit_f32(v);
+
+        npy_uint8 lane[npyv_nlanes_u8];
+        npyv_store_u8(lane, npyv_reinterpret_u8_u32(r));
+
+        op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_f32)];
+        op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_f32)];
+        #if npyv_nlanes_f32 == 4
+        op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_f32)];
+        op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_f32)];
+        #endif
+    }
+
+    #undef PACK_FACTOR
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = (npy_signbit(*ip) != 0);
+    }
+
+    npyv_cleanup();
+}
+
+#line 414
+static void simd_unary_signbit_FLOAT_NCONTIG_CONTIG
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+    const npyv_lanetype_f32 *ip = src;
+    npy_bool *op = dst;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_f32)
+    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+        // Load vectors
+        #if NCONTIG == CONTIG
+            // contiguous input
+            npyv_f32 v0 = npyv_load_f32(ip + vstep * 0);
+            npyv_f32 v1 = npyv_load_f32(ip + vstep * 1);
+            npyv_f32 v2 = npyv_load_f32(ip + vstep * 2);
+            npyv_f32 v3 = npyv_load_f32(ip + vstep * 3);
+            #if PACK_FACTOR == 8
+            npyv_f32 v4 = npyv_load_f32(ip + vstep * 4);
+            npyv_f32 v5 = npyv_load_f32(ip + vstep * 5);
+            npyv_f32 v6 = npyv_load_f32(ip + vstep * 6);
+            npyv_f32 v7 = npyv_load_f32(ip + vstep * 7);
+            #endif
+        #else
+            // non-contiguous input
+            npyv_f32 v0 = npyv_loadn_f32(ip + istride * vstep * 0, istride);
+            npyv_f32 v1 = npyv_loadn_f32(ip + istride * vstep * 1, istride);
+            npyv_f32 v2 = npyv_loadn_f32(ip + istride * vstep * 2, istride);
+            npyv_f32 v3 = npyv_loadn_f32(ip + istride * vstep * 3, istride);
+            #if PACK_FACTOR == 8
+            npyv_f32 v4 = npyv_loadn_f32(ip + istride * vstep * 4, istride);
+            npyv_f32 v5 = npyv_loadn_f32(ip + istride * vstep * 5, istride);
+            npyv_f32 v6 = npyv_loadn_f32(ip + istride * vstep * 6, istride);
+            npyv_f32 v7 = npyv_loadn_f32(ip + istride * vstep * 7, istride);
+            #endif
+        #endif
+
+        #if PACK_FACTOR == 4
+        npyv_u8 r = npyv_pack_signbit_f32(v0, v1, v2, v3);
+        #elif PACK_FACTOR == 8
+        npyv_u8 r = npyv_pack_signbit_f32(v0, v1, v2, v3, v4, v5, v6, v7);
+        #endif
+
+        #if CONTIG == CONTIG
+            npyv_store_u8(op, r);
+        #else // CONTIG == CONTIG
+            // Results are packed, so we can just loop over them
+            npy_uint8 lane[npyv_nlanes_u8];
+            npyv_store_u8(lane, r);
+            for (int ln=0; (ln * sizeof(npyv_lanetype_f32)) < npyv_nlanes_u8; ++ln){
+                op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_f32)];
+            }
+        #endif // CONTIG == CONTIG
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f32 v = npyv_load_f32(ip);
+    #else
+        npyv_f32 v = npyv_loadn_f32(ip, istride);
+    #endif
+
+        npyv_u32 r = npyv_signbit_f32(v);
+
+        npy_uint8 lane[npyv_nlanes_u8];
+        npyv_store_u8(lane, npyv_reinterpret_u8_u32(r));
+
+        op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_f32)];
+        op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_f32)];
+        #if npyv_nlanes_f32 == 4
+        op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_f32)];
+        op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_f32)];
+        #endif
+    }
+
+    #undef PACK_FACTOR
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = (npy_signbit(*ip) != 0);
+    }
+
+    npyv_cleanup();
+}
+
+#line 414
+static void simd_unary_signbit_FLOAT_CONTIG_NCONTIG
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+    const npyv_lanetype_f32 *ip = src;
+    npy_bool *op = dst;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_f32)
+    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+        // Load vectors
+        #if CONTIG == CONTIG
+            // contiguous input
+            npyv_f32 v0 = npyv_load_f32(ip + vstep * 0);
+            npyv_f32 v1 = npyv_load_f32(ip + vstep * 1);
+            npyv_f32 v2 = npyv_load_f32(ip + vstep * 2);
+            npyv_f32 v3 = npyv_load_f32(ip + vstep * 3);
+            #if PACK_FACTOR == 8
+            npyv_f32 v4 = npyv_load_f32(ip + vstep * 4);
+            npyv_f32 v5 = npyv_load_f32(ip + vstep * 5);
+            npyv_f32 v6 = npyv_load_f32(ip + vstep * 6);
+            npyv_f32 v7 = npyv_load_f32(ip + vstep * 7);
+            #endif
+        #else
+            // non-contiguous input
+            npyv_f32 v0 = npyv_loadn_f32(ip + istride * vstep * 0, istride);
+            npyv_f32 v1 = npyv_loadn_f32(ip + istride * vstep * 1, istride);
+            npyv_f32 v2 = npyv_loadn_f32(ip + istride * vstep * 2, istride);
+            npyv_f32 v3 = npyv_loadn_f32(ip + istride * vstep * 3, istride);
+            #if PACK_FACTOR == 8
+            npyv_f32 v4 = npyv_loadn_f32(ip + istride * vstep * 4, istride);
+            npyv_f32 v5 = npyv_loadn_f32(ip + istride * vstep * 5, istride);
+            npyv_f32 v6 = npyv_loadn_f32(ip + istride * vstep * 6, istride);
+            npyv_f32 v7 = npyv_loadn_f32(ip + istride * vstep * 7, istride);
+            #endif
+        #endif
+
+        #if PACK_FACTOR == 4
+        npyv_u8 r = npyv_pack_signbit_f32(v0, v1, v2, v3);
+        #elif PACK_FACTOR == 8
+        npyv_u8 r = npyv_pack_signbit_f32(v0, v1, v2, v3, v4, v5, v6, v7);
+        #endif
+
+        #if NCONTIG == CONTIG
+            npyv_store_u8(op, r);
+        #else // NCONTIG == CONTIG
+            // Results are packed, so we can just loop over them
+            npy_uint8 lane[npyv_nlanes_u8];
+            npyv_store_u8(lane, r);
+            for (int ln=0; (ln * sizeof(npyv_lanetype_f32)) < npyv_nlanes_u8; ++ln){
+                op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_f32)];
+            }
+        #endif // NCONTIG == CONTIG
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f32 v = npyv_load_f32(ip);
+    #else
+        npyv_f32 v = npyv_loadn_f32(ip, istride);
+    #endif
+
+        npyv_u32 r = npyv_signbit_f32(v);
+
+        npy_uint8 lane[npyv_nlanes_u8];
+        npyv_store_u8(lane, npyv_reinterpret_u8_u32(r));
+
+        op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_f32)];
+        op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_f32)];
+        #if npyv_nlanes_f32 == 4
+        op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_f32)];
+        op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_f32)];
+        #endif
+    }
+
+    #undef PACK_FACTOR
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = (npy_signbit(*ip) != 0);
+    }
+
+    npyv_cleanup();
+}
+
+#line 414
+static void simd_unary_signbit_FLOAT_NCONTIG_NCONTIG
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+    const npyv_lanetype_f32 *ip = src;
+    npy_bool *op = dst;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_f32)
+    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+    const int vstep = npyv_nlanes_f32;
+    const int wstep = vstep * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+        // Load vectors
+        #if NCONTIG == CONTIG
+            // contiguous input
+            npyv_f32 v0 = npyv_load_f32(ip + vstep * 0);
+            npyv_f32 v1 = npyv_load_f32(ip + vstep * 1);
+            npyv_f32 v2 = npyv_load_f32(ip + vstep * 2);
+            npyv_f32 v3 = npyv_load_f32(ip + vstep * 3);
+            #if PACK_FACTOR == 8
+            npyv_f32 v4 = npyv_load_f32(ip + vstep * 4);
+            npyv_f32 v5 = npyv_load_f32(ip + vstep * 5);
+            npyv_f32 v6 = npyv_load_f32(ip + vstep * 6);
+            npyv_f32 v7 = npyv_load_f32(ip + vstep * 7);
+            #endif
+        #else
+            // non-contiguous input
+            npyv_f32 v0 = npyv_loadn_f32(ip + istride * vstep * 0, istride);
+            npyv_f32 v1 = npyv_loadn_f32(ip + istride * vstep * 1, istride);
+            npyv_f32 v2 = npyv_loadn_f32(ip + istride * vstep * 2, istride);
+            npyv_f32 v3 = npyv_loadn_f32(ip + istride * vstep * 3, istride);
+            #if PACK_FACTOR == 8
+            npyv_f32 v4 = npyv_loadn_f32(ip + istride * vstep * 4, istride);
+            npyv_f32 v5 = npyv_loadn_f32(ip + istride * vstep * 5, istride);
+            npyv_f32 v6 = npyv_loadn_f32(ip + istride * vstep * 6, istride);
+            npyv_f32 v7 = npyv_loadn_f32(ip + istride * vstep * 7, istride);
+            #endif
+        #endif
+
+        #if PACK_FACTOR == 4
+        npyv_u8 r = npyv_pack_signbit_f32(v0, v1, v2, v3);
+        #elif PACK_FACTOR == 8
+        npyv_u8 r = npyv_pack_signbit_f32(v0, v1, v2, v3, v4, v5, v6, v7);
+        #endif
+
+        #if NCONTIG == CONTIG
+            npyv_store_u8(op, r);
+        #else // NCONTIG == CONTIG
+            // Results are packed, so we can just loop over them
+            npy_uint8 lane[npyv_nlanes_u8];
+            npyv_store_u8(lane, r);
+            for (int ln=0; (ln * sizeof(npyv_lanetype_f32)) < npyv_nlanes_u8; ++ln){
+                op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_f32)];
+            }
+        #endif // NCONTIG == CONTIG
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f32 v = npyv_load_f32(ip);
+    #else
+        npyv_f32 v = npyv_loadn_f32(ip, istride);
+    #endif
+
+        npyv_u32 r = npyv_signbit_f32(v);
+
+        npy_uint8 lane[npyv_nlanes_u8];
+        npyv_store_u8(lane, npyv_reinterpret_u8_u32(r));
+
+        op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_f32)];
+        op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_f32)];
+        #if npyv_nlanes_f32 == 4
+        op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_f32)];
+        op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_f32)];
+        #endif
+    }
+
+    #undef PACK_FACTOR
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = (npy_signbit(*ip) != 0);
+    }
+
+    npyv_cleanup();
+}
+
+
+
+#endif // NPY_SIMD_F32
+
+#line 406
+#if NPY_SIMD_F64
+#line 410
+#line 414
+static void simd_unary_isnan_DOUBLE_CONTIG_CONTIG
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+    const npyv_lanetype_f64 *ip = src;
+    npy_bool *op = dst;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_f64)
+    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+        // Load vectors
+        #if CONTIG == CONTIG
+            // contiguous input
+            npyv_f64 v0 = npyv_load_f64(ip + vstep * 0);
+            npyv_f64 v1 = npyv_load_f64(ip + vstep * 1);
+            npyv_f64 v2 = npyv_load_f64(ip + vstep * 2);
+            npyv_f64 v3 = npyv_load_f64(ip + vstep * 3);
+            #if PACK_FACTOR == 8
+            npyv_f64 v4 = npyv_load_f64(ip + vstep * 4);
+            npyv_f64 v5 = npyv_load_f64(ip + vstep * 5);
+            npyv_f64 v6 = npyv_load_f64(ip + vstep * 6);
+            npyv_f64 v7 = npyv_load_f64(ip + vstep * 7);
+            #endif
+        #else
+            // non-contiguous input
+            npyv_f64 v0 = npyv_loadn_f64(ip + istride * vstep * 0, istride);
+            npyv_f64 v1 = npyv_loadn_f64(ip + istride * vstep * 1, istride);
+            npyv_f64 v2 = npyv_loadn_f64(ip + istride * vstep * 2, istride);
+            npyv_f64 v3 = npyv_loadn_f64(ip + istride * vstep * 3, istride);
+            #if PACK_FACTOR == 8
+            npyv_f64 v4 = npyv_loadn_f64(ip + istride * vstep * 4, istride);
+            npyv_f64 v5 = npyv_loadn_f64(ip + istride * vstep * 5, istride);
+            npyv_f64 v6 = npyv_loadn_f64(ip + istride * vstep * 6, istride);
+            npyv_f64 v7 = npyv_loadn_f64(ip + istride * vstep * 7, istride);
+            #endif
+        #endif
+
+        #if PACK_FACTOR == 4
+        npyv_u8 r = npyv_pack_isnan_f64(v0, v1, v2, v3);
+        #elif PACK_FACTOR == 8
+        npyv_u8 r = npyv_pack_isnan_f64(v0, v1, v2, v3, v4, v5, v6, v7);
+        #endif
+
+        #if CONTIG == CONTIG
+            npyv_store_u8(op, r);
+        #else // CONTIG == CONTIG
+            // Results are packed, so we can just loop over them
+            npy_uint8 lane[npyv_nlanes_u8];
+            npyv_store_u8(lane, r);
+            for (int ln=0; (ln * sizeof(npyv_lanetype_f64)) < npyv_nlanes_u8; ++ln){
+                op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_f64)];
+            }
+        #endif // CONTIG == CONTIG
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f64 v = npyv_load_f64(ip);
+    #else
+        npyv_f64 v = npyv_loadn_f64(ip, istride);
+    #endif
+
+        npyv_u64 r = npyv_isnan_f64(v);
+
+        npy_uint8 lane[npyv_nlanes_u8];
+        npyv_store_u8(lane, npyv_reinterpret_u8_u64(r));
+
+        op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_f64)];
+        op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_f64)];
+        #if npyv_nlanes_f64 == 4
+        op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_f64)];
+        op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_f64)];
+        #endif
+    }
+
+    #undef PACK_FACTOR
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = (npy_isnan(*ip) != 0);
+    }
+
+    npyv_cleanup();
+}
+
+#line 414
+static void simd_unary_isnan_DOUBLE_NCONTIG_CONTIG
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+    const npyv_lanetype_f64 *ip = src;
+    npy_bool *op = dst;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_f64)
+    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+        // Load vectors
+        #if NCONTIG == CONTIG
+            // contiguous input
+            npyv_f64 v0 = npyv_load_f64(ip + vstep * 0);
+            npyv_f64 v1 = npyv_load_f64(ip + vstep * 1);
+            npyv_f64 v2 = npyv_load_f64(ip + vstep * 2);
+            npyv_f64 v3 = npyv_load_f64(ip + vstep * 3);
+            #if PACK_FACTOR == 8
+            npyv_f64 v4 = npyv_load_f64(ip + vstep * 4);
+            npyv_f64 v5 = npyv_load_f64(ip + vstep * 5);
+            npyv_f64 v6 = npyv_load_f64(ip + vstep * 6);
+            npyv_f64 v7 = npyv_load_f64(ip + vstep * 7);
+            #endif
+        #else
+            // non-contiguous input
+            npyv_f64 v0 = npyv_loadn_f64(ip + istride * vstep * 0, istride);
+            npyv_f64 v1 = npyv_loadn_f64(ip + istride * vstep * 1, istride);
+            npyv_f64 v2 = npyv_loadn_f64(ip + istride * vstep * 2, istride);
+            npyv_f64 v3 = npyv_loadn_f64(ip + istride * vstep * 3, istride);
+            #if PACK_FACTOR == 8
+            npyv_f64 v4 = npyv_loadn_f64(ip + istride * vstep * 4, istride);
+            npyv_f64 v5 = npyv_loadn_f64(ip + istride * vstep * 5, istride);
+            npyv_f64 v6 = npyv_loadn_f64(ip + istride * vstep * 6, istride);
+            npyv_f64 v7 = npyv_loadn_f64(ip + istride * vstep * 7, istride);
+            #endif
+        #endif
+
+        #if PACK_FACTOR == 4
+        npyv_u8 r = npyv_pack_isnan_f64(v0, v1, v2, v3);
+        #elif PACK_FACTOR == 8
+        npyv_u8 r = npyv_pack_isnan_f64(v0, v1, v2, v3, v4, v5, v6, v7);
+        #endif
+
+        #if CONTIG == CONTIG
+            npyv_store_u8(op, r);
+        #else // CONTIG == CONTIG
+            // Results are packed, so we can just loop over them
+            npy_uint8 lane[npyv_nlanes_u8];
+            npyv_store_u8(lane, r);
+            for (int ln=0; (ln * sizeof(npyv_lanetype_f64)) < npyv_nlanes_u8; ++ln){
+                op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_f64)];
+            }
+        #endif // CONTIG == CONTIG
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f64 v = npyv_load_f64(ip);
+    #else
+        npyv_f64 v = npyv_loadn_f64(ip, istride);
+    #endif
+
+        npyv_u64 r = npyv_isnan_f64(v);
+
+        npy_uint8 lane[npyv_nlanes_u8];
+        npyv_store_u8(lane, npyv_reinterpret_u8_u64(r));
+
+        op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_f64)];
+        op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_f64)];
+        #if npyv_nlanes_f64 == 4
+        op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_f64)];
+        op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_f64)];
+        #endif
+    }
+
+    #undef PACK_FACTOR
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = (npy_isnan(*ip) != 0);
+    }
+
+    npyv_cleanup();
+}
+
+#line 414
+static void simd_unary_isnan_DOUBLE_CONTIG_NCONTIG
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+    const npyv_lanetype_f64 *ip = src;
+    npy_bool *op = dst;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_f64)
+    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+        // Load vectors
+        #if CONTIG == CONTIG
+            // contiguous input
+            npyv_f64 v0 = npyv_load_f64(ip + vstep * 0);
+            npyv_f64 v1 = npyv_load_f64(ip + vstep * 1);
+            npyv_f64 v2 = npyv_load_f64(ip + vstep * 2);
+            npyv_f64 v3 = npyv_load_f64(ip + vstep * 3);
+            #if PACK_FACTOR == 8
+            npyv_f64 v4 = npyv_load_f64(ip + vstep * 4);
+            npyv_f64 v5 = npyv_load_f64(ip + vstep * 5);
+            npyv_f64 v6 = npyv_load_f64(ip + vstep * 6);
+            npyv_f64 v7 = npyv_load_f64(ip + vstep * 7);
+            #endif
+        #else
+            // non-contiguous input
+            npyv_f64 v0 = npyv_loadn_f64(ip + istride * vstep * 0, istride);
+            npyv_f64 v1 = npyv_loadn_f64(ip + istride * vstep * 1, istride);
+            npyv_f64 v2 = npyv_loadn_f64(ip + istride * vstep * 2, istride);
+            npyv_f64 v3 = npyv_loadn_f64(ip + istride * vstep * 3, istride);
+            #if PACK_FACTOR == 8
+            npyv_f64 v4 = npyv_loadn_f64(ip + istride * vstep * 4, istride);
+            npyv_f64 v5 = npyv_loadn_f64(ip + istride * vstep * 5, istride);
+            npyv_f64 v6 = npyv_loadn_f64(ip + istride * vstep * 6, istride);
+            npyv_f64 v7 = npyv_loadn_f64(ip + istride * vstep * 7, istride);
+            #endif
+        #endif
+
+        #if PACK_FACTOR == 4
+        npyv_u8 r = npyv_pack_isnan_f64(v0, v1, v2, v3);
+        #elif PACK_FACTOR == 8
+        npyv_u8 r = npyv_pack_isnan_f64(v0, v1, v2, v3, v4, v5, v6, v7);
+        #endif
+
+        #if NCONTIG == CONTIG
+            npyv_store_u8(op, r);
+        #else // NCONTIG == CONTIG
+            // Results are packed, so we can just loop over them
+            npy_uint8 lane[npyv_nlanes_u8];
+            npyv_store_u8(lane, r);
+            for (int ln=0; (ln * sizeof(npyv_lanetype_f64)) < npyv_nlanes_u8; ++ln){
+                op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_f64)];
+            }
+        #endif // NCONTIG == CONTIG
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f64 v = npyv_load_f64(ip);
+    #else
+        npyv_f64 v = npyv_loadn_f64(ip, istride);
+    #endif
+
+        npyv_u64 r = npyv_isnan_f64(v);
+
+        npy_uint8 lane[npyv_nlanes_u8];
+        npyv_store_u8(lane, npyv_reinterpret_u8_u64(r));
+
+        op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_f64)];
+        op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_f64)];
+        #if npyv_nlanes_f64 == 4
+        op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_f64)];
+        op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_f64)];
+        #endif
+    }
+
+    #undef PACK_FACTOR
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = (npy_isnan(*ip) != 0);
+    }
+
+    npyv_cleanup();
+}
+
+#line 414
+static void simd_unary_isnan_DOUBLE_NCONTIG_NCONTIG
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+    const npyv_lanetype_f64 *ip = src;
+    npy_bool *op = dst;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_f64)
+    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+        // Load vectors
+        #if NCONTIG == CONTIG
+            // contiguous input
+            npyv_f64 v0 = npyv_load_f64(ip + vstep * 0);
+            npyv_f64 v1 = npyv_load_f64(ip + vstep * 1);
+            npyv_f64 v2 = npyv_load_f64(ip + vstep * 2);
+            npyv_f64 v3 = npyv_load_f64(ip + vstep * 3);
+            #if PACK_FACTOR == 8
+            npyv_f64 v4 = npyv_load_f64(ip + vstep * 4);
+            npyv_f64 v5 = npyv_load_f64(ip + vstep * 5);
+            npyv_f64 v6 = npyv_load_f64(ip + vstep * 6);
+            npyv_f64 v7 = npyv_load_f64(ip + vstep * 7);
+            #endif
+        #else
+            // non-contiguous input
+            npyv_f64 v0 = npyv_loadn_f64(ip + istride * vstep * 0, istride);
+            npyv_f64 v1 = npyv_loadn_f64(ip + istride * vstep * 1, istride);
+            npyv_f64 v2 = npyv_loadn_f64(ip + istride * vstep * 2, istride);
+            npyv_f64 v3 = npyv_loadn_f64(ip + istride * vstep * 3, istride);
+            #if PACK_FACTOR == 8
+            npyv_f64 v4 = npyv_loadn_f64(ip + istride * vstep * 4, istride);
+            npyv_f64 v5 = npyv_loadn_f64(ip + istride * vstep * 5, istride);
+            npyv_f64 v6 = npyv_loadn_f64(ip + istride * vstep * 6, istride);
+            npyv_f64 v7 = npyv_loadn_f64(ip + istride * vstep * 7, istride);
+            #endif
+        #endif
+
+        #if PACK_FACTOR == 4
+        npyv_u8 r = npyv_pack_isnan_f64(v0, v1, v2, v3);
+        #elif PACK_FACTOR == 8
+        npyv_u8 r = npyv_pack_isnan_f64(v0, v1, v2, v3, v4, v5, v6, v7);
+        #endif
+
+        #if NCONTIG == CONTIG
+            npyv_store_u8(op, r);
+        #else // NCONTIG == CONTIG
+            // Results are packed, so we can just loop over them
+            npy_uint8 lane[npyv_nlanes_u8];
+            npyv_store_u8(lane, r);
+            for (int ln=0; (ln * sizeof(npyv_lanetype_f64)) < npyv_nlanes_u8; ++ln){
+                op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_f64)];
+            }
+        #endif // NCONTIG == CONTIG
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f64 v = npyv_load_f64(ip);
+    #else
+        npyv_f64 v = npyv_loadn_f64(ip, istride);
+    #endif
+
+        npyv_u64 r = npyv_isnan_f64(v);
+
+        npy_uint8 lane[npyv_nlanes_u8];
+        npyv_store_u8(lane, npyv_reinterpret_u8_u64(r));
+
+        op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_f64)];
+        op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_f64)];
+        #if npyv_nlanes_f64 == 4
+        op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_f64)];
+        op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_f64)];
+        #endif
+    }
+
+    #undef PACK_FACTOR
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = (npy_isnan(*ip) != 0);
+    }
+
+    npyv_cleanup();
+}
+
+
+#line 410
+#line 414
+static void simd_unary_isinf_DOUBLE_CONTIG_CONTIG
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+    const npyv_lanetype_f64 *ip = src;
+    npy_bool *op = dst;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_f64)
+    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+        // Load vectors
+        #if CONTIG == CONTIG
+            // contiguous input
+            npyv_f64 v0 = npyv_load_f64(ip + vstep * 0);
+            npyv_f64 v1 = npyv_load_f64(ip + vstep * 1);
+            npyv_f64 v2 = npyv_load_f64(ip + vstep * 2);
+            npyv_f64 v3 = npyv_load_f64(ip + vstep * 3);
+            #if PACK_FACTOR == 8
+            npyv_f64 v4 = npyv_load_f64(ip + vstep * 4);
+            npyv_f64 v5 = npyv_load_f64(ip + vstep * 5);
+            npyv_f64 v6 = npyv_load_f64(ip + vstep * 6);
+            npyv_f64 v7 = npyv_load_f64(ip + vstep * 7);
+            #endif
+        #else
+            // non-contiguous input
+            npyv_f64 v0 = npyv_loadn_f64(ip + istride * vstep * 0, istride);
+            npyv_f64 v1 = npyv_loadn_f64(ip + istride * vstep * 1, istride);
+            npyv_f64 v2 = npyv_loadn_f64(ip + istride * vstep * 2, istride);
+            npyv_f64 v3 = npyv_loadn_f64(ip + istride * vstep * 3, istride);
+            #if PACK_FACTOR == 8
+            npyv_f64 v4 = npyv_loadn_f64(ip + istride * vstep * 4, istride);
+            npyv_f64 v5 = npyv_loadn_f64(ip + istride * vstep * 5, istride);
+            npyv_f64 v6 = npyv_loadn_f64(ip + istride * vstep * 6, istride);
+            npyv_f64 v7 = npyv_loadn_f64(ip + istride * vstep * 7, istride);
+            #endif
+        #endif
+
+        #if PACK_FACTOR == 4
+        npyv_u8 r = npyv_pack_isinf_f64(v0, v1, v2, v3);
+        #elif PACK_FACTOR == 8
+        npyv_u8 r = npyv_pack_isinf_f64(v0, v1, v2, v3, v4, v5, v6, v7);
+        #endif
+
+        #if CONTIG == CONTIG
+            npyv_store_u8(op, r);
+        #else // CONTIG == CONTIG
+            // Results are packed, so we can just loop over them
+            npy_uint8 lane[npyv_nlanes_u8];
+            npyv_store_u8(lane, r);
+            for (int ln=0; (ln * sizeof(npyv_lanetype_f64)) < npyv_nlanes_u8; ++ln){
+                op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_f64)];
+            }
+        #endif // CONTIG == CONTIG
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f64 v = npyv_load_f64(ip);
+    #else
+        npyv_f64 v = npyv_loadn_f64(ip, istride);
+    #endif
+
+        npyv_u64 r = npyv_isinf_f64(v);
+
+        npy_uint8 lane[npyv_nlanes_u8];
+        npyv_store_u8(lane, npyv_reinterpret_u8_u64(r));
+
+        op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_f64)];
+        op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_f64)];
+        #if npyv_nlanes_f64 == 4
+        op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_f64)];
+        op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_f64)];
+        #endif
+    }
+
+    #undef PACK_FACTOR
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = (npy_isinf(*ip) != 0);
+    }
+
+    npyv_cleanup();
+}
+
+#line 414
+static void simd_unary_isinf_DOUBLE_NCONTIG_CONTIG
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+    const npyv_lanetype_f64 *ip = src;
+    npy_bool *op = dst;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_f64)
+    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+        // Load vectors
+        #if NCONTIG == CONTIG
+            // contiguous input
+            npyv_f64 v0 = npyv_load_f64(ip + vstep * 0);
+            npyv_f64 v1 = npyv_load_f64(ip + vstep * 1);
+            npyv_f64 v2 = npyv_load_f64(ip + vstep * 2);
+            npyv_f64 v3 = npyv_load_f64(ip + vstep * 3);
+            #if PACK_FACTOR == 8
+            npyv_f64 v4 = npyv_load_f64(ip + vstep * 4);
+            npyv_f64 v5 = npyv_load_f64(ip + vstep * 5);
+            npyv_f64 v6 = npyv_load_f64(ip + vstep * 6);
+            npyv_f64 v7 = npyv_load_f64(ip + vstep * 7);
+            #endif
+        #else
+            // non-contiguous input
+            npyv_f64 v0 = npyv_loadn_f64(ip + istride * vstep * 0, istride);
+            npyv_f64 v1 = npyv_loadn_f64(ip + istride * vstep * 1, istride);
+            npyv_f64 v2 = npyv_loadn_f64(ip + istride * vstep * 2, istride);
+            npyv_f64 v3 = npyv_loadn_f64(ip + istride * vstep * 3, istride);
+            #if PACK_FACTOR == 8
+            npyv_f64 v4 = npyv_loadn_f64(ip + istride * vstep * 4, istride);
+            npyv_f64 v5 = npyv_loadn_f64(ip + istride * vstep * 5, istride);
+            npyv_f64 v6 = npyv_loadn_f64(ip + istride * vstep * 6, istride);
+            npyv_f64 v7 = npyv_loadn_f64(ip + istride * vstep * 7, istride);
+            #endif
+        #endif
+
+        #if PACK_FACTOR == 4
+        npyv_u8 r = npyv_pack_isinf_f64(v0, v1, v2, v3);
+        #elif PACK_FACTOR == 8
+        npyv_u8 r = npyv_pack_isinf_f64(v0, v1, v2, v3, v4, v5, v6, v7);
+        #endif
+
+        #if CONTIG == CONTIG
+            npyv_store_u8(op, r);
+        #else // CONTIG == CONTIG
+            // Results are packed, so we can just loop over them
+            npy_uint8 lane[npyv_nlanes_u8];
+            npyv_store_u8(lane, r);
+            for (int ln=0; (ln * sizeof(npyv_lanetype_f64)) < npyv_nlanes_u8; ++ln){
+                op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_f64)];
+            }
+        #endif // CONTIG == CONTIG
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f64 v = npyv_load_f64(ip);
+    #else
+        npyv_f64 v = npyv_loadn_f64(ip, istride);
+    #endif
+
+        npyv_u64 r = npyv_isinf_f64(v);
+
+        npy_uint8 lane[npyv_nlanes_u8];
+        npyv_store_u8(lane, npyv_reinterpret_u8_u64(r));
+
+        op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_f64)];
+        op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_f64)];
+        #if npyv_nlanes_f64 == 4
+        op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_f64)];
+        op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_f64)];
+        #endif
+    }
+
+    #undef PACK_FACTOR
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = (npy_isinf(*ip) != 0);
+    }
+
+    npyv_cleanup();
+}
+
+#line 414
+static void simd_unary_isinf_DOUBLE_CONTIG_NCONTIG
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+    const npyv_lanetype_f64 *ip = src;
+    npy_bool *op = dst;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_f64)
+    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+        // Load vectors
+        #if CONTIG == CONTIG
+            // contiguous input
+            npyv_f64 v0 = npyv_load_f64(ip + vstep * 0);
+            npyv_f64 v1 = npyv_load_f64(ip + vstep * 1);
+            npyv_f64 v2 = npyv_load_f64(ip + vstep * 2);
+            npyv_f64 v3 = npyv_load_f64(ip + vstep * 3);
+            #if PACK_FACTOR == 8
+            npyv_f64 v4 = npyv_load_f64(ip + vstep * 4);
+            npyv_f64 v5 = npyv_load_f64(ip + vstep * 5);
+            npyv_f64 v6 = npyv_load_f64(ip + vstep * 6);
+            npyv_f64 v7 = npyv_load_f64(ip + vstep * 7);
+            #endif
+        #else
+            // non-contiguous input
+            npyv_f64 v0 = npyv_loadn_f64(ip + istride * vstep * 0, istride);
+            npyv_f64 v1 = npyv_loadn_f64(ip + istride * vstep * 1, istride);
+            npyv_f64 v2 = npyv_loadn_f64(ip + istride * vstep * 2, istride);
+            npyv_f64 v3 = npyv_loadn_f64(ip + istride * vstep * 3, istride);
+            #if PACK_FACTOR == 8
+            npyv_f64 v4 = npyv_loadn_f64(ip + istride * vstep * 4, istride);
+            npyv_f64 v5 = npyv_loadn_f64(ip + istride * vstep * 5, istride);
+            npyv_f64 v6 = npyv_loadn_f64(ip + istride * vstep * 6, istride);
+            npyv_f64 v7 = npyv_loadn_f64(ip + istride * vstep * 7, istride);
+            #endif
+        #endif
+
+        #if PACK_FACTOR == 4
+        npyv_u8 r = npyv_pack_isinf_f64(v0, v1, v2, v3);
+        #elif PACK_FACTOR == 8
+        npyv_u8 r = npyv_pack_isinf_f64(v0, v1, v2, v3, v4, v5, v6, v7);
+        #endif
+
+        #if NCONTIG == CONTIG
+            npyv_store_u8(op, r);
+        #else // NCONTIG == CONTIG
+            // Results are packed, so we can just loop over them
+            npy_uint8 lane[npyv_nlanes_u8];
+            npyv_store_u8(lane, r);
+            for (int ln=0; (ln * sizeof(npyv_lanetype_f64)) < npyv_nlanes_u8; ++ln){
+                op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_f64)];
+            }
+        #endif // NCONTIG == CONTIG
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f64 v = npyv_load_f64(ip);
+    #else
+        npyv_f64 v = npyv_loadn_f64(ip, istride);
+    #endif
+
+        npyv_u64 r = npyv_isinf_f64(v);
+
+        npy_uint8 lane[npyv_nlanes_u8];
+        npyv_store_u8(lane, npyv_reinterpret_u8_u64(r));
+
+        op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_f64)];
+        op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_f64)];
+        #if npyv_nlanes_f64 == 4
+        op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_f64)];
+        op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_f64)];
+        #endif
+    }
+
+    #undef PACK_FACTOR
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = (npy_isinf(*ip) != 0);
+    }
+
+    npyv_cleanup();
+}
+
+#line 414
+static void simd_unary_isinf_DOUBLE_NCONTIG_NCONTIG
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+    const npyv_lanetype_f64 *ip = src;
+    npy_bool *op = dst;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_f64)
+    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+        // Load vectors
+        #if NCONTIG == CONTIG
+            // contiguous input
+            npyv_f64 v0 = npyv_load_f64(ip + vstep * 0);
+            npyv_f64 v1 = npyv_load_f64(ip + vstep * 1);
+            npyv_f64 v2 = npyv_load_f64(ip + vstep * 2);
+            npyv_f64 v3 = npyv_load_f64(ip + vstep * 3);
+            #if PACK_FACTOR == 8
+            npyv_f64 v4 = npyv_load_f64(ip + vstep * 4);
+            npyv_f64 v5 = npyv_load_f64(ip + vstep * 5);
+            npyv_f64 v6 = npyv_load_f64(ip + vstep * 6);
+            npyv_f64 v7 = npyv_load_f64(ip + vstep * 7);
+            #endif
+        #else
+            // non-contiguous input
+            npyv_f64 v0 = npyv_loadn_f64(ip + istride * vstep * 0, istride);
+            npyv_f64 v1 = npyv_loadn_f64(ip + istride * vstep * 1, istride);
+            npyv_f64 v2 = npyv_loadn_f64(ip + istride * vstep * 2, istride);
+            npyv_f64 v3 = npyv_loadn_f64(ip + istride * vstep * 3, istride);
+            #if PACK_FACTOR == 8
+            npyv_f64 v4 = npyv_loadn_f64(ip + istride * vstep * 4, istride);
+            npyv_f64 v5 = npyv_loadn_f64(ip + istride * vstep * 5, istride);
+            npyv_f64 v6 = npyv_loadn_f64(ip + istride * vstep * 6, istride);
+            npyv_f64 v7 = npyv_loadn_f64(ip + istride * vstep * 7, istride);
+            #endif
+        #endif
+
+        #if PACK_FACTOR == 4
+        npyv_u8 r = npyv_pack_isinf_f64(v0, v1, v2, v3);
+        #elif PACK_FACTOR == 8
+        npyv_u8 r = npyv_pack_isinf_f64(v0, v1, v2, v3, v4, v5, v6, v7);
+        #endif
+
+        #if NCONTIG == CONTIG
+            npyv_store_u8(op, r);
+        #else // NCONTIG == CONTIG
+            // Results are packed, so we can just loop over them
+            npy_uint8 lane[npyv_nlanes_u8];
+            npyv_store_u8(lane, r);
+            for (int ln=0; (ln * sizeof(npyv_lanetype_f64)) < npyv_nlanes_u8; ++ln){
+                op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_f64)];
+            }
+        #endif // NCONTIG == CONTIG
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f64 v = npyv_load_f64(ip);
+    #else
+        npyv_f64 v = npyv_loadn_f64(ip, istride);
+    #endif
+
+        npyv_u64 r = npyv_isinf_f64(v);
+
+        npy_uint8 lane[npyv_nlanes_u8];
+        npyv_store_u8(lane, npyv_reinterpret_u8_u64(r));
+
+        op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_f64)];
+        op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_f64)];
+        #if npyv_nlanes_f64 == 4
+        op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_f64)];
+        op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_f64)];
+        #endif
+    }
+
+    #undef PACK_FACTOR
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = (npy_isinf(*ip) != 0);
+    }
+
+    npyv_cleanup();
+}
+
+
+#line 410
+#line 414
+static void simd_unary_isfinite_DOUBLE_CONTIG_CONTIG
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+    const npyv_lanetype_f64 *ip = src;
+    npy_bool *op = dst;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_f64)
+    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+        // Load vectors
+        #if CONTIG == CONTIG
+            // contiguous input
+            npyv_f64 v0 = npyv_load_f64(ip + vstep * 0);
+            npyv_f64 v1 = npyv_load_f64(ip + vstep * 1);
+            npyv_f64 v2 = npyv_load_f64(ip + vstep * 2);
+            npyv_f64 v3 = npyv_load_f64(ip + vstep * 3);
+            #if PACK_FACTOR == 8
+            npyv_f64 v4 = npyv_load_f64(ip + vstep * 4);
+            npyv_f64 v5 = npyv_load_f64(ip + vstep * 5);
+            npyv_f64 v6 = npyv_load_f64(ip + vstep * 6);
+            npyv_f64 v7 = npyv_load_f64(ip + vstep * 7);
+            #endif
+        #else
+            // non-contiguous input
+            npyv_f64 v0 = npyv_loadn_f64(ip + istride * vstep * 0, istride);
+            npyv_f64 v1 = npyv_loadn_f64(ip + istride * vstep * 1, istride);
+            npyv_f64 v2 = npyv_loadn_f64(ip + istride * vstep * 2, istride);
+            npyv_f64 v3 = npyv_loadn_f64(ip + istride * vstep * 3, istride);
+            #if PACK_FACTOR == 8
+            npyv_f64 v4 = npyv_loadn_f64(ip + istride * vstep * 4, istride);
+            npyv_f64 v5 = npyv_loadn_f64(ip + istride * vstep * 5, istride);
+            npyv_f64 v6 = npyv_loadn_f64(ip + istride * vstep * 6, istride);
+            npyv_f64 v7 = npyv_loadn_f64(ip + istride * vstep * 7, istride);
+            #endif
+        #endif
+
+        #if PACK_FACTOR == 4
+        npyv_u8 r = npyv_pack_isfinite_f64(v0, v1, v2, v3);
+        #elif PACK_FACTOR == 8
+        npyv_u8 r = npyv_pack_isfinite_f64(v0, v1, v2, v3, v4, v5, v6, v7);
+        #endif
+
+        #if CONTIG == CONTIG
+            npyv_store_u8(op, r);
+        #else // CONTIG == CONTIG
+            // Results are packed, so we can just loop over them
+            npy_uint8 lane[npyv_nlanes_u8];
+            npyv_store_u8(lane, r);
+            for (int ln=0; (ln * sizeof(npyv_lanetype_f64)) < npyv_nlanes_u8; ++ln){
+                op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_f64)];
+            }
+        #endif // CONTIG == CONTIG
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f64 v = npyv_load_f64(ip);
+    #else
+        npyv_f64 v = npyv_loadn_f64(ip, istride);
+    #endif
+
+        npyv_u64 r = npyv_isfinite_f64(v);
+
+        npy_uint8 lane[npyv_nlanes_u8];
+        npyv_store_u8(lane, npyv_reinterpret_u8_u64(r));
+
+        op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_f64)];
+        op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_f64)];
+        #if npyv_nlanes_f64 == 4
+        op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_f64)];
+        op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_f64)];
+        #endif
+    }
+
+    #undef PACK_FACTOR
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = (npy_isfinite(*ip) != 0);
+    }
+
+    npyv_cleanup();
+}
+
+#line 414
+static void simd_unary_isfinite_DOUBLE_NCONTIG_CONTIG
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+    const npyv_lanetype_f64 *ip = src;
+    npy_bool *op = dst;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_f64)
+    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+        // Load vectors
+        #if NCONTIG == CONTIG
+            // contiguous input
+            npyv_f64 v0 = npyv_load_f64(ip + vstep * 0);
+            npyv_f64 v1 = npyv_load_f64(ip + vstep * 1);
+            npyv_f64 v2 = npyv_load_f64(ip + vstep * 2);
+            npyv_f64 v3 = npyv_load_f64(ip + vstep * 3);
+            #if PACK_FACTOR == 8
+            npyv_f64 v4 = npyv_load_f64(ip + vstep * 4);
+            npyv_f64 v5 = npyv_load_f64(ip + vstep * 5);
+            npyv_f64 v6 = npyv_load_f64(ip + vstep * 6);
+            npyv_f64 v7 = npyv_load_f64(ip + vstep * 7);
+            #endif
+        #else
+            // non-contiguous input
+            npyv_f64 v0 = npyv_loadn_f64(ip + istride * vstep * 0, istride);
+            npyv_f64 v1 = npyv_loadn_f64(ip + istride * vstep * 1, istride);
+            npyv_f64 v2 = npyv_loadn_f64(ip + istride * vstep * 2, istride);
+            npyv_f64 v3 = npyv_loadn_f64(ip + istride * vstep * 3, istride);
+            #if PACK_FACTOR == 8
+            npyv_f64 v4 = npyv_loadn_f64(ip + istride * vstep * 4, istride);
+            npyv_f64 v5 = npyv_loadn_f64(ip + istride * vstep * 5, istride);
+            npyv_f64 v6 = npyv_loadn_f64(ip + istride * vstep * 6, istride);
+            npyv_f64 v7 = npyv_loadn_f64(ip + istride * vstep * 7, istride);
+            #endif
+        #endif
+
+        #if PACK_FACTOR == 4
+        npyv_u8 r = npyv_pack_isfinite_f64(v0, v1, v2, v3);
+        #elif PACK_FACTOR == 8
+        npyv_u8 r = npyv_pack_isfinite_f64(v0, v1, v2, v3, v4, v5, v6, v7);
+        #endif
+
+        #if CONTIG == CONTIG
+            npyv_store_u8(op, r);
+        #else // CONTIG == CONTIG
+            // Results are packed, so we can just loop over them
+            npy_uint8 lane[npyv_nlanes_u8];
+            npyv_store_u8(lane, r);
+            for (int ln=0; (ln * sizeof(npyv_lanetype_f64)) < npyv_nlanes_u8; ++ln){
+                op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_f64)];
+            }
+        #endif // CONTIG == CONTIG
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f64 v = npyv_load_f64(ip);
+    #else
+        npyv_f64 v = npyv_loadn_f64(ip, istride);
+    #endif
+
+        npyv_u64 r = npyv_isfinite_f64(v);
+
+        npy_uint8 lane[npyv_nlanes_u8];
+        npyv_store_u8(lane, npyv_reinterpret_u8_u64(r));
+
+        op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_f64)];
+        op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_f64)];
+        #if npyv_nlanes_f64 == 4
+        op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_f64)];
+        op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_f64)];
+        #endif
+    }
+
+    #undef PACK_FACTOR
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = (npy_isfinite(*ip) != 0);
+    }
+
+    npyv_cleanup();
+}
+
+#line 414
+static void simd_unary_isfinite_DOUBLE_CONTIG_NCONTIG
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+    const npyv_lanetype_f64 *ip = src;
+    npy_bool *op = dst;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_f64)
+    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+        // Load vectors
+        #if CONTIG == CONTIG
+            // contiguous input
+            npyv_f64 v0 = npyv_load_f64(ip + vstep * 0);
+            npyv_f64 v1 = npyv_load_f64(ip + vstep * 1);
+            npyv_f64 v2 = npyv_load_f64(ip + vstep * 2);
+            npyv_f64 v3 = npyv_load_f64(ip + vstep * 3);
+            #if PACK_FACTOR == 8
+            npyv_f64 v4 = npyv_load_f64(ip + vstep * 4);
+            npyv_f64 v5 = npyv_load_f64(ip + vstep * 5);
+            npyv_f64 v6 = npyv_load_f64(ip + vstep * 6);
+            npyv_f64 v7 = npyv_load_f64(ip + vstep * 7);
+            #endif
+        #else
+            // non-contiguous input
+            npyv_f64 v0 = npyv_loadn_f64(ip + istride * vstep * 0, istride);
+            npyv_f64 v1 = npyv_loadn_f64(ip + istride * vstep * 1, istride);
+            npyv_f64 v2 = npyv_loadn_f64(ip + istride * vstep * 2, istride);
+            npyv_f64 v3 = npyv_loadn_f64(ip + istride * vstep * 3, istride);
+            #if PACK_FACTOR == 8
+            npyv_f64 v4 = npyv_loadn_f64(ip + istride * vstep * 4, istride);
+            npyv_f64 v5 = npyv_loadn_f64(ip + istride * vstep * 5, istride);
+            npyv_f64 v6 = npyv_loadn_f64(ip + istride * vstep * 6, istride);
+            npyv_f64 v7 = npyv_loadn_f64(ip + istride * vstep * 7, istride);
+            #endif
+        #endif
+
+        #if PACK_FACTOR == 4
+        npyv_u8 r = npyv_pack_isfinite_f64(v0, v1, v2, v3);
+        #elif PACK_FACTOR == 8
+        npyv_u8 r = npyv_pack_isfinite_f64(v0, v1, v2, v3, v4, v5, v6, v7);
+        #endif
+
+        #if NCONTIG == CONTIG
+            npyv_store_u8(op, r);
+        #else // NCONTIG == CONTIG
+            // Results are packed, so we can just loop over them
+            npy_uint8 lane[npyv_nlanes_u8];
+            npyv_store_u8(lane, r);
+            for (int ln=0; (ln * sizeof(npyv_lanetype_f64)) < npyv_nlanes_u8; ++ln){
+                op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_f64)];
+            }
+        #endif // NCONTIG == CONTIG
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f64 v = npyv_load_f64(ip);
+    #else
+        npyv_f64 v = npyv_loadn_f64(ip, istride);
+    #endif
+
+        npyv_u64 r = npyv_isfinite_f64(v);
+
+        npy_uint8 lane[npyv_nlanes_u8];
+        npyv_store_u8(lane, npyv_reinterpret_u8_u64(r));
+
+        op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_f64)];
+        op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_f64)];
+        #if npyv_nlanes_f64 == 4
+        op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_f64)];
+        op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_f64)];
+        #endif
+    }
+
+    #undef PACK_FACTOR
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = (npy_isfinite(*ip) != 0);
+    }
+
+    npyv_cleanup();
+}
+
+#line 414
+static void simd_unary_isfinite_DOUBLE_NCONTIG_NCONTIG
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+    const npyv_lanetype_f64 *ip = src;
+    npy_bool *op = dst;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_f64)
+    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+        // Load vectors
+        #if NCONTIG == CONTIG
+            // contiguous input
+            npyv_f64 v0 = npyv_load_f64(ip + vstep * 0);
+            npyv_f64 v1 = npyv_load_f64(ip + vstep * 1);
+            npyv_f64 v2 = npyv_load_f64(ip + vstep * 2);
+            npyv_f64 v3 = npyv_load_f64(ip + vstep * 3);
+            #if PACK_FACTOR == 8
+            npyv_f64 v4 = npyv_load_f64(ip + vstep * 4);
+            npyv_f64 v5 = npyv_load_f64(ip + vstep * 5);
+            npyv_f64 v6 = npyv_load_f64(ip + vstep * 6);
+            npyv_f64 v7 = npyv_load_f64(ip + vstep * 7);
+            #endif
+        #else
+            // non-contiguous input
+            npyv_f64 v0 = npyv_loadn_f64(ip + istride * vstep * 0, istride);
+            npyv_f64 v1 = npyv_loadn_f64(ip + istride * vstep * 1, istride);
+            npyv_f64 v2 = npyv_loadn_f64(ip + istride * vstep * 2, istride);
+            npyv_f64 v3 = npyv_loadn_f64(ip + istride * vstep * 3, istride);
+            #if PACK_FACTOR == 8
+            npyv_f64 v4 = npyv_loadn_f64(ip + istride * vstep * 4, istride);
+            npyv_f64 v5 = npyv_loadn_f64(ip + istride * vstep * 5, istride);
+            npyv_f64 v6 = npyv_loadn_f64(ip + istride * vstep * 6, istride);
+            npyv_f64 v7 = npyv_loadn_f64(ip + istride * vstep * 7, istride);
+            #endif
+        #endif
+
+        #if PACK_FACTOR == 4
+        npyv_u8 r = npyv_pack_isfinite_f64(v0, v1, v2, v3);
+        #elif PACK_FACTOR == 8
+        npyv_u8 r = npyv_pack_isfinite_f64(v0, v1, v2, v3, v4, v5, v6, v7);
+        #endif
+
+        #if NCONTIG == CONTIG
+            npyv_store_u8(op, r);
+        #else // NCONTIG == CONTIG
+            // Results are packed, so we can just loop over them
+            npy_uint8 lane[npyv_nlanes_u8];
+            npyv_store_u8(lane, r);
+            for (int ln=0; (ln * sizeof(npyv_lanetype_f64)) < npyv_nlanes_u8; ++ln){
+                op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_f64)];
+            }
+        #endif // NCONTIG == CONTIG
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f64 v = npyv_load_f64(ip);
+    #else
+        npyv_f64 v = npyv_loadn_f64(ip, istride);
+    #endif
+
+        npyv_u64 r = npyv_isfinite_f64(v);
+
+        npy_uint8 lane[npyv_nlanes_u8];
+        npyv_store_u8(lane, npyv_reinterpret_u8_u64(r));
+
+        op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_f64)];
+        op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_f64)];
+        #if npyv_nlanes_f64 == 4
+        op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_f64)];
+        op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_f64)];
+        #endif
+    }
+
+    #undef PACK_FACTOR
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = (npy_isfinite(*ip) != 0);
+    }
+
+    npyv_cleanup();
+}
+
+
+#line 410
+#line 414
+static void simd_unary_signbit_DOUBLE_CONTIG_CONTIG
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+    const npyv_lanetype_f64 *ip = src;
+    npy_bool *op = dst;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_f64)
+    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+        // Load vectors
+        #if CONTIG == CONTIG
+            // contiguous input
+            npyv_f64 v0 = npyv_load_f64(ip + vstep * 0);
+            npyv_f64 v1 = npyv_load_f64(ip + vstep * 1);
+            npyv_f64 v2 = npyv_load_f64(ip + vstep * 2);
+            npyv_f64 v3 = npyv_load_f64(ip + vstep * 3);
+            #if PACK_FACTOR == 8
+            npyv_f64 v4 = npyv_load_f64(ip + vstep * 4);
+            npyv_f64 v5 = npyv_load_f64(ip + vstep * 5);
+            npyv_f64 v6 = npyv_load_f64(ip + vstep * 6);
+            npyv_f64 v7 = npyv_load_f64(ip + vstep * 7);
+            #endif
+        #else
+            // non-contiguous input
+            npyv_f64 v0 = npyv_loadn_f64(ip + istride * vstep * 0, istride);
+            npyv_f64 v1 = npyv_loadn_f64(ip + istride * vstep * 1, istride);
+            npyv_f64 v2 = npyv_loadn_f64(ip + istride * vstep * 2, istride);
+            npyv_f64 v3 = npyv_loadn_f64(ip + istride * vstep * 3, istride);
+            #if PACK_FACTOR == 8
+            npyv_f64 v4 = npyv_loadn_f64(ip + istride * vstep * 4, istride);
+            npyv_f64 v5 = npyv_loadn_f64(ip + istride * vstep * 5, istride);
+            npyv_f64 v6 = npyv_loadn_f64(ip + istride * vstep * 6, istride);
+            npyv_f64 v7 = npyv_loadn_f64(ip + istride * vstep * 7, istride);
+            #endif
+        #endif
+
+        #if PACK_FACTOR == 4
+        npyv_u8 r = npyv_pack_signbit_f64(v0, v1, v2, v3);
+        #elif PACK_FACTOR == 8
+        npyv_u8 r = npyv_pack_signbit_f64(v0, v1, v2, v3, v4, v5, v6, v7);
+        #endif
+
+        #if CONTIG == CONTIG
+            npyv_store_u8(op, r);
+        #else // CONTIG == CONTIG
+            // Results are packed, so we can just loop over them
+            npy_uint8 lane[npyv_nlanes_u8];
+            npyv_store_u8(lane, r);
+            for (int ln=0; (ln * sizeof(npyv_lanetype_f64)) < npyv_nlanes_u8; ++ln){
+                op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_f64)];
+            }
+        #endif // CONTIG == CONTIG
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f64 v = npyv_load_f64(ip);
+    #else
+        npyv_f64 v = npyv_loadn_f64(ip, istride);
+    #endif
+
+        npyv_u64 r = npyv_signbit_f64(v);
+
+        npy_uint8 lane[npyv_nlanes_u8];
+        npyv_store_u8(lane, npyv_reinterpret_u8_u64(r));
+
+        op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_f64)];
+        op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_f64)];
+        #if npyv_nlanes_f64 == 4
+        op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_f64)];
+        op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_f64)];
+        #endif
+    }
+
+    #undef PACK_FACTOR
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = (npy_signbit(*ip) != 0);
+    }
+
+    npyv_cleanup();
+}
+
+#line 414
+static void simd_unary_signbit_DOUBLE_NCONTIG_CONTIG
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+    const npyv_lanetype_f64 *ip = src;
+    npy_bool *op = dst;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_f64)
+    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+        // Load vectors
+        #if NCONTIG == CONTIG
+            // contiguous input
+            npyv_f64 v0 = npyv_load_f64(ip + vstep * 0);
+            npyv_f64 v1 = npyv_load_f64(ip + vstep * 1);
+            npyv_f64 v2 = npyv_load_f64(ip + vstep * 2);
+            npyv_f64 v3 = npyv_load_f64(ip + vstep * 3);
+            #if PACK_FACTOR == 8
+            npyv_f64 v4 = npyv_load_f64(ip + vstep * 4);
+            npyv_f64 v5 = npyv_load_f64(ip + vstep * 5);
+            npyv_f64 v6 = npyv_load_f64(ip + vstep * 6);
+            npyv_f64 v7 = npyv_load_f64(ip + vstep * 7);
+            #endif
+        #else
+            // non-contiguous input
+            npyv_f64 v0 = npyv_loadn_f64(ip + istride * vstep * 0, istride);
+            npyv_f64 v1 = npyv_loadn_f64(ip + istride * vstep * 1, istride);
+            npyv_f64 v2 = npyv_loadn_f64(ip + istride * vstep * 2, istride);
+            npyv_f64 v3 = npyv_loadn_f64(ip + istride * vstep * 3, istride);
+            #if PACK_FACTOR == 8
+            npyv_f64 v4 = npyv_loadn_f64(ip + istride * vstep * 4, istride);
+            npyv_f64 v5 = npyv_loadn_f64(ip + istride * vstep * 5, istride);
+            npyv_f64 v6 = npyv_loadn_f64(ip + istride * vstep * 6, istride);
+            npyv_f64 v7 = npyv_loadn_f64(ip + istride * vstep * 7, istride);
+            #endif
+        #endif
+
+        #if PACK_FACTOR == 4
+        npyv_u8 r = npyv_pack_signbit_f64(v0, v1, v2, v3);
+        #elif PACK_FACTOR == 8
+        npyv_u8 r = npyv_pack_signbit_f64(v0, v1, v2, v3, v4, v5, v6, v7);
+        #endif
+
+        #if CONTIG == CONTIG
+            npyv_store_u8(op, r);
+        #else // CONTIG == CONTIG
+            // Results are packed, so we can just loop over them
+            npy_uint8 lane[npyv_nlanes_u8];
+            npyv_store_u8(lane, r);
+            for (int ln=0; (ln * sizeof(npyv_lanetype_f64)) < npyv_nlanes_u8; ++ln){
+                op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_f64)];
+            }
+        #endif // CONTIG == CONTIG
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f64 v = npyv_load_f64(ip);
+    #else
+        npyv_f64 v = npyv_loadn_f64(ip, istride);
+    #endif
+
+        npyv_u64 r = npyv_signbit_f64(v);
+
+        npy_uint8 lane[npyv_nlanes_u8];
+        npyv_store_u8(lane, npyv_reinterpret_u8_u64(r));
+
+        op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_f64)];
+        op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_f64)];
+        #if npyv_nlanes_f64 == 4
+        op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_f64)];
+        op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_f64)];
+        #endif
+    }
+
+    #undef PACK_FACTOR
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = (npy_signbit(*ip) != 0);
+    }
+
+    npyv_cleanup();
+}
+
+#line 414
+static void simd_unary_signbit_DOUBLE_CONTIG_NCONTIG
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+    const npyv_lanetype_f64 *ip = src;
+    npy_bool *op = dst;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_f64)
+    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+        // Load vectors
+        #if CONTIG == CONTIG
+            // contiguous input
+            npyv_f64 v0 = npyv_load_f64(ip + vstep * 0);
+            npyv_f64 v1 = npyv_load_f64(ip + vstep * 1);
+            npyv_f64 v2 = npyv_load_f64(ip + vstep * 2);
+            npyv_f64 v3 = npyv_load_f64(ip + vstep * 3);
+            #if PACK_FACTOR == 8
+            npyv_f64 v4 = npyv_load_f64(ip + vstep * 4);
+            npyv_f64 v5 = npyv_load_f64(ip + vstep * 5);
+            npyv_f64 v6 = npyv_load_f64(ip + vstep * 6);
+            npyv_f64 v7 = npyv_load_f64(ip + vstep * 7);
+            #endif
+        #else
+            // non-contiguous input
+            npyv_f64 v0 = npyv_loadn_f64(ip + istride * vstep * 0, istride);
+            npyv_f64 v1 = npyv_loadn_f64(ip + istride * vstep * 1, istride);
+            npyv_f64 v2 = npyv_loadn_f64(ip + istride * vstep * 2, istride);
+            npyv_f64 v3 = npyv_loadn_f64(ip + istride * vstep * 3, istride);
+            #if PACK_FACTOR == 8
+            npyv_f64 v4 = npyv_loadn_f64(ip + istride * vstep * 4, istride);
+            npyv_f64 v5 = npyv_loadn_f64(ip + istride * vstep * 5, istride);
+            npyv_f64 v6 = npyv_loadn_f64(ip + istride * vstep * 6, istride);
+            npyv_f64 v7 = npyv_loadn_f64(ip + istride * vstep * 7, istride);
+            #endif
+        #endif
+
+        #if PACK_FACTOR == 4
+        npyv_u8 r = npyv_pack_signbit_f64(v0, v1, v2, v3);
+        #elif PACK_FACTOR == 8
+        npyv_u8 r = npyv_pack_signbit_f64(v0, v1, v2, v3, v4, v5, v6, v7);
+        #endif
+
+        #if NCONTIG == CONTIG
+            npyv_store_u8(op, r);
+        #else // NCONTIG == CONTIG
+            // Results are packed, so we can just loop over them
+            npy_uint8 lane[npyv_nlanes_u8];
+            npyv_store_u8(lane, r);
+            for (int ln=0; (ln * sizeof(npyv_lanetype_f64)) < npyv_nlanes_u8; ++ln){
+                op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_f64)];
+            }
+        #endif // NCONTIG == CONTIG
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+    #if CONTIG == CONTIG
+        npyv_f64 v = npyv_load_f64(ip);
+    #else
+        npyv_f64 v = npyv_loadn_f64(ip, istride);
+    #endif
+
+        npyv_u64 r = npyv_signbit_f64(v);
+
+        npy_uint8 lane[npyv_nlanes_u8];
+        npyv_store_u8(lane, npyv_reinterpret_u8_u64(r));
+
+        op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_f64)];
+        op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_f64)];
+        #if npyv_nlanes_f64 == 4
+        op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_f64)];
+        op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_f64)];
+        #endif
+    }
+
+    #undef PACK_FACTOR
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = (npy_signbit(*ip) != 0);
+    }
+
+    npyv_cleanup();
+}
+
+#line 414
+static void simd_unary_signbit_DOUBLE_NCONTIG_NCONTIG
+(const void *src, npy_intp istride, void *dst, npy_intp ostride, npy_intp len)
+{
+    const npyv_lanetype_f64 *ip = src;
+    npy_bool *op = dst;
+
+    // How many vectors can be packed into a u8 / bool vector?
+    #define PACK_FACTOR (NPY_SIMD_WIDTH / npyv_nlanes_f64)
+    assert(PACK_FACTOR == 4 || PACK_FACTOR == 8);
+
+    const int vstep = npyv_nlanes_f64;
+    const int wstep = vstep * PACK_FACTOR;
+
+    // unrolled iterations
+    for (; len >= wstep; len -= wstep, ip += istride*wstep, op += ostride*wstep) {
+        // Load vectors
+        #if NCONTIG == CONTIG
+            // contiguous input
+            npyv_f64 v0 = npyv_load_f64(ip + vstep * 0);
+            npyv_f64 v1 = npyv_load_f64(ip + vstep * 1);
+            npyv_f64 v2 = npyv_load_f64(ip + vstep * 2);
+            npyv_f64 v3 = npyv_load_f64(ip + vstep * 3);
+            #if PACK_FACTOR == 8
+            npyv_f64 v4 = npyv_load_f64(ip + vstep * 4);
+            npyv_f64 v5 = npyv_load_f64(ip + vstep * 5);
+            npyv_f64 v6 = npyv_load_f64(ip + vstep * 6);
+            npyv_f64 v7 = npyv_load_f64(ip + vstep * 7);
+            #endif
+        #else
+            // non-contiguous input
+            npyv_f64 v0 = npyv_loadn_f64(ip + istride * vstep * 0, istride);
+            npyv_f64 v1 = npyv_loadn_f64(ip + istride * vstep * 1, istride);
+            npyv_f64 v2 = npyv_loadn_f64(ip + istride * vstep * 2, istride);
+            npyv_f64 v3 = npyv_loadn_f64(ip + istride * vstep * 3, istride);
+            #if PACK_FACTOR == 8
+            npyv_f64 v4 = npyv_loadn_f64(ip + istride * vstep * 4, istride);
+            npyv_f64 v5 = npyv_loadn_f64(ip + istride * vstep * 5, istride);
+            npyv_f64 v6 = npyv_loadn_f64(ip + istride * vstep * 6, istride);
+            npyv_f64 v7 = npyv_loadn_f64(ip + istride * vstep * 7, istride);
+            #endif
+        #endif
+
+        #if PACK_FACTOR == 4
+        npyv_u8 r = npyv_pack_signbit_f64(v0, v1, v2, v3);
+        #elif PACK_FACTOR == 8
+        npyv_u8 r = npyv_pack_signbit_f64(v0, v1, v2, v3, v4, v5, v6, v7);
+        #endif
+
+        #if NCONTIG == CONTIG
+            npyv_store_u8(op, r);
+        #else // NCONTIG == CONTIG
+            // Results are packed, so we can just loop over them
+            npy_uint8 lane[npyv_nlanes_u8];
+            npyv_store_u8(lane, r);
+            for (int ln=0; (ln * sizeof(npyv_lanetype_f64)) < npyv_nlanes_u8; ++ln){
+                op[ln * ostride] = lane[ln * sizeof(npyv_lanetype_f64)];
+            }
+        #endif // NCONTIG == CONTIG
+    }
+
+    // vector-sized iterations
+    for (; len >= vstep; len -= vstep, ip += istride*vstep, op += ostride*vstep) {
+    #if NCONTIG == CONTIG
+        npyv_f64 v = npyv_load_f64(ip);
+    #else
+        npyv_f64 v = npyv_loadn_f64(ip, istride);
+    #endif
+
+        npyv_u64 r = npyv_signbit_f64(v);
+
+        npy_uint8 lane[npyv_nlanes_u8];
+        npyv_store_u8(lane, npyv_reinterpret_u8_u64(r));
+
+        op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_f64)];
+        op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_f64)];
+        #if npyv_nlanes_f64 == 4
+        op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_f64)];
+        op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_f64)];
+        #endif
+    }
+
+    #undef PACK_FACTOR
+
+    // Scalar loop to finish off
+    for (; len > 0; --len, ip += istride, op += ostride) {
+        *op = (npy_signbit(*ip) != 0);
+    }
+
+    npyv_cleanup();
+}
+
+
+
+#endif // NPY_SIMD_F64
+
+
+/********************************************************************************
+ ** Defining ufunc inner functions
+ ********************************************************************************/
+#line 518
+
+#line 522
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_isnan)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if NPY_SIMD_F32
+    const char *ip = args[0];
+    char *op = args[1];
+    const npy_intp istep = steps[0];
+    const npy_intp ostep = steps[1];
+    npy_intp len = dimensions[0];
+    const int ilsize = sizeof(npyv_lanetype_f32);
+    const int olsize = sizeof(npy_bool);
+    const npy_intp istride = istep / ilsize;
+    const npy_intp ostride = ostep / olsize;
+    assert(len <= 1 || ostep % olsize == 0);
+
+    if ((istep % ilsize == 0) &&
+        !is_mem_overlap(ip, istep, op, ostep, len) &&
+        npyv_loadable_stride_f32(istride) &&
+        npyv_storable_stride_f32(ostride))
+    {
+        if (istride == 1 && ostride == 1) {
+            simd_unary_isnan_FLOAT_CONTIG_CONTIG(ip, 1, op, 1, len);
+        }
+        else if (ostride == 1) {
+            simd_unary_isnan_FLOAT_NCONTIG_CONTIG(ip, istride, op, 1, len);
+        }
+        else if (istride == 1) {
+            simd_unary_isnan_FLOAT_CONTIG_NCONTIG(ip, 1, op, ostride, len);
+        } else {
+            simd_unary_isnan_FLOAT_NCONTIG_NCONTIG(ip, istride, op, ostride, len);
+        }
+    } else
+#endif // NPY_SIMD_F32
+    {
+    UNARY_LOOP {
+        const npyv_lanetype_f32 in = *(npyv_lanetype_f32 *)ip1;
+        *((npy_bool *)op1) = (npy_isnan(in) != 0);
+    }
+    }
+
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 522
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_isinf)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if NPY_SIMD_F32
+    const char *ip = args[0];
+    char *op = args[1];
+    const npy_intp istep = steps[0];
+    const npy_intp ostep = steps[1];
+    npy_intp len = dimensions[0];
+    const int ilsize = sizeof(npyv_lanetype_f32);
+    const int olsize = sizeof(npy_bool);
+    const npy_intp istride = istep / ilsize;
+    const npy_intp ostride = ostep / olsize;
+    assert(len <= 1 || ostep % olsize == 0);
+
+    if ((istep % ilsize == 0) &&
+        !is_mem_overlap(ip, istep, op, ostep, len) &&
+        npyv_loadable_stride_f32(istride) &&
+        npyv_storable_stride_f32(ostride))
+    {
+        if (istride == 1 && ostride == 1) {
+            simd_unary_isinf_FLOAT_CONTIG_CONTIG(ip, 1, op, 1, len);
+        }
+        else if (ostride == 1) {
+            simd_unary_isinf_FLOAT_NCONTIG_CONTIG(ip, istride, op, 1, len);
+        }
+        else if (istride == 1) {
+            simd_unary_isinf_FLOAT_CONTIG_NCONTIG(ip, 1, op, ostride, len);
+        } else {
+            simd_unary_isinf_FLOAT_NCONTIG_NCONTIG(ip, istride, op, ostride, len);
+        }
+    } else
+#endif // NPY_SIMD_F32
+    {
+    UNARY_LOOP {
+        const npyv_lanetype_f32 in = *(npyv_lanetype_f32 *)ip1;
+        *((npy_bool *)op1) = (npy_isinf(in) != 0);
+    }
+    }
+
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 522
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_isfinite)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if NPY_SIMD_F32
+    const char *ip = args[0];
+    char *op = args[1];
+    const npy_intp istep = steps[0];
+    const npy_intp ostep = steps[1];
+    npy_intp len = dimensions[0];
+    const int ilsize = sizeof(npyv_lanetype_f32);
+    const int olsize = sizeof(npy_bool);
+    const npy_intp istride = istep / ilsize;
+    const npy_intp ostride = ostep / olsize;
+    assert(len <= 1 || ostep % olsize == 0);
+
+    if ((istep % ilsize == 0) &&
+        !is_mem_overlap(ip, istep, op, ostep, len) &&
+        npyv_loadable_stride_f32(istride) &&
+        npyv_storable_stride_f32(ostride))
+    {
+        if (istride == 1 && ostride == 1) {
+            simd_unary_isfinite_FLOAT_CONTIG_CONTIG(ip, 1, op, 1, len);
+        }
+        else if (ostride == 1) {
+            simd_unary_isfinite_FLOAT_NCONTIG_CONTIG(ip, istride, op, 1, len);
+        }
+        else if (istride == 1) {
+            simd_unary_isfinite_FLOAT_CONTIG_NCONTIG(ip, 1, op, ostride, len);
+        } else {
+            simd_unary_isfinite_FLOAT_NCONTIG_NCONTIG(ip, istride, op, ostride, len);
+        }
+    } else
+#endif // NPY_SIMD_F32
+    {
+    UNARY_LOOP {
+        const npyv_lanetype_f32 in = *(npyv_lanetype_f32 *)ip1;
+        *((npy_bool *)op1) = (npy_isfinite(in) != 0);
+    }
+    }
+
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 522
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(FLOAT_signbit)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if NPY_SIMD_F32
+    const char *ip = args[0];
+    char *op = args[1];
+    const npy_intp istep = steps[0];
+    const npy_intp ostep = steps[1];
+    npy_intp len = dimensions[0];
+    const int ilsize = sizeof(npyv_lanetype_f32);
+    const int olsize = sizeof(npy_bool);
+    const npy_intp istride = istep / ilsize;
+    const npy_intp ostride = ostep / olsize;
+    assert(len <= 1 || ostep % olsize == 0);
+
+    if ((istep % ilsize == 0) &&
+        !is_mem_overlap(ip, istep, op, ostep, len) &&
+        npyv_loadable_stride_f32(istride) &&
+        npyv_storable_stride_f32(ostride))
+    {
+        if (istride == 1 && ostride == 1) {
+            simd_unary_signbit_FLOAT_CONTIG_CONTIG(ip, 1, op, 1, len);
+        }
+        else if (ostride == 1) {
+            simd_unary_signbit_FLOAT_NCONTIG_CONTIG(ip, istride, op, 1, len);
+        }
+        else if (istride == 1) {
+            simd_unary_signbit_FLOAT_CONTIG_NCONTIG(ip, 1, op, ostride, len);
+        } else {
+            simd_unary_signbit_FLOAT_NCONTIG_NCONTIG(ip, istride, op, ostride, len);
+        }
+    } else
+#endif // NPY_SIMD_F32
+    {
+    UNARY_LOOP {
+        const npyv_lanetype_f32 in = *(npyv_lanetype_f32 *)ip1;
+        *((npy_bool *)op1) = (npy_signbit(in) != 0);
+    }
+    }
+
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+
+#line 518
+
+#line 522
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_isnan)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if NPY_SIMD_F64
+    const char *ip = args[0];
+    char *op = args[1];
+    const npy_intp istep = steps[0];
+    const npy_intp ostep = steps[1];
+    npy_intp len = dimensions[0];
+    const int ilsize = sizeof(npyv_lanetype_f64);
+    const int olsize = sizeof(npy_bool);
+    const npy_intp istride = istep / ilsize;
+    const npy_intp ostride = ostep / olsize;
+    assert(len <= 1 || ostep % olsize == 0);
+
+    if ((istep % ilsize == 0) &&
+        !is_mem_overlap(ip, istep, op, ostep, len) &&
+        npyv_loadable_stride_f64(istride) &&
+        npyv_storable_stride_f64(ostride))
+    {
+        if (istride == 1 && ostride == 1) {
+            simd_unary_isnan_DOUBLE_CONTIG_CONTIG(ip, 1, op, 1, len);
+        }
+        else if (ostride == 1) {
+            simd_unary_isnan_DOUBLE_NCONTIG_CONTIG(ip, istride, op, 1, len);
+        }
+        else if (istride == 1) {
+            simd_unary_isnan_DOUBLE_CONTIG_NCONTIG(ip, 1, op, ostride, len);
+        } else {
+            simd_unary_isnan_DOUBLE_NCONTIG_NCONTIG(ip, istride, op, ostride, len);
+        }
+    } else
+#endif // NPY_SIMD_F64
+    {
+    UNARY_LOOP {
+        const npyv_lanetype_f64 in = *(npyv_lanetype_f64 *)ip1;
+        *((npy_bool *)op1) = (npy_isnan(in) != 0);
+    }
+    }
+
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 522
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_isinf)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if NPY_SIMD_F64
+    const char *ip = args[0];
+    char *op = args[1];
+    const npy_intp istep = steps[0];
+    const npy_intp ostep = steps[1];
+    npy_intp len = dimensions[0];
+    const int ilsize = sizeof(npyv_lanetype_f64);
+    const int olsize = sizeof(npy_bool);
+    const npy_intp istride = istep / ilsize;
+    const npy_intp ostride = ostep / olsize;
+    assert(len <= 1 || ostep % olsize == 0);
+
+    if ((istep % ilsize == 0) &&
+        !is_mem_overlap(ip, istep, op, ostep, len) &&
+        npyv_loadable_stride_f64(istride) &&
+        npyv_storable_stride_f64(ostride))
+    {
+        if (istride == 1 && ostride == 1) {
+            simd_unary_isinf_DOUBLE_CONTIG_CONTIG(ip, 1, op, 1, len);
+        }
+        else if (ostride == 1) {
+            simd_unary_isinf_DOUBLE_NCONTIG_CONTIG(ip, istride, op, 1, len);
+        }
+        else if (istride == 1) {
+            simd_unary_isinf_DOUBLE_CONTIG_NCONTIG(ip, 1, op, ostride, len);
+        } else {
+            simd_unary_isinf_DOUBLE_NCONTIG_NCONTIG(ip, istride, op, ostride, len);
+        }
+    } else
+#endif // NPY_SIMD_F64
+    {
+    UNARY_LOOP {
+        const npyv_lanetype_f64 in = *(npyv_lanetype_f64 *)ip1;
+        *((npy_bool *)op1) = (npy_isinf(in) != 0);
+    }
+    }
+
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 522
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_isfinite)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if NPY_SIMD_F64
+    const char *ip = args[0];
+    char *op = args[1];
+    const npy_intp istep = steps[0];
+    const npy_intp ostep = steps[1];
+    npy_intp len = dimensions[0];
+    const int ilsize = sizeof(npyv_lanetype_f64);
+    const int olsize = sizeof(npy_bool);
+    const npy_intp istride = istep / ilsize;
+    const npy_intp ostride = ostep / olsize;
+    assert(len <= 1 || ostep % olsize == 0);
+
+    if ((istep % ilsize == 0) &&
+        !is_mem_overlap(ip, istep, op, ostep, len) &&
+        npyv_loadable_stride_f64(istride) &&
+        npyv_storable_stride_f64(ostride))
+    {
+        if (istride == 1 && ostride == 1) {
+            simd_unary_isfinite_DOUBLE_CONTIG_CONTIG(ip, 1, op, 1, len);
+        }
+        else if (ostride == 1) {
+            simd_unary_isfinite_DOUBLE_NCONTIG_CONTIG(ip, istride, op, 1, len);
+        }
+        else if (istride == 1) {
+            simd_unary_isfinite_DOUBLE_CONTIG_NCONTIG(ip, 1, op, ostride, len);
+        } else {
+            simd_unary_isfinite_DOUBLE_NCONTIG_NCONTIG(ip, istride, op, ostride, len);
+        }
+    } else
+#endif // NPY_SIMD_F64
+    {
+    UNARY_LOOP {
+        const npyv_lanetype_f64 in = *(npyv_lanetype_f64 *)ip1;
+        *((npy_bool *)op1) = (npy_isfinite(in) != 0);
+    }
+    }
+
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+#line 522
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_signbit)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+#if NPY_SIMD_F64
+    const char *ip = args[0];
+    char *op = args[1];
+    const npy_intp istep = steps[0];
+    const npy_intp ostep = steps[1];
+    npy_intp len = dimensions[0];
+    const int ilsize = sizeof(npyv_lanetype_f64);
+    const int olsize = sizeof(npy_bool);
+    const npy_intp istride = istep / ilsize;
+    const npy_intp ostride = ostep / olsize;
+    assert(len <= 1 || ostep % olsize == 0);
+
+    if ((istep % ilsize == 0) &&
+        !is_mem_overlap(ip, istep, op, ostep, len) &&
+        npyv_loadable_stride_f64(istride) &&
+        npyv_storable_stride_f64(ostride))
+    {
+        if (istride == 1 && ostride == 1) {
+            simd_unary_signbit_DOUBLE_CONTIG_CONTIG(ip, 1, op, 1, len);
+        }
+        else if (ostride == 1) {
+            simd_unary_signbit_DOUBLE_NCONTIG_CONTIG(ip, istride, op, 1, len);
+        }
+        else if (istride == 1) {
+            simd_unary_signbit_DOUBLE_CONTIG_NCONTIG(ip, 1, op, ostride, len);
+        } else {
+            simd_unary_signbit_DOUBLE_NCONTIG_NCONTIG(ip, istride, op, ostride, len);
+        }
+    } else
+#endif // NPY_SIMD_F64
+    {
+    UNARY_LOOP {
+        const npyv_lanetype_f64 in = *(npyv_lanetype_f64 *)ip1;
+        *((npy_bool *)op1) = (npy_signbit(in) != 0);
+    }
+    }
+
+    npy_clear_floatstatus_barrier((char*)dimensions);
+}
+
+
+
diff --git a/numpy/core/src/_generated/loops_utils.h b/numpy/core/src/_generated/loops_utils.h
new file mode 100644
index 000000000000..acda6a76704a
--- /dev/null
+++ b/numpy/core/src/_generated/loops_utils.h
@@ -0,0 +1,597 @@
+#line 1 "numpy/core/src/umath/loops_utils.h.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+#ifndef _NPY_UMATH_LOOPS_UTILS_H_
+#define _NPY_UMATH_LOOPS_UTILS_H_
+
+#include "numpy/npy_common.h" // NPY_FINLINE
+#include "numpy/halffloat.h" // npy_half_to_float
+
+/**
+ * Old versions of MSVC causes ambiguous link errors when we deal with large SIMD kernels
+ * which lead to break the build, probably related to the following bug:
+ * https://developercommunity.visualstudio.com/content/problem/415095/internal-compiler-error-with-perfectly-forwarded-r.html
+ */
+#if defined(_MSC_VER) && _MSC_VER < 1916
+    #define SIMD_MSVC_NOINLINE __declspec(noinline)
+#else
+    #define SIMD_MSVC_NOINLINE
+#endif
+/*
+ * nomemoverlap - returns false if two strided arrays have an overlapping
+ * region in memory. ip_size/op_size = size of the arrays which can be negative
+ * indicating negative steps.
+ */
+NPY_FINLINE npy_bool
+nomemoverlap(char *ip, npy_intp ip_size, char *op, npy_intp op_size)
+{
+    char *ip_start, *ip_end, *op_start, *op_end;
+    if (ip_size < 0) {
+        ip_start = ip + ip_size;
+        ip_end = ip;
+    }
+    else {
+        ip_start = ip;
+        ip_end = ip + ip_size;
+    }
+    if (op_size < 0) {
+        op_start = op + op_size;
+        op_end = op;
+    }
+    else {
+        op_start = op;
+        op_end = op + op_size;
+    }
+    return (ip_start == op_start && op_end == ip_end) ||
+           (ip_start > op_end) || (op_start > ip_end);
+}
+
+// returns true if two strided arrays have an overlapping region in memory
+// same as `nomemoverlap()` but requires array length and step sizes
+NPY_FINLINE npy_bool
+is_mem_overlap(const void *src, npy_intp src_step, const void *dst, npy_intp dst_step, npy_intp len)
+{
+    return !(nomemoverlap((char*)src, src_step*len, (char*)dst, dst_step*len));
+}
+
+/*
+ * cutoff blocksize for pairwise summation
+ * decreasing it decreases errors slightly as more pairs are summed but
+ * also lowers performance, as the inner loop is unrolled eight times it is
+ * effectively 16
+ */
+#define PW_BLOCKSIZE    128
+
+#line 71
+
+/*
+ * Pairwise summation, rounding error O(lg n) instead of O(n).
+ * The recursion depth is O(lg n) as well.
+ * when updating also update similar complex floats summation
+ */
+static inline npy_float
+FLOAT_pairwise_sum(char *a, npy_intp n, npy_intp stride)
+{
+    if (n < 8) {
+        npy_intp i;
+        /*
+         * Start with -0 to preserve -0 values.  The reason is that summing
+         * only -0 should return -0, but `0 + -0 == 0` while `-0 + -0 == -0`.
+         */
+        npy_float res = -0.0;
+
+        for (i = 0; i < n; i++) {
+            res += (*((npy_float*)(a + i * stride)));
+        }
+        return res;
+    }
+    else if (n <= PW_BLOCKSIZE) {
+        npy_intp i;
+        npy_float r[8], res;
+
+        /*
+         * sum a block with 8 accumulators
+         * 8 times unroll reduces blocksize to 16 and allows vectorization with
+         * avx without changing summation ordering
+         */
+        r[0] = (*((npy_float *)(a + 0 * stride)));
+        r[1] = (*((npy_float *)(a + 1 * stride)));
+        r[2] = (*((npy_float *)(a + 2 * stride)));
+        r[3] = (*((npy_float *)(a + 3 * stride)));
+        r[4] = (*((npy_float *)(a + 4 * stride)));
+        r[5] = (*((npy_float *)(a + 5 * stride)));
+        r[6] = (*((npy_float *)(a + 6 * stride)));
+        r[7] = (*((npy_float *)(a + 7 * stride)));
+
+        for (i = 8; i < n - (n % 8); i += 8) {
+            /* small blocksizes seems to mess with hardware prefetch */
+            NPY_PREFETCH(a + (i + 512/(npy_intp)sizeof(npy_float))*stride, 0, 3);
+            r[0] += (*((npy_float *)(a + (i + 0) * stride)));
+            r[1] += (*((npy_float *)(a + (i + 1) * stride)));
+            r[2] += (*((npy_float *)(a + (i + 2) * stride)));
+            r[3] += (*((npy_float *)(a + (i + 3) * stride)));
+            r[4] += (*((npy_float *)(a + (i + 4) * stride)));
+            r[5] += (*((npy_float *)(a + (i + 5) * stride)));
+            r[6] += (*((npy_float *)(a + (i + 6) * stride)));
+            r[7] += (*((npy_float *)(a + (i + 7) * stride)));
+        }
+
+        /* accumulate now to avoid stack spills for single peel loop */
+        res = ((r[0] + r[1]) + (r[2] + r[3])) +
+              ((r[4] + r[5]) + (r[6] + r[7]));
+
+        /* do non multiple of 8 rest */
+        for (; i < n; i++) {
+            res += (*((npy_float *)(a + i * stride)));
+        }
+        return res;
+    }
+    else {
+        /* divide by two but avoid non-multiples of unroll factor */
+        npy_intp n2 = n / 2;
+
+        n2 -= n2 % 8;
+        return FLOAT_pairwise_sum(a, n2, stride) +
+               FLOAT_pairwise_sum(a + n2 * stride, n - n2, stride);
+    }
+}
+
+
+#line 71
+
+/*
+ * Pairwise summation, rounding error O(lg n) instead of O(n).
+ * The recursion depth is O(lg n) as well.
+ * when updating also update similar complex floats summation
+ */
+static inline npy_double
+DOUBLE_pairwise_sum(char *a, npy_intp n, npy_intp stride)
+{
+    if (n < 8) {
+        npy_intp i;
+        /*
+         * Start with -0 to preserve -0 values.  The reason is that summing
+         * only -0 should return -0, but `0 + -0 == 0` while `-0 + -0 == -0`.
+         */
+        npy_double res = -0.0;
+
+        for (i = 0; i < n; i++) {
+            res += (*((npy_double*)(a + i * stride)));
+        }
+        return res;
+    }
+    else if (n <= PW_BLOCKSIZE) {
+        npy_intp i;
+        npy_double r[8], res;
+
+        /*
+         * sum a block with 8 accumulators
+         * 8 times unroll reduces blocksize to 16 and allows vectorization with
+         * avx without changing summation ordering
+         */
+        r[0] = (*((npy_double *)(a + 0 * stride)));
+        r[1] = (*((npy_double *)(a + 1 * stride)));
+        r[2] = (*((npy_double *)(a + 2 * stride)));
+        r[3] = (*((npy_double *)(a + 3 * stride)));
+        r[4] = (*((npy_double *)(a + 4 * stride)));
+        r[5] = (*((npy_double *)(a + 5 * stride)));
+        r[6] = (*((npy_double *)(a + 6 * stride)));
+        r[7] = (*((npy_double *)(a + 7 * stride)));
+
+        for (i = 8; i < n - (n % 8); i += 8) {
+            /* small blocksizes seems to mess with hardware prefetch */
+            NPY_PREFETCH(a + (i + 512/(npy_intp)sizeof(npy_double))*stride, 0, 3);
+            r[0] += (*((npy_double *)(a + (i + 0) * stride)));
+            r[1] += (*((npy_double *)(a + (i + 1) * stride)));
+            r[2] += (*((npy_double *)(a + (i + 2) * stride)));
+            r[3] += (*((npy_double *)(a + (i + 3) * stride)));
+            r[4] += (*((npy_double *)(a + (i + 4) * stride)));
+            r[5] += (*((npy_double *)(a + (i + 5) * stride)));
+            r[6] += (*((npy_double *)(a + (i + 6) * stride)));
+            r[7] += (*((npy_double *)(a + (i + 7) * stride)));
+        }
+
+        /* accumulate now to avoid stack spills for single peel loop */
+        res = ((r[0] + r[1]) + (r[2] + r[3])) +
+              ((r[4] + r[5]) + (r[6] + r[7]));
+
+        /* do non multiple of 8 rest */
+        for (; i < n; i++) {
+            res += (*((npy_double *)(a + i * stride)));
+        }
+        return res;
+    }
+    else {
+        /* divide by two but avoid non-multiples of unroll factor */
+        npy_intp n2 = n / 2;
+
+        n2 -= n2 % 8;
+        return DOUBLE_pairwise_sum(a, n2, stride) +
+               DOUBLE_pairwise_sum(a + n2 * stride, n - n2, stride);
+    }
+}
+
+
+#line 71
+
+/*
+ * Pairwise summation, rounding error O(lg n) instead of O(n).
+ * The recursion depth is O(lg n) as well.
+ * when updating also update similar complex floats summation
+ */
+static inline npy_longdouble
+LONGDOUBLE_pairwise_sum(char *a, npy_intp n, npy_intp stride)
+{
+    if (n < 8) {
+        npy_intp i;
+        /*
+         * Start with -0 to preserve -0 values.  The reason is that summing
+         * only -0 should return -0, but `0 + -0 == 0` while `-0 + -0 == -0`.
+         */
+        npy_longdouble res = -0.0;
+
+        for (i = 0; i < n; i++) {
+            res += (*((npy_longdouble*)(a + i * stride)));
+        }
+        return res;
+    }
+    else if (n <= PW_BLOCKSIZE) {
+        npy_intp i;
+        npy_longdouble r[8], res;
+
+        /*
+         * sum a block with 8 accumulators
+         * 8 times unroll reduces blocksize to 16 and allows vectorization with
+         * avx without changing summation ordering
+         */
+        r[0] = (*((npy_longdouble *)(a + 0 * stride)));
+        r[1] = (*((npy_longdouble *)(a + 1 * stride)));
+        r[2] = (*((npy_longdouble *)(a + 2 * stride)));
+        r[3] = (*((npy_longdouble *)(a + 3 * stride)));
+        r[4] = (*((npy_longdouble *)(a + 4 * stride)));
+        r[5] = (*((npy_longdouble *)(a + 5 * stride)));
+        r[6] = (*((npy_longdouble *)(a + 6 * stride)));
+        r[7] = (*((npy_longdouble *)(a + 7 * stride)));
+
+        for (i = 8; i < n - (n % 8); i += 8) {
+            /* small blocksizes seems to mess with hardware prefetch */
+            NPY_PREFETCH(a + (i + 512/(npy_intp)sizeof(npy_longdouble))*stride, 0, 3);
+            r[0] += (*((npy_longdouble *)(a + (i + 0) * stride)));
+            r[1] += (*((npy_longdouble *)(a + (i + 1) * stride)));
+            r[2] += (*((npy_longdouble *)(a + (i + 2) * stride)));
+            r[3] += (*((npy_longdouble *)(a + (i + 3) * stride)));
+            r[4] += (*((npy_longdouble *)(a + (i + 4) * stride)));
+            r[5] += (*((npy_longdouble *)(a + (i + 5) * stride)));
+            r[6] += (*((npy_longdouble *)(a + (i + 6) * stride)));
+            r[7] += (*((npy_longdouble *)(a + (i + 7) * stride)));
+        }
+
+        /* accumulate now to avoid stack spills for single peel loop */
+        res = ((r[0] + r[1]) + (r[2] + r[3])) +
+              ((r[4] + r[5]) + (r[6] + r[7]));
+
+        /* do non multiple of 8 rest */
+        for (; i < n; i++) {
+            res += (*((npy_longdouble *)(a + i * stride)));
+        }
+        return res;
+    }
+    else {
+        /* divide by two but avoid non-multiples of unroll factor */
+        npy_intp n2 = n / 2;
+
+        n2 -= n2 % 8;
+        return LONGDOUBLE_pairwise_sum(a, n2, stride) +
+               LONGDOUBLE_pairwise_sum(a + n2 * stride, n - n2, stride);
+    }
+}
+
+
+#line 71
+
+/*
+ * Pairwise summation, rounding error O(lg n) instead of O(n).
+ * The recursion depth is O(lg n) as well.
+ * when updating also update similar complex floats summation
+ */
+static inline npy_float
+HALF_pairwise_sum(char *a, npy_intp n, npy_intp stride)
+{
+    if (n < 8) {
+        npy_intp i;
+        /*
+         * Start with -0 to preserve -0 values.  The reason is that summing
+         * only -0 should return -0, but `0 + -0 == 0` while `-0 + -0 == -0`.
+         */
+        npy_float res = -0.0;
+
+        for (i = 0; i < n; i++) {
+            res += npy_half_to_float(*((npy_half*)(a + i * stride)));
+        }
+        return res;
+    }
+    else if (n <= PW_BLOCKSIZE) {
+        npy_intp i;
+        npy_float r[8], res;
+
+        /*
+         * sum a block with 8 accumulators
+         * 8 times unroll reduces blocksize to 16 and allows vectorization with
+         * avx without changing summation ordering
+         */
+        r[0] = npy_half_to_float(*((npy_half *)(a + 0 * stride)));
+        r[1] = npy_half_to_float(*((npy_half *)(a + 1 * stride)));
+        r[2] = npy_half_to_float(*((npy_half *)(a + 2 * stride)));
+        r[3] = npy_half_to_float(*((npy_half *)(a + 3 * stride)));
+        r[4] = npy_half_to_float(*((npy_half *)(a + 4 * stride)));
+        r[5] = npy_half_to_float(*((npy_half *)(a + 5 * stride)));
+        r[6] = npy_half_to_float(*((npy_half *)(a + 6 * stride)));
+        r[7] = npy_half_to_float(*((npy_half *)(a + 7 * stride)));
+
+        for (i = 8; i < n - (n % 8); i += 8) {
+            /* small blocksizes seems to mess with hardware prefetch */
+            NPY_PREFETCH(a + (i + 512/(npy_intp)sizeof(npy_half))*stride, 0, 3);
+            r[0] += npy_half_to_float(*((npy_half *)(a + (i + 0) * stride)));
+            r[1] += npy_half_to_float(*((npy_half *)(a + (i + 1) * stride)));
+            r[2] += npy_half_to_float(*((npy_half *)(a + (i + 2) * stride)));
+            r[3] += npy_half_to_float(*((npy_half *)(a + (i + 3) * stride)));
+            r[4] += npy_half_to_float(*((npy_half *)(a + (i + 4) * stride)));
+            r[5] += npy_half_to_float(*((npy_half *)(a + (i + 5) * stride)));
+            r[6] += npy_half_to_float(*((npy_half *)(a + (i + 6) * stride)));
+            r[7] += npy_half_to_float(*((npy_half *)(a + (i + 7) * stride)));
+        }
+
+        /* accumulate now to avoid stack spills for single peel loop */
+        res = ((r[0] + r[1]) + (r[2] + r[3])) +
+              ((r[4] + r[5]) + (r[6] + r[7]));
+
+        /* do non multiple of 8 rest */
+        for (; i < n; i++) {
+            res += npy_half_to_float(*((npy_half *)(a + i * stride)));
+        }
+        return res;
+    }
+    else {
+        /* divide by two but avoid non-multiples of unroll factor */
+        npy_intp n2 = n / 2;
+
+        n2 -= n2 % 8;
+        return HALF_pairwise_sum(a, n2, stride) +
+               HALF_pairwise_sum(a + n2 * stride, n - n2, stride);
+    }
+}
+
+
+
+#line 154
+/* similar to pairwise sum of real floats */
+static inline void
+CFLOAT_pairwise_sum(npy_float *rr, npy_float * ri, char * a, npy_intp n,
+                    npy_intp stride)
+{
+    assert(n % 2 == 0);
+    if (n < 8) {
+        npy_intp i;
+
+        *rr = -0.0;
+        *ri = -0.0;
+        for (i = 0; i < n; i += 2) {
+            *rr += *((npy_float *)(a + i * stride + 0));
+            *ri += *((npy_float *)(a + i * stride + sizeof(npy_float)));
+        }
+        return;
+    }
+    else if (n <= PW_BLOCKSIZE) {
+        npy_intp i;
+        npy_float r[8];
+
+        /*
+         * sum a block with 8 accumulators
+         * 8 times unroll reduces blocksize to 16 and allows vectorization with
+         * avx without changing summation ordering
+         */
+        r[0] = *((npy_float *)(a + 0 * stride));
+        r[1] = *((npy_float *)(a + 0 * stride + sizeof(npy_float)));
+        r[2] = *((npy_float *)(a + 2 * stride));
+        r[3] = *((npy_float *)(a + 2 * stride + sizeof(npy_float)));
+        r[4] = *((npy_float *)(a + 4 * stride));
+        r[5] = *((npy_float *)(a + 4 * stride + sizeof(npy_float)));
+        r[6] = *((npy_float *)(a + 6 * stride));
+        r[7] = *((npy_float *)(a + 6 * stride + sizeof(npy_float)));
+
+        for (i = 8; i < n - (n % 8); i += 8) {
+            /* small blocksizes seems to mess with hardware prefetch */
+            NPY_PREFETCH(a + (i + 512/(npy_intp)sizeof(npy_float))*stride, 0, 3);
+            r[0] += *((npy_float *)(a + (i + 0) * stride));
+            r[1] += *((npy_float *)(a + (i + 0) * stride + sizeof(npy_float)));
+            r[2] += *((npy_float *)(a + (i + 2) * stride));
+            r[3] += *((npy_float *)(a + (i + 2) * stride + sizeof(npy_float)));
+            r[4] += *((npy_float *)(a + (i + 4) * stride));
+            r[5] += *((npy_float *)(a + (i + 4) * stride + sizeof(npy_float)));
+            r[6] += *((npy_float *)(a + (i + 6) * stride));
+            r[7] += *((npy_float *)(a + (i + 6) * stride + sizeof(npy_float)));
+        }
+
+        /* accumulate now to avoid stack spills for single peel loop */
+        *rr = ((r[0] + r[2]) + (r[4] + r[6]));
+        *ri = ((r[1] + r[3]) + (r[5] + r[7]));
+
+        /* do non multiple of 8 rest */
+        for (; i < n; i+=2) {
+            *rr += *((npy_float *)(a + i * stride + 0));
+            *ri += *((npy_float *)(a + i * stride + sizeof(npy_float)));
+        }
+        return;
+    }
+    else {
+        /* divide by two but avoid non-multiples of unroll factor */
+        npy_float rr1, ri1, rr2, ri2;
+        npy_intp n2 = n / 2;
+
+        n2 -= n2 % 8;
+        CFLOAT_pairwise_sum(&rr1, &ri1, a, n2, stride);
+        CFLOAT_pairwise_sum(&rr2, &ri2, a + n2 * stride, n - n2, stride);
+        *rr = rr1 + rr2;
+        *ri = ri1 + ri2;
+        return;
+    }
+}
+
+#line 154
+/* similar to pairwise sum of real floats */
+static inline void
+CDOUBLE_pairwise_sum(npy_double *rr, npy_double * ri, char * a, npy_intp n,
+                    npy_intp stride)
+{
+    assert(n % 2 == 0);
+    if (n < 8) {
+        npy_intp i;
+
+        *rr = -0.0;
+        *ri = -0.0;
+        for (i = 0; i < n; i += 2) {
+            *rr += *((npy_double *)(a + i * stride + 0));
+            *ri += *((npy_double *)(a + i * stride + sizeof(npy_double)));
+        }
+        return;
+    }
+    else if (n <= PW_BLOCKSIZE) {
+        npy_intp i;
+        npy_double r[8];
+
+        /*
+         * sum a block with 8 accumulators
+         * 8 times unroll reduces blocksize to 16 and allows vectorization with
+         * avx without changing summation ordering
+         */
+        r[0] = *((npy_double *)(a + 0 * stride));
+        r[1] = *((npy_double *)(a + 0 * stride + sizeof(npy_double)));
+        r[2] = *((npy_double *)(a + 2 * stride));
+        r[3] = *((npy_double *)(a + 2 * stride + sizeof(npy_double)));
+        r[4] = *((npy_double *)(a + 4 * stride));
+        r[5] = *((npy_double *)(a + 4 * stride + sizeof(npy_double)));
+        r[6] = *((npy_double *)(a + 6 * stride));
+        r[7] = *((npy_double *)(a + 6 * stride + sizeof(npy_double)));
+
+        for (i = 8; i < n - (n % 8); i += 8) {
+            /* small blocksizes seems to mess with hardware prefetch */
+            NPY_PREFETCH(a + (i + 512/(npy_intp)sizeof(npy_double))*stride, 0, 3);
+            r[0] += *((npy_double *)(a + (i + 0) * stride));
+            r[1] += *((npy_double *)(a + (i + 0) * stride + sizeof(npy_double)));
+            r[2] += *((npy_double *)(a + (i + 2) * stride));
+            r[3] += *((npy_double *)(a + (i + 2) * stride + sizeof(npy_double)));
+            r[4] += *((npy_double *)(a + (i + 4) * stride));
+            r[5] += *((npy_double *)(a + (i + 4) * stride + sizeof(npy_double)));
+            r[6] += *((npy_double *)(a + (i + 6) * stride));
+            r[7] += *((npy_double *)(a + (i + 6) * stride + sizeof(npy_double)));
+        }
+
+        /* accumulate now to avoid stack spills for single peel loop */
+        *rr = ((r[0] + r[2]) + (r[4] + r[6]));
+        *ri = ((r[1] + r[3]) + (r[5] + r[7]));
+
+        /* do non multiple of 8 rest */
+        for (; i < n; i+=2) {
+            *rr += *((npy_double *)(a + i * stride + 0));
+            *ri += *((npy_double *)(a + i * stride + sizeof(npy_double)));
+        }
+        return;
+    }
+    else {
+        /* divide by two but avoid non-multiples of unroll factor */
+        npy_double rr1, ri1, rr2, ri2;
+        npy_intp n2 = n / 2;
+
+        n2 -= n2 % 8;
+        CDOUBLE_pairwise_sum(&rr1, &ri1, a, n2, stride);
+        CDOUBLE_pairwise_sum(&rr2, &ri2, a + n2 * stride, n - n2, stride);
+        *rr = rr1 + rr2;
+        *ri = ri1 + ri2;
+        return;
+    }
+}
+
+#line 154
+/* similar to pairwise sum of real floats */
+static inline void
+CLONGDOUBLE_pairwise_sum(npy_longdouble *rr, npy_longdouble * ri, char * a, npy_intp n,
+                    npy_intp stride)
+{
+    assert(n % 2 == 0);
+    if (n < 8) {
+        npy_intp i;
+
+        *rr = -0.0;
+        *ri = -0.0;
+        for (i = 0; i < n; i += 2) {
+            *rr += *((npy_longdouble *)(a + i * stride + 0));
+            *ri += *((npy_longdouble *)(a + i * stride + sizeof(npy_longdouble)));
+        }
+        return;
+    }
+    else if (n <= PW_BLOCKSIZE) {
+        npy_intp i;
+        npy_longdouble r[8];
+
+        /*
+         * sum a block with 8 accumulators
+         * 8 times unroll reduces blocksize to 16 and allows vectorization with
+         * avx without changing summation ordering
+         */
+        r[0] = *((npy_longdouble *)(a + 0 * stride));
+        r[1] = *((npy_longdouble *)(a + 0 * stride + sizeof(npy_longdouble)));
+        r[2] = *((npy_longdouble *)(a + 2 * stride));
+        r[3] = *((npy_longdouble *)(a + 2 * stride + sizeof(npy_longdouble)));
+        r[4] = *((npy_longdouble *)(a + 4 * stride));
+        r[5] = *((npy_longdouble *)(a + 4 * stride + sizeof(npy_longdouble)));
+        r[6] = *((npy_longdouble *)(a + 6 * stride));
+        r[7] = *((npy_longdouble *)(a + 6 * stride + sizeof(npy_longdouble)));
+
+        for (i = 8; i < n - (n % 8); i += 8) {
+            /* small blocksizes seems to mess with hardware prefetch */
+            NPY_PREFETCH(a + (i + 512/(npy_intp)sizeof(npy_longdouble))*stride, 0, 3);
+            r[0] += *((npy_longdouble *)(a + (i + 0) * stride));
+            r[1] += *((npy_longdouble *)(a + (i + 0) * stride + sizeof(npy_longdouble)));
+            r[2] += *((npy_longdouble *)(a + (i + 2) * stride));
+            r[3] += *((npy_longdouble *)(a + (i + 2) * stride + sizeof(npy_longdouble)));
+            r[4] += *((npy_longdouble *)(a + (i + 4) * stride));
+            r[5] += *((npy_longdouble *)(a + (i + 4) * stride + sizeof(npy_longdouble)));
+            r[6] += *((npy_longdouble *)(a + (i + 6) * stride));
+            r[7] += *((npy_longdouble *)(a + (i + 6) * stride + sizeof(npy_longdouble)));
+        }
+
+        /* accumulate now to avoid stack spills for single peel loop */
+        *rr = ((r[0] + r[2]) + (r[4] + r[6]));
+        *ri = ((r[1] + r[3]) + (r[5] + r[7]));
+
+        /* do non multiple of 8 rest */
+        for (; i < n; i+=2) {
+            *rr += *((npy_longdouble *)(a + i * stride + 0));
+            *ri += *((npy_longdouble *)(a + i * stride + sizeof(npy_longdouble)));
+        }
+        return;
+    }
+    else {
+        /* divide by two but avoid non-multiples of unroll factor */
+        npy_longdouble rr1, ri1, rr2, ri2;
+        npy_intp n2 = n / 2;
+
+        n2 -= n2 % 8;
+        CLONGDOUBLE_pairwise_sum(&rr1, &ri1, a, n2, stride);
+        CLONGDOUBLE_pairwise_sum(&rr2, &ri2, a + n2 * stride, n - n2, stride);
+        *rr = rr1 + rr2;
+        *ri = ri1 + ri2;
+        return;
+    }
+}
+
+
+#endif // _NPY_UMATH_LOOPS_UTILS_H_
+
diff --git a/numpy/core/src/_generated/lowlevel_strided_loops.c b/numpy/core/src/_generated/lowlevel_strided_loops.c
new file mode 100644
index 000000000000..4dd3622ee1f3
--- /dev/null
+++ b/numpy/core/src/_generated/lowlevel_strided_loops.c
@@ -0,0 +1,254977 @@
+#line 1 "numpy/core/src/multiarray/lowlevel_strided_loops.c.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+/*
+ * This file contains low-level loops for copying and byte-swapping
+ * strided data.
+ *
+ * Copyright (c) 2010 by Mark Wiebe (mwwiebe@gmail.com)
+ * The University of British Columbia
+ *
+ * See LICENSE.txt for the license.
+ */
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#define _MULTIARRAYMODULE
+#define _UMATHMODULE
+#include <numpy/arrayobject.h>
+#include <numpy/npy_cpu.h>
+#include <numpy/halffloat.h>
+
+#include "lowlevel_strided_loops.h"
+#include "array_assign.h"
+#include "array_method.h"
+#include "usertypes.h"
+
+#include "umathmodule.h"
+
+/*
+ * x86 platform works with unaligned access but the compiler is allowed to
+ * assume all data is aligned to its size by the C standard. This means it can
+ * vectorize instructions peeling only by the size of the type, if the data is
+ * not aligned to this size one ends up with data not correctly aligned for SSE
+ * instructions (16 byte).
+ * So this flag can only be enabled if autovectorization is disabled.
+ */
+#if NPY_ALIGNMENT_REQUIRED
+#  define NPY_USE_UNALIGNED_ACCESS 0
+#else
+#  define NPY_USE_UNALIGNED_ACCESS 0
+#endif
+
+#define _NPY_NOP1(x) (x)
+#define _NPY_NOP2(x) (x)
+#define _NPY_NOP4(x) (x)
+#define _NPY_NOP8(x) (x)
+
+#define _NPY_SWAP2(x) npy_bswap2(x)
+
+#define _NPY_SWAP4(x) npy_bswap4(x)
+
+#define _NPY_SWAP_PAIR4(x) (((((npy_uint32)x)&0xffu) << 8) | \
+                       ((((npy_uint32)x)&0xff00u) >> 8) | \
+                       ((((npy_uint32)x)&0xff0000u) << 8) | \
+                       ((((npy_uint32)x)&0xff000000u) >> 8))
+
+#define _NPY_SWAP8(x) npy_bswap8(x)
+
+#define _NPY_SWAP_PAIR8(x) (((((npy_uint64)x)&0xffULL) << 24) | \
+                       ((((npy_uint64)x)&0xff00ULL) << 8) | \
+                       ((((npy_uint64)x)&0xff0000ULL) >> 8) | \
+                       ((((npy_uint64)x)&0xff000000ULL) >> 24) | \
+                       ((((npy_uint64)x)&0xff00000000ULL) << 24) | \
+                       ((((npy_uint64)x)&0xff0000000000ULL) << 8) | \
+                       ((((npy_uint64)x)&0xff000000000000ULL) >> 8) | \
+                       ((((npy_uint64)x)&0xff00000000000000ULL) >> 24))
+
+#define _NPY_SWAP_INPLACE2(x) npy_bswap2_unaligned(x)
+
+#define _NPY_SWAP_INPLACE4(x) npy_bswap4_unaligned(x)
+
+#define _NPY_SWAP_INPLACE8(x) npy_bswap8_unaligned(x)
+
+#define _NPY_SWAP_INPLACE16(x) { \
+        char a = (x)[0]; (x)[0] = (x)[15]; (x)[15] = a; \
+        a = (x)[1]; (x)[1] = (x)[14]; (x)[14] = a; \
+        a = (x)[2]; (x)[2] = (x)[13]; (x)[13] = a; \
+        a = (x)[3]; (x)[3] = (x)[12]; (x)[12] = a; \
+        a = (x)[4]; (x)[4] = (x)[11]; (x)[11] = a; \
+        a = (x)[5]; (x)[5] = (x)[10]; (x)[10] = a; \
+        a = (x)[6]; (x)[6] = (x)[9]; (x)[9] = a; \
+        a = (x)[7]; (x)[7] = (x)[8]; (x)[8] = a; \
+        }
+
+/************* STRIDED COPYING/SWAPPING SPECIALIZED FUNCTIONS *************/
+
+#line 91
+#line 97
+#line 105
+
+#if (1 >= 1) && \
+    (1 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 0 || 0 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 0 == 0 && 1 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_strided_to_strided_size1(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+    /*printf("fn _strided_to_strided_size1\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 1 != 16
+        (*((npy_uint8 *)dst)) = _NPY_NOP1(*((npy_uint8 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 1);
+#  if 0 == 1
+        _NPY_NOP1(dst);
+#  elif 0 == 2
+        _NPY_NOP0(dst);
+        _NPY_NOP0(dst + 0);
+#  endif
+
+#endif
+
+#if 0
+        dst += 1;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 1;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 0
+static NPY_GCC_OPT_3 int
+_strided_to_strided_size1_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1 != 16
+#  if !(1 == 1 && 0)
+    npy_uint8 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 1 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+#if 1 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 1 != 16
+    temp = _NPY_NOP1(*((npy_uint8 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 1 != 16
+        *((npy_uint8 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 1;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 0 */
+
+#endif/* 1 >= 1 */
+
+
+#line 105
+
+#if (1 >= 1) && \
+    (1 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 0 || 0 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 0 == 0 && 1 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_strided_to_strided_size1(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+    /*printf("fn _aligned_strided_to_strided_size1\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 1 != 16
+        (*((npy_uint8 *)dst)) = _NPY_NOP1(*((npy_uint8 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 1);
+#  if 0 == 1
+        _NPY_NOP1(dst);
+#  elif 0 == 2
+        _NPY_NOP0(dst);
+        _NPY_NOP0(dst + 0);
+#  endif
+
+#endif
+
+#if 0
+        dst += 1;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 1;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_strided_to_strided_size1_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1 != 16
+#  if !(1 == 1 && 0)
+    npy_uint8 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 1 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+#if 1 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 1 != 16
+    temp = _NPY_NOP1(*((npy_uint8 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 1 != 16
+        *((npy_uint8 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 1;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 1 */
+
+#endif/* 1 >= 1 */
+
+
+#line 105
+
+#if (1 >= 2) && \
+    (1 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 1 || 0 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 1 == 0 && 1 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_strided_to_strided_size1(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+    /*printf("fn _swap_strided_to_strided_size1\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 1 != 16
+        (*((npy_uint8 *)dst)) = _NPY_SWAP_INPLACE1(*((npy_uint8 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 1);
+#  if 1 == 1
+        _NPY_SWAP_INPLACE1(dst);
+#  elif 1 == 2
+        _NPY_SWAP_INPLACE0(dst);
+        _NPY_SWAP_INPLACE0(dst + 0);
+#  endif
+
+#endif
+
+#if 0
+        dst += 1;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 1;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_strided_to_strided_size1_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1 != 16
+#  if !(1 == 1 && 0)
+    npy_uint8 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 1 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+#if 1 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 1 != 16
+    temp = _NPY_SWAP_INPLACE1(*((npy_uint8 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 1 != 16
+        *((npy_uint8 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 1;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 0 */
+
+#endif/* 1 >= 2 */
+
+
+#line 105
+
+#if (1 >= 2) && \
+    (1 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 1 || 0 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 1 == 0 && 1 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_strided_to_strided_size1(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+    /*printf("fn _aligned_swap_strided_to_strided_size1\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 1 != 16
+        (*((npy_uint8 *)dst)) = _NPY_SWAP1(*((npy_uint8 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 1);
+#  if 1 == 1
+        _NPY_SWAP1(dst);
+#  elif 1 == 2
+        _NPY_SWAP0(dst);
+        _NPY_SWAP0(dst + 0);
+#  endif
+
+#endif
+
+#if 0
+        dst += 1;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 1;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_strided_to_strided_size1_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1 != 16
+#  if !(1 == 1 && 0)
+    npy_uint8 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 1 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+#if 1 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 1 != 16
+    temp = _NPY_SWAP1(*((npy_uint8 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 1 != 16
+        *((npy_uint8 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 1;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 1 */
+
+#endif/* 1 >= 2 */
+
+
+#line 105
+
+#if (1 >= 4) && \
+    (1 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 2 || 0 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 2 == 0 && 1 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_pair_strided_to_strided_size1(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+    /*printf("fn _swap_pair_strided_to_strided_size1\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 1 != 16
+        (*((npy_uint8 *)dst)) = _NPY_SWAP_INPLACE1(*((npy_uint8 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 1);
+#  if 2 == 1
+        _NPY_SWAP_INPLACE1(dst);
+#  elif 2 == 2
+        _NPY_SWAP_INPLACE0(dst);
+        _NPY_SWAP_INPLACE0(dst + 0);
+#  endif
+
+#endif
+
+#if 0
+        dst += 1;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 1;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_pair_strided_to_strided_size1_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1 != 16
+#  if !(1 == 1 && 0)
+    npy_uint8 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 1 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+#if 1 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 1 != 16
+    temp = _NPY_SWAP_INPLACE1(*((npy_uint8 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 1 != 16
+        *((npy_uint8 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 1;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 0 */
+
+#endif/* 1 >= 4 */
+
+
+#line 105
+
+#if (1 >= 4) && \
+    (1 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 2 || 0 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 2 == 0 && 1 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_pair_strided_to_strided_size1(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+    /*printf("fn _aligned_swap_pair_strided_to_strided_size1\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 1 != 16
+        (*((npy_uint8 *)dst)) = _NPY_SWAP_PAIR1(*((npy_uint8 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 1);
+#  if 2 == 1
+        _NPY_SWAP_PAIR1(dst);
+#  elif 2 == 2
+        _NPY_SWAP_PAIR0(dst);
+        _NPY_SWAP_PAIR0(dst + 0);
+#  endif
+
+#endif
+
+#if 0
+        dst += 1;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 1;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_pair_strided_to_strided_size1_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1 != 16
+#  if !(1 == 1 && 0)
+    npy_uint8 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 1 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+#if 1 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 1 != 16
+    temp = _NPY_SWAP_PAIR1(*((npy_uint8 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 1 != 16
+        *((npy_uint8 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 1;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 1 */
+
+#endif/* 1 >= 4 */
+
+
+
+#line 97
+#line 105
+
+#if (1 >= 1) && \
+    (1 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 0 || 0 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 0 == 0 && 1 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_strided_to_contig_size1(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+    /*printf("fn _strided_to_contig_size1\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 1 != 16
+        (*((npy_uint8 *)dst)) = _NPY_NOP1(*((npy_uint8 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 1);
+#  if 0 == 1
+        _NPY_NOP1(dst);
+#  elif 0 == 2
+        _NPY_NOP0(dst);
+        _NPY_NOP0(dst + 0);
+#  endif
+
+#endif
+
+#if 1
+        dst += 1;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 1;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 0
+static NPY_GCC_OPT_3 int
+_strided_to_contig_size1_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1 != 16
+#  if !(1 == 1 && 1)
+    npy_uint8 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 1 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+#if 1 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 1 != 16
+    temp = _NPY_NOP1(*((npy_uint8 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 1 != 16
+        *((npy_uint8 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 1;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 0 */
+
+#endif/* 1 >= 1 */
+
+
+#line 105
+
+#if (1 >= 1) && \
+    (1 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 0 || 0 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 0 == 0 && 1 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_strided_to_contig_size1(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+    /*printf("fn _aligned_strided_to_contig_size1\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 1 != 16
+        (*((npy_uint8 *)dst)) = _NPY_NOP1(*((npy_uint8 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 1);
+#  if 0 == 1
+        _NPY_NOP1(dst);
+#  elif 0 == 2
+        _NPY_NOP0(dst);
+        _NPY_NOP0(dst + 0);
+#  endif
+
+#endif
+
+#if 1
+        dst += 1;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 1;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_strided_to_contig_size1_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1 != 16
+#  if !(1 == 1 && 1)
+    npy_uint8 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 1 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+#if 1 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 1 != 16
+    temp = _NPY_NOP1(*((npy_uint8 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 1 != 16
+        *((npy_uint8 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 1;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 1 */
+
+#endif/* 1 >= 1 */
+
+
+#line 105
+
+#if (1 >= 2) && \
+    (1 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 1 || 0 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 1 == 0 && 1 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_strided_to_contig_size1(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+    /*printf("fn _swap_strided_to_contig_size1\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 1 != 16
+        (*((npy_uint8 *)dst)) = _NPY_SWAP_INPLACE1(*((npy_uint8 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 1);
+#  if 1 == 1
+        _NPY_SWAP_INPLACE1(dst);
+#  elif 1 == 2
+        _NPY_SWAP_INPLACE0(dst);
+        _NPY_SWAP_INPLACE0(dst + 0);
+#  endif
+
+#endif
+
+#if 1
+        dst += 1;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 1;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_strided_to_contig_size1_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1 != 16
+#  if !(1 == 1 && 1)
+    npy_uint8 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 1 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+#if 1 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 1 != 16
+    temp = _NPY_SWAP_INPLACE1(*((npy_uint8 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 1 != 16
+        *((npy_uint8 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 1;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 0 */
+
+#endif/* 1 >= 2 */
+
+
+#line 105
+
+#if (1 >= 2) && \
+    (1 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 1 || 0 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 1 == 0 && 1 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_strided_to_contig_size1(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+    /*printf("fn _aligned_swap_strided_to_contig_size1\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 1 != 16
+        (*((npy_uint8 *)dst)) = _NPY_SWAP1(*((npy_uint8 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 1);
+#  if 1 == 1
+        _NPY_SWAP1(dst);
+#  elif 1 == 2
+        _NPY_SWAP0(dst);
+        _NPY_SWAP0(dst + 0);
+#  endif
+
+#endif
+
+#if 1
+        dst += 1;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 1;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_strided_to_contig_size1_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1 != 16
+#  if !(1 == 1 && 1)
+    npy_uint8 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 1 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+#if 1 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 1 != 16
+    temp = _NPY_SWAP1(*((npy_uint8 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 1 != 16
+        *((npy_uint8 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 1;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 1 */
+
+#endif/* 1 >= 2 */
+
+
+#line 105
+
+#if (1 >= 4) && \
+    (1 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 2 || 0 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 2 == 0 && 1 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_pair_strided_to_contig_size1(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+    /*printf("fn _swap_pair_strided_to_contig_size1\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 1 != 16
+        (*((npy_uint8 *)dst)) = _NPY_SWAP_INPLACE1(*((npy_uint8 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 1);
+#  if 2 == 1
+        _NPY_SWAP_INPLACE1(dst);
+#  elif 2 == 2
+        _NPY_SWAP_INPLACE0(dst);
+        _NPY_SWAP_INPLACE0(dst + 0);
+#  endif
+
+#endif
+
+#if 1
+        dst += 1;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 1;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_pair_strided_to_contig_size1_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1 != 16
+#  if !(1 == 1 && 1)
+    npy_uint8 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 1 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+#if 1 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 1 != 16
+    temp = _NPY_SWAP_INPLACE1(*((npy_uint8 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 1 != 16
+        *((npy_uint8 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 1;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 0 */
+
+#endif/* 1 >= 4 */
+
+
+#line 105
+
+#if (1 >= 4) && \
+    (1 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 2 || 0 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 2 == 0 && 1 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_pair_strided_to_contig_size1(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+    /*printf("fn _aligned_swap_pair_strided_to_contig_size1\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 1 != 16
+        (*((npy_uint8 *)dst)) = _NPY_SWAP_PAIR1(*((npy_uint8 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 1);
+#  if 2 == 1
+        _NPY_SWAP_PAIR1(dst);
+#  elif 2 == 2
+        _NPY_SWAP_PAIR0(dst);
+        _NPY_SWAP_PAIR0(dst + 0);
+#  endif
+
+#endif
+
+#if 1
+        dst += 1;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 1;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_pair_strided_to_contig_size1_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1 != 16
+#  if !(1 == 1 && 1)
+    npy_uint8 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 1 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+#if 1 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 1 != 16
+    temp = _NPY_SWAP_PAIR1(*((npy_uint8 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 1 != 16
+        *((npy_uint8 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 1;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 1 */
+
+#endif/* 1 >= 4 */
+
+
+
+#line 97
+#line 105
+
+#if (1 >= 1) && \
+    (1 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 0 || 1 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 0 == 0 && 1 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_contig_to_strided_size1(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+    /*printf("fn _contig_to_strided_size1\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 1 != 16
+        (*((npy_uint8 *)dst)) = _NPY_NOP1(*((npy_uint8 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 1);
+#  if 0 == 1
+        _NPY_NOP1(dst);
+#  elif 0 == 2
+        _NPY_NOP0(dst);
+        _NPY_NOP0(dst + 0);
+#  endif
+
+#endif
+
+#if 0
+        dst += 1;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 1;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 0
+static NPY_GCC_OPT_3 int
+_contig_to_strided_size1_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1 != 16
+#  if !(1 == 1 && 0)
+    npy_uint8 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 1 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+#if 1 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 1 != 16
+    temp = _NPY_NOP1(*((npy_uint8 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 1 != 16
+        *((npy_uint8 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 1;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 0 */
+
+#endif/* 1 >= 1 */
+
+
+#line 105
+
+#if (1 >= 1) && \
+    (1 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 0 || 1 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 0 == 0 && 1 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_contig_to_strided_size1(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+    /*printf("fn _aligned_contig_to_strided_size1\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 1 != 16
+        (*((npy_uint8 *)dst)) = _NPY_NOP1(*((npy_uint8 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 1);
+#  if 0 == 1
+        _NPY_NOP1(dst);
+#  elif 0 == 2
+        _NPY_NOP0(dst);
+        _NPY_NOP0(dst + 0);
+#  endif
+
+#endif
+
+#if 0
+        dst += 1;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 1;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_contig_to_strided_size1_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1 != 16
+#  if !(1 == 1 && 0)
+    npy_uint8 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 1 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+#if 1 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 1 != 16
+    temp = _NPY_NOP1(*((npy_uint8 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 1 != 16
+        *((npy_uint8 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 1;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 1 */
+
+#endif/* 1 >= 1 */
+
+
+#line 105
+
+#if (1 >= 2) && \
+    (1 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 1 || 1 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 1 == 0 && 1 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_contig_to_strided_size1(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+    /*printf("fn _swap_contig_to_strided_size1\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 1 != 16
+        (*((npy_uint8 *)dst)) = _NPY_SWAP_INPLACE1(*((npy_uint8 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 1);
+#  if 1 == 1
+        _NPY_SWAP_INPLACE1(dst);
+#  elif 1 == 2
+        _NPY_SWAP_INPLACE0(dst);
+        _NPY_SWAP_INPLACE0(dst + 0);
+#  endif
+
+#endif
+
+#if 0
+        dst += 1;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 1;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_contig_to_strided_size1_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1 != 16
+#  if !(1 == 1 && 0)
+    npy_uint8 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 1 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+#if 1 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 1 != 16
+    temp = _NPY_SWAP_INPLACE1(*((npy_uint8 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 1 != 16
+        *((npy_uint8 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 1;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 0 */
+
+#endif/* 1 >= 2 */
+
+
+#line 105
+
+#if (1 >= 2) && \
+    (1 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 1 || 1 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 1 == 0 && 1 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_contig_to_strided_size1(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+    /*printf("fn _aligned_swap_contig_to_strided_size1\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 1 != 16
+        (*((npy_uint8 *)dst)) = _NPY_SWAP1(*((npy_uint8 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 1);
+#  if 1 == 1
+        _NPY_SWAP1(dst);
+#  elif 1 == 2
+        _NPY_SWAP0(dst);
+        _NPY_SWAP0(dst + 0);
+#  endif
+
+#endif
+
+#if 0
+        dst += 1;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 1;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_contig_to_strided_size1_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1 != 16
+#  if !(1 == 1 && 0)
+    npy_uint8 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 1 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+#if 1 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 1 != 16
+    temp = _NPY_SWAP1(*((npy_uint8 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 1 != 16
+        *((npy_uint8 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 1;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 1 */
+
+#endif/* 1 >= 2 */
+
+
+#line 105
+
+#if (1 >= 4) && \
+    (1 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 2 || 1 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 2 == 0 && 1 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_pair_contig_to_strided_size1(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+    /*printf("fn _swap_pair_contig_to_strided_size1\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 1 != 16
+        (*((npy_uint8 *)dst)) = _NPY_SWAP_INPLACE1(*((npy_uint8 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 1);
+#  if 2 == 1
+        _NPY_SWAP_INPLACE1(dst);
+#  elif 2 == 2
+        _NPY_SWAP_INPLACE0(dst);
+        _NPY_SWAP_INPLACE0(dst + 0);
+#  endif
+
+#endif
+
+#if 0
+        dst += 1;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 1;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_pair_contig_to_strided_size1_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1 != 16
+#  if !(1 == 1 && 0)
+    npy_uint8 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 1 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+#if 1 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 1 != 16
+    temp = _NPY_SWAP_INPLACE1(*((npy_uint8 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 1 != 16
+        *((npy_uint8 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 1;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 0 */
+
+#endif/* 1 >= 4 */
+
+
+#line 105
+
+#if (1 >= 4) && \
+    (1 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 2 || 1 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 2 == 0 && 1 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_pair_contig_to_strided_size1(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+    /*printf("fn _aligned_swap_pair_contig_to_strided_size1\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 1 != 16
+        (*((npy_uint8 *)dst)) = _NPY_SWAP_PAIR1(*((npy_uint8 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 1);
+#  if 2 == 1
+        _NPY_SWAP_PAIR1(dst);
+#  elif 2 == 2
+        _NPY_SWAP_PAIR0(dst);
+        _NPY_SWAP_PAIR0(dst + 0);
+#  endif
+
+#endif
+
+#if 0
+        dst += 1;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 1;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_pair_contig_to_strided_size1_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1 != 16
+#  if !(1 == 1 && 0)
+    npy_uint8 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 1 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+#if 1 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 1 != 16
+    temp = _NPY_SWAP_PAIR1(*((npy_uint8 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 1 != 16
+        *((npy_uint8 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 1;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 1 */
+
+#endif/* 1 >= 4 */
+
+
+
+#line 97
+#line 105
+
+#if (1 >= 1) && \
+    (1 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 0 || 1 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 0 == 0 && 1 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_contig_to_contig_size1(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+    /*printf("fn _contig_to_contig_size1\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 1 != 16
+        (*((npy_uint8 *)dst)) = _NPY_NOP1(*((npy_uint8 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 1);
+#  if 0 == 1
+        _NPY_NOP1(dst);
+#  elif 0 == 2
+        _NPY_NOP0(dst);
+        _NPY_NOP0(dst + 0);
+#  endif
+
+#endif
+
+#if 1
+        dst += 1;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 1;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 0
+static NPY_GCC_OPT_3 int
+_contig_to_contig_size1_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1 != 16
+#  if !(1 == 1 && 1)
+    npy_uint8 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 1 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+#if 1 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 1 != 16
+    temp = _NPY_NOP1(*((npy_uint8 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 1 != 16
+        *((npy_uint8 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 1;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 0 */
+
+#endif/* 1 >= 1 */
+
+
+#line 105
+
+#if (1 >= 1) && \
+    (1 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 0 || 1 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 0 == 0 && 1 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_contig_to_contig_size1(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+    /*printf("fn _aligned_contig_to_contig_size1\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 1 != 16
+        (*((npy_uint8 *)dst)) = _NPY_NOP1(*((npy_uint8 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 1);
+#  if 0 == 1
+        _NPY_NOP1(dst);
+#  elif 0 == 2
+        _NPY_NOP0(dst);
+        _NPY_NOP0(dst + 0);
+#  endif
+
+#endif
+
+#if 1
+        dst += 1;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 1;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_contig_to_contig_size1_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1 != 16
+#  if !(1 == 1 && 1)
+    npy_uint8 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 1 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+#if 1 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 1 != 16
+    temp = _NPY_NOP1(*((npy_uint8 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 1 != 16
+        *((npy_uint8 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 1;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 1 */
+
+#endif/* 1 >= 1 */
+
+
+#line 105
+
+#if (1 >= 2) && \
+    (1 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 1 || 1 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 1 == 0 && 1 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_contig_to_contig_size1(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+    /*printf("fn _swap_contig_to_contig_size1\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 1 != 16
+        (*((npy_uint8 *)dst)) = _NPY_SWAP_INPLACE1(*((npy_uint8 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 1);
+#  if 1 == 1
+        _NPY_SWAP_INPLACE1(dst);
+#  elif 1 == 2
+        _NPY_SWAP_INPLACE0(dst);
+        _NPY_SWAP_INPLACE0(dst + 0);
+#  endif
+
+#endif
+
+#if 1
+        dst += 1;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 1;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_contig_to_contig_size1_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1 != 16
+#  if !(1 == 1 && 1)
+    npy_uint8 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 1 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+#if 1 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 1 != 16
+    temp = _NPY_SWAP_INPLACE1(*((npy_uint8 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 1 != 16
+        *((npy_uint8 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 1;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 0 */
+
+#endif/* 1 >= 2 */
+
+
+#line 105
+
+#if (1 >= 2) && \
+    (1 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 1 || 1 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 1 == 0 && 1 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_contig_to_contig_size1(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+    /*printf("fn _aligned_swap_contig_to_contig_size1\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 1 != 16
+        (*((npy_uint8 *)dst)) = _NPY_SWAP1(*((npy_uint8 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 1);
+#  if 1 == 1
+        _NPY_SWAP1(dst);
+#  elif 1 == 2
+        _NPY_SWAP0(dst);
+        _NPY_SWAP0(dst + 0);
+#  endif
+
+#endif
+
+#if 1
+        dst += 1;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 1;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_contig_to_contig_size1_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1 != 16
+#  if !(1 == 1 && 1)
+    npy_uint8 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 1 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+#if 1 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 1 != 16
+    temp = _NPY_SWAP1(*((npy_uint8 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 1 != 16
+        *((npy_uint8 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 1;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 1 */
+
+#endif/* 1 >= 2 */
+
+
+#line 105
+
+#if (1 >= 4) && \
+    (1 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 2 || 1 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 2 == 0 && 1 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_pair_contig_to_contig_size1(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+    /*printf("fn _swap_pair_contig_to_contig_size1\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 1 != 16
+        (*((npy_uint8 *)dst)) = _NPY_SWAP_INPLACE1(*((npy_uint8 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 1);
+#  if 2 == 1
+        _NPY_SWAP_INPLACE1(dst);
+#  elif 2 == 2
+        _NPY_SWAP_INPLACE0(dst);
+        _NPY_SWAP_INPLACE0(dst + 0);
+#  endif
+
+#endif
+
+#if 1
+        dst += 1;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 1;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_pair_contig_to_contig_size1_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1 != 16
+#  if !(1 == 1 && 1)
+    npy_uint8 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 1 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+#if 1 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 1 != 16
+    temp = _NPY_SWAP_INPLACE1(*((npy_uint8 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 1 != 16
+        *((npy_uint8 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 1;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 0 */
+
+#endif/* 1 >= 4 */
+
+
+#line 105
+
+#if (1 >= 4) && \
+    (1 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 2 || 1 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 2 == 0 && 1 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_pair_contig_to_contig_size1(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+    /*printf("fn _aligned_swap_pair_contig_to_contig_size1\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 1 != 16
+        (*((npy_uint8 *)dst)) = _NPY_SWAP_PAIR1(*((npy_uint8 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 1);
+#  if 2 == 1
+        _NPY_SWAP_PAIR1(dst);
+#  elif 2 == 2
+        _NPY_SWAP_PAIR0(dst);
+        _NPY_SWAP_PAIR0(dst + 0);
+#  endif
+
+#endif
+
+#if 1
+        dst += 1;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 1;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_pair_contig_to_contig_size1_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1 != 16
+#  if !(1 == 1 && 1)
+    npy_uint8 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 1 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint8)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint8)));
+#endif
+#if 1 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 1 != 16
+    temp = _NPY_SWAP_PAIR1(*((npy_uint8 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 1 != 16
+        *((npy_uint8 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 1;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 1 */
+
+#endif/* 1 >= 4 */
+
+
+
+
+#line 91
+#line 97
+#line 105
+
+#if (2 >= 1) && \
+    (2 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 0 || 0 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 0 == 0 && 2 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_strided_to_strided_size2(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+    /*printf("fn _strided_to_strided_size2\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 2 != 16
+        (*((npy_uint16 *)dst)) = _NPY_NOP2(*((npy_uint16 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 2);
+#  if 0 == 1
+        _NPY_NOP2(dst);
+#  elif 0 == 2
+        _NPY_NOP1(dst);
+        _NPY_NOP1(dst + 1);
+#  endif
+
+#endif
+
+#if 0
+        dst += 2;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 2;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 0
+static NPY_GCC_OPT_3 int
+_strided_to_strided_size2_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 2 != 16
+#  if !(2 == 1 && 0)
+    npy_uint16 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 2 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+#if 2 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 2 != 16
+    temp = _NPY_NOP2(*((npy_uint16 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 2 != 16
+        *((npy_uint16 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 2;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 0 */
+
+#endif/* 2 >= 1 */
+
+
+#line 105
+
+#if (2 >= 1) && \
+    (2 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 0 || 0 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 0 == 0 && 2 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_strided_to_strided_size2(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+    /*printf("fn _aligned_strided_to_strided_size2\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 2 != 16
+        (*((npy_uint16 *)dst)) = _NPY_NOP2(*((npy_uint16 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 2);
+#  if 0 == 1
+        _NPY_NOP2(dst);
+#  elif 0 == 2
+        _NPY_NOP1(dst);
+        _NPY_NOP1(dst + 1);
+#  endif
+
+#endif
+
+#if 0
+        dst += 2;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 2;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_strided_to_strided_size2_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 2 != 16
+#  if !(2 == 1 && 0)
+    npy_uint16 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 2 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+#if 2 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 2 != 16
+    temp = _NPY_NOP2(*((npy_uint16 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 2 != 16
+        *((npy_uint16 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 2;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 1 */
+
+#endif/* 2 >= 1 */
+
+
+#line 105
+
+#if (2 >= 2) && \
+    (2 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 1 || 0 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 1 == 0 && 2 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_strided_to_strided_size2(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+    /*printf("fn _swap_strided_to_strided_size2\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 2 != 16
+        (*((npy_uint16 *)dst)) = _NPY_SWAP_INPLACE2(*((npy_uint16 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 2);
+#  if 1 == 1
+        _NPY_SWAP_INPLACE2(dst);
+#  elif 1 == 2
+        _NPY_SWAP_INPLACE1(dst);
+        _NPY_SWAP_INPLACE1(dst + 1);
+#  endif
+
+#endif
+
+#if 0
+        dst += 2;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 2;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_strided_to_strided_size2_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 2 != 16
+#  if !(2 == 1 && 0)
+    npy_uint16 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 2 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+#if 2 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 2 != 16
+    temp = _NPY_SWAP_INPLACE2(*((npy_uint16 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 2 != 16
+        *((npy_uint16 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 2;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 0 */
+
+#endif/* 2 >= 2 */
+
+
+#line 105
+
+#if (2 >= 2) && \
+    (2 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 1 || 0 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 1 == 0 && 2 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_strided_to_strided_size2(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+    /*printf("fn _aligned_swap_strided_to_strided_size2\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 2 != 16
+        (*((npy_uint16 *)dst)) = _NPY_SWAP2(*((npy_uint16 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 2);
+#  if 1 == 1
+        _NPY_SWAP2(dst);
+#  elif 1 == 2
+        _NPY_SWAP1(dst);
+        _NPY_SWAP1(dst + 1);
+#  endif
+
+#endif
+
+#if 0
+        dst += 2;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 2;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_strided_to_strided_size2_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 2 != 16
+#  if !(2 == 1 && 0)
+    npy_uint16 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 2 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+#if 2 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 2 != 16
+    temp = _NPY_SWAP2(*((npy_uint16 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 2 != 16
+        *((npy_uint16 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 2;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 1 */
+
+#endif/* 2 >= 2 */
+
+
+#line 105
+
+#if (2 >= 4) && \
+    (2 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 2 || 0 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 2 == 0 && 2 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_pair_strided_to_strided_size2(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+    /*printf("fn _swap_pair_strided_to_strided_size2\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 2 != 16
+        (*((npy_uint16 *)dst)) = _NPY_SWAP_INPLACE2(*((npy_uint16 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 2);
+#  if 2 == 1
+        _NPY_SWAP_INPLACE2(dst);
+#  elif 2 == 2
+        _NPY_SWAP_INPLACE1(dst);
+        _NPY_SWAP_INPLACE1(dst + 1);
+#  endif
+
+#endif
+
+#if 0
+        dst += 2;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 2;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_pair_strided_to_strided_size2_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 2 != 16
+#  if !(2 == 1 && 0)
+    npy_uint16 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 2 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+#if 2 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 2 != 16
+    temp = _NPY_SWAP_INPLACE2(*((npy_uint16 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 2 != 16
+        *((npy_uint16 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 2;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 0 */
+
+#endif/* 2 >= 4 */
+
+
+#line 105
+
+#if (2 >= 4) && \
+    (2 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 2 || 0 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 2 == 0 && 2 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_pair_strided_to_strided_size2(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+    /*printf("fn _aligned_swap_pair_strided_to_strided_size2\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 2 != 16
+        (*((npy_uint16 *)dst)) = _NPY_SWAP_PAIR2(*((npy_uint16 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 2);
+#  if 2 == 1
+        _NPY_SWAP_PAIR2(dst);
+#  elif 2 == 2
+        _NPY_SWAP_PAIR1(dst);
+        _NPY_SWAP_PAIR1(dst + 1);
+#  endif
+
+#endif
+
+#if 0
+        dst += 2;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 2;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_pair_strided_to_strided_size2_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 2 != 16
+#  if !(2 == 1 && 0)
+    npy_uint16 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 2 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+#if 2 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 2 != 16
+    temp = _NPY_SWAP_PAIR2(*((npy_uint16 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 2 != 16
+        *((npy_uint16 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 2;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 1 */
+
+#endif/* 2 >= 4 */
+
+
+
+#line 97
+#line 105
+
+#if (2 >= 1) && \
+    (2 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 0 || 0 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 0 == 0 && 2 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_strided_to_contig_size2(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+    /*printf("fn _strided_to_contig_size2\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 2 != 16
+        (*((npy_uint16 *)dst)) = _NPY_NOP2(*((npy_uint16 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 2);
+#  if 0 == 1
+        _NPY_NOP2(dst);
+#  elif 0 == 2
+        _NPY_NOP1(dst);
+        _NPY_NOP1(dst + 1);
+#  endif
+
+#endif
+
+#if 1
+        dst += 2;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 2;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 0
+static NPY_GCC_OPT_3 int
+_strided_to_contig_size2_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 2 != 16
+#  if !(2 == 1 && 1)
+    npy_uint16 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 2 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+#if 2 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 2 != 16
+    temp = _NPY_NOP2(*((npy_uint16 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 2 != 16
+        *((npy_uint16 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 2;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 0 */
+
+#endif/* 2 >= 1 */
+
+
+#line 105
+
+#if (2 >= 1) && \
+    (2 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 0 || 0 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 0 == 0 && 2 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_strided_to_contig_size2(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+    /*printf("fn _aligned_strided_to_contig_size2\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 2 != 16
+        (*((npy_uint16 *)dst)) = _NPY_NOP2(*((npy_uint16 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 2);
+#  if 0 == 1
+        _NPY_NOP2(dst);
+#  elif 0 == 2
+        _NPY_NOP1(dst);
+        _NPY_NOP1(dst + 1);
+#  endif
+
+#endif
+
+#if 1
+        dst += 2;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 2;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_strided_to_contig_size2_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 2 != 16
+#  if !(2 == 1 && 1)
+    npy_uint16 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 2 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+#if 2 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 2 != 16
+    temp = _NPY_NOP2(*((npy_uint16 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 2 != 16
+        *((npy_uint16 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 2;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 1 */
+
+#endif/* 2 >= 1 */
+
+
+#line 105
+
+#if (2 >= 2) && \
+    (2 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 1 || 0 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 1 == 0 && 2 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_strided_to_contig_size2(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+    /*printf("fn _swap_strided_to_contig_size2\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 2 != 16
+        (*((npy_uint16 *)dst)) = _NPY_SWAP_INPLACE2(*((npy_uint16 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 2);
+#  if 1 == 1
+        _NPY_SWAP_INPLACE2(dst);
+#  elif 1 == 2
+        _NPY_SWAP_INPLACE1(dst);
+        _NPY_SWAP_INPLACE1(dst + 1);
+#  endif
+
+#endif
+
+#if 1
+        dst += 2;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 2;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_strided_to_contig_size2_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 2 != 16
+#  if !(2 == 1 && 1)
+    npy_uint16 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 2 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+#if 2 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 2 != 16
+    temp = _NPY_SWAP_INPLACE2(*((npy_uint16 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 2 != 16
+        *((npy_uint16 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 2;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 0 */
+
+#endif/* 2 >= 2 */
+
+
+#line 105
+
+#if (2 >= 2) && \
+    (2 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 1 || 0 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 1 == 0 && 2 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_strided_to_contig_size2(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+    /*printf("fn _aligned_swap_strided_to_contig_size2\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 2 != 16
+        (*((npy_uint16 *)dst)) = _NPY_SWAP2(*((npy_uint16 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 2);
+#  if 1 == 1
+        _NPY_SWAP2(dst);
+#  elif 1 == 2
+        _NPY_SWAP1(dst);
+        _NPY_SWAP1(dst + 1);
+#  endif
+
+#endif
+
+#if 1
+        dst += 2;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 2;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_strided_to_contig_size2_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 2 != 16
+#  if !(2 == 1 && 1)
+    npy_uint16 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 2 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+#if 2 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 2 != 16
+    temp = _NPY_SWAP2(*((npy_uint16 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 2 != 16
+        *((npy_uint16 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 2;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 1 */
+
+#endif/* 2 >= 2 */
+
+
+#line 105
+
+#if (2 >= 4) && \
+    (2 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 2 || 0 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 2 == 0 && 2 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_pair_strided_to_contig_size2(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+    /*printf("fn _swap_pair_strided_to_contig_size2\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 2 != 16
+        (*((npy_uint16 *)dst)) = _NPY_SWAP_INPLACE2(*((npy_uint16 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 2);
+#  if 2 == 1
+        _NPY_SWAP_INPLACE2(dst);
+#  elif 2 == 2
+        _NPY_SWAP_INPLACE1(dst);
+        _NPY_SWAP_INPLACE1(dst + 1);
+#  endif
+
+#endif
+
+#if 1
+        dst += 2;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 2;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_pair_strided_to_contig_size2_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 2 != 16
+#  if !(2 == 1 && 1)
+    npy_uint16 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 2 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+#if 2 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 2 != 16
+    temp = _NPY_SWAP_INPLACE2(*((npy_uint16 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 2 != 16
+        *((npy_uint16 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 2;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 0 */
+
+#endif/* 2 >= 4 */
+
+
+#line 105
+
+#if (2 >= 4) && \
+    (2 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 2 || 0 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 2 == 0 && 2 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_pair_strided_to_contig_size2(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+    /*printf("fn _aligned_swap_pair_strided_to_contig_size2\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 2 != 16
+        (*((npy_uint16 *)dst)) = _NPY_SWAP_PAIR2(*((npy_uint16 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 2);
+#  if 2 == 1
+        _NPY_SWAP_PAIR2(dst);
+#  elif 2 == 2
+        _NPY_SWAP_PAIR1(dst);
+        _NPY_SWAP_PAIR1(dst + 1);
+#  endif
+
+#endif
+
+#if 1
+        dst += 2;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 2;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_pair_strided_to_contig_size2_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 2 != 16
+#  if !(2 == 1 && 1)
+    npy_uint16 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 2 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+#if 2 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 2 != 16
+    temp = _NPY_SWAP_PAIR2(*((npy_uint16 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 2 != 16
+        *((npy_uint16 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 2;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 1 */
+
+#endif/* 2 >= 4 */
+
+
+
+#line 97
+#line 105
+
+#if (2 >= 1) && \
+    (2 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 0 || 1 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 0 == 0 && 2 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_contig_to_strided_size2(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+    /*printf("fn _contig_to_strided_size2\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 2 != 16
+        (*((npy_uint16 *)dst)) = _NPY_NOP2(*((npy_uint16 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 2);
+#  if 0 == 1
+        _NPY_NOP2(dst);
+#  elif 0 == 2
+        _NPY_NOP1(dst);
+        _NPY_NOP1(dst + 1);
+#  endif
+
+#endif
+
+#if 0
+        dst += 2;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 2;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 0
+static NPY_GCC_OPT_3 int
+_contig_to_strided_size2_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 2 != 16
+#  if !(2 == 1 && 0)
+    npy_uint16 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 2 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+#if 2 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 2 != 16
+    temp = _NPY_NOP2(*((npy_uint16 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 2 != 16
+        *((npy_uint16 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 2;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 0 */
+
+#endif/* 2 >= 1 */
+
+
+#line 105
+
+#if (2 >= 1) && \
+    (2 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 0 || 1 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 0 == 0 && 2 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_contig_to_strided_size2(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+    /*printf("fn _aligned_contig_to_strided_size2\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 2 != 16
+        (*((npy_uint16 *)dst)) = _NPY_NOP2(*((npy_uint16 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 2);
+#  if 0 == 1
+        _NPY_NOP2(dst);
+#  elif 0 == 2
+        _NPY_NOP1(dst);
+        _NPY_NOP1(dst + 1);
+#  endif
+
+#endif
+
+#if 0
+        dst += 2;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 2;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_contig_to_strided_size2_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 2 != 16
+#  if !(2 == 1 && 0)
+    npy_uint16 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 2 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+#if 2 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 2 != 16
+    temp = _NPY_NOP2(*((npy_uint16 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 2 != 16
+        *((npy_uint16 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 2;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 1 */
+
+#endif/* 2 >= 1 */
+
+
+#line 105
+
+#if (2 >= 2) && \
+    (2 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 1 || 1 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 1 == 0 && 2 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_contig_to_strided_size2(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+    /*printf("fn _swap_contig_to_strided_size2\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 2 != 16
+        (*((npy_uint16 *)dst)) = _NPY_SWAP_INPLACE2(*((npy_uint16 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 2);
+#  if 1 == 1
+        _NPY_SWAP_INPLACE2(dst);
+#  elif 1 == 2
+        _NPY_SWAP_INPLACE1(dst);
+        _NPY_SWAP_INPLACE1(dst + 1);
+#  endif
+
+#endif
+
+#if 0
+        dst += 2;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 2;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_contig_to_strided_size2_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 2 != 16
+#  if !(2 == 1 && 0)
+    npy_uint16 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 2 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+#if 2 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 2 != 16
+    temp = _NPY_SWAP_INPLACE2(*((npy_uint16 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 2 != 16
+        *((npy_uint16 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 2;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 0 */
+
+#endif/* 2 >= 2 */
+
+
+#line 105
+
+#if (2 >= 2) && \
+    (2 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 1 || 1 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 1 == 0 && 2 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_contig_to_strided_size2(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+    /*printf("fn _aligned_swap_contig_to_strided_size2\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 2 != 16
+        (*((npy_uint16 *)dst)) = _NPY_SWAP2(*((npy_uint16 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 2);
+#  if 1 == 1
+        _NPY_SWAP2(dst);
+#  elif 1 == 2
+        _NPY_SWAP1(dst);
+        _NPY_SWAP1(dst + 1);
+#  endif
+
+#endif
+
+#if 0
+        dst += 2;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 2;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_contig_to_strided_size2_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 2 != 16
+#  if !(2 == 1 && 0)
+    npy_uint16 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 2 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+#if 2 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 2 != 16
+    temp = _NPY_SWAP2(*((npy_uint16 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 2 != 16
+        *((npy_uint16 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 2;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 1 */
+
+#endif/* 2 >= 2 */
+
+
+#line 105
+
+#if (2 >= 4) && \
+    (2 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 2 || 1 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 2 == 0 && 2 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_pair_contig_to_strided_size2(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+    /*printf("fn _swap_pair_contig_to_strided_size2\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 2 != 16
+        (*((npy_uint16 *)dst)) = _NPY_SWAP_INPLACE2(*((npy_uint16 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 2);
+#  if 2 == 1
+        _NPY_SWAP_INPLACE2(dst);
+#  elif 2 == 2
+        _NPY_SWAP_INPLACE1(dst);
+        _NPY_SWAP_INPLACE1(dst + 1);
+#  endif
+
+#endif
+
+#if 0
+        dst += 2;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 2;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_pair_contig_to_strided_size2_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 2 != 16
+#  if !(2 == 1 && 0)
+    npy_uint16 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 2 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+#if 2 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 2 != 16
+    temp = _NPY_SWAP_INPLACE2(*((npy_uint16 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 2 != 16
+        *((npy_uint16 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 2;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 0 */
+
+#endif/* 2 >= 4 */
+
+
+#line 105
+
+#if (2 >= 4) && \
+    (2 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 2 || 1 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 2 == 0 && 2 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_pair_contig_to_strided_size2(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+    /*printf("fn _aligned_swap_pair_contig_to_strided_size2\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 2 != 16
+        (*((npy_uint16 *)dst)) = _NPY_SWAP_PAIR2(*((npy_uint16 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 2);
+#  if 2 == 1
+        _NPY_SWAP_PAIR2(dst);
+#  elif 2 == 2
+        _NPY_SWAP_PAIR1(dst);
+        _NPY_SWAP_PAIR1(dst + 1);
+#  endif
+
+#endif
+
+#if 0
+        dst += 2;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 2;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_pair_contig_to_strided_size2_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 2 != 16
+#  if !(2 == 1 && 0)
+    npy_uint16 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 2 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+#if 2 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 2 != 16
+    temp = _NPY_SWAP_PAIR2(*((npy_uint16 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 2 != 16
+        *((npy_uint16 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 2;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 1 */
+
+#endif/* 2 >= 4 */
+
+
+
+#line 97
+#line 105
+
+#if (2 >= 1) && \
+    (2 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 0 || 1 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 0 == 0 && 2 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_contig_to_contig_size2(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+    /*printf("fn _contig_to_contig_size2\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 2 != 16
+        (*((npy_uint16 *)dst)) = _NPY_NOP2(*((npy_uint16 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 2);
+#  if 0 == 1
+        _NPY_NOP2(dst);
+#  elif 0 == 2
+        _NPY_NOP1(dst);
+        _NPY_NOP1(dst + 1);
+#  endif
+
+#endif
+
+#if 1
+        dst += 2;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 2;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 0
+static NPY_GCC_OPT_3 int
+_contig_to_contig_size2_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 2 != 16
+#  if !(2 == 1 && 1)
+    npy_uint16 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 2 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+#if 2 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 2 != 16
+    temp = _NPY_NOP2(*((npy_uint16 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 2 != 16
+        *((npy_uint16 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 2;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 0 */
+
+#endif/* 2 >= 1 */
+
+
+#line 105
+
+#if (2 >= 1) && \
+    (2 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 0 || 1 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 0 == 0 && 2 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_contig_to_contig_size2(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+    /*printf("fn _aligned_contig_to_contig_size2\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 2 != 16
+        (*((npy_uint16 *)dst)) = _NPY_NOP2(*((npy_uint16 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 2);
+#  if 0 == 1
+        _NPY_NOP2(dst);
+#  elif 0 == 2
+        _NPY_NOP1(dst);
+        _NPY_NOP1(dst + 1);
+#  endif
+
+#endif
+
+#if 1
+        dst += 2;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 2;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_contig_to_contig_size2_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 2 != 16
+#  if !(2 == 1 && 1)
+    npy_uint16 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 2 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+#if 2 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 2 != 16
+    temp = _NPY_NOP2(*((npy_uint16 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 2 != 16
+        *((npy_uint16 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 2;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 1 */
+
+#endif/* 2 >= 1 */
+
+
+#line 105
+
+#if (2 >= 2) && \
+    (2 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 1 || 1 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 1 == 0 && 2 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_contig_to_contig_size2(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+    /*printf("fn _swap_contig_to_contig_size2\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 2 != 16
+        (*((npy_uint16 *)dst)) = _NPY_SWAP_INPLACE2(*((npy_uint16 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 2);
+#  if 1 == 1
+        _NPY_SWAP_INPLACE2(dst);
+#  elif 1 == 2
+        _NPY_SWAP_INPLACE1(dst);
+        _NPY_SWAP_INPLACE1(dst + 1);
+#  endif
+
+#endif
+
+#if 1
+        dst += 2;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 2;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_contig_to_contig_size2_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 2 != 16
+#  if !(2 == 1 && 1)
+    npy_uint16 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 2 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+#if 2 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 2 != 16
+    temp = _NPY_SWAP_INPLACE2(*((npy_uint16 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 2 != 16
+        *((npy_uint16 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 2;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 0 */
+
+#endif/* 2 >= 2 */
+
+
+#line 105
+
+#if (2 >= 2) && \
+    (2 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 1 || 1 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 1 == 0 && 2 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_contig_to_contig_size2(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+    /*printf("fn _aligned_swap_contig_to_contig_size2\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 2 != 16
+        (*((npy_uint16 *)dst)) = _NPY_SWAP2(*((npy_uint16 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 2);
+#  if 1 == 1
+        _NPY_SWAP2(dst);
+#  elif 1 == 2
+        _NPY_SWAP1(dst);
+        _NPY_SWAP1(dst + 1);
+#  endif
+
+#endif
+
+#if 1
+        dst += 2;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 2;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_contig_to_contig_size2_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 2 != 16
+#  if !(2 == 1 && 1)
+    npy_uint16 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 2 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+#if 2 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 2 != 16
+    temp = _NPY_SWAP2(*((npy_uint16 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 2 != 16
+        *((npy_uint16 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 2;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 1 */
+
+#endif/* 2 >= 2 */
+
+
+#line 105
+
+#if (2 >= 4) && \
+    (2 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 2 || 1 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 2 == 0 && 2 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_pair_contig_to_contig_size2(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+    /*printf("fn _swap_pair_contig_to_contig_size2\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 2 != 16
+        (*((npy_uint16 *)dst)) = _NPY_SWAP_INPLACE2(*((npy_uint16 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 2);
+#  if 2 == 1
+        _NPY_SWAP_INPLACE2(dst);
+#  elif 2 == 2
+        _NPY_SWAP_INPLACE1(dst);
+        _NPY_SWAP_INPLACE1(dst + 1);
+#  endif
+
+#endif
+
+#if 1
+        dst += 2;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 2;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_pair_contig_to_contig_size2_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 2 != 16
+#  if !(2 == 1 && 1)
+    npy_uint16 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 2 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+#if 2 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 2 != 16
+    temp = _NPY_SWAP_INPLACE2(*((npy_uint16 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 2 != 16
+        *((npy_uint16 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 2;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 0 */
+
+#endif/* 2 >= 4 */
+
+
+#line 105
+
+#if (2 >= 4) && \
+    (2 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 2 || 1 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 2 == 0 && 2 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_pair_contig_to_contig_size2(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+    /*printf("fn _aligned_swap_pair_contig_to_contig_size2\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 2 != 16
+        (*((npy_uint16 *)dst)) = _NPY_SWAP_PAIR2(*((npy_uint16 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 2);
+#  if 2 == 1
+        _NPY_SWAP_PAIR2(dst);
+#  elif 2 == 2
+        _NPY_SWAP_PAIR1(dst);
+        _NPY_SWAP_PAIR1(dst + 1);
+#  endif
+
+#endif
+
+#if 1
+        dst += 2;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 2;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_pair_contig_to_contig_size2_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 2 != 16
+#  if !(2 == 1 && 1)
+    npy_uint16 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 2 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint16)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint16)));
+#endif
+#if 2 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 2 != 16
+    temp = _NPY_SWAP_PAIR2(*((npy_uint16 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 2 != 16
+        *((npy_uint16 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 2;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 1 */
+
+#endif/* 2 >= 4 */
+
+
+
+
+#line 91
+#line 97
+#line 105
+
+#if (4 >= 1) && \
+    (4 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 0 || 0 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 0 == 0 && 4 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_strided_to_strided_size4(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+    /*printf("fn _strided_to_strided_size4\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 4 != 16
+        (*((npy_uint32 *)dst)) = _NPY_NOP4(*((npy_uint32 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 4);
+#  if 0 == 1
+        _NPY_NOP4(dst);
+#  elif 0 == 2
+        _NPY_NOP2(dst);
+        _NPY_NOP2(dst + 2);
+#  endif
+
+#endif
+
+#if 0
+        dst += 4;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 4;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 0
+static NPY_GCC_OPT_3 int
+_strided_to_strided_size4_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 4 != 16
+#  if !(4 == 1 && 0)
+    npy_uint32 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 4 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+#if 4 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 4 != 16
+    temp = _NPY_NOP4(*((npy_uint32 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 4 != 16
+        *((npy_uint32 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 4;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 0 */
+
+#endif/* 4 >= 1 */
+
+
+#line 105
+
+#if (4 >= 1) && \
+    (4 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 0 || 0 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 0 == 0 && 4 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_strided_to_strided_size4(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+    /*printf("fn _aligned_strided_to_strided_size4\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 4 != 16
+        (*((npy_uint32 *)dst)) = _NPY_NOP4(*((npy_uint32 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 4);
+#  if 0 == 1
+        _NPY_NOP4(dst);
+#  elif 0 == 2
+        _NPY_NOP2(dst);
+        _NPY_NOP2(dst + 2);
+#  endif
+
+#endif
+
+#if 0
+        dst += 4;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 4;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_strided_to_strided_size4_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 4 != 16
+#  if !(4 == 1 && 0)
+    npy_uint32 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 4 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+#if 4 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 4 != 16
+    temp = _NPY_NOP4(*((npy_uint32 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 4 != 16
+        *((npy_uint32 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 4;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 1 */
+
+#endif/* 4 >= 1 */
+
+
+#line 105
+
+#if (4 >= 2) && \
+    (4 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 1 || 0 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 1 == 0 && 4 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_strided_to_strided_size4(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+    /*printf("fn _swap_strided_to_strided_size4\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 4 != 16
+        (*((npy_uint32 *)dst)) = _NPY_SWAP_INPLACE4(*((npy_uint32 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 4);
+#  if 1 == 1
+        _NPY_SWAP_INPLACE4(dst);
+#  elif 1 == 2
+        _NPY_SWAP_INPLACE2(dst);
+        _NPY_SWAP_INPLACE2(dst + 2);
+#  endif
+
+#endif
+
+#if 0
+        dst += 4;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 4;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_strided_to_strided_size4_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 4 != 16
+#  if !(4 == 1 && 0)
+    npy_uint32 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 4 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+#if 4 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 4 != 16
+    temp = _NPY_SWAP_INPLACE4(*((npy_uint32 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 4 != 16
+        *((npy_uint32 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 4;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 0 */
+
+#endif/* 4 >= 2 */
+
+
+#line 105
+
+#if (4 >= 2) && \
+    (4 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 1 || 0 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 1 == 0 && 4 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_strided_to_strided_size4(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+    /*printf("fn _aligned_swap_strided_to_strided_size4\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 4 != 16
+        (*((npy_uint32 *)dst)) = _NPY_SWAP4(*((npy_uint32 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 4);
+#  if 1 == 1
+        _NPY_SWAP4(dst);
+#  elif 1 == 2
+        _NPY_SWAP2(dst);
+        _NPY_SWAP2(dst + 2);
+#  endif
+
+#endif
+
+#if 0
+        dst += 4;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 4;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_strided_to_strided_size4_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 4 != 16
+#  if !(4 == 1 && 0)
+    npy_uint32 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 4 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+#if 4 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 4 != 16
+    temp = _NPY_SWAP4(*((npy_uint32 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 4 != 16
+        *((npy_uint32 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 4;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 1 */
+
+#endif/* 4 >= 2 */
+
+
+#line 105
+
+#if (4 >= 4) && \
+    (4 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 2 || 0 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 2 == 0 && 4 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_pair_strided_to_strided_size4(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+    /*printf("fn _swap_pair_strided_to_strided_size4\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 4 != 16
+        (*((npy_uint32 *)dst)) = _NPY_SWAP_INPLACE4(*((npy_uint32 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 4);
+#  if 2 == 1
+        _NPY_SWAP_INPLACE4(dst);
+#  elif 2 == 2
+        _NPY_SWAP_INPLACE2(dst);
+        _NPY_SWAP_INPLACE2(dst + 2);
+#  endif
+
+#endif
+
+#if 0
+        dst += 4;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 4;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_pair_strided_to_strided_size4_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 4 != 16
+#  if !(4 == 1 && 0)
+    npy_uint32 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 4 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+#if 4 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 4 != 16
+    temp = _NPY_SWAP_INPLACE4(*((npy_uint32 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 4 != 16
+        *((npy_uint32 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 4;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 0 */
+
+#endif/* 4 >= 4 */
+
+
+#line 105
+
+#if (4 >= 4) && \
+    (4 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 2 || 0 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 2 == 0 && 4 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_pair_strided_to_strided_size4(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+    /*printf("fn _aligned_swap_pair_strided_to_strided_size4\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 4 != 16
+        (*((npy_uint32 *)dst)) = _NPY_SWAP_PAIR4(*((npy_uint32 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 4);
+#  if 2 == 1
+        _NPY_SWAP_PAIR4(dst);
+#  elif 2 == 2
+        _NPY_SWAP_PAIR2(dst);
+        _NPY_SWAP_PAIR2(dst + 2);
+#  endif
+
+#endif
+
+#if 0
+        dst += 4;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 4;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_pair_strided_to_strided_size4_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 4 != 16
+#  if !(4 == 1 && 0)
+    npy_uint32 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 4 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+#if 4 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 4 != 16
+    temp = _NPY_SWAP_PAIR4(*((npy_uint32 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 4 != 16
+        *((npy_uint32 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 4;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 1 */
+
+#endif/* 4 >= 4 */
+
+
+
+#line 97
+#line 105
+
+#if (4 >= 1) && \
+    (4 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 0 || 0 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 0 == 0 && 4 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_strided_to_contig_size4(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+    /*printf("fn _strided_to_contig_size4\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 4 != 16
+        (*((npy_uint32 *)dst)) = _NPY_NOP4(*((npy_uint32 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 4);
+#  if 0 == 1
+        _NPY_NOP4(dst);
+#  elif 0 == 2
+        _NPY_NOP2(dst);
+        _NPY_NOP2(dst + 2);
+#  endif
+
+#endif
+
+#if 1
+        dst += 4;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 4;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 0
+static NPY_GCC_OPT_3 int
+_strided_to_contig_size4_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 4 != 16
+#  if !(4 == 1 && 1)
+    npy_uint32 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 4 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+#if 4 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 4 != 16
+    temp = _NPY_NOP4(*((npy_uint32 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 4 != 16
+        *((npy_uint32 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 4;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 0 */
+
+#endif/* 4 >= 1 */
+
+
+#line 105
+
+#if (4 >= 1) && \
+    (4 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 0 || 0 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 0 == 0 && 4 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_strided_to_contig_size4(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+    /*printf("fn _aligned_strided_to_contig_size4\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 4 != 16
+        (*((npy_uint32 *)dst)) = _NPY_NOP4(*((npy_uint32 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 4);
+#  if 0 == 1
+        _NPY_NOP4(dst);
+#  elif 0 == 2
+        _NPY_NOP2(dst);
+        _NPY_NOP2(dst + 2);
+#  endif
+
+#endif
+
+#if 1
+        dst += 4;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 4;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_strided_to_contig_size4_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 4 != 16
+#  if !(4 == 1 && 1)
+    npy_uint32 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 4 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+#if 4 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 4 != 16
+    temp = _NPY_NOP4(*((npy_uint32 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 4 != 16
+        *((npy_uint32 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 4;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 1 */
+
+#endif/* 4 >= 1 */
+
+
+#line 105
+
+#if (4 >= 2) && \
+    (4 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 1 || 0 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 1 == 0 && 4 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_strided_to_contig_size4(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+    /*printf("fn _swap_strided_to_contig_size4\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 4 != 16
+        (*((npy_uint32 *)dst)) = _NPY_SWAP_INPLACE4(*((npy_uint32 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 4);
+#  if 1 == 1
+        _NPY_SWAP_INPLACE4(dst);
+#  elif 1 == 2
+        _NPY_SWAP_INPLACE2(dst);
+        _NPY_SWAP_INPLACE2(dst + 2);
+#  endif
+
+#endif
+
+#if 1
+        dst += 4;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 4;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_strided_to_contig_size4_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 4 != 16
+#  if !(4 == 1 && 1)
+    npy_uint32 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 4 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+#if 4 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 4 != 16
+    temp = _NPY_SWAP_INPLACE4(*((npy_uint32 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 4 != 16
+        *((npy_uint32 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 4;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 0 */
+
+#endif/* 4 >= 2 */
+
+
+#line 105
+
+#if (4 >= 2) && \
+    (4 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 1 || 0 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 1 == 0 && 4 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_strided_to_contig_size4(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+    /*printf("fn _aligned_swap_strided_to_contig_size4\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 4 != 16
+        (*((npy_uint32 *)dst)) = _NPY_SWAP4(*((npy_uint32 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 4);
+#  if 1 == 1
+        _NPY_SWAP4(dst);
+#  elif 1 == 2
+        _NPY_SWAP2(dst);
+        _NPY_SWAP2(dst + 2);
+#  endif
+
+#endif
+
+#if 1
+        dst += 4;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 4;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_strided_to_contig_size4_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 4 != 16
+#  if !(4 == 1 && 1)
+    npy_uint32 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 4 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+#if 4 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 4 != 16
+    temp = _NPY_SWAP4(*((npy_uint32 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 4 != 16
+        *((npy_uint32 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 4;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 1 */
+
+#endif/* 4 >= 2 */
+
+
+#line 105
+
+#if (4 >= 4) && \
+    (4 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 2 || 0 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 2 == 0 && 4 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_pair_strided_to_contig_size4(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+    /*printf("fn _swap_pair_strided_to_contig_size4\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 4 != 16
+        (*((npy_uint32 *)dst)) = _NPY_SWAP_INPLACE4(*((npy_uint32 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 4);
+#  if 2 == 1
+        _NPY_SWAP_INPLACE4(dst);
+#  elif 2 == 2
+        _NPY_SWAP_INPLACE2(dst);
+        _NPY_SWAP_INPLACE2(dst + 2);
+#  endif
+
+#endif
+
+#if 1
+        dst += 4;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 4;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_pair_strided_to_contig_size4_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 4 != 16
+#  if !(4 == 1 && 1)
+    npy_uint32 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 4 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+#if 4 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 4 != 16
+    temp = _NPY_SWAP_INPLACE4(*((npy_uint32 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 4 != 16
+        *((npy_uint32 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 4;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 0 */
+
+#endif/* 4 >= 4 */
+
+
+#line 105
+
+#if (4 >= 4) && \
+    (4 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 2 || 0 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 2 == 0 && 4 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_pair_strided_to_contig_size4(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+    /*printf("fn _aligned_swap_pair_strided_to_contig_size4\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 4 != 16
+        (*((npy_uint32 *)dst)) = _NPY_SWAP_PAIR4(*((npy_uint32 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 4);
+#  if 2 == 1
+        _NPY_SWAP_PAIR4(dst);
+#  elif 2 == 2
+        _NPY_SWAP_PAIR2(dst);
+        _NPY_SWAP_PAIR2(dst + 2);
+#  endif
+
+#endif
+
+#if 1
+        dst += 4;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 4;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_pair_strided_to_contig_size4_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 4 != 16
+#  if !(4 == 1 && 1)
+    npy_uint32 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 4 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+#if 4 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 4 != 16
+    temp = _NPY_SWAP_PAIR4(*((npy_uint32 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 4 != 16
+        *((npy_uint32 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 4;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 1 */
+
+#endif/* 4 >= 4 */
+
+
+
+#line 97
+#line 105
+
+#if (4 >= 1) && \
+    (4 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 0 || 1 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 0 == 0 && 4 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_contig_to_strided_size4(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+    /*printf("fn _contig_to_strided_size4\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 4 != 16
+        (*((npy_uint32 *)dst)) = _NPY_NOP4(*((npy_uint32 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 4);
+#  if 0 == 1
+        _NPY_NOP4(dst);
+#  elif 0 == 2
+        _NPY_NOP2(dst);
+        _NPY_NOP2(dst + 2);
+#  endif
+
+#endif
+
+#if 0
+        dst += 4;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 4;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 0
+static NPY_GCC_OPT_3 int
+_contig_to_strided_size4_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 4 != 16
+#  if !(4 == 1 && 0)
+    npy_uint32 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 4 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+#if 4 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 4 != 16
+    temp = _NPY_NOP4(*((npy_uint32 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 4 != 16
+        *((npy_uint32 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 4;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 0 */
+
+#endif/* 4 >= 1 */
+
+
+#line 105
+
+#if (4 >= 1) && \
+    (4 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 0 || 1 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 0 == 0 && 4 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_contig_to_strided_size4(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+    /*printf("fn _aligned_contig_to_strided_size4\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 4 != 16
+        (*((npy_uint32 *)dst)) = _NPY_NOP4(*((npy_uint32 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 4);
+#  if 0 == 1
+        _NPY_NOP4(dst);
+#  elif 0 == 2
+        _NPY_NOP2(dst);
+        _NPY_NOP2(dst + 2);
+#  endif
+
+#endif
+
+#if 0
+        dst += 4;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 4;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_contig_to_strided_size4_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 4 != 16
+#  if !(4 == 1 && 0)
+    npy_uint32 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 4 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+#if 4 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 4 != 16
+    temp = _NPY_NOP4(*((npy_uint32 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 4 != 16
+        *((npy_uint32 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 4;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 1 */
+
+#endif/* 4 >= 1 */
+
+
+#line 105
+
+#if (4 >= 2) && \
+    (4 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 1 || 1 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 1 == 0 && 4 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_contig_to_strided_size4(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+    /*printf("fn _swap_contig_to_strided_size4\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 4 != 16
+        (*((npy_uint32 *)dst)) = _NPY_SWAP_INPLACE4(*((npy_uint32 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 4);
+#  if 1 == 1
+        _NPY_SWAP_INPLACE4(dst);
+#  elif 1 == 2
+        _NPY_SWAP_INPLACE2(dst);
+        _NPY_SWAP_INPLACE2(dst + 2);
+#  endif
+
+#endif
+
+#if 0
+        dst += 4;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 4;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_contig_to_strided_size4_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 4 != 16
+#  if !(4 == 1 && 0)
+    npy_uint32 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 4 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+#if 4 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 4 != 16
+    temp = _NPY_SWAP_INPLACE4(*((npy_uint32 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 4 != 16
+        *((npy_uint32 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 4;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 0 */
+
+#endif/* 4 >= 2 */
+
+
+#line 105
+
+#if (4 >= 2) && \
+    (4 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 1 || 1 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 1 == 0 && 4 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_contig_to_strided_size4(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+    /*printf("fn _aligned_swap_contig_to_strided_size4\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 4 != 16
+        (*((npy_uint32 *)dst)) = _NPY_SWAP4(*((npy_uint32 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 4);
+#  if 1 == 1
+        _NPY_SWAP4(dst);
+#  elif 1 == 2
+        _NPY_SWAP2(dst);
+        _NPY_SWAP2(dst + 2);
+#  endif
+
+#endif
+
+#if 0
+        dst += 4;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 4;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_contig_to_strided_size4_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 4 != 16
+#  if !(4 == 1 && 0)
+    npy_uint32 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 4 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+#if 4 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 4 != 16
+    temp = _NPY_SWAP4(*((npy_uint32 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 4 != 16
+        *((npy_uint32 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 4;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 1 */
+
+#endif/* 4 >= 2 */
+
+
+#line 105
+
+#if (4 >= 4) && \
+    (4 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 2 || 1 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 2 == 0 && 4 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_pair_contig_to_strided_size4(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+    /*printf("fn _swap_pair_contig_to_strided_size4\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 4 != 16
+        (*((npy_uint32 *)dst)) = _NPY_SWAP_INPLACE4(*((npy_uint32 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 4);
+#  if 2 == 1
+        _NPY_SWAP_INPLACE4(dst);
+#  elif 2 == 2
+        _NPY_SWAP_INPLACE2(dst);
+        _NPY_SWAP_INPLACE2(dst + 2);
+#  endif
+
+#endif
+
+#if 0
+        dst += 4;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 4;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_pair_contig_to_strided_size4_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 4 != 16
+#  if !(4 == 1 && 0)
+    npy_uint32 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 4 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+#if 4 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 4 != 16
+    temp = _NPY_SWAP_INPLACE4(*((npy_uint32 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 4 != 16
+        *((npy_uint32 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 4;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 0 */
+
+#endif/* 4 >= 4 */
+
+
+#line 105
+
+#if (4 >= 4) && \
+    (4 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 2 || 1 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 2 == 0 && 4 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_pair_contig_to_strided_size4(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+    /*printf("fn _aligned_swap_pair_contig_to_strided_size4\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 4 != 16
+        (*((npy_uint32 *)dst)) = _NPY_SWAP_PAIR4(*((npy_uint32 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 4);
+#  if 2 == 1
+        _NPY_SWAP_PAIR4(dst);
+#  elif 2 == 2
+        _NPY_SWAP_PAIR2(dst);
+        _NPY_SWAP_PAIR2(dst + 2);
+#  endif
+
+#endif
+
+#if 0
+        dst += 4;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 4;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_pair_contig_to_strided_size4_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 4 != 16
+#  if !(4 == 1 && 0)
+    npy_uint32 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 4 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+#if 4 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 4 != 16
+    temp = _NPY_SWAP_PAIR4(*((npy_uint32 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 4 != 16
+        *((npy_uint32 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 4;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 1 */
+
+#endif/* 4 >= 4 */
+
+
+
+#line 97
+#line 105
+
+#if (4 >= 1) && \
+    (4 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 0 || 1 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 0 == 0 && 4 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_contig_to_contig_size4(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+    /*printf("fn _contig_to_contig_size4\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 4 != 16
+        (*((npy_uint32 *)dst)) = _NPY_NOP4(*((npy_uint32 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 4);
+#  if 0 == 1
+        _NPY_NOP4(dst);
+#  elif 0 == 2
+        _NPY_NOP2(dst);
+        _NPY_NOP2(dst + 2);
+#  endif
+
+#endif
+
+#if 1
+        dst += 4;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 4;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 0
+static NPY_GCC_OPT_3 int
+_contig_to_contig_size4_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 4 != 16
+#  if !(4 == 1 && 1)
+    npy_uint32 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 4 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+#if 4 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 4 != 16
+    temp = _NPY_NOP4(*((npy_uint32 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 4 != 16
+        *((npy_uint32 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 4;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 0 */
+
+#endif/* 4 >= 1 */
+
+
+#line 105
+
+#if (4 >= 1) && \
+    (4 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 0 || 1 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 0 == 0 && 4 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_contig_to_contig_size4(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+    /*printf("fn _aligned_contig_to_contig_size4\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 4 != 16
+        (*((npy_uint32 *)dst)) = _NPY_NOP4(*((npy_uint32 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 4);
+#  if 0 == 1
+        _NPY_NOP4(dst);
+#  elif 0 == 2
+        _NPY_NOP2(dst);
+        _NPY_NOP2(dst + 2);
+#  endif
+
+#endif
+
+#if 1
+        dst += 4;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 4;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_contig_to_contig_size4_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 4 != 16
+#  if !(4 == 1 && 1)
+    npy_uint32 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 4 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+#if 4 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 4 != 16
+    temp = _NPY_NOP4(*((npy_uint32 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 4 != 16
+        *((npy_uint32 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 4;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 1 */
+
+#endif/* 4 >= 1 */
+
+
+#line 105
+
+#if (4 >= 2) && \
+    (4 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 1 || 1 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 1 == 0 && 4 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_contig_to_contig_size4(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+    /*printf("fn _swap_contig_to_contig_size4\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 4 != 16
+        (*((npy_uint32 *)dst)) = _NPY_SWAP_INPLACE4(*((npy_uint32 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 4);
+#  if 1 == 1
+        _NPY_SWAP_INPLACE4(dst);
+#  elif 1 == 2
+        _NPY_SWAP_INPLACE2(dst);
+        _NPY_SWAP_INPLACE2(dst + 2);
+#  endif
+
+#endif
+
+#if 1
+        dst += 4;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 4;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_contig_to_contig_size4_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 4 != 16
+#  if !(4 == 1 && 1)
+    npy_uint32 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 4 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+#if 4 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 4 != 16
+    temp = _NPY_SWAP_INPLACE4(*((npy_uint32 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 4 != 16
+        *((npy_uint32 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 4;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 0 */
+
+#endif/* 4 >= 2 */
+
+
+#line 105
+
+#if (4 >= 2) && \
+    (4 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 1 || 1 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 1 == 0 && 4 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_contig_to_contig_size4(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+    /*printf("fn _aligned_swap_contig_to_contig_size4\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 4 != 16
+        (*((npy_uint32 *)dst)) = _NPY_SWAP4(*((npy_uint32 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 4);
+#  if 1 == 1
+        _NPY_SWAP4(dst);
+#  elif 1 == 2
+        _NPY_SWAP2(dst);
+        _NPY_SWAP2(dst + 2);
+#  endif
+
+#endif
+
+#if 1
+        dst += 4;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 4;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_contig_to_contig_size4_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 4 != 16
+#  if !(4 == 1 && 1)
+    npy_uint32 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 4 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+#if 4 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 4 != 16
+    temp = _NPY_SWAP4(*((npy_uint32 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 4 != 16
+        *((npy_uint32 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 4;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 1 */
+
+#endif/* 4 >= 2 */
+
+
+#line 105
+
+#if (4 >= 4) && \
+    (4 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 2 || 1 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 2 == 0 && 4 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_pair_contig_to_contig_size4(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+    /*printf("fn _swap_pair_contig_to_contig_size4\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 4 != 16
+        (*((npy_uint32 *)dst)) = _NPY_SWAP_INPLACE4(*((npy_uint32 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 4);
+#  if 2 == 1
+        _NPY_SWAP_INPLACE4(dst);
+#  elif 2 == 2
+        _NPY_SWAP_INPLACE2(dst);
+        _NPY_SWAP_INPLACE2(dst + 2);
+#  endif
+
+#endif
+
+#if 1
+        dst += 4;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 4;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_pair_contig_to_contig_size4_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 4 != 16
+#  if !(4 == 1 && 1)
+    npy_uint32 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 4 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+#if 4 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 4 != 16
+    temp = _NPY_SWAP_INPLACE4(*((npy_uint32 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 4 != 16
+        *((npy_uint32 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 4;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 0 */
+
+#endif/* 4 >= 4 */
+
+
+#line 105
+
+#if (4 >= 4) && \
+    (4 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 2 || 1 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 2 == 0 && 4 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_pair_contig_to_contig_size4(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+    /*printf("fn _aligned_swap_pair_contig_to_contig_size4\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 4 != 16
+        (*((npy_uint32 *)dst)) = _NPY_SWAP_PAIR4(*((npy_uint32 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 4);
+#  if 2 == 1
+        _NPY_SWAP_PAIR4(dst);
+#  elif 2 == 2
+        _NPY_SWAP_PAIR2(dst);
+        _NPY_SWAP_PAIR2(dst + 2);
+#  endif
+
+#endif
+
+#if 1
+        dst += 4;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 4;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_pair_contig_to_contig_size4_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 4 != 16
+#  if !(4 == 1 && 1)
+    npy_uint32 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 4 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint32)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint32)));
+#endif
+#if 4 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 4 != 16
+    temp = _NPY_SWAP_PAIR4(*((npy_uint32 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 4 != 16
+        *((npy_uint32 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 4;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 1 */
+
+#endif/* 4 >= 4 */
+
+
+
+
+#line 91
+#line 97
+#line 105
+
+#if (8 >= 1) && \
+    (8 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 0 || 0 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 0 == 0 && 8 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_strided_to_strided_size8(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _strided_to_strided_size8\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 8 != 16
+        (*((npy_uint64 *)dst)) = _NPY_NOP8(*((npy_uint64 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 8);
+#  if 0 == 1
+        _NPY_NOP8(dst);
+#  elif 0 == 2
+        _NPY_NOP4(dst);
+        _NPY_NOP4(dst + 4);
+#  endif
+
+#endif
+
+#if 0
+        dst += 8;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 8;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 0
+static NPY_GCC_OPT_3 int
+_strided_to_strided_size8_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 8 != 16
+#  if !(8 == 1 && 0)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 8 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 8 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 8 != 16
+    temp = _NPY_NOP8(*((npy_uint64 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 8 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 8;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 0 */
+
+#endif/* 8 >= 1 */
+
+
+#line 105
+
+#if (8 >= 1) && \
+    (8 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 0 || 0 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 0 == 0 && 8 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_strided_to_strided_size8(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _aligned_strided_to_strided_size8\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 8 != 16
+        (*((npy_uint64 *)dst)) = _NPY_NOP8(*((npy_uint64 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 8);
+#  if 0 == 1
+        _NPY_NOP8(dst);
+#  elif 0 == 2
+        _NPY_NOP4(dst);
+        _NPY_NOP4(dst + 4);
+#  endif
+
+#endif
+
+#if 0
+        dst += 8;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 8;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_strided_to_strided_size8_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 8 != 16
+#  if !(8 == 1 && 0)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 8 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 8 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 8 != 16
+    temp = _NPY_NOP8(*((npy_uint64 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 8 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 8;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 1 */
+
+#endif/* 8 >= 1 */
+
+
+#line 105
+
+#if (8 >= 2) && \
+    (8 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 1 || 0 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 1 == 0 && 8 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_strided_to_strided_size8(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _swap_strided_to_strided_size8\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 8 != 16
+        (*((npy_uint64 *)dst)) = _NPY_SWAP_INPLACE8(*((npy_uint64 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 8);
+#  if 1 == 1
+        _NPY_SWAP_INPLACE8(dst);
+#  elif 1 == 2
+        _NPY_SWAP_INPLACE4(dst);
+        _NPY_SWAP_INPLACE4(dst + 4);
+#  endif
+
+#endif
+
+#if 0
+        dst += 8;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 8;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_strided_to_strided_size8_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 8 != 16
+#  if !(8 == 1 && 0)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 8 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 8 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 8 != 16
+    temp = _NPY_SWAP_INPLACE8(*((npy_uint64 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 8 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 8;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 0 */
+
+#endif/* 8 >= 2 */
+
+
+#line 105
+
+#if (8 >= 2) && \
+    (8 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 1 || 0 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 1 == 0 && 8 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_strided_to_strided_size8(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _aligned_swap_strided_to_strided_size8\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 8 != 16
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 8);
+#  if 1 == 1
+        _NPY_SWAP8(dst);
+#  elif 1 == 2
+        _NPY_SWAP4(dst);
+        _NPY_SWAP4(dst + 4);
+#  endif
+
+#endif
+
+#if 0
+        dst += 8;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 8;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_strided_to_strided_size8_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 8 != 16
+#  if !(8 == 1 && 0)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 8 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 8 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 8 != 16
+    temp = _NPY_SWAP8(*((npy_uint64 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 8 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 8;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 1 */
+
+#endif/* 8 >= 2 */
+
+
+#line 105
+
+#if (8 >= 4) && \
+    (8 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 2 || 0 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 2 == 0 && 8 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_pair_strided_to_strided_size8(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _swap_pair_strided_to_strided_size8\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 8 != 16
+        (*((npy_uint64 *)dst)) = _NPY_SWAP_INPLACE8(*((npy_uint64 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 8);
+#  if 2 == 1
+        _NPY_SWAP_INPLACE8(dst);
+#  elif 2 == 2
+        _NPY_SWAP_INPLACE4(dst);
+        _NPY_SWAP_INPLACE4(dst + 4);
+#  endif
+
+#endif
+
+#if 0
+        dst += 8;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 8;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_pair_strided_to_strided_size8_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 8 != 16
+#  if !(8 == 1 && 0)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 8 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 8 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 8 != 16
+    temp = _NPY_SWAP_INPLACE8(*((npy_uint64 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 8 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 8;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 0 */
+
+#endif/* 8 >= 4 */
+
+
+#line 105
+
+#if (8 >= 4) && \
+    (8 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 2 || 0 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 2 == 0 && 8 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_pair_strided_to_strided_size8(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _aligned_swap_pair_strided_to_strided_size8\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 8 != 16
+        (*((npy_uint64 *)dst)) = _NPY_SWAP_PAIR8(*((npy_uint64 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 8);
+#  if 2 == 1
+        _NPY_SWAP_PAIR8(dst);
+#  elif 2 == 2
+        _NPY_SWAP_PAIR4(dst);
+        _NPY_SWAP_PAIR4(dst + 4);
+#  endif
+
+#endif
+
+#if 0
+        dst += 8;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 8;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_pair_strided_to_strided_size8_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 8 != 16
+#  if !(8 == 1 && 0)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 8 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 8 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 8 != 16
+    temp = _NPY_SWAP_PAIR8(*((npy_uint64 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 8 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 8;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 1 */
+
+#endif/* 8 >= 4 */
+
+
+
+#line 97
+#line 105
+
+#if (8 >= 1) && \
+    (8 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 0 || 0 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 0 == 0 && 8 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_strided_to_contig_size8(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _strided_to_contig_size8\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 8 != 16
+        (*((npy_uint64 *)dst)) = _NPY_NOP8(*((npy_uint64 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 8);
+#  if 0 == 1
+        _NPY_NOP8(dst);
+#  elif 0 == 2
+        _NPY_NOP4(dst);
+        _NPY_NOP4(dst + 4);
+#  endif
+
+#endif
+
+#if 1
+        dst += 8;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 8;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 0
+static NPY_GCC_OPT_3 int
+_strided_to_contig_size8_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 8 != 16
+#  if !(8 == 1 && 1)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 8 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 8 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 8 != 16
+    temp = _NPY_NOP8(*((npy_uint64 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 8 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 8;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 0 */
+
+#endif/* 8 >= 1 */
+
+
+#line 105
+
+#if (8 >= 1) && \
+    (8 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 0 || 0 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 0 == 0 && 8 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_strided_to_contig_size8(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _aligned_strided_to_contig_size8\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 8 != 16
+        (*((npy_uint64 *)dst)) = _NPY_NOP8(*((npy_uint64 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 8);
+#  if 0 == 1
+        _NPY_NOP8(dst);
+#  elif 0 == 2
+        _NPY_NOP4(dst);
+        _NPY_NOP4(dst + 4);
+#  endif
+
+#endif
+
+#if 1
+        dst += 8;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 8;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_strided_to_contig_size8_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 8 != 16
+#  if !(8 == 1 && 1)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 8 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 8 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 8 != 16
+    temp = _NPY_NOP8(*((npy_uint64 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 8 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 8;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 1 */
+
+#endif/* 8 >= 1 */
+
+
+#line 105
+
+#if (8 >= 2) && \
+    (8 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 1 || 0 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 1 == 0 && 8 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_strided_to_contig_size8(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _swap_strided_to_contig_size8\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 8 != 16
+        (*((npy_uint64 *)dst)) = _NPY_SWAP_INPLACE8(*((npy_uint64 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 8);
+#  if 1 == 1
+        _NPY_SWAP_INPLACE8(dst);
+#  elif 1 == 2
+        _NPY_SWAP_INPLACE4(dst);
+        _NPY_SWAP_INPLACE4(dst + 4);
+#  endif
+
+#endif
+
+#if 1
+        dst += 8;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 8;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_strided_to_contig_size8_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 8 != 16
+#  if !(8 == 1 && 1)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 8 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 8 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 8 != 16
+    temp = _NPY_SWAP_INPLACE8(*((npy_uint64 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 8 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 8;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 0 */
+
+#endif/* 8 >= 2 */
+
+
+#line 105
+
+#if (8 >= 2) && \
+    (8 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 1 || 0 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 1 == 0 && 8 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_strided_to_contig_size8(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _aligned_swap_strided_to_contig_size8\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 8 != 16
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 8);
+#  if 1 == 1
+        _NPY_SWAP8(dst);
+#  elif 1 == 2
+        _NPY_SWAP4(dst);
+        _NPY_SWAP4(dst + 4);
+#  endif
+
+#endif
+
+#if 1
+        dst += 8;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 8;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_strided_to_contig_size8_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 8 != 16
+#  if !(8 == 1 && 1)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 8 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 8 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 8 != 16
+    temp = _NPY_SWAP8(*((npy_uint64 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 8 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 8;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 1 */
+
+#endif/* 8 >= 2 */
+
+
+#line 105
+
+#if (8 >= 4) && \
+    (8 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 2 || 0 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 2 == 0 && 8 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_pair_strided_to_contig_size8(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _swap_pair_strided_to_contig_size8\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 8 != 16
+        (*((npy_uint64 *)dst)) = _NPY_SWAP_INPLACE8(*((npy_uint64 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 8);
+#  if 2 == 1
+        _NPY_SWAP_INPLACE8(dst);
+#  elif 2 == 2
+        _NPY_SWAP_INPLACE4(dst);
+        _NPY_SWAP_INPLACE4(dst + 4);
+#  endif
+
+#endif
+
+#if 1
+        dst += 8;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 8;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_pair_strided_to_contig_size8_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 8 != 16
+#  if !(8 == 1 && 1)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 8 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 8 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 8 != 16
+    temp = _NPY_SWAP_INPLACE8(*((npy_uint64 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 8 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 8;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 0 */
+
+#endif/* 8 >= 4 */
+
+
+#line 105
+
+#if (8 >= 4) && \
+    (8 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 2 || 0 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 2 == 0 && 8 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_pair_strided_to_contig_size8(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _aligned_swap_pair_strided_to_contig_size8\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 8 != 16
+        (*((npy_uint64 *)dst)) = _NPY_SWAP_PAIR8(*((npy_uint64 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 8);
+#  if 2 == 1
+        _NPY_SWAP_PAIR8(dst);
+#  elif 2 == 2
+        _NPY_SWAP_PAIR4(dst);
+        _NPY_SWAP_PAIR4(dst + 4);
+#  endif
+
+#endif
+
+#if 1
+        dst += 8;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 8;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_pair_strided_to_contig_size8_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 8 != 16
+#  if !(8 == 1 && 1)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 8 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 8 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 8 != 16
+    temp = _NPY_SWAP_PAIR8(*((npy_uint64 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 8 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 8;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 1 */
+
+#endif/* 8 >= 4 */
+
+
+
+#line 97
+#line 105
+
+#if (8 >= 1) && \
+    (8 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 0 || 1 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 0 == 0 && 8 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_contig_to_strided_size8(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _contig_to_strided_size8\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 8 != 16
+        (*((npy_uint64 *)dst)) = _NPY_NOP8(*((npy_uint64 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 8);
+#  if 0 == 1
+        _NPY_NOP8(dst);
+#  elif 0 == 2
+        _NPY_NOP4(dst);
+        _NPY_NOP4(dst + 4);
+#  endif
+
+#endif
+
+#if 0
+        dst += 8;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 8;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 0
+static NPY_GCC_OPT_3 int
+_contig_to_strided_size8_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 8 != 16
+#  if !(8 == 1 && 0)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 8 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 8 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 8 != 16
+    temp = _NPY_NOP8(*((npy_uint64 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 8 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 8;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 0 */
+
+#endif/* 8 >= 1 */
+
+
+#line 105
+
+#if (8 >= 1) && \
+    (8 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 0 || 1 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 0 == 0 && 8 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_contig_to_strided_size8(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _aligned_contig_to_strided_size8\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 8 != 16
+        (*((npy_uint64 *)dst)) = _NPY_NOP8(*((npy_uint64 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 8);
+#  if 0 == 1
+        _NPY_NOP8(dst);
+#  elif 0 == 2
+        _NPY_NOP4(dst);
+        _NPY_NOP4(dst + 4);
+#  endif
+
+#endif
+
+#if 0
+        dst += 8;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 8;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_contig_to_strided_size8_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 8 != 16
+#  if !(8 == 1 && 0)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 8 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 8 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 8 != 16
+    temp = _NPY_NOP8(*((npy_uint64 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 8 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 8;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 1 */
+
+#endif/* 8 >= 1 */
+
+
+#line 105
+
+#if (8 >= 2) && \
+    (8 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 1 || 1 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 1 == 0 && 8 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_contig_to_strided_size8(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _swap_contig_to_strided_size8\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 8 != 16
+        (*((npy_uint64 *)dst)) = _NPY_SWAP_INPLACE8(*((npy_uint64 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 8);
+#  if 1 == 1
+        _NPY_SWAP_INPLACE8(dst);
+#  elif 1 == 2
+        _NPY_SWAP_INPLACE4(dst);
+        _NPY_SWAP_INPLACE4(dst + 4);
+#  endif
+
+#endif
+
+#if 0
+        dst += 8;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 8;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_contig_to_strided_size8_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 8 != 16
+#  if !(8 == 1 && 0)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 8 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 8 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 8 != 16
+    temp = _NPY_SWAP_INPLACE8(*((npy_uint64 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 8 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 8;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 0 */
+
+#endif/* 8 >= 2 */
+
+
+#line 105
+
+#if (8 >= 2) && \
+    (8 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 1 || 1 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 1 == 0 && 8 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_contig_to_strided_size8(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _aligned_swap_contig_to_strided_size8\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 8 != 16
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 8);
+#  if 1 == 1
+        _NPY_SWAP8(dst);
+#  elif 1 == 2
+        _NPY_SWAP4(dst);
+        _NPY_SWAP4(dst + 4);
+#  endif
+
+#endif
+
+#if 0
+        dst += 8;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 8;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_contig_to_strided_size8_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 8 != 16
+#  if !(8 == 1 && 0)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 8 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 8 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 8 != 16
+    temp = _NPY_SWAP8(*((npy_uint64 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 8 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 8;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 1 */
+
+#endif/* 8 >= 2 */
+
+
+#line 105
+
+#if (8 >= 4) && \
+    (8 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 2 || 1 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 2 == 0 && 8 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_pair_contig_to_strided_size8(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _swap_pair_contig_to_strided_size8\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 8 != 16
+        (*((npy_uint64 *)dst)) = _NPY_SWAP_INPLACE8(*((npy_uint64 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 8);
+#  if 2 == 1
+        _NPY_SWAP_INPLACE8(dst);
+#  elif 2 == 2
+        _NPY_SWAP_INPLACE4(dst);
+        _NPY_SWAP_INPLACE4(dst + 4);
+#  endif
+
+#endif
+
+#if 0
+        dst += 8;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 8;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_pair_contig_to_strided_size8_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 8 != 16
+#  if !(8 == 1 && 0)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 8 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 8 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 8 != 16
+    temp = _NPY_SWAP_INPLACE8(*((npy_uint64 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 8 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 8;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 0 */
+
+#endif/* 8 >= 4 */
+
+
+#line 105
+
+#if (8 >= 4) && \
+    (8 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 2 || 1 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 2 == 0 && 8 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_pair_contig_to_strided_size8(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _aligned_swap_pair_contig_to_strided_size8\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 8 != 16
+        (*((npy_uint64 *)dst)) = _NPY_SWAP_PAIR8(*((npy_uint64 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 8);
+#  if 2 == 1
+        _NPY_SWAP_PAIR8(dst);
+#  elif 2 == 2
+        _NPY_SWAP_PAIR4(dst);
+        _NPY_SWAP_PAIR4(dst + 4);
+#  endif
+
+#endif
+
+#if 0
+        dst += 8;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 8;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_pair_contig_to_strided_size8_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 8 != 16
+#  if !(8 == 1 && 0)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 8 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 8 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 8 != 16
+    temp = _NPY_SWAP_PAIR8(*((npy_uint64 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 8 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 8;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 1 */
+
+#endif/* 8 >= 4 */
+
+
+
+#line 97
+#line 105
+
+#if (8 >= 1) && \
+    (8 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 0 || 1 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 0 == 0 && 8 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_contig_to_contig_size8(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _contig_to_contig_size8\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 8 != 16
+        (*((npy_uint64 *)dst)) = _NPY_NOP8(*((npy_uint64 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 8);
+#  if 0 == 1
+        _NPY_NOP8(dst);
+#  elif 0 == 2
+        _NPY_NOP4(dst);
+        _NPY_NOP4(dst + 4);
+#  endif
+
+#endif
+
+#if 1
+        dst += 8;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 8;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 0
+static NPY_GCC_OPT_3 int
+_contig_to_contig_size8_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 8 != 16
+#  if !(8 == 1 && 1)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 8 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 8 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 8 != 16
+    temp = _NPY_NOP8(*((npy_uint64 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 8 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 8;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 0 */
+
+#endif/* 8 >= 1 */
+
+
+#line 105
+
+#if (8 >= 1) && \
+    (8 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 0 || 1 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 0 == 0 && 8 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_contig_to_contig_size8(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _aligned_contig_to_contig_size8\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 8 != 16
+        (*((npy_uint64 *)dst)) = _NPY_NOP8(*((npy_uint64 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 8);
+#  if 0 == 1
+        _NPY_NOP8(dst);
+#  elif 0 == 2
+        _NPY_NOP4(dst);
+        _NPY_NOP4(dst + 4);
+#  endif
+
+#endif
+
+#if 1
+        dst += 8;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 8;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_contig_to_contig_size8_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 8 != 16
+#  if !(8 == 1 && 1)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 8 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 8 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 8 != 16
+    temp = _NPY_NOP8(*((npy_uint64 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 8 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 8;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 1 */
+
+#endif/* 8 >= 1 */
+
+
+#line 105
+
+#if (8 >= 2) && \
+    (8 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 1 || 1 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 1 == 0 && 8 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_contig_to_contig_size8(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _swap_contig_to_contig_size8\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 8 != 16
+        (*((npy_uint64 *)dst)) = _NPY_SWAP_INPLACE8(*((npy_uint64 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 8);
+#  if 1 == 1
+        _NPY_SWAP_INPLACE8(dst);
+#  elif 1 == 2
+        _NPY_SWAP_INPLACE4(dst);
+        _NPY_SWAP_INPLACE4(dst + 4);
+#  endif
+
+#endif
+
+#if 1
+        dst += 8;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 8;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_contig_to_contig_size8_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 8 != 16
+#  if !(8 == 1 && 1)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 8 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 8 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 8 != 16
+    temp = _NPY_SWAP_INPLACE8(*((npy_uint64 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 8 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 8;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 0 */
+
+#endif/* 8 >= 2 */
+
+
+#line 105
+
+#if (8 >= 2) && \
+    (8 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 1 || 1 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 1 == 0 && 8 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_contig_to_contig_size8(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _aligned_swap_contig_to_contig_size8\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 8 != 16
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 8);
+#  if 1 == 1
+        _NPY_SWAP8(dst);
+#  elif 1 == 2
+        _NPY_SWAP4(dst);
+        _NPY_SWAP4(dst + 4);
+#  endif
+
+#endif
+
+#if 1
+        dst += 8;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 8;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_contig_to_contig_size8_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 8 != 16
+#  if !(8 == 1 && 1)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 8 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 8 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 8 != 16
+    temp = _NPY_SWAP8(*((npy_uint64 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 8 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 8;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 1 */
+
+#endif/* 8 >= 2 */
+
+
+#line 105
+
+#if (8 >= 4) && \
+    (8 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 2 || 1 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 2 == 0 && 8 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_pair_contig_to_contig_size8(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _swap_pair_contig_to_contig_size8\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 8 != 16
+        (*((npy_uint64 *)dst)) = _NPY_SWAP_INPLACE8(*((npy_uint64 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 8);
+#  if 2 == 1
+        _NPY_SWAP_INPLACE8(dst);
+#  elif 2 == 2
+        _NPY_SWAP_INPLACE4(dst);
+        _NPY_SWAP_INPLACE4(dst + 4);
+#  endif
+
+#endif
+
+#if 1
+        dst += 8;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 8;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_pair_contig_to_contig_size8_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 8 != 16
+#  if !(8 == 1 && 1)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 8 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 8 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 8 != 16
+    temp = _NPY_SWAP_INPLACE8(*((npy_uint64 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 8 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 8;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 0 */
+
+#endif/* 8 >= 4 */
+
+
+#line 105
+
+#if (8 >= 4) && \
+    (8 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 2 || 1 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 2 == 0 && 8 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_pair_contig_to_contig_size8(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _aligned_swap_pair_contig_to_contig_size8\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 8 != 16
+        (*((npy_uint64 *)dst)) = _NPY_SWAP_PAIR8(*((npy_uint64 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 8);
+#  if 2 == 1
+        _NPY_SWAP_PAIR8(dst);
+#  elif 2 == 2
+        _NPY_SWAP_PAIR4(dst);
+        _NPY_SWAP_PAIR4(dst + 4);
+#  endif
+
+#endif
+
+#if 1
+        dst += 8;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 8;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_pair_contig_to_contig_size8_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 8 != 16
+#  if !(8 == 1 && 1)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 8 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 8 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 8 != 16
+    temp = _NPY_SWAP_PAIR8(*((npy_uint64 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 8 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 8;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 1 */
+
+#endif/* 8 >= 4 */
+
+
+
+
+#line 91
+#line 97
+#line 105
+
+#if (16 >= 1) && \
+    (16 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 0 || 0 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 0 == 0 && 16 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_strided_to_strided_size16(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _strided_to_strided_size16\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 16 != 16
+        (*((npy_uint64 *)dst)) = _NPY_NOP16(*((npy_uint64 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 16);
+#  if 0 == 1
+        _NPY_NOP16(dst);
+#  elif 0 == 2
+        _NPY_NOP8(dst);
+        _NPY_NOP8(dst + 8);
+#  endif
+
+#endif
+
+#if 0
+        dst += 16;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 16;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 0
+static NPY_GCC_OPT_3 int
+_strided_to_strided_size16_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 16 != 16
+#  if !(16 == 1 && 0)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 16 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 16 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 16 != 16
+    temp = _NPY_NOP16(*((npy_uint64 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 16 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 16;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 0 */
+
+#endif/* 16 >= 1 */
+
+
+#line 105
+
+#if (16 >= 1) && \
+    (16 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 0 || 0 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 0 == 0 && 16 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_strided_to_strided_size16(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _aligned_strided_to_strided_size16\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 16 != 16
+        (*((npy_uint64 *)dst)) = _NPY_NOP16(*((npy_uint64 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 16);
+#  if 0 == 1
+        _NPY_NOP16(dst);
+#  elif 0 == 2
+        _NPY_NOP8(dst);
+        _NPY_NOP8(dst + 8);
+#  endif
+
+#endif
+
+#if 0
+        dst += 16;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 16;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_strided_to_strided_size16_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 16 != 16
+#  if !(16 == 1 && 0)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 16 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 16 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 16 != 16
+    temp = _NPY_NOP16(*((npy_uint64 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 16 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 16;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 1 */
+
+#endif/* 16 >= 1 */
+
+
+#line 105
+
+#if (16 >= 2) && \
+    (16 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 1 || 0 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 1 == 0 && 16 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_strided_to_strided_size16(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _swap_strided_to_strided_size16\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 16 != 16
+        (*((npy_uint64 *)dst)) = _NPY_SWAP_INPLACE16(*((npy_uint64 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 16);
+#  if 1 == 1
+        _NPY_SWAP_INPLACE16(dst);
+#  elif 1 == 2
+        _NPY_SWAP_INPLACE8(dst);
+        _NPY_SWAP_INPLACE8(dst + 8);
+#  endif
+
+#endif
+
+#if 0
+        dst += 16;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 16;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_strided_to_strided_size16_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 16 != 16
+#  if !(16 == 1 && 0)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 16 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 16 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 16 != 16
+    temp = _NPY_SWAP_INPLACE16(*((npy_uint64 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 16 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 16;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 0 */
+
+#endif/* 16 >= 2 */
+
+
+#line 105
+
+#if (16 >= 2) && \
+    (16 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 1 || 0 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 1 == 0 && 16 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_strided_to_strided_size16(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _aligned_swap_strided_to_strided_size16\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 16 != 16
+        (*((npy_uint64 *)dst)) = _NPY_SWAP16(*((npy_uint64 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 16);
+#  if 1 == 1
+        _NPY_SWAP16(dst);
+#  elif 1 == 2
+        _NPY_SWAP8(dst);
+        _NPY_SWAP8(dst + 8);
+#  endif
+
+#endif
+
+#if 0
+        dst += 16;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 16;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_strided_to_strided_size16_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 16 != 16
+#  if !(16 == 1 && 0)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 16 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 16 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 16 != 16
+    temp = _NPY_SWAP16(*((npy_uint64 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 16 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 16;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 1 */
+
+#endif/* 16 >= 2 */
+
+
+#line 105
+
+#if (16 >= 4) && \
+    (16 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 2 || 0 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 2 == 0 && 16 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_pair_strided_to_strided_size16(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _swap_pair_strided_to_strided_size16\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 16 != 16
+        (*((npy_uint64 *)dst)) = _NPY_SWAP_INPLACE16(*((npy_uint64 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 16);
+#  if 2 == 1
+        _NPY_SWAP_INPLACE16(dst);
+#  elif 2 == 2
+        _NPY_SWAP_INPLACE8(dst);
+        _NPY_SWAP_INPLACE8(dst + 8);
+#  endif
+
+#endif
+
+#if 0
+        dst += 16;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 16;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_pair_strided_to_strided_size16_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 16 != 16
+#  if !(16 == 1 && 0)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 16 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 16 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 16 != 16
+    temp = _NPY_SWAP_INPLACE16(*((npy_uint64 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 16 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 16;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 0 */
+
+#endif/* 16 >= 4 */
+
+
+#line 105
+
+#if (16 >= 4) && \
+    (16 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 2 || 0 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 2 == 0 && 16 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_pair_strided_to_strided_size16(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _aligned_swap_pair_strided_to_strided_size16\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 16 != 16
+        (*((npy_uint64 *)dst)) = _NPY_SWAP_PAIR16(*((npy_uint64 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 16);
+#  if 2 == 1
+        _NPY_SWAP_PAIR16(dst);
+#  elif 2 == 2
+        _NPY_SWAP_PAIR8(dst);
+        _NPY_SWAP_PAIR8(dst + 8);
+#  endif
+
+#endif
+
+#if 0
+        dst += 16;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 16;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_pair_strided_to_strided_size16_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 16 != 16
+#  if !(16 == 1 && 0)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 16 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 16 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 16 != 16
+    temp = _NPY_SWAP_PAIR16(*((npy_uint64 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 16 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 16;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 1 */
+
+#endif/* 16 >= 4 */
+
+
+
+#line 97
+#line 105
+
+#if (16 >= 1) && \
+    (16 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 0 || 0 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 0 == 0 && 16 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_strided_to_contig_size16(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _strided_to_contig_size16\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 16 != 16
+        (*((npy_uint64 *)dst)) = _NPY_NOP16(*((npy_uint64 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 16);
+#  if 0 == 1
+        _NPY_NOP16(dst);
+#  elif 0 == 2
+        _NPY_NOP8(dst);
+        _NPY_NOP8(dst + 8);
+#  endif
+
+#endif
+
+#if 1
+        dst += 16;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 16;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 0
+static NPY_GCC_OPT_3 int
+_strided_to_contig_size16_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 16 != 16
+#  if !(16 == 1 && 1)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 16 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 16 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 16 != 16
+    temp = _NPY_NOP16(*((npy_uint64 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 16 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 16;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 0 */
+
+#endif/* 16 >= 1 */
+
+
+#line 105
+
+#if (16 >= 1) && \
+    (16 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 0 || 0 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 0 == 0 && 16 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_strided_to_contig_size16(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _aligned_strided_to_contig_size16\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 16 != 16
+        (*((npy_uint64 *)dst)) = _NPY_NOP16(*((npy_uint64 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 16);
+#  if 0 == 1
+        _NPY_NOP16(dst);
+#  elif 0 == 2
+        _NPY_NOP8(dst);
+        _NPY_NOP8(dst + 8);
+#  endif
+
+#endif
+
+#if 1
+        dst += 16;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 16;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_strided_to_contig_size16_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 16 != 16
+#  if !(16 == 1 && 1)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 16 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 16 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 16 != 16
+    temp = _NPY_NOP16(*((npy_uint64 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 16 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 16;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 1 */
+
+#endif/* 16 >= 1 */
+
+
+#line 105
+
+#if (16 >= 2) && \
+    (16 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 1 || 0 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 1 == 0 && 16 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_strided_to_contig_size16(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _swap_strided_to_contig_size16\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 16 != 16
+        (*((npy_uint64 *)dst)) = _NPY_SWAP_INPLACE16(*((npy_uint64 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 16);
+#  if 1 == 1
+        _NPY_SWAP_INPLACE16(dst);
+#  elif 1 == 2
+        _NPY_SWAP_INPLACE8(dst);
+        _NPY_SWAP_INPLACE8(dst + 8);
+#  endif
+
+#endif
+
+#if 1
+        dst += 16;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 16;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_strided_to_contig_size16_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 16 != 16
+#  if !(16 == 1 && 1)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 16 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 16 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 16 != 16
+    temp = _NPY_SWAP_INPLACE16(*((npy_uint64 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 16 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 16;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 0 */
+
+#endif/* 16 >= 2 */
+
+
+#line 105
+
+#if (16 >= 2) && \
+    (16 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 1 || 0 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 1 == 0 && 16 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_strided_to_contig_size16(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _aligned_swap_strided_to_contig_size16\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 16 != 16
+        (*((npy_uint64 *)dst)) = _NPY_SWAP16(*((npy_uint64 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 16);
+#  if 1 == 1
+        _NPY_SWAP16(dst);
+#  elif 1 == 2
+        _NPY_SWAP8(dst);
+        _NPY_SWAP8(dst + 8);
+#  endif
+
+#endif
+
+#if 1
+        dst += 16;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 16;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_strided_to_contig_size16_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 16 != 16
+#  if !(16 == 1 && 1)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 16 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 16 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 16 != 16
+    temp = _NPY_SWAP16(*((npy_uint64 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 16 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 16;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 1 */
+
+#endif/* 16 >= 2 */
+
+
+#line 105
+
+#if (16 >= 4) && \
+    (16 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 2 || 0 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 2 == 0 && 16 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_pair_strided_to_contig_size16(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _swap_pair_strided_to_contig_size16\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 16 != 16
+        (*((npy_uint64 *)dst)) = _NPY_SWAP_INPLACE16(*((npy_uint64 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 16);
+#  if 2 == 1
+        _NPY_SWAP_INPLACE16(dst);
+#  elif 2 == 2
+        _NPY_SWAP_INPLACE8(dst);
+        _NPY_SWAP_INPLACE8(dst + 8);
+#  endif
+
+#endif
+
+#if 1
+        dst += 16;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 16;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_pair_strided_to_contig_size16_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 16 != 16
+#  if !(16 == 1 && 1)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 16 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 16 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 16 != 16
+    temp = _NPY_SWAP_INPLACE16(*((npy_uint64 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 16 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 16;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 0 */
+
+#endif/* 16 >= 4 */
+
+
+#line 105
+
+#if (16 >= 4) && \
+    (16 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 2 || 0 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 2 == 0 && 16 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_pair_strided_to_contig_size16(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _aligned_swap_pair_strided_to_contig_size16\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 16 != 16
+        (*((npy_uint64 *)dst)) = _NPY_SWAP_PAIR16(*((npy_uint64 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 16);
+#  if 2 == 1
+        _NPY_SWAP_PAIR16(dst);
+#  elif 2 == 2
+        _NPY_SWAP_PAIR8(dst);
+        _NPY_SWAP_PAIR8(dst + 8);
+#  endif
+
+#endif
+
+#if 1
+        dst += 16;
+#else
+        dst += dst_stride;
+#endif
+
+#if 0
+        src += 16;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (0 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_pair_strided_to_contig_size16_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 16 != 16
+#  if !(16 == 1 && 1)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 16 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 16 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 16 != 16
+    temp = _NPY_SWAP_PAIR16(*((npy_uint64 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 16 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 16;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (0 == 0) && 1 */
+
+#endif/* 16 >= 4 */
+
+
+
+#line 97
+#line 105
+
+#if (16 >= 1) && \
+    (16 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 0 || 1 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 0 == 0 && 16 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_contig_to_strided_size16(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _contig_to_strided_size16\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 16 != 16
+        (*((npy_uint64 *)dst)) = _NPY_NOP16(*((npy_uint64 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 16);
+#  if 0 == 1
+        _NPY_NOP16(dst);
+#  elif 0 == 2
+        _NPY_NOP8(dst);
+        _NPY_NOP8(dst + 8);
+#  endif
+
+#endif
+
+#if 0
+        dst += 16;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 16;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 0
+static NPY_GCC_OPT_3 int
+_contig_to_strided_size16_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 16 != 16
+#  if !(16 == 1 && 0)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 16 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 16 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 16 != 16
+    temp = _NPY_NOP16(*((npy_uint64 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 16 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 16;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 0 */
+
+#endif/* 16 >= 1 */
+
+
+#line 105
+
+#if (16 >= 1) && \
+    (16 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 0 || 1 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 0 == 0 && 16 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_contig_to_strided_size16(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _aligned_contig_to_strided_size16\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 16 != 16
+        (*((npy_uint64 *)dst)) = _NPY_NOP16(*((npy_uint64 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 16);
+#  if 0 == 1
+        _NPY_NOP16(dst);
+#  elif 0 == 2
+        _NPY_NOP8(dst);
+        _NPY_NOP8(dst + 8);
+#  endif
+
+#endif
+
+#if 0
+        dst += 16;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 16;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_contig_to_strided_size16_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 16 != 16
+#  if !(16 == 1 && 0)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 16 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 16 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 16 != 16
+    temp = _NPY_NOP16(*((npy_uint64 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 16 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 16;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 1 */
+
+#endif/* 16 >= 1 */
+
+
+#line 105
+
+#if (16 >= 2) && \
+    (16 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 1 || 1 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 1 == 0 && 16 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_contig_to_strided_size16(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _swap_contig_to_strided_size16\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 16 != 16
+        (*((npy_uint64 *)dst)) = _NPY_SWAP_INPLACE16(*((npy_uint64 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 16);
+#  if 1 == 1
+        _NPY_SWAP_INPLACE16(dst);
+#  elif 1 == 2
+        _NPY_SWAP_INPLACE8(dst);
+        _NPY_SWAP_INPLACE8(dst + 8);
+#  endif
+
+#endif
+
+#if 0
+        dst += 16;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 16;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_contig_to_strided_size16_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 16 != 16
+#  if !(16 == 1 && 0)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 16 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 16 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 16 != 16
+    temp = _NPY_SWAP_INPLACE16(*((npy_uint64 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 16 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 16;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 0 */
+
+#endif/* 16 >= 2 */
+
+
+#line 105
+
+#if (16 >= 2) && \
+    (16 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 1 || 1 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 1 == 0 && 16 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_contig_to_strided_size16(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _aligned_swap_contig_to_strided_size16\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 16 != 16
+        (*((npy_uint64 *)dst)) = _NPY_SWAP16(*((npy_uint64 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 16);
+#  if 1 == 1
+        _NPY_SWAP16(dst);
+#  elif 1 == 2
+        _NPY_SWAP8(dst);
+        _NPY_SWAP8(dst + 8);
+#  endif
+
+#endif
+
+#if 0
+        dst += 16;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 16;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_contig_to_strided_size16_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 16 != 16
+#  if !(16 == 1 && 0)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 16 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 16 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 16 != 16
+    temp = _NPY_SWAP16(*((npy_uint64 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 16 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 16;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 1 */
+
+#endif/* 16 >= 2 */
+
+
+#line 105
+
+#if (16 >= 4) && \
+    (16 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 2 || 1 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 2 == 0 && 16 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_pair_contig_to_strided_size16(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _swap_pair_contig_to_strided_size16\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 16 != 16
+        (*((npy_uint64 *)dst)) = _NPY_SWAP_INPLACE16(*((npy_uint64 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 16);
+#  if 2 == 1
+        _NPY_SWAP_INPLACE16(dst);
+#  elif 2 == 2
+        _NPY_SWAP_INPLACE8(dst);
+        _NPY_SWAP_INPLACE8(dst + 8);
+#  endif
+
+#endif
+
+#if 0
+        dst += 16;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 16;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_pair_contig_to_strided_size16_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 16 != 16
+#  if !(16 == 1 && 0)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 16 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 16 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 16 != 16
+    temp = _NPY_SWAP_INPLACE16(*((npy_uint64 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 16 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 16;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 0 */
+
+#endif/* 16 >= 4 */
+
+
+#line 105
+
+#if (16 >= 4) && \
+    (16 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 2 || 1 == 0 || 0 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 2 == 0 && 16 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_pair_contig_to_strided_size16(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _aligned_swap_pair_contig_to_strided_size16\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 16 != 16
+        (*((npy_uint64 *)dst)) = _NPY_SWAP_PAIR16(*((npy_uint64 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 16);
+#  if 2 == 1
+        _NPY_SWAP_PAIR16(dst);
+#  elif 2 == 2
+        _NPY_SWAP_PAIR8(dst);
+        _NPY_SWAP_PAIR8(dst + 8);
+#  endif
+
+#endif
+
+#if 0
+        dst += 16;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 16;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_pair_contig_to_strided_size16_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 16 != 16
+#  if !(16 == 1 && 0)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 16 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 16 == 1 && 0
+    memset(dst, *src, N);
+#else
+
+#  if 16 != 16
+    temp = _NPY_SWAP_PAIR16(*((npy_uint64 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 16 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 0
+        dst += 16;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 0 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 1 */
+
+#endif/* 16 >= 4 */
+
+
+
+#line 97
+#line 105
+
+#if (16 >= 1) && \
+    (16 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 0 || 1 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 0 == 0 && 16 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_contig_to_contig_size16(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _contig_to_contig_size16\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 16 != 16
+        (*((npy_uint64 *)dst)) = _NPY_NOP16(*((npy_uint64 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 16);
+#  if 0 == 1
+        _NPY_NOP16(dst);
+#  elif 0 == 2
+        _NPY_NOP8(dst);
+        _NPY_NOP8(dst + 8);
+#  endif
+
+#endif
+
+#if 1
+        dst += 16;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 16;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 0
+static NPY_GCC_OPT_3 int
+_contig_to_contig_size16_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 16 != 16
+#  if !(16 == 1 && 1)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 16 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 16 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 16 != 16
+    temp = _NPY_NOP16(*((npy_uint64 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 16 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 16;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 0 */
+
+#endif/* 16 >= 1 */
+
+
+#line 105
+
+#if (16 >= 1) && \
+    (16 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 0 || 1 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 0 == 0 && 16 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_contig_to_contig_size16(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _aligned_contig_to_contig_size16\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 16 != 16
+        (*((npy_uint64 *)dst)) = _NPY_NOP16(*((npy_uint64 *)src));
+#  else
+#    if 0 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 16);
+#  if 0 == 1
+        _NPY_NOP16(dst);
+#  elif 0 == 2
+        _NPY_NOP8(dst);
+        _NPY_NOP8(dst + 8);
+#  endif
+
+#endif
+
+#if 1
+        dst += 16;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 16;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_contig_to_contig_size16_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 16 != 16
+#  if !(16 == 1 && 1)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 16 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 16 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 16 != 16
+    temp = _NPY_NOP16(*((npy_uint64 *)src));
+#  else
+#    if 0 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 0 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 0 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 16 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 16;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 1 */
+
+#endif/* 16 >= 1 */
+
+
+#line 105
+
+#if (16 >= 2) && \
+    (16 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 1 || 1 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 1 == 0 && 16 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_contig_to_contig_size16(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _swap_contig_to_contig_size16\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 16 != 16
+        (*((npy_uint64 *)dst)) = _NPY_SWAP_INPLACE16(*((npy_uint64 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 16);
+#  if 1 == 1
+        _NPY_SWAP_INPLACE16(dst);
+#  elif 1 == 2
+        _NPY_SWAP_INPLACE8(dst);
+        _NPY_SWAP_INPLACE8(dst + 8);
+#  endif
+
+#endif
+
+#if 1
+        dst += 16;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 16;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_contig_to_contig_size16_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 16 != 16
+#  if !(16 == 1 && 1)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 16 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 16 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 16 != 16
+    temp = _NPY_SWAP_INPLACE16(*((npy_uint64 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 16 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 16;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 0 */
+
+#endif/* 16 >= 2 */
+
+
+#line 105
+
+#if (16 >= 2) && \
+    (16 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 1 || 1 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 1 == 0 && 16 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_contig_to_contig_size16(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _aligned_swap_contig_to_contig_size16\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 16 != 16
+        (*((npy_uint64 *)dst)) = _NPY_SWAP16(*((npy_uint64 *)src));
+#  else
+#    if 1 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 16);
+#  if 1 == 1
+        _NPY_SWAP16(dst);
+#  elif 1 == 2
+        _NPY_SWAP8(dst);
+        _NPY_SWAP8(dst + 8);
+#  endif
+
+#endif
+
+#if 1
+        dst += 16;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 16;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_contig_to_contig_size16_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 16 != 16
+#  if !(16 == 1 && 1)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 16 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 16 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 16 != 16
+    temp = _NPY_SWAP16(*((npy_uint64 *)src));
+#  else
+#    if 1 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 1 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 1 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 16 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 16;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 1 */
+
+#endif/* 16 >= 2 */
+
+
+#line 105
+
+#if (16 >= 4) && \
+    (16 > 1 || 0) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 0)
+
+
+#if 2 || 1 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 0 && 2 == 0 && 16 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_swap_pair_contig_to_contig_size16(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 0
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _swap_pair_contig_to_contig_size16\n");*/
+    while (N > 0) {
+#if 0
+
+        /* aligned copy and swap */
+#  if 16 != 16
+        (*((npy_uint64 *)dst)) = _NPY_SWAP_INPLACE16(*((npy_uint64 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 16);
+#  if 2 == 1
+        _NPY_SWAP_INPLACE16(dst);
+#  elif 2 == 2
+        _NPY_SWAP_INPLACE8(dst);
+        _NPY_SWAP_INPLACE8(dst + 8);
+#  endif
+
+#endif
+
+#if 1
+        dst += 16;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 16;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 0
+static NPY_GCC_OPT_3 int
+_swap_pair_contig_to_contig_size16_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 16 != 16
+#  if !(16 == 1 && 1)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 0 && 16 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 16 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 16 != 16
+    temp = _NPY_SWAP_INPLACE16(*((npy_uint64 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 16 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 16;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 0 */
+
+#endif/* 16 >= 4 */
+
+
+#line 105
+
+#if (16 >= 4) && \
+    (16 > 1 || 1) && \
+    (!NPY_USE_UNALIGNED_ACCESS || 1)
+
+
+#if 2 || 1 == 0 || 1 == 0
+/*
+ * unrolling gains about 20-50% if the copy can be done in one mov instruction
+ * if not it can decrease performance
+ * tested to improve performance on intel xeon 5x/7x, core2duo, amd phenom x4
+ */
+static int
+#if 1 && 2 == 0 && 16 <= NPY_SIZEOF_INTP
+    NPY_GCC_UNROLL_LOOPS
+#endif
+_aligned_swap_pair_contig_to_contig_size16(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0];
+#endif
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 1
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+    /*printf("fn _aligned_swap_pair_contig_to_contig_size16\n");*/
+    while (N > 0) {
+#if 1
+
+        /* aligned copy and swap */
+#  if 16 != 16
+        (*((npy_uint64 *)dst)) = _NPY_SWAP_PAIR16(*((npy_uint64 *)src));
+#  else
+#    if 2 == 0
+        (*((npy_uint64 *)dst)) = (*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        (*((npy_uint64 *)dst)) = _NPY_SWAP8(*((npy_uint64 *)src));
+        (*((npy_uint64 *)dst + 1)) = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+#else
+
+        /* unaligned copy and swap */
+        memmove(dst, src, 16);
+#  if 2 == 1
+        _NPY_SWAP_PAIR16(dst);
+#  elif 2 == 2
+        _NPY_SWAP_PAIR8(dst);
+        _NPY_SWAP_PAIR8(dst + 8);
+#  endif
+
+#endif
+
+#if 1
+        dst += 16;
+#else
+        dst += dst_stride;
+#endif
+
+#if 1
+        src += 16;
+#else
+        src += src_stride;
+#endif
+
+        --N;
+    }
+    return 0;
+}
+#endif
+
+
+/*
+ * specialized copy and swap for source stride 0,
+ * interestingly unrolling here is like above is only marginally profitable for
+ * small types and detrimental for >= 8byte moves on x86
+ * but it profits from vectorization enabled with -O3
+ */
+#if (1 == 0) && 1
+static NPY_GCC_OPT_3 int
+_aligned_swap_pair_contig_to_contig_size16_srcstride0(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(auxdata))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp dst_stride = strides[1];
+#endif
+
+#if 16 != 16
+#  if !(16 == 1 && 1)
+    npy_uint64 temp;
+#  endif
+#else
+    npy_uint64 temp0, temp1;
+#endif
+    if (N == 0) {
+        return 0;
+    }
+#if 1 && 16 != 16
+    /* sanity check */
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF_UINT(npy_uint64)));
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF_UINT(npy_uint64)));
+#endif
+#if 16 == 1 && 1
+    memset(dst, *src, N);
+#else
+
+#  if 16 != 16
+    temp = _NPY_SWAP_PAIR16(*((npy_uint64 *)src));
+#  else
+#    if 2 == 0
+        temp0 = (*((npy_uint64 *)src));
+        temp1 = (*((npy_uint64 *)src + 1));
+#    elif 2 == 1
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src));
+#    elif 2 == 2
+        temp0 = _NPY_SWAP8(*((npy_uint64 *)src));
+        temp1 = _NPY_SWAP8(*((npy_uint64 *)src + 1));
+#    endif
+#  endif
+
+    while (N > 0) {
+#  if 16 != 16
+        *((npy_uint64 *)dst) = temp;
+#  else
+        *((npy_uint64 *)dst) = temp0;
+        *((npy_uint64 *)dst + 1) = temp1;
+#  endif
+#  if 1
+        dst += 16;
+#  else
+        dst += dst_stride;
+#  endif
+        --N;
+    }
+#endif/* @elsize == 1 && 1 -- else */
+    return 0;
+}
+#endif/* (1 == 0) && 1 */
+
+#endif/* 16 >= 4 */
+
+
+
+
+
+static int
+_strided_to_strided(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+    npy_intp src_itemsize = context->descriptors[0]->elsize;
+
+    while (N > 0) {
+        memmove(dst, src, src_itemsize);
+        dst += dst_stride;
+        src += src_stride;
+        --N;
+    }
+    return 0;
+}
+
+/*
+ * NOTE: This function is currently unused. It would currently be used for
+ *       builtin dtypes that have an elsize other than 2, 4, 8, or 16 bytes.
+ *       Since unicode and complex swap differently, no such dtype exists.
+ */
+static int
+_swap_strided_to_strided(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+    npy_intp src_itemsize = context->descriptors[0]->elsize;
+
+    char *a, *b, c;
+
+    while (N > 0) {
+        memmove(dst, src, src_itemsize);
+        /* general in-place swap */
+        a = dst;
+        b = dst + src_itemsize - 1;
+        while (a < b) {
+            c = *a;
+            *a = *b;
+            *b = c;
+            ++a; --b;
+        }
+        dst += dst_stride;
+        src += src_stride;
+        --N;
+    }
+    return 0;
+}
+
+static int
+_swap_pair_strided_to_strided(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+    npy_intp src_itemsize = context->descriptors[0]->elsize;
+
+    char *a, *b, c;
+    npy_intp itemsize_half = src_itemsize / 2;
+
+    while (N > 0) {
+        memmove(dst, src, src_itemsize);
+        /* general in-place swap */
+        a = dst;
+        b = dst + itemsize_half - 1;
+        while (a < b) {
+            c = *a;
+            *a = *b;
+            *b = c;
+            ++a; --b;
+        }
+        /* general in-place swap */
+        a = dst + itemsize_half;
+        b = dst + 2*itemsize_half - 1;
+        while (a < b) {
+            c = *a;
+            *a = *b;
+            *b = c;
+            ++a; --b;
+        }
+        dst += dst_stride;
+        src += src_stride;
+        --N;
+    }
+    return 0;
+}
+
+static int
+_contig_to_contig(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *NPY_UNUSED(strides),
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+    npy_intp src_itemsize = context->descriptors[0]->elsize;
+
+    memmove(dst, src, src_itemsize*N);
+    return 0;
+}
+
+
+NPY_NO_EXPORT PyArrayMethod_StridedLoop *
+PyArray_GetStridedCopyFn(int aligned, npy_intp src_stride,
+                         npy_intp dst_stride, npy_intp itemsize)
+{
+/*
+ * Skip the "unaligned" versions on CPUs which support unaligned
+ * memory accesses.
+ */
+#if !NPY_USE_UNALIGNED_ACCESS
+    if (aligned) {
+#endif/*!NPY_USE_UNALIGNED_ACCESS*/
+
+        /* contiguous dst */
+        if (itemsize != 0 && dst_stride == itemsize) {
+            /* constant src */
+            if (src_stride == 0) {
+                switch (itemsize) {
+#line 402
+                    case 1:
+                        return
+                          &_aligned_strided_to_contig_size1_srcstride0;
+
+#line 402
+                    case 2:
+                        return
+                          &_aligned_strided_to_contig_size2_srcstride0;
+
+#line 402
+                    case 4:
+                        return
+                          &_aligned_strided_to_contig_size4_srcstride0;
+
+#line 402
+                    case 8:
+                        return
+                          &_aligned_strided_to_contig_size8_srcstride0;
+
+#line 402
+                    case 16:
+                        return
+                          &_aligned_strided_to_contig_size16_srcstride0;
+
+                }
+            }
+            /* contiguous src */
+            else if (src_stride == itemsize) {
+                return &_contig_to_contig;
+            }
+            /* general src */
+            else {
+                switch (itemsize) {
+#line 418
+                    case 1:
+                        return &_aligned_strided_to_contig_size1;
+
+#line 418
+                    case 2:
+                        return &_aligned_strided_to_contig_size2;
+
+#line 418
+                    case 4:
+                        return &_aligned_strided_to_contig_size4;
+
+#line 418
+                    case 8:
+                        return &_aligned_strided_to_contig_size8;
+
+#line 418
+                    case 16:
+                        return &_aligned_strided_to_contig_size16;
+
+                }
+            }
+
+            return &_strided_to_strided;
+        }
+        /* general dst */
+        else {
+            /* constant src */
+            if (src_stride == 0) {
+                switch (itemsize) {
+#line 434
+                    case 1:
+                        return
+                          &_aligned_strided_to_strided_size1_srcstride0;
+
+#line 434
+                    case 2:
+                        return
+                          &_aligned_strided_to_strided_size2_srcstride0;
+
+#line 434
+                    case 4:
+                        return
+                          &_aligned_strided_to_strided_size4_srcstride0;
+
+#line 434
+                    case 8:
+                        return
+                          &_aligned_strided_to_strided_size8_srcstride0;
+
+#line 434
+                    case 16:
+                        return
+                          &_aligned_strided_to_strided_size16_srcstride0;
+
+                }
+            }
+            /* contiguous src */
+            else if (src_stride == itemsize) {
+                switch (itemsize) {
+#line 446
+                    case 1:
+                        return &_aligned_contig_to_strided_size1;
+
+#line 446
+                    case 2:
+                        return &_aligned_contig_to_strided_size2;
+
+#line 446
+                    case 4:
+                        return &_aligned_contig_to_strided_size4;
+
+#line 446
+                    case 8:
+                        return &_aligned_contig_to_strided_size8;
+
+#line 446
+                    case 16:
+                        return &_aligned_contig_to_strided_size16;
+
+                }
+
+                return &_strided_to_strided;
+            }
+            else {
+                switch (itemsize) {
+#line 458
+                    case 1:
+                        return &_aligned_strided_to_strided_size1;
+
+#line 458
+                    case 2:
+                        return &_aligned_strided_to_strided_size2;
+
+#line 458
+                    case 4:
+                        return &_aligned_strided_to_strided_size4;
+
+#line 458
+                    case 8:
+                        return &_aligned_strided_to_strided_size8;
+
+#line 458
+                    case 16:
+                        return &_aligned_strided_to_strided_size16;
+
+                }
+            }
+        }
+
+#if !NPY_USE_UNALIGNED_ACCESS
+    }
+    else {
+        if (itemsize != 0) {
+            if (dst_stride == itemsize) {
+                /* contiguous dst */
+                if (src_stride == itemsize) {
+                    /* contiguous src, dst */
+                    return &_contig_to_contig;
+                }
+                else {
+                    /* general src */
+                    switch (itemsize) {
+                        case 1:
+                            return &_aligned_strided_to_contig_size1;
+#line 483
+                        case 2:
+                            return &_strided_to_contig_size2;
+
+#line 483
+                        case 4:
+                            return &_strided_to_contig_size4;
+
+#line 483
+                        case 8:
+                            return &_strided_to_contig_size8;
+
+#line 483
+                        case 16:
+                            return &_strided_to_contig_size16;
+
+                    }
+                }
+
+                return &_strided_to_strided;
+            }
+            else if (src_stride == itemsize) {
+                /* contiguous src, general dst */
+                switch (itemsize) {
+                    case 1:
+                        return &_aligned_contig_to_strided_size1;
+#line 499
+                    case 2:
+                        return &_contig_to_strided_size2;
+
+#line 499
+                    case 4:
+                        return &_contig_to_strided_size4;
+
+#line 499
+                    case 8:
+                        return &_contig_to_strided_size8;
+
+#line 499
+                    case 16:
+                        return &_contig_to_strided_size16;
+
+                }
+
+                return &_strided_to_strided;
+            }
+        }
+        else {
+            /* general src, dst */
+            switch (itemsize) {
+                case 1:
+                    return &_aligned_strided_to_strided_size1;
+#line 515
+                case 2:
+                    return &_strided_to_strided_size2;
+
+#line 515
+                case 4:
+                    return &_strided_to_strided_size4;
+
+#line 515
+                case 8:
+                    return &_strided_to_strided_size8;
+
+#line 515
+                case 16:
+                    return &_strided_to_strided_size16;
+
+            }
+        }
+    }
+#endif/*!NPY_USE_UNALIGNED_ACCESS*/
+
+    return &_strided_to_strided;
+}
+
+/*
+ * PyArray_GetStridedCopySwapFn and PyArray_GetStridedCopySwapPairFn are
+ * nearly identical, so can do a repeat for them.
+ */
+#line 535
+
+NPY_NO_EXPORT PyArrayMethod_StridedLoop *
+PyArray_GetStridedCopySwapFn(int aligned, npy_intp src_stride,
+                             npy_intp dst_stride, npy_intp itemsize)
+{
+/*
+ * Skip the "unaligned" versions on CPUs which support unaligned
+ * memory accesses.
+ */
+#if !NPY_USE_UNALIGNED_ACCESS
+    if (aligned) {
+#endif/*!NPY_USE_UNALIGNED_ACCESS*/
+
+        /* contiguous dst */
+        if (itemsize != 0 && dst_stride == itemsize) {
+            /* constant src */
+            if (src_stride == 0) {
+                switch (itemsize) {
+#line 556
+#if 1 || 2 > 2
+                case 2:
+                    return
+                 &_aligned_swap_strided_to_contig_size2_srcstride0;
+#endif
+
+#line 556
+#if 1 || 4 > 2
+                case 4:
+                    return
+                 &_aligned_swap_strided_to_contig_size4_srcstride0;
+#endif
+
+#line 556
+#if 1 || 8 > 2
+                case 8:
+                    return
+                 &_aligned_swap_strided_to_contig_size8_srcstride0;
+#endif
+
+#line 556
+#if 1 || 16 > 2
+                case 16:
+                    return
+                 &_aligned_swap_strided_to_contig_size16_srcstride0;
+#endif
+
+                }
+            }
+            /* contiguous src */
+            else if (src_stride == itemsize) {
+                switch (itemsize) {
+#line 570
+#if 1 || 2 > 2
+                case 2:
+                    return &_aligned_swap_contig_to_contig_size2;
+#endif
+
+#line 570
+#if 1 || 4 > 2
+                case 4:
+                    return &_aligned_swap_contig_to_contig_size4;
+#endif
+
+#line 570
+#if 1 || 8 > 2
+                case 8:
+                    return &_aligned_swap_contig_to_contig_size8;
+#endif
+
+#line 570
+#if 1 || 16 > 2
+                case 16:
+                    return &_aligned_swap_contig_to_contig_size16;
+#endif
+
+                }
+            }
+            /* general src */
+            else {
+                switch (itemsize) {
+#line 583
+#if 1 || 2 > 2
+                case 2:
+                    return &_aligned_swap_strided_to_contig_size2;
+#endif
+
+#line 583
+#if 1 || 4 > 2
+                case 4:
+                    return &_aligned_swap_strided_to_contig_size4;
+#endif
+
+#line 583
+#if 1 || 8 > 2
+                case 8:
+                    return &_aligned_swap_strided_to_contig_size8;
+#endif
+
+#line 583
+#if 1 || 16 > 2
+                case 16:
+                    return &_aligned_swap_strided_to_contig_size16;
+#endif
+
+                }
+            }
+        }
+        /* general dst */
+        else {
+            /* constant src */
+            if (src_stride == 0) {
+                switch (itemsize) {
+#line 599
+#if 1 || 2 > 2
+                case 2:
+                    return
+                &_aligned_swap_strided_to_strided_size2_srcstride0;
+#endif
+
+#line 599
+#if 1 || 4 > 2
+                case 4:
+                    return
+                &_aligned_swap_strided_to_strided_size4_srcstride0;
+#endif
+
+#line 599
+#if 1 || 8 > 2
+                case 8:
+                    return
+                &_aligned_swap_strided_to_strided_size8_srcstride0;
+#endif
+
+#line 599
+#if 1 || 16 > 2
+                case 16:
+                    return
+                &_aligned_swap_strided_to_strided_size16_srcstride0;
+#endif
+
+                }
+            }
+            /* contiguous src */
+            else if (src_stride == itemsize) {
+                switch (itemsize) {
+#line 613
+#if 1 || 2 > 2
+                case 2:
+                    return &_aligned_swap_contig_to_strided_size2;
+#endif
+
+#line 613
+#if 1 || 4 > 2
+                case 4:
+                    return &_aligned_swap_contig_to_strided_size4;
+#endif
+
+#line 613
+#if 1 || 8 > 2
+                case 8:
+                    return &_aligned_swap_contig_to_strided_size8;
+#endif
+
+#line 613
+#if 1 || 16 > 2
+                case 16:
+                    return &_aligned_swap_contig_to_strided_size16;
+#endif
+
+                }
+
+                return  &_swap_strided_to_strided;
+            }
+            else {
+                switch (itemsize) {
+#line 627
+#if 1 || 2 > 2
+                case 2:
+                    return &_aligned_swap_strided_to_strided_size2;
+#endif
+
+#line 627
+#if 1 || 4 > 2
+                case 4:
+                    return &_aligned_swap_strided_to_strided_size4;
+#endif
+
+#line 627
+#if 1 || 8 > 2
+                case 8:
+                    return &_aligned_swap_strided_to_strided_size8;
+#endif
+
+#line 627
+#if 1 || 16 > 2
+                case 16:
+                    return &_aligned_swap_strided_to_strided_size16;
+#endif
+
+                }
+            }
+        }
+
+#if !NPY_USE_UNALIGNED_ACCESS
+    }
+    else {
+        /* contiguous dst */
+        if (itemsize != 0 && dst_stride == itemsize) {
+            /* contiguous src */
+            if (src_stride == itemsize) {
+                switch (itemsize) {
+#line 647
+#if 1 || 2 > 2
+                case 2:
+                    return &_swap_contig_to_contig_size2;
+#endif
+
+#line 647
+#if 1 || 4 > 2
+                case 4:
+                    return &_swap_contig_to_contig_size4;
+#endif
+
+#line 647
+#if 1 || 8 > 2
+                case 8:
+                    return &_swap_contig_to_contig_size8;
+#endif
+
+#line 647
+#if 1 || 16 > 2
+                case 16:
+                    return &_swap_contig_to_contig_size16;
+#endif
+
+                }
+            }
+            /* general src */
+            else {
+                switch (itemsize) {
+#line 660
+#if 1 || 2 > 2
+                    case 2:
+                        return &_swap_strided_to_contig_size2;
+#endif
+
+#line 660
+#if 1 || 4 > 2
+                    case 4:
+                        return &_swap_strided_to_contig_size4;
+#endif
+
+#line 660
+#if 1 || 8 > 2
+                    case 8:
+                        return &_swap_strided_to_contig_size8;
+#endif
+
+#line 660
+#if 1 || 16 > 2
+                    case 16:
+                        return &_swap_strided_to_contig_size16;
+#endif
+
+                }
+            }
+
+            return  &_swap_strided_to_strided;
+        }
+        /* general dst */
+        else {
+            /* contiguous src */
+            if (itemsize != 0 && src_stride == itemsize) {
+                switch (itemsize) {
+#line 678
+#if 1 || 2 > 2
+                case 2:
+                    return &_swap_contig_to_strided_size2;
+#endif
+
+#line 678
+#if 1 || 4 > 2
+                case 4:
+                    return &_swap_contig_to_strided_size4;
+#endif
+
+#line 678
+#if 1 || 8 > 2
+                case 8:
+                    return &_swap_contig_to_strided_size8;
+#endif
+
+#line 678
+#if 1 || 16 > 2
+                case 16:
+                    return &_swap_contig_to_strided_size16;
+#endif
+
+                }
+
+                return  &_swap_strided_to_strided;
+            }
+            /* general src */
+            else {
+                switch (itemsize) {
+#line 693
+#if 1 || 2 > 2
+                case 2:
+                    return &_swap_strided_to_strided_size2;
+#endif
+
+#line 693
+#if 1 || 4 > 2
+                case 4:
+                    return &_swap_strided_to_strided_size4;
+#endif
+
+#line 693
+#if 1 || 8 > 2
+                case 8:
+                    return &_swap_strided_to_strided_size8;
+#endif
+
+#line 693
+#if 1 || 16 > 2
+                case 16:
+                    return &_swap_strided_to_strided_size16;
+#endif
+
+                }
+            }
+        }
+    }
+#endif/*!NPY_USE_UNALIGNED_ACCESS*/
+
+    return &_swap_strided_to_strided;
+}
+
+
+#line 535
+
+NPY_NO_EXPORT PyArrayMethod_StridedLoop *
+PyArray_GetStridedCopySwapPairFn(int aligned, npy_intp src_stride,
+                             npy_intp dst_stride, npy_intp itemsize)
+{
+/*
+ * Skip the "unaligned" versions on CPUs which support unaligned
+ * memory accesses.
+ */
+#if !NPY_USE_UNALIGNED_ACCESS
+    if (aligned) {
+#endif/*!NPY_USE_UNALIGNED_ACCESS*/
+
+        /* contiguous dst */
+        if (itemsize != 0 && dst_stride == itemsize) {
+            /* constant src */
+            if (src_stride == 0) {
+                switch (itemsize) {
+#line 556
+#if 0 || 2 > 2
+                case 2:
+                    return
+                 &_aligned_swap_pair_strided_to_contig_size2_srcstride0;
+#endif
+
+#line 556
+#if 0 || 4 > 2
+                case 4:
+                    return
+                 &_aligned_swap_pair_strided_to_contig_size4_srcstride0;
+#endif
+
+#line 556
+#if 0 || 8 > 2
+                case 8:
+                    return
+                 &_aligned_swap_pair_strided_to_contig_size8_srcstride0;
+#endif
+
+#line 556
+#if 0 || 16 > 2
+                case 16:
+                    return
+                 &_aligned_swap_pair_strided_to_contig_size16_srcstride0;
+#endif
+
+                }
+            }
+            /* contiguous src */
+            else if (src_stride == itemsize) {
+                switch (itemsize) {
+#line 570
+#if 0 || 2 > 2
+                case 2:
+                    return &_aligned_swap_pair_contig_to_contig_size2;
+#endif
+
+#line 570
+#if 0 || 4 > 2
+                case 4:
+                    return &_aligned_swap_pair_contig_to_contig_size4;
+#endif
+
+#line 570
+#if 0 || 8 > 2
+                case 8:
+                    return &_aligned_swap_pair_contig_to_contig_size8;
+#endif
+
+#line 570
+#if 0 || 16 > 2
+                case 16:
+                    return &_aligned_swap_pair_contig_to_contig_size16;
+#endif
+
+                }
+            }
+            /* general src */
+            else {
+                switch (itemsize) {
+#line 583
+#if 0 || 2 > 2
+                case 2:
+                    return &_aligned_swap_pair_strided_to_contig_size2;
+#endif
+
+#line 583
+#if 0 || 4 > 2
+                case 4:
+                    return &_aligned_swap_pair_strided_to_contig_size4;
+#endif
+
+#line 583
+#if 0 || 8 > 2
+                case 8:
+                    return &_aligned_swap_pair_strided_to_contig_size8;
+#endif
+
+#line 583
+#if 0 || 16 > 2
+                case 16:
+                    return &_aligned_swap_pair_strided_to_contig_size16;
+#endif
+
+                }
+            }
+        }
+        /* general dst */
+        else {
+            /* constant src */
+            if (src_stride == 0) {
+                switch (itemsize) {
+#line 599
+#if 0 || 2 > 2
+                case 2:
+                    return
+                &_aligned_swap_pair_strided_to_strided_size2_srcstride0;
+#endif
+
+#line 599
+#if 0 || 4 > 2
+                case 4:
+                    return
+                &_aligned_swap_pair_strided_to_strided_size4_srcstride0;
+#endif
+
+#line 599
+#if 0 || 8 > 2
+                case 8:
+                    return
+                &_aligned_swap_pair_strided_to_strided_size8_srcstride0;
+#endif
+
+#line 599
+#if 0 || 16 > 2
+                case 16:
+                    return
+                &_aligned_swap_pair_strided_to_strided_size16_srcstride0;
+#endif
+
+                }
+            }
+            /* contiguous src */
+            else if (src_stride == itemsize) {
+                switch (itemsize) {
+#line 613
+#if 0 || 2 > 2
+                case 2:
+                    return &_aligned_swap_pair_contig_to_strided_size2;
+#endif
+
+#line 613
+#if 0 || 4 > 2
+                case 4:
+                    return &_aligned_swap_pair_contig_to_strided_size4;
+#endif
+
+#line 613
+#if 0 || 8 > 2
+                case 8:
+                    return &_aligned_swap_pair_contig_to_strided_size8;
+#endif
+
+#line 613
+#if 0 || 16 > 2
+                case 16:
+                    return &_aligned_swap_pair_contig_to_strided_size16;
+#endif
+
+                }
+
+                return  &_swap_pair_strided_to_strided;
+            }
+            else {
+                switch (itemsize) {
+#line 627
+#if 0 || 2 > 2
+                case 2:
+                    return &_aligned_swap_pair_strided_to_strided_size2;
+#endif
+
+#line 627
+#if 0 || 4 > 2
+                case 4:
+                    return &_aligned_swap_pair_strided_to_strided_size4;
+#endif
+
+#line 627
+#if 0 || 8 > 2
+                case 8:
+                    return &_aligned_swap_pair_strided_to_strided_size8;
+#endif
+
+#line 627
+#if 0 || 16 > 2
+                case 16:
+                    return &_aligned_swap_pair_strided_to_strided_size16;
+#endif
+
+                }
+            }
+        }
+
+#if !NPY_USE_UNALIGNED_ACCESS
+    }
+    else {
+        /* contiguous dst */
+        if (itemsize != 0 && dst_stride == itemsize) {
+            /* contiguous src */
+            if (src_stride == itemsize) {
+                switch (itemsize) {
+#line 647
+#if 0 || 2 > 2
+                case 2:
+                    return &_swap_pair_contig_to_contig_size2;
+#endif
+
+#line 647
+#if 0 || 4 > 2
+                case 4:
+                    return &_swap_pair_contig_to_contig_size4;
+#endif
+
+#line 647
+#if 0 || 8 > 2
+                case 8:
+                    return &_swap_pair_contig_to_contig_size8;
+#endif
+
+#line 647
+#if 0 || 16 > 2
+                case 16:
+                    return &_swap_pair_contig_to_contig_size16;
+#endif
+
+                }
+            }
+            /* general src */
+            else {
+                switch (itemsize) {
+#line 660
+#if 0 || 2 > 2
+                    case 2:
+                        return &_swap_pair_strided_to_contig_size2;
+#endif
+
+#line 660
+#if 0 || 4 > 2
+                    case 4:
+                        return &_swap_pair_strided_to_contig_size4;
+#endif
+
+#line 660
+#if 0 || 8 > 2
+                    case 8:
+                        return &_swap_pair_strided_to_contig_size8;
+#endif
+
+#line 660
+#if 0 || 16 > 2
+                    case 16:
+                        return &_swap_pair_strided_to_contig_size16;
+#endif
+
+                }
+            }
+
+            return  &_swap_pair_strided_to_strided;
+        }
+        /* general dst */
+        else {
+            /* contiguous src */
+            if (itemsize != 0 && src_stride == itemsize) {
+                switch (itemsize) {
+#line 678
+#if 0 || 2 > 2
+                case 2:
+                    return &_swap_pair_contig_to_strided_size2;
+#endif
+
+#line 678
+#if 0 || 4 > 2
+                case 4:
+                    return &_swap_pair_contig_to_strided_size4;
+#endif
+
+#line 678
+#if 0 || 8 > 2
+                case 8:
+                    return &_swap_pair_contig_to_strided_size8;
+#endif
+
+#line 678
+#if 0 || 16 > 2
+                case 16:
+                    return &_swap_pair_contig_to_strided_size16;
+#endif
+
+                }
+
+                return  &_swap_pair_strided_to_strided;
+            }
+            /* general src */
+            else {
+                switch (itemsize) {
+#line 693
+#if 0 || 2 > 2
+                case 2:
+                    return &_swap_pair_strided_to_strided_size2;
+#endif
+
+#line 693
+#if 0 || 4 > 2
+                case 4:
+                    return &_swap_pair_strided_to_strided_size4;
+#endif
+
+#line 693
+#if 0 || 8 > 2
+                case 8:
+                    return &_swap_pair_strided_to_strided_size8;
+#endif
+
+#line 693
+#if 0 || 16 > 2
+                case 16:
+                    return &_swap_pair_strided_to_strided_size16;
+#endif
+
+                }
+            }
+        }
+    }
+#endif/*!NPY_USE_UNALIGNED_ACCESS*/
+
+    return &_swap_pair_strided_to_strided;
+}
+
+
+
+/************* STRIDED CASTING SPECIALIZED FUNCTIONS *************/
+
+#line 739
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_bool_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_bool_to_bool\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_bool_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_bool_to_bool\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_bool_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_bool_to_bool\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_bool_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_bool_to_bool\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_bool_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_bool_to_ubyte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_bool_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_bool_to_ubyte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_bool_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_bool_to_ubyte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_bool_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_bool_to_ubyte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_bool_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_bool_to_ushort\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_bool_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_bool_to_ushort\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_bool_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_bool_to_ushort\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_bool_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_bool_to_ushort\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_bool_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_bool_to_uint\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_bool_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_bool_to_uint\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_bool_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_bool_to_uint\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_bool_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_bool_to_uint\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_bool_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_bool_to_ulong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_bool_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_bool_to_ulong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_bool_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_bool_to_ulong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_bool_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_bool_to_ulong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_bool_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_bool_to_ulonglong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_bool_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_bool_to_ulonglong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_bool_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_bool_to_ulonglong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_bool_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_bool_to_ulonglong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_bool_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_bool_to_byte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_bool_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_bool_to_byte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_bool_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_bool_to_byte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_bool_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_bool_to_byte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_bool_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_bool_to_short\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_short);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_bool_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_bool_to_short\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_short);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_bool_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_bool_to_short\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_short);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_bool_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_bool_to_short\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_short);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_bool_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_bool_to_int\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_int);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_bool_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_bool_to_int\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_int);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_bool_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_bool_to_int\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_int);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_bool_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_bool_to_int\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_int);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_bool_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_bool_to_long\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_long);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_bool_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_bool_to_long\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_long);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_bool_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_bool_to_long\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_long);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_bool_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_bool_to_long\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_long);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_bool_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_bool_to_longlong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_bool_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_bool_to_longlong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_bool_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_bool_to_longlong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_bool_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_bool_to_longlong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_bool_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_bool_to_half\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_half);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_bool_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_bool_to_half\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_half);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_bool_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_bool_to_half\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_half);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_bool_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_bool_to_half\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_half);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_bool_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_bool_to_float\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_float);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_bool_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_bool_to_float\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_float);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_bool_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_bool_to_float\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_float);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_bool_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_bool_to_float\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_float);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_bool_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_bool_to_double\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_double);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_bool_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_bool_to_double\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_double);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_bool_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_bool_to_double\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_double);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_bool_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_bool_to_double\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_double);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_bool_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_bool_to_longdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_bool_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_bool_to_longdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_bool_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_bool_to_longdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_bool_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_bool_to_longdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_bool_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_bool_to_cfloat\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_bool_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_bool_to_cfloat\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_bool_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_bool_to_cfloat\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_bool_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_bool_to_cfloat\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_bool_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_bool_to_cdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_bool_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_bool_to_cdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_bool_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_bool_to_cdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_bool_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_bool_to_cdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_bool_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_bool_to_clongdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_bool_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_bool_to_clongdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_bool_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_bool_to_clongdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_bool
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_bool
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 1
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_bool_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_bool_to_clongdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_bool);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+
+
+#line 739
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ubyte_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ubyte_to_bool\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ubyte_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ubyte_to_bool\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ubyte_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ubyte_to_bool\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ubyte_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ubyte_to_bool\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ubyte_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ubyte_to_ubyte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ubyte_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ubyte_to_ubyte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ubyte_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ubyte_to_ubyte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ubyte_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ubyte_to_ubyte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ubyte_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ubyte_to_ushort\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ubyte_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ubyte_to_ushort\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ubyte_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ubyte_to_ushort\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ubyte_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ubyte_to_ushort\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ubyte_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ubyte_to_uint\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ubyte_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ubyte_to_uint\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ubyte_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ubyte_to_uint\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ubyte_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ubyte_to_uint\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ubyte_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ubyte_to_ulong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ubyte_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ubyte_to_ulong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ubyte_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ubyte_to_ulong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ubyte_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ubyte_to_ulong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ubyte_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ubyte_to_ulonglong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ubyte_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ubyte_to_ulonglong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ubyte_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ubyte_to_ulonglong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ubyte_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ubyte_to_ulonglong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ubyte_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ubyte_to_byte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ubyte_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ubyte_to_byte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ubyte_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ubyte_to_byte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ubyte_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ubyte_to_byte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ubyte_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ubyte_to_short\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_short);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ubyte_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ubyte_to_short\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_short);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ubyte_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ubyte_to_short\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_short);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ubyte_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ubyte_to_short\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_short);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ubyte_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ubyte_to_int\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_int);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ubyte_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ubyte_to_int\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_int);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ubyte_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ubyte_to_int\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_int);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ubyte_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ubyte_to_int\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_int);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ubyte_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ubyte_to_long\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_long);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ubyte_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ubyte_to_long\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_long);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ubyte_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ubyte_to_long\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_long);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ubyte_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ubyte_to_long\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_long);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ubyte_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ubyte_to_longlong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ubyte_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ubyte_to_longlong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ubyte_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ubyte_to_longlong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ubyte_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ubyte_to_longlong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ubyte_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ubyte_to_half\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_half);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ubyte_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ubyte_to_half\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_half);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ubyte_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ubyte_to_half\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_half);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ubyte_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ubyte_to_half\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_half);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ubyte_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ubyte_to_float\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_float);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ubyte_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ubyte_to_float\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_float);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ubyte_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ubyte_to_float\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_float);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ubyte_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ubyte_to_float\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_float);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ubyte_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ubyte_to_double\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_double);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ubyte_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ubyte_to_double\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_double);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ubyte_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ubyte_to_double\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_double);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ubyte_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ubyte_to_double\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_double);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ubyte_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ubyte_to_longdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ubyte_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ubyte_to_longdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ubyte_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ubyte_to_longdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ubyte_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ubyte_to_longdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ubyte_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ubyte_to_cfloat\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ubyte_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ubyte_to_cfloat\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ubyte_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ubyte_to_cfloat\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ubyte_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ubyte_to_cfloat\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ubyte_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ubyte_to_cdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ubyte_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ubyte_to_cdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ubyte_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ubyte_to_cdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ubyte_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ubyte_to_cdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ubyte_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ubyte_to_clongdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ubyte_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ubyte_to_clongdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ubyte_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ubyte_to_clongdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ubyte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_ubyte
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ubyte_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ubyte_to_clongdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_ubyte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+
+
+#line 739
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ushort_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ushort_to_bool\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ushort_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ushort_to_bool\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ushort_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ushort_to_bool\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ushort_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ushort_to_bool\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ushort_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ushort_to_ubyte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ushort_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ushort_to_ubyte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ushort_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ushort_to_ubyte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ushort_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ushort_to_ubyte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ushort_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ushort_to_ushort\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ushort_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ushort_to_ushort\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ushort_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ushort_to_ushort\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ushort_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ushort_to_ushort\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ushort_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ushort_to_uint\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ushort_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ushort_to_uint\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ushort_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ushort_to_uint\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ushort_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ushort_to_uint\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ushort_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ushort_to_ulong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ushort_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ushort_to_ulong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ushort_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ushort_to_ulong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ushort_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ushort_to_ulong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ushort_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ushort_to_ulonglong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ushort_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ushort_to_ulonglong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ushort_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ushort_to_ulonglong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ushort_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ushort_to_ulonglong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ushort_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ushort_to_byte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ushort_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ushort_to_byte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ushort_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ushort_to_byte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ushort_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ushort_to_byte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ushort_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ushort_to_short\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_short);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ushort_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ushort_to_short\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_short);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ushort_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ushort_to_short\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_short);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ushort_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ushort_to_short\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_short);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ushort_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ushort_to_int\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_int);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ushort_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ushort_to_int\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_int);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ushort_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ushort_to_int\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_int);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ushort_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ushort_to_int\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_int);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ushort_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ushort_to_long\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_long);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ushort_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ushort_to_long\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_long);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ushort_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ushort_to_long\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_long);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ushort_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ushort_to_long\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_long);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ushort_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ushort_to_longlong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ushort_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ushort_to_longlong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ushort_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ushort_to_longlong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ushort_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ushort_to_longlong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ushort_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ushort_to_half\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_half);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ushort_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ushort_to_half\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_half);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ushort_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ushort_to_half\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_half);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ushort_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ushort_to_half\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_half);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ushort_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ushort_to_float\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_float);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ushort_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ushort_to_float\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_float);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ushort_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ushort_to_float\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_float);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ushort_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ushort_to_float\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_float);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ushort_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ushort_to_double\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_double);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ushort_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ushort_to_double\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_double);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ushort_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ushort_to_double\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_double);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ushort_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ushort_to_double\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_double);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ushort_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ushort_to_longdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ushort_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ushort_to_longdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ushort_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ushort_to_longdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ushort_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ushort_to_longdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ushort_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ushort_to_cfloat\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ushort_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ushort_to_cfloat\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ushort_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ushort_to_cfloat\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ushort_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ushort_to_cfloat\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ushort_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ushort_to_cdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ushort_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ushort_to_cdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ushort_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ushort_to_cdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ushort_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ushort_to_cdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ushort_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ushort_to_clongdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ushort_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ushort_to_clongdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ushort_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ushort_to_clongdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ushort
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_ushort
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ushort_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ushort_to_clongdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_ushort);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+
+
+#line 739
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_uint_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_uint_to_bool\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_uint_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_uint_to_bool\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_uint_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_uint_to_bool\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_uint_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_uint_to_bool\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_uint_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_uint_to_ubyte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_uint_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_uint_to_ubyte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_uint_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_uint_to_ubyte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_uint_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_uint_to_ubyte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_uint_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_uint_to_ushort\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_uint_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_uint_to_ushort\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_uint_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_uint_to_ushort\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_uint_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_uint_to_ushort\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_uint_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_uint_to_uint\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_uint_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_uint_to_uint\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_uint_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_uint_to_uint\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_uint_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_uint_to_uint\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_uint_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_uint_to_ulong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_uint_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_uint_to_ulong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_uint_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_uint_to_ulong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_uint_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_uint_to_ulong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_uint_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_uint_to_ulonglong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_uint_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_uint_to_ulonglong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_uint_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_uint_to_ulonglong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_uint_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_uint_to_ulonglong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_uint_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_uint_to_byte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_uint_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_uint_to_byte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_uint_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_uint_to_byte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_uint_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_uint_to_byte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_uint_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_uint_to_short\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_short);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_uint_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_uint_to_short\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_short);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_uint_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_uint_to_short\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_short);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_uint_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_uint_to_short\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_short);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_uint_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_uint_to_int\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_int);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_uint_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_uint_to_int\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_int);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_uint_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_uint_to_int\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_int);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_uint_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_uint_to_int\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_int);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_uint_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_uint_to_long\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_long);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_uint_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_uint_to_long\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_long);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_uint_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_uint_to_long\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_long);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_uint_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_uint_to_long\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_long);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_uint_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_uint_to_longlong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_uint_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_uint_to_longlong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_uint_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_uint_to_longlong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_uint_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_uint_to_longlong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_uint_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_uint_to_half\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_half);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_uint_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_uint_to_half\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_half);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_uint_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_uint_to_half\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_half);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_uint_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_uint_to_half\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_half);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_uint_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_uint_to_float\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_float);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_uint_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_uint_to_float\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_float);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_uint_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_uint_to_float\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_float);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_uint_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_uint_to_float\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_float);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_uint_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_uint_to_double\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_double);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_uint_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_uint_to_double\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_double);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_uint_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_uint_to_double\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_double);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_uint_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_uint_to_double\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_double);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_uint_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_uint_to_longdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_uint_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_uint_to_longdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_uint_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_uint_to_longdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_uint_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_uint_to_longdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_uint_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_uint_to_cfloat\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_uint_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_uint_to_cfloat\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_uint_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_uint_to_cfloat\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_uint_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_uint_to_cfloat\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_uint_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_uint_to_cdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_uint_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_uint_to_cdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_uint_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_uint_to_cdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_uint_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_uint_to_cdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_uint_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_uint_to_clongdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_uint_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_uint_to_clongdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_uint_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_uint_to_clongdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_uint
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_uint
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_uint_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_uint_to_clongdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_uint);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+
+
+#line 739
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ulong_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ulong_to_bool\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ulong_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ulong_to_bool\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ulong_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ulong_to_bool\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ulong_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ulong_to_bool\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ulong_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ulong_to_ubyte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ulong_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ulong_to_ubyte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ulong_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ulong_to_ubyte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ulong_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ulong_to_ubyte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ulong_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ulong_to_ushort\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ulong_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ulong_to_ushort\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ulong_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ulong_to_ushort\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ulong_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ulong_to_ushort\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ulong_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ulong_to_uint\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ulong_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ulong_to_uint\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ulong_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ulong_to_uint\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ulong_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ulong_to_uint\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ulong_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ulong_to_ulong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ulong_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ulong_to_ulong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ulong_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ulong_to_ulong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ulong_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ulong_to_ulong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ulong_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ulong_to_ulonglong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ulong_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ulong_to_ulonglong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ulong_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ulong_to_ulonglong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ulong_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ulong_to_ulonglong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ulong_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ulong_to_byte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ulong_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ulong_to_byte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ulong_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ulong_to_byte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ulong_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ulong_to_byte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ulong_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ulong_to_short\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_short);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ulong_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ulong_to_short\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_short);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ulong_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ulong_to_short\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_short);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ulong_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ulong_to_short\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_short);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ulong_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ulong_to_int\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_int);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ulong_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ulong_to_int\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_int);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ulong_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ulong_to_int\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_int);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ulong_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ulong_to_int\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_int);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ulong_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ulong_to_long\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_long);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ulong_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ulong_to_long\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_long);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ulong_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ulong_to_long\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_long);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ulong_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ulong_to_long\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_long);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ulong_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ulong_to_longlong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ulong_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ulong_to_longlong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ulong_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ulong_to_longlong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ulong_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ulong_to_longlong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ulong_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ulong_to_half\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_half);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ulong_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ulong_to_half\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_half);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ulong_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ulong_to_half\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_half);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ulong_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ulong_to_half\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_half);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ulong_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ulong_to_float\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_float);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ulong_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ulong_to_float\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_float);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ulong_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ulong_to_float\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_float);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ulong_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ulong_to_float\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_float);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ulong_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ulong_to_double\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_double);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ulong_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ulong_to_double\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_double);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ulong_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ulong_to_double\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_double);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ulong_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ulong_to_double\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_double);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ulong_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ulong_to_longdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ulong_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ulong_to_longdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ulong_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ulong_to_longdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ulong_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ulong_to_longdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ulong_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ulong_to_cfloat\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ulong_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ulong_to_cfloat\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ulong_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ulong_to_cfloat\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ulong_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ulong_to_cfloat\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ulong_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ulong_to_cdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ulong_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ulong_to_cdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ulong_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ulong_to_cdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ulong_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ulong_to_cdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ulong_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ulong_to_clongdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ulong_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ulong_to_clongdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ulong_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ulong_to_clongdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulong
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ulong_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ulong_to_clongdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_ulong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+
+
+#line 739
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ulonglong_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ulonglong_to_bool\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ulonglong_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ulonglong_to_bool\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ulonglong_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ulonglong_to_bool\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ulonglong_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ulonglong_to_bool\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ulonglong_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ulonglong_to_ubyte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ulonglong_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ulonglong_to_ubyte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ulonglong_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ulonglong_to_ubyte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ulonglong_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ulonglong_to_ubyte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ulonglong_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ulonglong_to_ushort\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ulonglong_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ulonglong_to_ushort\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ulonglong_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ulonglong_to_ushort\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ulonglong_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ulonglong_to_ushort\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ulonglong_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ulonglong_to_uint\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ulonglong_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ulonglong_to_uint\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ulonglong_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ulonglong_to_uint\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ulonglong_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ulonglong_to_uint\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ulonglong_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ulonglong_to_ulong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ulonglong_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ulonglong_to_ulong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ulonglong_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ulonglong_to_ulong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ulonglong_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ulonglong_to_ulong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ulonglong_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ulonglong_to_ulonglong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ulonglong_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ulonglong_to_ulonglong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ulonglong_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ulonglong_to_ulonglong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ulonglong_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ulonglong_to_ulonglong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ulonglong_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ulonglong_to_byte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ulonglong_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ulonglong_to_byte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ulonglong_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ulonglong_to_byte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ulonglong_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ulonglong_to_byte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ulonglong_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ulonglong_to_short\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_short);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ulonglong_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ulonglong_to_short\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_short);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ulonglong_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ulonglong_to_short\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_short);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ulonglong_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ulonglong_to_short\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_short);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ulonglong_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ulonglong_to_int\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_int);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ulonglong_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ulonglong_to_int\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_int);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ulonglong_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ulonglong_to_int\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_int);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ulonglong_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ulonglong_to_int\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_int);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ulonglong_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ulonglong_to_long\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_long);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ulonglong_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ulonglong_to_long\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_long);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ulonglong_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ulonglong_to_long\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_long);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ulonglong_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ulonglong_to_long\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_long);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ulonglong_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ulonglong_to_longlong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ulonglong_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ulonglong_to_longlong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ulonglong_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ulonglong_to_longlong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ulonglong_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ulonglong_to_longlong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ulonglong_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ulonglong_to_half\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_half);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ulonglong_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ulonglong_to_half\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_half);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ulonglong_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ulonglong_to_half\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_half);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ulonglong_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ulonglong_to_half\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_half);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ulonglong_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ulonglong_to_float\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_float);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ulonglong_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ulonglong_to_float\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_float);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ulonglong_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ulonglong_to_float\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_float);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ulonglong_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ulonglong_to_float\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_float);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ulonglong_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ulonglong_to_double\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_double);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ulonglong_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ulonglong_to_double\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_double);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ulonglong_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ulonglong_to_double\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_double);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ulonglong_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ulonglong_to_double\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_double);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ulonglong_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ulonglong_to_longdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ulonglong_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ulonglong_to_longdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ulonglong_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ulonglong_to_longdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ulonglong_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ulonglong_to_longdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ulonglong_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ulonglong_to_cfloat\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ulonglong_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ulonglong_to_cfloat\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ulonglong_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ulonglong_to_cfloat\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ulonglong_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ulonglong_to_cfloat\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ulonglong_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ulonglong_to_cdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ulonglong_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ulonglong_to_cdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ulonglong_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ulonglong_to_cdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ulonglong_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ulonglong_to_cdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_ulonglong_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_ulonglong_to_clongdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_ulonglong_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_ulonglong_to_clongdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_ulonglong_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_ulonglong_to_clongdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_ulonglong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_ulonglong
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_ulonglong_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_ulonglong_to_clongdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_ulonglong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+
+
+#line 739
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_byte_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_byte_to_bool\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_byte_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_byte_to_bool\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_byte_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_byte_to_bool\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_byte_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_byte_to_bool\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_byte_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_byte_to_ubyte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_byte_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_byte_to_ubyte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_byte_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_byte_to_ubyte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_byte_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_byte_to_ubyte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_byte_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_byte_to_ushort\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_byte_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_byte_to_ushort\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_byte_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_byte_to_ushort\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_byte_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_byte_to_ushort\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_byte_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_byte_to_uint\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_byte_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_byte_to_uint\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_byte_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_byte_to_uint\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_byte_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_byte_to_uint\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_byte_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_byte_to_ulong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_byte_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_byte_to_ulong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_byte_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_byte_to_ulong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_byte_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_byte_to_ulong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_byte_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_byte_to_ulonglong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_byte_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_byte_to_ulonglong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_byte_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_byte_to_ulonglong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_byte_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_byte_to_ulonglong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_byte_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_byte_to_byte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_byte_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_byte_to_byte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_byte_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_byte_to_byte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_byte_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_byte_to_byte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_byte_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_byte_to_short\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_short);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_byte_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_byte_to_short\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_short);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_byte_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_byte_to_short\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_short);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_byte_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_byte_to_short\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_short);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_byte_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_byte_to_int\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_int);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_byte_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_byte_to_int\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_int);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_byte_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_byte_to_int\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_int);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_byte_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_byte_to_int\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_int);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_byte_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_byte_to_long\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_long);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_byte_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_byte_to_long\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_long);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_byte_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_byte_to_long\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_long);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_byte_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_byte_to_long\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_long);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_byte_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_byte_to_longlong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_byte_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_byte_to_longlong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_byte_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_byte_to_longlong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_byte_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_byte_to_longlong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_byte_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_byte_to_half\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_half);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_byte_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_byte_to_half\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_half);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_byte_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_byte_to_half\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_half);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_byte_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_byte_to_half\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_half);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_byte_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_byte_to_float\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_float);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_byte_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_byte_to_float\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_float);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_byte_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_byte_to_float\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_float);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_byte_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_byte_to_float\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_float);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_byte_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_byte_to_double\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_double);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_byte_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_byte_to_double\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_double);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_byte_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_byte_to_double\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_double);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_byte_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_byte_to_double\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_double);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_byte_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_byte_to_longdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_byte_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_byte_to_longdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_byte_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_byte_to_longdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_byte_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_byte_to_longdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_byte_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_byte_to_cfloat\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_byte_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_byte_to_cfloat\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_byte_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_byte_to_cfloat\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_byte_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_byte_to_cfloat\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_byte_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_byte_to_cdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_byte_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_byte_to_cdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_byte_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_byte_to_cdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_byte_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_byte_to_cdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_byte_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_byte_to_clongdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_byte_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_byte_to_clongdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_byte_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_byte_to_clongdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_byte
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_byte
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_byte_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_byte_to_clongdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_byte);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+
+
+#line 739
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_short_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_short_to_bool\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_short_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_short_to_bool\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_short_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_short_to_bool\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_short_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_short_to_bool\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_short_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_short_to_ubyte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_short_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_short_to_ubyte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_short_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_short_to_ubyte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_short_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_short_to_ubyte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_short_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_short_to_ushort\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_short_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_short_to_ushort\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_short_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_short_to_ushort\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_short_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_short_to_ushort\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_short_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_short_to_uint\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_short_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_short_to_uint\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_short_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_short_to_uint\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_short_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_short_to_uint\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_short_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_short_to_ulong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_short_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_short_to_ulong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_short_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_short_to_ulong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_short_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_short_to_ulong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_short_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_short_to_ulonglong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_short_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_short_to_ulonglong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_short_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_short_to_ulonglong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_short_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_short_to_ulonglong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_short_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_short_to_byte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_short_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_short_to_byte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_short_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_short_to_byte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_short_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_short_to_byte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_short_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_short_to_short\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_short);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_short_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_short_to_short\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_short);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_short_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_short_to_short\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_short);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_short_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_short_to_short\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_short);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_short_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_short_to_int\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_int);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_short_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_short_to_int\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_int);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_short_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_short_to_int\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_int);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_short_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_short_to_int\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_int);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_short_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_short_to_long\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_long);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_short_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_short_to_long\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_long);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_short_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_short_to_long\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_long);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_short_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_short_to_long\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_long);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_short_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_short_to_longlong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_short_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_short_to_longlong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_short_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_short_to_longlong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_short_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_short_to_longlong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_short_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_short_to_half\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_half);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_short_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_short_to_half\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_half);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_short_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_short_to_half\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_half);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_short_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_short_to_half\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_half);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_short_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_short_to_float\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_float);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_short_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_short_to_float\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_float);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_short_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_short_to_float\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_float);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_short_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_short_to_float\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_float);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_short_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_short_to_double\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_double);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_short_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_short_to_double\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_double);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_short_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_short_to_double\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_double);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_short_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_short_to_double\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_double);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_short_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_short_to_longdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_short_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_short_to_longdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_short_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_short_to_longdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_short_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_short_to_longdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_short_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_short_to_cfloat\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_short_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_short_to_cfloat\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_short_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_short_to_cfloat\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_short_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_short_to_cfloat\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_short_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_short_to_cdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_short_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_short_to_cdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_short_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_short_to_cdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_short_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_short_to_cdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_short_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_short_to_clongdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_short_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_short_to_clongdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_short_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_short_to_clongdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_short
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_short
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_short_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_short_to_clongdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_short);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+
+
+#line 739
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_int_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_int_to_bool\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_int_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_int_to_bool\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_int_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_int_to_bool\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_int_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_int_to_bool\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_int_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_int_to_ubyte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_int_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_int_to_ubyte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_int_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_int_to_ubyte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_int_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_int_to_ubyte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_int_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_int_to_ushort\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_int_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_int_to_ushort\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_int_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_int_to_ushort\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_int_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_int_to_ushort\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_int_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_int_to_uint\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_int_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_int_to_uint\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_int_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_int_to_uint\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_int_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_int_to_uint\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_int_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_int_to_ulong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_int_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_int_to_ulong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_int_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_int_to_ulong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_int_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_int_to_ulong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_int_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_int_to_ulonglong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_int_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_int_to_ulonglong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_int_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_int_to_ulonglong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_int_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_int_to_ulonglong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_int_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_int_to_byte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_int_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_int_to_byte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_int_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_int_to_byte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_int_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_int_to_byte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_int_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_int_to_short\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_short);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_int_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_int_to_short\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_short);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_int_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_int_to_short\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_short);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_int_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_int_to_short\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_short);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_int_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_int_to_int\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_int);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_int_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_int_to_int\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_int);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_int_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_int_to_int\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_int);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_int_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_int_to_int\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_int);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_int_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_int_to_long\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_long);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_int_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_int_to_long\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_long);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_int_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_int_to_long\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_long);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_int_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_int_to_long\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_long);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_int_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_int_to_longlong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_int_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_int_to_longlong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_int_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_int_to_longlong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_int_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_int_to_longlong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_int_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_int_to_half\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_half);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_int_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_int_to_half\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_half);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_int_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_int_to_half\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_half);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_int_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_int_to_half\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_half);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_int_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_int_to_float\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_float);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_int_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_int_to_float\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_float);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_int_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_int_to_float\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_float);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_int_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_int_to_float\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_float);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_int_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_int_to_double\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_double);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_int_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_int_to_double\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_double);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_int_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_int_to_double\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_double);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_int_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_int_to_double\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_double);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_int_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_int_to_longdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_int_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_int_to_longdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_int_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_int_to_longdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_int_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_int_to_longdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_int_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_int_to_cfloat\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_int_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_int_to_cfloat\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_int_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_int_to_cfloat\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_int_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_int_to_cfloat\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_int_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_int_to_cdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_int_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_int_to_cdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_int_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_int_to_cdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_int_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_int_to_cdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_int_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_int_to_clongdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_int_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_int_to_clongdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_int_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_int_to_clongdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_int
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_int
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_int_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_int_to_clongdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_int);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+
+
+#line 739
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_long_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_long_to_bool\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_long_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_long_to_bool\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_long_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_long_to_bool\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_long_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_long_to_bool\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_long_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_long_to_ubyte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_long_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_long_to_ubyte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_long_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_long_to_ubyte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_long_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_long_to_ubyte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_long_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_long_to_ushort\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_long_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_long_to_ushort\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_long_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_long_to_ushort\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_long_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_long_to_ushort\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_long_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_long_to_uint\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_long_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_long_to_uint\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_long_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_long_to_uint\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_long_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_long_to_uint\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_long_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_long_to_ulong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_long_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_long_to_ulong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_long_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_long_to_ulong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_long_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_long_to_ulong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_long_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_long_to_ulonglong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_long_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_long_to_ulonglong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_long_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_long_to_ulonglong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_long_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_long_to_ulonglong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_long_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_long_to_byte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_long_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_long_to_byte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_long_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_long_to_byte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_long_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_long_to_byte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_long_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_long_to_short\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_short);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_long_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_long_to_short\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_short);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_long_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_long_to_short\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_short);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_long_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_long_to_short\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_short);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_long_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_long_to_int\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_int);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_long_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_long_to_int\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_int);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_long_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_long_to_int\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_int);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_long_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_long_to_int\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_int);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_long_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_long_to_long\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_long);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_long_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_long_to_long\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_long);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_long_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_long_to_long\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_long);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_long_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_long_to_long\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_long);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_long_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_long_to_longlong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_long_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_long_to_longlong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_long_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_long_to_longlong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_long_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_long_to_longlong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_long_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_long_to_half\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_half);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_long_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_long_to_half\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_half);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_long_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_long_to_half\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_half);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_long_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_long_to_half\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_half);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_long_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_long_to_float\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_float);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_long_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_long_to_float\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_float);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_long_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_long_to_float\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_float);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_long_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_long_to_float\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_float);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_long_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_long_to_double\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_double);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_long_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_long_to_double\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_double);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_long_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_long_to_double\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_double);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_long_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_long_to_double\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_double);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_long_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_long_to_longdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_long_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_long_to_longdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_long_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_long_to_longdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_long_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_long_to_longdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_long_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_long_to_cfloat\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_long_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_long_to_cfloat\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_long_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_long_to_cfloat\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_long_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_long_to_cfloat\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_long_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_long_to_cdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_long_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_long_to_cdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_long_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_long_to_cdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_long_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_long_to_cdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_long_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_long_to_clongdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_long_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_long_to_clongdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_long_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_long_to_clongdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_long
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_long
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_long_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_long_to_clongdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_long);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+
+
+#line 739
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_longlong_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_longlong_to_bool\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_longlong_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_longlong_to_bool\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_longlong_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_longlong_to_bool\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_longlong_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_longlong_to_bool\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_longlong_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_longlong_to_ubyte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_longlong_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_longlong_to_ubyte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_longlong_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_longlong_to_ubyte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_longlong_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_longlong_to_ubyte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_longlong_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_longlong_to_ushort\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_longlong_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_longlong_to_ushort\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_longlong_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_longlong_to_ushort\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_longlong_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_longlong_to_ushort\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_longlong_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_longlong_to_uint\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_longlong_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_longlong_to_uint\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_longlong_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_longlong_to_uint\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_longlong_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_longlong_to_uint\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_longlong_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_longlong_to_ulong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_longlong_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_longlong_to_ulong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_longlong_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_longlong_to_ulong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_longlong_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_longlong_to_ulong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_longlong_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_longlong_to_ulonglong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_longlong_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_longlong_to_ulonglong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_longlong_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_longlong_to_ulonglong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_longlong_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_longlong_to_ulonglong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_longlong_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_longlong_to_byte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_longlong_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_longlong_to_byte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_longlong_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_longlong_to_byte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_longlong_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_longlong_to_byte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_longlong_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_longlong_to_short\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_short);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_longlong_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_longlong_to_short\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_short);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_longlong_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_longlong_to_short\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_short);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_longlong_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_longlong_to_short\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_short);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_longlong_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_longlong_to_int\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_int);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_longlong_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_longlong_to_int\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_int);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_longlong_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_longlong_to_int\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_int);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_longlong_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_longlong_to_int\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_int);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_longlong_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_longlong_to_long\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_long);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_longlong_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_longlong_to_long\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_long);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_longlong_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_longlong_to_long\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_long);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_longlong_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_longlong_to_long\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_long);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_longlong_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_longlong_to_longlong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_longlong_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_longlong_to_longlong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_longlong_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_longlong_to_longlong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_longlong_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_longlong_to_longlong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_longlong_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_longlong_to_half\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_half);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_longlong_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_longlong_to_half\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_half);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_longlong_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_longlong_to_half\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_half);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_longlong_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_longlong_to_half\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_half);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_longlong_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_longlong_to_float\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_float);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_longlong_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_longlong_to_float\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_float);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_longlong_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_longlong_to_float\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_float);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_longlong_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_longlong_to_float\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_float);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_longlong_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_longlong_to_double\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_double);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_longlong_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_longlong_to_double\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_double);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_longlong_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_longlong_to_double\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_double);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_longlong_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_longlong_to_double\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_double);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_longlong_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_longlong_to_longdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_longlong_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_longlong_to_longdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_longlong_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_longlong_to_longdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_longlong_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_longlong_to_longdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_longlong_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_longlong_to_cfloat\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_longlong_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_longlong_to_cfloat\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_longlong_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_longlong_to_cfloat\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_longlong_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_longlong_to_cfloat\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_longlong_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_longlong_to_cdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_longlong_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_longlong_to_cdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_longlong_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_longlong_to_cdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_longlong_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_longlong_to_cdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_longlong_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_longlong_to_clongdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_longlong_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_longlong_to_clongdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_longlong_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_longlong_to_clongdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longlong
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_longlong
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_longlong_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_longlong_to_clongdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_longlong);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+
+
+#line 739
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_half_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_half_to_bool\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_half_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_half_to_bool\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_half_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_half_to_bool\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_half_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_half_to_bool\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_half_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_half_to_ubyte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_half_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_half_to_ubyte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_half_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_half_to_ubyte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_half_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_half_to_ubyte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_half_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_half_to_ushort\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_half_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_half_to_ushort\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_half_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_half_to_ushort\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_half_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_half_to_ushort\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_half_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_half_to_uint\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_half_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_half_to_uint\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_half_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_half_to_uint\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_half_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_half_to_uint\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_half_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_half_to_ulong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_half_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_half_to_ulong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_half_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_half_to_ulong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_half_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_half_to_ulong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_half_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_half_to_ulonglong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_half_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_half_to_ulonglong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_half_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_half_to_ulonglong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_half_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_half_to_ulonglong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_half_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_half_to_byte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_half_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_half_to_byte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_half_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_half_to_byte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_half_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_half_to_byte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_half_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_half_to_short\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_short);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_half_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_half_to_short\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_short);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_half_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_half_to_short\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_short);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_half_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_half_to_short\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_short);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_half_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_half_to_int\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_int);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_half_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_half_to_int\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_int);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_half_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_half_to_int\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_int);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_half_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_half_to_int\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_int);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_half_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_half_to_long\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_long);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_half_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_half_to_long\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_long);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_half_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_half_to_long\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_long);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_half_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_half_to_long\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_long);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_half_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_half_to_longlong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_half_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_half_to_longlong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_half_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_half_to_longlong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_half_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_half_to_longlong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_half_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_half_to_half\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_half);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_half_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_half_to_half\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_half);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_half_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_half_to_half\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_half);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_half_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_half_to_half\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_half);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_half_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_half_to_float\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_float);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_half_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_half_to_float\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_float);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_half_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_half_to_float\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_float);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_half_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_half_to_float\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_float);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_half_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_half_to_double\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_double);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_half_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_half_to_double\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_double);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_half_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_half_to_double\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_double);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_half_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_half_to_double\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_double);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_half_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_half_to_longdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_half_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_half_to_longdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_half_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_half_to_longdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_half_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_half_to_longdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_half_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_half_to_cfloat\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_half_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_half_to_cfloat\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_half_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_half_to_cfloat\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_half_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_half_to_cfloat\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_half_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_half_to_cdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_half_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_half_to_cdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_half_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_half_to_cdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_half_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_half_to_cdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_half_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_half_to_clongdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_half_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_half_to_clongdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_half_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_half_to_clongdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 1 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_half
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_half
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_half_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_half_to_clongdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_half);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+
+
+#line 739
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_float_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_float_to_bool\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_float_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_float_to_bool\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_float_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_float_to_bool\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_float_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_float_to_bool\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_float_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_float_to_ubyte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_float_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_float_to_ubyte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_float_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_float_to_ubyte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_float_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_float_to_ubyte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_float_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_float_to_ushort\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_float_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_float_to_ushort\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_float_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_float_to_ushort\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_float_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_float_to_ushort\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_float_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_float_to_uint\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_float_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_float_to_uint\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_float_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_float_to_uint\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_float_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_float_to_uint\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_float_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_float_to_ulong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_float_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_float_to_ulong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_float_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_float_to_ulong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_float_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_float_to_ulong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_float_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_float_to_ulonglong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_float_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_float_to_ulonglong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_float_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_float_to_ulonglong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_float_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_float_to_ulonglong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_float_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_float_to_byte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_float_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_float_to_byte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_float_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_float_to_byte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_float_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_float_to_byte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_float_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_float_to_short\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_short);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_float_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_float_to_short\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_short);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_float_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_float_to_short\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_short);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_float_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_float_to_short\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_short);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_float_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_float_to_int\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_int);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_float_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_float_to_int\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_int);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_float_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_float_to_int\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_int);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_float_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_float_to_int\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_int);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_float_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_float_to_long\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_long);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_float_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_float_to_long\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_long);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_float_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_float_to_long\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_long);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_float_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_float_to_long\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_long);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_float_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_float_to_longlong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_float_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_float_to_longlong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_float_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_float_to_longlong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_float_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_float_to_longlong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_float_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_float_to_half\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_half);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_float_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_float_to_half\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_half);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_float_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_float_to_half\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_half);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_float_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_float_to_half\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_half);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_float_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_float_to_float\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_float);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_float_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_float_to_float\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_float);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_float_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_float_to_float\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_float);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_float_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_float_to_float\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_float);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_float_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_float_to_double\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_double);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_float_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_float_to_double\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_double);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_float_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_float_to_double\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_double);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_float_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_float_to_double\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_double);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_float_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_float_to_longdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_float_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_float_to_longdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_float_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_float_to_longdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_float_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_float_to_longdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_float_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_float_to_cfloat\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_float_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_float_to_cfloat\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_float_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_float_to_cfloat\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_float_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_float_to_cfloat\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_float_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_float_to_cdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_float_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_float_to_cdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_float_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_float_to_cdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_float_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_float_to_cdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_float_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_float_to_clongdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_float_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_float_to_clongdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_float_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_float_to_clongdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_float_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_float_to_clongdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_float);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+
+
+#line 739
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_double_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_double_to_bool\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_double_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_double_to_bool\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_double_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_double_to_bool\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_double_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_double_to_bool\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_double_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_double_to_ubyte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_double_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_double_to_ubyte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_double_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_double_to_ubyte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_double_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_double_to_ubyte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_double_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_double_to_ushort\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_double_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_double_to_ushort\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_double_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_double_to_ushort\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_double_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_double_to_ushort\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_double_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_double_to_uint\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_double_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_double_to_uint\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_double_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_double_to_uint\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_double_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_double_to_uint\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_double_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_double_to_ulong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_double_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_double_to_ulong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_double_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_double_to_ulong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_double_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_double_to_ulong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_double_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_double_to_ulonglong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_double_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_double_to_ulonglong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_double_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_double_to_ulonglong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_double_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_double_to_ulonglong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_double_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_double_to_byte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_double_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_double_to_byte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_double_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_double_to_byte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_double_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_double_to_byte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_double_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_double_to_short\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_short);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_double_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_double_to_short\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_short);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_double_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_double_to_short\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_short);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_double_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_double_to_short\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_short);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_double_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_double_to_int\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_int);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_double_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_double_to_int\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_int);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_double_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_double_to_int\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_int);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_double_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_double_to_int\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_int);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_double_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_double_to_long\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_long);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_double_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_double_to_long\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_long);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_double_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_double_to_long\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_long);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_double_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_double_to_long\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_long);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_double_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_double_to_longlong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_double_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_double_to_longlong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_double_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_double_to_longlong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_double_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_double_to_longlong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_double_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_double_to_half\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_half);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_double_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_double_to_half\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_half);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_double_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_double_to_half\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_half);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_double_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_double_to_half\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_half);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_double_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_double_to_float\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_float);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_double_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_double_to_float\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_float);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_double_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_double_to_float\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_float);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_double_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_double_to_float\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_float);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_double_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_double_to_double\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_double);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_double_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_double_to_double\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_double);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_double_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_double_to_double\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_double);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_double_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_double_to_double\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_double);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_double_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_double_to_longdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_double_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_double_to_longdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_double_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_double_to_longdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_double_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_double_to_longdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_double_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_double_to_cfloat\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_double_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_double_to_cfloat\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_double_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_double_to_cfloat\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_double_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_double_to_cfloat\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_double_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_double_to_cdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_double_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_double_to_cdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_double_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_double_to_cdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_double_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_double_to_cdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_double_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_double_to_clongdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_double_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_double_to_clongdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_double_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_double_to_clongdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_double_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_double_to_clongdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_double);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+
+
+#line 739
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_longdouble_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_longdouble_to_bool\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_longdouble_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_longdouble_to_bool\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_longdouble_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_longdouble_to_bool\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_longdouble_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_longdouble_to_bool\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_longdouble_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_longdouble_to_ubyte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_longdouble_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_longdouble_to_ubyte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_longdouble_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_longdouble_to_ubyte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_longdouble_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_longdouble_to_ubyte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_longdouble_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_longdouble_to_ushort\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_longdouble_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_longdouble_to_ushort\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_longdouble_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_longdouble_to_ushort\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_longdouble_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_longdouble_to_ushort\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_longdouble_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_longdouble_to_uint\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_longdouble_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_longdouble_to_uint\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_longdouble_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_longdouble_to_uint\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_longdouble_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_longdouble_to_uint\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_longdouble_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_longdouble_to_ulong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_longdouble_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_longdouble_to_ulong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_longdouble_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_longdouble_to_ulong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_longdouble_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_longdouble_to_ulong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_longdouble_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_longdouble_to_ulonglong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_longdouble_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_longdouble_to_ulonglong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_longdouble_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_longdouble_to_ulonglong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_longdouble_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_longdouble_to_ulonglong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_longdouble_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_longdouble_to_byte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_longdouble_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_longdouble_to_byte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_longdouble_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_longdouble_to_byte\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_longdouble_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_longdouble_to_byte\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_longdouble_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_longdouble_to_short\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_short);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_longdouble_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_longdouble_to_short\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_short);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_longdouble_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_longdouble_to_short\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_short);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_longdouble_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_longdouble_to_short\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_short);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_longdouble_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_longdouble_to_int\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_int);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_longdouble_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_longdouble_to_int\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_int);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_longdouble_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_longdouble_to_int\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_int);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_longdouble_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_longdouble_to_int\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_int);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_longdouble_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_longdouble_to_long\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_long);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_longdouble_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_longdouble_to_long\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_long);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_longdouble_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_longdouble_to_long\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_long);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_longdouble_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_longdouble_to_long\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_long);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_longdouble_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_longdouble_to_longlong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_longdouble_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_longdouble_to_longlong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_longdouble_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_longdouble_to_longlong\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_longdouble_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_longdouble_to_longlong\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_longdouble_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_longdouble_to_half\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_half);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_longdouble_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_longdouble_to_half\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_half);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_longdouble_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_longdouble_to_half\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_half);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_longdouble_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_longdouble_to_half\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_half);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_longdouble_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_longdouble_to_float\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_float);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_longdouble_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_longdouble_to_float\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_float);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_longdouble_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_longdouble_to_float\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_float);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_longdouble_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_longdouble_to_float\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_float);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_longdouble_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_longdouble_to_double\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_double);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_longdouble_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_longdouble_to_double\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_double);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_longdouble_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_longdouble_to_double\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_double);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_longdouble_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_longdouble_to_double\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_double);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_longdouble_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_longdouble_to_longdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_longdouble_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_longdouble_to_longdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_longdouble_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_longdouble_to_longdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_longdouble_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_longdouble_to_longdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_longdouble_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_longdouble_to_cfloat\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_longdouble_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_longdouble_to_cfloat\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_longdouble_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_longdouble_to_cfloat\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_longdouble_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_longdouble_to_cfloat\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_longdouble_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_longdouble_to_cdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_longdouble_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_longdouble_to_cdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_longdouble_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_longdouble_to_cdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_longdouble_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_longdouble_to_cdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_longdouble_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_longdouble_to_clongdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_longdouble_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_longdouble_to_clongdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_longdouble_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_longdouble_to_clongdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_longdouble_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 0
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_longdouble_to_clongdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 0
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 0
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_longdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+
+
+#line 739
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_cfloat_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_cfloat_to_bool\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_cfloat_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_cfloat_to_bool\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_cfloat_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_cfloat_to_bool\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_cfloat_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_cfloat_to_bool\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_cfloat_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_cfloat_to_ubyte\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_cfloat_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_cfloat_to_ubyte\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_cfloat_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_cfloat_to_ubyte\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_cfloat_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_cfloat_to_ubyte\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_cfloat_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_cfloat_to_ushort\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_cfloat_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_cfloat_to_ushort\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_cfloat_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_cfloat_to_ushort\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_cfloat_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_cfloat_to_ushort\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_cfloat_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_cfloat_to_uint\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_cfloat_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_cfloat_to_uint\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_cfloat_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_cfloat_to_uint\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_cfloat_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_cfloat_to_uint\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_cfloat_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_cfloat_to_ulong\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_cfloat_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_cfloat_to_ulong\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_cfloat_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_cfloat_to_ulong\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_cfloat_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_cfloat_to_ulong\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_cfloat_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_cfloat_to_ulonglong\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_cfloat_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_cfloat_to_ulonglong\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_cfloat_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_cfloat_to_ulonglong\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_cfloat_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_cfloat_to_ulonglong\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_cfloat_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_cfloat_to_byte\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_cfloat_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_cfloat_to_byte\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_cfloat_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_cfloat_to_byte\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_cfloat_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_cfloat_to_byte\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_cfloat_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_cfloat_to_short\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_short);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_cfloat_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_cfloat_to_short\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_short);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_cfloat_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_cfloat_to_short\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_short);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_cfloat_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_cfloat_to_short\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_short);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_cfloat_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_cfloat_to_int\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_int);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_cfloat_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_cfloat_to_int\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_int);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_cfloat_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_cfloat_to_int\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_int);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_cfloat_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_cfloat_to_int\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_int);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_cfloat_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_cfloat_to_long\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_long);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_cfloat_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_cfloat_to_long\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_long);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_cfloat_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_cfloat_to_long\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_long);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_cfloat_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_cfloat_to_long\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_long);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_cfloat_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_cfloat_to_longlong\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_cfloat_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_cfloat_to_longlong\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_cfloat_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_cfloat_to_longlong\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_cfloat_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_cfloat_to_longlong\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_cfloat_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_cfloat_to_half\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_half);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_cfloat_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_cfloat_to_half\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_half);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_cfloat_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_cfloat_to_half\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_half);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_cfloat_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_cfloat_to_half\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_half);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_cfloat_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_cfloat_to_float\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_float);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_cfloat_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_cfloat_to_float\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_float);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_cfloat_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_cfloat_to_float\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_float);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_cfloat_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_cfloat_to_float\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_float);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_cfloat_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_cfloat_to_double\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_double);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_cfloat_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_cfloat_to_double\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_double);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_cfloat_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_cfloat_to_double\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_double);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_cfloat_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_cfloat_to_double\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_double);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_cfloat_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_cfloat_to_longdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_cfloat_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_cfloat_to_longdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_cfloat_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_cfloat_to_longdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_cfloat_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_cfloat_to_longdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_cfloat_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_cfloat_to_cfloat\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_cfloat_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_cfloat_to_cfloat\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_cfloat_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_cfloat_to_cfloat\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_cfloat_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_cfloat_to_cfloat\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_cfloat_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_cfloat_to_cdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_cfloat_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_cfloat_to_cdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_cfloat_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_cfloat_to_cdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_cfloat_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_cfloat_to_cdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_cfloat_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_cfloat_to_clongdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_cfloat_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_cfloat_to_clongdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_cfloat_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_cfloat_to_clongdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 1
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_float
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_float
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_cfloat_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_cfloat_to_clongdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_cfloat);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+
+
+#line 739
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_cdouble_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_cdouble_to_bool\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_cdouble_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_cdouble_to_bool\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_cdouble_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_cdouble_to_bool\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_cdouble_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_cdouble_to_bool\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_cdouble_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_cdouble_to_ubyte\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_cdouble_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_cdouble_to_ubyte\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_cdouble_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_cdouble_to_ubyte\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_cdouble_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_cdouble_to_ubyte\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_cdouble_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_cdouble_to_ushort\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_cdouble_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_cdouble_to_ushort\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_cdouble_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_cdouble_to_ushort\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_cdouble_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_cdouble_to_ushort\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_cdouble_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_cdouble_to_uint\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_cdouble_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_cdouble_to_uint\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_cdouble_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_cdouble_to_uint\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_cdouble_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_cdouble_to_uint\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_cdouble_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_cdouble_to_ulong\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_cdouble_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_cdouble_to_ulong\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_cdouble_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_cdouble_to_ulong\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_cdouble_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_cdouble_to_ulong\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_cdouble_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_cdouble_to_ulonglong\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_cdouble_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_cdouble_to_ulonglong\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_cdouble_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_cdouble_to_ulonglong\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_cdouble_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_cdouble_to_ulonglong\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_cdouble_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_cdouble_to_byte\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_cdouble_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_cdouble_to_byte\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_cdouble_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_cdouble_to_byte\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_cdouble_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_cdouble_to_byte\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_cdouble_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_cdouble_to_short\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_short);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_cdouble_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_cdouble_to_short\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_short);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_cdouble_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_cdouble_to_short\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_short);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_cdouble_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_cdouble_to_short\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_short);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_cdouble_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_cdouble_to_int\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_int);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_cdouble_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_cdouble_to_int\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_int);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_cdouble_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_cdouble_to_int\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_int);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_cdouble_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_cdouble_to_int\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_int);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_cdouble_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_cdouble_to_long\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_long);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_cdouble_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_cdouble_to_long\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_long);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_cdouble_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_cdouble_to_long\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_long);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_cdouble_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_cdouble_to_long\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_long);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_cdouble_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_cdouble_to_longlong\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_cdouble_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_cdouble_to_longlong\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_cdouble_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_cdouble_to_longlong\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_cdouble_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_cdouble_to_longlong\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_cdouble_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_cdouble_to_half\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_half);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_cdouble_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_cdouble_to_half\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_half);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_cdouble_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_cdouble_to_half\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_half);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_cdouble_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_cdouble_to_half\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_half);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_cdouble_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_cdouble_to_float\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_float);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_cdouble_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_cdouble_to_float\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_float);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_cdouble_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_cdouble_to_float\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_float);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_cdouble_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_cdouble_to_float\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_float);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_cdouble_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_cdouble_to_double\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_double);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_cdouble_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_cdouble_to_double\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_double);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_cdouble_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_cdouble_to_double\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_double);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_cdouble_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_cdouble_to_double\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_double);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_cdouble_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_cdouble_to_longdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_cdouble_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_cdouble_to_longdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_cdouble_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_cdouble_to_longdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_cdouble_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_cdouble_to_longdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_cdouble_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_cdouble_to_cfloat\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_cdouble_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_cdouble_to_cfloat\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_cdouble_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_cdouble_to_cfloat\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_cdouble_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_cdouble_to_cfloat\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_cdouble_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_cdouble_to_cdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_cdouble_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_cdouble_to_cdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_cdouble_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_cdouble_to_cdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_cdouble_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_cdouble_to_cdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_cdouble_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_cdouble_to_clongdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_cdouble_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_cdouble_to_clongdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_cdouble_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_cdouble_to_clongdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 1
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_double
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_double
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_cdouble_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_cdouble_to_clongdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_cdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+
+
+#line 739
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_clongdouble_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_clongdouble_to_bool\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_clongdouble_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_clongdouble_to_bool\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_clongdouble_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_clongdouble_to_bool\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_bool
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_bool
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 1
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 1 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_clongdouble_to_bool(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_clongdouble_to_bool\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 1
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 1
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_bool);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_clongdouble_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_clongdouble_to_ubyte\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_clongdouble_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_clongdouble_to_ubyte\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_clongdouble_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_clongdouble_to_ubyte\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ubyte
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_ubyte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_clongdouble_to_ubyte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_clongdouble_to_ubyte\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ubyte);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_clongdouble_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_clongdouble_to_ushort\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_clongdouble_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_clongdouble_to_ushort\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_clongdouble_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_clongdouble_to_ushort\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ushort
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_ushort
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_clongdouble_to_ushort(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_clongdouble_to_ushort\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ushort);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_clongdouble_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_clongdouble_to_uint\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_clongdouble_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_clongdouble_to_uint\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_clongdouble_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_clongdouble_to_uint\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_uint
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_uint
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_clongdouble_to_uint(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_clongdouble_to_uint\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_uint);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_clongdouble_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_clongdouble_to_ulong\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_clongdouble_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_clongdouble_to_ulong\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_clongdouble_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_clongdouble_to_ulong\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulong
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_ulong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_clongdouble_to_ulong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_clongdouble_to_ulong\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulong);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_clongdouble_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_clongdouble_to_ulonglong\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_clongdouble_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_clongdouble_to_ulonglong\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_clongdouble_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_clongdouble_to_ulonglong\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_ulonglong
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_ulonglong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_clongdouble_to_ulonglong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_clongdouble_to_ulonglong\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_ulonglong);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_clongdouble_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_clongdouble_to_byte\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_clongdouble_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_clongdouble_to_byte\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_clongdouble_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_clongdouble_to_byte\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_byte
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_byte
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_clongdouble_to_byte(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_clongdouble_to_byte\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_byte);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_clongdouble_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_clongdouble_to_short\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_short);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_clongdouble_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_clongdouble_to_short\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_short);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_clongdouble_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_clongdouble_to_short\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_short);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_short
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_short
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_clongdouble_to_short(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_clongdouble_to_short\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_short);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_clongdouble_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_clongdouble_to_int\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_int);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_clongdouble_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_clongdouble_to_int\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_int);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_clongdouble_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_clongdouble_to_int\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_int);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_int
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_int
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_clongdouble_to_int(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_clongdouble_to_int\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_int);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_clongdouble_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_clongdouble_to_long\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_long);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_clongdouble_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_clongdouble_to_long\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_long);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_clongdouble_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_clongdouble_to_long\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_long);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_long
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_long
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_clongdouble_to_long(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_clongdouble_to_long\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_long);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_clongdouble_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_clongdouble_to_longlong\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_clongdouble_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_clongdouble_to_longlong\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_clongdouble_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_clongdouble_to_longlong\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longlong
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_longlong
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_clongdouble_to_longlong(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_clongdouble_to_longlong\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longlong);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_clongdouble_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_clongdouble_to_half\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_half);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_clongdouble_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_clongdouble_to_half\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_half);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_clongdouble_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_clongdouble_to_half\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_half);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 1
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_half
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_half
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 1
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 1
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_clongdouble_to_half(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_clongdouble_to_half\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_half);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_clongdouble_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_clongdouble_to_float\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_float);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_clongdouble_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_clongdouble_to_float\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_float);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_clongdouble_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_clongdouble_to_float\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_float);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_clongdouble_to_float(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_clongdouble_to_float\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_float);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_clongdouble_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_clongdouble_to_double\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_double);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_clongdouble_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_clongdouble_to_double\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_double);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_clongdouble_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_clongdouble_to_double\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_double);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_clongdouble_to_double(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_clongdouble_to_double\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_double);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_clongdouble_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_clongdouble_to_longdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_clongdouble_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_clongdouble_to_longdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_clongdouble_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_clongdouble_to_longdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_clongdouble_to_longdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 0
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_clongdouble_to_longdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 0
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 0
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 0
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_longdouble);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_clongdouble_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_clongdouble_to_cfloat\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_clongdouble_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_clongdouble_to_cfloat\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_clongdouble_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_clongdouble_to_cfloat\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 1
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_float
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_float
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 1
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_clongdouble_to_cfloat(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_clongdouble_to_cfloat\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cfloat);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_clongdouble_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_clongdouble_to_cdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_clongdouble_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_clongdouble_to_cdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_clongdouble_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_clongdouble_to_cdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 1
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_double
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_double
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 1
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_clongdouble_to_cdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_clongdouble_to_cdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_cdouble);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+#line 768
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_cast_clongdouble_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_cast_clongdouble_to_clongdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_cast_clongdouble_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !0
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_cast_clongdouble_to_clongdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 0
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !1)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_aligned_contig_cast_clongdouble_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !1
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !1
+    _TYPE2 dst_value;
+#endif
+
+#if 1
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_aligned_contig_cast_clongdouble_to_clongdouble\n");*/
+
+    while (N--) {
+#if 1
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !1
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !1
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !1
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 1
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+#line 774
+
+#if !(NPY_USE_UNALIGNED_ACCESS && !0)
+
+/* For half types, don't use actual double/float types in conversion */
+#if 0 || 0
+
+#  if 0
+#    define _TYPE1 npy_uint32
+#  elif 0
+#    define _TYPE1 npy_uint64
+#  else
+#    define _TYPE1 npy_longdouble
+#  endif
+
+#  if 0
+#    define _TYPE2 npy_uint32
+#  elif 0
+#    define _TYPE2 npy_uint64
+#  else
+#    define _TYPE2 npy_longdouble
+#  endif
+
+#else
+
+#define _TYPE1 npy_longdouble
+#define _TYPE2 npy_longdouble
+
+#endif
+
+/* Determine an appropriate casting conversion function */
+#if 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_halfbits_to_floatbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_halfbits_to_doublebits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) ((npy_bool)!npy_half_iszero(x))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)npy_half_to_float(x))
+#  endif
+
+#elif 0
+
+#  if 0
+#    define _CONVERT_FN(x) npy_floatbits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_doublebits_to_halfbits(x)
+#  elif 0
+#    define _CONVERT_FN(x) (x)
+#  elif 0
+#    define _CONVERT_FN(x) npy_float_to_half((float)(x!=0))
+#  else
+#    define _CONVERT_FN(x) npy_float_to_half((float)x)
+#  endif
+
+#else
+
+#  if 0 || 0
+#    define _CONVERT_FN(x) ((npy_bool)(x != 0))
+#  else
+#    define _CONVERT_FN(x) ((_TYPE2)x)
+#  endif
+
+#endif
+
+static NPY_GCC_OPT_3 int
+_contig_cast_clongdouble_to_clongdouble(
+        PyArrayMethod_Context *context, char *const *args,
+        const npy_intp *dimensions, const npy_intp *strides,
+        NpyAuxData *NPY_UNUSED(data))
+{
+    npy_intp N = dimensions[0];
+    char *src = args[0], *dst = args[1];
+#if !1
+    npy_intp src_stride = strides[0], dst_stride = strides[1];
+#endif
+
+#if 1
+    _TYPE1 src_value[2];
+#elif !0
+    _TYPE1 src_value;
+#endif
+#if 1
+    _TYPE2 dst_value[2];
+#elif !0
+    _TYPE2 dst_value;
+#endif
+
+#if 0
+   /* sanity check */
+    assert(N == 0 || npy_is_aligned(src, NPY_ALIGNOF(_TYPE1)));
+    assert(N == 0 || npy_is_aligned(dst, NPY_ALIGNOF(_TYPE2)));
+#endif
+
+    /*printf("_contig_cast_clongdouble_to_clongdouble\n");*/
+
+    while (N--) {
+#if 0
+#  if 1
+        src_value[0] = ((_TYPE1 *)src)[0];
+        src_value[1] = ((_TYPE1 *)src)[1];
+#  endif
+#else
+        memmove(&src_value, src, sizeof(src_value));
+#endif
+
+/* Do the cast */
+#if 1
+#  if 1
+    dst_value[0] = _CONVERT_FN(src_value[0]);
+    dst_value[1] = _CONVERT_FN(src_value[1]);
+#  elif !0
+#    if 0
+       dst_value = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       dst_value = _CONVERT_FN(src_value[0]);
+#    endif
+#  else
+#    if 0
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]) || _CONVERT_FN(src_value[1]);
+#    else
+       *(_TYPE2 *)dst = _CONVERT_FN(src_value[0]);
+#    endif
+#  endif
+#else
+#  if 1
+#    if !0
+    dst_value[0] = _CONVERT_FN(src_value);
+#    else
+    dst_value[0] = _CONVERT_FN(*(_TYPE1 *)src);
+#    endif
+    dst_value[1] = 0;
+#  elif !0
+    dst_value = _CONVERT_FN(src_value);
+#  else
+    *(_TYPE2 *)dst = _CONVERT_FN(*(_TYPE1 *)src);
+#  endif
+#endif
+
+#if 0
+#  if 1
+        ((_TYPE2 *)dst)[0] = dst_value[0];
+        ((_TYPE2 *)dst)[1] = dst_value[1];
+#  endif
+#else
+        memmove(dst, &dst_value, sizeof(dst_value));
+#endif
+
+#if 1
+        dst += sizeof(npy_clongdouble);
+        src += sizeof(npy_clongdouble);
+#else
+        dst += dst_stride;
+        src += src_stride;
+#endif
+    }
+    return 0;
+}
+
+#undef _CONVERT_FN
+#undef _TYPE2
+#undef _TYPE1
+
+#endif
+
+
+
+
+
+
+
+NPY_NO_EXPORT PyArrayMethod_StridedLoop *
+PyArray_GetStridedNumericCastFn(int aligned, npy_intp src_stride,
+                             npy_intp dst_stride,
+                             int src_type_num, int dst_type_num)
+{
+    switch (src_type_num) {
+#line 972
+
+        case NPY_BOOL:
+            /*printf("test fn %d - second %d\n", NPY_BOOL, dst_type_num);*/
+            switch (dst_type_num) {
+#line 994
+
+                case NPY_BOOL:
+                    /*printf("ret fn %d %d\n", NPY_BOOL, NPY_BOOL);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_bool) &&
+                                dst_stride == sizeof(npy_bool)) {
+                        return &_aligned_contig_cast_bool_to_bool;
+                    }
+                    else {
+                        return &_aligned_cast_bool_to_bool;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_bool) &&
+                                dst_stride == sizeof(npy_bool)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_bool_to_bool :
+                                    &_contig_cast_bool_to_bool;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_bool_to_bool :
+                                         &_cast_bool_to_bool;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_UBYTE:
+                    /*printf("ret fn %d %d\n", NPY_BOOL, NPY_UBYTE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_bool) &&
+                                dst_stride == sizeof(npy_ubyte)) {
+                        return &_aligned_contig_cast_bool_to_ubyte;
+                    }
+                    else {
+                        return &_aligned_cast_bool_to_ubyte;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_bool) &&
+                                dst_stride == sizeof(npy_ubyte)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_bool_to_ubyte :
+                                    &_contig_cast_bool_to_ubyte;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_bool_to_ubyte :
+                                         &_cast_bool_to_ubyte;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_USHORT:
+                    /*printf("ret fn %d %d\n", NPY_BOOL, NPY_USHORT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_bool) &&
+                                dst_stride == sizeof(npy_ushort)) {
+                        return &_aligned_contig_cast_bool_to_ushort;
+                    }
+                    else {
+                        return &_aligned_cast_bool_to_ushort;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_bool) &&
+                                dst_stride == sizeof(npy_ushort)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_bool_to_ushort :
+                                    &_contig_cast_bool_to_ushort;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_bool_to_ushort :
+                                         &_cast_bool_to_ushort;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_UINT:
+                    /*printf("ret fn %d %d\n", NPY_BOOL, NPY_UINT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_bool) &&
+                                dst_stride == sizeof(npy_uint)) {
+                        return &_aligned_contig_cast_bool_to_uint;
+                    }
+                    else {
+                        return &_aligned_cast_bool_to_uint;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_bool) &&
+                                dst_stride == sizeof(npy_uint)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_bool_to_uint :
+                                    &_contig_cast_bool_to_uint;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_bool_to_uint :
+                                         &_cast_bool_to_uint;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_ULONG:
+                    /*printf("ret fn %d %d\n", NPY_BOOL, NPY_ULONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_bool) &&
+                                dst_stride == sizeof(npy_ulong)) {
+                        return &_aligned_contig_cast_bool_to_ulong;
+                    }
+                    else {
+                        return &_aligned_cast_bool_to_ulong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_bool) &&
+                                dst_stride == sizeof(npy_ulong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_bool_to_ulong :
+                                    &_contig_cast_bool_to_ulong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_bool_to_ulong :
+                                         &_cast_bool_to_ulong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_ULONGLONG:
+                    /*printf("ret fn %d %d\n", NPY_BOOL, NPY_ULONGLONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_bool) &&
+                                dst_stride == sizeof(npy_ulonglong)) {
+                        return &_aligned_contig_cast_bool_to_ulonglong;
+                    }
+                    else {
+                        return &_aligned_cast_bool_to_ulonglong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_bool) &&
+                                dst_stride == sizeof(npy_ulonglong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_bool_to_ulonglong :
+                                    &_contig_cast_bool_to_ulonglong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_bool_to_ulonglong :
+                                         &_cast_bool_to_ulonglong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_BYTE:
+                    /*printf("ret fn %d %d\n", NPY_BOOL, NPY_BYTE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_bool) &&
+                                dst_stride == sizeof(npy_byte)) {
+                        return &_aligned_contig_cast_bool_to_byte;
+                    }
+                    else {
+                        return &_aligned_cast_bool_to_byte;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_bool) &&
+                                dst_stride == sizeof(npy_byte)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_bool_to_byte :
+                                    &_contig_cast_bool_to_byte;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_bool_to_byte :
+                                         &_cast_bool_to_byte;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_SHORT:
+                    /*printf("ret fn %d %d\n", NPY_BOOL, NPY_SHORT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_bool) &&
+                                dst_stride == sizeof(npy_short)) {
+                        return &_aligned_contig_cast_bool_to_short;
+                    }
+                    else {
+                        return &_aligned_cast_bool_to_short;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_bool) &&
+                                dst_stride == sizeof(npy_short)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_bool_to_short :
+                                    &_contig_cast_bool_to_short;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_bool_to_short :
+                                         &_cast_bool_to_short;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_INT:
+                    /*printf("ret fn %d %d\n", NPY_BOOL, NPY_INT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_bool) &&
+                                dst_stride == sizeof(npy_int)) {
+                        return &_aligned_contig_cast_bool_to_int;
+                    }
+                    else {
+                        return &_aligned_cast_bool_to_int;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_bool) &&
+                                dst_stride == sizeof(npy_int)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_bool_to_int :
+                                    &_contig_cast_bool_to_int;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_bool_to_int :
+                                         &_cast_bool_to_int;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONG:
+                    /*printf("ret fn %d %d\n", NPY_BOOL, NPY_LONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_bool) &&
+                                dst_stride == sizeof(npy_long)) {
+                        return &_aligned_contig_cast_bool_to_long;
+                    }
+                    else {
+                        return &_aligned_cast_bool_to_long;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_bool) &&
+                                dst_stride == sizeof(npy_long)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_bool_to_long :
+                                    &_contig_cast_bool_to_long;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_bool_to_long :
+                                         &_cast_bool_to_long;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONGLONG:
+                    /*printf("ret fn %d %d\n", NPY_BOOL, NPY_LONGLONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_bool) &&
+                                dst_stride == sizeof(npy_longlong)) {
+                        return &_aligned_contig_cast_bool_to_longlong;
+                    }
+                    else {
+                        return &_aligned_cast_bool_to_longlong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_bool) &&
+                                dst_stride == sizeof(npy_longlong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_bool_to_longlong :
+                                    &_contig_cast_bool_to_longlong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_bool_to_longlong :
+                                         &_cast_bool_to_longlong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_HALF:
+                    /*printf("ret fn %d %d\n", NPY_BOOL, NPY_HALF);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_bool) &&
+                                dst_stride == sizeof(npy_half)) {
+                        return &_aligned_contig_cast_bool_to_half;
+                    }
+                    else {
+                        return &_aligned_cast_bool_to_half;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_bool) &&
+                                dst_stride == sizeof(npy_half)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_bool_to_half :
+                                    &_contig_cast_bool_to_half;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_bool_to_half :
+                                         &_cast_bool_to_half;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_FLOAT:
+                    /*printf("ret fn %d %d\n", NPY_BOOL, NPY_FLOAT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_bool) &&
+                                dst_stride == sizeof(npy_float)) {
+                        return &_aligned_contig_cast_bool_to_float;
+                    }
+                    else {
+                        return &_aligned_cast_bool_to_float;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_bool) &&
+                                dst_stride == sizeof(npy_float)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_bool_to_float :
+                                    &_contig_cast_bool_to_float;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_bool_to_float :
+                                         &_cast_bool_to_float;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_DOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_BOOL, NPY_DOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_bool) &&
+                                dst_stride == sizeof(npy_double)) {
+                        return &_aligned_contig_cast_bool_to_double;
+                    }
+                    else {
+                        return &_aligned_cast_bool_to_double;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_bool) &&
+                                dst_stride == sizeof(npy_double)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_bool_to_double :
+                                    &_contig_cast_bool_to_double;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_bool_to_double :
+                                         &_cast_bool_to_double;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONGDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_BOOL, NPY_LONGDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_bool) &&
+                                dst_stride == sizeof(npy_longdouble)) {
+                        return &_aligned_contig_cast_bool_to_longdouble;
+                    }
+                    else {
+                        return &_aligned_cast_bool_to_longdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_bool) &&
+                                dst_stride == sizeof(npy_longdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_bool_to_longdouble :
+                                    &_contig_cast_bool_to_longdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_bool_to_longdouble :
+                                         &_cast_bool_to_longdouble;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CFLOAT:
+                    /*printf("ret fn %d %d\n", NPY_BOOL, NPY_CFLOAT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_bool) &&
+                                dst_stride == sizeof(npy_cfloat)) {
+                        return &_aligned_contig_cast_bool_to_cfloat;
+                    }
+                    else {
+                        return &_aligned_cast_bool_to_cfloat;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_bool) &&
+                                dst_stride == sizeof(npy_cfloat)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_bool_to_cfloat :
+                                    &_contig_cast_bool_to_cfloat;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_bool_to_cfloat :
+                                         &_cast_bool_to_cfloat;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_BOOL, NPY_CDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_bool) &&
+                                dst_stride == sizeof(npy_cdouble)) {
+                        return &_aligned_contig_cast_bool_to_cdouble;
+                    }
+                    else {
+                        return &_aligned_cast_bool_to_cdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_bool) &&
+                                dst_stride == sizeof(npy_cdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_bool_to_cdouble :
+                                    &_contig_cast_bool_to_cdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_bool_to_cdouble :
+                                         &_cast_bool_to_cdouble;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CLONGDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_BOOL, NPY_CLONGDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_bool) &&
+                                dst_stride == sizeof(npy_clongdouble)) {
+                        return &_aligned_contig_cast_bool_to_clongdouble;
+                    }
+                    else {
+                        return &_aligned_cast_bool_to_clongdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_bool) &&
+                                dst_stride == sizeof(npy_clongdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_bool_to_clongdouble :
+                                    &_contig_cast_bool_to_clongdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_bool_to_clongdouble :
+                                         &_cast_bool_to_clongdouble;
+                    }
+#  endif
+
+
+            }
+            /*printf("switched test fn %d - second %d\n", NPY_BOOL, dst_type_num);*/
+
+
+#line 972
+
+        case NPY_UBYTE:
+            /*printf("test fn %d - second %d\n", NPY_UBYTE, dst_type_num);*/
+            switch (dst_type_num) {
+#line 994
+
+                case NPY_BOOL:
+                    /*printf("ret fn %d %d\n", NPY_UBYTE, NPY_BOOL);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ubyte) &&
+                                dst_stride == sizeof(npy_bool)) {
+                        return &_aligned_contig_cast_ubyte_to_bool;
+                    }
+                    else {
+                        return &_aligned_cast_ubyte_to_bool;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ubyte) &&
+                                dst_stride == sizeof(npy_bool)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ubyte_to_bool :
+                                    &_contig_cast_ubyte_to_bool;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ubyte_to_bool :
+                                         &_cast_ubyte_to_bool;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_UBYTE:
+                    /*printf("ret fn %d %d\n", NPY_UBYTE, NPY_UBYTE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ubyte) &&
+                                dst_stride == sizeof(npy_ubyte)) {
+                        return &_aligned_contig_cast_ubyte_to_ubyte;
+                    }
+                    else {
+                        return &_aligned_cast_ubyte_to_ubyte;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ubyte) &&
+                                dst_stride == sizeof(npy_ubyte)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ubyte_to_ubyte :
+                                    &_contig_cast_ubyte_to_ubyte;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ubyte_to_ubyte :
+                                         &_cast_ubyte_to_ubyte;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_USHORT:
+                    /*printf("ret fn %d %d\n", NPY_UBYTE, NPY_USHORT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ubyte) &&
+                                dst_stride == sizeof(npy_ushort)) {
+                        return &_aligned_contig_cast_ubyte_to_ushort;
+                    }
+                    else {
+                        return &_aligned_cast_ubyte_to_ushort;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ubyte) &&
+                                dst_stride == sizeof(npy_ushort)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ubyte_to_ushort :
+                                    &_contig_cast_ubyte_to_ushort;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ubyte_to_ushort :
+                                         &_cast_ubyte_to_ushort;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_UINT:
+                    /*printf("ret fn %d %d\n", NPY_UBYTE, NPY_UINT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ubyte) &&
+                                dst_stride == sizeof(npy_uint)) {
+                        return &_aligned_contig_cast_ubyte_to_uint;
+                    }
+                    else {
+                        return &_aligned_cast_ubyte_to_uint;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ubyte) &&
+                                dst_stride == sizeof(npy_uint)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ubyte_to_uint :
+                                    &_contig_cast_ubyte_to_uint;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ubyte_to_uint :
+                                         &_cast_ubyte_to_uint;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_ULONG:
+                    /*printf("ret fn %d %d\n", NPY_UBYTE, NPY_ULONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ubyte) &&
+                                dst_stride == sizeof(npy_ulong)) {
+                        return &_aligned_contig_cast_ubyte_to_ulong;
+                    }
+                    else {
+                        return &_aligned_cast_ubyte_to_ulong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ubyte) &&
+                                dst_stride == sizeof(npy_ulong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ubyte_to_ulong :
+                                    &_contig_cast_ubyte_to_ulong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ubyte_to_ulong :
+                                         &_cast_ubyte_to_ulong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_ULONGLONG:
+                    /*printf("ret fn %d %d\n", NPY_UBYTE, NPY_ULONGLONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ubyte) &&
+                                dst_stride == sizeof(npy_ulonglong)) {
+                        return &_aligned_contig_cast_ubyte_to_ulonglong;
+                    }
+                    else {
+                        return &_aligned_cast_ubyte_to_ulonglong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ubyte) &&
+                                dst_stride == sizeof(npy_ulonglong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ubyte_to_ulonglong :
+                                    &_contig_cast_ubyte_to_ulonglong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ubyte_to_ulonglong :
+                                         &_cast_ubyte_to_ulonglong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_BYTE:
+                    /*printf("ret fn %d %d\n", NPY_UBYTE, NPY_BYTE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ubyte) &&
+                                dst_stride == sizeof(npy_byte)) {
+                        return &_aligned_contig_cast_ubyte_to_byte;
+                    }
+                    else {
+                        return &_aligned_cast_ubyte_to_byte;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ubyte) &&
+                                dst_stride == sizeof(npy_byte)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ubyte_to_byte :
+                                    &_contig_cast_ubyte_to_byte;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ubyte_to_byte :
+                                         &_cast_ubyte_to_byte;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_SHORT:
+                    /*printf("ret fn %d %d\n", NPY_UBYTE, NPY_SHORT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ubyte) &&
+                                dst_stride == sizeof(npy_short)) {
+                        return &_aligned_contig_cast_ubyte_to_short;
+                    }
+                    else {
+                        return &_aligned_cast_ubyte_to_short;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ubyte) &&
+                                dst_stride == sizeof(npy_short)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ubyte_to_short :
+                                    &_contig_cast_ubyte_to_short;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ubyte_to_short :
+                                         &_cast_ubyte_to_short;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_INT:
+                    /*printf("ret fn %d %d\n", NPY_UBYTE, NPY_INT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ubyte) &&
+                                dst_stride == sizeof(npy_int)) {
+                        return &_aligned_contig_cast_ubyte_to_int;
+                    }
+                    else {
+                        return &_aligned_cast_ubyte_to_int;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ubyte) &&
+                                dst_stride == sizeof(npy_int)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ubyte_to_int :
+                                    &_contig_cast_ubyte_to_int;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ubyte_to_int :
+                                         &_cast_ubyte_to_int;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONG:
+                    /*printf("ret fn %d %d\n", NPY_UBYTE, NPY_LONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ubyte) &&
+                                dst_stride == sizeof(npy_long)) {
+                        return &_aligned_contig_cast_ubyte_to_long;
+                    }
+                    else {
+                        return &_aligned_cast_ubyte_to_long;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ubyte) &&
+                                dst_stride == sizeof(npy_long)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ubyte_to_long :
+                                    &_contig_cast_ubyte_to_long;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ubyte_to_long :
+                                         &_cast_ubyte_to_long;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONGLONG:
+                    /*printf("ret fn %d %d\n", NPY_UBYTE, NPY_LONGLONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ubyte) &&
+                                dst_stride == sizeof(npy_longlong)) {
+                        return &_aligned_contig_cast_ubyte_to_longlong;
+                    }
+                    else {
+                        return &_aligned_cast_ubyte_to_longlong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ubyte) &&
+                                dst_stride == sizeof(npy_longlong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ubyte_to_longlong :
+                                    &_contig_cast_ubyte_to_longlong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ubyte_to_longlong :
+                                         &_cast_ubyte_to_longlong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_HALF:
+                    /*printf("ret fn %d %d\n", NPY_UBYTE, NPY_HALF);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ubyte) &&
+                                dst_stride == sizeof(npy_half)) {
+                        return &_aligned_contig_cast_ubyte_to_half;
+                    }
+                    else {
+                        return &_aligned_cast_ubyte_to_half;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ubyte) &&
+                                dst_stride == sizeof(npy_half)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ubyte_to_half :
+                                    &_contig_cast_ubyte_to_half;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ubyte_to_half :
+                                         &_cast_ubyte_to_half;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_FLOAT:
+                    /*printf("ret fn %d %d\n", NPY_UBYTE, NPY_FLOAT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ubyte) &&
+                                dst_stride == sizeof(npy_float)) {
+                        return &_aligned_contig_cast_ubyte_to_float;
+                    }
+                    else {
+                        return &_aligned_cast_ubyte_to_float;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ubyte) &&
+                                dst_stride == sizeof(npy_float)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ubyte_to_float :
+                                    &_contig_cast_ubyte_to_float;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ubyte_to_float :
+                                         &_cast_ubyte_to_float;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_DOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_UBYTE, NPY_DOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ubyte) &&
+                                dst_stride == sizeof(npy_double)) {
+                        return &_aligned_contig_cast_ubyte_to_double;
+                    }
+                    else {
+                        return &_aligned_cast_ubyte_to_double;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ubyte) &&
+                                dst_stride == sizeof(npy_double)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ubyte_to_double :
+                                    &_contig_cast_ubyte_to_double;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ubyte_to_double :
+                                         &_cast_ubyte_to_double;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONGDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_UBYTE, NPY_LONGDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ubyte) &&
+                                dst_stride == sizeof(npy_longdouble)) {
+                        return &_aligned_contig_cast_ubyte_to_longdouble;
+                    }
+                    else {
+                        return &_aligned_cast_ubyte_to_longdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ubyte) &&
+                                dst_stride == sizeof(npy_longdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ubyte_to_longdouble :
+                                    &_contig_cast_ubyte_to_longdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ubyte_to_longdouble :
+                                         &_cast_ubyte_to_longdouble;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CFLOAT:
+                    /*printf("ret fn %d %d\n", NPY_UBYTE, NPY_CFLOAT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ubyte) &&
+                                dst_stride == sizeof(npy_cfloat)) {
+                        return &_aligned_contig_cast_ubyte_to_cfloat;
+                    }
+                    else {
+                        return &_aligned_cast_ubyte_to_cfloat;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ubyte) &&
+                                dst_stride == sizeof(npy_cfloat)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ubyte_to_cfloat :
+                                    &_contig_cast_ubyte_to_cfloat;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ubyte_to_cfloat :
+                                         &_cast_ubyte_to_cfloat;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_UBYTE, NPY_CDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ubyte) &&
+                                dst_stride == sizeof(npy_cdouble)) {
+                        return &_aligned_contig_cast_ubyte_to_cdouble;
+                    }
+                    else {
+                        return &_aligned_cast_ubyte_to_cdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ubyte) &&
+                                dst_stride == sizeof(npy_cdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ubyte_to_cdouble :
+                                    &_contig_cast_ubyte_to_cdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ubyte_to_cdouble :
+                                         &_cast_ubyte_to_cdouble;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CLONGDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_UBYTE, NPY_CLONGDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ubyte) &&
+                                dst_stride == sizeof(npy_clongdouble)) {
+                        return &_aligned_contig_cast_ubyte_to_clongdouble;
+                    }
+                    else {
+                        return &_aligned_cast_ubyte_to_clongdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ubyte) &&
+                                dst_stride == sizeof(npy_clongdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ubyte_to_clongdouble :
+                                    &_contig_cast_ubyte_to_clongdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ubyte_to_clongdouble :
+                                         &_cast_ubyte_to_clongdouble;
+                    }
+#  endif
+
+
+            }
+            /*printf("switched test fn %d - second %d\n", NPY_UBYTE, dst_type_num);*/
+
+
+#line 972
+
+        case NPY_USHORT:
+            /*printf("test fn %d - second %d\n", NPY_USHORT, dst_type_num);*/
+            switch (dst_type_num) {
+#line 994
+
+                case NPY_BOOL:
+                    /*printf("ret fn %d %d\n", NPY_USHORT, NPY_BOOL);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ushort) &&
+                                dst_stride == sizeof(npy_bool)) {
+                        return &_aligned_contig_cast_ushort_to_bool;
+                    }
+                    else {
+                        return &_aligned_cast_ushort_to_bool;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ushort) &&
+                                dst_stride == sizeof(npy_bool)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ushort_to_bool :
+                                    &_contig_cast_ushort_to_bool;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ushort_to_bool :
+                                         &_cast_ushort_to_bool;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_UBYTE:
+                    /*printf("ret fn %d %d\n", NPY_USHORT, NPY_UBYTE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ushort) &&
+                                dst_stride == sizeof(npy_ubyte)) {
+                        return &_aligned_contig_cast_ushort_to_ubyte;
+                    }
+                    else {
+                        return &_aligned_cast_ushort_to_ubyte;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ushort) &&
+                                dst_stride == sizeof(npy_ubyte)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ushort_to_ubyte :
+                                    &_contig_cast_ushort_to_ubyte;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ushort_to_ubyte :
+                                         &_cast_ushort_to_ubyte;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_USHORT:
+                    /*printf("ret fn %d %d\n", NPY_USHORT, NPY_USHORT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ushort) &&
+                                dst_stride == sizeof(npy_ushort)) {
+                        return &_aligned_contig_cast_ushort_to_ushort;
+                    }
+                    else {
+                        return &_aligned_cast_ushort_to_ushort;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ushort) &&
+                                dst_stride == sizeof(npy_ushort)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ushort_to_ushort :
+                                    &_contig_cast_ushort_to_ushort;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ushort_to_ushort :
+                                         &_cast_ushort_to_ushort;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_UINT:
+                    /*printf("ret fn %d %d\n", NPY_USHORT, NPY_UINT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ushort) &&
+                                dst_stride == sizeof(npy_uint)) {
+                        return &_aligned_contig_cast_ushort_to_uint;
+                    }
+                    else {
+                        return &_aligned_cast_ushort_to_uint;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ushort) &&
+                                dst_stride == sizeof(npy_uint)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ushort_to_uint :
+                                    &_contig_cast_ushort_to_uint;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ushort_to_uint :
+                                         &_cast_ushort_to_uint;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_ULONG:
+                    /*printf("ret fn %d %d\n", NPY_USHORT, NPY_ULONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ushort) &&
+                                dst_stride == sizeof(npy_ulong)) {
+                        return &_aligned_contig_cast_ushort_to_ulong;
+                    }
+                    else {
+                        return &_aligned_cast_ushort_to_ulong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ushort) &&
+                                dst_stride == sizeof(npy_ulong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ushort_to_ulong :
+                                    &_contig_cast_ushort_to_ulong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ushort_to_ulong :
+                                         &_cast_ushort_to_ulong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_ULONGLONG:
+                    /*printf("ret fn %d %d\n", NPY_USHORT, NPY_ULONGLONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ushort) &&
+                                dst_stride == sizeof(npy_ulonglong)) {
+                        return &_aligned_contig_cast_ushort_to_ulonglong;
+                    }
+                    else {
+                        return &_aligned_cast_ushort_to_ulonglong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ushort) &&
+                                dst_stride == sizeof(npy_ulonglong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ushort_to_ulonglong :
+                                    &_contig_cast_ushort_to_ulonglong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ushort_to_ulonglong :
+                                         &_cast_ushort_to_ulonglong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_BYTE:
+                    /*printf("ret fn %d %d\n", NPY_USHORT, NPY_BYTE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ushort) &&
+                                dst_stride == sizeof(npy_byte)) {
+                        return &_aligned_contig_cast_ushort_to_byte;
+                    }
+                    else {
+                        return &_aligned_cast_ushort_to_byte;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ushort) &&
+                                dst_stride == sizeof(npy_byte)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ushort_to_byte :
+                                    &_contig_cast_ushort_to_byte;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ushort_to_byte :
+                                         &_cast_ushort_to_byte;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_SHORT:
+                    /*printf("ret fn %d %d\n", NPY_USHORT, NPY_SHORT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ushort) &&
+                                dst_stride == sizeof(npy_short)) {
+                        return &_aligned_contig_cast_ushort_to_short;
+                    }
+                    else {
+                        return &_aligned_cast_ushort_to_short;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ushort) &&
+                                dst_stride == sizeof(npy_short)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ushort_to_short :
+                                    &_contig_cast_ushort_to_short;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ushort_to_short :
+                                         &_cast_ushort_to_short;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_INT:
+                    /*printf("ret fn %d %d\n", NPY_USHORT, NPY_INT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ushort) &&
+                                dst_stride == sizeof(npy_int)) {
+                        return &_aligned_contig_cast_ushort_to_int;
+                    }
+                    else {
+                        return &_aligned_cast_ushort_to_int;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ushort) &&
+                                dst_stride == sizeof(npy_int)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ushort_to_int :
+                                    &_contig_cast_ushort_to_int;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ushort_to_int :
+                                         &_cast_ushort_to_int;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONG:
+                    /*printf("ret fn %d %d\n", NPY_USHORT, NPY_LONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ushort) &&
+                                dst_stride == sizeof(npy_long)) {
+                        return &_aligned_contig_cast_ushort_to_long;
+                    }
+                    else {
+                        return &_aligned_cast_ushort_to_long;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ushort) &&
+                                dst_stride == sizeof(npy_long)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ushort_to_long :
+                                    &_contig_cast_ushort_to_long;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ushort_to_long :
+                                         &_cast_ushort_to_long;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONGLONG:
+                    /*printf("ret fn %d %d\n", NPY_USHORT, NPY_LONGLONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ushort) &&
+                                dst_stride == sizeof(npy_longlong)) {
+                        return &_aligned_contig_cast_ushort_to_longlong;
+                    }
+                    else {
+                        return &_aligned_cast_ushort_to_longlong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ushort) &&
+                                dst_stride == sizeof(npy_longlong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ushort_to_longlong :
+                                    &_contig_cast_ushort_to_longlong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ushort_to_longlong :
+                                         &_cast_ushort_to_longlong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_HALF:
+                    /*printf("ret fn %d %d\n", NPY_USHORT, NPY_HALF);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ushort) &&
+                                dst_stride == sizeof(npy_half)) {
+                        return &_aligned_contig_cast_ushort_to_half;
+                    }
+                    else {
+                        return &_aligned_cast_ushort_to_half;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ushort) &&
+                                dst_stride == sizeof(npy_half)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ushort_to_half :
+                                    &_contig_cast_ushort_to_half;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ushort_to_half :
+                                         &_cast_ushort_to_half;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_FLOAT:
+                    /*printf("ret fn %d %d\n", NPY_USHORT, NPY_FLOAT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ushort) &&
+                                dst_stride == sizeof(npy_float)) {
+                        return &_aligned_contig_cast_ushort_to_float;
+                    }
+                    else {
+                        return &_aligned_cast_ushort_to_float;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ushort) &&
+                                dst_stride == sizeof(npy_float)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ushort_to_float :
+                                    &_contig_cast_ushort_to_float;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ushort_to_float :
+                                         &_cast_ushort_to_float;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_DOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_USHORT, NPY_DOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ushort) &&
+                                dst_stride == sizeof(npy_double)) {
+                        return &_aligned_contig_cast_ushort_to_double;
+                    }
+                    else {
+                        return &_aligned_cast_ushort_to_double;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ushort) &&
+                                dst_stride == sizeof(npy_double)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ushort_to_double :
+                                    &_contig_cast_ushort_to_double;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ushort_to_double :
+                                         &_cast_ushort_to_double;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONGDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_USHORT, NPY_LONGDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ushort) &&
+                                dst_stride == sizeof(npy_longdouble)) {
+                        return &_aligned_contig_cast_ushort_to_longdouble;
+                    }
+                    else {
+                        return &_aligned_cast_ushort_to_longdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ushort) &&
+                                dst_stride == sizeof(npy_longdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ushort_to_longdouble :
+                                    &_contig_cast_ushort_to_longdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ushort_to_longdouble :
+                                         &_cast_ushort_to_longdouble;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CFLOAT:
+                    /*printf("ret fn %d %d\n", NPY_USHORT, NPY_CFLOAT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ushort) &&
+                                dst_stride == sizeof(npy_cfloat)) {
+                        return &_aligned_contig_cast_ushort_to_cfloat;
+                    }
+                    else {
+                        return &_aligned_cast_ushort_to_cfloat;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ushort) &&
+                                dst_stride == sizeof(npy_cfloat)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ushort_to_cfloat :
+                                    &_contig_cast_ushort_to_cfloat;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ushort_to_cfloat :
+                                         &_cast_ushort_to_cfloat;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_USHORT, NPY_CDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ushort) &&
+                                dst_stride == sizeof(npy_cdouble)) {
+                        return &_aligned_contig_cast_ushort_to_cdouble;
+                    }
+                    else {
+                        return &_aligned_cast_ushort_to_cdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ushort) &&
+                                dst_stride == sizeof(npy_cdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ushort_to_cdouble :
+                                    &_contig_cast_ushort_to_cdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ushort_to_cdouble :
+                                         &_cast_ushort_to_cdouble;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CLONGDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_USHORT, NPY_CLONGDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ushort) &&
+                                dst_stride == sizeof(npy_clongdouble)) {
+                        return &_aligned_contig_cast_ushort_to_clongdouble;
+                    }
+                    else {
+                        return &_aligned_cast_ushort_to_clongdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ushort) &&
+                                dst_stride == sizeof(npy_clongdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ushort_to_clongdouble :
+                                    &_contig_cast_ushort_to_clongdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ushort_to_clongdouble :
+                                         &_cast_ushort_to_clongdouble;
+                    }
+#  endif
+
+
+            }
+            /*printf("switched test fn %d - second %d\n", NPY_USHORT, dst_type_num);*/
+
+
+#line 972
+
+        case NPY_UINT:
+            /*printf("test fn %d - second %d\n", NPY_UINT, dst_type_num);*/
+            switch (dst_type_num) {
+#line 994
+
+                case NPY_BOOL:
+                    /*printf("ret fn %d %d\n", NPY_UINT, NPY_BOOL);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_uint) &&
+                                dst_stride == sizeof(npy_bool)) {
+                        return &_aligned_contig_cast_uint_to_bool;
+                    }
+                    else {
+                        return &_aligned_cast_uint_to_bool;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_uint) &&
+                                dst_stride == sizeof(npy_bool)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_uint_to_bool :
+                                    &_contig_cast_uint_to_bool;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_uint_to_bool :
+                                         &_cast_uint_to_bool;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_UBYTE:
+                    /*printf("ret fn %d %d\n", NPY_UINT, NPY_UBYTE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_uint) &&
+                                dst_stride == sizeof(npy_ubyte)) {
+                        return &_aligned_contig_cast_uint_to_ubyte;
+                    }
+                    else {
+                        return &_aligned_cast_uint_to_ubyte;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_uint) &&
+                                dst_stride == sizeof(npy_ubyte)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_uint_to_ubyte :
+                                    &_contig_cast_uint_to_ubyte;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_uint_to_ubyte :
+                                         &_cast_uint_to_ubyte;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_USHORT:
+                    /*printf("ret fn %d %d\n", NPY_UINT, NPY_USHORT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_uint) &&
+                                dst_stride == sizeof(npy_ushort)) {
+                        return &_aligned_contig_cast_uint_to_ushort;
+                    }
+                    else {
+                        return &_aligned_cast_uint_to_ushort;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_uint) &&
+                                dst_stride == sizeof(npy_ushort)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_uint_to_ushort :
+                                    &_contig_cast_uint_to_ushort;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_uint_to_ushort :
+                                         &_cast_uint_to_ushort;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_UINT:
+                    /*printf("ret fn %d %d\n", NPY_UINT, NPY_UINT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_uint) &&
+                                dst_stride == sizeof(npy_uint)) {
+                        return &_aligned_contig_cast_uint_to_uint;
+                    }
+                    else {
+                        return &_aligned_cast_uint_to_uint;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_uint) &&
+                                dst_stride == sizeof(npy_uint)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_uint_to_uint :
+                                    &_contig_cast_uint_to_uint;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_uint_to_uint :
+                                         &_cast_uint_to_uint;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_ULONG:
+                    /*printf("ret fn %d %d\n", NPY_UINT, NPY_ULONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_uint) &&
+                                dst_stride == sizeof(npy_ulong)) {
+                        return &_aligned_contig_cast_uint_to_ulong;
+                    }
+                    else {
+                        return &_aligned_cast_uint_to_ulong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_uint) &&
+                                dst_stride == sizeof(npy_ulong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_uint_to_ulong :
+                                    &_contig_cast_uint_to_ulong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_uint_to_ulong :
+                                         &_cast_uint_to_ulong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_ULONGLONG:
+                    /*printf("ret fn %d %d\n", NPY_UINT, NPY_ULONGLONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_uint) &&
+                                dst_stride == sizeof(npy_ulonglong)) {
+                        return &_aligned_contig_cast_uint_to_ulonglong;
+                    }
+                    else {
+                        return &_aligned_cast_uint_to_ulonglong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_uint) &&
+                                dst_stride == sizeof(npy_ulonglong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_uint_to_ulonglong :
+                                    &_contig_cast_uint_to_ulonglong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_uint_to_ulonglong :
+                                         &_cast_uint_to_ulonglong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_BYTE:
+                    /*printf("ret fn %d %d\n", NPY_UINT, NPY_BYTE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_uint) &&
+                                dst_stride == sizeof(npy_byte)) {
+                        return &_aligned_contig_cast_uint_to_byte;
+                    }
+                    else {
+                        return &_aligned_cast_uint_to_byte;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_uint) &&
+                                dst_stride == sizeof(npy_byte)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_uint_to_byte :
+                                    &_contig_cast_uint_to_byte;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_uint_to_byte :
+                                         &_cast_uint_to_byte;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_SHORT:
+                    /*printf("ret fn %d %d\n", NPY_UINT, NPY_SHORT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_uint) &&
+                                dst_stride == sizeof(npy_short)) {
+                        return &_aligned_contig_cast_uint_to_short;
+                    }
+                    else {
+                        return &_aligned_cast_uint_to_short;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_uint) &&
+                                dst_stride == sizeof(npy_short)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_uint_to_short :
+                                    &_contig_cast_uint_to_short;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_uint_to_short :
+                                         &_cast_uint_to_short;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_INT:
+                    /*printf("ret fn %d %d\n", NPY_UINT, NPY_INT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_uint) &&
+                                dst_stride == sizeof(npy_int)) {
+                        return &_aligned_contig_cast_uint_to_int;
+                    }
+                    else {
+                        return &_aligned_cast_uint_to_int;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_uint) &&
+                                dst_stride == sizeof(npy_int)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_uint_to_int :
+                                    &_contig_cast_uint_to_int;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_uint_to_int :
+                                         &_cast_uint_to_int;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONG:
+                    /*printf("ret fn %d %d\n", NPY_UINT, NPY_LONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_uint) &&
+                                dst_stride == sizeof(npy_long)) {
+                        return &_aligned_contig_cast_uint_to_long;
+                    }
+                    else {
+                        return &_aligned_cast_uint_to_long;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_uint) &&
+                                dst_stride == sizeof(npy_long)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_uint_to_long :
+                                    &_contig_cast_uint_to_long;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_uint_to_long :
+                                         &_cast_uint_to_long;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONGLONG:
+                    /*printf("ret fn %d %d\n", NPY_UINT, NPY_LONGLONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_uint) &&
+                                dst_stride == sizeof(npy_longlong)) {
+                        return &_aligned_contig_cast_uint_to_longlong;
+                    }
+                    else {
+                        return &_aligned_cast_uint_to_longlong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_uint) &&
+                                dst_stride == sizeof(npy_longlong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_uint_to_longlong :
+                                    &_contig_cast_uint_to_longlong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_uint_to_longlong :
+                                         &_cast_uint_to_longlong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_HALF:
+                    /*printf("ret fn %d %d\n", NPY_UINT, NPY_HALF);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_uint) &&
+                                dst_stride == sizeof(npy_half)) {
+                        return &_aligned_contig_cast_uint_to_half;
+                    }
+                    else {
+                        return &_aligned_cast_uint_to_half;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_uint) &&
+                                dst_stride == sizeof(npy_half)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_uint_to_half :
+                                    &_contig_cast_uint_to_half;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_uint_to_half :
+                                         &_cast_uint_to_half;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_FLOAT:
+                    /*printf("ret fn %d %d\n", NPY_UINT, NPY_FLOAT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_uint) &&
+                                dst_stride == sizeof(npy_float)) {
+                        return &_aligned_contig_cast_uint_to_float;
+                    }
+                    else {
+                        return &_aligned_cast_uint_to_float;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_uint) &&
+                                dst_stride == sizeof(npy_float)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_uint_to_float :
+                                    &_contig_cast_uint_to_float;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_uint_to_float :
+                                         &_cast_uint_to_float;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_DOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_UINT, NPY_DOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_uint) &&
+                                dst_stride == sizeof(npy_double)) {
+                        return &_aligned_contig_cast_uint_to_double;
+                    }
+                    else {
+                        return &_aligned_cast_uint_to_double;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_uint) &&
+                                dst_stride == sizeof(npy_double)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_uint_to_double :
+                                    &_contig_cast_uint_to_double;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_uint_to_double :
+                                         &_cast_uint_to_double;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONGDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_UINT, NPY_LONGDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_uint) &&
+                                dst_stride == sizeof(npy_longdouble)) {
+                        return &_aligned_contig_cast_uint_to_longdouble;
+                    }
+                    else {
+                        return &_aligned_cast_uint_to_longdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_uint) &&
+                                dst_stride == sizeof(npy_longdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_uint_to_longdouble :
+                                    &_contig_cast_uint_to_longdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_uint_to_longdouble :
+                                         &_cast_uint_to_longdouble;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CFLOAT:
+                    /*printf("ret fn %d %d\n", NPY_UINT, NPY_CFLOAT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_uint) &&
+                                dst_stride == sizeof(npy_cfloat)) {
+                        return &_aligned_contig_cast_uint_to_cfloat;
+                    }
+                    else {
+                        return &_aligned_cast_uint_to_cfloat;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_uint) &&
+                                dst_stride == sizeof(npy_cfloat)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_uint_to_cfloat :
+                                    &_contig_cast_uint_to_cfloat;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_uint_to_cfloat :
+                                         &_cast_uint_to_cfloat;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_UINT, NPY_CDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_uint) &&
+                                dst_stride == sizeof(npy_cdouble)) {
+                        return &_aligned_contig_cast_uint_to_cdouble;
+                    }
+                    else {
+                        return &_aligned_cast_uint_to_cdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_uint) &&
+                                dst_stride == sizeof(npy_cdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_uint_to_cdouble :
+                                    &_contig_cast_uint_to_cdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_uint_to_cdouble :
+                                         &_cast_uint_to_cdouble;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CLONGDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_UINT, NPY_CLONGDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_uint) &&
+                                dst_stride == sizeof(npy_clongdouble)) {
+                        return &_aligned_contig_cast_uint_to_clongdouble;
+                    }
+                    else {
+                        return &_aligned_cast_uint_to_clongdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_uint) &&
+                                dst_stride == sizeof(npy_clongdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_uint_to_clongdouble :
+                                    &_contig_cast_uint_to_clongdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_uint_to_clongdouble :
+                                         &_cast_uint_to_clongdouble;
+                    }
+#  endif
+
+
+            }
+            /*printf("switched test fn %d - second %d\n", NPY_UINT, dst_type_num);*/
+
+
+#line 972
+
+        case NPY_ULONG:
+            /*printf("test fn %d - second %d\n", NPY_ULONG, dst_type_num);*/
+            switch (dst_type_num) {
+#line 994
+
+                case NPY_BOOL:
+                    /*printf("ret fn %d %d\n", NPY_ULONG, NPY_BOOL);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ulong) &&
+                                dst_stride == sizeof(npy_bool)) {
+                        return &_aligned_contig_cast_ulong_to_bool;
+                    }
+                    else {
+                        return &_aligned_cast_ulong_to_bool;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ulong) &&
+                                dst_stride == sizeof(npy_bool)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ulong_to_bool :
+                                    &_contig_cast_ulong_to_bool;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ulong_to_bool :
+                                         &_cast_ulong_to_bool;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_UBYTE:
+                    /*printf("ret fn %d %d\n", NPY_ULONG, NPY_UBYTE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ulong) &&
+                                dst_stride == sizeof(npy_ubyte)) {
+                        return &_aligned_contig_cast_ulong_to_ubyte;
+                    }
+                    else {
+                        return &_aligned_cast_ulong_to_ubyte;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ulong) &&
+                                dst_stride == sizeof(npy_ubyte)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ulong_to_ubyte :
+                                    &_contig_cast_ulong_to_ubyte;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ulong_to_ubyte :
+                                         &_cast_ulong_to_ubyte;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_USHORT:
+                    /*printf("ret fn %d %d\n", NPY_ULONG, NPY_USHORT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ulong) &&
+                                dst_stride == sizeof(npy_ushort)) {
+                        return &_aligned_contig_cast_ulong_to_ushort;
+                    }
+                    else {
+                        return &_aligned_cast_ulong_to_ushort;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ulong) &&
+                                dst_stride == sizeof(npy_ushort)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ulong_to_ushort :
+                                    &_contig_cast_ulong_to_ushort;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ulong_to_ushort :
+                                         &_cast_ulong_to_ushort;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_UINT:
+                    /*printf("ret fn %d %d\n", NPY_ULONG, NPY_UINT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ulong) &&
+                                dst_stride == sizeof(npy_uint)) {
+                        return &_aligned_contig_cast_ulong_to_uint;
+                    }
+                    else {
+                        return &_aligned_cast_ulong_to_uint;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ulong) &&
+                                dst_stride == sizeof(npy_uint)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ulong_to_uint :
+                                    &_contig_cast_ulong_to_uint;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ulong_to_uint :
+                                         &_cast_ulong_to_uint;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_ULONG:
+                    /*printf("ret fn %d %d\n", NPY_ULONG, NPY_ULONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ulong) &&
+                                dst_stride == sizeof(npy_ulong)) {
+                        return &_aligned_contig_cast_ulong_to_ulong;
+                    }
+                    else {
+                        return &_aligned_cast_ulong_to_ulong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ulong) &&
+                                dst_stride == sizeof(npy_ulong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ulong_to_ulong :
+                                    &_contig_cast_ulong_to_ulong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ulong_to_ulong :
+                                         &_cast_ulong_to_ulong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_ULONGLONG:
+                    /*printf("ret fn %d %d\n", NPY_ULONG, NPY_ULONGLONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ulong) &&
+                                dst_stride == sizeof(npy_ulonglong)) {
+                        return &_aligned_contig_cast_ulong_to_ulonglong;
+                    }
+                    else {
+                        return &_aligned_cast_ulong_to_ulonglong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ulong) &&
+                                dst_stride == sizeof(npy_ulonglong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ulong_to_ulonglong :
+                                    &_contig_cast_ulong_to_ulonglong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ulong_to_ulonglong :
+                                         &_cast_ulong_to_ulonglong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_BYTE:
+                    /*printf("ret fn %d %d\n", NPY_ULONG, NPY_BYTE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ulong) &&
+                                dst_stride == sizeof(npy_byte)) {
+                        return &_aligned_contig_cast_ulong_to_byte;
+                    }
+                    else {
+                        return &_aligned_cast_ulong_to_byte;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ulong) &&
+                                dst_stride == sizeof(npy_byte)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ulong_to_byte :
+                                    &_contig_cast_ulong_to_byte;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ulong_to_byte :
+                                         &_cast_ulong_to_byte;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_SHORT:
+                    /*printf("ret fn %d %d\n", NPY_ULONG, NPY_SHORT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ulong) &&
+                                dst_stride == sizeof(npy_short)) {
+                        return &_aligned_contig_cast_ulong_to_short;
+                    }
+                    else {
+                        return &_aligned_cast_ulong_to_short;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ulong) &&
+                                dst_stride == sizeof(npy_short)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ulong_to_short :
+                                    &_contig_cast_ulong_to_short;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ulong_to_short :
+                                         &_cast_ulong_to_short;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_INT:
+                    /*printf("ret fn %d %d\n", NPY_ULONG, NPY_INT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ulong) &&
+                                dst_stride == sizeof(npy_int)) {
+                        return &_aligned_contig_cast_ulong_to_int;
+                    }
+                    else {
+                        return &_aligned_cast_ulong_to_int;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ulong) &&
+                                dst_stride == sizeof(npy_int)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ulong_to_int :
+                                    &_contig_cast_ulong_to_int;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ulong_to_int :
+                                         &_cast_ulong_to_int;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONG:
+                    /*printf("ret fn %d %d\n", NPY_ULONG, NPY_LONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ulong) &&
+                                dst_stride == sizeof(npy_long)) {
+                        return &_aligned_contig_cast_ulong_to_long;
+                    }
+                    else {
+                        return &_aligned_cast_ulong_to_long;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ulong) &&
+                                dst_stride == sizeof(npy_long)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ulong_to_long :
+                                    &_contig_cast_ulong_to_long;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ulong_to_long :
+                                         &_cast_ulong_to_long;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONGLONG:
+                    /*printf("ret fn %d %d\n", NPY_ULONG, NPY_LONGLONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ulong) &&
+                                dst_stride == sizeof(npy_longlong)) {
+                        return &_aligned_contig_cast_ulong_to_longlong;
+                    }
+                    else {
+                        return &_aligned_cast_ulong_to_longlong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ulong) &&
+                                dst_stride == sizeof(npy_longlong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ulong_to_longlong :
+                                    &_contig_cast_ulong_to_longlong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ulong_to_longlong :
+                                         &_cast_ulong_to_longlong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_HALF:
+                    /*printf("ret fn %d %d\n", NPY_ULONG, NPY_HALF);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ulong) &&
+                                dst_stride == sizeof(npy_half)) {
+                        return &_aligned_contig_cast_ulong_to_half;
+                    }
+                    else {
+                        return &_aligned_cast_ulong_to_half;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ulong) &&
+                                dst_stride == sizeof(npy_half)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ulong_to_half :
+                                    &_contig_cast_ulong_to_half;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ulong_to_half :
+                                         &_cast_ulong_to_half;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_FLOAT:
+                    /*printf("ret fn %d %d\n", NPY_ULONG, NPY_FLOAT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ulong) &&
+                                dst_stride == sizeof(npy_float)) {
+                        return &_aligned_contig_cast_ulong_to_float;
+                    }
+                    else {
+                        return &_aligned_cast_ulong_to_float;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ulong) &&
+                                dst_stride == sizeof(npy_float)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ulong_to_float :
+                                    &_contig_cast_ulong_to_float;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ulong_to_float :
+                                         &_cast_ulong_to_float;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_DOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_ULONG, NPY_DOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ulong) &&
+                                dst_stride == sizeof(npy_double)) {
+                        return &_aligned_contig_cast_ulong_to_double;
+                    }
+                    else {
+                        return &_aligned_cast_ulong_to_double;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ulong) &&
+                                dst_stride == sizeof(npy_double)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ulong_to_double :
+                                    &_contig_cast_ulong_to_double;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ulong_to_double :
+                                         &_cast_ulong_to_double;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONGDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_ULONG, NPY_LONGDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ulong) &&
+                                dst_stride == sizeof(npy_longdouble)) {
+                        return &_aligned_contig_cast_ulong_to_longdouble;
+                    }
+                    else {
+                        return &_aligned_cast_ulong_to_longdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ulong) &&
+                                dst_stride == sizeof(npy_longdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ulong_to_longdouble :
+                                    &_contig_cast_ulong_to_longdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ulong_to_longdouble :
+                                         &_cast_ulong_to_longdouble;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CFLOAT:
+                    /*printf("ret fn %d %d\n", NPY_ULONG, NPY_CFLOAT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ulong) &&
+                                dst_stride == sizeof(npy_cfloat)) {
+                        return &_aligned_contig_cast_ulong_to_cfloat;
+                    }
+                    else {
+                        return &_aligned_cast_ulong_to_cfloat;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ulong) &&
+                                dst_stride == sizeof(npy_cfloat)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ulong_to_cfloat :
+                                    &_contig_cast_ulong_to_cfloat;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ulong_to_cfloat :
+                                         &_cast_ulong_to_cfloat;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_ULONG, NPY_CDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ulong) &&
+                                dst_stride == sizeof(npy_cdouble)) {
+                        return &_aligned_contig_cast_ulong_to_cdouble;
+                    }
+                    else {
+                        return &_aligned_cast_ulong_to_cdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ulong) &&
+                                dst_stride == sizeof(npy_cdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ulong_to_cdouble :
+                                    &_contig_cast_ulong_to_cdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ulong_to_cdouble :
+                                         &_cast_ulong_to_cdouble;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CLONGDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_ULONG, NPY_CLONGDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ulong) &&
+                                dst_stride == sizeof(npy_clongdouble)) {
+                        return &_aligned_contig_cast_ulong_to_clongdouble;
+                    }
+                    else {
+                        return &_aligned_cast_ulong_to_clongdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ulong) &&
+                                dst_stride == sizeof(npy_clongdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ulong_to_clongdouble :
+                                    &_contig_cast_ulong_to_clongdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ulong_to_clongdouble :
+                                         &_cast_ulong_to_clongdouble;
+                    }
+#  endif
+
+
+            }
+            /*printf("switched test fn %d - second %d\n", NPY_ULONG, dst_type_num);*/
+
+
+#line 972
+
+        case NPY_ULONGLONG:
+            /*printf("test fn %d - second %d\n", NPY_ULONGLONG, dst_type_num);*/
+            switch (dst_type_num) {
+#line 994
+
+                case NPY_BOOL:
+                    /*printf("ret fn %d %d\n", NPY_ULONGLONG, NPY_BOOL);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ulonglong) &&
+                                dst_stride == sizeof(npy_bool)) {
+                        return &_aligned_contig_cast_ulonglong_to_bool;
+                    }
+                    else {
+                        return &_aligned_cast_ulonglong_to_bool;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ulonglong) &&
+                                dst_stride == sizeof(npy_bool)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ulonglong_to_bool :
+                                    &_contig_cast_ulonglong_to_bool;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ulonglong_to_bool :
+                                         &_cast_ulonglong_to_bool;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_UBYTE:
+                    /*printf("ret fn %d %d\n", NPY_ULONGLONG, NPY_UBYTE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ulonglong) &&
+                                dst_stride == sizeof(npy_ubyte)) {
+                        return &_aligned_contig_cast_ulonglong_to_ubyte;
+                    }
+                    else {
+                        return &_aligned_cast_ulonglong_to_ubyte;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ulonglong) &&
+                                dst_stride == sizeof(npy_ubyte)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ulonglong_to_ubyte :
+                                    &_contig_cast_ulonglong_to_ubyte;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ulonglong_to_ubyte :
+                                         &_cast_ulonglong_to_ubyte;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_USHORT:
+                    /*printf("ret fn %d %d\n", NPY_ULONGLONG, NPY_USHORT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ulonglong) &&
+                                dst_stride == sizeof(npy_ushort)) {
+                        return &_aligned_contig_cast_ulonglong_to_ushort;
+                    }
+                    else {
+                        return &_aligned_cast_ulonglong_to_ushort;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ulonglong) &&
+                                dst_stride == sizeof(npy_ushort)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ulonglong_to_ushort :
+                                    &_contig_cast_ulonglong_to_ushort;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ulonglong_to_ushort :
+                                         &_cast_ulonglong_to_ushort;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_UINT:
+                    /*printf("ret fn %d %d\n", NPY_ULONGLONG, NPY_UINT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ulonglong) &&
+                                dst_stride == sizeof(npy_uint)) {
+                        return &_aligned_contig_cast_ulonglong_to_uint;
+                    }
+                    else {
+                        return &_aligned_cast_ulonglong_to_uint;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ulonglong) &&
+                                dst_stride == sizeof(npy_uint)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ulonglong_to_uint :
+                                    &_contig_cast_ulonglong_to_uint;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ulonglong_to_uint :
+                                         &_cast_ulonglong_to_uint;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_ULONG:
+                    /*printf("ret fn %d %d\n", NPY_ULONGLONG, NPY_ULONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ulonglong) &&
+                                dst_stride == sizeof(npy_ulong)) {
+                        return &_aligned_contig_cast_ulonglong_to_ulong;
+                    }
+                    else {
+                        return &_aligned_cast_ulonglong_to_ulong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ulonglong) &&
+                                dst_stride == sizeof(npy_ulong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ulonglong_to_ulong :
+                                    &_contig_cast_ulonglong_to_ulong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ulonglong_to_ulong :
+                                         &_cast_ulonglong_to_ulong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_ULONGLONG:
+                    /*printf("ret fn %d %d\n", NPY_ULONGLONG, NPY_ULONGLONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ulonglong) &&
+                                dst_stride == sizeof(npy_ulonglong)) {
+                        return &_aligned_contig_cast_ulonglong_to_ulonglong;
+                    }
+                    else {
+                        return &_aligned_cast_ulonglong_to_ulonglong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ulonglong) &&
+                                dst_stride == sizeof(npy_ulonglong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ulonglong_to_ulonglong :
+                                    &_contig_cast_ulonglong_to_ulonglong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ulonglong_to_ulonglong :
+                                         &_cast_ulonglong_to_ulonglong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_BYTE:
+                    /*printf("ret fn %d %d\n", NPY_ULONGLONG, NPY_BYTE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ulonglong) &&
+                                dst_stride == sizeof(npy_byte)) {
+                        return &_aligned_contig_cast_ulonglong_to_byte;
+                    }
+                    else {
+                        return &_aligned_cast_ulonglong_to_byte;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ulonglong) &&
+                                dst_stride == sizeof(npy_byte)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ulonglong_to_byte :
+                                    &_contig_cast_ulonglong_to_byte;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ulonglong_to_byte :
+                                         &_cast_ulonglong_to_byte;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_SHORT:
+                    /*printf("ret fn %d %d\n", NPY_ULONGLONG, NPY_SHORT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ulonglong) &&
+                                dst_stride == sizeof(npy_short)) {
+                        return &_aligned_contig_cast_ulonglong_to_short;
+                    }
+                    else {
+                        return &_aligned_cast_ulonglong_to_short;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ulonglong) &&
+                                dst_stride == sizeof(npy_short)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ulonglong_to_short :
+                                    &_contig_cast_ulonglong_to_short;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ulonglong_to_short :
+                                         &_cast_ulonglong_to_short;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_INT:
+                    /*printf("ret fn %d %d\n", NPY_ULONGLONG, NPY_INT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ulonglong) &&
+                                dst_stride == sizeof(npy_int)) {
+                        return &_aligned_contig_cast_ulonglong_to_int;
+                    }
+                    else {
+                        return &_aligned_cast_ulonglong_to_int;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ulonglong) &&
+                                dst_stride == sizeof(npy_int)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ulonglong_to_int :
+                                    &_contig_cast_ulonglong_to_int;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ulonglong_to_int :
+                                         &_cast_ulonglong_to_int;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONG:
+                    /*printf("ret fn %d %d\n", NPY_ULONGLONG, NPY_LONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ulonglong) &&
+                                dst_stride == sizeof(npy_long)) {
+                        return &_aligned_contig_cast_ulonglong_to_long;
+                    }
+                    else {
+                        return &_aligned_cast_ulonglong_to_long;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ulonglong) &&
+                                dst_stride == sizeof(npy_long)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ulonglong_to_long :
+                                    &_contig_cast_ulonglong_to_long;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ulonglong_to_long :
+                                         &_cast_ulonglong_to_long;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONGLONG:
+                    /*printf("ret fn %d %d\n", NPY_ULONGLONG, NPY_LONGLONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ulonglong) &&
+                                dst_stride == sizeof(npy_longlong)) {
+                        return &_aligned_contig_cast_ulonglong_to_longlong;
+                    }
+                    else {
+                        return &_aligned_cast_ulonglong_to_longlong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ulonglong) &&
+                                dst_stride == sizeof(npy_longlong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ulonglong_to_longlong :
+                                    &_contig_cast_ulonglong_to_longlong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ulonglong_to_longlong :
+                                         &_cast_ulonglong_to_longlong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_HALF:
+                    /*printf("ret fn %d %d\n", NPY_ULONGLONG, NPY_HALF);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ulonglong) &&
+                                dst_stride == sizeof(npy_half)) {
+                        return &_aligned_contig_cast_ulonglong_to_half;
+                    }
+                    else {
+                        return &_aligned_cast_ulonglong_to_half;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ulonglong) &&
+                                dst_stride == sizeof(npy_half)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ulonglong_to_half :
+                                    &_contig_cast_ulonglong_to_half;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ulonglong_to_half :
+                                         &_cast_ulonglong_to_half;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_FLOAT:
+                    /*printf("ret fn %d %d\n", NPY_ULONGLONG, NPY_FLOAT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ulonglong) &&
+                                dst_stride == sizeof(npy_float)) {
+                        return &_aligned_contig_cast_ulonglong_to_float;
+                    }
+                    else {
+                        return &_aligned_cast_ulonglong_to_float;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ulonglong) &&
+                                dst_stride == sizeof(npy_float)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ulonglong_to_float :
+                                    &_contig_cast_ulonglong_to_float;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ulonglong_to_float :
+                                         &_cast_ulonglong_to_float;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_DOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_ULONGLONG, NPY_DOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ulonglong) &&
+                                dst_stride == sizeof(npy_double)) {
+                        return &_aligned_contig_cast_ulonglong_to_double;
+                    }
+                    else {
+                        return &_aligned_cast_ulonglong_to_double;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ulonglong) &&
+                                dst_stride == sizeof(npy_double)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ulonglong_to_double :
+                                    &_contig_cast_ulonglong_to_double;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ulonglong_to_double :
+                                         &_cast_ulonglong_to_double;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONGDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_ULONGLONG, NPY_LONGDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ulonglong) &&
+                                dst_stride == sizeof(npy_longdouble)) {
+                        return &_aligned_contig_cast_ulonglong_to_longdouble;
+                    }
+                    else {
+                        return &_aligned_cast_ulonglong_to_longdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ulonglong) &&
+                                dst_stride == sizeof(npy_longdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ulonglong_to_longdouble :
+                                    &_contig_cast_ulonglong_to_longdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ulonglong_to_longdouble :
+                                         &_cast_ulonglong_to_longdouble;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CFLOAT:
+                    /*printf("ret fn %d %d\n", NPY_ULONGLONG, NPY_CFLOAT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ulonglong) &&
+                                dst_stride == sizeof(npy_cfloat)) {
+                        return &_aligned_contig_cast_ulonglong_to_cfloat;
+                    }
+                    else {
+                        return &_aligned_cast_ulonglong_to_cfloat;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ulonglong) &&
+                                dst_stride == sizeof(npy_cfloat)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ulonglong_to_cfloat :
+                                    &_contig_cast_ulonglong_to_cfloat;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ulonglong_to_cfloat :
+                                         &_cast_ulonglong_to_cfloat;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_ULONGLONG, NPY_CDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ulonglong) &&
+                                dst_stride == sizeof(npy_cdouble)) {
+                        return &_aligned_contig_cast_ulonglong_to_cdouble;
+                    }
+                    else {
+                        return &_aligned_cast_ulonglong_to_cdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ulonglong) &&
+                                dst_stride == sizeof(npy_cdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ulonglong_to_cdouble :
+                                    &_contig_cast_ulonglong_to_cdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ulonglong_to_cdouble :
+                                         &_cast_ulonglong_to_cdouble;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CLONGDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_ULONGLONG, NPY_CLONGDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_ulonglong) &&
+                                dst_stride == sizeof(npy_clongdouble)) {
+                        return &_aligned_contig_cast_ulonglong_to_clongdouble;
+                    }
+                    else {
+                        return &_aligned_cast_ulonglong_to_clongdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_ulonglong) &&
+                                dst_stride == sizeof(npy_clongdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_ulonglong_to_clongdouble :
+                                    &_contig_cast_ulonglong_to_clongdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_ulonglong_to_clongdouble :
+                                         &_cast_ulonglong_to_clongdouble;
+                    }
+#  endif
+
+
+            }
+            /*printf("switched test fn %d - second %d\n", NPY_ULONGLONG, dst_type_num);*/
+
+
+#line 972
+
+        case NPY_BYTE:
+            /*printf("test fn %d - second %d\n", NPY_BYTE, dst_type_num);*/
+            switch (dst_type_num) {
+#line 994
+
+                case NPY_BOOL:
+                    /*printf("ret fn %d %d\n", NPY_BYTE, NPY_BOOL);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_byte) &&
+                                dst_stride == sizeof(npy_bool)) {
+                        return &_aligned_contig_cast_byte_to_bool;
+                    }
+                    else {
+                        return &_aligned_cast_byte_to_bool;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_byte) &&
+                                dst_stride == sizeof(npy_bool)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_byte_to_bool :
+                                    &_contig_cast_byte_to_bool;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_byte_to_bool :
+                                         &_cast_byte_to_bool;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_UBYTE:
+                    /*printf("ret fn %d %d\n", NPY_BYTE, NPY_UBYTE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_byte) &&
+                                dst_stride == sizeof(npy_ubyte)) {
+                        return &_aligned_contig_cast_byte_to_ubyte;
+                    }
+                    else {
+                        return &_aligned_cast_byte_to_ubyte;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_byte) &&
+                                dst_stride == sizeof(npy_ubyte)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_byte_to_ubyte :
+                                    &_contig_cast_byte_to_ubyte;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_byte_to_ubyte :
+                                         &_cast_byte_to_ubyte;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_USHORT:
+                    /*printf("ret fn %d %d\n", NPY_BYTE, NPY_USHORT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_byte) &&
+                                dst_stride == sizeof(npy_ushort)) {
+                        return &_aligned_contig_cast_byte_to_ushort;
+                    }
+                    else {
+                        return &_aligned_cast_byte_to_ushort;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_byte) &&
+                                dst_stride == sizeof(npy_ushort)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_byte_to_ushort :
+                                    &_contig_cast_byte_to_ushort;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_byte_to_ushort :
+                                         &_cast_byte_to_ushort;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_UINT:
+                    /*printf("ret fn %d %d\n", NPY_BYTE, NPY_UINT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_byte) &&
+                                dst_stride == sizeof(npy_uint)) {
+                        return &_aligned_contig_cast_byte_to_uint;
+                    }
+                    else {
+                        return &_aligned_cast_byte_to_uint;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_byte) &&
+                                dst_stride == sizeof(npy_uint)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_byte_to_uint :
+                                    &_contig_cast_byte_to_uint;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_byte_to_uint :
+                                         &_cast_byte_to_uint;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_ULONG:
+                    /*printf("ret fn %d %d\n", NPY_BYTE, NPY_ULONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_byte) &&
+                                dst_stride == sizeof(npy_ulong)) {
+                        return &_aligned_contig_cast_byte_to_ulong;
+                    }
+                    else {
+                        return &_aligned_cast_byte_to_ulong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_byte) &&
+                                dst_stride == sizeof(npy_ulong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_byte_to_ulong :
+                                    &_contig_cast_byte_to_ulong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_byte_to_ulong :
+                                         &_cast_byte_to_ulong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_ULONGLONG:
+                    /*printf("ret fn %d %d\n", NPY_BYTE, NPY_ULONGLONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_byte) &&
+                                dst_stride == sizeof(npy_ulonglong)) {
+                        return &_aligned_contig_cast_byte_to_ulonglong;
+                    }
+                    else {
+                        return &_aligned_cast_byte_to_ulonglong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_byte) &&
+                                dst_stride == sizeof(npy_ulonglong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_byte_to_ulonglong :
+                                    &_contig_cast_byte_to_ulonglong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_byte_to_ulonglong :
+                                         &_cast_byte_to_ulonglong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_BYTE:
+                    /*printf("ret fn %d %d\n", NPY_BYTE, NPY_BYTE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_byte) &&
+                                dst_stride == sizeof(npy_byte)) {
+                        return &_aligned_contig_cast_byte_to_byte;
+                    }
+                    else {
+                        return &_aligned_cast_byte_to_byte;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_byte) &&
+                                dst_stride == sizeof(npy_byte)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_byte_to_byte :
+                                    &_contig_cast_byte_to_byte;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_byte_to_byte :
+                                         &_cast_byte_to_byte;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_SHORT:
+                    /*printf("ret fn %d %d\n", NPY_BYTE, NPY_SHORT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_byte) &&
+                                dst_stride == sizeof(npy_short)) {
+                        return &_aligned_contig_cast_byte_to_short;
+                    }
+                    else {
+                        return &_aligned_cast_byte_to_short;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_byte) &&
+                                dst_stride == sizeof(npy_short)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_byte_to_short :
+                                    &_contig_cast_byte_to_short;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_byte_to_short :
+                                         &_cast_byte_to_short;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_INT:
+                    /*printf("ret fn %d %d\n", NPY_BYTE, NPY_INT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_byte) &&
+                                dst_stride == sizeof(npy_int)) {
+                        return &_aligned_contig_cast_byte_to_int;
+                    }
+                    else {
+                        return &_aligned_cast_byte_to_int;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_byte) &&
+                                dst_stride == sizeof(npy_int)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_byte_to_int :
+                                    &_contig_cast_byte_to_int;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_byte_to_int :
+                                         &_cast_byte_to_int;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONG:
+                    /*printf("ret fn %d %d\n", NPY_BYTE, NPY_LONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_byte) &&
+                                dst_stride == sizeof(npy_long)) {
+                        return &_aligned_contig_cast_byte_to_long;
+                    }
+                    else {
+                        return &_aligned_cast_byte_to_long;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_byte) &&
+                                dst_stride == sizeof(npy_long)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_byte_to_long :
+                                    &_contig_cast_byte_to_long;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_byte_to_long :
+                                         &_cast_byte_to_long;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONGLONG:
+                    /*printf("ret fn %d %d\n", NPY_BYTE, NPY_LONGLONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_byte) &&
+                                dst_stride == sizeof(npy_longlong)) {
+                        return &_aligned_contig_cast_byte_to_longlong;
+                    }
+                    else {
+                        return &_aligned_cast_byte_to_longlong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_byte) &&
+                                dst_stride == sizeof(npy_longlong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_byte_to_longlong :
+                                    &_contig_cast_byte_to_longlong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_byte_to_longlong :
+                                         &_cast_byte_to_longlong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_HALF:
+                    /*printf("ret fn %d %d\n", NPY_BYTE, NPY_HALF);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_byte) &&
+                                dst_stride == sizeof(npy_half)) {
+                        return &_aligned_contig_cast_byte_to_half;
+                    }
+                    else {
+                        return &_aligned_cast_byte_to_half;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_byte) &&
+                                dst_stride == sizeof(npy_half)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_byte_to_half :
+                                    &_contig_cast_byte_to_half;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_byte_to_half :
+                                         &_cast_byte_to_half;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_FLOAT:
+                    /*printf("ret fn %d %d\n", NPY_BYTE, NPY_FLOAT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_byte) &&
+                                dst_stride == sizeof(npy_float)) {
+                        return &_aligned_contig_cast_byte_to_float;
+                    }
+                    else {
+                        return &_aligned_cast_byte_to_float;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_byte) &&
+                                dst_stride == sizeof(npy_float)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_byte_to_float :
+                                    &_contig_cast_byte_to_float;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_byte_to_float :
+                                         &_cast_byte_to_float;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_DOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_BYTE, NPY_DOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_byte) &&
+                                dst_stride == sizeof(npy_double)) {
+                        return &_aligned_contig_cast_byte_to_double;
+                    }
+                    else {
+                        return &_aligned_cast_byte_to_double;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_byte) &&
+                                dst_stride == sizeof(npy_double)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_byte_to_double :
+                                    &_contig_cast_byte_to_double;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_byte_to_double :
+                                         &_cast_byte_to_double;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONGDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_BYTE, NPY_LONGDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_byte) &&
+                                dst_stride == sizeof(npy_longdouble)) {
+                        return &_aligned_contig_cast_byte_to_longdouble;
+                    }
+                    else {
+                        return &_aligned_cast_byte_to_longdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_byte) &&
+                                dst_stride == sizeof(npy_longdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_byte_to_longdouble :
+                                    &_contig_cast_byte_to_longdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_byte_to_longdouble :
+                                         &_cast_byte_to_longdouble;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CFLOAT:
+                    /*printf("ret fn %d %d\n", NPY_BYTE, NPY_CFLOAT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_byte) &&
+                                dst_stride == sizeof(npy_cfloat)) {
+                        return &_aligned_contig_cast_byte_to_cfloat;
+                    }
+                    else {
+                        return &_aligned_cast_byte_to_cfloat;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_byte) &&
+                                dst_stride == sizeof(npy_cfloat)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_byte_to_cfloat :
+                                    &_contig_cast_byte_to_cfloat;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_byte_to_cfloat :
+                                         &_cast_byte_to_cfloat;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_BYTE, NPY_CDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_byte) &&
+                                dst_stride == sizeof(npy_cdouble)) {
+                        return &_aligned_contig_cast_byte_to_cdouble;
+                    }
+                    else {
+                        return &_aligned_cast_byte_to_cdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_byte) &&
+                                dst_stride == sizeof(npy_cdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_byte_to_cdouble :
+                                    &_contig_cast_byte_to_cdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_byte_to_cdouble :
+                                         &_cast_byte_to_cdouble;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CLONGDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_BYTE, NPY_CLONGDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_byte) &&
+                                dst_stride == sizeof(npy_clongdouble)) {
+                        return &_aligned_contig_cast_byte_to_clongdouble;
+                    }
+                    else {
+                        return &_aligned_cast_byte_to_clongdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_byte) &&
+                                dst_stride == sizeof(npy_clongdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_byte_to_clongdouble :
+                                    &_contig_cast_byte_to_clongdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_byte_to_clongdouble :
+                                         &_cast_byte_to_clongdouble;
+                    }
+#  endif
+
+
+            }
+            /*printf("switched test fn %d - second %d\n", NPY_BYTE, dst_type_num);*/
+
+
+#line 972
+
+        case NPY_SHORT:
+            /*printf("test fn %d - second %d\n", NPY_SHORT, dst_type_num);*/
+            switch (dst_type_num) {
+#line 994
+
+                case NPY_BOOL:
+                    /*printf("ret fn %d %d\n", NPY_SHORT, NPY_BOOL);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_short) &&
+                                dst_stride == sizeof(npy_bool)) {
+                        return &_aligned_contig_cast_short_to_bool;
+                    }
+                    else {
+                        return &_aligned_cast_short_to_bool;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_short) &&
+                                dst_stride == sizeof(npy_bool)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_short_to_bool :
+                                    &_contig_cast_short_to_bool;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_short_to_bool :
+                                         &_cast_short_to_bool;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_UBYTE:
+                    /*printf("ret fn %d %d\n", NPY_SHORT, NPY_UBYTE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_short) &&
+                                dst_stride == sizeof(npy_ubyte)) {
+                        return &_aligned_contig_cast_short_to_ubyte;
+                    }
+                    else {
+                        return &_aligned_cast_short_to_ubyte;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_short) &&
+                                dst_stride == sizeof(npy_ubyte)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_short_to_ubyte :
+                                    &_contig_cast_short_to_ubyte;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_short_to_ubyte :
+                                         &_cast_short_to_ubyte;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_USHORT:
+                    /*printf("ret fn %d %d\n", NPY_SHORT, NPY_USHORT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_short) &&
+                                dst_stride == sizeof(npy_ushort)) {
+                        return &_aligned_contig_cast_short_to_ushort;
+                    }
+                    else {
+                        return &_aligned_cast_short_to_ushort;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_short) &&
+                                dst_stride == sizeof(npy_ushort)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_short_to_ushort :
+                                    &_contig_cast_short_to_ushort;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_short_to_ushort :
+                                         &_cast_short_to_ushort;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_UINT:
+                    /*printf("ret fn %d %d\n", NPY_SHORT, NPY_UINT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_short) &&
+                                dst_stride == sizeof(npy_uint)) {
+                        return &_aligned_contig_cast_short_to_uint;
+                    }
+                    else {
+                        return &_aligned_cast_short_to_uint;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_short) &&
+                                dst_stride == sizeof(npy_uint)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_short_to_uint :
+                                    &_contig_cast_short_to_uint;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_short_to_uint :
+                                         &_cast_short_to_uint;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_ULONG:
+                    /*printf("ret fn %d %d\n", NPY_SHORT, NPY_ULONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_short) &&
+                                dst_stride == sizeof(npy_ulong)) {
+                        return &_aligned_contig_cast_short_to_ulong;
+                    }
+                    else {
+                        return &_aligned_cast_short_to_ulong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_short) &&
+                                dst_stride == sizeof(npy_ulong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_short_to_ulong :
+                                    &_contig_cast_short_to_ulong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_short_to_ulong :
+                                         &_cast_short_to_ulong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_ULONGLONG:
+                    /*printf("ret fn %d %d\n", NPY_SHORT, NPY_ULONGLONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_short) &&
+                                dst_stride == sizeof(npy_ulonglong)) {
+                        return &_aligned_contig_cast_short_to_ulonglong;
+                    }
+                    else {
+                        return &_aligned_cast_short_to_ulonglong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_short) &&
+                                dst_stride == sizeof(npy_ulonglong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_short_to_ulonglong :
+                                    &_contig_cast_short_to_ulonglong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_short_to_ulonglong :
+                                         &_cast_short_to_ulonglong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_BYTE:
+                    /*printf("ret fn %d %d\n", NPY_SHORT, NPY_BYTE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_short) &&
+                                dst_stride == sizeof(npy_byte)) {
+                        return &_aligned_contig_cast_short_to_byte;
+                    }
+                    else {
+                        return &_aligned_cast_short_to_byte;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_short) &&
+                                dst_stride == sizeof(npy_byte)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_short_to_byte :
+                                    &_contig_cast_short_to_byte;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_short_to_byte :
+                                         &_cast_short_to_byte;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_SHORT:
+                    /*printf("ret fn %d %d\n", NPY_SHORT, NPY_SHORT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_short) &&
+                                dst_stride == sizeof(npy_short)) {
+                        return &_aligned_contig_cast_short_to_short;
+                    }
+                    else {
+                        return &_aligned_cast_short_to_short;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_short) &&
+                                dst_stride == sizeof(npy_short)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_short_to_short :
+                                    &_contig_cast_short_to_short;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_short_to_short :
+                                         &_cast_short_to_short;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_INT:
+                    /*printf("ret fn %d %d\n", NPY_SHORT, NPY_INT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_short) &&
+                                dst_stride == sizeof(npy_int)) {
+                        return &_aligned_contig_cast_short_to_int;
+                    }
+                    else {
+                        return &_aligned_cast_short_to_int;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_short) &&
+                                dst_stride == sizeof(npy_int)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_short_to_int :
+                                    &_contig_cast_short_to_int;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_short_to_int :
+                                         &_cast_short_to_int;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONG:
+                    /*printf("ret fn %d %d\n", NPY_SHORT, NPY_LONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_short) &&
+                                dst_stride == sizeof(npy_long)) {
+                        return &_aligned_contig_cast_short_to_long;
+                    }
+                    else {
+                        return &_aligned_cast_short_to_long;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_short) &&
+                                dst_stride == sizeof(npy_long)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_short_to_long :
+                                    &_contig_cast_short_to_long;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_short_to_long :
+                                         &_cast_short_to_long;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONGLONG:
+                    /*printf("ret fn %d %d\n", NPY_SHORT, NPY_LONGLONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_short) &&
+                                dst_stride == sizeof(npy_longlong)) {
+                        return &_aligned_contig_cast_short_to_longlong;
+                    }
+                    else {
+                        return &_aligned_cast_short_to_longlong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_short) &&
+                                dst_stride == sizeof(npy_longlong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_short_to_longlong :
+                                    &_contig_cast_short_to_longlong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_short_to_longlong :
+                                         &_cast_short_to_longlong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_HALF:
+                    /*printf("ret fn %d %d\n", NPY_SHORT, NPY_HALF);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_short) &&
+                                dst_stride == sizeof(npy_half)) {
+                        return &_aligned_contig_cast_short_to_half;
+                    }
+                    else {
+                        return &_aligned_cast_short_to_half;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_short) &&
+                                dst_stride == sizeof(npy_half)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_short_to_half :
+                                    &_contig_cast_short_to_half;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_short_to_half :
+                                         &_cast_short_to_half;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_FLOAT:
+                    /*printf("ret fn %d %d\n", NPY_SHORT, NPY_FLOAT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_short) &&
+                                dst_stride == sizeof(npy_float)) {
+                        return &_aligned_contig_cast_short_to_float;
+                    }
+                    else {
+                        return &_aligned_cast_short_to_float;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_short) &&
+                                dst_stride == sizeof(npy_float)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_short_to_float :
+                                    &_contig_cast_short_to_float;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_short_to_float :
+                                         &_cast_short_to_float;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_DOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_SHORT, NPY_DOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_short) &&
+                                dst_stride == sizeof(npy_double)) {
+                        return &_aligned_contig_cast_short_to_double;
+                    }
+                    else {
+                        return &_aligned_cast_short_to_double;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_short) &&
+                                dst_stride == sizeof(npy_double)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_short_to_double :
+                                    &_contig_cast_short_to_double;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_short_to_double :
+                                         &_cast_short_to_double;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONGDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_SHORT, NPY_LONGDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_short) &&
+                                dst_stride == sizeof(npy_longdouble)) {
+                        return &_aligned_contig_cast_short_to_longdouble;
+                    }
+                    else {
+                        return &_aligned_cast_short_to_longdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_short) &&
+                                dst_stride == sizeof(npy_longdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_short_to_longdouble :
+                                    &_contig_cast_short_to_longdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_short_to_longdouble :
+                                         &_cast_short_to_longdouble;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CFLOAT:
+                    /*printf("ret fn %d %d\n", NPY_SHORT, NPY_CFLOAT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_short) &&
+                                dst_stride == sizeof(npy_cfloat)) {
+                        return &_aligned_contig_cast_short_to_cfloat;
+                    }
+                    else {
+                        return &_aligned_cast_short_to_cfloat;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_short) &&
+                                dst_stride == sizeof(npy_cfloat)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_short_to_cfloat :
+                                    &_contig_cast_short_to_cfloat;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_short_to_cfloat :
+                                         &_cast_short_to_cfloat;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_SHORT, NPY_CDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_short) &&
+                                dst_stride == sizeof(npy_cdouble)) {
+                        return &_aligned_contig_cast_short_to_cdouble;
+                    }
+                    else {
+                        return &_aligned_cast_short_to_cdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_short) &&
+                                dst_stride == sizeof(npy_cdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_short_to_cdouble :
+                                    &_contig_cast_short_to_cdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_short_to_cdouble :
+                                         &_cast_short_to_cdouble;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CLONGDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_SHORT, NPY_CLONGDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_short) &&
+                                dst_stride == sizeof(npy_clongdouble)) {
+                        return &_aligned_contig_cast_short_to_clongdouble;
+                    }
+                    else {
+                        return &_aligned_cast_short_to_clongdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_short) &&
+                                dst_stride == sizeof(npy_clongdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_short_to_clongdouble :
+                                    &_contig_cast_short_to_clongdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_short_to_clongdouble :
+                                         &_cast_short_to_clongdouble;
+                    }
+#  endif
+
+
+            }
+            /*printf("switched test fn %d - second %d\n", NPY_SHORT, dst_type_num);*/
+
+
+#line 972
+
+        case NPY_INT:
+            /*printf("test fn %d - second %d\n", NPY_INT, dst_type_num);*/
+            switch (dst_type_num) {
+#line 994
+
+                case NPY_BOOL:
+                    /*printf("ret fn %d %d\n", NPY_INT, NPY_BOOL);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_int) &&
+                                dst_stride == sizeof(npy_bool)) {
+                        return &_aligned_contig_cast_int_to_bool;
+                    }
+                    else {
+                        return &_aligned_cast_int_to_bool;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_int) &&
+                                dst_stride == sizeof(npy_bool)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_int_to_bool :
+                                    &_contig_cast_int_to_bool;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_int_to_bool :
+                                         &_cast_int_to_bool;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_UBYTE:
+                    /*printf("ret fn %d %d\n", NPY_INT, NPY_UBYTE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_int) &&
+                                dst_stride == sizeof(npy_ubyte)) {
+                        return &_aligned_contig_cast_int_to_ubyte;
+                    }
+                    else {
+                        return &_aligned_cast_int_to_ubyte;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_int) &&
+                                dst_stride == sizeof(npy_ubyte)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_int_to_ubyte :
+                                    &_contig_cast_int_to_ubyte;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_int_to_ubyte :
+                                         &_cast_int_to_ubyte;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_USHORT:
+                    /*printf("ret fn %d %d\n", NPY_INT, NPY_USHORT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_int) &&
+                                dst_stride == sizeof(npy_ushort)) {
+                        return &_aligned_contig_cast_int_to_ushort;
+                    }
+                    else {
+                        return &_aligned_cast_int_to_ushort;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_int) &&
+                                dst_stride == sizeof(npy_ushort)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_int_to_ushort :
+                                    &_contig_cast_int_to_ushort;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_int_to_ushort :
+                                         &_cast_int_to_ushort;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_UINT:
+                    /*printf("ret fn %d %d\n", NPY_INT, NPY_UINT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_int) &&
+                                dst_stride == sizeof(npy_uint)) {
+                        return &_aligned_contig_cast_int_to_uint;
+                    }
+                    else {
+                        return &_aligned_cast_int_to_uint;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_int) &&
+                                dst_stride == sizeof(npy_uint)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_int_to_uint :
+                                    &_contig_cast_int_to_uint;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_int_to_uint :
+                                         &_cast_int_to_uint;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_ULONG:
+                    /*printf("ret fn %d %d\n", NPY_INT, NPY_ULONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_int) &&
+                                dst_stride == sizeof(npy_ulong)) {
+                        return &_aligned_contig_cast_int_to_ulong;
+                    }
+                    else {
+                        return &_aligned_cast_int_to_ulong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_int) &&
+                                dst_stride == sizeof(npy_ulong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_int_to_ulong :
+                                    &_contig_cast_int_to_ulong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_int_to_ulong :
+                                         &_cast_int_to_ulong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_ULONGLONG:
+                    /*printf("ret fn %d %d\n", NPY_INT, NPY_ULONGLONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_int) &&
+                                dst_stride == sizeof(npy_ulonglong)) {
+                        return &_aligned_contig_cast_int_to_ulonglong;
+                    }
+                    else {
+                        return &_aligned_cast_int_to_ulonglong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_int) &&
+                                dst_stride == sizeof(npy_ulonglong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_int_to_ulonglong :
+                                    &_contig_cast_int_to_ulonglong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_int_to_ulonglong :
+                                         &_cast_int_to_ulonglong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_BYTE:
+                    /*printf("ret fn %d %d\n", NPY_INT, NPY_BYTE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_int) &&
+                                dst_stride == sizeof(npy_byte)) {
+                        return &_aligned_contig_cast_int_to_byte;
+                    }
+                    else {
+                        return &_aligned_cast_int_to_byte;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_int) &&
+                                dst_stride == sizeof(npy_byte)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_int_to_byte :
+                                    &_contig_cast_int_to_byte;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_int_to_byte :
+                                         &_cast_int_to_byte;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_SHORT:
+                    /*printf("ret fn %d %d\n", NPY_INT, NPY_SHORT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_int) &&
+                                dst_stride == sizeof(npy_short)) {
+                        return &_aligned_contig_cast_int_to_short;
+                    }
+                    else {
+                        return &_aligned_cast_int_to_short;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_int) &&
+                                dst_stride == sizeof(npy_short)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_int_to_short :
+                                    &_contig_cast_int_to_short;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_int_to_short :
+                                         &_cast_int_to_short;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_INT:
+                    /*printf("ret fn %d %d\n", NPY_INT, NPY_INT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_int) &&
+                                dst_stride == sizeof(npy_int)) {
+                        return &_aligned_contig_cast_int_to_int;
+                    }
+                    else {
+                        return &_aligned_cast_int_to_int;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_int) &&
+                                dst_stride == sizeof(npy_int)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_int_to_int :
+                                    &_contig_cast_int_to_int;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_int_to_int :
+                                         &_cast_int_to_int;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONG:
+                    /*printf("ret fn %d %d\n", NPY_INT, NPY_LONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_int) &&
+                                dst_stride == sizeof(npy_long)) {
+                        return &_aligned_contig_cast_int_to_long;
+                    }
+                    else {
+                        return &_aligned_cast_int_to_long;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_int) &&
+                                dst_stride == sizeof(npy_long)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_int_to_long :
+                                    &_contig_cast_int_to_long;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_int_to_long :
+                                         &_cast_int_to_long;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONGLONG:
+                    /*printf("ret fn %d %d\n", NPY_INT, NPY_LONGLONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_int) &&
+                                dst_stride == sizeof(npy_longlong)) {
+                        return &_aligned_contig_cast_int_to_longlong;
+                    }
+                    else {
+                        return &_aligned_cast_int_to_longlong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_int) &&
+                                dst_stride == sizeof(npy_longlong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_int_to_longlong :
+                                    &_contig_cast_int_to_longlong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_int_to_longlong :
+                                         &_cast_int_to_longlong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_HALF:
+                    /*printf("ret fn %d %d\n", NPY_INT, NPY_HALF);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_int) &&
+                                dst_stride == sizeof(npy_half)) {
+                        return &_aligned_contig_cast_int_to_half;
+                    }
+                    else {
+                        return &_aligned_cast_int_to_half;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_int) &&
+                                dst_stride == sizeof(npy_half)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_int_to_half :
+                                    &_contig_cast_int_to_half;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_int_to_half :
+                                         &_cast_int_to_half;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_FLOAT:
+                    /*printf("ret fn %d %d\n", NPY_INT, NPY_FLOAT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_int) &&
+                                dst_stride == sizeof(npy_float)) {
+                        return &_aligned_contig_cast_int_to_float;
+                    }
+                    else {
+                        return &_aligned_cast_int_to_float;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_int) &&
+                                dst_stride == sizeof(npy_float)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_int_to_float :
+                                    &_contig_cast_int_to_float;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_int_to_float :
+                                         &_cast_int_to_float;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_DOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_INT, NPY_DOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_int) &&
+                                dst_stride == sizeof(npy_double)) {
+                        return &_aligned_contig_cast_int_to_double;
+                    }
+                    else {
+                        return &_aligned_cast_int_to_double;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_int) &&
+                                dst_stride == sizeof(npy_double)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_int_to_double :
+                                    &_contig_cast_int_to_double;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_int_to_double :
+                                         &_cast_int_to_double;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONGDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_INT, NPY_LONGDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_int) &&
+                                dst_stride == sizeof(npy_longdouble)) {
+                        return &_aligned_contig_cast_int_to_longdouble;
+                    }
+                    else {
+                        return &_aligned_cast_int_to_longdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_int) &&
+                                dst_stride == sizeof(npy_longdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_int_to_longdouble :
+                                    &_contig_cast_int_to_longdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_int_to_longdouble :
+                                         &_cast_int_to_longdouble;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CFLOAT:
+                    /*printf("ret fn %d %d\n", NPY_INT, NPY_CFLOAT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_int) &&
+                                dst_stride == sizeof(npy_cfloat)) {
+                        return &_aligned_contig_cast_int_to_cfloat;
+                    }
+                    else {
+                        return &_aligned_cast_int_to_cfloat;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_int) &&
+                                dst_stride == sizeof(npy_cfloat)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_int_to_cfloat :
+                                    &_contig_cast_int_to_cfloat;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_int_to_cfloat :
+                                         &_cast_int_to_cfloat;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_INT, NPY_CDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_int) &&
+                                dst_stride == sizeof(npy_cdouble)) {
+                        return &_aligned_contig_cast_int_to_cdouble;
+                    }
+                    else {
+                        return &_aligned_cast_int_to_cdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_int) &&
+                                dst_stride == sizeof(npy_cdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_int_to_cdouble :
+                                    &_contig_cast_int_to_cdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_int_to_cdouble :
+                                         &_cast_int_to_cdouble;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CLONGDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_INT, NPY_CLONGDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_int) &&
+                                dst_stride == sizeof(npy_clongdouble)) {
+                        return &_aligned_contig_cast_int_to_clongdouble;
+                    }
+                    else {
+                        return &_aligned_cast_int_to_clongdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_int) &&
+                                dst_stride == sizeof(npy_clongdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_int_to_clongdouble :
+                                    &_contig_cast_int_to_clongdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_int_to_clongdouble :
+                                         &_cast_int_to_clongdouble;
+                    }
+#  endif
+
+
+            }
+            /*printf("switched test fn %d - second %d\n", NPY_INT, dst_type_num);*/
+
+
+#line 972
+
+        case NPY_LONG:
+            /*printf("test fn %d - second %d\n", NPY_LONG, dst_type_num);*/
+            switch (dst_type_num) {
+#line 994
+
+                case NPY_BOOL:
+                    /*printf("ret fn %d %d\n", NPY_LONG, NPY_BOOL);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_long) &&
+                                dst_stride == sizeof(npy_bool)) {
+                        return &_aligned_contig_cast_long_to_bool;
+                    }
+                    else {
+                        return &_aligned_cast_long_to_bool;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_long) &&
+                                dst_stride == sizeof(npy_bool)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_long_to_bool :
+                                    &_contig_cast_long_to_bool;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_long_to_bool :
+                                         &_cast_long_to_bool;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_UBYTE:
+                    /*printf("ret fn %d %d\n", NPY_LONG, NPY_UBYTE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_long) &&
+                                dst_stride == sizeof(npy_ubyte)) {
+                        return &_aligned_contig_cast_long_to_ubyte;
+                    }
+                    else {
+                        return &_aligned_cast_long_to_ubyte;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_long) &&
+                                dst_stride == sizeof(npy_ubyte)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_long_to_ubyte :
+                                    &_contig_cast_long_to_ubyte;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_long_to_ubyte :
+                                         &_cast_long_to_ubyte;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_USHORT:
+                    /*printf("ret fn %d %d\n", NPY_LONG, NPY_USHORT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_long) &&
+                                dst_stride == sizeof(npy_ushort)) {
+                        return &_aligned_contig_cast_long_to_ushort;
+                    }
+                    else {
+                        return &_aligned_cast_long_to_ushort;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_long) &&
+                                dst_stride == sizeof(npy_ushort)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_long_to_ushort :
+                                    &_contig_cast_long_to_ushort;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_long_to_ushort :
+                                         &_cast_long_to_ushort;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_UINT:
+                    /*printf("ret fn %d %d\n", NPY_LONG, NPY_UINT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_long) &&
+                                dst_stride == sizeof(npy_uint)) {
+                        return &_aligned_contig_cast_long_to_uint;
+                    }
+                    else {
+                        return &_aligned_cast_long_to_uint;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_long) &&
+                                dst_stride == sizeof(npy_uint)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_long_to_uint :
+                                    &_contig_cast_long_to_uint;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_long_to_uint :
+                                         &_cast_long_to_uint;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_ULONG:
+                    /*printf("ret fn %d %d\n", NPY_LONG, NPY_ULONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_long) &&
+                                dst_stride == sizeof(npy_ulong)) {
+                        return &_aligned_contig_cast_long_to_ulong;
+                    }
+                    else {
+                        return &_aligned_cast_long_to_ulong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_long) &&
+                                dst_stride == sizeof(npy_ulong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_long_to_ulong :
+                                    &_contig_cast_long_to_ulong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_long_to_ulong :
+                                         &_cast_long_to_ulong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_ULONGLONG:
+                    /*printf("ret fn %d %d\n", NPY_LONG, NPY_ULONGLONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_long) &&
+                                dst_stride == sizeof(npy_ulonglong)) {
+                        return &_aligned_contig_cast_long_to_ulonglong;
+                    }
+                    else {
+                        return &_aligned_cast_long_to_ulonglong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_long) &&
+                                dst_stride == sizeof(npy_ulonglong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_long_to_ulonglong :
+                                    &_contig_cast_long_to_ulonglong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_long_to_ulonglong :
+                                         &_cast_long_to_ulonglong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_BYTE:
+                    /*printf("ret fn %d %d\n", NPY_LONG, NPY_BYTE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_long) &&
+                                dst_stride == sizeof(npy_byte)) {
+                        return &_aligned_contig_cast_long_to_byte;
+                    }
+                    else {
+                        return &_aligned_cast_long_to_byte;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_long) &&
+                                dst_stride == sizeof(npy_byte)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_long_to_byte :
+                                    &_contig_cast_long_to_byte;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_long_to_byte :
+                                         &_cast_long_to_byte;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_SHORT:
+                    /*printf("ret fn %d %d\n", NPY_LONG, NPY_SHORT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_long) &&
+                                dst_stride == sizeof(npy_short)) {
+                        return &_aligned_contig_cast_long_to_short;
+                    }
+                    else {
+                        return &_aligned_cast_long_to_short;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_long) &&
+                                dst_stride == sizeof(npy_short)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_long_to_short :
+                                    &_contig_cast_long_to_short;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_long_to_short :
+                                         &_cast_long_to_short;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_INT:
+                    /*printf("ret fn %d %d\n", NPY_LONG, NPY_INT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_long) &&
+                                dst_stride == sizeof(npy_int)) {
+                        return &_aligned_contig_cast_long_to_int;
+                    }
+                    else {
+                        return &_aligned_cast_long_to_int;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_long) &&
+                                dst_stride == sizeof(npy_int)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_long_to_int :
+                                    &_contig_cast_long_to_int;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_long_to_int :
+                                         &_cast_long_to_int;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONG:
+                    /*printf("ret fn %d %d\n", NPY_LONG, NPY_LONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_long) &&
+                                dst_stride == sizeof(npy_long)) {
+                        return &_aligned_contig_cast_long_to_long;
+                    }
+                    else {
+                        return &_aligned_cast_long_to_long;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_long) &&
+                                dst_stride == sizeof(npy_long)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_long_to_long :
+                                    &_contig_cast_long_to_long;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_long_to_long :
+                                         &_cast_long_to_long;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONGLONG:
+                    /*printf("ret fn %d %d\n", NPY_LONG, NPY_LONGLONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_long) &&
+                                dst_stride == sizeof(npy_longlong)) {
+                        return &_aligned_contig_cast_long_to_longlong;
+                    }
+                    else {
+                        return &_aligned_cast_long_to_longlong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_long) &&
+                                dst_stride == sizeof(npy_longlong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_long_to_longlong :
+                                    &_contig_cast_long_to_longlong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_long_to_longlong :
+                                         &_cast_long_to_longlong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_HALF:
+                    /*printf("ret fn %d %d\n", NPY_LONG, NPY_HALF);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_long) &&
+                                dst_stride == sizeof(npy_half)) {
+                        return &_aligned_contig_cast_long_to_half;
+                    }
+                    else {
+                        return &_aligned_cast_long_to_half;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_long) &&
+                                dst_stride == sizeof(npy_half)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_long_to_half :
+                                    &_contig_cast_long_to_half;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_long_to_half :
+                                         &_cast_long_to_half;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_FLOAT:
+                    /*printf("ret fn %d %d\n", NPY_LONG, NPY_FLOAT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_long) &&
+                                dst_stride == sizeof(npy_float)) {
+                        return &_aligned_contig_cast_long_to_float;
+                    }
+                    else {
+                        return &_aligned_cast_long_to_float;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_long) &&
+                                dst_stride == sizeof(npy_float)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_long_to_float :
+                                    &_contig_cast_long_to_float;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_long_to_float :
+                                         &_cast_long_to_float;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_DOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_LONG, NPY_DOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_long) &&
+                                dst_stride == sizeof(npy_double)) {
+                        return &_aligned_contig_cast_long_to_double;
+                    }
+                    else {
+                        return &_aligned_cast_long_to_double;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_long) &&
+                                dst_stride == sizeof(npy_double)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_long_to_double :
+                                    &_contig_cast_long_to_double;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_long_to_double :
+                                         &_cast_long_to_double;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONGDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_LONG, NPY_LONGDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_long) &&
+                                dst_stride == sizeof(npy_longdouble)) {
+                        return &_aligned_contig_cast_long_to_longdouble;
+                    }
+                    else {
+                        return &_aligned_cast_long_to_longdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_long) &&
+                                dst_stride == sizeof(npy_longdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_long_to_longdouble :
+                                    &_contig_cast_long_to_longdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_long_to_longdouble :
+                                         &_cast_long_to_longdouble;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CFLOAT:
+                    /*printf("ret fn %d %d\n", NPY_LONG, NPY_CFLOAT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_long) &&
+                                dst_stride == sizeof(npy_cfloat)) {
+                        return &_aligned_contig_cast_long_to_cfloat;
+                    }
+                    else {
+                        return &_aligned_cast_long_to_cfloat;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_long) &&
+                                dst_stride == sizeof(npy_cfloat)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_long_to_cfloat :
+                                    &_contig_cast_long_to_cfloat;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_long_to_cfloat :
+                                         &_cast_long_to_cfloat;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_LONG, NPY_CDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_long) &&
+                                dst_stride == sizeof(npy_cdouble)) {
+                        return &_aligned_contig_cast_long_to_cdouble;
+                    }
+                    else {
+                        return &_aligned_cast_long_to_cdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_long) &&
+                                dst_stride == sizeof(npy_cdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_long_to_cdouble :
+                                    &_contig_cast_long_to_cdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_long_to_cdouble :
+                                         &_cast_long_to_cdouble;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CLONGDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_LONG, NPY_CLONGDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_long) &&
+                                dst_stride == sizeof(npy_clongdouble)) {
+                        return &_aligned_contig_cast_long_to_clongdouble;
+                    }
+                    else {
+                        return &_aligned_cast_long_to_clongdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_long) &&
+                                dst_stride == sizeof(npy_clongdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_long_to_clongdouble :
+                                    &_contig_cast_long_to_clongdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_long_to_clongdouble :
+                                         &_cast_long_to_clongdouble;
+                    }
+#  endif
+
+
+            }
+            /*printf("switched test fn %d - second %d\n", NPY_LONG, dst_type_num);*/
+
+
+#line 972
+
+        case NPY_LONGLONG:
+            /*printf("test fn %d - second %d\n", NPY_LONGLONG, dst_type_num);*/
+            switch (dst_type_num) {
+#line 994
+
+                case NPY_BOOL:
+                    /*printf("ret fn %d %d\n", NPY_LONGLONG, NPY_BOOL);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_longlong) &&
+                                dst_stride == sizeof(npy_bool)) {
+                        return &_aligned_contig_cast_longlong_to_bool;
+                    }
+                    else {
+                        return &_aligned_cast_longlong_to_bool;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_longlong) &&
+                                dst_stride == sizeof(npy_bool)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_longlong_to_bool :
+                                    &_contig_cast_longlong_to_bool;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_longlong_to_bool :
+                                         &_cast_longlong_to_bool;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_UBYTE:
+                    /*printf("ret fn %d %d\n", NPY_LONGLONG, NPY_UBYTE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_longlong) &&
+                                dst_stride == sizeof(npy_ubyte)) {
+                        return &_aligned_contig_cast_longlong_to_ubyte;
+                    }
+                    else {
+                        return &_aligned_cast_longlong_to_ubyte;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_longlong) &&
+                                dst_stride == sizeof(npy_ubyte)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_longlong_to_ubyte :
+                                    &_contig_cast_longlong_to_ubyte;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_longlong_to_ubyte :
+                                         &_cast_longlong_to_ubyte;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_USHORT:
+                    /*printf("ret fn %d %d\n", NPY_LONGLONG, NPY_USHORT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_longlong) &&
+                                dst_stride == sizeof(npy_ushort)) {
+                        return &_aligned_contig_cast_longlong_to_ushort;
+                    }
+                    else {
+                        return &_aligned_cast_longlong_to_ushort;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_longlong) &&
+                                dst_stride == sizeof(npy_ushort)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_longlong_to_ushort :
+                                    &_contig_cast_longlong_to_ushort;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_longlong_to_ushort :
+                                         &_cast_longlong_to_ushort;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_UINT:
+                    /*printf("ret fn %d %d\n", NPY_LONGLONG, NPY_UINT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_longlong) &&
+                                dst_stride == sizeof(npy_uint)) {
+                        return &_aligned_contig_cast_longlong_to_uint;
+                    }
+                    else {
+                        return &_aligned_cast_longlong_to_uint;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_longlong) &&
+                                dst_stride == sizeof(npy_uint)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_longlong_to_uint :
+                                    &_contig_cast_longlong_to_uint;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_longlong_to_uint :
+                                         &_cast_longlong_to_uint;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_ULONG:
+                    /*printf("ret fn %d %d\n", NPY_LONGLONG, NPY_ULONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_longlong) &&
+                                dst_stride == sizeof(npy_ulong)) {
+                        return &_aligned_contig_cast_longlong_to_ulong;
+                    }
+                    else {
+                        return &_aligned_cast_longlong_to_ulong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_longlong) &&
+                                dst_stride == sizeof(npy_ulong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_longlong_to_ulong :
+                                    &_contig_cast_longlong_to_ulong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_longlong_to_ulong :
+                                         &_cast_longlong_to_ulong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_ULONGLONG:
+                    /*printf("ret fn %d %d\n", NPY_LONGLONG, NPY_ULONGLONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_longlong) &&
+                                dst_stride == sizeof(npy_ulonglong)) {
+                        return &_aligned_contig_cast_longlong_to_ulonglong;
+                    }
+                    else {
+                        return &_aligned_cast_longlong_to_ulonglong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_longlong) &&
+                                dst_stride == sizeof(npy_ulonglong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_longlong_to_ulonglong :
+                                    &_contig_cast_longlong_to_ulonglong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_longlong_to_ulonglong :
+                                         &_cast_longlong_to_ulonglong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_BYTE:
+                    /*printf("ret fn %d %d\n", NPY_LONGLONG, NPY_BYTE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_longlong) &&
+                                dst_stride == sizeof(npy_byte)) {
+                        return &_aligned_contig_cast_longlong_to_byte;
+                    }
+                    else {
+                        return &_aligned_cast_longlong_to_byte;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_longlong) &&
+                                dst_stride == sizeof(npy_byte)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_longlong_to_byte :
+                                    &_contig_cast_longlong_to_byte;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_longlong_to_byte :
+                                         &_cast_longlong_to_byte;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_SHORT:
+                    /*printf("ret fn %d %d\n", NPY_LONGLONG, NPY_SHORT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_longlong) &&
+                                dst_stride == sizeof(npy_short)) {
+                        return &_aligned_contig_cast_longlong_to_short;
+                    }
+                    else {
+                        return &_aligned_cast_longlong_to_short;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_longlong) &&
+                                dst_stride == sizeof(npy_short)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_longlong_to_short :
+                                    &_contig_cast_longlong_to_short;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_longlong_to_short :
+                                         &_cast_longlong_to_short;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_INT:
+                    /*printf("ret fn %d %d\n", NPY_LONGLONG, NPY_INT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_longlong) &&
+                                dst_stride == sizeof(npy_int)) {
+                        return &_aligned_contig_cast_longlong_to_int;
+                    }
+                    else {
+                        return &_aligned_cast_longlong_to_int;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_longlong) &&
+                                dst_stride == sizeof(npy_int)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_longlong_to_int :
+                                    &_contig_cast_longlong_to_int;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_longlong_to_int :
+                                         &_cast_longlong_to_int;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONG:
+                    /*printf("ret fn %d %d\n", NPY_LONGLONG, NPY_LONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_longlong) &&
+                                dst_stride == sizeof(npy_long)) {
+                        return &_aligned_contig_cast_longlong_to_long;
+                    }
+                    else {
+                        return &_aligned_cast_longlong_to_long;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_longlong) &&
+                                dst_stride == sizeof(npy_long)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_longlong_to_long :
+                                    &_contig_cast_longlong_to_long;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_longlong_to_long :
+                                         &_cast_longlong_to_long;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONGLONG:
+                    /*printf("ret fn %d %d\n", NPY_LONGLONG, NPY_LONGLONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_longlong) &&
+                                dst_stride == sizeof(npy_longlong)) {
+                        return &_aligned_contig_cast_longlong_to_longlong;
+                    }
+                    else {
+                        return &_aligned_cast_longlong_to_longlong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_longlong) &&
+                                dst_stride == sizeof(npy_longlong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_longlong_to_longlong :
+                                    &_contig_cast_longlong_to_longlong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_longlong_to_longlong :
+                                         &_cast_longlong_to_longlong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_HALF:
+                    /*printf("ret fn %d %d\n", NPY_LONGLONG, NPY_HALF);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_longlong) &&
+                                dst_stride == sizeof(npy_half)) {
+                        return &_aligned_contig_cast_longlong_to_half;
+                    }
+                    else {
+                        return &_aligned_cast_longlong_to_half;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_longlong) &&
+                                dst_stride == sizeof(npy_half)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_longlong_to_half :
+                                    &_contig_cast_longlong_to_half;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_longlong_to_half :
+                                         &_cast_longlong_to_half;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_FLOAT:
+                    /*printf("ret fn %d %d\n", NPY_LONGLONG, NPY_FLOAT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_longlong) &&
+                                dst_stride == sizeof(npy_float)) {
+                        return &_aligned_contig_cast_longlong_to_float;
+                    }
+                    else {
+                        return &_aligned_cast_longlong_to_float;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_longlong) &&
+                                dst_stride == sizeof(npy_float)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_longlong_to_float :
+                                    &_contig_cast_longlong_to_float;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_longlong_to_float :
+                                         &_cast_longlong_to_float;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_DOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_LONGLONG, NPY_DOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_longlong) &&
+                                dst_stride == sizeof(npy_double)) {
+                        return &_aligned_contig_cast_longlong_to_double;
+                    }
+                    else {
+                        return &_aligned_cast_longlong_to_double;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_longlong) &&
+                                dst_stride == sizeof(npy_double)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_longlong_to_double :
+                                    &_contig_cast_longlong_to_double;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_longlong_to_double :
+                                         &_cast_longlong_to_double;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONGDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_LONGLONG, NPY_LONGDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_longlong) &&
+                                dst_stride == sizeof(npy_longdouble)) {
+                        return &_aligned_contig_cast_longlong_to_longdouble;
+                    }
+                    else {
+                        return &_aligned_cast_longlong_to_longdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_longlong) &&
+                                dst_stride == sizeof(npy_longdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_longlong_to_longdouble :
+                                    &_contig_cast_longlong_to_longdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_longlong_to_longdouble :
+                                         &_cast_longlong_to_longdouble;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CFLOAT:
+                    /*printf("ret fn %d %d\n", NPY_LONGLONG, NPY_CFLOAT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_longlong) &&
+                                dst_stride == sizeof(npy_cfloat)) {
+                        return &_aligned_contig_cast_longlong_to_cfloat;
+                    }
+                    else {
+                        return &_aligned_cast_longlong_to_cfloat;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_longlong) &&
+                                dst_stride == sizeof(npy_cfloat)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_longlong_to_cfloat :
+                                    &_contig_cast_longlong_to_cfloat;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_longlong_to_cfloat :
+                                         &_cast_longlong_to_cfloat;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_LONGLONG, NPY_CDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_longlong) &&
+                                dst_stride == sizeof(npy_cdouble)) {
+                        return &_aligned_contig_cast_longlong_to_cdouble;
+                    }
+                    else {
+                        return &_aligned_cast_longlong_to_cdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_longlong) &&
+                                dst_stride == sizeof(npy_cdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_longlong_to_cdouble :
+                                    &_contig_cast_longlong_to_cdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_longlong_to_cdouble :
+                                         &_cast_longlong_to_cdouble;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CLONGDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_LONGLONG, NPY_CLONGDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_longlong) &&
+                                dst_stride == sizeof(npy_clongdouble)) {
+                        return &_aligned_contig_cast_longlong_to_clongdouble;
+                    }
+                    else {
+                        return &_aligned_cast_longlong_to_clongdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_longlong) &&
+                                dst_stride == sizeof(npy_clongdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_longlong_to_clongdouble :
+                                    &_contig_cast_longlong_to_clongdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_longlong_to_clongdouble :
+                                         &_cast_longlong_to_clongdouble;
+                    }
+#  endif
+
+
+            }
+            /*printf("switched test fn %d - second %d\n", NPY_LONGLONG, dst_type_num);*/
+
+
+#line 972
+
+        case NPY_HALF:
+            /*printf("test fn %d - second %d\n", NPY_HALF, dst_type_num);*/
+            switch (dst_type_num) {
+#line 994
+
+                case NPY_BOOL:
+                    /*printf("ret fn %d %d\n", NPY_HALF, NPY_BOOL);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_half) &&
+                                dst_stride == sizeof(npy_bool)) {
+                        return &_aligned_contig_cast_half_to_bool;
+                    }
+                    else {
+                        return &_aligned_cast_half_to_bool;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_half) &&
+                                dst_stride == sizeof(npy_bool)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_half_to_bool :
+                                    &_contig_cast_half_to_bool;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_half_to_bool :
+                                         &_cast_half_to_bool;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_UBYTE:
+                    /*printf("ret fn %d %d\n", NPY_HALF, NPY_UBYTE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_half) &&
+                                dst_stride == sizeof(npy_ubyte)) {
+                        return &_aligned_contig_cast_half_to_ubyte;
+                    }
+                    else {
+                        return &_aligned_cast_half_to_ubyte;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_half) &&
+                                dst_stride == sizeof(npy_ubyte)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_half_to_ubyte :
+                                    &_contig_cast_half_to_ubyte;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_half_to_ubyte :
+                                         &_cast_half_to_ubyte;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_USHORT:
+                    /*printf("ret fn %d %d\n", NPY_HALF, NPY_USHORT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_half) &&
+                                dst_stride == sizeof(npy_ushort)) {
+                        return &_aligned_contig_cast_half_to_ushort;
+                    }
+                    else {
+                        return &_aligned_cast_half_to_ushort;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_half) &&
+                                dst_stride == sizeof(npy_ushort)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_half_to_ushort :
+                                    &_contig_cast_half_to_ushort;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_half_to_ushort :
+                                         &_cast_half_to_ushort;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_UINT:
+                    /*printf("ret fn %d %d\n", NPY_HALF, NPY_UINT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_half) &&
+                                dst_stride == sizeof(npy_uint)) {
+                        return &_aligned_contig_cast_half_to_uint;
+                    }
+                    else {
+                        return &_aligned_cast_half_to_uint;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_half) &&
+                                dst_stride == sizeof(npy_uint)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_half_to_uint :
+                                    &_contig_cast_half_to_uint;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_half_to_uint :
+                                         &_cast_half_to_uint;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_ULONG:
+                    /*printf("ret fn %d %d\n", NPY_HALF, NPY_ULONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_half) &&
+                                dst_stride == sizeof(npy_ulong)) {
+                        return &_aligned_contig_cast_half_to_ulong;
+                    }
+                    else {
+                        return &_aligned_cast_half_to_ulong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_half) &&
+                                dst_stride == sizeof(npy_ulong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_half_to_ulong :
+                                    &_contig_cast_half_to_ulong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_half_to_ulong :
+                                         &_cast_half_to_ulong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_ULONGLONG:
+                    /*printf("ret fn %d %d\n", NPY_HALF, NPY_ULONGLONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_half) &&
+                                dst_stride == sizeof(npy_ulonglong)) {
+                        return &_aligned_contig_cast_half_to_ulonglong;
+                    }
+                    else {
+                        return &_aligned_cast_half_to_ulonglong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_half) &&
+                                dst_stride == sizeof(npy_ulonglong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_half_to_ulonglong :
+                                    &_contig_cast_half_to_ulonglong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_half_to_ulonglong :
+                                         &_cast_half_to_ulonglong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_BYTE:
+                    /*printf("ret fn %d %d\n", NPY_HALF, NPY_BYTE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_half) &&
+                                dst_stride == sizeof(npy_byte)) {
+                        return &_aligned_contig_cast_half_to_byte;
+                    }
+                    else {
+                        return &_aligned_cast_half_to_byte;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_half) &&
+                                dst_stride == sizeof(npy_byte)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_half_to_byte :
+                                    &_contig_cast_half_to_byte;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_half_to_byte :
+                                         &_cast_half_to_byte;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_SHORT:
+                    /*printf("ret fn %d %d\n", NPY_HALF, NPY_SHORT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_half) &&
+                                dst_stride == sizeof(npy_short)) {
+                        return &_aligned_contig_cast_half_to_short;
+                    }
+                    else {
+                        return &_aligned_cast_half_to_short;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_half) &&
+                                dst_stride == sizeof(npy_short)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_half_to_short :
+                                    &_contig_cast_half_to_short;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_half_to_short :
+                                         &_cast_half_to_short;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_INT:
+                    /*printf("ret fn %d %d\n", NPY_HALF, NPY_INT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_half) &&
+                                dst_stride == sizeof(npy_int)) {
+                        return &_aligned_contig_cast_half_to_int;
+                    }
+                    else {
+                        return &_aligned_cast_half_to_int;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_half) &&
+                                dst_stride == sizeof(npy_int)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_half_to_int :
+                                    &_contig_cast_half_to_int;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_half_to_int :
+                                         &_cast_half_to_int;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONG:
+                    /*printf("ret fn %d %d\n", NPY_HALF, NPY_LONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_half) &&
+                                dst_stride == sizeof(npy_long)) {
+                        return &_aligned_contig_cast_half_to_long;
+                    }
+                    else {
+                        return &_aligned_cast_half_to_long;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_half) &&
+                                dst_stride == sizeof(npy_long)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_half_to_long :
+                                    &_contig_cast_half_to_long;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_half_to_long :
+                                         &_cast_half_to_long;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONGLONG:
+                    /*printf("ret fn %d %d\n", NPY_HALF, NPY_LONGLONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_half) &&
+                                dst_stride == sizeof(npy_longlong)) {
+                        return &_aligned_contig_cast_half_to_longlong;
+                    }
+                    else {
+                        return &_aligned_cast_half_to_longlong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_half) &&
+                                dst_stride == sizeof(npy_longlong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_half_to_longlong :
+                                    &_contig_cast_half_to_longlong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_half_to_longlong :
+                                         &_cast_half_to_longlong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_HALF:
+                    /*printf("ret fn %d %d\n", NPY_HALF, NPY_HALF);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_half) &&
+                                dst_stride == sizeof(npy_half)) {
+                        return &_aligned_contig_cast_half_to_half;
+                    }
+                    else {
+                        return &_aligned_cast_half_to_half;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_half) &&
+                                dst_stride == sizeof(npy_half)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_half_to_half :
+                                    &_contig_cast_half_to_half;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_half_to_half :
+                                         &_cast_half_to_half;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_FLOAT:
+                    /*printf("ret fn %d %d\n", NPY_HALF, NPY_FLOAT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_half) &&
+                                dst_stride == sizeof(npy_float)) {
+                        return &_aligned_contig_cast_half_to_float;
+                    }
+                    else {
+                        return &_aligned_cast_half_to_float;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_half) &&
+                                dst_stride == sizeof(npy_float)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_half_to_float :
+                                    &_contig_cast_half_to_float;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_half_to_float :
+                                         &_cast_half_to_float;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_DOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_HALF, NPY_DOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_half) &&
+                                dst_stride == sizeof(npy_double)) {
+                        return &_aligned_contig_cast_half_to_double;
+                    }
+                    else {
+                        return &_aligned_cast_half_to_double;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_half) &&
+                                dst_stride == sizeof(npy_double)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_half_to_double :
+                                    &_contig_cast_half_to_double;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_half_to_double :
+                                         &_cast_half_to_double;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONGDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_HALF, NPY_LONGDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_half) &&
+                                dst_stride == sizeof(npy_longdouble)) {
+                        return &_aligned_contig_cast_half_to_longdouble;
+                    }
+                    else {
+                        return &_aligned_cast_half_to_longdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_half) &&
+                                dst_stride == sizeof(npy_longdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_half_to_longdouble :
+                                    &_contig_cast_half_to_longdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_half_to_longdouble :
+                                         &_cast_half_to_longdouble;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CFLOAT:
+                    /*printf("ret fn %d %d\n", NPY_HALF, NPY_CFLOAT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_half) &&
+                                dst_stride == sizeof(npy_cfloat)) {
+                        return &_aligned_contig_cast_half_to_cfloat;
+                    }
+                    else {
+                        return &_aligned_cast_half_to_cfloat;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_half) &&
+                                dst_stride == sizeof(npy_cfloat)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_half_to_cfloat :
+                                    &_contig_cast_half_to_cfloat;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_half_to_cfloat :
+                                         &_cast_half_to_cfloat;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_HALF, NPY_CDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_half) &&
+                                dst_stride == sizeof(npy_cdouble)) {
+                        return &_aligned_contig_cast_half_to_cdouble;
+                    }
+                    else {
+                        return &_aligned_cast_half_to_cdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_half) &&
+                                dst_stride == sizeof(npy_cdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_half_to_cdouble :
+                                    &_contig_cast_half_to_cdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_half_to_cdouble :
+                                         &_cast_half_to_cdouble;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CLONGDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_HALF, NPY_CLONGDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_half) &&
+                                dst_stride == sizeof(npy_clongdouble)) {
+                        return &_aligned_contig_cast_half_to_clongdouble;
+                    }
+                    else {
+                        return &_aligned_cast_half_to_clongdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_half) &&
+                                dst_stride == sizeof(npy_clongdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_half_to_clongdouble :
+                                    &_contig_cast_half_to_clongdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_half_to_clongdouble :
+                                         &_cast_half_to_clongdouble;
+                    }
+#  endif
+
+
+            }
+            /*printf("switched test fn %d - second %d\n", NPY_HALF, dst_type_num);*/
+
+
+#line 972
+
+        case NPY_FLOAT:
+            /*printf("test fn %d - second %d\n", NPY_FLOAT, dst_type_num);*/
+            switch (dst_type_num) {
+#line 994
+
+                case NPY_BOOL:
+                    /*printf("ret fn %d %d\n", NPY_FLOAT, NPY_BOOL);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_float) &&
+                                dst_stride == sizeof(npy_bool)) {
+                        return &_aligned_contig_cast_float_to_bool;
+                    }
+                    else {
+                        return &_aligned_cast_float_to_bool;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_float) &&
+                                dst_stride == sizeof(npy_bool)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_float_to_bool :
+                                    &_contig_cast_float_to_bool;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_float_to_bool :
+                                         &_cast_float_to_bool;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_UBYTE:
+                    /*printf("ret fn %d %d\n", NPY_FLOAT, NPY_UBYTE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_float) &&
+                                dst_stride == sizeof(npy_ubyte)) {
+                        return &_aligned_contig_cast_float_to_ubyte;
+                    }
+                    else {
+                        return &_aligned_cast_float_to_ubyte;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_float) &&
+                                dst_stride == sizeof(npy_ubyte)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_float_to_ubyte :
+                                    &_contig_cast_float_to_ubyte;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_float_to_ubyte :
+                                         &_cast_float_to_ubyte;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_USHORT:
+                    /*printf("ret fn %d %d\n", NPY_FLOAT, NPY_USHORT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_float) &&
+                                dst_stride == sizeof(npy_ushort)) {
+                        return &_aligned_contig_cast_float_to_ushort;
+                    }
+                    else {
+                        return &_aligned_cast_float_to_ushort;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_float) &&
+                                dst_stride == sizeof(npy_ushort)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_float_to_ushort :
+                                    &_contig_cast_float_to_ushort;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_float_to_ushort :
+                                         &_cast_float_to_ushort;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_UINT:
+                    /*printf("ret fn %d %d\n", NPY_FLOAT, NPY_UINT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_float) &&
+                                dst_stride == sizeof(npy_uint)) {
+                        return &_aligned_contig_cast_float_to_uint;
+                    }
+                    else {
+                        return &_aligned_cast_float_to_uint;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_float) &&
+                                dst_stride == sizeof(npy_uint)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_float_to_uint :
+                                    &_contig_cast_float_to_uint;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_float_to_uint :
+                                         &_cast_float_to_uint;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_ULONG:
+                    /*printf("ret fn %d %d\n", NPY_FLOAT, NPY_ULONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_float) &&
+                                dst_stride == sizeof(npy_ulong)) {
+                        return &_aligned_contig_cast_float_to_ulong;
+                    }
+                    else {
+                        return &_aligned_cast_float_to_ulong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_float) &&
+                                dst_stride == sizeof(npy_ulong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_float_to_ulong :
+                                    &_contig_cast_float_to_ulong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_float_to_ulong :
+                                         &_cast_float_to_ulong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_ULONGLONG:
+                    /*printf("ret fn %d %d\n", NPY_FLOAT, NPY_ULONGLONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_float) &&
+                                dst_stride == sizeof(npy_ulonglong)) {
+                        return &_aligned_contig_cast_float_to_ulonglong;
+                    }
+                    else {
+                        return &_aligned_cast_float_to_ulonglong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_float) &&
+                                dst_stride == sizeof(npy_ulonglong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_float_to_ulonglong :
+                                    &_contig_cast_float_to_ulonglong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_float_to_ulonglong :
+                                         &_cast_float_to_ulonglong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_BYTE:
+                    /*printf("ret fn %d %d\n", NPY_FLOAT, NPY_BYTE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_float) &&
+                                dst_stride == sizeof(npy_byte)) {
+                        return &_aligned_contig_cast_float_to_byte;
+                    }
+                    else {
+                        return &_aligned_cast_float_to_byte;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_float) &&
+                                dst_stride == sizeof(npy_byte)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_float_to_byte :
+                                    &_contig_cast_float_to_byte;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_float_to_byte :
+                                         &_cast_float_to_byte;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_SHORT:
+                    /*printf("ret fn %d %d\n", NPY_FLOAT, NPY_SHORT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_float) &&
+                                dst_stride == sizeof(npy_short)) {
+                        return &_aligned_contig_cast_float_to_short;
+                    }
+                    else {
+                        return &_aligned_cast_float_to_short;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_float) &&
+                                dst_stride == sizeof(npy_short)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_float_to_short :
+                                    &_contig_cast_float_to_short;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_float_to_short :
+                                         &_cast_float_to_short;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_INT:
+                    /*printf("ret fn %d %d\n", NPY_FLOAT, NPY_INT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_float) &&
+                                dst_stride == sizeof(npy_int)) {
+                        return &_aligned_contig_cast_float_to_int;
+                    }
+                    else {
+                        return &_aligned_cast_float_to_int;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_float) &&
+                                dst_stride == sizeof(npy_int)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_float_to_int :
+                                    &_contig_cast_float_to_int;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_float_to_int :
+                                         &_cast_float_to_int;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONG:
+                    /*printf("ret fn %d %d\n", NPY_FLOAT, NPY_LONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_float) &&
+                                dst_stride == sizeof(npy_long)) {
+                        return &_aligned_contig_cast_float_to_long;
+                    }
+                    else {
+                        return &_aligned_cast_float_to_long;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_float) &&
+                                dst_stride == sizeof(npy_long)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_float_to_long :
+                                    &_contig_cast_float_to_long;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_float_to_long :
+                                         &_cast_float_to_long;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONGLONG:
+                    /*printf("ret fn %d %d\n", NPY_FLOAT, NPY_LONGLONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_float) &&
+                                dst_stride == sizeof(npy_longlong)) {
+                        return &_aligned_contig_cast_float_to_longlong;
+                    }
+                    else {
+                        return &_aligned_cast_float_to_longlong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_float) &&
+                                dst_stride == sizeof(npy_longlong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_float_to_longlong :
+                                    &_contig_cast_float_to_longlong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_float_to_longlong :
+                                         &_cast_float_to_longlong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_HALF:
+                    /*printf("ret fn %d %d\n", NPY_FLOAT, NPY_HALF);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_float) &&
+                                dst_stride == sizeof(npy_half)) {
+                        return &_aligned_contig_cast_float_to_half;
+                    }
+                    else {
+                        return &_aligned_cast_float_to_half;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_float) &&
+                                dst_stride == sizeof(npy_half)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_float_to_half :
+                                    &_contig_cast_float_to_half;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_float_to_half :
+                                         &_cast_float_to_half;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_FLOAT:
+                    /*printf("ret fn %d %d\n", NPY_FLOAT, NPY_FLOAT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_float) &&
+                                dst_stride == sizeof(npy_float)) {
+                        return &_aligned_contig_cast_float_to_float;
+                    }
+                    else {
+                        return &_aligned_cast_float_to_float;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_float) &&
+                                dst_stride == sizeof(npy_float)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_float_to_float :
+                                    &_contig_cast_float_to_float;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_float_to_float :
+                                         &_cast_float_to_float;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_DOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_FLOAT, NPY_DOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_float) &&
+                                dst_stride == sizeof(npy_double)) {
+                        return &_aligned_contig_cast_float_to_double;
+                    }
+                    else {
+                        return &_aligned_cast_float_to_double;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_float) &&
+                                dst_stride == sizeof(npy_double)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_float_to_double :
+                                    &_contig_cast_float_to_double;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_float_to_double :
+                                         &_cast_float_to_double;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONGDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_FLOAT, NPY_LONGDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_float) &&
+                                dst_stride == sizeof(npy_longdouble)) {
+                        return &_aligned_contig_cast_float_to_longdouble;
+                    }
+                    else {
+                        return &_aligned_cast_float_to_longdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_float) &&
+                                dst_stride == sizeof(npy_longdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_float_to_longdouble :
+                                    &_contig_cast_float_to_longdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_float_to_longdouble :
+                                         &_cast_float_to_longdouble;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CFLOAT:
+                    /*printf("ret fn %d %d\n", NPY_FLOAT, NPY_CFLOAT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_float) &&
+                                dst_stride == sizeof(npy_cfloat)) {
+                        return &_aligned_contig_cast_float_to_cfloat;
+                    }
+                    else {
+                        return &_aligned_cast_float_to_cfloat;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_float) &&
+                                dst_stride == sizeof(npy_cfloat)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_float_to_cfloat :
+                                    &_contig_cast_float_to_cfloat;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_float_to_cfloat :
+                                         &_cast_float_to_cfloat;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_FLOAT, NPY_CDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_float) &&
+                                dst_stride == sizeof(npy_cdouble)) {
+                        return &_aligned_contig_cast_float_to_cdouble;
+                    }
+                    else {
+                        return &_aligned_cast_float_to_cdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_float) &&
+                                dst_stride == sizeof(npy_cdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_float_to_cdouble :
+                                    &_contig_cast_float_to_cdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_float_to_cdouble :
+                                         &_cast_float_to_cdouble;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CLONGDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_FLOAT, NPY_CLONGDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_float) &&
+                                dst_stride == sizeof(npy_clongdouble)) {
+                        return &_aligned_contig_cast_float_to_clongdouble;
+                    }
+                    else {
+                        return &_aligned_cast_float_to_clongdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_float) &&
+                                dst_stride == sizeof(npy_clongdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_float_to_clongdouble :
+                                    &_contig_cast_float_to_clongdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_float_to_clongdouble :
+                                         &_cast_float_to_clongdouble;
+                    }
+#  endif
+
+
+            }
+            /*printf("switched test fn %d - second %d\n", NPY_FLOAT, dst_type_num);*/
+
+
+#line 972
+
+        case NPY_DOUBLE:
+            /*printf("test fn %d - second %d\n", NPY_DOUBLE, dst_type_num);*/
+            switch (dst_type_num) {
+#line 994
+
+                case NPY_BOOL:
+                    /*printf("ret fn %d %d\n", NPY_DOUBLE, NPY_BOOL);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_double) &&
+                                dst_stride == sizeof(npy_bool)) {
+                        return &_aligned_contig_cast_double_to_bool;
+                    }
+                    else {
+                        return &_aligned_cast_double_to_bool;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_double) &&
+                                dst_stride == sizeof(npy_bool)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_double_to_bool :
+                                    &_contig_cast_double_to_bool;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_double_to_bool :
+                                         &_cast_double_to_bool;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_UBYTE:
+                    /*printf("ret fn %d %d\n", NPY_DOUBLE, NPY_UBYTE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_double) &&
+                                dst_stride == sizeof(npy_ubyte)) {
+                        return &_aligned_contig_cast_double_to_ubyte;
+                    }
+                    else {
+                        return &_aligned_cast_double_to_ubyte;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_double) &&
+                                dst_stride == sizeof(npy_ubyte)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_double_to_ubyte :
+                                    &_contig_cast_double_to_ubyte;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_double_to_ubyte :
+                                         &_cast_double_to_ubyte;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_USHORT:
+                    /*printf("ret fn %d %d\n", NPY_DOUBLE, NPY_USHORT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_double) &&
+                                dst_stride == sizeof(npy_ushort)) {
+                        return &_aligned_contig_cast_double_to_ushort;
+                    }
+                    else {
+                        return &_aligned_cast_double_to_ushort;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_double) &&
+                                dst_stride == sizeof(npy_ushort)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_double_to_ushort :
+                                    &_contig_cast_double_to_ushort;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_double_to_ushort :
+                                         &_cast_double_to_ushort;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_UINT:
+                    /*printf("ret fn %d %d\n", NPY_DOUBLE, NPY_UINT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_double) &&
+                                dst_stride == sizeof(npy_uint)) {
+                        return &_aligned_contig_cast_double_to_uint;
+                    }
+                    else {
+                        return &_aligned_cast_double_to_uint;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_double) &&
+                                dst_stride == sizeof(npy_uint)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_double_to_uint :
+                                    &_contig_cast_double_to_uint;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_double_to_uint :
+                                         &_cast_double_to_uint;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_ULONG:
+                    /*printf("ret fn %d %d\n", NPY_DOUBLE, NPY_ULONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_double) &&
+                                dst_stride == sizeof(npy_ulong)) {
+                        return &_aligned_contig_cast_double_to_ulong;
+                    }
+                    else {
+                        return &_aligned_cast_double_to_ulong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_double) &&
+                                dst_stride == sizeof(npy_ulong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_double_to_ulong :
+                                    &_contig_cast_double_to_ulong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_double_to_ulong :
+                                         &_cast_double_to_ulong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_ULONGLONG:
+                    /*printf("ret fn %d %d\n", NPY_DOUBLE, NPY_ULONGLONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_double) &&
+                                dst_stride == sizeof(npy_ulonglong)) {
+                        return &_aligned_contig_cast_double_to_ulonglong;
+                    }
+                    else {
+                        return &_aligned_cast_double_to_ulonglong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_double) &&
+                                dst_stride == sizeof(npy_ulonglong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_double_to_ulonglong :
+                                    &_contig_cast_double_to_ulonglong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_double_to_ulonglong :
+                                         &_cast_double_to_ulonglong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_BYTE:
+                    /*printf("ret fn %d %d\n", NPY_DOUBLE, NPY_BYTE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_double) &&
+                                dst_stride == sizeof(npy_byte)) {
+                        return &_aligned_contig_cast_double_to_byte;
+                    }
+                    else {
+                        return &_aligned_cast_double_to_byte;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_double) &&
+                                dst_stride == sizeof(npy_byte)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_double_to_byte :
+                                    &_contig_cast_double_to_byte;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_double_to_byte :
+                                         &_cast_double_to_byte;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_SHORT:
+                    /*printf("ret fn %d %d\n", NPY_DOUBLE, NPY_SHORT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_double) &&
+                                dst_stride == sizeof(npy_short)) {
+                        return &_aligned_contig_cast_double_to_short;
+                    }
+                    else {
+                        return &_aligned_cast_double_to_short;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_double) &&
+                                dst_stride == sizeof(npy_short)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_double_to_short :
+                                    &_contig_cast_double_to_short;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_double_to_short :
+                                         &_cast_double_to_short;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_INT:
+                    /*printf("ret fn %d %d\n", NPY_DOUBLE, NPY_INT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_double) &&
+                                dst_stride == sizeof(npy_int)) {
+                        return &_aligned_contig_cast_double_to_int;
+                    }
+                    else {
+                        return &_aligned_cast_double_to_int;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_double) &&
+                                dst_stride == sizeof(npy_int)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_double_to_int :
+                                    &_contig_cast_double_to_int;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_double_to_int :
+                                         &_cast_double_to_int;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONG:
+                    /*printf("ret fn %d %d\n", NPY_DOUBLE, NPY_LONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_double) &&
+                                dst_stride == sizeof(npy_long)) {
+                        return &_aligned_contig_cast_double_to_long;
+                    }
+                    else {
+                        return &_aligned_cast_double_to_long;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_double) &&
+                                dst_stride == sizeof(npy_long)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_double_to_long :
+                                    &_contig_cast_double_to_long;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_double_to_long :
+                                         &_cast_double_to_long;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONGLONG:
+                    /*printf("ret fn %d %d\n", NPY_DOUBLE, NPY_LONGLONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_double) &&
+                                dst_stride == sizeof(npy_longlong)) {
+                        return &_aligned_contig_cast_double_to_longlong;
+                    }
+                    else {
+                        return &_aligned_cast_double_to_longlong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_double) &&
+                                dst_stride == sizeof(npy_longlong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_double_to_longlong :
+                                    &_contig_cast_double_to_longlong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_double_to_longlong :
+                                         &_cast_double_to_longlong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_HALF:
+                    /*printf("ret fn %d %d\n", NPY_DOUBLE, NPY_HALF);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_double) &&
+                                dst_stride == sizeof(npy_half)) {
+                        return &_aligned_contig_cast_double_to_half;
+                    }
+                    else {
+                        return &_aligned_cast_double_to_half;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_double) &&
+                                dst_stride == sizeof(npy_half)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_double_to_half :
+                                    &_contig_cast_double_to_half;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_double_to_half :
+                                         &_cast_double_to_half;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_FLOAT:
+                    /*printf("ret fn %d %d\n", NPY_DOUBLE, NPY_FLOAT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_double) &&
+                                dst_stride == sizeof(npy_float)) {
+                        return &_aligned_contig_cast_double_to_float;
+                    }
+                    else {
+                        return &_aligned_cast_double_to_float;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_double) &&
+                                dst_stride == sizeof(npy_float)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_double_to_float :
+                                    &_contig_cast_double_to_float;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_double_to_float :
+                                         &_cast_double_to_float;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_DOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_DOUBLE, NPY_DOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_double) &&
+                                dst_stride == sizeof(npy_double)) {
+                        return &_aligned_contig_cast_double_to_double;
+                    }
+                    else {
+                        return &_aligned_cast_double_to_double;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_double) &&
+                                dst_stride == sizeof(npy_double)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_double_to_double :
+                                    &_contig_cast_double_to_double;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_double_to_double :
+                                         &_cast_double_to_double;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONGDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_DOUBLE, NPY_LONGDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_double) &&
+                                dst_stride == sizeof(npy_longdouble)) {
+                        return &_aligned_contig_cast_double_to_longdouble;
+                    }
+                    else {
+                        return &_aligned_cast_double_to_longdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_double) &&
+                                dst_stride == sizeof(npy_longdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_double_to_longdouble :
+                                    &_contig_cast_double_to_longdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_double_to_longdouble :
+                                         &_cast_double_to_longdouble;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CFLOAT:
+                    /*printf("ret fn %d %d\n", NPY_DOUBLE, NPY_CFLOAT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_double) &&
+                                dst_stride == sizeof(npy_cfloat)) {
+                        return &_aligned_contig_cast_double_to_cfloat;
+                    }
+                    else {
+                        return &_aligned_cast_double_to_cfloat;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_double) &&
+                                dst_stride == sizeof(npy_cfloat)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_double_to_cfloat :
+                                    &_contig_cast_double_to_cfloat;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_double_to_cfloat :
+                                         &_cast_double_to_cfloat;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_DOUBLE, NPY_CDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_double) &&
+                                dst_stride == sizeof(npy_cdouble)) {
+                        return &_aligned_contig_cast_double_to_cdouble;
+                    }
+                    else {
+                        return &_aligned_cast_double_to_cdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_double) &&
+                                dst_stride == sizeof(npy_cdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_double_to_cdouble :
+                                    &_contig_cast_double_to_cdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_double_to_cdouble :
+                                         &_cast_double_to_cdouble;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CLONGDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_DOUBLE, NPY_CLONGDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_double) &&
+                                dst_stride == sizeof(npy_clongdouble)) {
+                        return &_aligned_contig_cast_double_to_clongdouble;
+                    }
+                    else {
+                        return &_aligned_cast_double_to_clongdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_double) &&
+                                dst_stride == sizeof(npy_clongdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_double_to_clongdouble :
+                                    &_contig_cast_double_to_clongdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_double_to_clongdouble :
+                                         &_cast_double_to_clongdouble;
+                    }
+#  endif
+
+
+            }
+            /*printf("switched test fn %d - second %d\n", NPY_DOUBLE, dst_type_num);*/
+
+
+#line 972
+
+        case NPY_LONGDOUBLE:
+            /*printf("test fn %d - second %d\n", NPY_LONGDOUBLE, dst_type_num);*/
+            switch (dst_type_num) {
+#line 994
+
+                case NPY_BOOL:
+                    /*printf("ret fn %d %d\n", NPY_LONGDOUBLE, NPY_BOOL);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_longdouble) &&
+                                dst_stride == sizeof(npy_bool)) {
+                        return &_aligned_contig_cast_longdouble_to_bool;
+                    }
+                    else {
+                        return &_aligned_cast_longdouble_to_bool;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_longdouble) &&
+                                dst_stride == sizeof(npy_bool)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_longdouble_to_bool :
+                                    &_contig_cast_longdouble_to_bool;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_longdouble_to_bool :
+                                         &_cast_longdouble_to_bool;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_UBYTE:
+                    /*printf("ret fn %d %d\n", NPY_LONGDOUBLE, NPY_UBYTE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_longdouble) &&
+                                dst_stride == sizeof(npy_ubyte)) {
+                        return &_aligned_contig_cast_longdouble_to_ubyte;
+                    }
+                    else {
+                        return &_aligned_cast_longdouble_to_ubyte;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_longdouble) &&
+                                dst_stride == sizeof(npy_ubyte)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_longdouble_to_ubyte :
+                                    &_contig_cast_longdouble_to_ubyte;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_longdouble_to_ubyte :
+                                         &_cast_longdouble_to_ubyte;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_USHORT:
+                    /*printf("ret fn %d %d\n", NPY_LONGDOUBLE, NPY_USHORT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_longdouble) &&
+                                dst_stride == sizeof(npy_ushort)) {
+                        return &_aligned_contig_cast_longdouble_to_ushort;
+                    }
+                    else {
+                        return &_aligned_cast_longdouble_to_ushort;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_longdouble) &&
+                                dst_stride == sizeof(npy_ushort)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_longdouble_to_ushort :
+                                    &_contig_cast_longdouble_to_ushort;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_longdouble_to_ushort :
+                                         &_cast_longdouble_to_ushort;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_UINT:
+                    /*printf("ret fn %d %d\n", NPY_LONGDOUBLE, NPY_UINT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_longdouble) &&
+                                dst_stride == sizeof(npy_uint)) {
+                        return &_aligned_contig_cast_longdouble_to_uint;
+                    }
+                    else {
+                        return &_aligned_cast_longdouble_to_uint;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_longdouble) &&
+                                dst_stride == sizeof(npy_uint)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_longdouble_to_uint :
+                                    &_contig_cast_longdouble_to_uint;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_longdouble_to_uint :
+                                         &_cast_longdouble_to_uint;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_ULONG:
+                    /*printf("ret fn %d %d\n", NPY_LONGDOUBLE, NPY_ULONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_longdouble) &&
+                                dst_stride == sizeof(npy_ulong)) {
+                        return &_aligned_contig_cast_longdouble_to_ulong;
+                    }
+                    else {
+                        return &_aligned_cast_longdouble_to_ulong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_longdouble) &&
+                                dst_stride == sizeof(npy_ulong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_longdouble_to_ulong :
+                                    &_contig_cast_longdouble_to_ulong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_longdouble_to_ulong :
+                                         &_cast_longdouble_to_ulong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_ULONGLONG:
+                    /*printf("ret fn %d %d\n", NPY_LONGDOUBLE, NPY_ULONGLONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_longdouble) &&
+                                dst_stride == sizeof(npy_ulonglong)) {
+                        return &_aligned_contig_cast_longdouble_to_ulonglong;
+                    }
+                    else {
+                        return &_aligned_cast_longdouble_to_ulonglong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_longdouble) &&
+                                dst_stride == sizeof(npy_ulonglong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_longdouble_to_ulonglong :
+                                    &_contig_cast_longdouble_to_ulonglong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_longdouble_to_ulonglong :
+                                         &_cast_longdouble_to_ulonglong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_BYTE:
+                    /*printf("ret fn %d %d\n", NPY_LONGDOUBLE, NPY_BYTE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_longdouble) &&
+                                dst_stride == sizeof(npy_byte)) {
+                        return &_aligned_contig_cast_longdouble_to_byte;
+                    }
+                    else {
+                        return &_aligned_cast_longdouble_to_byte;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_longdouble) &&
+                                dst_stride == sizeof(npy_byte)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_longdouble_to_byte :
+                                    &_contig_cast_longdouble_to_byte;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_longdouble_to_byte :
+                                         &_cast_longdouble_to_byte;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_SHORT:
+                    /*printf("ret fn %d %d\n", NPY_LONGDOUBLE, NPY_SHORT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_longdouble) &&
+                                dst_stride == sizeof(npy_short)) {
+                        return &_aligned_contig_cast_longdouble_to_short;
+                    }
+                    else {
+                        return &_aligned_cast_longdouble_to_short;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_longdouble) &&
+                                dst_stride == sizeof(npy_short)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_longdouble_to_short :
+                                    &_contig_cast_longdouble_to_short;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_longdouble_to_short :
+                                         &_cast_longdouble_to_short;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_INT:
+                    /*printf("ret fn %d %d\n", NPY_LONGDOUBLE, NPY_INT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_longdouble) &&
+                                dst_stride == sizeof(npy_int)) {
+                        return &_aligned_contig_cast_longdouble_to_int;
+                    }
+                    else {
+                        return &_aligned_cast_longdouble_to_int;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_longdouble) &&
+                                dst_stride == sizeof(npy_int)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_longdouble_to_int :
+                                    &_contig_cast_longdouble_to_int;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_longdouble_to_int :
+                                         &_cast_longdouble_to_int;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONG:
+                    /*printf("ret fn %d %d\n", NPY_LONGDOUBLE, NPY_LONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_longdouble) &&
+                                dst_stride == sizeof(npy_long)) {
+                        return &_aligned_contig_cast_longdouble_to_long;
+                    }
+                    else {
+                        return &_aligned_cast_longdouble_to_long;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_longdouble) &&
+                                dst_stride == sizeof(npy_long)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_longdouble_to_long :
+                                    &_contig_cast_longdouble_to_long;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_longdouble_to_long :
+                                         &_cast_longdouble_to_long;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONGLONG:
+                    /*printf("ret fn %d %d\n", NPY_LONGDOUBLE, NPY_LONGLONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_longdouble) &&
+                                dst_stride == sizeof(npy_longlong)) {
+                        return &_aligned_contig_cast_longdouble_to_longlong;
+                    }
+                    else {
+                        return &_aligned_cast_longdouble_to_longlong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_longdouble) &&
+                                dst_stride == sizeof(npy_longlong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_longdouble_to_longlong :
+                                    &_contig_cast_longdouble_to_longlong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_longdouble_to_longlong :
+                                         &_cast_longdouble_to_longlong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_HALF:
+                    /*printf("ret fn %d %d\n", NPY_LONGDOUBLE, NPY_HALF);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_longdouble) &&
+                                dst_stride == sizeof(npy_half)) {
+                        return &_aligned_contig_cast_longdouble_to_half;
+                    }
+                    else {
+                        return &_aligned_cast_longdouble_to_half;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_longdouble) &&
+                                dst_stride == sizeof(npy_half)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_longdouble_to_half :
+                                    &_contig_cast_longdouble_to_half;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_longdouble_to_half :
+                                         &_cast_longdouble_to_half;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_FLOAT:
+                    /*printf("ret fn %d %d\n", NPY_LONGDOUBLE, NPY_FLOAT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_longdouble) &&
+                                dst_stride == sizeof(npy_float)) {
+                        return &_aligned_contig_cast_longdouble_to_float;
+                    }
+                    else {
+                        return &_aligned_cast_longdouble_to_float;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_longdouble) &&
+                                dst_stride == sizeof(npy_float)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_longdouble_to_float :
+                                    &_contig_cast_longdouble_to_float;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_longdouble_to_float :
+                                         &_cast_longdouble_to_float;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_DOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_LONGDOUBLE, NPY_DOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_longdouble) &&
+                                dst_stride == sizeof(npy_double)) {
+                        return &_aligned_contig_cast_longdouble_to_double;
+                    }
+                    else {
+                        return &_aligned_cast_longdouble_to_double;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_longdouble) &&
+                                dst_stride == sizeof(npy_double)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_longdouble_to_double :
+                                    &_contig_cast_longdouble_to_double;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_longdouble_to_double :
+                                         &_cast_longdouble_to_double;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONGDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_LONGDOUBLE, NPY_LONGDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_longdouble) &&
+                                dst_stride == sizeof(npy_longdouble)) {
+                        return &_aligned_contig_cast_longdouble_to_longdouble;
+                    }
+                    else {
+                        return &_aligned_cast_longdouble_to_longdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_longdouble) &&
+                                dst_stride == sizeof(npy_longdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_longdouble_to_longdouble :
+                                    &_contig_cast_longdouble_to_longdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_longdouble_to_longdouble :
+                                         &_cast_longdouble_to_longdouble;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CFLOAT:
+                    /*printf("ret fn %d %d\n", NPY_LONGDOUBLE, NPY_CFLOAT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_longdouble) &&
+                                dst_stride == sizeof(npy_cfloat)) {
+                        return &_aligned_contig_cast_longdouble_to_cfloat;
+                    }
+                    else {
+                        return &_aligned_cast_longdouble_to_cfloat;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_longdouble) &&
+                                dst_stride == sizeof(npy_cfloat)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_longdouble_to_cfloat :
+                                    &_contig_cast_longdouble_to_cfloat;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_longdouble_to_cfloat :
+                                         &_cast_longdouble_to_cfloat;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_LONGDOUBLE, NPY_CDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_longdouble) &&
+                                dst_stride == sizeof(npy_cdouble)) {
+                        return &_aligned_contig_cast_longdouble_to_cdouble;
+                    }
+                    else {
+                        return &_aligned_cast_longdouble_to_cdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_longdouble) &&
+                                dst_stride == sizeof(npy_cdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_longdouble_to_cdouble :
+                                    &_contig_cast_longdouble_to_cdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_longdouble_to_cdouble :
+                                         &_cast_longdouble_to_cdouble;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CLONGDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_LONGDOUBLE, NPY_CLONGDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_longdouble) &&
+                                dst_stride == sizeof(npy_clongdouble)) {
+                        return &_aligned_contig_cast_longdouble_to_clongdouble;
+                    }
+                    else {
+                        return &_aligned_cast_longdouble_to_clongdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_longdouble) &&
+                                dst_stride == sizeof(npy_clongdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_longdouble_to_clongdouble :
+                                    &_contig_cast_longdouble_to_clongdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_longdouble_to_clongdouble :
+                                         &_cast_longdouble_to_clongdouble;
+                    }
+#  endif
+
+
+            }
+            /*printf("switched test fn %d - second %d\n", NPY_LONGDOUBLE, dst_type_num);*/
+
+
+#line 972
+
+        case NPY_CFLOAT:
+            /*printf("test fn %d - second %d\n", NPY_CFLOAT, dst_type_num);*/
+            switch (dst_type_num) {
+#line 994
+
+                case NPY_BOOL:
+                    /*printf("ret fn %d %d\n", NPY_CFLOAT, NPY_BOOL);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_cfloat) &&
+                                dst_stride == sizeof(npy_bool)) {
+                        return &_aligned_contig_cast_cfloat_to_bool;
+                    }
+                    else {
+                        return &_aligned_cast_cfloat_to_bool;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_cfloat) &&
+                                dst_stride == sizeof(npy_bool)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_cfloat_to_bool :
+                                    &_contig_cast_cfloat_to_bool;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_cfloat_to_bool :
+                                         &_cast_cfloat_to_bool;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_UBYTE:
+                    /*printf("ret fn %d %d\n", NPY_CFLOAT, NPY_UBYTE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_cfloat) &&
+                                dst_stride == sizeof(npy_ubyte)) {
+                        return &_aligned_contig_cast_cfloat_to_ubyte;
+                    }
+                    else {
+                        return &_aligned_cast_cfloat_to_ubyte;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_cfloat) &&
+                                dst_stride == sizeof(npy_ubyte)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_cfloat_to_ubyte :
+                                    &_contig_cast_cfloat_to_ubyte;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_cfloat_to_ubyte :
+                                         &_cast_cfloat_to_ubyte;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_USHORT:
+                    /*printf("ret fn %d %d\n", NPY_CFLOAT, NPY_USHORT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_cfloat) &&
+                                dst_stride == sizeof(npy_ushort)) {
+                        return &_aligned_contig_cast_cfloat_to_ushort;
+                    }
+                    else {
+                        return &_aligned_cast_cfloat_to_ushort;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_cfloat) &&
+                                dst_stride == sizeof(npy_ushort)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_cfloat_to_ushort :
+                                    &_contig_cast_cfloat_to_ushort;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_cfloat_to_ushort :
+                                         &_cast_cfloat_to_ushort;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_UINT:
+                    /*printf("ret fn %d %d\n", NPY_CFLOAT, NPY_UINT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_cfloat) &&
+                                dst_stride == sizeof(npy_uint)) {
+                        return &_aligned_contig_cast_cfloat_to_uint;
+                    }
+                    else {
+                        return &_aligned_cast_cfloat_to_uint;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_cfloat) &&
+                                dst_stride == sizeof(npy_uint)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_cfloat_to_uint :
+                                    &_contig_cast_cfloat_to_uint;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_cfloat_to_uint :
+                                         &_cast_cfloat_to_uint;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_ULONG:
+                    /*printf("ret fn %d %d\n", NPY_CFLOAT, NPY_ULONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_cfloat) &&
+                                dst_stride == sizeof(npy_ulong)) {
+                        return &_aligned_contig_cast_cfloat_to_ulong;
+                    }
+                    else {
+                        return &_aligned_cast_cfloat_to_ulong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_cfloat) &&
+                                dst_stride == sizeof(npy_ulong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_cfloat_to_ulong :
+                                    &_contig_cast_cfloat_to_ulong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_cfloat_to_ulong :
+                                         &_cast_cfloat_to_ulong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_ULONGLONG:
+                    /*printf("ret fn %d %d\n", NPY_CFLOAT, NPY_ULONGLONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_cfloat) &&
+                                dst_stride == sizeof(npy_ulonglong)) {
+                        return &_aligned_contig_cast_cfloat_to_ulonglong;
+                    }
+                    else {
+                        return &_aligned_cast_cfloat_to_ulonglong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_cfloat) &&
+                                dst_stride == sizeof(npy_ulonglong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_cfloat_to_ulonglong :
+                                    &_contig_cast_cfloat_to_ulonglong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_cfloat_to_ulonglong :
+                                         &_cast_cfloat_to_ulonglong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_BYTE:
+                    /*printf("ret fn %d %d\n", NPY_CFLOAT, NPY_BYTE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_cfloat) &&
+                                dst_stride == sizeof(npy_byte)) {
+                        return &_aligned_contig_cast_cfloat_to_byte;
+                    }
+                    else {
+                        return &_aligned_cast_cfloat_to_byte;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_cfloat) &&
+                                dst_stride == sizeof(npy_byte)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_cfloat_to_byte :
+                                    &_contig_cast_cfloat_to_byte;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_cfloat_to_byte :
+                                         &_cast_cfloat_to_byte;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_SHORT:
+                    /*printf("ret fn %d %d\n", NPY_CFLOAT, NPY_SHORT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_cfloat) &&
+                                dst_stride == sizeof(npy_short)) {
+                        return &_aligned_contig_cast_cfloat_to_short;
+                    }
+                    else {
+                        return &_aligned_cast_cfloat_to_short;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_cfloat) &&
+                                dst_stride == sizeof(npy_short)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_cfloat_to_short :
+                                    &_contig_cast_cfloat_to_short;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_cfloat_to_short :
+                                         &_cast_cfloat_to_short;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_INT:
+                    /*printf("ret fn %d %d\n", NPY_CFLOAT, NPY_INT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_cfloat) &&
+                                dst_stride == sizeof(npy_int)) {
+                        return &_aligned_contig_cast_cfloat_to_int;
+                    }
+                    else {
+                        return &_aligned_cast_cfloat_to_int;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_cfloat) &&
+                                dst_stride == sizeof(npy_int)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_cfloat_to_int :
+                                    &_contig_cast_cfloat_to_int;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_cfloat_to_int :
+                                         &_cast_cfloat_to_int;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONG:
+                    /*printf("ret fn %d %d\n", NPY_CFLOAT, NPY_LONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_cfloat) &&
+                                dst_stride == sizeof(npy_long)) {
+                        return &_aligned_contig_cast_cfloat_to_long;
+                    }
+                    else {
+                        return &_aligned_cast_cfloat_to_long;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_cfloat) &&
+                                dst_stride == sizeof(npy_long)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_cfloat_to_long :
+                                    &_contig_cast_cfloat_to_long;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_cfloat_to_long :
+                                         &_cast_cfloat_to_long;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONGLONG:
+                    /*printf("ret fn %d %d\n", NPY_CFLOAT, NPY_LONGLONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_cfloat) &&
+                                dst_stride == sizeof(npy_longlong)) {
+                        return &_aligned_contig_cast_cfloat_to_longlong;
+                    }
+                    else {
+                        return &_aligned_cast_cfloat_to_longlong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_cfloat) &&
+                                dst_stride == sizeof(npy_longlong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_cfloat_to_longlong :
+                                    &_contig_cast_cfloat_to_longlong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_cfloat_to_longlong :
+                                         &_cast_cfloat_to_longlong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_HALF:
+                    /*printf("ret fn %d %d\n", NPY_CFLOAT, NPY_HALF);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_cfloat) &&
+                                dst_stride == sizeof(npy_half)) {
+                        return &_aligned_contig_cast_cfloat_to_half;
+                    }
+                    else {
+                        return &_aligned_cast_cfloat_to_half;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_cfloat) &&
+                                dst_stride == sizeof(npy_half)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_cfloat_to_half :
+                                    &_contig_cast_cfloat_to_half;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_cfloat_to_half :
+                                         &_cast_cfloat_to_half;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_FLOAT:
+                    /*printf("ret fn %d %d\n", NPY_CFLOAT, NPY_FLOAT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_cfloat) &&
+                                dst_stride == sizeof(npy_float)) {
+                        return &_aligned_contig_cast_cfloat_to_float;
+                    }
+                    else {
+                        return &_aligned_cast_cfloat_to_float;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_cfloat) &&
+                                dst_stride == sizeof(npy_float)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_cfloat_to_float :
+                                    &_contig_cast_cfloat_to_float;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_cfloat_to_float :
+                                         &_cast_cfloat_to_float;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_DOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_CFLOAT, NPY_DOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_cfloat) &&
+                                dst_stride == sizeof(npy_double)) {
+                        return &_aligned_contig_cast_cfloat_to_double;
+                    }
+                    else {
+                        return &_aligned_cast_cfloat_to_double;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_cfloat) &&
+                                dst_stride == sizeof(npy_double)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_cfloat_to_double :
+                                    &_contig_cast_cfloat_to_double;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_cfloat_to_double :
+                                         &_cast_cfloat_to_double;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONGDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_CFLOAT, NPY_LONGDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_cfloat) &&
+                                dst_stride == sizeof(npy_longdouble)) {
+                        return &_aligned_contig_cast_cfloat_to_longdouble;
+                    }
+                    else {
+                        return &_aligned_cast_cfloat_to_longdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_cfloat) &&
+                                dst_stride == sizeof(npy_longdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_cfloat_to_longdouble :
+                                    &_contig_cast_cfloat_to_longdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_cfloat_to_longdouble :
+                                         &_cast_cfloat_to_longdouble;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CFLOAT:
+                    /*printf("ret fn %d %d\n", NPY_CFLOAT, NPY_CFLOAT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_cfloat) &&
+                                dst_stride == sizeof(npy_cfloat)) {
+                        return &_aligned_contig_cast_cfloat_to_cfloat;
+                    }
+                    else {
+                        return &_aligned_cast_cfloat_to_cfloat;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_cfloat) &&
+                                dst_stride == sizeof(npy_cfloat)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_cfloat_to_cfloat :
+                                    &_contig_cast_cfloat_to_cfloat;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_cfloat_to_cfloat :
+                                         &_cast_cfloat_to_cfloat;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_CFLOAT, NPY_CDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_cfloat) &&
+                                dst_stride == sizeof(npy_cdouble)) {
+                        return &_aligned_contig_cast_cfloat_to_cdouble;
+                    }
+                    else {
+                        return &_aligned_cast_cfloat_to_cdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_cfloat) &&
+                                dst_stride == sizeof(npy_cdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_cfloat_to_cdouble :
+                                    &_contig_cast_cfloat_to_cdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_cfloat_to_cdouble :
+                                         &_cast_cfloat_to_cdouble;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CLONGDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_CFLOAT, NPY_CLONGDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_cfloat) &&
+                                dst_stride == sizeof(npy_clongdouble)) {
+                        return &_aligned_contig_cast_cfloat_to_clongdouble;
+                    }
+                    else {
+                        return &_aligned_cast_cfloat_to_clongdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_cfloat) &&
+                                dst_stride == sizeof(npy_clongdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_cfloat_to_clongdouble :
+                                    &_contig_cast_cfloat_to_clongdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_cfloat_to_clongdouble :
+                                         &_cast_cfloat_to_clongdouble;
+                    }
+#  endif
+
+
+            }
+            /*printf("switched test fn %d - second %d\n", NPY_CFLOAT, dst_type_num);*/
+
+
+#line 972
+
+        case NPY_CDOUBLE:
+            /*printf("test fn %d - second %d\n", NPY_CDOUBLE, dst_type_num);*/
+            switch (dst_type_num) {
+#line 994
+
+                case NPY_BOOL:
+                    /*printf("ret fn %d %d\n", NPY_CDOUBLE, NPY_BOOL);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_cdouble) &&
+                                dst_stride == sizeof(npy_bool)) {
+                        return &_aligned_contig_cast_cdouble_to_bool;
+                    }
+                    else {
+                        return &_aligned_cast_cdouble_to_bool;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_cdouble) &&
+                                dst_stride == sizeof(npy_bool)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_cdouble_to_bool :
+                                    &_contig_cast_cdouble_to_bool;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_cdouble_to_bool :
+                                         &_cast_cdouble_to_bool;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_UBYTE:
+                    /*printf("ret fn %d %d\n", NPY_CDOUBLE, NPY_UBYTE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_cdouble) &&
+                                dst_stride == sizeof(npy_ubyte)) {
+                        return &_aligned_contig_cast_cdouble_to_ubyte;
+                    }
+                    else {
+                        return &_aligned_cast_cdouble_to_ubyte;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_cdouble) &&
+                                dst_stride == sizeof(npy_ubyte)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_cdouble_to_ubyte :
+                                    &_contig_cast_cdouble_to_ubyte;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_cdouble_to_ubyte :
+                                         &_cast_cdouble_to_ubyte;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_USHORT:
+                    /*printf("ret fn %d %d\n", NPY_CDOUBLE, NPY_USHORT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_cdouble) &&
+                                dst_stride == sizeof(npy_ushort)) {
+                        return &_aligned_contig_cast_cdouble_to_ushort;
+                    }
+                    else {
+                        return &_aligned_cast_cdouble_to_ushort;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_cdouble) &&
+                                dst_stride == sizeof(npy_ushort)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_cdouble_to_ushort :
+                                    &_contig_cast_cdouble_to_ushort;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_cdouble_to_ushort :
+                                         &_cast_cdouble_to_ushort;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_UINT:
+                    /*printf("ret fn %d %d\n", NPY_CDOUBLE, NPY_UINT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_cdouble) &&
+                                dst_stride == sizeof(npy_uint)) {
+                        return &_aligned_contig_cast_cdouble_to_uint;
+                    }
+                    else {
+                        return &_aligned_cast_cdouble_to_uint;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_cdouble) &&
+                                dst_stride == sizeof(npy_uint)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_cdouble_to_uint :
+                                    &_contig_cast_cdouble_to_uint;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_cdouble_to_uint :
+                                         &_cast_cdouble_to_uint;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_ULONG:
+                    /*printf("ret fn %d %d\n", NPY_CDOUBLE, NPY_ULONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_cdouble) &&
+                                dst_stride == sizeof(npy_ulong)) {
+                        return &_aligned_contig_cast_cdouble_to_ulong;
+                    }
+                    else {
+                        return &_aligned_cast_cdouble_to_ulong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_cdouble) &&
+                                dst_stride == sizeof(npy_ulong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_cdouble_to_ulong :
+                                    &_contig_cast_cdouble_to_ulong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_cdouble_to_ulong :
+                                         &_cast_cdouble_to_ulong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_ULONGLONG:
+                    /*printf("ret fn %d %d\n", NPY_CDOUBLE, NPY_ULONGLONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_cdouble) &&
+                                dst_stride == sizeof(npy_ulonglong)) {
+                        return &_aligned_contig_cast_cdouble_to_ulonglong;
+                    }
+                    else {
+                        return &_aligned_cast_cdouble_to_ulonglong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_cdouble) &&
+                                dst_stride == sizeof(npy_ulonglong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_cdouble_to_ulonglong :
+                                    &_contig_cast_cdouble_to_ulonglong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_cdouble_to_ulonglong :
+                                         &_cast_cdouble_to_ulonglong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_BYTE:
+                    /*printf("ret fn %d %d\n", NPY_CDOUBLE, NPY_BYTE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_cdouble) &&
+                                dst_stride == sizeof(npy_byte)) {
+                        return &_aligned_contig_cast_cdouble_to_byte;
+                    }
+                    else {
+                        return &_aligned_cast_cdouble_to_byte;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_cdouble) &&
+                                dst_stride == sizeof(npy_byte)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_cdouble_to_byte :
+                                    &_contig_cast_cdouble_to_byte;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_cdouble_to_byte :
+                                         &_cast_cdouble_to_byte;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_SHORT:
+                    /*printf("ret fn %d %d\n", NPY_CDOUBLE, NPY_SHORT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_cdouble) &&
+                                dst_stride == sizeof(npy_short)) {
+                        return &_aligned_contig_cast_cdouble_to_short;
+                    }
+                    else {
+                        return &_aligned_cast_cdouble_to_short;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_cdouble) &&
+                                dst_stride == sizeof(npy_short)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_cdouble_to_short :
+                                    &_contig_cast_cdouble_to_short;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_cdouble_to_short :
+                                         &_cast_cdouble_to_short;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_INT:
+                    /*printf("ret fn %d %d\n", NPY_CDOUBLE, NPY_INT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_cdouble) &&
+                                dst_stride == sizeof(npy_int)) {
+                        return &_aligned_contig_cast_cdouble_to_int;
+                    }
+                    else {
+                        return &_aligned_cast_cdouble_to_int;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_cdouble) &&
+                                dst_stride == sizeof(npy_int)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_cdouble_to_int :
+                                    &_contig_cast_cdouble_to_int;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_cdouble_to_int :
+                                         &_cast_cdouble_to_int;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONG:
+                    /*printf("ret fn %d %d\n", NPY_CDOUBLE, NPY_LONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_cdouble) &&
+                                dst_stride == sizeof(npy_long)) {
+                        return &_aligned_contig_cast_cdouble_to_long;
+                    }
+                    else {
+                        return &_aligned_cast_cdouble_to_long;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_cdouble) &&
+                                dst_stride == sizeof(npy_long)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_cdouble_to_long :
+                                    &_contig_cast_cdouble_to_long;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_cdouble_to_long :
+                                         &_cast_cdouble_to_long;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONGLONG:
+                    /*printf("ret fn %d %d\n", NPY_CDOUBLE, NPY_LONGLONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_cdouble) &&
+                                dst_stride == sizeof(npy_longlong)) {
+                        return &_aligned_contig_cast_cdouble_to_longlong;
+                    }
+                    else {
+                        return &_aligned_cast_cdouble_to_longlong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_cdouble) &&
+                                dst_stride == sizeof(npy_longlong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_cdouble_to_longlong :
+                                    &_contig_cast_cdouble_to_longlong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_cdouble_to_longlong :
+                                         &_cast_cdouble_to_longlong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_HALF:
+                    /*printf("ret fn %d %d\n", NPY_CDOUBLE, NPY_HALF);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_cdouble) &&
+                                dst_stride == sizeof(npy_half)) {
+                        return &_aligned_contig_cast_cdouble_to_half;
+                    }
+                    else {
+                        return &_aligned_cast_cdouble_to_half;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_cdouble) &&
+                                dst_stride == sizeof(npy_half)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_cdouble_to_half :
+                                    &_contig_cast_cdouble_to_half;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_cdouble_to_half :
+                                         &_cast_cdouble_to_half;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_FLOAT:
+                    /*printf("ret fn %d %d\n", NPY_CDOUBLE, NPY_FLOAT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_cdouble) &&
+                                dst_stride == sizeof(npy_float)) {
+                        return &_aligned_contig_cast_cdouble_to_float;
+                    }
+                    else {
+                        return &_aligned_cast_cdouble_to_float;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_cdouble) &&
+                                dst_stride == sizeof(npy_float)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_cdouble_to_float :
+                                    &_contig_cast_cdouble_to_float;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_cdouble_to_float :
+                                         &_cast_cdouble_to_float;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_DOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_CDOUBLE, NPY_DOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_cdouble) &&
+                                dst_stride == sizeof(npy_double)) {
+                        return &_aligned_contig_cast_cdouble_to_double;
+                    }
+                    else {
+                        return &_aligned_cast_cdouble_to_double;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_cdouble) &&
+                                dst_stride == sizeof(npy_double)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_cdouble_to_double :
+                                    &_contig_cast_cdouble_to_double;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_cdouble_to_double :
+                                         &_cast_cdouble_to_double;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONGDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_CDOUBLE, NPY_LONGDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_cdouble) &&
+                                dst_stride == sizeof(npy_longdouble)) {
+                        return &_aligned_contig_cast_cdouble_to_longdouble;
+                    }
+                    else {
+                        return &_aligned_cast_cdouble_to_longdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_cdouble) &&
+                                dst_stride == sizeof(npy_longdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_cdouble_to_longdouble :
+                                    &_contig_cast_cdouble_to_longdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_cdouble_to_longdouble :
+                                         &_cast_cdouble_to_longdouble;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CFLOAT:
+                    /*printf("ret fn %d %d\n", NPY_CDOUBLE, NPY_CFLOAT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_cdouble) &&
+                                dst_stride == sizeof(npy_cfloat)) {
+                        return &_aligned_contig_cast_cdouble_to_cfloat;
+                    }
+                    else {
+                        return &_aligned_cast_cdouble_to_cfloat;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_cdouble) &&
+                                dst_stride == sizeof(npy_cfloat)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_cdouble_to_cfloat :
+                                    &_contig_cast_cdouble_to_cfloat;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_cdouble_to_cfloat :
+                                         &_cast_cdouble_to_cfloat;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_CDOUBLE, NPY_CDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_cdouble) &&
+                                dst_stride == sizeof(npy_cdouble)) {
+                        return &_aligned_contig_cast_cdouble_to_cdouble;
+                    }
+                    else {
+                        return &_aligned_cast_cdouble_to_cdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_cdouble) &&
+                                dst_stride == sizeof(npy_cdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_cdouble_to_cdouble :
+                                    &_contig_cast_cdouble_to_cdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_cdouble_to_cdouble :
+                                         &_cast_cdouble_to_cdouble;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CLONGDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_CDOUBLE, NPY_CLONGDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_cdouble) &&
+                                dst_stride == sizeof(npy_clongdouble)) {
+                        return &_aligned_contig_cast_cdouble_to_clongdouble;
+                    }
+                    else {
+                        return &_aligned_cast_cdouble_to_clongdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_cdouble) &&
+                                dst_stride == sizeof(npy_clongdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_cdouble_to_clongdouble :
+                                    &_contig_cast_cdouble_to_clongdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_cdouble_to_clongdouble :
+                                         &_cast_cdouble_to_clongdouble;
+                    }
+#  endif
+
+
+            }
+            /*printf("switched test fn %d - second %d\n", NPY_CDOUBLE, dst_type_num);*/
+
+
+#line 972
+
+        case NPY_CLONGDOUBLE:
+            /*printf("test fn %d - second %d\n", NPY_CLONGDOUBLE, dst_type_num);*/
+            switch (dst_type_num) {
+#line 994
+
+                case NPY_BOOL:
+                    /*printf("ret fn %d %d\n", NPY_CLONGDOUBLE, NPY_BOOL);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_clongdouble) &&
+                                dst_stride == sizeof(npy_bool)) {
+                        return &_aligned_contig_cast_clongdouble_to_bool;
+                    }
+                    else {
+                        return &_aligned_cast_clongdouble_to_bool;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_clongdouble) &&
+                                dst_stride == sizeof(npy_bool)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_clongdouble_to_bool :
+                                    &_contig_cast_clongdouble_to_bool;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_clongdouble_to_bool :
+                                         &_cast_clongdouble_to_bool;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_UBYTE:
+                    /*printf("ret fn %d %d\n", NPY_CLONGDOUBLE, NPY_UBYTE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_clongdouble) &&
+                                dst_stride == sizeof(npy_ubyte)) {
+                        return &_aligned_contig_cast_clongdouble_to_ubyte;
+                    }
+                    else {
+                        return &_aligned_cast_clongdouble_to_ubyte;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_clongdouble) &&
+                                dst_stride == sizeof(npy_ubyte)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_clongdouble_to_ubyte :
+                                    &_contig_cast_clongdouble_to_ubyte;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_clongdouble_to_ubyte :
+                                         &_cast_clongdouble_to_ubyte;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_USHORT:
+                    /*printf("ret fn %d %d\n", NPY_CLONGDOUBLE, NPY_USHORT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_clongdouble) &&
+                                dst_stride == sizeof(npy_ushort)) {
+                        return &_aligned_contig_cast_clongdouble_to_ushort;
+                    }
+                    else {
+                        return &_aligned_cast_clongdouble_to_ushort;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_clongdouble) &&
+                                dst_stride == sizeof(npy_ushort)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_clongdouble_to_ushort :
+                                    &_contig_cast_clongdouble_to_ushort;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_clongdouble_to_ushort :
+                                         &_cast_clongdouble_to_ushort;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_UINT:
+                    /*printf("ret fn %d %d\n", NPY_CLONGDOUBLE, NPY_UINT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_clongdouble) &&
+                                dst_stride == sizeof(npy_uint)) {
+                        return &_aligned_contig_cast_clongdouble_to_uint;
+                    }
+                    else {
+                        return &_aligned_cast_clongdouble_to_uint;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_clongdouble) &&
+                                dst_stride == sizeof(npy_uint)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_clongdouble_to_uint :
+                                    &_contig_cast_clongdouble_to_uint;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_clongdouble_to_uint :
+                                         &_cast_clongdouble_to_uint;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_ULONG:
+                    /*printf("ret fn %d %d\n", NPY_CLONGDOUBLE, NPY_ULONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_clongdouble) &&
+                                dst_stride == sizeof(npy_ulong)) {
+                        return &_aligned_contig_cast_clongdouble_to_ulong;
+                    }
+                    else {
+                        return &_aligned_cast_clongdouble_to_ulong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_clongdouble) &&
+                                dst_stride == sizeof(npy_ulong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_clongdouble_to_ulong :
+                                    &_contig_cast_clongdouble_to_ulong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_clongdouble_to_ulong :
+                                         &_cast_clongdouble_to_ulong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_ULONGLONG:
+                    /*printf("ret fn %d %d\n", NPY_CLONGDOUBLE, NPY_ULONGLONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_clongdouble) &&
+                                dst_stride == sizeof(npy_ulonglong)) {
+                        return &_aligned_contig_cast_clongdouble_to_ulonglong;
+                    }
+                    else {
+                        return &_aligned_cast_clongdouble_to_ulonglong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_clongdouble) &&
+                                dst_stride == sizeof(npy_ulonglong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_clongdouble_to_ulonglong :
+                                    &_contig_cast_clongdouble_to_ulonglong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_clongdouble_to_ulonglong :
+                                         &_cast_clongdouble_to_ulonglong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_BYTE:
+                    /*printf("ret fn %d %d\n", NPY_CLONGDOUBLE, NPY_BYTE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_clongdouble) &&
+                                dst_stride == sizeof(npy_byte)) {
+                        return &_aligned_contig_cast_clongdouble_to_byte;
+                    }
+                    else {
+                        return &_aligned_cast_clongdouble_to_byte;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_clongdouble) &&
+                                dst_stride == sizeof(npy_byte)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_clongdouble_to_byte :
+                                    &_contig_cast_clongdouble_to_byte;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_clongdouble_to_byte :
+                                         &_cast_clongdouble_to_byte;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_SHORT:
+                    /*printf("ret fn %d %d\n", NPY_CLONGDOUBLE, NPY_SHORT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_clongdouble) &&
+                                dst_stride == sizeof(npy_short)) {
+                        return &_aligned_contig_cast_clongdouble_to_short;
+                    }
+                    else {
+                        return &_aligned_cast_clongdouble_to_short;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_clongdouble) &&
+                                dst_stride == sizeof(npy_short)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_clongdouble_to_short :
+                                    &_contig_cast_clongdouble_to_short;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_clongdouble_to_short :
+                                         &_cast_clongdouble_to_short;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_INT:
+                    /*printf("ret fn %d %d\n", NPY_CLONGDOUBLE, NPY_INT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_clongdouble) &&
+                                dst_stride == sizeof(npy_int)) {
+                        return &_aligned_contig_cast_clongdouble_to_int;
+                    }
+                    else {
+                        return &_aligned_cast_clongdouble_to_int;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_clongdouble) &&
+                                dst_stride == sizeof(npy_int)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_clongdouble_to_int :
+                                    &_contig_cast_clongdouble_to_int;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_clongdouble_to_int :
+                                         &_cast_clongdouble_to_int;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONG:
+                    /*printf("ret fn %d %d\n", NPY_CLONGDOUBLE, NPY_LONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_clongdouble) &&
+                                dst_stride == sizeof(npy_long)) {
+                        return &_aligned_contig_cast_clongdouble_to_long;
+                    }
+                    else {
+                        return &_aligned_cast_clongdouble_to_long;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_clongdouble) &&
+                                dst_stride == sizeof(npy_long)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_clongdouble_to_long :
+                                    &_contig_cast_clongdouble_to_long;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_clongdouble_to_long :
+                                         &_cast_clongdouble_to_long;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONGLONG:
+                    /*printf("ret fn %d %d\n", NPY_CLONGDOUBLE, NPY_LONGLONG);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_clongdouble) &&
+                                dst_stride == sizeof(npy_longlong)) {
+                        return &_aligned_contig_cast_clongdouble_to_longlong;
+                    }
+                    else {
+                        return &_aligned_cast_clongdouble_to_longlong;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_clongdouble) &&
+                                dst_stride == sizeof(npy_longlong)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_clongdouble_to_longlong :
+                                    &_contig_cast_clongdouble_to_longlong;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_clongdouble_to_longlong :
+                                         &_cast_clongdouble_to_longlong;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_HALF:
+                    /*printf("ret fn %d %d\n", NPY_CLONGDOUBLE, NPY_HALF);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_clongdouble) &&
+                                dst_stride == sizeof(npy_half)) {
+                        return &_aligned_contig_cast_clongdouble_to_half;
+                    }
+                    else {
+                        return &_aligned_cast_clongdouble_to_half;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_clongdouble) &&
+                                dst_stride == sizeof(npy_half)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_clongdouble_to_half :
+                                    &_contig_cast_clongdouble_to_half;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_clongdouble_to_half :
+                                         &_cast_clongdouble_to_half;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_FLOAT:
+                    /*printf("ret fn %d %d\n", NPY_CLONGDOUBLE, NPY_FLOAT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_clongdouble) &&
+                                dst_stride == sizeof(npy_float)) {
+                        return &_aligned_contig_cast_clongdouble_to_float;
+                    }
+                    else {
+                        return &_aligned_cast_clongdouble_to_float;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_clongdouble) &&
+                                dst_stride == sizeof(npy_float)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_clongdouble_to_float :
+                                    &_contig_cast_clongdouble_to_float;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_clongdouble_to_float :
+                                         &_cast_clongdouble_to_float;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_DOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_CLONGDOUBLE, NPY_DOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_clongdouble) &&
+                                dst_stride == sizeof(npy_double)) {
+                        return &_aligned_contig_cast_clongdouble_to_double;
+                    }
+                    else {
+                        return &_aligned_cast_clongdouble_to_double;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_clongdouble) &&
+                                dst_stride == sizeof(npy_double)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_clongdouble_to_double :
+                                    &_contig_cast_clongdouble_to_double;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_clongdouble_to_double :
+                                         &_cast_clongdouble_to_double;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_LONGDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_CLONGDOUBLE, NPY_LONGDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_clongdouble) &&
+                                dst_stride == sizeof(npy_longdouble)) {
+                        return &_aligned_contig_cast_clongdouble_to_longdouble;
+                    }
+                    else {
+                        return &_aligned_cast_clongdouble_to_longdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_clongdouble) &&
+                                dst_stride == sizeof(npy_longdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_clongdouble_to_longdouble :
+                                    &_contig_cast_clongdouble_to_longdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_clongdouble_to_longdouble :
+                                         &_cast_clongdouble_to_longdouble;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CFLOAT:
+                    /*printf("ret fn %d %d\n", NPY_CLONGDOUBLE, NPY_CFLOAT);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_clongdouble) &&
+                                dst_stride == sizeof(npy_cfloat)) {
+                        return &_aligned_contig_cast_clongdouble_to_cfloat;
+                    }
+                    else {
+                        return &_aligned_cast_clongdouble_to_cfloat;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_clongdouble) &&
+                                dst_stride == sizeof(npy_cfloat)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_clongdouble_to_cfloat :
+                                    &_contig_cast_clongdouble_to_cfloat;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_clongdouble_to_cfloat :
+                                         &_cast_clongdouble_to_cfloat;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_CLONGDOUBLE, NPY_CDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_clongdouble) &&
+                                dst_stride == sizeof(npy_cdouble)) {
+                        return &_aligned_contig_cast_clongdouble_to_cdouble;
+                    }
+                    else {
+                        return &_aligned_cast_clongdouble_to_cdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_clongdouble) &&
+                                dst_stride == sizeof(npy_cdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_clongdouble_to_cdouble :
+                                    &_contig_cast_clongdouble_to_cdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_clongdouble_to_cdouble :
+                                         &_cast_clongdouble_to_cdouble;
+                    }
+#  endif
+
+
+#line 994
+
+                case NPY_CLONGDOUBLE:
+                    /*printf("ret fn %d %d\n", NPY_CLONGDOUBLE, NPY_CLONGDOUBLE);*/
+#  if NPY_USE_UNALIGNED_ACCESS
+                    if (src_stride == sizeof(npy_clongdouble) &&
+                                dst_stride == sizeof(npy_clongdouble)) {
+                        return &_aligned_contig_cast_clongdouble_to_clongdouble;
+                    }
+                    else {
+                        return &_aligned_cast_clongdouble_to_clongdouble;
+                    }
+#  else
+                    if (src_stride == sizeof(npy_clongdouble) &&
+                                dst_stride == sizeof(npy_clongdouble)) {
+                        return aligned ?
+                                    &_aligned_contig_cast_clongdouble_to_clongdouble :
+                                    &_contig_cast_clongdouble_to_clongdouble;
+                    }
+                    else {
+                        return aligned ? &_aligned_cast_clongdouble_to_clongdouble :
+                                         &_cast_clongdouble_to_clongdouble;
+                    }
+#  endif
+
+
+            }
+            /*printf("switched test fn %d - second %d\n", NPY_CLONGDOUBLE, dst_type_num);*/
+
+
+    }
+
+    return NULL;
+}
+
+
+/****************** PRIMITIVE FLAT TO/FROM NDIM FUNCTIONS ******************/
+
+/* See documentation of arguments in lowlevel_strided_loops.h */
+NPY_NO_EXPORT npy_intp
+PyArray_TransferNDimToStrided(npy_intp ndim,
+                char *dst, npy_intp dst_stride,
+                char *src, npy_intp const *src_strides, npy_intp src_strides_inc,
+                npy_intp const *coords, npy_intp coords_inc,
+                npy_intp const *shape, npy_intp shape_inc,
+                npy_intp count, npy_intp src_itemsize,
+                NPY_cast_info *cast_info)
+{
+    npy_intp i, M, N, coord0, shape0, src_stride0, coord1, shape1, src_stride1;
+
+    /* Finish off dimension 0 */
+    coord0 = coords[0];
+    shape0 = shape[0];
+    src_stride0 = src_strides[0];
+    N = shape0 - coord0;
+
+    npy_intp strides[2] = {src_stride0, dst_stride};
+
+    char *args[2] = {src, dst};
+    if (N >= count) {
+        return cast_info->func(&cast_info->context,
+                args, &count, strides, cast_info->auxdata);
+    }
+    int res = cast_info->func(&cast_info->context,
+            args, &N, strides, cast_info->auxdata);
+
+    if (res < 0) {
+        return -1;
+    }
+    count -= N;
+
+    /* If it's 1-dimensional, there's no more to copy */
+    if (ndim == 1) {
+        return count;
+    }
+
+    /* Adjust the src and dst pointers */
+    coord1 = (coords + coords_inc)[0];
+    shape1 = (shape + shape_inc)[0];
+    src_stride1 = (src_strides + src_strides_inc)[0];
+    src = src - coord0*src_stride0 + src_stride1;
+    dst += N*dst_stride;
+
+    /* Finish off dimension 1 */
+    M = (shape1 - coord1 - 1);
+    N = shape0*M;
+    for (i = 0; i < M; ++i) {
+        args[0] = src; args[1] = dst;
+        if (shape0 >= count) {
+            return cast_info->func(&cast_info->context,
+                    args, &count, strides, cast_info->auxdata);
+        }
+        else {
+            res = cast_info->func(&cast_info->context,
+                    args, &shape0, strides, cast_info->auxdata);
+            if (res < 0) {
+                return -1;
+            }
+        }
+        count -= shape0;
+        src += src_stride1;
+        dst += shape0*dst_stride;
+    }
+
+    /* If it's 2-dimensional, there's no more to copy */
+    if (ndim == 2) {
+        return count;
+    }
+
+    /* General-case loop for everything else */
+    else {
+        /* Iteration structure for dimensions 2 and up */
+        struct {
+            npy_intp coord, shape, src_stride;
+        } it[NPY_MAXDIMS];
+
+        /* Copy the coordinates and shape */
+        coords += 2*coords_inc;
+        shape += 2*shape_inc;
+        src_strides += 2*src_strides_inc;
+        for (i = 0; i < ndim-2; ++i) {
+            it[i].coord = coords[0];
+            it[i].shape = shape[0];
+            it[i].src_stride = src_strides[0];
+            coords += coords_inc;
+            shape += shape_inc;
+            src_strides += src_strides_inc;
+        }
+
+        for (;;) {
+            /* Adjust the src pointer from the dimension 0 and 1 loop */
+            src = src - shape1*src_stride1;
+
+            /* Increment to the next coordinate */
+            for (i = 0; i < ndim-2; ++i) {
+                src += it[i].src_stride;
+                if (++it[i].coord >= it[i].shape) {
+                    it[i].coord = 0;
+                    src -= it[i].src_stride*it[i].shape;
+                }
+                else {
+                    break;
+                }
+            }
+            /* If the last dimension rolled over, we're done */
+            if (i == ndim-2) {
+                return count;
+            }
+
+            /* A loop for dimensions 0 and 1 */
+            for (i = 0; i < shape1; ++i) {
+                args[0] = src; args[1] = dst;
+                if (shape0 >= count) {
+                    return cast_info->func(&cast_info->context,
+                            args, &count, strides, cast_info->auxdata);
+                }
+                else {
+                    res = cast_info->func(&cast_info->context,
+                            args, &shape0, strides, cast_info->auxdata);
+                    if (res < 0) {
+                        return -1;
+                    }
+                }
+                count -= shape0;
+                src += src_stride1;
+                dst += shape0*dst_stride;
+            }
+        }
+    }
+}
+
+/* See documentation of arguments in lowlevel_strided_loops.h */
+NPY_NO_EXPORT npy_intp
+PyArray_TransferStridedToNDim(npy_intp ndim,
+                char *dst, npy_intp const *dst_strides, npy_intp dst_strides_inc,
+                char *src, npy_intp src_stride,
+                npy_intp const *coords, npy_intp coords_inc,
+                npy_intp const *shape, npy_intp shape_inc,
+                npy_intp count, npy_intp src_itemsize,
+                NPY_cast_info *cast_info)
+{
+    npy_intp i, M, N, coord0, shape0, dst_stride0, coord1, shape1, dst_stride1;
+
+    /* Finish off dimension 0 */
+    coord0 = coords[0];
+    shape0 = shape[0];
+    dst_stride0 = dst_strides[0];
+    N = shape0 - coord0;
+
+    npy_intp strides[2] = {src_stride, dst_stride0};
+
+    char *args[2] = {src, dst};
+    if (N >= count) {
+        return cast_info->func(&cast_info->context,
+                args, &count, strides, cast_info->auxdata);
+    }
+    int res = cast_info->func(&cast_info->context,
+            args, &N, strides, cast_info->auxdata);
+    if (res < 0) {
+        return -1;
+    }
+    count -= N;
+
+    /* If it's 1-dimensional, there's no more to copy */
+    if (ndim == 1) {
+        return count;
+    }
+
+    /* Adjust the src and dst pointers */
+    coord1 = (coords + coords_inc)[0];
+    shape1 = (shape + shape_inc)[0];
+    dst_stride1 = (dst_strides + dst_strides_inc)[0];
+    dst = dst - coord0*dst_stride0 + dst_stride1;
+    src += N*src_stride;
+
+    /* Finish off dimension 1 */
+    M = (shape1 - coord1 - 1);
+    N = shape0*M;
+    for (i = 0; i < M; ++i) {
+        args[0] = src; args[1] = dst;
+        if (shape0 >= count) {
+            return cast_info->func(&cast_info->context,
+                    args, &count, strides, cast_info->auxdata);
+        }
+        else {
+            res = cast_info->func(&cast_info->context,
+                    args, &shape0, strides, cast_info->auxdata);
+            if (res < 0) {
+                return -1;
+            }
+        }
+        count -= shape0;
+        dst += dst_stride1;
+        src += shape0*src_stride;
+    }
+
+    /* If it's 2-dimensional, there's no more to copy */
+    if (ndim == 2) {
+        return count;
+    }
+
+    /* General-case loop for everything else */
+    else {
+        /* Iteration structure for dimensions 2 and up */
+        struct {
+            npy_intp coord, shape, dst_stride;
+        } it[NPY_MAXDIMS];
+
+        /* Copy the coordinates and shape */
+        coords += 2*coords_inc;
+        shape += 2*shape_inc;
+        dst_strides += 2*dst_strides_inc;
+        for (i = 0; i < ndim-2; ++i) {
+            it[i].coord = coords[0];
+            it[i].shape = shape[0];
+            it[i].dst_stride = dst_strides[0];
+            coords += coords_inc;
+            shape += shape_inc;
+            dst_strides += dst_strides_inc;
+        }
+
+        for (;;) {
+            /* Adjust the dst pointer from the dimension 0 and 1 loop */
+            dst = dst - shape1*dst_stride1;
+
+            /* Increment to the next coordinate */
+            for (i = 0; i < ndim-2; ++i) {
+                dst += it[i].dst_stride;
+                if (++it[i].coord >= it[i].shape) {
+                    it[i].coord = 0;
+                    dst -= it[i].dst_stride*it[i].shape;
+                }
+                else {
+                    break;
+                }
+            }
+            /* If the last dimension rolled over, we're done */
+            if (i == ndim-2) {
+                return count;
+            }
+
+            /* A loop for dimensions 0 and 1 */
+            for (i = 0; i < shape1; ++i) {
+                args[0] = src; args[1] = dst;
+                if (shape0 >= count) {
+                    return cast_info->func(&cast_info->context,
+                            args, &count, strides, cast_info->auxdata);
+                }
+                else {
+                    res = cast_info->func(&cast_info->context,
+                            args, &shape0, strides, cast_info->auxdata);
+                    if (res < 0) {
+                        return -1;
+                    }
+                }
+                count -= shape0;
+                dst += dst_stride1;
+                src += shape0*src_stride;
+            }
+        }
+    }
+}
+
+/* See documentation of arguments in lowlevel_strided_loops.h */
+NPY_NO_EXPORT npy_intp
+PyArray_TransferMaskedStridedToNDim(npy_intp ndim,
+                char *dst, npy_intp const *dst_strides, npy_intp dst_strides_inc,
+                char *src, npy_intp src_stride,
+                npy_uint8 *mask, npy_intp mask_stride,
+                npy_intp const *coords, npy_intp coords_inc,
+                npy_intp const *shape, npy_intp shape_inc,
+                npy_intp count, npy_intp src_itemsize,
+                NPY_cast_info *cast_info)
+{
+    npy_intp i, M, N, coord0, shape0, dst_stride0, coord1, shape1, dst_stride1;
+    PyArray_MaskedStridedUnaryOp *stransfer =
+            (PyArray_MaskedStridedUnaryOp*)cast_info->func;
+
+    /* Finish off dimension 0 */
+    coord0 = coords[0];
+    shape0 = shape[0];
+    dst_stride0 = dst_strides[0];
+    N = shape0 - coord0;
+
+    npy_intp strides[2] = {src_stride, dst_stride0};
+
+    char *args[2] = {src, dst};
+    if (N >= count) {
+        return stransfer(&cast_info->context,
+                args, &count, strides, mask, mask_stride, cast_info->auxdata);
+    }
+    int res = stransfer(&cast_info->context,
+            args, &N, strides, mask, mask_stride, cast_info->auxdata);
+    if (res < 0) {
+        return -1;
+    }
+    count -= N;
+
+    /* If it's 1-dimensional, there's no more to copy */
+    if (ndim == 1) {
+        return count;
+    }
+
+    /* Adjust the src and dst pointers */
+    coord1 = (coords + coords_inc)[0];
+    shape1 = (shape + shape_inc)[0];
+    dst_stride1 = (dst_strides + dst_strides_inc)[0];
+    dst = dst - coord0*dst_stride0 + dst_stride1;
+    src += N*src_stride;
+    mask += N*mask_stride;
+
+    /* Finish off dimension 1 */
+    M = (shape1 - coord1 - 1);
+    N = shape0*M;
+    for (i = 0; i < M; ++i) {
+        args[0] = src; args[1] = dst;
+        if (shape0 >= count) {
+            return stransfer(&cast_info->context,
+                    args, &count, strides,
+                    mask, mask_stride, cast_info->auxdata);
+        }
+        else {
+            int res =  stransfer(&cast_info->context,
+                    args, &shape0, strides,
+                    mask, mask_stride, cast_info->auxdata);
+            if (res < 0) {
+                return -1;
+            }
+        }
+        count -= shape0;
+        dst += dst_stride1;
+        src += shape0*src_stride;
+        mask += shape0*mask_stride;
+    }
+
+    /* If it's 2-dimensional, there's no more to copy */
+    if (ndim == 2) {
+        return count;
+    }
+
+    /* General-case loop for everything else */
+    else {
+        /* Iteration structure for dimensions 2 and up */
+        struct {
+            npy_intp coord, shape, dst_stride;
+        } it[NPY_MAXDIMS];
+
+        /* Copy the coordinates and shape */
+        coords += 2*coords_inc;
+        shape += 2*shape_inc;
+        dst_strides += 2*dst_strides_inc;
+        for (i = 0; i < ndim-2; ++i) {
+            it[i].coord = coords[0];
+            it[i].shape = shape[0];
+            it[i].dst_stride = dst_strides[0];
+            coords += coords_inc;
+            shape += shape_inc;
+            dst_strides += dst_strides_inc;
+        }
+
+        for (;;) {
+            /* Adjust the dst pointer from the dimension 0 and 1 loop */
+            dst = dst - shape1*dst_stride1;
+
+            /* Increment to the next coordinate */
+            for (i = 0; i < ndim-2; ++i) {
+                dst += it[i].dst_stride;
+                if (++it[i].coord >= it[i].shape) {
+                    it[i].coord = 0;
+                    dst -= it[i].dst_stride*it[i].shape;
+                }
+                else {
+                    break;
+                }
+            }
+            /* If the last dimension rolled over, we're done */
+            if (i == ndim-2) {
+                return count;
+            }
+
+            /* A loop for dimensions 0 and 1 */
+            for (i = 0; i < shape1; ++i) {
+                args[0] = src; args[1] = dst;
+                if (shape0 >= count) {
+                    return stransfer(&cast_info->context,
+                            args, &count, strides, mask,
+                            mask_stride, cast_info->auxdata);
+                }
+                else {
+                    int res =  stransfer(&cast_info->context,
+                            args, &shape0, strides,
+                            mask, mask_stride, cast_info->auxdata);
+                    if (res < 0) {
+                        return -1;
+                    }
+                }
+                count -= shape0;
+                dst += dst_stride1;
+                src += shape0*src_stride;
+                mask += shape0*mask_stride;
+            }
+        }
+    }
+}
+
+
+/***************************************************************************/
+/****************** MapIter (Advanced indexing) Get/Set ********************/
+/***************************************************************************/
+
+typedef struct {npy_uint64 a; npy_uint64 b;} copytype128;
+
+#line 1449
+
+/*
+ * Advanced indexing iteration of arrays when there is a single indexing
+ * array which has the same memory order as the value array and both
+ * can be trivially iterated (single stride, aligned, no casting necessary).
+ */
+NPY_NO_EXPORT int
+mapiter_trivial_set(
+        PyArrayObject *self, PyArrayObject *ind, PyArrayObject *result,
+        int is_aligned, NPY_cast_info *cast_info)
+{
+    char *base_ptr, *ind_ptr, *result_ptr;
+    npy_intp self_stride, ind_stride, result_stride;
+    npy_intp fancy_dim = PyArray_DIM(self, 0);
+
+    npy_intp itersize;
+
+    /* copying between the same dtype, we can assume this is correct: */
+    int needs_api = PyDataType_REFCHK(PyArray_DESCR(self));
+    npy_intp itemsize = PyArray_ITEMSIZE(self);
+    npy_intp strides[2] = {itemsize, itemsize};
+    npy_intp one = 1;
+
+    NPY_BEGIN_THREADS_DEF;
+
+    base_ptr = PyArray_BYTES(self);
+    self_stride = PyArray_STRIDE(self, 0);
+
+    PyArray_PREPARE_TRIVIAL_PAIR_ITERATION(ind, result, itersize,
+                                           ind_ptr, result_ptr,
+                                           ind_stride, result_stride)
+
+    if (!needs_api) {
+        NPY_BEGIN_THREADS_THRESHOLDED(PyArray_SIZE(ind));
+    }
+#if !0
+    /* Check the indices beforehand */
+    while (itersize--) {
+        npy_intp indval = *((npy_intp*)ind_ptr);
+        if (check_and_adjust_index(&indval, fancy_dim, 0, _save) < 0 ) {
+            return -1;
+        }
+        ind_ptr += ind_stride;
+    }
+
+    /*
+     * Reset the ind_ptr and itersize, due to broadcasting it is always
+     * the size of ind.
+     */
+    ind_ptr = PyArray_BYTES(ind);
+    itersize = PyArray_SIZE(ind);
+#endif
+
+    /* Optimization for aligned types that do not need the api */
+    switch ((is_aligned && !needs_api) ? itemsize : 0) {
+
+#line 1509
+
+#if 1
+    case 1:
+#else
+    default:
+#endif
+        while (itersize--) {
+            char * self_ptr;
+            npy_intp indval = *((npy_intp*)ind_ptr);
+            assert(npy_is_aligned(ind_ptr, NPY_ALIGNOF_UINT(npy_intp)));
+#if 0
+            if (check_and_adjust_index(&indval, fancy_dim, 0, _save) < 0 ) {
+                return -1;
+            }
+#else
+            if (indval < 0) {
+                indval += fancy_dim;
+            }
+#endif
+            self_ptr = base_ptr + indval * self_stride;
+
+#if 0
+#if 1
+            assert(npy_is_aligned(result_ptr, NPY_ALIGNOF_UINT(npy_uint8)));
+            assert(npy_is_aligned(self_ptr, NPY_ALIGNOF_UINT(npy_uint8)));
+            *(npy_uint8 *)result_ptr = *(npy_uint8 *)self_ptr;
+#else
+            char *args[2] = {self_ptr, result_ptr};
+            if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                    args, &one, strides,
+                    cast_info->auxdata) < 0)) {
+                NPY_END_THREADS;
+                return -1;
+            }
+#endif
+
+#else /* !0 */
+#if 1
+            assert(npy_is_aligned(result_ptr, NPY_ALIGNOF_UINT(npy_uint8)));
+            assert(npy_is_aligned(self_ptr, NPY_ALIGNOF_UINT(npy_uint8)));
+            *(npy_uint8 *)self_ptr = *(npy_uint8 *)result_ptr;
+#else
+            char *args[2] = {result_ptr, self_ptr};
+            if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                    args, &one, strides,
+                    cast_info->auxdata) < 0)) {
+                NPY_END_THREADS;
+                return -1;
+            }
+#endif
+#endif
+
+            ind_ptr += ind_stride;
+            result_ptr += result_stride;
+        }
+        break;
+
+
+#line 1509
+
+#if 2
+    case 2:
+#else
+    default:
+#endif
+        while (itersize--) {
+            char * self_ptr;
+            npy_intp indval = *((npy_intp*)ind_ptr);
+            assert(npy_is_aligned(ind_ptr, NPY_ALIGNOF_UINT(npy_intp)));
+#if 0
+            if (check_and_adjust_index(&indval, fancy_dim, 0, _save) < 0 ) {
+                return -1;
+            }
+#else
+            if (indval < 0) {
+                indval += fancy_dim;
+            }
+#endif
+            self_ptr = base_ptr + indval * self_stride;
+
+#if 0
+#if 2
+            assert(npy_is_aligned(result_ptr, NPY_ALIGNOF_UINT(npy_uint16)));
+            assert(npy_is_aligned(self_ptr, NPY_ALIGNOF_UINT(npy_uint16)));
+            *(npy_uint16 *)result_ptr = *(npy_uint16 *)self_ptr;
+#else
+            char *args[2] = {self_ptr, result_ptr};
+            if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                    args, &one, strides,
+                    cast_info->auxdata) < 0)) {
+                NPY_END_THREADS;
+                return -1;
+            }
+#endif
+
+#else /* !0 */
+#if 2
+            assert(npy_is_aligned(result_ptr, NPY_ALIGNOF_UINT(npy_uint16)));
+            assert(npy_is_aligned(self_ptr, NPY_ALIGNOF_UINT(npy_uint16)));
+            *(npy_uint16 *)self_ptr = *(npy_uint16 *)result_ptr;
+#else
+            char *args[2] = {result_ptr, self_ptr};
+            if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                    args, &one, strides,
+                    cast_info->auxdata) < 0)) {
+                NPY_END_THREADS;
+                return -1;
+            }
+#endif
+#endif
+
+            ind_ptr += ind_stride;
+            result_ptr += result_stride;
+        }
+        break;
+
+
+#line 1509
+
+#if 4
+    case 4:
+#else
+    default:
+#endif
+        while (itersize--) {
+            char * self_ptr;
+            npy_intp indval = *((npy_intp*)ind_ptr);
+            assert(npy_is_aligned(ind_ptr, NPY_ALIGNOF_UINT(npy_intp)));
+#if 0
+            if (check_and_adjust_index(&indval, fancy_dim, 0, _save) < 0 ) {
+                return -1;
+            }
+#else
+            if (indval < 0) {
+                indval += fancy_dim;
+            }
+#endif
+            self_ptr = base_ptr + indval * self_stride;
+
+#if 0
+#if 4
+            assert(npy_is_aligned(result_ptr, NPY_ALIGNOF_UINT(npy_uint32)));
+            assert(npy_is_aligned(self_ptr, NPY_ALIGNOF_UINT(npy_uint32)));
+            *(npy_uint32 *)result_ptr = *(npy_uint32 *)self_ptr;
+#else
+            char *args[2] = {self_ptr, result_ptr};
+            if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                    args, &one, strides,
+                    cast_info->auxdata) < 0)) {
+                NPY_END_THREADS;
+                return -1;
+            }
+#endif
+
+#else /* !0 */
+#if 4
+            assert(npy_is_aligned(result_ptr, NPY_ALIGNOF_UINT(npy_uint32)));
+            assert(npy_is_aligned(self_ptr, NPY_ALIGNOF_UINT(npy_uint32)));
+            *(npy_uint32 *)self_ptr = *(npy_uint32 *)result_ptr;
+#else
+            char *args[2] = {result_ptr, self_ptr};
+            if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                    args, &one, strides,
+                    cast_info->auxdata) < 0)) {
+                NPY_END_THREADS;
+                return -1;
+            }
+#endif
+#endif
+
+            ind_ptr += ind_stride;
+            result_ptr += result_stride;
+        }
+        break;
+
+
+#line 1509
+
+#if 8
+    case 8:
+#else
+    default:
+#endif
+        while (itersize--) {
+            char * self_ptr;
+            npy_intp indval = *((npy_intp*)ind_ptr);
+            assert(npy_is_aligned(ind_ptr, NPY_ALIGNOF_UINT(npy_intp)));
+#if 0
+            if (check_and_adjust_index(&indval, fancy_dim, 0, _save) < 0 ) {
+                return -1;
+            }
+#else
+            if (indval < 0) {
+                indval += fancy_dim;
+            }
+#endif
+            self_ptr = base_ptr + indval * self_stride;
+
+#if 0
+#if 8
+            assert(npy_is_aligned(result_ptr, NPY_ALIGNOF_UINT(npy_uint64)));
+            assert(npy_is_aligned(self_ptr, NPY_ALIGNOF_UINT(npy_uint64)));
+            *(npy_uint64 *)result_ptr = *(npy_uint64 *)self_ptr;
+#else
+            char *args[2] = {self_ptr, result_ptr};
+            if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                    args, &one, strides,
+                    cast_info->auxdata) < 0)) {
+                NPY_END_THREADS;
+                return -1;
+            }
+#endif
+
+#else /* !0 */
+#if 8
+            assert(npy_is_aligned(result_ptr, NPY_ALIGNOF_UINT(npy_uint64)));
+            assert(npy_is_aligned(self_ptr, NPY_ALIGNOF_UINT(npy_uint64)));
+            *(npy_uint64 *)self_ptr = *(npy_uint64 *)result_ptr;
+#else
+            char *args[2] = {result_ptr, self_ptr};
+            if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                    args, &one, strides,
+                    cast_info->auxdata) < 0)) {
+                NPY_END_THREADS;
+                return -1;
+            }
+#endif
+#endif
+
+            ind_ptr += ind_stride;
+            result_ptr += result_stride;
+        }
+        break;
+
+
+#line 1509
+
+#if 16
+    case 16:
+#else
+    default:
+#endif
+        while (itersize--) {
+            char * self_ptr;
+            npy_intp indval = *((npy_intp*)ind_ptr);
+            assert(npy_is_aligned(ind_ptr, NPY_ALIGNOF_UINT(npy_intp)));
+#if 0
+            if (check_and_adjust_index(&indval, fancy_dim, 0, _save) < 0 ) {
+                return -1;
+            }
+#else
+            if (indval < 0) {
+                indval += fancy_dim;
+            }
+#endif
+            self_ptr = base_ptr + indval * self_stride;
+
+#if 0
+#if 16
+            assert(npy_is_aligned(result_ptr, NPY_ALIGNOF_UINT(copytype128)));
+            assert(npy_is_aligned(self_ptr, NPY_ALIGNOF_UINT(copytype128)));
+            *(copytype128 *)result_ptr = *(copytype128 *)self_ptr;
+#else
+            char *args[2] = {self_ptr, result_ptr};
+            if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                    args, &one, strides,
+                    cast_info->auxdata) < 0)) {
+                NPY_END_THREADS;
+                return -1;
+            }
+#endif
+
+#else /* !0 */
+#if 16
+            assert(npy_is_aligned(result_ptr, NPY_ALIGNOF_UINT(copytype128)));
+            assert(npy_is_aligned(self_ptr, NPY_ALIGNOF_UINT(copytype128)));
+            *(copytype128 *)self_ptr = *(copytype128 *)result_ptr;
+#else
+            char *args[2] = {result_ptr, self_ptr};
+            if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                    args, &one, strides,
+                    cast_info->auxdata) < 0)) {
+                NPY_END_THREADS;
+                return -1;
+            }
+#endif
+#endif
+
+            ind_ptr += ind_stride;
+            result_ptr += result_stride;
+        }
+        break;
+
+
+#line 1509
+
+#if 0
+    case 0:
+#else
+    default:
+#endif
+        while (itersize--) {
+            char * self_ptr;
+            npy_intp indval = *((npy_intp*)ind_ptr);
+            assert(npy_is_aligned(ind_ptr, NPY_ALIGNOF_UINT(npy_intp)));
+#if 0
+            if (check_and_adjust_index(&indval, fancy_dim, 0, _save) < 0 ) {
+                return -1;
+            }
+#else
+            if (indval < 0) {
+                indval += fancy_dim;
+            }
+#endif
+            self_ptr = base_ptr + indval * self_stride;
+
+#if 0
+#if 0
+            assert(npy_is_aligned(result_ptr, NPY_ALIGNOF_UINT(0)));
+            assert(npy_is_aligned(self_ptr, NPY_ALIGNOF_UINT(0)));
+            *(0 *)result_ptr = *(0 *)self_ptr;
+#else
+            char *args[2] = {self_ptr, result_ptr};
+            if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                    args, &one, strides,
+                    cast_info->auxdata) < 0)) {
+                NPY_END_THREADS;
+                return -1;
+            }
+#endif
+
+#else /* !0 */
+#if 0
+            assert(npy_is_aligned(result_ptr, NPY_ALIGNOF_UINT(0)));
+            assert(npy_is_aligned(self_ptr, NPY_ALIGNOF_UINT(0)));
+            *(0 *)self_ptr = *(0 *)result_ptr;
+#else
+            char *args[2] = {result_ptr, self_ptr};
+            if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                    args, &one, strides,
+                    cast_info->auxdata) < 0)) {
+                NPY_END_THREADS;
+                return -1;
+            }
+#endif
+#endif
+
+            ind_ptr += ind_stride;
+            result_ptr += result_stride;
+        }
+        break;
+
+
+    }
+
+    NPY_END_THREADS;
+
+    return 0;
+}
+
+
+/*
+ * General advanced indexing iteration.
+ */
+NPY_NO_EXPORT int
+mapiter_set(
+        PyArrayMapIterObject *mit, NPY_cast_info *cast_info,
+        NPY_ARRAYMETHOD_FLAGS flags, int is_aligned)
+{
+    npy_intp *counter, count;
+    int i;
+
+    /* Cached mit info */
+    int numiter = mit->numiter;
+    int needs_api = (flags & NPY_METH_REQUIRES_PYAPI) != 0;
+    /* Constant information */
+    npy_intp fancy_dims[NPY_MAXDIMS];
+    npy_intp fancy_strides[NPY_MAXDIMS];
+#if 0
+    int iteraxis;
+#endif
+
+    char *baseoffset = mit->baseoffset;
+    char **outer_ptrs = mit->outer_ptrs;
+    npy_intp *outer_strides = mit->outer_strides;
+    PyArrayObject *array= mit->array;
+
+    /* Fill constant information */
+#if 0
+    iteraxis = mit->iteraxes[0];
+#endif
+    for (i = 0; i < numiter; i++) {
+        fancy_dims[i] = mit->fancy_dims[i];
+        fancy_strides[i] = mit->fancy_strides[i];
+    }
+
+    if (mit->size == 0) {
+       return 0;
+    }
+
+    if (mit->subspace_iter == NULL) {
+        /*
+         * Item by item copy situation, the operand is buffered
+         * so use a cast to copy.  The iterator may not do any transfers, so may
+         * not have set `needs_api` yet, set it if necessary:
+         */
+        needs_api |= PyDataType_REFCHK(PyArray_DESCR(array));
+        npy_intp itemsize = PyArray_ITEMSIZE(array);
+        npy_intp strides[2] = {itemsize, itemsize};
+        npy_intp one = 1;
+
+        /* We have only one iterator handling everything */
+        counter = NpyIter_GetInnerLoopSizePtr(mit->outer);
+
+        /************ Optimized inner loops without subspace *************/
+
+#line 1634
+
+#if 1
+        if (numiter == 1) {
+#else
+        else {
+#endif
+            NPY_BEGIN_THREADS_DEF;
+            if (!needs_api) {
+                NPY_BEGIN_THREADS;
+            }
+
+            /* Optimization for aligned types that do not need the api */
+            switch ((is_aligned && !needs_api) ? itemsize : 0) {
+
+#line 1652
+
+#if 1
+            case 1:
+#else
+            default:
+#endif
+                /* Outer iteration (safe because mit->size != 0) */
+                do {
+#if !0
+                    /*
+                     * When the API is needed the casting might fail
+                     * TODO: (only if buffering is enabled).
+                     */
+                    if (needs_api && PyErr_Occurred()) {
+                        return -1;
+                    }
+#endif
+                    count = *counter;
+                    while (count--) {
+                        char * self_ptr = baseoffset;
+                        for (i=0; i < 1; i++) {
+                            npy_intp indval = *((npy_intp*)outer_ptrs[i]);
+                            assert(npy_is_aligned(outer_ptrs[i],
+                                                  NPY_ALIGNOF_UINT(npy_intp)));
+
+#if 0 && 1
+                            if (check_and_adjust_index(&indval, fancy_dims[i],
+                                                       iteraxis, _save) < 0 ) {
+                                return -1;
+                            }
+#else
+                            if (indval < 0) {
+                                indval += fancy_dims[i];
+                            }
+#endif
+                            self_ptr += indval * fancy_strides[i];
+
+                            /* advance indexing arrays */
+                            outer_ptrs[i] += outer_strides[i];
+                        }
+
+#if 0
+#if 1
+                        assert(npy_is_aligned(outer_ptrs[i],
+                                              NPY_ALIGNOF_UINT(npy_uint8)));
+                        assert(npy_is_aligned(self_ptr,
+                                              NPY_ALIGNOF_UINT(npy_uint8)));
+                        *(npy_uint8 *)(outer_ptrs[i]) = *(npy_uint8 *)self_ptr;
+#else
+                        char *args[2] = {self_ptr, outer_ptrs[i]};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#else /* !0 */
+#if 1
+                        assert(npy_is_aligned(outer_ptrs[i],
+                               NPY_ALIGNOF_UINT(npy_uint8)));
+                        assert(npy_is_aligned(self_ptr,
+                               NPY_ALIGNOF_UINT(npy_uint8)));
+                        *(npy_uint8 *)self_ptr = *(npy_uint8 *)(outer_ptrs[i]);
+#else
+                        char *args[2] = {outer_ptrs[i], self_ptr};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#endif
+                        /* advance extra operand */
+                        outer_ptrs[i] += outer_strides[i];
+                    }
+                } while (mit->outer_next(mit->outer));
+
+                break;
+
+
+#line 1652
+
+#if 2
+            case 2:
+#else
+            default:
+#endif
+                /* Outer iteration (safe because mit->size != 0) */
+                do {
+#if !0
+                    /*
+                     * When the API is needed the casting might fail
+                     * TODO: (only if buffering is enabled).
+                     */
+                    if (needs_api && PyErr_Occurred()) {
+                        return -1;
+                    }
+#endif
+                    count = *counter;
+                    while (count--) {
+                        char * self_ptr = baseoffset;
+                        for (i=0; i < 1; i++) {
+                            npy_intp indval = *((npy_intp*)outer_ptrs[i]);
+                            assert(npy_is_aligned(outer_ptrs[i],
+                                                  NPY_ALIGNOF_UINT(npy_intp)));
+
+#if 0 && 1
+                            if (check_and_adjust_index(&indval, fancy_dims[i],
+                                                       iteraxis, _save) < 0 ) {
+                                return -1;
+                            }
+#else
+                            if (indval < 0) {
+                                indval += fancy_dims[i];
+                            }
+#endif
+                            self_ptr += indval * fancy_strides[i];
+
+                            /* advance indexing arrays */
+                            outer_ptrs[i] += outer_strides[i];
+                        }
+
+#if 0
+#if 2
+                        assert(npy_is_aligned(outer_ptrs[i],
+                                              NPY_ALIGNOF_UINT(npy_uint16)));
+                        assert(npy_is_aligned(self_ptr,
+                                              NPY_ALIGNOF_UINT(npy_uint16)));
+                        *(npy_uint16 *)(outer_ptrs[i]) = *(npy_uint16 *)self_ptr;
+#else
+                        char *args[2] = {self_ptr, outer_ptrs[i]};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#else /* !0 */
+#if 2
+                        assert(npy_is_aligned(outer_ptrs[i],
+                               NPY_ALIGNOF_UINT(npy_uint16)));
+                        assert(npy_is_aligned(self_ptr,
+                               NPY_ALIGNOF_UINT(npy_uint16)));
+                        *(npy_uint16 *)self_ptr = *(npy_uint16 *)(outer_ptrs[i]);
+#else
+                        char *args[2] = {outer_ptrs[i], self_ptr};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#endif
+                        /* advance extra operand */
+                        outer_ptrs[i] += outer_strides[i];
+                    }
+                } while (mit->outer_next(mit->outer));
+
+                break;
+
+
+#line 1652
+
+#if 4
+            case 4:
+#else
+            default:
+#endif
+                /* Outer iteration (safe because mit->size != 0) */
+                do {
+#if !0
+                    /*
+                     * When the API is needed the casting might fail
+                     * TODO: (only if buffering is enabled).
+                     */
+                    if (needs_api && PyErr_Occurred()) {
+                        return -1;
+                    }
+#endif
+                    count = *counter;
+                    while (count--) {
+                        char * self_ptr = baseoffset;
+                        for (i=0; i < 1; i++) {
+                            npy_intp indval = *((npy_intp*)outer_ptrs[i]);
+                            assert(npy_is_aligned(outer_ptrs[i],
+                                                  NPY_ALIGNOF_UINT(npy_intp)));
+
+#if 0 && 1
+                            if (check_and_adjust_index(&indval, fancy_dims[i],
+                                                       iteraxis, _save) < 0 ) {
+                                return -1;
+                            }
+#else
+                            if (indval < 0) {
+                                indval += fancy_dims[i];
+                            }
+#endif
+                            self_ptr += indval * fancy_strides[i];
+
+                            /* advance indexing arrays */
+                            outer_ptrs[i] += outer_strides[i];
+                        }
+
+#if 0
+#if 4
+                        assert(npy_is_aligned(outer_ptrs[i],
+                                              NPY_ALIGNOF_UINT(npy_uint32)));
+                        assert(npy_is_aligned(self_ptr,
+                                              NPY_ALIGNOF_UINT(npy_uint32)));
+                        *(npy_uint32 *)(outer_ptrs[i]) = *(npy_uint32 *)self_ptr;
+#else
+                        char *args[2] = {self_ptr, outer_ptrs[i]};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#else /* !0 */
+#if 4
+                        assert(npy_is_aligned(outer_ptrs[i],
+                               NPY_ALIGNOF_UINT(npy_uint32)));
+                        assert(npy_is_aligned(self_ptr,
+                               NPY_ALIGNOF_UINT(npy_uint32)));
+                        *(npy_uint32 *)self_ptr = *(npy_uint32 *)(outer_ptrs[i]);
+#else
+                        char *args[2] = {outer_ptrs[i], self_ptr};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#endif
+                        /* advance extra operand */
+                        outer_ptrs[i] += outer_strides[i];
+                    }
+                } while (mit->outer_next(mit->outer));
+
+                break;
+
+
+#line 1652
+
+#if 8
+            case 8:
+#else
+            default:
+#endif
+                /* Outer iteration (safe because mit->size != 0) */
+                do {
+#if !0
+                    /*
+                     * When the API is needed the casting might fail
+                     * TODO: (only if buffering is enabled).
+                     */
+                    if (needs_api && PyErr_Occurred()) {
+                        return -1;
+                    }
+#endif
+                    count = *counter;
+                    while (count--) {
+                        char * self_ptr = baseoffset;
+                        for (i=0; i < 1; i++) {
+                            npy_intp indval = *((npy_intp*)outer_ptrs[i]);
+                            assert(npy_is_aligned(outer_ptrs[i],
+                                                  NPY_ALIGNOF_UINT(npy_intp)));
+
+#if 0 && 1
+                            if (check_and_adjust_index(&indval, fancy_dims[i],
+                                                       iteraxis, _save) < 0 ) {
+                                return -1;
+                            }
+#else
+                            if (indval < 0) {
+                                indval += fancy_dims[i];
+                            }
+#endif
+                            self_ptr += indval * fancy_strides[i];
+
+                            /* advance indexing arrays */
+                            outer_ptrs[i] += outer_strides[i];
+                        }
+
+#if 0
+#if 8
+                        assert(npy_is_aligned(outer_ptrs[i],
+                                              NPY_ALIGNOF_UINT(npy_uint64)));
+                        assert(npy_is_aligned(self_ptr,
+                                              NPY_ALIGNOF_UINT(npy_uint64)));
+                        *(npy_uint64 *)(outer_ptrs[i]) = *(npy_uint64 *)self_ptr;
+#else
+                        char *args[2] = {self_ptr, outer_ptrs[i]};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#else /* !0 */
+#if 8
+                        assert(npy_is_aligned(outer_ptrs[i],
+                               NPY_ALIGNOF_UINT(npy_uint64)));
+                        assert(npy_is_aligned(self_ptr,
+                               NPY_ALIGNOF_UINT(npy_uint64)));
+                        *(npy_uint64 *)self_ptr = *(npy_uint64 *)(outer_ptrs[i]);
+#else
+                        char *args[2] = {outer_ptrs[i], self_ptr};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#endif
+                        /* advance extra operand */
+                        outer_ptrs[i] += outer_strides[i];
+                    }
+                } while (mit->outer_next(mit->outer));
+
+                break;
+
+
+#line 1652
+
+#if 16
+            case 16:
+#else
+            default:
+#endif
+                /* Outer iteration (safe because mit->size != 0) */
+                do {
+#if !0
+                    /*
+                     * When the API is needed the casting might fail
+                     * TODO: (only if buffering is enabled).
+                     */
+                    if (needs_api && PyErr_Occurred()) {
+                        return -1;
+                    }
+#endif
+                    count = *counter;
+                    while (count--) {
+                        char * self_ptr = baseoffset;
+                        for (i=0; i < 1; i++) {
+                            npy_intp indval = *((npy_intp*)outer_ptrs[i]);
+                            assert(npy_is_aligned(outer_ptrs[i],
+                                                  NPY_ALIGNOF_UINT(npy_intp)));
+
+#if 0 && 1
+                            if (check_and_adjust_index(&indval, fancy_dims[i],
+                                                       iteraxis, _save) < 0 ) {
+                                return -1;
+                            }
+#else
+                            if (indval < 0) {
+                                indval += fancy_dims[i];
+                            }
+#endif
+                            self_ptr += indval * fancy_strides[i];
+
+                            /* advance indexing arrays */
+                            outer_ptrs[i] += outer_strides[i];
+                        }
+
+#if 0
+#if 16
+                        assert(npy_is_aligned(outer_ptrs[i],
+                                              NPY_ALIGNOF_UINT(copytype128)));
+                        assert(npy_is_aligned(self_ptr,
+                                              NPY_ALIGNOF_UINT(copytype128)));
+                        *(copytype128 *)(outer_ptrs[i]) = *(copytype128 *)self_ptr;
+#else
+                        char *args[2] = {self_ptr, outer_ptrs[i]};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#else /* !0 */
+#if 16
+                        assert(npy_is_aligned(outer_ptrs[i],
+                               NPY_ALIGNOF_UINT(copytype128)));
+                        assert(npy_is_aligned(self_ptr,
+                               NPY_ALIGNOF_UINT(copytype128)));
+                        *(copytype128 *)self_ptr = *(copytype128 *)(outer_ptrs[i]);
+#else
+                        char *args[2] = {outer_ptrs[i], self_ptr};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#endif
+                        /* advance extra operand */
+                        outer_ptrs[i] += outer_strides[i];
+                    }
+                } while (mit->outer_next(mit->outer));
+
+                break;
+
+
+#line 1652
+
+#if 0
+            case 0:
+#else
+            default:
+#endif
+                /* Outer iteration (safe because mit->size != 0) */
+                do {
+#if !0
+                    /*
+                     * When the API is needed the casting might fail
+                     * TODO: (only if buffering is enabled).
+                     */
+                    if (needs_api && PyErr_Occurred()) {
+                        return -1;
+                    }
+#endif
+                    count = *counter;
+                    while (count--) {
+                        char * self_ptr = baseoffset;
+                        for (i=0; i < 1; i++) {
+                            npy_intp indval = *((npy_intp*)outer_ptrs[i]);
+                            assert(npy_is_aligned(outer_ptrs[i],
+                                                  NPY_ALIGNOF_UINT(npy_intp)));
+
+#if 0 && 1
+                            if (check_and_adjust_index(&indval, fancy_dims[i],
+                                                       iteraxis, _save) < 0 ) {
+                                return -1;
+                            }
+#else
+                            if (indval < 0) {
+                                indval += fancy_dims[i];
+                            }
+#endif
+                            self_ptr += indval * fancy_strides[i];
+
+                            /* advance indexing arrays */
+                            outer_ptrs[i] += outer_strides[i];
+                        }
+
+#if 0
+#if 0
+                        assert(npy_is_aligned(outer_ptrs[i],
+                                              NPY_ALIGNOF_UINT(0)));
+                        assert(npy_is_aligned(self_ptr,
+                                              NPY_ALIGNOF_UINT(0)));
+                        *(0 *)(outer_ptrs[i]) = *(0 *)self_ptr;
+#else
+                        char *args[2] = {self_ptr, outer_ptrs[i]};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#else /* !0 */
+#if 0
+                        assert(npy_is_aligned(outer_ptrs[i],
+                               NPY_ALIGNOF_UINT(0)));
+                        assert(npy_is_aligned(self_ptr,
+                               NPY_ALIGNOF_UINT(0)));
+                        *(0 *)self_ptr = *(0 *)(outer_ptrs[i]);
+#else
+                        char *args[2] = {outer_ptrs[i], self_ptr};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#endif
+                        /* advance extra operand */
+                        outer_ptrs[i] += outer_strides[i];
+                    }
+                } while (mit->outer_next(mit->outer));
+
+                break;
+
+
+            }
+            NPY_END_THREADS;
+        }
+
+#line 1634
+
+#if 0
+        if (numiter == 1) {
+#else
+        else {
+#endif
+            NPY_BEGIN_THREADS_DEF;
+            if (!needs_api) {
+                NPY_BEGIN_THREADS;
+            }
+
+            /* Optimization for aligned types that do not need the api */
+            switch ((is_aligned && !needs_api) ? itemsize : 0) {
+
+#line 1652
+
+#if 1
+            case 1:
+#else
+            default:
+#endif
+                /* Outer iteration (safe because mit->size != 0) */
+                do {
+#if !0
+                    /*
+                     * When the API is needed the casting might fail
+                     * TODO: (only if buffering is enabled).
+                     */
+                    if (needs_api && PyErr_Occurred()) {
+                        return -1;
+                    }
+#endif
+                    count = *counter;
+                    while (count--) {
+                        char * self_ptr = baseoffset;
+                        for (i=0; i < numiter; i++) {
+                            npy_intp indval = *((npy_intp*)outer_ptrs[i]);
+                            assert(npy_is_aligned(outer_ptrs[i],
+                                                  NPY_ALIGNOF_UINT(npy_intp)));
+
+#if 0 && 0
+                            if (check_and_adjust_index(&indval, fancy_dims[i],
+                                                       iteraxis, _save) < 0 ) {
+                                return -1;
+                            }
+#else
+                            if (indval < 0) {
+                                indval += fancy_dims[i];
+                            }
+#endif
+                            self_ptr += indval * fancy_strides[i];
+
+                            /* advance indexing arrays */
+                            outer_ptrs[i] += outer_strides[i];
+                        }
+
+#if 0
+#if 1
+                        assert(npy_is_aligned(outer_ptrs[i],
+                                              NPY_ALIGNOF_UINT(npy_uint8)));
+                        assert(npy_is_aligned(self_ptr,
+                                              NPY_ALIGNOF_UINT(npy_uint8)));
+                        *(npy_uint8 *)(outer_ptrs[i]) = *(npy_uint8 *)self_ptr;
+#else
+                        char *args[2] = {self_ptr, outer_ptrs[i]};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#else /* !0 */
+#if 1
+                        assert(npy_is_aligned(outer_ptrs[i],
+                               NPY_ALIGNOF_UINT(npy_uint8)));
+                        assert(npy_is_aligned(self_ptr,
+                               NPY_ALIGNOF_UINT(npy_uint8)));
+                        *(npy_uint8 *)self_ptr = *(npy_uint8 *)(outer_ptrs[i]);
+#else
+                        char *args[2] = {outer_ptrs[i], self_ptr};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#endif
+                        /* advance extra operand */
+                        outer_ptrs[i] += outer_strides[i];
+                    }
+                } while (mit->outer_next(mit->outer));
+
+                break;
+
+
+#line 1652
+
+#if 2
+            case 2:
+#else
+            default:
+#endif
+                /* Outer iteration (safe because mit->size != 0) */
+                do {
+#if !0
+                    /*
+                     * When the API is needed the casting might fail
+                     * TODO: (only if buffering is enabled).
+                     */
+                    if (needs_api && PyErr_Occurred()) {
+                        return -1;
+                    }
+#endif
+                    count = *counter;
+                    while (count--) {
+                        char * self_ptr = baseoffset;
+                        for (i=0; i < numiter; i++) {
+                            npy_intp indval = *((npy_intp*)outer_ptrs[i]);
+                            assert(npy_is_aligned(outer_ptrs[i],
+                                                  NPY_ALIGNOF_UINT(npy_intp)));
+
+#if 0 && 0
+                            if (check_and_adjust_index(&indval, fancy_dims[i],
+                                                       iteraxis, _save) < 0 ) {
+                                return -1;
+                            }
+#else
+                            if (indval < 0) {
+                                indval += fancy_dims[i];
+                            }
+#endif
+                            self_ptr += indval * fancy_strides[i];
+
+                            /* advance indexing arrays */
+                            outer_ptrs[i] += outer_strides[i];
+                        }
+
+#if 0
+#if 2
+                        assert(npy_is_aligned(outer_ptrs[i],
+                                              NPY_ALIGNOF_UINT(npy_uint16)));
+                        assert(npy_is_aligned(self_ptr,
+                                              NPY_ALIGNOF_UINT(npy_uint16)));
+                        *(npy_uint16 *)(outer_ptrs[i]) = *(npy_uint16 *)self_ptr;
+#else
+                        char *args[2] = {self_ptr, outer_ptrs[i]};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#else /* !0 */
+#if 2
+                        assert(npy_is_aligned(outer_ptrs[i],
+                               NPY_ALIGNOF_UINT(npy_uint16)));
+                        assert(npy_is_aligned(self_ptr,
+                               NPY_ALIGNOF_UINT(npy_uint16)));
+                        *(npy_uint16 *)self_ptr = *(npy_uint16 *)(outer_ptrs[i]);
+#else
+                        char *args[2] = {outer_ptrs[i], self_ptr};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#endif
+                        /* advance extra operand */
+                        outer_ptrs[i] += outer_strides[i];
+                    }
+                } while (mit->outer_next(mit->outer));
+
+                break;
+
+
+#line 1652
+
+#if 4
+            case 4:
+#else
+            default:
+#endif
+                /* Outer iteration (safe because mit->size != 0) */
+                do {
+#if !0
+                    /*
+                     * When the API is needed the casting might fail
+                     * TODO: (only if buffering is enabled).
+                     */
+                    if (needs_api && PyErr_Occurred()) {
+                        return -1;
+                    }
+#endif
+                    count = *counter;
+                    while (count--) {
+                        char * self_ptr = baseoffset;
+                        for (i=0; i < numiter; i++) {
+                            npy_intp indval = *((npy_intp*)outer_ptrs[i]);
+                            assert(npy_is_aligned(outer_ptrs[i],
+                                                  NPY_ALIGNOF_UINT(npy_intp)));
+
+#if 0 && 0
+                            if (check_and_adjust_index(&indval, fancy_dims[i],
+                                                       iteraxis, _save) < 0 ) {
+                                return -1;
+                            }
+#else
+                            if (indval < 0) {
+                                indval += fancy_dims[i];
+                            }
+#endif
+                            self_ptr += indval * fancy_strides[i];
+
+                            /* advance indexing arrays */
+                            outer_ptrs[i] += outer_strides[i];
+                        }
+
+#if 0
+#if 4
+                        assert(npy_is_aligned(outer_ptrs[i],
+                                              NPY_ALIGNOF_UINT(npy_uint32)));
+                        assert(npy_is_aligned(self_ptr,
+                                              NPY_ALIGNOF_UINT(npy_uint32)));
+                        *(npy_uint32 *)(outer_ptrs[i]) = *(npy_uint32 *)self_ptr;
+#else
+                        char *args[2] = {self_ptr, outer_ptrs[i]};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#else /* !0 */
+#if 4
+                        assert(npy_is_aligned(outer_ptrs[i],
+                               NPY_ALIGNOF_UINT(npy_uint32)));
+                        assert(npy_is_aligned(self_ptr,
+                               NPY_ALIGNOF_UINT(npy_uint32)));
+                        *(npy_uint32 *)self_ptr = *(npy_uint32 *)(outer_ptrs[i]);
+#else
+                        char *args[2] = {outer_ptrs[i], self_ptr};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#endif
+                        /* advance extra operand */
+                        outer_ptrs[i] += outer_strides[i];
+                    }
+                } while (mit->outer_next(mit->outer));
+
+                break;
+
+
+#line 1652
+
+#if 8
+            case 8:
+#else
+            default:
+#endif
+                /* Outer iteration (safe because mit->size != 0) */
+                do {
+#if !0
+                    /*
+                     * When the API is needed the casting might fail
+                     * TODO: (only if buffering is enabled).
+                     */
+                    if (needs_api && PyErr_Occurred()) {
+                        return -1;
+                    }
+#endif
+                    count = *counter;
+                    while (count--) {
+                        char * self_ptr = baseoffset;
+                        for (i=0; i < numiter; i++) {
+                            npy_intp indval = *((npy_intp*)outer_ptrs[i]);
+                            assert(npy_is_aligned(outer_ptrs[i],
+                                                  NPY_ALIGNOF_UINT(npy_intp)));
+
+#if 0 && 0
+                            if (check_and_adjust_index(&indval, fancy_dims[i],
+                                                       iteraxis, _save) < 0 ) {
+                                return -1;
+                            }
+#else
+                            if (indval < 0) {
+                                indval += fancy_dims[i];
+                            }
+#endif
+                            self_ptr += indval * fancy_strides[i];
+
+                            /* advance indexing arrays */
+                            outer_ptrs[i] += outer_strides[i];
+                        }
+
+#if 0
+#if 8
+                        assert(npy_is_aligned(outer_ptrs[i],
+                                              NPY_ALIGNOF_UINT(npy_uint64)));
+                        assert(npy_is_aligned(self_ptr,
+                                              NPY_ALIGNOF_UINT(npy_uint64)));
+                        *(npy_uint64 *)(outer_ptrs[i]) = *(npy_uint64 *)self_ptr;
+#else
+                        char *args[2] = {self_ptr, outer_ptrs[i]};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#else /* !0 */
+#if 8
+                        assert(npy_is_aligned(outer_ptrs[i],
+                               NPY_ALIGNOF_UINT(npy_uint64)));
+                        assert(npy_is_aligned(self_ptr,
+                               NPY_ALIGNOF_UINT(npy_uint64)));
+                        *(npy_uint64 *)self_ptr = *(npy_uint64 *)(outer_ptrs[i]);
+#else
+                        char *args[2] = {outer_ptrs[i], self_ptr};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#endif
+                        /* advance extra operand */
+                        outer_ptrs[i] += outer_strides[i];
+                    }
+                } while (mit->outer_next(mit->outer));
+
+                break;
+
+
+#line 1652
+
+#if 16
+            case 16:
+#else
+            default:
+#endif
+                /* Outer iteration (safe because mit->size != 0) */
+                do {
+#if !0
+                    /*
+                     * When the API is needed the casting might fail
+                     * TODO: (only if buffering is enabled).
+                     */
+                    if (needs_api && PyErr_Occurred()) {
+                        return -1;
+                    }
+#endif
+                    count = *counter;
+                    while (count--) {
+                        char * self_ptr = baseoffset;
+                        for (i=0; i < numiter; i++) {
+                            npy_intp indval = *((npy_intp*)outer_ptrs[i]);
+                            assert(npy_is_aligned(outer_ptrs[i],
+                                                  NPY_ALIGNOF_UINT(npy_intp)));
+
+#if 0 && 0
+                            if (check_and_adjust_index(&indval, fancy_dims[i],
+                                                       iteraxis, _save) < 0 ) {
+                                return -1;
+                            }
+#else
+                            if (indval < 0) {
+                                indval += fancy_dims[i];
+                            }
+#endif
+                            self_ptr += indval * fancy_strides[i];
+
+                            /* advance indexing arrays */
+                            outer_ptrs[i] += outer_strides[i];
+                        }
+
+#if 0
+#if 16
+                        assert(npy_is_aligned(outer_ptrs[i],
+                                              NPY_ALIGNOF_UINT(copytype128)));
+                        assert(npy_is_aligned(self_ptr,
+                                              NPY_ALIGNOF_UINT(copytype128)));
+                        *(copytype128 *)(outer_ptrs[i]) = *(copytype128 *)self_ptr;
+#else
+                        char *args[2] = {self_ptr, outer_ptrs[i]};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#else /* !0 */
+#if 16
+                        assert(npy_is_aligned(outer_ptrs[i],
+                               NPY_ALIGNOF_UINT(copytype128)));
+                        assert(npy_is_aligned(self_ptr,
+                               NPY_ALIGNOF_UINT(copytype128)));
+                        *(copytype128 *)self_ptr = *(copytype128 *)(outer_ptrs[i]);
+#else
+                        char *args[2] = {outer_ptrs[i], self_ptr};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#endif
+                        /* advance extra operand */
+                        outer_ptrs[i] += outer_strides[i];
+                    }
+                } while (mit->outer_next(mit->outer));
+
+                break;
+
+
+#line 1652
+
+#if 0
+            case 0:
+#else
+            default:
+#endif
+                /* Outer iteration (safe because mit->size != 0) */
+                do {
+#if !0
+                    /*
+                     * When the API is needed the casting might fail
+                     * TODO: (only if buffering is enabled).
+                     */
+                    if (needs_api && PyErr_Occurred()) {
+                        return -1;
+                    }
+#endif
+                    count = *counter;
+                    while (count--) {
+                        char * self_ptr = baseoffset;
+                        for (i=0; i < numiter; i++) {
+                            npy_intp indval = *((npy_intp*)outer_ptrs[i]);
+                            assert(npy_is_aligned(outer_ptrs[i],
+                                                  NPY_ALIGNOF_UINT(npy_intp)));
+
+#if 0 && 0
+                            if (check_and_adjust_index(&indval, fancy_dims[i],
+                                                       iteraxis, _save) < 0 ) {
+                                return -1;
+                            }
+#else
+                            if (indval < 0) {
+                                indval += fancy_dims[i];
+                            }
+#endif
+                            self_ptr += indval * fancy_strides[i];
+
+                            /* advance indexing arrays */
+                            outer_ptrs[i] += outer_strides[i];
+                        }
+
+#if 0
+#if 0
+                        assert(npy_is_aligned(outer_ptrs[i],
+                                              NPY_ALIGNOF_UINT(0)));
+                        assert(npy_is_aligned(self_ptr,
+                                              NPY_ALIGNOF_UINT(0)));
+                        *(0 *)(outer_ptrs[i]) = *(0 *)self_ptr;
+#else
+                        char *args[2] = {self_ptr, outer_ptrs[i]};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#else /* !0 */
+#if 0
+                        assert(npy_is_aligned(outer_ptrs[i],
+                               NPY_ALIGNOF_UINT(0)));
+                        assert(npy_is_aligned(self_ptr,
+                               NPY_ALIGNOF_UINT(0)));
+                        *(0 *)self_ptr = *(0 *)(outer_ptrs[i]);
+#else
+                        char *args[2] = {outer_ptrs[i], self_ptr};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#endif
+                        /* advance extra operand */
+                        outer_ptrs[i] += outer_strides[i];
+                    }
+                } while (mit->outer_next(mit->outer));
+
+                break;
+
+
+            }
+            NPY_END_THREADS;
+        }
+
+    }
+
+    /******************* Nested Iteration Situation *******************/
+    else {
+        char *subspace_baseptrs[2];
+        char **subspace_ptrs = mit->subspace_ptrs;
+        npy_intp *subspace_strides = mit->subspace_strides;
+        int is_subiter_trivial = 0; /* has three states */
+        npy_intp reset_offsets[2] = {0, 0};
+
+        /* Note: it may make sense to refactor `needs_api` out in this branch */
+        if (flags & NPY_METH_REQUIRES_PYAPI) {
+            needs_api = 1;
+        }
+
+        counter = NpyIter_GetInnerLoopSizePtr(mit->subspace_iter);
+        if (*counter == PyArray_SIZE(mit->subspace)) {
+           /*
+            * subspace is trivially iterable.
+            * manipulate pointers to avoid expensive resetting
+            */
+            is_subiter_trivial = 1;
+        }
+#line 1765
+
+#if 1
+        if (numiter == 1) {
+#else
+        else {
+#endif
+            NPY_BEGIN_THREADS_DEF;
+            if (!needs_api) {
+                NPY_BEGIN_THREADS;
+            }
+
+            /* Outer iteration (safe because mit->size != 0) */
+            do {
+                char * self_ptr = baseoffset;
+                for (i=0; i < 1; i++) {
+                    npy_intp indval = *((npy_intp*)outer_ptrs[i]);
+
+#if 0 && 1
+                    if (check_and_adjust_index(&indval, fancy_dims[i],
+                                               iteraxis, _save) < 0 ) {
+                        return -1;
+                    }
+#else
+                    if (indval < 0) {
+                        indval += fancy_dims[i];
+                    }
+#endif
+
+                    self_ptr += indval * fancy_strides[i];
+                }
+
+                /*
+                 * Resetting is slow, so try to avoid resetting
+                 * if subspace iteration is trivial.
+                 * Watch out: reset_offsets are kept outside of the loop,
+                 * assuming the subspaces of different external iterations
+                 * share the same structure.
+                 */
+                if (is_subiter_trivial <= 1) {
+                    /* slower resetting: first iteration or non-trivial subspace */
+
+                    char * errmsg = NULL;
+                    subspace_baseptrs[0] = self_ptr;
+                    subspace_baseptrs[1] = mit->extra_op_ptrs[0];
+
+                    /* (can't really fail, since no buffering necessary) */
+                    if (!NpyIter_ResetBasePointers(mit->subspace_iter,
+                                                   subspace_baseptrs,
+                                                   &errmsg)) {
+                        NPY_END_THREADS;
+                        PyErr_SetString(PyExc_ValueError, errmsg);
+                        return -1;
+                    }
+                    if (is_subiter_trivial != 0) {
+                        /* reset_offsets are nonzero for negative strides.*/
+                        reset_offsets[0] = subspace_ptrs[0] - self_ptr;
+                        reset_offsets[1] = subspace_ptrs[1] - mit->extra_op_ptrs[0];
+
+                        /* use the faster adjustment further on */
+                        is_subiter_trivial ++;
+                    }
+                }
+                else {
+                    /*
+                     * faster resetting if the subspace iteration is trivial.
+                     * reset_offsets are zero for positive strides,
+                     * for negative strides this shifts the pointer to the last
+                     * item.
+                     */
+                    subspace_ptrs[0] = self_ptr + reset_offsets[0];
+                    subspace_ptrs[1] = mit->extra_op_ptrs[0] + reset_offsets[1];
+                }
+
+#if !0
+                /*
+                 * When the API is needed the casting might fail
+                 * TODO: Could only check if casting is unsafe, or even just
+                 *       not at all...
+                 */
+                if (needs_api && PyErr_Occurred()) {
+                    return -1;
+                }
+#endif
+
+                do {
+
+#if 0
+                    if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                            subspace_ptrs, counter, subspace_strides,
+                            cast_info->auxdata) < 0)) {
+                        NPY_END_THREADS;
+                        return -1;
+                    }
+#else
+                    /* The operand order is reversed here */
+                    char *args[2] = {subspace_ptrs[1], subspace_ptrs[0]};
+                    npy_intp strides[2] = {subspace_strides[1], subspace_strides[0]};
+                    if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                            args, counter, strides, cast_info->auxdata) < 0)) {
+                        NPY_END_THREADS;
+                        return -1;
+                    }
+#endif
+                } while (mit->subspace_next(mit->subspace_iter));
+
+                mit->extra_op_next(mit->extra_op_iter);
+            } while (mit->outer_next(mit->outer));
+            NPY_END_THREADS;
+        }
+
+#line 1765
+
+#if 0
+        if (numiter == 1) {
+#else
+        else {
+#endif
+            NPY_BEGIN_THREADS_DEF;
+            if (!needs_api) {
+                NPY_BEGIN_THREADS;
+            }
+
+            /* Outer iteration (safe because mit->size != 0) */
+            do {
+                char * self_ptr = baseoffset;
+                for (i=0; i < numiter; i++) {
+                    npy_intp indval = *((npy_intp*)outer_ptrs[i]);
+
+#if 0 && 0
+                    if (check_and_adjust_index(&indval, fancy_dims[i],
+                                               iteraxis, _save) < 0 ) {
+                        return -1;
+                    }
+#else
+                    if (indval < 0) {
+                        indval += fancy_dims[i];
+                    }
+#endif
+
+                    self_ptr += indval * fancy_strides[i];
+                }
+
+                /*
+                 * Resetting is slow, so try to avoid resetting
+                 * if subspace iteration is trivial.
+                 * Watch out: reset_offsets are kept outside of the loop,
+                 * assuming the subspaces of different external iterations
+                 * share the same structure.
+                 */
+                if (is_subiter_trivial <= 1) {
+                    /* slower resetting: first iteration or non-trivial subspace */
+
+                    char * errmsg = NULL;
+                    subspace_baseptrs[0] = self_ptr;
+                    subspace_baseptrs[1] = mit->extra_op_ptrs[0];
+
+                    /* (can't really fail, since no buffering necessary) */
+                    if (!NpyIter_ResetBasePointers(mit->subspace_iter,
+                                                   subspace_baseptrs,
+                                                   &errmsg)) {
+                        NPY_END_THREADS;
+                        PyErr_SetString(PyExc_ValueError, errmsg);
+                        return -1;
+                    }
+                    if (is_subiter_trivial != 0) {
+                        /* reset_offsets are nonzero for negative strides.*/
+                        reset_offsets[0] = subspace_ptrs[0] - self_ptr;
+                        reset_offsets[1] = subspace_ptrs[1] - mit->extra_op_ptrs[0];
+
+                        /* use the faster adjustment further on */
+                        is_subiter_trivial ++;
+                    }
+                }
+                else {
+                    /*
+                     * faster resetting if the subspace iteration is trivial.
+                     * reset_offsets are zero for positive strides,
+                     * for negative strides this shifts the pointer to the last
+                     * item.
+                     */
+                    subspace_ptrs[0] = self_ptr + reset_offsets[0];
+                    subspace_ptrs[1] = mit->extra_op_ptrs[0] + reset_offsets[1];
+                }
+
+#if !0
+                /*
+                 * When the API is needed the casting might fail
+                 * TODO: Could only check if casting is unsafe, or even just
+                 *       not at all...
+                 */
+                if (needs_api && PyErr_Occurred()) {
+                    return -1;
+                }
+#endif
+
+                do {
+
+#if 0
+                    if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                            subspace_ptrs, counter, subspace_strides,
+                            cast_info->auxdata) < 0)) {
+                        NPY_END_THREADS;
+                        return -1;
+                    }
+#else
+                    /* The operand order is reversed here */
+                    char *args[2] = {subspace_ptrs[1], subspace_ptrs[0]};
+                    npy_intp strides[2] = {subspace_strides[1], subspace_strides[0]};
+                    if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                            args, counter, strides, cast_info->auxdata) < 0)) {
+                        NPY_END_THREADS;
+                        return -1;
+                    }
+#endif
+                } while (mit->subspace_next(mit->subspace_iter));
+
+                mit->extra_op_next(mit->extra_op_iter);
+            } while (mit->outer_next(mit->outer));
+            NPY_END_THREADS;
+        }
+
+    }
+    return 0;
+}
+
+
+#line 1449
+
+/*
+ * Advanced indexing iteration of arrays when there is a single indexing
+ * array which has the same memory order as the value array and both
+ * can be trivially iterated (single stride, aligned, no casting necessary).
+ */
+NPY_NO_EXPORT int
+mapiter_trivial_get(
+        PyArrayObject *self, PyArrayObject *ind, PyArrayObject *result,
+        int is_aligned, NPY_cast_info *cast_info)
+{
+    char *base_ptr, *ind_ptr, *result_ptr;
+    npy_intp self_stride, ind_stride, result_stride;
+    npy_intp fancy_dim = PyArray_DIM(self, 0);
+
+    npy_intp itersize;
+
+    /* copying between the same dtype, we can assume this is correct: */
+    int needs_api = PyDataType_REFCHK(PyArray_DESCR(self));
+    npy_intp itemsize = PyArray_ITEMSIZE(self);
+    npy_intp strides[2] = {itemsize, itemsize};
+    npy_intp one = 1;
+
+    NPY_BEGIN_THREADS_DEF;
+
+    base_ptr = PyArray_BYTES(self);
+    self_stride = PyArray_STRIDE(self, 0);
+
+    PyArray_PREPARE_TRIVIAL_PAIR_ITERATION(ind, result, itersize,
+                                           ind_ptr, result_ptr,
+                                           ind_stride, result_stride)
+
+    if (!needs_api) {
+        NPY_BEGIN_THREADS_THRESHOLDED(PyArray_SIZE(ind));
+    }
+#if !1
+    /* Check the indices beforehand */
+    while (itersize--) {
+        npy_intp indval = *((npy_intp*)ind_ptr);
+        if (check_and_adjust_index(&indval, fancy_dim, 0, _save) < 0 ) {
+            return -1;
+        }
+        ind_ptr += ind_stride;
+    }
+
+    /*
+     * Reset the ind_ptr and itersize, due to broadcasting it is always
+     * the size of ind.
+     */
+    ind_ptr = PyArray_BYTES(ind);
+    itersize = PyArray_SIZE(ind);
+#endif
+
+    /* Optimization for aligned types that do not need the api */
+    switch ((is_aligned && !needs_api) ? itemsize : 0) {
+
+#line 1509
+
+#if 1
+    case 1:
+#else
+    default:
+#endif
+        while (itersize--) {
+            char * self_ptr;
+            npy_intp indval = *((npy_intp*)ind_ptr);
+            assert(npy_is_aligned(ind_ptr, NPY_ALIGNOF_UINT(npy_intp)));
+#if 1
+            if (check_and_adjust_index(&indval, fancy_dim, 0, _save) < 0 ) {
+                return -1;
+            }
+#else
+            if (indval < 0) {
+                indval += fancy_dim;
+            }
+#endif
+            self_ptr = base_ptr + indval * self_stride;
+
+#if 1
+#if 1
+            assert(npy_is_aligned(result_ptr, NPY_ALIGNOF_UINT(npy_uint8)));
+            assert(npy_is_aligned(self_ptr, NPY_ALIGNOF_UINT(npy_uint8)));
+            *(npy_uint8 *)result_ptr = *(npy_uint8 *)self_ptr;
+#else
+            char *args[2] = {self_ptr, result_ptr};
+            if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                    args, &one, strides,
+                    cast_info->auxdata) < 0)) {
+                NPY_END_THREADS;
+                return -1;
+            }
+#endif
+
+#else /* !1 */
+#if 1
+            assert(npy_is_aligned(result_ptr, NPY_ALIGNOF_UINT(npy_uint8)));
+            assert(npy_is_aligned(self_ptr, NPY_ALIGNOF_UINT(npy_uint8)));
+            *(npy_uint8 *)self_ptr = *(npy_uint8 *)result_ptr;
+#else
+            char *args[2] = {result_ptr, self_ptr};
+            if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                    args, &one, strides,
+                    cast_info->auxdata) < 0)) {
+                NPY_END_THREADS;
+                return -1;
+            }
+#endif
+#endif
+
+            ind_ptr += ind_stride;
+            result_ptr += result_stride;
+        }
+        break;
+
+
+#line 1509
+
+#if 2
+    case 2:
+#else
+    default:
+#endif
+        while (itersize--) {
+            char * self_ptr;
+            npy_intp indval = *((npy_intp*)ind_ptr);
+            assert(npy_is_aligned(ind_ptr, NPY_ALIGNOF_UINT(npy_intp)));
+#if 1
+            if (check_and_adjust_index(&indval, fancy_dim, 0, _save) < 0 ) {
+                return -1;
+            }
+#else
+            if (indval < 0) {
+                indval += fancy_dim;
+            }
+#endif
+            self_ptr = base_ptr + indval * self_stride;
+
+#if 1
+#if 2
+            assert(npy_is_aligned(result_ptr, NPY_ALIGNOF_UINT(npy_uint16)));
+            assert(npy_is_aligned(self_ptr, NPY_ALIGNOF_UINT(npy_uint16)));
+            *(npy_uint16 *)result_ptr = *(npy_uint16 *)self_ptr;
+#else
+            char *args[2] = {self_ptr, result_ptr};
+            if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                    args, &one, strides,
+                    cast_info->auxdata) < 0)) {
+                NPY_END_THREADS;
+                return -1;
+            }
+#endif
+
+#else /* !1 */
+#if 2
+            assert(npy_is_aligned(result_ptr, NPY_ALIGNOF_UINT(npy_uint16)));
+            assert(npy_is_aligned(self_ptr, NPY_ALIGNOF_UINT(npy_uint16)));
+            *(npy_uint16 *)self_ptr = *(npy_uint16 *)result_ptr;
+#else
+            char *args[2] = {result_ptr, self_ptr};
+            if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                    args, &one, strides,
+                    cast_info->auxdata) < 0)) {
+                NPY_END_THREADS;
+                return -1;
+            }
+#endif
+#endif
+
+            ind_ptr += ind_stride;
+            result_ptr += result_stride;
+        }
+        break;
+
+
+#line 1509
+
+#if 4
+    case 4:
+#else
+    default:
+#endif
+        while (itersize--) {
+            char * self_ptr;
+            npy_intp indval = *((npy_intp*)ind_ptr);
+            assert(npy_is_aligned(ind_ptr, NPY_ALIGNOF_UINT(npy_intp)));
+#if 1
+            if (check_and_adjust_index(&indval, fancy_dim, 0, _save) < 0 ) {
+                return -1;
+            }
+#else
+            if (indval < 0) {
+                indval += fancy_dim;
+            }
+#endif
+            self_ptr = base_ptr + indval * self_stride;
+
+#if 1
+#if 4
+            assert(npy_is_aligned(result_ptr, NPY_ALIGNOF_UINT(npy_uint32)));
+            assert(npy_is_aligned(self_ptr, NPY_ALIGNOF_UINT(npy_uint32)));
+            *(npy_uint32 *)result_ptr = *(npy_uint32 *)self_ptr;
+#else
+            char *args[2] = {self_ptr, result_ptr};
+            if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                    args, &one, strides,
+                    cast_info->auxdata) < 0)) {
+                NPY_END_THREADS;
+                return -1;
+            }
+#endif
+
+#else /* !1 */
+#if 4
+            assert(npy_is_aligned(result_ptr, NPY_ALIGNOF_UINT(npy_uint32)));
+            assert(npy_is_aligned(self_ptr, NPY_ALIGNOF_UINT(npy_uint32)));
+            *(npy_uint32 *)self_ptr = *(npy_uint32 *)result_ptr;
+#else
+            char *args[2] = {result_ptr, self_ptr};
+            if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                    args, &one, strides,
+                    cast_info->auxdata) < 0)) {
+                NPY_END_THREADS;
+                return -1;
+            }
+#endif
+#endif
+
+            ind_ptr += ind_stride;
+            result_ptr += result_stride;
+        }
+        break;
+
+
+#line 1509
+
+#if 8
+    case 8:
+#else
+    default:
+#endif
+        while (itersize--) {
+            char * self_ptr;
+            npy_intp indval = *((npy_intp*)ind_ptr);
+            assert(npy_is_aligned(ind_ptr, NPY_ALIGNOF_UINT(npy_intp)));
+#if 1
+            if (check_and_adjust_index(&indval, fancy_dim, 0, _save) < 0 ) {
+                return -1;
+            }
+#else
+            if (indval < 0) {
+                indval += fancy_dim;
+            }
+#endif
+            self_ptr = base_ptr + indval * self_stride;
+
+#if 1
+#if 8
+            assert(npy_is_aligned(result_ptr, NPY_ALIGNOF_UINT(npy_uint64)));
+            assert(npy_is_aligned(self_ptr, NPY_ALIGNOF_UINT(npy_uint64)));
+            *(npy_uint64 *)result_ptr = *(npy_uint64 *)self_ptr;
+#else
+            char *args[2] = {self_ptr, result_ptr};
+            if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                    args, &one, strides,
+                    cast_info->auxdata) < 0)) {
+                NPY_END_THREADS;
+                return -1;
+            }
+#endif
+
+#else /* !1 */
+#if 8
+            assert(npy_is_aligned(result_ptr, NPY_ALIGNOF_UINT(npy_uint64)));
+            assert(npy_is_aligned(self_ptr, NPY_ALIGNOF_UINT(npy_uint64)));
+            *(npy_uint64 *)self_ptr = *(npy_uint64 *)result_ptr;
+#else
+            char *args[2] = {result_ptr, self_ptr};
+            if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                    args, &one, strides,
+                    cast_info->auxdata) < 0)) {
+                NPY_END_THREADS;
+                return -1;
+            }
+#endif
+#endif
+
+            ind_ptr += ind_stride;
+            result_ptr += result_stride;
+        }
+        break;
+
+
+#line 1509
+
+#if 16
+    case 16:
+#else
+    default:
+#endif
+        while (itersize--) {
+            char * self_ptr;
+            npy_intp indval = *((npy_intp*)ind_ptr);
+            assert(npy_is_aligned(ind_ptr, NPY_ALIGNOF_UINT(npy_intp)));
+#if 1
+            if (check_and_adjust_index(&indval, fancy_dim, 0, _save) < 0 ) {
+                return -1;
+            }
+#else
+            if (indval < 0) {
+                indval += fancy_dim;
+            }
+#endif
+            self_ptr = base_ptr + indval * self_stride;
+
+#if 1
+#if 16
+            assert(npy_is_aligned(result_ptr, NPY_ALIGNOF_UINT(copytype128)));
+            assert(npy_is_aligned(self_ptr, NPY_ALIGNOF_UINT(copytype128)));
+            *(copytype128 *)result_ptr = *(copytype128 *)self_ptr;
+#else
+            char *args[2] = {self_ptr, result_ptr};
+            if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                    args, &one, strides,
+                    cast_info->auxdata) < 0)) {
+                NPY_END_THREADS;
+                return -1;
+            }
+#endif
+
+#else /* !1 */
+#if 16
+            assert(npy_is_aligned(result_ptr, NPY_ALIGNOF_UINT(copytype128)));
+            assert(npy_is_aligned(self_ptr, NPY_ALIGNOF_UINT(copytype128)));
+            *(copytype128 *)self_ptr = *(copytype128 *)result_ptr;
+#else
+            char *args[2] = {result_ptr, self_ptr};
+            if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                    args, &one, strides,
+                    cast_info->auxdata) < 0)) {
+                NPY_END_THREADS;
+                return -1;
+            }
+#endif
+#endif
+
+            ind_ptr += ind_stride;
+            result_ptr += result_stride;
+        }
+        break;
+
+
+#line 1509
+
+#if 0
+    case 0:
+#else
+    default:
+#endif
+        while (itersize--) {
+            char * self_ptr;
+            npy_intp indval = *((npy_intp*)ind_ptr);
+            assert(npy_is_aligned(ind_ptr, NPY_ALIGNOF_UINT(npy_intp)));
+#if 1
+            if (check_and_adjust_index(&indval, fancy_dim, 0, _save) < 0 ) {
+                return -1;
+            }
+#else
+            if (indval < 0) {
+                indval += fancy_dim;
+            }
+#endif
+            self_ptr = base_ptr + indval * self_stride;
+
+#if 1
+#if 0
+            assert(npy_is_aligned(result_ptr, NPY_ALIGNOF_UINT(0)));
+            assert(npy_is_aligned(self_ptr, NPY_ALIGNOF_UINT(0)));
+            *(0 *)result_ptr = *(0 *)self_ptr;
+#else
+            char *args[2] = {self_ptr, result_ptr};
+            if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                    args, &one, strides,
+                    cast_info->auxdata) < 0)) {
+                NPY_END_THREADS;
+                return -1;
+            }
+#endif
+
+#else /* !1 */
+#if 0
+            assert(npy_is_aligned(result_ptr, NPY_ALIGNOF_UINT(0)));
+            assert(npy_is_aligned(self_ptr, NPY_ALIGNOF_UINT(0)));
+            *(0 *)self_ptr = *(0 *)result_ptr;
+#else
+            char *args[2] = {result_ptr, self_ptr};
+            if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                    args, &one, strides,
+                    cast_info->auxdata) < 0)) {
+                NPY_END_THREADS;
+                return -1;
+            }
+#endif
+#endif
+
+            ind_ptr += ind_stride;
+            result_ptr += result_stride;
+        }
+        break;
+
+
+    }
+
+    NPY_END_THREADS;
+
+    return 0;
+}
+
+
+/*
+ * General advanced indexing iteration.
+ */
+NPY_NO_EXPORT int
+mapiter_get(
+        PyArrayMapIterObject *mit, NPY_cast_info *cast_info,
+        NPY_ARRAYMETHOD_FLAGS flags, int is_aligned)
+{
+    npy_intp *counter, count;
+    int i;
+
+    /* Cached mit info */
+    int numiter = mit->numiter;
+    int needs_api = (flags & NPY_METH_REQUIRES_PYAPI) != 0;
+    /* Constant information */
+    npy_intp fancy_dims[NPY_MAXDIMS];
+    npy_intp fancy_strides[NPY_MAXDIMS];
+#if 1
+    int iteraxis;
+#endif
+
+    char *baseoffset = mit->baseoffset;
+    char **outer_ptrs = mit->outer_ptrs;
+    npy_intp *outer_strides = mit->outer_strides;
+    PyArrayObject *array= mit->array;
+
+    /* Fill constant information */
+#if 1
+    iteraxis = mit->iteraxes[0];
+#endif
+    for (i = 0; i < numiter; i++) {
+        fancy_dims[i] = mit->fancy_dims[i];
+        fancy_strides[i] = mit->fancy_strides[i];
+    }
+
+    if (mit->size == 0) {
+       return 0;
+    }
+
+    if (mit->subspace_iter == NULL) {
+        /*
+         * Item by item copy situation, the operand is buffered
+         * so use a cast to copy.  The iterator may not do any transfers, so may
+         * not have set `needs_api` yet, set it if necessary:
+         */
+        needs_api |= PyDataType_REFCHK(PyArray_DESCR(array));
+        npy_intp itemsize = PyArray_ITEMSIZE(array);
+        npy_intp strides[2] = {itemsize, itemsize};
+        npy_intp one = 1;
+
+        /* We have only one iterator handling everything */
+        counter = NpyIter_GetInnerLoopSizePtr(mit->outer);
+
+        /************ Optimized inner loops without subspace *************/
+
+#line 1634
+
+#if 1
+        if (numiter == 1) {
+#else
+        else {
+#endif
+            NPY_BEGIN_THREADS_DEF;
+            if (!needs_api) {
+                NPY_BEGIN_THREADS;
+            }
+
+            /* Optimization for aligned types that do not need the api */
+            switch ((is_aligned && !needs_api) ? itemsize : 0) {
+
+#line 1652
+
+#if 1
+            case 1:
+#else
+            default:
+#endif
+                /* Outer iteration (safe because mit->size != 0) */
+                do {
+#if !1
+                    /*
+                     * When the API is needed the casting might fail
+                     * TODO: (only if buffering is enabled).
+                     */
+                    if (needs_api && PyErr_Occurred()) {
+                        return -1;
+                    }
+#endif
+                    count = *counter;
+                    while (count--) {
+                        char * self_ptr = baseoffset;
+                        for (i=0; i < 1; i++) {
+                            npy_intp indval = *((npy_intp*)outer_ptrs[i]);
+                            assert(npy_is_aligned(outer_ptrs[i],
+                                                  NPY_ALIGNOF_UINT(npy_intp)));
+
+#if 1 && 1
+                            if (check_and_adjust_index(&indval, fancy_dims[i],
+                                                       iteraxis, _save) < 0 ) {
+                                return -1;
+                            }
+#else
+                            if (indval < 0) {
+                                indval += fancy_dims[i];
+                            }
+#endif
+                            self_ptr += indval * fancy_strides[i];
+
+                            /* advance indexing arrays */
+                            outer_ptrs[i] += outer_strides[i];
+                        }
+
+#if 1
+#if 1
+                        assert(npy_is_aligned(outer_ptrs[i],
+                                              NPY_ALIGNOF_UINT(npy_uint8)));
+                        assert(npy_is_aligned(self_ptr,
+                                              NPY_ALIGNOF_UINT(npy_uint8)));
+                        *(npy_uint8 *)(outer_ptrs[i]) = *(npy_uint8 *)self_ptr;
+#else
+                        char *args[2] = {self_ptr, outer_ptrs[i]};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#else /* !1 */
+#if 1
+                        assert(npy_is_aligned(outer_ptrs[i],
+                               NPY_ALIGNOF_UINT(npy_uint8)));
+                        assert(npy_is_aligned(self_ptr,
+                               NPY_ALIGNOF_UINT(npy_uint8)));
+                        *(npy_uint8 *)self_ptr = *(npy_uint8 *)(outer_ptrs[i]);
+#else
+                        char *args[2] = {outer_ptrs[i], self_ptr};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#endif
+                        /* advance extra operand */
+                        outer_ptrs[i] += outer_strides[i];
+                    }
+                } while (mit->outer_next(mit->outer));
+
+                break;
+
+
+#line 1652
+
+#if 2
+            case 2:
+#else
+            default:
+#endif
+                /* Outer iteration (safe because mit->size != 0) */
+                do {
+#if !1
+                    /*
+                     * When the API is needed the casting might fail
+                     * TODO: (only if buffering is enabled).
+                     */
+                    if (needs_api && PyErr_Occurred()) {
+                        return -1;
+                    }
+#endif
+                    count = *counter;
+                    while (count--) {
+                        char * self_ptr = baseoffset;
+                        for (i=0; i < 1; i++) {
+                            npy_intp indval = *((npy_intp*)outer_ptrs[i]);
+                            assert(npy_is_aligned(outer_ptrs[i],
+                                                  NPY_ALIGNOF_UINT(npy_intp)));
+
+#if 1 && 1
+                            if (check_and_adjust_index(&indval, fancy_dims[i],
+                                                       iteraxis, _save) < 0 ) {
+                                return -1;
+                            }
+#else
+                            if (indval < 0) {
+                                indval += fancy_dims[i];
+                            }
+#endif
+                            self_ptr += indval * fancy_strides[i];
+
+                            /* advance indexing arrays */
+                            outer_ptrs[i] += outer_strides[i];
+                        }
+
+#if 1
+#if 2
+                        assert(npy_is_aligned(outer_ptrs[i],
+                                              NPY_ALIGNOF_UINT(npy_uint16)));
+                        assert(npy_is_aligned(self_ptr,
+                                              NPY_ALIGNOF_UINT(npy_uint16)));
+                        *(npy_uint16 *)(outer_ptrs[i]) = *(npy_uint16 *)self_ptr;
+#else
+                        char *args[2] = {self_ptr, outer_ptrs[i]};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#else /* !1 */
+#if 2
+                        assert(npy_is_aligned(outer_ptrs[i],
+                               NPY_ALIGNOF_UINT(npy_uint16)));
+                        assert(npy_is_aligned(self_ptr,
+                               NPY_ALIGNOF_UINT(npy_uint16)));
+                        *(npy_uint16 *)self_ptr = *(npy_uint16 *)(outer_ptrs[i]);
+#else
+                        char *args[2] = {outer_ptrs[i], self_ptr};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#endif
+                        /* advance extra operand */
+                        outer_ptrs[i] += outer_strides[i];
+                    }
+                } while (mit->outer_next(mit->outer));
+
+                break;
+
+
+#line 1652
+
+#if 4
+            case 4:
+#else
+            default:
+#endif
+                /* Outer iteration (safe because mit->size != 0) */
+                do {
+#if !1
+                    /*
+                     * When the API is needed the casting might fail
+                     * TODO: (only if buffering is enabled).
+                     */
+                    if (needs_api && PyErr_Occurred()) {
+                        return -1;
+                    }
+#endif
+                    count = *counter;
+                    while (count--) {
+                        char * self_ptr = baseoffset;
+                        for (i=0; i < 1; i++) {
+                            npy_intp indval = *((npy_intp*)outer_ptrs[i]);
+                            assert(npy_is_aligned(outer_ptrs[i],
+                                                  NPY_ALIGNOF_UINT(npy_intp)));
+
+#if 1 && 1
+                            if (check_and_adjust_index(&indval, fancy_dims[i],
+                                                       iteraxis, _save) < 0 ) {
+                                return -1;
+                            }
+#else
+                            if (indval < 0) {
+                                indval += fancy_dims[i];
+                            }
+#endif
+                            self_ptr += indval * fancy_strides[i];
+
+                            /* advance indexing arrays */
+                            outer_ptrs[i] += outer_strides[i];
+                        }
+
+#if 1
+#if 4
+                        assert(npy_is_aligned(outer_ptrs[i],
+                                              NPY_ALIGNOF_UINT(npy_uint32)));
+                        assert(npy_is_aligned(self_ptr,
+                                              NPY_ALIGNOF_UINT(npy_uint32)));
+                        *(npy_uint32 *)(outer_ptrs[i]) = *(npy_uint32 *)self_ptr;
+#else
+                        char *args[2] = {self_ptr, outer_ptrs[i]};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#else /* !1 */
+#if 4
+                        assert(npy_is_aligned(outer_ptrs[i],
+                               NPY_ALIGNOF_UINT(npy_uint32)));
+                        assert(npy_is_aligned(self_ptr,
+                               NPY_ALIGNOF_UINT(npy_uint32)));
+                        *(npy_uint32 *)self_ptr = *(npy_uint32 *)(outer_ptrs[i]);
+#else
+                        char *args[2] = {outer_ptrs[i], self_ptr};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#endif
+                        /* advance extra operand */
+                        outer_ptrs[i] += outer_strides[i];
+                    }
+                } while (mit->outer_next(mit->outer));
+
+                break;
+
+
+#line 1652
+
+#if 8
+            case 8:
+#else
+            default:
+#endif
+                /* Outer iteration (safe because mit->size != 0) */
+                do {
+#if !1
+                    /*
+                     * When the API is needed the casting might fail
+                     * TODO: (only if buffering is enabled).
+                     */
+                    if (needs_api && PyErr_Occurred()) {
+                        return -1;
+                    }
+#endif
+                    count = *counter;
+                    while (count--) {
+                        char * self_ptr = baseoffset;
+                        for (i=0; i < 1; i++) {
+                            npy_intp indval = *((npy_intp*)outer_ptrs[i]);
+                            assert(npy_is_aligned(outer_ptrs[i],
+                                                  NPY_ALIGNOF_UINT(npy_intp)));
+
+#if 1 && 1
+                            if (check_and_adjust_index(&indval, fancy_dims[i],
+                                                       iteraxis, _save) < 0 ) {
+                                return -1;
+                            }
+#else
+                            if (indval < 0) {
+                                indval += fancy_dims[i];
+                            }
+#endif
+                            self_ptr += indval * fancy_strides[i];
+
+                            /* advance indexing arrays */
+                            outer_ptrs[i] += outer_strides[i];
+                        }
+
+#if 1
+#if 8
+                        assert(npy_is_aligned(outer_ptrs[i],
+                                              NPY_ALIGNOF_UINT(npy_uint64)));
+                        assert(npy_is_aligned(self_ptr,
+                                              NPY_ALIGNOF_UINT(npy_uint64)));
+                        *(npy_uint64 *)(outer_ptrs[i]) = *(npy_uint64 *)self_ptr;
+#else
+                        char *args[2] = {self_ptr, outer_ptrs[i]};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#else /* !1 */
+#if 8
+                        assert(npy_is_aligned(outer_ptrs[i],
+                               NPY_ALIGNOF_UINT(npy_uint64)));
+                        assert(npy_is_aligned(self_ptr,
+                               NPY_ALIGNOF_UINT(npy_uint64)));
+                        *(npy_uint64 *)self_ptr = *(npy_uint64 *)(outer_ptrs[i]);
+#else
+                        char *args[2] = {outer_ptrs[i], self_ptr};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#endif
+                        /* advance extra operand */
+                        outer_ptrs[i] += outer_strides[i];
+                    }
+                } while (mit->outer_next(mit->outer));
+
+                break;
+
+
+#line 1652
+
+#if 16
+            case 16:
+#else
+            default:
+#endif
+                /* Outer iteration (safe because mit->size != 0) */
+                do {
+#if !1
+                    /*
+                     * When the API is needed the casting might fail
+                     * TODO: (only if buffering is enabled).
+                     */
+                    if (needs_api && PyErr_Occurred()) {
+                        return -1;
+                    }
+#endif
+                    count = *counter;
+                    while (count--) {
+                        char * self_ptr = baseoffset;
+                        for (i=0; i < 1; i++) {
+                            npy_intp indval = *((npy_intp*)outer_ptrs[i]);
+                            assert(npy_is_aligned(outer_ptrs[i],
+                                                  NPY_ALIGNOF_UINT(npy_intp)));
+
+#if 1 && 1
+                            if (check_and_adjust_index(&indval, fancy_dims[i],
+                                                       iteraxis, _save) < 0 ) {
+                                return -1;
+                            }
+#else
+                            if (indval < 0) {
+                                indval += fancy_dims[i];
+                            }
+#endif
+                            self_ptr += indval * fancy_strides[i];
+
+                            /* advance indexing arrays */
+                            outer_ptrs[i] += outer_strides[i];
+                        }
+
+#if 1
+#if 16
+                        assert(npy_is_aligned(outer_ptrs[i],
+                                              NPY_ALIGNOF_UINT(copytype128)));
+                        assert(npy_is_aligned(self_ptr,
+                                              NPY_ALIGNOF_UINT(copytype128)));
+                        *(copytype128 *)(outer_ptrs[i]) = *(copytype128 *)self_ptr;
+#else
+                        char *args[2] = {self_ptr, outer_ptrs[i]};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#else /* !1 */
+#if 16
+                        assert(npy_is_aligned(outer_ptrs[i],
+                               NPY_ALIGNOF_UINT(copytype128)));
+                        assert(npy_is_aligned(self_ptr,
+                               NPY_ALIGNOF_UINT(copytype128)));
+                        *(copytype128 *)self_ptr = *(copytype128 *)(outer_ptrs[i]);
+#else
+                        char *args[2] = {outer_ptrs[i], self_ptr};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#endif
+                        /* advance extra operand */
+                        outer_ptrs[i] += outer_strides[i];
+                    }
+                } while (mit->outer_next(mit->outer));
+
+                break;
+
+
+#line 1652
+
+#if 0
+            case 0:
+#else
+            default:
+#endif
+                /* Outer iteration (safe because mit->size != 0) */
+                do {
+#if !1
+                    /*
+                     * When the API is needed the casting might fail
+                     * TODO: (only if buffering is enabled).
+                     */
+                    if (needs_api && PyErr_Occurred()) {
+                        return -1;
+                    }
+#endif
+                    count = *counter;
+                    while (count--) {
+                        char * self_ptr = baseoffset;
+                        for (i=0; i < 1; i++) {
+                            npy_intp indval = *((npy_intp*)outer_ptrs[i]);
+                            assert(npy_is_aligned(outer_ptrs[i],
+                                                  NPY_ALIGNOF_UINT(npy_intp)));
+
+#if 1 && 1
+                            if (check_and_adjust_index(&indval, fancy_dims[i],
+                                                       iteraxis, _save) < 0 ) {
+                                return -1;
+                            }
+#else
+                            if (indval < 0) {
+                                indval += fancy_dims[i];
+                            }
+#endif
+                            self_ptr += indval * fancy_strides[i];
+
+                            /* advance indexing arrays */
+                            outer_ptrs[i] += outer_strides[i];
+                        }
+
+#if 1
+#if 0
+                        assert(npy_is_aligned(outer_ptrs[i],
+                                              NPY_ALIGNOF_UINT(0)));
+                        assert(npy_is_aligned(self_ptr,
+                                              NPY_ALIGNOF_UINT(0)));
+                        *(0 *)(outer_ptrs[i]) = *(0 *)self_ptr;
+#else
+                        char *args[2] = {self_ptr, outer_ptrs[i]};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#else /* !1 */
+#if 0
+                        assert(npy_is_aligned(outer_ptrs[i],
+                               NPY_ALIGNOF_UINT(0)));
+                        assert(npy_is_aligned(self_ptr,
+                               NPY_ALIGNOF_UINT(0)));
+                        *(0 *)self_ptr = *(0 *)(outer_ptrs[i]);
+#else
+                        char *args[2] = {outer_ptrs[i], self_ptr};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#endif
+                        /* advance extra operand */
+                        outer_ptrs[i] += outer_strides[i];
+                    }
+                } while (mit->outer_next(mit->outer));
+
+                break;
+
+
+            }
+            NPY_END_THREADS;
+        }
+
+#line 1634
+
+#if 0
+        if (numiter == 1) {
+#else
+        else {
+#endif
+            NPY_BEGIN_THREADS_DEF;
+            if (!needs_api) {
+                NPY_BEGIN_THREADS;
+            }
+
+            /* Optimization for aligned types that do not need the api */
+            switch ((is_aligned && !needs_api) ? itemsize : 0) {
+
+#line 1652
+
+#if 1
+            case 1:
+#else
+            default:
+#endif
+                /* Outer iteration (safe because mit->size != 0) */
+                do {
+#if !1
+                    /*
+                     * When the API is needed the casting might fail
+                     * TODO: (only if buffering is enabled).
+                     */
+                    if (needs_api && PyErr_Occurred()) {
+                        return -1;
+                    }
+#endif
+                    count = *counter;
+                    while (count--) {
+                        char * self_ptr = baseoffset;
+                        for (i=0; i < numiter; i++) {
+                            npy_intp indval = *((npy_intp*)outer_ptrs[i]);
+                            assert(npy_is_aligned(outer_ptrs[i],
+                                                  NPY_ALIGNOF_UINT(npy_intp)));
+
+#if 1 && 0
+                            if (check_and_adjust_index(&indval, fancy_dims[i],
+                                                       iteraxis, _save) < 0 ) {
+                                return -1;
+                            }
+#else
+                            if (indval < 0) {
+                                indval += fancy_dims[i];
+                            }
+#endif
+                            self_ptr += indval * fancy_strides[i];
+
+                            /* advance indexing arrays */
+                            outer_ptrs[i] += outer_strides[i];
+                        }
+
+#if 1
+#if 1
+                        assert(npy_is_aligned(outer_ptrs[i],
+                                              NPY_ALIGNOF_UINT(npy_uint8)));
+                        assert(npy_is_aligned(self_ptr,
+                                              NPY_ALIGNOF_UINT(npy_uint8)));
+                        *(npy_uint8 *)(outer_ptrs[i]) = *(npy_uint8 *)self_ptr;
+#else
+                        char *args[2] = {self_ptr, outer_ptrs[i]};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#else /* !1 */
+#if 1
+                        assert(npy_is_aligned(outer_ptrs[i],
+                               NPY_ALIGNOF_UINT(npy_uint8)));
+                        assert(npy_is_aligned(self_ptr,
+                               NPY_ALIGNOF_UINT(npy_uint8)));
+                        *(npy_uint8 *)self_ptr = *(npy_uint8 *)(outer_ptrs[i]);
+#else
+                        char *args[2] = {outer_ptrs[i], self_ptr};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#endif
+                        /* advance extra operand */
+                        outer_ptrs[i] += outer_strides[i];
+                    }
+                } while (mit->outer_next(mit->outer));
+
+                break;
+
+
+#line 1652
+
+#if 2
+            case 2:
+#else
+            default:
+#endif
+                /* Outer iteration (safe because mit->size != 0) */
+                do {
+#if !1
+                    /*
+                     * When the API is needed the casting might fail
+                     * TODO: (only if buffering is enabled).
+                     */
+                    if (needs_api && PyErr_Occurred()) {
+                        return -1;
+                    }
+#endif
+                    count = *counter;
+                    while (count--) {
+                        char * self_ptr = baseoffset;
+                        for (i=0; i < numiter; i++) {
+                            npy_intp indval = *((npy_intp*)outer_ptrs[i]);
+                            assert(npy_is_aligned(outer_ptrs[i],
+                                                  NPY_ALIGNOF_UINT(npy_intp)));
+
+#if 1 && 0
+                            if (check_and_adjust_index(&indval, fancy_dims[i],
+                                                       iteraxis, _save) < 0 ) {
+                                return -1;
+                            }
+#else
+                            if (indval < 0) {
+                                indval += fancy_dims[i];
+                            }
+#endif
+                            self_ptr += indval * fancy_strides[i];
+
+                            /* advance indexing arrays */
+                            outer_ptrs[i] += outer_strides[i];
+                        }
+
+#if 1
+#if 2
+                        assert(npy_is_aligned(outer_ptrs[i],
+                                              NPY_ALIGNOF_UINT(npy_uint16)));
+                        assert(npy_is_aligned(self_ptr,
+                                              NPY_ALIGNOF_UINT(npy_uint16)));
+                        *(npy_uint16 *)(outer_ptrs[i]) = *(npy_uint16 *)self_ptr;
+#else
+                        char *args[2] = {self_ptr, outer_ptrs[i]};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#else /* !1 */
+#if 2
+                        assert(npy_is_aligned(outer_ptrs[i],
+                               NPY_ALIGNOF_UINT(npy_uint16)));
+                        assert(npy_is_aligned(self_ptr,
+                               NPY_ALIGNOF_UINT(npy_uint16)));
+                        *(npy_uint16 *)self_ptr = *(npy_uint16 *)(outer_ptrs[i]);
+#else
+                        char *args[2] = {outer_ptrs[i], self_ptr};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#endif
+                        /* advance extra operand */
+                        outer_ptrs[i] += outer_strides[i];
+                    }
+                } while (mit->outer_next(mit->outer));
+
+                break;
+
+
+#line 1652
+
+#if 4
+            case 4:
+#else
+            default:
+#endif
+                /* Outer iteration (safe because mit->size != 0) */
+                do {
+#if !1
+                    /*
+                     * When the API is needed the casting might fail
+                     * TODO: (only if buffering is enabled).
+                     */
+                    if (needs_api && PyErr_Occurred()) {
+                        return -1;
+                    }
+#endif
+                    count = *counter;
+                    while (count--) {
+                        char * self_ptr = baseoffset;
+                        for (i=0; i < numiter; i++) {
+                            npy_intp indval = *((npy_intp*)outer_ptrs[i]);
+                            assert(npy_is_aligned(outer_ptrs[i],
+                                                  NPY_ALIGNOF_UINT(npy_intp)));
+
+#if 1 && 0
+                            if (check_and_adjust_index(&indval, fancy_dims[i],
+                                                       iteraxis, _save) < 0 ) {
+                                return -1;
+                            }
+#else
+                            if (indval < 0) {
+                                indval += fancy_dims[i];
+                            }
+#endif
+                            self_ptr += indval * fancy_strides[i];
+
+                            /* advance indexing arrays */
+                            outer_ptrs[i] += outer_strides[i];
+                        }
+
+#if 1
+#if 4
+                        assert(npy_is_aligned(outer_ptrs[i],
+                                              NPY_ALIGNOF_UINT(npy_uint32)));
+                        assert(npy_is_aligned(self_ptr,
+                                              NPY_ALIGNOF_UINT(npy_uint32)));
+                        *(npy_uint32 *)(outer_ptrs[i]) = *(npy_uint32 *)self_ptr;
+#else
+                        char *args[2] = {self_ptr, outer_ptrs[i]};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#else /* !1 */
+#if 4
+                        assert(npy_is_aligned(outer_ptrs[i],
+                               NPY_ALIGNOF_UINT(npy_uint32)));
+                        assert(npy_is_aligned(self_ptr,
+                               NPY_ALIGNOF_UINT(npy_uint32)));
+                        *(npy_uint32 *)self_ptr = *(npy_uint32 *)(outer_ptrs[i]);
+#else
+                        char *args[2] = {outer_ptrs[i], self_ptr};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#endif
+                        /* advance extra operand */
+                        outer_ptrs[i] += outer_strides[i];
+                    }
+                } while (mit->outer_next(mit->outer));
+
+                break;
+
+
+#line 1652
+
+#if 8
+            case 8:
+#else
+            default:
+#endif
+                /* Outer iteration (safe because mit->size != 0) */
+                do {
+#if !1
+                    /*
+                     * When the API is needed the casting might fail
+                     * TODO: (only if buffering is enabled).
+                     */
+                    if (needs_api && PyErr_Occurred()) {
+                        return -1;
+                    }
+#endif
+                    count = *counter;
+                    while (count--) {
+                        char * self_ptr = baseoffset;
+                        for (i=0; i < numiter; i++) {
+                            npy_intp indval = *((npy_intp*)outer_ptrs[i]);
+                            assert(npy_is_aligned(outer_ptrs[i],
+                                                  NPY_ALIGNOF_UINT(npy_intp)));
+
+#if 1 && 0
+                            if (check_and_adjust_index(&indval, fancy_dims[i],
+                                                       iteraxis, _save) < 0 ) {
+                                return -1;
+                            }
+#else
+                            if (indval < 0) {
+                                indval += fancy_dims[i];
+                            }
+#endif
+                            self_ptr += indval * fancy_strides[i];
+
+                            /* advance indexing arrays */
+                            outer_ptrs[i] += outer_strides[i];
+                        }
+
+#if 1
+#if 8
+                        assert(npy_is_aligned(outer_ptrs[i],
+                                              NPY_ALIGNOF_UINT(npy_uint64)));
+                        assert(npy_is_aligned(self_ptr,
+                                              NPY_ALIGNOF_UINT(npy_uint64)));
+                        *(npy_uint64 *)(outer_ptrs[i]) = *(npy_uint64 *)self_ptr;
+#else
+                        char *args[2] = {self_ptr, outer_ptrs[i]};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#else /* !1 */
+#if 8
+                        assert(npy_is_aligned(outer_ptrs[i],
+                               NPY_ALIGNOF_UINT(npy_uint64)));
+                        assert(npy_is_aligned(self_ptr,
+                               NPY_ALIGNOF_UINT(npy_uint64)));
+                        *(npy_uint64 *)self_ptr = *(npy_uint64 *)(outer_ptrs[i]);
+#else
+                        char *args[2] = {outer_ptrs[i], self_ptr};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#endif
+                        /* advance extra operand */
+                        outer_ptrs[i] += outer_strides[i];
+                    }
+                } while (mit->outer_next(mit->outer));
+
+                break;
+
+
+#line 1652
+
+#if 16
+            case 16:
+#else
+            default:
+#endif
+                /* Outer iteration (safe because mit->size != 0) */
+                do {
+#if !1
+                    /*
+                     * When the API is needed the casting might fail
+                     * TODO: (only if buffering is enabled).
+                     */
+                    if (needs_api && PyErr_Occurred()) {
+                        return -1;
+                    }
+#endif
+                    count = *counter;
+                    while (count--) {
+                        char * self_ptr = baseoffset;
+                        for (i=0; i < numiter; i++) {
+                            npy_intp indval = *((npy_intp*)outer_ptrs[i]);
+                            assert(npy_is_aligned(outer_ptrs[i],
+                                                  NPY_ALIGNOF_UINT(npy_intp)));
+
+#if 1 && 0
+                            if (check_and_adjust_index(&indval, fancy_dims[i],
+                                                       iteraxis, _save) < 0 ) {
+                                return -1;
+                            }
+#else
+                            if (indval < 0) {
+                                indval += fancy_dims[i];
+                            }
+#endif
+                            self_ptr += indval * fancy_strides[i];
+
+                            /* advance indexing arrays */
+                            outer_ptrs[i] += outer_strides[i];
+                        }
+
+#if 1
+#if 16
+                        assert(npy_is_aligned(outer_ptrs[i],
+                                              NPY_ALIGNOF_UINT(copytype128)));
+                        assert(npy_is_aligned(self_ptr,
+                                              NPY_ALIGNOF_UINT(copytype128)));
+                        *(copytype128 *)(outer_ptrs[i]) = *(copytype128 *)self_ptr;
+#else
+                        char *args[2] = {self_ptr, outer_ptrs[i]};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#else /* !1 */
+#if 16
+                        assert(npy_is_aligned(outer_ptrs[i],
+                               NPY_ALIGNOF_UINT(copytype128)));
+                        assert(npy_is_aligned(self_ptr,
+                               NPY_ALIGNOF_UINT(copytype128)));
+                        *(copytype128 *)self_ptr = *(copytype128 *)(outer_ptrs[i]);
+#else
+                        char *args[2] = {outer_ptrs[i], self_ptr};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#endif
+                        /* advance extra operand */
+                        outer_ptrs[i] += outer_strides[i];
+                    }
+                } while (mit->outer_next(mit->outer));
+
+                break;
+
+
+#line 1652
+
+#if 0
+            case 0:
+#else
+            default:
+#endif
+                /* Outer iteration (safe because mit->size != 0) */
+                do {
+#if !1
+                    /*
+                     * When the API is needed the casting might fail
+                     * TODO: (only if buffering is enabled).
+                     */
+                    if (needs_api && PyErr_Occurred()) {
+                        return -1;
+                    }
+#endif
+                    count = *counter;
+                    while (count--) {
+                        char * self_ptr = baseoffset;
+                        for (i=0; i < numiter; i++) {
+                            npy_intp indval = *((npy_intp*)outer_ptrs[i]);
+                            assert(npy_is_aligned(outer_ptrs[i],
+                                                  NPY_ALIGNOF_UINT(npy_intp)));
+
+#if 1 && 0
+                            if (check_and_adjust_index(&indval, fancy_dims[i],
+                                                       iteraxis, _save) < 0 ) {
+                                return -1;
+                            }
+#else
+                            if (indval < 0) {
+                                indval += fancy_dims[i];
+                            }
+#endif
+                            self_ptr += indval * fancy_strides[i];
+
+                            /* advance indexing arrays */
+                            outer_ptrs[i] += outer_strides[i];
+                        }
+
+#if 1
+#if 0
+                        assert(npy_is_aligned(outer_ptrs[i],
+                                              NPY_ALIGNOF_UINT(0)));
+                        assert(npy_is_aligned(self_ptr,
+                                              NPY_ALIGNOF_UINT(0)));
+                        *(0 *)(outer_ptrs[i]) = *(0 *)self_ptr;
+#else
+                        char *args[2] = {self_ptr, outer_ptrs[i]};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#else /* !1 */
+#if 0
+                        assert(npy_is_aligned(outer_ptrs[i],
+                               NPY_ALIGNOF_UINT(0)));
+                        assert(npy_is_aligned(self_ptr,
+                               NPY_ALIGNOF_UINT(0)));
+                        *(0 *)self_ptr = *(0 *)(outer_ptrs[i]);
+#else
+                        char *args[2] = {outer_ptrs[i], self_ptr};
+                        if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                                               args, &one, strides,
+                                               cast_info->auxdata) < 0)) {
+                                           NPY_END_THREADS;
+                                           return -1;
+                                       }
+#endif
+#endif
+                        /* advance extra operand */
+                        outer_ptrs[i] += outer_strides[i];
+                    }
+                } while (mit->outer_next(mit->outer));
+
+                break;
+
+
+            }
+            NPY_END_THREADS;
+        }
+
+    }
+
+    /******************* Nested Iteration Situation *******************/
+    else {
+        char *subspace_baseptrs[2];
+        char **subspace_ptrs = mit->subspace_ptrs;
+        npy_intp *subspace_strides = mit->subspace_strides;
+        int is_subiter_trivial = 0; /* has three states */
+        npy_intp reset_offsets[2] = {0, 0};
+
+        /* Note: it may make sense to refactor `needs_api` out in this branch */
+        if (flags & NPY_METH_REQUIRES_PYAPI) {
+            needs_api = 1;
+        }
+
+        counter = NpyIter_GetInnerLoopSizePtr(mit->subspace_iter);
+        if (*counter == PyArray_SIZE(mit->subspace)) {
+           /*
+            * subspace is trivially iterable.
+            * manipulate pointers to avoid expensive resetting
+            */
+            is_subiter_trivial = 1;
+        }
+#line 1765
+
+#if 1
+        if (numiter == 1) {
+#else
+        else {
+#endif
+            NPY_BEGIN_THREADS_DEF;
+            if (!needs_api) {
+                NPY_BEGIN_THREADS;
+            }
+
+            /* Outer iteration (safe because mit->size != 0) */
+            do {
+                char * self_ptr = baseoffset;
+                for (i=0; i < 1; i++) {
+                    npy_intp indval = *((npy_intp*)outer_ptrs[i]);
+
+#if 1 && 1
+                    if (check_and_adjust_index(&indval, fancy_dims[i],
+                                               iteraxis, _save) < 0 ) {
+                        return -1;
+                    }
+#else
+                    if (indval < 0) {
+                        indval += fancy_dims[i];
+                    }
+#endif
+
+                    self_ptr += indval * fancy_strides[i];
+                }
+
+                /*
+                 * Resetting is slow, so try to avoid resetting
+                 * if subspace iteration is trivial.
+                 * Watch out: reset_offsets are kept outside of the loop,
+                 * assuming the subspaces of different external iterations
+                 * share the same structure.
+                 */
+                if (is_subiter_trivial <= 1) {
+                    /* slower resetting: first iteration or non-trivial subspace */
+
+                    char * errmsg = NULL;
+                    subspace_baseptrs[0] = self_ptr;
+                    subspace_baseptrs[1] = mit->extra_op_ptrs[0];
+
+                    /* (can't really fail, since no buffering necessary) */
+                    if (!NpyIter_ResetBasePointers(mit->subspace_iter,
+                                                   subspace_baseptrs,
+                                                   &errmsg)) {
+                        NPY_END_THREADS;
+                        PyErr_SetString(PyExc_ValueError, errmsg);
+                        return -1;
+                    }
+                    if (is_subiter_trivial != 0) {
+                        /* reset_offsets are nonzero for negative strides.*/
+                        reset_offsets[0] = subspace_ptrs[0] - self_ptr;
+                        reset_offsets[1] = subspace_ptrs[1] - mit->extra_op_ptrs[0];
+
+                        /* use the faster adjustment further on */
+                        is_subiter_trivial ++;
+                    }
+                }
+                else {
+                    /*
+                     * faster resetting if the subspace iteration is trivial.
+                     * reset_offsets are zero for positive strides,
+                     * for negative strides this shifts the pointer to the last
+                     * item.
+                     */
+                    subspace_ptrs[0] = self_ptr + reset_offsets[0];
+                    subspace_ptrs[1] = mit->extra_op_ptrs[0] + reset_offsets[1];
+                }
+
+#if !1
+                /*
+                 * When the API is needed the casting might fail
+                 * TODO: Could only check if casting is unsafe, or even just
+                 *       not at all...
+                 */
+                if (needs_api && PyErr_Occurred()) {
+                    return -1;
+                }
+#endif
+
+                do {
+
+#if 1
+                    if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                            subspace_ptrs, counter, subspace_strides,
+                            cast_info->auxdata) < 0)) {
+                        NPY_END_THREADS;
+                        return -1;
+                    }
+#else
+                    /* The operand order is reversed here */
+                    char *args[2] = {subspace_ptrs[1], subspace_ptrs[0]};
+                    npy_intp strides[2] = {subspace_strides[1], subspace_strides[0]};
+                    if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                            args, counter, strides, cast_info->auxdata) < 0)) {
+                        NPY_END_THREADS;
+                        return -1;
+                    }
+#endif
+                } while (mit->subspace_next(mit->subspace_iter));
+
+                mit->extra_op_next(mit->extra_op_iter);
+            } while (mit->outer_next(mit->outer));
+            NPY_END_THREADS;
+        }
+
+#line 1765
+
+#if 0
+        if (numiter == 1) {
+#else
+        else {
+#endif
+            NPY_BEGIN_THREADS_DEF;
+            if (!needs_api) {
+                NPY_BEGIN_THREADS;
+            }
+
+            /* Outer iteration (safe because mit->size != 0) */
+            do {
+                char * self_ptr = baseoffset;
+                for (i=0; i < numiter; i++) {
+                    npy_intp indval = *((npy_intp*)outer_ptrs[i]);
+
+#if 1 && 0
+                    if (check_and_adjust_index(&indval, fancy_dims[i],
+                                               iteraxis, _save) < 0 ) {
+                        return -1;
+                    }
+#else
+                    if (indval < 0) {
+                        indval += fancy_dims[i];
+                    }
+#endif
+
+                    self_ptr += indval * fancy_strides[i];
+                }
+
+                /*
+                 * Resetting is slow, so try to avoid resetting
+                 * if subspace iteration is trivial.
+                 * Watch out: reset_offsets are kept outside of the loop,
+                 * assuming the subspaces of different external iterations
+                 * share the same structure.
+                 */
+                if (is_subiter_trivial <= 1) {
+                    /* slower resetting: first iteration or non-trivial subspace */
+
+                    char * errmsg = NULL;
+                    subspace_baseptrs[0] = self_ptr;
+                    subspace_baseptrs[1] = mit->extra_op_ptrs[0];
+
+                    /* (can't really fail, since no buffering necessary) */
+                    if (!NpyIter_ResetBasePointers(mit->subspace_iter,
+                                                   subspace_baseptrs,
+                                                   &errmsg)) {
+                        NPY_END_THREADS;
+                        PyErr_SetString(PyExc_ValueError, errmsg);
+                        return -1;
+                    }
+                    if (is_subiter_trivial != 0) {
+                        /* reset_offsets are nonzero for negative strides.*/
+                        reset_offsets[0] = subspace_ptrs[0] - self_ptr;
+                        reset_offsets[1] = subspace_ptrs[1] - mit->extra_op_ptrs[0];
+
+                        /* use the faster adjustment further on */
+                        is_subiter_trivial ++;
+                    }
+                }
+                else {
+                    /*
+                     * faster resetting if the subspace iteration is trivial.
+                     * reset_offsets are zero for positive strides,
+                     * for negative strides this shifts the pointer to the last
+                     * item.
+                     */
+                    subspace_ptrs[0] = self_ptr + reset_offsets[0];
+                    subspace_ptrs[1] = mit->extra_op_ptrs[0] + reset_offsets[1];
+                }
+
+#if !1
+                /*
+                 * When the API is needed the casting might fail
+                 * TODO: Could only check if casting is unsafe, or even just
+                 *       not at all...
+                 */
+                if (needs_api && PyErr_Occurred()) {
+                    return -1;
+                }
+#endif
+
+                do {
+
+#if 1
+                    if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                            subspace_ptrs, counter, subspace_strides,
+                            cast_info->auxdata) < 0)) {
+                        NPY_END_THREADS;
+                        return -1;
+                    }
+#else
+                    /* The operand order is reversed here */
+                    char *args[2] = {subspace_ptrs[1], subspace_ptrs[0]};
+                    npy_intp strides[2] = {subspace_strides[1], subspace_strides[0]};
+                    if (NPY_UNLIKELY(cast_info->func(&cast_info->context,
+                            args, counter, strides, cast_info->auxdata) < 0)) {
+                        NPY_END_THREADS;
+                        return -1;
+                    }
+#endif
+                } while (mit->subspace_next(mit->subspace_iter));
+
+                mit->extra_op_next(mit->extra_op_iter);
+            } while (mit->outer_next(mit->outer));
+            NPY_END_THREADS;
+        }
+
+    }
+    return 0;
+}
+
+
+
diff --git a/numpy/core/src/_generated/matmul.c b/numpy/core/src/_generated/matmul.c
new file mode 100644
index 000000000000..71b0ed95589d
--- /dev/null
+++ b/numpy/core/src/_generated/matmul.c
@@ -0,0 +1,3926 @@
+#line 1 "numpy/core/src/umath/matmul.c.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+/* -*- c -*- */
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "npy_config.h"
+#include "numpy/npy_common.h"
+#include "numpy/arrayobject.h"
+#include "numpy/ufuncobject.h"
+#include "numpy/npy_math.h"
+#include "numpy/halffloat.h"
+#include "lowlevel_strided_loops.h"
+
+#include "npy_pycompat.h"
+
+#include "npy_cblas.h"
+#include "arraytypes.h" /* For TYPE_dot functions */
+
+#include <assert.h>
+
+/*
+ *****************************************************************************
+ **                            BASICS                                       **
+ *****************************************************************************
+ */
+
+#if defined(HAVE_CBLAS)
+/*
+ * -1 to be conservative, in case blas internally uses a for loop with an
+ * inclusive upper bound
+ */
+#ifndef HAVE_BLAS_ILP64
+#define BLAS_MAXSIZE (NPY_MAX_INT - 1)
+#else
+#define BLAS_MAXSIZE (NPY_MAX_INT64 - 1)
+#endif
+
+/*
+ * Determine if a 2d matrix can be used by BLAS
+ * 1. Strides must not alias or overlap
+ * 2. The faster (second) axis must be contiguous
+ * 3. The slower (first) axis stride, in unit steps, must be larger than
+ *    the faster axis dimension
+ */
+static inline npy_bool
+is_blasable2d(npy_intp byte_stride1, npy_intp byte_stride2,
+              npy_intp d1, npy_intp d2,  npy_intp itemsize)
+{
+    npy_intp unit_stride1 = byte_stride1 / itemsize;
+    if (byte_stride2 != itemsize) {
+        return NPY_FALSE;
+    }
+    if ((byte_stride1 % itemsize ==0) &&
+        (unit_stride1 >= d2) &&
+        (unit_stride1 <= BLAS_MAXSIZE))
+    {
+        return NPY_TRUE;
+    }
+    return NPY_FALSE;
+}
+
+static const npy_cdouble oneD = {1.0, 0.0}, zeroD = {0.0, 0.0};
+static const npy_cfloat  oneF = {1.0, 0.0}, zeroF = {0.0, 0.0};
+
+#line 77
+NPY_NO_EXPORT void
+FLOAT_gemv(void *ip1, npy_intp is1_m, npy_intp is1_n,
+            void *ip2, npy_intp is2_n, npy_intp NPY_UNUSED(is2_p),
+            void *op, npy_intp op_m, npy_intp NPY_UNUSED(op_p),
+            npy_intp m, npy_intp n, npy_intp NPY_UNUSED(p))
+{
+    /*
+     * Vector matrix multiplication -- Level 2 BLAS
+     * arguments
+     * ip1: contiguous data, m*n shape
+     * ip2: data in c order, n*1 shape
+     * op:  data in c order, m shape
+     */
+    enum CBLAS_ORDER order;
+    CBLAS_INT M, N, lda;
+
+    assert(m <= BLAS_MAXSIZE && n <= BLAS_MAXSIZE);
+    assert (is_blasable2d(is2_n, sizeof(npy_float), n, 1, sizeof(npy_float)));
+    M = (CBLAS_INT)m;
+    N = (CBLAS_INT)n;
+
+    if (is_blasable2d(is1_m, is1_n, m, n, sizeof(npy_float))) {
+        order = CblasColMajor;
+        lda = (CBLAS_INT)(is1_m / sizeof(npy_float));
+    }
+    else {
+        /* If not ColMajor, caller should have ensured we are RowMajor */
+        /* will not assert in release mode */
+        order = CblasRowMajor;
+        assert(is_blasable2d(is1_n, is1_m, n, m, sizeof(npy_float)));
+        lda = (CBLAS_INT)(is1_n / sizeof(npy_float));
+    }
+    CBLAS_FUNC(cblas_sgemv)(order, CblasTrans, N, M, 1.F, ip1, lda, ip2,
+                                     is2_n / sizeof(npy_float), 0.F, op, op_m / sizeof(npy_float));
+}
+
+NPY_NO_EXPORT void
+FLOAT_matmul_matrixmatrix(void *ip1, npy_intp is1_m, npy_intp is1_n,
+                           void *ip2, npy_intp is2_n, npy_intp is2_p,
+                           void *op, npy_intp os_m, npy_intp os_p,
+                           npy_intp m, npy_intp n, npy_intp p)
+{
+    /*
+     * matrix matrix multiplication -- Level 3 BLAS
+     */
+    enum CBLAS_ORDER order = CblasRowMajor;
+    enum CBLAS_TRANSPOSE trans1, trans2;
+    CBLAS_INT M, N, P, lda, ldb, ldc;
+    assert(m <= BLAS_MAXSIZE && n <= BLAS_MAXSIZE && p <= BLAS_MAXSIZE);
+    M = (CBLAS_INT)m;
+    N = (CBLAS_INT)n;
+    P = (CBLAS_INT)p;
+
+    assert(is_blasable2d(os_m, os_p, m, p, sizeof(npy_float)));
+    ldc = (CBLAS_INT)(os_m / sizeof(npy_float));
+
+    if (is_blasable2d(is1_m, is1_n, m, n, sizeof(npy_float))) {
+        trans1 = CblasNoTrans;
+        lda = (CBLAS_INT)(is1_m / sizeof(npy_float));
+    }
+    else {
+        /* If not ColMajor, caller should have ensured we are RowMajor */
+        /* will not assert in release mode */
+        assert(is_blasable2d(is1_n, is1_m, n, m, sizeof(npy_float)));
+        trans1 = CblasTrans;
+        lda = (CBLAS_INT)(is1_n / sizeof(npy_float));
+    }
+
+    if (is_blasable2d(is2_n, is2_p, n, p, sizeof(npy_float))) {
+        trans2 = CblasNoTrans;
+        ldb = (CBLAS_INT)(is2_n / sizeof(npy_float));
+    }
+    else {
+        /* If not ColMajor, caller should have ensured we are RowMajor */
+        /* will not assert in release mode */
+        assert(is_blasable2d(is2_p, is2_n, p, n, sizeof(npy_float)));
+        trans2 = CblasTrans;
+        ldb = (CBLAS_INT)(is2_p / sizeof(npy_float));
+    }
+    /*
+     * Use syrk if we have a case of a matrix times its transpose.
+     * Otherwise, use gemm for all other cases.
+     */
+    if (
+        (ip1 == ip2) &&
+        (m == p) &&
+        (is1_m == is2_p) &&
+        (is1_n == is2_n) &&
+        (trans1 != trans2)
+    ) {
+        npy_intp i,j;
+        if (trans1 == CblasNoTrans) {
+            CBLAS_FUNC(cblas_ssyrk)(
+                order, CblasUpper, trans1, P, N, 1.F,
+                ip1, lda, 0.F, op, ldc);
+        }
+        else {
+            CBLAS_FUNC(cblas_ssyrk)(
+                order, CblasUpper, trans1, P, N, 1.F,
+                ip1, ldb, 0.F, op, ldc);
+        }
+        /* Copy the triangle */
+        for (i = 0; i < P; i++) {
+            for (j = i + 1; j < P; j++) {
+                ((npy_float*)op)[j * ldc + i] = ((npy_float*)op)[i * ldc + j];
+            }
+        }
+
+    }
+    else {
+        CBLAS_FUNC(cblas_sgemm)(
+            order, trans1, trans2, M, P, N, 1.F, ip1, lda,
+            ip2, ldb, 0.F, op, ldc);
+    }
+}
+
+
+#line 77
+NPY_NO_EXPORT void
+DOUBLE_gemv(void *ip1, npy_intp is1_m, npy_intp is1_n,
+            void *ip2, npy_intp is2_n, npy_intp NPY_UNUSED(is2_p),
+            void *op, npy_intp op_m, npy_intp NPY_UNUSED(op_p),
+            npy_intp m, npy_intp n, npy_intp NPY_UNUSED(p))
+{
+    /*
+     * Vector matrix multiplication -- Level 2 BLAS
+     * arguments
+     * ip1: contiguous data, m*n shape
+     * ip2: data in c order, n*1 shape
+     * op:  data in c order, m shape
+     */
+    enum CBLAS_ORDER order;
+    CBLAS_INT M, N, lda;
+
+    assert(m <= BLAS_MAXSIZE && n <= BLAS_MAXSIZE);
+    assert (is_blasable2d(is2_n, sizeof(npy_double), n, 1, sizeof(npy_double)));
+    M = (CBLAS_INT)m;
+    N = (CBLAS_INT)n;
+
+    if (is_blasable2d(is1_m, is1_n, m, n, sizeof(npy_double))) {
+        order = CblasColMajor;
+        lda = (CBLAS_INT)(is1_m / sizeof(npy_double));
+    }
+    else {
+        /* If not ColMajor, caller should have ensured we are RowMajor */
+        /* will not assert in release mode */
+        order = CblasRowMajor;
+        assert(is_blasable2d(is1_n, is1_m, n, m, sizeof(npy_double)));
+        lda = (CBLAS_INT)(is1_n / sizeof(npy_double));
+    }
+    CBLAS_FUNC(cblas_dgemv)(order, CblasTrans, N, M, 1., ip1, lda, ip2,
+                                     is2_n / sizeof(npy_double), 0., op, op_m / sizeof(npy_double));
+}
+
+NPY_NO_EXPORT void
+DOUBLE_matmul_matrixmatrix(void *ip1, npy_intp is1_m, npy_intp is1_n,
+                           void *ip2, npy_intp is2_n, npy_intp is2_p,
+                           void *op, npy_intp os_m, npy_intp os_p,
+                           npy_intp m, npy_intp n, npy_intp p)
+{
+    /*
+     * matrix matrix multiplication -- Level 3 BLAS
+     */
+    enum CBLAS_ORDER order = CblasRowMajor;
+    enum CBLAS_TRANSPOSE trans1, trans2;
+    CBLAS_INT M, N, P, lda, ldb, ldc;
+    assert(m <= BLAS_MAXSIZE && n <= BLAS_MAXSIZE && p <= BLAS_MAXSIZE);
+    M = (CBLAS_INT)m;
+    N = (CBLAS_INT)n;
+    P = (CBLAS_INT)p;
+
+    assert(is_blasable2d(os_m, os_p, m, p, sizeof(npy_double)));
+    ldc = (CBLAS_INT)(os_m / sizeof(npy_double));
+
+    if (is_blasable2d(is1_m, is1_n, m, n, sizeof(npy_double))) {
+        trans1 = CblasNoTrans;
+        lda = (CBLAS_INT)(is1_m / sizeof(npy_double));
+    }
+    else {
+        /* If not ColMajor, caller should have ensured we are RowMajor */
+        /* will not assert in release mode */
+        assert(is_blasable2d(is1_n, is1_m, n, m, sizeof(npy_double)));
+        trans1 = CblasTrans;
+        lda = (CBLAS_INT)(is1_n / sizeof(npy_double));
+    }
+
+    if (is_blasable2d(is2_n, is2_p, n, p, sizeof(npy_double))) {
+        trans2 = CblasNoTrans;
+        ldb = (CBLAS_INT)(is2_n / sizeof(npy_double));
+    }
+    else {
+        /* If not ColMajor, caller should have ensured we are RowMajor */
+        /* will not assert in release mode */
+        assert(is_blasable2d(is2_p, is2_n, p, n, sizeof(npy_double)));
+        trans2 = CblasTrans;
+        ldb = (CBLAS_INT)(is2_p / sizeof(npy_double));
+    }
+    /*
+     * Use syrk if we have a case of a matrix times its transpose.
+     * Otherwise, use gemm for all other cases.
+     */
+    if (
+        (ip1 == ip2) &&
+        (m == p) &&
+        (is1_m == is2_p) &&
+        (is1_n == is2_n) &&
+        (trans1 != trans2)
+    ) {
+        npy_intp i,j;
+        if (trans1 == CblasNoTrans) {
+            CBLAS_FUNC(cblas_dsyrk)(
+                order, CblasUpper, trans1, P, N, 1.,
+                ip1, lda, 0., op, ldc);
+        }
+        else {
+            CBLAS_FUNC(cblas_dsyrk)(
+                order, CblasUpper, trans1, P, N, 1.,
+                ip1, ldb, 0., op, ldc);
+        }
+        /* Copy the triangle */
+        for (i = 0; i < P; i++) {
+            for (j = i + 1; j < P; j++) {
+                ((npy_double*)op)[j * ldc + i] = ((npy_double*)op)[i * ldc + j];
+            }
+        }
+
+    }
+    else {
+        CBLAS_FUNC(cblas_dgemm)(
+            order, trans1, trans2, M, P, N, 1., ip1, lda,
+            ip2, ldb, 0., op, ldc);
+    }
+}
+
+
+#line 77
+NPY_NO_EXPORT void
+CFLOAT_gemv(void *ip1, npy_intp is1_m, npy_intp is1_n,
+            void *ip2, npy_intp is2_n, npy_intp NPY_UNUSED(is2_p),
+            void *op, npy_intp op_m, npy_intp NPY_UNUSED(op_p),
+            npy_intp m, npy_intp n, npy_intp NPY_UNUSED(p))
+{
+    /*
+     * Vector matrix multiplication -- Level 2 BLAS
+     * arguments
+     * ip1: contiguous data, m*n shape
+     * ip2: data in c order, n*1 shape
+     * op:  data in c order, m shape
+     */
+    enum CBLAS_ORDER order;
+    CBLAS_INT M, N, lda;
+
+    assert(m <= BLAS_MAXSIZE && n <= BLAS_MAXSIZE);
+    assert (is_blasable2d(is2_n, sizeof(npy_cfloat), n, 1, sizeof(npy_cfloat)));
+    M = (CBLAS_INT)m;
+    N = (CBLAS_INT)n;
+
+    if (is_blasable2d(is1_m, is1_n, m, n, sizeof(npy_cfloat))) {
+        order = CblasColMajor;
+        lda = (CBLAS_INT)(is1_m / sizeof(npy_cfloat));
+    }
+    else {
+        /* If not ColMajor, caller should have ensured we are RowMajor */
+        /* will not assert in release mode */
+        order = CblasRowMajor;
+        assert(is_blasable2d(is1_n, is1_m, n, m, sizeof(npy_cfloat)));
+        lda = (CBLAS_INT)(is1_n / sizeof(npy_cfloat));
+    }
+    CBLAS_FUNC(cblas_cgemv)(order, CblasTrans, N, M, &oneF, ip1, lda, ip2,
+                                     is2_n / sizeof(npy_cfloat), &zeroF, op, op_m / sizeof(npy_cfloat));
+}
+
+NPY_NO_EXPORT void
+CFLOAT_matmul_matrixmatrix(void *ip1, npy_intp is1_m, npy_intp is1_n,
+                           void *ip2, npy_intp is2_n, npy_intp is2_p,
+                           void *op, npy_intp os_m, npy_intp os_p,
+                           npy_intp m, npy_intp n, npy_intp p)
+{
+    /*
+     * matrix matrix multiplication -- Level 3 BLAS
+     */
+    enum CBLAS_ORDER order = CblasRowMajor;
+    enum CBLAS_TRANSPOSE trans1, trans2;
+    CBLAS_INT M, N, P, lda, ldb, ldc;
+    assert(m <= BLAS_MAXSIZE && n <= BLAS_MAXSIZE && p <= BLAS_MAXSIZE);
+    M = (CBLAS_INT)m;
+    N = (CBLAS_INT)n;
+    P = (CBLAS_INT)p;
+
+    assert(is_blasable2d(os_m, os_p, m, p, sizeof(npy_cfloat)));
+    ldc = (CBLAS_INT)(os_m / sizeof(npy_cfloat));
+
+    if (is_blasable2d(is1_m, is1_n, m, n, sizeof(npy_cfloat))) {
+        trans1 = CblasNoTrans;
+        lda = (CBLAS_INT)(is1_m / sizeof(npy_cfloat));
+    }
+    else {
+        /* If not ColMajor, caller should have ensured we are RowMajor */
+        /* will not assert in release mode */
+        assert(is_blasable2d(is1_n, is1_m, n, m, sizeof(npy_cfloat)));
+        trans1 = CblasTrans;
+        lda = (CBLAS_INT)(is1_n / sizeof(npy_cfloat));
+    }
+
+    if (is_blasable2d(is2_n, is2_p, n, p, sizeof(npy_cfloat))) {
+        trans2 = CblasNoTrans;
+        ldb = (CBLAS_INT)(is2_n / sizeof(npy_cfloat));
+    }
+    else {
+        /* If not ColMajor, caller should have ensured we are RowMajor */
+        /* will not assert in release mode */
+        assert(is_blasable2d(is2_p, is2_n, p, n, sizeof(npy_cfloat)));
+        trans2 = CblasTrans;
+        ldb = (CBLAS_INT)(is2_p / sizeof(npy_cfloat));
+    }
+    /*
+     * Use syrk if we have a case of a matrix times its transpose.
+     * Otherwise, use gemm for all other cases.
+     */
+    if (
+        (ip1 == ip2) &&
+        (m == p) &&
+        (is1_m == is2_p) &&
+        (is1_n == is2_n) &&
+        (trans1 != trans2)
+    ) {
+        npy_intp i,j;
+        if (trans1 == CblasNoTrans) {
+            CBLAS_FUNC(cblas_csyrk)(
+                order, CblasUpper, trans1, P, N, &oneF,
+                ip1, lda, &zeroF, op, ldc);
+        }
+        else {
+            CBLAS_FUNC(cblas_csyrk)(
+                order, CblasUpper, trans1, P, N, &oneF,
+                ip1, ldb, &zeroF, op, ldc);
+        }
+        /* Copy the triangle */
+        for (i = 0; i < P; i++) {
+            for (j = i + 1; j < P; j++) {
+                ((npy_cfloat*)op)[j * ldc + i] = ((npy_cfloat*)op)[i * ldc + j];
+            }
+        }
+
+    }
+    else {
+        CBLAS_FUNC(cblas_cgemm)(
+            order, trans1, trans2, M, P, N, &oneF, ip1, lda,
+            ip2, ldb, &zeroF, op, ldc);
+    }
+}
+
+
+#line 77
+NPY_NO_EXPORT void
+CDOUBLE_gemv(void *ip1, npy_intp is1_m, npy_intp is1_n,
+            void *ip2, npy_intp is2_n, npy_intp NPY_UNUSED(is2_p),
+            void *op, npy_intp op_m, npy_intp NPY_UNUSED(op_p),
+            npy_intp m, npy_intp n, npy_intp NPY_UNUSED(p))
+{
+    /*
+     * Vector matrix multiplication -- Level 2 BLAS
+     * arguments
+     * ip1: contiguous data, m*n shape
+     * ip2: data in c order, n*1 shape
+     * op:  data in c order, m shape
+     */
+    enum CBLAS_ORDER order;
+    CBLAS_INT M, N, lda;
+
+    assert(m <= BLAS_MAXSIZE && n <= BLAS_MAXSIZE);
+    assert (is_blasable2d(is2_n, sizeof(npy_cdouble), n, 1, sizeof(npy_cdouble)));
+    M = (CBLAS_INT)m;
+    N = (CBLAS_INT)n;
+
+    if (is_blasable2d(is1_m, is1_n, m, n, sizeof(npy_cdouble))) {
+        order = CblasColMajor;
+        lda = (CBLAS_INT)(is1_m / sizeof(npy_cdouble));
+    }
+    else {
+        /* If not ColMajor, caller should have ensured we are RowMajor */
+        /* will not assert in release mode */
+        order = CblasRowMajor;
+        assert(is_blasable2d(is1_n, is1_m, n, m, sizeof(npy_cdouble)));
+        lda = (CBLAS_INT)(is1_n / sizeof(npy_cdouble));
+    }
+    CBLAS_FUNC(cblas_zgemv)(order, CblasTrans, N, M, &oneD, ip1, lda, ip2,
+                                     is2_n / sizeof(npy_cdouble), &zeroD, op, op_m / sizeof(npy_cdouble));
+}
+
+NPY_NO_EXPORT void
+CDOUBLE_matmul_matrixmatrix(void *ip1, npy_intp is1_m, npy_intp is1_n,
+                           void *ip2, npy_intp is2_n, npy_intp is2_p,
+                           void *op, npy_intp os_m, npy_intp os_p,
+                           npy_intp m, npy_intp n, npy_intp p)
+{
+    /*
+     * matrix matrix multiplication -- Level 3 BLAS
+     */
+    enum CBLAS_ORDER order = CblasRowMajor;
+    enum CBLAS_TRANSPOSE trans1, trans2;
+    CBLAS_INT M, N, P, lda, ldb, ldc;
+    assert(m <= BLAS_MAXSIZE && n <= BLAS_MAXSIZE && p <= BLAS_MAXSIZE);
+    M = (CBLAS_INT)m;
+    N = (CBLAS_INT)n;
+    P = (CBLAS_INT)p;
+
+    assert(is_blasable2d(os_m, os_p, m, p, sizeof(npy_cdouble)));
+    ldc = (CBLAS_INT)(os_m / sizeof(npy_cdouble));
+
+    if (is_blasable2d(is1_m, is1_n, m, n, sizeof(npy_cdouble))) {
+        trans1 = CblasNoTrans;
+        lda = (CBLAS_INT)(is1_m / sizeof(npy_cdouble));
+    }
+    else {
+        /* If not ColMajor, caller should have ensured we are RowMajor */
+        /* will not assert in release mode */
+        assert(is_blasable2d(is1_n, is1_m, n, m, sizeof(npy_cdouble)));
+        trans1 = CblasTrans;
+        lda = (CBLAS_INT)(is1_n / sizeof(npy_cdouble));
+    }
+
+    if (is_blasable2d(is2_n, is2_p, n, p, sizeof(npy_cdouble))) {
+        trans2 = CblasNoTrans;
+        ldb = (CBLAS_INT)(is2_n / sizeof(npy_cdouble));
+    }
+    else {
+        /* If not ColMajor, caller should have ensured we are RowMajor */
+        /* will not assert in release mode */
+        assert(is_blasable2d(is2_p, is2_n, p, n, sizeof(npy_cdouble)));
+        trans2 = CblasTrans;
+        ldb = (CBLAS_INT)(is2_p / sizeof(npy_cdouble));
+    }
+    /*
+     * Use syrk if we have a case of a matrix times its transpose.
+     * Otherwise, use gemm for all other cases.
+     */
+    if (
+        (ip1 == ip2) &&
+        (m == p) &&
+        (is1_m == is2_p) &&
+        (is1_n == is2_n) &&
+        (trans1 != trans2)
+    ) {
+        npy_intp i,j;
+        if (trans1 == CblasNoTrans) {
+            CBLAS_FUNC(cblas_zsyrk)(
+                order, CblasUpper, trans1, P, N, &oneD,
+                ip1, lda, &zeroD, op, ldc);
+        }
+        else {
+            CBLAS_FUNC(cblas_zsyrk)(
+                order, CblasUpper, trans1, P, N, &oneD,
+                ip1, ldb, &zeroD, op, ldc);
+        }
+        /* Copy the triangle */
+        for (i = 0; i < P; i++) {
+            for (j = i + 1; j < P; j++) {
+                ((npy_cdouble*)op)[j * ldc + i] = ((npy_cdouble*)op)[i * ldc + j];
+            }
+        }
+
+    }
+    else {
+        CBLAS_FUNC(cblas_zgemm)(
+            order, trans1, trans2, M, P, N, &oneD, ip1, lda,
+            ip2, ldb, &zeroD, op, ldc);
+    }
+}
+
+
+#endif
+
+/*
+ * matmul loops
+ * signature is (m?,n),(n,p?)->(m?,p?)
+ */
+
+#line 215
+
+NPY_NO_EXPORT void
+LONGDOUBLE_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
+                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
+                           void *_op, npy_intp os_m, npy_intp os_p,
+                           npy_intp dm, npy_intp dn, npy_intp dp)
+                           
+{
+    npy_intp m, n, p;
+    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
+    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;
+
+    ib1_n = is1_n * dn;
+    ib2_n = is2_n * dn;
+    ib2_p = is2_p * dp;
+    ob_p  = os_p * dp;
+
+    for (m = 0; m < dm; m++) {
+        for (p = 0; p < dp; p++) {
+#if 0 == 1
+            (*(npy_longdouble *)op).real = 0;
+            (*(npy_longdouble *)op).imag = 0;
+#elif 0
+            float sum = 0;
+#else
+            *(npy_longdouble *)op = 0;
+#endif
+            for (n = 0; n < dn; n++) {
+                npy_longdouble val1 = (*(npy_longdouble *)ip1);
+                npy_longdouble val2 = (*(npy_longdouble *)ip2);
+#if 0
+                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
+#elif 0 == 1
+                (*(npy_longdouble *)op).real += (val1.real * val2.real) -
+                                       (val1.imag * val2.imag);
+                (*(npy_longdouble *)op).imag += (val1.real * val2.imag) +
+                                       (val1.imag * val2.real);
+#else
+                *(npy_longdouble *)op += val1 * val2;
+#endif
+                ip2 += is2_n;
+                ip1 += is1_n;
+            }
+#if 0
+            *(npy_longdouble *)op = npy_float_to_half(sum);
+#endif
+            ip1 -= ib1_n;
+            ip2 -= ib2_n;
+            op  +=  os_p;
+            ip2 += is2_p;
+        }
+        op -= ob_p;
+        ip2 -= ib2_p;
+        ip1 += is1_m;
+        op  +=  os_m;
+    }
+}
+
+
+#line 215
+
+NPY_NO_EXPORT void
+FLOAT_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
+                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
+                           void *_op, npy_intp os_m, npy_intp os_p,
+                           npy_intp dm, npy_intp dn, npy_intp dp)
+                           
+{
+    npy_intp m, n, p;
+    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
+    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;
+
+    ib1_n = is1_n * dn;
+    ib2_n = is2_n * dn;
+    ib2_p = is2_p * dp;
+    ob_p  = os_p * dp;
+
+    for (m = 0; m < dm; m++) {
+        for (p = 0; p < dp; p++) {
+#if 0 == 1
+            (*(npy_float *)op).real = 0;
+            (*(npy_float *)op).imag = 0;
+#elif 0
+            float sum = 0;
+#else
+            *(npy_float *)op = 0;
+#endif
+            for (n = 0; n < dn; n++) {
+                npy_float val1 = (*(npy_float *)ip1);
+                npy_float val2 = (*(npy_float *)ip2);
+#if 0
+                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
+#elif 0 == 1
+                (*(npy_float *)op).real += (val1.real * val2.real) -
+                                       (val1.imag * val2.imag);
+                (*(npy_float *)op).imag += (val1.real * val2.imag) +
+                                       (val1.imag * val2.real);
+#else
+                *(npy_float *)op += val1 * val2;
+#endif
+                ip2 += is2_n;
+                ip1 += is1_n;
+            }
+#if 0
+            *(npy_float *)op = npy_float_to_half(sum);
+#endif
+            ip1 -= ib1_n;
+            ip2 -= ib2_n;
+            op  +=  os_p;
+            ip2 += is2_p;
+        }
+        op -= ob_p;
+        ip2 -= ib2_p;
+        ip1 += is1_m;
+        op  +=  os_m;
+    }
+}
+
+
+#line 215
+
+NPY_NO_EXPORT void
+DOUBLE_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
+                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
+                           void *_op, npy_intp os_m, npy_intp os_p,
+                           npy_intp dm, npy_intp dn, npy_intp dp)
+                           
+{
+    npy_intp m, n, p;
+    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
+    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;
+
+    ib1_n = is1_n * dn;
+    ib2_n = is2_n * dn;
+    ib2_p = is2_p * dp;
+    ob_p  = os_p * dp;
+
+    for (m = 0; m < dm; m++) {
+        for (p = 0; p < dp; p++) {
+#if 0 == 1
+            (*(npy_double *)op).real = 0;
+            (*(npy_double *)op).imag = 0;
+#elif 0
+            float sum = 0;
+#else
+            *(npy_double *)op = 0;
+#endif
+            for (n = 0; n < dn; n++) {
+                npy_double val1 = (*(npy_double *)ip1);
+                npy_double val2 = (*(npy_double *)ip2);
+#if 0
+                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
+#elif 0 == 1
+                (*(npy_double *)op).real += (val1.real * val2.real) -
+                                       (val1.imag * val2.imag);
+                (*(npy_double *)op).imag += (val1.real * val2.imag) +
+                                       (val1.imag * val2.real);
+#else
+                *(npy_double *)op += val1 * val2;
+#endif
+                ip2 += is2_n;
+                ip1 += is1_n;
+            }
+#if 0
+            *(npy_double *)op = npy_float_to_half(sum);
+#endif
+            ip1 -= ib1_n;
+            ip2 -= ib2_n;
+            op  +=  os_p;
+            ip2 += is2_p;
+        }
+        op -= ob_p;
+        ip2 -= ib2_p;
+        ip1 += is1_m;
+        op  +=  os_m;
+    }
+}
+
+
+#line 215
+
+NPY_NO_EXPORT void
+HALF_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
+                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
+                           void *_op, npy_intp os_m, npy_intp os_p,
+                           npy_intp dm, npy_intp dn, npy_intp dp)
+                           
+{
+    npy_intp m, n, p;
+    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
+    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;
+
+    ib1_n = is1_n * dn;
+    ib2_n = is2_n * dn;
+    ib2_p = is2_p * dp;
+    ob_p  = os_p * dp;
+
+    for (m = 0; m < dm; m++) {
+        for (p = 0; p < dp; p++) {
+#if 0 == 1
+            (*(npy_half *)op).real = 0;
+            (*(npy_half *)op).imag = 0;
+#elif 1
+            float sum = 0;
+#else
+            *(npy_half *)op = 0;
+#endif
+            for (n = 0; n < dn; n++) {
+                npy_half val1 = (*(npy_half *)ip1);
+                npy_half val2 = (*(npy_half *)ip2);
+#if 1
+                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
+#elif 0 == 1
+                (*(npy_half *)op).real += (val1.real * val2.real) -
+                                       (val1.imag * val2.imag);
+                (*(npy_half *)op).imag += (val1.real * val2.imag) +
+                                       (val1.imag * val2.real);
+#else
+                *(npy_half *)op += val1 * val2;
+#endif
+                ip2 += is2_n;
+                ip1 += is1_n;
+            }
+#if 1
+            *(npy_half *)op = npy_float_to_half(sum);
+#endif
+            ip1 -= ib1_n;
+            ip2 -= ib2_n;
+            op  +=  os_p;
+            ip2 += is2_p;
+        }
+        op -= ob_p;
+        ip2 -= ib2_p;
+        ip1 += is1_m;
+        op  +=  os_m;
+    }
+}
+
+
+#line 215
+
+NPY_NO_EXPORT void
+CFLOAT_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
+                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
+                           void *_op, npy_intp os_m, npy_intp os_p,
+                           npy_intp dm, npy_intp dn, npy_intp dp)
+                           
+{
+    npy_intp m, n, p;
+    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
+    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;
+
+    ib1_n = is1_n * dn;
+    ib2_n = is2_n * dn;
+    ib2_p = is2_p * dp;
+    ob_p  = os_p * dp;
+
+    for (m = 0; m < dm; m++) {
+        for (p = 0; p < dp; p++) {
+#if 1 == 1
+            (*(npy_cfloat *)op).real = 0;
+            (*(npy_cfloat *)op).imag = 0;
+#elif 0
+            float sum = 0;
+#else
+            *(npy_cfloat *)op = 0;
+#endif
+            for (n = 0; n < dn; n++) {
+                npy_cfloat val1 = (*(npy_cfloat *)ip1);
+                npy_cfloat val2 = (*(npy_cfloat *)ip2);
+#if 0
+                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
+#elif 1 == 1
+                (*(npy_cfloat *)op).real += (val1.real * val2.real) -
+                                       (val1.imag * val2.imag);
+                (*(npy_cfloat *)op).imag += (val1.real * val2.imag) +
+                                       (val1.imag * val2.real);
+#else
+                *(npy_cfloat *)op += val1 * val2;
+#endif
+                ip2 += is2_n;
+                ip1 += is1_n;
+            }
+#if 0
+            *(npy_cfloat *)op = npy_float_to_half(sum);
+#endif
+            ip1 -= ib1_n;
+            ip2 -= ib2_n;
+            op  +=  os_p;
+            ip2 += is2_p;
+        }
+        op -= ob_p;
+        ip2 -= ib2_p;
+        ip1 += is1_m;
+        op  +=  os_m;
+    }
+}
+
+
+#line 215
+
+NPY_NO_EXPORT void
+CDOUBLE_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
+                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
+                           void *_op, npy_intp os_m, npy_intp os_p,
+                           npy_intp dm, npy_intp dn, npy_intp dp)
+                           
+{
+    npy_intp m, n, p;
+    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
+    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;
+
+    ib1_n = is1_n * dn;
+    ib2_n = is2_n * dn;
+    ib2_p = is2_p * dp;
+    ob_p  = os_p * dp;
+
+    for (m = 0; m < dm; m++) {
+        for (p = 0; p < dp; p++) {
+#if 1 == 1
+            (*(npy_cdouble *)op).real = 0;
+            (*(npy_cdouble *)op).imag = 0;
+#elif 0
+            float sum = 0;
+#else
+            *(npy_cdouble *)op = 0;
+#endif
+            for (n = 0; n < dn; n++) {
+                npy_cdouble val1 = (*(npy_cdouble *)ip1);
+                npy_cdouble val2 = (*(npy_cdouble *)ip2);
+#if 0
+                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
+#elif 1 == 1
+                (*(npy_cdouble *)op).real += (val1.real * val2.real) -
+                                       (val1.imag * val2.imag);
+                (*(npy_cdouble *)op).imag += (val1.real * val2.imag) +
+                                       (val1.imag * val2.real);
+#else
+                *(npy_cdouble *)op += val1 * val2;
+#endif
+                ip2 += is2_n;
+                ip1 += is1_n;
+            }
+#if 0
+            *(npy_cdouble *)op = npy_float_to_half(sum);
+#endif
+            ip1 -= ib1_n;
+            ip2 -= ib2_n;
+            op  +=  os_p;
+            ip2 += is2_p;
+        }
+        op -= ob_p;
+        ip2 -= ib2_p;
+        ip1 += is1_m;
+        op  +=  os_m;
+    }
+}
+
+
+#line 215
+
+NPY_NO_EXPORT void
+CLONGDOUBLE_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
+                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
+                           void *_op, npy_intp os_m, npy_intp os_p,
+                           npy_intp dm, npy_intp dn, npy_intp dp)
+                           
+{
+    npy_intp m, n, p;
+    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
+    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;
+
+    ib1_n = is1_n * dn;
+    ib2_n = is2_n * dn;
+    ib2_p = is2_p * dp;
+    ob_p  = os_p * dp;
+
+    for (m = 0; m < dm; m++) {
+        for (p = 0; p < dp; p++) {
+#if 1 == 1
+            (*(npy_clongdouble *)op).real = 0;
+            (*(npy_clongdouble *)op).imag = 0;
+#elif 0
+            float sum = 0;
+#else
+            *(npy_clongdouble *)op = 0;
+#endif
+            for (n = 0; n < dn; n++) {
+                npy_clongdouble val1 = (*(npy_clongdouble *)ip1);
+                npy_clongdouble val2 = (*(npy_clongdouble *)ip2);
+#if 0
+                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
+#elif 1 == 1
+                (*(npy_clongdouble *)op).real += (val1.real * val2.real) -
+                                       (val1.imag * val2.imag);
+                (*(npy_clongdouble *)op).imag += (val1.real * val2.imag) +
+                                       (val1.imag * val2.real);
+#else
+                *(npy_clongdouble *)op += val1 * val2;
+#endif
+                ip2 += is2_n;
+                ip1 += is1_n;
+            }
+#if 0
+            *(npy_clongdouble *)op = npy_float_to_half(sum);
+#endif
+            ip1 -= ib1_n;
+            ip2 -= ib2_n;
+            op  +=  os_p;
+            ip2 += is2_p;
+        }
+        op -= ob_p;
+        ip2 -= ib2_p;
+        ip1 += is1_m;
+        op  +=  os_m;
+    }
+}
+
+
+#line 215
+
+NPY_NO_EXPORT void
+UBYTE_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
+                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
+                           void *_op, npy_intp os_m, npy_intp os_p,
+                           npy_intp dm, npy_intp dn, npy_intp dp)
+                           
+{
+    npy_intp m, n, p;
+    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
+    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;
+
+    ib1_n = is1_n * dn;
+    ib2_n = is2_n * dn;
+    ib2_p = is2_p * dp;
+    ob_p  = os_p * dp;
+
+    for (m = 0; m < dm; m++) {
+        for (p = 0; p < dp; p++) {
+#if 0 == 1
+            (*(npy_ubyte *)op).real = 0;
+            (*(npy_ubyte *)op).imag = 0;
+#elif 0
+            float sum = 0;
+#else
+            *(npy_ubyte *)op = 0;
+#endif
+            for (n = 0; n < dn; n++) {
+                npy_ubyte val1 = (*(npy_ubyte *)ip1);
+                npy_ubyte val2 = (*(npy_ubyte *)ip2);
+#if 0
+                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
+#elif 0 == 1
+                (*(npy_ubyte *)op).real += (val1.real * val2.real) -
+                                       (val1.imag * val2.imag);
+                (*(npy_ubyte *)op).imag += (val1.real * val2.imag) +
+                                       (val1.imag * val2.real);
+#else
+                *(npy_ubyte *)op += val1 * val2;
+#endif
+                ip2 += is2_n;
+                ip1 += is1_n;
+            }
+#if 0
+            *(npy_ubyte *)op = npy_float_to_half(sum);
+#endif
+            ip1 -= ib1_n;
+            ip2 -= ib2_n;
+            op  +=  os_p;
+            ip2 += is2_p;
+        }
+        op -= ob_p;
+        ip2 -= ib2_p;
+        ip1 += is1_m;
+        op  +=  os_m;
+    }
+}
+
+
+#line 215
+
+NPY_NO_EXPORT void
+USHORT_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
+                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
+                           void *_op, npy_intp os_m, npy_intp os_p,
+                           npy_intp dm, npy_intp dn, npy_intp dp)
+                           
+{
+    npy_intp m, n, p;
+    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
+    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;
+
+    ib1_n = is1_n * dn;
+    ib2_n = is2_n * dn;
+    ib2_p = is2_p * dp;
+    ob_p  = os_p * dp;
+
+    for (m = 0; m < dm; m++) {
+        for (p = 0; p < dp; p++) {
+#if 0 == 1
+            (*(npy_ushort *)op).real = 0;
+            (*(npy_ushort *)op).imag = 0;
+#elif 0
+            float sum = 0;
+#else
+            *(npy_ushort *)op = 0;
+#endif
+            for (n = 0; n < dn; n++) {
+                npy_ushort val1 = (*(npy_ushort *)ip1);
+                npy_ushort val2 = (*(npy_ushort *)ip2);
+#if 0
+                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
+#elif 0 == 1
+                (*(npy_ushort *)op).real += (val1.real * val2.real) -
+                                       (val1.imag * val2.imag);
+                (*(npy_ushort *)op).imag += (val1.real * val2.imag) +
+                                       (val1.imag * val2.real);
+#else
+                *(npy_ushort *)op += val1 * val2;
+#endif
+                ip2 += is2_n;
+                ip1 += is1_n;
+            }
+#if 0
+            *(npy_ushort *)op = npy_float_to_half(sum);
+#endif
+            ip1 -= ib1_n;
+            ip2 -= ib2_n;
+            op  +=  os_p;
+            ip2 += is2_p;
+        }
+        op -= ob_p;
+        ip2 -= ib2_p;
+        ip1 += is1_m;
+        op  +=  os_m;
+    }
+}
+
+
+#line 215
+
+NPY_NO_EXPORT void
+UINT_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
+                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
+                           void *_op, npy_intp os_m, npy_intp os_p,
+                           npy_intp dm, npy_intp dn, npy_intp dp)
+                           
+{
+    npy_intp m, n, p;
+    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
+    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;
+
+    ib1_n = is1_n * dn;
+    ib2_n = is2_n * dn;
+    ib2_p = is2_p * dp;
+    ob_p  = os_p * dp;
+
+    for (m = 0; m < dm; m++) {
+        for (p = 0; p < dp; p++) {
+#if 0 == 1
+            (*(npy_uint *)op).real = 0;
+            (*(npy_uint *)op).imag = 0;
+#elif 0
+            float sum = 0;
+#else
+            *(npy_uint *)op = 0;
+#endif
+            for (n = 0; n < dn; n++) {
+                npy_uint val1 = (*(npy_uint *)ip1);
+                npy_uint val2 = (*(npy_uint *)ip2);
+#if 0
+                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
+#elif 0 == 1
+                (*(npy_uint *)op).real += (val1.real * val2.real) -
+                                       (val1.imag * val2.imag);
+                (*(npy_uint *)op).imag += (val1.real * val2.imag) +
+                                       (val1.imag * val2.real);
+#else
+                *(npy_uint *)op += val1 * val2;
+#endif
+                ip2 += is2_n;
+                ip1 += is1_n;
+            }
+#if 0
+            *(npy_uint *)op = npy_float_to_half(sum);
+#endif
+            ip1 -= ib1_n;
+            ip2 -= ib2_n;
+            op  +=  os_p;
+            ip2 += is2_p;
+        }
+        op -= ob_p;
+        ip2 -= ib2_p;
+        ip1 += is1_m;
+        op  +=  os_m;
+    }
+}
+
+
+#line 215
+
+NPY_NO_EXPORT void
+ULONG_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
+                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
+                           void *_op, npy_intp os_m, npy_intp os_p,
+                           npy_intp dm, npy_intp dn, npy_intp dp)
+                           
+{
+    npy_intp m, n, p;
+    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
+    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;
+
+    ib1_n = is1_n * dn;
+    ib2_n = is2_n * dn;
+    ib2_p = is2_p * dp;
+    ob_p  = os_p * dp;
+
+    for (m = 0; m < dm; m++) {
+        for (p = 0; p < dp; p++) {
+#if 0 == 1
+            (*(npy_ulong *)op).real = 0;
+            (*(npy_ulong *)op).imag = 0;
+#elif 0
+            float sum = 0;
+#else
+            *(npy_ulong *)op = 0;
+#endif
+            for (n = 0; n < dn; n++) {
+                npy_ulong val1 = (*(npy_ulong *)ip1);
+                npy_ulong val2 = (*(npy_ulong *)ip2);
+#if 0
+                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
+#elif 0 == 1
+                (*(npy_ulong *)op).real += (val1.real * val2.real) -
+                                       (val1.imag * val2.imag);
+                (*(npy_ulong *)op).imag += (val1.real * val2.imag) +
+                                       (val1.imag * val2.real);
+#else
+                *(npy_ulong *)op += val1 * val2;
+#endif
+                ip2 += is2_n;
+                ip1 += is1_n;
+            }
+#if 0
+            *(npy_ulong *)op = npy_float_to_half(sum);
+#endif
+            ip1 -= ib1_n;
+            ip2 -= ib2_n;
+            op  +=  os_p;
+            ip2 += is2_p;
+        }
+        op -= ob_p;
+        ip2 -= ib2_p;
+        ip1 += is1_m;
+        op  +=  os_m;
+    }
+}
+
+
+#line 215
+
+NPY_NO_EXPORT void
+ULONGLONG_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
+                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
+                           void *_op, npy_intp os_m, npy_intp os_p,
+                           npy_intp dm, npy_intp dn, npy_intp dp)
+                           
+{
+    npy_intp m, n, p;
+    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
+    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;
+
+    ib1_n = is1_n * dn;
+    ib2_n = is2_n * dn;
+    ib2_p = is2_p * dp;
+    ob_p  = os_p * dp;
+
+    for (m = 0; m < dm; m++) {
+        for (p = 0; p < dp; p++) {
+#if 0 == 1
+            (*(npy_ulonglong *)op).real = 0;
+            (*(npy_ulonglong *)op).imag = 0;
+#elif 0
+            float sum = 0;
+#else
+            *(npy_ulonglong *)op = 0;
+#endif
+            for (n = 0; n < dn; n++) {
+                npy_ulonglong val1 = (*(npy_ulonglong *)ip1);
+                npy_ulonglong val2 = (*(npy_ulonglong *)ip2);
+#if 0
+                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
+#elif 0 == 1
+                (*(npy_ulonglong *)op).real += (val1.real * val2.real) -
+                                       (val1.imag * val2.imag);
+                (*(npy_ulonglong *)op).imag += (val1.real * val2.imag) +
+                                       (val1.imag * val2.real);
+#else
+                *(npy_ulonglong *)op += val1 * val2;
+#endif
+                ip2 += is2_n;
+                ip1 += is1_n;
+            }
+#if 0
+            *(npy_ulonglong *)op = npy_float_to_half(sum);
+#endif
+            ip1 -= ib1_n;
+            ip2 -= ib2_n;
+            op  +=  os_p;
+            ip2 += is2_p;
+        }
+        op -= ob_p;
+        ip2 -= ib2_p;
+        ip1 += is1_m;
+        op  +=  os_m;
+    }
+}
+
+
+#line 215
+
+NPY_NO_EXPORT void
+BYTE_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
+                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
+                           void *_op, npy_intp os_m, npy_intp os_p,
+                           npy_intp dm, npy_intp dn, npy_intp dp)
+                           
+{
+    npy_intp m, n, p;
+    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
+    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;
+
+    ib1_n = is1_n * dn;
+    ib2_n = is2_n * dn;
+    ib2_p = is2_p * dp;
+    ob_p  = os_p * dp;
+
+    for (m = 0; m < dm; m++) {
+        for (p = 0; p < dp; p++) {
+#if 0 == 1
+            (*(npy_byte *)op).real = 0;
+            (*(npy_byte *)op).imag = 0;
+#elif 0
+            float sum = 0;
+#else
+            *(npy_byte *)op = 0;
+#endif
+            for (n = 0; n < dn; n++) {
+                npy_byte val1 = (*(npy_byte *)ip1);
+                npy_byte val2 = (*(npy_byte *)ip2);
+#if 0
+                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
+#elif 0 == 1
+                (*(npy_byte *)op).real += (val1.real * val2.real) -
+                                       (val1.imag * val2.imag);
+                (*(npy_byte *)op).imag += (val1.real * val2.imag) +
+                                       (val1.imag * val2.real);
+#else
+                *(npy_byte *)op += val1 * val2;
+#endif
+                ip2 += is2_n;
+                ip1 += is1_n;
+            }
+#if 0
+            *(npy_byte *)op = npy_float_to_half(sum);
+#endif
+            ip1 -= ib1_n;
+            ip2 -= ib2_n;
+            op  +=  os_p;
+            ip2 += is2_p;
+        }
+        op -= ob_p;
+        ip2 -= ib2_p;
+        ip1 += is1_m;
+        op  +=  os_m;
+    }
+}
+
+
+#line 215
+
+NPY_NO_EXPORT void
+SHORT_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
+                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
+                           void *_op, npy_intp os_m, npy_intp os_p,
+                           npy_intp dm, npy_intp dn, npy_intp dp)
+                           
+{
+    npy_intp m, n, p;
+    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
+    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;
+
+    ib1_n = is1_n * dn;
+    ib2_n = is2_n * dn;
+    ib2_p = is2_p * dp;
+    ob_p  = os_p * dp;
+
+    for (m = 0; m < dm; m++) {
+        for (p = 0; p < dp; p++) {
+#if 0 == 1
+            (*(npy_short *)op).real = 0;
+            (*(npy_short *)op).imag = 0;
+#elif 0
+            float sum = 0;
+#else
+            *(npy_short *)op = 0;
+#endif
+            for (n = 0; n < dn; n++) {
+                npy_short val1 = (*(npy_short *)ip1);
+                npy_short val2 = (*(npy_short *)ip2);
+#if 0
+                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
+#elif 0 == 1
+                (*(npy_short *)op).real += (val1.real * val2.real) -
+                                       (val1.imag * val2.imag);
+                (*(npy_short *)op).imag += (val1.real * val2.imag) +
+                                       (val1.imag * val2.real);
+#else
+                *(npy_short *)op += val1 * val2;
+#endif
+                ip2 += is2_n;
+                ip1 += is1_n;
+            }
+#if 0
+            *(npy_short *)op = npy_float_to_half(sum);
+#endif
+            ip1 -= ib1_n;
+            ip2 -= ib2_n;
+            op  +=  os_p;
+            ip2 += is2_p;
+        }
+        op -= ob_p;
+        ip2 -= ib2_p;
+        ip1 += is1_m;
+        op  +=  os_m;
+    }
+}
+
+
+#line 215
+
+NPY_NO_EXPORT void
+INT_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
+                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
+                           void *_op, npy_intp os_m, npy_intp os_p,
+                           npy_intp dm, npy_intp dn, npy_intp dp)
+                           
+{
+    npy_intp m, n, p;
+    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
+    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;
+
+    ib1_n = is1_n * dn;
+    ib2_n = is2_n * dn;
+    ib2_p = is2_p * dp;
+    ob_p  = os_p * dp;
+
+    for (m = 0; m < dm; m++) {
+        for (p = 0; p < dp; p++) {
+#if 0 == 1
+            (*(npy_int *)op).real = 0;
+            (*(npy_int *)op).imag = 0;
+#elif 0
+            float sum = 0;
+#else
+            *(npy_int *)op = 0;
+#endif
+            for (n = 0; n < dn; n++) {
+                npy_int val1 = (*(npy_int *)ip1);
+                npy_int val2 = (*(npy_int *)ip2);
+#if 0
+                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
+#elif 0 == 1
+                (*(npy_int *)op).real += (val1.real * val2.real) -
+                                       (val1.imag * val2.imag);
+                (*(npy_int *)op).imag += (val1.real * val2.imag) +
+                                       (val1.imag * val2.real);
+#else
+                *(npy_int *)op += val1 * val2;
+#endif
+                ip2 += is2_n;
+                ip1 += is1_n;
+            }
+#if 0
+            *(npy_int *)op = npy_float_to_half(sum);
+#endif
+            ip1 -= ib1_n;
+            ip2 -= ib2_n;
+            op  +=  os_p;
+            ip2 += is2_p;
+        }
+        op -= ob_p;
+        ip2 -= ib2_p;
+        ip1 += is1_m;
+        op  +=  os_m;
+    }
+}
+
+
+#line 215
+
+NPY_NO_EXPORT void
+LONG_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
+                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
+                           void *_op, npy_intp os_m, npy_intp os_p,
+                           npy_intp dm, npy_intp dn, npy_intp dp)
+                           
+{
+    npy_intp m, n, p;
+    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
+    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;
+
+    ib1_n = is1_n * dn;
+    ib2_n = is2_n * dn;
+    ib2_p = is2_p * dp;
+    ob_p  = os_p * dp;
+
+    for (m = 0; m < dm; m++) {
+        for (p = 0; p < dp; p++) {
+#if 0 == 1
+            (*(npy_long *)op).real = 0;
+            (*(npy_long *)op).imag = 0;
+#elif 0
+            float sum = 0;
+#else
+            *(npy_long *)op = 0;
+#endif
+            for (n = 0; n < dn; n++) {
+                npy_long val1 = (*(npy_long *)ip1);
+                npy_long val2 = (*(npy_long *)ip2);
+#if 0
+                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
+#elif 0 == 1
+                (*(npy_long *)op).real += (val1.real * val2.real) -
+                                       (val1.imag * val2.imag);
+                (*(npy_long *)op).imag += (val1.real * val2.imag) +
+                                       (val1.imag * val2.real);
+#else
+                *(npy_long *)op += val1 * val2;
+#endif
+                ip2 += is2_n;
+                ip1 += is1_n;
+            }
+#if 0
+            *(npy_long *)op = npy_float_to_half(sum);
+#endif
+            ip1 -= ib1_n;
+            ip2 -= ib2_n;
+            op  +=  os_p;
+            ip2 += is2_p;
+        }
+        op -= ob_p;
+        ip2 -= ib2_p;
+        ip1 += is1_m;
+        op  +=  os_m;
+    }
+}
+
+
+#line 215
+
+NPY_NO_EXPORT void
+LONGLONG_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
+                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
+                           void *_op, npy_intp os_m, npy_intp os_p,
+                           npy_intp dm, npy_intp dn, npy_intp dp)
+                           
+{
+    npy_intp m, n, p;
+    npy_intp ib1_n, ib2_n, ib2_p, ob_p;
+    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;
+
+    ib1_n = is1_n * dn;
+    ib2_n = is2_n * dn;
+    ib2_p = is2_p * dp;
+    ob_p  = os_p * dp;
+
+    for (m = 0; m < dm; m++) {
+        for (p = 0; p < dp; p++) {
+#if 0 == 1
+            (*(npy_longlong *)op).real = 0;
+            (*(npy_longlong *)op).imag = 0;
+#elif 0
+            float sum = 0;
+#else
+            *(npy_longlong *)op = 0;
+#endif
+            for (n = 0; n < dn; n++) {
+                npy_longlong val1 = (*(npy_longlong *)ip1);
+                npy_longlong val2 = (*(npy_longlong *)ip2);
+#if 0
+                sum += npy_half_to_float(val1) * npy_half_to_float(val2);
+#elif 0 == 1
+                (*(npy_longlong *)op).real += (val1.real * val2.real) -
+                                       (val1.imag * val2.imag);
+                (*(npy_longlong *)op).imag += (val1.real * val2.imag) +
+                                       (val1.imag * val2.real);
+#else
+                *(npy_longlong *)op += val1 * val2;
+#endif
+                ip2 += is2_n;
+                ip1 += is1_n;
+            }
+#if 0
+            *(npy_longlong *)op = npy_float_to_half(sum);
+#endif
+            ip1 -= ib1_n;
+            ip2 -= ib2_n;
+            op  +=  os_p;
+            ip2 += is2_p;
+        }
+        op -= ob_p;
+        ip2 -= ib2_p;
+        ip1 += is1_m;
+        op  +=  os_m;
+    }
+}
+
+
+NPY_NO_EXPORT void
+BOOL_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
+                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
+                           void *_op, npy_intp os_m, npy_intp os_p,
+                           npy_intp dm, npy_intp dn, npy_intp dp)
+                           
+{
+    npy_intp m, n, p;
+    npy_intp ib2_p, ob_p;
+    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;
+
+    ib2_p = is2_p * dp;
+    ob_p  = os_p * dp;
+
+    for (m = 0; m < dm; m++) {
+        for (p = 0; p < dp; p++) {
+            char *ip1tmp = ip1;
+            char *ip2tmp = ip2;
+            *(npy_bool *)op = NPY_FALSE;
+            for (n = 0; n < dn; n++) {
+                npy_bool val1 = (*(npy_bool *)ip1tmp);
+                npy_bool val2 = (*(npy_bool *)ip2tmp);
+                if (val1 != 0 && val2 != 0) {
+                    *(npy_bool *)op = NPY_TRUE;
+                    break;
+                }
+                ip2tmp += is2_n;
+                ip1tmp += is1_n;
+            }
+            op  +=  os_p;
+            ip2 += is2_p;
+        }
+        op -= ob_p;
+        ip2 -= ib2_p;
+        ip1 += is1_m;
+        op  +=  os_m;
+    }
+}
+
+NPY_NO_EXPORT void
+OBJECT_matmul_inner_noblas(void *_ip1, npy_intp is1_m, npy_intp is1_n,
+                           void *_ip2, npy_intp is2_n, npy_intp is2_p,
+                           void *_op, npy_intp os_m, npy_intp os_p,
+                           npy_intp dm, npy_intp dn, npy_intp dp)                         
+{
+    char *ip1 = (char *)_ip1, *ip2 = (char *)_ip2, *op = (char *)_op;
+
+    npy_intp ib1_n = is1_n * dn;
+    npy_intp ib2_n = is2_n * dn;
+    npy_intp ib2_p = is2_p * dp;
+    npy_intp ob_p  = os_p * dp;
+
+    PyObject *product, *sum_of_products = NULL;
+
+    for (npy_intp m = 0; m < dm; m++) {
+        for (npy_intp p = 0; p < dp; p++) {
+            if ( 0 == dn ) {
+                sum_of_products = PyLong_FromLong(0);
+                if (sum_of_products == NULL) {
+                    return;
+                }
+            }
+
+            for (npy_intp n = 0; n < dn; n++) {
+                PyObject *obj1 = *(PyObject**)ip1, *obj2 = *(PyObject**)ip2;
+                if (obj1 == NULL) {
+                    obj1 = Py_None;
+                }
+                if (obj2 == NULL) {
+                    obj2 = Py_None;
+                }
+
+                product = PyNumber_Multiply(obj1, obj2);
+                if (product == NULL) {
+                    Py_XDECREF(sum_of_products);
+                    return;
+                }
+
+                if (n == 0) {
+                    sum_of_products = product;
+                }
+                else {
+                    Py_SETREF(sum_of_products, PyNumber_Add(sum_of_products, product));
+                    Py_DECREF(product);
+                    if (sum_of_products == NULL) {
+                        return;
+                    }
+                }
+
+                ip2 += is2_n;
+                ip1 += is1_n;
+            }
+
+            *((PyObject **)op) = sum_of_products;
+            ip1 -= ib1_n;
+            ip2 -= ib2_n;
+            op  +=  os_p;
+            ip2 += is2_p;
+        }
+        op -= ob_p;
+        ip2 -= ib2_p;
+        ip1 += is1_m;
+        op  +=  os_m;
+    }
+}
+
+
+#line 395
+
+
+NPY_NO_EXPORT void
+FLOAT_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp dOuter = *dimensions++;
+    npy_intp iOuter;
+    npy_intp s0 = *steps++;
+    npy_intp s1 = *steps++;
+    npy_intp s2 = *steps++;
+    npy_intp dm = dimensions[0];
+    npy_intp dn = dimensions[1];
+    npy_intp dp = dimensions[2];
+    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
+         os_m=steps[4], os_p=steps[5];
+#if 1 && defined(HAVE_CBLAS)
+    npy_intp sz = sizeof(npy_float);
+    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
+    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
+    npy_bool scalar_out = (dm == 1 && dp == 1);
+    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
+    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
+                                 dp > BLAS_MAXSIZE);
+    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
+    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
+    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
+    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
+    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
+    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
+    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
+    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
+    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
+                              is_blasable2d(is1_n, sz, dn, 1, sz));
+    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
+                              is_blasable2d(is2_n, sz, dn, 1, sz));
+#endif
+
+    for (iOuter = 0; iOuter < dOuter; iOuter++,
+                         args[0] += s0, args[1] += s1, args[2] += s2) {
+        void *ip1=args[0], *ip2=args[1], *op=args[2];
+#if 1 && defined(HAVE_CBLAS)
+        /*
+         * TODO: refactor this out to a inner_loop_selector, in
+         * PyUFunc_MatmulLoopSelector. But that call does not have access to
+         * n, m, p and strides.
+         */
+        if (too_big_for_blas || any_zero_dim) {
+            FLOAT_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                       ip2, is2_n, is2_p,
+                                       op, os_m, os_p, dm, dn, dp);
+        }
+        else if (special_case) {
+            /* Special case variants that have a 1 in the core dimensions */
+            if (scalar_out) {
+                /* row @ column, 1,1 output */
+                FLOAT_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
+            } else if (scalar_vec){
+                /*
+                 * 1,1d @ vector or vector @ 1,1d
+                 * could use cblas_Xaxy, but that requires 0ing output
+                 * and would not be faster (XXX prove it)
+                 */
+                FLOAT_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            } else if (vector_matrix) {
+                /* vector @ matrix, switch ip1, ip2, p and m */
+                FLOAT_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
+                            op, os_p, os_m, dp, dn, dm);
+            } else if  (matrix_vector) {
+                /* matrix @ vector */
+                FLOAT_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,
+
+                            op, os_m, os_p, dm, dn, dp);
+            } else {
+                /* column @ row, 2d output, no blas needed or non-blas-able input */
+                FLOAT_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        } else {
+            /* matrix @ matrix */
+            if (i1blasable && i2blasable && o_c_blasable) {
+                FLOAT_matmul_matrixmatrix(ip1, is1_m, is1_n,
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p,
+                                           dm, dn, dp);
+            } else if (i1blasable && i2blasable && o_f_blasable) {
+                /*
+                 * Use transpose equivalence:
+                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
+                 */
+                FLOAT_matmul_matrixmatrix(ip2, is2_p, is2_n,
+                                           ip1, is1_n, is1_m,
+                                           op, os_p, os_m,
+                                           dp, dn, dm);
+            } else {
+                /*
+                 * If parameters are castable to int and we copy the
+                 * non-blasable (or non-ccontiguous output)
+                 * we could still use BLAS, see gh-12365.
+                 */
+                FLOAT_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        }
+#else
+        FLOAT_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                   ip2, is2_n, is2_p,
+                                   op, os_m, os_p, dm, dn, dp);
+
+#endif
+    }
+}
+
+
+#line 395
+
+
+NPY_NO_EXPORT void
+DOUBLE_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp dOuter = *dimensions++;
+    npy_intp iOuter;
+    npy_intp s0 = *steps++;
+    npy_intp s1 = *steps++;
+    npy_intp s2 = *steps++;
+    npy_intp dm = dimensions[0];
+    npy_intp dn = dimensions[1];
+    npy_intp dp = dimensions[2];
+    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
+         os_m=steps[4], os_p=steps[5];
+#if 1 && defined(HAVE_CBLAS)
+    npy_intp sz = sizeof(npy_double);
+    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
+    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
+    npy_bool scalar_out = (dm == 1 && dp == 1);
+    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
+    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
+                                 dp > BLAS_MAXSIZE);
+    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
+    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
+    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
+    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
+    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
+    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
+    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
+    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
+    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
+                              is_blasable2d(is1_n, sz, dn, 1, sz));
+    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
+                              is_blasable2d(is2_n, sz, dn, 1, sz));
+#endif
+
+    for (iOuter = 0; iOuter < dOuter; iOuter++,
+                         args[0] += s0, args[1] += s1, args[2] += s2) {
+        void *ip1=args[0], *ip2=args[1], *op=args[2];
+#if 1 && defined(HAVE_CBLAS)
+        /*
+         * TODO: refactor this out to a inner_loop_selector, in
+         * PyUFunc_MatmulLoopSelector. But that call does not have access to
+         * n, m, p and strides.
+         */
+        if (too_big_for_blas || any_zero_dim) {
+            DOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                       ip2, is2_n, is2_p,
+                                       op, os_m, os_p, dm, dn, dp);
+        }
+        else if (special_case) {
+            /* Special case variants that have a 1 in the core dimensions */
+            if (scalar_out) {
+                /* row @ column, 1,1 output */
+                DOUBLE_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
+            } else if (scalar_vec){
+                /*
+                 * 1,1d @ vector or vector @ 1,1d
+                 * could use cblas_Xaxy, but that requires 0ing output
+                 * and would not be faster (XXX prove it)
+                 */
+                DOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            } else if (vector_matrix) {
+                /* vector @ matrix, switch ip1, ip2, p and m */
+                DOUBLE_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
+                            op, os_p, os_m, dp, dn, dm);
+            } else if  (matrix_vector) {
+                /* matrix @ vector */
+                DOUBLE_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,
+
+                            op, os_m, os_p, dm, dn, dp);
+            } else {
+                /* column @ row, 2d output, no blas needed or non-blas-able input */
+                DOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        } else {
+            /* matrix @ matrix */
+            if (i1blasable && i2blasable && o_c_blasable) {
+                DOUBLE_matmul_matrixmatrix(ip1, is1_m, is1_n,
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p,
+                                           dm, dn, dp);
+            } else if (i1blasable && i2blasable && o_f_blasable) {
+                /*
+                 * Use transpose equivalence:
+                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
+                 */
+                DOUBLE_matmul_matrixmatrix(ip2, is2_p, is2_n,
+                                           ip1, is1_n, is1_m,
+                                           op, os_p, os_m,
+                                           dp, dn, dm);
+            } else {
+                /*
+                 * If parameters are castable to int and we copy the
+                 * non-blasable (or non-ccontiguous output)
+                 * we could still use BLAS, see gh-12365.
+                 */
+                DOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        }
+#else
+        DOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                   ip2, is2_n, is2_p,
+                                   op, os_m, os_p, dm, dn, dp);
+
+#endif
+    }
+}
+
+
+#line 395
+
+
+NPY_NO_EXPORT void
+LONGDOUBLE_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp dOuter = *dimensions++;
+    npy_intp iOuter;
+    npy_intp s0 = *steps++;
+    npy_intp s1 = *steps++;
+    npy_intp s2 = *steps++;
+    npy_intp dm = dimensions[0];
+    npy_intp dn = dimensions[1];
+    npy_intp dp = dimensions[2];
+    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
+         os_m=steps[4], os_p=steps[5];
+#if 0 && defined(HAVE_CBLAS)
+    npy_intp sz = sizeof(npy_longdouble);
+    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
+    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
+    npy_bool scalar_out = (dm == 1 && dp == 1);
+    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
+    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
+                                 dp > BLAS_MAXSIZE);
+    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
+    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
+    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
+    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
+    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
+    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
+    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
+    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
+    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
+                              is_blasable2d(is1_n, sz, dn, 1, sz));
+    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
+                              is_blasable2d(is2_n, sz, dn, 1, sz));
+#endif
+
+    for (iOuter = 0; iOuter < dOuter; iOuter++,
+                         args[0] += s0, args[1] += s1, args[2] += s2) {
+        void *ip1=args[0], *ip2=args[1], *op=args[2];
+#if 0 && defined(HAVE_CBLAS)
+        /*
+         * TODO: refactor this out to a inner_loop_selector, in
+         * PyUFunc_MatmulLoopSelector. But that call does not have access to
+         * n, m, p and strides.
+         */
+        if (too_big_for_blas || any_zero_dim) {
+            LONGDOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                       ip2, is2_n, is2_p,
+                                       op, os_m, os_p, dm, dn, dp);
+        }
+        else if (special_case) {
+            /* Special case variants that have a 1 in the core dimensions */
+            if (scalar_out) {
+                /* row @ column, 1,1 output */
+                LONGDOUBLE_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
+            } else if (scalar_vec){
+                /*
+                 * 1,1d @ vector or vector @ 1,1d
+                 * could use cblas_Xaxy, but that requires 0ing output
+                 * and would not be faster (XXX prove it)
+                 */
+                LONGDOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            } else if (vector_matrix) {
+                /* vector @ matrix, switch ip1, ip2, p and m */
+                LONGDOUBLE_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
+                            op, os_p, os_m, dp, dn, dm);
+            } else if  (matrix_vector) {
+                /* matrix @ vector */
+                LONGDOUBLE_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,
+
+                            op, os_m, os_p, dm, dn, dp);
+            } else {
+                /* column @ row, 2d output, no blas needed or non-blas-able input */
+                LONGDOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        } else {
+            /* matrix @ matrix */
+            if (i1blasable && i2blasable && o_c_blasable) {
+                LONGDOUBLE_matmul_matrixmatrix(ip1, is1_m, is1_n,
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p,
+                                           dm, dn, dp);
+            } else if (i1blasable && i2blasable && o_f_blasable) {
+                /*
+                 * Use transpose equivalence:
+                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
+                 */
+                LONGDOUBLE_matmul_matrixmatrix(ip2, is2_p, is2_n,
+                                           ip1, is1_n, is1_m,
+                                           op, os_p, os_m,
+                                           dp, dn, dm);
+            } else {
+                /*
+                 * If parameters are castable to int and we copy the
+                 * non-blasable (or non-ccontiguous output)
+                 * we could still use BLAS, see gh-12365.
+                 */
+                LONGDOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        }
+#else
+        LONGDOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                   ip2, is2_n, is2_p,
+                                   op, os_m, os_p, dm, dn, dp);
+
+#endif
+    }
+}
+
+
+#line 395
+
+
+NPY_NO_EXPORT void
+HALF_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp dOuter = *dimensions++;
+    npy_intp iOuter;
+    npy_intp s0 = *steps++;
+    npy_intp s1 = *steps++;
+    npy_intp s2 = *steps++;
+    npy_intp dm = dimensions[0];
+    npy_intp dn = dimensions[1];
+    npy_intp dp = dimensions[2];
+    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
+         os_m=steps[4], os_p=steps[5];
+#if 0 && defined(HAVE_CBLAS)
+    npy_intp sz = sizeof(npy_half);
+    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
+    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
+    npy_bool scalar_out = (dm == 1 && dp == 1);
+    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
+    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
+                                 dp > BLAS_MAXSIZE);
+    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
+    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
+    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
+    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
+    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
+    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
+    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
+    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
+    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
+                              is_blasable2d(is1_n, sz, dn, 1, sz));
+    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
+                              is_blasable2d(is2_n, sz, dn, 1, sz));
+#endif
+
+    for (iOuter = 0; iOuter < dOuter; iOuter++,
+                         args[0] += s0, args[1] += s1, args[2] += s2) {
+        void *ip1=args[0], *ip2=args[1], *op=args[2];
+#if 0 && defined(HAVE_CBLAS)
+        /*
+         * TODO: refactor this out to a inner_loop_selector, in
+         * PyUFunc_MatmulLoopSelector. But that call does not have access to
+         * n, m, p and strides.
+         */
+        if (too_big_for_blas || any_zero_dim) {
+            HALF_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                       ip2, is2_n, is2_p,
+                                       op, os_m, os_p, dm, dn, dp);
+        }
+        else if (special_case) {
+            /* Special case variants that have a 1 in the core dimensions */
+            if (scalar_out) {
+                /* row @ column, 1,1 output */
+                HALF_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
+            } else if (scalar_vec){
+                /*
+                 * 1,1d @ vector or vector @ 1,1d
+                 * could use cblas_Xaxy, but that requires 0ing output
+                 * and would not be faster (XXX prove it)
+                 */
+                HALF_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            } else if (vector_matrix) {
+                /* vector @ matrix, switch ip1, ip2, p and m */
+                HALF_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
+                            op, os_p, os_m, dp, dn, dm);
+            } else if  (matrix_vector) {
+                /* matrix @ vector */
+                HALF_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,
+
+                            op, os_m, os_p, dm, dn, dp);
+            } else {
+                /* column @ row, 2d output, no blas needed or non-blas-able input */
+                HALF_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        } else {
+            /* matrix @ matrix */
+            if (i1blasable && i2blasable && o_c_blasable) {
+                HALF_matmul_matrixmatrix(ip1, is1_m, is1_n,
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p,
+                                           dm, dn, dp);
+            } else if (i1blasable && i2blasable && o_f_blasable) {
+                /*
+                 * Use transpose equivalence:
+                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
+                 */
+                HALF_matmul_matrixmatrix(ip2, is2_p, is2_n,
+                                           ip1, is1_n, is1_m,
+                                           op, os_p, os_m,
+                                           dp, dn, dm);
+            } else {
+                /*
+                 * If parameters are castable to int and we copy the
+                 * non-blasable (or non-ccontiguous output)
+                 * we could still use BLAS, see gh-12365.
+                 */
+                HALF_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        }
+#else
+        HALF_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                   ip2, is2_n, is2_p,
+                                   op, os_m, os_p, dm, dn, dp);
+
+#endif
+    }
+}
+
+
+#line 395
+
+
+NPY_NO_EXPORT void
+CFLOAT_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp dOuter = *dimensions++;
+    npy_intp iOuter;
+    npy_intp s0 = *steps++;
+    npy_intp s1 = *steps++;
+    npy_intp s2 = *steps++;
+    npy_intp dm = dimensions[0];
+    npy_intp dn = dimensions[1];
+    npy_intp dp = dimensions[2];
+    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
+         os_m=steps[4], os_p=steps[5];
+#if 1 && defined(HAVE_CBLAS)
+    npy_intp sz = sizeof(npy_cfloat);
+    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
+    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
+    npy_bool scalar_out = (dm == 1 && dp == 1);
+    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
+    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
+                                 dp > BLAS_MAXSIZE);
+    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
+    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
+    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
+    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
+    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
+    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
+    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
+    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
+    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
+                              is_blasable2d(is1_n, sz, dn, 1, sz));
+    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
+                              is_blasable2d(is2_n, sz, dn, 1, sz));
+#endif
+
+    for (iOuter = 0; iOuter < dOuter; iOuter++,
+                         args[0] += s0, args[1] += s1, args[2] += s2) {
+        void *ip1=args[0], *ip2=args[1], *op=args[2];
+#if 1 && defined(HAVE_CBLAS)
+        /*
+         * TODO: refactor this out to a inner_loop_selector, in
+         * PyUFunc_MatmulLoopSelector. But that call does not have access to
+         * n, m, p and strides.
+         */
+        if (too_big_for_blas || any_zero_dim) {
+            CFLOAT_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                       ip2, is2_n, is2_p,
+                                       op, os_m, os_p, dm, dn, dp);
+        }
+        else if (special_case) {
+            /* Special case variants that have a 1 in the core dimensions */
+            if (scalar_out) {
+                /* row @ column, 1,1 output */
+                CFLOAT_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
+            } else if (scalar_vec){
+                /*
+                 * 1,1d @ vector or vector @ 1,1d
+                 * could use cblas_Xaxy, but that requires 0ing output
+                 * and would not be faster (XXX prove it)
+                 */
+                CFLOAT_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            } else if (vector_matrix) {
+                /* vector @ matrix, switch ip1, ip2, p and m */
+                CFLOAT_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
+                            op, os_p, os_m, dp, dn, dm);
+            } else if  (matrix_vector) {
+                /* matrix @ vector */
+                CFLOAT_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,
+
+                            op, os_m, os_p, dm, dn, dp);
+            } else {
+                /* column @ row, 2d output, no blas needed or non-blas-able input */
+                CFLOAT_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        } else {
+            /* matrix @ matrix */
+            if (i1blasable && i2blasable && o_c_blasable) {
+                CFLOAT_matmul_matrixmatrix(ip1, is1_m, is1_n,
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p,
+                                           dm, dn, dp);
+            } else if (i1blasable && i2blasable && o_f_blasable) {
+                /*
+                 * Use transpose equivalence:
+                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
+                 */
+                CFLOAT_matmul_matrixmatrix(ip2, is2_p, is2_n,
+                                           ip1, is1_n, is1_m,
+                                           op, os_p, os_m,
+                                           dp, dn, dm);
+            } else {
+                /*
+                 * If parameters are castable to int and we copy the
+                 * non-blasable (or non-ccontiguous output)
+                 * we could still use BLAS, see gh-12365.
+                 */
+                CFLOAT_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        }
+#else
+        CFLOAT_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                   ip2, is2_n, is2_p,
+                                   op, os_m, os_p, dm, dn, dp);
+
+#endif
+    }
+}
+
+
+#line 395
+
+
+NPY_NO_EXPORT void
+CDOUBLE_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp dOuter = *dimensions++;
+    npy_intp iOuter;
+    npy_intp s0 = *steps++;
+    npy_intp s1 = *steps++;
+    npy_intp s2 = *steps++;
+    npy_intp dm = dimensions[0];
+    npy_intp dn = dimensions[1];
+    npy_intp dp = dimensions[2];
+    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
+         os_m=steps[4], os_p=steps[5];
+#if 1 && defined(HAVE_CBLAS)
+    npy_intp sz = sizeof(npy_cdouble);
+    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
+    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
+    npy_bool scalar_out = (dm == 1 && dp == 1);
+    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
+    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
+                                 dp > BLAS_MAXSIZE);
+    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
+    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
+    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
+    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
+    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
+    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
+    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
+    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
+    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
+                              is_blasable2d(is1_n, sz, dn, 1, sz));
+    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
+                              is_blasable2d(is2_n, sz, dn, 1, sz));
+#endif
+
+    for (iOuter = 0; iOuter < dOuter; iOuter++,
+                         args[0] += s0, args[1] += s1, args[2] += s2) {
+        void *ip1=args[0], *ip2=args[1], *op=args[2];
+#if 1 && defined(HAVE_CBLAS)
+        /*
+         * TODO: refactor this out to a inner_loop_selector, in
+         * PyUFunc_MatmulLoopSelector. But that call does not have access to
+         * n, m, p and strides.
+         */
+        if (too_big_for_blas || any_zero_dim) {
+            CDOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                       ip2, is2_n, is2_p,
+                                       op, os_m, os_p, dm, dn, dp);
+        }
+        else if (special_case) {
+            /* Special case variants that have a 1 in the core dimensions */
+            if (scalar_out) {
+                /* row @ column, 1,1 output */
+                CDOUBLE_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
+            } else if (scalar_vec){
+                /*
+                 * 1,1d @ vector or vector @ 1,1d
+                 * could use cblas_Xaxy, but that requires 0ing output
+                 * and would not be faster (XXX prove it)
+                 */
+                CDOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            } else if (vector_matrix) {
+                /* vector @ matrix, switch ip1, ip2, p and m */
+                CDOUBLE_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
+                            op, os_p, os_m, dp, dn, dm);
+            } else if  (matrix_vector) {
+                /* matrix @ vector */
+                CDOUBLE_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,
+
+                            op, os_m, os_p, dm, dn, dp);
+            } else {
+                /* column @ row, 2d output, no blas needed or non-blas-able input */
+                CDOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        } else {
+            /* matrix @ matrix */
+            if (i1blasable && i2blasable && o_c_blasable) {
+                CDOUBLE_matmul_matrixmatrix(ip1, is1_m, is1_n,
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p,
+                                           dm, dn, dp);
+            } else if (i1blasable && i2blasable && o_f_blasable) {
+                /*
+                 * Use transpose equivalence:
+                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
+                 */
+                CDOUBLE_matmul_matrixmatrix(ip2, is2_p, is2_n,
+                                           ip1, is1_n, is1_m,
+                                           op, os_p, os_m,
+                                           dp, dn, dm);
+            } else {
+                /*
+                 * If parameters are castable to int and we copy the
+                 * non-blasable (or non-ccontiguous output)
+                 * we could still use BLAS, see gh-12365.
+                 */
+                CDOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        }
+#else
+        CDOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                   ip2, is2_n, is2_p,
+                                   op, os_m, os_p, dm, dn, dp);
+
+#endif
+    }
+}
+
+
+#line 395
+
+
+NPY_NO_EXPORT void
+CLONGDOUBLE_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp dOuter = *dimensions++;
+    npy_intp iOuter;
+    npy_intp s0 = *steps++;
+    npy_intp s1 = *steps++;
+    npy_intp s2 = *steps++;
+    npy_intp dm = dimensions[0];
+    npy_intp dn = dimensions[1];
+    npy_intp dp = dimensions[2];
+    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
+         os_m=steps[4], os_p=steps[5];
+#if 0 && defined(HAVE_CBLAS)
+    npy_intp sz = sizeof(npy_clongdouble);
+    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
+    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
+    npy_bool scalar_out = (dm == 1 && dp == 1);
+    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
+    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
+                                 dp > BLAS_MAXSIZE);
+    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
+    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
+    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
+    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
+    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
+    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
+    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
+    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
+    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
+                              is_blasable2d(is1_n, sz, dn, 1, sz));
+    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
+                              is_blasable2d(is2_n, sz, dn, 1, sz));
+#endif
+
+    for (iOuter = 0; iOuter < dOuter; iOuter++,
+                         args[0] += s0, args[1] += s1, args[2] += s2) {
+        void *ip1=args[0], *ip2=args[1], *op=args[2];
+#if 0 && defined(HAVE_CBLAS)
+        /*
+         * TODO: refactor this out to a inner_loop_selector, in
+         * PyUFunc_MatmulLoopSelector. But that call does not have access to
+         * n, m, p and strides.
+         */
+        if (too_big_for_blas || any_zero_dim) {
+            CLONGDOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                       ip2, is2_n, is2_p,
+                                       op, os_m, os_p, dm, dn, dp);
+        }
+        else if (special_case) {
+            /* Special case variants that have a 1 in the core dimensions */
+            if (scalar_out) {
+                /* row @ column, 1,1 output */
+                CLONGDOUBLE_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
+            } else if (scalar_vec){
+                /*
+                 * 1,1d @ vector or vector @ 1,1d
+                 * could use cblas_Xaxy, but that requires 0ing output
+                 * and would not be faster (XXX prove it)
+                 */
+                CLONGDOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            } else if (vector_matrix) {
+                /* vector @ matrix, switch ip1, ip2, p and m */
+                CLONGDOUBLE_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
+                            op, os_p, os_m, dp, dn, dm);
+            } else if  (matrix_vector) {
+                /* matrix @ vector */
+                CLONGDOUBLE_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,
+
+                            op, os_m, os_p, dm, dn, dp);
+            } else {
+                /* column @ row, 2d output, no blas needed or non-blas-able input */
+                CLONGDOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        } else {
+            /* matrix @ matrix */
+            if (i1blasable && i2blasable && o_c_blasable) {
+                CLONGDOUBLE_matmul_matrixmatrix(ip1, is1_m, is1_n,
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p,
+                                           dm, dn, dp);
+            } else if (i1blasable && i2blasable && o_f_blasable) {
+                /*
+                 * Use transpose equivalence:
+                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
+                 */
+                CLONGDOUBLE_matmul_matrixmatrix(ip2, is2_p, is2_n,
+                                           ip1, is1_n, is1_m,
+                                           op, os_p, os_m,
+                                           dp, dn, dm);
+            } else {
+                /*
+                 * If parameters are castable to int and we copy the
+                 * non-blasable (or non-ccontiguous output)
+                 * we could still use BLAS, see gh-12365.
+                 */
+                CLONGDOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        }
+#else
+        CLONGDOUBLE_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                   ip2, is2_n, is2_p,
+                                   op, os_m, os_p, dm, dn, dp);
+
+#endif
+    }
+}
+
+
+#line 395
+
+
+NPY_NO_EXPORT void
+UBYTE_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp dOuter = *dimensions++;
+    npy_intp iOuter;
+    npy_intp s0 = *steps++;
+    npy_intp s1 = *steps++;
+    npy_intp s2 = *steps++;
+    npy_intp dm = dimensions[0];
+    npy_intp dn = dimensions[1];
+    npy_intp dp = dimensions[2];
+    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
+         os_m=steps[4], os_p=steps[5];
+#if 0 && defined(HAVE_CBLAS)
+    npy_intp sz = sizeof(npy_ubyte);
+    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
+    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
+    npy_bool scalar_out = (dm == 1 && dp == 1);
+    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
+    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
+                                 dp > BLAS_MAXSIZE);
+    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
+    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
+    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
+    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
+    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
+    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
+    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
+    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
+    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
+                              is_blasable2d(is1_n, sz, dn, 1, sz));
+    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
+                              is_blasable2d(is2_n, sz, dn, 1, sz));
+#endif
+
+    for (iOuter = 0; iOuter < dOuter; iOuter++,
+                         args[0] += s0, args[1] += s1, args[2] += s2) {
+        void *ip1=args[0], *ip2=args[1], *op=args[2];
+#if 0 && defined(HAVE_CBLAS)
+        /*
+         * TODO: refactor this out to a inner_loop_selector, in
+         * PyUFunc_MatmulLoopSelector. But that call does not have access to
+         * n, m, p and strides.
+         */
+        if (too_big_for_blas || any_zero_dim) {
+            UBYTE_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                       ip2, is2_n, is2_p,
+                                       op, os_m, os_p, dm, dn, dp);
+        }
+        else if (special_case) {
+            /* Special case variants that have a 1 in the core dimensions */
+            if (scalar_out) {
+                /* row @ column, 1,1 output */
+                UBYTE_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
+            } else if (scalar_vec){
+                /*
+                 * 1,1d @ vector or vector @ 1,1d
+                 * could use cblas_Xaxy, but that requires 0ing output
+                 * and would not be faster (XXX prove it)
+                 */
+                UBYTE_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            } else if (vector_matrix) {
+                /* vector @ matrix, switch ip1, ip2, p and m */
+                UBYTE_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
+                            op, os_p, os_m, dp, dn, dm);
+            } else if  (matrix_vector) {
+                /* matrix @ vector */
+                UBYTE_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,
+
+                            op, os_m, os_p, dm, dn, dp);
+            } else {
+                /* column @ row, 2d output, no blas needed or non-blas-able input */
+                UBYTE_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        } else {
+            /* matrix @ matrix */
+            if (i1blasable && i2blasable && o_c_blasable) {
+                UBYTE_matmul_matrixmatrix(ip1, is1_m, is1_n,
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p,
+                                           dm, dn, dp);
+            } else if (i1blasable && i2blasable && o_f_blasable) {
+                /*
+                 * Use transpose equivalence:
+                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
+                 */
+                UBYTE_matmul_matrixmatrix(ip2, is2_p, is2_n,
+                                           ip1, is1_n, is1_m,
+                                           op, os_p, os_m,
+                                           dp, dn, dm);
+            } else {
+                /*
+                 * If parameters are castable to int and we copy the
+                 * non-blasable (or non-ccontiguous output)
+                 * we could still use BLAS, see gh-12365.
+                 */
+                UBYTE_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        }
+#else
+        UBYTE_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                   ip2, is2_n, is2_p,
+                                   op, os_m, os_p, dm, dn, dp);
+
+#endif
+    }
+}
+
+
+#line 395
+
+
+NPY_NO_EXPORT void
+USHORT_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp dOuter = *dimensions++;
+    npy_intp iOuter;
+    npy_intp s0 = *steps++;
+    npy_intp s1 = *steps++;
+    npy_intp s2 = *steps++;
+    npy_intp dm = dimensions[0];
+    npy_intp dn = dimensions[1];
+    npy_intp dp = dimensions[2];
+    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
+         os_m=steps[4], os_p=steps[5];
+#if 0 && defined(HAVE_CBLAS)
+    npy_intp sz = sizeof(npy_ushort);
+    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
+    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
+    npy_bool scalar_out = (dm == 1 && dp == 1);
+    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
+    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
+                                 dp > BLAS_MAXSIZE);
+    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
+    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
+    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
+    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
+    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
+    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
+    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
+    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
+    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
+                              is_blasable2d(is1_n, sz, dn, 1, sz));
+    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
+                              is_blasable2d(is2_n, sz, dn, 1, sz));
+#endif
+
+    for (iOuter = 0; iOuter < dOuter; iOuter++,
+                         args[0] += s0, args[1] += s1, args[2] += s2) {
+        void *ip1=args[0], *ip2=args[1], *op=args[2];
+#if 0 && defined(HAVE_CBLAS)
+        /*
+         * TODO: refactor this out to a inner_loop_selector, in
+         * PyUFunc_MatmulLoopSelector. But that call does not have access to
+         * n, m, p and strides.
+         */
+        if (too_big_for_blas || any_zero_dim) {
+            USHORT_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                       ip2, is2_n, is2_p,
+                                       op, os_m, os_p, dm, dn, dp);
+        }
+        else if (special_case) {
+            /* Special case variants that have a 1 in the core dimensions */
+            if (scalar_out) {
+                /* row @ column, 1,1 output */
+                USHORT_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
+            } else if (scalar_vec){
+                /*
+                 * 1,1d @ vector or vector @ 1,1d
+                 * could use cblas_Xaxy, but that requires 0ing output
+                 * and would not be faster (XXX prove it)
+                 */
+                USHORT_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            } else if (vector_matrix) {
+                /* vector @ matrix, switch ip1, ip2, p and m */
+                USHORT_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
+                            op, os_p, os_m, dp, dn, dm);
+            } else if  (matrix_vector) {
+                /* matrix @ vector */
+                USHORT_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,
+
+                            op, os_m, os_p, dm, dn, dp);
+            } else {
+                /* column @ row, 2d output, no blas needed or non-blas-able input */
+                USHORT_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        } else {
+            /* matrix @ matrix */
+            if (i1blasable && i2blasable && o_c_blasable) {
+                USHORT_matmul_matrixmatrix(ip1, is1_m, is1_n,
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p,
+                                           dm, dn, dp);
+            } else if (i1blasable && i2blasable && o_f_blasable) {
+                /*
+                 * Use transpose equivalence:
+                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
+                 */
+                USHORT_matmul_matrixmatrix(ip2, is2_p, is2_n,
+                                           ip1, is1_n, is1_m,
+                                           op, os_p, os_m,
+                                           dp, dn, dm);
+            } else {
+                /*
+                 * If parameters are castable to int and we copy the
+                 * non-blasable (or non-ccontiguous output)
+                 * we could still use BLAS, see gh-12365.
+                 */
+                USHORT_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        }
+#else
+        USHORT_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                   ip2, is2_n, is2_p,
+                                   op, os_m, os_p, dm, dn, dp);
+
+#endif
+    }
+}
+
+
+#line 395
+
+
+NPY_NO_EXPORT void
+UINT_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp dOuter = *dimensions++;
+    npy_intp iOuter;
+    npy_intp s0 = *steps++;
+    npy_intp s1 = *steps++;
+    npy_intp s2 = *steps++;
+    npy_intp dm = dimensions[0];
+    npy_intp dn = dimensions[1];
+    npy_intp dp = dimensions[2];
+    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
+         os_m=steps[4], os_p=steps[5];
+#if 0 && defined(HAVE_CBLAS)
+    npy_intp sz = sizeof(npy_uint);
+    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
+    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
+    npy_bool scalar_out = (dm == 1 && dp == 1);
+    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
+    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
+                                 dp > BLAS_MAXSIZE);
+    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
+    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
+    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
+    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
+    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
+    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
+    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
+    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
+    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
+                              is_blasable2d(is1_n, sz, dn, 1, sz));
+    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
+                              is_blasable2d(is2_n, sz, dn, 1, sz));
+#endif
+
+    for (iOuter = 0; iOuter < dOuter; iOuter++,
+                         args[0] += s0, args[1] += s1, args[2] += s2) {
+        void *ip1=args[0], *ip2=args[1], *op=args[2];
+#if 0 && defined(HAVE_CBLAS)
+        /*
+         * TODO: refactor this out to a inner_loop_selector, in
+         * PyUFunc_MatmulLoopSelector. But that call does not have access to
+         * n, m, p and strides.
+         */
+        if (too_big_for_blas || any_zero_dim) {
+            UINT_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                       ip2, is2_n, is2_p,
+                                       op, os_m, os_p, dm, dn, dp);
+        }
+        else if (special_case) {
+            /* Special case variants that have a 1 in the core dimensions */
+            if (scalar_out) {
+                /* row @ column, 1,1 output */
+                UINT_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
+            } else if (scalar_vec){
+                /*
+                 * 1,1d @ vector or vector @ 1,1d
+                 * could use cblas_Xaxy, but that requires 0ing output
+                 * and would not be faster (XXX prove it)
+                 */
+                UINT_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            } else if (vector_matrix) {
+                /* vector @ matrix, switch ip1, ip2, p and m */
+                UINT_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
+                            op, os_p, os_m, dp, dn, dm);
+            } else if  (matrix_vector) {
+                /* matrix @ vector */
+                UINT_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,
+
+                            op, os_m, os_p, dm, dn, dp);
+            } else {
+                /* column @ row, 2d output, no blas needed or non-blas-able input */
+                UINT_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        } else {
+            /* matrix @ matrix */
+            if (i1blasable && i2blasable && o_c_blasable) {
+                UINT_matmul_matrixmatrix(ip1, is1_m, is1_n,
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p,
+                                           dm, dn, dp);
+            } else if (i1blasable && i2blasable && o_f_blasable) {
+                /*
+                 * Use transpose equivalence:
+                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
+                 */
+                UINT_matmul_matrixmatrix(ip2, is2_p, is2_n,
+                                           ip1, is1_n, is1_m,
+                                           op, os_p, os_m,
+                                           dp, dn, dm);
+            } else {
+                /*
+                 * If parameters are castable to int and we copy the
+                 * non-blasable (or non-ccontiguous output)
+                 * we could still use BLAS, see gh-12365.
+                 */
+                UINT_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        }
+#else
+        UINT_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                   ip2, is2_n, is2_p,
+                                   op, os_m, os_p, dm, dn, dp);
+
+#endif
+    }
+}
+
+
+#line 395
+
+
+NPY_NO_EXPORT void
+ULONG_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp dOuter = *dimensions++;
+    npy_intp iOuter;
+    npy_intp s0 = *steps++;
+    npy_intp s1 = *steps++;
+    npy_intp s2 = *steps++;
+    npy_intp dm = dimensions[0];
+    npy_intp dn = dimensions[1];
+    npy_intp dp = dimensions[2];
+    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
+         os_m=steps[4], os_p=steps[5];
+#if 0 && defined(HAVE_CBLAS)
+    npy_intp sz = sizeof(npy_ulong);
+    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
+    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
+    npy_bool scalar_out = (dm == 1 && dp == 1);
+    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
+    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
+                                 dp > BLAS_MAXSIZE);
+    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
+    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
+    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
+    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
+    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
+    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
+    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
+    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
+    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
+                              is_blasable2d(is1_n, sz, dn, 1, sz));
+    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
+                              is_blasable2d(is2_n, sz, dn, 1, sz));
+#endif
+
+    for (iOuter = 0; iOuter < dOuter; iOuter++,
+                         args[0] += s0, args[1] += s1, args[2] += s2) {
+        void *ip1=args[0], *ip2=args[1], *op=args[2];
+#if 0 && defined(HAVE_CBLAS)
+        /*
+         * TODO: refactor this out to a inner_loop_selector, in
+         * PyUFunc_MatmulLoopSelector. But that call does not have access to
+         * n, m, p and strides.
+         */
+        if (too_big_for_blas || any_zero_dim) {
+            ULONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                       ip2, is2_n, is2_p,
+                                       op, os_m, os_p, dm, dn, dp);
+        }
+        else if (special_case) {
+            /* Special case variants that have a 1 in the core dimensions */
+            if (scalar_out) {
+                /* row @ column, 1,1 output */
+                ULONG_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
+            } else if (scalar_vec){
+                /*
+                 * 1,1d @ vector or vector @ 1,1d
+                 * could use cblas_Xaxy, but that requires 0ing output
+                 * and would not be faster (XXX prove it)
+                 */
+                ULONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            } else if (vector_matrix) {
+                /* vector @ matrix, switch ip1, ip2, p and m */
+                ULONG_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
+                            op, os_p, os_m, dp, dn, dm);
+            } else if  (matrix_vector) {
+                /* matrix @ vector */
+                ULONG_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,
+
+                            op, os_m, os_p, dm, dn, dp);
+            } else {
+                /* column @ row, 2d output, no blas needed or non-blas-able input */
+                ULONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        } else {
+            /* matrix @ matrix */
+            if (i1blasable && i2blasable && o_c_blasable) {
+                ULONG_matmul_matrixmatrix(ip1, is1_m, is1_n,
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p,
+                                           dm, dn, dp);
+            } else if (i1blasable && i2blasable && o_f_blasable) {
+                /*
+                 * Use transpose equivalence:
+                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
+                 */
+                ULONG_matmul_matrixmatrix(ip2, is2_p, is2_n,
+                                           ip1, is1_n, is1_m,
+                                           op, os_p, os_m,
+                                           dp, dn, dm);
+            } else {
+                /*
+                 * If parameters are castable to int and we copy the
+                 * non-blasable (or non-ccontiguous output)
+                 * we could still use BLAS, see gh-12365.
+                 */
+                ULONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        }
+#else
+        ULONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                   ip2, is2_n, is2_p,
+                                   op, os_m, os_p, dm, dn, dp);
+
+#endif
+    }
+}
+
+
+#line 395
+
+
+NPY_NO_EXPORT void
+ULONGLONG_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp dOuter = *dimensions++;
+    npy_intp iOuter;
+    npy_intp s0 = *steps++;
+    npy_intp s1 = *steps++;
+    npy_intp s2 = *steps++;
+    npy_intp dm = dimensions[0];
+    npy_intp dn = dimensions[1];
+    npy_intp dp = dimensions[2];
+    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
+         os_m=steps[4], os_p=steps[5];
+#if 0 && defined(HAVE_CBLAS)
+    npy_intp sz = sizeof(npy_ulonglong);
+    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
+    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
+    npy_bool scalar_out = (dm == 1 && dp == 1);
+    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
+    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
+                                 dp > BLAS_MAXSIZE);
+    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
+    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
+    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
+    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
+    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
+    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
+    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
+    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
+    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
+                              is_blasable2d(is1_n, sz, dn, 1, sz));
+    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
+                              is_blasable2d(is2_n, sz, dn, 1, sz));
+#endif
+
+    for (iOuter = 0; iOuter < dOuter; iOuter++,
+                         args[0] += s0, args[1] += s1, args[2] += s2) {
+        void *ip1=args[0], *ip2=args[1], *op=args[2];
+#if 0 && defined(HAVE_CBLAS)
+        /*
+         * TODO: refactor this out to a inner_loop_selector, in
+         * PyUFunc_MatmulLoopSelector. But that call does not have access to
+         * n, m, p and strides.
+         */
+        if (too_big_for_blas || any_zero_dim) {
+            ULONGLONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                       ip2, is2_n, is2_p,
+                                       op, os_m, os_p, dm, dn, dp);
+        }
+        else if (special_case) {
+            /* Special case variants that have a 1 in the core dimensions */
+            if (scalar_out) {
+                /* row @ column, 1,1 output */
+                ULONGLONG_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
+            } else if (scalar_vec){
+                /*
+                 * 1,1d @ vector or vector @ 1,1d
+                 * could use cblas_Xaxy, but that requires 0ing output
+                 * and would not be faster (XXX prove it)
+                 */
+                ULONGLONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            } else if (vector_matrix) {
+                /* vector @ matrix, switch ip1, ip2, p and m */
+                ULONGLONG_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
+                            op, os_p, os_m, dp, dn, dm);
+            } else if  (matrix_vector) {
+                /* matrix @ vector */
+                ULONGLONG_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,
+
+                            op, os_m, os_p, dm, dn, dp);
+            } else {
+                /* column @ row, 2d output, no blas needed or non-blas-able input */
+                ULONGLONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        } else {
+            /* matrix @ matrix */
+            if (i1blasable && i2blasable && o_c_blasable) {
+                ULONGLONG_matmul_matrixmatrix(ip1, is1_m, is1_n,
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p,
+                                           dm, dn, dp);
+            } else if (i1blasable && i2blasable && o_f_blasable) {
+                /*
+                 * Use transpose equivalence:
+                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
+                 */
+                ULONGLONG_matmul_matrixmatrix(ip2, is2_p, is2_n,
+                                           ip1, is1_n, is1_m,
+                                           op, os_p, os_m,
+                                           dp, dn, dm);
+            } else {
+                /*
+                 * If parameters are castable to int and we copy the
+                 * non-blasable (or non-ccontiguous output)
+                 * we could still use BLAS, see gh-12365.
+                 */
+                ULONGLONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        }
+#else
+        ULONGLONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                   ip2, is2_n, is2_p,
+                                   op, os_m, os_p, dm, dn, dp);
+
+#endif
+    }
+}
+
+
+#line 395
+
+
+NPY_NO_EXPORT void
+BYTE_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp dOuter = *dimensions++;
+    npy_intp iOuter;
+    npy_intp s0 = *steps++;
+    npy_intp s1 = *steps++;
+    npy_intp s2 = *steps++;
+    npy_intp dm = dimensions[0];
+    npy_intp dn = dimensions[1];
+    npy_intp dp = dimensions[2];
+    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
+         os_m=steps[4], os_p=steps[5];
+#if 0 && defined(HAVE_CBLAS)
+    npy_intp sz = sizeof(npy_byte);
+    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
+    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
+    npy_bool scalar_out = (dm == 1 && dp == 1);
+    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
+    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
+                                 dp > BLAS_MAXSIZE);
+    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
+    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
+    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
+    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
+    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
+    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
+    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
+    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
+    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
+                              is_blasable2d(is1_n, sz, dn, 1, sz));
+    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
+                              is_blasable2d(is2_n, sz, dn, 1, sz));
+#endif
+
+    for (iOuter = 0; iOuter < dOuter; iOuter++,
+                         args[0] += s0, args[1] += s1, args[2] += s2) {
+        void *ip1=args[0], *ip2=args[1], *op=args[2];
+#if 0 && defined(HAVE_CBLAS)
+        /*
+         * TODO: refactor this out to a inner_loop_selector, in
+         * PyUFunc_MatmulLoopSelector. But that call does not have access to
+         * n, m, p and strides.
+         */
+        if (too_big_for_blas || any_zero_dim) {
+            BYTE_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                       ip2, is2_n, is2_p,
+                                       op, os_m, os_p, dm, dn, dp);
+        }
+        else if (special_case) {
+            /* Special case variants that have a 1 in the core dimensions */
+            if (scalar_out) {
+                /* row @ column, 1,1 output */
+                BYTE_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
+            } else if (scalar_vec){
+                /*
+                 * 1,1d @ vector or vector @ 1,1d
+                 * could use cblas_Xaxy, but that requires 0ing output
+                 * and would not be faster (XXX prove it)
+                 */
+                BYTE_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            } else if (vector_matrix) {
+                /* vector @ matrix, switch ip1, ip2, p and m */
+                BYTE_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
+                            op, os_p, os_m, dp, dn, dm);
+            } else if  (matrix_vector) {
+                /* matrix @ vector */
+                BYTE_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,
+
+                            op, os_m, os_p, dm, dn, dp);
+            } else {
+                /* column @ row, 2d output, no blas needed or non-blas-able input */
+                BYTE_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        } else {
+            /* matrix @ matrix */
+            if (i1blasable && i2blasable && o_c_blasable) {
+                BYTE_matmul_matrixmatrix(ip1, is1_m, is1_n,
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p,
+                                           dm, dn, dp);
+            } else if (i1blasable && i2blasable && o_f_blasable) {
+                /*
+                 * Use transpose equivalence:
+                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
+                 */
+                BYTE_matmul_matrixmatrix(ip2, is2_p, is2_n,
+                                           ip1, is1_n, is1_m,
+                                           op, os_p, os_m,
+                                           dp, dn, dm);
+            } else {
+                /*
+                 * If parameters are castable to int and we copy the
+                 * non-blasable (or non-ccontiguous output)
+                 * we could still use BLAS, see gh-12365.
+                 */
+                BYTE_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        }
+#else
+        BYTE_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                   ip2, is2_n, is2_p,
+                                   op, os_m, os_p, dm, dn, dp);
+
+#endif
+    }
+}
+
+
+#line 395
+
+
+NPY_NO_EXPORT void
+SHORT_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp dOuter = *dimensions++;
+    npy_intp iOuter;
+    npy_intp s0 = *steps++;
+    npy_intp s1 = *steps++;
+    npy_intp s2 = *steps++;
+    npy_intp dm = dimensions[0];
+    npy_intp dn = dimensions[1];
+    npy_intp dp = dimensions[2];
+    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
+         os_m=steps[4], os_p=steps[5];
+#if 0 && defined(HAVE_CBLAS)
+    npy_intp sz = sizeof(npy_short);
+    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
+    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
+    npy_bool scalar_out = (dm == 1 && dp == 1);
+    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
+    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
+                                 dp > BLAS_MAXSIZE);
+    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
+    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
+    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
+    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
+    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
+    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
+    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
+    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
+    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
+                              is_blasable2d(is1_n, sz, dn, 1, sz));
+    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
+                              is_blasable2d(is2_n, sz, dn, 1, sz));
+#endif
+
+    for (iOuter = 0; iOuter < dOuter; iOuter++,
+                         args[0] += s0, args[1] += s1, args[2] += s2) {
+        void *ip1=args[0], *ip2=args[1], *op=args[2];
+#if 0 && defined(HAVE_CBLAS)
+        /*
+         * TODO: refactor this out to a inner_loop_selector, in
+         * PyUFunc_MatmulLoopSelector. But that call does not have access to
+         * n, m, p and strides.
+         */
+        if (too_big_for_blas || any_zero_dim) {
+            SHORT_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                       ip2, is2_n, is2_p,
+                                       op, os_m, os_p, dm, dn, dp);
+        }
+        else if (special_case) {
+            /* Special case variants that have a 1 in the core dimensions */
+            if (scalar_out) {
+                /* row @ column, 1,1 output */
+                SHORT_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
+            } else if (scalar_vec){
+                /*
+                 * 1,1d @ vector or vector @ 1,1d
+                 * could use cblas_Xaxy, but that requires 0ing output
+                 * and would not be faster (XXX prove it)
+                 */
+                SHORT_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            } else if (vector_matrix) {
+                /* vector @ matrix, switch ip1, ip2, p and m */
+                SHORT_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
+                            op, os_p, os_m, dp, dn, dm);
+            } else if  (matrix_vector) {
+                /* matrix @ vector */
+                SHORT_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,
+
+                            op, os_m, os_p, dm, dn, dp);
+            } else {
+                /* column @ row, 2d output, no blas needed or non-blas-able input */
+                SHORT_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        } else {
+            /* matrix @ matrix */
+            if (i1blasable && i2blasable && o_c_blasable) {
+                SHORT_matmul_matrixmatrix(ip1, is1_m, is1_n,
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p,
+                                           dm, dn, dp);
+            } else if (i1blasable && i2blasable && o_f_blasable) {
+                /*
+                 * Use transpose equivalence:
+                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
+                 */
+                SHORT_matmul_matrixmatrix(ip2, is2_p, is2_n,
+                                           ip1, is1_n, is1_m,
+                                           op, os_p, os_m,
+                                           dp, dn, dm);
+            } else {
+                /*
+                 * If parameters are castable to int and we copy the
+                 * non-blasable (or non-ccontiguous output)
+                 * we could still use BLAS, see gh-12365.
+                 */
+                SHORT_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        }
+#else
+        SHORT_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                   ip2, is2_n, is2_p,
+                                   op, os_m, os_p, dm, dn, dp);
+
+#endif
+    }
+}
+
+
+#line 395
+
+
+NPY_NO_EXPORT void
+INT_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp dOuter = *dimensions++;
+    npy_intp iOuter;
+    npy_intp s0 = *steps++;
+    npy_intp s1 = *steps++;
+    npy_intp s2 = *steps++;
+    npy_intp dm = dimensions[0];
+    npy_intp dn = dimensions[1];
+    npy_intp dp = dimensions[2];
+    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
+         os_m=steps[4], os_p=steps[5];
+#if 0 && defined(HAVE_CBLAS)
+    npy_intp sz = sizeof(npy_int);
+    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
+    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
+    npy_bool scalar_out = (dm == 1 && dp == 1);
+    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
+    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
+                                 dp > BLAS_MAXSIZE);
+    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
+    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
+    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
+    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
+    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
+    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
+    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
+    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
+    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
+                              is_blasable2d(is1_n, sz, dn, 1, sz));
+    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
+                              is_blasable2d(is2_n, sz, dn, 1, sz));
+#endif
+
+    for (iOuter = 0; iOuter < dOuter; iOuter++,
+                         args[0] += s0, args[1] += s1, args[2] += s2) {
+        void *ip1=args[0], *ip2=args[1], *op=args[2];
+#if 0 && defined(HAVE_CBLAS)
+        /*
+         * TODO: refactor this out to a inner_loop_selector, in
+         * PyUFunc_MatmulLoopSelector. But that call does not have access to
+         * n, m, p and strides.
+         */
+        if (too_big_for_blas || any_zero_dim) {
+            INT_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                       ip2, is2_n, is2_p,
+                                       op, os_m, os_p, dm, dn, dp);
+        }
+        else if (special_case) {
+            /* Special case variants that have a 1 in the core dimensions */
+            if (scalar_out) {
+                /* row @ column, 1,1 output */
+                INT_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
+            } else if (scalar_vec){
+                /*
+                 * 1,1d @ vector or vector @ 1,1d
+                 * could use cblas_Xaxy, but that requires 0ing output
+                 * and would not be faster (XXX prove it)
+                 */
+                INT_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            } else if (vector_matrix) {
+                /* vector @ matrix, switch ip1, ip2, p and m */
+                INT_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
+                            op, os_p, os_m, dp, dn, dm);
+            } else if  (matrix_vector) {
+                /* matrix @ vector */
+                INT_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,
+
+                            op, os_m, os_p, dm, dn, dp);
+            } else {
+                /* column @ row, 2d output, no blas needed or non-blas-able input */
+                INT_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        } else {
+            /* matrix @ matrix */
+            if (i1blasable && i2blasable && o_c_blasable) {
+                INT_matmul_matrixmatrix(ip1, is1_m, is1_n,
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p,
+                                           dm, dn, dp);
+            } else if (i1blasable && i2blasable && o_f_blasable) {
+                /*
+                 * Use transpose equivalence:
+                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
+                 */
+                INT_matmul_matrixmatrix(ip2, is2_p, is2_n,
+                                           ip1, is1_n, is1_m,
+                                           op, os_p, os_m,
+                                           dp, dn, dm);
+            } else {
+                /*
+                 * If parameters are castable to int and we copy the
+                 * non-blasable (or non-ccontiguous output)
+                 * we could still use BLAS, see gh-12365.
+                 */
+                INT_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        }
+#else
+        INT_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                   ip2, is2_n, is2_p,
+                                   op, os_m, os_p, dm, dn, dp);
+
+#endif
+    }
+}
+
+
+#line 395
+
+
+NPY_NO_EXPORT void
+LONG_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp dOuter = *dimensions++;
+    npy_intp iOuter;
+    npy_intp s0 = *steps++;
+    npy_intp s1 = *steps++;
+    npy_intp s2 = *steps++;
+    npy_intp dm = dimensions[0];
+    npy_intp dn = dimensions[1];
+    npy_intp dp = dimensions[2];
+    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
+         os_m=steps[4], os_p=steps[5];
+#if 0 && defined(HAVE_CBLAS)
+    npy_intp sz = sizeof(npy_long);
+    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
+    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
+    npy_bool scalar_out = (dm == 1 && dp == 1);
+    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
+    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
+                                 dp > BLAS_MAXSIZE);
+    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
+    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
+    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
+    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
+    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
+    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
+    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
+    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
+    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
+                              is_blasable2d(is1_n, sz, dn, 1, sz));
+    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
+                              is_blasable2d(is2_n, sz, dn, 1, sz));
+#endif
+
+    for (iOuter = 0; iOuter < dOuter; iOuter++,
+                         args[0] += s0, args[1] += s1, args[2] += s2) {
+        void *ip1=args[0], *ip2=args[1], *op=args[2];
+#if 0 && defined(HAVE_CBLAS)
+        /*
+         * TODO: refactor this out to a inner_loop_selector, in
+         * PyUFunc_MatmulLoopSelector. But that call does not have access to
+         * n, m, p and strides.
+         */
+        if (too_big_for_blas || any_zero_dim) {
+            LONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                       ip2, is2_n, is2_p,
+                                       op, os_m, os_p, dm, dn, dp);
+        }
+        else if (special_case) {
+            /* Special case variants that have a 1 in the core dimensions */
+            if (scalar_out) {
+                /* row @ column, 1,1 output */
+                LONG_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
+            } else if (scalar_vec){
+                /*
+                 * 1,1d @ vector or vector @ 1,1d
+                 * could use cblas_Xaxy, but that requires 0ing output
+                 * and would not be faster (XXX prove it)
+                 */
+                LONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            } else if (vector_matrix) {
+                /* vector @ matrix, switch ip1, ip2, p and m */
+                LONG_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
+                            op, os_p, os_m, dp, dn, dm);
+            } else if  (matrix_vector) {
+                /* matrix @ vector */
+                LONG_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,
+
+                            op, os_m, os_p, dm, dn, dp);
+            } else {
+                /* column @ row, 2d output, no blas needed or non-blas-able input */
+                LONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        } else {
+            /* matrix @ matrix */
+            if (i1blasable && i2blasable && o_c_blasable) {
+                LONG_matmul_matrixmatrix(ip1, is1_m, is1_n,
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p,
+                                           dm, dn, dp);
+            } else if (i1blasable && i2blasable && o_f_blasable) {
+                /*
+                 * Use transpose equivalence:
+                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
+                 */
+                LONG_matmul_matrixmatrix(ip2, is2_p, is2_n,
+                                           ip1, is1_n, is1_m,
+                                           op, os_p, os_m,
+                                           dp, dn, dm);
+            } else {
+                /*
+                 * If parameters are castable to int and we copy the
+                 * non-blasable (or non-ccontiguous output)
+                 * we could still use BLAS, see gh-12365.
+                 */
+                LONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        }
+#else
+        LONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                   ip2, is2_n, is2_p,
+                                   op, os_m, os_p, dm, dn, dp);
+
+#endif
+    }
+}
+
+
+#line 395
+
+
+NPY_NO_EXPORT void
+LONGLONG_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp dOuter = *dimensions++;
+    npy_intp iOuter;
+    npy_intp s0 = *steps++;
+    npy_intp s1 = *steps++;
+    npy_intp s2 = *steps++;
+    npy_intp dm = dimensions[0];
+    npy_intp dn = dimensions[1];
+    npy_intp dp = dimensions[2];
+    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
+         os_m=steps[4], os_p=steps[5];
+#if 0 && defined(HAVE_CBLAS)
+    npy_intp sz = sizeof(npy_longlong);
+    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
+    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
+    npy_bool scalar_out = (dm == 1 && dp == 1);
+    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
+    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
+                                 dp > BLAS_MAXSIZE);
+    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
+    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
+    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
+    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
+    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
+    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
+    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
+    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
+    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
+                              is_blasable2d(is1_n, sz, dn, 1, sz));
+    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
+                              is_blasable2d(is2_n, sz, dn, 1, sz));
+#endif
+
+    for (iOuter = 0; iOuter < dOuter; iOuter++,
+                         args[0] += s0, args[1] += s1, args[2] += s2) {
+        void *ip1=args[0], *ip2=args[1], *op=args[2];
+#if 0 && defined(HAVE_CBLAS)
+        /*
+         * TODO: refactor this out to a inner_loop_selector, in
+         * PyUFunc_MatmulLoopSelector. But that call does not have access to
+         * n, m, p and strides.
+         */
+        if (too_big_for_blas || any_zero_dim) {
+            LONGLONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                       ip2, is2_n, is2_p,
+                                       op, os_m, os_p, dm, dn, dp);
+        }
+        else if (special_case) {
+            /* Special case variants that have a 1 in the core dimensions */
+            if (scalar_out) {
+                /* row @ column, 1,1 output */
+                LONGLONG_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
+            } else if (scalar_vec){
+                /*
+                 * 1,1d @ vector or vector @ 1,1d
+                 * could use cblas_Xaxy, but that requires 0ing output
+                 * and would not be faster (XXX prove it)
+                 */
+                LONGLONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            } else if (vector_matrix) {
+                /* vector @ matrix, switch ip1, ip2, p and m */
+                LONGLONG_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
+                            op, os_p, os_m, dp, dn, dm);
+            } else if  (matrix_vector) {
+                /* matrix @ vector */
+                LONGLONG_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,
+
+                            op, os_m, os_p, dm, dn, dp);
+            } else {
+                /* column @ row, 2d output, no blas needed or non-blas-able input */
+                LONGLONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        } else {
+            /* matrix @ matrix */
+            if (i1blasable && i2blasable && o_c_blasable) {
+                LONGLONG_matmul_matrixmatrix(ip1, is1_m, is1_n,
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p,
+                                           dm, dn, dp);
+            } else if (i1blasable && i2blasable && o_f_blasable) {
+                /*
+                 * Use transpose equivalence:
+                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
+                 */
+                LONGLONG_matmul_matrixmatrix(ip2, is2_p, is2_n,
+                                           ip1, is1_n, is1_m,
+                                           op, os_p, os_m,
+                                           dp, dn, dm);
+            } else {
+                /*
+                 * If parameters are castable to int and we copy the
+                 * non-blasable (or non-ccontiguous output)
+                 * we could still use BLAS, see gh-12365.
+                 */
+                LONGLONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        }
+#else
+        LONGLONG_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                   ip2, is2_n, is2_p,
+                                   op, os_m, os_p, dm, dn, dp);
+
+#endif
+    }
+}
+
+
+#line 395
+
+
+NPY_NO_EXPORT void
+BOOL_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp dOuter = *dimensions++;
+    npy_intp iOuter;
+    npy_intp s0 = *steps++;
+    npy_intp s1 = *steps++;
+    npy_intp s2 = *steps++;
+    npy_intp dm = dimensions[0];
+    npy_intp dn = dimensions[1];
+    npy_intp dp = dimensions[2];
+    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
+         os_m=steps[4], os_p=steps[5];
+#if 0 && defined(HAVE_CBLAS)
+    npy_intp sz = sizeof(npy_bool);
+    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
+    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
+    npy_bool scalar_out = (dm == 1 && dp == 1);
+    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
+    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
+                                 dp > BLAS_MAXSIZE);
+    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
+    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
+    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
+    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
+    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
+    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
+    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
+    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
+    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
+                              is_blasable2d(is1_n, sz, dn, 1, sz));
+    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
+                              is_blasable2d(is2_n, sz, dn, 1, sz));
+#endif
+
+    for (iOuter = 0; iOuter < dOuter; iOuter++,
+                         args[0] += s0, args[1] += s1, args[2] += s2) {
+        void *ip1=args[0], *ip2=args[1], *op=args[2];
+#if 0 && defined(HAVE_CBLAS)
+        /*
+         * TODO: refactor this out to a inner_loop_selector, in
+         * PyUFunc_MatmulLoopSelector. But that call does not have access to
+         * n, m, p and strides.
+         */
+        if (too_big_for_blas || any_zero_dim) {
+            BOOL_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                       ip2, is2_n, is2_p,
+                                       op, os_m, os_p, dm, dn, dp);
+        }
+        else if (special_case) {
+            /* Special case variants that have a 1 in the core dimensions */
+            if (scalar_out) {
+                /* row @ column, 1,1 output */
+                BOOL_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
+            } else if (scalar_vec){
+                /*
+                 * 1,1d @ vector or vector @ 1,1d
+                 * could use cblas_Xaxy, but that requires 0ing output
+                 * and would not be faster (XXX prove it)
+                 */
+                BOOL_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            } else if (vector_matrix) {
+                /* vector @ matrix, switch ip1, ip2, p and m */
+                BOOL_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
+                            op, os_p, os_m, dp, dn, dm);
+            } else if  (matrix_vector) {
+                /* matrix @ vector */
+                BOOL_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,
+
+                            op, os_m, os_p, dm, dn, dp);
+            } else {
+                /* column @ row, 2d output, no blas needed or non-blas-able input */
+                BOOL_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        } else {
+            /* matrix @ matrix */
+            if (i1blasable && i2blasable && o_c_blasable) {
+                BOOL_matmul_matrixmatrix(ip1, is1_m, is1_n,
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p,
+                                           dm, dn, dp);
+            } else if (i1blasable && i2blasable && o_f_blasable) {
+                /*
+                 * Use transpose equivalence:
+                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
+                 */
+                BOOL_matmul_matrixmatrix(ip2, is2_p, is2_n,
+                                           ip1, is1_n, is1_m,
+                                           op, os_p, os_m,
+                                           dp, dn, dm);
+            } else {
+                /*
+                 * If parameters are castable to int and we copy the
+                 * non-blasable (or non-ccontiguous output)
+                 * we could still use BLAS, see gh-12365.
+                 */
+                BOOL_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        }
+#else
+        BOOL_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                   ip2, is2_n, is2_p,
+                                   op, os_m, os_p, dm, dn, dp);
+
+#endif
+    }
+}
+
+
+#line 395
+
+
+NPY_NO_EXPORT void
+OBJECT_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
+{
+    npy_intp dOuter = *dimensions++;
+    npy_intp iOuter;
+    npy_intp s0 = *steps++;
+    npy_intp s1 = *steps++;
+    npy_intp s2 = *steps++;
+    npy_intp dm = dimensions[0];
+    npy_intp dn = dimensions[1];
+    npy_intp dp = dimensions[2];
+    npy_intp is1_m=steps[0], is1_n=steps[1], is2_n=steps[2], is2_p=steps[3],
+         os_m=steps[4], os_p=steps[5];
+#if 0 && defined(HAVE_CBLAS)
+    npy_intp sz = sizeof(npy_object);
+    npy_bool special_case = (dm == 1 || dn == 1 || dp == 1);
+    npy_bool any_zero_dim = (dm == 0 || dn == 0 || dp == 0);
+    npy_bool scalar_out = (dm == 1 && dp == 1);
+    npy_bool scalar_vec = (dn == 1 && (dp == 1 || dm == 1));
+    npy_bool too_big_for_blas = (dm > BLAS_MAXSIZE || dn > BLAS_MAXSIZE ||
+                                 dp > BLAS_MAXSIZE);
+    npy_bool i1_c_blasable = is_blasable2d(is1_m, is1_n, dm, dn, sz);
+    npy_bool i2_c_blasable = is_blasable2d(is2_n, is2_p, dn, dp, sz);
+    npy_bool i1_f_blasable = is_blasable2d(is1_n, is1_m, dn, dm, sz);
+    npy_bool i2_f_blasable = is_blasable2d(is2_p, is2_n, dp, dn, sz);
+    npy_bool i1blasable = i1_c_blasable || i1_f_blasable;
+    npy_bool i2blasable = i2_c_blasable || i2_f_blasable;
+    npy_bool o_c_blasable = is_blasable2d(os_m, os_p, dm, dp, sz);
+    npy_bool o_f_blasable = is_blasable2d(os_p, os_m, dp, dm, sz);
+    npy_bool vector_matrix = ((dm == 1) && i2blasable &&
+                              is_blasable2d(is1_n, sz, dn, 1, sz));
+    npy_bool matrix_vector = ((dp == 1)  && i1blasable &&
+                              is_blasable2d(is2_n, sz, dn, 1, sz));
+#endif
+
+    for (iOuter = 0; iOuter < dOuter; iOuter++,
+                         args[0] += s0, args[1] += s1, args[2] += s2) {
+        void *ip1=args[0], *ip2=args[1], *op=args[2];
+#if 0 && defined(HAVE_CBLAS)
+        /*
+         * TODO: refactor this out to a inner_loop_selector, in
+         * PyUFunc_MatmulLoopSelector. But that call does not have access to
+         * n, m, p and strides.
+         */
+        if (too_big_for_blas || any_zero_dim) {
+            OBJECT_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                       ip2, is2_n, is2_p,
+                                       op, os_m, os_p, dm, dn, dp);
+        }
+        else if (special_case) {
+            /* Special case variants that have a 1 in the core dimensions */
+            if (scalar_out) {
+                /* row @ column, 1,1 output */
+                OBJECT_dot(ip1, is1_n, ip2, is2_n, op, dn, NULL);
+            } else if (scalar_vec){
+                /*
+                 * 1,1d @ vector or vector @ 1,1d
+                 * could use cblas_Xaxy, but that requires 0ing output
+                 * and would not be faster (XXX prove it)
+                 */
+                OBJECT_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            } else if (vector_matrix) {
+                /* vector @ matrix, switch ip1, ip2, p and m */
+                OBJECT_gemv(ip2, is2_p, is2_n, ip1, is1_n, is1_m,
+                            op, os_p, os_m, dp, dn, dm);
+            } else if  (matrix_vector) {
+                /* matrix @ vector */
+                OBJECT_gemv(ip1, is1_m, is1_n, ip2, is2_n, is2_p,
+
+                            op, os_m, os_p, dm, dn, dp);
+            } else {
+                /* column @ row, 2d output, no blas needed or non-blas-able input */
+                OBJECT_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        } else {
+            /* matrix @ matrix */
+            if (i1blasable && i2blasable && o_c_blasable) {
+                OBJECT_matmul_matrixmatrix(ip1, is1_m, is1_n,
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p,
+                                           dm, dn, dp);
+            } else if (i1blasable && i2blasable && o_f_blasable) {
+                /*
+                 * Use transpose equivalence:
+                 * matmul(a, b, o) == matmul(b.T, a.T, o.T)
+                 */
+                OBJECT_matmul_matrixmatrix(ip2, is2_p, is2_n,
+                                           ip1, is1_n, is1_m,
+                                           op, os_p, os_m,
+                                           dp, dn, dm);
+            } else {
+                /*
+                 * If parameters are castable to int and we copy the
+                 * non-blasable (or non-ccontiguous output)
+                 * we could still use BLAS, see gh-12365.
+                 */
+                OBJECT_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                           ip2, is2_n, is2_p,
+                                           op, os_m, os_p, dm, dn, dp);
+            }
+        }
+#else
+        OBJECT_matmul_inner_noblas(ip1, is1_m, is1_n, 
+                                   ip2, is2_n, is2_p,
+                                   op, os_m, os_p, dm, dn, dp);
+
+#endif
+    }
+}
+
+
+
diff --git a/numpy/core/src/_generated/matmul.h b/numpy/core/src/_generated/matmul.h
new file mode 100644
index 000000000000..9caec1db40e4
--- /dev/null
+++ b/numpy/core/src/_generated/matmul.h
@@ -0,0 +1,89 @@
+#line 1 "numpy/core/src/umath/matmul.h.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+#line 8
+NPY_NO_EXPORT void
+FLOAT_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 8
+NPY_NO_EXPORT void
+DOUBLE_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 8
+NPY_NO_EXPORT void
+LONGDOUBLE_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 8
+NPY_NO_EXPORT void
+HALF_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 8
+NPY_NO_EXPORT void
+CFLOAT_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 8
+NPY_NO_EXPORT void
+CDOUBLE_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 8
+NPY_NO_EXPORT void
+CLONGDOUBLE_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 8
+NPY_NO_EXPORT void
+UBYTE_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 8
+NPY_NO_EXPORT void
+USHORT_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 8
+NPY_NO_EXPORT void
+UINT_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 8
+NPY_NO_EXPORT void
+ULONG_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 8
+NPY_NO_EXPORT void
+ULONGLONG_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 8
+NPY_NO_EXPORT void
+BYTE_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 8
+NPY_NO_EXPORT void
+SHORT_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 8
+NPY_NO_EXPORT void
+INT_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 8
+NPY_NO_EXPORT void
+LONG_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 8
+NPY_NO_EXPORT void
+LONGLONG_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 8
+NPY_NO_EXPORT void
+BOOL_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+#line 8
+NPY_NO_EXPORT void
+OBJECT_matmul(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+
+
+
diff --git a/numpy/core/src/_generated/nditer_templ.c b/numpy/core/src/_generated/nditer_templ.c
new file mode 100644
index 000000000000..d5ac182d3374
--- /dev/null
+++ b/numpy/core/src/_generated/nditer_templ.c
@@ -0,0 +1,8182 @@
+#line 1 "numpy/core/src/multiarray/nditer_templ.c.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+/*
+ * This file implements the API functions for NumPy's nditer that
+ * are specialized using the templating system.
+ *
+ * Copyright (c) 2010-2011 by Mark Wiebe (mwwiebe@gmail.com)
+ * The University of British Columbia
+ *
+ * See LICENSE.txt for the license.
+ */
+
+/* Indicate that this .c file is allowed to include the header */
+#define NPY_ITERATOR_IMPLEMENTATION_CODE
+#include "nditer_impl.h"
+
+/* SPECIALIZED iternext functions that handle the non-buffering part */
+
+#line 25
+#line 29
+#line 33
+
+/* Specialized iternext (0,1,1) */
+static int
+npyiter_iternext_itflags0_dims1_iters1(
+                                                      NpyIter *iter)
+{
+#if !(0&NPY_ITFLAG_EXLOOP) || (1 > 1)
+    const npy_uint32 itflags = 0;
+#  if 1 >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if 1 < NPY_MAXDIMS
+    const int nop = 1;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if 1 > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if 1 > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (0&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if 1 > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(0&NPY_ITFLAG_EXLOOP) || (1 > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(0&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if 1 == 1
+
+#  if !(0&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(0&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if 1 == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+#line 33
+
+/* Specialized iternext (0,1,2) */
+static int
+npyiter_iternext_itflags0_dims1_iters2(
+                                                      NpyIter *iter)
+{
+#if !(0&NPY_ITFLAG_EXLOOP) || (1 > 1)
+    const npy_uint32 itflags = 0;
+#  if 1 >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if 2 < NPY_MAXDIMS
+    const int nop = 2;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if 1 > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if 1 > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (0&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if 1 > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(0&NPY_ITFLAG_EXLOOP) || (1 > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(0&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if 1 == 1
+
+#  if !(0&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(0&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if 1 == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+#line 33
+
+/* Specialized iternext (0,1,ANY) */
+static int
+npyiter_iternext_itflags0_dims1_itersANY(
+                                                      NpyIter *iter)
+{
+#if !(0&NPY_ITFLAG_EXLOOP) || (1 > 1)
+    const npy_uint32 itflags = 0;
+#  if 1 >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if NPY_MAXDIMS < NPY_MAXDIMS
+    const int nop = NPY_MAXDIMS;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if 1 > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if 1 > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (0&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if 1 > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(0&NPY_ITFLAG_EXLOOP) || (1 > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(0&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if 1 == 1
+
+#  if !(0&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(0&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if 1 == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+
+#line 29
+#line 33
+
+/* Specialized iternext (0,2,1) */
+static int
+npyiter_iternext_itflags0_dims2_iters1(
+                                                      NpyIter *iter)
+{
+#if !(0&NPY_ITFLAG_EXLOOP) || (2 > 1)
+    const npy_uint32 itflags = 0;
+#  if 2 >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if 1 < NPY_MAXDIMS
+    const int nop = 1;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if 2 > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if 2 > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (0&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if 2 > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(0&NPY_ITFLAG_EXLOOP) || (2 > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(0&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if 2 == 1
+
+#  if !(0&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(0&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if 2 == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+#line 33
+
+/* Specialized iternext (0,2,2) */
+static int
+npyiter_iternext_itflags0_dims2_iters2(
+                                                      NpyIter *iter)
+{
+#if !(0&NPY_ITFLAG_EXLOOP) || (2 > 1)
+    const npy_uint32 itflags = 0;
+#  if 2 >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if 2 < NPY_MAXDIMS
+    const int nop = 2;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if 2 > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if 2 > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (0&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if 2 > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(0&NPY_ITFLAG_EXLOOP) || (2 > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(0&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if 2 == 1
+
+#  if !(0&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(0&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if 2 == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+#line 33
+
+/* Specialized iternext (0,2,ANY) */
+static int
+npyiter_iternext_itflags0_dims2_itersANY(
+                                                      NpyIter *iter)
+{
+#if !(0&NPY_ITFLAG_EXLOOP) || (2 > 1)
+    const npy_uint32 itflags = 0;
+#  if 2 >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if NPY_MAXDIMS < NPY_MAXDIMS
+    const int nop = NPY_MAXDIMS;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if 2 > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if 2 > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (0&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if 2 > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(0&NPY_ITFLAG_EXLOOP) || (2 > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(0&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if 2 == 1
+
+#  if !(0&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(0&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if 2 == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+
+#line 29
+#line 33
+
+/* Specialized iternext (0,ANY,1) */
+static int
+npyiter_iternext_itflags0_dimsANY_iters1(
+                                                      NpyIter *iter)
+{
+#if !(0&NPY_ITFLAG_EXLOOP) || (NPY_MAXDIMS > 1)
+    const npy_uint32 itflags = 0;
+#  if NPY_MAXDIMS >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if 1 < NPY_MAXDIMS
+    const int nop = 1;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if NPY_MAXDIMS > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if NPY_MAXDIMS > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (0&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if NPY_MAXDIMS > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(0&NPY_ITFLAG_EXLOOP) || (NPY_MAXDIMS > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(0&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if NPY_MAXDIMS == 1
+
+#  if !(0&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(0&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if NPY_MAXDIMS == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+#line 33
+
+/* Specialized iternext (0,ANY,2) */
+static int
+npyiter_iternext_itflags0_dimsANY_iters2(
+                                                      NpyIter *iter)
+{
+#if !(0&NPY_ITFLAG_EXLOOP) || (NPY_MAXDIMS > 1)
+    const npy_uint32 itflags = 0;
+#  if NPY_MAXDIMS >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if 2 < NPY_MAXDIMS
+    const int nop = 2;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if NPY_MAXDIMS > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if NPY_MAXDIMS > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (0&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if NPY_MAXDIMS > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(0&NPY_ITFLAG_EXLOOP) || (NPY_MAXDIMS > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(0&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if NPY_MAXDIMS == 1
+
+#  if !(0&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(0&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if NPY_MAXDIMS == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+#line 33
+
+/* Specialized iternext (0,ANY,ANY) */
+static int
+npyiter_iternext_itflags0_dimsANY_itersANY(
+                                                      NpyIter *iter)
+{
+#if !(0&NPY_ITFLAG_EXLOOP) || (NPY_MAXDIMS > 1)
+    const npy_uint32 itflags = 0;
+#  if NPY_MAXDIMS >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if NPY_MAXDIMS < NPY_MAXDIMS
+    const int nop = NPY_MAXDIMS;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if NPY_MAXDIMS > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if NPY_MAXDIMS > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (0&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if NPY_MAXDIMS > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(0&NPY_ITFLAG_EXLOOP) || (NPY_MAXDIMS > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(0&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if NPY_MAXDIMS == 1
+
+#  if !(0&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(0&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if NPY_MAXDIMS == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+
+
+#line 25
+#line 29
+#line 33
+
+/* Specialized iternext (NPY_ITFLAG_HASINDEX,1,1) */
+static int
+npyiter_iternext_itflagsIND_dims1_iters1(
+                                                      NpyIter *iter)
+{
+#if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP) || (1 > 1)
+    const npy_uint32 itflags = NPY_ITFLAG_HASINDEX;
+#  if 1 >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if 1 < NPY_MAXDIMS
+    const int nop = 1;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if 1 > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if 1 > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (NPY_ITFLAG_HASINDEX&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if 1 > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP) || (1 > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if 1 == 1
+
+#  if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if 1 == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+#line 33
+
+/* Specialized iternext (NPY_ITFLAG_HASINDEX,1,2) */
+static int
+npyiter_iternext_itflagsIND_dims1_iters2(
+                                                      NpyIter *iter)
+{
+#if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP) || (1 > 1)
+    const npy_uint32 itflags = NPY_ITFLAG_HASINDEX;
+#  if 1 >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if 2 < NPY_MAXDIMS
+    const int nop = 2;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if 1 > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if 1 > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (NPY_ITFLAG_HASINDEX&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if 1 > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP) || (1 > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if 1 == 1
+
+#  if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if 1 == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+#line 33
+
+/* Specialized iternext (NPY_ITFLAG_HASINDEX,1,ANY) */
+static int
+npyiter_iternext_itflagsIND_dims1_itersANY(
+                                                      NpyIter *iter)
+{
+#if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP) || (1 > 1)
+    const npy_uint32 itflags = NPY_ITFLAG_HASINDEX;
+#  if 1 >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if NPY_MAXDIMS < NPY_MAXDIMS
+    const int nop = NPY_MAXDIMS;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if 1 > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if 1 > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (NPY_ITFLAG_HASINDEX&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if 1 > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP) || (1 > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if 1 == 1
+
+#  if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if 1 == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+
+#line 29
+#line 33
+
+/* Specialized iternext (NPY_ITFLAG_HASINDEX,2,1) */
+static int
+npyiter_iternext_itflagsIND_dims2_iters1(
+                                                      NpyIter *iter)
+{
+#if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP) || (2 > 1)
+    const npy_uint32 itflags = NPY_ITFLAG_HASINDEX;
+#  if 2 >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if 1 < NPY_MAXDIMS
+    const int nop = 1;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if 2 > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if 2 > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (NPY_ITFLAG_HASINDEX&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if 2 > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP) || (2 > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if 2 == 1
+
+#  if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if 2 == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+#line 33
+
+/* Specialized iternext (NPY_ITFLAG_HASINDEX,2,2) */
+static int
+npyiter_iternext_itflagsIND_dims2_iters2(
+                                                      NpyIter *iter)
+{
+#if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP) || (2 > 1)
+    const npy_uint32 itflags = NPY_ITFLAG_HASINDEX;
+#  if 2 >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if 2 < NPY_MAXDIMS
+    const int nop = 2;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if 2 > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if 2 > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (NPY_ITFLAG_HASINDEX&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if 2 > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP) || (2 > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if 2 == 1
+
+#  if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if 2 == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+#line 33
+
+/* Specialized iternext (NPY_ITFLAG_HASINDEX,2,ANY) */
+static int
+npyiter_iternext_itflagsIND_dims2_itersANY(
+                                                      NpyIter *iter)
+{
+#if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP) || (2 > 1)
+    const npy_uint32 itflags = NPY_ITFLAG_HASINDEX;
+#  if 2 >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if NPY_MAXDIMS < NPY_MAXDIMS
+    const int nop = NPY_MAXDIMS;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if 2 > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if 2 > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (NPY_ITFLAG_HASINDEX&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if 2 > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP) || (2 > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if 2 == 1
+
+#  if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if 2 == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+
+#line 29
+#line 33
+
+/* Specialized iternext (NPY_ITFLAG_HASINDEX,ANY,1) */
+static int
+npyiter_iternext_itflagsIND_dimsANY_iters1(
+                                                      NpyIter *iter)
+{
+#if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP) || (NPY_MAXDIMS > 1)
+    const npy_uint32 itflags = NPY_ITFLAG_HASINDEX;
+#  if NPY_MAXDIMS >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if 1 < NPY_MAXDIMS
+    const int nop = 1;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if NPY_MAXDIMS > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if NPY_MAXDIMS > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (NPY_ITFLAG_HASINDEX&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if NPY_MAXDIMS > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP) || (NPY_MAXDIMS > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if NPY_MAXDIMS == 1
+
+#  if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if NPY_MAXDIMS == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+#line 33
+
+/* Specialized iternext (NPY_ITFLAG_HASINDEX,ANY,2) */
+static int
+npyiter_iternext_itflagsIND_dimsANY_iters2(
+                                                      NpyIter *iter)
+{
+#if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP) || (NPY_MAXDIMS > 1)
+    const npy_uint32 itflags = NPY_ITFLAG_HASINDEX;
+#  if NPY_MAXDIMS >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if 2 < NPY_MAXDIMS
+    const int nop = 2;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if NPY_MAXDIMS > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if NPY_MAXDIMS > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (NPY_ITFLAG_HASINDEX&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if NPY_MAXDIMS > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP) || (NPY_MAXDIMS > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if NPY_MAXDIMS == 1
+
+#  if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if NPY_MAXDIMS == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+#line 33
+
+/* Specialized iternext (NPY_ITFLAG_HASINDEX,ANY,ANY) */
+static int
+npyiter_iternext_itflagsIND_dimsANY_itersANY(
+                                                      NpyIter *iter)
+{
+#if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP) || (NPY_MAXDIMS > 1)
+    const npy_uint32 itflags = NPY_ITFLAG_HASINDEX;
+#  if NPY_MAXDIMS >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if NPY_MAXDIMS < NPY_MAXDIMS
+    const int nop = NPY_MAXDIMS;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if NPY_MAXDIMS > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if NPY_MAXDIMS > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (NPY_ITFLAG_HASINDEX&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if NPY_MAXDIMS > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP) || (NPY_MAXDIMS > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if NPY_MAXDIMS == 1
+
+#  if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if NPY_MAXDIMS == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+
+
+#line 25
+#line 29
+#line 33
+
+/* Specialized iternext (NPY_ITFLAG_EXLOOP,1,1) */
+static int
+npyiter_iternext_itflagsNOINN_dims1_iters1(
+                                                      NpyIter *iter)
+{
+#if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP) || (1 > 1)
+    const npy_uint32 itflags = NPY_ITFLAG_EXLOOP;
+#  if 1 >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if 1 < NPY_MAXDIMS
+    const int nop = 1;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if 1 > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if 1 > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (NPY_ITFLAG_EXLOOP&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if 1 > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP) || (1 > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if 1 == 1
+
+#  if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if 1 == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+#line 33
+
+/* Specialized iternext (NPY_ITFLAG_EXLOOP,1,2) */
+static int
+npyiter_iternext_itflagsNOINN_dims1_iters2(
+                                                      NpyIter *iter)
+{
+#if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP) || (1 > 1)
+    const npy_uint32 itflags = NPY_ITFLAG_EXLOOP;
+#  if 1 >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if 2 < NPY_MAXDIMS
+    const int nop = 2;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if 1 > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if 1 > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (NPY_ITFLAG_EXLOOP&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if 1 > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP) || (1 > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if 1 == 1
+
+#  if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if 1 == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+#line 33
+
+/* Specialized iternext (NPY_ITFLAG_EXLOOP,1,ANY) */
+static int
+npyiter_iternext_itflagsNOINN_dims1_itersANY(
+                                                      NpyIter *iter)
+{
+#if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP) || (1 > 1)
+    const npy_uint32 itflags = NPY_ITFLAG_EXLOOP;
+#  if 1 >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if NPY_MAXDIMS < NPY_MAXDIMS
+    const int nop = NPY_MAXDIMS;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if 1 > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if 1 > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (NPY_ITFLAG_EXLOOP&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if 1 > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP) || (1 > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if 1 == 1
+
+#  if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if 1 == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+
+#line 29
+#line 33
+
+/* Specialized iternext (NPY_ITFLAG_EXLOOP,2,1) */
+static int
+npyiter_iternext_itflagsNOINN_dims2_iters1(
+                                                      NpyIter *iter)
+{
+#if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP) || (2 > 1)
+    const npy_uint32 itflags = NPY_ITFLAG_EXLOOP;
+#  if 2 >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if 1 < NPY_MAXDIMS
+    const int nop = 1;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if 2 > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if 2 > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (NPY_ITFLAG_EXLOOP&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if 2 > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP) || (2 > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if 2 == 1
+
+#  if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if 2 == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+#line 33
+
+/* Specialized iternext (NPY_ITFLAG_EXLOOP,2,2) */
+static int
+npyiter_iternext_itflagsNOINN_dims2_iters2(
+                                                      NpyIter *iter)
+{
+#if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP) || (2 > 1)
+    const npy_uint32 itflags = NPY_ITFLAG_EXLOOP;
+#  if 2 >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if 2 < NPY_MAXDIMS
+    const int nop = 2;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if 2 > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if 2 > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (NPY_ITFLAG_EXLOOP&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if 2 > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP) || (2 > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if 2 == 1
+
+#  if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if 2 == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+#line 33
+
+/* Specialized iternext (NPY_ITFLAG_EXLOOP,2,ANY) */
+static int
+npyiter_iternext_itflagsNOINN_dims2_itersANY(
+                                                      NpyIter *iter)
+{
+#if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP) || (2 > 1)
+    const npy_uint32 itflags = NPY_ITFLAG_EXLOOP;
+#  if 2 >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if NPY_MAXDIMS < NPY_MAXDIMS
+    const int nop = NPY_MAXDIMS;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if 2 > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if 2 > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (NPY_ITFLAG_EXLOOP&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if 2 > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP) || (2 > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if 2 == 1
+
+#  if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if 2 == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+
+#line 29
+#line 33
+
+/* Specialized iternext (NPY_ITFLAG_EXLOOP,ANY,1) */
+static int
+npyiter_iternext_itflagsNOINN_dimsANY_iters1(
+                                                      NpyIter *iter)
+{
+#if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP) || (NPY_MAXDIMS > 1)
+    const npy_uint32 itflags = NPY_ITFLAG_EXLOOP;
+#  if NPY_MAXDIMS >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if 1 < NPY_MAXDIMS
+    const int nop = 1;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if NPY_MAXDIMS > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if NPY_MAXDIMS > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (NPY_ITFLAG_EXLOOP&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if NPY_MAXDIMS > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP) || (NPY_MAXDIMS > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if NPY_MAXDIMS == 1
+
+#  if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if NPY_MAXDIMS == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+#line 33
+
+/* Specialized iternext (NPY_ITFLAG_EXLOOP,ANY,2) */
+static int
+npyiter_iternext_itflagsNOINN_dimsANY_iters2(
+                                                      NpyIter *iter)
+{
+#if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP) || (NPY_MAXDIMS > 1)
+    const npy_uint32 itflags = NPY_ITFLAG_EXLOOP;
+#  if NPY_MAXDIMS >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if 2 < NPY_MAXDIMS
+    const int nop = 2;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if NPY_MAXDIMS > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if NPY_MAXDIMS > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (NPY_ITFLAG_EXLOOP&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if NPY_MAXDIMS > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP) || (NPY_MAXDIMS > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if NPY_MAXDIMS == 1
+
+#  if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if NPY_MAXDIMS == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+#line 33
+
+/* Specialized iternext (NPY_ITFLAG_EXLOOP,ANY,ANY) */
+static int
+npyiter_iternext_itflagsNOINN_dimsANY_itersANY(
+                                                      NpyIter *iter)
+{
+#if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP) || (NPY_MAXDIMS > 1)
+    const npy_uint32 itflags = NPY_ITFLAG_EXLOOP;
+#  if NPY_MAXDIMS >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if NPY_MAXDIMS < NPY_MAXDIMS
+    const int nop = NPY_MAXDIMS;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if NPY_MAXDIMS > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if NPY_MAXDIMS > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (NPY_ITFLAG_EXLOOP&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if NPY_MAXDIMS > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP) || (NPY_MAXDIMS > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if NPY_MAXDIMS == 1
+
+#  if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(NPY_ITFLAG_EXLOOP&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if NPY_MAXDIMS == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+
+
+#line 25
+#line 29
+#line 33
+
+/* Specialized iternext (NPY_ITFLAG_RANGE,1,1) */
+static int
+npyiter_iternext_itflagsRNG_dims1_iters1(
+                                                      NpyIter *iter)
+{
+#if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP) || (1 > 1)
+    const npy_uint32 itflags = NPY_ITFLAG_RANGE;
+#  if 1 >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if 1 < NPY_MAXDIMS
+    const int nop = 1;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if 1 > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if 1 > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (NPY_ITFLAG_RANGE&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if 1 > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP) || (1 > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if 1 == 1
+
+#  if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if 1 == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+#line 33
+
+/* Specialized iternext (NPY_ITFLAG_RANGE,1,2) */
+static int
+npyiter_iternext_itflagsRNG_dims1_iters2(
+                                                      NpyIter *iter)
+{
+#if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP) || (1 > 1)
+    const npy_uint32 itflags = NPY_ITFLAG_RANGE;
+#  if 1 >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if 2 < NPY_MAXDIMS
+    const int nop = 2;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if 1 > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if 1 > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (NPY_ITFLAG_RANGE&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if 1 > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP) || (1 > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if 1 == 1
+
+#  if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if 1 == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+#line 33
+
+/* Specialized iternext (NPY_ITFLAG_RANGE,1,ANY) */
+static int
+npyiter_iternext_itflagsRNG_dims1_itersANY(
+                                                      NpyIter *iter)
+{
+#if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP) || (1 > 1)
+    const npy_uint32 itflags = NPY_ITFLAG_RANGE;
+#  if 1 >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if NPY_MAXDIMS < NPY_MAXDIMS
+    const int nop = NPY_MAXDIMS;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if 1 > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if 1 > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (NPY_ITFLAG_RANGE&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if 1 > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP) || (1 > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if 1 == 1
+
+#  if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if 1 == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+
+#line 29
+#line 33
+
+/* Specialized iternext (NPY_ITFLAG_RANGE,2,1) */
+static int
+npyiter_iternext_itflagsRNG_dims2_iters1(
+                                                      NpyIter *iter)
+{
+#if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP) || (2 > 1)
+    const npy_uint32 itflags = NPY_ITFLAG_RANGE;
+#  if 2 >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if 1 < NPY_MAXDIMS
+    const int nop = 1;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if 2 > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if 2 > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (NPY_ITFLAG_RANGE&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if 2 > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP) || (2 > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if 2 == 1
+
+#  if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if 2 == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+#line 33
+
+/* Specialized iternext (NPY_ITFLAG_RANGE,2,2) */
+static int
+npyiter_iternext_itflagsRNG_dims2_iters2(
+                                                      NpyIter *iter)
+{
+#if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP) || (2 > 1)
+    const npy_uint32 itflags = NPY_ITFLAG_RANGE;
+#  if 2 >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if 2 < NPY_MAXDIMS
+    const int nop = 2;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if 2 > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if 2 > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (NPY_ITFLAG_RANGE&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if 2 > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP) || (2 > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if 2 == 1
+
+#  if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if 2 == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+#line 33
+
+/* Specialized iternext (NPY_ITFLAG_RANGE,2,ANY) */
+static int
+npyiter_iternext_itflagsRNG_dims2_itersANY(
+                                                      NpyIter *iter)
+{
+#if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP) || (2 > 1)
+    const npy_uint32 itflags = NPY_ITFLAG_RANGE;
+#  if 2 >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if NPY_MAXDIMS < NPY_MAXDIMS
+    const int nop = NPY_MAXDIMS;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if 2 > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if 2 > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (NPY_ITFLAG_RANGE&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if 2 > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP) || (2 > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if 2 == 1
+
+#  if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if 2 == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+
+#line 29
+#line 33
+
+/* Specialized iternext (NPY_ITFLAG_RANGE,ANY,1) */
+static int
+npyiter_iternext_itflagsRNG_dimsANY_iters1(
+                                                      NpyIter *iter)
+{
+#if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP) || (NPY_MAXDIMS > 1)
+    const npy_uint32 itflags = NPY_ITFLAG_RANGE;
+#  if NPY_MAXDIMS >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if 1 < NPY_MAXDIMS
+    const int nop = 1;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if NPY_MAXDIMS > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if NPY_MAXDIMS > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (NPY_ITFLAG_RANGE&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if NPY_MAXDIMS > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP) || (NPY_MAXDIMS > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if NPY_MAXDIMS == 1
+
+#  if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if NPY_MAXDIMS == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+#line 33
+
+/* Specialized iternext (NPY_ITFLAG_RANGE,ANY,2) */
+static int
+npyiter_iternext_itflagsRNG_dimsANY_iters2(
+                                                      NpyIter *iter)
+{
+#if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP) || (NPY_MAXDIMS > 1)
+    const npy_uint32 itflags = NPY_ITFLAG_RANGE;
+#  if NPY_MAXDIMS >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if 2 < NPY_MAXDIMS
+    const int nop = 2;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if NPY_MAXDIMS > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if NPY_MAXDIMS > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (NPY_ITFLAG_RANGE&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if NPY_MAXDIMS > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP) || (NPY_MAXDIMS > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if NPY_MAXDIMS == 1
+
+#  if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if NPY_MAXDIMS == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+#line 33
+
+/* Specialized iternext (NPY_ITFLAG_RANGE,ANY,ANY) */
+static int
+npyiter_iternext_itflagsRNG_dimsANY_itersANY(
+                                                      NpyIter *iter)
+{
+#if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP) || (NPY_MAXDIMS > 1)
+    const npy_uint32 itflags = NPY_ITFLAG_RANGE;
+#  if NPY_MAXDIMS >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if NPY_MAXDIMS < NPY_MAXDIMS
+    const int nop = NPY_MAXDIMS;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if NPY_MAXDIMS > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if NPY_MAXDIMS > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (NPY_ITFLAG_RANGE&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if NPY_MAXDIMS > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP) || (NPY_MAXDIMS > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if NPY_MAXDIMS == 1
+
+#  if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(NPY_ITFLAG_RANGE&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if NPY_MAXDIMS == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+
+
+#line 25
+#line 29
+#line 33
+
+/* Specialized iternext (NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX,1,1) */
+static int
+npyiter_iternext_itflagsRNGuIND_dims1_iters1(
+                                                      NpyIter *iter)
+{
+#if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP) || (1 > 1)
+    const npy_uint32 itflags = NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX;
+#  if 1 >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if 1 < NPY_MAXDIMS
+    const int nop = 1;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if 1 > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if 1 > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if 1 > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP) || (1 > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if 1 == 1
+
+#  if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if 1 == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+#line 33
+
+/* Specialized iternext (NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX,1,2) */
+static int
+npyiter_iternext_itflagsRNGuIND_dims1_iters2(
+                                                      NpyIter *iter)
+{
+#if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP) || (1 > 1)
+    const npy_uint32 itflags = NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX;
+#  if 1 >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if 2 < NPY_MAXDIMS
+    const int nop = 2;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if 1 > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if 1 > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if 1 > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP) || (1 > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if 1 == 1
+
+#  if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if 1 == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+#line 33
+
+/* Specialized iternext (NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX,1,ANY) */
+static int
+npyiter_iternext_itflagsRNGuIND_dims1_itersANY(
+                                                      NpyIter *iter)
+{
+#if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP) || (1 > 1)
+    const npy_uint32 itflags = NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX;
+#  if 1 >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if NPY_MAXDIMS < NPY_MAXDIMS
+    const int nop = NPY_MAXDIMS;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if 1 > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if 1 > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if 1 > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP) || (1 > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if 1 == 1
+
+#  if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if 1 == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+
+#line 29
+#line 33
+
+/* Specialized iternext (NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX,2,1) */
+static int
+npyiter_iternext_itflagsRNGuIND_dims2_iters1(
+                                                      NpyIter *iter)
+{
+#if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP) || (2 > 1)
+    const npy_uint32 itflags = NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX;
+#  if 2 >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if 1 < NPY_MAXDIMS
+    const int nop = 1;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if 2 > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if 2 > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if 2 > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP) || (2 > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if 2 == 1
+
+#  if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if 2 == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+#line 33
+
+/* Specialized iternext (NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX,2,2) */
+static int
+npyiter_iternext_itflagsRNGuIND_dims2_iters2(
+                                                      NpyIter *iter)
+{
+#if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP) || (2 > 1)
+    const npy_uint32 itflags = NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX;
+#  if 2 >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if 2 < NPY_MAXDIMS
+    const int nop = 2;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if 2 > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if 2 > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if 2 > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP) || (2 > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if 2 == 1
+
+#  if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if 2 == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+#line 33
+
+/* Specialized iternext (NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX,2,ANY) */
+static int
+npyiter_iternext_itflagsRNGuIND_dims2_itersANY(
+                                                      NpyIter *iter)
+{
+#if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP) || (2 > 1)
+    const npy_uint32 itflags = NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX;
+#  if 2 >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if NPY_MAXDIMS < NPY_MAXDIMS
+    const int nop = NPY_MAXDIMS;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if 2 > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if 2 > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if 2 > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP) || (2 > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if 2 == 1
+
+#  if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if 2 == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+
+#line 29
+#line 33
+
+/* Specialized iternext (NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX,ANY,1) */
+static int
+npyiter_iternext_itflagsRNGuIND_dimsANY_iters1(
+                                                      NpyIter *iter)
+{
+#if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP) || (NPY_MAXDIMS > 1)
+    const npy_uint32 itflags = NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX;
+#  if NPY_MAXDIMS >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if 1 < NPY_MAXDIMS
+    const int nop = 1;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if NPY_MAXDIMS > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if NPY_MAXDIMS > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if NPY_MAXDIMS > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP) || (NPY_MAXDIMS > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if NPY_MAXDIMS == 1
+
+#  if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if NPY_MAXDIMS == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+#line 33
+
+/* Specialized iternext (NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX,ANY,2) */
+static int
+npyiter_iternext_itflagsRNGuIND_dimsANY_iters2(
+                                                      NpyIter *iter)
+{
+#if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP) || (NPY_MAXDIMS > 1)
+    const npy_uint32 itflags = NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX;
+#  if NPY_MAXDIMS >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if 2 < NPY_MAXDIMS
+    const int nop = 2;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if NPY_MAXDIMS > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if NPY_MAXDIMS > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if NPY_MAXDIMS > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP) || (NPY_MAXDIMS > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if NPY_MAXDIMS == 1
+
+#  if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if NPY_MAXDIMS == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+#line 33
+
+/* Specialized iternext (NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX,ANY,ANY) */
+static int
+npyiter_iternext_itflagsRNGuIND_dimsANY_itersANY(
+                                                      NpyIter *iter)
+{
+#if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP) || (NPY_MAXDIMS > 1)
+    const npy_uint32 itflags = NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX;
+#  if NPY_MAXDIMS >= NPY_MAXDIMS
+    int idim, ndim = NIT_NDIM(iter);
+#  endif
+#  if NPY_MAXDIMS < NPY_MAXDIMS
+    const int nop = NPY_MAXDIMS;
+#  else
+    int nop = NIT_NOP(iter);
+#  endif
+
+    NpyIter_AxisData *axisdata0;
+    npy_intp istrides, nstrides = NAD_NSTRIDES();
+#endif
+#if NPY_MAXDIMS > 1
+    NpyIter_AxisData *axisdata1;
+    npy_intp sizeof_axisdata;
+#endif
+#if NPY_MAXDIMS > 2
+    NpyIter_AxisData *axisdata2;
+#endif
+
+#if (NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_RANGE)
+    /* When ranged iteration is enabled, use the iterindex */
+    if (++NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        return 0;
+    }
+#endif
+
+#if NPY_MAXDIMS > 1
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#endif
+
+#  if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP) || (NPY_MAXDIMS > 1)
+    axisdata0 = NIT_AXISDATA(iter);
+#  endif
+#  if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    /* Increment index 0 */
+    NAD_INDEX(axisdata0)++;
+    /* Increment pointer 0 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata0)[istrides] += NAD_STRIDES(axisdata0)[istrides];
+    }
+#  endif
+
+#if NPY_MAXDIMS == 1
+
+#  if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    /* Finished when the index equals the shape */
+    return NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0);
+#  else
+    return 0;
+#  endif
+
+#else
+
+#  if !(NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX&NPY_ITFLAG_EXLOOP)
+    if (NAD_INDEX(axisdata0) < NAD_SHAPE(axisdata0)) {
+        return 1;
+    }
+#  endif
+
+    axisdata1 = NIT_INDEX_AXISDATA(axisdata0, 1);
+    /* Increment index 1 */
+    NAD_INDEX(axisdata1)++;
+    /* Increment pointer 1 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata1)[istrides] += NAD_STRIDES(axisdata1)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata1) < NAD_SHAPE(axisdata1)) {
+        /* Reset the 1st index to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        /* Reset the 1st pointer to the value of the 2nd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata1)[istrides];
+        }
+        return 1;
+    }
+
+# if NPY_MAXDIMS == 2
+    return 0;
+# else
+
+    axisdata2 = NIT_INDEX_AXISDATA(axisdata1, 1);
+    /* Increment index 2 */
+    NAD_INDEX(axisdata2)++;
+    /* Increment pointer 2 */
+    for (istrides = 0; istrides < nstrides; ++istrides) {
+        NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+    }
+
+    if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+        /* Reset the 1st and 2nd indices to 0 */
+        NAD_INDEX(axisdata0) = 0;
+        NAD_INDEX(axisdata1) = 0;
+        /* Reset the 1st and 2nd pointers to the value of the 3rd */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata0)[istrides] = NAD_PTRS(axisdata2)[istrides];
+            NAD_PTRS(axisdata1)[istrides] = NAD_PTRS(axisdata2)[istrides];
+        }
+        return 1;
+    }
+
+    for (idim = 3; idim < ndim; ++idim) {
+        NIT_ADVANCE_AXISDATA(axisdata2, 1);
+        /* Increment the index */
+        NAD_INDEX(axisdata2)++;
+        /* Increment the pointer */
+        for (istrides = 0; istrides < nstrides; ++istrides) {
+            NAD_PTRS(axisdata2)[istrides] += NAD_STRIDES(axisdata2)[istrides];
+        }
+
+
+        if (NAD_INDEX(axisdata2) < NAD_SHAPE(axisdata2)) {
+            /* Reset the indices and pointers of all previous axisdatas */
+            axisdata1 = axisdata2;
+            do {
+                NIT_ADVANCE_AXISDATA(axisdata1, -1);
+                /* Reset the index to 0 */
+                NAD_INDEX(axisdata1) = 0;
+                /* Reset the pointer to the updated value */
+                for (istrides = 0; istrides < nstrides; ++istrides) {
+                    NAD_PTRS(axisdata1)[istrides] =
+                                        NAD_PTRS(axisdata2)[istrides];
+                }
+            } while (axisdata1 != axisdata0);
+
+            return 1;
+        }
+    }
+
+    return 0;
+
+# endif /* ndim != 2 */
+
+#endif /* ndim != 1 */
+}
+
+
+
+
+
+
+#line 187
+
+/*
+ * Iternext function that handles the reduction buffering part.  This
+ * is done with a double loop to avoid frequent re-buffering.
+ */
+static int
+npyiter_buffered_reduce_iternext_iters1(NpyIter *iter)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    /*int ndim = NIT_NDIM(iter);*/
+#if 1 >= NPY_MAXDIMS
+    int nop = NIT_NOP(iter);
+#else
+    const int nop = 1;
+#endif
+
+    int iop;
+
+    NpyIter_AxisData *axisdata;
+    NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
+    char **ptrs;
+    char *prev_dataptrs[NPY_MAXARGS];
+
+    ptrs = NBF_PTRS(bufferdata);
+
+    /*
+     * If the iterator handles the inner loop, need to increment all
+     * the indices and pointers
+     */
+    if (!(itflags&NPY_ITFLAG_EXLOOP)) {
+        /* Increment within the buffer */
+        if (++NIT_ITERINDEX(iter) < NBF_BUFITEREND(bufferdata)) {
+            npy_intp *strides;
+
+            strides = NBF_STRIDES(bufferdata);
+            for (iop = 0; iop < nop; ++iop) {
+                ptrs[iop] += strides[iop];
+            }
+            return 1;
+        }
+    }
+    else {
+        NIT_ITERINDEX(iter) += NBF_SIZE(bufferdata);
+    }
+
+    NPY_IT_DBG_PRINT1("Iterator: Finished iteration %d of outer reduce loop\n",
+                            (int)NBF_REDUCE_POS(bufferdata));
+    /* The outer increment for the reduce double loop */
+    if (++NBF_REDUCE_POS(bufferdata) < NBF_REDUCE_OUTERSIZE(bufferdata)) {
+        npy_intp *reduce_outerstrides = NBF_REDUCE_OUTERSTRIDES(bufferdata);
+        char **reduce_outerptrs = NBF_REDUCE_OUTERPTRS(bufferdata);
+        for (iop = 0; iop < nop; ++iop) {
+            char *ptr = reduce_outerptrs[iop] + reduce_outerstrides[iop];
+            ptrs[iop] = ptr;
+            reduce_outerptrs[iop] = ptr;
+        }
+        NBF_BUFITEREND(bufferdata) = NIT_ITERINDEX(iter) + NBF_SIZE(bufferdata);
+        return 1;
+    }
+
+    /* Save the previously used data pointers */
+    axisdata = NIT_AXISDATA(iter);
+    memcpy(prev_dataptrs, NAD_PTRS(axisdata), NPY_SIZEOF_INTP*nop);
+
+    /* Write back to the arrays */
+    if (npyiter_copy_from_buffers(iter) < 0) {
+        npyiter_clear_buffers(iter);
+        return 0;
+    }
+
+    /* Check if we're past the end */
+    if (NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        NBF_SIZE(bufferdata) = 0;
+        return 0;
+    }
+    /* Increment to the next buffer */
+    else {
+        npyiter_goto_iterindex(iter, NIT_ITERINDEX(iter));
+    }
+
+    /* Prepare the next buffers and set iterend/size */
+    if (npyiter_copy_to_buffers(iter, prev_dataptrs) < 0) {
+        npyiter_clear_buffers(iter);
+        return 0;
+    }
+
+    return 1;
+}
+
+
+#line 187
+
+/*
+ * Iternext function that handles the reduction buffering part.  This
+ * is done with a double loop to avoid frequent re-buffering.
+ */
+static int
+npyiter_buffered_reduce_iternext_iters2(NpyIter *iter)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    /*int ndim = NIT_NDIM(iter);*/
+#if 2 >= NPY_MAXDIMS
+    int nop = NIT_NOP(iter);
+#else
+    const int nop = 2;
+#endif
+
+    int iop;
+
+    NpyIter_AxisData *axisdata;
+    NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
+    char **ptrs;
+    char *prev_dataptrs[NPY_MAXARGS];
+
+    ptrs = NBF_PTRS(bufferdata);
+
+    /*
+     * If the iterator handles the inner loop, need to increment all
+     * the indices and pointers
+     */
+    if (!(itflags&NPY_ITFLAG_EXLOOP)) {
+        /* Increment within the buffer */
+        if (++NIT_ITERINDEX(iter) < NBF_BUFITEREND(bufferdata)) {
+            npy_intp *strides;
+
+            strides = NBF_STRIDES(bufferdata);
+            for (iop = 0; iop < nop; ++iop) {
+                ptrs[iop] += strides[iop];
+            }
+            return 1;
+        }
+    }
+    else {
+        NIT_ITERINDEX(iter) += NBF_SIZE(bufferdata);
+    }
+
+    NPY_IT_DBG_PRINT1("Iterator: Finished iteration %d of outer reduce loop\n",
+                            (int)NBF_REDUCE_POS(bufferdata));
+    /* The outer increment for the reduce double loop */
+    if (++NBF_REDUCE_POS(bufferdata) < NBF_REDUCE_OUTERSIZE(bufferdata)) {
+        npy_intp *reduce_outerstrides = NBF_REDUCE_OUTERSTRIDES(bufferdata);
+        char **reduce_outerptrs = NBF_REDUCE_OUTERPTRS(bufferdata);
+        for (iop = 0; iop < nop; ++iop) {
+            char *ptr = reduce_outerptrs[iop] + reduce_outerstrides[iop];
+            ptrs[iop] = ptr;
+            reduce_outerptrs[iop] = ptr;
+        }
+        NBF_BUFITEREND(bufferdata) = NIT_ITERINDEX(iter) + NBF_SIZE(bufferdata);
+        return 1;
+    }
+
+    /* Save the previously used data pointers */
+    axisdata = NIT_AXISDATA(iter);
+    memcpy(prev_dataptrs, NAD_PTRS(axisdata), NPY_SIZEOF_INTP*nop);
+
+    /* Write back to the arrays */
+    if (npyiter_copy_from_buffers(iter) < 0) {
+        npyiter_clear_buffers(iter);
+        return 0;
+    }
+
+    /* Check if we're past the end */
+    if (NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        NBF_SIZE(bufferdata) = 0;
+        return 0;
+    }
+    /* Increment to the next buffer */
+    else {
+        npyiter_goto_iterindex(iter, NIT_ITERINDEX(iter));
+    }
+
+    /* Prepare the next buffers and set iterend/size */
+    if (npyiter_copy_to_buffers(iter, prev_dataptrs) < 0) {
+        npyiter_clear_buffers(iter);
+        return 0;
+    }
+
+    return 1;
+}
+
+
+#line 187
+
+/*
+ * Iternext function that handles the reduction buffering part.  This
+ * is done with a double loop to avoid frequent re-buffering.
+ */
+static int
+npyiter_buffered_reduce_iternext_iters3(NpyIter *iter)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    /*int ndim = NIT_NDIM(iter);*/
+#if 3 >= NPY_MAXDIMS
+    int nop = NIT_NOP(iter);
+#else
+    const int nop = 3;
+#endif
+
+    int iop;
+
+    NpyIter_AxisData *axisdata;
+    NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
+    char **ptrs;
+    char *prev_dataptrs[NPY_MAXARGS];
+
+    ptrs = NBF_PTRS(bufferdata);
+
+    /*
+     * If the iterator handles the inner loop, need to increment all
+     * the indices and pointers
+     */
+    if (!(itflags&NPY_ITFLAG_EXLOOP)) {
+        /* Increment within the buffer */
+        if (++NIT_ITERINDEX(iter) < NBF_BUFITEREND(bufferdata)) {
+            npy_intp *strides;
+
+            strides = NBF_STRIDES(bufferdata);
+            for (iop = 0; iop < nop; ++iop) {
+                ptrs[iop] += strides[iop];
+            }
+            return 1;
+        }
+    }
+    else {
+        NIT_ITERINDEX(iter) += NBF_SIZE(bufferdata);
+    }
+
+    NPY_IT_DBG_PRINT1("Iterator: Finished iteration %d of outer reduce loop\n",
+                            (int)NBF_REDUCE_POS(bufferdata));
+    /* The outer increment for the reduce double loop */
+    if (++NBF_REDUCE_POS(bufferdata) < NBF_REDUCE_OUTERSIZE(bufferdata)) {
+        npy_intp *reduce_outerstrides = NBF_REDUCE_OUTERSTRIDES(bufferdata);
+        char **reduce_outerptrs = NBF_REDUCE_OUTERPTRS(bufferdata);
+        for (iop = 0; iop < nop; ++iop) {
+            char *ptr = reduce_outerptrs[iop] + reduce_outerstrides[iop];
+            ptrs[iop] = ptr;
+            reduce_outerptrs[iop] = ptr;
+        }
+        NBF_BUFITEREND(bufferdata) = NIT_ITERINDEX(iter) + NBF_SIZE(bufferdata);
+        return 1;
+    }
+
+    /* Save the previously used data pointers */
+    axisdata = NIT_AXISDATA(iter);
+    memcpy(prev_dataptrs, NAD_PTRS(axisdata), NPY_SIZEOF_INTP*nop);
+
+    /* Write back to the arrays */
+    if (npyiter_copy_from_buffers(iter) < 0) {
+        npyiter_clear_buffers(iter);
+        return 0;
+    }
+
+    /* Check if we're past the end */
+    if (NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        NBF_SIZE(bufferdata) = 0;
+        return 0;
+    }
+    /* Increment to the next buffer */
+    else {
+        npyiter_goto_iterindex(iter, NIT_ITERINDEX(iter));
+    }
+
+    /* Prepare the next buffers and set iterend/size */
+    if (npyiter_copy_to_buffers(iter, prev_dataptrs) < 0) {
+        npyiter_clear_buffers(iter);
+        return 0;
+    }
+
+    return 1;
+}
+
+
+#line 187
+
+/*
+ * Iternext function that handles the reduction buffering part.  This
+ * is done with a double loop to avoid frequent re-buffering.
+ */
+static int
+npyiter_buffered_reduce_iternext_iters4(NpyIter *iter)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    /*int ndim = NIT_NDIM(iter);*/
+#if 4 >= NPY_MAXDIMS
+    int nop = NIT_NOP(iter);
+#else
+    const int nop = 4;
+#endif
+
+    int iop;
+
+    NpyIter_AxisData *axisdata;
+    NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
+    char **ptrs;
+    char *prev_dataptrs[NPY_MAXARGS];
+
+    ptrs = NBF_PTRS(bufferdata);
+
+    /*
+     * If the iterator handles the inner loop, need to increment all
+     * the indices and pointers
+     */
+    if (!(itflags&NPY_ITFLAG_EXLOOP)) {
+        /* Increment within the buffer */
+        if (++NIT_ITERINDEX(iter) < NBF_BUFITEREND(bufferdata)) {
+            npy_intp *strides;
+
+            strides = NBF_STRIDES(bufferdata);
+            for (iop = 0; iop < nop; ++iop) {
+                ptrs[iop] += strides[iop];
+            }
+            return 1;
+        }
+    }
+    else {
+        NIT_ITERINDEX(iter) += NBF_SIZE(bufferdata);
+    }
+
+    NPY_IT_DBG_PRINT1("Iterator: Finished iteration %d of outer reduce loop\n",
+                            (int)NBF_REDUCE_POS(bufferdata));
+    /* The outer increment for the reduce double loop */
+    if (++NBF_REDUCE_POS(bufferdata) < NBF_REDUCE_OUTERSIZE(bufferdata)) {
+        npy_intp *reduce_outerstrides = NBF_REDUCE_OUTERSTRIDES(bufferdata);
+        char **reduce_outerptrs = NBF_REDUCE_OUTERPTRS(bufferdata);
+        for (iop = 0; iop < nop; ++iop) {
+            char *ptr = reduce_outerptrs[iop] + reduce_outerstrides[iop];
+            ptrs[iop] = ptr;
+            reduce_outerptrs[iop] = ptr;
+        }
+        NBF_BUFITEREND(bufferdata) = NIT_ITERINDEX(iter) + NBF_SIZE(bufferdata);
+        return 1;
+    }
+
+    /* Save the previously used data pointers */
+    axisdata = NIT_AXISDATA(iter);
+    memcpy(prev_dataptrs, NAD_PTRS(axisdata), NPY_SIZEOF_INTP*nop);
+
+    /* Write back to the arrays */
+    if (npyiter_copy_from_buffers(iter) < 0) {
+        npyiter_clear_buffers(iter);
+        return 0;
+    }
+
+    /* Check if we're past the end */
+    if (NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        NBF_SIZE(bufferdata) = 0;
+        return 0;
+    }
+    /* Increment to the next buffer */
+    else {
+        npyiter_goto_iterindex(iter, NIT_ITERINDEX(iter));
+    }
+
+    /* Prepare the next buffers and set iterend/size */
+    if (npyiter_copy_to_buffers(iter, prev_dataptrs) < 0) {
+        npyiter_clear_buffers(iter);
+        return 0;
+    }
+
+    return 1;
+}
+
+
+#line 187
+
+/*
+ * Iternext function that handles the reduction buffering part.  This
+ * is done with a double loop to avoid frequent re-buffering.
+ */
+static int
+npyiter_buffered_reduce_iternext_itersANY(NpyIter *iter)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    /*int ndim = NIT_NDIM(iter);*/
+#if NPY_MAXDIMS >= NPY_MAXDIMS
+    int nop = NIT_NOP(iter);
+#else
+    const int nop = NPY_MAXDIMS;
+#endif
+
+    int iop;
+
+    NpyIter_AxisData *axisdata;
+    NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
+    char **ptrs;
+    char *prev_dataptrs[NPY_MAXARGS];
+
+    ptrs = NBF_PTRS(bufferdata);
+
+    /*
+     * If the iterator handles the inner loop, need to increment all
+     * the indices and pointers
+     */
+    if (!(itflags&NPY_ITFLAG_EXLOOP)) {
+        /* Increment within the buffer */
+        if (++NIT_ITERINDEX(iter) < NBF_BUFITEREND(bufferdata)) {
+            npy_intp *strides;
+
+            strides = NBF_STRIDES(bufferdata);
+            for (iop = 0; iop < nop; ++iop) {
+                ptrs[iop] += strides[iop];
+            }
+            return 1;
+        }
+    }
+    else {
+        NIT_ITERINDEX(iter) += NBF_SIZE(bufferdata);
+    }
+
+    NPY_IT_DBG_PRINT1("Iterator: Finished iteration %d of outer reduce loop\n",
+                            (int)NBF_REDUCE_POS(bufferdata));
+    /* The outer increment for the reduce double loop */
+    if (++NBF_REDUCE_POS(bufferdata) < NBF_REDUCE_OUTERSIZE(bufferdata)) {
+        npy_intp *reduce_outerstrides = NBF_REDUCE_OUTERSTRIDES(bufferdata);
+        char **reduce_outerptrs = NBF_REDUCE_OUTERPTRS(bufferdata);
+        for (iop = 0; iop < nop; ++iop) {
+            char *ptr = reduce_outerptrs[iop] + reduce_outerstrides[iop];
+            ptrs[iop] = ptr;
+            reduce_outerptrs[iop] = ptr;
+        }
+        NBF_BUFITEREND(bufferdata) = NIT_ITERINDEX(iter) + NBF_SIZE(bufferdata);
+        return 1;
+    }
+
+    /* Save the previously used data pointers */
+    axisdata = NIT_AXISDATA(iter);
+    memcpy(prev_dataptrs, NAD_PTRS(axisdata), NPY_SIZEOF_INTP*nop);
+
+    /* Write back to the arrays */
+    if (npyiter_copy_from_buffers(iter) < 0) {
+        npyiter_clear_buffers(iter);
+        return 0;
+    }
+
+    /* Check if we're past the end */
+    if (NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        NBF_SIZE(bufferdata) = 0;
+        return 0;
+    }
+    /* Increment to the next buffer */
+    else {
+        npyiter_goto_iterindex(iter, NIT_ITERINDEX(iter));
+    }
+
+    /* Prepare the next buffers and set iterend/size */
+    if (npyiter_copy_to_buffers(iter, prev_dataptrs) < 0) {
+        npyiter_clear_buffers(iter);
+        return 0;
+    }
+
+    return 1;
+}
+
+
+
+/* iternext function that handles the buffering part */
+static int
+npyiter_buffered_iternext(NpyIter *iter)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    /*int ndim = NIT_NDIM(iter);*/
+    int nop = NIT_NOP(iter);
+
+    NpyIter_BufferData *bufferdata = NIT_BUFFERDATA(iter);
+
+    /*
+     * If the iterator handles the inner loop, need to increment all
+     * the indices and pointers
+     */
+    if (!(itflags&NPY_ITFLAG_EXLOOP)) {
+        /* Increment within the buffer */
+        if (++NIT_ITERINDEX(iter) < NBF_BUFITEREND(bufferdata)) {
+            int iop;
+            npy_intp *strides;
+            char **ptrs;
+
+            strides = NBF_STRIDES(bufferdata);
+            ptrs = NBF_PTRS(bufferdata);
+            for (iop = 0; iop < nop; ++iop) {
+                ptrs[iop] += strides[iop];
+            }
+            return 1;
+        }
+    }
+    else {
+        NIT_ITERINDEX(iter) += NBF_SIZE(bufferdata);
+    }
+
+    /* Write back to the arrays */
+    if (npyiter_copy_from_buffers(iter) < 0) {
+        npyiter_clear_buffers(iter);
+        return 0;
+    }
+
+    /* Check if we're past the end */
+    if (NIT_ITERINDEX(iter) >= NIT_ITEREND(iter)) {
+        NBF_SIZE(bufferdata) = 0;
+        return 0;
+    }
+    /* Increment to the next buffer */
+    else {
+        npyiter_goto_iterindex(iter, NIT_ITERINDEX(iter));
+    }
+
+    /* Prepare the next buffers and set iterend/size */
+    if (npyiter_copy_to_buffers(iter, NULL) < 0) {
+        npyiter_clear_buffers(iter);
+        return 0;
+    }
+
+    return 1;
+}
+
+/**end repeat2**/
+/**end repeat1**/
+/**end repeat**/
+
+/* Specialization of iternext for when the iteration size is 1 */
+static int
+npyiter_iternext_sizeone(NpyIter *iter)
+{
+    return 0;
+}
+
+/*NUMPY_API
+ * Compute the specialized iteration function for an iterator
+ *
+ * If errmsg is non-NULL, it should point to a variable which will
+ * receive the error message, and no Python exception will be set.
+ * This is so that the function can be called from code not holding
+ * the GIL.
+ */
+NPY_NO_EXPORT NpyIter_IterNextFunc *
+NpyIter_GetIterNext(NpyIter *iter, char **errmsg)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    int ndim = NIT_NDIM(iter);
+    int nop = NIT_NOP(iter);
+
+    if (NIT_ITERSIZE(iter) < 0) {
+        if (errmsg == NULL) {
+            PyErr_SetString(PyExc_ValueError, "iterator is too large");
+        }
+        else {
+            *errmsg = "iterator is too large";
+        }
+        return NULL;
+    }
+
+    /*
+     * When there is just one iteration and buffering is disabled
+     * the iternext function is very simple.
+     */
+    if (itflags&NPY_ITFLAG_ONEITERATION) {
+        return &npyiter_iternext_sizeone;
+    }
+
+    /*
+     * If buffering is enabled.
+     */
+    if (itflags&NPY_ITFLAG_BUFFER) {
+        if (itflags&NPY_ITFLAG_REDUCE) {
+            switch (nop) {
+                case 1:
+                    return &npyiter_buffered_reduce_iternext_iters1;
+                case 2:
+                    return &npyiter_buffered_reduce_iternext_iters2;
+                case 3:
+                    return &npyiter_buffered_reduce_iternext_iters3;
+                case 4:
+                    return &npyiter_buffered_reduce_iternext_iters4;
+                default:
+                    return &npyiter_buffered_reduce_iternext_itersANY;
+            }
+        }
+        else {
+            return &npyiter_buffered_iternext;
+        }
+    }
+
+    /*
+     * Ignore all the flags that don't affect the iterator memory
+     * layout or the iternext function.  Currently only HASINDEX,
+     * EXLOOP, and RANGE affect them here.
+     */
+    itflags &= (NPY_ITFLAG_HASINDEX|NPY_ITFLAG_EXLOOP|NPY_ITFLAG_RANGE);
+
+    /* Switch statements let the compiler optimize this most effectively */
+    switch (itflags) {
+    /*
+     * The combinations HASINDEX|EXLOOP and RANGE|EXLOOP are excluded
+     * by the New functions
+     */
+#line 424
+        case 0:
+            switch (ndim) {
+#line 430
+                case 1:
+                    switch (nop) {
+#line 436
+                        case 1:
+                            return &npyiter_iternext_itflags0_dims1_iters1;
+
+#line 436
+                        case 2:
+                            return &npyiter_iternext_itflags0_dims1_iters2;
+
+                        /* Not specialized on nop */
+                        default:
+                            return &npyiter_iternext_itflags0_dims1_itersANY;
+                    }
+
+#line 430
+                case 2:
+                    switch (nop) {
+#line 436
+                        case 1:
+                            return &npyiter_iternext_itflags0_dims2_iters1;
+
+#line 436
+                        case 2:
+                            return &npyiter_iternext_itflags0_dims2_iters2;
+
+                        /* Not specialized on nop */
+                        default:
+                            return &npyiter_iternext_itflags0_dims2_itersANY;
+                    }
+
+                /* Not specialized on ndim */
+                default:
+                    switch (nop) {
+#line 451
+                        case 1:
+                            return &npyiter_iternext_itflags0_dimsANY_iters1;
+
+#line 451
+                        case 2:
+                            return &npyiter_iternext_itflags0_dimsANY_iters2;
+
+                        /* Not specialized on nop */
+                        default:
+                            return &npyiter_iternext_itflags0_dimsANY_itersANY;
+                    }
+            }
+
+#line 424
+        case NPY_ITFLAG_HASINDEX:
+            switch (ndim) {
+#line 430
+                case 1:
+                    switch (nop) {
+#line 436
+                        case 1:
+                            return &npyiter_iternext_itflagsIND_dims1_iters1;
+
+#line 436
+                        case 2:
+                            return &npyiter_iternext_itflagsIND_dims1_iters2;
+
+                        /* Not specialized on nop */
+                        default:
+                            return &npyiter_iternext_itflagsIND_dims1_itersANY;
+                    }
+
+#line 430
+                case 2:
+                    switch (nop) {
+#line 436
+                        case 1:
+                            return &npyiter_iternext_itflagsIND_dims2_iters1;
+
+#line 436
+                        case 2:
+                            return &npyiter_iternext_itflagsIND_dims2_iters2;
+
+                        /* Not specialized on nop */
+                        default:
+                            return &npyiter_iternext_itflagsIND_dims2_itersANY;
+                    }
+
+                /* Not specialized on ndim */
+                default:
+                    switch (nop) {
+#line 451
+                        case 1:
+                            return &npyiter_iternext_itflagsIND_dimsANY_iters1;
+
+#line 451
+                        case 2:
+                            return &npyiter_iternext_itflagsIND_dimsANY_iters2;
+
+                        /* Not specialized on nop */
+                        default:
+                            return &npyiter_iternext_itflagsIND_dimsANY_itersANY;
+                    }
+            }
+
+#line 424
+        case NPY_ITFLAG_EXLOOP:
+            switch (ndim) {
+#line 430
+                case 1:
+                    switch (nop) {
+#line 436
+                        case 1:
+                            return &npyiter_iternext_itflagsNOINN_dims1_iters1;
+
+#line 436
+                        case 2:
+                            return &npyiter_iternext_itflagsNOINN_dims1_iters2;
+
+                        /* Not specialized on nop */
+                        default:
+                            return &npyiter_iternext_itflagsNOINN_dims1_itersANY;
+                    }
+
+#line 430
+                case 2:
+                    switch (nop) {
+#line 436
+                        case 1:
+                            return &npyiter_iternext_itflagsNOINN_dims2_iters1;
+
+#line 436
+                        case 2:
+                            return &npyiter_iternext_itflagsNOINN_dims2_iters2;
+
+                        /* Not specialized on nop */
+                        default:
+                            return &npyiter_iternext_itflagsNOINN_dims2_itersANY;
+                    }
+
+                /* Not specialized on ndim */
+                default:
+                    switch (nop) {
+#line 451
+                        case 1:
+                            return &npyiter_iternext_itflagsNOINN_dimsANY_iters1;
+
+#line 451
+                        case 2:
+                            return &npyiter_iternext_itflagsNOINN_dimsANY_iters2;
+
+                        /* Not specialized on nop */
+                        default:
+                            return &npyiter_iternext_itflagsNOINN_dimsANY_itersANY;
+                    }
+            }
+
+#line 424
+        case NPY_ITFLAG_RANGE:
+            switch (ndim) {
+#line 430
+                case 1:
+                    switch (nop) {
+#line 436
+                        case 1:
+                            return &npyiter_iternext_itflagsRNG_dims1_iters1;
+
+#line 436
+                        case 2:
+                            return &npyiter_iternext_itflagsRNG_dims1_iters2;
+
+                        /* Not specialized on nop */
+                        default:
+                            return &npyiter_iternext_itflagsRNG_dims1_itersANY;
+                    }
+
+#line 430
+                case 2:
+                    switch (nop) {
+#line 436
+                        case 1:
+                            return &npyiter_iternext_itflagsRNG_dims2_iters1;
+
+#line 436
+                        case 2:
+                            return &npyiter_iternext_itflagsRNG_dims2_iters2;
+
+                        /* Not specialized on nop */
+                        default:
+                            return &npyiter_iternext_itflagsRNG_dims2_itersANY;
+                    }
+
+                /* Not specialized on ndim */
+                default:
+                    switch (nop) {
+#line 451
+                        case 1:
+                            return &npyiter_iternext_itflagsRNG_dimsANY_iters1;
+
+#line 451
+                        case 2:
+                            return &npyiter_iternext_itflagsRNG_dimsANY_iters2;
+
+                        /* Not specialized on nop */
+                        default:
+                            return &npyiter_iternext_itflagsRNG_dimsANY_itersANY;
+                    }
+            }
+
+#line 424
+        case NPY_ITFLAG_RANGE|NPY_ITFLAG_HASINDEX:
+            switch (ndim) {
+#line 430
+                case 1:
+                    switch (nop) {
+#line 436
+                        case 1:
+                            return &npyiter_iternext_itflagsRNGuIND_dims1_iters1;
+
+#line 436
+                        case 2:
+                            return &npyiter_iternext_itflagsRNGuIND_dims1_iters2;
+
+                        /* Not specialized on nop */
+                        default:
+                            return &npyiter_iternext_itflagsRNGuIND_dims1_itersANY;
+                    }
+
+#line 430
+                case 2:
+                    switch (nop) {
+#line 436
+                        case 1:
+                            return &npyiter_iternext_itflagsRNGuIND_dims2_iters1;
+
+#line 436
+                        case 2:
+                            return &npyiter_iternext_itflagsRNGuIND_dims2_iters2;
+
+                        /* Not specialized on nop */
+                        default:
+                            return &npyiter_iternext_itflagsRNGuIND_dims2_itersANY;
+                    }
+
+                /* Not specialized on ndim */
+                default:
+                    switch (nop) {
+#line 451
+                        case 1:
+                            return &npyiter_iternext_itflagsRNGuIND_dimsANY_iters1;
+
+#line 451
+                        case 2:
+                            return &npyiter_iternext_itflagsRNGuIND_dimsANY_iters2;
+
+                        /* Not specialized on nop */
+                        default:
+                            return &npyiter_iternext_itflagsRNGuIND_dimsANY_itersANY;
+                    }
+            }
+
+    }
+    /* The switch above should have caught all the possibilities. */
+    if (errmsg == NULL) {
+        PyErr_Format(PyExc_ValueError,
+                "GetIterNext internal iterator error - unexpected "
+                "itflags/ndim/nop combination (%04x/%d/%d)",
+                (int)itflags, (int)ndim, (int)nop);
+    }
+    else {
+        *errmsg = "GetIterNext internal iterator error - unexpected "
+                  "itflags/ndim/nop combination";
+    }
+    return NULL;
+}
+
+
+/* SPECIALIZED getindex functions */
+
+#line 494
+static void
+npyiter_get_multi_index_itflags0(
+                        NpyIter *iter, npy_intp *out_multi_index)
+{
+    const npy_uint32 itflags = 0;
+    int idim, ndim = NIT_NDIM(iter);
+    int nop = NIT_NOP(iter);
+
+    npy_intp sizeof_axisdata;
+    NpyIter_AxisData *axisdata;
+#if !((0)&NPY_ITFLAG_IDENTPERM)
+    npy_int8 *perm = NIT_PERM(iter);
+#endif
+
+    axisdata = NIT_AXISDATA(iter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#if ((0)&NPY_ITFLAG_IDENTPERM)
+    out_multi_index += ndim-1;
+    for(idim = 0; idim < ndim; ++idim, --out_multi_index,
+                                    NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        *out_multi_index = NAD_INDEX(axisdata);
+    }
+#elif !((0)&NPY_ITFLAG_NEGPERM)
+    for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        npy_int8 p = perm[idim];
+        out_multi_index[ndim-p-1] = NAD_INDEX(axisdata);
+    }
+#else
+    for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        npy_int8 p = perm[idim];
+        if (p < 0) {
+            /* If the perm entry is negative, reverse the index */
+            out_multi_index[ndim+p] = NAD_SHAPE(axisdata) - NAD_INDEX(axisdata) - 1;
+        }
+        else {
+            out_multi_index[ndim-p-1] = NAD_INDEX(axisdata);
+        }
+    }
+#endif /* not ident perm */
+}
+
+#line 494
+static void
+npyiter_get_multi_index_itflagsIND(
+                        NpyIter *iter, npy_intp *out_multi_index)
+{
+    const npy_uint32 itflags = NPY_ITFLAG_HASINDEX;
+    int idim, ndim = NIT_NDIM(iter);
+    int nop = NIT_NOP(iter);
+
+    npy_intp sizeof_axisdata;
+    NpyIter_AxisData *axisdata;
+#if !((NPY_ITFLAG_HASINDEX)&NPY_ITFLAG_IDENTPERM)
+    npy_int8 *perm = NIT_PERM(iter);
+#endif
+
+    axisdata = NIT_AXISDATA(iter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#if ((NPY_ITFLAG_HASINDEX)&NPY_ITFLAG_IDENTPERM)
+    out_multi_index += ndim-1;
+    for(idim = 0; idim < ndim; ++idim, --out_multi_index,
+                                    NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        *out_multi_index = NAD_INDEX(axisdata);
+    }
+#elif !((NPY_ITFLAG_HASINDEX)&NPY_ITFLAG_NEGPERM)
+    for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        npy_int8 p = perm[idim];
+        out_multi_index[ndim-p-1] = NAD_INDEX(axisdata);
+    }
+#else
+    for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        npy_int8 p = perm[idim];
+        if (p < 0) {
+            /* If the perm entry is negative, reverse the index */
+            out_multi_index[ndim+p] = NAD_SHAPE(axisdata) - NAD_INDEX(axisdata) - 1;
+        }
+        else {
+            out_multi_index[ndim-p-1] = NAD_INDEX(axisdata);
+        }
+    }
+#endif /* not ident perm */
+}
+
+#line 494
+static void
+npyiter_get_multi_index_itflagsIDP(
+                        NpyIter *iter, npy_intp *out_multi_index)
+{
+    const npy_uint32 itflags = NPY_ITFLAG_IDENTPERM;
+    int idim, ndim = NIT_NDIM(iter);
+    int nop = NIT_NOP(iter);
+
+    npy_intp sizeof_axisdata;
+    NpyIter_AxisData *axisdata;
+#if !((NPY_ITFLAG_IDENTPERM)&NPY_ITFLAG_IDENTPERM)
+    npy_int8 *perm = NIT_PERM(iter);
+#endif
+
+    axisdata = NIT_AXISDATA(iter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#if ((NPY_ITFLAG_IDENTPERM)&NPY_ITFLAG_IDENTPERM)
+    out_multi_index += ndim-1;
+    for(idim = 0; idim < ndim; ++idim, --out_multi_index,
+                                    NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        *out_multi_index = NAD_INDEX(axisdata);
+    }
+#elif !((NPY_ITFLAG_IDENTPERM)&NPY_ITFLAG_NEGPERM)
+    for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        npy_int8 p = perm[idim];
+        out_multi_index[ndim-p-1] = NAD_INDEX(axisdata);
+    }
+#else
+    for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        npy_int8 p = perm[idim];
+        if (p < 0) {
+            /* If the perm entry is negative, reverse the index */
+            out_multi_index[ndim+p] = NAD_SHAPE(axisdata) - NAD_INDEX(axisdata) - 1;
+        }
+        else {
+            out_multi_index[ndim-p-1] = NAD_INDEX(axisdata);
+        }
+    }
+#endif /* not ident perm */
+}
+
+#line 494
+static void
+npyiter_get_multi_index_itflagsINDuIDP(
+                        NpyIter *iter, npy_intp *out_multi_index)
+{
+    const npy_uint32 itflags = NPY_ITFLAG_HASINDEX|NPY_ITFLAG_IDENTPERM;
+    int idim, ndim = NIT_NDIM(iter);
+    int nop = NIT_NOP(iter);
+
+    npy_intp sizeof_axisdata;
+    NpyIter_AxisData *axisdata;
+#if !((NPY_ITFLAG_HASINDEX|NPY_ITFLAG_IDENTPERM)&NPY_ITFLAG_IDENTPERM)
+    npy_int8 *perm = NIT_PERM(iter);
+#endif
+
+    axisdata = NIT_AXISDATA(iter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#if ((NPY_ITFLAG_HASINDEX|NPY_ITFLAG_IDENTPERM)&NPY_ITFLAG_IDENTPERM)
+    out_multi_index += ndim-1;
+    for(idim = 0; idim < ndim; ++idim, --out_multi_index,
+                                    NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        *out_multi_index = NAD_INDEX(axisdata);
+    }
+#elif !((NPY_ITFLAG_HASINDEX|NPY_ITFLAG_IDENTPERM)&NPY_ITFLAG_NEGPERM)
+    for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        npy_int8 p = perm[idim];
+        out_multi_index[ndim-p-1] = NAD_INDEX(axisdata);
+    }
+#else
+    for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        npy_int8 p = perm[idim];
+        if (p < 0) {
+            /* If the perm entry is negative, reverse the index */
+            out_multi_index[ndim+p] = NAD_SHAPE(axisdata) - NAD_INDEX(axisdata) - 1;
+        }
+        else {
+            out_multi_index[ndim-p-1] = NAD_INDEX(axisdata);
+        }
+    }
+#endif /* not ident perm */
+}
+
+#line 494
+static void
+npyiter_get_multi_index_itflagsNEGP(
+                        NpyIter *iter, npy_intp *out_multi_index)
+{
+    const npy_uint32 itflags = NPY_ITFLAG_NEGPERM;
+    int idim, ndim = NIT_NDIM(iter);
+    int nop = NIT_NOP(iter);
+
+    npy_intp sizeof_axisdata;
+    NpyIter_AxisData *axisdata;
+#if !((NPY_ITFLAG_NEGPERM)&NPY_ITFLAG_IDENTPERM)
+    npy_int8 *perm = NIT_PERM(iter);
+#endif
+
+    axisdata = NIT_AXISDATA(iter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#if ((NPY_ITFLAG_NEGPERM)&NPY_ITFLAG_IDENTPERM)
+    out_multi_index += ndim-1;
+    for(idim = 0; idim < ndim; ++idim, --out_multi_index,
+                                    NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        *out_multi_index = NAD_INDEX(axisdata);
+    }
+#elif !((NPY_ITFLAG_NEGPERM)&NPY_ITFLAG_NEGPERM)
+    for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        npy_int8 p = perm[idim];
+        out_multi_index[ndim-p-1] = NAD_INDEX(axisdata);
+    }
+#else
+    for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        npy_int8 p = perm[idim];
+        if (p < 0) {
+            /* If the perm entry is negative, reverse the index */
+            out_multi_index[ndim+p] = NAD_SHAPE(axisdata) - NAD_INDEX(axisdata) - 1;
+        }
+        else {
+            out_multi_index[ndim-p-1] = NAD_INDEX(axisdata);
+        }
+    }
+#endif /* not ident perm */
+}
+
+#line 494
+static void
+npyiter_get_multi_index_itflagsINDuNEGP(
+                        NpyIter *iter, npy_intp *out_multi_index)
+{
+    const npy_uint32 itflags = NPY_ITFLAG_HASINDEX|NPY_ITFLAG_NEGPERM;
+    int idim, ndim = NIT_NDIM(iter);
+    int nop = NIT_NOP(iter);
+
+    npy_intp sizeof_axisdata;
+    NpyIter_AxisData *axisdata;
+#if !((NPY_ITFLAG_HASINDEX|NPY_ITFLAG_NEGPERM)&NPY_ITFLAG_IDENTPERM)
+    npy_int8 *perm = NIT_PERM(iter);
+#endif
+
+    axisdata = NIT_AXISDATA(iter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#if ((NPY_ITFLAG_HASINDEX|NPY_ITFLAG_NEGPERM)&NPY_ITFLAG_IDENTPERM)
+    out_multi_index += ndim-1;
+    for(idim = 0; idim < ndim; ++idim, --out_multi_index,
+                                    NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        *out_multi_index = NAD_INDEX(axisdata);
+    }
+#elif !((NPY_ITFLAG_HASINDEX|NPY_ITFLAG_NEGPERM)&NPY_ITFLAG_NEGPERM)
+    for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        npy_int8 p = perm[idim];
+        out_multi_index[ndim-p-1] = NAD_INDEX(axisdata);
+    }
+#else
+    for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        npy_int8 p = perm[idim];
+        if (p < 0) {
+            /* If the perm entry is negative, reverse the index */
+            out_multi_index[ndim+p] = NAD_SHAPE(axisdata) - NAD_INDEX(axisdata) - 1;
+        }
+        else {
+            out_multi_index[ndim-p-1] = NAD_INDEX(axisdata);
+        }
+    }
+#endif /* not ident perm */
+}
+
+#line 494
+static void
+npyiter_get_multi_index_itflagsBUF(
+                        NpyIter *iter, npy_intp *out_multi_index)
+{
+    const npy_uint32 itflags = NPY_ITFLAG_BUFFER;
+    int idim, ndim = NIT_NDIM(iter);
+    int nop = NIT_NOP(iter);
+
+    npy_intp sizeof_axisdata;
+    NpyIter_AxisData *axisdata;
+#if !((NPY_ITFLAG_BUFFER)&NPY_ITFLAG_IDENTPERM)
+    npy_int8 *perm = NIT_PERM(iter);
+#endif
+
+    axisdata = NIT_AXISDATA(iter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#if ((NPY_ITFLAG_BUFFER)&NPY_ITFLAG_IDENTPERM)
+    out_multi_index += ndim-1;
+    for(idim = 0; idim < ndim; ++idim, --out_multi_index,
+                                    NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        *out_multi_index = NAD_INDEX(axisdata);
+    }
+#elif !((NPY_ITFLAG_BUFFER)&NPY_ITFLAG_NEGPERM)
+    for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        npy_int8 p = perm[idim];
+        out_multi_index[ndim-p-1] = NAD_INDEX(axisdata);
+    }
+#else
+    for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        npy_int8 p = perm[idim];
+        if (p < 0) {
+            /* If the perm entry is negative, reverse the index */
+            out_multi_index[ndim+p] = NAD_SHAPE(axisdata) - NAD_INDEX(axisdata) - 1;
+        }
+        else {
+            out_multi_index[ndim-p-1] = NAD_INDEX(axisdata);
+        }
+    }
+#endif /* not ident perm */
+}
+
+#line 494
+static void
+npyiter_get_multi_index_itflagsINDuBUF(
+                        NpyIter *iter, npy_intp *out_multi_index)
+{
+    const npy_uint32 itflags = NPY_ITFLAG_HASINDEX|NPY_ITFLAG_BUFFER;
+    int idim, ndim = NIT_NDIM(iter);
+    int nop = NIT_NOP(iter);
+
+    npy_intp sizeof_axisdata;
+    NpyIter_AxisData *axisdata;
+#if !((NPY_ITFLAG_HASINDEX|NPY_ITFLAG_BUFFER)&NPY_ITFLAG_IDENTPERM)
+    npy_int8 *perm = NIT_PERM(iter);
+#endif
+
+    axisdata = NIT_AXISDATA(iter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#if ((NPY_ITFLAG_HASINDEX|NPY_ITFLAG_BUFFER)&NPY_ITFLAG_IDENTPERM)
+    out_multi_index += ndim-1;
+    for(idim = 0; idim < ndim; ++idim, --out_multi_index,
+                                    NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        *out_multi_index = NAD_INDEX(axisdata);
+    }
+#elif !((NPY_ITFLAG_HASINDEX|NPY_ITFLAG_BUFFER)&NPY_ITFLAG_NEGPERM)
+    for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        npy_int8 p = perm[idim];
+        out_multi_index[ndim-p-1] = NAD_INDEX(axisdata);
+    }
+#else
+    for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        npy_int8 p = perm[idim];
+        if (p < 0) {
+            /* If the perm entry is negative, reverse the index */
+            out_multi_index[ndim+p] = NAD_SHAPE(axisdata) - NAD_INDEX(axisdata) - 1;
+        }
+        else {
+            out_multi_index[ndim-p-1] = NAD_INDEX(axisdata);
+        }
+    }
+#endif /* not ident perm */
+}
+
+#line 494
+static void
+npyiter_get_multi_index_itflagsIDPuBUF(
+                        NpyIter *iter, npy_intp *out_multi_index)
+{
+    const npy_uint32 itflags = NPY_ITFLAG_IDENTPERM|NPY_ITFLAG_BUFFER;
+    int idim, ndim = NIT_NDIM(iter);
+    int nop = NIT_NOP(iter);
+
+    npy_intp sizeof_axisdata;
+    NpyIter_AxisData *axisdata;
+#if !((NPY_ITFLAG_IDENTPERM|NPY_ITFLAG_BUFFER)&NPY_ITFLAG_IDENTPERM)
+    npy_int8 *perm = NIT_PERM(iter);
+#endif
+
+    axisdata = NIT_AXISDATA(iter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#if ((NPY_ITFLAG_IDENTPERM|NPY_ITFLAG_BUFFER)&NPY_ITFLAG_IDENTPERM)
+    out_multi_index += ndim-1;
+    for(idim = 0; idim < ndim; ++idim, --out_multi_index,
+                                    NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        *out_multi_index = NAD_INDEX(axisdata);
+    }
+#elif !((NPY_ITFLAG_IDENTPERM|NPY_ITFLAG_BUFFER)&NPY_ITFLAG_NEGPERM)
+    for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        npy_int8 p = perm[idim];
+        out_multi_index[ndim-p-1] = NAD_INDEX(axisdata);
+    }
+#else
+    for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        npy_int8 p = perm[idim];
+        if (p < 0) {
+            /* If the perm entry is negative, reverse the index */
+            out_multi_index[ndim+p] = NAD_SHAPE(axisdata) - NAD_INDEX(axisdata) - 1;
+        }
+        else {
+            out_multi_index[ndim-p-1] = NAD_INDEX(axisdata);
+        }
+    }
+#endif /* not ident perm */
+}
+
+#line 494
+static void
+npyiter_get_multi_index_itflagsINDuIDPuBUF(
+                        NpyIter *iter, npy_intp *out_multi_index)
+{
+    const npy_uint32 itflags = NPY_ITFLAG_HASINDEX|NPY_ITFLAG_IDENTPERM|NPY_ITFLAG_BUFFER;
+    int idim, ndim = NIT_NDIM(iter);
+    int nop = NIT_NOP(iter);
+
+    npy_intp sizeof_axisdata;
+    NpyIter_AxisData *axisdata;
+#if !((NPY_ITFLAG_HASINDEX|NPY_ITFLAG_IDENTPERM|NPY_ITFLAG_BUFFER)&NPY_ITFLAG_IDENTPERM)
+    npy_int8 *perm = NIT_PERM(iter);
+#endif
+
+    axisdata = NIT_AXISDATA(iter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#if ((NPY_ITFLAG_HASINDEX|NPY_ITFLAG_IDENTPERM|NPY_ITFLAG_BUFFER)&NPY_ITFLAG_IDENTPERM)
+    out_multi_index += ndim-1;
+    for(idim = 0; idim < ndim; ++idim, --out_multi_index,
+                                    NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        *out_multi_index = NAD_INDEX(axisdata);
+    }
+#elif !((NPY_ITFLAG_HASINDEX|NPY_ITFLAG_IDENTPERM|NPY_ITFLAG_BUFFER)&NPY_ITFLAG_NEGPERM)
+    for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        npy_int8 p = perm[idim];
+        out_multi_index[ndim-p-1] = NAD_INDEX(axisdata);
+    }
+#else
+    for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        npy_int8 p = perm[idim];
+        if (p < 0) {
+            /* If the perm entry is negative, reverse the index */
+            out_multi_index[ndim+p] = NAD_SHAPE(axisdata) - NAD_INDEX(axisdata) - 1;
+        }
+        else {
+            out_multi_index[ndim-p-1] = NAD_INDEX(axisdata);
+        }
+    }
+#endif /* not ident perm */
+}
+
+#line 494
+static void
+npyiter_get_multi_index_itflagsNEGPuBUF(
+                        NpyIter *iter, npy_intp *out_multi_index)
+{
+    const npy_uint32 itflags = NPY_ITFLAG_NEGPERM|NPY_ITFLAG_BUFFER;
+    int idim, ndim = NIT_NDIM(iter);
+    int nop = NIT_NOP(iter);
+
+    npy_intp sizeof_axisdata;
+    NpyIter_AxisData *axisdata;
+#if !((NPY_ITFLAG_NEGPERM|NPY_ITFLAG_BUFFER)&NPY_ITFLAG_IDENTPERM)
+    npy_int8 *perm = NIT_PERM(iter);
+#endif
+
+    axisdata = NIT_AXISDATA(iter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#if ((NPY_ITFLAG_NEGPERM|NPY_ITFLAG_BUFFER)&NPY_ITFLAG_IDENTPERM)
+    out_multi_index += ndim-1;
+    for(idim = 0; idim < ndim; ++idim, --out_multi_index,
+                                    NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        *out_multi_index = NAD_INDEX(axisdata);
+    }
+#elif !((NPY_ITFLAG_NEGPERM|NPY_ITFLAG_BUFFER)&NPY_ITFLAG_NEGPERM)
+    for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        npy_int8 p = perm[idim];
+        out_multi_index[ndim-p-1] = NAD_INDEX(axisdata);
+    }
+#else
+    for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        npy_int8 p = perm[idim];
+        if (p < 0) {
+            /* If the perm entry is negative, reverse the index */
+            out_multi_index[ndim+p] = NAD_SHAPE(axisdata) - NAD_INDEX(axisdata) - 1;
+        }
+        else {
+            out_multi_index[ndim-p-1] = NAD_INDEX(axisdata);
+        }
+    }
+#endif /* not ident perm */
+}
+
+#line 494
+static void
+npyiter_get_multi_index_itflagsINDuNEGPuBUF(
+                        NpyIter *iter, npy_intp *out_multi_index)
+{
+    const npy_uint32 itflags = NPY_ITFLAG_HASINDEX|NPY_ITFLAG_NEGPERM|NPY_ITFLAG_BUFFER;
+    int idim, ndim = NIT_NDIM(iter);
+    int nop = NIT_NOP(iter);
+
+    npy_intp sizeof_axisdata;
+    NpyIter_AxisData *axisdata;
+#if !((NPY_ITFLAG_HASINDEX|NPY_ITFLAG_NEGPERM|NPY_ITFLAG_BUFFER)&NPY_ITFLAG_IDENTPERM)
+    npy_int8 *perm = NIT_PERM(iter);
+#endif
+
+    axisdata = NIT_AXISDATA(iter);
+    sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
+#if ((NPY_ITFLAG_HASINDEX|NPY_ITFLAG_NEGPERM|NPY_ITFLAG_BUFFER)&NPY_ITFLAG_IDENTPERM)
+    out_multi_index += ndim-1;
+    for(idim = 0; idim < ndim; ++idim, --out_multi_index,
+                                    NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        *out_multi_index = NAD_INDEX(axisdata);
+    }
+#elif !((NPY_ITFLAG_HASINDEX|NPY_ITFLAG_NEGPERM|NPY_ITFLAG_BUFFER)&NPY_ITFLAG_NEGPERM)
+    for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        npy_int8 p = perm[idim];
+        out_multi_index[ndim-p-1] = NAD_INDEX(axisdata);
+    }
+#else
+    for(idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
+        npy_int8 p = perm[idim];
+        if (p < 0) {
+            /* If the perm entry is negative, reverse the index */
+            out_multi_index[ndim+p] = NAD_SHAPE(axisdata) - NAD_INDEX(axisdata) - 1;
+        }
+        else {
+            out_multi_index[ndim-p-1] = NAD_INDEX(axisdata);
+        }
+    }
+#endif /* not ident perm */
+}
+
+
+/*NUMPY_API
+ * Compute a specialized get_multi_index function for the iterator
+ *
+ * If errmsg is non-NULL, it should point to a variable which will
+ * receive the error message, and no Python exception will be set.
+ * This is so that the function can be called from code not holding
+ * the GIL.
+ */
+NPY_NO_EXPORT NpyIter_GetMultiIndexFunc *
+NpyIter_GetGetMultiIndex(NpyIter *iter, char **errmsg)
+{
+    npy_uint32 itflags = NIT_ITFLAGS(iter);
+    int ndim = NIT_NDIM(iter);
+    int nop = NIT_NOP(iter);
+
+    /* These flags must be correct */
+    if ((itflags&(NPY_ITFLAG_HASMULTIINDEX|NPY_ITFLAG_DELAYBUF)) !=
+            NPY_ITFLAG_HASMULTIINDEX) {
+        if (!(itflags&NPY_ITFLAG_HASMULTIINDEX)) {
+            if (errmsg == NULL) {
+                PyErr_SetString(PyExc_ValueError,
+                        "Cannot retrieve a GetMultiIndex function for an "
+                        "iterator that doesn't track a multi-index.");
+            }
+            else {
+                *errmsg = "Cannot retrieve a GetMultiIndex function for an "
+                          "iterator that doesn't track a multi-index.";
+            }
+            return NULL;
+        }
+        else {
+            if (errmsg == NULL) {
+                PyErr_SetString(PyExc_ValueError,
+                        "Cannot retrieve a GetMultiIndex function for an "
+                        "iterator that used DELAY_BUFALLOC before a Reset call");
+            }
+            else {
+                *errmsg = "Cannot retrieve a GetMultiIndex function for an "
+                          "iterator that used DELAY_BUFALLOC before a "
+                          "Reset call";
+            }
+            return NULL;
+        }
+    }
+
+    /*
+     * Only these flags affect the iterator memory layout or
+     * the get_multi_index behavior. IDENTPERM and NEGPERM are mutually
+     * exclusive, so that reduces the number of cases slightly.
+     */
+    itflags &= (NPY_ITFLAG_HASINDEX |
+                NPY_ITFLAG_IDENTPERM |
+                NPY_ITFLAG_NEGPERM |
+                NPY_ITFLAG_BUFFER);
+
+    switch (itflags) {
+#line 608
+        case 0:
+            return npyiter_get_multi_index_itflags0;
+
+#line 608
+        case NPY_ITFLAG_HASINDEX:
+            return npyiter_get_multi_index_itflagsIND;
+
+#line 608
+        case NPY_ITFLAG_IDENTPERM:
+            return npyiter_get_multi_index_itflagsIDP;
+
+#line 608
+        case NPY_ITFLAG_HASINDEX|NPY_ITFLAG_IDENTPERM:
+            return npyiter_get_multi_index_itflagsINDuIDP;
+
+#line 608
+        case NPY_ITFLAG_NEGPERM:
+            return npyiter_get_multi_index_itflagsNEGP;
+
+#line 608
+        case NPY_ITFLAG_HASINDEX|NPY_ITFLAG_NEGPERM:
+            return npyiter_get_multi_index_itflagsINDuNEGP;
+
+#line 608
+        case NPY_ITFLAG_BUFFER:
+            return npyiter_get_multi_index_itflagsBUF;
+
+#line 608
+        case NPY_ITFLAG_HASINDEX|NPY_ITFLAG_BUFFER:
+            return npyiter_get_multi_index_itflagsINDuBUF;
+
+#line 608
+        case NPY_ITFLAG_IDENTPERM|NPY_ITFLAG_BUFFER:
+            return npyiter_get_multi_index_itflagsIDPuBUF;
+
+#line 608
+        case NPY_ITFLAG_HASINDEX|NPY_ITFLAG_IDENTPERM|NPY_ITFLAG_BUFFER:
+            return npyiter_get_multi_index_itflagsINDuIDPuBUF;
+
+#line 608
+        case NPY_ITFLAG_NEGPERM|NPY_ITFLAG_BUFFER:
+            return npyiter_get_multi_index_itflagsNEGPuBUF;
+
+#line 608
+        case NPY_ITFLAG_HASINDEX|NPY_ITFLAG_NEGPERM|NPY_ITFLAG_BUFFER:
+            return npyiter_get_multi_index_itflagsINDuNEGPuBUF;
+
+    }
+    /* The switch above should have caught all the possibilities. */
+    if (errmsg == NULL) {
+        PyErr_Format(PyExc_ValueError,
+                "GetGetMultiIndex internal iterator error - unexpected "
+                "itflags/ndim/nop combination (%04x/%d/%d)",
+                (int)itflags, (int)ndim, (int)nop);
+    }
+    else {
+        *errmsg = "GetGetMultiIndex internal iterator error - unexpected "
+                  "itflags/ndim/nop combination";
+    }
+    return NULL;
+
+}
+
+#undef NPY_ITERATOR_IMPLEMENTATION_CODE
+
diff --git a/numpy/core/src/_generated/npy_math_complex.c b/numpy/core/src/_generated/npy_math_complex.c
new file mode 100644
index 000000000000..d412131260ec
--- /dev/null
+++ b/numpy/core/src/_generated/npy_math_complex.c
@@ -0,0 +1,5773 @@
+#line 1 "numpy/core/src/npymath/npy_math_complex.c.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+/*
+ * vim: syntax=c
+ *
+ * Implement some C99-compatible complex math functions
+ *
+ * Most of the code is taken from the msun library in FreeBSD (HEAD @ 4th
+ * October 2013), under the following license:
+ *
+ * Copyright (c) 2007, 2011 David Schultz <das@FreeBSD.ORG>
+ * Copyright (c) 2012 Stephen Montgomery-Smith <stephen@FreeBSD.ORG>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#include "npy_math_common.h"
+#include "npy_math_private.h"
+#include <numpy/utils.h>
+
+/*
+ * Hack inherited from BSD, the intent is to set the FPU inexact
+ * flag in an efficient way. The flag is IEEE specific. See
+ * https://github.com/freebsd/freebsd/blob/4c6378299/lib/msun/src/catrig.c#L42
+ */
+#if !defined(HAVE_CACOSF) || !defined(HAVE_CACOSL) || !defined(HAVE_CASINHF) || !defined(HAVE_CASINHL)
+#define raise_inexact() do {                        \
+    volatile npy_float NPY_UNUSED(junk) = 1 + tiny; \
+} while (0)
+
+
+static const volatile npy_float tiny = 3.9443045e-31f;
+#endif
+
+#line 63
+
+/*==========================================================
+ * Constants
+ *=========================================================*/
+static const npy_cfloat c_1f = {1.0F, 0.0};
+
+/*==========================================================
+ * Helper functions
+ *
+ * These are necessary because we do not count on using a
+ * C99 compiler.
+ *=========================================================*/
+static inline
+npy_cfloat
+cmulf(npy_cfloat a, npy_cfloat b)
+{
+    npy_float ar, ai, br, bi;
+    ar = npy_crealf(a);
+    ai = npy_cimagf(a);
+    br = npy_crealf(b);
+    bi = npy_cimagf(b);
+    return npy_cpackf(ar*br - ai*bi, ar*bi + ai*br);
+}
+
+static inline
+npy_cfloat
+cdivf(npy_cfloat a, npy_cfloat b)
+{
+    npy_float ar, ai, br, bi, abs_br, abs_bi;
+    ar = npy_crealf(a);
+    ai = npy_cimagf(a);
+    br = npy_crealf(b);
+    bi = npy_cimagf(b);
+    abs_br = npy_fabsf(br);
+    abs_bi = npy_fabsf(bi);
+
+    if (abs_br >= abs_bi) {
+        if (abs_br == 0 && abs_bi == 0) {
+            /* divide by zeros should yield a complex inf or nan */
+            return npy_cpackf(ar/abs_br, ai/abs_bi);
+        }
+        else {
+            npy_float rat = bi/br;
+            npy_float scl = 1.0F/(br+bi*rat);
+            return npy_cpackf((ar + ai*rat)*scl, (ai - ar*rat)*scl);
+        }
+    }
+    else {
+        npy_float rat = br/bi;
+        npy_float scl = 1.0F/(bi + br*rat);
+        return npy_cpackf((ar*rat + ai)*scl, (ai*rat - ar)*scl);
+    }
+}
+
+/*==========================================================
+ * Custom implementation of missing complex C99 functions
+ *=========================================================*/
+
+#ifndef HAVE_CABSF
+npy_float
+npy_cabsf(npy_cfloat z)
+{
+    return npy_hypotf(npy_crealf(z), npy_cimagf(z));
+}
+#endif
+
+#ifndef HAVE_CARGF
+npy_float
+npy_cargf(npy_cfloat z)
+{
+    return npy_atan2f(npy_cimagf(z), npy_crealf(z));
+}
+#endif
+
+/*
+ * cexp and (ccos, csin)h functions need to calculate exp scaled by another
+ * number.  This can be difficult if exp(x) overflows.  By doing this way, we
+ * don't risk overflowing exp. This likely raises floating-point exceptions,
+ * if we decide that we care.
+ *
+ * This is only useful over a limited range, (see below) an expects that the
+ * input values are in this range.
+ *
+ * This is based on the technique used in FreeBSD's __frexp_exp and
+ * __ldexp_(c)exp functions by David Schultz.
+ *
+ * SCALED_CEXP_LOWER = log(FLT_MAX)
+ * SCALED_CEXP_UPPER = log(2) + log(FLT_MAX) - log(FLT_TRUE_MIN),
+ * where FLT_TRUE_MIN is the smallest possible subnormal number.
+ */
+
+#define SCALED_CEXP_LOWERF 88.722839f
+#define SCALED_CEXP_UPPERF 192.69492f
+#define SCALED_CEXP_LOWER 710.47586007394386
+#define SCALED_CEXP_UPPER 1454.9159319953251
+#define SCALED_CEXP_LOWERL 11357.216553474703895L
+#define SCALED_CEXP_UPPERL 22756.021937783004509L
+
+#if !defined(HAVE_CSINHF) || \
+    !defined(HAVE_CCOSHF) || \
+    !defined(HAVE_CEXPF)
+
+static
+npy_cfloat
+_npy_scaled_cexpf(npy_float x, npy_float y, npy_int expt)
+{
+#if 1 == 1
+    const npy_int k = 235;
+#endif
+#if 1 == 2
+    const npy_int k = 1799;
+#endif
+#if 1 == 3
+    const npy_int k = 19547;
+#endif
+    const npy_float kln2 = k * NPY_LOGE2f;
+    npy_float mant, mantcos, mantsin;
+    npy_int ex, excos, exsin;
+
+    mant = npy_frexpf(npy_expf(x - kln2), &ex);
+    mantcos = npy_frexpf(npy_cosf(y), &excos);
+    mantsin = npy_frexpf(npy_sinf(y), &exsin);
+
+    expt += ex + k;
+    return npy_cpackf( npy_ldexpf(mant * mantcos, expt + excos),
+                         npy_ldexpf(mant * mantsin, expt + exsin));
+}
+
+#endif
+
+#ifndef HAVE_CEXPF
+
+npy_cfloat
+npy_cexpf(npy_cfloat z)
+{
+    npy_float x, c, s;
+    npy_float r, i;
+    npy_cfloat ret;
+
+    r = npy_crealf(z);
+    i = npy_cimagf(z);
+
+    if (npy_isfinite(r)) {
+        if (r >= SCALED_CEXP_LOWERF && r <= SCALED_CEXP_UPPERF) {
+            ret = _npy_scaled_cexpf(r, i, 0);
+        }
+        else {
+            x = npy_expf(r);
+
+            c = npy_cosf(i);
+            s = npy_sinf(i);
+
+            if (npy_isfinite(i)) {
+                ret = npy_cpackf(x * c, x * s);
+            }
+            else {
+                ret = npy_cpackf(NPY_NANF, npy_copysignf(NPY_NANF, i));
+            }
+        }
+
+    }
+    else  if (npy_isnan(r)) {
+        /* r is nan */
+        if (i == 0) {
+            ret = z;
+        }
+        else {
+            ret = npy_cpackf(r, npy_copysignf(NPY_NANF, i));
+        }
+    }
+    else {
+        /* r is +- inf */
+        if (r > 0) {
+            if (i == 0) {
+                ret = npy_cpackf(r, i);
+            }
+            else if (npy_isfinite(i)) {
+                c = npy_cosf(i);
+                s = npy_sinf(i);
+
+                ret = npy_cpackf(r * c, r * s);
+            }
+            else {
+                /* x = +inf, y = +-inf | nan */
+                npy_set_floatstatus_invalid();
+                ret = npy_cpackf(r, NPY_NANF);
+            }
+        }
+        else {
+            if (npy_isfinite(i)) {
+                x = npy_expf(r);
+                c = npy_cosf(i);
+                s = npy_sinf(i);
+
+                ret = npy_cpackf(x * c, x * s);
+            }
+            else {
+                /* x = -inf, y = nan | +i inf */
+                ret = npy_cpackf(0, 0);
+            }
+        }
+    }
+
+    return ret;
+}
+#endif
+
+#ifndef HAVE_CLOGF
+/* algorithm from cpython, rev. d86f5686cef9
+ *
+ * The usual formula for the real part is log(hypot(z.real, z.imag)).
+ * There are four situations where this formula is potentially
+ * problematic:
+ *
+ * (1) the absolute value of z is subnormal.  Then hypot is subnormal,
+ * so has fewer than the usual number of bits of accuracy, hence may
+ * have large relative error.  This then gives a large absolute error
+ * in the log.  This can be solved by rescaling z by a suitable power
+ * of 2.
+ *
+ * (2) the absolute value of z is greater than DBL_MAX (e.g. when both
+ * z.real and z.imag are within a factor of 1/sqrt(2) of DBL_MAX)
+ * Again, rescaling solves this.
+ *
+ * (3) the absolute value of z is close to 1.  In this case it's
+ * difficult to achieve good accuracy, at least in part because a
+ * change of 1ulp in the real or imaginary part of z can result in a
+ * change of billions of ulps in the correctly rounded answer.
+ *
+ * (4) z = 0.  The simplest thing to do here is to call the
+ * floating-point log with an argument of 0, and let its behaviour
+ * (returning -infinity, signaling a floating-point exception, setting
+ * errno, or whatever) determine that of c_log.  So the usual formula
+ * is fine here.
+*/
+npy_cfloat
+npy_clogf(npy_cfloat z)
+{
+    npy_float ax = npy_fabsf(npy_crealf(z));
+    npy_float ay = npy_fabsf(npy_cimagf(z));
+    npy_float rr, ri;
+
+    if (ax > FLT_MAX/4 || ay > FLT_MAX/4) {
+        rr = npy_logf(npy_hypotf(ax/2, ay/2)) + NPY_LOGE2f;
+    }
+    else if (ax < FLT_MIN && ay < FLT_MIN) {
+        if (ax > 0  || ay > 0) {
+            /* catch cases where hypot(ax, ay) is subnormal */
+            rr = npy_logf(npy_hypotf(npy_ldexpf(ax, FLT_MANT_DIG),
+                 npy_ldexpf(ay, FLT_MANT_DIG))) - FLT_MANT_DIG*NPY_LOGE2f;
+        }
+        else {
+            /* log(+/-0 +/- 0i) */
+            /* raise divide-by-zero floating point exception */
+            rr = -1.0f / npy_crealf(z);
+            rr = npy_copysignf(rr, -1);
+            ri = npy_cargf(z);
+            return npy_cpackf(rr, ri);
+        }
+    }
+    else {
+        npy_float h = npy_hypotf(ax, ay);
+        if (0.71 <= h && h <= 1.73) {
+            npy_float am = ax > ay ? ax : ay; /* max(ax, ay) */
+            npy_float an = ax > ay ? ay : ax; /* min(ax, ay) */
+            rr = npy_log1pf((am-1)*(am+1)+an*an)/2;
+        }
+        else {
+            rr = npy_logf(h);
+        }
+    }
+    ri = npy_cargf(z);
+
+    return npy_cpackf(rr, ri);
+}
+#endif
+
+#ifndef HAVE_CSQRTF
+
+/* We risk spurious overflow for components >= DBL_MAX / (1 + sqrt(2)). */
+#define THRESH  (FLT_MAX / (1 + NPY_SQRT2f))
+
+npy_cfloat
+npy_csqrtf(npy_cfloat z)
+{
+    npy_cfloat result;
+    npy_float a, b;
+    npy_float t;
+    int scale;
+
+    a = npy_crealf(z);
+    b = npy_cimagf(z);
+
+    /* Handle special cases. */
+    if (a == 0 && b == 0) {
+        return (npy_cpackf(0, b));
+    }
+    if (npy_isinf(b)) {
+        return (npy_cpackf(NPY_INFINITYF, b));
+    }
+    if (npy_isnan(a)) {
+        t = (b - b) / (b - b);  /* raise invalid if b is not a NaN */
+        return (npy_cpackf(a, t));    /* return NaN + NaN i */
+    }
+    if (npy_isinf(a)) {
+        /*
+         * csqrt(inf + NaN i)  = inf +  NaN i
+         * csqrt(inf + y i)    = inf +  0 i
+         * csqrt(-inf + NaN i) = NaN +- inf i
+         * csqrt(-inf + y i)   = 0   +  inf i
+         */
+        if (npy_signbit(a)) {
+            return (npy_cpackf(npy_fabsf(b - b), npy_copysignf(a, b)));
+        }
+        else {
+            return (npy_cpackf(a, npy_copysignf(b - b, b)));
+        }
+    }
+    /*
+     * The remaining special case (b is NaN) is handled just fine by
+     * the normal code path below.
+     */
+
+    /* Scale to avoid overflow. */
+    if (npy_fabsf(a) >= THRESH || npy_fabsf(b) >= THRESH) {
+        a *= 0.25;
+        b *= 0.25;
+        scale = 1;
+    }
+    else {
+        scale = 0;
+    }
+
+    /* Algorithm 312, CACM vol 10, Oct 1967. */
+    if (a >= 0) {
+        t = npy_sqrtf((a + npy_hypotf(a, b)) * 0.5f);
+        result = npy_cpackf(t, b / (2 * t));
+    }
+    else {
+        t = npy_sqrtf((-a + npy_hypotf(a, b)) * 0.5f);
+        result = npy_cpackf(npy_fabsf(b) / (2 * t), npy_copysignf(t, b));
+    }
+
+    /* Rescale. */
+    if (scale) {
+        return (npy_cpackf(npy_crealf(result) * 2, npy_cimagf(result)));
+    }
+    else {
+        return (result);
+    }
+}
+#undef THRESH
+#endif
+
+/*
+ * Always use this function because of the multiplication for small
+ * integer powers, but in the body use cpow if it is available.
+ */
+
+/* private function for use in npy_pow{f, ,l} */
+#ifdef HAVE_CPOWF
+static npy_cfloat
+sys_cpowf(npy_cfloat x, npy_cfloat y)
+{
+    __npy_cfloat_to_c99_cast xcast;
+    __npy_cfloat_to_c99_cast ycast;
+    __npy_cfloat_to_c99_cast ret;
+    xcast.npy_z = x;
+    ycast.npy_z = y;
+    ret.c99_z = cpowf(xcast.c99_z, ycast.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+
+npy_cfloat
+npy_cpowf (npy_cfloat a, npy_cfloat b)
+{
+    npy_intp n;
+    npy_float ar = npy_crealf(a);
+    npy_float br = npy_crealf(b);
+    npy_float ai = npy_cimagf(a);
+    npy_float bi = npy_cimagf(b);
+    npy_cfloat r;
+
+    /*
+     * Checking if in a^b, if b is zero.
+     * If a is not zero then by definition of logarithm a^0 is 1.
+     * If a is also zero then 0^0 is best defined as 1.
+     */
+    if (br == 0. && bi == 0.) {
+        return npy_cpackf(1., 0.);
+    }
+    /* case 0^b
+     * If a is a complex zero (ai=ar=0), then the result depends 
+     * upon values of br and bi. The result is either:
+     * 0 (in magnitude), undefined or 1.
+     * The later case is for br=bi=0 and independent of ar and ai
+     * but is handled above).
+     */
+    else if (ar == 0. && ai == 0.) {
+        /* 
+         * If the real part of b is positive (br>0) then this is
+         * the zero complex with positive sign on both the
+         * real and imaginary part.
+         */
+         if (br > 0) {
+             return npy_cpackf(0., 0.);
+         }
+        /* else we are in the case where the
+         * real part of b is negative (br<0).
+         * Here we should return a complex nan
+         * and raise FloatingPointError: invalid value...
+         */
+         
+         /* Raise invalid value by calling inf - inf*/
+          volatile npy_float tmp = NPY_INFINITYF;
+          tmp -= NPY_INFINITYF;
+          ar = tmp;
+          
+          r = npy_cpackf(NPY_NANF, NPY_NANF);
+          return r;
+    }
+    if (bi == 0 && (n=(npy_intp)br) == br) {
+        if (n == 1) {
+            /* unroll: handle inf better */
+            return npy_cpackf(ar, ai);
+        }
+        else if (n == 2) {
+            /* unroll: handle inf better */
+            return cmulf(a, a);
+        }
+        else if (n == 3) {
+            /* unroll: handle inf better */
+            return cmulf(a, cmulf(a, a));
+        }
+        else if (n > -100 && n < 100) {
+            npy_cfloat p, aa;
+            npy_intp mask = 1;
+            if (n < 0) {
+                n = -n;
+            }
+            aa = c_1f;
+            p = npy_cpackf(ar, ai);
+            while (1) {
+                if (n & mask) {
+                    aa = cmulf(aa,p);
+                }
+                mask <<= 1;
+                if (n < mask || mask <= 0) {
+                    break;
+                }
+                p = cmulf(p,p);
+            }
+            r = npy_cpackf(npy_crealf(aa), npy_cimagf(aa));
+            if (br < 0) {
+                r = cdivf(c_1f, r);
+            }
+            return r;
+        }
+    }
+
+#ifdef HAVE_CPOWF
+    return sys_cpowf(a, b);
+
+#else
+    {
+        npy_cfloat loga = npy_clogf(a);
+
+        ar = npy_crealf(loga);
+        ai = npy_cimagf(loga);
+        return npy_cexpf(npy_cpackf(ar*br - ai*bi, ar*bi + ai*br));
+    }
+
+#endif
+}
+
+
+#ifndef HAVE_CCOSF
+npy_cfloat
+npy_ccosf(npy_cfloat z)
+{
+    /* ccos(z) = ccosh(I * z) */
+    return npy_ccoshf(npy_cpackf(-npy_cimagf(z), npy_crealf(z)));
+}
+#endif
+
+#ifndef HAVE_CSINF
+npy_cfloat
+npy_csinf(npy_cfloat z)
+{
+    /* csin(z) = -I * csinh(I * z) */
+    z = npy_csinhf(npy_cpackf(-npy_cimagf(z), npy_crealf(z)));
+    return npy_cpackf(npy_cimagf(z), -npy_crealf(z));
+}
+#endif
+
+#ifndef HAVE_CTANF
+npy_cfloat
+npy_ctanf(npy_cfloat z)
+{
+    /* ctan(z) = -I * ctanh(I * z) */
+    z = npy_ctanhf(npy_cpackf(-npy_cimagf(z), npy_crealf(z)));
+    return (npy_cpackf(npy_cimagf(z), -npy_crealf(z)));
+}
+#endif
+
+#ifndef HAVE_CCOSHF
+/*
+ * Taken from the msun library in FreeBSD, rev 226599.
+ *
+ * Hyperbolic cosine of a complex argument z = x + i y.
+ *
+ * cosh(z) = cosh(x+iy)
+ *         = cosh(x) cos(y) + i sinh(x) sin(y).
+ *
+ * Exceptional values are noted in the comments within the source code.
+ * These values and the return value were taken from n1124.pdf.
+ *
+ * CCOSH_BIG is chosen such that
+ * spacing(0.5 * exp(CCOSH_BIG)) > 0.5*exp(-CCOSH_BIG)
+ * although the exact value assigned to CCOSH_BIG is not so important
+ */
+npy_cfloat
+npy_ccoshf(npy_cfloat z)
+{
+#if 1 == 1
+    const npy_float CCOSH_BIG = 9.0f;
+    const npy_float CCOSH_HUGE = 1.70141183e+38f;
+#endif
+#if 1 == 2
+    const npy_double CCOSH_BIG = 22.0;
+    const npy_double CCOSH_HUGE = 8.9884656743115795e+307;
+#endif
+#if 1 >= 3
+#if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
+    const npy_longdouble CCOSH_BIG = 22.0L;
+    const npy_longdouble CCOSH_HUGE = 8.9884656743115795e+307L;
+#else
+    const npy_longdouble CCOSH_BIG = 24.0L;
+    const npy_longdouble CCOSH_HUGE = 5.94865747678615882543e+4931L;
+#endif
+#endif
+
+    npy_float  x, y, h, absx;
+    npy_int xfinite, yfinite;
+
+    x = npy_crealf(z);
+    y = npy_cimagf(z);
+
+    xfinite = npy_isfinite(x);
+    yfinite = npy_isfinite(y);
+
+    /* Handle the nearly-non-exceptional cases where x and y are finite. */
+    if (xfinite && yfinite) {
+        if (y == 0) {
+            return npy_cpackf(npy_coshf(x), x * y);
+        }
+        absx = npy_fabsf(x);
+        if (absx < CCOSH_BIG) {
+            /* small x: normal case */
+            return npy_cpackf(npy_coshf(x) * npy_cosf(y),
+                                npy_sinhf(x) * npy_sinf(y));
+        }
+
+        /* |x| >= 22, so cosh(x) ~= exp(|x|) */
+        if (absx < SCALED_CEXP_LOWERF) {
+            /* x < 710: exp(|x|) won't overflow */
+            h = npy_expf(absx) * 0.5f;
+            return npy_cpackf(h * npy_cosf(y),
+                                npy_copysignf(h, x) * npy_sinf(y));
+        }
+        else if (absx < SCALED_CEXP_UPPERF) {
+            /* x < 1455: scale to avoid overflow */
+            z = _npy_scaled_cexpf(absx, y, -1);
+            return npy_cpackf(npy_crealf(z),
+                                npy_cimagf(z) * npy_copysignf(1, x));
+        }
+        else {
+            /* x >= 1455: the result always overflows */
+            h = CCOSH_HUGE * x;
+            return npy_cpackf(h * h * npy_cosf(y), h * npy_sinf(y));
+        }
+    }
+
+    /*
+     * cosh(+-0 +- I Inf) = dNaN + I sign(d(+-0, dNaN))0.
+     * The sign of 0 in the result is unspecified.  Choice = normally
+     * the same as dNaN.  Raise the invalid floating-point exception.
+     *
+     * cosh(+-0 +- I NaN) = d(NaN) + I sign(d(+-0, NaN))0.
+     * The sign of 0 in the result is unspecified.  Choice = normally
+     * the same as d(NaN).
+     */
+    if (x == 0 && !yfinite) {
+        return npy_cpackf(y - y, npy_copysignf(0, x * (y - y)));
+    }
+
+    /*
+     * cosh(+-Inf +- I 0) = +Inf + I (+-)(+-)0.
+     *
+     * cosh(NaN +- I 0)   = d(NaN) + I sign(d(NaN, +-0))0.
+     * The sign of 0 in the result is unspecified.
+     */
+    if (y == 0 && !xfinite) {
+        return npy_cpackf(x * x, npy_copysignf(0, x) * y);
+    }
+
+    /*
+     * cosh(x +- I Inf) = dNaN + I dNaN.
+     * Raise the invalid floating-point exception for finite nonzero x.
+     *
+     * cosh(x + I NaN) = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception for finite
+     * nonzero x.  Choice = don't raise (except for signaling NaNs).
+     */
+    if (xfinite && !yfinite) {
+        return npy_cpackf(y - y, x * (y - y));
+    }
+
+    /*
+     * cosh(+-Inf + I NaN)  = +Inf + I d(NaN).
+     *
+     * cosh(+-Inf +- I Inf) = +Inf + I dNaN.
+     * The sign of Inf in the result is unspecified.  Choice = always +.
+     * Raise the invalid floating-point exception.
+     *
+     * cosh(+-Inf + I y)   = +Inf cos(y) +- I Inf sin(y)
+     */
+    if (npy_isinf(x)) {
+        if (!yfinite) {
+            return npy_cpackf(x * x, x * (y - y));
+        }
+        return npy_cpackf((x * x) * npy_cosf(y), x * npy_sinf(y));
+    }
+
+    /*
+     * cosh(NaN + I NaN)  = d(NaN) + I d(NaN).
+     *
+     * cosh(NaN +- I Inf) = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception.
+     * Choice = raise.
+     *
+     * cosh(NaN + I y)    = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception for finite
+     * nonzero y.  Choice = don't raise (except for signaling NaNs).
+     */
+    return npy_cpackf((x * x) * (y - y), (x + x) * (y - y));
+}
+#undef CCOSH_BIG
+#undef CCOSH_HUGE
+#endif
+
+#ifndef HAVE_CSINHF
+/*
+ * Taken from the msun library in FreeBSD, rev 226599.
+ *
+ * Hyperbolic sine of a complex argument z = x + i y.
+ *
+ * sinh(z) = sinh(x+iy)
+ *         = sinh(x) cos(y) + i cosh(x) sin(y).
+ *
+ * Exceptional values are noted in the comments within the source code.
+ * These values and the return value were taken from n1124.pdf.
+ */
+npy_cfloat
+npy_csinhf(npy_cfloat z)
+{
+#if 1 == 1
+    const npy_float CSINH_BIG = 9.0f;
+    const npy_float CSINH_HUGE = 1.70141183e+38f;
+#endif
+#if 1 == 2
+    const npy_double CSINH_BIG = 22.0;
+    const npy_double CSINH_HUGE = 8.9884656743115795e+307;
+#endif
+#if 1 >= 3
+#if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
+    const npy_longdouble CSINH_BIG = 22.0L;
+    const npy_longdouble CSINH_HUGE = 8.9884656743115795e+307L;
+#else
+    const npy_longdouble CSINH_BIG = 24.0L;
+    const npy_longdouble CSINH_HUGE = 5.94865747678615882543e+4931L;
+#endif
+#endif
+
+    npy_float x, y, h, absx;
+    npy_int xfinite, yfinite;
+
+    x = npy_crealf(z);
+    y = npy_cimagf(z);
+
+    xfinite = npy_isfinite(x);
+    yfinite = npy_isfinite(y);
+
+    /* Handle the nearly-non-exceptional cases where x and y are finite. */
+    if (xfinite && yfinite) {
+        if (y == 0) {
+            return npy_cpackf(npy_sinhf(x), y);
+        }
+        absx = npy_fabsf(x);
+        if (absx < CSINH_BIG) {
+            /* small x: normal case */
+            return npy_cpackf(npy_sinhf(x) * npy_cosf(y),
+                                npy_coshf(x) * npy_sinf(y));
+        }
+
+        /* |x| >= 22, so cosh(x) ~= exp(|x|) */
+        if (absx < SCALED_CEXP_LOWERF) {
+            /* x < 710: exp(|x|) won't overflow */
+            h = npy_expf(npy_fabsf(x)) * 0.5f;
+            return npy_cpackf(npy_copysignf(h, x) * npy_cosf(y),
+                                h * npy_sinf(y));
+        }
+        else if (x < SCALED_CEXP_UPPERF) {
+            /* x < 1455: scale to avoid overflow */
+            z = _npy_scaled_cexpf(absx, y, -1);
+            return npy_cpackf(npy_crealf(z) * npy_copysignf(1, x),
+                                npy_cimagf(z));
+        }
+        else {
+            /* x >= 1455: the result always overflows */
+            h = CSINH_HUGE * x;
+            return npy_cpackf(h * npy_cosf(y), h * h * npy_sinf(y));
+        }
+    }
+
+    /*
+     * sinh(+-0 +- I Inf) = sign(d(+-0, dNaN))0 + I dNaN.
+     * The sign of 0 in the result is unspecified.  Choice = normally
+     * the same as dNaN.  Raise the invalid floating-point exception.
+     *
+     * sinh(+-0 +- I NaN) = sign(d(+-0, NaN))0 + I d(NaN).
+     * The sign of 0 in the result is unspecified.  Choice = normally
+     * the same as d(NaN).
+     */
+    if (x == 0 && !yfinite) {
+        return npy_cpackf(npy_copysignf(0, x * (y - y)), y - y);
+    }
+
+    /*
+     * sinh(+-Inf +- I 0) = +-Inf + I +-0.
+     *
+     * sinh(NaN +- I 0)   = d(NaN) + I +-0.
+     */
+    if (y == 0 && !xfinite) {
+        if (npy_isnan(x)) {
+            return z;
+        }
+        return npy_cpackf(x, npy_copysignf(0, y));
+    }
+
+    /*
+     * sinh(x +- I Inf) = dNaN + I dNaN.
+     * Raise the invalid floating-point exception for finite nonzero x.
+     *
+     * sinh(x + I NaN) = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception for finite
+     * nonzero x.  Choice = don't raise (except for signaling NaNs).
+     */
+    if (xfinite && !yfinite) {
+        return npy_cpackf(y - y, x * (y - y));
+    }
+
+    /*
+     * sinh(+-Inf + I NaN)  = +-Inf + I d(NaN).
+     * The sign of Inf in the result is unspecified.  Choice = normally
+     * the same as d(NaN).
+     *
+     * sinh(+-Inf +- I Inf) = +Inf + I dNaN.
+     * The sign of Inf in the result is unspecified.  Choice = always +.
+     * Raise the invalid floating-point exception.
+     *
+     * sinh(+-Inf + I y)   = +-Inf cos(y) + I Inf sin(y)
+     */
+    if (!xfinite && !npy_isnan(x)) {
+        if (!yfinite) {
+            return npy_cpackf(x * x, x * (y - y));
+        }
+        return npy_cpackf(x * npy_cosf(y),
+                            NPY_INFINITYF * npy_sinf(y));
+    }
+
+    /*
+     * sinh(NaN + I NaN)  = d(NaN) + I d(NaN).
+     *
+     * sinh(NaN +- I Inf) = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception.
+     * Choice = raise.
+     *
+     * sinh(NaN + I y)    = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception for finite
+     * nonzero y.  Choice = don't raise (except for signaling NaNs).
+     */
+    return npy_cpackf((x * x) * (y - y), (x + x) * (y - y));
+}
+#undef CSINH_BIG
+#undef CSINH_HUGE
+#endif
+
+#ifndef HAVE_CTANHF
+/*
+ * Taken from the msun library in FreeBSD, rev 226600.
+ *
+ * Hyperbolic tangent of a complex argument z = x + i y.
+ *
+ * The algorithm is from:
+ *
+ *   W. Kahan.  Branch Cuts for Complex Elementary Functions or Much
+ *   Ado About Nothing's Sign Bit.  In The State of the Art in
+ *   Numerical Analysis, pp. 165 ff.  Iserles and Powell, eds., 1987.
+ *
+ * Method:
+ *
+ *   Let t    = tan(x)
+ *       beta = 1/cos^2(y)
+ *       s    = sinh(x)
+ *       rho  = cosh(x)
+ *
+ *   We have:
+ *
+ *   tanh(z) = sinh(z) / cosh(z)
+ *
+ *             sinh(x) cos(y) + i cosh(x) sin(y)
+ *           = ---------------------------------
+ *             cosh(x) cos(y) + i sinh(x) sin(y)
+ *
+ *             cosh(x) sinh(x) / cos^2(y) + i tan(y)
+ *           = -------------------------------------
+ *                    1 + sinh^2(x) / cos^2(y)
+ *
+ *             beta rho s + i t
+ *           = ----------------
+ *               1 + beta s^2
+ *
+ * Modifications:
+ *
+ *   I omitted the original algorithm's handling of overflow in tan(x) after
+ *   verifying with nearpi.c that this can't happen in IEEE single or double
+ *   precision.  I also handle large x differently.
+ */
+
+#define TANH_HUGE 22.0
+#define TANHF_HUGE 11.0F
+#define TANHL_HUGE 42.0L
+
+npy_cfloat
+npy_ctanhf(npy_cfloat z)
+{
+    npy_float x, y;
+    npy_float t, beta, s, rho, denom;
+
+    x = npy_crealf(z);
+    y = npy_cimagf(z);
+
+    /*
+     * ctanh(NaN + i 0) = NaN + i 0
+     *
+     * ctanh(NaN + i y) = NaN + i NaN        for y != 0
+     *
+     * The imaginary part has the sign of x*sin(2*y), but there's no
+     * special effort to get this right.
+     *
+     * ctanh(+-Inf +- i Inf) = +-1 +- 0
+     *
+     * ctanh(+-Inf + i y) = +-1 + 0 sin(2y)        for y finite
+     *
+     * The imaginary part of the sign is unspecified.  This special
+     * case is only needed to avoid a spurious invalid exception when
+     * y is infinite.
+     */
+        if (!npy_isfinite(x)) {
+            if (npy_isnan(x)) {
+                return npy_cpackf(x, (y == 0 ? y : x * y));
+            }
+            return npy_cpackf(npy_copysignf(1,x),
+                                npy_copysignf(0,
+                                npy_isinf(y) ?
+                                    y : npy_sinf(y) * npy_cosf(y)));
+        }
+
+    /*
+     * ctanh(x + i NAN) = NaN + i NaN
+     * ctanh(x +- i Inf) = NaN + i NaN
+     */
+    if (!npy_isfinite(y)) {
+        return (npy_cpackf(y - y, y - y));
+    }
+
+    /*
+     * ctanh(+-huge + i +-y) ~= +-1 +- i 2sin(2y)/exp(2x), using the
+     * approximation sinh^2(huge) ~= exp(2*huge) / 4.
+     * We use a modified formula to avoid spurious overflow.
+     */
+    if (npy_fabsf(x) >= TANHF_HUGE) {
+        npy_float exp_mx = npy_expf(-npy_fabsf(x));
+        return npy_cpackf(npy_copysignf(1, x),
+                            4 * npy_sinf(y) * npy_cosf(y) *
+                                exp_mx * exp_mx);
+    }
+
+    /* Kahan's algorithm */
+    t = npy_tanf(y);
+    beta = 1 + t * t;    /* = 1 / cos^2(y) */
+    s = npy_sinhf(x);
+    rho = npy_sqrtf(1 + s * s);    /* = cosh(x) */
+    denom = 1 + beta * s * s;
+    return (npy_cpackf((beta * rho * s) / denom, t / denom));
+}
+#undef TANH_HUGE
+#undef TANHF_HUGE
+#undef TANHL_HUGE
+#endif
+
+#if !defined (HAVE_CACOSF) || !defined (HAVE_CASINHF)
+/*
+ * Complex inverse trig functions taken from the msum library in FreeBSD
+ * revision 251404
+ *
+ * The algorithm is very close to that in "Implementing the complex arcsine
+ * and arccosine functions using exception handling" by T. E. Hull, Thomas F.
+ * Fairgrieve, and Ping Tak Peter Tang, published in ACM Transactions on
+ * Mathematical Software, Volume 23 Issue 3, 1997, Pages 299-335,
+ * http://dl.acm.org/citation.cfm?id=275324.
+ *
+ * Throughout we use the convention z = x + I*y.
+ *
+ * casinh(z) = sign(x)*log(A+sqrt(A*A-1)) + I*asin(B)
+ * where
+ * A = (|z+I| + |z-I|) / 2
+ * B = (|z+I| - |z-I|) / 2 = y/A
+ *
+ * These formulas become numerically unstable:
+ *   (a) for Re(casinh(z)) when z is close to the line segment [-I, I] (that
+ *       is, Re(casinh(z)) is close to 0);
+ *   (b) for Im(casinh(z)) when z is close to either of the intervals
+ *       [I, I*infinity) or (-I*infinity, -I] (that is, |Im(casinh(z))| is
+ *       close to PI/2).
+ *
+ * These numerical problems are overcome by defining
+ * f(a, b) = (hypot(a, b) - b) / 2 = a*a / (hypot(a, b) + b) / 2
+ * Then if A < A_crossover, we use
+ *   log(A + sqrt(A*A-1)) = log1p((A-1) + sqrt((A-1)*(A+1)))
+ *   A-1 = f(x, 1+y) + f(x, 1-y)
+ * and if B > B_crossover, we use
+ *   asin(B) = atan2(y, sqrt(A*A - y*y)) = atan2(y, sqrt((A+y)*(A-y)))
+ *   A-y = f(x, y+1) + f(x, y-1)
+ * where without loss of generality we have assumed that x and y are
+ * non-negative.
+ *
+ * Much of the difficulty comes because the intermediate computations may
+ * produce overflows or underflows.  This is dealt with in the paper by Hull
+ * et al by using exception handling.  We do this by detecting when
+ * computations risk underflow or overflow.  The hardest part is handling the
+ * underflows when computing f(a, b).
+ *
+ * Note that the function f(a, b) does not appear explicitly in the paper by
+ * Hull et al, but the idea may be found on pages 308 and 309.  Introducing the
+ * function f(a, b) allows us to concentrate many of the clever tricks in this
+ * paper into one function.
+ */
+
+/*
+ * Function f(a, b, hypot_a_b) = (hypot(a, b) - b) / 2.
+ * Pass hypot(a, b) as the third argument.
+ */
+static inline npy_float
+_ff(npy_float a, npy_float b, npy_float hypot_a_b)
+{
+    if (b < 0) {
+        return ((hypot_a_b - b) / 2);
+    }
+    if (b == 0) {
+        return (a / 2);
+    }
+    return (a * a / (hypot_a_b + b) / 2);
+}
+
+/*
+ * All the hard work is contained in this function.
+ * x and y are assumed positive or zero, and less than RECIP_EPSILON.
+ * Upon return:
+ * rx = Re(casinh(z)) = -Im(cacos(y + I*x)).
+ * B_is_usable is set to 1 if the value of B is usable.
+ * If B_is_usable is set to 0, sqrt_A2my2 = sqrt(A*A - y*y), and new_y = y.
+ * If returning sqrt_A2my2 has potential to result in an underflow, it is
+ * rescaled, and new_y is similarly rescaled.
+ */
+static inline void
+_do_hard_workf(npy_float x, npy_float y, npy_float *rx,
+    npy_int *B_is_usable, npy_float *B, npy_float *sqrt_A2my2, npy_float *new_y)
+{
+#if 1 == 1
+    const npy_float A_crossover = 10.0f;
+    const npy_float B_crossover = 0.6417f;
+    const npy_float FOUR_SQRT_MIN = 4.3368086899420177e-19f;
+#endif
+#if 1 == 2
+    const npy_double A_crossover = 10.0;
+    const npy_double B_crossover = 0.6417;
+    const npy_double FOUR_SQRT_MIN = 5.9666725849601654e-154;
+#endif
+#if 1 == 3
+    const npy_longdouble A_crossover = 10.0l;
+    const npy_longdouble B_crossover = 0.6417l;
+#if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
+    const npy_longdouble FOUR_SQRT_MIN = 5.9666725849601654e-154;
+#else
+    const npy_longdouble FOUR_SQRT_MIN = 7.3344154702193886625e-2466l;
+#endif
+#endif
+    npy_float R, S, A; /* A, B, R, and S are as in Hull et al. */
+    npy_float Am1, Amy; /* A-1, A-y. */
+
+    R = npy_hypotf(x, y + 1);        /* |z+I| */
+    S = npy_hypotf(x, y - 1);        /* |z-I| */
+
+    /* A = (|z+I| + |z-I|) / 2 */
+    A = (R + S) / 2;
+    /*
+     * Mathematically A >= 1.  There is a small chance that this will not
+     * be so because of rounding errors.  So we will make certain it is
+     * so.
+     */
+    if (A < 1) {
+        A = 1;
+    }
+
+    if (A < A_crossover) {
+        /*
+         * Am1 = fp + fm, where fp = f(x, 1+y), and fm = f(x, 1-y).
+         * rx = log1p(Am1 + sqrt(Am1*(A+1)))
+         */
+        if (y == 1 && x < FLT_EPSILON * FLT_EPSILON / 128) {
+            /*
+             * fp is of order x^2, and fm = x/2.
+             * A = 1 (inexactly).
+             */
+            *rx = npy_sqrtf(x);
+        }
+        else if (x >= FLT_EPSILON * npy_fabsf(y - 1)) {
+            /*
+             * Underflow will not occur because
+             * x >= DBL_EPSILON^2/128 >= FOUR_SQRT_MIN
+             */
+            Am1 = _ff(x, 1 + y, R) + _ff(x, 1 - y, S);
+            *rx = npy_log1pf(Am1 + npy_sqrtf(Am1 * (A + 1)));
+        }
+        else if (y < 1) {
+            /*
+             * fp = x*x/(1+y)/4, fm = x*x/(1-y)/4, and
+             * A = 1 (inexactly).
+             */
+            *rx = x / npy_sqrtf((1 - y) * (1 + y));
+        }
+        else {        /* if (y > 1) */
+            /*
+             * A-1 = y-1 (inexactly).
+             */
+            *rx = npy_log1pf((y - 1) + npy_sqrtf((y - 1) * (y + 1)));
+        }
+    }
+    else {
+        *rx = npy_logf(A + npy_sqrtf(A * A - 1));
+    }
+
+    *new_y = y;
+
+    if (y < FOUR_SQRT_MIN) {
+        /*
+         * Avoid a possible underflow caused by y/A.  For casinh this
+         * would be legitimate, but will be picked up by invoking atan2
+         * later on.  For cacos this would not be legitimate.
+         */
+        *B_is_usable = 0;
+        *sqrt_A2my2 = A * (2 / FLT_EPSILON);
+        *new_y = y * (2 / FLT_EPSILON);
+        return;
+    }
+
+    /* B = (|z+I| - |z-I|) / 2 = y/A */
+    *B = y / A;
+    *B_is_usable = 1;
+
+    if (*B > B_crossover) {
+        *B_is_usable = 0;
+        /*
+         * Amy = fp + fm, where fp = f(x, y+1), and fm = f(x, y-1).
+         * sqrt_A2my2 = sqrt(Amy*(A+y))
+         */
+        if (y == 1 && x < FLT_EPSILON / 128) {
+            /*
+             * fp is of order x^2, and fm = x/2.
+             * A = 1 (inexactly).
+             */
+            *sqrt_A2my2 = npy_sqrtf(x) * npy_sqrtf((A + y) / 2);
+        }
+        else if (x >= FLT_EPSILON * npy_fabsf(y - 1)) {
+            /*
+             * Underflow will not occur because
+             * x >= DBL_EPSILON/128 >= FOUR_SQRT_MIN
+             * and
+             * x >= DBL_EPSILON^2 >= FOUR_SQRT_MIN
+             */
+            Amy = _ff(x, y + 1, R) + _ff(x, y - 1, S);
+            *sqrt_A2my2 = npy_sqrtf(Amy * (A + y));
+        }
+        else if (y > 1) {
+            /*
+             * fp = x*x/(y+1)/4, fm = x*x/(y-1)/4, and
+             * A = y (inexactly).
+             *
+             * y < RECIP_EPSILON.  So the following
+             * scaling should avoid any underflow problems.
+             */
+            *sqrt_A2my2 = x * (4 / FLT_EPSILON / FLT_EPSILON) * y /
+                npy_sqrtf((y + 1) * (y - 1));
+            *new_y = y * (4 / FLT_EPSILON / FLT_EPSILON);
+        }
+        else {        /* if (y < 1) */
+            /*
+             * fm = 1-y >= DBL_EPSILON, fp is of order x^2, and
+             * A = 1 (inexactly).
+             */
+            *sqrt_A2my2 = npy_sqrtf((1 - y) * (1 + y));
+        }
+    }
+}
+
+/*
+ * Optimized version of clog() for |z| finite and larger than ~RECIP_EPSILON.
+ */
+static inline void
+_clog_for_large_valuesf(npy_float x, npy_float y,
+    npy_float *rr, npy_float *ri)
+{
+#if 1 == 1
+    const npy_float QUARTER_SQRT_MAX = 4.611685743549481e+18f;
+    const npy_float SQRT_MIN = 1.0842021724855044e-19f;
+ #endif
+#if 1 == 2
+    const npy_double QUARTER_SQRT_MAX = 3.3519519824856489e+153;
+    const npy_double SQRT_MIN = 1.4916681462400413e-154;
+ #endif
+#if 1 == 3
+#if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
+    const npy_longdouble QUARTER_SQRT_MAX = 3.3519519824856489e+153;
+    const npy_longdouble SQRT_MIN = 1.4916681462400413e-154;
+#else
+    const npy_longdouble QUARTER_SQRT_MAX = 2.7268703390485398235e+2465l;
+    const npy_longdouble SQRT_MIN = 1.8336038675548471656e-2466l;
+#endif
+#endif
+    npy_float ax, ay, t;
+
+    ax = npy_fabsf(x);
+    ay = npy_fabsf(y);
+    if (ax < ay) {
+        t = ax;
+        ax = ay;
+        ay = t;
+    }
+
+    /*
+     * Avoid overflow in hypot() when x and y are both very large.
+     * Divide x and y by E, and then add 1 to the logarithm.  This depends
+     * on E being larger than sqrt(2).
+     * Dividing by E causes an insignificant loss of accuracy; however
+     * this method is still poor since it is unnecessarily slow.
+     */
+    if (ax > FLT_MAX / 2) {
+        *rr = npy_logf(npy_hypotf(x / NPY_Ef, y / NPY_Ef)) + 1;
+    }
+    /*
+     * Avoid overflow when x or y is large.  Avoid underflow when x or
+     * y is small.
+     */
+    else if (ax > QUARTER_SQRT_MAX || ay < SQRT_MIN) {
+        *rr = npy_logf(npy_hypotf(x, y));
+    }
+    else {
+        *rr = npy_logf(ax * ax + ay * ay) / 2;
+    }
+    *ri = npy_atan2f(y, x);
+}
+#endif
+
+#ifndef HAVE_CACOSF
+npy_cfloat
+npy_cacosf(npy_cfloat z)
+{
+#if 1 == 1
+    /* this is sqrt(6*EPS) */
+    const npy_float SQRT_6_EPSILON = 8.4572793338e-4f;
+    /* chosen such that pio2_hi + pio2_lo == pio2_hi but causes FE_INEXACT. */
+    const volatile npy_float pio2_lo = 7.5497899549e-9f;
+#endif
+#if 1 == 2
+    const npy_double SQRT_6_EPSILON = 3.65002414998885671e-08;
+    const volatile npy_double pio2_lo = 6.1232339957367659e-17;
+#endif
+#if 1 == 3
+    const npy_longdouble SQRT_6_EPSILON = 8.0654900873493277169e-10l;
+    const volatile npy_longdouble pio2_lo = 2.710505431213761085e-20l;
+#endif
+    const npy_float RECIP_EPSILON = 1.0f / FLT_EPSILON;
+    const npy_float pio2_hi = NPY_PI_2f;
+    npy_float x, y, ax, ay, wx, wy, rx, ry, B, sqrt_A2mx2, new_x;
+    npy_int sx, sy;
+    npy_int B_is_usable;
+
+    x = npy_crealf(z);
+    y = npy_cimagf(z);
+    sx = npy_signbit(x);
+    sy = npy_signbit(y);
+    ax = npy_fabsf(x);
+    ay = npy_fabsf(y);
+
+    if (npy_isnan(x) || npy_isnan(y)) {
+        /* cacos(+-Inf + I*NaN) = NaN + I*opt(-)Inf */
+        if (npy_isinf(x)) {
+            return npy_cpackf(y + y, -NPY_INFINITYF);
+        }
+        /* cacos(NaN + I*+-Inf) = NaN + I*-+Inf */
+        if (npy_isinf(y)) {
+            return npy_cpackf(x + x, -y);
+        }
+        /* cacos(0 + I*NaN) = PI/2 + I*NaN with inexact */
+        if (x == 0) {
+            return npy_cpackf(pio2_hi + pio2_lo, y + y);
+        }
+        /*
+         * All other cases involving NaN return NaN + I*NaN.
+         * C99 leaves it optional whether to raise invalid if one of
+         * the arguments is not NaN, so we opt not to raise it.
+         */
+        return npy_cpackf(NPY_NANF, NPY_NANF);
+    }
+
+    if (ax > RECIP_EPSILON || ay > RECIP_EPSILON) {
+        /* clog...() will raise inexact unless x or y is infinite. */
+        _clog_for_large_valuesf(x, y, &wx, &wy);
+        rx = npy_fabsf(wy);
+        ry = wx + NPY_LOGE2f;
+        if (sy == 0) {
+            ry = -ry;
+        }
+        return npy_cpackf(rx, ry);
+    }
+
+    /* Avoid spuriously raising inexact for z = 1. */
+    if (x == 1 && y == 0) {
+        return npy_cpackf(0, -y);
+    }
+
+    /* All remaining cases are inexact. */
+    raise_inexact();
+
+    if (ax < SQRT_6_EPSILON / 4 && ay < SQRT_6_EPSILON / 4) {
+        return npy_cpackf(pio2_hi - (x - pio2_lo), -y);
+    }
+
+    _do_hard_workf(ay, ax, &ry, &B_is_usable, &B, &sqrt_A2mx2, &new_x);
+    if (B_is_usable) {
+        if (sx == 0) {
+            rx = npy_acosf(B);
+        }
+        else {
+            rx = npy_acosf(-B);
+        }
+    }
+    else {
+        if (sx == 0) {
+            rx = npy_atan2f(sqrt_A2mx2, new_x);
+        }
+        else {
+            rx = npy_atan2f(sqrt_A2mx2, -new_x);
+        }
+    }
+    if (sy == 0) {
+        ry = -ry;
+    }
+    return npy_cpackf(rx, ry);
+}
+#endif
+
+#ifndef HAVE_CASINF
+npy_cfloat
+npy_casinf(npy_cfloat z)
+{
+    /* casin(z) = I * conj( casinh(I * conj(z)) ) */
+    z = npy_casinhf(npy_cpackf(npy_cimagf(z), npy_crealf(z)));
+    return npy_cpackf(npy_cimagf(z), npy_crealf(z));
+}
+#endif
+
+#ifndef HAVE_CATANF
+npy_cfloat
+npy_catanf(npy_cfloat z)
+{
+    /* catan(z) = I * conj( catanh(I * conj(z)) ) */
+    z = npy_catanhf(npy_cpackf(npy_cimagf(z), npy_crealf(z)));
+    return npy_cpackf(npy_cimagf(z), npy_crealf(z));
+}
+#endif
+
+#ifndef HAVE_CACOSHF
+npy_cfloat
+npy_cacoshf(npy_cfloat z)
+{
+    /*
+     * cacosh(z) = I*cacos(z) or -I*cacos(z)
+     * where the sign is chosen so Re(cacosh(z)) >= 0.
+     */
+    npy_cfloat  w;
+    npy_float rx, ry;
+
+    w = npy_cacosf(z);
+    rx = npy_crealf(w);
+    ry = npy_cimagf(w);
+    /* cacosh(NaN + I*NaN) = NaN + I*NaN */
+    if (npy_isnan(rx) && npy_isnan(ry)) {
+        return npy_cpackf(ry, rx);
+    }
+    /* cacosh(NaN + I*+-Inf) = +Inf + I*NaN */
+    /* cacosh(+-Inf + I*NaN) = +Inf + I*NaN */
+    if (npy_isnan(rx)) {
+        return npy_cpackf(npy_fabsf(ry), rx);
+    }
+    /* cacosh(0 + I*NaN) = NaN + I*NaN */
+    if (npy_isnan(ry)) {
+        return npy_cpackf(ry, ry);
+    }
+    return npy_cpackf(npy_fabsf(ry), npy_copysignf(rx, npy_cimagf(z)));
+}
+#endif
+
+#ifndef HAVE_CASINHF
+/*
+ * casinh(z) = z + O(z^3)   as z -> 0
+ *
+ * casinh(z) = sign(x)*clog(sign(x)*z) + O(1/z^2)   as z -> infinity
+ * The above formula works for the imaginary part as well, because
+ * Im(casinh(z)) = sign(x)*atan2(sign(x)*y, fabs(x)) + O(y/z^3)
+ *    as z -> infinity, uniformly in y
+ */
+npy_cfloat
+npy_casinhf(npy_cfloat z)
+{
+#if 1 == 1
+    /* this is sqrt(6*EPS) */
+    const npy_float SQRT_6_EPSILON = 8.4572793338e-4f;
+#endif
+#if 1 == 2
+    const npy_double SQRT_6_EPSILON = 3.65002414998885671e-08;
+#endif
+#if 1 == 3
+    const npy_longdouble SQRT_6_EPSILON = 8.0654900873493277169e-10l;
+#endif
+    const npy_float RECIP_EPSILON = 1.0f / FLT_EPSILON;
+    npy_float x, y, ax, ay, wx, wy, rx, ry, B, sqrt_A2my2, new_y;
+    npy_int B_is_usable;
+
+    x = npy_crealf(z);
+    y = npy_cimagf(z);
+    ax = npy_fabsf(x);
+    ay = npy_fabsf(y);
+
+    if (npy_isnan(x) || npy_isnan(y)) {
+        /* casinh(+-Inf + I*NaN) = +-Inf + I*NaN */
+        if (npy_isinf(x)) {
+            return npy_cpackf(x, y + y);
+        }
+        /* casinh(NaN + I*+-Inf) = opt(+-)Inf + I*NaN */
+        if (npy_isinf(y)) {
+            return npy_cpackf(y, x + x);
+        }
+        /* casinh(NaN + I*0) = NaN + I*0 */
+        if (y == 0) {
+            return npy_cpackf(x + x, y);
+        }
+        /*
+         * All other cases involving NaN return NaN + I*NaN.
+         * C99 leaves it optional whether to raise invalid if one of
+         * the arguments is not NaN, so we opt not to raise it.
+         */
+        return npy_cpackf(NPY_NANF, NPY_NANF);
+    }
+
+    if (ax > RECIP_EPSILON || ay > RECIP_EPSILON) {
+        /* clog...() will raise inexact unless x or y is infinite. */
+        if (npy_signbit(x) == 0) {
+            _clog_for_large_valuesf(x, y, &wx, &wy);
+            wx += NPY_LOGE2f;
+        }
+        else {
+            _clog_for_large_valuesf(-x, -y, &wx, &wy);
+            wx += NPY_LOGE2f;
+        }
+        return npy_cpackf(npy_copysignf(wx, x), npy_copysignf(wy, y));
+    }
+
+    /* Avoid spuriously raising inexact for z = 0. */
+    if (x == 0 && y == 0) {
+        return (z);
+    }
+
+    /* All remaining cases are inexact. */
+    raise_inexact();
+
+    if (ax < SQRT_6_EPSILON / 4 && ay < SQRT_6_EPSILON / 4) {
+        return (z);
+    }
+
+    _do_hard_workf(ax, ay, &rx, &B_is_usable, &B, &sqrt_A2my2, &new_y);
+    if (B_is_usable) {
+        ry = npy_asinf(B);
+    }
+    else {
+        ry = npy_atan2f(new_y, sqrt_A2my2);
+    }
+    return npy_cpackf(npy_copysignf(rx, x), npy_copysignf(ry, y));
+}
+#endif
+
+#ifndef HAVE_CATANHF
+/*
+ * sum_squares(x,y) = x*x + y*y (or just x*x if y*y would underflow).
+ * Assumes x*x and y*y will not overflow.
+ * Assumes x and y are finite.
+ * Assumes y is non-negative.
+ * Assumes fabs(x) >= DBL_EPSILON.
+ */
+static inline npy_float
+_sum_squaresf(npy_float x, npy_float y)
+{
+#if 1 == 1
+const npy_float SQRT_MIN = 1.0842022e-19f;
+#endif
+#if 1 == 2
+const npy_double SQRT_MIN = 1.4916681462400413e-154; /* sqrt(DBL_MIN) */
+#endif
+#if 1 == 3
+#if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
+const npy_longdouble SQRT_MIN = 1.4916681462400413e-154; /* sqrt(DBL_MIN) */
+#else
+/* this is correct for 80 bit long doubles */
+const npy_longdouble SQRT_MIN = 1.8336038675548471656e-2466l;
+#endif
+#endif
+    /* Avoid underflow when y is small. */
+    if (y < SQRT_MIN) {
+        return (x * x);
+    }
+
+    return (x * x + y * y);
+}
+
+/*
+ * real_part_reciprocal(x, y) = Re(1/(x+I*y)) = x/(x*x + y*y).
+ * Assumes x and y are not NaN, and one of x and y is larger than
+ * RECIP_EPSILON.  We avoid unwarranted underflow.  It is important to not use
+ * the code creal(1/z), because the imaginary part may produce an unwanted
+ * underflow.
+ * This is only called in a context where inexact is always raised before
+ * the call, so no effort is made to avoid or force inexact.
+ */
+#if 1 == 1
+#define BIAS (FLT_MAX_EXP - 1)
+#define CUTOFF (FLT_MANT_DIG / 2 + 1)
+static inline npy_float
+_real_part_reciprocalf(npy_float x, npy_float y)
+{
+    npy_float scale;
+    npy_uint32 hx, hy;
+    npy_int32 ix, iy;
+
+    GET_FLOAT_WORD(hx, x);
+    ix = hx & 0x7f800000;
+    GET_FLOAT_WORD(hy, y);
+    iy = hy & 0x7f800000;
+    if (ix - iy >= CUTOFF << 23 || npy_isinf(x)) {
+        return (1 / x);
+    }
+    if (iy - ix >= CUTOFF << 23) {
+        return (x / y / y);
+    }
+    if (ix <= (BIAS + FLT_MAX_EXP / 2 - CUTOFF) << 23) {
+        return (x / (x * x + y * y));
+    }
+    SET_FLOAT_WORD(scale, 0x7f800000 - ix);
+    x *= scale;
+    y *= scale;
+    return (x / (x * x + y * y) * scale);
+}
+#undef BIAS
+#undef CUTOFF
+#endif
+
+#if 1 == 2
+#define BIAS (DBL_MAX_EXP - 1)
+/*  more guard digits are useful iff there is extra precision. */
+#define CUTOFF (DBL_MANT_DIG / 2 + 1)  /* just half or 1 guard digit */
+static inline npy_double
+_real_part_reciprocal(npy_double x, npy_double y)
+{
+    npy_double scale;
+    npy_uint32 hx, hy;
+    npy_int32 ix, iy;
+
+    /*
+     * This code is inspired by the C99 document n1124.pdf, Section G.5.1,
+     * example 2.
+     */
+    GET_HIGH_WORD(hx, x);
+    ix = hx & 0x7ff00000;
+    GET_HIGH_WORD(hy, y);
+    iy = hy & 0x7ff00000;
+    if (ix - iy >= CUTOFF << 20 || npy_isinf(x)) {
+        /* +-Inf -> +-0 is special */
+        return (1 / x);
+    }
+    if (iy - ix >= CUTOFF << 20) {
+        /* should avoid double div, but hard */
+        return (x / y / y);
+    }
+    if (ix <= (BIAS + DBL_MAX_EXP / 2 - CUTOFF) << 20) {
+        return (x / (x * x + y * y));
+    }
+    scale = 1;
+    SET_HIGH_WORD(scale, 0x7ff00000 - ix);  /* 2**(1-ilogb(x)) */
+    x *= scale;
+    y *= scale;
+    return (x / (x * x + y * y) * scale);
+}
+#undef BIAS
+#undef CUTOFF
+#endif
+
+#if 1 == 3
+#if !defined(HAVE_LDOUBLE_DOUBLE_DOUBLE_BE) && \
+    !defined(HAVE_LDOUBLE_DOUBLE_DOUBLE_LE)
+
+#define BIAS (LDBL_MAX_EXP - 1)
+#define CUTOFF (LDBL_MANT_DIG / 2 + 1)
+static inline npy_longdouble
+_real_part_reciprocall(npy_longdouble x,
+    npy_longdouble y)
+{
+    npy_longdouble scale;
+    union IEEEl2bitsrep ux, uy, us;
+    npy_int32 ix, iy;
+
+    ux.e = x;
+    ix = GET_LDOUBLE_EXP(ux);
+    uy.e = y;
+    iy = GET_LDOUBLE_EXP(uy);
+    if (ix - iy >= CUTOFF || npy_isinf(x)) {
+        return (1/x);
+    }
+    if (iy - ix >= CUTOFF) {
+        return (x/y/y);
+    }
+    if (ix <= BIAS + LDBL_MAX_EXP / 2 - CUTOFF) {
+        return (x/(x*x + y*y));
+    }
+    us.e = 1;
+    SET_LDOUBLE_EXP(us, 0x7fff - ix);
+    scale = us.e;
+    x *= scale;
+    y *= scale;
+    return (x/(x*x + y*y) * scale);
+}
+#undef BIAS
+#undef CUTOFF
+
+#else
+
+static inline npy_longdouble
+_real_part_reciprocall(npy_longdouble x,
+    npy_longdouble y)
+{
+    return x/(x*x + y*y);
+}
+
+#endif
+#endif
+
+npy_cfloat
+npy_catanhf(npy_cfloat z)
+{
+#if 1 == 1
+    /* this is sqrt(3*EPS) */
+    const npy_float SQRT_3_EPSILON = 5.9801995673e-4f;
+    /* chosen such that pio2_hi + pio2_lo == pio2_hi but causes FE_INEXACT. */
+    const volatile npy_float pio2_lo = 7.5497899549e-9f;
+#endif
+#if 1 == 2
+    const npy_double SQRT_3_EPSILON = 2.5809568279517849e-8;
+    const volatile npy_double pio2_lo = 6.1232339957367659e-17;
+#endif
+#if 1 == 3
+    const npy_longdouble SQRT_3_EPSILON = 5.70316273435758915310e-10l;
+    const volatile npy_longdouble pio2_lo = 2.710505431213761085e-20l;
+#endif
+    const npy_float RECIP_EPSILON = 1.0f / FLT_EPSILON;
+    const npy_float pio2_hi = NPY_PI_2f;
+    npy_float x, y, ax, ay, rx, ry;
+
+    x = npy_crealf(z);
+    y = npy_cimagf(z);
+    ax = npy_fabsf(x);
+    ay = npy_fabsf(y);
+
+    /* This helps handle many cases. */
+    if (y == 0 && ax <= 1) {
+        return npy_cpackf(npy_atanhf(x), y);
+    }
+
+    /* To ensure the same accuracy as atan(), and to filter out z = 0. */
+    if (x == 0) {
+        return npy_cpackf(x, npy_atanf(y));
+    }
+
+    if (npy_isnan(x) || npy_isnan(y)) {
+        /* catanh(+-Inf + I*NaN) = +-0 + I*NaN */
+        if (npy_isinf(x)) {
+            return npy_cpackf(npy_copysignf(0, x), y + y);
+        }
+        /* catanh(NaN + I*+-Inf) = sign(NaN)0 + I*+-PI/2 */
+        if (npy_isinf(y)) {
+            return npy_cpackf(npy_copysignf(0, x),
+                npy_copysignf(pio2_hi + pio2_lo, y));
+        }
+        /*
+         * All other cases involving NaN return NaN + I*NaN.
+         * C99 leaves it optional whether to raise invalid if one of
+         * the arguments is not NaN, so we opt not to raise it.
+         */
+        return npy_cpackf(NPY_NANF, NPY_NANF);
+    }
+
+    if (ax > RECIP_EPSILON || ay > RECIP_EPSILON) {
+        return npy_cpackf(_real_part_reciprocalf(x, y),
+            npy_copysignf(pio2_hi + pio2_lo, y));
+    }
+
+    if (ax < SQRT_3_EPSILON / 2 && ay < SQRT_3_EPSILON / 2) {
+        /*
+         * z = 0 was filtered out above.  All other cases must raise
+         * inexact, but this is the only one that needs to do it
+         * explicitly.
+         */
+        raise_inexact();
+        return (z);
+    }
+
+    if (ax == 1 && ay < FLT_EPSILON) {
+        rx = (NPY_LOGE2f - npy_logf(ay)) / 2;
+    }
+    else {
+        rx = npy_log1pf(4 * ax / _sum_squaresf(ax - 1, ay)) / 4;
+    }
+
+    if (ax == 1) {
+        ry = npy_atan2f(2, -ay) / 2;
+    }
+    else if (ay < FLT_EPSILON) {
+        ry = npy_atan2f(2 * ay, (1 - ax) * (1 + ax)) / 2;
+    }
+    else {
+        ry = npy_atan2f(2 * ay, (1 - ax) * (1 + ax) - ay * ay) / 2;
+    }
+
+    return npy_cpackf(npy_copysignf(rx, x), npy_copysignf(ry, y));
+}
+#endif
+
+#line 63
+
+/*==========================================================
+ * Constants
+ *=========================================================*/
+static const npy_cdouble c_1 = {1.0, 0.0};
+
+/*==========================================================
+ * Helper functions
+ *
+ * These are necessary because we do not count on using a
+ * C99 compiler.
+ *=========================================================*/
+static inline
+npy_cdouble
+cmul(npy_cdouble a, npy_cdouble b)
+{
+    npy_double ar, ai, br, bi;
+    ar = npy_creal(a);
+    ai = npy_cimag(a);
+    br = npy_creal(b);
+    bi = npy_cimag(b);
+    return npy_cpack(ar*br - ai*bi, ar*bi + ai*br);
+}
+
+static inline
+npy_cdouble
+cdiv(npy_cdouble a, npy_cdouble b)
+{
+    npy_double ar, ai, br, bi, abs_br, abs_bi;
+    ar = npy_creal(a);
+    ai = npy_cimag(a);
+    br = npy_creal(b);
+    bi = npy_cimag(b);
+    abs_br = npy_fabs(br);
+    abs_bi = npy_fabs(bi);
+
+    if (abs_br >= abs_bi) {
+        if (abs_br == 0 && abs_bi == 0) {
+            /* divide by zeros should yield a complex inf or nan */
+            return npy_cpack(ar/abs_br, ai/abs_bi);
+        }
+        else {
+            npy_double rat = bi/br;
+            npy_double scl = 1.0/(br+bi*rat);
+            return npy_cpack((ar + ai*rat)*scl, (ai - ar*rat)*scl);
+        }
+    }
+    else {
+        npy_double rat = br/bi;
+        npy_double scl = 1.0/(bi + br*rat);
+        return npy_cpack((ar*rat + ai)*scl, (ai*rat - ar)*scl);
+    }
+}
+
+/*==========================================================
+ * Custom implementation of missing complex C99 functions
+ *=========================================================*/
+
+#ifndef HAVE_CABS
+npy_double
+npy_cabs(npy_cdouble z)
+{
+    return npy_hypot(npy_creal(z), npy_cimag(z));
+}
+#endif
+
+#ifndef HAVE_CARG
+npy_double
+npy_carg(npy_cdouble z)
+{
+    return npy_atan2(npy_cimag(z), npy_creal(z));
+}
+#endif
+
+/*
+ * cexp and (ccos, csin)h functions need to calculate exp scaled by another
+ * number.  This can be difficult if exp(x) overflows.  By doing this way, we
+ * don't risk overflowing exp. This likely raises floating-point exceptions,
+ * if we decide that we care.
+ *
+ * This is only useful over a limited range, (see below) an expects that the
+ * input values are in this range.
+ *
+ * This is based on the technique used in FreeBSD's __frexp_exp and
+ * __ldexp_(c)exp functions by David Schultz.
+ *
+ * SCALED_CEXP_LOWER = log(FLT_MAX)
+ * SCALED_CEXP_UPPER = log(2) + log(FLT_MAX) - log(FLT_TRUE_MIN),
+ * where FLT_TRUE_MIN is the smallest possible subnormal number.
+ */
+
+#define SCALED_CEXP_LOWERF 88.722839f
+#define SCALED_CEXP_UPPERF 192.69492f
+#define SCALED_CEXP_LOWER 710.47586007394386
+#define SCALED_CEXP_UPPER 1454.9159319953251
+#define SCALED_CEXP_LOWERL 11357.216553474703895L
+#define SCALED_CEXP_UPPERL 22756.021937783004509L
+
+#if !defined(HAVE_CSINH) || \
+    !defined(HAVE_CCOSH) || \
+    !defined(HAVE_CEXP)
+
+static
+npy_cdouble
+_npy_scaled_cexp(npy_double x, npy_double y, npy_int expt)
+{
+#if 2 == 1
+    const npy_int k = 235;
+#endif
+#if 2 == 2
+    const npy_int k = 1799;
+#endif
+#if 2 == 3
+    const npy_int k = 19547;
+#endif
+    const npy_double kln2 = k * NPY_LOGE2;
+    npy_double mant, mantcos, mantsin;
+    npy_int ex, excos, exsin;
+
+    mant = npy_frexp(npy_exp(x - kln2), &ex);
+    mantcos = npy_frexp(npy_cos(y), &excos);
+    mantsin = npy_frexp(npy_sin(y), &exsin);
+
+    expt += ex + k;
+    return npy_cpack( npy_ldexp(mant * mantcos, expt + excos),
+                         npy_ldexp(mant * mantsin, expt + exsin));
+}
+
+#endif
+
+#ifndef HAVE_CEXP
+
+npy_cdouble
+npy_cexp(npy_cdouble z)
+{
+    npy_double x, c, s;
+    npy_double r, i;
+    npy_cdouble ret;
+
+    r = npy_creal(z);
+    i = npy_cimag(z);
+
+    if (npy_isfinite(r)) {
+        if (r >= SCALED_CEXP_LOWER && r <= SCALED_CEXP_UPPER) {
+            ret = _npy_scaled_cexp(r, i, 0);
+        }
+        else {
+            x = npy_exp(r);
+
+            c = npy_cos(i);
+            s = npy_sin(i);
+
+            if (npy_isfinite(i)) {
+                ret = npy_cpack(x * c, x * s);
+            }
+            else {
+                ret = npy_cpack(NPY_NAN, npy_copysign(NPY_NAN, i));
+            }
+        }
+
+    }
+    else  if (npy_isnan(r)) {
+        /* r is nan */
+        if (i == 0) {
+            ret = z;
+        }
+        else {
+            ret = npy_cpack(r, npy_copysign(NPY_NAN, i));
+        }
+    }
+    else {
+        /* r is +- inf */
+        if (r > 0) {
+            if (i == 0) {
+                ret = npy_cpack(r, i);
+            }
+            else if (npy_isfinite(i)) {
+                c = npy_cos(i);
+                s = npy_sin(i);
+
+                ret = npy_cpack(r * c, r * s);
+            }
+            else {
+                /* x = +inf, y = +-inf | nan */
+                npy_set_floatstatus_invalid();
+                ret = npy_cpack(r, NPY_NAN);
+            }
+        }
+        else {
+            if (npy_isfinite(i)) {
+                x = npy_exp(r);
+                c = npy_cos(i);
+                s = npy_sin(i);
+
+                ret = npy_cpack(x * c, x * s);
+            }
+            else {
+                /* x = -inf, y = nan | +i inf */
+                ret = npy_cpack(0, 0);
+            }
+        }
+    }
+
+    return ret;
+}
+#endif
+
+#ifndef HAVE_CLOG
+/* algorithm from cpython, rev. d86f5686cef9
+ *
+ * The usual formula for the real part is log(hypot(z.real, z.imag)).
+ * There are four situations where this formula is potentially
+ * problematic:
+ *
+ * (1) the absolute value of z is subnormal.  Then hypot is subnormal,
+ * so has fewer than the usual number of bits of accuracy, hence may
+ * have large relative error.  This then gives a large absolute error
+ * in the log.  This can be solved by rescaling z by a suitable power
+ * of 2.
+ *
+ * (2) the absolute value of z is greater than DBL_MAX (e.g. when both
+ * z.real and z.imag are within a factor of 1/sqrt(2) of DBL_MAX)
+ * Again, rescaling solves this.
+ *
+ * (3) the absolute value of z is close to 1.  In this case it's
+ * difficult to achieve good accuracy, at least in part because a
+ * change of 1ulp in the real or imaginary part of z can result in a
+ * change of billions of ulps in the correctly rounded answer.
+ *
+ * (4) z = 0.  The simplest thing to do here is to call the
+ * floating-point log with an argument of 0, and let its behaviour
+ * (returning -infinity, signaling a floating-point exception, setting
+ * errno, or whatever) determine that of c_log.  So the usual formula
+ * is fine here.
+*/
+npy_cdouble
+npy_clog(npy_cdouble z)
+{
+    npy_double ax = npy_fabs(npy_creal(z));
+    npy_double ay = npy_fabs(npy_cimag(z));
+    npy_double rr, ri;
+
+    if (ax > DBL_MAX/4 || ay > DBL_MAX/4) {
+        rr = npy_log(npy_hypot(ax/2, ay/2)) + NPY_LOGE2;
+    }
+    else if (ax < DBL_MIN && ay < DBL_MIN) {
+        if (ax > 0  || ay > 0) {
+            /* catch cases where hypot(ax, ay) is subnormal */
+            rr = npy_log(npy_hypot(npy_ldexp(ax, DBL_MANT_DIG),
+                 npy_ldexp(ay, DBL_MANT_DIG))) - DBL_MANT_DIG*NPY_LOGE2;
+        }
+        else {
+            /* log(+/-0 +/- 0i) */
+            /* raise divide-by-zero floating point exception */
+            rr = -1.0 / npy_creal(z);
+            rr = npy_copysign(rr, -1);
+            ri = npy_carg(z);
+            return npy_cpack(rr, ri);
+        }
+    }
+    else {
+        npy_double h = npy_hypot(ax, ay);
+        if (0.71 <= h && h <= 1.73) {
+            npy_double am = ax > ay ? ax : ay; /* max(ax, ay) */
+            npy_double an = ax > ay ? ay : ax; /* min(ax, ay) */
+            rr = npy_log1p((am-1)*(am+1)+an*an)/2;
+        }
+        else {
+            rr = npy_log(h);
+        }
+    }
+    ri = npy_carg(z);
+
+    return npy_cpack(rr, ri);
+}
+#endif
+
+#ifndef HAVE_CSQRT
+
+/* We risk spurious overflow for components >= DBL_MAX / (1 + sqrt(2)). */
+#define THRESH  (DBL_MAX / (1 + NPY_SQRT2))
+
+npy_cdouble
+npy_csqrt(npy_cdouble z)
+{
+    npy_cdouble result;
+    npy_double a, b;
+    npy_double t;
+    int scale;
+
+    a = npy_creal(z);
+    b = npy_cimag(z);
+
+    /* Handle special cases. */
+    if (a == 0 && b == 0) {
+        return (npy_cpack(0, b));
+    }
+    if (npy_isinf(b)) {
+        return (npy_cpack(NPY_INFINITY, b));
+    }
+    if (npy_isnan(a)) {
+        t = (b - b) / (b - b);  /* raise invalid if b is not a NaN */
+        return (npy_cpack(a, t));    /* return NaN + NaN i */
+    }
+    if (npy_isinf(a)) {
+        /*
+         * csqrt(inf + NaN i)  = inf +  NaN i
+         * csqrt(inf + y i)    = inf +  0 i
+         * csqrt(-inf + NaN i) = NaN +- inf i
+         * csqrt(-inf + y i)   = 0   +  inf i
+         */
+        if (npy_signbit(a)) {
+            return (npy_cpack(npy_fabs(b - b), npy_copysign(a, b)));
+        }
+        else {
+            return (npy_cpack(a, npy_copysign(b - b, b)));
+        }
+    }
+    /*
+     * The remaining special case (b is NaN) is handled just fine by
+     * the normal code path below.
+     */
+
+    /* Scale to avoid overflow. */
+    if (npy_fabs(a) >= THRESH || npy_fabs(b) >= THRESH) {
+        a *= 0.25;
+        b *= 0.25;
+        scale = 1;
+    }
+    else {
+        scale = 0;
+    }
+
+    /* Algorithm 312, CACM vol 10, Oct 1967. */
+    if (a >= 0) {
+        t = npy_sqrt((a + npy_hypot(a, b)) * 0.5);
+        result = npy_cpack(t, b / (2 * t));
+    }
+    else {
+        t = npy_sqrt((-a + npy_hypot(a, b)) * 0.5);
+        result = npy_cpack(npy_fabs(b) / (2 * t), npy_copysign(t, b));
+    }
+
+    /* Rescale. */
+    if (scale) {
+        return (npy_cpack(npy_creal(result) * 2, npy_cimag(result)));
+    }
+    else {
+        return (result);
+    }
+}
+#undef THRESH
+#endif
+
+/*
+ * Always use this function because of the multiplication for small
+ * integer powers, but in the body use cpow if it is available.
+ */
+
+/* private function for use in npy_pow{f, ,l} */
+#ifdef HAVE_CPOW
+static npy_cdouble
+sys_cpow(npy_cdouble x, npy_cdouble y)
+{
+    __npy_cdouble_to_c99_cast xcast;
+    __npy_cdouble_to_c99_cast ycast;
+    __npy_cdouble_to_c99_cast ret;
+    xcast.npy_z = x;
+    ycast.npy_z = y;
+    ret.c99_z = cpow(xcast.c99_z, ycast.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+
+npy_cdouble
+npy_cpow (npy_cdouble a, npy_cdouble b)
+{
+    npy_intp n;
+    npy_double ar = npy_creal(a);
+    npy_double br = npy_creal(b);
+    npy_double ai = npy_cimag(a);
+    npy_double bi = npy_cimag(b);
+    npy_cdouble r;
+
+    /*
+     * Checking if in a^b, if b is zero.
+     * If a is not zero then by definition of logarithm a^0 is 1.
+     * If a is also zero then 0^0 is best defined as 1.
+     */
+    if (br == 0. && bi == 0.) {
+        return npy_cpack(1., 0.);
+    }
+    /* case 0^b
+     * If a is a complex zero (ai=ar=0), then the result depends 
+     * upon values of br and bi. The result is either:
+     * 0 (in magnitude), undefined or 1.
+     * The later case is for br=bi=0 and independent of ar and ai
+     * but is handled above).
+     */
+    else if (ar == 0. && ai == 0.) {
+        /* 
+         * If the real part of b is positive (br>0) then this is
+         * the zero complex with positive sign on both the
+         * real and imaginary part.
+         */
+         if (br > 0) {
+             return npy_cpack(0., 0.);
+         }
+        /* else we are in the case where the
+         * real part of b is negative (br<0).
+         * Here we should return a complex nan
+         * and raise FloatingPointError: invalid value...
+         */
+         
+         /* Raise invalid value by calling inf - inf*/
+          volatile npy_double tmp = NPY_INFINITY;
+          tmp -= NPY_INFINITY;
+          ar = tmp;
+          
+          r = npy_cpack(NPY_NAN, NPY_NAN);
+          return r;
+    }
+    if (bi == 0 && (n=(npy_intp)br) == br) {
+        if (n == 1) {
+            /* unroll: handle inf better */
+            return npy_cpack(ar, ai);
+        }
+        else if (n == 2) {
+            /* unroll: handle inf better */
+            return cmul(a, a);
+        }
+        else if (n == 3) {
+            /* unroll: handle inf better */
+            return cmul(a, cmul(a, a));
+        }
+        else if (n > -100 && n < 100) {
+            npy_cdouble p, aa;
+            npy_intp mask = 1;
+            if (n < 0) {
+                n = -n;
+            }
+            aa = c_1;
+            p = npy_cpack(ar, ai);
+            while (1) {
+                if (n & mask) {
+                    aa = cmul(aa,p);
+                }
+                mask <<= 1;
+                if (n < mask || mask <= 0) {
+                    break;
+                }
+                p = cmul(p,p);
+            }
+            r = npy_cpack(npy_creal(aa), npy_cimag(aa));
+            if (br < 0) {
+                r = cdiv(c_1, r);
+            }
+            return r;
+        }
+    }
+
+#ifdef HAVE_CPOW
+    return sys_cpow(a, b);
+
+#else
+    {
+        npy_cdouble loga = npy_clog(a);
+
+        ar = npy_creal(loga);
+        ai = npy_cimag(loga);
+        return npy_cexp(npy_cpack(ar*br - ai*bi, ar*bi + ai*br));
+    }
+
+#endif
+}
+
+
+#ifndef HAVE_CCOS
+npy_cdouble
+npy_ccos(npy_cdouble z)
+{
+    /* ccos(z) = ccosh(I * z) */
+    return npy_ccosh(npy_cpack(-npy_cimag(z), npy_creal(z)));
+}
+#endif
+
+#ifndef HAVE_CSIN
+npy_cdouble
+npy_csin(npy_cdouble z)
+{
+    /* csin(z) = -I * csinh(I * z) */
+    z = npy_csinh(npy_cpack(-npy_cimag(z), npy_creal(z)));
+    return npy_cpack(npy_cimag(z), -npy_creal(z));
+}
+#endif
+
+#ifndef HAVE_CTAN
+npy_cdouble
+npy_ctan(npy_cdouble z)
+{
+    /* ctan(z) = -I * ctanh(I * z) */
+    z = npy_ctanh(npy_cpack(-npy_cimag(z), npy_creal(z)));
+    return (npy_cpack(npy_cimag(z), -npy_creal(z)));
+}
+#endif
+
+#ifndef HAVE_CCOSH
+/*
+ * Taken from the msun library in FreeBSD, rev 226599.
+ *
+ * Hyperbolic cosine of a complex argument z = x + i y.
+ *
+ * cosh(z) = cosh(x+iy)
+ *         = cosh(x) cos(y) + i sinh(x) sin(y).
+ *
+ * Exceptional values are noted in the comments within the source code.
+ * These values and the return value were taken from n1124.pdf.
+ *
+ * CCOSH_BIG is chosen such that
+ * spacing(0.5 * exp(CCOSH_BIG)) > 0.5*exp(-CCOSH_BIG)
+ * although the exact value assigned to CCOSH_BIG is not so important
+ */
+npy_cdouble
+npy_ccosh(npy_cdouble z)
+{
+#if 2 == 1
+    const npy_float CCOSH_BIG = 9.0f;
+    const npy_float CCOSH_HUGE = 1.70141183e+38f;
+#endif
+#if 2 == 2
+    const npy_double CCOSH_BIG = 22.0;
+    const npy_double CCOSH_HUGE = 8.9884656743115795e+307;
+#endif
+#if 2 >= 3
+#if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
+    const npy_longdouble CCOSH_BIG = 22.0L;
+    const npy_longdouble CCOSH_HUGE = 8.9884656743115795e+307L;
+#else
+    const npy_longdouble CCOSH_BIG = 24.0L;
+    const npy_longdouble CCOSH_HUGE = 5.94865747678615882543e+4931L;
+#endif
+#endif
+
+    npy_double  x, y, h, absx;
+    npy_int xfinite, yfinite;
+
+    x = npy_creal(z);
+    y = npy_cimag(z);
+
+    xfinite = npy_isfinite(x);
+    yfinite = npy_isfinite(y);
+
+    /* Handle the nearly-non-exceptional cases where x and y are finite. */
+    if (xfinite && yfinite) {
+        if (y == 0) {
+            return npy_cpack(npy_cosh(x), x * y);
+        }
+        absx = npy_fabs(x);
+        if (absx < CCOSH_BIG) {
+            /* small x: normal case */
+            return npy_cpack(npy_cosh(x) * npy_cos(y),
+                                npy_sinh(x) * npy_sin(y));
+        }
+
+        /* |x| >= 22, so cosh(x) ~= exp(|x|) */
+        if (absx < SCALED_CEXP_LOWER) {
+            /* x < 710: exp(|x|) won't overflow */
+            h = npy_exp(absx) * 0.5;
+            return npy_cpack(h * npy_cos(y),
+                                npy_copysign(h, x) * npy_sin(y));
+        }
+        else if (absx < SCALED_CEXP_UPPER) {
+            /* x < 1455: scale to avoid overflow */
+            z = _npy_scaled_cexp(absx, y, -1);
+            return npy_cpack(npy_creal(z),
+                                npy_cimag(z) * npy_copysign(1, x));
+        }
+        else {
+            /* x >= 1455: the result always overflows */
+            h = CCOSH_HUGE * x;
+            return npy_cpack(h * h * npy_cos(y), h * npy_sin(y));
+        }
+    }
+
+    /*
+     * cosh(+-0 +- I Inf) = dNaN + I sign(d(+-0, dNaN))0.
+     * The sign of 0 in the result is unspecified.  Choice = normally
+     * the same as dNaN.  Raise the invalid floating-point exception.
+     *
+     * cosh(+-0 +- I NaN) = d(NaN) + I sign(d(+-0, NaN))0.
+     * The sign of 0 in the result is unspecified.  Choice = normally
+     * the same as d(NaN).
+     */
+    if (x == 0 && !yfinite) {
+        return npy_cpack(y - y, npy_copysign(0, x * (y - y)));
+    }
+
+    /*
+     * cosh(+-Inf +- I 0) = +Inf + I (+-)(+-)0.
+     *
+     * cosh(NaN +- I 0)   = d(NaN) + I sign(d(NaN, +-0))0.
+     * The sign of 0 in the result is unspecified.
+     */
+    if (y == 0 && !xfinite) {
+        return npy_cpack(x * x, npy_copysign(0, x) * y);
+    }
+
+    /*
+     * cosh(x +- I Inf) = dNaN + I dNaN.
+     * Raise the invalid floating-point exception for finite nonzero x.
+     *
+     * cosh(x + I NaN) = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception for finite
+     * nonzero x.  Choice = don't raise (except for signaling NaNs).
+     */
+    if (xfinite && !yfinite) {
+        return npy_cpack(y - y, x * (y - y));
+    }
+
+    /*
+     * cosh(+-Inf + I NaN)  = +Inf + I d(NaN).
+     *
+     * cosh(+-Inf +- I Inf) = +Inf + I dNaN.
+     * The sign of Inf in the result is unspecified.  Choice = always +.
+     * Raise the invalid floating-point exception.
+     *
+     * cosh(+-Inf + I y)   = +Inf cos(y) +- I Inf sin(y)
+     */
+    if (npy_isinf(x)) {
+        if (!yfinite) {
+            return npy_cpack(x * x, x * (y - y));
+        }
+        return npy_cpack((x * x) * npy_cos(y), x * npy_sin(y));
+    }
+
+    /*
+     * cosh(NaN + I NaN)  = d(NaN) + I d(NaN).
+     *
+     * cosh(NaN +- I Inf) = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception.
+     * Choice = raise.
+     *
+     * cosh(NaN + I y)    = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception for finite
+     * nonzero y.  Choice = don't raise (except for signaling NaNs).
+     */
+    return npy_cpack((x * x) * (y - y), (x + x) * (y - y));
+}
+#undef CCOSH_BIG
+#undef CCOSH_HUGE
+#endif
+
+#ifndef HAVE_CSINH
+/*
+ * Taken from the msun library in FreeBSD, rev 226599.
+ *
+ * Hyperbolic sine of a complex argument z = x + i y.
+ *
+ * sinh(z) = sinh(x+iy)
+ *         = sinh(x) cos(y) + i cosh(x) sin(y).
+ *
+ * Exceptional values are noted in the comments within the source code.
+ * These values and the return value were taken from n1124.pdf.
+ */
+npy_cdouble
+npy_csinh(npy_cdouble z)
+{
+#if 2 == 1
+    const npy_float CSINH_BIG = 9.0f;
+    const npy_float CSINH_HUGE = 1.70141183e+38f;
+#endif
+#if 2 == 2
+    const npy_double CSINH_BIG = 22.0;
+    const npy_double CSINH_HUGE = 8.9884656743115795e+307;
+#endif
+#if 2 >= 3
+#if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
+    const npy_longdouble CSINH_BIG = 22.0L;
+    const npy_longdouble CSINH_HUGE = 8.9884656743115795e+307L;
+#else
+    const npy_longdouble CSINH_BIG = 24.0L;
+    const npy_longdouble CSINH_HUGE = 5.94865747678615882543e+4931L;
+#endif
+#endif
+
+    npy_double x, y, h, absx;
+    npy_int xfinite, yfinite;
+
+    x = npy_creal(z);
+    y = npy_cimag(z);
+
+    xfinite = npy_isfinite(x);
+    yfinite = npy_isfinite(y);
+
+    /* Handle the nearly-non-exceptional cases where x and y are finite. */
+    if (xfinite && yfinite) {
+        if (y == 0) {
+            return npy_cpack(npy_sinh(x), y);
+        }
+        absx = npy_fabs(x);
+        if (absx < CSINH_BIG) {
+            /* small x: normal case */
+            return npy_cpack(npy_sinh(x) * npy_cos(y),
+                                npy_cosh(x) * npy_sin(y));
+        }
+
+        /* |x| >= 22, so cosh(x) ~= exp(|x|) */
+        if (absx < SCALED_CEXP_LOWER) {
+            /* x < 710: exp(|x|) won't overflow */
+            h = npy_exp(npy_fabs(x)) * 0.5;
+            return npy_cpack(npy_copysign(h, x) * npy_cos(y),
+                                h * npy_sin(y));
+        }
+        else if (x < SCALED_CEXP_UPPER) {
+            /* x < 1455: scale to avoid overflow */
+            z = _npy_scaled_cexp(absx, y, -1);
+            return npy_cpack(npy_creal(z) * npy_copysign(1, x),
+                                npy_cimag(z));
+        }
+        else {
+            /* x >= 1455: the result always overflows */
+            h = CSINH_HUGE * x;
+            return npy_cpack(h * npy_cos(y), h * h * npy_sin(y));
+        }
+    }
+
+    /*
+     * sinh(+-0 +- I Inf) = sign(d(+-0, dNaN))0 + I dNaN.
+     * The sign of 0 in the result is unspecified.  Choice = normally
+     * the same as dNaN.  Raise the invalid floating-point exception.
+     *
+     * sinh(+-0 +- I NaN) = sign(d(+-0, NaN))0 + I d(NaN).
+     * The sign of 0 in the result is unspecified.  Choice = normally
+     * the same as d(NaN).
+     */
+    if (x == 0 && !yfinite) {
+        return npy_cpack(npy_copysign(0, x * (y - y)), y - y);
+    }
+
+    /*
+     * sinh(+-Inf +- I 0) = +-Inf + I +-0.
+     *
+     * sinh(NaN +- I 0)   = d(NaN) + I +-0.
+     */
+    if (y == 0 && !xfinite) {
+        if (npy_isnan(x)) {
+            return z;
+        }
+        return npy_cpack(x, npy_copysign(0, y));
+    }
+
+    /*
+     * sinh(x +- I Inf) = dNaN + I dNaN.
+     * Raise the invalid floating-point exception for finite nonzero x.
+     *
+     * sinh(x + I NaN) = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception for finite
+     * nonzero x.  Choice = don't raise (except for signaling NaNs).
+     */
+    if (xfinite && !yfinite) {
+        return npy_cpack(y - y, x * (y - y));
+    }
+
+    /*
+     * sinh(+-Inf + I NaN)  = +-Inf + I d(NaN).
+     * The sign of Inf in the result is unspecified.  Choice = normally
+     * the same as d(NaN).
+     *
+     * sinh(+-Inf +- I Inf) = +Inf + I dNaN.
+     * The sign of Inf in the result is unspecified.  Choice = always +.
+     * Raise the invalid floating-point exception.
+     *
+     * sinh(+-Inf + I y)   = +-Inf cos(y) + I Inf sin(y)
+     */
+    if (!xfinite && !npy_isnan(x)) {
+        if (!yfinite) {
+            return npy_cpack(x * x, x * (y - y));
+        }
+        return npy_cpack(x * npy_cos(y),
+                            NPY_INFINITY * npy_sin(y));
+    }
+
+    /*
+     * sinh(NaN + I NaN)  = d(NaN) + I d(NaN).
+     *
+     * sinh(NaN +- I Inf) = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception.
+     * Choice = raise.
+     *
+     * sinh(NaN + I y)    = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception for finite
+     * nonzero y.  Choice = don't raise (except for signaling NaNs).
+     */
+    return npy_cpack((x * x) * (y - y), (x + x) * (y - y));
+}
+#undef CSINH_BIG
+#undef CSINH_HUGE
+#endif
+
+#ifndef HAVE_CTANH
+/*
+ * Taken from the msun library in FreeBSD, rev 226600.
+ *
+ * Hyperbolic tangent of a complex argument z = x + i y.
+ *
+ * The algorithm is from:
+ *
+ *   W. Kahan.  Branch Cuts for Complex Elementary Functions or Much
+ *   Ado About Nothing's Sign Bit.  In The State of the Art in
+ *   Numerical Analysis, pp. 165 ff.  Iserles and Powell, eds., 1987.
+ *
+ * Method:
+ *
+ *   Let t    = tan(x)
+ *       beta = 1/cos^2(y)
+ *       s    = sinh(x)
+ *       rho  = cosh(x)
+ *
+ *   We have:
+ *
+ *   tanh(z) = sinh(z) / cosh(z)
+ *
+ *             sinh(x) cos(y) + i cosh(x) sin(y)
+ *           = ---------------------------------
+ *             cosh(x) cos(y) + i sinh(x) sin(y)
+ *
+ *             cosh(x) sinh(x) / cos^2(y) + i tan(y)
+ *           = -------------------------------------
+ *                    1 + sinh^2(x) / cos^2(y)
+ *
+ *             beta rho s + i t
+ *           = ----------------
+ *               1 + beta s^2
+ *
+ * Modifications:
+ *
+ *   I omitted the original algorithm's handling of overflow in tan(x) after
+ *   verifying with nearpi.c that this can't happen in IEEE single or double
+ *   precision.  I also handle large x differently.
+ */
+
+#define TANH_HUGE 22.0
+#define TANHF_HUGE 11.0F
+#define TANHL_HUGE 42.0L
+
+npy_cdouble
+npy_ctanh(npy_cdouble z)
+{
+    npy_double x, y;
+    npy_double t, beta, s, rho, denom;
+
+    x = npy_creal(z);
+    y = npy_cimag(z);
+
+    /*
+     * ctanh(NaN + i 0) = NaN + i 0
+     *
+     * ctanh(NaN + i y) = NaN + i NaN        for y != 0
+     *
+     * The imaginary part has the sign of x*sin(2*y), but there's no
+     * special effort to get this right.
+     *
+     * ctanh(+-Inf +- i Inf) = +-1 +- 0
+     *
+     * ctanh(+-Inf + i y) = +-1 + 0 sin(2y)        for y finite
+     *
+     * The imaginary part of the sign is unspecified.  This special
+     * case is only needed to avoid a spurious invalid exception when
+     * y is infinite.
+     */
+        if (!npy_isfinite(x)) {
+            if (npy_isnan(x)) {
+                return npy_cpack(x, (y == 0 ? y : x * y));
+            }
+            return npy_cpack(npy_copysign(1,x),
+                                npy_copysign(0,
+                                npy_isinf(y) ?
+                                    y : npy_sin(y) * npy_cos(y)));
+        }
+
+    /*
+     * ctanh(x + i NAN) = NaN + i NaN
+     * ctanh(x +- i Inf) = NaN + i NaN
+     */
+    if (!npy_isfinite(y)) {
+        return (npy_cpack(y - y, y - y));
+    }
+
+    /*
+     * ctanh(+-huge + i +-y) ~= +-1 +- i 2sin(2y)/exp(2x), using the
+     * approximation sinh^2(huge) ~= exp(2*huge) / 4.
+     * We use a modified formula to avoid spurious overflow.
+     */
+    if (npy_fabs(x) >= TANH_HUGE) {
+        npy_double exp_mx = npy_exp(-npy_fabs(x));
+        return npy_cpack(npy_copysign(1, x),
+                            4 * npy_sin(y) * npy_cos(y) *
+                                exp_mx * exp_mx);
+    }
+
+    /* Kahan's algorithm */
+    t = npy_tan(y);
+    beta = 1 + t * t;    /* = 1 / cos^2(y) */
+    s = npy_sinh(x);
+    rho = npy_sqrt(1 + s * s);    /* = cosh(x) */
+    denom = 1 + beta * s * s;
+    return (npy_cpack((beta * rho * s) / denom, t / denom));
+}
+#undef TANH_HUGE
+#undef TANHF_HUGE
+#undef TANHL_HUGE
+#endif
+
+#if !defined (HAVE_CACOS) || !defined (HAVE_CASINH)
+/*
+ * Complex inverse trig functions taken from the msum library in FreeBSD
+ * revision 251404
+ *
+ * The algorithm is very close to that in "Implementing the complex arcsine
+ * and arccosine functions using exception handling" by T. E. Hull, Thomas F.
+ * Fairgrieve, and Ping Tak Peter Tang, published in ACM Transactions on
+ * Mathematical Software, Volume 23 Issue 3, 1997, Pages 299-335,
+ * http://dl.acm.org/citation.cfm?id=275324.
+ *
+ * Throughout we use the convention z = x + I*y.
+ *
+ * casinh(z) = sign(x)*log(A+sqrt(A*A-1)) + I*asin(B)
+ * where
+ * A = (|z+I| + |z-I|) / 2
+ * B = (|z+I| - |z-I|) / 2 = y/A
+ *
+ * These formulas become numerically unstable:
+ *   (a) for Re(casinh(z)) when z is close to the line segment [-I, I] (that
+ *       is, Re(casinh(z)) is close to 0);
+ *   (b) for Im(casinh(z)) when z is close to either of the intervals
+ *       [I, I*infinity) or (-I*infinity, -I] (that is, |Im(casinh(z))| is
+ *       close to PI/2).
+ *
+ * These numerical problems are overcome by defining
+ * f(a, b) = (hypot(a, b) - b) / 2 = a*a / (hypot(a, b) + b) / 2
+ * Then if A < A_crossover, we use
+ *   log(A + sqrt(A*A-1)) = log1p((A-1) + sqrt((A-1)*(A+1)))
+ *   A-1 = f(x, 1+y) + f(x, 1-y)
+ * and if B > B_crossover, we use
+ *   asin(B) = atan2(y, sqrt(A*A - y*y)) = atan2(y, sqrt((A+y)*(A-y)))
+ *   A-y = f(x, y+1) + f(x, y-1)
+ * where without loss of generality we have assumed that x and y are
+ * non-negative.
+ *
+ * Much of the difficulty comes because the intermediate computations may
+ * produce overflows or underflows.  This is dealt with in the paper by Hull
+ * et al by using exception handling.  We do this by detecting when
+ * computations risk underflow or overflow.  The hardest part is handling the
+ * underflows when computing f(a, b).
+ *
+ * Note that the function f(a, b) does not appear explicitly in the paper by
+ * Hull et al, but the idea may be found on pages 308 and 309.  Introducing the
+ * function f(a, b) allows us to concentrate many of the clever tricks in this
+ * paper into one function.
+ */
+
+/*
+ * Function f(a, b, hypot_a_b) = (hypot(a, b) - b) / 2.
+ * Pass hypot(a, b) as the third argument.
+ */
+static inline npy_double
+_f(npy_double a, npy_double b, npy_double hypot_a_b)
+{
+    if (b < 0) {
+        return ((hypot_a_b - b) / 2);
+    }
+    if (b == 0) {
+        return (a / 2);
+    }
+    return (a * a / (hypot_a_b + b) / 2);
+}
+
+/*
+ * All the hard work is contained in this function.
+ * x and y are assumed positive or zero, and less than RECIP_EPSILON.
+ * Upon return:
+ * rx = Re(casinh(z)) = -Im(cacos(y + I*x)).
+ * B_is_usable is set to 1 if the value of B is usable.
+ * If B_is_usable is set to 0, sqrt_A2my2 = sqrt(A*A - y*y), and new_y = y.
+ * If returning sqrt_A2my2 has potential to result in an underflow, it is
+ * rescaled, and new_y is similarly rescaled.
+ */
+static inline void
+_do_hard_work(npy_double x, npy_double y, npy_double *rx,
+    npy_int *B_is_usable, npy_double *B, npy_double *sqrt_A2my2, npy_double *new_y)
+{
+#if 2 == 1
+    const npy_float A_crossover = 10.0f;
+    const npy_float B_crossover = 0.6417f;
+    const npy_float FOUR_SQRT_MIN = 4.3368086899420177e-19f;
+#endif
+#if 2 == 2
+    const npy_double A_crossover = 10.0;
+    const npy_double B_crossover = 0.6417;
+    const npy_double FOUR_SQRT_MIN = 5.9666725849601654e-154;
+#endif
+#if 2 == 3
+    const npy_longdouble A_crossover = 10.0l;
+    const npy_longdouble B_crossover = 0.6417l;
+#if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
+    const npy_longdouble FOUR_SQRT_MIN = 5.9666725849601654e-154;
+#else
+    const npy_longdouble FOUR_SQRT_MIN = 7.3344154702193886625e-2466l;
+#endif
+#endif
+    npy_double R, S, A; /* A, B, R, and S are as in Hull et al. */
+    npy_double Am1, Amy; /* A-1, A-y. */
+
+    R = npy_hypot(x, y + 1);        /* |z+I| */
+    S = npy_hypot(x, y - 1);        /* |z-I| */
+
+    /* A = (|z+I| + |z-I|) / 2 */
+    A = (R + S) / 2;
+    /*
+     * Mathematically A >= 1.  There is a small chance that this will not
+     * be so because of rounding errors.  So we will make certain it is
+     * so.
+     */
+    if (A < 1) {
+        A = 1;
+    }
+
+    if (A < A_crossover) {
+        /*
+         * Am1 = fp + fm, where fp = f(x, 1+y), and fm = f(x, 1-y).
+         * rx = log1p(Am1 + sqrt(Am1*(A+1)))
+         */
+        if (y == 1 && x < DBL_EPSILON * DBL_EPSILON / 128) {
+            /*
+             * fp is of order x^2, and fm = x/2.
+             * A = 1 (inexactly).
+             */
+            *rx = npy_sqrt(x);
+        }
+        else if (x >= DBL_EPSILON * npy_fabs(y - 1)) {
+            /*
+             * Underflow will not occur because
+             * x >= DBL_EPSILON^2/128 >= FOUR_SQRT_MIN
+             */
+            Am1 = _f(x, 1 + y, R) + _f(x, 1 - y, S);
+            *rx = npy_log1p(Am1 + npy_sqrt(Am1 * (A + 1)));
+        }
+        else if (y < 1) {
+            /*
+             * fp = x*x/(1+y)/4, fm = x*x/(1-y)/4, and
+             * A = 1 (inexactly).
+             */
+            *rx = x / npy_sqrt((1 - y) * (1 + y));
+        }
+        else {        /* if (y > 1) */
+            /*
+             * A-1 = y-1 (inexactly).
+             */
+            *rx = npy_log1p((y - 1) + npy_sqrt((y - 1) * (y + 1)));
+        }
+    }
+    else {
+        *rx = npy_log(A + npy_sqrt(A * A - 1));
+    }
+
+    *new_y = y;
+
+    if (y < FOUR_SQRT_MIN) {
+        /*
+         * Avoid a possible underflow caused by y/A.  For casinh this
+         * would be legitimate, but will be picked up by invoking atan2
+         * later on.  For cacos this would not be legitimate.
+         */
+        *B_is_usable = 0;
+        *sqrt_A2my2 = A * (2 / DBL_EPSILON);
+        *new_y = y * (2 / DBL_EPSILON);
+        return;
+    }
+
+    /* B = (|z+I| - |z-I|) / 2 = y/A */
+    *B = y / A;
+    *B_is_usable = 1;
+
+    if (*B > B_crossover) {
+        *B_is_usable = 0;
+        /*
+         * Amy = fp + fm, where fp = f(x, y+1), and fm = f(x, y-1).
+         * sqrt_A2my2 = sqrt(Amy*(A+y))
+         */
+        if (y == 1 && x < DBL_EPSILON / 128) {
+            /*
+             * fp is of order x^2, and fm = x/2.
+             * A = 1 (inexactly).
+             */
+            *sqrt_A2my2 = npy_sqrt(x) * npy_sqrt((A + y) / 2);
+        }
+        else if (x >= DBL_EPSILON * npy_fabs(y - 1)) {
+            /*
+             * Underflow will not occur because
+             * x >= DBL_EPSILON/128 >= FOUR_SQRT_MIN
+             * and
+             * x >= DBL_EPSILON^2 >= FOUR_SQRT_MIN
+             */
+            Amy = _f(x, y + 1, R) + _f(x, y - 1, S);
+            *sqrt_A2my2 = npy_sqrt(Amy * (A + y));
+        }
+        else if (y > 1) {
+            /*
+             * fp = x*x/(y+1)/4, fm = x*x/(y-1)/4, and
+             * A = y (inexactly).
+             *
+             * y < RECIP_EPSILON.  So the following
+             * scaling should avoid any underflow problems.
+             */
+            *sqrt_A2my2 = x * (4 / DBL_EPSILON / DBL_EPSILON) * y /
+                npy_sqrt((y + 1) * (y - 1));
+            *new_y = y * (4 / DBL_EPSILON / DBL_EPSILON);
+        }
+        else {        /* if (y < 1) */
+            /*
+             * fm = 1-y >= DBL_EPSILON, fp is of order x^2, and
+             * A = 1 (inexactly).
+             */
+            *sqrt_A2my2 = npy_sqrt((1 - y) * (1 + y));
+        }
+    }
+}
+
+/*
+ * Optimized version of clog() for |z| finite and larger than ~RECIP_EPSILON.
+ */
+static inline void
+_clog_for_large_values(npy_double x, npy_double y,
+    npy_double *rr, npy_double *ri)
+{
+#if 2 == 1
+    const npy_float QUARTER_SQRT_MAX = 4.611685743549481e+18f;
+    const npy_float SQRT_MIN = 1.0842021724855044e-19f;
+ #endif
+#if 2 == 2
+    const npy_double QUARTER_SQRT_MAX = 3.3519519824856489e+153;
+    const npy_double SQRT_MIN = 1.4916681462400413e-154;
+ #endif
+#if 2 == 3
+#if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
+    const npy_longdouble QUARTER_SQRT_MAX = 3.3519519824856489e+153;
+    const npy_longdouble SQRT_MIN = 1.4916681462400413e-154;
+#else
+    const npy_longdouble QUARTER_SQRT_MAX = 2.7268703390485398235e+2465l;
+    const npy_longdouble SQRT_MIN = 1.8336038675548471656e-2466l;
+#endif
+#endif
+    npy_double ax, ay, t;
+
+    ax = npy_fabs(x);
+    ay = npy_fabs(y);
+    if (ax < ay) {
+        t = ax;
+        ax = ay;
+        ay = t;
+    }
+
+    /*
+     * Avoid overflow in hypot() when x and y are both very large.
+     * Divide x and y by E, and then add 1 to the logarithm.  This depends
+     * on E being larger than sqrt(2).
+     * Dividing by E causes an insignificant loss of accuracy; however
+     * this method is still poor since it is unnecessarily slow.
+     */
+    if (ax > DBL_MAX / 2) {
+        *rr = npy_log(npy_hypot(x / NPY_E, y / NPY_E)) + 1;
+    }
+    /*
+     * Avoid overflow when x or y is large.  Avoid underflow when x or
+     * y is small.
+     */
+    else if (ax > QUARTER_SQRT_MAX || ay < SQRT_MIN) {
+        *rr = npy_log(npy_hypot(x, y));
+    }
+    else {
+        *rr = npy_log(ax * ax + ay * ay) / 2;
+    }
+    *ri = npy_atan2(y, x);
+}
+#endif
+
+#ifndef HAVE_CACOS
+npy_cdouble
+npy_cacos(npy_cdouble z)
+{
+#if 2 == 1
+    /* this is sqrt(6*EPS) */
+    const npy_float SQRT_6_EPSILON = 8.4572793338e-4f;
+    /* chosen such that pio2_hi + pio2_lo == pio2_hi but causes FE_INEXACT. */
+    const volatile npy_float pio2_lo = 7.5497899549e-9f;
+#endif
+#if 2 == 2
+    const npy_double SQRT_6_EPSILON = 3.65002414998885671e-08;
+    const volatile npy_double pio2_lo = 6.1232339957367659e-17;
+#endif
+#if 2 == 3
+    const npy_longdouble SQRT_6_EPSILON = 8.0654900873493277169e-10l;
+    const volatile npy_longdouble pio2_lo = 2.710505431213761085e-20l;
+#endif
+    const npy_double RECIP_EPSILON = 1.0 / DBL_EPSILON;
+    const npy_double pio2_hi = NPY_PI_2;
+    npy_double x, y, ax, ay, wx, wy, rx, ry, B, sqrt_A2mx2, new_x;
+    npy_int sx, sy;
+    npy_int B_is_usable;
+
+    x = npy_creal(z);
+    y = npy_cimag(z);
+    sx = npy_signbit(x);
+    sy = npy_signbit(y);
+    ax = npy_fabs(x);
+    ay = npy_fabs(y);
+
+    if (npy_isnan(x) || npy_isnan(y)) {
+        /* cacos(+-Inf + I*NaN) = NaN + I*opt(-)Inf */
+        if (npy_isinf(x)) {
+            return npy_cpack(y + y, -NPY_INFINITY);
+        }
+        /* cacos(NaN + I*+-Inf) = NaN + I*-+Inf */
+        if (npy_isinf(y)) {
+            return npy_cpack(x + x, -y);
+        }
+        /* cacos(0 + I*NaN) = PI/2 + I*NaN with inexact */
+        if (x == 0) {
+            return npy_cpack(pio2_hi + pio2_lo, y + y);
+        }
+        /*
+         * All other cases involving NaN return NaN + I*NaN.
+         * C99 leaves it optional whether to raise invalid if one of
+         * the arguments is not NaN, so we opt not to raise it.
+         */
+        return npy_cpack(NPY_NAN, NPY_NAN);
+    }
+
+    if (ax > RECIP_EPSILON || ay > RECIP_EPSILON) {
+        /* clog...() will raise inexact unless x or y is infinite. */
+        _clog_for_large_values(x, y, &wx, &wy);
+        rx = npy_fabs(wy);
+        ry = wx + NPY_LOGE2;
+        if (sy == 0) {
+            ry = -ry;
+        }
+        return npy_cpack(rx, ry);
+    }
+
+    /* Avoid spuriously raising inexact for z = 1. */
+    if (x == 1 && y == 0) {
+        return npy_cpack(0, -y);
+    }
+
+    /* All remaining cases are inexact. */
+    raise_inexact();
+
+    if (ax < SQRT_6_EPSILON / 4 && ay < SQRT_6_EPSILON / 4) {
+        return npy_cpack(pio2_hi - (x - pio2_lo), -y);
+    }
+
+    _do_hard_work(ay, ax, &ry, &B_is_usable, &B, &sqrt_A2mx2, &new_x);
+    if (B_is_usable) {
+        if (sx == 0) {
+            rx = npy_acos(B);
+        }
+        else {
+            rx = npy_acos(-B);
+        }
+    }
+    else {
+        if (sx == 0) {
+            rx = npy_atan2(sqrt_A2mx2, new_x);
+        }
+        else {
+            rx = npy_atan2(sqrt_A2mx2, -new_x);
+        }
+    }
+    if (sy == 0) {
+        ry = -ry;
+    }
+    return npy_cpack(rx, ry);
+}
+#endif
+
+#ifndef HAVE_CASIN
+npy_cdouble
+npy_casin(npy_cdouble z)
+{
+    /* casin(z) = I * conj( casinh(I * conj(z)) ) */
+    z = npy_casinh(npy_cpack(npy_cimag(z), npy_creal(z)));
+    return npy_cpack(npy_cimag(z), npy_creal(z));
+}
+#endif
+
+#ifndef HAVE_CATAN
+npy_cdouble
+npy_catan(npy_cdouble z)
+{
+    /* catan(z) = I * conj( catanh(I * conj(z)) ) */
+    z = npy_catanh(npy_cpack(npy_cimag(z), npy_creal(z)));
+    return npy_cpack(npy_cimag(z), npy_creal(z));
+}
+#endif
+
+#ifndef HAVE_CACOSH
+npy_cdouble
+npy_cacosh(npy_cdouble z)
+{
+    /*
+     * cacosh(z) = I*cacos(z) or -I*cacos(z)
+     * where the sign is chosen so Re(cacosh(z)) >= 0.
+     */
+    npy_cdouble  w;
+    npy_double rx, ry;
+
+    w = npy_cacos(z);
+    rx = npy_creal(w);
+    ry = npy_cimag(w);
+    /* cacosh(NaN + I*NaN) = NaN + I*NaN */
+    if (npy_isnan(rx) && npy_isnan(ry)) {
+        return npy_cpack(ry, rx);
+    }
+    /* cacosh(NaN + I*+-Inf) = +Inf + I*NaN */
+    /* cacosh(+-Inf + I*NaN) = +Inf + I*NaN */
+    if (npy_isnan(rx)) {
+        return npy_cpack(npy_fabs(ry), rx);
+    }
+    /* cacosh(0 + I*NaN) = NaN + I*NaN */
+    if (npy_isnan(ry)) {
+        return npy_cpack(ry, ry);
+    }
+    return npy_cpack(npy_fabs(ry), npy_copysign(rx, npy_cimag(z)));
+}
+#endif
+
+#ifndef HAVE_CASINH
+/*
+ * casinh(z) = z + O(z^3)   as z -> 0
+ *
+ * casinh(z) = sign(x)*clog(sign(x)*z) + O(1/z^2)   as z -> infinity
+ * The above formula works for the imaginary part as well, because
+ * Im(casinh(z)) = sign(x)*atan2(sign(x)*y, fabs(x)) + O(y/z^3)
+ *    as z -> infinity, uniformly in y
+ */
+npy_cdouble
+npy_casinh(npy_cdouble z)
+{
+#if 2 == 1
+    /* this is sqrt(6*EPS) */
+    const npy_float SQRT_6_EPSILON = 8.4572793338e-4f;
+#endif
+#if 2 == 2
+    const npy_double SQRT_6_EPSILON = 3.65002414998885671e-08;
+#endif
+#if 2 == 3
+    const npy_longdouble SQRT_6_EPSILON = 8.0654900873493277169e-10l;
+#endif
+    const npy_double RECIP_EPSILON = 1.0 / DBL_EPSILON;
+    npy_double x, y, ax, ay, wx, wy, rx, ry, B, sqrt_A2my2, new_y;
+    npy_int B_is_usable;
+
+    x = npy_creal(z);
+    y = npy_cimag(z);
+    ax = npy_fabs(x);
+    ay = npy_fabs(y);
+
+    if (npy_isnan(x) || npy_isnan(y)) {
+        /* casinh(+-Inf + I*NaN) = +-Inf + I*NaN */
+        if (npy_isinf(x)) {
+            return npy_cpack(x, y + y);
+        }
+        /* casinh(NaN + I*+-Inf) = opt(+-)Inf + I*NaN */
+        if (npy_isinf(y)) {
+            return npy_cpack(y, x + x);
+        }
+        /* casinh(NaN + I*0) = NaN + I*0 */
+        if (y == 0) {
+            return npy_cpack(x + x, y);
+        }
+        /*
+         * All other cases involving NaN return NaN + I*NaN.
+         * C99 leaves it optional whether to raise invalid if one of
+         * the arguments is not NaN, so we opt not to raise it.
+         */
+        return npy_cpack(NPY_NAN, NPY_NAN);
+    }
+
+    if (ax > RECIP_EPSILON || ay > RECIP_EPSILON) {
+        /* clog...() will raise inexact unless x or y is infinite. */
+        if (npy_signbit(x) == 0) {
+            _clog_for_large_values(x, y, &wx, &wy);
+            wx += NPY_LOGE2;
+        }
+        else {
+            _clog_for_large_values(-x, -y, &wx, &wy);
+            wx += NPY_LOGE2;
+        }
+        return npy_cpack(npy_copysign(wx, x), npy_copysign(wy, y));
+    }
+
+    /* Avoid spuriously raising inexact for z = 0. */
+    if (x == 0 && y == 0) {
+        return (z);
+    }
+
+    /* All remaining cases are inexact. */
+    raise_inexact();
+
+    if (ax < SQRT_6_EPSILON / 4 && ay < SQRT_6_EPSILON / 4) {
+        return (z);
+    }
+
+    _do_hard_work(ax, ay, &rx, &B_is_usable, &B, &sqrt_A2my2, &new_y);
+    if (B_is_usable) {
+        ry = npy_asin(B);
+    }
+    else {
+        ry = npy_atan2(new_y, sqrt_A2my2);
+    }
+    return npy_cpack(npy_copysign(rx, x), npy_copysign(ry, y));
+}
+#endif
+
+#ifndef HAVE_CATANH
+/*
+ * sum_squares(x,y) = x*x + y*y (or just x*x if y*y would underflow).
+ * Assumes x*x and y*y will not overflow.
+ * Assumes x and y are finite.
+ * Assumes y is non-negative.
+ * Assumes fabs(x) >= DBL_EPSILON.
+ */
+static inline npy_double
+_sum_squares(npy_double x, npy_double y)
+{
+#if 2 == 1
+const npy_float SQRT_MIN = 1.0842022e-19f;
+#endif
+#if 2 == 2
+const npy_double SQRT_MIN = 1.4916681462400413e-154; /* sqrt(DBL_MIN) */
+#endif
+#if 2 == 3
+#if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
+const npy_longdouble SQRT_MIN = 1.4916681462400413e-154; /* sqrt(DBL_MIN) */
+#else
+/* this is correct for 80 bit long doubles */
+const npy_longdouble SQRT_MIN = 1.8336038675548471656e-2466l;
+#endif
+#endif
+    /* Avoid underflow when y is small. */
+    if (y < SQRT_MIN) {
+        return (x * x);
+    }
+
+    return (x * x + y * y);
+}
+
+/*
+ * real_part_reciprocal(x, y) = Re(1/(x+I*y)) = x/(x*x + y*y).
+ * Assumes x and y are not NaN, and one of x and y is larger than
+ * RECIP_EPSILON.  We avoid unwarranted underflow.  It is important to not use
+ * the code creal(1/z), because the imaginary part may produce an unwanted
+ * underflow.
+ * This is only called in a context where inexact is always raised before
+ * the call, so no effort is made to avoid or force inexact.
+ */
+#if 2 == 1
+#define BIAS (FLT_MAX_EXP - 1)
+#define CUTOFF (FLT_MANT_DIG / 2 + 1)
+static inline npy_float
+_real_part_reciprocalf(npy_float x, npy_float y)
+{
+    npy_float scale;
+    npy_uint32 hx, hy;
+    npy_int32 ix, iy;
+
+    GET_FLOAT_WORD(hx, x);
+    ix = hx & 0x7f800000;
+    GET_FLOAT_WORD(hy, y);
+    iy = hy & 0x7f800000;
+    if (ix - iy >= CUTOFF << 23 || npy_isinf(x)) {
+        return (1 / x);
+    }
+    if (iy - ix >= CUTOFF << 23) {
+        return (x / y / y);
+    }
+    if (ix <= (BIAS + FLT_MAX_EXP / 2 - CUTOFF) << 23) {
+        return (x / (x * x + y * y));
+    }
+    SET_FLOAT_WORD(scale, 0x7f800000 - ix);
+    x *= scale;
+    y *= scale;
+    return (x / (x * x + y * y) * scale);
+}
+#undef BIAS
+#undef CUTOFF
+#endif
+
+#if 2 == 2
+#define BIAS (DBL_MAX_EXP - 1)
+/*  more guard digits are useful iff there is extra precision. */
+#define CUTOFF (DBL_MANT_DIG / 2 + 1)  /* just half or 1 guard digit */
+static inline npy_double
+_real_part_reciprocal(npy_double x, npy_double y)
+{
+    npy_double scale;
+    npy_uint32 hx, hy;
+    npy_int32 ix, iy;
+
+    /*
+     * This code is inspired by the C99 document n1124.pdf, Section G.5.1,
+     * example 2.
+     */
+    GET_HIGH_WORD(hx, x);
+    ix = hx & 0x7ff00000;
+    GET_HIGH_WORD(hy, y);
+    iy = hy & 0x7ff00000;
+    if (ix - iy >= CUTOFF << 20 || npy_isinf(x)) {
+        /* +-Inf -> +-0 is special */
+        return (1 / x);
+    }
+    if (iy - ix >= CUTOFF << 20) {
+        /* should avoid double div, but hard */
+        return (x / y / y);
+    }
+    if (ix <= (BIAS + DBL_MAX_EXP / 2 - CUTOFF) << 20) {
+        return (x / (x * x + y * y));
+    }
+    scale = 1;
+    SET_HIGH_WORD(scale, 0x7ff00000 - ix);  /* 2**(1-ilogb(x)) */
+    x *= scale;
+    y *= scale;
+    return (x / (x * x + y * y) * scale);
+}
+#undef BIAS
+#undef CUTOFF
+#endif
+
+#if 2 == 3
+#if !defined(HAVE_LDOUBLE_DOUBLE_DOUBLE_BE) && \
+    !defined(HAVE_LDOUBLE_DOUBLE_DOUBLE_LE)
+
+#define BIAS (LDBL_MAX_EXP - 1)
+#define CUTOFF (LDBL_MANT_DIG / 2 + 1)
+static inline npy_longdouble
+_real_part_reciprocall(npy_longdouble x,
+    npy_longdouble y)
+{
+    npy_longdouble scale;
+    union IEEEl2bitsrep ux, uy, us;
+    npy_int32 ix, iy;
+
+    ux.e = x;
+    ix = GET_LDOUBLE_EXP(ux);
+    uy.e = y;
+    iy = GET_LDOUBLE_EXP(uy);
+    if (ix - iy >= CUTOFF || npy_isinf(x)) {
+        return (1/x);
+    }
+    if (iy - ix >= CUTOFF) {
+        return (x/y/y);
+    }
+    if (ix <= BIAS + LDBL_MAX_EXP / 2 - CUTOFF) {
+        return (x/(x*x + y*y));
+    }
+    us.e = 1;
+    SET_LDOUBLE_EXP(us, 0x7fff - ix);
+    scale = us.e;
+    x *= scale;
+    y *= scale;
+    return (x/(x*x + y*y) * scale);
+}
+#undef BIAS
+#undef CUTOFF
+
+#else
+
+static inline npy_longdouble
+_real_part_reciprocall(npy_longdouble x,
+    npy_longdouble y)
+{
+    return x/(x*x + y*y);
+}
+
+#endif
+#endif
+
+npy_cdouble
+npy_catanh(npy_cdouble z)
+{
+#if 2 == 1
+    /* this is sqrt(3*EPS) */
+    const npy_float SQRT_3_EPSILON = 5.9801995673e-4f;
+    /* chosen such that pio2_hi + pio2_lo == pio2_hi but causes FE_INEXACT. */
+    const volatile npy_float pio2_lo = 7.5497899549e-9f;
+#endif
+#if 2 == 2
+    const npy_double SQRT_3_EPSILON = 2.5809568279517849e-8;
+    const volatile npy_double pio2_lo = 6.1232339957367659e-17;
+#endif
+#if 2 == 3
+    const npy_longdouble SQRT_3_EPSILON = 5.70316273435758915310e-10l;
+    const volatile npy_longdouble pio2_lo = 2.710505431213761085e-20l;
+#endif
+    const npy_double RECIP_EPSILON = 1.0 / DBL_EPSILON;
+    const npy_double pio2_hi = NPY_PI_2;
+    npy_double x, y, ax, ay, rx, ry;
+
+    x = npy_creal(z);
+    y = npy_cimag(z);
+    ax = npy_fabs(x);
+    ay = npy_fabs(y);
+
+    /* This helps handle many cases. */
+    if (y == 0 && ax <= 1) {
+        return npy_cpack(npy_atanh(x), y);
+    }
+
+    /* To ensure the same accuracy as atan(), and to filter out z = 0. */
+    if (x == 0) {
+        return npy_cpack(x, npy_atan(y));
+    }
+
+    if (npy_isnan(x) || npy_isnan(y)) {
+        /* catanh(+-Inf + I*NaN) = +-0 + I*NaN */
+        if (npy_isinf(x)) {
+            return npy_cpack(npy_copysign(0, x), y + y);
+        }
+        /* catanh(NaN + I*+-Inf) = sign(NaN)0 + I*+-PI/2 */
+        if (npy_isinf(y)) {
+            return npy_cpack(npy_copysign(0, x),
+                npy_copysign(pio2_hi + pio2_lo, y));
+        }
+        /*
+         * All other cases involving NaN return NaN + I*NaN.
+         * C99 leaves it optional whether to raise invalid if one of
+         * the arguments is not NaN, so we opt not to raise it.
+         */
+        return npy_cpack(NPY_NAN, NPY_NAN);
+    }
+
+    if (ax > RECIP_EPSILON || ay > RECIP_EPSILON) {
+        return npy_cpack(_real_part_reciprocal(x, y),
+            npy_copysign(pio2_hi + pio2_lo, y));
+    }
+
+    if (ax < SQRT_3_EPSILON / 2 && ay < SQRT_3_EPSILON / 2) {
+        /*
+         * z = 0 was filtered out above.  All other cases must raise
+         * inexact, but this is the only one that needs to do it
+         * explicitly.
+         */
+        raise_inexact();
+        return (z);
+    }
+
+    if (ax == 1 && ay < DBL_EPSILON) {
+        rx = (NPY_LOGE2 - npy_log(ay)) / 2;
+    }
+    else {
+        rx = npy_log1p(4 * ax / _sum_squares(ax - 1, ay)) / 4;
+    }
+
+    if (ax == 1) {
+        ry = npy_atan2(2, -ay) / 2;
+    }
+    else if (ay < DBL_EPSILON) {
+        ry = npy_atan2(2 * ay, (1 - ax) * (1 + ax)) / 2;
+    }
+    else {
+        ry = npy_atan2(2 * ay, (1 - ax) * (1 + ax) - ay * ay) / 2;
+    }
+
+    return npy_cpack(npy_copysign(rx, x), npy_copysign(ry, y));
+}
+#endif
+
+#line 63
+
+/*==========================================================
+ * Constants
+ *=========================================================*/
+static const npy_clongdouble c_1l = {1.0L, 0.0};
+
+/*==========================================================
+ * Helper functions
+ *
+ * These are necessary because we do not count on using a
+ * C99 compiler.
+ *=========================================================*/
+static inline
+npy_clongdouble
+cmull(npy_clongdouble a, npy_clongdouble b)
+{
+    npy_longdouble ar, ai, br, bi;
+    ar = npy_creall(a);
+    ai = npy_cimagl(a);
+    br = npy_creall(b);
+    bi = npy_cimagl(b);
+    return npy_cpackl(ar*br - ai*bi, ar*bi + ai*br);
+}
+
+static inline
+npy_clongdouble
+cdivl(npy_clongdouble a, npy_clongdouble b)
+{
+    npy_longdouble ar, ai, br, bi, abs_br, abs_bi;
+    ar = npy_creall(a);
+    ai = npy_cimagl(a);
+    br = npy_creall(b);
+    bi = npy_cimagl(b);
+    abs_br = npy_fabsl(br);
+    abs_bi = npy_fabsl(bi);
+
+    if (abs_br >= abs_bi) {
+        if (abs_br == 0 && abs_bi == 0) {
+            /* divide by zeros should yield a complex inf or nan */
+            return npy_cpackl(ar/abs_br, ai/abs_bi);
+        }
+        else {
+            npy_longdouble rat = bi/br;
+            npy_longdouble scl = 1.0L/(br+bi*rat);
+            return npy_cpackl((ar + ai*rat)*scl, (ai - ar*rat)*scl);
+        }
+    }
+    else {
+        npy_longdouble rat = br/bi;
+        npy_longdouble scl = 1.0L/(bi + br*rat);
+        return npy_cpackl((ar*rat + ai)*scl, (ai*rat - ar)*scl);
+    }
+}
+
+/*==========================================================
+ * Custom implementation of missing complex C99 functions
+ *=========================================================*/
+
+#ifndef HAVE_CABSL
+npy_longdouble
+npy_cabsl(npy_clongdouble z)
+{
+    return npy_hypotl(npy_creall(z), npy_cimagl(z));
+}
+#endif
+
+#ifndef HAVE_CARGL
+npy_longdouble
+npy_cargl(npy_clongdouble z)
+{
+    return npy_atan2l(npy_cimagl(z), npy_creall(z));
+}
+#endif
+
+/*
+ * cexp and (ccos, csin)h functions need to calculate exp scaled by another
+ * number.  This can be difficult if exp(x) overflows.  By doing this way, we
+ * don't risk overflowing exp. This likely raises floating-point exceptions,
+ * if we decide that we care.
+ *
+ * This is only useful over a limited range, (see below) an expects that the
+ * input values are in this range.
+ *
+ * This is based on the technique used in FreeBSD's __frexp_exp and
+ * __ldexp_(c)exp functions by David Schultz.
+ *
+ * SCALED_CEXP_LOWER = log(FLT_MAX)
+ * SCALED_CEXP_UPPER = log(2) + log(FLT_MAX) - log(FLT_TRUE_MIN),
+ * where FLT_TRUE_MIN is the smallest possible subnormal number.
+ */
+
+#define SCALED_CEXP_LOWERF 88.722839f
+#define SCALED_CEXP_UPPERF 192.69492f
+#define SCALED_CEXP_LOWER 710.47586007394386
+#define SCALED_CEXP_UPPER 1454.9159319953251
+#define SCALED_CEXP_LOWERL 11357.216553474703895L
+#define SCALED_CEXP_UPPERL 22756.021937783004509L
+
+#if !defined(HAVE_CSINHL) || \
+    !defined(HAVE_CCOSHL) || \
+    !defined(HAVE_CEXPL)
+
+static
+npy_clongdouble
+_npy_scaled_cexpl(npy_longdouble x, npy_longdouble y, npy_int expt)
+{
+#if 3 == 1
+    const npy_int k = 235;
+#endif
+#if 3 == 2
+    const npy_int k = 1799;
+#endif
+#if 3 == 3
+    const npy_int k = 19547;
+#endif
+    const npy_longdouble kln2 = k * NPY_LOGE2l;
+    npy_longdouble mant, mantcos, mantsin;
+    npy_int ex, excos, exsin;
+
+    mant = npy_frexpl(npy_expl(x - kln2), &ex);
+    mantcos = npy_frexpl(npy_cosl(y), &excos);
+    mantsin = npy_frexpl(npy_sinl(y), &exsin);
+
+    expt += ex + k;
+    return npy_cpackl( npy_ldexpl(mant * mantcos, expt + excos),
+                         npy_ldexpl(mant * mantsin, expt + exsin));
+}
+
+#endif
+
+#ifndef HAVE_CEXPL
+
+npy_clongdouble
+npy_cexpl(npy_clongdouble z)
+{
+    npy_longdouble x, c, s;
+    npy_longdouble r, i;
+    npy_clongdouble ret;
+
+    r = npy_creall(z);
+    i = npy_cimagl(z);
+
+    if (npy_isfinite(r)) {
+        if (r >= SCALED_CEXP_LOWERL && r <= SCALED_CEXP_UPPERL) {
+            ret = _npy_scaled_cexpl(r, i, 0);
+        }
+        else {
+            x = npy_expl(r);
+
+            c = npy_cosl(i);
+            s = npy_sinl(i);
+
+            if (npy_isfinite(i)) {
+                ret = npy_cpackl(x * c, x * s);
+            }
+            else {
+                ret = npy_cpackl(NPY_NANL, npy_copysignl(NPY_NANL, i));
+            }
+        }
+
+    }
+    else  if (npy_isnan(r)) {
+        /* r is nan */
+        if (i == 0) {
+            ret = z;
+        }
+        else {
+            ret = npy_cpackl(r, npy_copysignl(NPY_NANL, i));
+        }
+    }
+    else {
+        /* r is +- inf */
+        if (r > 0) {
+            if (i == 0) {
+                ret = npy_cpackl(r, i);
+            }
+            else if (npy_isfinite(i)) {
+                c = npy_cosl(i);
+                s = npy_sinl(i);
+
+                ret = npy_cpackl(r * c, r * s);
+            }
+            else {
+                /* x = +inf, y = +-inf | nan */
+                npy_set_floatstatus_invalid();
+                ret = npy_cpackl(r, NPY_NANL);
+            }
+        }
+        else {
+            if (npy_isfinite(i)) {
+                x = npy_expl(r);
+                c = npy_cosl(i);
+                s = npy_sinl(i);
+
+                ret = npy_cpackl(x * c, x * s);
+            }
+            else {
+                /* x = -inf, y = nan | +i inf */
+                ret = npy_cpackl(0, 0);
+            }
+        }
+    }
+
+    return ret;
+}
+#endif
+
+#ifndef HAVE_CLOGL
+/* algorithm from cpython, rev. d86f5686cef9
+ *
+ * The usual formula for the real part is log(hypot(z.real, z.imag)).
+ * There are four situations where this formula is potentially
+ * problematic:
+ *
+ * (1) the absolute value of z is subnormal.  Then hypot is subnormal,
+ * so has fewer than the usual number of bits of accuracy, hence may
+ * have large relative error.  This then gives a large absolute error
+ * in the log.  This can be solved by rescaling z by a suitable power
+ * of 2.
+ *
+ * (2) the absolute value of z is greater than DBL_MAX (e.g. when both
+ * z.real and z.imag are within a factor of 1/sqrt(2) of DBL_MAX)
+ * Again, rescaling solves this.
+ *
+ * (3) the absolute value of z is close to 1.  In this case it's
+ * difficult to achieve good accuracy, at least in part because a
+ * change of 1ulp in the real or imaginary part of z can result in a
+ * change of billions of ulps in the correctly rounded answer.
+ *
+ * (4) z = 0.  The simplest thing to do here is to call the
+ * floating-point log with an argument of 0, and let its behaviour
+ * (returning -infinity, signaling a floating-point exception, setting
+ * errno, or whatever) determine that of c_log.  So the usual formula
+ * is fine here.
+*/
+npy_clongdouble
+npy_clogl(npy_clongdouble z)
+{
+    npy_longdouble ax = npy_fabsl(npy_creall(z));
+    npy_longdouble ay = npy_fabsl(npy_cimagl(z));
+    npy_longdouble rr, ri;
+
+    if (ax > LDBL_MAX/4 || ay > LDBL_MAX/4) {
+        rr = npy_logl(npy_hypotl(ax/2, ay/2)) + NPY_LOGE2l;
+    }
+    else if (ax < LDBL_MIN && ay < LDBL_MIN) {
+        if (ax > 0  || ay > 0) {
+            /* catch cases where hypot(ax, ay) is subnormal */
+            rr = npy_logl(npy_hypotl(npy_ldexpl(ax, LDBL_MANT_DIG),
+                 npy_ldexpl(ay, LDBL_MANT_DIG))) - LDBL_MANT_DIG*NPY_LOGE2l;
+        }
+        else {
+            /* log(+/-0 +/- 0i) */
+            /* raise divide-by-zero floating point exception */
+            rr = -1.0l / npy_creall(z);
+            rr = npy_copysignl(rr, -1);
+            ri = npy_cargl(z);
+            return npy_cpackl(rr, ri);
+        }
+    }
+    else {
+        npy_longdouble h = npy_hypotl(ax, ay);
+        if (0.71 <= h && h <= 1.73) {
+            npy_longdouble am = ax > ay ? ax : ay; /* max(ax, ay) */
+            npy_longdouble an = ax > ay ? ay : ax; /* min(ax, ay) */
+            rr = npy_log1pl((am-1)*(am+1)+an*an)/2;
+        }
+        else {
+            rr = npy_logl(h);
+        }
+    }
+    ri = npy_cargl(z);
+
+    return npy_cpackl(rr, ri);
+}
+#endif
+
+#ifndef HAVE_CSQRTL
+
+/* We risk spurious overflow for components >= DBL_MAX / (1 + sqrt(2)). */
+#define THRESH  (LDBL_MAX / (1 + NPY_SQRT2l))
+
+npy_clongdouble
+npy_csqrtl(npy_clongdouble z)
+{
+    npy_clongdouble result;
+    npy_longdouble a, b;
+    npy_longdouble t;
+    int scale;
+
+    a = npy_creall(z);
+    b = npy_cimagl(z);
+
+    /* Handle special cases. */
+    if (a == 0 && b == 0) {
+        return (npy_cpackl(0, b));
+    }
+    if (npy_isinf(b)) {
+        return (npy_cpackl(NPY_INFINITYL, b));
+    }
+    if (npy_isnan(a)) {
+        t = (b - b) / (b - b);  /* raise invalid if b is not a NaN */
+        return (npy_cpackl(a, t));    /* return NaN + NaN i */
+    }
+    if (npy_isinf(a)) {
+        /*
+         * csqrt(inf + NaN i)  = inf +  NaN i
+         * csqrt(inf + y i)    = inf +  0 i
+         * csqrt(-inf + NaN i) = NaN +- inf i
+         * csqrt(-inf + y i)   = 0   +  inf i
+         */
+        if (npy_signbit(a)) {
+            return (npy_cpackl(npy_fabsl(b - b), npy_copysignl(a, b)));
+        }
+        else {
+            return (npy_cpackl(a, npy_copysignl(b - b, b)));
+        }
+    }
+    /*
+     * The remaining special case (b is NaN) is handled just fine by
+     * the normal code path below.
+     */
+
+    /* Scale to avoid overflow. */
+    if (npy_fabsl(a) >= THRESH || npy_fabsl(b) >= THRESH) {
+        a *= 0.25;
+        b *= 0.25;
+        scale = 1;
+    }
+    else {
+        scale = 0;
+    }
+
+    /* Algorithm 312, CACM vol 10, Oct 1967. */
+    if (a >= 0) {
+        t = npy_sqrtl((a + npy_hypotl(a, b)) * 0.5l);
+        result = npy_cpackl(t, b / (2 * t));
+    }
+    else {
+        t = npy_sqrtl((-a + npy_hypotl(a, b)) * 0.5l);
+        result = npy_cpackl(npy_fabsl(b) / (2 * t), npy_copysignl(t, b));
+    }
+
+    /* Rescale. */
+    if (scale) {
+        return (npy_cpackl(npy_creall(result) * 2, npy_cimagl(result)));
+    }
+    else {
+        return (result);
+    }
+}
+#undef THRESH
+#endif
+
+/*
+ * Always use this function because of the multiplication for small
+ * integer powers, but in the body use cpow if it is available.
+ */
+
+/* private function for use in npy_pow{f, ,l} */
+#ifdef HAVE_CPOWL
+static npy_clongdouble
+sys_cpowl(npy_clongdouble x, npy_clongdouble y)
+{
+    __npy_clongdouble_to_c99_cast xcast;
+    __npy_clongdouble_to_c99_cast ycast;
+    __npy_clongdouble_to_c99_cast ret;
+    xcast.npy_z = x;
+    ycast.npy_z = y;
+    ret.c99_z = cpowl(xcast.c99_z, ycast.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+
+npy_clongdouble
+npy_cpowl (npy_clongdouble a, npy_clongdouble b)
+{
+    npy_intp n;
+    npy_longdouble ar = npy_creall(a);
+    npy_longdouble br = npy_creall(b);
+    npy_longdouble ai = npy_cimagl(a);
+    npy_longdouble bi = npy_cimagl(b);
+    npy_clongdouble r;
+
+    /*
+     * Checking if in a^b, if b is zero.
+     * If a is not zero then by definition of logarithm a^0 is 1.
+     * If a is also zero then 0^0 is best defined as 1.
+     */
+    if (br == 0. && bi == 0.) {
+        return npy_cpackl(1., 0.);
+    }
+    /* case 0^b
+     * If a is a complex zero (ai=ar=0), then the result depends 
+     * upon values of br and bi. The result is either:
+     * 0 (in magnitude), undefined or 1.
+     * The later case is for br=bi=0 and independent of ar and ai
+     * but is handled above).
+     */
+    else if (ar == 0. && ai == 0.) {
+        /* 
+         * If the real part of b is positive (br>0) then this is
+         * the zero complex with positive sign on both the
+         * real and imaginary part.
+         */
+         if (br > 0) {
+             return npy_cpackl(0., 0.);
+         }
+        /* else we are in the case where the
+         * real part of b is negative (br<0).
+         * Here we should return a complex nan
+         * and raise FloatingPointError: invalid value...
+         */
+         
+         /* Raise invalid value by calling inf - inf*/
+          volatile npy_longdouble tmp = NPY_INFINITYL;
+          tmp -= NPY_INFINITYL;
+          ar = tmp;
+          
+          r = npy_cpackl(NPY_NANL, NPY_NANL);
+          return r;
+    }
+    if (bi == 0 && (n=(npy_intp)br) == br) {
+        if (n == 1) {
+            /* unroll: handle inf better */
+            return npy_cpackl(ar, ai);
+        }
+        else if (n == 2) {
+            /* unroll: handle inf better */
+            return cmull(a, a);
+        }
+        else if (n == 3) {
+            /* unroll: handle inf better */
+            return cmull(a, cmull(a, a));
+        }
+        else if (n > -100 && n < 100) {
+            npy_clongdouble p, aa;
+            npy_intp mask = 1;
+            if (n < 0) {
+                n = -n;
+            }
+            aa = c_1l;
+            p = npy_cpackl(ar, ai);
+            while (1) {
+                if (n & mask) {
+                    aa = cmull(aa,p);
+                }
+                mask <<= 1;
+                if (n < mask || mask <= 0) {
+                    break;
+                }
+                p = cmull(p,p);
+            }
+            r = npy_cpackl(npy_creall(aa), npy_cimagl(aa));
+            if (br < 0) {
+                r = cdivl(c_1l, r);
+            }
+            return r;
+        }
+    }
+
+#ifdef HAVE_CPOWL
+    return sys_cpowl(a, b);
+
+#else
+    {
+        npy_clongdouble loga = npy_clogl(a);
+
+        ar = npy_creall(loga);
+        ai = npy_cimagl(loga);
+        return npy_cexpl(npy_cpackl(ar*br - ai*bi, ar*bi + ai*br));
+    }
+
+#endif
+}
+
+
+#ifndef HAVE_CCOSL
+npy_clongdouble
+npy_ccosl(npy_clongdouble z)
+{
+    /* ccos(z) = ccosh(I * z) */
+    return npy_ccoshl(npy_cpackl(-npy_cimagl(z), npy_creall(z)));
+}
+#endif
+
+#ifndef HAVE_CSINL
+npy_clongdouble
+npy_csinl(npy_clongdouble z)
+{
+    /* csin(z) = -I * csinh(I * z) */
+    z = npy_csinhl(npy_cpackl(-npy_cimagl(z), npy_creall(z)));
+    return npy_cpackl(npy_cimagl(z), -npy_creall(z));
+}
+#endif
+
+#ifndef HAVE_CTANL
+npy_clongdouble
+npy_ctanl(npy_clongdouble z)
+{
+    /* ctan(z) = -I * ctanh(I * z) */
+    z = npy_ctanhl(npy_cpackl(-npy_cimagl(z), npy_creall(z)));
+    return (npy_cpackl(npy_cimagl(z), -npy_creall(z)));
+}
+#endif
+
+#ifndef HAVE_CCOSHL
+/*
+ * Taken from the msun library in FreeBSD, rev 226599.
+ *
+ * Hyperbolic cosine of a complex argument z = x + i y.
+ *
+ * cosh(z) = cosh(x+iy)
+ *         = cosh(x) cos(y) + i sinh(x) sin(y).
+ *
+ * Exceptional values are noted in the comments within the source code.
+ * These values and the return value were taken from n1124.pdf.
+ *
+ * CCOSH_BIG is chosen such that
+ * spacing(0.5 * exp(CCOSH_BIG)) > 0.5*exp(-CCOSH_BIG)
+ * although the exact value assigned to CCOSH_BIG is not so important
+ */
+npy_clongdouble
+npy_ccoshl(npy_clongdouble z)
+{
+#if 3 == 1
+    const npy_float CCOSH_BIG = 9.0f;
+    const npy_float CCOSH_HUGE = 1.70141183e+38f;
+#endif
+#if 3 == 2
+    const npy_double CCOSH_BIG = 22.0;
+    const npy_double CCOSH_HUGE = 8.9884656743115795e+307;
+#endif
+#if 3 >= 3
+#if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
+    const npy_longdouble CCOSH_BIG = 22.0L;
+    const npy_longdouble CCOSH_HUGE = 8.9884656743115795e+307L;
+#else
+    const npy_longdouble CCOSH_BIG = 24.0L;
+    const npy_longdouble CCOSH_HUGE = 5.94865747678615882543e+4931L;
+#endif
+#endif
+
+    npy_longdouble  x, y, h, absx;
+    npy_int xfinite, yfinite;
+
+    x = npy_creall(z);
+    y = npy_cimagl(z);
+
+    xfinite = npy_isfinite(x);
+    yfinite = npy_isfinite(y);
+
+    /* Handle the nearly-non-exceptional cases where x and y are finite. */
+    if (xfinite && yfinite) {
+        if (y == 0) {
+            return npy_cpackl(npy_coshl(x), x * y);
+        }
+        absx = npy_fabsl(x);
+        if (absx < CCOSH_BIG) {
+            /* small x: normal case */
+            return npy_cpackl(npy_coshl(x) * npy_cosl(y),
+                                npy_sinhl(x) * npy_sinl(y));
+        }
+
+        /* |x| >= 22, so cosh(x) ~= exp(|x|) */
+        if (absx < SCALED_CEXP_LOWERL) {
+            /* x < 710: exp(|x|) won't overflow */
+            h = npy_expl(absx) * 0.5l;
+            return npy_cpackl(h * npy_cosl(y),
+                                npy_copysignl(h, x) * npy_sinl(y));
+        }
+        else if (absx < SCALED_CEXP_UPPERL) {
+            /* x < 1455: scale to avoid overflow */
+            z = _npy_scaled_cexpl(absx, y, -1);
+            return npy_cpackl(npy_creall(z),
+                                npy_cimagl(z) * npy_copysignl(1, x));
+        }
+        else {
+            /* x >= 1455: the result always overflows */
+            h = CCOSH_HUGE * x;
+            return npy_cpackl(h * h * npy_cosl(y), h * npy_sinl(y));
+        }
+    }
+
+    /*
+     * cosh(+-0 +- I Inf) = dNaN + I sign(d(+-0, dNaN))0.
+     * The sign of 0 in the result is unspecified.  Choice = normally
+     * the same as dNaN.  Raise the invalid floating-point exception.
+     *
+     * cosh(+-0 +- I NaN) = d(NaN) + I sign(d(+-0, NaN))0.
+     * The sign of 0 in the result is unspecified.  Choice = normally
+     * the same as d(NaN).
+     */
+    if (x == 0 && !yfinite) {
+        return npy_cpackl(y - y, npy_copysignl(0, x * (y - y)));
+    }
+
+    /*
+     * cosh(+-Inf +- I 0) = +Inf + I (+-)(+-)0.
+     *
+     * cosh(NaN +- I 0)   = d(NaN) + I sign(d(NaN, +-0))0.
+     * The sign of 0 in the result is unspecified.
+     */
+    if (y == 0 && !xfinite) {
+        return npy_cpackl(x * x, npy_copysignl(0, x) * y);
+    }
+
+    /*
+     * cosh(x +- I Inf) = dNaN + I dNaN.
+     * Raise the invalid floating-point exception for finite nonzero x.
+     *
+     * cosh(x + I NaN) = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception for finite
+     * nonzero x.  Choice = don't raise (except for signaling NaNs).
+     */
+    if (xfinite && !yfinite) {
+        return npy_cpackl(y - y, x * (y - y));
+    }
+
+    /*
+     * cosh(+-Inf + I NaN)  = +Inf + I d(NaN).
+     *
+     * cosh(+-Inf +- I Inf) = +Inf + I dNaN.
+     * The sign of Inf in the result is unspecified.  Choice = always +.
+     * Raise the invalid floating-point exception.
+     *
+     * cosh(+-Inf + I y)   = +Inf cos(y) +- I Inf sin(y)
+     */
+    if (npy_isinf(x)) {
+        if (!yfinite) {
+            return npy_cpackl(x * x, x * (y - y));
+        }
+        return npy_cpackl((x * x) * npy_cosl(y), x * npy_sinl(y));
+    }
+
+    /*
+     * cosh(NaN + I NaN)  = d(NaN) + I d(NaN).
+     *
+     * cosh(NaN +- I Inf) = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception.
+     * Choice = raise.
+     *
+     * cosh(NaN + I y)    = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception for finite
+     * nonzero y.  Choice = don't raise (except for signaling NaNs).
+     */
+    return npy_cpackl((x * x) * (y - y), (x + x) * (y - y));
+}
+#undef CCOSH_BIG
+#undef CCOSH_HUGE
+#endif
+
+#ifndef HAVE_CSINHL
+/*
+ * Taken from the msun library in FreeBSD, rev 226599.
+ *
+ * Hyperbolic sine of a complex argument z = x + i y.
+ *
+ * sinh(z) = sinh(x+iy)
+ *         = sinh(x) cos(y) + i cosh(x) sin(y).
+ *
+ * Exceptional values are noted in the comments within the source code.
+ * These values and the return value were taken from n1124.pdf.
+ */
+npy_clongdouble
+npy_csinhl(npy_clongdouble z)
+{
+#if 3 == 1
+    const npy_float CSINH_BIG = 9.0f;
+    const npy_float CSINH_HUGE = 1.70141183e+38f;
+#endif
+#if 3 == 2
+    const npy_double CSINH_BIG = 22.0;
+    const npy_double CSINH_HUGE = 8.9884656743115795e+307;
+#endif
+#if 3 >= 3
+#if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
+    const npy_longdouble CSINH_BIG = 22.0L;
+    const npy_longdouble CSINH_HUGE = 8.9884656743115795e+307L;
+#else
+    const npy_longdouble CSINH_BIG = 24.0L;
+    const npy_longdouble CSINH_HUGE = 5.94865747678615882543e+4931L;
+#endif
+#endif
+
+    npy_longdouble x, y, h, absx;
+    npy_int xfinite, yfinite;
+
+    x = npy_creall(z);
+    y = npy_cimagl(z);
+
+    xfinite = npy_isfinite(x);
+    yfinite = npy_isfinite(y);
+
+    /* Handle the nearly-non-exceptional cases where x and y are finite. */
+    if (xfinite && yfinite) {
+        if (y == 0) {
+            return npy_cpackl(npy_sinhl(x), y);
+        }
+        absx = npy_fabsl(x);
+        if (absx < CSINH_BIG) {
+            /* small x: normal case */
+            return npy_cpackl(npy_sinhl(x) * npy_cosl(y),
+                                npy_coshl(x) * npy_sinl(y));
+        }
+
+        /* |x| >= 22, so cosh(x) ~= exp(|x|) */
+        if (absx < SCALED_CEXP_LOWERL) {
+            /* x < 710: exp(|x|) won't overflow */
+            h = npy_expl(npy_fabsl(x)) * 0.5l;
+            return npy_cpackl(npy_copysignl(h, x) * npy_cosl(y),
+                                h * npy_sinl(y));
+        }
+        else if (x < SCALED_CEXP_UPPERL) {
+            /* x < 1455: scale to avoid overflow */
+            z = _npy_scaled_cexpl(absx, y, -1);
+            return npy_cpackl(npy_creall(z) * npy_copysignl(1, x),
+                                npy_cimagl(z));
+        }
+        else {
+            /* x >= 1455: the result always overflows */
+            h = CSINH_HUGE * x;
+            return npy_cpackl(h * npy_cosl(y), h * h * npy_sinl(y));
+        }
+    }
+
+    /*
+     * sinh(+-0 +- I Inf) = sign(d(+-0, dNaN))0 + I dNaN.
+     * The sign of 0 in the result is unspecified.  Choice = normally
+     * the same as dNaN.  Raise the invalid floating-point exception.
+     *
+     * sinh(+-0 +- I NaN) = sign(d(+-0, NaN))0 + I d(NaN).
+     * The sign of 0 in the result is unspecified.  Choice = normally
+     * the same as d(NaN).
+     */
+    if (x == 0 && !yfinite) {
+        return npy_cpackl(npy_copysignl(0, x * (y - y)), y - y);
+    }
+
+    /*
+     * sinh(+-Inf +- I 0) = +-Inf + I +-0.
+     *
+     * sinh(NaN +- I 0)   = d(NaN) + I +-0.
+     */
+    if (y == 0 && !xfinite) {
+        if (npy_isnan(x)) {
+            return z;
+        }
+        return npy_cpackl(x, npy_copysignl(0, y));
+    }
+
+    /*
+     * sinh(x +- I Inf) = dNaN + I dNaN.
+     * Raise the invalid floating-point exception for finite nonzero x.
+     *
+     * sinh(x + I NaN) = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception for finite
+     * nonzero x.  Choice = don't raise (except for signaling NaNs).
+     */
+    if (xfinite && !yfinite) {
+        return npy_cpackl(y - y, x * (y - y));
+    }
+
+    /*
+     * sinh(+-Inf + I NaN)  = +-Inf + I d(NaN).
+     * The sign of Inf in the result is unspecified.  Choice = normally
+     * the same as d(NaN).
+     *
+     * sinh(+-Inf +- I Inf) = +Inf + I dNaN.
+     * The sign of Inf in the result is unspecified.  Choice = always +.
+     * Raise the invalid floating-point exception.
+     *
+     * sinh(+-Inf + I y)   = +-Inf cos(y) + I Inf sin(y)
+     */
+    if (!xfinite && !npy_isnan(x)) {
+        if (!yfinite) {
+            return npy_cpackl(x * x, x * (y - y));
+        }
+        return npy_cpackl(x * npy_cosl(y),
+                            NPY_INFINITYL * npy_sinl(y));
+    }
+
+    /*
+     * sinh(NaN + I NaN)  = d(NaN) + I d(NaN).
+     *
+     * sinh(NaN +- I Inf) = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception.
+     * Choice = raise.
+     *
+     * sinh(NaN + I y)    = d(NaN) + I d(NaN).
+     * Optionally raises the invalid floating-point exception for finite
+     * nonzero y.  Choice = don't raise (except for signaling NaNs).
+     */
+    return npy_cpackl((x * x) * (y - y), (x + x) * (y - y));
+}
+#undef CSINH_BIG
+#undef CSINH_HUGE
+#endif
+
+#ifndef HAVE_CTANHL
+/*
+ * Taken from the msun library in FreeBSD, rev 226600.
+ *
+ * Hyperbolic tangent of a complex argument z = x + i y.
+ *
+ * The algorithm is from:
+ *
+ *   W. Kahan.  Branch Cuts for Complex Elementary Functions or Much
+ *   Ado About Nothing's Sign Bit.  In The State of the Art in
+ *   Numerical Analysis, pp. 165 ff.  Iserles and Powell, eds., 1987.
+ *
+ * Method:
+ *
+ *   Let t    = tan(x)
+ *       beta = 1/cos^2(y)
+ *       s    = sinh(x)
+ *       rho  = cosh(x)
+ *
+ *   We have:
+ *
+ *   tanh(z) = sinh(z) / cosh(z)
+ *
+ *             sinh(x) cos(y) + i cosh(x) sin(y)
+ *           = ---------------------------------
+ *             cosh(x) cos(y) + i sinh(x) sin(y)
+ *
+ *             cosh(x) sinh(x) / cos^2(y) + i tan(y)
+ *           = -------------------------------------
+ *                    1 + sinh^2(x) / cos^2(y)
+ *
+ *             beta rho s + i t
+ *           = ----------------
+ *               1 + beta s^2
+ *
+ * Modifications:
+ *
+ *   I omitted the original algorithm's handling of overflow in tan(x) after
+ *   verifying with nearpi.c that this can't happen in IEEE single or double
+ *   precision.  I also handle large x differently.
+ */
+
+#define TANH_HUGE 22.0
+#define TANHF_HUGE 11.0F
+#define TANHL_HUGE 42.0L
+
+npy_clongdouble
+npy_ctanhl(npy_clongdouble z)
+{
+    npy_longdouble x, y;
+    npy_longdouble t, beta, s, rho, denom;
+
+    x = npy_creall(z);
+    y = npy_cimagl(z);
+
+    /*
+     * ctanh(NaN + i 0) = NaN + i 0
+     *
+     * ctanh(NaN + i y) = NaN + i NaN        for y != 0
+     *
+     * The imaginary part has the sign of x*sin(2*y), but there's no
+     * special effort to get this right.
+     *
+     * ctanh(+-Inf +- i Inf) = +-1 +- 0
+     *
+     * ctanh(+-Inf + i y) = +-1 + 0 sin(2y)        for y finite
+     *
+     * The imaginary part of the sign is unspecified.  This special
+     * case is only needed to avoid a spurious invalid exception when
+     * y is infinite.
+     */
+        if (!npy_isfinite(x)) {
+            if (npy_isnan(x)) {
+                return npy_cpackl(x, (y == 0 ? y : x * y));
+            }
+            return npy_cpackl(npy_copysignl(1,x),
+                                npy_copysignl(0,
+                                npy_isinf(y) ?
+                                    y : npy_sinl(y) * npy_cosl(y)));
+        }
+
+    /*
+     * ctanh(x + i NAN) = NaN + i NaN
+     * ctanh(x +- i Inf) = NaN + i NaN
+     */
+    if (!npy_isfinite(y)) {
+        return (npy_cpackl(y - y, y - y));
+    }
+
+    /*
+     * ctanh(+-huge + i +-y) ~= +-1 +- i 2sin(2y)/exp(2x), using the
+     * approximation sinh^2(huge) ~= exp(2*huge) / 4.
+     * We use a modified formula to avoid spurious overflow.
+     */
+    if (npy_fabsl(x) >= TANHL_HUGE) {
+        npy_longdouble exp_mx = npy_expl(-npy_fabsl(x));
+        return npy_cpackl(npy_copysignl(1, x),
+                            4 * npy_sinl(y) * npy_cosl(y) *
+                                exp_mx * exp_mx);
+    }
+
+    /* Kahan's algorithm */
+    t = npy_tanl(y);
+    beta = 1 + t * t;    /* = 1 / cos^2(y) */
+    s = npy_sinhl(x);
+    rho = npy_sqrtl(1 + s * s);    /* = cosh(x) */
+    denom = 1 + beta * s * s;
+    return (npy_cpackl((beta * rho * s) / denom, t / denom));
+}
+#undef TANH_HUGE
+#undef TANHF_HUGE
+#undef TANHL_HUGE
+#endif
+
+#if !defined (HAVE_CACOSL) || !defined (HAVE_CASINHL)
+/*
+ * Complex inverse trig functions taken from the msum library in FreeBSD
+ * revision 251404
+ *
+ * The algorithm is very close to that in "Implementing the complex arcsine
+ * and arccosine functions using exception handling" by T. E. Hull, Thomas F.
+ * Fairgrieve, and Ping Tak Peter Tang, published in ACM Transactions on
+ * Mathematical Software, Volume 23 Issue 3, 1997, Pages 299-335,
+ * http://dl.acm.org/citation.cfm?id=275324.
+ *
+ * Throughout we use the convention z = x + I*y.
+ *
+ * casinh(z) = sign(x)*log(A+sqrt(A*A-1)) + I*asin(B)
+ * where
+ * A = (|z+I| + |z-I|) / 2
+ * B = (|z+I| - |z-I|) / 2 = y/A
+ *
+ * These formulas become numerically unstable:
+ *   (a) for Re(casinh(z)) when z is close to the line segment [-I, I] (that
+ *       is, Re(casinh(z)) is close to 0);
+ *   (b) for Im(casinh(z)) when z is close to either of the intervals
+ *       [I, I*infinity) or (-I*infinity, -I] (that is, |Im(casinh(z))| is
+ *       close to PI/2).
+ *
+ * These numerical problems are overcome by defining
+ * f(a, b) = (hypot(a, b) - b) / 2 = a*a / (hypot(a, b) + b) / 2
+ * Then if A < A_crossover, we use
+ *   log(A + sqrt(A*A-1)) = log1p((A-1) + sqrt((A-1)*(A+1)))
+ *   A-1 = f(x, 1+y) + f(x, 1-y)
+ * and if B > B_crossover, we use
+ *   asin(B) = atan2(y, sqrt(A*A - y*y)) = atan2(y, sqrt((A+y)*(A-y)))
+ *   A-y = f(x, y+1) + f(x, y-1)
+ * where without loss of generality we have assumed that x and y are
+ * non-negative.
+ *
+ * Much of the difficulty comes because the intermediate computations may
+ * produce overflows or underflows.  This is dealt with in the paper by Hull
+ * et al by using exception handling.  We do this by detecting when
+ * computations risk underflow or overflow.  The hardest part is handling the
+ * underflows when computing f(a, b).
+ *
+ * Note that the function f(a, b) does not appear explicitly in the paper by
+ * Hull et al, but the idea may be found on pages 308 and 309.  Introducing the
+ * function f(a, b) allows us to concentrate many of the clever tricks in this
+ * paper into one function.
+ */
+
+/*
+ * Function f(a, b, hypot_a_b) = (hypot(a, b) - b) / 2.
+ * Pass hypot(a, b) as the third argument.
+ */
+static inline npy_longdouble
+_fl(npy_longdouble a, npy_longdouble b, npy_longdouble hypot_a_b)
+{
+    if (b < 0) {
+        return ((hypot_a_b - b) / 2);
+    }
+    if (b == 0) {
+        return (a / 2);
+    }
+    return (a * a / (hypot_a_b + b) / 2);
+}
+
+/*
+ * All the hard work is contained in this function.
+ * x and y are assumed positive or zero, and less than RECIP_EPSILON.
+ * Upon return:
+ * rx = Re(casinh(z)) = -Im(cacos(y + I*x)).
+ * B_is_usable is set to 1 if the value of B is usable.
+ * If B_is_usable is set to 0, sqrt_A2my2 = sqrt(A*A - y*y), and new_y = y.
+ * If returning sqrt_A2my2 has potential to result in an underflow, it is
+ * rescaled, and new_y is similarly rescaled.
+ */
+static inline void
+_do_hard_workl(npy_longdouble x, npy_longdouble y, npy_longdouble *rx,
+    npy_int *B_is_usable, npy_longdouble *B, npy_longdouble *sqrt_A2my2, npy_longdouble *new_y)
+{
+#if 3 == 1
+    const npy_float A_crossover = 10.0f;
+    const npy_float B_crossover = 0.6417f;
+    const npy_float FOUR_SQRT_MIN = 4.3368086899420177e-19f;
+#endif
+#if 3 == 2
+    const npy_double A_crossover = 10.0;
+    const npy_double B_crossover = 0.6417;
+    const npy_double FOUR_SQRT_MIN = 5.9666725849601654e-154;
+#endif
+#if 3 == 3
+    const npy_longdouble A_crossover = 10.0l;
+    const npy_longdouble B_crossover = 0.6417l;
+#if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
+    const npy_longdouble FOUR_SQRT_MIN = 5.9666725849601654e-154;
+#else
+    const npy_longdouble FOUR_SQRT_MIN = 7.3344154702193886625e-2466l;
+#endif
+#endif
+    npy_longdouble R, S, A; /* A, B, R, and S are as in Hull et al. */
+    npy_longdouble Am1, Amy; /* A-1, A-y. */
+
+    R = npy_hypotl(x, y + 1);        /* |z+I| */
+    S = npy_hypotl(x, y - 1);        /* |z-I| */
+
+    /* A = (|z+I| + |z-I|) / 2 */
+    A = (R + S) / 2;
+    /*
+     * Mathematically A >= 1.  There is a small chance that this will not
+     * be so because of rounding errors.  So we will make certain it is
+     * so.
+     */
+    if (A < 1) {
+        A = 1;
+    }
+
+    if (A < A_crossover) {
+        /*
+         * Am1 = fp + fm, where fp = f(x, 1+y), and fm = f(x, 1-y).
+         * rx = log1p(Am1 + sqrt(Am1*(A+1)))
+         */
+        if (y == 1 && x < LDBL_EPSILON * LDBL_EPSILON / 128) {
+            /*
+             * fp is of order x^2, and fm = x/2.
+             * A = 1 (inexactly).
+             */
+            *rx = npy_sqrtl(x);
+        }
+        else if (x >= LDBL_EPSILON * npy_fabsl(y - 1)) {
+            /*
+             * Underflow will not occur because
+             * x >= DBL_EPSILON^2/128 >= FOUR_SQRT_MIN
+             */
+            Am1 = _fl(x, 1 + y, R) + _fl(x, 1 - y, S);
+            *rx = npy_log1pl(Am1 + npy_sqrtl(Am1 * (A + 1)));
+        }
+        else if (y < 1) {
+            /*
+             * fp = x*x/(1+y)/4, fm = x*x/(1-y)/4, and
+             * A = 1 (inexactly).
+             */
+            *rx = x / npy_sqrtl((1 - y) * (1 + y));
+        }
+        else {        /* if (y > 1) */
+            /*
+             * A-1 = y-1 (inexactly).
+             */
+            *rx = npy_log1pl((y - 1) + npy_sqrtl((y - 1) * (y + 1)));
+        }
+    }
+    else {
+        *rx = npy_logl(A + npy_sqrtl(A * A - 1));
+    }
+
+    *new_y = y;
+
+    if (y < FOUR_SQRT_MIN) {
+        /*
+         * Avoid a possible underflow caused by y/A.  For casinh this
+         * would be legitimate, but will be picked up by invoking atan2
+         * later on.  For cacos this would not be legitimate.
+         */
+        *B_is_usable = 0;
+        *sqrt_A2my2 = A * (2 / LDBL_EPSILON);
+        *new_y = y * (2 / LDBL_EPSILON);
+        return;
+    }
+
+    /* B = (|z+I| - |z-I|) / 2 = y/A */
+    *B = y / A;
+    *B_is_usable = 1;
+
+    if (*B > B_crossover) {
+        *B_is_usable = 0;
+        /*
+         * Amy = fp + fm, where fp = f(x, y+1), and fm = f(x, y-1).
+         * sqrt_A2my2 = sqrt(Amy*(A+y))
+         */
+        if (y == 1 && x < LDBL_EPSILON / 128) {
+            /*
+             * fp is of order x^2, and fm = x/2.
+             * A = 1 (inexactly).
+             */
+            *sqrt_A2my2 = npy_sqrtl(x) * npy_sqrtl((A + y) / 2);
+        }
+        else if (x >= LDBL_EPSILON * npy_fabsl(y - 1)) {
+            /*
+             * Underflow will not occur because
+             * x >= DBL_EPSILON/128 >= FOUR_SQRT_MIN
+             * and
+             * x >= DBL_EPSILON^2 >= FOUR_SQRT_MIN
+             */
+            Amy = _fl(x, y + 1, R) + _fl(x, y - 1, S);
+            *sqrt_A2my2 = npy_sqrtl(Amy * (A + y));
+        }
+        else if (y > 1) {
+            /*
+             * fp = x*x/(y+1)/4, fm = x*x/(y-1)/4, and
+             * A = y (inexactly).
+             *
+             * y < RECIP_EPSILON.  So the following
+             * scaling should avoid any underflow problems.
+             */
+            *sqrt_A2my2 = x * (4 / LDBL_EPSILON / LDBL_EPSILON) * y /
+                npy_sqrtl((y + 1) * (y - 1));
+            *new_y = y * (4 / LDBL_EPSILON / LDBL_EPSILON);
+        }
+        else {        /* if (y < 1) */
+            /*
+             * fm = 1-y >= DBL_EPSILON, fp is of order x^2, and
+             * A = 1 (inexactly).
+             */
+            *sqrt_A2my2 = npy_sqrtl((1 - y) * (1 + y));
+        }
+    }
+}
+
+/*
+ * Optimized version of clog() for |z| finite and larger than ~RECIP_EPSILON.
+ */
+static inline void
+_clog_for_large_valuesl(npy_longdouble x, npy_longdouble y,
+    npy_longdouble *rr, npy_longdouble *ri)
+{
+#if 3 == 1
+    const npy_float QUARTER_SQRT_MAX = 4.611685743549481e+18f;
+    const npy_float SQRT_MIN = 1.0842021724855044e-19f;
+ #endif
+#if 3 == 2
+    const npy_double QUARTER_SQRT_MAX = 3.3519519824856489e+153;
+    const npy_double SQRT_MIN = 1.4916681462400413e-154;
+ #endif
+#if 3 == 3
+#if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
+    const npy_longdouble QUARTER_SQRT_MAX = 3.3519519824856489e+153;
+    const npy_longdouble SQRT_MIN = 1.4916681462400413e-154;
+#else
+    const npy_longdouble QUARTER_SQRT_MAX = 2.7268703390485398235e+2465l;
+    const npy_longdouble SQRT_MIN = 1.8336038675548471656e-2466l;
+#endif
+#endif
+    npy_longdouble ax, ay, t;
+
+    ax = npy_fabsl(x);
+    ay = npy_fabsl(y);
+    if (ax < ay) {
+        t = ax;
+        ax = ay;
+        ay = t;
+    }
+
+    /*
+     * Avoid overflow in hypot() when x and y are both very large.
+     * Divide x and y by E, and then add 1 to the logarithm.  This depends
+     * on E being larger than sqrt(2).
+     * Dividing by E causes an insignificant loss of accuracy; however
+     * this method is still poor since it is unnecessarily slow.
+     */
+    if (ax > LDBL_MAX / 2) {
+        *rr = npy_logl(npy_hypotl(x / NPY_El, y / NPY_El)) + 1;
+    }
+    /*
+     * Avoid overflow when x or y is large.  Avoid underflow when x or
+     * y is small.
+     */
+    else if (ax > QUARTER_SQRT_MAX || ay < SQRT_MIN) {
+        *rr = npy_logl(npy_hypotl(x, y));
+    }
+    else {
+        *rr = npy_logl(ax * ax + ay * ay) / 2;
+    }
+    *ri = npy_atan2l(y, x);
+}
+#endif
+
+#ifndef HAVE_CACOSL
+npy_clongdouble
+npy_cacosl(npy_clongdouble z)
+{
+#if 3 == 1
+    /* this is sqrt(6*EPS) */
+    const npy_float SQRT_6_EPSILON = 8.4572793338e-4f;
+    /* chosen such that pio2_hi + pio2_lo == pio2_hi but causes FE_INEXACT. */
+    const volatile npy_float pio2_lo = 7.5497899549e-9f;
+#endif
+#if 3 == 2
+    const npy_double SQRT_6_EPSILON = 3.65002414998885671e-08;
+    const volatile npy_double pio2_lo = 6.1232339957367659e-17;
+#endif
+#if 3 == 3
+    const npy_longdouble SQRT_6_EPSILON = 8.0654900873493277169e-10l;
+    const volatile npy_longdouble pio2_lo = 2.710505431213761085e-20l;
+#endif
+    const npy_longdouble RECIP_EPSILON = 1.0l / LDBL_EPSILON;
+    const npy_longdouble pio2_hi = NPY_PI_2l;
+    npy_longdouble x, y, ax, ay, wx, wy, rx, ry, B, sqrt_A2mx2, new_x;
+    npy_int sx, sy;
+    npy_int B_is_usable;
+
+    x = npy_creall(z);
+    y = npy_cimagl(z);
+    sx = npy_signbit(x);
+    sy = npy_signbit(y);
+    ax = npy_fabsl(x);
+    ay = npy_fabsl(y);
+
+    if (npy_isnan(x) || npy_isnan(y)) {
+        /* cacos(+-Inf + I*NaN) = NaN + I*opt(-)Inf */
+        if (npy_isinf(x)) {
+            return npy_cpackl(y + y, -NPY_INFINITYL);
+        }
+        /* cacos(NaN + I*+-Inf) = NaN + I*-+Inf */
+        if (npy_isinf(y)) {
+            return npy_cpackl(x + x, -y);
+        }
+        /* cacos(0 + I*NaN) = PI/2 + I*NaN with inexact */
+        if (x == 0) {
+            return npy_cpackl(pio2_hi + pio2_lo, y + y);
+        }
+        /*
+         * All other cases involving NaN return NaN + I*NaN.
+         * C99 leaves it optional whether to raise invalid if one of
+         * the arguments is not NaN, so we opt not to raise it.
+         */
+        return npy_cpackl(NPY_NANL, NPY_NANL);
+    }
+
+    if (ax > RECIP_EPSILON || ay > RECIP_EPSILON) {
+        /* clog...() will raise inexact unless x or y is infinite. */
+        _clog_for_large_valuesl(x, y, &wx, &wy);
+        rx = npy_fabsl(wy);
+        ry = wx + NPY_LOGE2l;
+        if (sy == 0) {
+            ry = -ry;
+        }
+        return npy_cpackl(rx, ry);
+    }
+
+    /* Avoid spuriously raising inexact for z = 1. */
+    if (x == 1 && y == 0) {
+        return npy_cpackl(0, -y);
+    }
+
+    /* All remaining cases are inexact. */
+    raise_inexact();
+
+    if (ax < SQRT_6_EPSILON / 4 && ay < SQRT_6_EPSILON / 4) {
+        return npy_cpackl(pio2_hi - (x - pio2_lo), -y);
+    }
+
+    _do_hard_workl(ay, ax, &ry, &B_is_usable, &B, &sqrt_A2mx2, &new_x);
+    if (B_is_usable) {
+        if (sx == 0) {
+            rx = npy_acosl(B);
+        }
+        else {
+            rx = npy_acosl(-B);
+        }
+    }
+    else {
+        if (sx == 0) {
+            rx = npy_atan2l(sqrt_A2mx2, new_x);
+        }
+        else {
+            rx = npy_atan2l(sqrt_A2mx2, -new_x);
+        }
+    }
+    if (sy == 0) {
+        ry = -ry;
+    }
+    return npy_cpackl(rx, ry);
+}
+#endif
+
+#ifndef HAVE_CASINL
+npy_clongdouble
+npy_casinl(npy_clongdouble z)
+{
+    /* casin(z) = I * conj( casinh(I * conj(z)) ) */
+    z = npy_casinhl(npy_cpackl(npy_cimagl(z), npy_creall(z)));
+    return npy_cpackl(npy_cimagl(z), npy_creall(z));
+}
+#endif
+
+#ifndef HAVE_CATANL
+npy_clongdouble
+npy_catanl(npy_clongdouble z)
+{
+    /* catan(z) = I * conj( catanh(I * conj(z)) ) */
+    z = npy_catanhl(npy_cpackl(npy_cimagl(z), npy_creall(z)));
+    return npy_cpackl(npy_cimagl(z), npy_creall(z));
+}
+#endif
+
+#ifndef HAVE_CACOSHL
+npy_clongdouble
+npy_cacoshl(npy_clongdouble z)
+{
+    /*
+     * cacosh(z) = I*cacos(z) or -I*cacos(z)
+     * where the sign is chosen so Re(cacosh(z)) >= 0.
+     */
+    npy_clongdouble  w;
+    npy_longdouble rx, ry;
+
+    w = npy_cacosl(z);
+    rx = npy_creall(w);
+    ry = npy_cimagl(w);
+    /* cacosh(NaN + I*NaN) = NaN + I*NaN */
+    if (npy_isnan(rx) && npy_isnan(ry)) {
+        return npy_cpackl(ry, rx);
+    }
+    /* cacosh(NaN + I*+-Inf) = +Inf + I*NaN */
+    /* cacosh(+-Inf + I*NaN) = +Inf + I*NaN */
+    if (npy_isnan(rx)) {
+        return npy_cpackl(npy_fabsl(ry), rx);
+    }
+    /* cacosh(0 + I*NaN) = NaN + I*NaN */
+    if (npy_isnan(ry)) {
+        return npy_cpackl(ry, ry);
+    }
+    return npy_cpackl(npy_fabsl(ry), npy_copysignl(rx, npy_cimagl(z)));
+}
+#endif
+
+#ifndef HAVE_CASINHL
+/*
+ * casinh(z) = z + O(z^3)   as z -> 0
+ *
+ * casinh(z) = sign(x)*clog(sign(x)*z) + O(1/z^2)   as z -> infinity
+ * The above formula works for the imaginary part as well, because
+ * Im(casinh(z)) = sign(x)*atan2(sign(x)*y, fabs(x)) + O(y/z^3)
+ *    as z -> infinity, uniformly in y
+ */
+npy_clongdouble
+npy_casinhl(npy_clongdouble z)
+{
+#if 3 == 1
+    /* this is sqrt(6*EPS) */
+    const npy_float SQRT_6_EPSILON = 8.4572793338e-4f;
+#endif
+#if 3 == 2
+    const npy_double SQRT_6_EPSILON = 3.65002414998885671e-08;
+#endif
+#if 3 == 3
+    const npy_longdouble SQRT_6_EPSILON = 8.0654900873493277169e-10l;
+#endif
+    const npy_longdouble RECIP_EPSILON = 1.0l / LDBL_EPSILON;
+    npy_longdouble x, y, ax, ay, wx, wy, rx, ry, B, sqrt_A2my2, new_y;
+    npy_int B_is_usable;
+
+    x = npy_creall(z);
+    y = npy_cimagl(z);
+    ax = npy_fabsl(x);
+    ay = npy_fabsl(y);
+
+    if (npy_isnan(x) || npy_isnan(y)) {
+        /* casinh(+-Inf + I*NaN) = +-Inf + I*NaN */
+        if (npy_isinf(x)) {
+            return npy_cpackl(x, y + y);
+        }
+        /* casinh(NaN + I*+-Inf) = opt(+-)Inf + I*NaN */
+        if (npy_isinf(y)) {
+            return npy_cpackl(y, x + x);
+        }
+        /* casinh(NaN + I*0) = NaN + I*0 */
+        if (y == 0) {
+            return npy_cpackl(x + x, y);
+        }
+        /*
+         * All other cases involving NaN return NaN + I*NaN.
+         * C99 leaves it optional whether to raise invalid if one of
+         * the arguments is not NaN, so we opt not to raise it.
+         */
+        return npy_cpackl(NPY_NANL, NPY_NANL);
+    }
+
+    if (ax > RECIP_EPSILON || ay > RECIP_EPSILON) {
+        /* clog...() will raise inexact unless x or y is infinite. */
+        if (npy_signbit(x) == 0) {
+            _clog_for_large_valuesl(x, y, &wx, &wy);
+            wx += NPY_LOGE2l;
+        }
+        else {
+            _clog_for_large_valuesl(-x, -y, &wx, &wy);
+            wx += NPY_LOGE2l;
+        }
+        return npy_cpackl(npy_copysignl(wx, x), npy_copysignl(wy, y));
+    }
+
+    /* Avoid spuriously raising inexact for z = 0. */
+    if (x == 0 && y == 0) {
+        return (z);
+    }
+
+    /* All remaining cases are inexact. */
+    raise_inexact();
+
+    if (ax < SQRT_6_EPSILON / 4 && ay < SQRT_6_EPSILON / 4) {
+        return (z);
+    }
+
+    _do_hard_workl(ax, ay, &rx, &B_is_usable, &B, &sqrt_A2my2, &new_y);
+    if (B_is_usable) {
+        ry = npy_asinl(B);
+    }
+    else {
+        ry = npy_atan2l(new_y, sqrt_A2my2);
+    }
+    return npy_cpackl(npy_copysignl(rx, x), npy_copysignl(ry, y));
+}
+#endif
+
+#ifndef HAVE_CATANHL
+/*
+ * sum_squares(x,y) = x*x + y*y (or just x*x if y*y would underflow).
+ * Assumes x*x and y*y will not overflow.
+ * Assumes x and y are finite.
+ * Assumes y is non-negative.
+ * Assumes fabs(x) >= DBL_EPSILON.
+ */
+static inline npy_longdouble
+_sum_squaresl(npy_longdouble x, npy_longdouble y)
+{
+#if 3 == 1
+const npy_float SQRT_MIN = 1.0842022e-19f;
+#endif
+#if 3 == 2
+const npy_double SQRT_MIN = 1.4916681462400413e-154; /* sqrt(DBL_MIN) */
+#endif
+#if 3 == 3
+#if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
+const npy_longdouble SQRT_MIN = 1.4916681462400413e-154; /* sqrt(DBL_MIN) */
+#else
+/* this is correct for 80 bit long doubles */
+const npy_longdouble SQRT_MIN = 1.8336038675548471656e-2466l;
+#endif
+#endif
+    /* Avoid underflow when y is small. */
+    if (y < SQRT_MIN) {
+        return (x * x);
+    }
+
+    return (x * x + y * y);
+}
+
+/*
+ * real_part_reciprocal(x, y) = Re(1/(x+I*y)) = x/(x*x + y*y).
+ * Assumes x and y are not NaN, and one of x and y is larger than
+ * RECIP_EPSILON.  We avoid unwarranted underflow.  It is important to not use
+ * the code creal(1/z), because the imaginary part may produce an unwanted
+ * underflow.
+ * This is only called in a context where inexact is always raised before
+ * the call, so no effort is made to avoid or force inexact.
+ */
+#if 3 == 1
+#define BIAS (FLT_MAX_EXP - 1)
+#define CUTOFF (FLT_MANT_DIG / 2 + 1)
+static inline npy_float
+_real_part_reciprocalf(npy_float x, npy_float y)
+{
+    npy_float scale;
+    npy_uint32 hx, hy;
+    npy_int32 ix, iy;
+
+    GET_FLOAT_WORD(hx, x);
+    ix = hx & 0x7f800000;
+    GET_FLOAT_WORD(hy, y);
+    iy = hy & 0x7f800000;
+    if (ix - iy >= CUTOFF << 23 || npy_isinf(x)) {
+        return (1 / x);
+    }
+    if (iy - ix >= CUTOFF << 23) {
+        return (x / y / y);
+    }
+    if (ix <= (BIAS + FLT_MAX_EXP / 2 - CUTOFF) << 23) {
+        return (x / (x * x + y * y));
+    }
+    SET_FLOAT_WORD(scale, 0x7f800000 - ix);
+    x *= scale;
+    y *= scale;
+    return (x / (x * x + y * y) * scale);
+}
+#undef BIAS
+#undef CUTOFF
+#endif
+
+#if 3 == 2
+#define BIAS (DBL_MAX_EXP - 1)
+/*  more guard digits are useful iff there is extra precision. */
+#define CUTOFF (DBL_MANT_DIG / 2 + 1)  /* just half or 1 guard digit */
+static inline npy_double
+_real_part_reciprocal(npy_double x, npy_double y)
+{
+    npy_double scale;
+    npy_uint32 hx, hy;
+    npy_int32 ix, iy;
+
+    /*
+     * This code is inspired by the C99 document n1124.pdf, Section G.5.1,
+     * example 2.
+     */
+    GET_HIGH_WORD(hx, x);
+    ix = hx & 0x7ff00000;
+    GET_HIGH_WORD(hy, y);
+    iy = hy & 0x7ff00000;
+    if (ix - iy >= CUTOFF << 20 || npy_isinf(x)) {
+        /* +-Inf -> +-0 is special */
+        return (1 / x);
+    }
+    if (iy - ix >= CUTOFF << 20) {
+        /* should avoid double div, but hard */
+        return (x / y / y);
+    }
+    if (ix <= (BIAS + DBL_MAX_EXP / 2 - CUTOFF) << 20) {
+        return (x / (x * x + y * y));
+    }
+    scale = 1;
+    SET_HIGH_WORD(scale, 0x7ff00000 - ix);  /* 2**(1-ilogb(x)) */
+    x *= scale;
+    y *= scale;
+    return (x / (x * x + y * y) * scale);
+}
+#undef BIAS
+#undef CUTOFF
+#endif
+
+#if 3 == 3
+#if !defined(HAVE_LDOUBLE_DOUBLE_DOUBLE_BE) && \
+    !defined(HAVE_LDOUBLE_DOUBLE_DOUBLE_LE)
+
+#define BIAS (LDBL_MAX_EXP - 1)
+#define CUTOFF (LDBL_MANT_DIG / 2 + 1)
+static inline npy_longdouble
+_real_part_reciprocall(npy_longdouble x,
+    npy_longdouble y)
+{
+    npy_longdouble scale;
+    union IEEEl2bitsrep ux, uy, us;
+    npy_int32 ix, iy;
+
+    ux.e = x;
+    ix = GET_LDOUBLE_EXP(ux);
+    uy.e = y;
+    iy = GET_LDOUBLE_EXP(uy);
+    if (ix - iy >= CUTOFF || npy_isinf(x)) {
+        return (1/x);
+    }
+    if (iy - ix >= CUTOFF) {
+        return (x/y/y);
+    }
+    if (ix <= BIAS + LDBL_MAX_EXP / 2 - CUTOFF) {
+        return (x/(x*x + y*y));
+    }
+    us.e = 1;
+    SET_LDOUBLE_EXP(us, 0x7fff - ix);
+    scale = us.e;
+    x *= scale;
+    y *= scale;
+    return (x/(x*x + y*y) * scale);
+}
+#undef BIAS
+#undef CUTOFF
+
+#else
+
+static inline npy_longdouble
+_real_part_reciprocall(npy_longdouble x,
+    npy_longdouble y)
+{
+    return x/(x*x + y*y);
+}
+
+#endif
+#endif
+
+npy_clongdouble
+npy_catanhl(npy_clongdouble z)
+{
+#if 3 == 1
+    /* this is sqrt(3*EPS) */
+    const npy_float SQRT_3_EPSILON = 5.9801995673e-4f;
+    /* chosen such that pio2_hi + pio2_lo == pio2_hi but causes FE_INEXACT. */
+    const volatile npy_float pio2_lo = 7.5497899549e-9f;
+#endif
+#if 3 == 2
+    const npy_double SQRT_3_EPSILON = 2.5809568279517849e-8;
+    const volatile npy_double pio2_lo = 6.1232339957367659e-17;
+#endif
+#if 3 == 3
+    const npy_longdouble SQRT_3_EPSILON = 5.70316273435758915310e-10l;
+    const volatile npy_longdouble pio2_lo = 2.710505431213761085e-20l;
+#endif
+    const npy_longdouble RECIP_EPSILON = 1.0l / LDBL_EPSILON;
+    const npy_longdouble pio2_hi = NPY_PI_2l;
+    npy_longdouble x, y, ax, ay, rx, ry;
+
+    x = npy_creall(z);
+    y = npy_cimagl(z);
+    ax = npy_fabsl(x);
+    ay = npy_fabsl(y);
+
+    /* This helps handle many cases. */
+    if (y == 0 && ax <= 1) {
+        return npy_cpackl(npy_atanhl(x), y);
+    }
+
+    /* To ensure the same accuracy as atan(), and to filter out z = 0. */
+    if (x == 0) {
+        return npy_cpackl(x, npy_atanl(y));
+    }
+
+    if (npy_isnan(x) || npy_isnan(y)) {
+        /* catanh(+-Inf + I*NaN) = +-0 + I*NaN */
+        if (npy_isinf(x)) {
+            return npy_cpackl(npy_copysignl(0, x), y + y);
+        }
+        /* catanh(NaN + I*+-Inf) = sign(NaN)0 + I*+-PI/2 */
+        if (npy_isinf(y)) {
+            return npy_cpackl(npy_copysignl(0, x),
+                npy_copysignl(pio2_hi + pio2_lo, y));
+        }
+        /*
+         * All other cases involving NaN return NaN + I*NaN.
+         * C99 leaves it optional whether to raise invalid if one of
+         * the arguments is not NaN, so we opt not to raise it.
+         */
+        return npy_cpackl(NPY_NANL, NPY_NANL);
+    }
+
+    if (ax > RECIP_EPSILON || ay > RECIP_EPSILON) {
+        return npy_cpackl(_real_part_reciprocall(x, y),
+            npy_copysignl(pio2_hi + pio2_lo, y));
+    }
+
+    if (ax < SQRT_3_EPSILON / 2 && ay < SQRT_3_EPSILON / 2) {
+        /*
+         * z = 0 was filtered out above.  All other cases must raise
+         * inexact, but this is the only one that needs to do it
+         * explicitly.
+         */
+        raise_inexact();
+        return (z);
+    }
+
+    if (ax == 1 && ay < LDBL_EPSILON) {
+        rx = (NPY_LOGE2l - npy_logl(ay)) / 2;
+    }
+    else {
+        rx = npy_log1pl(4 * ax / _sum_squaresl(ax - 1, ay)) / 4;
+    }
+
+    if (ax == 1) {
+        ry = npy_atan2l(2, -ay) / 2;
+    }
+    else if (ay < LDBL_EPSILON) {
+        ry = npy_atan2l(2 * ay, (1 - ax) * (1 + ax)) / 2;
+    }
+    else {
+        ry = npy_atan2l(2 * ay, (1 - ax) * (1 + ax) - ay * ay) / 2;
+    }
+
+    return npy_cpackl(npy_copysignl(rx, x), npy_copysignl(ry, y));
+}
+#endif
+
+
+/*==========================================================
+ * Decorate all the functions which are available natively
+ *=========================================================*/
+
+#line 1752
+
+#line 1757
+#ifdef HAVE_CABSF
+npy_float
+npy_cabsf(npy_cfloat z)
+{
+    __npy_cfloat_to_c99_cast z1;
+    z1.npy_z = z;
+    return cabsf(z1.c99_z);
+}
+#endif
+
+#line 1757
+#ifdef HAVE_CARGF
+npy_float
+npy_cargf(npy_cfloat z)
+{
+    __npy_cfloat_to_c99_cast z1;
+    z1.npy_z = z;
+    return cargf(z1.c99_z);
+}
+#endif
+
+
+#line 1774
+#ifdef HAVE_CEXPF
+npy_cfloat
+npy_cexpf(npy_cfloat z)
+{
+    __npy_cfloat_to_c99_cast z1;
+    __npy_cfloat_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = cexpf(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CLOGF
+npy_cfloat
+npy_clogf(npy_cfloat z)
+{
+    __npy_cfloat_to_c99_cast z1;
+    __npy_cfloat_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = clogf(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CSQRTF
+npy_cfloat
+npy_csqrtf(npy_cfloat z)
+{
+    __npy_cfloat_to_c99_cast z1;
+    __npy_cfloat_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = csqrtf(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CCOSF
+npy_cfloat
+npy_ccosf(npy_cfloat z)
+{
+    __npy_cfloat_to_c99_cast z1;
+    __npy_cfloat_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = ccosf(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CSINF
+npy_cfloat
+npy_csinf(npy_cfloat z)
+{
+    __npy_cfloat_to_c99_cast z1;
+    __npy_cfloat_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = csinf(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CTANF
+npy_cfloat
+npy_ctanf(npy_cfloat z)
+{
+    __npy_cfloat_to_c99_cast z1;
+    __npy_cfloat_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = ctanf(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CCOSHF
+npy_cfloat
+npy_ccoshf(npy_cfloat z)
+{
+    __npy_cfloat_to_c99_cast z1;
+    __npy_cfloat_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = ccoshf(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CSINHF
+npy_cfloat
+npy_csinhf(npy_cfloat z)
+{
+    __npy_cfloat_to_c99_cast z1;
+    __npy_cfloat_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = csinhf(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CTANHF
+npy_cfloat
+npy_ctanhf(npy_cfloat z)
+{
+    __npy_cfloat_to_c99_cast z1;
+    __npy_cfloat_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = ctanhf(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CACOSF
+npy_cfloat
+npy_cacosf(npy_cfloat z)
+{
+    __npy_cfloat_to_c99_cast z1;
+    __npy_cfloat_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = cacosf(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CASINF
+npy_cfloat
+npy_casinf(npy_cfloat z)
+{
+    __npy_cfloat_to_c99_cast z1;
+    __npy_cfloat_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = casinf(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CATANF
+npy_cfloat
+npy_catanf(npy_cfloat z)
+{
+    __npy_cfloat_to_c99_cast z1;
+    __npy_cfloat_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = catanf(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CACOSHF
+npy_cfloat
+npy_cacoshf(npy_cfloat z)
+{
+    __npy_cfloat_to_c99_cast z1;
+    __npy_cfloat_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = cacoshf(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CASINHF
+npy_cfloat
+npy_casinhf(npy_cfloat z)
+{
+    __npy_cfloat_to_c99_cast z1;
+    __npy_cfloat_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = casinhf(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CATANHF
+npy_cfloat
+npy_catanhf(npy_cfloat z)
+{
+    __npy_cfloat_to_c99_cast z1;
+    __npy_cfloat_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = catanhf(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+
+
+
+#line 1752
+
+#line 1757
+#ifdef HAVE_CABS
+npy_double
+npy_cabs(npy_cdouble z)
+{
+    __npy_cdouble_to_c99_cast z1;
+    z1.npy_z = z;
+    return cabs(z1.c99_z);
+}
+#endif
+
+#line 1757
+#ifdef HAVE_CARG
+npy_double
+npy_carg(npy_cdouble z)
+{
+    __npy_cdouble_to_c99_cast z1;
+    z1.npy_z = z;
+    return carg(z1.c99_z);
+}
+#endif
+
+
+#line 1774
+#ifdef HAVE_CEXP
+npy_cdouble
+npy_cexp(npy_cdouble z)
+{
+    __npy_cdouble_to_c99_cast z1;
+    __npy_cdouble_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = cexp(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CLOG
+npy_cdouble
+npy_clog(npy_cdouble z)
+{
+    __npy_cdouble_to_c99_cast z1;
+    __npy_cdouble_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = clog(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CSQRT
+npy_cdouble
+npy_csqrt(npy_cdouble z)
+{
+    __npy_cdouble_to_c99_cast z1;
+    __npy_cdouble_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = csqrt(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CCOS
+npy_cdouble
+npy_ccos(npy_cdouble z)
+{
+    __npy_cdouble_to_c99_cast z1;
+    __npy_cdouble_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = ccos(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CSIN
+npy_cdouble
+npy_csin(npy_cdouble z)
+{
+    __npy_cdouble_to_c99_cast z1;
+    __npy_cdouble_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = csin(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CTAN
+npy_cdouble
+npy_ctan(npy_cdouble z)
+{
+    __npy_cdouble_to_c99_cast z1;
+    __npy_cdouble_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = ctan(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CCOSH
+npy_cdouble
+npy_ccosh(npy_cdouble z)
+{
+    __npy_cdouble_to_c99_cast z1;
+    __npy_cdouble_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = ccosh(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CSINH
+npy_cdouble
+npy_csinh(npy_cdouble z)
+{
+    __npy_cdouble_to_c99_cast z1;
+    __npy_cdouble_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = csinh(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CTANH
+npy_cdouble
+npy_ctanh(npy_cdouble z)
+{
+    __npy_cdouble_to_c99_cast z1;
+    __npy_cdouble_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = ctanh(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CACOS
+npy_cdouble
+npy_cacos(npy_cdouble z)
+{
+    __npy_cdouble_to_c99_cast z1;
+    __npy_cdouble_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = cacos(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CASIN
+npy_cdouble
+npy_casin(npy_cdouble z)
+{
+    __npy_cdouble_to_c99_cast z1;
+    __npy_cdouble_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = casin(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CATAN
+npy_cdouble
+npy_catan(npy_cdouble z)
+{
+    __npy_cdouble_to_c99_cast z1;
+    __npy_cdouble_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = catan(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CACOSH
+npy_cdouble
+npy_cacosh(npy_cdouble z)
+{
+    __npy_cdouble_to_c99_cast z1;
+    __npy_cdouble_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = cacosh(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CASINH
+npy_cdouble
+npy_casinh(npy_cdouble z)
+{
+    __npy_cdouble_to_c99_cast z1;
+    __npy_cdouble_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = casinh(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CATANH
+npy_cdouble
+npy_catanh(npy_cdouble z)
+{
+    __npy_cdouble_to_c99_cast z1;
+    __npy_cdouble_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = catanh(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+
+
+
+#line 1752
+
+#line 1757
+#ifdef HAVE_CABSL
+npy_longdouble
+npy_cabsl(npy_clongdouble z)
+{
+    __npy_clongdouble_to_c99_cast z1;
+    z1.npy_z = z;
+    return cabsl(z1.c99_z);
+}
+#endif
+
+#line 1757
+#ifdef HAVE_CARGL
+npy_longdouble
+npy_cargl(npy_clongdouble z)
+{
+    __npy_clongdouble_to_c99_cast z1;
+    z1.npy_z = z;
+    return cargl(z1.c99_z);
+}
+#endif
+
+
+#line 1774
+#ifdef HAVE_CEXPL
+npy_clongdouble
+npy_cexpl(npy_clongdouble z)
+{
+    __npy_clongdouble_to_c99_cast z1;
+    __npy_clongdouble_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = cexpl(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CLOGL
+npy_clongdouble
+npy_clogl(npy_clongdouble z)
+{
+    __npy_clongdouble_to_c99_cast z1;
+    __npy_clongdouble_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = clogl(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CSQRTL
+npy_clongdouble
+npy_csqrtl(npy_clongdouble z)
+{
+    __npy_clongdouble_to_c99_cast z1;
+    __npy_clongdouble_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = csqrtl(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CCOSL
+npy_clongdouble
+npy_ccosl(npy_clongdouble z)
+{
+    __npy_clongdouble_to_c99_cast z1;
+    __npy_clongdouble_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = ccosl(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CSINL
+npy_clongdouble
+npy_csinl(npy_clongdouble z)
+{
+    __npy_clongdouble_to_c99_cast z1;
+    __npy_clongdouble_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = csinl(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CTANL
+npy_clongdouble
+npy_ctanl(npy_clongdouble z)
+{
+    __npy_clongdouble_to_c99_cast z1;
+    __npy_clongdouble_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = ctanl(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CCOSHL
+npy_clongdouble
+npy_ccoshl(npy_clongdouble z)
+{
+    __npy_clongdouble_to_c99_cast z1;
+    __npy_clongdouble_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = ccoshl(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CSINHL
+npy_clongdouble
+npy_csinhl(npy_clongdouble z)
+{
+    __npy_clongdouble_to_c99_cast z1;
+    __npy_clongdouble_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = csinhl(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CTANHL
+npy_clongdouble
+npy_ctanhl(npy_clongdouble z)
+{
+    __npy_clongdouble_to_c99_cast z1;
+    __npy_clongdouble_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = ctanhl(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CACOSL
+npy_clongdouble
+npy_cacosl(npy_clongdouble z)
+{
+    __npy_clongdouble_to_c99_cast z1;
+    __npy_clongdouble_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = cacosl(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CASINL
+npy_clongdouble
+npy_casinl(npy_clongdouble z)
+{
+    __npy_clongdouble_to_c99_cast z1;
+    __npy_clongdouble_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = casinl(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CATANL
+npy_clongdouble
+npy_catanl(npy_clongdouble z)
+{
+    __npy_clongdouble_to_c99_cast z1;
+    __npy_clongdouble_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = catanl(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CACOSHL
+npy_clongdouble
+npy_cacoshl(npy_clongdouble z)
+{
+    __npy_clongdouble_to_c99_cast z1;
+    __npy_clongdouble_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = cacoshl(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CASINHL
+npy_clongdouble
+npy_casinhl(npy_clongdouble z)
+{
+    __npy_clongdouble_to_c99_cast z1;
+    __npy_clongdouble_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = casinhl(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+#line 1774
+#ifdef HAVE_CATANHL
+npy_clongdouble
+npy_catanhl(npy_clongdouble z)
+{
+    __npy_clongdouble_to_c99_cast z1;
+    __npy_clongdouble_to_c99_cast ret;
+    z1.npy_z = z;
+    ret.c99_z = catanhl(z1.c99_z);
+    return ret.npy_z;
+}
+#endif
+
+
+
+
+
diff --git a/numpy/core/src/_generated/npy_math_internal.h b/numpy/core/src/_generated/npy_math_internal.h
new file mode 100644
index 000000000000..be4d99df476f
--- /dev/null
+++ b/numpy/core/src/_generated/npy_math_internal.h
@@ -0,0 +1,2005 @@
+#line 1 "numpy/core/src/npymath/npy_math_internal.h.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+/*
+ * vim:syntax=c
+ * A small module to implement missing C99 math capabilities required by numpy
+ *
+ * Please keep this independent of python ! Only basic types (npy_longdouble)
+ * can be used, otherwise, pure C, without any use of Python facilities
+ *
+ * How to add a function to this section
+ * -------------------------------------
+ *
+ * Say you want to add `foo`, these are the steps and the reasons for them.
+ *
+ * 1) Add foo to the appropriate list in the configuration system. The
+ *    lists can be found in numpy/core/setup.py lines 63-105. Read the
+ *    comments that come with them, they are very helpful.
+ *
+ * 2) The configuration system will define a macro HAVE_FOO if your function
+ *    can be linked from the math library. The result can depend on the
+ *    optimization flags as well as the compiler, so can't be known ahead of
+ *    time. If the function can't be linked, then either it is absent, defined
+ *    as a macro, or is an intrinsic (hardware) function.
+ *
+ *    i) Undefine any possible macros:
+ *
+ *    #ifdef foo
+ *    #undef foo
+ *    #endif
+ *
+ *    ii) Avoid as much as possible to declare any function here. Declaring
+ *    functions is not portable: some platforms define some function inline
+ *    with a non standard identifier, for example, or may put another
+ *    identifier which changes the calling convention of the function. If you
+ *    really have to, ALWAYS declare it for the one platform you are dealing
+ *    with:
+ *
+ *    Not ok:
+ *        double exp(double a);
+ *
+ *    Ok:
+ *        #ifdef SYMBOL_DEFINED_WEIRD_PLATFORM
+ *        double exp(double);
+ *        #endif
+ *
+ * Some of the code is taken from msun library in FreeBSD, with the following
+ * notice:
+ *
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+#include "npy_math_private.h"
+#ifdef _MSC_VER
+#  include <intrin.h>   // for __popcnt
+#endif
+
+/* Magic binary numbers used by bit_count
+ * For type T, the magic numbers are computed as follows:
+ * Magic[0]: 01 01 01 01 01 01... = (T)~(T)0/3
+ * Magic[1]: 0011 0011 0011...    = (T)~(T)0/15  * 3
+ * Magic[2]: 00001111 00001111... = (T)~(T)0/255 * 15
+ * Magic[3]: 00000001 00000001... = (T)~(T)0/255
+ *
+ * Counting bits set, in parallel
+ * Based on: http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+ *
+ * Generic Algorithm for type T:
+ * a = a - ((a >> 1) & (T)~(T)0/3);
+ * a = (a & (T)~(T)0/15*3) + ((a >> 2) & (T)~(T)0/15*3);
+ * a = (a + (a >> 4)) & (T)~(T)0/255*15;
+ * c = (T)(a * ((T)~(T)0/255)) >> (sizeof(T) - 1) * CHAR_BIT;
+*/
+
+static const npy_uint8  MAGIC8[]  = {0x55u,                 0x33u,                 0x0Fu,                 0x01u};
+static const npy_uint16 MAGIC16[] = {0x5555u,               0x3333u,               0x0F0Fu,               0x0101u};
+static const npy_uint32 MAGIC32[] = {0x55555555ul,          0x33333333ul,          0x0F0F0F0Ful,          0x01010101ul};
+static const npy_uint64 MAGIC64[] = {0x5555555555555555ull, 0x3333333333333333ull, 0x0F0F0F0F0F0F0F0Full, 0x0101010101010101ull};
+
+
+/*
+ *****************************************************************************
+ **                    BLOCKLIST-ABLE BASIC MATH FUNCTIONS                  **
+ *****************************************************************************
+ */
+
+/* The following functions can be blocked, even for doubles */
+
+/* Original code by Konrad Hinsen.  */
+/* Taken from FreeBSD mlib, adapted for numpy
+ *
+ * XXX: we could be a bit faster by reusing high/low words for inf/nan
+ * classification instead of calling npy_isinf/npy_isnan: we should have some
+ * macros for this, though, instead of doing it manually
+ */
+NPY_INPLACE double npy_log2(double x)
+{
+#ifndef NPY_BLOCK_LOG2
+    return log2(x);
+#else
+    if (!npy_isfinite(x) || x <= 0.) {
+        /* special value result */
+        return npy_log(x);
+    }
+    else {
+        /*
+         * fallback implementation copied from python3.4 math.log2
+         * provides int(log(2**i)) == i for i 1-64 in default rounding mode.
+         *
+         * We want log2(m * 2**e) == log(m) / log(2) + e.  Care is needed when
+         * x is just greater than 1.0: in that case e is 1, log(m) is negative,
+         * and we get significant cancellation error from the addition of
+         * log(m) / log(2) to e.  The slight rewrite of the expression below
+         * avoids this problem.
+         */
+        int e;
+        double m = frexp(x, &e);
+        if (x >= 1.0) {
+            return log(2.0 * m) / log(2.0) + (e - 1);
+        }
+        else {
+            return log(m) / log(2.0) + e;
+        }
+    }
+#endif
+}
+
+/* Taken from FreeBSD mlib, adapted for numpy
+ *
+ * XXX: we could be a bit faster by reusing high/low words for inf/nan
+ * classification instead of calling npy_isinf/npy_isnan: we should have some
+ * macros for this, though, instead of doing it manually
+ */
+/* XXX: we should have this in npy_math.h */
+#define NPY_DBL_EPSILON 1.2246467991473531772E-16
+NPY_INPLACE double npy_atan2(double y, double x)
+{
+#ifndef NPY_BLOCK_ATAN2
+    return atan2(y, x);
+#else
+    npy_int32 k, m, iy, ix, hx, hy;
+    npy_uint32 lx,ly;
+    double z;
+
+    EXTRACT_WORDS(hx, lx, x);
+    ix = hx & 0x7fffffff;
+    EXTRACT_WORDS(hy, ly, y);
+    iy = hy & 0x7fffffff;
+
+    /* if x or y is nan, return nan */
+    if (npy_isnan(x * y)) {
+        return x + y;
+    }
+
+    if (x == 1.0) {
+        return npy_atan(y);
+    }
+
+    m = 2 * (npy_signbit((x)) != 0) + (npy_signbit((y)) != 0);
+    if (y == 0.0) {
+        switch(m) {
+        case 0:
+        case 1: return  y;  /* atan(+-0,+anything)=+-0 */
+        case 2: return  NPY_PI;/* atan(+0,-anything) = pi */
+        case 3: return -NPY_PI;/* atan(-0,-anything) =-pi */
+        }
+    }
+
+    if (x == 0.0) {
+        return y > 0 ? NPY_PI_2 : -NPY_PI_2;
+    }
+
+    if (npy_isinf(x)) {
+        if (npy_isinf(y)) {
+            switch(m) {
+                case 0: return  NPY_PI_4;/* atan(+INF,+INF) */
+                case 1: return -NPY_PI_4;/* atan(-INF,+INF) */
+                case 2: return  3.0*NPY_PI_4;/*atan(+INF,-INF)*/
+                case 3: return -3.0*NPY_PI_4;/*atan(-INF,-INF)*/
+            }
+        } else {
+            switch(m) {
+                case 0: return  NPY_PZERO;  /* atan(+...,+INF) */
+                case 1: return  NPY_NZERO;  /* atan(-...,+INF) */
+                case 2: return  NPY_PI;  /* atan(+...,-INF) */
+                case 3: return -NPY_PI;  /* atan(-...,-INF) */
+            }
+        }
+    }
+
+    if (npy_isinf(y)) {
+        return y > 0 ? NPY_PI_2 : -NPY_PI_2;
+    }
+
+    /* compute y/x */
+    k = (iy - ix) >> 20;
+    if (k > 60) {            /* |y/x| >  2**60 */
+        z = NPY_PI_2 + 0.5 * NPY_DBL_EPSILON;
+        m &= 1;
+    } else if (hx < 0 && k < -60) {
+        z = 0.0;    /* 0 > |y|/x > -2**-60 */
+    } else {
+        z = npy_atan(npy_fabs(y/x));        /* safe to do y/x */
+    }
+
+    switch (m) {
+        case 0: return  z  ;    /* atan(+,+) */
+        case 1: return -z  ;    /* atan(-,+) */
+        case 2: return  NPY_PI - (z - NPY_DBL_EPSILON);/* atan(+,-) */
+        default: /* case 3 */
+            return  (z - NPY_DBL_EPSILON) - NPY_PI;/* atan(-,-) */
+    }
+#endif
+}
+
+
+
+
+
+NPY_INPLACE double npy_hypot(double x, double y)
+{
+#ifndef NPY_BLOCK_HYPOT
+    return hypot(x, y);
+#else
+    double yx;
+
+    if (npy_isinf(x) || npy_isinf(y)) {
+        return NPY_INFINITY;
+    }
+
+    if (npy_isnan(x) || npy_isnan(y)) {
+        return NPY_NAN;
+    }
+
+    x = npy_fabs(x);
+    y = npy_fabs(y);
+    if (x < y) {
+        double temp = x;
+        x = y;
+        y = temp;
+    }
+    if (x == 0.) {
+        return 0.;
+    }
+    else {
+        yx = y/x;
+        return x*npy_sqrt(1.+yx*yx);
+    }
+#endif
+}
+
+/*
+ *
+ * sin, cos, tan
+ * sinh, cosh, tanh,
+ * fabs, floor, ceil, rint, trunc
+ * sqrt, log10, log, exp, expm1
+ * asin, acos, atan,
+ * asinh, acosh, atanh
+ *
+ * hypot, atan2, pow, fmod, modf
+ * ldexp, frexp, cbrt
+ *
+ * We assume the above are always available in their double versions.
+ *
+ * NOTE: some facilities may be available as macro only  instead of functions.
+ * For simplicity, we define our own functions and undef the macros. We could
+ * instead test for the macro, but I am lazy to do that for now.
+ */
+
+
+/*
+ * Decorate all the math functions which are available on the current platform
+ */
+
+#line 285
+#undef NPY__FP_SFX
+#if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
+    #define NPY__FP_SFX(X) X
+#else
+    #define NPY__FP_SFX(X) NPY_CAT(X, l)
+#endif
+/*
+ * On arm64 macOS, there's a bug with sin, cos, and tan where they don't
+ * raise "invalid" when given INFINITY as input.
+ */
+#if defined(__APPLE__) && defined(__arm64__)
+#define WORKAROUND_APPLE_TRIG_BUG 1
+#else
+#define WORKAROUND_APPLE_TRIG_BUG 0
+#endif
+
+#line 305
+NPY_INPLACE npy_longdouble npy_sinl(npy_longdouble x)
+{
+#if WORKAROUND_APPLE_TRIG_BUG
+    if (!npy_isfinite(x)) {
+        return (x - x);
+    }
+#endif
+    return NPY__FP_SFX(sin)(x);
+}
+
+
+#line 305
+NPY_INPLACE npy_longdouble npy_cosl(npy_longdouble x)
+{
+#if WORKAROUND_APPLE_TRIG_BUG
+    if (!npy_isfinite(x)) {
+        return (x - x);
+    }
+#endif
+    return NPY__FP_SFX(cos)(x);
+}
+
+
+#line 305
+NPY_INPLACE npy_longdouble npy_tanl(npy_longdouble x)
+{
+#if WORKAROUND_APPLE_TRIG_BUG
+    if (!npy_isfinite(x)) {
+        return (x - x);
+    }
+#endif
+    return NPY__FP_SFX(tan)(x);
+}
+
+
+
+#undef WORKAROUND_APPLE_TRIG_BUG
+
+
+#line 285
+#undef NPY__FP_SFX
+#if NPY_SIZEOF_DOUBLE == NPY_SIZEOF_DOUBLE
+    #define NPY__FP_SFX(X) X
+#else
+    #define NPY__FP_SFX(X) NPY_CAT(X, )
+#endif
+/*
+ * On arm64 macOS, there's a bug with sin, cos, and tan where they don't
+ * raise "invalid" when given INFINITY as input.
+ */
+#if defined(__APPLE__) && defined(__arm64__)
+#define WORKAROUND_APPLE_TRIG_BUG 1
+#else
+#define WORKAROUND_APPLE_TRIG_BUG 0
+#endif
+
+#line 305
+NPY_INPLACE npy_double npy_sin(npy_double x)
+{
+#if WORKAROUND_APPLE_TRIG_BUG
+    if (!npy_isfinite(x)) {
+        return (x - x);
+    }
+#endif
+    return NPY__FP_SFX(sin)(x);
+}
+
+
+#line 305
+NPY_INPLACE npy_double npy_cos(npy_double x)
+{
+#if WORKAROUND_APPLE_TRIG_BUG
+    if (!npy_isfinite(x)) {
+        return (x - x);
+    }
+#endif
+    return NPY__FP_SFX(cos)(x);
+}
+
+
+#line 305
+NPY_INPLACE npy_double npy_tan(npy_double x)
+{
+#if WORKAROUND_APPLE_TRIG_BUG
+    if (!npy_isfinite(x)) {
+        return (x - x);
+    }
+#endif
+    return NPY__FP_SFX(tan)(x);
+}
+
+
+
+#undef WORKAROUND_APPLE_TRIG_BUG
+
+
+#line 285
+#undef NPY__FP_SFX
+#if NPY_SIZEOF_FLOAT == NPY_SIZEOF_DOUBLE
+    #define NPY__FP_SFX(X) X
+#else
+    #define NPY__FP_SFX(X) NPY_CAT(X, f)
+#endif
+/*
+ * On arm64 macOS, there's a bug with sin, cos, and tan where they don't
+ * raise "invalid" when given INFINITY as input.
+ */
+#if defined(__APPLE__) && defined(__arm64__)
+#define WORKAROUND_APPLE_TRIG_BUG 1
+#else
+#define WORKAROUND_APPLE_TRIG_BUG 0
+#endif
+
+#line 305
+NPY_INPLACE npy_float npy_sinf(npy_float x)
+{
+#if WORKAROUND_APPLE_TRIG_BUG
+    if (!npy_isfinite(x)) {
+        return (x - x);
+    }
+#endif
+    return NPY__FP_SFX(sin)(x);
+}
+
+
+#line 305
+NPY_INPLACE npy_float npy_cosf(npy_float x)
+{
+#if WORKAROUND_APPLE_TRIG_BUG
+    if (!npy_isfinite(x)) {
+        return (x - x);
+    }
+#endif
+    return NPY__FP_SFX(cos)(x);
+}
+
+
+#line 305
+NPY_INPLACE npy_float npy_tanf(npy_float x)
+{
+#if WORKAROUND_APPLE_TRIG_BUG
+    if (!npy_isfinite(x)) {
+        return (x - x);
+    }
+#endif
+    return NPY__FP_SFX(tan)(x);
+}
+
+
+
+#undef WORKAROUND_APPLE_TRIG_BUG
+
+
+
+/* Blocklist-able C99 functions */
+
+#line 329
+#undef NPY__FP_SFX
+#if NPY_SIZEOF_FLOAT == NPY_SIZEOF_DOUBLE
+    #define NPY__FP_SFX(X) X
+#else
+    #define NPY__FP_SFX(X) NPY_CAT(X, f)
+#endif
+
+#line 340
+
+#ifdef expf
+#undef expf
+#endif
+#ifdef NPY_BLOCK_EXPF
+NPY_INPLACE npy_float npy_expf(npy_float x)
+{
+    return (npy_float) npy_exp((double)x);
+}
+#endif
+
+#ifndef NPY_BLOCK_EXPF
+NPY_INPLACE npy_float npy_expf(npy_float x)
+{
+    return NPY__FP_SFX(exp)(x);
+}
+#endif
+
+
+#line 340
+
+#ifdef log2f
+#undef log2f
+#endif
+#ifdef NPY_BLOCK_LOG2F
+NPY_INPLACE npy_float npy_log2f(npy_float x)
+{
+    return (npy_float) npy_log2((double)x);
+}
+#endif
+
+#ifndef NPY_BLOCK_LOG2F
+NPY_INPLACE npy_float npy_log2f(npy_float x)
+{
+    return NPY__FP_SFX(log2)(x);
+}
+#endif
+
+
+#line 340
+
+#ifdef sqrtf
+#undef sqrtf
+#endif
+#ifdef NPY_BLOCK_SQRTF
+NPY_INPLACE npy_float npy_sqrtf(npy_float x)
+{
+    return (npy_float) npy_sqrt((double)x);
+}
+#endif
+
+#ifndef NPY_BLOCK_SQRTF
+NPY_INPLACE npy_float npy_sqrtf(npy_float x)
+{
+    return NPY__FP_SFX(sqrt)(x);
+}
+#endif
+
+
+
+
+#line 365
+#ifdef atan2f
+#undef atan2f
+#endif
+#ifdef NPY_BLOCK_ATAN2F
+NPY_INPLACE npy_float npy_atan2f(npy_float x, npy_float y)
+{
+    return (npy_float) npy_atan2((double)x, (double) y);
+}
+#endif
+
+#ifndef NPY_BLOCK_ATAN2F
+NPY_INPLACE npy_float npy_atan2f(npy_float x, npy_float y)
+{
+    return NPY__FP_SFX(atan2)(x, y);
+}
+#endif
+
+#line 365
+#ifdef hypotf
+#undef hypotf
+#endif
+#ifdef NPY_BLOCK_HYPOTF
+NPY_INPLACE npy_float npy_hypotf(npy_float x, npy_float y)
+{
+    return (npy_float) npy_hypot((double)x, (double) y);
+}
+#endif
+
+#ifndef NPY_BLOCK_HYPOTF
+NPY_INPLACE npy_float npy_hypotf(npy_float x, npy_float y)
+{
+    return NPY__FP_SFX(hypot)(x, y);
+}
+#endif
+
+#line 365
+#ifdef powf
+#undef powf
+#endif
+#ifdef NPY_BLOCK_POWF
+NPY_INPLACE npy_float npy_powf(npy_float x, npy_float y)
+{
+    return (npy_float) npy_pow((double)x, (double) y);
+}
+#endif
+
+#ifndef NPY_BLOCK_POWF
+NPY_INPLACE npy_float npy_powf(npy_float x, npy_float y)
+{
+    return NPY__FP_SFX(pow)(x, y);
+}
+#endif
+
+
+#ifdef modff
+#undef modff
+#endif
+#ifdef NPY_BLOCK_MODFF
+NPY_INPLACE npy_float npy_modff(npy_float x, npy_float *iptr)
+{
+    double niptr;
+    double y = npy_modf((double)x, &niptr);
+    *iptr = (npy_float) niptr;
+    return (npy_float) y;
+}
+#endif
+
+#ifndef NPY_BLOCK_MODFF
+NPY_INPLACE npy_float npy_modff(npy_float x, npy_float *iptr)
+{
+    return NPY__FP_SFX(modf)(x, iptr);
+}
+#endif
+
+
+
+#line 329
+#undef NPY__FP_SFX
+#if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
+    #define NPY__FP_SFX(X) X
+#else
+    #define NPY__FP_SFX(X) NPY_CAT(X, l)
+#endif
+
+#line 340
+
+#ifdef expl
+#undef expl
+#endif
+#ifdef NPY_BLOCK_EXPL
+NPY_INPLACE npy_longdouble npy_expl(npy_longdouble x)
+{
+    return (npy_longdouble) npy_exp((double)x);
+}
+#endif
+
+#ifndef NPY_BLOCK_EXPL
+NPY_INPLACE npy_longdouble npy_expl(npy_longdouble x)
+{
+    return NPY__FP_SFX(exp)(x);
+}
+#endif
+
+
+#line 340
+
+#ifdef log2l
+#undef log2l
+#endif
+#ifdef NPY_BLOCK_LOG2L
+NPY_INPLACE npy_longdouble npy_log2l(npy_longdouble x)
+{
+    return (npy_longdouble) npy_log2((double)x);
+}
+#endif
+
+#ifndef NPY_BLOCK_LOG2L
+NPY_INPLACE npy_longdouble npy_log2l(npy_longdouble x)
+{
+    return NPY__FP_SFX(log2)(x);
+}
+#endif
+
+
+#line 340
+
+#ifdef sqrtl
+#undef sqrtl
+#endif
+#ifdef NPY_BLOCK_SQRTL
+NPY_INPLACE npy_longdouble npy_sqrtl(npy_longdouble x)
+{
+    return (npy_longdouble) npy_sqrt((double)x);
+}
+#endif
+
+#ifndef NPY_BLOCK_SQRTL
+NPY_INPLACE npy_longdouble npy_sqrtl(npy_longdouble x)
+{
+    return NPY__FP_SFX(sqrt)(x);
+}
+#endif
+
+
+
+
+#line 365
+#ifdef atan2l
+#undef atan2l
+#endif
+#ifdef NPY_BLOCK_ATAN2L
+NPY_INPLACE npy_longdouble npy_atan2l(npy_longdouble x, npy_longdouble y)
+{
+    return (npy_longdouble) npy_atan2((double)x, (double) y);
+}
+#endif
+
+#ifndef NPY_BLOCK_ATAN2L
+NPY_INPLACE npy_longdouble npy_atan2l(npy_longdouble x, npy_longdouble y)
+{
+    return NPY__FP_SFX(atan2)(x, y);
+}
+#endif
+
+#line 365
+#ifdef hypotl
+#undef hypotl
+#endif
+#ifdef NPY_BLOCK_HYPOTL
+NPY_INPLACE npy_longdouble npy_hypotl(npy_longdouble x, npy_longdouble y)
+{
+    return (npy_longdouble) npy_hypot((double)x, (double) y);
+}
+#endif
+
+#ifndef NPY_BLOCK_HYPOTL
+NPY_INPLACE npy_longdouble npy_hypotl(npy_longdouble x, npy_longdouble y)
+{
+    return NPY__FP_SFX(hypot)(x, y);
+}
+#endif
+
+#line 365
+#ifdef powl
+#undef powl
+#endif
+#ifdef NPY_BLOCK_POWL
+NPY_INPLACE npy_longdouble npy_powl(npy_longdouble x, npy_longdouble y)
+{
+    return (npy_longdouble) npy_pow((double)x, (double) y);
+}
+#endif
+
+#ifndef NPY_BLOCK_POWL
+NPY_INPLACE npy_longdouble npy_powl(npy_longdouble x, npy_longdouble y)
+{
+    return NPY__FP_SFX(pow)(x, y);
+}
+#endif
+
+
+#ifdef modfl
+#undef modfl
+#endif
+#ifdef NPY_BLOCK_MODFL
+NPY_INPLACE npy_longdouble npy_modfl(npy_longdouble x, npy_longdouble *iptr)
+{
+    double niptr;
+    double y = npy_modf((double)x, &niptr);
+    *iptr = (npy_longdouble) niptr;
+    return (npy_longdouble) y;
+}
+#endif
+
+#ifndef NPY_BLOCK_MODFL
+NPY_INPLACE npy_longdouble npy_modfl(npy_longdouble x, npy_longdouble *iptr)
+{
+    return NPY__FP_SFX(modf)(x, iptr);
+}
+#endif
+
+
+
+
+
+#undef NPY__FP_SFX
+
+
+/*
+ * Non standard functions
+ */
+
+#line 420
+#undef NPY__FP_SFX
+#if NPY_SIZEOF_FLOAT == NPY_SIZEOF_DOUBLE
+    #define NPY__FP_SFX(X) X
+#else
+    #define NPY__FP_SFX(X) NPY_CAT(X, f)
+#endif
+npy_float npy_heavisidef(npy_float x, npy_float h0)
+{
+    if (npy_isnan(x)) {
+        return (npy_float) NPY_NAN;
+    }
+    else if (x == 0) {
+        return h0;
+    }
+    else if (x < 0) {
+        return (npy_float) 0.0;
+    }
+    else {
+        return (npy_float) 1.0;
+    }
+}
+
+#define LOGE2    NPY__FP_SFX(NPY_LOGE2)
+#define LOG2E    NPY__FP_SFX(NPY_LOG2E)
+#define RAD2DEG  (NPY__FP_SFX(180.0)/NPY__FP_SFX(NPY_PI))
+#define DEG2RAD  (NPY__FP_SFX(NPY_PI)/NPY__FP_SFX(180.0))
+
+NPY_INPLACE npy_float npy_rad2degf(npy_float x)
+{
+    return x*RAD2DEG;
+}
+
+NPY_INPLACE npy_float npy_deg2radf(npy_float x)
+{
+    return x*DEG2RAD;
+}
+
+NPY_INPLACE npy_float npy_log2_1pf(npy_float x)
+{
+    return LOG2E*npy_log1pf(x);
+}
+
+NPY_INPLACE npy_float npy_exp2_m1f(npy_float x)
+{
+    return npy_expm1f(LOGE2*x);
+}
+
+NPY_INPLACE npy_float npy_logaddexpf(npy_float x, npy_float y)
+{
+    if (x == y) {
+        /* Handles infinities of the same sign without warnings */
+        return x + LOGE2;
+    }
+    else {
+        const npy_float tmp = x - y;
+        if (tmp > 0) {
+            return x + npy_log1pf(npy_expf(-tmp));
+        }
+        else if (tmp <= 0) {
+            return y + npy_log1pf(npy_expf(tmp));
+        }
+        else {
+            /* NaNs */
+            return tmp;
+        }
+    }
+}
+
+NPY_INPLACE npy_float npy_logaddexp2f(npy_float x, npy_float y)
+{
+    if (x == y) {
+        /* Handles infinities of the same sign without warnings */
+        return x + 1;
+    }
+    else {
+        const npy_float tmp = x - y;
+        if (tmp > 0) {
+            return x + npy_log2_1pf(npy_exp2f(-tmp));
+        }
+        else if (tmp <= 0) {
+            return y + npy_log2_1pf(npy_exp2f(tmp));
+        }
+        else {
+            /* NaNs */
+            return tmp;
+        }
+    }
+}
+
+/*
+ * Wrapper function for remainder edge cases
+ * Internally calls npy_divmod*
+ */
+NPY_INPLACE npy_float
+npy_remainderf(npy_float a, npy_float b)
+{
+    npy_float mod;
+    if (NPY_UNLIKELY(!b)) {
+        /*
+         * in2 == 0 (and not NaN): normal fmod will give the correct
+         * result (always NaN). `divmod` may set additional FPE for the
+         * division by zero creating an inf.
+         */
+        mod = npy_fmodf(a, b);
+    }
+    else {
+        npy_divmodf(a, b, &mod);
+    }
+    return mod;
+}
+
+NPY_INPLACE npy_float
+npy_floor_dividef(npy_float a, npy_float b) {
+    npy_float div, mod;
+    if (NPY_UNLIKELY(!b)) {
+        /*
+         * in2 == 0 (and not NaN): normal division will give the correct
+         * result (Inf or NaN). `divmod` may set additional FPE for the modulo
+         * evaluating to NaN.
+         */
+        div = a / b;
+    }
+    else {
+        div = npy_divmodf(a, b, &mod);
+    }
+    return div;
+}
+
+/*
+ * Python version of divmod.
+ *
+ * The implementation is mostly copied from cpython 3.5.
+ */
+NPY_INPLACE npy_float
+npy_divmodf(npy_float a, npy_float b, npy_float *modulus)
+{
+    npy_float div, mod, floordiv;
+
+    mod = npy_fmodf(a, b);
+    if (NPY_UNLIKELY(!b)) {
+        /* b == 0 (not NaN): return result of fmod. For IEEE is nan */
+        *modulus = mod;
+        return a / b;
+    }
+
+    /* a - mod should be very nearly an integer multiple of b */
+    div = (a - mod) / b;
+
+    /* adjust fmod result to conform to Python convention of remainder */
+    if (mod) {
+        if (isless(b, (npy_float)0) != isless(mod, (npy_float)0)) {
+            mod += b;
+            div -= 1.0f;
+        }
+    }
+    else {
+        /* if mod is zero ensure correct sign */
+        mod = npy_copysignf(0, b);
+    }
+
+    /* snap quotient to nearest integral value */
+    if (div) {
+        floordiv = npy_floorf(div);
+        if (isgreater(div - floordiv, 0.5f))
+            floordiv += 1.0f;
+    }
+    else {
+        /* if div is zero ensure correct sign */
+        floordiv = npy_copysignf(0, a/b);
+    }
+
+    *modulus = mod;
+    return floordiv;
+}
+
+#undef LOGE2
+#undef LOG2E
+#undef RAD2DEG
+#undef DEG2RAD
+#undef NPY__FP_SFX
+
+#line 420
+#undef NPY__FP_SFX
+#if NPY_SIZEOF_DOUBLE == NPY_SIZEOF_DOUBLE
+    #define NPY__FP_SFX(X) X
+#else
+    #define NPY__FP_SFX(X) NPY_CAT(X, )
+#endif
+npy_double npy_heaviside(npy_double x, npy_double h0)
+{
+    if (npy_isnan(x)) {
+        return (npy_double) NPY_NAN;
+    }
+    else if (x == 0) {
+        return h0;
+    }
+    else if (x < 0) {
+        return (npy_double) 0.0;
+    }
+    else {
+        return (npy_double) 1.0;
+    }
+}
+
+#define LOGE2    NPY__FP_SFX(NPY_LOGE2)
+#define LOG2E    NPY__FP_SFX(NPY_LOG2E)
+#define RAD2DEG  (NPY__FP_SFX(180.0)/NPY__FP_SFX(NPY_PI))
+#define DEG2RAD  (NPY__FP_SFX(NPY_PI)/NPY__FP_SFX(180.0))
+
+NPY_INPLACE npy_double npy_rad2deg(npy_double x)
+{
+    return x*RAD2DEG;
+}
+
+NPY_INPLACE npy_double npy_deg2rad(npy_double x)
+{
+    return x*DEG2RAD;
+}
+
+NPY_INPLACE npy_double npy_log2_1p(npy_double x)
+{
+    return LOG2E*npy_log1p(x);
+}
+
+NPY_INPLACE npy_double npy_exp2_m1(npy_double x)
+{
+    return npy_expm1(LOGE2*x);
+}
+
+NPY_INPLACE npy_double npy_logaddexp(npy_double x, npy_double y)
+{
+    if (x == y) {
+        /* Handles infinities of the same sign without warnings */
+        return x + LOGE2;
+    }
+    else {
+        const npy_double tmp = x - y;
+        if (tmp > 0) {
+            return x + npy_log1p(npy_exp(-tmp));
+        }
+        else if (tmp <= 0) {
+            return y + npy_log1p(npy_exp(tmp));
+        }
+        else {
+            /* NaNs */
+            return tmp;
+        }
+    }
+}
+
+NPY_INPLACE npy_double npy_logaddexp2(npy_double x, npy_double y)
+{
+    if (x == y) {
+        /* Handles infinities of the same sign without warnings */
+        return x + 1;
+    }
+    else {
+        const npy_double tmp = x - y;
+        if (tmp > 0) {
+            return x + npy_log2_1p(npy_exp2(-tmp));
+        }
+        else if (tmp <= 0) {
+            return y + npy_log2_1p(npy_exp2(tmp));
+        }
+        else {
+            /* NaNs */
+            return tmp;
+        }
+    }
+}
+
+/*
+ * Wrapper function for remainder edge cases
+ * Internally calls npy_divmod*
+ */
+NPY_INPLACE npy_double
+npy_remainder(npy_double a, npy_double b)
+{
+    npy_double mod;
+    if (NPY_UNLIKELY(!b)) {
+        /*
+         * in2 == 0 (and not NaN): normal fmod will give the correct
+         * result (always NaN). `divmod` may set additional FPE for the
+         * division by zero creating an inf.
+         */
+        mod = npy_fmod(a, b);
+    }
+    else {
+        npy_divmod(a, b, &mod);
+    }
+    return mod;
+}
+
+NPY_INPLACE npy_double
+npy_floor_divide(npy_double a, npy_double b) {
+    npy_double div, mod;
+    if (NPY_UNLIKELY(!b)) {
+        /*
+         * in2 == 0 (and not NaN): normal division will give the correct
+         * result (Inf or NaN). `divmod` may set additional FPE for the modulo
+         * evaluating to NaN.
+         */
+        div = a / b;
+    }
+    else {
+        div = npy_divmod(a, b, &mod);
+    }
+    return div;
+}
+
+/*
+ * Python version of divmod.
+ *
+ * The implementation is mostly copied from cpython 3.5.
+ */
+NPY_INPLACE npy_double
+npy_divmod(npy_double a, npy_double b, npy_double *modulus)
+{
+    npy_double div, mod, floordiv;
+
+    mod = npy_fmod(a, b);
+    if (NPY_UNLIKELY(!b)) {
+        /* b == 0 (not NaN): return result of fmod. For IEEE is nan */
+        *modulus = mod;
+        return a / b;
+    }
+
+    /* a - mod should be very nearly an integer multiple of b */
+    div = (a - mod) / b;
+
+    /* adjust fmod result to conform to Python convention of remainder */
+    if (mod) {
+        if (isless(b, (npy_double)0) != isless(mod, (npy_double)0)) {
+            mod += b;
+            div -= 1.0;
+        }
+    }
+    else {
+        /* if mod is zero ensure correct sign */
+        mod = npy_copysign(0, b);
+    }
+
+    /* snap quotient to nearest integral value */
+    if (div) {
+        floordiv = npy_floor(div);
+        if (isgreater(div - floordiv, 0.5))
+            floordiv += 1.0;
+    }
+    else {
+        /* if div is zero ensure correct sign */
+        floordiv = npy_copysign(0, a/b);
+    }
+
+    *modulus = mod;
+    return floordiv;
+}
+
+#undef LOGE2
+#undef LOG2E
+#undef RAD2DEG
+#undef DEG2RAD
+#undef NPY__FP_SFX
+
+#line 420
+#undef NPY__FP_SFX
+#if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
+    #define NPY__FP_SFX(X) X
+#else
+    #define NPY__FP_SFX(X) NPY_CAT(X, l)
+#endif
+npy_longdouble npy_heavisidel(npy_longdouble x, npy_longdouble h0)
+{
+    if (npy_isnan(x)) {
+        return (npy_longdouble) NPY_NAN;
+    }
+    else if (x == 0) {
+        return h0;
+    }
+    else if (x < 0) {
+        return (npy_longdouble) 0.0;
+    }
+    else {
+        return (npy_longdouble) 1.0;
+    }
+}
+
+#define LOGE2    NPY__FP_SFX(NPY_LOGE2)
+#define LOG2E    NPY__FP_SFX(NPY_LOG2E)
+#define RAD2DEG  (NPY__FP_SFX(180.0)/NPY__FP_SFX(NPY_PI))
+#define DEG2RAD  (NPY__FP_SFX(NPY_PI)/NPY__FP_SFX(180.0))
+
+NPY_INPLACE npy_longdouble npy_rad2degl(npy_longdouble x)
+{
+    return x*RAD2DEG;
+}
+
+NPY_INPLACE npy_longdouble npy_deg2radl(npy_longdouble x)
+{
+    return x*DEG2RAD;
+}
+
+NPY_INPLACE npy_longdouble npy_log2_1pl(npy_longdouble x)
+{
+    return LOG2E*npy_log1pl(x);
+}
+
+NPY_INPLACE npy_longdouble npy_exp2_m1l(npy_longdouble x)
+{
+    return npy_expm1l(LOGE2*x);
+}
+
+NPY_INPLACE npy_longdouble npy_logaddexpl(npy_longdouble x, npy_longdouble y)
+{
+    if (x == y) {
+        /* Handles infinities of the same sign without warnings */
+        return x + LOGE2;
+    }
+    else {
+        const npy_longdouble tmp = x - y;
+        if (tmp > 0) {
+            return x + npy_log1pl(npy_expl(-tmp));
+        }
+        else if (tmp <= 0) {
+            return y + npy_log1pl(npy_expl(tmp));
+        }
+        else {
+            /* NaNs */
+            return tmp;
+        }
+    }
+}
+
+NPY_INPLACE npy_longdouble npy_logaddexp2l(npy_longdouble x, npy_longdouble y)
+{
+    if (x == y) {
+        /* Handles infinities of the same sign without warnings */
+        return x + 1;
+    }
+    else {
+        const npy_longdouble tmp = x - y;
+        if (tmp > 0) {
+            return x + npy_log2_1pl(npy_exp2l(-tmp));
+        }
+        else if (tmp <= 0) {
+            return y + npy_log2_1pl(npy_exp2l(tmp));
+        }
+        else {
+            /* NaNs */
+            return tmp;
+        }
+    }
+}
+
+/*
+ * Wrapper function for remainder edge cases
+ * Internally calls npy_divmod*
+ */
+NPY_INPLACE npy_longdouble
+npy_remainderl(npy_longdouble a, npy_longdouble b)
+{
+    npy_longdouble mod;
+    if (NPY_UNLIKELY(!b)) {
+        /*
+         * in2 == 0 (and not NaN): normal fmod will give the correct
+         * result (always NaN). `divmod` may set additional FPE for the
+         * division by zero creating an inf.
+         */
+        mod = npy_fmodl(a, b);
+    }
+    else {
+        npy_divmodl(a, b, &mod);
+    }
+    return mod;
+}
+
+NPY_INPLACE npy_longdouble
+npy_floor_dividel(npy_longdouble a, npy_longdouble b) {
+    npy_longdouble div, mod;
+    if (NPY_UNLIKELY(!b)) {
+        /*
+         * in2 == 0 (and not NaN): normal division will give the correct
+         * result (Inf or NaN). `divmod` may set additional FPE for the modulo
+         * evaluating to NaN.
+         */
+        div = a / b;
+    }
+    else {
+        div = npy_divmodl(a, b, &mod);
+    }
+    return div;
+}
+
+/*
+ * Python version of divmod.
+ *
+ * The implementation is mostly copied from cpython 3.5.
+ */
+NPY_INPLACE npy_longdouble
+npy_divmodl(npy_longdouble a, npy_longdouble b, npy_longdouble *modulus)
+{
+    npy_longdouble div, mod, floordiv;
+
+    mod = npy_fmodl(a, b);
+    if (NPY_UNLIKELY(!b)) {
+        /* b == 0 (not NaN): return result of fmod. For IEEE is nan */
+        *modulus = mod;
+        return a / b;
+    }
+
+    /* a - mod should be very nearly an integer multiple of b */
+    div = (a - mod) / b;
+
+    /* adjust fmod result to conform to Python convention of remainder */
+    if (mod) {
+        if (isless(b, (npy_longdouble)0) != isless(mod, (npy_longdouble)0)) {
+            mod += b;
+            div -= 1.0l;
+        }
+    }
+    else {
+        /* if mod is zero ensure correct sign */
+        mod = npy_copysignl(0, b);
+    }
+
+    /* snap quotient to nearest integral value */
+    if (div) {
+        floordiv = npy_floorl(div);
+        if (isgreater(div - floordiv, 0.5l))
+            floordiv += 1.0l;
+    }
+    else {
+        /* if div is zero ensure correct sign */
+        floordiv = npy_copysignl(0, a/b);
+    }
+
+    *modulus = mod;
+    return floordiv;
+}
+
+#undef LOGE2
+#undef LOG2E
+#undef RAD2DEG
+#undef DEG2RAD
+#undef NPY__FP_SFX
+
+
+#line 607
+NPY_INPLACE npy_uint
+npy_gcdu(npy_uint a, npy_uint b)
+{
+    npy_uint c;
+    while (a != 0) {
+        c = a;
+        a = b%a;
+        b = c;
+    }
+    return b;
+}
+
+NPY_INPLACE npy_uint
+npy_lcmu(npy_uint a, npy_uint b)
+{
+    npy_uint gcd = npy_gcdu(a, b);
+    return gcd == 0 ? 0 : a / gcd * b;
+}
+
+#line 607
+NPY_INPLACE npy_ulong
+npy_gcdul(npy_ulong a, npy_ulong b)
+{
+    npy_ulong c;
+    while (a != 0) {
+        c = a;
+        a = b%a;
+        b = c;
+    }
+    return b;
+}
+
+NPY_INPLACE npy_ulong
+npy_lcmul(npy_ulong a, npy_ulong b)
+{
+    npy_ulong gcd = npy_gcdul(a, b);
+    return gcd == 0 ? 0 : a / gcd * b;
+}
+
+#line 607
+NPY_INPLACE npy_ulonglong
+npy_gcdull(npy_ulonglong a, npy_ulonglong b)
+{
+    npy_ulonglong c;
+    while (a != 0) {
+        c = a;
+        a = b%a;
+        b = c;
+    }
+    return b;
+}
+
+NPY_INPLACE npy_ulonglong
+npy_lcmull(npy_ulonglong a, npy_ulonglong b)
+{
+    npy_ulonglong gcd = npy_gcdull(a, b);
+    return gcd == 0 ? 0 : a / gcd * b;
+}
+
+
+#line 633
+NPY_INPLACE npy_int
+npy_gcd(npy_int a, npy_int b)
+{
+    return npy_gcdu(a < 0 ? -a : a, b < 0 ? -b : b);
+}
+
+#line 633
+NPY_INPLACE npy_long
+npy_gcdl(npy_long a, npy_long b)
+{
+    return npy_gcdul(a < 0 ? -a : a, b < 0 ? -b : b);
+}
+
+#line 633
+NPY_INPLACE npy_longlong
+npy_gcdll(npy_longlong a, npy_longlong b)
+{
+    return npy_gcdull(a < 0 ? -a : a, b < 0 ? -b : b);
+}
+
+#line 633
+NPY_INPLACE npy_int
+npy_lcm(npy_int a, npy_int b)
+{
+    return npy_lcmu(a < 0 ? -a : a, b < 0 ? -b : b);
+}
+
+#line 633
+NPY_INPLACE npy_long
+npy_lcml(npy_long a, npy_long b)
+{
+    return npy_lcmul(a < 0 ? -a : a, b < 0 ? -b : b);
+}
+
+#line 633
+NPY_INPLACE npy_longlong
+npy_lcmll(npy_longlong a, npy_longlong b)
+{
+    return npy_lcmull(a < 0 ? -a : a, b < 0 ? -b : b);
+}
+
+
+/* Unlike LCM and GCD, we need byte and short variants for the shift operators,
+ * since the result is dependent on the width of the type
+ */
+#line 648
+#line 653
+NPY_INPLACE npy_ubyte
+npy_lshiftuhh(npy_ubyte a, npy_ubyte b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a << b;
+    }
+    else {
+        return 0;
+    }
+}
+NPY_INPLACE npy_ubyte
+npy_rshiftuhh(npy_ubyte a, npy_ubyte b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a >> b;
+    }
+#if 0
+    else if (a < 0) {
+        return (npy_ubyte)-1;  /* preserve the sign bit */
+    }
+#endif
+    else {
+        return 0;
+    }
+}
+
+#line 653
+NPY_INPLACE npy_byte
+npy_lshifthh(npy_byte a, npy_byte b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a << b;
+    }
+    else {
+        return 0;
+    }
+}
+NPY_INPLACE npy_byte
+npy_rshifthh(npy_byte a, npy_byte b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a >> b;
+    }
+#if 1
+    else if (a < 0) {
+        return (npy_byte)-1;  /* preserve the sign bit */
+    }
+#endif
+    else {
+        return 0;
+    }
+}
+
+
+#line 648
+#line 653
+NPY_INPLACE npy_ushort
+npy_lshiftuh(npy_ushort a, npy_ushort b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a << b;
+    }
+    else {
+        return 0;
+    }
+}
+NPY_INPLACE npy_ushort
+npy_rshiftuh(npy_ushort a, npy_ushort b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a >> b;
+    }
+#if 0
+    else if (a < 0) {
+        return (npy_ushort)-1;  /* preserve the sign bit */
+    }
+#endif
+    else {
+        return 0;
+    }
+}
+
+#line 653
+NPY_INPLACE npy_short
+npy_lshifth(npy_short a, npy_short b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a << b;
+    }
+    else {
+        return 0;
+    }
+}
+NPY_INPLACE npy_short
+npy_rshifth(npy_short a, npy_short b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a >> b;
+    }
+#if 1
+    else if (a < 0) {
+        return (npy_short)-1;  /* preserve the sign bit */
+    }
+#endif
+    else {
+        return 0;
+    }
+}
+
+
+#line 648
+#line 653
+NPY_INPLACE npy_uint
+npy_lshiftu(npy_uint a, npy_uint b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a << b;
+    }
+    else {
+        return 0;
+    }
+}
+NPY_INPLACE npy_uint
+npy_rshiftu(npy_uint a, npy_uint b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a >> b;
+    }
+#if 0
+    else if (a < 0) {
+        return (npy_uint)-1;  /* preserve the sign bit */
+    }
+#endif
+    else {
+        return 0;
+    }
+}
+
+#line 653
+NPY_INPLACE npy_int
+npy_lshift(npy_int a, npy_int b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a << b;
+    }
+    else {
+        return 0;
+    }
+}
+NPY_INPLACE npy_int
+npy_rshift(npy_int a, npy_int b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a >> b;
+    }
+#if 1
+    else if (a < 0) {
+        return (npy_int)-1;  /* preserve the sign bit */
+    }
+#endif
+    else {
+        return 0;
+    }
+}
+
+
+#line 648
+#line 653
+NPY_INPLACE npy_ulong
+npy_lshiftul(npy_ulong a, npy_ulong b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a << b;
+    }
+    else {
+        return 0;
+    }
+}
+NPY_INPLACE npy_ulong
+npy_rshiftul(npy_ulong a, npy_ulong b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a >> b;
+    }
+#if 0
+    else if (a < 0) {
+        return (npy_ulong)-1;  /* preserve the sign bit */
+    }
+#endif
+    else {
+        return 0;
+    }
+}
+
+#line 653
+NPY_INPLACE npy_long
+npy_lshiftl(npy_long a, npy_long b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a << b;
+    }
+    else {
+        return 0;
+    }
+}
+NPY_INPLACE npy_long
+npy_rshiftl(npy_long a, npy_long b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a >> b;
+    }
+#if 1
+    else if (a < 0) {
+        return (npy_long)-1;  /* preserve the sign bit */
+    }
+#endif
+    else {
+        return 0;
+    }
+}
+
+
+#line 648
+#line 653
+NPY_INPLACE npy_ulonglong
+npy_lshiftull(npy_ulonglong a, npy_ulonglong b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a << b;
+    }
+    else {
+        return 0;
+    }
+}
+NPY_INPLACE npy_ulonglong
+npy_rshiftull(npy_ulonglong a, npy_ulonglong b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a >> b;
+    }
+#if 0
+    else if (a < 0) {
+        return (npy_ulonglong)-1;  /* preserve the sign bit */
+    }
+#endif
+    else {
+        return 0;
+    }
+}
+
+#line 653
+NPY_INPLACE npy_longlong
+npy_lshiftll(npy_longlong a, npy_longlong b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a << b;
+    }
+    else {
+        return 0;
+    }
+}
+NPY_INPLACE npy_longlong
+npy_rshiftll(npy_longlong a, npy_longlong b)
+{
+    if (NPY_LIKELY((size_t)b < sizeof(a) * CHAR_BIT)) {
+        return a >> b;
+    }
+#if 1
+    else if (a < 0) {
+        return (npy_longlong)-1;  /* preserve the sign bit */
+    }
+#endif
+    else {
+        return 0;
+    }
+}
+
+
+
+
+#define __popcnt32 __popcnt
+#line 689
+#undef TO_BITS_LEN
+#if 0
+#line 694
+#elif NPY_BITSOF_BYTE == 8
+    #define TO_BITS_LEN(X) X##8
+
+#line 694
+#elif NPY_BITSOF_BYTE == 16
+    #define TO_BITS_LEN(X) X##16
+
+#line 694
+#elif NPY_BITSOF_BYTE == 32
+    #define TO_BITS_LEN(X) X##32
+
+#line 694
+#elif NPY_BITSOF_BYTE == 64
+    #define TO_BITS_LEN(X) X##64
+
+#endif
+
+
+NPY_INPLACE uint8_t
+npy_popcount_parallelhh(npy_ubyte a)
+{
+    a = a - ((a >> 1) & (npy_ubyte) TO_BITS_LEN(MAGIC)[0]);
+    a = ((a & (npy_ubyte) TO_BITS_LEN(MAGIC)[1])) + ((a >> 2) & (npy_ubyte) TO_BITS_LEN(MAGIC)[1]);
+    a = (a + (a >> 4)) & (npy_ubyte) TO_BITS_LEN(MAGIC)[2];
+    return (npy_ubyte) (a * (npy_ubyte) TO_BITS_LEN(MAGIC)[3]) >> ((NPY_SIZEOF_BYTE - 1) * CHAR_BIT);
+}
+
+NPY_INPLACE uint8_t
+npy_popcountuhh(npy_ubyte a)
+{
+/* use built-in popcount if present, else use our implementation */
+#if (defined(__clang__) || defined(__GNUC__)) && NPY_BITSOF_BYTE >= 32
+    return __builtin_popcounthh(a);
+#elif defined(_MSC_VER) && NPY_BITSOF_BYTE >= 16 && !defined(_M_ARM64) && !defined(_M_ARM)
+    /* no builtin __popcnt64 for 32 bits */
+    #if defined(_WIN64) || (defined(_WIN32) && NPY_BITSOF_BYTE != 64)
+        return TO_BITS_LEN(__popcnt)(a);
+    /* split 64 bit number into two 32 bit ints and return sum of counts */
+    #elif (defined(_WIN32) && NPY_BITSOF_BYTE == 64)
+        npy_uint32 left  = (npy_uint32) (a>>32);
+        npy_uint32 right = (npy_uint32) a;
+        return __popcnt32(left) + __popcnt32(right);
+    #endif
+#else
+    return npy_popcount_parallelhh(a);
+#endif
+}
+
+#line 689
+#undef TO_BITS_LEN
+#if 0
+#line 694
+#elif NPY_BITSOF_SHORT == 8
+    #define TO_BITS_LEN(X) X##8
+
+#line 694
+#elif NPY_BITSOF_SHORT == 16
+    #define TO_BITS_LEN(X) X##16
+
+#line 694
+#elif NPY_BITSOF_SHORT == 32
+    #define TO_BITS_LEN(X) X##32
+
+#line 694
+#elif NPY_BITSOF_SHORT == 64
+    #define TO_BITS_LEN(X) X##64
+
+#endif
+
+
+NPY_INPLACE uint8_t
+npy_popcount_parallelh(npy_ushort a)
+{
+    a = a - ((a >> 1) & (npy_ushort) TO_BITS_LEN(MAGIC)[0]);
+    a = ((a & (npy_ushort) TO_BITS_LEN(MAGIC)[1])) + ((a >> 2) & (npy_ushort) TO_BITS_LEN(MAGIC)[1]);
+    a = (a + (a >> 4)) & (npy_ushort) TO_BITS_LEN(MAGIC)[2];
+    return (npy_ushort) (a * (npy_ushort) TO_BITS_LEN(MAGIC)[3]) >> ((NPY_SIZEOF_SHORT - 1) * CHAR_BIT);
+}
+
+NPY_INPLACE uint8_t
+npy_popcountuh(npy_ushort a)
+{
+/* use built-in popcount if present, else use our implementation */
+#if (defined(__clang__) || defined(__GNUC__)) && NPY_BITSOF_SHORT >= 32
+    return __builtin_popcounth(a);
+#elif defined(_MSC_VER) && NPY_BITSOF_SHORT >= 16 && !defined(_M_ARM64) && !defined(_M_ARM)
+    /* no builtin __popcnt64 for 32 bits */
+    #if defined(_WIN64) || (defined(_WIN32) && NPY_BITSOF_SHORT != 64)
+        return TO_BITS_LEN(__popcnt)(a);
+    /* split 64 bit number into two 32 bit ints and return sum of counts */
+    #elif (defined(_WIN32) && NPY_BITSOF_SHORT == 64)
+        npy_uint32 left  = (npy_uint32) (a>>32);
+        npy_uint32 right = (npy_uint32) a;
+        return __popcnt32(left) + __popcnt32(right);
+    #endif
+#else
+    return npy_popcount_parallelh(a);
+#endif
+}
+
+#line 689
+#undef TO_BITS_LEN
+#if 0
+#line 694
+#elif NPY_BITSOF_INT == 8
+    #define TO_BITS_LEN(X) X##8
+
+#line 694
+#elif NPY_BITSOF_INT == 16
+    #define TO_BITS_LEN(X) X##16
+
+#line 694
+#elif NPY_BITSOF_INT == 32
+    #define TO_BITS_LEN(X) X##32
+
+#line 694
+#elif NPY_BITSOF_INT == 64
+    #define TO_BITS_LEN(X) X##64
+
+#endif
+
+
+NPY_INPLACE uint8_t
+npy_popcount_parallel(npy_uint a)
+{
+    a = a - ((a >> 1) & (npy_uint) TO_BITS_LEN(MAGIC)[0]);
+    a = ((a & (npy_uint) TO_BITS_LEN(MAGIC)[1])) + ((a >> 2) & (npy_uint) TO_BITS_LEN(MAGIC)[1]);
+    a = (a + (a >> 4)) & (npy_uint) TO_BITS_LEN(MAGIC)[2];
+    return (npy_uint) (a * (npy_uint) TO_BITS_LEN(MAGIC)[3]) >> ((NPY_SIZEOF_INT - 1) * CHAR_BIT);
+}
+
+NPY_INPLACE uint8_t
+npy_popcountu(npy_uint a)
+{
+/* use built-in popcount if present, else use our implementation */
+#if (defined(__clang__) || defined(__GNUC__)) && NPY_BITSOF_INT >= 32
+    return __builtin_popcount(a);
+#elif defined(_MSC_VER) && NPY_BITSOF_INT >= 16 && !defined(_M_ARM64) && !defined(_M_ARM)
+    /* no builtin __popcnt64 for 32 bits */
+    #if defined(_WIN64) || (defined(_WIN32) && NPY_BITSOF_INT != 64)
+        return TO_BITS_LEN(__popcnt)(a);
+    /* split 64 bit number into two 32 bit ints and return sum of counts */
+    #elif (defined(_WIN32) && NPY_BITSOF_INT == 64)
+        npy_uint32 left  = (npy_uint32) (a>>32);
+        npy_uint32 right = (npy_uint32) a;
+        return __popcnt32(left) + __popcnt32(right);
+    #endif
+#else
+    return npy_popcount_parallel(a);
+#endif
+}
+
+#line 689
+#undef TO_BITS_LEN
+#if 0
+#line 694
+#elif NPY_BITSOF_LONG == 8
+    #define TO_BITS_LEN(X) X##8
+
+#line 694
+#elif NPY_BITSOF_LONG == 16
+    #define TO_BITS_LEN(X) X##16
+
+#line 694
+#elif NPY_BITSOF_LONG == 32
+    #define TO_BITS_LEN(X) X##32
+
+#line 694
+#elif NPY_BITSOF_LONG == 64
+    #define TO_BITS_LEN(X) X##64
+
+#endif
+
+
+NPY_INPLACE uint8_t
+npy_popcount_parallell(npy_ulong a)
+{
+    a = a - ((a >> 1) & (npy_ulong) TO_BITS_LEN(MAGIC)[0]);
+    a = ((a & (npy_ulong) TO_BITS_LEN(MAGIC)[1])) + ((a >> 2) & (npy_ulong) TO_BITS_LEN(MAGIC)[1]);
+    a = (a + (a >> 4)) & (npy_ulong) TO_BITS_LEN(MAGIC)[2];
+    return (npy_ulong) (a * (npy_ulong) TO_BITS_LEN(MAGIC)[3]) >> ((NPY_SIZEOF_LONG - 1) * CHAR_BIT);
+}
+
+NPY_INPLACE uint8_t
+npy_popcountul(npy_ulong a)
+{
+/* use built-in popcount if present, else use our implementation */
+#if (defined(__clang__) || defined(__GNUC__)) && NPY_BITSOF_LONG >= 32
+    return __builtin_popcountl(a);
+#elif defined(_MSC_VER) && NPY_BITSOF_LONG >= 16 && !defined(_M_ARM64) && !defined(_M_ARM)
+    /* no builtin __popcnt64 for 32 bits */
+    #if defined(_WIN64) || (defined(_WIN32) && NPY_BITSOF_LONG != 64)
+        return TO_BITS_LEN(__popcnt)(a);
+    /* split 64 bit number into two 32 bit ints and return sum of counts */
+    #elif (defined(_WIN32) && NPY_BITSOF_LONG == 64)
+        npy_uint32 left  = (npy_uint32) (a>>32);
+        npy_uint32 right = (npy_uint32) a;
+        return __popcnt32(left) + __popcnt32(right);
+    #endif
+#else
+    return npy_popcount_parallell(a);
+#endif
+}
+
+#line 689
+#undef TO_BITS_LEN
+#if 0
+#line 694
+#elif NPY_BITSOF_LONGLONG == 8
+    #define TO_BITS_LEN(X) X##8
+
+#line 694
+#elif NPY_BITSOF_LONGLONG == 16
+    #define TO_BITS_LEN(X) X##16
+
+#line 694
+#elif NPY_BITSOF_LONGLONG == 32
+    #define TO_BITS_LEN(X) X##32
+
+#line 694
+#elif NPY_BITSOF_LONGLONG == 64
+    #define TO_BITS_LEN(X) X##64
+
+#endif
+
+
+NPY_INPLACE uint8_t
+npy_popcount_parallelll(npy_ulonglong a)
+{
+    a = a - ((a >> 1) & (npy_ulonglong) TO_BITS_LEN(MAGIC)[0]);
+    a = ((a & (npy_ulonglong) TO_BITS_LEN(MAGIC)[1])) + ((a >> 2) & (npy_ulonglong) TO_BITS_LEN(MAGIC)[1]);
+    a = (a + (a >> 4)) & (npy_ulonglong) TO_BITS_LEN(MAGIC)[2];
+    return (npy_ulonglong) (a * (npy_ulonglong) TO_BITS_LEN(MAGIC)[3]) >> ((NPY_SIZEOF_LONGLONG - 1) * CHAR_BIT);
+}
+
+NPY_INPLACE uint8_t
+npy_popcountull(npy_ulonglong a)
+{
+/* use built-in popcount if present, else use our implementation */
+#if (defined(__clang__) || defined(__GNUC__)) && NPY_BITSOF_LONGLONG >= 32
+    return __builtin_popcountll(a);
+#elif defined(_MSC_VER) && NPY_BITSOF_LONGLONG >= 16 && !defined(_M_ARM64) && !defined(_M_ARM)
+    /* no builtin __popcnt64 for 32 bits */
+    #if defined(_WIN64) || (defined(_WIN32) && NPY_BITSOF_LONGLONG != 64)
+        return TO_BITS_LEN(__popcnt)(a);
+    /* split 64 bit number into two 32 bit ints and return sum of counts */
+    #elif (defined(_WIN32) && NPY_BITSOF_LONGLONG == 64)
+        npy_uint32 left  = (npy_uint32) (a>>32);
+        npy_uint32 right = (npy_uint32) a;
+        return __popcnt32(left) + __popcnt32(right);
+    #endif
+#else
+    return npy_popcount_parallelll(a);
+#endif
+}
+
+
+#line 736
+NPY_INPLACE uint8_t
+npy_popcounthh(npy_byte a)
+{
+    /* Return popcount of abs(a) */
+    return npy_popcountuhh(a < 0 ? -a : a);
+}
+
+#line 736
+NPY_INPLACE uint8_t
+npy_popcounth(npy_short a)
+{
+    /* Return popcount of abs(a) */
+    return npy_popcountuh(a < 0 ? -a : a);
+}
+
+#line 736
+NPY_INPLACE uint8_t
+npy_popcount(npy_int a)
+{
+    /* Return popcount of abs(a) */
+    return npy_popcountu(a < 0 ? -a : a);
+}
+
+#line 736
+NPY_INPLACE uint8_t
+npy_popcountl(npy_long a)
+{
+    /* Return popcount of abs(a) */
+    return npy_popcountul(a < 0 ? -a : a);
+}
+
+#line 736
+NPY_INPLACE uint8_t
+npy_popcountll(npy_longlong a)
+{
+    /* Return popcount of abs(a) */
+    return npy_popcountull(a < 0 ? -a : a);
+}
+
+
+
diff --git a/numpy/core/src/_generated/npy_sort.h b/numpy/core/src/_generated/npy_sort.h
new file mode 100644
index 000000000000..a2231b4b3d01
--- /dev/null
+++ b/numpy/core/src/_generated/npy_sort.h
@@ -0,0 +1,463 @@
+#line 1 "numpy/core/src/common/npy_sort.h.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+#ifndef __NPY_SORT_H__
+#define __NPY_SORT_H__
+
+/* Python include is for future object sorts */
+#include <Python.h>
+#include <numpy/npy_common.h>
+#include <numpy/ndarraytypes.h>
+
+#define NPY_ENOMEM 1
+#define NPY_ECOMP 2
+
+static inline int npy_get_msb(npy_uintp unum)
+{
+    int depth_limit = 0;
+    while (unum >>= 1)  {
+        depth_limit++;
+    }
+    return depth_limit;
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+/*
+ *****************************************************************************
+ **                            NUMERIC SORTS                                **
+ *****************************************************************************
+ */
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_bool(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_bool(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_bool(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_bool(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_bool(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_bool(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_bool(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_bool(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_byte(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_byte(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_byte(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_byte(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_byte(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_byte(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_byte(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_byte(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_ubyte(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_ubyte(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_ubyte(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_ubyte(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_ubyte(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_ubyte(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_ubyte(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_ubyte(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_short(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_short(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_short(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_short(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_short(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_short(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_short(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_short(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_ushort(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_ushort(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_ushort(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_ushort(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_ushort(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_ushort(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_ushort(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_ushort(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_int(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_int(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_int(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_int(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_int(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_int(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_int(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_int(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_uint(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_uint(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_uint(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_uint(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_uint(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_uint(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_uint(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_uint(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_long(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_long(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_long(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_long(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_long(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_long(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_long(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_long(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_ulong(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_ulong(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_ulong(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_ulong(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_ulong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_ulong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_ulong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_ulong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_longlong(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_longlong(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_longlong(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_longlong(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_longlong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_longlong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_longlong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_longlong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_ulonglong(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_ulonglong(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_ulonglong(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_ulonglong(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_ulonglong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_ulonglong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_ulonglong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_ulonglong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_half(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_half(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_half(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_half(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_half(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_half(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_half(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_half(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_float(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_float(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_float(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_float(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_float(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_float(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_float(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_float(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_double(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_double(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_double(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_double(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_double(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_double(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_double(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_double(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_longdouble(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_longdouble(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_longdouble(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_longdouble(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_longdouble(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_longdouble(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_longdouble(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_longdouble(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_cfloat(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_cfloat(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_cfloat(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_cfloat(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_cfloat(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_cfloat(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_cfloat(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_cfloat(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_cdouble(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_cdouble(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_cdouble(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_cdouble(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_cdouble(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_cdouble(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_cdouble(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_cdouble(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_clongdouble(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_clongdouble(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_clongdouble(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_clongdouble(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_clongdouble(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_clongdouble(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_clongdouble(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_clongdouble(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_datetime(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_datetime(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_datetime(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_datetime(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_datetime(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_datetime(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_datetime(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_datetime(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+#line 40
+
+NPY_NO_EXPORT int quicksort_timedelta(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int heapsort_timedelta(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int mergesort_timedelta(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int timsort_timedelta(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aquicksort_timedelta(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aheapsort_timedelta(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int amergesort_timedelta(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+NPY_NO_EXPORT int atimsort_timedelta(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+
+
+
+#line 57
+#ifdef __cplusplus
+extern "C" {
+#endif
+NPY_NO_EXPORT int radixsort_bool(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aradixsort_bool(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+#ifdef __cplusplus
+}
+#endif
+
+
+#line 57
+#ifdef __cplusplus
+extern "C" {
+#endif
+NPY_NO_EXPORT int radixsort_byte(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aradixsort_byte(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+#ifdef __cplusplus
+}
+#endif
+
+
+#line 57
+#ifdef __cplusplus
+extern "C" {
+#endif
+NPY_NO_EXPORT int radixsort_ubyte(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aradixsort_ubyte(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+#ifdef __cplusplus
+}
+#endif
+
+
+#line 57
+#ifdef __cplusplus
+extern "C" {
+#endif
+NPY_NO_EXPORT int radixsort_short(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aradixsort_short(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+#ifdef __cplusplus
+}
+#endif
+
+
+#line 57
+#ifdef __cplusplus
+extern "C" {
+#endif
+NPY_NO_EXPORT int radixsort_ushort(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aradixsort_ushort(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+#ifdef __cplusplus
+}
+#endif
+
+
+#line 57
+#ifdef __cplusplus
+extern "C" {
+#endif
+NPY_NO_EXPORT int radixsort_int(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aradixsort_int(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+#ifdef __cplusplus
+}
+#endif
+
+
+#line 57
+#ifdef __cplusplus
+extern "C" {
+#endif
+NPY_NO_EXPORT int radixsort_uint(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aradixsort_uint(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+#ifdef __cplusplus
+}
+#endif
+
+
+#line 57
+#ifdef __cplusplus
+extern "C" {
+#endif
+NPY_NO_EXPORT int radixsort_long(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aradixsort_long(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+#ifdef __cplusplus
+}
+#endif
+
+
+#line 57
+#ifdef __cplusplus
+extern "C" {
+#endif
+NPY_NO_EXPORT int radixsort_ulong(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aradixsort_ulong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+#ifdef __cplusplus
+}
+#endif
+
+
+#line 57
+#ifdef __cplusplus
+extern "C" {
+#endif
+NPY_NO_EXPORT int radixsort_longlong(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aradixsort_longlong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+#ifdef __cplusplus
+}
+#endif
+
+
+#line 57
+#ifdef __cplusplus
+extern "C" {
+#endif
+NPY_NO_EXPORT int radixsort_ulonglong(void *vec, npy_intp cnt, void *null);
+NPY_NO_EXPORT int aradixsort_ulonglong(void *vec, npy_intp *ind, npy_intp cnt, void *null);
+#ifdef __cplusplus
+}
+#endif
+
+
+
+
+
+/*
+ *****************************************************************************
+ **                             STRING SORTS                                **
+ *****************************************************************************
+ */
+
+
+#line 81
+
+NPY_NO_EXPORT int quicksort_string(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int heapsort_string(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int mergesort_string(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int timsort_string(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int aquicksort_string(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int aheapsort_string(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int amergesort_string(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int atimsort_string(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+
+
+#line 81
+
+NPY_NO_EXPORT int quicksort_unicode(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int heapsort_unicode(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int mergesort_unicode(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int timsort_unicode(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int aquicksort_unicode(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int aheapsort_unicode(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int amergesort_unicode(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int atimsort_unicode(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+
+
+
+
+/*
+ *****************************************************************************
+ **                             GENERIC SORT                                **
+ *****************************************************************************
+ */
+
+
+NPY_NO_EXPORT int npy_quicksort(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int npy_heapsort(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int npy_mergesort(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int npy_timsort(void *vec, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int npy_aquicksort(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int npy_aheapsort(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int npy_amergesort(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+NPY_NO_EXPORT int npy_atimsort(void *vec, npy_intp *ind, npy_intp cnt, void *arr);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/numpy/core/src/_generated/scalarmath.c b/numpy/core/src/_generated/scalarmath.c
new file mode 100644
index 000000000000..5eaf4c22b286
--- /dev/null
+++ b/numpy/core/src/_generated/scalarmath.c
@@ -0,0 +1,43813 @@
+#line 1 "numpy/core/src/umath/scalarmath.c.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+/* -*- c -*- */
+
+/* The purpose of this module is to add faster math for array scalars
+   that does not go through the ufunc machinery
+
+   but still supports error-modes.
+*/
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+#define _UMATHMODULE
+#define _MULTIARRAYMODULE
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+
+#include "npy_config.h"
+#include "numpy/arrayobject.h"
+#include "numpy/ufuncobject.h"
+#include "numpy/arrayscalars.h"
+
+#include "npy_import.h"
+#include "npy_pycompat.h"
+
+#include "numpy/halffloat.h"
+#include "templ_common.h"
+
+#include "binop_override.h"
+#include "npy_longdouble.h"
+
+#include "arraytypes.h"
+#include "array_coercion.h"
+#include "common.h"
+#include "can_cast_table.h"
+#include "umathmodule.h"
+
+#include "convert_datatype.h"
+
+
+/* TODO: Used for some functions, should possibly move these to npy_math.h */
+#include "loops.h"
+
+/* Basic operations:
+ *
+ *  BINARY:
+ *
+ * add, subtract, multiply, divide, remainder, divmod, power,
+ * floor_divide, true_divide
+ *
+ * lshift, rshift, and, or, xor (integers only)
+ *
+ * UNARY:
+ *
+ * negative, positive, absolute, nonzero, invert, int, long, float, oct, hex
+ *
+ */
+
+#line 60
+static inline int
+byte_ctype_add(npy_byte a, npy_byte b, npy_byte *out) {
+    *out = a + b;
+    if ((*out^a) >= 0 || (*out^b) >= 0) {
+        return 0;
+    }
+    return NPY_FPE_OVERFLOW;
+}
+
+static inline int
+byte_ctype_subtract(npy_byte a, npy_byte b, npy_byte *out) {
+    *out = a - b;
+    if ((*out^a) >= 0 || (*out^~b) >= 0) {
+        return 0;
+    }
+    return NPY_FPE_OVERFLOW;
+}
+
+#line 60
+static inline int
+short_ctype_add(npy_short a, npy_short b, npy_short *out) {
+    *out = a + b;
+    if ((*out^a) >= 0 || (*out^b) >= 0) {
+        return 0;
+    }
+    return NPY_FPE_OVERFLOW;
+}
+
+static inline int
+short_ctype_subtract(npy_short a, npy_short b, npy_short *out) {
+    *out = a - b;
+    if ((*out^a) >= 0 || (*out^~b) >= 0) {
+        return 0;
+    }
+    return NPY_FPE_OVERFLOW;
+}
+
+#line 60
+static inline int
+int_ctype_add(npy_int a, npy_int b, npy_int *out) {
+    *out = a + b;
+    if ((*out^a) >= 0 || (*out^b) >= 0) {
+        return 0;
+    }
+    return NPY_FPE_OVERFLOW;
+}
+
+static inline int
+int_ctype_subtract(npy_int a, npy_int b, npy_int *out) {
+    *out = a - b;
+    if ((*out^a) >= 0 || (*out^~b) >= 0) {
+        return 0;
+    }
+    return NPY_FPE_OVERFLOW;
+}
+
+#line 60
+static inline int
+long_ctype_add(npy_long a, npy_long b, npy_long *out) {
+    *out = a + b;
+    if ((*out^a) >= 0 || (*out^b) >= 0) {
+        return 0;
+    }
+    return NPY_FPE_OVERFLOW;
+}
+
+static inline int
+long_ctype_subtract(npy_long a, npy_long b, npy_long *out) {
+    *out = a - b;
+    if ((*out^a) >= 0 || (*out^~b) >= 0) {
+        return 0;
+    }
+    return NPY_FPE_OVERFLOW;
+}
+
+#line 60
+static inline int
+longlong_ctype_add(npy_longlong a, npy_longlong b, npy_longlong *out) {
+    *out = a + b;
+    if ((*out^a) >= 0 || (*out^b) >= 0) {
+        return 0;
+    }
+    return NPY_FPE_OVERFLOW;
+}
+
+static inline int
+longlong_ctype_subtract(npy_longlong a, npy_longlong b, npy_longlong *out) {
+    *out = a - b;
+    if ((*out^a) >= 0 || (*out^~b) >= 0) {
+        return 0;
+    }
+    return NPY_FPE_OVERFLOW;
+}
+
+
+#line 83
+static inline int
+ubyte_ctype_add(npy_ubyte a, npy_ubyte b, npy_ubyte *out) {
+    *out = a + b;
+    if (*out >= a && *out >= b) {
+        return 0;
+    }
+    return NPY_FPE_OVERFLOW;
+}
+
+static inline int
+ubyte_ctype_subtract(npy_ubyte a, npy_ubyte b, npy_ubyte *out) {
+    *out = a - b;
+    if (a >= b) {
+        return 0;
+    }
+    return NPY_FPE_OVERFLOW;
+}
+
+#line 83
+static inline int
+ushort_ctype_add(npy_ushort a, npy_ushort b, npy_ushort *out) {
+    *out = a + b;
+    if (*out >= a && *out >= b) {
+        return 0;
+    }
+    return NPY_FPE_OVERFLOW;
+}
+
+static inline int
+ushort_ctype_subtract(npy_ushort a, npy_ushort b, npy_ushort *out) {
+    *out = a - b;
+    if (a >= b) {
+        return 0;
+    }
+    return NPY_FPE_OVERFLOW;
+}
+
+#line 83
+static inline int
+uint_ctype_add(npy_uint a, npy_uint b, npy_uint *out) {
+    *out = a + b;
+    if (*out >= a && *out >= b) {
+        return 0;
+    }
+    return NPY_FPE_OVERFLOW;
+}
+
+static inline int
+uint_ctype_subtract(npy_uint a, npy_uint b, npy_uint *out) {
+    *out = a - b;
+    if (a >= b) {
+        return 0;
+    }
+    return NPY_FPE_OVERFLOW;
+}
+
+#line 83
+static inline int
+ulong_ctype_add(npy_ulong a, npy_ulong b, npy_ulong *out) {
+    *out = a + b;
+    if (*out >= a && *out >= b) {
+        return 0;
+    }
+    return NPY_FPE_OVERFLOW;
+}
+
+static inline int
+ulong_ctype_subtract(npy_ulong a, npy_ulong b, npy_ulong *out) {
+    *out = a - b;
+    if (a >= b) {
+        return 0;
+    }
+    return NPY_FPE_OVERFLOW;
+}
+
+#line 83
+static inline int
+ulonglong_ctype_add(npy_ulonglong a, npy_ulonglong b, npy_ulonglong *out) {
+    *out = a + b;
+    if (*out >= a && *out >= b) {
+        return 0;
+    }
+    return NPY_FPE_OVERFLOW;
+}
+
+static inline int
+ulonglong_ctype_subtract(npy_ulonglong a, npy_ulonglong b, npy_ulonglong *out) {
+    *out = a - b;
+    if (a >= b) {
+        return 0;
+    }
+    return NPY_FPE_OVERFLOW;
+}
+
+
+#ifndef NPY_SIZEOF_BYTE
+#define NPY_SIZEOF_BYTE 1
+#endif
+
+#line 120
+#if NPY_SIZEOF_INT > NPY_SIZEOF_BYTE
+static inline int
+byte_ctype_multiply(npy_byte a, npy_byte b, npy_byte *out) {
+    npy_int temp;
+    temp = ((npy_int) a) * ((npy_int) b);
+    *out = (npy_byte) temp;
+#if 1
+    if (temp > NPY_MAX_BYTE || temp < NPY_MIN_BYTE) {
+#else
+    if (temp > NPY_MAX_BYTE) {
+#endif
+        return NPY_FPE_OVERFLOW;
+    }
+    return 0;
+}
+#endif
+
+#line 120
+#if NPY_SIZEOF_INT > NPY_SIZEOF_BYTE
+static inline int
+ubyte_ctype_multiply(npy_ubyte a, npy_ubyte b, npy_ubyte *out) {
+    npy_uint temp;
+    temp = ((npy_uint) a) * ((npy_uint) b);
+    *out = (npy_ubyte) temp;
+#if 0
+    if (temp > NPY_MAX_UBYTE || temp < NPY_MIN_UBYTE) {
+#else
+    if (temp > NPY_MAX_UBYTE) {
+#endif
+        return NPY_FPE_OVERFLOW;
+    }
+    return 0;
+}
+#endif
+
+#line 120
+#if NPY_SIZEOF_INT > NPY_SIZEOF_SHORT
+static inline int
+short_ctype_multiply(npy_short a, npy_short b, npy_short *out) {
+    npy_int temp;
+    temp = ((npy_int) a) * ((npy_int) b);
+    *out = (npy_short) temp;
+#if 1
+    if (temp > NPY_MAX_SHORT || temp < NPY_MIN_SHORT) {
+#else
+    if (temp > NPY_MAX_SHORT) {
+#endif
+        return NPY_FPE_OVERFLOW;
+    }
+    return 0;
+}
+#endif
+
+#line 120
+#if NPY_SIZEOF_INT > NPY_SIZEOF_SHORT
+static inline int
+ushort_ctype_multiply(npy_ushort a, npy_ushort b, npy_ushort *out) {
+    npy_uint temp;
+    temp = ((npy_uint) a) * ((npy_uint) b);
+    *out = (npy_ushort) temp;
+#if 0
+    if (temp > NPY_MAX_USHORT || temp < NPY_MIN_USHORT) {
+#else
+    if (temp > NPY_MAX_USHORT) {
+#endif
+        return NPY_FPE_OVERFLOW;
+    }
+    return 0;
+}
+#endif
+
+#line 120
+#if NPY_SIZEOF_LONGLONG > NPY_SIZEOF_INT
+static inline int
+int_ctype_multiply(npy_int a, npy_int b, npy_int *out) {
+    npy_longlong temp;
+    temp = ((npy_longlong) a) * ((npy_longlong) b);
+    *out = (npy_int) temp;
+#if 1
+    if (temp > NPY_MAX_INT || temp < NPY_MIN_INT) {
+#else
+    if (temp > NPY_MAX_INT) {
+#endif
+        return NPY_FPE_OVERFLOW;
+    }
+    return 0;
+}
+#endif
+
+#line 120
+#if NPY_SIZEOF_LONGLONG > NPY_SIZEOF_INT
+static inline int
+uint_ctype_multiply(npy_uint a, npy_uint b, npy_uint *out) {
+    npy_ulonglong temp;
+    temp = ((npy_ulonglong) a) * ((npy_ulonglong) b);
+    *out = (npy_uint) temp;
+#if 0
+    if (temp > NPY_MAX_UINT || temp < NPY_MIN_UINT) {
+#else
+    if (temp > NPY_MAX_UINT) {
+#endif
+        return NPY_FPE_OVERFLOW;
+    }
+    return 0;
+}
+#endif
+
+#line 120
+#if NPY_SIZEOF_LONGLONG > NPY_SIZEOF_LONG
+static inline int
+long_ctype_multiply(npy_long a, npy_long b, npy_long *out) {
+    npy_longlong temp;
+    temp = ((npy_longlong) a) * ((npy_longlong) b);
+    *out = (npy_long) temp;
+#if 1
+    if (temp > NPY_MAX_LONG || temp < NPY_MIN_LONG) {
+#else
+    if (temp > NPY_MAX_LONG) {
+#endif
+        return NPY_FPE_OVERFLOW;
+    }
+    return 0;
+}
+#endif
+
+#line 120
+#if NPY_SIZEOF_LONGLONG > NPY_SIZEOF_LONG
+static inline int
+ulong_ctype_multiply(npy_ulong a, npy_ulong b, npy_ulong *out) {
+    npy_ulonglong temp;
+    temp = ((npy_ulonglong) a) * ((npy_ulonglong) b);
+    *out = (npy_ulong) temp;
+#if 0
+    if (temp > NPY_MAX_ULONG || temp < NPY_MIN_ULONG) {
+#else
+    if (temp > NPY_MAX_ULONG) {
+#endif
+        return NPY_FPE_OVERFLOW;
+    }
+    return 0;
+}
+#endif
+
+
+#line 146
+#if NPY_SIZEOF_LONGLONG == NPY_SIZEOF_INT
+static inline int
+int_ctype_multiply(npy_int a, npy_int b, npy_int *out) {
+    if (npy_mul_with_overflow_int(out, a, b)) {
+        return NPY_FPE_OVERFLOW;
+    }
+    return 0;
+}
+#endif
+
+#line 146
+#if NPY_SIZEOF_LONGLONG == NPY_SIZEOF_INT
+static inline int
+uint_ctype_multiply(npy_uint a, npy_uint b, npy_uint *out) {
+    if (npy_mul_with_overflow_uint(out, a, b)) {
+        return NPY_FPE_OVERFLOW;
+    }
+    return 0;
+}
+#endif
+
+#line 146
+#if NPY_SIZEOF_LONGLONG == NPY_SIZEOF_LONG
+static inline int
+long_ctype_multiply(npy_long a, npy_long b, npy_long *out) {
+    if (npy_mul_with_overflow_long(out, a, b)) {
+        return NPY_FPE_OVERFLOW;
+    }
+    return 0;
+}
+#endif
+
+#line 146
+#if NPY_SIZEOF_LONGLONG == NPY_SIZEOF_LONG
+static inline int
+ulong_ctype_multiply(npy_ulong a, npy_ulong b, npy_ulong *out) {
+    if (npy_mul_with_overflow_ulong(out, a, b)) {
+        return NPY_FPE_OVERFLOW;
+    }
+    return 0;
+}
+#endif
+
+#line 146
+#if NPY_SIZEOF_LONGLONG == NPY_SIZEOF_LONGLONG
+static inline int
+longlong_ctype_multiply(npy_longlong a, npy_longlong b, npy_longlong *out) {
+    if (npy_mul_with_overflow_longlong(out, a, b)) {
+        return NPY_FPE_OVERFLOW;
+    }
+    return 0;
+}
+#endif
+
+#line 146
+#if NPY_SIZEOF_LONGLONG == NPY_SIZEOF_LONGLONG
+static inline int
+ulonglong_ctype_multiply(npy_ulonglong a, npy_ulonglong b, npy_ulonglong *out) {
+    if (npy_mul_with_overflow_ulonglong(out, a, b)) {
+        return NPY_FPE_OVERFLOW;
+    }
+    return 0;
+}
+#endif
+
+
+#line 167
+
+#if 1
+    #define DIVIDEBYZERO_CHECK (b == 0 || (a == NPY_MIN_BYTE && b == -1))
+#else
+    #define DIVIDEBYZERO_CHECK (b == 0)
+#endif
+
+static inline int
+byte_ctype_divide(npy_byte a, npy_byte b, npy_byte *out) {
+    if (b == 0) {
+        *out = 0;
+        return NPY_FPE_DIVIDEBYZERO;
+    }
+#if 1
+    else if (b == -1 && a == NPY_MIN_BYTE) {
+        *out = NPY_MIN_BYTE;
+        return NPY_FPE_OVERFLOW;
+    }
+#endif
+    else {
+#if 1
+        npy_byte tmp;
+        tmp = a / b;
+        if (((a > 0) != (b > 0)) && (a % b != 0)) {
+            tmp--;
+        }
+        *out = tmp;
+#else
+        *out = a / b;
+#endif
+        return 0;
+    }
+}
+
+#define byte_ctype_floor_divide byte_ctype_divide
+
+static inline int
+byte_ctype_remainder(npy_byte a, npy_byte b, npy_byte *out) {
+    if (DIVIDEBYZERO_CHECK) {
+        *out = 0;
+        if (b == 0) {
+            return NPY_FPE_DIVIDEBYZERO;
+        }
+        return 0;
+    }
+#if 1
+    else if ((a > 0) == (b > 0)) {
+        *out = a % b;
+    }
+    else {
+        /* handled like Python does */
+        *out = a % b;
+        if (*out) *out += b;
+    }
+#else
+    *out = a % b;
+#endif
+    return 0;
+}
+#undef DIVIDEBYZERO_CHECK
+
+#line 167
+
+#if 0
+    #define DIVIDEBYZERO_CHECK (b == 0 || (a == NPY_MIN_UBYTE && b == -1))
+#else
+    #define DIVIDEBYZERO_CHECK (b == 0)
+#endif
+
+static inline int
+ubyte_ctype_divide(npy_ubyte a, npy_ubyte b, npy_ubyte *out) {
+    if (b == 0) {
+        *out = 0;
+        return NPY_FPE_DIVIDEBYZERO;
+    }
+#if 0
+    else if (b == -1 && a == NPY_MIN_UBYTE) {
+        *out = NPY_MIN_UBYTE;
+        return NPY_FPE_OVERFLOW;
+    }
+#endif
+    else {
+#if 0
+        npy_ubyte tmp;
+        tmp = a / b;
+        if (((a > 0) != (b > 0)) && (a % b != 0)) {
+            tmp--;
+        }
+        *out = tmp;
+#else
+        *out = a / b;
+#endif
+        return 0;
+    }
+}
+
+#define ubyte_ctype_floor_divide ubyte_ctype_divide
+
+static inline int
+ubyte_ctype_remainder(npy_ubyte a, npy_ubyte b, npy_ubyte *out) {
+    if (DIVIDEBYZERO_CHECK) {
+        *out = 0;
+        if (b == 0) {
+            return NPY_FPE_DIVIDEBYZERO;
+        }
+        return 0;
+    }
+#if 0
+    else if ((a > 0) == (b > 0)) {
+        *out = a % b;
+    }
+    else {
+        /* handled like Python does */
+        *out = a % b;
+        if (*out) *out += b;
+    }
+#else
+    *out = a % b;
+#endif
+    return 0;
+}
+#undef DIVIDEBYZERO_CHECK
+
+#line 167
+
+#if 1
+    #define DIVIDEBYZERO_CHECK (b == 0 || (a == NPY_MIN_SHORT && b == -1))
+#else
+    #define DIVIDEBYZERO_CHECK (b == 0)
+#endif
+
+static inline int
+short_ctype_divide(npy_short a, npy_short b, npy_short *out) {
+    if (b == 0) {
+        *out = 0;
+        return NPY_FPE_DIVIDEBYZERO;
+    }
+#if 1
+    else if (b == -1 && a == NPY_MIN_SHORT) {
+        *out = NPY_MIN_SHORT;
+        return NPY_FPE_OVERFLOW;
+    }
+#endif
+    else {
+#if 1
+        npy_short tmp;
+        tmp = a / b;
+        if (((a > 0) != (b > 0)) && (a % b != 0)) {
+            tmp--;
+        }
+        *out = tmp;
+#else
+        *out = a / b;
+#endif
+        return 0;
+    }
+}
+
+#define short_ctype_floor_divide short_ctype_divide
+
+static inline int
+short_ctype_remainder(npy_short a, npy_short b, npy_short *out) {
+    if (DIVIDEBYZERO_CHECK) {
+        *out = 0;
+        if (b == 0) {
+            return NPY_FPE_DIVIDEBYZERO;
+        }
+        return 0;
+    }
+#if 1
+    else if ((a > 0) == (b > 0)) {
+        *out = a % b;
+    }
+    else {
+        /* handled like Python does */
+        *out = a % b;
+        if (*out) *out += b;
+    }
+#else
+    *out = a % b;
+#endif
+    return 0;
+}
+#undef DIVIDEBYZERO_CHECK
+
+#line 167
+
+#if 0
+    #define DIVIDEBYZERO_CHECK (b == 0 || (a == NPY_MIN_USHORT && b == -1))
+#else
+    #define DIVIDEBYZERO_CHECK (b == 0)
+#endif
+
+static inline int
+ushort_ctype_divide(npy_ushort a, npy_ushort b, npy_ushort *out) {
+    if (b == 0) {
+        *out = 0;
+        return NPY_FPE_DIVIDEBYZERO;
+    }
+#if 0
+    else if (b == -1 && a == NPY_MIN_USHORT) {
+        *out = NPY_MIN_USHORT;
+        return NPY_FPE_OVERFLOW;
+    }
+#endif
+    else {
+#if 0
+        npy_ushort tmp;
+        tmp = a / b;
+        if (((a > 0) != (b > 0)) && (a % b != 0)) {
+            tmp--;
+        }
+        *out = tmp;
+#else
+        *out = a / b;
+#endif
+        return 0;
+    }
+}
+
+#define ushort_ctype_floor_divide ushort_ctype_divide
+
+static inline int
+ushort_ctype_remainder(npy_ushort a, npy_ushort b, npy_ushort *out) {
+    if (DIVIDEBYZERO_CHECK) {
+        *out = 0;
+        if (b == 0) {
+            return NPY_FPE_DIVIDEBYZERO;
+        }
+        return 0;
+    }
+#if 0
+    else if ((a > 0) == (b > 0)) {
+        *out = a % b;
+    }
+    else {
+        /* handled like Python does */
+        *out = a % b;
+        if (*out) *out += b;
+    }
+#else
+    *out = a % b;
+#endif
+    return 0;
+}
+#undef DIVIDEBYZERO_CHECK
+
+#line 167
+
+#if 1
+    #define DIVIDEBYZERO_CHECK (b == 0 || (a == NPY_MIN_INT && b == -1))
+#else
+    #define DIVIDEBYZERO_CHECK (b == 0)
+#endif
+
+static inline int
+int_ctype_divide(npy_int a, npy_int b, npy_int *out) {
+    if (b == 0) {
+        *out = 0;
+        return NPY_FPE_DIVIDEBYZERO;
+    }
+#if 1
+    else if (b == -1 && a == NPY_MIN_INT) {
+        *out = NPY_MIN_INT;
+        return NPY_FPE_OVERFLOW;
+    }
+#endif
+    else {
+#if 1
+        npy_int tmp;
+        tmp = a / b;
+        if (((a > 0) != (b > 0)) && (a % b != 0)) {
+            tmp--;
+        }
+        *out = tmp;
+#else
+        *out = a / b;
+#endif
+        return 0;
+    }
+}
+
+#define int_ctype_floor_divide int_ctype_divide
+
+static inline int
+int_ctype_remainder(npy_int a, npy_int b, npy_int *out) {
+    if (DIVIDEBYZERO_CHECK) {
+        *out = 0;
+        if (b == 0) {
+            return NPY_FPE_DIVIDEBYZERO;
+        }
+        return 0;
+    }
+#if 1
+    else if ((a > 0) == (b > 0)) {
+        *out = a % b;
+    }
+    else {
+        /* handled like Python does */
+        *out = a % b;
+        if (*out) *out += b;
+    }
+#else
+    *out = a % b;
+#endif
+    return 0;
+}
+#undef DIVIDEBYZERO_CHECK
+
+#line 167
+
+#if 0
+    #define DIVIDEBYZERO_CHECK (b == 0 || (a == NPY_MIN_UINT && b == -1))
+#else
+    #define DIVIDEBYZERO_CHECK (b == 0)
+#endif
+
+static inline int
+uint_ctype_divide(npy_uint a, npy_uint b, npy_uint *out) {
+    if (b == 0) {
+        *out = 0;
+        return NPY_FPE_DIVIDEBYZERO;
+    }
+#if 0
+    else if (b == -1 && a == NPY_MIN_UINT) {
+        *out = NPY_MIN_UINT;
+        return NPY_FPE_OVERFLOW;
+    }
+#endif
+    else {
+#if 0
+        npy_uint tmp;
+        tmp = a / b;
+        if (((a > 0) != (b > 0)) && (a % b != 0)) {
+            tmp--;
+        }
+        *out = tmp;
+#else
+        *out = a / b;
+#endif
+        return 0;
+    }
+}
+
+#define uint_ctype_floor_divide uint_ctype_divide
+
+static inline int
+uint_ctype_remainder(npy_uint a, npy_uint b, npy_uint *out) {
+    if (DIVIDEBYZERO_CHECK) {
+        *out = 0;
+        if (b == 0) {
+            return NPY_FPE_DIVIDEBYZERO;
+        }
+        return 0;
+    }
+#if 0
+    else if ((a > 0) == (b > 0)) {
+        *out = a % b;
+    }
+    else {
+        /* handled like Python does */
+        *out = a % b;
+        if (*out) *out += b;
+    }
+#else
+    *out = a % b;
+#endif
+    return 0;
+}
+#undef DIVIDEBYZERO_CHECK
+
+#line 167
+
+#if 1
+    #define DIVIDEBYZERO_CHECK (b == 0 || (a == NPY_MIN_LONG && b == -1))
+#else
+    #define DIVIDEBYZERO_CHECK (b == 0)
+#endif
+
+static inline int
+long_ctype_divide(npy_long a, npy_long b, npy_long *out) {
+    if (b == 0) {
+        *out = 0;
+        return NPY_FPE_DIVIDEBYZERO;
+    }
+#if 1
+    else if (b == -1 && a == NPY_MIN_LONG) {
+        *out = NPY_MIN_LONG;
+        return NPY_FPE_OVERFLOW;
+    }
+#endif
+    else {
+#if 1
+        npy_long tmp;
+        tmp = a / b;
+        if (((a > 0) != (b > 0)) && (a % b != 0)) {
+            tmp--;
+        }
+        *out = tmp;
+#else
+        *out = a / b;
+#endif
+        return 0;
+    }
+}
+
+#define long_ctype_floor_divide long_ctype_divide
+
+static inline int
+long_ctype_remainder(npy_long a, npy_long b, npy_long *out) {
+    if (DIVIDEBYZERO_CHECK) {
+        *out = 0;
+        if (b == 0) {
+            return NPY_FPE_DIVIDEBYZERO;
+        }
+        return 0;
+    }
+#if 1
+    else if ((a > 0) == (b > 0)) {
+        *out = a % b;
+    }
+    else {
+        /* handled like Python does */
+        *out = a % b;
+        if (*out) *out += b;
+    }
+#else
+    *out = a % b;
+#endif
+    return 0;
+}
+#undef DIVIDEBYZERO_CHECK
+
+#line 167
+
+#if 0
+    #define DIVIDEBYZERO_CHECK (b == 0 || (a == NPY_MIN_ULONG && b == -1))
+#else
+    #define DIVIDEBYZERO_CHECK (b == 0)
+#endif
+
+static inline int
+ulong_ctype_divide(npy_ulong a, npy_ulong b, npy_ulong *out) {
+    if (b == 0) {
+        *out = 0;
+        return NPY_FPE_DIVIDEBYZERO;
+    }
+#if 0
+    else if (b == -1 && a == NPY_MIN_ULONG) {
+        *out = NPY_MIN_ULONG;
+        return NPY_FPE_OVERFLOW;
+    }
+#endif
+    else {
+#if 0
+        npy_ulong tmp;
+        tmp = a / b;
+        if (((a > 0) != (b > 0)) && (a % b != 0)) {
+            tmp--;
+        }
+        *out = tmp;
+#else
+        *out = a / b;
+#endif
+        return 0;
+    }
+}
+
+#define ulong_ctype_floor_divide ulong_ctype_divide
+
+static inline int
+ulong_ctype_remainder(npy_ulong a, npy_ulong b, npy_ulong *out) {
+    if (DIVIDEBYZERO_CHECK) {
+        *out = 0;
+        if (b == 0) {
+            return NPY_FPE_DIVIDEBYZERO;
+        }
+        return 0;
+    }
+#if 0
+    else if ((a > 0) == (b > 0)) {
+        *out = a % b;
+    }
+    else {
+        /* handled like Python does */
+        *out = a % b;
+        if (*out) *out += b;
+    }
+#else
+    *out = a % b;
+#endif
+    return 0;
+}
+#undef DIVIDEBYZERO_CHECK
+
+#line 167
+
+#if 1
+    #define DIVIDEBYZERO_CHECK (b == 0 || (a == NPY_MIN_LONGLONG && b == -1))
+#else
+    #define DIVIDEBYZERO_CHECK (b == 0)
+#endif
+
+static inline int
+longlong_ctype_divide(npy_longlong a, npy_longlong b, npy_longlong *out) {
+    if (b == 0) {
+        *out = 0;
+        return NPY_FPE_DIVIDEBYZERO;
+    }
+#if 1
+    else if (b == -1 && a == NPY_MIN_LONGLONG) {
+        *out = NPY_MIN_LONGLONG;
+        return NPY_FPE_OVERFLOW;
+    }
+#endif
+    else {
+#if 1
+        npy_longlong tmp;
+        tmp = a / b;
+        if (((a > 0) != (b > 0)) && (a % b != 0)) {
+            tmp--;
+        }
+        *out = tmp;
+#else
+        *out = a / b;
+#endif
+        return 0;
+    }
+}
+
+#define longlong_ctype_floor_divide longlong_ctype_divide
+
+static inline int
+longlong_ctype_remainder(npy_longlong a, npy_longlong b, npy_longlong *out) {
+    if (DIVIDEBYZERO_CHECK) {
+        *out = 0;
+        if (b == 0) {
+            return NPY_FPE_DIVIDEBYZERO;
+        }
+        return 0;
+    }
+#if 1
+    else if ((a > 0) == (b > 0)) {
+        *out = a % b;
+    }
+    else {
+        /* handled like Python does */
+        *out = a % b;
+        if (*out) *out += b;
+    }
+#else
+    *out = a % b;
+#endif
+    return 0;
+}
+#undef DIVIDEBYZERO_CHECK
+
+#line 167
+
+#if 0
+    #define DIVIDEBYZERO_CHECK (b == 0 || (a == NPY_MIN_ULONGLONG && b == -1))
+#else
+    #define DIVIDEBYZERO_CHECK (b == 0)
+#endif
+
+static inline int
+ulonglong_ctype_divide(npy_ulonglong a, npy_ulonglong b, npy_ulonglong *out) {
+    if (b == 0) {
+        *out = 0;
+        return NPY_FPE_DIVIDEBYZERO;
+    }
+#if 0
+    else if (b == -1 && a == NPY_MIN_ULONGLONG) {
+        *out = NPY_MIN_ULONGLONG;
+        return NPY_FPE_OVERFLOW;
+    }
+#endif
+    else {
+#if 0
+        npy_ulonglong tmp;
+        tmp = a / b;
+        if (((a > 0) != (b > 0)) && (a % b != 0)) {
+            tmp--;
+        }
+        *out = tmp;
+#else
+        *out = a / b;
+#endif
+        return 0;
+    }
+}
+
+#define ulonglong_ctype_floor_divide ulonglong_ctype_divide
+
+static inline int
+ulonglong_ctype_remainder(npy_ulonglong a, npy_ulonglong b, npy_ulonglong *out) {
+    if (DIVIDEBYZERO_CHECK) {
+        *out = 0;
+        if (b == 0) {
+            return NPY_FPE_DIVIDEBYZERO;
+        }
+        return 0;
+    }
+#if 0
+    else if ((a > 0) == (b > 0)) {
+        *out = a % b;
+    }
+    else {
+        /* handled like Python does */
+        *out = a % b;
+        if (*out) *out += b;
+    }
+#else
+    *out = a % b;
+#endif
+    return 0;
+}
+#undef DIVIDEBYZERO_CHECK
+
+
+#line 234
+
+static inline int
+byte_ctype_true_divide(npy_byte a, npy_byte b, npy_double *out)
+{
+    *out = (npy_double)a / (npy_double)b;
+    return 0;
+}
+
+
+#line 234
+
+static inline int
+ubyte_ctype_true_divide(npy_ubyte a, npy_ubyte b, npy_double *out)
+{
+    *out = (npy_double)a / (npy_double)b;
+    return 0;
+}
+
+
+#line 234
+
+static inline int
+short_ctype_true_divide(npy_short a, npy_short b, npy_double *out)
+{
+    *out = (npy_double)a / (npy_double)b;
+    return 0;
+}
+
+
+#line 234
+
+static inline int
+ushort_ctype_true_divide(npy_ushort a, npy_ushort b, npy_double *out)
+{
+    *out = (npy_double)a / (npy_double)b;
+    return 0;
+}
+
+
+#line 234
+
+static inline int
+int_ctype_true_divide(npy_int a, npy_int b, npy_double *out)
+{
+    *out = (npy_double)a / (npy_double)b;
+    return 0;
+}
+
+
+#line 234
+
+static inline int
+uint_ctype_true_divide(npy_uint a, npy_uint b, npy_double *out)
+{
+    *out = (npy_double)a / (npy_double)b;
+    return 0;
+}
+
+
+#line 234
+
+static inline int
+long_ctype_true_divide(npy_long a, npy_long b, npy_double *out)
+{
+    *out = (npy_double)a / (npy_double)b;
+    return 0;
+}
+
+
+#line 234
+
+static inline int
+ulong_ctype_true_divide(npy_ulong a, npy_ulong b, npy_double *out)
+{
+    *out = (npy_double)a / (npy_double)b;
+    return 0;
+}
+
+
+#line 234
+
+static inline int
+longlong_ctype_true_divide(npy_longlong a, npy_longlong b, npy_double *out)
+{
+    *out = (npy_double)a / (npy_double)b;
+    return 0;
+}
+
+
+#line 234
+
+static inline int
+ulonglong_ctype_true_divide(npy_ulonglong a, npy_ulonglong b, npy_double *out)
+{
+    *out = (npy_double)a / (npy_double)b;
+    return 0;
+}
+
+
+
+/* b will always be positive in this call */
+#line 254
+static inline int
+byte_ctype_power(npy_byte a, npy_byte b, npy_byte *out) {
+    npy_byte tmp;
+
+    if (b == 0) {
+        *out = 1;
+        return 0;
+    }
+    if (a == 1) {
+        *out = 1;
+        return 0;
+    }
+
+    tmp = b & 1 ? a : 1;
+    b >>= 1;
+    while (b > 0) {
+        a *= a;
+        if (b & 1) {
+            tmp *= a;
+        }
+        b >>= 1;
+    }
+    *out = tmp;
+    return 0;
+}
+
+#line 254
+static inline int
+ubyte_ctype_power(npy_ubyte a, npy_ubyte b, npy_ubyte *out) {
+    npy_ubyte tmp;
+
+    if (b == 0) {
+        *out = 1;
+        return 0;
+    }
+    if (a == 1) {
+        *out = 1;
+        return 0;
+    }
+
+    tmp = b & 1 ? a : 1;
+    b >>= 1;
+    while (b > 0) {
+        a *= a;
+        if (b & 1) {
+            tmp *= a;
+        }
+        b >>= 1;
+    }
+    *out = tmp;
+    return 0;
+}
+
+#line 254
+static inline int
+short_ctype_power(npy_short a, npy_short b, npy_short *out) {
+    npy_short tmp;
+
+    if (b == 0) {
+        *out = 1;
+        return 0;
+    }
+    if (a == 1) {
+        *out = 1;
+        return 0;
+    }
+
+    tmp = b & 1 ? a : 1;
+    b >>= 1;
+    while (b > 0) {
+        a *= a;
+        if (b & 1) {
+            tmp *= a;
+        }
+        b >>= 1;
+    }
+    *out = tmp;
+    return 0;
+}
+
+#line 254
+static inline int
+ushort_ctype_power(npy_ushort a, npy_ushort b, npy_ushort *out) {
+    npy_ushort tmp;
+
+    if (b == 0) {
+        *out = 1;
+        return 0;
+    }
+    if (a == 1) {
+        *out = 1;
+        return 0;
+    }
+
+    tmp = b & 1 ? a : 1;
+    b >>= 1;
+    while (b > 0) {
+        a *= a;
+        if (b & 1) {
+            tmp *= a;
+        }
+        b >>= 1;
+    }
+    *out = tmp;
+    return 0;
+}
+
+#line 254
+static inline int
+int_ctype_power(npy_int a, npy_int b, npy_int *out) {
+    npy_int tmp;
+
+    if (b == 0) {
+        *out = 1;
+        return 0;
+    }
+    if (a == 1) {
+        *out = 1;
+        return 0;
+    }
+
+    tmp = b & 1 ? a : 1;
+    b >>= 1;
+    while (b > 0) {
+        a *= a;
+        if (b & 1) {
+            tmp *= a;
+        }
+        b >>= 1;
+    }
+    *out = tmp;
+    return 0;
+}
+
+#line 254
+static inline int
+uint_ctype_power(npy_uint a, npy_uint b, npy_uint *out) {
+    npy_uint tmp;
+
+    if (b == 0) {
+        *out = 1;
+        return 0;
+    }
+    if (a == 1) {
+        *out = 1;
+        return 0;
+    }
+
+    tmp = b & 1 ? a : 1;
+    b >>= 1;
+    while (b > 0) {
+        a *= a;
+        if (b & 1) {
+            tmp *= a;
+        }
+        b >>= 1;
+    }
+    *out = tmp;
+    return 0;
+}
+
+#line 254
+static inline int
+long_ctype_power(npy_long a, npy_long b, npy_long *out) {
+    npy_long tmp;
+
+    if (b == 0) {
+        *out = 1;
+        return 0;
+    }
+    if (a == 1) {
+        *out = 1;
+        return 0;
+    }
+
+    tmp = b & 1 ? a : 1;
+    b >>= 1;
+    while (b > 0) {
+        a *= a;
+        if (b & 1) {
+            tmp *= a;
+        }
+        b >>= 1;
+    }
+    *out = tmp;
+    return 0;
+}
+
+#line 254
+static inline int
+ulong_ctype_power(npy_ulong a, npy_ulong b, npy_ulong *out) {
+    npy_ulong tmp;
+
+    if (b == 0) {
+        *out = 1;
+        return 0;
+    }
+    if (a == 1) {
+        *out = 1;
+        return 0;
+    }
+
+    tmp = b & 1 ? a : 1;
+    b >>= 1;
+    while (b > 0) {
+        a *= a;
+        if (b & 1) {
+            tmp *= a;
+        }
+        b >>= 1;
+    }
+    *out = tmp;
+    return 0;
+}
+
+#line 254
+static inline int
+longlong_ctype_power(npy_longlong a, npy_longlong b, npy_longlong *out) {
+    npy_longlong tmp;
+
+    if (b == 0) {
+        *out = 1;
+        return 0;
+    }
+    if (a == 1) {
+        *out = 1;
+        return 0;
+    }
+
+    tmp = b & 1 ? a : 1;
+    b >>= 1;
+    while (b > 0) {
+        a *= a;
+        if (b & 1) {
+            tmp *= a;
+        }
+        b >>= 1;
+    }
+    *out = tmp;
+    return 0;
+}
+
+#line 254
+static inline int
+ulonglong_ctype_power(npy_ulonglong a, npy_ulonglong b, npy_ulonglong *out) {
+    npy_ulonglong tmp;
+
+    if (b == 0) {
+        *out = 1;
+        return 0;
+    }
+    if (a == 1) {
+        *out = 1;
+        return 0;
+    }
+
+    tmp = b & 1 ? a : 1;
+    b >>= 1;
+    while (b > 0) {
+        a *= a;
+        if (b & 1) {
+            tmp *= a;
+        }
+        b >>= 1;
+    }
+    *out = tmp;
+    return 0;
+}
+
+
+
+#line 289
+
+#line 294
+
+static inline int
+byte_ctype_and(npy_byte arg1, npy_byte arg2, npy_byte *out)
+{
+    *out = arg1 & arg2;
+    return 0;
+}
+
+
+#line 294
+
+static inline int
+byte_ctype_xor(npy_byte arg1, npy_byte arg2, npy_byte *out)
+{
+    *out = arg1 ^ arg2;
+    return 0;
+}
+
+
+#line 294
+
+static inline int
+byte_ctype_or(npy_byte arg1, npy_byte arg2, npy_byte *out)
+{
+    *out = arg1 | arg2;
+    return 0;
+}
+
+
+
+static inline int
+byte_ctype_lshift(npy_byte arg1, npy_byte arg2, npy_byte *out)
+{
+    *out = npy_lshifthh(arg1, arg2);
+    return 0;
+}
+
+static inline int
+byte_ctype_rshift(npy_byte arg1, npy_byte arg2, npy_byte *out)
+{
+    *out = npy_rshifthh(arg1, arg2);
+    return 0;
+}
+
+
+#line 289
+
+#line 294
+
+static inline int
+ubyte_ctype_and(npy_ubyte arg1, npy_ubyte arg2, npy_ubyte *out)
+{
+    *out = arg1 & arg2;
+    return 0;
+}
+
+
+#line 294
+
+static inline int
+ubyte_ctype_xor(npy_ubyte arg1, npy_ubyte arg2, npy_ubyte *out)
+{
+    *out = arg1 ^ arg2;
+    return 0;
+}
+
+
+#line 294
+
+static inline int
+ubyte_ctype_or(npy_ubyte arg1, npy_ubyte arg2, npy_ubyte *out)
+{
+    *out = arg1 | arg2;
+    return 0;
+}
+
+
+
+static inline int
+ubyte_ctype_lshift(npy_ubyte arg1, npy_ubyte arg2, npy_ubyte *out)
+{
+    *out = npy_lshiftuhh(arg1, arg2);
+    return 0;
+}
+
+static inline int
+ubyte_ctype_rshift(npy_ubyte arg1, npy_ubyte arg2, npy_ubyte *out)
+{
+    *out = npy_rshiftuhh(arg1, arg2);
+    return 0;
+}
+
+
+#line 289
+
+#line 294
+
+static inline int
+short_ctype_and(npy_short arg1, npy_short arg2, npy_short *out)
+{
+    *out = arg1 & arg2;
+    return 0;
+}
+
+
+#line 294
+
+static inline int
+short_ctype_xor(npy_short arg1, npy_short arg2, npy_short *out)
+{
+    *out = arg1 ^ arg2;
+    return 0;
+}
+
+
+#line 294
+
+static inline int
+short_ctype_or(npy_short arg1, npy_short arg2, npy_short *out)
+{
+    *out = arg1 | arg2;
+    return 0;
+}
+
+
+
+static inline int
+short_ctype_lshift(npy_short arg1, npy_short arg2, npy_short *out)
+{
+    *out = npy_lshifth(arg1, arg2);
+    return 0;
+}
+
+static inline int
+short_ctype_rshift(npy_short arg1, npy_short arg2, npy_short *out)
+{
+    *out = npy_rshifth(arg1, arg2);
+    return 0;
+}
+
+
+#line 289
+
+#line 294
+
+static inline int
+ushort_ctype_and(npy_ushort arg1, npy_ushort arg2, npy_ushort *out)
+{
+    *out = arg1 & arg2;
+    return 0;
+}
+
+
+#line 294
+
+static inline int
+ushort_ctype_xor(npy_ushort arg1, npy_ushort arg2, npy_ushort *out)
+{
+    *out = arg1 ^ arg2;
+    return 0;
+}
+
+
+#line 294
+
+static inline int
+ushort_ctype_or(npy_ushort arg1, npy_ushort arg2, npy_ushort *out)
+{
+    *out = arg1 | arg2;
+    return 0;
+}
+
+
+
+static inline int
+ushort_ctype_lshift(npy_ushort arg1, npy_ushort arg2, npy_ushort *out)
+{
+    *out = npy_lshiftuh(arg1, arg2);
+    return 0;
+}
+
+static inline int
+ushort_ctype_rshift(npy_ushort arg1, npy_ushort arg2, npy_ushort *out)
+{
+    *out = npy_rshiftuh(arg1, arg2);
+    return 0;
+}
+
+
+#line 289
+
+#line 294
+
+static inline int
+int_ctype_and(npy_int arg1, npy_int arg2, npy_int *out)
+{
+    *out = arg1 & arg2;
+    return 0;
+}
+
+
+#line 294
+
+static inline int
+int_ctype_xor(npy_int arg1, npy_int arg2, npy_int *out)
+{
+    *out = arg1 ^ arg2;
+    return 0;
+}
+
+
+#line 294
+
+static inline int
+int_ctype_or(npy_int arg1, npy_int arg2, npy_int *out)
+{
+    *out = arg1 | arg2;
+    return 0;
+}
+
+
+
+static inline int
+int_ctype_lshift(npy_int arg1, npy_int arg2, npy_int *out)
+{
+    *out = npy_lshift(arg1, arg2);
+    return 0;
+}
+
+static inline int
+int_ctype_rshift(npy_int arg1, npy_int arg2, npy_int *out)
+{
+    *out = npy_rshift(arg1, arg2);
+    return 0;
+}
+
+
+#line 289
+
+#line 294
+
+static inline int
+uint_ctype_and(npy_uint arg1, npy_uint arg2, npy_uint *out)
+{
+    *out = arg1 & arg2;
+    return 0;
+}
+
+
+#line 294
+
+static inline int
+uint_ctype_xor(npy_uint arg1, npy_uint arg2, npy_uint *out)
+{
+    *out = arg1 ^ arg2;
+    return 0;
+}
+
+
+#line 294
+
+static inline int
+uint_ctype_or(npy_uint arg1, npy_uint arg2, npy_uint *out)
+{
+    *out = arg1 | arg2;
+    return 0;
+}
+
+
+
+static inline int
+uint_ctype_lshift(npy_uint arg1, npy_uint arg2, npy_uint *out)
+{
+    *out = npy_lshiftu(arg1, arg2);
+    return 0;
+}
+
+static inline int
+uint_ctype_rshift(npy_uint arg1, npy_uint arg2, npy_uint *out)
+{
+    *out = npy_rshiftu(arg1, arg2);
+    return 0;
+}
+
+
+#line 289
+
+#line 294
+
+static inline int
+long_ctype_and(npy_long arg1, npy_long arg2, npy_long *out)
+{
+    *out = arg1 & arg2;
+    return 0;
+}
+
+
+#line 294
+
+static inline int
+long_ctype_xor(npy_long arg1, npy_long arg2, npy_long *out)
+{
+    *out = arg1 ^ arg2;
+    return 0;
+}
+
+
+#line 294
+
+static inline int
+long_ctype_or(npy_long arg1, npy_long arg2, npy_long *out)
+{
+    *out = arg1 | arg2;
+    return 0;
+}
+
+
+
+static inline int
+long_ctype_lshift(npy_long arg1, npy_long arg2, npy_long *out)
+{
+    *out = npy_lshiftl(arg1, arg2);
+    return 0;
+}
+
+static inline int
+long_ctype_rshift(npy_long arg1, npy_long arg2, npy_long *out)
+{
+    *out = npy_rshiftl(arg1, arg2);
+    return 0;
+}
+
+
+#line 289
+
+#line 294
+
+static inline int
+ulong_ctype_and(npy_ulong arg1, npy_ulong arg2, npy_ulong *out)
+{
+    *out = arg1 & arg2;
+    return 0;
+}
+
+
+#line 294
+
+static inline int
+ulong_ctype_xor(npy_ulong arg1, npy_ulong arg2, npy_ulong *out)
+{
+    *out = arg1 ^ arg2;
+    return 0;
+}
+
+
+#line 294
+
+static inline int
+ulong_ctype_or(npy_ulong arg1, npy_ulong arg2, npy_ulong *out)
+{
+    *out = arg1 | arg2;
+    return 0;
+}
+
+
+
+static inline int
+ulong_ctype_lshift(npy_ulong arg1, npy_ulong arg2, npy_ulong *out)
+{
+    *out = npy_lshiftul(arg1, arg2);
+    return 0;
+}
+
+static inline int
+ulong_ctype_rshift(npy_ulong arg1, npy_ulong arg2, npy_ulong *out)
+{
+    *out = npy_rshiftul(arg1, arg2);
+    return 0;
+}
+
+
+#line 289
+
+#line 294
+
+static inline int
+longlong_ctype_and(npy_longlong arg1, npy_longlong arg2, npy_longlong *out)
+{
+    *out = arg1 & arg2;
+    return 0;
+}
+
+
+#line 294
+
+static inline int
+longlong_ctype_xor(npy_longlong arg1, npy_longlong arg2, npy_longlong *out)
+{
+    *out = arg1 ^ arg2;
+    return 0;
+}
+
+
+#line 294
+
+static inline int
+longlong_ctype_or(npy_longlong arg1, npy_longlong arg2, npy_longlong *out)
+{
+    *out = arg1 | arg2;
+    return 0;
+}
+
+
+
+static inline int
+longlong_ctype_lshift(npy_longlong arg1, npy_longlong arg2, npy_longlong *out)
+{
+    *out = npy_lshiftll(arg1, arg2);
+    return 0;
+}
+
+static inline int
+longlong_ctype_rshift(npy_longlong arg1, npy_longlong arg2, npy_longlong *out)
+{
+    *out = npy_rshiftll(arg1, arg2);
+    return 0;
+}
+
+
+#line 289
+
+#line 294
+
+static inline int
+ulonglong_ctype_and(npy_ulonglong arg1, npy_ulonglong arg2, npy_ulonglong *out)
+{
+    *out = arg1 & arg2;
+    return 0;
+}
+
+
+#line 294
+
+static inline int
+ulonglong_ctype_xor(npy_ulonglong arg1, npy_ulonglong arg2, npy_ulonglong *out)
+{
+    *out = arg1 ^ arg2;
+    return 0;
+}
+
+
+#line 294
+
+static inline int
+ulonglong_ctype_or(npy_ulonglong arg1, npy_ulonglong arg2, npy_ulonglong *out)
+{
+    *out = arg1 | arg2;
+    return 0;
+}
+
+
+
+static inline int
+ulonglong_ctype_lshift(npy_ulonglong arg1, npy_ulonglong arg2, npy_ulonglong *out)
+{
+    *out = npy_lshiftull(arg1, arg2);
+    return 0;
+}
+
+static inline int
+ulonglong_ctype_rshift(npy_ulonglong arg1, npy_ulonglong arg2, npy_ulonglong *out)
+{
+    *out = npy_rshiftull(arg1, arg2);
+    return 0;
+}
+
+
+
+#line 325
+
+#line 330
+
+static inline int
+float_ctype_add(npy_float a, npy_float b, npy_float *out)
+{
+    *out = a + b;
+    return 0;
+}
+
+
+#line 330
+
+static inline int
+float_ctype_subtract(npy_float a, npy_float b, npy_float *out)
+{
+    *out = a - b;
+    return 0;
+}
+
+
+#line 330
+
+static inline int
+float_ctype_multiply(npy_float a, npy_float b, npy_float *out)
+{
+    *out = a * b;
+    return 0;
+}
+
+
+#line 330
+
+static inline int
+float_ctype_divide(npy_float a, npy_float b, npy_float *out)
+{
+    *out = a / b;
+    return 0;
+}
+
+
+
+#define float_ctype_true_divide float_ctype_divide
+
+
+static inline int
+float_ctype_floor_divide(npy_float a, npy_float b, npy_float *out) {
+    *out = npy_floor_dividef(a, b);
+    return 0;
+}
+
+
+static inline int
+float_ctype_remainder(npy_float a, npy_float b, npy_float *out) {
+    *out = npy_remainderf(a, b);
+    return 0;
+}
+
+
+static inline int
+float_ctype_divmod(npy_float a, npy_float b, npy_float *out1, npy_float *out2) {
+    *out1 = npy_divmodf(a, b, out2);
+    return 0;
+}
+
+
+
+#line 325
+
+#line 330
+
+static inline int
+double_ctype_add(npy_double a, npy_double b, npy_double *out)
+{
+    *out = a + b;
+    return 0;
+}
+
+
+#line 330
+
+static inline int
+double_ctype_subtract(npy_double a, npy_double b, npy_double *out)
+{
+    *out = a - b;
+    return 0;
+}
+
+
+#line 330
+
+static inline int
+double_ctype_multiply(npy_double a, npy_double b, npy_double *out)
+{
+    *out = a * b;
+    return 0;
+}
+
+
+#line 330
+
+static inline int
+double_ctype_divide(npy_double a, npy_double b, npy_double *out)
+{
+    *out = a / b;
+    return 0;
+}
+
+
+
+#define double_ctype_true_divide double_ctype_divide
+
+
+static inline int
+double_ctype_floor_divide(npy_double a, npy_double b, npy_double *out) {
+    *out = npy_floor_divide(a, b);
+    return 0;
+}
+
+
+static inline int
+double_ctype_remainder(npy_double a, npy_double b, npy_double *out) {
+    *out = npy_remainder(a, b);
+    return 0;
+}
+
+
+static inline int
+double_ctype_divmod(npy_double a, npy_double b, npy_double *out1, npy_double *out2) {
+    *out1 = npy_divmod(a, b, out2);
+    return 0;
+}
+
+
+
+#line 325
+
+#line 330
+
+static inline int
+longdouble_ctype_add(npy_longdouble a, npy_longdouble b, npy_longdouble *out)
+{
+    *out = a + b;
+    return 0;
+}
+
+
+#line 330
+
+static inline int
+longdouble_ctype_subtract(npy_longdouble a, npy_longdouble b, npy_longdouble *out)
+{
+    *out = a - b;
+    return 0;
+}
+
+
+#line 330
+
+static inline int
+longdouble_ctype_multiply(npy_longdouble a, npy_longdouble b, npy_longdouble *out)
+{
+    *out = a * b;
+    return 0;
+}
+
+
+#line 330
+
+static inline int
+longdouble_ctype_divide(npy_longdouble a, npy_longdouble b, npy_longdouble *out)
+{
+    *out = a / b;
+    return 0;
+}
+
+
+
+#define longdouble_ctype_true_divide longdouble_ctype_divide
+
+
+static inline int
+longdouble_ctype_floor_divide(npy_longdouble a, npy_longdouble b, npy_longdouble *out) {
+    *out = npy_floor_dividel(a, b);
+    return 0;
+}
+
+
+static inline int
+longdouble_ctype_remainder(npy_longdouble a, npy_longdouble b, npy_longdouble *out) {
+    *out = npy_remainderl(a, b);
+    return 0;
+}
+
+
+static inline int
+longdouble_ctype_divmod(npy_longdouble a, npy_longdouble b, npy_longdouble *out1, npy_longdouble *out2) {
+    *out1 = npy_divmodl(a, b, out2);
+    return 0;
+}
+
+
+
+
+#line 370
+
+static inline int
+half_ctype_add(npy_half a, npy_half b, npy_half *out)
+{
+    float res = npy_half_to_float(a) + npy_half_to_float(b);
+    *out = npy_float_to_half(res);
+    return 0;
+}
+
+
+#line 370
+
+static inline int
+half_ctype_subtract(npy_half a, npy_half b, npy_half *out)
+{
+    float res = npy_half_to_float(a) - npy_half_to_float(b);
+    *out = npy_float_to_half(res);
+    return 0;
+}
+
+
+#line 370
+
+static inline int
+half_ctype_multiply(npy_half a, npy_half b, npy_half *out)
+{
+    float res = npy_half_to_float(a) * npy_half_to_float(b);
+    *out = npy_float_to_half(res);
+    return 0;
+}
+
+
+#line 370
+
+static inline int
+half_ctype_divide(npy_half a, npy_half b, npy_half *out)
+{
+    float res = npy_half_to_float(a) / npy_half_to_float(b);
+    *out = npy_float_to_half(res);
+    return 0;
+}
+
+
+#define half_ctype_true_divide half_ctype_divide
+
+
+static inline int
+half_ctype_floor_divide(npy_half a, npy_half b, npy_half *out)
+{
+    npy_half mod;
+
+    if (!b) {
+        float res = npy_half_to_float(a) / npy_half_to_float(b);
+        *out = npy_float_to_half(res);
+    }
+    else {
+        *out = npy_half_divmod(a, b, &mod);
+    }
+    return 0;
+}
+
+
+static inline int
+half_ctype_remainder(npy_half a, npy_half b, npy_half *out)
+{
+    npy_half_divmod(a, b, out);
+    return 0;
+}
+
+
+static inline int
+half_ctype_divmod(npy_half a, npy_half b, npy_half *out1, npy_half *out2)
+{
+    *out1 = npy_half_divmod(a, b, out2);
+    return 0;
+}
+
+#line 422
+static inline int
+cfloat_ctype_add(npy_cfloat a, npy_cfloat b, npy_cfloat *out)
+{
+    out->real = a.real + b.real;
+    out->imag = a.imag + b.imag;
+    return 0;
+}
+
+static inline int
+cfloat_ctype_subtract(npy_cfloat a, npy_cfloat b, npy_cfloat *out)
+{
+    out->real = a.real - b.real;
+    out->imag = a.imag - b.imag;
+    return 0;
+}
+
+
+/*
+ * TODO: Mark as  to work around FPEs not being issues on clang 12.
+ *       This should be removed when possible.
+ */
+static inline int
+cfloat_ctype_multiply( npy_cfloat a, npy_cfloat b, npy_cfloat *out)
+{
+    out->real = a.real * b.real - a.imag * b.imag;
+    out->imag = a.real * b.imag + a.imag * b.real;
+    return 0;
+}
+
+/* Use the ufunc loop directly to avoid duplicating the complicated logic */
+static inline int
+cfloat_ctype_divide(npy_cfloat a, npy_cfloat b, npy_cfloat *out)
+{
+    char *args[3] = {(char *)&a, (char *)&b, (char *)out};
+    npy_intp steps[3] = {0, 0, 0};
+    npy_intp size = 1;
+    CFLOAT_divide(args, &size, steps, NULL);
+    return 0;
+}
+
+#define cfloat_ctype_true_divide cfloat_ctype_divide
+
+
+#line 422
+static inline int
+cdouble_ctype_add(npy_cdouble a, npy_cdouble b, npy_cdouble *out)
+{
+    out->real = a.real + b.real;
+    out->imag = a.imag + b.imag;
+    return 0;
+}
+
+static inline int
+cdouble_ctype_subtract(npy_cdouble a, npy_cdouble b, npy_cdouble *out)
+{
+    out->real = a.real - b.real;
+    out->imag = a.imag - b.imag;
+    return 0;
+}
+
+
+/*
+ * TODO: Mark as  to work around FPEs not being issues on clang 12.
+ *       This should be removed when possible.
+ */
+static inline int
+cdouble_ctype_multiply( npy_cdouble a, npy_cdouble b, npy_cdouble *out)
+{
+    out->real = a.real * b.real - a.imag * b.imag;
+    out->imag = a.real * b.imag + a.imag * b.real;
+    return 0;
+}
+
+/* Use the ufunc loop directly to avoid duplicating the complicated logic */
+static inline int
+cdouble_ctype_divide(npy_cdouble a, npy_cdouble b, npy_cdouble *out)
+{
+    char *args[3] = {(char *)&a, (char *)&b, (char *)out};
+    npy_intp steps[3] = {0, 0, 0};
+    npy_intp size = 1;
+    CDOUBLE_divide(args, &size, steps, NULL);
+    return 0;
+}
+
+#define cdouble_ctype_true_divide cdouble_ctype_divide
+
+
+#line 422
+static inline int
+clongdouble_ctype_add(npy_clongdouble a, npy_clongdouble b, npy_clongdouble *out)
+{
+    out->real = a.real + b.real;
+    out->imag = a.imag + b.imag;
+    return 0;
+}
+
+static inline int
+clongdouble_ctype_subtract(npy_clongdouble a, npy_clongdouble b, npy_clongdouble *out)
+{
+    out->real = a.real - b.real;
+    out->imag = a.imag - b.imag;
+    return 0;
+}
+
+
+/*
+ * TODO: Mark as  to work around FPEs not being issues on clang 12.
+ *       This should be removed when possible.
+ */
+static inline int
+clongdouble_ctype_multiply( npy_clongdouble a, npy_clongdouble b, npy_clongdouble *out)
+{
+    out->real = a.real * b.real - a.imag * b.imag;
+    out->imag = a.real * b.imag + a.imag * b.real;
+    return 0;
+}
+
+/* Use the ufunc loop directly to avoid duplicating the complicated logic */
+static inline int
+clongdouble_ctype_divide(npy_clongdouble a, npy_clongdouble b, npy_clongdouble *out)
+{
+    char *args[3] = {(char *)&a, (char *)&b, (char *)out};
+    npy_intp steps[3] = {0, 0, 0};
+    npy_intp size = 1;
+    CLONGDOUBLE_divide(args, &size, steps, NULL);
+    return 0;
+}
+
+#define clongdouble_ctype_true_divide clongdouble_ctype_divide
+
+
+
+
+
+#line 472
+
+static inline int
+byte_ctype_divmod(npy_byte a, npy_byte b, npy_byte *out, npy_byte *out2)
+{
+    int res = byte_ctype_floor_divide(a, b, out);
+    res |= byte_ctype_remainder(a, b, out2);
+    return res;
+}
+
+
+#line 472
+
+static inline int
+ubyte_ctype_divmod(npy_ubyte a, npy_ubyte b, npy_ubyte *out, npy_ubyte *out2)
+{
+    int res = ubyte_ctype_floor_divide(a, b, out);
+    res |= ubyte_ctype_remainder(a, b, out2);
+    return res;
+}
+
+
+#line 472
+
+static inline int
+short_ctype_divmod(npy_short a, npy_short b, npy_short *out, npy_short *out2)
+{
+    int res = short_ctype_floor_divide(a, b, out);
+    res |= short_ctype_remainder(a, b, out2);
+    return res;
+}
+
+
+#line 472
+
+static inline int
+ushort_ctype_divmod(npy_ushort a, npy_ushort b, npy_ushort *out, npy_ushort *out2)
+{
+    int res = ushort_ctype_floor_divide(a, b, out);
+    res |= ushort_ctype_remainder(a, b, out2);
+    return res;
+}
+
+
+#line 472
+
+static inline int
+int_ctype_divmod(npy_int a, npy_int b, npy_int *out, npy_int *out2)
+{
+    int res = int_ctype_floor_divide(a, b, out);
+    res |= int_ctype_remainder(a, b, out2);
+    return res;
+}
+
+
+#line 472
+
+static inline int
+uint_ctype_divmod(npy_uint a, npy_uint b, npy_uint *out, npy_uint *out2)
+{
+    int res = uint_ctype_floor_divide(a, b, out);
+    res |= uint_ctype_remainder(a, b, out2);
+    return res;
+}
+
+
+#line 472
+
+static inline int
+long_ctype_divmod(npy_long a, npy_long b, npy_long *out, npy_long *out2)
+{
+    int res = long_ctype_floor_divide(a, b, out);
+    res |= long_ctype_remainder(a, b, out2);
+    return res;
+}
+
+
+#line 472
+
+static inline int
+ulong_ctype_divmod(npy_ulong a, npy_ulong b, npy_ulong *out, npy_ulong *out2)
+{
+    int res = ulong_ctype_floor_divide(a, b, out);
+    res |= ulong_ctype_remainder(a, b, out2);
+    return res;
+}
+
+
+#line 472
+
+static inline int
+longlong_ctype_divmod(npy_longlong a, npy_longlong b, npy_longlong *out, npy_longlong *out2)
+{
+    int res = longlong_ctype_floor_divide(a, b, out);
+    res |= longlong_ctype_remainder(a, b, out2);
+    return res;
+}
+
+
+#line 472
+
+static inline int
+ulonglong_ctype_divmod(npy_ulonglong a, npy_ulonglong b, npy_ulonglong *out, npy_ulonglong *out2)
+{
+    int res = ulonglong_ctype_floor_divide(a, b, out);
+    res |= ulonglong_ctype_remainder(a, b, out2);
+    return res;
+}
+
+
+
+
+#line 489
+
+static inline int
+float_ctype_power(npy_float a, npy_float b, npy_float *out)
+{
+    *out = npy_powf(a, b);
+    return 0;
+}
+
+
+#line 489
+
+static inline int
+double_ctype_power(npy_double a, npy_double b, npy_double *out)
+{
+    *out = npy_pow(a, b);
+    return 0;
+}
+
+
+#line 489
+
+static inline int
+longdouble_ctype_power(npy_longdouble a, npy_longdouble b, npy_longdouble *out)
+{
+    *out = npy_powl(a, b);
+    return 0;
+}
+
+
+static inline int
+half_ctype_power(npy_half a, npy_half b, npy_half *out)
+{
+    const npy_float af = npy_half_to_float(a);
+    const npy_float bf = npy_half_to_float(b);
+    const npy_float outf = npy_powf(af,bf);
+    *out = npy_float_to_half(outf);
+    return 0;
+}
+
+#line 521
+static inline int
+byte_ctype_negative(npy_byte a, npy_byte *out)
+{
+#if 0
+    *out = -a;
+    if (a == 0) {
+        return 0;
+    }
+    return NPY_FPE_OVERFLOW;
+#elif 1
+    if (a == NPY_MIN_BYTE){
+        *out = a;
+        return NPY_FPE_OVERFLOW;
+    }
+    *out = -a;
+    return 0;
+#else  /* floats */
+    *out = -a;
+    return 0;
+#endif
+}
+
+#line 521
+static inline int
+ubyte_ctype_negative(npy_ubyte a, npy_ubyte *out)
+{
+#if 1
+    *out = -a;
+    if (a == 0) {
+        return 0;
+    }
+    return NPY_FPE_OVERFLOW;
+#elif 1
+    if (a == NPY_MIN_UBYTE){
+        *out = a;
+        return NPY_FPE_OVERFLOW;
+    }
+    *out = -a;
+    return 0;
+#else  /* floats */
+    *out = -a;
+    return 0;
+#endif
+}
+
+#line 521
+static inline int
+short_ctype_negative(npy_short a, npy_short *out)
+{
+#if 0
+    *out = -a;
+    if (a == 0) {
+        return 0;
+    }
+    return NPY_FPE_OVERFLOW;
+#elif 1
+    if (a == NPY_MIN_SHORT){
+        *out = a;
+        return NPY_FPE_OVERFLOW;
+    }
+    *out = -a;
+    return 0;
+#else  /* floats */
+    *out = -a;
+    return 0;
+#endif
+}
+
+#line 521
+static inline int
+ushort_ctype_negative(npy_ushort a, npy_ushort *out)
+{
+#if 1
+    *out = -a;
+    if (a == 0) {
+        return 0;
+    }
+    return NPY_FPE_OVERFLOW;
+#elif 1
+    if (a == NPY_MIN_USHORT){
+        *out = a;
+        return NPY_FPE_OVERFLOW;
+    }
+    *out = -a;
+    return 0;
+#else  /* floats */
+    *out = -a;
+    return 0;
+#endif
+}
+
+#line 521
+static inline int
+int_ctype_negative(npy_int a, npy_int *out)
+{
+#if 0
+    *out = -a;
+    if (a == 0) {
+        return 0;
+    }
+    return NPY_FPE_OVERFLOW;
+#elif 1
+    if (a == NPY_MIN_INT){
+        *out = a;
+        return NPY_FPE_OVERFLOW;
+    }
+    *out = -a;
+    return 0;
+#else  /* floats */
+    *out = -a;
+    return 0;
+#endif
+}
+
+#line 521
+static inline int
+uint_ctype_negative(npy_uint a, npy_uint *out)
+{
+#if 1
+    *out = -a;
+    if (a == 0) {
+        return 0;
+    }
+    return NPY_FPE_OVERFLOW;
+#elif 1
+    if (a == NPY_MIN_UINT){
+        *out = a;
+        return NPY_FPE_OVERFLOW;
+    }
+    *out = -a;
+    return 0;
+#else  /* floats */
+    *out = -a;
+    return 0;
+#endif
+}
+
+#line 521
+static inline int
+long_ctype_negative(npy_long a, npy_long *out)
+{
+#if 0
+    *out = -a;
+    if (a == 0) {
+        return 0;
+    }
+    return NPY_FPE_OVERFLOW;
+#elif 1
+    if (a == NPY_MIN_LONG){
+        *out = a;
+        return NPY_FPE_OVERFLOW;
+    }
+    *out = -a;
+    return 0;
+#else  /* floats */
+    *out = -a;
+    return 0;
+#endif
+}
+
+#line 521
+static inline int
+ulong_ctype_negative(npy_ulong a, npy_ulong *out)
+{
+#if 1
+    *out = -a;
+    if (a == 0) {
+        return 0;
+    }
+    return NPY_FPE_OVERFLOW;
+#elif 1
+    if (a == NPY_MIN_ULONG){
+        *out = a;
+        return NPY_FPE_OVERFLOW;
+    }
+    *out = -a;
+    return 0;
+#else  /* floats */
+    *out = -a;
+    return 0;
+#endif
+}
+
+#line 521
+static inline int
+longlong_ctype_negative(npy_longlong a, npy_longlong *out)
+{
+#if 0
+    *out = -a;
+    if (a == 0) {
+        return 0;
+    }
+    return NPY_FPE_OVERFLOW;
+#elif 1
+    if (a == NPY_MIN_LONGLONG){
+        *out = a;
+        return NPY_FPE_OVERFLOW;
+    }
+    *out = -a;
+    return 0;
+#else  /* floats */
+    *out = -a;
+    return 0;
+#endif
+}
+
+#line 521
+static inline int
+ulonglong_ctype_negative(npy_ulonglong a, npy_ulonglong *out)
+{
+#if 1
+    *out = -a;
+    if (a == 0) {
+        return 0;
+    }
+    return NPY_FPE_OVERFLOW;
+#elif 1
+    if (a == NPY_MIN_ULONGLONG){
+        *out = a;
+        return NPY_FPE_OVERFLOW;
+    }
+    *out = -a;
+    return 0;
+#else  /* floats */
+    *out = -a;
+    return 0;
+#endif
+}
+
+#line 521
+static inline int
+float_ctype_negative(npy_float a, npy_float *out)
+{
+#if 0
+    *out = -a;
+    if (a == 0) {
+        return 0;
+    }
+    return NPY_FPE_OVERFLOW;
+#elif 0
+    if (a == NPY_MIN_FLOAT){
+        *out = a;
+        return NPY_FPE_OVERFLOW;
+    }
+    *out = -a;
+    return 0;
+#else  /* floats */
+    *out = -a;
+    return 0;
+#endif
+}
+
+#line 521
+static inline int
+double_ctype_negative(npy_double a, npy_double *out)
+{
+#if 0
+    *out = -a;
+    if (a == 0) {
+        return 0;
+    }
+    return NPY_FPE_OVERFLOW;
+#elif 0
+    if (a == NPY_MIN_DOUBLE){
+        *out = a;
+        return NPY_FPE_OVERFLOW;
+    }
+    *out = -a;
+    return 0;
+#else  /* floats */
+    *out = -a;
+    return 0;
+#endif
+}
+
+#line 521
+static inline int
+longdouble_ctype_negative(npy_longdouble a, npy_longdouble *out)
+{
+#if 0
+    *out = -a;
+    if (a == 0) {
+        return 0;
+    }
+    return NPY_FPE_OVERFLOW;
+#elif 0
+    if (a == NPY_MIN_LONGDOUBLE){
+        *out = a;
+        return NPY_FPE_OVERFLOW;
+    }
+    *out = -a;
+    return 0;
+#else  /* floats */
+    *out = -a;
+    return 0;
+#endif
+}
+
+
+static inline int
+half_ctype_negative(npy_half a, npy_half *out)
+{
+    *out = a^0x8000u;
+    return 0;
+}
+
+
+#line 556
+static inline int
+cfloat_ctype_negative(npy_cfloat a, npy_cfloat *out)
+{
+    out->real = -a.real;
+    out->imag = -a.imag;
+    return 0;
+}
+
+#line 556
+static inline int
+cdouble_ctype_negative(npy_cdouble a, npy_cdouble *out)
+{
+    out->real = -a.real;
+    out->imag = -a.imag;
+    return 0;
+}
+
+#line 556
+static inline int
+clongdouble_ctype_negative(npy_clongdouble a, npy_clongdouble *out)
+{
+    out->real = -a.real;
+    out->imag = -a.imag;
+    return 0;
+}
+
+
+#line 573
+static inline int
+byte_ctype_positive(npy_byte a, npy_byte *out)
+{
+    *out = a;
+    return 0;
+}
+
+#line 573
+static inline int
+ubyte_ctype_positive(npy_ubyte a, npy_ubyte *out)
+{
+    *out = a;
+    return 0;
+}
+
+#line 573
+static inline int
+short_ctype_positive(npy_short a, npy_short *out)
+{
+    *out = a;
+    return 0;
+}
+
+#line 573
+static inline int
+ushort_ctype_positive(npy_ushort a, npy_ushort *out)
+{
+    *out = a;
+    return 0;
+}
+
+#line 573
+static inline int
+int_ctype_positive(npy_int a, npy_int *out)
+{
+    *out = a;
+    return 0;
+}
+
+#line 573
+static inline int
+uint_ctype_positive(npy_uint a, npy_uint *out)
+{
+    *out = a;
+    return 0;
+}
+
+#line 573
+static inline int
+long_ctype_positive(npy_long a, npy_long *out)
+{
+    *out = a;
+    return 0;
+}
+
+#line 573
+static inline int
+ulong_ctype_positive(npy_ulong a, npy_ulong *out)
+{
+    *out = a;
+    return 0;
+}
+
+#line 573
+static inline int
+longlong_ctype_positive(npy_longlong a, npy_longlong *out)
+{
+    *out = a;
+    return 0;
+}
+
+#line 573
+static inline int
+ulonglong_ctype_positive(npy_ulonglong a, npy_ulonglong *out)
+{
+    *out = a;
+    return 0;
+}
+
+#line 573
+static inline int
+half_ctype_positive(npy_half a, npy_half *out)
+{
+    *out = a;
+    return 0;
+}
+
+#line 573
+static inline int
+float_ctype_positive(npy_float a, npy_float *out)
+{
+    *out = a;
+    return 0;
+}
+
+#line 573
+static inline int
+double_ctype_positive(npy_double a, npy_double *out)
+{
+    *out = a;
+    return 0;
+}
+
+#line 573
+static inline int
+longdouble_ctype_positive(npy_longdouble a, npy_longdouble *out)
+{
+    *out = a;
+    return 0;
+}
+
+
+#line 586
+static inline int
+cfloat_ctype_positive(npy_cfloat a, npy_cfloat *out)
+{
+    out->real = a.real;
+    out->imag = a.imag;
+    return 0;
+}
+
+static inline int
+cfloat_ctype_power(npy_cfloat a, npy_cfloat b, npy_cfloat *out)
+{
+    *out = npy_cpowf(a, b);
+    return 0;
+}
+
+#line 586
+static inline int
+cdouble_ctype_positive(npy_cdouble a, npy_cdouble *out)
+{
+    out->real = a.real;
+    out->imag = a.imag;
+    return 0;
+}
+
+static inline int
+cdouble_ctype_power(npy_cdouble a, npy_cdouble b, npy_cdouble *out)
+{
+    *out = npy_cpow(a, b);
+    return 0;
+}
+
+#line 586
+static inline int
+clongdouble_ctype_positive(npy_clongdouble a, npy_clongdouble *out)
+{
+    out->real = a.real;
+    out->imag = a.imag;
+    return 0;
+}
+
+static inline int
+clongdouble_ctype_power(npy_clongdouble a, npy_clongdouble b, npy_clongdouble *out)
+{
+    *out = npy_cpowl(a, b);
+    return 0;
+}
+
+
+
+#line 606
+
+#define ubyte_ctype_absolute ubyte_ctype_positive
+
+
+#line 606
+
+#define ushort_ctype_absolute ushort_ctype_positive
+
+
+#line 606
+
+#define uint_ctype_absolute uint_ctype_positive
+
+
+#line 606
+
+#define ulong_ctype_absolute ulong_ctype_positive
+
+
+#line 606
+
+#define ulonglong_ctype_absolute ulonglong_ctype_positive
+
+
+
+
+#line 617
+static inline int
+byte_ctype_absolute(npy_byte a, npy_byte *out)
+{
+    if (a == NPY_MIN_BYTE) {
+        *out = a;
+        return NPY_FPE_OVERFLOW;
+    }
+    *out = (a < 0 ? -a : a);
+    return 0;
+}
+
+#line 617
+static inline int
+short_ctype_absolute(npy_short a, npy_short *out)
+{
+    if (a == NPY_MIN_SHORT) {
+        *out = a;
+        return NPY_FPE_OVERFLOW;
+    }
+    *out = (a < 0 ? -a : a);
+    return 0;
+}
+
+#line 617
+static inline int
+int_ctype_absolute(npy_int a, npy_int *out)
+{
+    if (a == NPY_MIN_INT) {
+        *out = a;
+        return NPY_FPE_OVERFLOW;
+    }
+    *out = (a < 0 ? -a : a);
+    return 0;
+}
+
+#line 617
+static inline int
+long_ctype_absolute(npy_long a, npy_long *out)
+{
+    if (a == NPY_MIN_LONG) {
+        *out = a;
+        return NPY_FPE_OVERFLOW;
+    }
+    *out = (a < 0 ? -a : a);
+    return 0;
+}
+
+#line 617
+static inline int
+longlong_ctype_absolute(npy_longlong a, npy_longlong *out)
+{
+    if (a == NPY_MIN_LONGLONG) {
+        *out = a;
+        return NPY_FPE_OVERFLOW;
+    }
+    *out = (a < 0 ? -a : a);
+    return 0;
+}
+
+
+#line 634
+static inline int
+float_ctype_absolute(npy_float a, npy_float *out)
+{
+    *out = npy_fabsf(a);
+    return 0;
+}
+
+#line 634
+static inline int
+double_ctype_absolute(npy_double a, npy_double *out)
+{
+    *out = npy_fabs(a);
+    return 0;
+}
+
+#line 634
+static inline int
+longdouble_ctype_absolute(npy_longdouble a, npy_longdouble *out)
+{
+    *out = npy_fabsl(a);
+    return 0;
+}
+
+
+static inline int
+half_ctype_absolute(npy_half a, npy_half *out)
+{
+    *out = a&0x7fffu;
+    return 0;
+}
+
+#line 655
+static inline int
+cfloat_ctype_absolute(npy_cfloat a, npy_float *out)
+{
+    *out = npy_cabsf(a);
+    return 0;
+}
+
+#line 655
+static inline int
+cdouble_ctype_absolute(npy_cdouble a, npy_double *out)
+{
+    *out = npy_cabs(a);
+    return 0;
+}
+
+#line 655
+static inline int
+clongdouble_ctype_absolute(npy_clongdouble a, npy_longdouble *out)
+{
+    *out = npy_cabsl(a);
+    return 0;
+}
+
+
+#line 667
+
+static inline int
+byte_ctype_invert(npy_byte a, npy_byte *out)
+{
+    *out = ~a;
+    return 0;
+}
+
+
+#line 667
+
+static inline int
+ubyte_ctype_invert(npy_ubyte a, npy_ubyte *out)
+{
+    *out = ~a;
+    return 0;
+}
+
+
+#line 667
+
+static inline int
+short_ctype_invert(npy_short a, npy_short *out)
+{
+    *out = ~a;
+    return 0;
+}
+
+
+#line 667
+
+static inline int
+ushort_ctype_invert(npy_ushort a, npy_ushort *out)
+{
+    *out = ~a;
+    return 0;
+}
+
+
+#line 667
+
+static inline int
+int_ctype_invert(npy_int a, npy_int *out)
+{
+    *out = ~a;
+    return 0;
+}
+
+
+#line 667
+
+static inline int
+uint_ctype_invert(npy_uint a, npy_uint *out)
+{
+    *out = ~a;
+    return 0;
+}
+
+
+#line 667
+
+static inline int
+long_ctype_invert(npy_long a, npy_long *out)
+{
+    *out = ~a;
+    return 0;
+}
+
+
+#line 667
+
+static inline int
+ulong_ctype_invert(npy_ulong a, npy_ulong *out)
+{
+    *out = ~a;
+    return 0;
+}
+
+
+#line 667
+
+static inline int
+longlong_ctype_invert(npy_longlong a, npy_longlong *out)
+{
+    *out = ~a;
+    return 0;
+}
+
+
+#line 667
+
+static inline int
+ulonglong_ctype_invert(npy_ulonglong a, npy_ulonglong *out)
+{
+    *out = ~a;
+    return 0;
+}
+
+
+
+/*** END OF BASIC CODE **/
+
+
+/*
+ * How binary operators work
+ * -------------------------
+ *
+ * All binary (numeric) operators use the larger of the two types, with the
+ * exception of unsigned int and signed int mixed cases which must promote
+ * to a larger type.
+ *
+ * The strategy employed for all binary operation is that we coerce the other
+ * scalar if it is safe to do.  E.g. `float64 + float32` the `float64` can
+ * convert `float32` and do the operation as `float64 + float64`.
+ * OTOH, for `float32 + float64` it is safe, and we should defer to `float64`.
+ *
+ * So we have multiple possible paths:
+ * - The other scalar is a subclass.  In principle *both* inputs could be
+ *   different subclasses.  In this case it would make sense to defer, but
+ *   Python's `int` does not try this as well, so we do not here:
+ *
+ *      class A(int): pass
+ *      class B(int):
+ *          def __add__(self, other): return "b"
+ *          __radd__ = __add__
+ *
+ *      A(1) + B(1)  # return 2
+ *      B(1) + A(1)  # return "b"
+ *
+ * - The other scalar can be converted:  All is good, we do the operation
+ * - The other scalar cannot be converted, there are two possibilities:
+ *   - The reverse should work, so we return NotImplemented to defer.
+ *     (If self is a subclass, this will end up in the "unknown" path.)
+ *   - Neither works (e.g. `uint8 + int8`):  We currently use the array path.
+ * - The other object is a unknown.  It could be either a scalar, an array,
+ *   or an array-like (including a list!).  Because NumPy scalars pretend to be
+ *   arrays we fall into the array fallback path here _normally_ (through
+ *   the generic scalar path).
+ *   First we check if we should defer, though.
+ *
+ * The last possibility is awkward and leads to very confusing situations.
+ * The problem is that usually we should defer (return NotImplemented)
+ * in that path.
+ * If the other object is a NumPy array (or array-like) it will know what to
+ * do.  If NumPy knows that it is a scalar (not generic `object`), then it
+ * would make sense to try and use the "array path" (i.e. deal with it
+ * using the ufunc machinery).
+ *
+ * But this overlooks two things that currently work:
+ *
+ * 1. `np.float64(3) * [1, 2, 3]`  happily returns an array result.
+ * 2. `np.int32(3) * decimal.Decimal(3)` works!  (see below)
+ *
+ * The first must work, because scalars pretend to be arrays.  Which means
+ * they inherit the greedy "convert the other object to an array" logic.
+ * This may be a questionable choice, but is fine.
+ * (As of now, it is not negotiable, since NumPy often converts 0-D arrays
+ * to scalars.)
+ *
+ * The second one is more confusing.  This works also by using the ufunc
+ * machinery (array path), but it works because:
+ *
+ *     np.add(np.int32(3), decimal.Decimal(3))
+ *
+ * Will convert the `int32` to an int32 array, and the decimal to an object
+ * array.  It then *casts* the `int32` array to an object array.
+ * The casting step CONVERTS the integer to a Python integer.  The ufunc object
+ * loop will then call back into Python scalar logic.
+ *
+ * The above would be recursive, if it was not for the conversion of the int32
+ * to a Python integer!
+ * This leads us to the EXCEEDINGLY IMPORTANT special case:
+ *
+ * WARNING: longdouble and clongdouble do NOT convert to a Python scalar
+ *          when cast to object.  Thus they MUST NEVER take the array-path.
+ *          However, they STILL should defer at least for
+ *          `np.longdouble(3) + array`.
+ *
+ *
+ * As a general note, in the above we defer exactly when we know that deferring
+ * will work.  `longdouble` uses the "simple" logic of generally deferring
+ * though, because it would otherwise easily run into an infinite recursion.
+ *
+ *
+ * The future?!
+ * ------------
+ *
+ * This is very tricky and it would be nice to formalize away that "recursive"
+ * path we currently use.  I (seberg) have currently no great idea on this,
+ * this is more brainstorming!
+ *
+ * If both are scalars (known to NumPy), they have a DType and we may be able
+ * to do the ufunc promotion to make sure there is no risk of recursion.
+ *
+ * In principle always deferring would probably be clean.  But we likely cannot
+ * do that?  There is also an issue that it is nice that we allow adding a
+ * DType for an existing Python scalar (which will not know about NumPy
+ * scalars).
+ * The DType/ufunc machinery teaches NumPy how arrays will work with that
+ * Python scalar, but the DType may need to help us decide whether we should
+ * defer (return NotImplemented) or try using the ufunc machinery (or a
+ * simplified ufunc-like machinery limited to scalars).
+ */
+
+
+/*
+ * Enum used to describe the space of possibilities when converting the second
+ * argument to a binary operation.
+ * Any of these flags may be combined with the return flag of
+ * `may_need_deferring` indicating that the other is any type of object which
+ * may e.g. define an `__array_priority__`.
+ */
+typedef enum {
+    /* An error occurred (should not really happen/be possible) */
+    CONVERSION_ERROR = -1,
+    /* A known NumPy scalar, but of higher precision: we defer */
+    DEFER_TO_OTHER_KNOWN_SCALAR,
+    /*
+     * Conversion was successful (known scalar of less precision).  Note that
+     * the other value may still be a subclass of such a scalar so even here
+     * we may have to check for deferring.
+     * More specialized subclass handling, which defers based on whether the
+     * subclass has an implementation, plausible but complicated.
+     * We do not do it, as even CPython does not do it for the builtin `int`.
+     */
+    CONVERSION_SUCCESS,
+    /*
+     * We use the normal conversion (setitem) function when coercing from
+     * Python scalars.
+     */
+    CONVERT_PYSCALAR,
+    /*
+     * Other object is an unknown scalar or array-like, we (typically) use
+     * the generic path, which normally ends up in the ufunc machinery.
+     */
+    OTHER_IS_UNKNOWN_OBJECT,
+    /*
+     * Promotion necessary
+     */
+    PROMOTION_REQUIRED,
+} conversion_result;
+
+#line 837
+
+#define IS_BYTE 1
+
+#define IS_SAFE(FROM, TO) _npy_can_cast_safely_table[FROM][TO]
+
+/*
+ * TODO: This whole thing is awkward, and we should create a helper header to
+ *       define inline functions that convert single elements for all numeric
+ *       types.  That could then also be used to define all cast loops.
+ *       (Even if that may get more complex for SIMD at some point.)
+ *       For now, half casts could be optimized because of that.
+ */
+
+#if defined(IS_HALF)
+    #define CONVERT_TO_RESULT(value)  \
+        *result = npy_float_to_half((float)(value))
+#elif defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+    #define CONVERT_TO_RESULT(value)  \
+        result->real = value;  \
+        result->imag = 0
+#else
+    #define CONVERT_TO_RESULT(value) *result = value
+#endif
+
+
+#define GET_VALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_##OTHER, NPY_BYTE)) {  \
+            CONVERT_TO_RESULT(PyArrayScalar_VAL(value, Other));  \
+            ret = CONVERSION_SUCCESS;  \
+        }  \
+        else if (IS_SAFE(NPY_BYTE, NPY_##OTHER)) {  \
+            /*
+             * If self can cast safely to other, this is clear:
+             * we should definitely defer.
+             */  \
+             ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            /* Otherwise, we must promote */  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+/*
+ * Complex to complex (and rejecting complex to real) is a bit different:
+ */
+
+#if defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+
+#define GET_CVALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_##OTHER, NPY_BYTE)) {  \
+            assert(Py_TYPE(value) == &Py##Other##ArrType_Type);  \
+            result->real = PyArrayScalar_VAL(value, Other).real;  \
+            result->imag = PyArrayScalar_VAL(value, Other).imag;  \
+            ret = 1;  \
+        }  \
+        else if (IS_SAFE(NPY_BYTE, NPY_##OTHER)) {  \
+             ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+#else
+
+/* Getting a complex value to real is never safe: */
+#define GET_CVALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_BYTE, NPY_##OTHER)) {  \
+            ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+#endif
+
+
+/**
+ * Convert the value to the own type and and store the result.
+ *
+ * @param value The value to convert (if compatible)
+ * @param result The result value (output)
+ * @param may_need_deferring Set to `NPY_TRUE` when the caller must check
+ *        `BINOP_GIVE_UP_IF_NEEDED` (or similar) due to possible implementation
+ *        of `__array_priority__` (or similar).
+ *        This is set for unknown objects and all subclasses even when they
+ *        can be handled.
+ * @result The result value indicating what we did with `value` or what type
+ *         of object it is (see `conversion_result`).
+ */
+static inline conversion_result
+convert_to_byte(PyObject *value, npy_byte *result, npy_bool *may_need_deferring)
+{
+    PyArray_Descr *descr;
+    *may_need_deferring = NPY_FALSE;
+
+    if (Py_TYPE(value) == &PyByteArrType_Type) {
+        *result = PyArrayScalar_VAL(value, Byte);
+        return CONVERSION_SUCCESS;
+    }
+    /* Optimize the identical scalar specifically. */
+    if (PyArray_IsScalar(value, Byte)) {
+        *result = PyArrayScalar_VAL(value, Byte);
+        /*
+         * In principle special, assyemetric, handling could be possible for
+         * explicit subclasses.
+         * In practice, we just check the normal deferring logic.
+         */
+        *may_need_deferring = NPY_TRUE;
+        return CONVERSION_SUCCESS;
+    }
+
+    /*
+     * Then we check for the basic Python types float, int, and complex.
+     * (this is a bit tedious to do right for complex).
+     */
+    if (PyBool_Check(value)) {
+        CONVERT_TO_RESULT(value == Py_True);
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyFloat_Check(value)) {
+        if (!PyFloat_CheckExact(value)) {
+            /* A NumPy double is a float subclass, but special. */
+            if (PyArray_IsScalar(value, Double)) {
+                descr = PyArray_DescrFromType(NPY_DOUBLE);
+                goto numpy_scalar;
+            }
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_DOUBLE, NPY_BYTE)) {
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            /* Weak promotion is used when self is float or complex: */
+            if (!PyTypeNum_ISFLOAT(NPY_BYTE) && !PyTypeNum_ISCOMPLEX(NPY_BYTE)) {
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        CONVERT_TO_RESULT(PyFloat_AS_DOUBLE(value));
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyLong_Check(value)) {
+        if (!PyLong_CheckExact(value)) {
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_LONG, NPY_BYTE)) {
+            /*
+             * long -> (c)longdouble is safe, so `OTHER_IS_UNKNOWN_OBJECT` will
+             * be returned below for huge integers.
+             */
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        int overflow;
+        long val = PyLong_AsLongAndOverflow(value, &overflow);
+        if (overflow) {
+            /* handle as if "unsafe" */
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                return OTHER_IS_UNKNOWN_OBJECT;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        if (error_converting(val)) {
+            return CONVERSION_ERROR;  /* should not be possible */
+        }
+        CONVERT_TO_RESULT(val);
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyComplex_Check(value)) {
+        if (!PyComplex_CheckExact(value)) {
+            /* A NumPy complex double is a float subclass, but special. */
+            if (PyArray_IsScalar(value, CDouble)) {
+                descr = PyArray_DescrFromType(NPY_CDOUBLE);
+                goto numpy_scalar;
+            }
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_CDOUBLE, NPY_BYTE)) {
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            /* Weak promotion is used when self is float or complex: */
+            if (!PyTypeNum_ISCOMPLEX(NPY_BYTE)) {
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+#if defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+        Py_complex val = PyComplex_AsCComplex(value);
+        if (error_converting(val.real)) {
+            return CONVERSION_ERROR;  /* should not be possible */
+        }
+        result->real = val.real;
+        result->imag = val.imag;
+        return CONVERSION_SUCCESS;
+#else
+        /* unreachable, always unsafe cast above; return to avoid warning */
+        assert(0);
+        return OTHER_IS_UNKNOWN_OBJECT;
+#endif  /* defined(IS_CFLOAT) || ... */
+    }
+
+    /*
+     * (seberg) It would be nice to use `PyArray_DiscoverDTypeFromScalarType`
+     * from array coercion here.  OTOH, the array coercion code also falls
+     * back to this code.  The issue is around how subclasses should work...
+     *
+     * It would be nice to try to fully align the paths again (they effectively
+     * are equivalent).  Proper support for subclasses is in general tricky,
+     * and it would make more sense to just _refuse_ to support them.
+     * However, it is unclear that this is a viable option...
+     */
+    if (!PyArray_IsScalar(value, Generic)) {
+        /*
+         * The input is an unknown python object.  This should probably defer
+         * but only does so for float128.
+         * For all other cases, we defer to the array logic.  If the object
+         * is indeed not an array-like, this will end up converting the NumPy
+         * scalar to a Python scalar and then try again.
+         * The logic is that the ufunc casts the input to object, which does
+         * the conversion.
+         * If the object is an array, deferring will always kick in.
+         */
+        *may_need_deferring = NPY_TRUE;
+        return OTHER_IS_UNKNOWN_OBJECT;
+    }
+
+    descr = PyArray_DescrFromScalar(value);
+    if (descr == NULL) {
+        if (PyErr_Occurred()) {
+            return CONVERSION_ERROR;
+        }
+        /* Should not happen, but may be possible with bad user subclasses */
+        *may_need_deferring = NPY_TRUE;
+        return OTHER_IS_UNKNOWN_OBJECT;
+    }
+
+  numpy_scalar:
+    if (descr->typeobj != Py_TYPE(value)) {
+        /*
+         * This is a subclass of a builtin type, we may continue normally,
+         * but should check whether we need to defer.
+         */
+        *may_need_deferring = NPY_TRUE;
+    }
+
+    /*
+     * Otherwise, we have a clear NumPy scalar, find if it is a compatible
+     * builtin scalar.
+     * Each `GET_VALUE_OR_DEFER` represents a case clause for its type number,
+     * extracting the value if it is safe and otherwise deferring.
+     * (Safety is known at compile time, so the switch statement should be
+     * simplified by the compiler accordingly.)
+     * If we have a scalar that is not listed or not safe, we defer to it.
+     *
+     * We should probably defer more aggressively, but that is too big a change,
+     * since it would disable `np.float64(1.) * [1, 2, 3, 4]`.
+     */
+    int ret;  /* set by the GET_VALUE_OR_DEFER macro */
+    switch (descr->type_num) {
+        GET_VALUE_OR_DEFER(BOOL, Bool, value);
+        /* UInts */
+        GET_VALUE_OR_DEFER(UBYTE, UByte, value);
+        GET_VALUE_OR_DEFER(USHORT, UShort, value);
+        GET_VALUE_OR_DEFER(UINT, UInt, value);
+        GET_VALUE_OR_DEFER(ULONG, ULong, value);
+        GET_VALUE_OR_DEFER(ULONGLONG, ULongLong, value);
+        /* Ints */
+        GET_VALUE_OR_DEFER(BYTE, Byte, value);
+        GET_VALUE_OR_DEFER(SHORT, Short, value);
+        GET_VALUE_OR_DEFER(INT, Int, value);
+        GET_VALUE_OR_DEFER(LONG, Long, value);
+        GET_VALUE_OR_DEFER(LONGLONG, LongLong, value);
+        /* Floats */
+        case NPY_HALF:
+            if (IS_SAFE(NPY_HALF, NPY_BYTE)) {
+                CONVERT_TO_RESULT(npy_half_to_float(PyArrayScalar_VAL(value, Half)));
+                ret = CONVERSION_SUCCESS;
+            }
+            else if (IS_SAFE(NPY_BYTE, NPY_HALF)) {
+                ret = DEFER_TO_OTHER_KNOWN_SCALAR;
+            }
+            else {
+                ret = PROMOTION_REQUIRED;
+            }
+            break;
+        GET_VALUE_OR_DEFER(FLOAT, Float, value);
+        GET_VALUE_OR_DEFER(DOUBLE, Double, value);
+        GET_VALUE_OR_DEFER(LONGDOUBLE, LongDouble, value);
+        /* Complex: We should still defer, but the code won't work... */
+        GET_CVALUE_OR_DEFER(CFLOAT, CFloat, value);
+        GET_CVALUE_OR_DEFER(CDOUBLE, CDouble, value);
+        GET_CVALUE_OR_DEFER(CLONGDOUBLE, CLongDouble, value);
+        default:
+            /*
+             * If there is no match, this is an unknown scalar object.  It
+             * would make sense to defer generously here, but it should also
+             * always be safe to use the array path.
+             * The issue is, that the other scalar may or may not be designed
+             * to deal with NumPy scalars.  Without knowing that, we cannot
+             * defer (which would be much faster potentially).
+             * TODO: We could add a DType flag to allow opting in to deferring!
+             */
+            *may_need_deferring = NPY_TRUE;
+            ret = OTHER_IS_UNKNOWN_OBJECT;
+    }
+    Py_DECREF(descr);
+    return ret;
+}
+
+#undef IS_SAFE
+#undef CONVERT_TO_RESULT
+#undef GET_VALUE_OR_DEFER
+#undef GET_CVALUE_OR_DEFER
+#undef IS_BYTE
+
+
+#line 837
+
+#define IS_UBYTE 1
+
+#define IS_SAFE(FROM, TO) _npy_can_cast_safely_table[FROM][TO]
+
+/*
+ * TODO: This whole thing is awkward, and we should create a helper header to
+ *       define inline functions that convert single elements for all numeric
+ *       types.  That could then also be used to define all cast loops.
+ *       (Even if that may get more complex for SIMD at some point.)
+ *       For now, half casts could be optimized because of that.
+ */
+
+#if defined(IS_HALF)
+    #define CONVERT_TO_RESULT(value)  \
+        *result = npy_float_to_half((float)(value))
+#elif defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+    #define CONVERT_TO_RESULT(value)  \
+        result->real = value;  \
+        result->imag = 0
+#else
+    #define CONVERT_TO_RESULT(value) *result = value
+#endif
+
+
+#define GET_VALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_##OTHER, NPY_UBYTE)) {  \
+            CONVERT_TO_RESULT(PyArrayScalar_VAL(value, Other));  \
+            ret = CONVERSION_SUCCESS;  \
+        }  \
+        else if (IS_SAFE(NPY_UBYTE, NPY_##OTHER)) {  \
+            /*
+             * If self can cast safely to other, this is clear:
+             * we should definitely defer.
+             */  \
+             ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            /* Otherwise, we must promote */  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+/*
+ * Complex to complex (and rejecting complex to real) is a bit different:
+ */
+
+#if defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+
+#define GET_CVALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_##OTHER, NPY_UBYTE)) {  \
+            assert(Py_TYPE(value) == &Py##Other##ArrType_Type);  \
+            result->real = PyArrayScalar_VAL(value, Other).real;  \
+            result->imag = PyArrayScalar_VAL(value, Other).imag;  \
+            ret = 1;  \
+        }  \
+        else if (IS_SAFE(NPY_UBYTE, NPY_##OTHER)) {  \
+             ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+#else
+
+/* Getting a complex value to real is never safe: */
+#define GET_CVALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_UBYTE, NPY_##OTHER)) {  \
+            ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+#endif
+
+
+/**
+ * Convert the value to the own type and and store the result.
+ *
+ * @param value The value to convert (if compatible)
+ * @param result The result value (output)
+ * @param may_need_deferring Set to `NPY_TRUE` when the caller must check
+ *        `BINOP_GIVE_UP_IF_NEEDED` (or similar) due to possible implementation
+ *        of `__array_priority__` (or similar).
+ *        This is set for unknown objects and all subclasses even when they
+ *        can be handled.
+ * @result The result value indicating what we did with `value` or what type
+ *         of object it is (see `conversion_result`).
+ */
+static inline conversion_result
+convert_to_ubyte(PyObject *value, npy_ubyte *result, npy_bool *may_need_deferring)
+{
+    PyArray_Descr *descr;
+    *may_need_deferring = NPY_FALSE;
+
+    if (Py_TYPE(value) == &PyUByteArrType_Type) {
+        *result = PyArrayScalar_VAL(value, UByte);
+        return CONVERSION_SUCCESS;
+    }
+    /* Optimize the identical scalar specifically. */
+    if (PyArray_IsScalar(value, UByte)) {
+        *result = PyArrayScalar_VAL(value, UByte);
+        /*
+         * In principle special, assyemetric, handling could be possible for
+         * explicit subclasses.
+         * In practice, we just check the normal deferring logic.
+         */
+        *may_need_deferring = NPY_TRUE;
+        return CONVERSION_SUCCESS;
+    }
+
+    /*
+     * Then we check for the basic Python types float, int, and complex.
+     * (this is a bit tedious to do right for complex).
+     */
+    if (PyBool_Check(value)) {
+        CONVERT_TO_RESULT(value == Py_True);
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyFloat_Check(value)) {
+        if (!PyFloat_CheckExact(value)) {
+            /* A NumPy double is a float subclass, but special. */
+            if (PyArray_IsScalar(value, Double)) {
+                descr = PyArray_DescrFromType(NPY_DOUBLE);
+                goto numpy_scalar;
+            }
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_DOUBLE, NPY_UBYTE)) {
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            /* Weak promotion is used when self is float or complex: */
+            if (!PyTypeNum_ISFLOAT(NPY_UBYTE) && !PyTypeNum_ISCOMPLEX(NPY_UBYTE)) {
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        CONVERT_TO_RESULT(PyFloat_AS_DOUBLE(value));
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyLong_Check(value)) {
+        if (!PyLong_CheckExact(value)) {
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_LONG, NPY_UBYTE)) {
+            /*
+             * long -> (c)longdouble is safe, so `OTHER_IS_UNKNOWN_OBJECT` will
+             * be returned below for huge integers.
+             */
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        int overflow;
+        long val = PyLong_AsLongAndOverflow(value, &overflow);
+        if (overflow) {
+            /* handle as if "unsafe" */
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                return OTHER_IS_UNKNOWN_OBJECT;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        if (error_converting(val)) {
+            return CONVERSION_ERROR;  /* should not be possible */
+        }
+        CONVERT_TO_RESULT(val);
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyComplex_Check(value)) {
+        if (!PyComplex_CheckExact(value)) {
+            /* A NumPy complex double is a float subclass, but special. */
+            if (PyArray_IsScalar(value, CDouble)) {
+                descr = PyArray_DescrFromType(NPY_CDOUBLE);
+                goto numpy_scalar;
+            }
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_CDOUBLE, NPY_UBYTE)) {
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            /* Weak promotion is used when self is float or complex: */
+            if (!PyTypeNum_ISCOMPLEX(NPY_UBYTE)) {
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+#if defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+        Py_complex val = PyComplex_AsCComplex(value);
+        if (error_converting(val.real)) {
+            return CONVERSION_ERROR;  /* should not be possible */
+        }
+        result->real = val.real;
+        result->imag = val.imag;
+        return CONVERSION_SUCCESS;
+#else
+        /* unreachable, always unsafe cast above; return to avoid warning */
+        assert(0);
+        return OTHER_IS_UNKNOWN_OBJECT;
+#endif  /* defined(IS_CFLOAT) || ... */
+    }
+
+    /*
+     * (seberg) It would be nice to use `PyArray_DiscoverDTypeFromScalarType`
+     * from array coercion here.  OTOH, the array coercion code also falls
+     * back to this code.  The issue is around how subclasses should work...
+     *
+     * It would be nice to try to fully align the paths again (they effectively
+     * are equivalent).  Proper support for subclasses is in general tricky,
+     * and it would make more sense to just _refuse_ to support them.
+     * However, it is unclear that this is a viable option...
+     */
+    if (!PyArray_IsScalar(value, Generic)) {
+        /*
+         * The input is an unknown python object.  This should probably defer
+         * but only does so for float128.
+         * For all other cases, we defer to the array logic.  If the object
+         * is indeed not an array-like, this will end up converting the NumPy
+         * scalar to a Python scalar and then try again.
+         * The logic is that the ufunc casts the input to object, which does
+         * the conversion.
+         * If the object is an array, deferring will always kick in.
+         */
+        *may_need_deferring = NPY_TRUE;
+        return OTHER_IS_UNKNOWN_OBJECT;
+    }
+
+    descr = PyArray_DescrFromScalar(value);
+    if (descr == NULL) {
+        if (PyErr_Occurred()) {
+            return CONVERSION_ERROR;
+        }
+        /* Should not happen, but may be possible with bad user subclasses */
+        *may_need_deferring = NPY_TRUE;
+        return OTHER_IS_UNKNOWN_OBJECT;
+    }
+
+  numpy_scalar:
+    if (descr->typeobj != Py_TYPE(value)) {
+        /*
+         * This is a subclass of a builtin type, we may continue normally,
+         * but should check whether we need to defer.
+         */
+        *may_need_deferring = NPY_TRUE;
+    }
+
+    /*
+     * Otherwise, we have a clear NumPy scalar, find if it is a compatible
+     * builtin scalar.
+     * Each `GET_VALUE_OR_DEFER` represents a case clause for its type number,
+     * extracting the value if it is safe and otherwise deferring.
+     * (Safety is known at compile time, so the switch statement should be
+     * simplified by the compiler accordingly.)
+     * If we have a scalar that is not listed or not safe, we defer to it.
+     *
+     * We should probably defer more aggressively, but that is too big a change,
+     * since it would disable `np.float64(1.) * [1, 2, 3, 4]`.
+     */
+    int ret;  /* set by the GET_VALUE_OR_DEFER macro */
+    switch (descr->type_num) {
+        GET_VALUE_OR_DEFER(BOOL, Bool, value);
+        /* UInts */
+        GET_VALUE_OR_DEFER(UBYTE, UByte, value);
+        GET_VALUE_OR_DEFER(USHORT, UShort, value);
+        GET_VALUE_OR_DEFER(UINT, UInt, value);
+        GET_VALUE_OR_DEFER(ULONG, ULong, value);
+        GET_VALUE_OR_DEFER(ULONGLONG, ULongLong, value);
+        /* Ints */
+        GET_VALUE_OR_DEFER(BYTE, Byte, value);
+        GET_VALUE_OR_DEFER(SHORT, Short, value);
+        GET_VALUE_OR_DEFER(INT, Int, value);
+        GET_VALUE_OR_DEFER(LONG, Long, value);
+        GET_VALUE_OR_DEFER(LONGLONG, LongLong, value);
+        /* Floats */
+        case NPY_HALF:
+            if (IS_SAFE(NPY_HALF, NPY_UBYTE)) {
+                CONVERT_TO_RESULT(npy_half_to_float(PyArrayScalar_VAL(value, Half)));
+                ret = CONVERSION_SUCCESS;
+            }
+            else if (IS_SAFE(NPY_UBYTE, NPY_HALF)) {
+                ret = DEFER_TO_OTHER_KNOWN_SCALAR;
+            }
+            else {
+                ret = PROMOTION_REQUIRED;
+            }
+            break;
+        GET_VALUE_OR_DEFER(FLOAT, Float, value);
+        GET_VALUE_OR_DEFER(DOUBLE, Double, value);
+        GET_VALUE_OR_DEFER(LONGDOUBLE, LongDouble, value);
+        /* Complex: We should still defer, but the code won't work... */
+        GET_CVALUE_OR_DEFER(CFLOAT, CFloat, value);
+        GET_CVALUE_OR_DEFER(CDOUBLE, CDouble, value);
+        GET_CVALUE_OR_DEFER(CLONGDOUBLE, CLongDouble, value);
+        default:
+            /*
+             * If there is no match, this is an unknown scalar object.  It
+             * would make sense to defer generously here, but it should also
+             * always be safe to use the array path.
+             * The issue is, that the other scalar may or may not be designed
+             * to deal with NumPy scalars.  Without knowing that, we cannot
+             * defer (which would be much faster potentially).
+             * TODO: We could add a DType flag to allow opting in to deferring!
+             */
+            *may_need_deferring = NPY_TRUE;
+            ret = OTHER_IS_UNKNOWN_OBJECT;
+    }
+    Py_DECREF(descr);
+    return ret;
+}
+
+#undef IS_SAFE
+#undef CONVERT_TO_RESULT
+#undef GET_VALUE_OR_DEFER
+#undef GET_CVALUE_OR_DEFER
+#undef IS_UBYTE
+
+
+#line 837
+
+#define IS_SHORT 1
+
+#define IS_SAFE(FROM, TO) _npy_can_cast_safely_table[FROM][TO]
+
+/*
+ * TODO: This whole thing is awkward, and we should create a helper header to
+ *       define inline functions that convert single elements for all numeric
+ *       types.  That could then also be used to define all cast loops.
+ *       (Even if that may get more complex for SIMD at some point.)
+ *       For now, half casts could be optimized because of that.
+ */
+
+#if defined(IS_HALF)
+    #define CONVERT_TO_RESULT(value)  \
+        *result = npy_float_to_half((float)(value))
+#elif defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+    #define CONVERT_TO_RESULT(value)  \
+        result->real = value;  \
+        result->imag = 0
+#else
+    #define CONVERT_TO_RESULT(value) *result = value
+#endif
+
+
+#define GET_VALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_##OTHER, NPY_SHORT)) {  \
+            CONVERT_TO_RESULT(PyArrayScalar_VAL(value, Other));  \
+            ret = CONVERSION_SUCCESS;  \
+        }  \
+        else if (IS_SAFE(NPY_SHORT, NPY_##OTHER)) {  \
+            /*
+             * If self can cast safely to other, this is clear:
+             * we should definitely defer.
+             */  \
+             ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            /* Otherwise, we must promote */  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+/*
+ * Complex to complex (and rejecting complex to real) is a bit different:
+ */
+
+#if defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+
+#define GET_CVALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_##OTHER, NPY_SHORT)) {  \
+            assert(Py_TYPE(value) == &Py##Other##ArrType_Type);  \
+            result->real = PyArrayScalar_VAL(value, Other).real;  \
+            result->imag = PyArrayScalar_VAL(value, Other).imag;  \
+            ret = 1;  \
+        }  \
+        else if (IS_SAFE(NPY_SHORT, NPY_##OTHER)) {  \
+             ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+#else
+
+/* Getting a complex value to real is never safe: */
+#define GET_CVALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_SHORT, NPY_##OTHER)) {  \
+            ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+#endif
+
+
+/**
+ * Convert the value to the own type and and store the result.
+ *
+ * @param value The value to convert (if compatible)
+ * @param result The result value (output)
+ * @param may_need_deferring Set to `NPY_TRUE` when the caller must check
+ *        `BINOP_GIVE_UP_IF_NEEDED` (or similar) due to possible implementation
+ *        of `__array_priority__` (or similar).
+ *        This is set for unknown objects and all subclasses even when they
+ *        can be handled.
+ * @result The result value indicating what we did with `value` or what type
+ *         of object it is (see `conversion_result`).
+ */
+static inline conversion_result
+convert_to_short(PyObject *value, npy_short *result, npy_bool *may_need_deferring)
+{
+    PyArray_Descr *descr;
+    *may_need_deferring = NPY_FALSE;
+
+    if (Py_TYPE(value) == &PyShortArrType_Type) {
+        *result = PyArrayScalar_VAL(value, Short);
+        return CONVERSION_SUCCESS;
+    }
+    /* Optimize the identical scalar specifically. */
+    if (PyArray_IsScalar(value, Short)) {
+        *result = PyArrayScalar_VAL(value, Short);
+        /*
+         * In principle special, assyemetric, handling could be possible for
+         * explicit subclasses.
+         * In practice, we just check the normal deferring logic.
+         */
+        *may_need_deferring = NPY_TRUE;
+        return CONVERSION_SUCCESS;
+    }
+
+    /*
+     * Then we check for the basic Python types float, int, and complex.
+     * (this is a bit tedious to do right for complex).
+     */
+    if (PyBool_Check(value)) {
+        CONVERT_TO_RESULT(value == Py_True);
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyFloat_Check(value)) {
+        if (!PyFloat_CheckExact(value)) {
+            /* A NumPy double is a float subclass, but special. */
+            if (PyArray_IsScalar(value, Double)) {
+                descr = PyArray_DescrFromType(NPY_DOUBLE);
+                goto numpy_scalar;
+            }
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_DOUBLE, NPY_SHORT)) {
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            /* Weak promotion is used when self is float or complex: */
+            if (!PyTypeNum_ISFLOAT(NPY_SHORT) && !PyTypeNum_ISCOMPLEX(NPY_SHORT)) {
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        CONVERT_TO_RESULT(PyFloat_AS_DOUBLE(value));
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyLong_Check(value)) {
+        if (!PyLong_CheckExact(value)) {
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_LONG, NPY_SHORT)) {
+            /*
+             * long -> (c)longdouble is safe, so `OTHER_IS_UNKNOWN_OBJECT` will
+             * be returned below for huge integers.
+             */
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        int overflow;
+        long val = PyLong_AsLongAndOverflow(value, &overflow);
+        if (overflow) {
+            /* handle as if "unsafe" */
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                return OTHER_IS_UNKNOWN_OBJECT;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        if (error_converting(val)) {
+            return CONVERSION_ERROR;  /* should not be possible */
+        }
+        CONVERT_TO_RESULT(val);
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyComplex_Check(value)) {
+        if (!PyComplex_CheckExact(value)) {
+            /* A NumPy complex double is a float subclass, but special. */
+            if (PyArray_IsScalar(value, CDouble)) {
+                descr = PyArray_DescrFromType(NPY_CDOUBLE);
+                goto numpy_scalar;
+            }
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_CDOUBLE, NPY_SHORT)) {
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            /* Weak promotion is used when self is float or complex: */
+            if (!PyTypeNum_ISCOMPLEX(NPY_SHORT)) {
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+#if defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+        Py_complex val = PyComplex_AsCComplex(value);
+        if (error_converting(val.real)) {
+            return CONVERSION_ERROR;  /* should not be possible */
+        }
+        result->real = val.real;
+        result->imag = val.imag;
+        return CONVERSION_SUCCESS;
+#else
+        /* unreachable, always unsafe cast above; return to avoid warning */
+        assert(0);
+        return OTHER_IS_UNKNOWN_OBJECT;
+#endif  /* defined(IS_CFLOAT) || ... */
+    }
+
+    /*
+     * (seberg) It would be nice to use `PyArray_DiscoverDTypeFromScalarType`
+     * from array coercion here.  OTOH, the array coercion code also falls
+     * back to this code.  The issue is around how subclasses should work...
+     *
+     * It would be nice to try to fully align the paths again (they effectively
+     * are equivalent).  Proper support for subclasses is in general tricky,
+     * and it would make more sense to just _refuse_ to support them.
+     * However, it is unclear that this is a viable option...
+     */
+    if (!PyArray_IsScalar(value, Generic)) {
+        /*
+         * The input is an unknown python object.  This should probably defer
+         * but only does so for float128.
+         * For all other cases, we defer to the array logic.  If the object
+         * is indeed not an array-like, this will end up converting the NumPy
+         * scalar to a Python scalar and then try again.
+         * The logic is that the ufunc casts the input to object, which does
+         * the conversion.
+         * If the object is an array, deferring will always kick in.
+         */
+        *may_need_deferring = NPY_TRUE;
+        return OTHER_IS_UNKNOWN_OBJECT;
+    }
+
+    descr = PyArray_DescrFromScalar(value);
+    if (descr == NULL) {
+        if (PyErr_Occurred()) {
+            return CONVERSION_ERROR;
+        }
+        /* Should not happen, but may be possible with bad user subclasses */
+        *may_need_deferring = NPY_TRUE;
+        return OTHER_IS_UNKNOWN_OBJECT;
+    }
+
+  numpy_scalar:
+    if (descr->typeobj != Py_TYPE(value)) {
+        /*
+         * This is a subclass of a builtin type, we may continue normally,
+         * but should check whether we need to defer.
+         */
+        *may_need_deferring = NPY_TRUE;
+    }
+
+    /*
+     * Otherwise, we have a clear NumPy scalar, find if it is a compatible
+     * builtin scalar.
+     * Each `GET_VALUE_OR_DEFER` represents a case clause for its type number,
+     * extracting the value if it is safe and otherwise deferring.
+     * (Safety is known at compile time, so the switch statement should be
+     * simplified by the compiler accordingly.)
+     * If we have a scalar that is not listed or not safe, we defer to it.
+     *
+     * We should probably defer more aggressively, but that is too big a change,
+     * since it would disable `np.float64(1.) * [1, 2, 3, 4]`.
+     */
+    int ret;  /* set by the GET_VALUE_OR_DEFER macro */
+    switch (descr->type_num) {
+        GET_VALUE_OR_DEFER(BOOL, Bool, value);
+        /* UInts */
+        GET_VALUE_OR_DEFER(UBYTE, UByte, value);
+        GET_VALUE_OR_DEFER(USHORT, UShort, value);
+        GET_VALUE_OR_DEFER(UINT, UInt, value);
+        GET_VALUE_OR_DEFER(ULONG, ULong, value);
+        GET_VALUE_OR_DEFER(ULONGLONG, ULongLong, value);
+        /* Ints */
+        GET_VALUE_OR_DEFER(BYTE, Byte, value);
+        GET_VALUE_OR_DEFER(SHORT, Short, value);
+        GET_VALUE_OR_DEFER(INT, Int, value);
+        GET_VALUE_OR_DEFER(LONG, Long, value);
+        GET_VALUE_OR_DEFER(LONGLONG, LongLong, value);
+        /* Floats */
+        case NPY_HALF:
+            if (IS_SAFE(NPY_HALF, NPY_SHORT)) {
+                CONVERT_TO_RESULT(npy_half_to_float(PyArrayScalar_VAL(value, Half)));
+                ret = CONVERSION_SUCCESS;
+            }
+            else if (IS_SAFE(NPY_SHORT, NPY_HALF)) {
+                ret = DEFER_TO_OTHER_KNOWN_SCALAR;
+            }
+            else {
+                ret = PROMOTION_REQUIRED;
+            }
+            break;
+        GET_VALUE_OR_DEFER(FLOAT, Float, value);
+        GET_VALUE_OR_DEFER(DOUBLE, Double, value);
+        GET_VALUE_OR_DEFER(LONGDOUBLE, LongDouble, value);
+        /* Complex: We should still defer, but the code won't work... */
+        GET_CVALUE_OR_DEFER(CFLOAT, CFloat, value);
+        GET_CVALUE_OR_DEFER(CDOUBLE, CDouble, value);
+        GET_CVALUE_OR_DEFER(CLONGDOUBLE, CLongDouble, value);
+        default:
+            /*
+             * If there is no match, this is an unknown scalar object.  It
+             * would make sense to defer generously here, but it should also
+             * always be safe to use the array path.
+             * The issue is, that the other scalar may or may not be designed
+             * to deal with NumPy scalars.  Without knowing that, we cannot
+             * defer (which would be much faster potentially).
+             * TODO: We could add a DType flag to allow opting in to deferring!
+             */
+            *may_need_deferring = NPY_TRUE;
+            ret = OTHER_IS_UNKNOWN_OBJECT;
+    }
+    Py_DECREF(descr);
+    return ret;
+}
+
+#undef IS_SAFE
+#undef CONVERT_TO_RESULT
+#undef GET_VALUE_OR_DEFER
+#undef GET_CVALUE_OR_DEFER
+#undef IS_SHORT
+
+
+#line 837
+
+#define IS_USHORT 1
+
+#define IS_SAFE(FROM, TO) _npy_can_cast_safely_table[FROM][TO]
+
+/*
+ * TODO: This whole thing is awkward, and we should create a helper header to
+ *       define inline functions that convert single elements for all numeric
+ *       types.  That could then also be used to define all cast loops.
+ *       (Even if that may get more complex for SIMD at some point.)
+ *       For now, half casts could be optimized because of that.
+ */
+
+#if defined(IS_HALF)
+    #define CONVERT_TO_RESULT(value)  \
+        *result = npy_float_to_half((float)(value))
+#elif defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+    #define CONVERT_TO_RESULT(value)  \
+        result->real = value;  \
+        result->imag = 0
+#else
+    #define CONVERT_TO_RESULT(value) *result = value
+#endif
+
+
+#define GET_VALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_##OTHER, NPY_USHORT)) {  \
+            CONVERT_TO_RESULT(PyArrayScalar_VAL(value, Other));  \
+            ret = CONVERSION_SUCCESS;  \
+        }  \
+        else if (IS_SAFE(NPY_USHORT, NPY_##OTHER)) {  \
+            /*
+             * If self can cast safely to other, this is clear:
+             * we should definitely defer.
+             */  \
+             ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            /* Otherwise, we must promote */  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+/*
+ * Complex to complex (and rejecting complex to real) is a bit different:
+ */
+
+#if defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+
+#define GET_CVALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_##OTHER, NPY_USHORT)) {  \
+            assert(Py_TYPE(value) == &Py##Other##ArrType_Type);  \
+            result->real = PyArrayScalar_VAL(value, Other).real;  \
+            result->imag = PyArrayScalar_VAL(value, Other).imag;  \
+            ret = 1;  \
+        }  \
+        else if (IS_SAFE(NPY_USHORT, NPY_##OTHER)) {  \
+             ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+#else
+
+/* Getting a complex value to real is never safe: */
+#define GET_CVALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_USHORT, NPY_##OTHER)) {  \
+            ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+#endif
+
+
+/**
+ * Convert the value to the own type and and store the result.
+ *
+ * @param value The value to convert (if compatible)
+ * @param result The result value (output)
+ * @param may_need_deferring Set to `NPY_TRUE` when the caller must check
+ *        `BINOP_GIVE_UP_IF_NEEDED` (or similar) due to possible implementation
+ *        of `__array_priority__` (or similar).
+ *        This is set for unknown objects and all subclasses even when they
+ *        can be handled.
+ * @result The result value indicating what we did with `value` or what type
+ *         of object it is (see `conversion_result`).
+ */
+static inline conversion_result
+convert_to_ushort(PyObject *value, npy_ushort *result, npy_bool *may_need_deferring)
+{
+    PyArray_Descr *descr;
+    *may_need_deferring = NPY_FALSE;
+
+    if (Py_TYPE(value) == &PyUShortArrType_Type) {
+        *result = PyArrayScalar_VAL(value, UShort);
+        return CONVERSION_SUCCESS;
+    }
+    /* Optimize the identical scalar specifically. */
+    if (PyArray_IsScalar(value, UShort)) {
+        *result = PyArrayScalar_VAL(value, UShort);
+        /*
+         * In principle special, assyemetric, handling could be possible for
+         * explicit subclasses.
+         * In practice, we just check the normal deferring logic.
+         */
+        *may_need_deferring = NPY_TRUE;
+        return CONVERSION_SUCCESS;
+    }
+
+    /*
+     * Then we check for the basic Python types float, int, and complex.
+     * (this is a bit tedious to do right for complex).
+     */
+    if (PyBool_Check(value)) {
+        CONVERT_TO_RESULT(value == Py_True);
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyFloat_Check(value)) {
+        if (!PyFloat_CheckExact(value)) {
+            /* A NumPy double is a float subclass, but special. */
+            if (PyArray_IsScalar(value, Double)) {
+                descr = PyArray_DescrFromType(NPY_DOUBLE);
+                goto numpy_scalar;
+            }
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_DOUBLE, NPY_USHORT)) {
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            /* Weak promotion is used when self is float or complex: */
+            if (!PyTypeNum_ISFLOAT(NPY_USHORT) && !PyTypeNum_ISCOMPLEX(NPY_USHORT)) {
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        CONVERT_TO_RESULT(PyFloat_AS_DOUBLE(value));
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyLong_Check(value)) {
+        if (!PyLong_CheckExact(value)) {
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_LONG, NPY_USHORT)) {
+            /*
+             * long -> (c)longdouble is safe, so `OTHER_IS_UNKNOWN_OBJECT` will
+             * be returned below for huge integers.
+             */
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        int overflow;
+        long val = PyLong_AsLongAndOverflow(value, &overflow);
+        if (overflow) {
+            /* handle as if "unsafe" */
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                return OTHER_IS_UNKNOWN_OBJECT;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        if (error_converting(val)) {
+            return CONVERSION_ERROR;  /* should not be possible */
+        }
+        CONVERT_TO_RESULT(val);
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyComplex_Check(value)) {
+        if (!PyComplex_CheckExact(value)) {
+            /* A NumPy complex double is a float subclass, but special. */
+            if (PyArray_IsScalar(value, CDouble)) {
+                descr = PyArray_DescrFromType(NPY_CDOUBLE);
+                goto numpy_scalar;
+            }
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_CDOUBLE, NPY_USHORT)) {
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            /* Weak promotion is used when self is float or complex: */
+            if (!PyTypeNum_ISCOMPLEX(NPY_USHORT)) {
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+#if defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+        Py_complex val = PyComplex_AsCComplex(value);
+        if (error_converting(val.real)) {
+            return CONVERSION_ERROR;  /* should not be possible */
+        }
+        result->real = val.real;
+        result->imag = val.imag;
+        return CONVERSION_SUCCESS;
+#else
+        /* unreachable, always unsafe cast above; return to avoid warning */
+        assert(0);
+        return OTHER_IS_UNKNOWN_OBJECT;
+#endif  /* defined(IS_CFLOAT) || ... */
+    }
+
+    /*
+     * (seberg) It would be nice to use `PyArray_DiscoverDTypeFromScalarType`
+     * from array coercion here.  OTOH, the array coercion code also falls
+     * back to this code.  The issue is around how subclasses should work...
+     *
+     * It would be nice to try to fully align the paths again (they effectively
+     * are equivalent).  Proper support for subclasses is in general tricky,
+     * and it would make more sense to just _refuse_ to support them.
+     * However, it is unclear that this is a viable option...
+     */
+    if (!PyArray_IsScalar(value, Generic)) {
+        /*
+         * The input is an unknown python object.  This should probably defer
+         * but only does so for float128.
+         * For all other cases, we defer to the array logic.  If the object
+         * is indeed not an array-like, this will end up converting the NumPy
+         * scalar to a Python scalar and then try again.
+         * The logic is that the ufunc casts the input to object, which does
+         * the conversion.
+         * If the object is an array, deferring will always kick in.
+         */
+        *may_need_deferring = NPY_TRUE;
+        return OTHER_IS_UNKNOWN_OBJECT;
+    }
+
+    descr = PyArray_DescrFromScalar(value);
+    if (descr == NULL) {
+        if (PyErr_Occurred()) {
+            return CONVERSION_ERROR;
+        }
+        /* Should not happen, but may be possible with bad user subclasses */
+        *may_need_deferring = NPY_TRUE;
+        return OTHER_IS_UNKNOWN_OBJECT;
+    }
+
+  numpy_scalar:
+    if (descr->typeobj != Py_TYPE(value)) {
+        /*
+         * This is a subclass of a builtin type, we may continue normally,
+         * but should check whether we need to defer.
+         */
+        *may_need_deferring = NPY_TRUE;
+    }
+
+    /*
+     * Otherwise, we have a clear NumPy scalar, find if it is a compatible
+     * builtin scalar.
+     * Each `GET_VALUE_OR_DEFER` represents a case clause for its type number,
+     * extracting the value if it is safe and otherwise deferring.
+     * (Safety is known at compile time, so the switch statement should be
+     * simplified by the compiler accordingly.)
+     * If we have a scalar that is not listed or not safe, we defer to it.
+     *
+     * We should probably defer more aggressively, but that is too big a change,
+     * since it would disable `np.float64(1.) * [1, 2, 3, 4]`.
+     */
+    int ret;  /* set by the GET_VALUE_OR_DEFER macro */
+    switch (descr->type_num) {
+        GET_VALUE_OR_DEFER(BOOL, Bool, value);
+        /* UInts */
+        GET_VALUE_OR_DEFER(UBYTE, UByte, value);
+        GET_VALUE_OR_DEFER(USHORT, UShort, value);
+        GET_VALUE_OR_DEFER(UINT, UInt, value);
+        GET_VALUE_OR_DEFER(ULONG, ULong, value);
+        GET_VALUE_OR_DEFER(ULONGLONG, ULongLong, value);
+        /* Ints */
+        GET_VALUE_OR_DEFER(BYTE, Byte, value);
+        GET_VALUE_OR_DEFER(SHORT, Short, value);
+        GET_VALUE_OR_DEFER(INT, Int, value);
+        GET_VALUE_OR_DEFER(LONG, Long, value);
+        GET_VALUE_OR_DEFER(LONGLONG, LongLong, value);
+        /* Floats */
+        case NPY_HALF:
+            if (IS_SAFE(NPY_HALF, NPY_USHORT)) {
+                CONVERT_TO_RESULT(npy_half_to_float(PyArrayScalar_VAL(value, Half)));
+                ret = CONVERSION_SUCCESS;
+            }
+            else if (IS_SAFE(NPY_USHORT, NPY_HALF)) {
+                ret = DEFER_TO_OTHER_KNOWN_SCALAR;
+            }
+            else {
+                ret = PROMOTION_REQUIRED;
+            }
+            break;
+        GET_VALUE_OR_DEFER(FLOAT, Float, value);
+        GET_VALUE_OR_DEFER(DOUBLE, Double, value);
+        GET_VALUE_OR_DEFER(LONGDOUBLE, LongDouble, value);
+        /* Complex: We should still defer, but the code won't work... */
+        GET_CVALUE_OR_DEFER(CFLOAT, CFloat, value);
+        GET_CVALUE_OR_DEFER(CDOUBLE, CDouble, value);
+        GET_CVALUE_OR_DEFER(CLONGDOUBLE, CLongDouble, value);
+        default:
+            /*
+             * If there is no match, this is an unknown scalar object.  It
+             * would make sense to defer generously here, but it should also
+             * always be safe to use the array path.
+             * The issue is, that the other scalar may or may not be designed
+             * to deal with NumPy scalars.  Without knowing that, we cannot
+             * defer (which would be much faster potentially).
+             * TODO: We could add a DType flag to allow opting in to deferring!
+             */
+            *may_need_deferring = NPY_TRUE;
+            ret = OTHER_IS_UNKNOWN_OBJECT;
+    }
+    Py_DECREF(descr);
+    return ret;
+}
+
+#undef IS_SAFE
+#undef CONVERT_TO_RESULT
+#undef GET_VALUE_OR_DEFER
+#undef GET_CVALUE_OR_DEFER
+#undef IS_USHORT
+
+
+#line 837
+
+#define IS_INT 1
+
+#define IS_SAFE(FROM, TO) _npy_can_cast_safely_table[FROM][TO]
+
+/*
+ * TODO: This whole thing is awkward, and we should create a helper header to
+ *       define inline functions that convert single elements for all numeric
+ *       types.  That could then also be used to define all cast loops.
+ *       (Even if that may get more complex for SIMD at some point.)
+ *       For now, half casts could be optimized because of that.
+ */
+
+#if defined(IS_HALF)
+    #define CONVERT_TO_RESULT(value)  \
+        *result = npy_float_to_half((float)(value))
+#elif defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+    #define CONVERT_TO_RESULT(value)  \
+        result->real = value;  \
+        result->imag = 0
+#else
+    #define CONVERT_TO_RESULT(value) *result = value
+#endif
+
+
+#define GET_VALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_##OTHER, NPY_INT)) {  \
+            CONVERT_TO_RESULT(PyArrayScalar_VAL(value, Other));  \
+            ret = CONVERSION_SUCCESS;  \
+        }  \
+        else if (IS_SAFE(NPY_INT, NPY_##OTHER)) {  \
+            /*
+             * If self can cast safely to other, this is clear:
+             * we should definitely defer.
+             */  \
+             ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            /* Otherwise, we must promote */  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+/*
+ * Complex to complex (and rejecting complex to real) is a bit different:
+ */
+
+#if defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+
+#define GET_CVALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_##OTHER, NPY_INT)) {  \
+            assert(Py_TYPE(value) == &Py##Other##ArrType_Type);  \
+            result->real = PyArrayScalar_VAL(value, Other).real;  \
+            result->imag = PyArrayScalar_VAL(value, Other).imag;  \
+            ret = 1;  \
+        }  \
+        else if (IS_SAFE(NPY_INT, NPY_##OTHER)) {  \
+             ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+#else
+
+/* Getting a complex value to real is never safe: */
+#define GET_CVALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_INT, NPY_##OTHER)) {  \
+            ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+#endif
+
+
+/**
+ * Convert the value to the own type and and store the result.
+ *
+ * @param value The value to convert (if compatible)
+ * @param result The result value (output)
+ * @param may_need_deferring Set to `NPY_TRUE` when the caller must check
+ *        `BINOP_GIVE_UP_IF_NEEDED` (or similar) due to possible implementation
+ *        of `__array_priority__` (or similar).
+ *        This is set for unknown objects and all subclasses even when they
+ *        can be handled.
+ * @result The result value indicating what we did with `value` or what type
+ *         of object it is (see `conversion_result`).
+ */
+static inline conversion_result
+convert_to_int(PyObject *value, npy_int *result, npy_bool *may_need_deferring)
+{
+    PyArray_Descr *descr;
+    *may_need_deferring = NPY_FALSE;
+
+    if (Py_TYPE(value) == &PyIntArrType_Type) {
+        *result = PyArrayScalar_VAL(value, Int);
+        return CONVERSION_SUCCESS;
+    }
+    /* Optimize the identical scalar specifically. */
+    if (PyArray_IsScalar(value, Int)) {
+        *result = PyArrayScalar_VAL(value, Int);
+        /*
+         * In principle special, assyemetric, handling could be possible for
+         * explicit subclasses.
+         * In practice, we just check the normal deferring logic.
+         */
+        *may_need_deferring = NPY_TRUE;
+        return CONVERSION_SUCCESS;
+    }
+
+    /*
+     * Then we check for the basic Python types float, int, and complex.
+     * (this is a bit tedious to do right for complex).
+     */
+    if (PyBool_Check(value)) {
+        CONVERT_TO_RESULT(value == Py_True);
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyFloat_Check(value)) {
+        if (!PyFloat_CheckExact(value)) {
+            /* A NumPy double is a float subclass, but special. */
+            if (PyArray_IsScalar(value, Double)) {
+                descr = PyArray_DescrFromType(NPY_DOUBLE);
+                goto numpy_scalar;
+            }
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_DOUBLE, NPY_INT)) {
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            /* Weak promotion is used when self is float or complex: */
+            if (!PyTypeNum_ISFLOAT(NPY_INT) && !PyTypeNum_ISCOMPLEX(NPY_INT)) {
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        CONVERT_TO_RESULT(PyFloat_AS_DOUBLE(value));
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyLong_Check(value)) {
+        if (!PyLong_CheckExact(value)) {
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_LONG, NPY_INT)) {
+            /*
+             * long -> (c)longdouble is safe, so `OTHER_IS_UNKNOWN_OBJECT` will
+             * be returned below for huge integers.
+             */
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        int overflow;
+        long val = PyLong_AsLongAndOverflow(value, &overflow);
+        if (overflow) {
+            /* handle as if "unsafe" */
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                return OTHER_IS_UNKNOWN_OBJECT;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        if (error_converting(val)) {
+            return CONVERSION_ERROR;  /* should not be possible */
+        }
+        CONVERT_TO_RESULT(val);
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyComplex_Check(value)) {
+        if (!PyComplex_CheckExact(value)) {
+            /* A NumPy complex double is a float subclass, but special. */
+            if (PyArray_IsScalar(value, CDouble)) {
+                descr = PyArray_DescrFromType(NPY_CDOUBLE);
+                goto numpy_scalar;
+            }
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_CDOUBLE, NPY_INT)) {
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            /* Weak promotion is used when self is float or complex: */
+            if (!PyTypeNum_ISCOMPLEX(NPY_INT)) {
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+#if defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+        Py_complex val = PyComplex_AsCComplex(value);
+        if (error_converting(val.real)) {
+            return CONVERSION_ERROR;  /* should not be possible */
+        }
+        result->real = val.real;
+        result->imag = val.imag;
+        return CONVERSION_SUCCESS;
+#else
+        /* unreachable, always unsafe cast above; return to avoid warning */
+        assert(0);
+        return OTHER_IS_UNKNOWN_OBJECT;
+#endif  /* defined(IS_CFLOAT) || ... */
+    }
+
+    /*
+     * (seberg) It would be nice to use `PyArray_DiscoverDTypeFromScalarType`
+     * from array coercion here.  OTOH, the array coercion code also falls
+     * back to this code.  The issue is around how subclasses should work...
+     *
+     * It would be nice to try to fully align the paths again (they effectively
+     * are equivalent).  Proper support for subclasses is in general tricky,
+     * and it would make more sense to just _refuse_ to support them.
+     * However, it is unclear that this is a viable option...
+     */
+    if (!PyArray_IsScalar(value, Generic)) {
+        /*
+         * The input is an unknown python object.  This should probably defer
+         * but only does so for float128.
+         * For all other cases, we defer to the array logic.  If the object
+         * is indeed not an array-like, this will end up converting the NumPy
+         * scalar to a Python scalar and then try again.
+         * The logic is that the ufunc casts the input to object, which does
+         * the conversion.
+         * If the object is an array, deferring will always kick in.
+         */
+        *may_need_deferring = NPY_TRUE;
+        return OTHER_IS_UNKNOWN_OBJECT;
+    }
+
+    descr = PyArray_DescrFromScalar(value);
+    if (descr == NULL) {
+        if (PyErr_Occurred()) {
+            return CONVERSION_ERROR;
+        }
+        /* Should not happen, but may be possible with bad user subclasses */
+        *may_need_deferring = NPY_TRUE;
+        return OTHER_IS_UNKNOWN_OBJECT;
+    }
+
+  numpy_scalar:
+    if (descr->typeobj != Py_TYPE(value)) {
+        /*
+         * This is a subclass of a builtin type, we may continue normally,
+         * but should check whether we need to defer.
+         */
+        *may_need_deferring = NPY_TRUE;
+    }
+
+    /*
+     * Otherwise, we have a clear NumPy scalar, find if it is a compatible
+     * builtin scalar.
+     * Each `GET_VALUE_OR_DEFER` represents a case clause for its type number,
+     * extracting the value if it is safe and otherwise deferring.
+     * (Safety is known at compile time, so the switch statement should be
+     * simplified by the compiler accordingly.)
+     * If we have a scalar that is not listed or not safe, we defer to it.
+     *
+     * We should probably defer more aggressively, but that is too big a change,
+     * since it would disable `np.float64(1.) * [1, 2, 3, 4]`.
+     */
+    int ret;  /* set by the GET_VALUE_OR_DEFER macro */
+    switch (descr->type_num) {
+        GET_VALUE_OR_DEFER(BOOL, Bool, value);
+        /* UInts */
+        GET_VALUE_OR_DEFER(UBYTE, UByte, value);
+        GET_VALUE_OR_DEFER(USHORT, UShort, value);
+        GET_VALUE_OR_DEFER(UINT, UInt, value);
+        GET_VALUE_OR_DEFER(ULONG, ULong, value);
+        GET_VALUE_OR_DEFER(ULONGLONG, ULongLong, value);
+        /* Ints */
+        GET_VALUE_OR_DEFER(BYTE, Byte, value);
+        GET_VALUE_OR_DEFER(SHORT, Short, value);
+        GET_VALUE_OR_DEFER(INT, Int, value);
+        GET_VALUE_OR_DEFER(LONG, Long, value);
+        GET_VALUE_OR_DEFER(LONGLONG, LongLong, value);
+        /* Floats */
+        case NPY_HALF:
+            if (IS_SAFE(NPY_HALF, NPY_INT)) {
+                CONVERT_TO_RESULT(npy_half_to_float(PyArrayScalar_VAL(value, Half)));
+                ret = CONVERSION_SUCCESS;
+            }
+            else if (IS_SAFE(NPY_INT, NPY_HALF)) {
+                ret = DEFER_TO_OTHER_KNOWN_SCALAR;
+            }
+            else {
+                ret = PROMOTION_REQUIRED;
+            }
+            break;
+        GET_VALUE_OR_DEFER(FLOAT, Float, value);
+        GET_VALUE_OR_DEFER(DOUBLE, Double, value);
+        GET_VALUE_OR_DEFER(LONGDOUBLE, LongDouble, value);
+        /* Complex: We should still defer, but the code won't work... */
+        GET_CVALUE_OR_DEFER(CFLOAT, CFloat, value);
+        GET_CVALUE_OR_DEFER(CDOUBLE, CDouble, value);
+        GET_CVALUE_OR_DEFER(CLONGDOUBLE, CLongDouble, value);
+        default:
+            /*
+             * If there is no match, this is an unknown scalar object.  It
+             * would make sense to defer generously here, but it should also
+             * always be safe to use the array path.
+             * The issue is, that the other scalar may or may not be designed
+             * to deal with NumPy scalars.  Without knowing that, we cannot
+             * defer (which would be much faster potentially).
+             * TODO: We could add a DType flag to allow opting in to deferring!
+             */
+            *may_need_deferring = NPY_TRUE;
+            ret = OTHER_IS_UNKNOWN_OBJECT;
+    }
+    Py_DECREF(descr);
+    return ret;
+}
+
+#undef IS_SAFE
+#undef CONVERT_TO_RESULT
+#undef GET_VALUE_OR_DEFER
+#undef GET_CVALUE_OR_DEFER
+#undef IS_INT
+
+
+#line 837
+
+#define IS_UINT 1
+
+#define IS_SAFE(FROM, TO) _npy_can_cast_safely_table[FROM][TO]
+
+/*
+ * TODO: This whole thing is awkward, and we should create a helper header to
+ *       define inline functions that convert single elements for all numeric
+ *       types.  That could then also be used to define all cast loops.
+ *       (Even if that may get more complex for SIMD at some point.)
+ *       For now, half casts could be optimized because of that.
+ */
+
+#if defined(IS_HALF)
+    #define CONVERT_TO_RESULT(value)  \
+        *result = npy_float_to_half((float)(value))
+#elif defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+    #define CONVERT_TO_RESULT(value)  \
+        result->real = value;  \
+        result->imag = 0
+#else
+    #define CONVERT_TO_RESULT(value) *result = value
+#endif
+
+
+#define GET_VALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_##OTHER, NPY_UINT)) {  \
+            CONVERT_TO_RESULT(PyArrayScalar_VAL(value, Other));  \
+            ret = CONVERSION_SUCCESS;  \
+        }  \
+        else if (IS_SAFE(NPY_UINT, NPY_##OTHER)) {  \
+            /*
+             * If self can cast safely to other, this is clear:
+             * we should definitely defer.
+             */  \
+             ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            /* Otherwise, we must promote */  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+/*
+ * Complex to complex (and rejecting complex to real) is a bit different:
+ */
+
+#if defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+
+#define GET_CVALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_##OTHER, NPY_UINT)) {  \
+            assert(Py_TYPE(value) == &Py##Other##ArrType_Type);  \
+            result->real = PyArrayScalar_VAL(value, Other).real;  \
+            result->imag = PyArrayScalar_VAL(value, Other).imag;  \
+            ret = 1;  \
+        }  \
+        else if (IS_SAFE(NPY_UINT, NPY_##OTHER)) {  \
+             ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+#else
+
+/* Getting a complex value to real is never safe: */
+#define GET_CVALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_UINT, NPY_##OTHER)) {  \
+            ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+#endif
+
+
+/**
+ * Convert the value to the own type and and store the result.
+ *
+ * @param value The value to convert (if compatible)
+ * @param result The result value (output)
+ * @param may_need_deferring Set to `NPY_TRUE` when the caller must check
+ *        `BINOP_GIVE_UP_IF_NEEDED` (or similar) due to possible implementation
+ *        of `__array_priority__` (or similar).
+ *        This is set for unknown objects and all subclasses even when they
+ *        can be handled.
+ * @result The result value indicating what we did with `value` or what type
+ *         of object it is (see `conversion_result`).
+ */
+static inline conversion_result
+convert_to_uint(PyObject *value, npy_uint *result, npy_bool *may_need_deferring)
+{
+    PyArray_Descr *descr;
+    *may_need_deferring = NPY_FALSE;
+
+    if (Py_TYPE(value) == &PyUIntArrType_Type) {
+        *result = PyArrayScalar_VAL(value, UInt);
+        return CONVERSION_SUCCESS;
+    }
+    /* Optimize the identical scalar specifically. */
+    if (PyArray_IsScalar(value, UInt)) {
+        *result = PyArrayScalar_VAL(value, UInt);
+        /*
+         * In principle special, assyemetric, handling could be possible for
+         * explicit subclasses.
+         * In practice, we just check the normal deferring logic.
+         */
+        *may_need_deferring = NPY_TRUE;
+        return CONVERSION_SUCCESS;
+    }
+
+    /*
+     * Then we check for the basic Python types float, int, and complex.
+     * (this is a bit tedious to do right for complex).
+     */
+    if (PyBool_Check(value)) {
+        CONVERT_TO_RESULT(value == Py_True);
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyFloat_Check(value)) {
+        if (!PyFloat_CheckExact(value)) {
+            /* A NumPy double is a float subclass, but special. */
+            if (PyArray_IsScalar(value, Double)) {
+                descr = PyArray_DescrFromType(NPY_DOUBLE);
+                goto numpy_scalar;
+            }
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_DOUBLE, NPY_UINT)) {
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            /* Weak promotion is used when self is float or complex: */
+            if (!PyTypeNum_ISFLOAT(NPY_UINT) && !PyTypeNum_ISCOMPLEX(NPY_UINT)) {
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        CONVERT_TO_RESULT(PyFloat_AS_DOUBLE(value));
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyLong_Check(value)) {
+        if (!PyLong_CheckExact(value)) {
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_LONG, NPY_UINT)) {
+            /*
+             * long -> (c)longdouble is safe, so `OTHER_IS_UNKNOWN_OBJECT` will
+             * be returned below for huge integers.
+             */
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        int overflow;
+        long val = PyLong_AsLongAndOverflow(value, &overflow);
+        if (overflow) {
+            /* handle as if "unsafe" */
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                return OTHER_IS_UNKNOWN_OBJECT;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        if (error_converting(val)) {
+            return CONVERSION_ERROR;  /* should not be possible */
+        }
+        CONVERT_TO_RESULT(val);
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyComplex_Check(value)) {
+        if (!PyComplex_CheckExact(value)) {
+            /* A NumPy complex double is a float subclass, but special. */
+            if (PyArray_IsScalar(value, CDouble)) {
+                descr = PyArray_DescrFromType(NPY_CDOUBLE);
+                goto numpy_scalar;
+            }
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_CDOUBLE, NPY_UINT)) {
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            /* Weak promotion is used when self is float or complex: */
+            if (!PyTypeNum_ISCOMPLEX(NPY_UINT)) {
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+#if defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+        Py_complex val = PyComplex_AsCComplex(value);
+        if (error_converting(val.real)) {
+            return CONVERSION_ERROR;  /* should not be possible */
+        }
+        result->real = val.real;
+        result->imag = val.imag;
+        return CONVERSION_SUCCESS;
+#else
+        /* unreachable, always unsafe cast above; return to avoid warning */
+        assert(0);
+        return OTHER_IS_UNKNOWN_OBJECT;
+#endif  /* defined(IS_CFLOAT) || ... */
+    }
+
+    /*
+     * (seberg) It would be nice to use `PyArray_DiscoverDTypeFromScalarType`
+     * from array coercion here.  OTOH, the array coercion code also falls
+     * back to this code.  The issue is around how subclasses should work...
+     *
+     * It would be nice to try to fully align the paths again (they effectively
+     * are equivalent).  Proper support for subclasses is in general tricky,
+     * and it would make more sense to just _refuse_ to support them.
+     * However, it is unclear that this is a viable option...
+     */
+    if (!PyArray_IsScalar(value, Generic)) {
+        /*
+         * The input is an unknown python object.  This should probably defer
+         * but only does so for float128.
+         * For all other cases, we defer to the array logic.  If the object
+         * is indeed not an array-like, this will end up converting the NumPy
+         * scalar to a Python scalar and then try again.
+         * The logic is that the ufunc casts the input to object, which does
+         * the conversion.
+         * If the object is an array, deferring will always kick in.
+         */
+        *may_need_deferring = NPY_TRUE;
+        return OTHER_IS_UNKNOWN_OBJECT;
+    }
+
+    descr = PyArray_DescrFromScalar(value);
+    if (descr == NULL) {
+        if (PyErr_Occurred()) {
+            return CONVERSION_ERROR;
+        }
+        /* Should not happen, but may be possible with bad user subclasses */
+        *may_need_deferring = NPY_TRUE;
+        return OTHER_IS_UNKNOWN_OBJECT;
+    }
+
+  numpy_scalar:
+    if (descr->typeobj != Py_TYPE(value)) {
+        /*
+         * This is a subclass of a builtin type, we may continue normally,
+         * but should check whether we need to defer.
+         */
+        *may_need_deferring = NPY_TRUE;
+    }
+
+    /*
+     * Otherwise, we have a clear NumPy scalar, find if it is a compatible
+     * builtin scalar.
+     * Each `GET_VALUE_OR_DEFER` represents a case clause for its type number,
+     * extracting the value if it is safe and otherwise deferring.
+     * (Safety is known at compile time, so the switch statement should be
+     * simplified by the compiler accordingly.)
+     * If we have a scalar that is not listed or not safe, we defer to it.
+     *
+     * We should probably defer more aggressively, but that is too big a change,
+     * since it would disable `np.float64(1.) * [1, 2, 3, 4]`.
+     */
+    int ret;  /* set by the GET_VALUE_OR_DEFER macro */
+    switch (descr->type_num) {
+        GET_VALUE_OR_DEFER(BOOL, Bool, value);
+        /* UInts */
+        GET_VALUE_OR_DEFER(UBYTE, UByte, value);
+        GET_VALUE_OR_DEFER(USHORT, UShort, value);
+        GET_VALUE_OR_DEFER(UINT, UInt, value);
+        GET_VALUE_OR_DEFER(ULONG, ULong, value);
+        GET_VALUE_OR_DEFER(ULONGLONG, ULongLong, value);
+        /* Ints */
+        GET_VALUE_OR_DEFER(BYTE, Byte, value);
+        GET_VALUE_OR_DEFER(SHORT, Short, value);
+        GET_VALUE_OR_DEFER(INT, Int, value);
+        GET_VALUE_OR_DEFER(LONG, Long, value);
+        GET_VALUE_OR_DEFER(LONGLONG, LongLong, value);
+        /* Floats */
+        case NPY_HALF:
+            if (IS_SAFE(NPY_HALF, NPY_UINT)) {
+                CONVERT_TO_RESULT(npy_half_to_float(PyArrayScalar_VAL(value, Half)));
+                ret = CONVERSION_SUCCESS;
+            }
+            else if (IS_SAFE(NPY_UINT, NPY_HALF)) {
+                ret = DEFER_TO_OTHER_KNOWN_SCALAR;
+            }
+            else {
+                ret = PROMOTION_REQUIRED;
+            }
+            break;
+        GET_VALUE_OR_DEFER(FLOAT, Float, value);
+        GET_VALUE_OR_DEFER(DOUBLE, Double, value);
+        GET_VALUE_OR_DEFER(LONGDOUBLE, LongDouble, value);
+        /* Complex: We should still defer, but the code won't work... */
+        GET_CVALUE_OR_DEFER(CFLOAT, CFloat, value);
+        GET_CVALUE_OR_DEFER(CDOUBLE, CDouble, value);
+        GET_CVALUE_OR_DEFER(CLONGDOUBLE, CLongDouble, value);
+        default:
+            /*
+             * If there is no match, this is an unknown scalar object.  It
+             * would make sense to defer generously here, but it should also
+             * always be safe to use the array path.
+             * The issue is, that the other scalar may or may not be designed
+             * to deal with NumPy scalars.  Without knowing that, we cannot
+             * defer (which would be much faster potentially).
+             * TODO: We could add a DType flag to allow opting in to deferring!
+             */
+            *may_need_deferring = NPY_TRUE;
+            ret = OTHER_IS_UNKNOWN_OBJECT;
+    }
+    Py_DECREF(descr);
+    return ret;
+}
+
+#undef IS_SAFE
+#undef CONVERT_TO_RESULT
+#undef GET_VALUE_OR_DEFER
+#undef GET_CVALUE_OR_DEFER
+#undef IS_UINT
+
+
+#line 837
+
+#define IS_LONG 1
+
+#define IS_SAFE(FROM, TO) _npy_can_cast_safely_table[FROM][TO]
+
+/*
+ * TODO: This whole thing is awkward, and we should create a helper header to
+ *       define inline functions that convert single elements for all numeric
+ *       types.  That could then also be used to define all cast loops.
+ *       (Even if that may get more complex for SIMD at some point.)
+ *       For now, half casts could be optimized because of that.
+ */
+
+#if defined(IS_HALF)
+    #define CONVERT_TO_RESULT(value)  \
+        *result = npy_float_to_half((float)(value))
+#elif defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+    #define CONVERT_TO_RESULT(value)  \
+        result->real = value;  \
+        result->imag = 0
+#else
+    #define CONVERT_TO_RESULT(value) *result = value
+#endif
+
+
+#define GET_VALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_##OTHER, NPY_LONG)) {  \
+            CONVERT_TO_RESULT(PyArrayScalar_VAL(value, Other));  \
+            ret = CONVERSION_SUCCESS;  \
+        }  \
+        else if (IS_SAFE(NPY_LONG, NPY_##OTHER)) {  \
+            /*
+             * If self can cast safely to other, this is clear:
+             * we should definitely defer.
+             */  \
+             ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            /* Otherwise, we must promote */  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+/*
+ * Complex to complex (and rejecting complex to real) is a bit different:
+ */
+
+#if defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+
+#define GET_CVALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_##OTHER, NPY_LONG)) {  \
+            assert(Py_TYPE(value) == &Py##Other##ArrType_Type);  \
+            result->real = PyArrayScalar_VAL(value, Other).real;  \
+            result->imag = PyArrayScalar_VAL(value, Other).imag;  \
+            ret = 1;  \
+        }  \
+        else if (IS_SAFE(NPY_LONG, NPY_##OTHER)) {  \
+             ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+#else
+
+/* Getting a complex value to real is never safe: */
+#define GET_CVALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_LONG, NPY_##OTHER)) {  \
+            ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+#endif
+
+
+/**
+ * Convert the value to the own type and and store the result.
+ *
+ * @param value The value to convert (if compatible)
+ * @param result The result value (output)
+ * @param may_need_deferring Set to `NPY_TRUE` when the caller must check
+ *        `BINOP_GIVE_UP_IF_NEEDED` (or similar) due to possible implementation
+ *        of `__array_priority__` (or similar).
+ *        This is set for unknown objects and all subclasses even when they
+ *        can be handled.
+ * @result The result value indicating what we did with `value` or what type
+ *         of object it is (see `conversion_result`).
+ */
+static inline conversion_result
+convert_to_long(PyObject *value, npy_long *result, npy_bool *may_need_deferring)
+{
+    PyArray_Descr *descr;
+    *may_need_deferring = NPY_FALSE;
+
+    if (Py_TYPE(value) == &PyLongArrType_Type) {
+        *result = PyArrayScalar_VAL(value, Long);
+        return CONVERSION_SUCCESS;
+    }
+    /* Optimize the identical scalar specifically. */
+    if (PyArray_IsScalar(value, Long)) {
+        *result = PyArrayScalar_VAL(value, Long);
+        /*
+         * In principle special, assyemetric, handling could be possible for
+         * explicit subclasses.
+         * In practice, we just check the normal deferring logic.
+         */
+        *may_need_deferring = NPY_TRUE;
+        return CONVERSION_SUCCESS;
+    }
+
+    /*
+     * Then we check for the basic Python types float, int, and complex.
+     * (this is a bit tedious to do right for complex).
+     */
+    if (PyBool_Check(value)) {
+        CONVERT_TO_RESULT(value == Py_True);
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyFloat_Check(value)) {
+        if (!PyFloat_CheckExact(value)) {
+            /* A NumPy double is a float subclass, but special. */
+            if (PyArray_IsScalar(value, Double)) {
+                descr = PyArray_DescrFromType(NPY_DOUBLE);
+                goto numpy_scalar;
+            }
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_DOUBLE, NPY_LONG)) {
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            /* Weak promotion is used when self is float or complex: */
+            if (!PyTypeNum_ISFLOAT(NPY_LONG) && !PyTypeNum_ISCOMPLEX(NPY_LONG)) {
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        CONVERT_TO_RESULT(PyFloat_AS_DOUBLE(value));
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyLong_Check(value)) {
+        if (!PyLong_CheckExact(value)) {
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_LONG, NPY_LONG)) {
+            /*
+             * long -> (c)longdouble is safe, so `OTHER_IS_UNKNOWN_OBJECT` will
+             * be returned below for huge integers.
+             */
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        int overflow;
+        long val = PyLong_AsLongAndOverflow(value, &overflow);
+        if (overflow) {
+            /* handle as if "unsafe" */
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                return OTHER_IS_UNKNOWN_OBJECT;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        if (error_converting(val)) {
+            return CONVERSION_ERROR;  /* should not be possible */
+        }
+        CONVERT_TO_RESULT(val);
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyComplex_Check(value)) {
+        if (!PyComplex_CheckExact(value)) {
+            /* A NumPy complex double is a float subclass, but special. */
+            if (PyArray_IsScalar(value, CDouble)) {
+                descr = PyArray_DescrFromType(NPY_CDOUBLE);
+                goto numpy_scalar;
+            }
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_CDOUBLE, NPY_LONG)) {
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            /* Weak promotion is used when self is float or complex: */
+            if (!PyTypeNum_ISCOMPLEX(NPY_LONG)) {
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+#if defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+        Py_complex val = PyComplex_AsCComplex(value);
+        if (error_converting(val.real)) {
+            return CONVERSION_ERROR;  /* should not be possible */
+        }
+        result->real = val.real;
+        result->imag = val.imag;
+        return CONVERSION_SUCCESS;
+#else
+        /* unreachable, always unsafe cast above; return to avoid warning */
+        assert(0);
+        return OTHER_IS_UNKNOWN_OBJECT;
+#endif  /* defined(IS_CFLOAT) || ... */
+    }
+
+    /*
+     * (seberg) It would be nice to use `PyArray_DiscoverDTypeFromScalarType`
+     * from array coercion here.  OTOH, the array coercion code also falls
+     * back to this code.  The issue is around how subclasses should work...
+     *
+     * It would be nice to try to fully align the paths again (they effectively
+     * are equivalent).  Proper support for subclasses is in general tricky,
+     * and it would make more sense to just _refuse_ to support them.
+     * However, it is unclear that this is a viable option...
+     */
+    if (!PyArray_IsScalar(value, Generic)) {
+        /*
+         * The input is an unknown python object.  This should probably defer
+         * but only does so for float128.
+         * For all other cases, we defer to the array logic.  If the object
+         * is indeed not an array-like, this will end up converting the NumPy
+         * scalar to a Python scalar and then try again.
+         * The logic is that the ufunc casts the input to object, which does
+         * the conversion.
+         * If the object is an array, deferring will always kick in.
+         */
+        *may_need_deferring = NPY_TRUE;
+        return OTHER_IS_UNKNOWN_OBJECT;
+    }
+
+    descr = PyArray_DescrFromScalar(value);
+    if (descr == NULL) {
+        if (PyErr_Occurred()) {
+            return CONVERSION_ERROR;
+        }
+        /* Should not happen, but may be possible with bad user subclasses */
+        *may_need_deferring = NPY_TRUE;
+        return OTHER_IS_UNKNOWN_OBJECT;
+    }
+
+  numpy_scalar:
+    if (descr->typeobj != Py_TYPE(value)) {
+        /*
+         * This is a subclass of a builtin type, we may continue normally,
+         * but should check whether we need to defer.
+         */
+        *may_need_deferring = NPY_TRUE;
+    }
+
+    /*
+     * Otherwise, we have a clear NumPy scalar, find if it is a compatible
+     * builtin scalar.
+     * Each `GET_VALUE_OR_DEFER` represents a case clause for its type number,
+     * extracting the value if it is safe and otherwise deferring.
+     * (Safety is known at compile time, so the switch statement should be
+     * simplified by the compiler accordingly.)
+     * If we have a scalar that is not listed or not safe, we defer to it.
+     *
+     * We should probably defer more aggressively, but that is too big a change,
+     * since it would disable `np.float64(1.) * [1, 2, 3, 4]`.
+     */
+    int ret;  /* set by the GET_VALUE_OR_DEFER macro */
+    switch (descr->type_num) {
+        GET_VALUE_OR_DEFER(BOOL, Bool, value);
+        /* UInts */
+        GET_VALUE_OR_DEFER(UBYTE, UByte, value);
+        GET_VALUE_OR_DEFER(USHORT, UShort, value);
+        GET_VALUE_OR_DEFER(UINT, UInt, value);
+        GET_VALUE_OR_DEFER(ULONG, ULong, value);
+        GET_VALUE_OR_DEFER(ULONGLONG, ULongLong, value);
+        /* Ints */
+        GET_VALUE_OR_DEFER(BYTE, Byte, value);
+        GET_VALUE_OR_DEFER(SHORT, Short, value);
+        GET_VALUE_OR_DEFER(INT, Int, value);
+        GET_VALUE_OR_DEFER(LONG, Long, value);
+        GET_VALUE_OR_DEFER(LONGLONG, LongLong, value);
+        /* Floats */
+        case NPY_HALF:
+            if (IS_SAFE(NPY_HALF, NPY_LONG)) {
+                CONVERT_TO_RESULT(npy_half_to_float(PyArrayScalar_VAL(value, Half)));
+                ret = CONVERSION_SUCCESS;
+            }
+            else if (IS_SAFE(NPY_LONG, NPY_HALF)) {
+                ret = DEFER_TO_OTHER_KNOWN_SCALAR;
+            }
+            else {
+                ret = PROMOTION_REQUIRED;
+            }
+            break;
+        GET_VALUE_OR_DEFER(FLOAT, Float, value);
+        GET_VALUE_OR_DEFER(DOUBLE, Double, value);
+        GET_VALUE_OR_DEFER(LONGDOUBLE, LongDouble, value);
+        /* Complex: We should still defer, but the code won't work... */
+        GET_CVALUE_OR_DEFER(CFLOAT, CFloat, value);
+        GET_CVALUE_OR_DEFER(CDOUBLE, CDouble, value);
+        GET_CVALUE_OR_DEFER(CLONGDOUBLE, CLongDouble, value);
+        default:
+            /*
+             * If there is no match, this is an unknown scalar object.  It
+             * would make sense to defer generously here, but it should also
+             * always be safe to use the array path.
+             * The issue is, that the other scalar may or may not be designed
+             * to deal with NumPy scalars.  Without knowing that, we cannot
+             * defer (which would be much faster potentially).
+             * TODO: We could add a DType flag to allow opting in to deferring!
+             */
+            *may_need_deferring = NPY_TRUE;
+            ret = OTHER_IS_UNKNOWN_OBJECT;
+    }
+    Py_DECREF(descr);
+    return ret;
+}
+
+#undef IS_SAFE
+#undef CONVERT_TO_RESULT
+#undef GET_VALUE_OR_DEFER
+#undef GET_CVALUE_OR_DEFER
+#undef IS_LONG
+
+
+#line 837
+
+#define IS_ULONG 1
+
+#define IS_SAFE(FROM, TO) _npy_can_cast_safely_table[FROM][TO]
+
+/*
+ * TODO: This whole thing is awkward, and we should create a helper header to
+ *       define inline functions that convert single elements for all numeric
+ *       types.  That could then also be used to define all cast loops.
+ *       (Even if that may get more complex for SIMD at some point.)
+ *       For now, half casts could be optimized because of that.
+ */
+
+#if defined(IS_HALF)
+    #define CONVERT_TO_RESULT(value)  \
+        *result = npy_float_to_half((float)(value))
+#elif defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+    #define CONVERT_TO_RESULT(value)  \
+        result->real = value;  \
+        result->imag = 0
+#else
+    #define CONVERT_TO_RESULT(value) *result = value
+#endif
+
+
+#define GET_VALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_##OTHER, NPY_ULONG)) {  \
+            CONVERT_TO_RESULT(PyArrayScalar_VAL(value, Other));  \
+            ret = CONVERSION_SUCCESS;  \
+        }  \
+        else if (IS_SAFE(NPY_ULONG, NPY_##OTHER)) {  \
+            /*
+             * If self can cast safely to other, this is clear:
+             * we should definitely defer.
+             */  \
+             ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            /* Otherwise, we must promote */  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+/*
+ * Complex to complex (and rejecting complex to real) is a bit different:
+ */
+
+#if defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+
+#define GET_CVALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_##OTHER, NPY_ULONG)) {  \
+            assert(Py_TYPE(value) == &Py##Other##ArrType_Type);  \
+            result->real = PyArrayScalar_VAL(value, Other).real;  \
+            result->imag = PyArrayScalar_VAL(value, Other).imag;  \
+            ret = 1;  \
+        }  \
+        else if (IS_SAFE(NPY_ULONG, NPY_##OTHER)) {  \
+             ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+#else
+
+/* Getting a complex value to real is never safe: */
+#define GET_CVALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_ULONG, NPY_##OTHER)) {  \
+            ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+#endif
+
+
+/**
+ * Convert the value to the own type and and store the result.
+ *
+ * @param value The value to convert (if compatible)
+ * @param result The result value (output)
+ * @param may_need_deferring Set to `NPY_TRUE` when the caller must check
+ *        `BINOP_GIVE_UP_IF_NEEDED` (or similar) due to possible implementation
+ *        of `__array_priority__` (or similar).
+ *        This is set for unknown objects and all subclasses even when they
+ *        can be handled.
+ * @result The result value indicating what we did with `value` or what type
+ *         of object it is (see `conversion_result`).
+ */
+static inline conversion_result
+convert_to_ulong(PyObject *value, npy_ulong *result, npy_bool *may_need_deferring)
+{
+    PyArray_Descr *descr;
+    *may_need_deferring = NPY_FALSE;
+
+    if (Py_TYPE(value) == &PyULongArrType_Type) {
+        *result = PyArrayScalar_VAL(value, ULong);
+        return CONVERSION_SUCCESS;
+    }
+    /* Optimize the identical scalar specifically. */
+    if (PyArray_IsScalar(value, ULong)) {
+        *result = PyArrayScalar_VAL(value, ULong);
+        /*
+         * In principle special, assyemetric, handling could be possible for
+         * explicit subclasses.
+         * In practice, we just check the normal deferring logic.
+         */
+        *may_need_deferring = NPY_TRUE;
+        return CONVERSION_SUCCESS;
+    }
+
+    /*
+     * Then we check for the basic Python types float, int, and complex.
+     * (this is a bit tedious to do right for complex).
+     */
+    if (PyBool_Check(value)) {
+        CONVERT_TO_RESULT(value == Py_True);
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyFloat_Check(value)) {
+        if (!PyFloat_CheckExact(value)) {
+            /* A NumPy double is a float subclass, but special. */
+            if (PyArray_IsScalar(value, Double)) {
+                descr = PyArray_DescrFromType(NPY_DOUBLE);
+                goto numpy_scalar;
+            }
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_DOUBLE, NPY_ULONG)) {
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            /* Weak promotion is used when self is float or complex: */
+            if (!PyTypeNum_ISFLOAT(NPY_ULONG) && !PyTypeNum_ISCOMPLEX(NPY_ULONG)) {
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        CONVERT_TO_RESULT(PyFloat_AS_DOUBLE(value));
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyLong_Check(value)) {
+        if (!PyLong_CheckExact(value)) {
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_LONG, NPY_ULONG)) {
+            /*
+             * long -> (c)longdouble is safe, so `OTHER_IS_UNKNOWN_OBJECT` will
+             * be returned below for huge integers.
+             */
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        int overflow;
+        long val = PyLong_AsLongAndOverflow(value, &overflow);
+        if (overflow) {
+            /* handle as if "unsafe" */
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                return OTHER_IS_UNKNOWN_OBJECT;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        if (error_converting(val)) {
+            return CONVERSION_ERROR;  /* should not be possible */
+        }
+        CONVERT_TO_RESULT(val);
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyComplex_Check(value)) {
+        if (!PyComplex_CheckExact(value)) {
+            /* A NumPy complex double is a float subclass, but special. */
+            if (PyArray_IsScalar(value, CDouble)) {
+                descr = PyArray_DescrFromType(NPY_CDOUBLE);
+                goto numpy_scalar;
+            }
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_CDOUBLE, NPY_ULONG)) {
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            /* Weak promotion is used when self is float or complex: */
+            if (!PyTypeNum_ISCOMPLEX(NPY_ULONG)) {
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+#if defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+        Py_complex val = PyComplex_AsCComplex(value);
+        if (error_converting(val.real)) {
+            return CONVERSION_ERROR;  /* should not be possible */
+        }
+        result->real = val.real;
+        result->imag = val.imag;
+        return CONVERSION_SUCCESS;
+#else
+        /* unreachable, always unsafe cast above; return to avoid warning */
+        assert(0);
+        return OTHER_IS_UNKNOWN_OBJECT;
+#endif  /* defined(IS_CFLOAT) || ... */
+    }
+
+    /*
+     * (seberg) It would be nice to use `PyArray_DiscoverDTypeFromScalarType`
+     * from array coercion here.  OTOH, the array coercion code also falls
+     * back to this code.  The issue is around how subclasses should work...
+     *
+     * It would be nice to try to fully align the paths again (they effectively
+     * are equivalent).  Proper support for subclasses is in general tricky,
+     * and it would make more sense to just _refuse_ to support them.
+     * However, it is unclear that this is a viable option...
+     */
+    if (!PyArray_IsScalar(value, Generic)) {
+        /*
+         * The input is an unknown python object.  This should probably defer
+         * but only does so for float128.
+         * For all other cases, we defer to the array logic.  If the object
+         * is indeed not an array-like, this will end up converting the NumPy
+         * scalar to a Python scalar and then try again.
+         * The logic is that the ufunc casts the input to object, which does
+         * the conversion.
+         * If the object is an array, deferring will always kick in.
+         */
+        *may_need_deferring = NPY_TRUE;
+        return OTHER_IS_UNKNOWN_OBJECT;
+    }
+
+    descr = PyArray_DescrFromScalar(value);
+    if (descr == NULL) {
+        if (PyErr_Occurred()) {
+            return CONVERSION_ERROR;
+        }
+        /* Should not happen, but may be possible with bad user subclasses */
+        *may_need_deferring = NPY_TRUE;
+        return OTHER_IS_UNKNOWN_OBJECT;
+    }
+
+  numpy_scalar:
+    if (descr->typeobj != Py_TYPE(value)) {
+        /*
+         * This is a subclass of a builtin type, we may continue normally,
+         * but should check whether we need to defer.
+         */
+        *may_need_deferring = NPY_TRUE;
+    }
+
+    /*
+     * Otherwise, we have a clear NumPy scalar, find if it is a compatible
+     * builtin scalar.
+     * Each `GET_VALUE_OR_DEFER` represents a case clause for its type number,
+     * extracting the value if it is safe and otherwise deferring.
+     * (Safety is known at compile time, so the switch statement should be
+     * simplified by the compiler accordingly.)
+     * If we have a scalar that is not listed or not safe, we defer to it.
+     *
+     * We should probably defer more aggressively, but that is too big a change,
+     * since it would disable `np.float64(1.) * [1, 2, 3, 4]`.
+     */
+    int ret;  /* set by the GET_VALUE_OR_DEFER macro */
+    switch (descr->type_num) {
+        GET_VALUE_OR_DEFER(BOOL, Bool, value);
+        /* UInts */
+        GET_VALUE_OR_DEFER(UBYTE, UByte, value);
+        GET_VALUE_OR_DEFER(USHORT, UShort, value);
+        GET_VALUE_OR_DEFER(UINT, UInt, value);
+        GET_VALUE_OR_DEFER(ULONG, ULong, value);
+        GET_VALUE_OR_DEFER(ULONGLONG, ULongLong, value);
+        /* Ints */
+        GET_VALUE_OR_DEFER(BYTE, Byte, value);
+        GET_VALUE_OR_DEFER(SHORT, Short, value);
+        GET_VALUE_OR_DEFER(INT, Int, value);
+        GET_VALUE_OR_DEFER(LONG, Long, value);
+        GET_VALUE_OR_DEFER(LONGLONG, LongLong, value);
+        /* Floats */
+        case NPY_HALF:
+            if (IS_SAFE(NPY_HALF, NPY_ULONG)) {
+                CONVERT_TO_RESULT(npy_half_to_float(PyArrayScalar_VAL(value, Half)));
+                ret = CONVERSION_SUCCESS;
+            }
+            else if (IS_SAFE(NPY_ULONG, NPY_HALF)) {
+                ret = DEFER_TO_OTHER_KNOWN_SCALAR;
+            }
+            else {
+                ret = PROMOTION_REQUIRED;
+            }
+            break;
+        GET_VALUE_OR_DEFER(FLOAT, Float, value);
+        GET_VALUE_OR_DEFER(DOUBLE, Double, value);
+        GET_VALUE_OR_DEFER(LONGDOUBLE, LongDouble, value);
+        /* Complex: We should still defer, but the code won't work... */
+        GET_CVALUE_OR_DEFER(CFLOAT, CFloat, value);
+        GET_CVALUE_OR_DEFER(CDOUBLE, CDouble, value);
+        GET_CVALUE_OR_DEFER(CLONGDOUBLE, CLongDouble, value);
+        default:
+            /*
+             * If there is no match, this is an unknown scalar object.  It
+             * would make sense to defer generously here, but it should also
+             * always be safe to use the array path.
+             * The issue is, that the other scalar may or may not be designed
+             * to deal with NumPy scalars.  Without knowing that, we cannot
+             * defer (which would be much faster potentially).
+             * TODO: We could add a DType flag to allow opting in to deferring!
+             */
+            *may_need_deferring = NPY_TRUE;
+            ret = OTHER_IS_UNKNOWN_OBJECT;
+    }
+    Py_DECREF(descr);
+    return ret;
+}
+
+#undef IS_SAFE
+#undef CONVERT_TO_RESULT
+#undef GET_VALUE_OR_DEFER
+#undef GET_CVALUE_OR_DEFER
+#undef IS_ULONG
+
+
+#line 837
+
+#define IS_LONGLONG 1
+
+#define IS_SAFE(FROM, TO) _npy_can_cast_safely_table[FROM][TO]
+
+/*
+ * TODO: This whole thing is awkward, and we should create a helper header to
+ *       define inline functions that convert single elements for all numeric
+ *       types.  That could then also be used to define all cast loops.
+ *       (Even if that may get more complex for SIMD at some point.)
+ *       For now, half casts could be optimized because of that.
+ */
+
+#if defined(IS_HALF)
+    #define CONVERT_TO_RESULT(value)  \
+        *result = npy_float_to_half((float)(value))
+#elif defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+    #define CONVERT_TO_RESULT(value)  \
+        result->real = value;  \
+        result->imag = 0
+#else
+    #define CONVERT_TO_RESULT(value) *result = value
+#endif
+
+
+#define GET_VALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_##OTHER, NPY_LONGLONG)) {  \
+            CONVERT_TO_RESULT(PyArrayScalar_VAL(value, Other));  \
+            ret = CONVERSION_SUCCESS;  \
+        }  \
+        else if (IS_SAFE(NPY_LONGLONG, NPY_##OTHER)) {  \
+            /*
+             * If self can cast safely to other, this is clear:
+             * we should definitely defer.
+             */  \
+             ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            /* Otherwise, we must promote */  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+/*
+ * Complex to complex (and rejecting complex to real) is a bit different:
+ */
+
+#if defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+
+#define GET_CVALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_##OTHER, NPY_LONGLONG)) {  \
+            assert(Py_TYPE(value) == &Py##Other##ArrType_Type);  \
+            result->real = PyArrayScalar_VAL(value, Other).real;  \
+            result->imag = PyArrayScalar_VAL(value, Other).imag;  \
+            ret = 1;  \
+        }  \
+        else if (IS_SAFE(NPY_LONGLONG, NPY_##OTHER)) {  \
+             ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+#else
+
+/* Getting a complex value to real is never safe: */
+#define GET_CVALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_LONGLONG, NPY_##OTHER)) {  \
+            ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+#endif
+
+
+/**
+ * Convert the value to the own type and and store the result.
+ *
+ * @param value The value to convert (if compatible)
+ * @param result The result value (output)
+ * @param may_need_deferring Set to `NPY_TRUE` when the caller must check
+ *        `BINOP_GIVE_UP_IF_NEEDED` (or similar) due to possible implementation
+ *        of `__array_priority__` (or similar).
+ *        This is set for unknown objects and all subclasses even when they
+ *        can be handled.
+ * @result The result value indicating what we did with `value` or what type
+ *         of object it is (see `conversion_result`).
+ */
+static inline conversion_result
+convert_to_longlong(PyObject *value, npy_longlong *result, npy_bool *may_need_deferring)
+{
+    PyArray_Descr *descr;
+    *may_need_deferring = NPY_FALSE;
+
+    if (Py_TYPE(value) == &PyLongLongArrType_Type) {
+        *result = PyArrayScalar_VAL(value, LongLong);
+        return CONVERSION_SUCCESS;
+    }
+    /* Optimize the identical scalar specifically. */
+    if (PyArray_IsScalar(value, LongLong)) {
+        *result = PyArrayScalar_VAL(value, LongLong);
+        /*
+         * In principle special, assyemetric, handling could be possible for
+         * explicit subclasses.
+         * In practice, we just check the normal deferring logic.
+         */
+        *may_need_deferring = NPY_TRUE;
+        return CONVERSION_SUCCESS;
+    }
+
+    /*
+     * Then we check for the basic Python types float, int, and complex.
+     * (this is a bit tedious to do right for complex).
+     */
+    if (PyBool_Check(value)) {
+        CONVERT_TO_RESULT(value == Py_True);
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyFloat_Check(value)) {
+        if (!PyFloat_CheckExact(value)) {
+            /* A NumPy double is a float subclass, but special. */
+            if (PyArray_IsScalar(value, Double)) {
+                descr = PyArray_DescrFromType(NPY_DOUBLE);
+                goto numpy_scalar;
+            }
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_DOUBLE, NPY_LONGLONG)) {
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            /* Weak promotion is used when self is float or complex: */
+            if (!PyTypeNum_ISFLOAT(NPY_LONGLONG) && !PyTypeNum_ISCOMPLEX(NPY_LONGLONG)) {
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        CONVERT_TO_RESULT(PyFloat_AS_DOUBLE(value));
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyLong_Check(value)) {
+        if (!PyLong_CheckExact(value)) {
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_LONG, NPY_LONGLONG)) {
+            /*
+             * long -> (c)longdouble is safe, so `OTHER_IS_UNKNOWN_OBJECT` will
+             * be returned below for huge integers.
+             */
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        int overflow;
+        long val = PyLong_AsLongAndOverflow(value, &overflow);
+        if (overflow) {
+            /* handle as if "unsafe" */
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                return OTHER_IS_UNKNOWN_OBJECT;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        if (error_converting(val)) {
+            return CONVERSION_ERROR;  /* should not be possible */
+        }
+        CONVERT_TO_RESULT(val);
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyComplex_Check(value)) {
+        if (!PyComplex_CheckExact(value)) {
+            /* A NumPy complex double is a float subclass, but special. */
+            if (PyArray_IsScalar(value, CDouble)) {
+                descr = PyArray_DescrFromType(NPY_CDOUBLE);
+                goto numpy_scalar;
+            }
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_CDOUBLE, NPY_LONGLONG)) {
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            /* Weak promotion is used when self is float or complex: */
+            if (!PyTypeNum_ISCOMPLEX(NPY_LONGLONG)) {
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+#if defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+        Py_complex val = PyComplex_AsCComplex(value);
+        if (error_converting(val.real)) {
+            return CONVERSION_ERROR;  /* should not be possible */
+        }
+        result->real = val.real;
+        result->imag = val.imag;
+        return CONVERSION_SUCCESS;
+#else
+        /* unreachable, always unsafe cast above; return to avoid warning */
+        assert(0);
+        return OTHER_IS_UNKNOWN_OBJECT;
+#endif  /* defined(IS_CFLOAT) || ... */
+    }
+
+    /*
+     * (seberg) It would be nice to use `PyArray_DiscoverDTypeFromScalarType`
+     * from array coercion here.  OTOH, the array coercion code also falls
+     * back to this code.  The issue is around how subclasses should work...
+     *
+     * It would be nice to try to fully align the paths again (they effectively
+     * are equivalent).  Proper support for subclasses is in general tricky,
+     * and it would make more sense to just _refuse_ to support them.
+     * However, it is unclear that this is a viable option...
+     */
+    if (!PyArray_IsScalar(value, Generic)) {
+        /*
+         * The input is an unknown python object.  This should probably defer
+         * but only does so for float128.
+         * For all other cases, we defer to the array logic.  If the object
+         * is indeed not an array-like, this will end up converting the NumPy
+         * scalar to a Python scalar and then try again.
+         * The logic is that the ufunc casts the input to object, which does
+         * the conversion.
+         * If the object is an array, deferring will always kick in.
+         */
+        *may_need_deferring = NPY_TRUE;
+        return OTHER_IS_UNKNOWN_OBJECT;
+    }
+
+    descr = PyArray_DescrFromScalar(value);
+    if (descr == NULL) {
+        if (PyErr_Occurred()) {
+            return CONVERSION_ERROR;
+        }
+        /* Should not happen, but may be possible with bad user subclasses */
+        *may_need_deferring = NPY_TRUE;
+        return OTHER_IS_UNKNOWN_OBJECT;
+    }
+
+  numpy_scalar:
+    if (descr->typeobj != Py_TYPE(value)) {
+        /*
+         * This is a subclass of a builtin type, we may continue normally,
+         * but should check whether we need to defer.
+         */
+        *may_need_deferring = NPY_TRUE;
+    }
+
+    /*
+     * Otherwise, we have a clear NumPy scalar, find if it is a compatible
+     * builtin scalar.
+     * Each `GET_VALUE_OR_DEFER` represents a case clause for its type number,
+     * extracting the value if it is safe and otherwise deferring.
+     * (Safety is known at compile time, so the switch statement should be
+     * simplified by the compiler accordingly.)
+     * If we have a scalar that is not listed or not safe, we defer to it.
+     *
+     * We should probably defer more aggressively, but that is too big a change,
+     * since it would disable `np.float64(1.) * [1, 2, 3, 4]`.
+     */
+    int ret;  /* set by the GET_VALUE_OR_DEFER macro */
+    switch (descr->type_num) {
+        GET_VALUE_OR_DEFER(BOOL, Bool, value);
+        /* UInts */
+        GET_VALUE_OR_DEFER(UBYTE, UByte, value);
+        GET_VALUE_OR_DEFER(USHORT, UShort, value);
+        GET_VALUE_OR_DEFER(UINT, UInt, value);
+        GET_VALUE_OR_DEFER(ULONG, ULong, value);
+        GET_VALUE_OR_DEFER(ULONGLONG, ULongLong, value);
+        /* Ints */
+        GET_VALUE_OR_DEFER(BYTE, Byte, value);
+        GET_VALUE_OR_DEFER(SHORT, Short, value);
+        GET_VALUE_OR_DEFER(INT, Int, value);
+        GET_VALUE_OR_DEFER(LONG, Long, value);
+        GET_VALUE_OR_DEFER(LONGLONG, LongLong, value);
+        /* Floats */
+        case NPY_HALF:
+            if (IS_SAFE(NPY_HALF, NPY_LONGLONG)) {
+                CONVERT_TO_RESULT(npy_half_to_float(PyArrayScalar_VAL(value, Half)));
+                ret = CONVERSION_SUCCESS;
+            }
+            else if (IS_SAFE(NPY_LONGLONG, NPY_HALF)) {
+                ret = DEFER_TO_OTHER_KNOWN_SCALAR;
+            }
+            else {
+                ret = PROMOTION_REQUIRED;
+            }
+            break;
+        GET_VALUE_OR_DEFER(FLOAT, Float, value);
+        GET_VALUE_OR_DEFER(DOUBLE, Double, value);
+        GET_VALUE_OR_DEFER(LONGDOUBLE, LongDouble, value);
+        /* Complex: We should still defer, but the code won't work... */
+        GET_CVALUE_OR_DEFER(CFLOAT, CFloat, value);
+        GET_CVALUE_OR_DEFER(CDOUBLE, CDouble, value);
+        GET_CVALUE_OR_DEFER(CLONGDOUBLE, CLongDouble, value);
+        default:
+            /*
+             * If there is no match, this is an unknown scalar object.  It
+             * would make sense to defer generously here, but it should also
+             * always be safe to use the array path.
+             * The issue is, that the other scalar may or may not be designed
+             * to deal with NumPy scalars.  Without knowing that, we cannot
+             * defer (which would be much faster potentially).
+             * TODO: We could add a DType flag to allow opting in to deferring!
+             */
+            *may_need_deferring = NPY_TRUE;
+            ret = OTHER_IS_UNKNOWN_OBJECT;
+    }
+    Py_DECREF(descr);
+    return ret;
+}
+
+#undef IS_SAFE
+#undef CONVERT_TO_RESULT
+#undef GET_VALUE_OR_DEFER
+#undef GET_CVALUE_OR_DEFER
+#undef IS_LONGLONG
+
+
+#line 837
+
+#define IS_ULONGLONG 1
+
+#define IS_SAFE(FROM, TO) _npy_can_cast_safely_table[FROM][TO]
+
+/*
+ * TODO: This whole thing is awkward, and we should create a helper header to
+ *       define inline functions that convert single elements for all numeric
+ *       types.  That could then also be used to define all cast loops.
+ *       (Even if that may get more complex for SIMD at some point.)
+ *       For now, half casts could be optimized because of that.
+ */
+
+#if defined(IS_HALF)
+    #define CONVERT_TO_RESULT(value)  \
+        *result = npy_float_to_half((float)(value))
+#elif defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+    #define CONVERT_TO_RESULT(value)  \
+        result->real = value;  \
+        result->imag = 0
+#else
+    #define CONVERT_TO_RESULT(value) *result = value
+#endif
+
+
+#define GET_VALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_##OTHER, NPY_ULONGLONG)) {  \
+            CONVERT_TO_RESULT(PyArrayScalar_VAL(value, Other));  \
+            ret = CONVERSION_SUCCESS;  \
+        }  \
+        else if (IS_SAFE(NPY_ULONGLONG, NPY_##OTHER)) {  \
+            /*
+             * If self can cast safely to other, this is clear:
+             * we should definitely defer.
+             */  \
+             ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            /* Otherwise, we must promote */  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+/*
+ * Complex to complex (and rejecting complex to real) is a bit different:
+ */
+
+#if defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+
+#define GET_CVALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_##OTHER, NPY_ULONGLONG)) {  \
+            assert(Py_TYPE(value) == &Py##Other##ArrType_Type);  \
+            result->real = PyArrayScalar_VAL(value, Other).real;  \
+            result->imag = PyArrayScalar_VAL(value, Other).imag;  \
+            ret = 1;  \
+        }  \
+        else if (IS_SAFE(NPY_ULONGLONG, NPY_##OTHER)) {  \
+             ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+#else
+
+/* Getting a complex value to real is never safe: */
+#define GET_CVALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_ULONGLONG, NPY_##OTHER)) {  \
+            ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+#endif
+
+
+/**
+ * Convert the value to the own type and and store the result.
+ *
+ * @param value The value to convert (if compatible)
+ * @param result The result value (output)
+ * @param may_need_deferring Set to `NPY_TRUE` when the caller must check
+ *        `BINOP_GIVE_UP_IF_NEEDED` (or similar) due to possible implementation
+ *        of `__array_priority__` (or similar).
+ *        This is set for unknown objects and all subclasses even when they
+ *        can be handled.
+ * @result The result value indicating what we did with `value` or what type
+ *         of object it is (see `conversion_result`).
+ */
+static inline conversion_result
+convert_to_ulonglong(PyObject *value, npy_ulonglong *result, npy_bool *may_need_deferring)
+{
+    PyArray_Descr *descr;
+    *may_need_deferring = NPY_FALSE;
+
+    if (Py_TYPE(value) == &PyULongLongArrType_Type) {
+        *result = PyArrayScalar_VAL(value, ULongLong);
+        return CONVERSION_SUCCESS;
+    }
+    /* Optimize the identical scalar specifically. */
+    if (PyArray_IsScalar(value, ULongLong)) {
+        *result = PyArrayScalar_VAL(value, ULongLong);
+        /*
+         * In principle special, assyemetric, handling could be possible for
+         * explicit subclasses.
+         * In practice, we just check the normal deferring logic.
+         */
+        *may_need_deferring = NPY_TRUE;
+        return CONVERSION_SUCCESS;
+    }
+
+    /*
+     * Then we check for the basic Python types float, int, and complex.
+     * (this is a bit tedious to do right for complex).
+     */
+    if (PyBool_Check(value)) {
+        CONVERT_TO_RESULT(value == Py_True);
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyFloat_Check(value)) {
+        if (!PyFloat_CheckExact(value)) {
+            /* A NumPy double is a float subclass, but special. */
+            if (PyArray_IsScalar(value, Double)) {
+                descr = PyArray_DescrFromType(NPY_DOUBLE);
+                goto numpy_scalar;
+            }
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_DOUBLE, NPY_ULONGLONG)) {
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            /* Weak promotion is used when self is float or complex: */
+            if (!PyTypeNum_ISFLOAT(NPY_ULONGLONG) && !PyTypeNum_ISCOMPLEX(NPY_ULONGLONG)) {
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        CONVERT_TO_RESULT(PyFloat_AS_DOUBLE(value));
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyLong_Check(value)) {
+        if (!PyLong_CheckExact(value)) {
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_LONG, NPY_ULONGLONG)) {
+            /*
+             * long -> (c)longdouble is safe, so `OTHER_IS_UNKNOWN_OBJECT` will
+             * be returned below for huge integers.
+             */
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        int overflow;
+        long val = PyLong_AsLongAndOverflow(value, &overflow);
+        if (overflow) {
+            /* handle as if "unsafe" */
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                return OTHER_IS_UNKNOWN_OBJECT;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        if (error_converting(val)) {
+            return CONVERSION_ERROR;  /* should not be possible */
+        }
+        CONVERT_TO_RESULT(val);
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyComplex_Check(value)) {
+        if (!PyComplex_CheckExact(value)) {
+            /* A NumPy complex double is a float subclass, but special. */
+            if (PyArray_IsScalar(value, CDouble)) {
+                descr = PyArray_DescrFromType(NPY_CDOUBLE);
+                goto numpy_scalar;
+            }
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_CDOUBLE, NPY_ULONGLONG)) {
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            /* Weak promotion is used when self is float or complex: */
+            if (!PyTypeNum_ISCOMPLEX(NPY_ULONGLONG)) {
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+#if defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+        Py_complex val = PyComplex_AsCComplex(value);
+        if (error_converting(val.real)) {
+            return CONVERSION_ERROR;  /* should not be possible */
+        }
+        result->real = val.real;
+        result->imag = val.imag;
+        return CONVERSION_SUCCESS;
+#else
+        /* unreachable, always unsafe cast above; return to avoid warning */
+        assert(0);
+        return OTHER_IS_UNKNOWN_OBJECT;
+#endif  /* defined(IS_CFLOAT) || ... */
+    }
+
+    /*
+     * (seberg) It would be nice to use `PyArray_DiscoverDTypeFromScalarType`
+     * from array coercion here.  OTOH, the array coercion code also falls
+     * back to this code.  The issue is around how subclasses should work...
+     *
+     * It would be nice to try to fully align the paths again (they effectively
+     * are equivalent).  Proper support for subclasses is in general tricky,
+     * and it would make more sense to just _refuse_ to support them.
+     * However, it is unclear that this is a viable option...
+     */
+    if (!PyArray_IsScalar(value, Generic)) {
+        /*
+         * The input is an unknown python object.  This should probably defer
+         * but only does so for float128.
+         * For all other cases, we defer to the array logic.  If the object
+         * is indeed not an array-like, this will end up converting the NumPy
+         * scalar to a Python scalar and then try again.
+         * The logic is that the ufunc casts the input to object, which does
+         * the conversion.
+         * If the object is an array, deferring will always kick in.
+         */
+        *may_need_deferring = NPY_TRUE;
+        return OTHER_IS_UNKNOWN_OBJECT;
+    }
+
+    descr = PyArray_DescrFromScalar(value);
+    if (descr == NULL) {
+        if (PyErr_Occurred()) {
+            return CONVERSION_ERROR;
+        }
+        /* Should not happen, but may be possible with bad user subclasses */
+        *may_need_deferring = NPY_TRUE;
+        return OTHER_IS_UNKNOWN_OBJECT;
+    }
+
+  numpy_scalar:
+    if (descr->typeobj != Py_TYPE(value)) {
+        /*
+         * This is a subclass of a builtin type, we may continue normally,
+         * but should check whether we need to defer.
+         */
+        *may_need_deferring = NPY_TRUE;
+    }
+
+    /*
+     * Otherwise, we have a clear NumPy scalar, find if it is a compatible
+     * builtin scalar.
+     * Each `GET_VALUE_OR_DEFER` represents a case clause for its type number,
+     * extracting the value if it is safe and otherwise deferring.
+     * (Safety is known at compile time, so the switch statement should be
+     * simplified by the compiler accordingly.)
+     * If we have a scalar that is not listed or not safe, we defer to it.
+     *
+     * We should probably defer more aggressively, but that is too big a change,
+     * since it would disable `np.float64(1.) * [1, 2, 3, 4]`.
+     */
+    int ret;  /* set by the GET_VALUE_OR_DEFER macro */
+    switch (descr->type_num) {
+        GET_VALUE_OR_DEFER(BOOL, Bool, value);
+        /* UInts */
+        GET_VALUE_OR_DEFER(UBYTE, UByte, value);
+        GET_VALUE_OR_DEFER(USHORT, UShort, value);
+        GET_VALUE_OR_DEFER(UINT, UInt, value);
+        GET_VALUE_OR_DEFER(ULONG, ULong, value);
+        GET_VALUE_OR_DEFER(ULONGLONG, ULongLong, value);
+        /* Ints */
+        GET_VALUE_OR_DEFER(BYTE, Byte, value);
+        GET_VALUE_OR_DEFER(SHORT, Short, value);
+        GET_VALUE_OR_DEFER(INT, Int, value);
+        GET_VALUE_OR_DEFER(LONG, Long, value);
+        GET_VALUE_OR_DEFER(LONGLONG, LongLong, value);
+        /* Floats */
+        case NPY_HALF:
+            if (IS_SAFE(NPY_HALF, NPY_ULONGLONG)) {
+                CONVERT_TO_RESULT(npy_half_to_float(PyArrayScalar_VAL(value, Half)));
+                ret = CONVERSION_SUCCESS;
+            }
+            else if (IS_SAFE(NPY_ULONGLONG, NPY_HALF)) {
+                ret = DEFER_TO_OTHER_KNOWN_SCALAR;
+            }
+            else {
+                ret = PROMOTION_REQUIRED;
+            }
+            break;
+        GET_VALUE_OR_DEFER(FLOAT, Float, value);
+        GET_VALUE_OR_DEFER(DOUBLE, Double, value);
+        GET_VALUE_OR_DEFER(LONGDOUBLE, LongDouble, value);
+        /* Complex: We should still defer, but the code won't work... */
+        GET_CVALUE_OR_DEFER(CFLOAT, CFloat, value);
+        GET_CVALUE_OR_DEFER(CDOUBLE, CDouble, value);
+        GET_CVALUE_OR_DEFER(CLONGDOUBLE, CLongDouble, value);
+        default:
+            /*
+             * If there is no match, this is an unknown scalar object.  It
+             * would make sense to defer generously here, but it should also
+             * always be safe to use the array path.
+             * The issue is, that the other scalar may or may not be designed
+             * to deal with NumPy scalars.  Without knowing that, we cannot
+             * defer (which would be much faster potentially).
+             * TODO: We could add a DType flag to allow opting in to deferring!
+             */
+            *may_need_deferring = NPY_TRUE;
+            ret = OTHER_IS_UNKNOWN_OBJECT;
+    }
+    Py_DECREF(descr);
+    return ret;
+}
+
+#undef IS_SAFE
+#undef CONVERT_TO_RESULT
+#undef GET_VALUE_OR_DEFER
+#undef GET_CVALUE_OR_DEFER
+#undef IS_ULONGLONG
+
+
+#line 837
+
+#define IS_HALF 1
+
+#define IS_SAFE(FROM, TO) _npy_can_cast_safely_table[FROM][TO]
+
+/*
+ * TODO: This whole thing is awkward, and we should create a helper header to
+ *       define inline functions that convert single elements for all numeric
+ *       types.  That could then also be used to define all cast loops.
+ *       (Even if that may get more complex for SIMD at some point.)
+ *       For now, half casts could be optimized because of that.
+ */
+
+#if defined(IS_HALF)
+    #define CONVERT_TO_RESULT(value)  \
+        *result = npy_float_to_half((float)(value))
+#elif defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+    #define CONVERT_TO_RESULT(value)  \
+        result->real = value;  \
+        result->imag = 0
+#else
+    #define CONVERT_TO_RESULT(value) *result = value
+#endif
+
+
+#define GET_VALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_##OTHER, NPY_HALF)) {  \
+            CONVERT_TO_RESULT(PyArrayScalar_VAL(value, Other));  \
+            ret = CONVERSION_SUCCESS;  \
+        }  \
+        else if (IS_SAFE(NPY_HALF, NPY_##OTHER)) {  \
+            /*
+             * If self can cast safely to other, this is clear:
+             * we should definitely defer.
+             */  \
+             ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            /* Otherwise, we must promote */  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+/*
+ * Complex to complex (and rejecting complex to real) is a bit different:
+ */
+
+#if defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+
+#define GET_CVALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_##OTHER, NPY_HALF)) {  \
+            assert(Py_TYPE(value) == &Py##Other##ArrType_Type);  \
+            result->real = PyArrayScalar_VAL(value, Other).real;  \
+            result->imag = PyArrayScalar_VAL(value, Other).imag;  \
+            ret = 1;  \
+        }  \
+        else if (IS_SAFE(NPY_HALF, NPY_##OTHER)) {  \
+             ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+#else
+
+/* Getting a complex value to real is never safe: */
+#define GET_CVALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_HALF, NPY_##OTHER)) {  \
+            ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+#endif
+
+
+/**
+ * Convert the value to the own type and and store the result.
+ *
+ * @param value The value to convert (if compatible)
+ * @param result The result value (output)
+ * @param may_need_deferring Set to `NPY_TRUE` when the caller must check
+ *        `BINOP_GIVE_UP_IF_NEEDED` (or similar) due to possible implementation
+ *        of `__array_priority__` (or similar).
+ *        This is set for unknown objects and all subclasses even when they
+ *        can be handled.
+ * @result The result value indicating what we did with `value` or what type
+ *         of object it is (see `conversion_result`).
+ */
+static inline conversion_result
+convert_to_half(PyObject *value, npy_half *result, npy_bool *may_need_deferring)
+{
+    PyArray_Descr *descr;
+    *may_need_deferring = NPY_FALSE;
+
+    if (Py_TYPE(value) == &PyHalfArrType_Type) {
+        *result = PyArrayScalar_VAL(value, Half);
+        return CONVERSION_SUCCESS;
+    }
+    /* Optimize the identical scalar specifically. */
+    if (PyArray_IsScalar(value, Half)) {
+        *result = PyArrayScalar_VAL(value, Half);
+        /*
+         * In principle special, assyemetric, handling could be possible for
+         * explicit subclasses.
+         * In practice, we just check the normal deferring logic.
+         */
+        *may_need_deferring = NPY_TRUE;
+        return CONVERSION_SUCCESS;
+    }
+
+    /*
+     * Then we check for the basic Python types float, int, and complex.
+     * (this is a bit tedious to do right for complex).
+     */
+    if (PyBool_Check(value)) {
+        CONVERT_TO_RESULT(value == Py_True);
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyFloat_Check(value)) {
+        if (!PyFloat_CheckExact(value)) {
+            /* A NumPy double is a float subclass, but special. */
+            if (PyArray_IsScalar(value, Double)) {
+                descr = PyArray_DescrFromType(NPY_DOUBLE);
+                goto numpy_scalar;
+            }
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_DOUBLE, NPY_HALF)) {
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            /* Weak promotion is used when self is float or complex: */
+            if (!PyTypeNum_ISFLOAT(NPY_HALF) && !PyTypeNum_ISCOMPLEX(NPY_HALF)) {
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        CONVERT_TO_RESULT(PyFloat_AS_DOUBLE(value));
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyLong_Check(value)) {
+        if (!PyLong_CheckExact(value)) {
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_LONG, NPY_HALF)) {
+            /*
+             * long -> (c)longdouble is safe, so `OTHER_IS_UNKNOWN_OBJECT` will
+             * be returned below for huge integers.
+             */
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        int overflow;
+        long val = PyLong_AsLongAndOverflow(value, &overflow);
+        if (overflow) {
+            /* handle as if "unsafe" */
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                return OTHER_IS_UNKNOWN_OBJECT;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        if (error_converting(val)) {
+            return CONVERSION_ERROR;  /* should not be possible */
+        }
+        CONVERT_TO_RESULT(val);
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyComplex_Check(value)) {
+        if (!PyComplex_CheckExact(value)) {
+            /* A NumPy complex double is a float subclass, but special. */
+            if (PyArray_IsScalar(value, CDouble)) {
+                descr = PyArray_DescrFromType(NPY_CDOUBLE);
+                goto numpy_scalar;
+            }
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_CDOUBLE, NPY_HALF)) {
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            /* Weak promotion is used when self is float or complex: */
+            if (!PyTypeNum_ISCOMPLEX(NPY_HALF)) {
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+#if defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+        Py_complex val = PyComplex_AsCComplex(value);
+        if (error_converting(val.real)) {
+            return CONVERSION_ERROR;  /* should not be possible */
+        }
+        result->real = val.real;
+        result->imag = val.imag;
+        return CONVERSION_SUCCESS;
+#else
+        /* unreachable, always unsafe cast above; return to avoid warning */
+        assert(0);
+        return OTHER_IS_UNKNOWN_OBJECT;
+#endif  /* defined(IS_CFLOAT) || ... */
+    }
+
+    /*
+     * (seberg) It would be nice to use `PyArray_DiscoverDTypeFromScalarType`
+     * from array coercion here.  OTOH, the array coercion code also falls
+     * back to this code.  The issue is around how subclasses should work...
+     *
+     * It would be nice to try to fully align the paths again (they effectively
+     * are equivalent).  Proper support for subclasses is in general tricky,
+     * and it would make more sense to just _refuse_ to support them.
+     * However, it is unclear that this is a viable option...
+     */
+    if (!PyArray_IsScalar(value, Generic)) {
+        /*
+         * The input is an unknown python object.  This should probably defer
+         * but only does so for float128.
+         * For all other cases, we defer to the array logic.  If the object
+         * is indeed not an array-like, this will end up converting the NumPy
+         * scalar to a Python scalar and then try again.
+         * The logic is that the ufunc casts the input to object, which does
+         * the conversion.
+         * If the object is an array, deferring will always kick in.
+         */
+        *may_need_deferring = NPY_TRUE;
+        return OTHER_IS_UNKNOWN_OBJECT;
+    }
+
+    descr = PyArray_DescrFromScalar(value);
+    if (descr == NULL) {
+        if (PyErr_Occurred()) {
+            return CONVERSION_ERROR;
+        }
+        /* Should not happen, but may be possible with bad user subclasses */
+        *may_need_deferring = NPY_TRUE;
+        return OTHER_IS_UNKNOWN_OBJECT;
+    }
+
+  numpy_scalar:
+    if (descr->typeobj != Py_TYPE(value)) {
+        /*
+         * This is a subclass of a builtin type, we may continue normally,
+         * but should check whether we need to defer.
+         */
+        *may_need_deferring = NPY_TRUE;
+    }
+
+    /*
+     * Otherwise, we have a clear NumPy scalar, find if it is a compatible
+     * builtin scalar.
+     * Each `GET_VALUE_OR_DEFER` represents a case clause for its type number,
+     * extracting the value if it is safe and otherwise deferring.
+     * (Safety is known at compile time, so the switch statement should be
+     * simplified by the compiler accordingly.)
+     * If we have a scalar that is not listed or not safe, we defer to it.
+     *
+     * We should probably defer more aggressively, but that is too big a change,
+     * since it would disable `np.float64(1.) * [1, 2, 3, 4]`.
+     */
+    int ret;  /* set by the GET_VALUE_OR_DEFER macro */
+    switch (descr->type_num) {
+        GET_VALUE_OR_DEFER(BOOL, Bool, value);
+        /* UInts */
+        GET_VALUE_OR_DEFER(UBYTE, UByte, value);
+        GET_VALUE_OR_DEFER(USHORT, UShort, value);
+        GET_VALUE_OR_DEFER(UINT, UInt, value);
+        GET_VALUE_OR_DEFER(ULONG, ULong, value);
+        GET_VALUE_OR_DEFER(ULONGLONG, ULongLong, value);
+        /* Ints */
+        GET_VALUE_OR_DEFER(BYTE, Byte, value);
+        GET_VALUE_OR_DEFER(SHORT, Short, value);
+        GET_VALUE_OR_DEFER(INT, Int, value);
+        GET_VALUE_OR_DEFER(LONG, Long, value);
+        GET_VALUE_OR_DEFER(LONGLONG, LongLong, value);
+        /* Floats */
+        case NPY_HALF:
+            if (IS_SAFE(NPY_HALF, NPY_HALF)) {
+                CONVERT_TO_RESULT(npy_half_to_float(PyArrayScalar_VAL(value, Half)));
+                ret = CONVERSION_SUCCESS;
+            }
+            else if (IS_SAFE(NPY_HALF, NPY_HALF)) {
+                ret = DEFER_TO_OTHER_KNOWN_SCALAR;
+            }
+            else {
+                ret = PROMOTION_REQUIRED;
+            }
+            break;
+        GET_VALUE_OR_DEFER(FLOAT, Float, value);
+        GET_VALUE_OR_DEFER(DOUBLE, Double, value);
+        GET_VALUE_OR_DEFER(LONGDOUBLE, LongDouble, value);
+        /* Complex: We should still defer, but the code won't work... */
+        GET_CVALUE_OR_DEFER(CFLOAT, CFloat, value);
+        GET_CVALUE_OR_DEFER(CDOUBLE, CDouble, value);
+        GET_CVALUE_OR_DEFER(CLONGDOUBLE, CLongDouble, value);
+        default:
+            /*
+             * If there is no match, this is an unknown scalar object.  It
+             * would make sense to defer generously here, but it should also
+             * always be safe to use the array path.
+             * The issue is, that the other scalar may or may not be designed
+             * to deal with NumPy scalars.  Without knowing that, we cannot
+             * defer (which would be much faster potentially).
+             * TODO: We could add a DType flag to allow opting in to deferring!
+             */
+            *may_need_deferring = NPY_TRUE;
+            ret = OTHER_IS_UNKNOWN_OBJECT;
+    }
+    Py_DECREF(descr);
+    return ret;
+}
+
+#undef IS_SAFE
+#undef CONVERT_TO_RESULT
+#undef GET_VALUE_OR_DEFER
+#undef GET_CVALUE_OR_DEFER
+#undef IS_HALF
+
+
+#line 837
+
+#define IS_FLOAT 1
+
+#define IS_SAFE(FROM, TO) _npy_can_cast_safely_table[FROM][TO]
+
+/*
+ * TODO: This whole thing is awkward, and we should create a helper header to
+ *       define inline functions that convert single elements for all numeric
+ *       types.  That could then also be used to define all cast loops.
+ *       (Even if that may get more complex for SIMD at some point.)
+ *       For now, half casts could be optimized because of that.
+ */
+
+#if defined(IS_HALF)
+    #define CONVERT_TO_RESULT(value)  \
+        *result = npy_float_to_half((float)(value))
+#elif defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+    #define CONVERT_TO_RESULT(value)  \
+        result->real = value;  \
+        result->imag = 0
+#else
+    #define CONVERT_TO_RESULT(value) *result = value
+#endif
+
+
+#define GET_VALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_##OTHER, NPY_FLOAT)) {  \
+            CONVERT_TO_RESULT(PyArrayScalar_VAL(value, Other));  \
+            ret = CONVERSION_SUCCESS;  \
+        }  \
+        else if (IS_SAFE(NPY_FLOAT, NPY_##OTHER)) {  \
+            /*
+             * If self can cast safely to other, this is clear:
+             * we should definitely defer.
+             */  \
+             ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            /* Otherwise, we must promote */  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+/*
+ * Complex to complex (and rejecting complex to real) is a bit different:
+ */
+
+#if defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+
+#define GET_CVALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_##OTHER, NPY_FLOAT)) {  \
+            assert(Py_TYPE(value) == &Py##Other##ArrType_Type);  \
+            result->real = PyArrayScalar_VAL(value, Other).real;  \
+            result->imag = PyArrayScalar_VAL(value, Other).imag;  \
+            ret = 1;  \
+        }  \
+        else if (IS_SAFE(NPY_FLOAT, NPY_##OTHER)) {  \
+             ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+#else
+
+/* Getting a complex value to real is never safe: */
+#define GET_CVALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_FLOAT, NPY_##OTHER)) {  \
+            ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+#endif
+
+
+/**
+ * Convert the value to the own type and and store the result.
+ *
+ * @param value The value to convert (if compatible)
+ * @param result The result value (output)
+ * @param may_need_deferring Set to `NPY_TRUE` when the caller must check
+ *        `BINOP_GIVE_UP_IF_NEEDED` (or similar) due to possible implementation
+ *        of `__array_priority__` (or similar).
+ *        This is set for unknown objects and all subclasses even when they
+ *        can be handled.
+ * @result The result value indicating what we did with `value` or what type
+ *         of object it is (see `conversion_result`).
+ */
+static inline conversion_result
+convert_to_float(PyObject *value, npy_float *result, npy_bool *may_need_deferring)
+{
+    PyArray_Descr *descr;
+    *may_need_deferring = NPY_FALSE;
+
+    if (Py_TYPE(value) == &PyFloatArrType_Type) {
+        *result = PyArrayScalar_VAL(value, Float);
+        return CONVERSION_SUCCESS;
+    }
+    /* Optimize the identical scalar specifically. */
+    if (PyArray_IsScalar(value, Float)) {
+        *result = PyArrayScalar_VAL(value, Float);
+        /*
+         * In principle special, assyemetric, handling could be possible for
+         * explicit subclasses.
+         * In practice, we just check the normal deferring logic.
+         */
+        *may_need_deferring = NPY_TRUE;
+        return CONVERSION_SUCCESS;
+    }
+
+    /*
+     * Then we check for the basic Python types float, int, and complex.
+     * (this is a bit tedious to do right for complex).
+     */
+    if (PyBool_Check(value)) {
+        CONVERT_TO_RESULT(value == Py_True);
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyFloat_Check(value)) {
+        if (!PyFloat_CheckExact(value)) {
+            /* A NumPy double is a float subclass, but special. */
+            if (PyArray_IsScalar(value, Double)) {
+                descr = PyArray_DescrFromType(NPY_DOUBLE);
+                goto numpy_scalar;
+            }
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_DOUBLE, NPY_FLOAT)) {
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            /* Weak promotion is used when self is float or complex: */
+            if (!PyTypeNum_ISFLOAT(NPY_FLOAT) && !PyTypeNum_ISCOMPLEX(NPY_FLOAT)) {
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        CONVERT_TO_RESULT(PyFloat_AS_DOUBLE(value));
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyLong_Check(value)) {
+        if (!PyLong_CheckExact(value)) {
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_LONG, NPY_FLOAT)) {
+            /*
+             * long -> (c)longdouble is safe, so `OTHER_IS_UNKNOWN_OBJECT` will
+             * be returned below for huge integers.
+             */
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        int overflow;
+        long val = PyLong_AsLongAndOverflow(value, &overflow);
+        if (overflow) {
+            /* handle as if "unsafe" */
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                return OTHER_IS_UNKNOWN_OBJECT;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        if (error_converting(val)) {
+            return CONVERSION_ERROR;  /* should not be possible */
+        }
+        CONVERT_TO_RESULT(val);
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyComplex_Check(value)) {
+        if (!PyComplex_CheckExact(value)) {
+            /* A NumPy complex double is a float subclass, but special. */
+            if (PyArray_IsScalar(value, CDouble)) {
+                descr = PyArray_DescrFromType(NPY_CDOUBLE);
+                goto numpy_scalar;
+            }
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_CDOUBLE, NPY_FLOAT)) {
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            /* Weak promotion is used when self is float or complex: */
+            if (!PyTypeNum_ISCOMPLEX(NPY_FLOAT)) {
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+#if defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+        Py_complex val = PyComplex_AsCComplex(value);
+        if (error_converting(val.real)) {
+            return CONVERSION_ERROR;  /* should not be possible */
+        }
+        result->real = val.real;
+        result->imag = val.imag;
+        return CONVERSION_SUCCESS;
+#else
+        /* unreachable, always unsafe cast above; return to avoid warning */
+        assert(0);
+        return OTHER_IS_UNKNOWN_OBJECT;
+#endif  /* defined(IS_CFLOAT) || ... */
+    }
+
+    /*
+     * (seberg) It would be nice to use `PyArray_DiscoverDTypeFromScalarType`
+     * from array coercion here.  OTOH, the array coercion code also falls
+     * back to this code.  The issue is around how subclasses should work...
+     *
+     * It would be nice to try to fully align the paths again (they effectively
+     * are equivalent).  Proper support for subclasses is in general tricky,
+     * and it would make more sense to just _refuse_ to support them.
+     * However, it is unclear that this is a viable option...
+     */
+    if (!PyArray_IsScalar(value, Generic)) {
+        /*
+         * The input is an unknown python object.  This should probably defer
+         * but only does so for float128.
+         * For all other cases, we defer to the array logic.  If the object
+         * is indeed not an array-like, this will end up converting the NumPy
+         * scalar to a Python scalar and then try again.
+         * The logic is that the ufunc casts the input to object, which does
+         * the conversion.
+         * If the object is an array, deferring will always kick in.
+         */
+        *may_need_deferring = NPY_TRUE;
+        return OTHER_IS_UNKNOWN_OBJECT;
+    }
+
+    descr = PyArray_DescrFromScalar(value);
+    if (descr == NULL) {
+        if (PyErr_Occurred()) {
+            return CONVERSION_ERROR;
+        }
+        /* Should not happen, but may be possible with bad user subclasses */
+        *may_need_deferring = NPY_TRUE;
+        return OTHER_IS_UNKNOWN_OBJECT;
+    }
+
+  numpy_scalar:
+    if (descr->typeobj != Py_TYPE(value)) {
+        /*
+         * This is a subclass of a builtin type, we may continue normally,
+         * but should check whether we need to defer.
+         */
+        *may_need_deferring = NPY_TRUE;
+    }
+
+    /*
+     * Otherwise, we have a clear NumPy scalar, find if it is a compatible
+     * builtin scalar.
+     * Each `GET_VALUE_OR_DEFER` represents a case clause for its type number,
+     * extracting the value if it is safe and otherwise deferring.
+     * (Safety is known at compile time, so the switch statement should be
+     * simplified by the compiler accordingly.)
+     * If we have a scalar that is not listed or not safe, we defer to it.
+     *
+     * We should probably defer more aggressively, but that is too big a change,
+     * since it would disable `np.float64(1.) * [1, 2, 3, 4]`.
+     */
+    int ret;  /* set by the GET_VALUE_OR_DEFER macro */
+    switch (descr->type_num) {
+        GET_VALUE_OR_DEFER(BOOL, Bool, value);
+        /* UInts */
+        GET_VALUE_OR_DEFER(UBYTE, UByte, value);
+        GET_VALUE_OR_DEFER(USHORT, UShort, value);
+        GET_VALUE_OR_DEFER(UINT, UInt, value);
+        GET_VALUE_OR_DEFER(ULONG, ULong, value);
+        GET_VALUE_OR_DEFER(ULONGLONG, ULongLong, value);
+        /* Ints */
+        GET_VALUE_OR_DEFER(BYTE, Byte, value);
+        GET_VALUE_OR_DEFER(SHORT, Short, value);
+        GET_VALUE_OR_DEFER(INT, Int, value);
+        GET_VALUE_OR_DEFER(LONG, Long, value);
+        GET_VALUE_OR_DEFER(LONGLONG, LongLong, value);
+        /* Floats */
+        case NPY_HALF:
+            if (IS_SAFE(NPY_HALF, NPY_FLOAT)) {
+                CONVERT_TO_RESULT(npy_half_to_float(PyArrayScalar_VAL(value, Half)));
+                ret = CONVERSION_SUCCESS;
+            }
+            else if (IS_SAFE(NPY_FLOAT, NPY_HALF)) {
+                ret = DEFER_TO_OTHER_KNOWN_SCALAR;
+            }
+            else {
+                ret = PROMOTION_REQUIRED;
+            }
+            break;
+        GET_VALUE_OR_DEFER(FLOAT, Float, value);
+        GET_VALUE_OR_DEFER(DOUBLE, Double, value);
+        GET_VALUE_OR_DEFER(LONGDOUBLE, LongDouble, value);
+        /* Complex: We should still defer, but the code won't work... */
+        GET_CVALUE_OR_DEFER(CFLOAT, CFloat, value);
+        GET_CVALUE_OR_DEFER(CDOUBLE, CDouble, value);
+        GET_CVALUE_OR_DEFER(CLONGDOUBLE, CLongDouble, value);
+        default:
+            /*
+             * If there is no match, this is an unknown scalar object.  It
+             * would make sense to defer generously here, but it should also
+             * always be safe to use the array path.
+             * The issue is, that the other scalar may or may not be designed
+             * to deal with NumPy scalars.  Without knowing that, we cannot
+             * defer (which would be much faster potentially).
+             * TODO: We could add a DType flag to allow opting in to deferring!
+             */
+            *may_need_deferring = NPY_TRUE;
+            ret = OTHER_IS_UNKNOWN_OBJECT;
+    }
+    Py_DECREF(descr);
+    return ret;
+}
+
+#undef IS_SAFE
+#undef CONVERT_TO_RESULT
+#undef GET_VALUE_OR_DEFER
+#undef GET_CVALUE_OR_DEFER
+#undef IS_FLOAT
+
+
+#line 837
+
+#define IS_DOUBLE 1
+
+#define IS_SAFE(FROM, TO) _npy_can_cast_safely_table[FROM][TO]
+
+/*
+ * TODO: This whole thing is awkward, and we should create a helper header to
+ *       define inline functions that convert single elements for all numeric
+ *       types.  That could then also be used to define all cast loops.
+ *       (Even if that may get more complex for SIMD at some point.)
+ *       For now, half casts could be optimized because of that.
+ */
+
+#if defined(IS_HALF)
+    #define CONVERT_TO_RESULT(value)  \
+        *result = npy_float_to_half((float)(value))
+#elif defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+    #define CONVERT_TO_RESULT(value)  \
+        result->real = value;  \
+        result->imag = 0
+#else
+    #define CONVERT_TO_RESULT(value) *result = value
+#endif
+
+
+#define GET_VALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_##OTHER, NPY_DOUBLE)) {  \
+            CONVERT_TO_RESULT(PyArrayScalar_VAL(value, Other));  \
+            ret = CONVERSION_SUCCESS;  \
+        }  \
+        else if (IS_SAFE(NPY_DOUBLE, NPY_##OTHER)) {  \
+            /*
+             * If self can cast safely to other, this is clear:
+             * we should definitely defer.
+             */  \
+             ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            /* Otherwise, we must promote */  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+/*
+ * Complex to complex (and rejecting complex to real) is a bit different:
+ */
+
+#if defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+
+#define GET_CVALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_##OTHER, NPY_DOUBLE)) {  \
+            assert(Py_TYPE(value) == &Py##Other##ArrType_Type);  \
+            result->real = PyArrayScalar_VAL(value, Other).real;  \
+            result->imag = PyArrayScalar_VAL(value, Other).imag;  \
+            ret = 1;  \
+        }  \
+        else if (IS_SAFE(NPY_DOUBLE, NPY_##OTHER)) {  \
+             ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+#else
+
+/* Getting a complex value to real is never safe: */
+#define GET_CVALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_DOUBLE, NPY_##OTHER)) {  \
+            ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+#endif
+
+
+/**
+ * Convert the value to the own type and and store the result.
+ *
+ * @param value The value to convert (if compatible)
+ * @param result The result value (output)
+ * @param may_need_deferring Set to `NPY_TRUE` when the caller must check
+ *        `BINOP_GIVE_UP_IF_NEEDED` (or similar) due to possible implementation
+ *        of `__array_priority__` (or similar).
+ *        This is set for unknown objects and all subclasses even when they
+ *        can be handled.
+ * @result The result value indicating what we did with `value` or what type
+ *         of object it is (see `conversion_result`).
+ */
+static inline conversion_result
+convert_to_double(PyObject *value, npy_double *result, npy_bool *may_need_deferring)
+{
+    PyArray_Descr *descr;
+    *may_need_deferring = NPY_FALSE;
+
+    if (Py_TYPE(value) == &PyDoubleArrType_Type) {
+        *result = PyArrayScalar_VAL(value, Double);
+        return CONVERSION_SUCCESS;
+    }
+    /* Optimize the identical scalar specifically. */
+    if (PyArray_IsScalar(value, Double)) {
+        *result = PyArrayScalar_VAL(value, Double);
+        /*
+         * In principle special, assyemetric, handling could be possible for
+         * explicit subclasses.
+         * In practice, we just check the normal deferring logic.
+         */
+        *may_need_deferring = NPY_TRUE;
+        return CONVERSION_SUCCESS;
+    }
+
+    /*
+     * Then we check for the basic Python types float, int, and complex.
+     * (this is a bit tedious to do right for complex).
+     */
+    if (PyBool_Check(value)) {
+        CONVERT_TO_RESULT(value == Py_True);
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyFloat_Check(value)) {
+        if (!PyFloat_CheckExact(value)) {
+            /* A NumPy double is a float subclass, but special. */
+            if (PyArray_IsScalar(value, Double)) {
+                descr = PyArray_DescrFromType(NPY_DOUBLE);
+                goto numpy_scalar;
+            }
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_DOUBLE, NPY_DOUBLE)) {
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            /* Weak promotion is used when self is float or complex: */
+            if (!PyTypeNum_ISFLOAT(NPY_DOUBLE) && !PyTypeNum_ISCOMPLEX(NPY_DOUBLE)) {
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        CONVERT_TO_RESULT(PyFloat_AS_DOUBLE(value));
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyLong_Check(value)) {
+        if (!PyLong_CheckExact(value)) {
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_LONG, NPY_DOUBLE)) {
+            /*
+             * long -> (c)longdouble is safe, so `OTHER_IS_UNKNOWN_OBJECT` will
+             * be returned below for huge integers.
+             */
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        int overflow;
+        long val = PyLong_AsLongAndOverflow(value, &overflow);
+        if (overflow) {
+            /* handle as if "unsafe" */
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                return OTHER_IS_UNKNOWN_OBJECT;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        if (error_converting(val)) {
+            return CONVERSION_ERROR;  /* should not be possible */
+        }
+        CONVERT_TO_RESULT(val);
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyComplex_Check(value)) {
+        if (!PyComplex_CheckExact(value)) {
+            /* A NumPy complex double is a float subclass, but special. */
+            if (PyArray_IsScalar(value, CDouble)) {
+                descr = PyArray_DescrFromType(NPY_CDOUBLE);
+                goto numpy_scalar;
+            }
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_CDOUBLE, NPY_DOUBLE)) {
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            /* Weak promotion is used when self is float or complex: */
+            if (!PyTypeNum_ISCOMPLEX(NPY_DOUBLE)) {
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+#if defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+        Py_complex val = PyComplex_AsCComplex(value);
+        if (error_converting(val.real)) {
+            return CONVERSION_ERROR;  /* should not be possible */
+        }
+        result->real = val.real;
+        result->imag = val.imag;
+        return CONVERSION_SUCCESS;
+#else
+        /* unreachable, always unsafe cast above; return to avoid warning */
+        assert(0);
+        return OTHER_IS_UNKNOWN_OBJECT;
+#endif  /* defined(IS_CFLOAT) || ... */
+    }
+
+    /*
+     * (seberg) It would be nice to use `PyArray_DiscoverDTypeFromScalarType`
+     * from array coercion here.  OTOH, the array coercion code also falls
+     * back to this code.  The issue is around how subclasses should work...
+     *
+     * It would be nice to try to fully align the paths again (they effectively
+     * are equivalent).  Proper support for subclasses is in general tricky,
+     * and it would make more sense to just _refuse_ to support them.
+     * However, it is unclear that this is a viable option...
+     */
+    if (!PyArray_IsScalar(value, Generic)) {
+        /*
+         * The input is an unknown python object.  This should probably defer
+         * but only does so for float128.
+         * For all other cases, we defer to the array logic.  If the object
+         * is indeed not an array-like, this will end up converting the NumPy
+         * scalar to a Python scalar and then try again.
+         * The logic is that the ufunc casts the input to object, which does
+         * the conversion.
+         * If the object is an array, deferring will always kick in.
+         */
+        *may_need_deferring = NPY_TRUE;
+        return OTHER_IS_UNKNOWN_OBJECT;
+    }
+
+    descr = PyArray_DescrFromScalar(value);
+    if (descr == NULL) {
+        if (PyErr_Occurred()) {
+            return CONVERSION_ERROR;
+        }
+        /* Should not happen, but may be possible with bad user subclasses */
+        *may_need_deferring = NPY_TRUE;
+        return OTHER_IS_UNKNOWN_OBJECT;
+    }
+
+  numpy_scalar:
+    if (descr->typeobj != Py_TYPE(value)) {
+        /*
+         * This is a subclass of a builtin type, we may continue normally,
+         * but should check whether we need to defer.
+         */
+        *may_need_deferring = NPY_TRUE;
+    }
+
+    /*
+     * Otherwise, we have a clear NumPy scalar, find if it is a compatible
+     * builtin scalar.
+     * Each `GET_VALUE_OR_DEFER` represents a case clause for its type number,
+     * extracting the value if it is safe and otherwise deferring.
+     * (Safety is known at compile time, so the switch statement should be
+     * simplified by the compiler accordingly.)
+     * If we have a scalar that is not listed or not safe, we defer to it.
+     *
+     * We should probably defer more aggressively, but that is too big a change,
+     * since it would disable `np.float64(1.) * [1, 2, 3, 4]`.
+     */
+    int ret;  /* set by the GET_VALUE_OR_DEFER macro */
+    switch (descr->type_num) {
+        GET_VALUE_OR_DEFER(BOOL, Bool, value);
+        /* UInts */
+        GET_VALUE_OR_DEFER(UBYTE, UByte, value);
+        GET_VALUE_OR_DEFER(USHORT, UShort, value);
+        GET_VALUE_OR_DEFER(UINT, UInt, value);
+        GET_VALUE_OR_DEFER(ULONG, ULong, value);
+        GET_VALUE_OR_DEFER(ULONGLONG, ULongLong, value);
+        /* Ints */
+        GET_VALUE_OR_DEFER(BYTE, Byte, value);
+        GET_VALUE_OR_DEFER(SHORT, Short, value);
+        GET_VALUE_OR_DEFER(INT, Int, value);
+        GET_VALUE_OR_DEFER(LONG, Long, value);
+        GET_VALUE_OR_DEFER(LONGLONG, LongLong, value);
+        /* Floats */
+        case NPY_HALF:
+            if (IS_SAFE(NPY_HALF, NPY_DOUBLE)) {
+                CONVERT_TO_RESULT(npy_half_to_float(PyArrayScalar_VAL(value, Half)));
+                ret = CONVERSION_SUCCESS;
+            }
+            else if (IS_SAFE(NPY_DOUBLE, NPY_HALF)) {
+                ret = DEFER_TO_OTHER_KNOWN_SCALAR;
+            }
+            else {
+                ret = PROMOTION_REQUIRED;
+            }
+            break;
+        GET_VALUE_OR_DEFER(FLOAT, Float, value);
+        GET_VALUE_OR_DEFER(DOUBLE, Double, value);
+        GET_VALUE_OR_DEFER(LONGDOUBLE, LongDouble, value);
+        /* Complex: We should still defer, but the code won't work... */
+        GET_CVALUE_OR_DEFER(CFLOAT, CFloat, value);
+        GET_CVALUE_OR_DEFER(CDOUBLE, CDouble, value);
+        GET_CVALUE_OR_DEFER(CLONGDOUBLE, CLongDouble, value);
+        default:
+            /*
+             * If there is no match, this is an unknown scalar object.  It
+             * would make sense to defer generously here, but it should also
+             * always be safe to use the array path.
+             * The issue is, that the other scalar may or may not be designed
+             * to deal with NumPy scalars.  Without knowing that, we cannot
+             * defer (which would be much faster potentially).
+             * TODO: We could add a DType flag to allow opting in to deferring!
+             */
+            *may_need_deferring = NPY_TRUE;
+            ret = OTHER_IS_UNKNOWN_OBJECT;
+    }
+    Py_DECREF(descr);
+    return ret;
+}
+
+#undef IS_SAFE
+#undef CONVERT_TO_RESULT
+#undef GET_VALUE_OR_DEFER
+#undef GET_CVALUE_OR_DEFER
+#undef IS_DOUBLE
+
+
+#line 837
+
+#define IS_LONGDOUBLE 1
+
+#define IS_SAFE(FROM, TO) _npy_can_cast_safely_table[FROM][TO]
+
+/*
+ * TODO: This whole thing is awkward, and we should create a helper header to
+ *       define inline functions that convert single elements for all numeric
+ *       types.  That could then also be used to define all cast loops.
+ *       (Even if that may get more complex for SIMD at some point.)
+ *       For now, half casts could be optimized because of that.
+ */
+
+#if defined(IS_HALF)
+    #define CONVERT_TO_RESULT(value)  \
+        *result = npy_float_to_half((float)(value))
+#elif defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+    #define CONVERT_TO_RESULT(value)  \
+        result->real = value;  \
+        result->imag = 0
+#else
+    #define CONVERT_TO_RESULT(value) *result = value
+#endif
+
+
+#define GET_VALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_##OTHER, NPY_LONGDOUBLE)) {  \
+            CONVERT_TO_RESULT(PyArrayScalar_VAL(value, Other));  \
+            ret = CONVERSION_SUCCESS;  \
+        }  \
+        else if (IS_SAFE(NPY_LONGDOUBLE, NPY_##OTHER)) {  \
+            /*
+             * If self can cast safely to other, this is clear:
+             * we should definitely defer.
+             */  \
+             ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            /* Otherwise, we must promote */  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+/*
+ * Complex to complex (and rejecting complex to real) is a bit different:
+ */
+
+#if defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+
+#define GET_CVALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_##OTHER, NPY_LONGDOUBLE)) {  \
+            assert(Py_TYPE(value) == &Py##Other##ArrType_Type);  \
+            result->real = PyArrayScalar_VAL(value, Other).real;  \
+            result->imag = PyArrayScalar_VAL(value, Other).imag;  \
+            ret = 1;  \
+        }  \
+        else if (IS_SAFE(NPY_LONGDOUBLE, NPY_##OTHER)) {  \
+             ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+#else
+
+/* Getting a complex value to real is never safe: */
+#define GET_CVALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_LONGDOUBLE, NPY_##OTHER)) {  \
+            ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+#endif
+
+
+/**
+ * Convert the value to the own type and and store the result.
+ *
+ * @param value The value to convert (if compatible)
+ * @param result The result value (output)
+ * @param may_need_deferring Set to `NPY_TRUE` when the caller must check
+ *        `BINOP_GIVE_UP_IF_NEEDED` (or similar) due to possible implementation
+ *        of `__array_priority__` (or similar).
+ *        This is set for unknown objects and all subclasses even when they
+ *        can be handled.
+ * @result The result value indicating what we did with `value` or what type
+ *         of object it is (see `conversion_result`).
+ */
+static inline conversion_result
+convert_to_longdouble(PyObject *value, npy_longdouble *result, npy_bool *may_need_deferring)
+{
+    PyArray_Descr *descr;
+    *may_need_deferring = NPY_FALSE;
+
+    if (Py_TYPE(value) == &PyLongDoubleArrType_Type) {
+        *result = PyArrayScalar_VAL(value, LongDouble);
+        return CONVERSION_SUCCESS;
+    }
+    /* Optimize the identical scalar specifically. */
+    if (PyArray_IsScalar(value, LongDouble)) {
+        *result = PyArrayScalar_VAL(value, LongDouble);
+        /*
+         * In principle special, assyemetric, handling could be possible for
+         * explicit subclasses.
+         * In practice, we just check the normal deferring logic.
+         */
+        *may_need_deferring = NPY_TRUE;
+        return CONVERSION_SUCCESS;
+    }
+
+    /*
+     * Then we check for the basic Python types float, int, and complex.
+     * (this is a bit tedious to do right for complex).
+     */
+    if (PyBool_Check(value)) {
+        CONVERT_TO_RESULT(value == Py_True);
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyFloat_Check(value)) {
+        if (!PyFloat_CheckExact(value)) {
+            /* A NumPy double is a float subclass, but special. */
+            if (PyArray_IsScalar(value, Double)) {
+                descr = PyArray_DescrFromType(NPY_DOUBLE);
+                goto numpy_scalar;
+            }
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_DOUBLE, NPY_LONGDOUBLE)) {
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            /* Weak promotion is used when self is float or complex: */
+            if (!PyTypeNum_ISFLOAT(NPY_LONGDOUBLE) && !PyTypeNum_ISCOMPLEX(NPY_LONGDOUBLE)) {
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        CONVERT_TO_RESULT(PyFloat_AS_DOUBLE(value));
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyLong_Check(value)) {
+        if (!PyLong_CheckExact(value)) {
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_LONG, NPY_LONGDOUBLE)) {
+            /*
+             * long -> (c)longdouble is safe, so `OTHER_IS_UNKNOWN_OBJECT` will
+             * be returned below for huge integers.
+             */
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        int overflow;
+        long val = PyLong_AsLongAndOverflow(value, &overflow);
+        if (overflow) {
+            /* handle as if "unsafe" */
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                return OTHER_IS_UNKNOWN_OBJECT;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        if (error_converting(val)) {
+            return CONVERSION_ERROR;  /* should not be possible */
+        }
+        CONVERT_TO_RESULT(val);
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyComplex_Check(value)) {
+        if (!PyComplex_CheckExact(value)) {
+            /* A NumPy complex double is a float subclass, but special. */
+            if (PyArray_IsScalar(value, CDouble)) {
+                descr = PyArray_DescrFromType(NPY_CDOUBLE);
+                goto numpy_scalar;
+            }
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_CDOUBLE, NPY_LONGDOUBLE)) {
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            /* Weak promotion is used when self is float or complex: */
+            if (!PyTypeNum_ISCOMPLEX(NPY_LONGDOUBLE)) {
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+#if defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+        Py_complex val = PyComplex_AsCComplex(value);
+        if (error_converting(val.real)) {
+            return CONVERSION_ERROR;  /* should not be possible */
+        }
+        result->real = val.real;
+        result->imag = val.imag;
+        return CONVERSION_SUCCESS;
+#else
+        /* unreachable, always unsafe cast above; return to avoid warning */
+        assert(0);
+        return OTHER_IS_UNKNOWN_OBJECT;
+#endif  /* defined(IS_CFLOAT) || ... */
+    }
+
+    /*
+     * (seberg) It would be nice to use `PyArray_DiscoverDTypeFromScalarType`
+     * from array coercion here.  OTOH, the array coercion code also falls
+     * back to this code.  The issue is around how subclasses should work...
+     *
+     * It would be nice to try to fully align the paths again (they effectively
+     * are equivalent).  Proper support for subclasses is in general tricky,
+     * and it would make more sense to just _refuse_ to support them.
+     * However, it is unclear that this is a viable option...
+     */
+    if (!PyArray_IsScalar(value, Generic)) {
+        /*
+         * The input is an unknown python object.  This should probably defer
+         * but only does so for float128.
+         * For all other cases, we defer to the array logic.  If the object
+         * is indeed not an array-like, this will end up converting the NumPy
+         * scalar to a Python scalar and then try again.
+         * The logic is that the ufunc casts the input to object, which does
+         * the conversion.
+         * If the object is an array, deferring will always kick in.
+         */
+        *may_need_deferring = NPY_TRUE;
+        return OTHER_IS_UNKNOWN_OBJECT;
+    }
+
+    descr = PyArray_DescrFromScalar(value);
+    if (descr == NULL) {
+        if (PyErr_Occurred()) {
+            return CONVERSION_ERROR;
+        }
+        /* Should not happen, but may be possible with bad user subclasses */
+        *may_need_deferring = NPY_TRUE;
+        return OTHER_IS_UNKNOWN_OBJECT;
+    }
+
+  numpy_scalar:
+    if (descr->typeobj != Py_TYPE(value)) {
+        /*
+         * This is a subclass of a builtin type, we may continue normally,
+         * but should check whether we need to defer.
+         */
+        *may_need_deferring = NPY_TRUE;
+    }
+
+    /*
+     * Otherwise, we have a clear NumPy scalar, find if it is a compatible
+     * builtin scalar.
+     * Each `GET_VALUE_OR_DEFER` represents a case clause for its type number,
+     * extracting the value if it is safe and otherwise deferring.
+     * (Safety is known at compile time, so the switch statement should be
+     * simplified by the compiler accordingly.)
+     * If we have a scalar that is not listed or not safe, we defer to it.
+     *
+     * We should probably defer more aggressively, but that is too big a change,
+     * since it would disable `np.float64(1.) * [1, 2, 3, 4]`.
+     */
+    int ret;  /* set by the GET_VALUE_OR_DEFER macro */
+    switch (descr->type_num) {
+        GET_VALUE_OR_DEFER(BOOL, Bool, value);
+        /* UInts */
+        GET_VALUE_OR_DEFER(UBYTE, UByte, value);
+        GET_VALUE_OR_DEFER(USHORT, UShort, value);
+        GET_VALUE_OR_DEFER(UINT, UInt, value);
+        GET_VALUE_OR_DEFER(ULONG, ULong, value);
+        GET_VALUE_OR_DEFER(ULONGLONG, ULongLong, value);
+        /* Ints */
+        GET_VALUE_OR_DEFER(BYTE, Byte, value);
+        GET_VALUE_OR_DEFER(SHORT, Short, value);
+        GET_VALUE_OR_DEFER(INT, Int, value);
+        GET_VALUE_OR_DEFER(LONG, Long, value);
+        GET_VALUE_OR_DEFER(LONGLONG, LongLong, value);
+        /* Floats */
+        case NPY_HALF:
+            if (IS_SAFE(NPY_HALF, NPY_LONGDOUBLE)) {
+                CONVERT_TO_RESULT(npy_half_to_float(PyArrayScalar_VAL(value, Half)));
+                ret = CONVERSION_SUCCESS;
+            }
+            else if (IS_SAFE(NPY_LONGDOUBLE, NPY_HALF)) {
+                ret = DEFER_TO_OTHER_KNOWN_SCALAR;
+            }
+            else {
+                ret = PROMOTION_REQUIRED;
+            }
+            break;
+        GET_VALUE_OR_DEFER(FLOAT, Float, value);
+        GET_VALUE_OR_DEFER(DOUBLE, Double, value);
+        GET_VALUE_OR_DEFER(LONGDOUBLE, LongDouble, value);
+        /* Complex: We should still defer, but the code won't work... */
+        GET_CVALUE_OR_DEFER(CFLOAT, CFloat, value);
+        GET_CVALUE_OR_DEFER(CDOUBLE, CDouble, value);
+        GET_CVALUE_OR_DEFER(CLONGDOUBLE, CLongDouble, value);
+        default:
+            /*
+             * If there is no match, this is an unknown scalar object.  It
+             * would make sense to defer generously here, but it should also
+             * always be safe to use the array path.
+             * The issue is, that the other scalar may or may not be designed
+             * to deal with NumPy scalars.  Without knowing that, we cannot
+             * defer (which would be much faster potentially).
+             * TODO: We could add a DType flag to allow opting in to deferring!
+             */
+            *may_need_deferring = NPY_TRUE;
+            ret = OTHER_IS_UNKNOWN_OBJECT;
+    }
+    Py_DECREF(descr);
+    return ret;
+}
+
+#undef IS_SAFE
+#undef CONVERT_TO_RESULT
+#undef GET_VALUE_OR_DEFER
+#undef GET_CVALUE_OR_DEFER
+#undef IS_LONGDOUBLE
+
+
+#line 837
+
+#define IS_CFLOAT 1
+
+#define IS_SAFE(FROM, TO) _npy_can_cast_safely_table[FROM][TO]
+
+/*
+ * TODO: This whole thing is awkward, and we should create a helper header to
+ *       define inline functions that convert single elements for all numeric
+ *       types.  That could then also be used to define all cast loops.
+ *       (Even if that may get more complex for SIMD at some point.)
+ *       For now, half casts could be optimized because of that.
+ */
+
+#if defined(IS_HALF)
+    #define CONVERT_TO_RESULT(value)  \
+        *result = npy_float_to_half((float)(value))
+#elif defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+    #define CONVERT_TO_RESULT(value)  \
+        result->real = value;  \
+        result->imag = 0
+#else
+    #define CONVERT_TO_RESULT(value) *result = value
+#endif
+
+
+#define GET_VALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_##OTHER, NPY_CFLOAT)) {  \
+            CONVERT_TO_RESULT(PyArrayScalar_VAL(value, Other));  \
+            ret = CONVERSION_SUCCESS;  \
+        }  \
+        else if (IS_SAFE(NPY_CFLOAT, NPY_##OTHER)) {  \
+            /*
+             * If self can cast safely to other, this is clear:
+             * we should definitely defer.
+             */  \
+             ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            /* Otherwise, we must promote */  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+/*
+ * Complex to complex (and rejecting complex to real) is a bit different:
+ */
+
+#if defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+
+#define GET_CVALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_##OTHER, NPY_CFLOAT)) {  \
+            assert(Py_TYPE(value) == &Py##Other##ArrType_Type);  \
+            result->real = PyArrayScalar_VAL(value, Other).real;  \
+            result->imag = PyArrayScalar_VAL(value, Other).imag;  \
+            ret = 1;  \
+        }  \
+        else if (IS_SAFE(NPY_CFLOAT, NPY_##OTHER)) {  \
+             ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+#else
+
+/* Getting a complex value to real is never safe: */
+#define GET_CVALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_CFLOAT, NPY_##OTHER)) {  \
+            ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+#endif
+
+
+/**
+ * Convert the value to the own type and and store the result.
+ *
+ * @param value The value to convert (if compatible)
+ * @param result The result value (output)
+ * @param may_need_deferring Set to `NPY_TRUE` when the caller must check
+ *        `BINOP_GIVE_UP_IF_NEEDED` (or similar) due to possible implementation
+ *        of `__array_priority__` (or similar).
+ *        This is set for unknown objects and all subclasses even when they
+ *        can be handled.
+ * @result The result value indicating what we did with `value` or what type
+ *         of object it is (see `conversion_result`).
+ */
+static inline conversion_result
+convert_to_cfloat(PyObject *value, npy_cfloat *result, npy_bool *may_need_deferring)
+{
+    PyArray_Descr *descr;
+    *may_need_deferring = NPY_FALSE;
+
+    if (Py_TYPE(value) == &PyCFloatArrType_Type) {
+        *result = PyArrayScalar_VAL(value, CFloat);
+        return CONVERSION_SUCCESS;
+    }
+    /* Optimize the identical scalar specifically. */
+    if (PyArray_IsScalar(value, CFloat)) {
+        *result = PyArrayScalar_VAL(value, CFloat);
+        /*
+         * In principle special, assyemetric, handling could be possible for
+         * explicit subclasses.
+         * In practice, we just check the normal deferring logic.
+         */
+        *may_need_deferring = NPY_TRUE;
+        return CONVERSION_SUCCESS;
+    }
+
+    /*
+     * Then we check for the basic Python types float, int, and complex.
+     * (this is a bit tedious to do right for complex).
+     */
+    if (PyBool_Check(value)) {
+        CONVERT_TO_RESULT(value == Py_True);
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyFloat_Check(value)) {
+        if (!PyFloat_CheckExact(value)) {
+            /* A NumPy double is a float subclass, but special. */
+            if (PyArray_IsScalar(value, Double)) {
+                descr = PyArray_DescrFromType(NPY_DOUBLE);
+                goto numpy_scalar;
+            }
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_DOUBLE, NPY_CFLOAT)) {
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            /* Weak promotion is used when self is float or complex: */
+            if (!PyTypeNum_ISFLOAT(NPY_CFLOAT) && !PyTypeNum_ISCOMPLEX(NPY_CFLOAT)) {
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        CONVERT_TO_RESULT(PyFloat_AS_DOUBLE(value));
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyLong_Check(value)) {
+        if (!PyLong_CheckExact(value)) {
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_LONG, NPY_CFLOAT)) {
+            /*
+             * long -> (c)longdouble is safe, so `OTHER_IS_UNKNOWN_OBJECT` will
+             * be returned below for huge integers.
+             */
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        int overflow;
+        long val = PyLong_AsLongAndOverflow(value, &overflow);
+        if (overflow) {
+            /* handle as if "unsafe" */
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                return OTHER_IS_UNKNOWN_OBJECT;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        if (error_converting(val)) {
+            return CONVERSION_ERROR;  /* should not be possible */
+        }
+        CONVERT_TO_RESULT(val);
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyComplex_Check(value)) {
+        if (!PyComplex_CheckExact(value)) {
+            /* A NumPy complex double is a float subclass, but special. */
+            if (PyArray_IsScalar(value, CDouble)) {
+                descr = PyArray_DescrFromType(NPY_CDOUBLE);
+                goto numpy_scalar;
+            }
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_CDOUBLE, NPY_CFLOAT)) {
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            /* Weak promotion is used when self is float or complex: */
+            if (!PyTypeNum_ISCOMPLEX(NPY_CFLOAT)) {
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+#if defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+        Py_complex val = PyComplex_AsCComplex(value);
+        if (error_converting(val.real)) {
+            return CONVERSION_ERROR;  /* should not be possible */
+        }
+        result->real = val.real;
+        result->imag = val.imag;
+        return CONVERSION_SUCCESS;
+#else
+        /* unreachable, always unsafe cast above; return to avoid warning */
+        assert(0);
+        return OTHER_IS_UNKNOWN_OBJECT;
+#endif  /* defined(IS_CFLOAT) || ... */
+    }
+
+    /*
+     * (seberg) It would be nice to use `PyArray_DiscoverDTypeFromScalarType`
+     * from array coercion here.  OTOH, the array coercion code also falls
+     * back to this code.  The issue is around how subclasses should work...
+     *
+     * It would be nice to try to fully align the paths again (they effectively
+     * are equivalent).  Proper support for subclasses is in general tricky,
+     * and it would make more sense to just _refuse_ to support them.
+     * However, it is unclear that this is a viable option...
+     */
+    if (!PyArray_IsScalar(value, Generic)) {
+        /*
+         * The input is an unknown python object.  This should probably defer
+         * but only does so for float128.
+         * For all other cases, we defer to the array logic.  If the object
+         * is indeed not an array-like, this will end up converting the NumPy
+         * scalar to a Python scalar and then try again.
+         * The logic is that the ufunc casts the input to object, which does
+         * the conversion.
+         * If the object is an array, deferring will always kick in.
+         */
+        *may_need_deferring = NPY_TRUE;
+        return OTHER_IS_UNKNOWN_OBJECT;
+    }
+
+    descr = PyArray_DescrFromScalar(value);
+    if (descr == NULL) {
+        if (PyErr_Occurred()) {
+            return CONVERSION_ERROR;
+        }
+        /* Should not happen, but may be possible with bad user subclasses */
+        *may_need_deferring = NPY_TRUE;
+        return OTHER_IS_UNKNOWN_OBJECT;
+    }
+
+  numpy_scalar:
+    if (descr->typeobj != Py_TYPE(value)) {
+        /*
+         * This is a subclass of a builtin type, we may continue normally,
+         * but should check whether we need to defer.
+         */
+        *may_need_deferring = NPY_TRUE;
+    }
+
+    /*
+     * Otherwise, we have a clear NumPy scalar, find if it is a compatible
+     * builtin scalar.
+     * Each `GET_VALUE_OR_DEFER` represents a case clause for its type number,
+     * extracting the value if it is safe and otherwise deferring.
+     * (Safety is known at compile time, so the switch statement should be
+     * simplified by the compiler accordingly.)
+     * If we have a scalar that is not listed or not safe, we defer to it.
+     *
+     * We should probably defer more aggressively, but that is too big a change,
+     * since it would disable `np.float64(1.) * [1, 2, 3, 4]`.
+     */
+    int ret;  /* set by the GET_VALUE_OR_DEFER macro */
+    switch (descr->type_num) {
+        GET_VALUE_OR_DEFER(BOOL, Bool, value);
+        /* UInts */
+        GET_VALUE_OR_DEFER(UBYTE, UByte, value);
+        GET_VALUE_OR_DEFER(USHORT, UShort, value);
+        GET_VALUE_OR_DEFER(UINT, UInt, value);
+        GET_VALUE_OR_DEFER(ULONG, ULong, value);
+        GET_VALUE_OR_DEFER(ULONGLONG, ULongLong, value);
+        /* Ints */
+        GET_VALUE_OR_DEFER(BYTE, Byte, value);
+        GET_VALUE_OR_DEFER(SHORT, Short, value);
+        GET_VALUE_OR_DEFER(INT, Int, value);
+        GET_VALUE_OR_DEFER(LONG, Long, value);
+        GET_VALUE_OR_DEFER(LONGLONG, LongLong, value);
+        /* Floats */
+        case NPY_HALF:
+            if (IS_SAFE(NPY_HALF, NPY_CFLOAT)) {
+                CONVERT_TO_RESULT(npy_half_to_float(PyArrayScalar_VAL(value, Half)));
+                ret = CONVERSION_SUCCESS;
+            }
+            else if (IS_SAFE(NPY_CFLOAT, NPY_HALF)) {
+                ret = DEFER_TO_OTHER_KNOWN_SCALAR;
+            }
+            else {
+                ret = PROMOTION_REQUIRED;
+            }
+            break;
+        GET_VALUE_OR_DEFER(FLOAT, Float, value);
+        GET_VALUE_OR_DEFER(DOUBLE, Double, value);
+        GET_VALUE_OR_DEFER(LONGDOUBLE, LongDouble, value);
+        /* Complex: We should still defer, but the code won't work... */
+        GET_CVALUE_OR_DEFER(CFLOAT, CFloat, value);
+        GET_CVALUE_OR_DEFER(CDOUBLE, CDouble, value);
+        GET_CVALUE_OR_DEFER(CLONGDOUBLE, CLongDouble, value);
+        default:
+            /*
+             * If there is no match, this is an unknown scalar object.  It
+             * would make sense to defer generously here, but it should also
+             * always be safe to use the array path.
+             * The issue is, that the other scalar may or may not be designed
+             * to deal with NumPy scalars.  Without knowing that, we cannot
+             * defer (which would be much faster potentially).
+             * TODO: We could add a DType flag to allow opting in to deferring!
+             */
+            *may_need_deferring = NPY_TRUE;
+            ret = OTHER_IS_UNKNOWN_OBJECT;
+    }
+    Py_DECREF(descr);
+    return ret;
+}
+
+#undef IS_SAFE
+#undef CONVERT_TO_RESULT
+#undef GET_VALUE_OR_DEFER
+#undef GET_CVALUE_OR_DEFER
+#undef IS_CFLOAT
+
+
+#line 837
+
+#define IS_CDOUBLE 1
+
+#define IS_SAFE(FROM, TO) _npy_can_cast_safely_table[FROM][TO]
+
+/*
+ * TODO: This whole thing is awkward, and we should create a helper header to
+ *       define inline functions that convert single elements for all numeric
+ *       types.  That could then also be used to define all cast loops.
+ *       (Even if that may get more complex for SIMD at some point.)
+ *       For now, half casts could be optimized because of that.
+ */
+
+#if defined(IS_HALF)
+    #define CONVERT_TO_RESULT(value)  \
+        *result = npy_float_to_half((float)(value))
+#elif defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+    #define CONVERT_TO_RESULT(value)  \
+        result->real = value;  \
+        result->imag = 0
+#else
+    #define CONVERT_TO_RESULT(value) *result = value
+#endif
+
+
+#define GET_VALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_##OTHER, NPY_CDOUBLE)) {  \
+            CONVERT_TO_RESULT(PyArrayScalar_VAL(value, Other));  \
+            ret = CONVERSION_SUCCESS;  \
+        }  \
+        else if (IS_SAFE(NPY_CDOUBLE, NPY_##OTHER)) {  \
+            /*
+             * If self can cast safely to other, this is clear:
+             * we should definitely defer.
+             */  \
+             ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            /* Otherwise, we must promote */  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+/*
+ * Complex to complex (and rejecting complex to real) is a bit different:
+ */
+
+#if defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+
+#define GET_CVALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_##OTHER, NPY_CDOUBLE)) {  \
+            assert(Py_TYPE(value) == &Py##Other##ArrType_Type);  \
+            result->real = PyArrayScalar_VAL(value, Other).real;  \
+            result->imag = PyArrayScalar_VAL(value, Other).imag;  \
+            ret = 1;  \
+        }  \
+        else if (IS_SAFE(NPY_CDOUBLE, NPY_##OTHER)) {  \
+             ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+#else
+
+/* Getting a complex value to real is never safe: */
+#define GET_CVALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_CDOUBLE, NPY_##OTHER)) {  \
+            ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+#endif
+
+
+/**
+ * Convert the value to the own type and and store the result.
+ *
+ * @param value The value to convert (if compatible)
+ * @param result The result value (output)
+ * @param may_need_deferring Set to `NPY_TRUE` when the caller must check
+ *        `BINOP_GIVE_UP_IF_NEEDED` (or similar) due to possible implementation
+ *        of `__array_priority__` (or similar).
+ *        This is set for unknown objects and all subclasses even when they
+ *        can be handled.
+ * @result The result value indicating what we did with `value` or what type
+ *         of object it is (see `conversion_result`).
+ */
+static inline conversion_result
+convert_to_cdouble(PyObject *value, npy_cdouble *result, npy_bool *may_need_deferring)
+{
+    PyArray_Descr *descr;
+    *may_need_deferring = NPY_FALSE;
+
+    if (Py_TYPE(value) == &PyCDoubleArrType_Type) {
+        *result = PyArrayScalar_VAL(value, CDouble);
+        return CONVERSION_SUCCESS;
+    }
+    /* Optimize the identical scalar specifically. */
+    if (PyArray_IsScalar(value, CDouble)) {
+        *result = PyArrayScalar_VAL(value, CDouble);
+        /*
+         * In principle special, assyemetric, handling could be possible for
+         * explicit subclasses.
+         * In practice, we just check the normal deferring logic.
+         */
+        *may_need_deferring = NPY_TRUE;
+        return CONVERSION_SUCCESS;
+    }
+
+    /*
+     * Then we check for the basic Python types float, int, and complex.
+     * (this is a bit tedious to do right for complex).
+     */
+    if (PyBool_Check(value)) {
+        CONVERT_TO_RESULT(value == Py_True);
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyFloat_Check(value)) {
+        if (!PyFloat_CheckExact(value)) {
+            /* A NumPy double is a float subclass, but special. */
+            if (PyArray_IsScalar(value, Double)) {
+                descr = PyArray_DescrFromType(NPY_DOUBLE);
+                goto numpy_scalar;
+            }
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_DOUBLE, NPY_CDOUBLE)) {
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            /* Weak promotion is used when self is float or complex: */
+            if (!PyTypeNum_ISFLOAT(NPY_CDOUBLE) && !PyTypeNum_ISCOMPLEX(NPY_CDOUBLE)) {
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        CONVERT_TO_RESULT(PyFloat_AS_DOUBLE(value));
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyLong_Check(value)) {
+        if (!PyLong_CheckExact(value)) {
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_LONG, NPY_CDOUBLE)) {
+            /*
+             * long -> (c)longdouble is safe, so `OTHER_IS_UNKNOWN_OBJECT` will
+             * be returned below for huge integers.
+             */
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        int overflow;
+        long val = PyLong_AsLongAndOverflow(value, &overflow);
+        if (overflow) {
+            /* handle as if "unsafe" */
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                return OTHER_IS_UNKNOWN_OBJECT;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        if (error_converting(val)) {
+            return CONVERSION_ERROR;  /* should not be possible */
+        }
+        CONVERT_TO_RESULT(val);
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyComplex_Check(value)) {
+        if (!PyComplex_CheckExact(value)) {
+            /* A NumPy complex double is a float subclass, but special. */
+            if (PyArray_IsScalar(value, CDouble)) {
+                descr = PyArray_DescrFromType(NPY_CDOUBLE);
+                goto numpy_scalar;
+            }
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_CDOUBLE, NPY_CDOUBLE)) {
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            /* Weak promotion is used when self is float or complex: */
+            if (!PyTypeNum_ISCOMPLEX(NPY_CDOUBLE)) {
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+#if defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+        Py_complex val = PyComplex_AsCComplex(value);
+        if (error_converting(val.real)) {
+            return CONVERSION_ERROR;  /* should not be possible */
+        }
+        result->real = val.real;
+        result->imag = val.imag;
+        return CONVERSION_SUCCESS;
+#else
+        /* unreachable, always unsafe cast above; return to avoid warning */
+        assert(0);
+        return OTHER_IS_UNKNOWN_OBJECT;
+#endif  /* defined(IS_CFLOAT) || ... */
+    }
+
+    /*
+     * (seberg) It would be nice to use `PyArray_DiscoverDTypeFromScalarType`
+     * from array coercion here.  OTOH, the array coercion code also falls
+     * back to this code.  The issue is around how subclasses should work...
+     *
+     * It would be nice to try to fully align the paths again (they effectively
+     * are equivalent).  Proper support for subclasses is in general tricky,
+     * and it would make more sense to just _refuse_ to support them.
+     * However, it is unclear that this is a viable option...
+     */
+    if (!PyArray_IsScalar(value, Generic)) {
+        /*
+         * The input is an unknown python object.  This should probably defer
+         * but only does so for float128.
+         * For all other cases, we defer to the array logic.  If the object
+         * is indeed not an array-like, this will end up converting the NumPy
+         * scalar to a Python scalar and then try again.
+         * The logic is that the ufunc casts the input to object, which does
+         * the conversion.
+         * If the object is an array, deferring will always kick in.
+         */
+        *may_need_deferring = NPY_TRUE;
+        return OTHER_IS_UNKNOWN_OBJECT;
+    }
+
+    descr = PyArray_DescrFromScalar(value);
+    if (descr == NULL) {
+        if (PyErr_Occurred()) {
+            return CONVERSION_ERROR;
+        }
+        /* Should not happen, but may be possible with bad user subclasses */
+        *may_need_deferring = NPY_TRUE;
+        return OTHER_IS_UNKNOWN_OBJECT;
+    }
+
+  numpy_scalar:
+    if (descr->typeobj != Py_TYPE(value)) {
+        /*
+         * This is a subclass of a builtin type, we may continue normally,
+         * but should check whether we need to defer.
+         */
+        *may_need_deferring = NPY_TRUE;
+    }
+
+    /*
+     * Otherwise, we have a clear NumPy scalar, find if it is a compatible
+     * builtin scalar.
+     * Each `GET_VALUE_OR_DEFER` represents a case clause for its type number,
+     * extracting the value if it is safe and otherwise deferring.
+     * (Safety is known at compile time, so the switch statement should be
+     * simplified by the compiler accordingly.)
+     * If we have a scalar that is not listed or not safe, we defer to it.
+     *
+     * We should probably defer more aggressively, but that is too big a change,
+     * since it would disable `np.float64(1.) * [1, 2, 3, 4]`.
+     */
+    int ret;  /* set by the GET_VALUE_OR_DEFER macro */
+    switch (descr->type_num) {
+        GET_VALUE_OR_DEFER(BOOL, Bool, value);
+        /* UInts */
+        GET_VALUE_OR_DEFER(UBYTE, UByte, value);
+        GET_VALUE_OR_DEFER(USHORT, UShort, value);
+        GET_VALUE_OR_DEFER(UINT, UInt, value);
+        GET_VALUE_OR_DEFER(ULONG, ULong, value);
+        GET_VALUE_OR_DEFER(ULONGLONG, ULongLong, value);
+        /* Ints */
+        GET_VALUE_OR_DEFER(BYTE, Byte, value);
+        GET_VALUE_OR_DEFER(SHORT, Short, value);
+        GET_VALUE_OR_DEFER(INT, Int, value);
+        GET_VALUE_OR_DEFER(LONG, Long, value);
+        GET_VALUE_OR_DEFER(LONGLONG, LongLong, value);
+        /* Floats */
+        case NPY_HALF:
+            if (IS_SAFE(NPY_HALF, NPY_CDOUBLE)) {
+                CONVERT_TO_RESULT(npy_half_to_float(PyArrayScalar_VAL(value, Half)));
+                ret = CONVERSION_SUCCESS;
+            }
+            else if (IS_SAFE(NPY_CDOUBLE, NPY_HALF)) {
+                ret = DEFER_TO_OTHER_KNOWN_SCALAR;
+            }
+            else {
+                ret = PROMOTION_REQUIRED;
+            }
+            break;
+        GET_VALUE_OR_DEFER(FLOAT, Float, value);
+        GET_VALUE_OR_DEFER(DOUBLE, Double, value);
+        GET_VALUE_OR_DEFER(LONGDOUBLE, LongDouble, value);
+        /* Complex: We should still defer, but the code won't work... */
+        GET_CVALUE_OR_DEFER(CFLOAT, CFloat, value);
+        GET_CVALUE_OR_DEFER(CDOUBLE, CDouble, value);
+        GET_CVALUE_OR_DEFER(CLONGDOUBLE, CLongDouble, value);
+        default:
+            /*
+             * If there is no match, this is an unknown scalar object.  It
+             * would make sense to defer generously here, but it should also
+             * always be safe to use the array path.
+             * The issue is, that the other scalar may or may not be designed
+             * to deal with NumPy scalars.  Without knowing that, we cannot
+             * defer (which would be much faster potentially).
+             * TODO: We could add a DType flag to allow opting in to deferring!
+             */
+            *may_need_deferring = NPY_TRUE;
+            ret = OTHER_IS_UNKNOWN_OBJECT;
+    }
+    Py_DECREF(descr);
+    return ret;
+}
+
+#undef IS_SAFE
+#undef CONVERT_TO_RESULT
+#undef GET_VALUE_OR_DEFER
+#undef GET_CVALUE_OR_DEFER
+#undef IS_CDOUBLE
+
+
+#line 837
+
+#define IS_CLONGDOUBLE 1
+
+#define IS_SAFE(FROM, TO) _npy_can_cast_safely_table[FROM][TO]
+
+/*
+ * TODO: This whole thing is awkward, and we should create a helper header to
+ *       define inline functions that convert single elements for all numeric
+ *       types.  That could then also be used to define all cast loops.
+ *       (Even if that may get more complex for SIMD at some point.)
+ *       For now, half casts could be optimized because of that.
+ */
+
+#if defined(IS_HALF)
+    #define CONVERT_TO_RESULT(value)  \
+        *result = npy_float_to_half((float)(value))
+#elif defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+    #define CONVERT_TO_RESULT(value)  \
+        result->real = value;  \
+        result->imag = 0
+#else
+    #define CONVERT_TO_RESULT(value) *result = value
+#endif
+
+
+#define GET_VALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_##OTHER, NPY_CLONGDOUBLE)) {  \
+            CONVERT_TO_RESULT(PyArrayScalar_VAL(value, Other));  \
+            ret = CONVERSION_SUCCESS;  \
+        }  \
+        else if (IS_SAFE(NPY_CLONGDOUBLE, NPY_##OTHER)) {  \
+            /*
+             * If self can cast safely to other, this is clear:
+             * we should definitely defer.
+             */  \
+             ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            /* Otherwise, we must promote */  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+/*
+ * Complex to complex (and rejecting complex to real) is a bit different:
+ */
+
+#if defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+
+#define GET_CVALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_##OTHER, NPY_CLONGDOUBLE)) {  \
+            assert(Py_TYPE(value) == &Py##Other##ArrType_Type);  \
+            result->real = PyArrayScalar_VAL(value, Other).real;  \
+            result->imag = PyArrayScalar_VAL(value, Other).imag;  \
+            ret = 1;  \
+        }  \
+        else if (IS_SAFE(NPY_CLONGDOUBLE, NPY_##OTHER)) {  \
+             ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+#else
+
+/* Getting a complex value to real is never safe: */
+#define GET_CVALUE_OR_DEFER(OTHER, Other, value)  \
+    case NPY_##OTHER:  \
+        if (IS_SAFE(NPY_CLONGDOUBLE, NPY_##OTHER)) {  \
+            ret = DEFER_TO_OTHER_KNOWN_SCALAR;  \
+        }  \
+        else {  \
+            ret = PROMOTION_REQUIRED;  \
+        }  \
+        break;
+
+#endif
+
+
+/**
+ * Convert the value to the own type and and store the result.
+ *
+ * @param value The value to convert (if compatible)
+ * @param result The result value (output)
+ * @param may_need_deferring Set to `NPY_TRUE` when the caller must check
+ *        `BINOP_GIVE_UP_IF_NEEDED` (or similar) due to possible implementation
+ *        of `__array_priority__` (or similar).
+ *        This is set for unknown objects and all subclasses even when they
+ *        can be handled.
+ * @result The result value indicating what we did with `value` or what type
+ *         of object it is (see `conversion_result`).
+ */
+static inline conversion_result
+convert_to_clongdouble(PyObject *value, npy_clongdouble *result, npy_bool *may_need_deferring)
+{
+    PyArray_Descr *descr;
+    *may_need_deferring = NPY_FALSE;
+
+    if (Py_TYPE(value) == &PyCLongDoubleArrType_Type) {
+        *result = PyArrayScalar_VAL(value, CLongDouble);
+        return CONVERSION_SUCCESS;
+    }
+    /* Optimize the identical scalar specifically. */
+    if (PyArray_IsScalar(value, CLongDouble)) {
+        *result = PyArrayScalar_VAL(value, CLongDouble);
+        /*
+         * In principle special, assyemetric, handling could be possible for
+         * explicit subclasses.
+         * In practice, we just check the normal deferring logic.
+         */
+        *may_need_deferring = NPY_TRUE;
+        return CONVERSION_SUCCESS;
+    }
+
+    /*
+     * Then we check for the basic Python types float, int, and complex.
+     * (this is a bit tedious to do right for complex).
+     */
+    if (PyBool_Check(value)) {
+        CONVERT_TO_RESULT(value == Py_True);
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyFloat_Check(value)) {
+        if (!PyFloat_CheckExact(value)) {
+            /* A NumPy double is a float subclass, but special. */
+            if (PyArray_IsScalar(value, Double)) {
+                descr = PyArray_DescrFromType(NPY_DOUBLE);
+                goto numpy_scalar;
+            }
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_DOUBLE, NPY_CLONGDOUBLE)) {
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            /* Weak promotion is used when self is float or complex: */
+            if (!PyTypeNum_ISFLOAT(NPY_CLONGDOUBLE) && !PyTypeNum_ISCOMPLEX(NPY_CLONGDOUBLE)) {
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        CONVERT_TO_RESULT(PyFloat_AS_DOUBLE(value));
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyLong_Check(value)) {
+        if (!PyLong_CheckExact(value)) {
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_LONG, NPY_CLONGDOUBLE)) {
+            /*
+             * long -> (c)longdouble is safe, so `OTHER_IS_UNKNOWN_OBJECT` will
+             * be returned below for huge integers.
+             */
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        int overflow;
+        long val = PyLong_AsLongAndOverflow(value, &overflow);
+        if (overflow) {
+            /* handle as if "unsafe" */
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                return OTHER_IS_UNKNOWN_OBJECT;
+            }
+            return CONVERT_PYSCALAR;
+        }
+        if (error_converting(val)) {
+            return CONVERSION_ERROR;  /* should not be possible */
+        }
+        CONVERT_TO_RESULT(val);
+        return CONVERSION_SUCCESS;
+    }
+
+    if (PyComplex_Check(value)) {
+        if (!PyComplex_CheckExact(value)) {
+            /* A NumPy complex double is a float subclass, but special. */
+            if (PyArray_IsScalar(value, CDouble)) {
+                descr = PyArray_DescrFromType(NPY_CDOUBLE);
+                goto numpy_scalar;
+            }
+            *may_need_deferring = NPY_TRUE;
+        }
+        if (!IS_SAFE(NPY_CDOUBLE, NPY_CLONGDOUBLE)) {
+            if (npy_promotion_state != NPY_USE_WEAK_PROMOTION) {
+                /* Legacy promotion and weak-and-warn not handled here */
+                return PROMOTION_REQUIRED;
+            }
+            /* Weak promotion is used when self is float or complex: */
+            if (!PyTypeNum_ISCOMPLEX(NPY_CLONGDOUBLE)) {
+                return PROMOTION_REQUIRED;
+            }
+            return CONVERT_PYSCALAR;
+        }
+#if defined(IS_CFLOAT) || defined(IS_CDOUBLE) || defined(IS_CLONGDOUBLE)
+        Py_complex val = PyComplex_AsCComplex(value);
+        if (error_converting(val.real)) {
+            return CONVERSION_ERROR;  /* should not be possible */
+        }
+        result->real = val.real;
+        result->imag = val.imag;
+        return CONVERSION_SUCCESS;
+#else
+        /* unreachable, always unsafe cast above; return to avoid warning */
+        assert(0);
+        return OTHER_IS_UNKNOWN_OBJECT;
+#endif  /* defined(IS_CFLOAT) || ... */
+    }
+
+    /*
+     * (seberg) It would be nice to use `PyArray_DiscoverDTypeFromScalarType`
+     * from array coercion here.  OTOH, the array coercion code also falls
+     * back to this code.  The issue is around how subclasses should work...
+     *
+     * It would be nice to try to fully align the paths again (they effectively
+     * are equivalent).  Proper support for subclasses is in general tricky,
+     * and it would make more sense to just _refuse_ to support them.
+     * However, it is unclear that this is a viable option...
+     */
+    if (!PyArray_IsScalar(value, Generic)) {
+        /*
+         * The input is an unknown python object.  This should probably defer
+         * but only does so for float128.
+         * For all other cases, we defer to the array logic.  If the object
+         * is indeed not an array-like, this will end up converting the NumPy
+         * scalar to a Python scalar and then try again.
+         * The logic is that the ufunc casts the input to object, which does
+         * the conversion.
+         * If the object is an array, deferring will always kick in.
+         */
+        *may_need_deferring = NPY_TRUE;
+        return OTHER_IS_UNKNOWN_OBJECT;
+    }
+
+    descr = PyArray_DescrFromScalar(value);
+    if (descr == NULL) {
+        if (PyErr_Occurred()) {
+            return CONVERSION_ERROR;
+        }
+        /* Should not happen, but may be possible with bad user subclasses */
+        *may_need_deferring = NPY_TRUE;
+        return OTHER_IS_UNKNOWN_OBJECT;
+    }
+
+  numpy_scalar:
+    if (descr->typeobj != Py_TYPE(value)) {
+        /*
+         * This is a subclass of a builtin type, we may continue normally,
+         * but should check whether we need to defer.
+         */
+        *may_need_deferring = NPY_TRUE;
+    }
+
+    /*
+     * Otherwise, we have a clear NumPy scalar, find if it is a compatible
+     * builtin scalar.
+     * Each `GET_VALUE_OR_DEFER` represents a case clause for its type number,
+     * extracting the value if it is safe and otherwise deferring.
+     * (Safety is known at compile time, so the switch statement should be
+     * simplified by the compiler accordingly.)
+     * If we have a scalar that is not listed or not safe, we defer to it.
+     *
+     * We should probably defer more aggressively, but that is too big a change,
+     * since it would disable `np.float64(1.) * [1, 2, 3, 4]`.
+     */
+    int ret;  /* set by the GET_VALUE_OR_DEFER macro */
+    switch (descr->type_num) {
+        GET_VALUE_OR_DEFER(BOOL, Bool, value);
+        /* UInts */
+        GET_VALUE_OR_DEFER(UBYTE, UByte, value);
+        GET_VALUE_OR_DEFER(USHORT, UShort, value);
+        GET_VALUE_OR_DEFER(UINT, UInt, value);
+        GET_VALUE_OR_DEFER(ULONG, ULong, value);
+        GET_VALUE_OR_DEFER(ULONGLONG, ULongLong, value);
+        /* Ints */
+        GET_VALUE_OR_DEFER(BYTE, Byte, value);
+        GET_VALUE_OR_DEFER(SHORT, Short, value);
+        GET_VALUE_OR_DEFER(INT, Int, value);
+        GET_VALUE_OR_DEFER(LONG, Long, value);
+        GET_VALUE_OR_DEFER(LONGLONG, LongLong, value);
+        /* Floats */
+        case NPY_HALF:
+            if (IS_SAFE(NPY_HALF, NPY_CLONGDOUBLE)) {
+                CONVERT_TO_RESULT(npy_half_to_float(PyArrayScalar_VAL(value, Half)));
+                ret = CONVERSION_SUCCESS;
+            }
+            else if (IS_SAFE(NPY_CLONGDOUBLE, NPY_HALF)) {
+                ret = DEFER_TO_OTHER_KNOWN_SCALAR;
+            }
+            else {
+                ret = PROMOTION_REQUIRED;
+            }
+            break;
+        GET_VALUE_OR_DEFER(FLOAT, Float, value);
+        GET_VALUE_OR_DEFER(DOUBLE, Double, value);
+        GET_VALUE_OR_DEFER(LONGDOUBLE, LongDouble, value);
+        /* Complex: We should still defer, but the code won't work... */
+        GET_CVALUE_OR_DEFER(CFLOAT, CFloat, value);
+        GET_CVALUE_OR_DEFER(CDOUBLE, CDouble, value);
+        GET_CVALUE_OR_DEFER(CLONGDOUBLE, CLongDouble, value);
+        default:
+            /*
+             * If there is no match, this is an unknown scalar object.  It
+             * would make sense to defer generously here, but it should also
+             * always be safe to use the array path.
+             * The issue is, that the other scalar may or may not be designed
+             * to deal with NumPy scalars.  Without knowing that, we cannot
+             * defer (which would be much faster potentially).
+             * TODO: We could add a DType flag to allow opting in to deferring!
+             */
+            *may_need_deferring = NPY_TRUE;
+            ret = OTHER_IS_UNKNOWN_OBJECT;
+    }
+    Py_DECREF(descr);
+    return ret;
+}
+
+#undef IS_SAFE
+#undef CONVERT_TO_RESULT
+#undef GET_VALUE_OR_DEFER
+#undef GET_CVALUE_OR_DEFER
+#undef IS_CLONGDOUBLE
+
+
+
+
+#line 1217
+#define IS_byte
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_add
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "add"
+#endif
+
+static PyObject *
+byte_add(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_byte arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyByteArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyByteArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Byte);
+        assert(is_forward || PyArray_IsScalar(b, Byte));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_byte(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_add, byte_add);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_add(a,b);
+        case CONVERT_PYSCALAR:
+            if (BYTE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Byte);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Byte);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_byte out;
+#if 0
+    npy_byte out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = byte_ctype_add(arg1, arg2, &out, &out2);
+#else
+    int retstatus = byte_ctype_add(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Byte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Byte, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Byte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Byte, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Byte);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Byte, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_add
+#undef IS_byte
+
+
+#line 1217
+#define IS_ubyte
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_add
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "add"
+#endif
+
+static PyObject *
+ubyte_add(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ubyte arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyUByteArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUByteArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UByte);
+        assert(is_forward || PyArray_IsScalar(b, UByte));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ubyte(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_add, ubyte_add);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_add(a,b);
+        case CONVERT_PYSCALAR:
+            if (UBYTE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UByte);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UByte);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ubyte out;
+#if 0
+    npy_ubyte out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ubyte_ctype_add(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ubyte_ctype_add(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(UByte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UByte, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(UByte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UByte, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(UByte);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, UByte, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_add
+#undef IS_ubyte
+
+
+#line 1217
+#define IS_short
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_add
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "add"
+#endif
+
+static PyObject *
+short_add(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_short arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyShortArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyShortArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Short);
+        assert(is_forward || PyArray_IsScalar(b, Short));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_short(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_add, short_add);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_add(a,b);
+        case CONVERT_PYSCALAR:
+            if (SHORT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Short);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Short);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_short out;
+#if 0
+    npy_short out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = short_ctype_add(arg1, arg2, &out, &out2);
+#else
+    int retstatus = short_ctype_add(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Short);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Short, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Short);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Short, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Short);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Short, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_add
+#undef IS_short
+
+
+#line 1217
+#define IS_ushort
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_add
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "add"
+#endif
+
+static PyObject *
+ushort_add(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ushort arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyUShortArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUShortArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UShort);
+        assert(is_forward || PyArray_IsScalar(b, UShort));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ushort(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_add, ushort_add);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_add(a,b);
+        case CONVERT_PYSCALAR:
+            if (USHORT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UShort);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UShort);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ushort out;
+#if 0
+    npy_ushort out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ushort_ctype_add(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ushort_ctype_add(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(UShort);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UShort, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(UShort);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UShort, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(UShort);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, UShort, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_add
+#undef IS_ushort
+
+
+#line 1217
+#define IS_int
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_add
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "add"
+#endif
+
+static PyObject *
+int_add(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_int arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyIntArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyIntArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Int);
+        assert(is_forward || PyArray_IsScalar(b, Int));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_int(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_add, int_add);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_add(a,b);
+        case CONVERT_PYSCALAR:
+            if (INT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Int);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Int);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_int out;
+#if 0
+    npy_int out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = int_ctype_add(arg1, arg2, &out, &out2);
+#else
+    int retstatus = int_ctype_add(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Int);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Int, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Int);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Int, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Int);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Int, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_add
+#undef IS_int
+
+
+#line 1217
+#define IS_uint
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_add
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "add"
+#endif
+
+static PyObject *
+uint_add(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_uint arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyUIntArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUIntArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UInt);
+        assert(is_forward || PyArray_IsScalar(b, UInt));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_uint(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_add, uint_add);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_add(a,b);
+        case CONVERT_PYSCALAR:
+            if (UINT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UInt);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UInt);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_uint out;
+#if 0
+    npy_uint out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = uint_ctype_add(arg1, arg2, &out, &out2);
+#else
+    int retstatus = uint_ctype_add(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(UInt);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UInt, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(UInt);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UInt, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(UInt);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, UInt, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_add
+#undef IS_uint
+
+
+#line 1217
+#define IS_long
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_add
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "add"
+#endif
+
+static PyObject *
+long_add(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_long arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Long);
+        assert(is_forward || PyArray_IsScalar(b, Long));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_long(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_add, long_add);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_add(a,b);
+        case CONVERT_PYSCALAR:
+            if (LONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Long);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Long);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_long out;
+#if 0
+    npy_long out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = long_ctype_add(arg1, arg2, &out, &out2);
+#else
+    int retstatus = long_ctype_add(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Long);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Long, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Long);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Long, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Long);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Long, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_add
+#undef IS_long
+
+
+#line 1217
+#define IS_ulong
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_add
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "add"
+#endif
+
+static PyObject *
+ulong_add(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ulong arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyULongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyULongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, ULong);
+        assert(is_forward || PyArray_IsScalar(b, ULong));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ulong(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_add, ulong_add);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_add(a,b);
+        case CONVERT_PYSCALAR:
+            if (ULONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, ULong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, ULong);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ulong out;
+#if 0
+    npy_ulong out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ulong_ctype_add(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ulong_ctype_add(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(ULong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULong, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(ULong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULong, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(ULong);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, ULong, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_add
+#undef IS_ulong
+
+
+#line 1217
+#define IS_longlong
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_add
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "add"
+#endif
+
+static PyObject *
+longlong_add(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_longlong arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyLongLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyLongLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, LongLong);
+        assert(is_forward || PyArray_IsScalar(b, LongLong));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_longlong(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_add, longlong_add);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_add(a,b);
+        case CONVERT_PYSCALAR:
+            if (LONGLONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, LongLong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, LongLong);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_longlong out;
+#if 0
+    npy_longlong out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = longlong_ctype_add(arg1, arg2, &out, &out2);
+#else
+    int retstatus = longlong_ctype_add(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(LongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, LongLong, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(LongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, LongLong, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(LongLong);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, LongLong, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_add
+#undef IS_longlong
+
+
+#line 1217
+#define IS_ulonglong
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_add
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "add"
+#endif
+
+static PyObject *
+ulonglong_add(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ulonglong arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyULongLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyULongLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, ULongLong);
+        assert(is_forward || PyArray_IsScalar(b, ULongLong));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ulonglong(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_add, ulonglong_add);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_add(a,b);
+        case CONVERT_PYSCALAR:
+            if (ULONGLONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, ULongLong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, ULongLong);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ulonglong out;
+#if 0
+    npy_ulonglong out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ulonglong_ctype_add(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ulonglong_ctype_add(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(ULongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULongLong, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(ULongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULongLong, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(ULongLong);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, ULongLong, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_add
+#undef IS_ulonglong
+
+
+#line 1217
+#define IS_byte
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_subtract
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "subtract"
+#endif
+
+static PyObject *
+byte_subtract(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_byte arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyByteArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyByteArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Byte);
+        assert(is_forward || PyArray_IsScalar(b, Byte));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_byte(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_subtract, byte_subtract);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_subtract(a,b);
+        case CONVERT_PYSCALAR:
+            if (BYTE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Byte);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Byte);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_byte out;
+#if 0
+    npy_byte out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = byte_ctype_subtract(arg1, arg2, &out, &out2);
+#else
+    int retstatus = byte_ctype_subtract(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Byte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Byte, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Byte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Byte, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Byte);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Byte, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_subtract
+#undef IS_byte
+
+
+#line 1217
+#define IS_ubyte
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_subtract
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "subtract"
+#endif
+
+static PyObject *
+ubyte_subtract(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ubyte arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyUByteArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUByteArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UByte);
+        assert(is_forward || PyArray_IsScalar(b, UByte));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ubyte(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_subtract, ubyte_subtract);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_subtract(a,b);
+        case CONVERT_PYSCALAR:
+            if (UBYTE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UByte);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UByte);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ubyte out;
+#if 0
+    npy_ubyte out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ubyte_ctype_subtract(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ubyte_ctype_subtract(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(UByte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UByte, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(UByte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UByte, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(UByte);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, UByte, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_subtract
+#undef IS_ubyte
+
+
+#line 1217
+#define IS_short
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_subtract
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "subtract"
+#endif
+
+static PyObject *
+short_subtract(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_short arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyShortArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyShortArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Short);
+        assert(is_forward || PyArray_IsScalar(b, Short));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_short(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_subtract, short_subtract);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_subtract(a,b);
+        case CONVERT_PYSCALAR:
+            if (SHORT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Short);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Short);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_short out;
+#if 0
+    npy_short out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = short_ctype_subtract(arg1, arg2, &out, &out2);
+#else
+    int retstatus = short_ctype_subtract(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Short);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Short, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Short);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Short, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Short);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Short, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_subtract
+#undef IS_short
+
+
+#line 1217
+#define IS_ushort
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_subtract
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "subtract"
+#endif
+
+static PyObject *
+ushort_subtract(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ushort arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyUShortArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUShortArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UShort);
+        assert(is_forward || PyArray_IsScalar(b, UShort));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ushort(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_subtract, ushort_subtract);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_subtract(a,b);
+        case CONVERT_PYSCALAR:
+            if (USHORT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UShort);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UShort);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ushort out;
+#if 0
+    npy_ushort out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ushort_ctype_subtract(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ushort_ctype_subtract(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(UShort);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UShort, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(UShort);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UShort, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(UShort);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, UShort, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_subtract
+#undef IS_ushort
+
+
+#line 1217
+#define IS_int
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_subtract
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "subtract"
+#endif
+
+static PyObject *
+int_subtract(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_int arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyIntArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyIntArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Int);
+        assert(is_forward || PyArray_IsScalar(b, Int));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_int(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_subtract, int_subtract);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_subtract(a,b);
+        case CONVERT_PYSCALAR:
+            if (INT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Int);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Int);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_int out;
+#if 0
+    npy_int out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = int_ctype_subtract(arg1, arg2, &out, &out2);
+#else
+    int retstatus = int_ctype_subtract(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Int);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Int, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Int);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Int, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Int);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Int, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_subtract
+#undef IS_int
+
+
+#line 1217
+#define IS_uint
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_subtract
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "subtract"
+#endif
+
+static PyObject *
+uint_subtract(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_uint arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyUIntArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUIntArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UInt);
+        assert(is_forward || PyArray_IsScalar(b, UInt));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_uint(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_subtract, uint_subtract);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_subtract(a,b);
+        case CONVERT_PYSCALAR:
+            if (UINT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UInt);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UInt);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_uint out;
+#if 0
+    npy_uint out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = uint_ctype_subtract(arg1, arg2, &out, &out2);
+#else
+    int retstatus = uint_ctype_subtract(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(UInt);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UInt, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(UInt);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UInt, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(UInt);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, UInt, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_subtract
+#undef IS_uint
+
+
+#line 1217
+#define IS_long
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_subtract
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "subtract"
+#endif
+
+static PyObject *
+long_subtract(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_long arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Long);
+        assert(is_forward || PyArray_IsScalar(b, Long));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_long(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_subtract, long_subtract);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_subtract(a,b);
+        case CONVERT_PYSCALAR:
+            if (LONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Long);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Long);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_long out;
+#if 0
+    npy_long out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = long_ctype_subtract(arg1, arg2, &out, &out2);
+#else
+    int retstatus = long_ctype_subtract(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Long);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Long, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Long);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Long, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Long);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Long, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_subtract
+#undef IS_long
+
+
+#line 1217
+#define IS_ulong
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_subtract
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "subtract"
+#endif
+
+static PyObject *
+ulong_subtract(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ulong arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyULongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyULongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, ULong);
+        assert(is_forward || PyArray_IsScalar(b, ULong));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ulong(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_subtract, ulong_subtract);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_subtract(a,b);
+        case CONVERT_PYSCALAR:
+            if (ULONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, ULong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, ULong);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ulong out;
+#if 0
+    npy_ulong out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ulong_ctype_subtract(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ulong_ctype_subtract(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(ULong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULong, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(ULong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULong, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(ULong);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, ULong, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_subtract
+#undef IS_ulong
+
+
+#line 1217
+#define IS_longlong
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_subtract
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "subtract"
+#endif
+
+static PyObject *
+longlong_subtract(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_longlong arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyLongLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyLongLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, LongLong);
+        assert(is_forward || PyArray_IsScalar(b, LongLong));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_longlong(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_subtract, longlong_subtract);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_subtract(a,b);
+        case CONVERT_PYSCALAR:
+            if (LONGLONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, LongLong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, LongLong);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_longlong out;
+#if 0
+    npy_longlong out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = longlong_ctype_subtract(arg1, arg2, &out, &out2);
+#else
+    int retstatus = longlong_ctype_subtract(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(LongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, LongLong, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(LongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, LongLong, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(LongLong);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, LongLong, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_subtract
+#undef IS_longlong
+
+
+#line 1217
+#define IS_ulonglong
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_subtract
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "subtract"
+#endif
+
+static PyObject *
+ulonglong_subtract(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ulonglong arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyULongLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyULongLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, ULongLong);
+        assert(is_forward || PyArray_IsScalar(b, ULongLong));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ulonglong(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_subtract, ulonglong_subtract);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_subtract(a,b);
+        case CONVERT_PYSCALAR:
+            if (ULONGLONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, ULongLong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, ULongLong);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ulonglong out;
+#if 0
+    npy_ulonglong out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ulonglong_ctype_subtract(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ulonglong_ctype_subtract(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(ULongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULongLong, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(ULongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULongLong, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(ULongLong);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, ULongLong, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_subtract
+#undef IS_ulonglong
+
+
+#line 1217
+#define IS_byte
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_multiply
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "multiply"
+#endif
+
+static PyObject *
+byte_multiply(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_byte arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyByteArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyByteArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Byte);
+        assert(is_forward || PyArray_IsScalar(b, Byte));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_byte(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_multiply, byte_multiply);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_multiply(a,b);
+        case CONVERT_PYSCALAR:
+            if (BYTE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Byte);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Byte);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_byte out;
+#if 0
+    npy_byte out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = byte_ctype_multiply(arg1, arg2, &out, &out2);
+#else
+    int retstatus = byte_ctype_multiply(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Byte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Byte, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Byte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Byte, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Byte);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Byte, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_multiply
+#undef IS_byte
+
+
+#line 1217
+#define IS_ubyte
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_multiply
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "multiply"
+#endif
+
+static PyObject *
+ubyte_multiply(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ubyte arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyUByteArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUByteArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UByte);
+        assert(is_forward || PyArray_IsScalar(b, UByte));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ubyte(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_multiply, ubyte_multiply);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_multiply(a,b);
+        case CONVERT_PYSCALAR:
+            if (UBYTE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UByte);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UByte);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ubyte out;
+#if 0
+    npy_ubyte out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ubyte_ctype_multiply(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ubyte_ctype_multiply(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(UByte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UByte, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(UByte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UByte, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(UByte);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, UByte, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_multiply
+#undef IS_ubyte
+
+
+#line 1217
+#define IS_short
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_multiply
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "multiply"
+#endif
+
+static PyObject *
+short_multiply(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_short arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyShortArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyShortArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Short);
+        assert(is_forward || PyArray_IsScalar(b, Short));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_short(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_multiply, short_multiply);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_multiply(a,b);
+        case CONVERT_PYSCALAR:
+            if (SHORT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Short);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Short);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_short out;
+#if 0
+    npy_short out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = short_ctype_multiply(arg1, arg2, &out, &out2);
+#else
+    int retstatus = short_ctype_multiply(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Short);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Short, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Short);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Short, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Short);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Short, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_multiply
+#undef IS_short
+
+
+#line 1217
+#define IS_ushort
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_multiply
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "multiply"
+#endif
+
+static PyObject *
+ushort_multiply(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ushort arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyUShortArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUShortArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UShort);
+        assert(is_forward || PyArray_IsScalar(b, UShort));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ushort(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_multiply, ushort_multiply);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_multiply(a,b);
+        case CONVERT_PYSCALAR:
+            if (USHORT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UShort);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UShort);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ushort out;
+#if 0
+    npy_ushort out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ushort_ctype_multiply(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ushort_ctype_multiply(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(UShort);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UShort, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(UShort);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UShort, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(UShort);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, UShort, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_multiply
+#undef IS_ushort
+
+
+#line 1217
+#define IS_int
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_multiply
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "multiply"
+#endif
+
+static PyObject *
+int_multiply(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_int arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyIntArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyIntArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Int);
+        assert(is_forward || PyArray_IsScalar(b, Int));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_int(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_multiply, int_multiply);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_multiply(a,b);
+        case CONVERT_PYSCALAR:
+            if (INT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Int);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Int);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_int out;
+#if 0
+    npy_int out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = int_ctype_multiply(arg1, arg2, &out, &out2);
+#else
+    int retstatus = int_ctype_multiply(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Int);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Int, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Int);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Int, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Int);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Int, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_multiply
+#undef IS_int
+
+
+#line 1217
+#define IS_uint
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_multiply
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "multiply"
+#endif
+
+static PyObject *
+uint_multiply(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_uint arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyUIntArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUIntArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UInt);
+        assert(is_forward || PyArray_IsScalar(b, UInt));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_uint(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_multiply, uint_multiply);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_multiply(a,b);
+        case CONVERT_PYSCALAR:
+            if (UINT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UInt);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UInt);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_uint out;
+#if 0
+    npy_uint out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = uint_ctype_multiply(arg1, arg2, &out, &out2);
+#else
+    int retstatus = uint_ctype_multiply(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(UInt);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UInt, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(UInt);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UInt, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(UInt);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, UInt, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_multiply
+#undef IS_uint
+
+
+#line 1217
+#define IS_long
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_multiply
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "multiply"
+#endif
+
+static PyObject *
+long_multiply(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_long arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Long);
+        assert(is_forward || PyArray_IsScalar(b, Long));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_long(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_multiply, long_multiply);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_multiply(a,b);
+        case CONVERT_PYSCALAR:
+            if (LONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Long);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Long);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_long out;
+#if 0
+    npy_long out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = long_ctype_multiply(arg1, arg2, &out, &out2);
+#else
+    int retstatus = long_ctype_multiply(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Long);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Long, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Long);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Long, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Long);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Long, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_multiply
+#undef IS_long
+
+
+#line 1217
+#define IS_ulong
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_multiply
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "multiply"
+#endif
+
+static PyObject *
+ulong_multiply(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ulong arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyULongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyULongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, ULong);
+        assert(is_forward || PyArray_IsScalar(b, ULong));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ulong(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_multiply, ulong_multiply);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_multiply(a,b);
+        case CONVERT_PYSCALAR:
+            if (ULONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, ULong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, ULong);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ulong out;
+#if 0
+    npy_ulong out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ulong_ctype_multiply(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ulong_ctype_multiply(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(ULong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULong, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(ULong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULong, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(ULong);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, ULong, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_multiply
+#undef IS_ulong
+
+
+#line 1217
+#define IS_longlong
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_multiply
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "multiply"
+#endif
+
+static PyObject *
+longlong_multiply(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_longlong arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyLongLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyLongLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, LongLong);
+        assert(is_forward || PyArray_IsScalar(b, LongLong));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_longlong(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_multiply, longlong_multiply);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_multiply(a,b);
+        case CONVERT_PYSCALAR:
+            if (LONGLONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, LongLong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, LongLong);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_longlong out;
+#if 0
+    npy_longlong out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = longlong_ctype_multiply(arg1, arg2, &out, &out2);
+#else
+    int retstatus = longlong_ctype_multiply(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(LongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, LongLong, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(LongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, LongLong, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(LongLong);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, LongLong, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_multiply
+#undef IS_longlong
+
+
+#line 1217
+#define IS_ulonglong
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_multiply
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "multiply"
+#endif
+
+static PyObject *
+ulonglong_multiply(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ulonglong arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyULongLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyULongLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, ULongLong);
+        assert(is_forward || PyArray_IsScalar(b, ULongLong));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ulonglong(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_multiply, ulonglong_multiply);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_multiply(a,b);
+        case CONVERT_PYSCALAR:
+            if (ULONGLONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, ULongLong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, ULongLong);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ulonglong out;
+#if 0
+    npy_ulonglong out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ulonglong_ctype_multiply(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ulonglong_ctype_multiply(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(ULongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULongLong, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(ULongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULongLong, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(ULongLong);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, ULongLong, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_multiply
+#undef IS_ulonglong
+
+
+#line 1217
+#define IS_byte
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_remainder
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "remainder"
+#endif
+
+static PyObject *
+byte_remainder(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_byte arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyByteArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyByteArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Byte);
+        assert(is_forward || PyArray_IsScalar(b, Byte));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_byte(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_remainder, byte_remainder);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_remainder(a,b);
+        case CONVERT_PYSCALAR:
+            if (BYTE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Byte);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Byte);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_byte out;
+#if 0
+    npy_byte out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = byte_ctype_remainder(arg1, arg2, &out, &out2);
+#else
+    int retstatus = byte_ctype_remainder(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Byte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Byte, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Byte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Byte, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Byte);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Byte, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_remainder
+#undef IS_byte
+
+
+#line 1217
+#define IS_ubyte
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_remainder
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "remainder"
+#endif
+
+static PyObject *
+ubyte_remainder(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ubyte arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyUByteArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUByteArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UByte);
+        assert(is_forward || PyArray_IsScalar(b, UByte));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ubyte(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_remainder, ubyte_remainder);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_remainder(a,b);
+        case CONVERT_PYSCALAR:
+            if (UBYTE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UByte);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UByte);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ubyte out;
+#if 0
+    npy_ubyte out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ubyte_ctype_remainder(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ubyte_ctype_remainder(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(UByte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UByte, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(UByte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UByte, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(UByte);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, UByte, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_remainder
+#undef IS_ubyte
+
+
+#line 1217
+#define IS_short
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_remainder
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "remainder"
+#endif
+
+static PyObject *
+short_remainder(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_short arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyShortArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyShortArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Short);
+        assert(is_forward || PyArray_IsScalar(b, Short));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_short(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_remainder, short_remainder);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_remainder(a,b);
+        case CONVERT_PYSCALAR:
+            if (SHORT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Short);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Short);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_short out;
+#if 0
+    npy_short out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = short_ctype_remainder(arg1, arg2, &out, &out2);
+#else
+    int retstatus = short_ctype_remainder(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Short);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Short, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Short);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Short, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Short);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Short, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_remainder
+#undef IS_short
+
+
+#line 1217
+#define IS_ushort
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_remainder
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "remainder"
+#endif
+
+static PyObject *
+ushort_remainder(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ushort arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyUShortArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUShortArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UShort);
+        assert(is_forward || PyArray_IsScalar(b, UShort));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ushort(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_remainder, ushort_remainder);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_remainder(a,b);
+        case CONVERT_PYSCALAR:
+            if (USHORT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UShort);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UShort);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ushort out;
+#if 0
+    npy_ushort out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ushort_ctype_remainder(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ushort_ctype_remainder(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(UShort);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UShort, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(UShort);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UShort, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(UShort);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, UShort, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_remainder
+#undef IS_ushort
+
+
+#line 1217
+#define IS_int
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_remainder
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "remainder"
+#endif
+
+static PyObject *
+int_remainder(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_int arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyIntArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyIntArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Int);
+        assert(is_forward || PyArray_IsScalar(b, Int));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_int(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_remainder, int_remainder);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_remainder(a,b);
+        case CONVERT_PYSCALAR:
+            if (INT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Int);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Int);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_int out;
+#if 0
+    npy_int out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = int_ctype_remainder(arg1, arg2, &out, &out2);
+#else
+    int retstatus = int_ctype_remainder(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Int);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Int, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Int);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Int, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Int);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Int, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_remainder
+#undef IS_int
+
+
+#line 1217
+#define IS_uint
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_remainder
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "remainder"
+#endif
+
+static PyObject *
+uint_remainder(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_uint arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyUIntArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUIntArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UInt);
+        assert(is_forward || PyArray_IsScalar(b, UInt));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_uint(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_remainder, uint_remainder);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_remainder(a,b);
+        case CONVERT_PYSCALAR:
+            if (UINT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UInt);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UInt);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_uint out;
+#if 0
+    npy_uint out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = uint_ctype_remainder(arg1, arg2, &out, &out2);
+#else
+    int retstatus = uint_ctype_remainder(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(UInt);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UInt, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(UInt);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UInt, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(UInt);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, UInt, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_remainder
+#undef IS_uint
+
+
+#line 1217
+#define IS_long
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_remainder
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "remainder"
+#endif
+
+static PyObject *
+long_remainder(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_long arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Long);
+        assert(is_forward || PyArray_IsScalar(b, Long));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_long(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_remainder, long_remainder);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_remainder(a,b);
+        case CONVERT_PYSCALAR:
+            if (LONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Long);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Long);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_long out;
+#if 0
+    npy_long out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = long_ctype_remainder(arg1, arg2, &out, &out2);
+#else
+    int retstatus = long_ctype_remainder(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Long);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Long, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Long);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Long, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Long);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Long, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_remainder
+#undef IS_long
+
+
+#line 1217
+#define IS_ulong
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_remainder
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "remainder"
+#endif
+
+static PyObject *
+ulong_remainder(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ulong arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyULongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyULongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, ULong);
+        assert(is_forward || PyArray_IsScalar(b, ULong));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ulong(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_remainder, ulong_remainder);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_remainder(a,b);
+        case CONVERT_PYSCALAR:
+            if (ULONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, ULong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, ULong);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ulong out;
+#if 0
+    npy_ulong out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ulong_ctype_remainder(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ulong_ctype_remainder(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(ULong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULong, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(ULong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULong, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(ULong);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, ULong, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_remainder
+#undef IS_ulong
+
+
+#line 1217
+#define IS_longlong
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_remainder
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "remainder"
+#endif
+
+static PyObject *
+longlong_remainder(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_longlong arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyLongLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyLongLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, LongLong);
+        assert(is_forward || PyArray_IsScalar(b, LongLong));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_longlong(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_remainder, longlong_remainder);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_remainder(a,b);
+        case CONVERT_PYSCALAR:
+            if (LONGLONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, LongLong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, LongLong);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_longlong out;
+#if 0
+    npy_longlong out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = longlong_ctype_remainder(arg1, arg2, &out, &out2);
+#else
+    int retstatus = longlong_ctype_remainder(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(LongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, LongLong, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(LongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, LongLong, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(LongLong);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, LongLong, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_remainder
+#undef IS_longlong
+
+
+#line 1217
+#define IS_ulonglong
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_remainder
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "remainder"
+#endif
+
+static PyObject *
+ulonglong_remainder(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ulonglong arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyULongLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyULongLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, ULongLong);
+        assert(is_forward || PyArray_IsScalar(b, ULongLong));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ulonglong(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_remainder, ulonglong_remainder);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_remainder(a,b);
+        case CONVERT_PYSCALAR:
+            if (ULONGLONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, ULongLong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, ULongLong);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ulonglong out;
+#if 0
+    npy_ulonglong out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ulonglong_ctype_remainder(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ulonglong_ctype_remainder(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(ULongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULongLong, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(ULongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULongLong, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(ULongLong);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, ULongLong, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_remainder
+#undef IS_ulonglong
+
+
+#line 1217
+#define IS_byte
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_divmod
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "divmod"
+#endif
+
+static PyObject *
+byte_divmod(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_byte arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyByteArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyByteArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Byte);
+        assert(is_forward || PyArray_IsScalar(b, Byte));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_byte(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_divmod, byte_divmod);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_divmod(a,b);
+        case CONVERT_PYSCALAR:
+            if (BYTE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Byte);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Byte);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_byte out;
+#if 1
+    npy_byte out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 1
+    int retstatus = byte_ctype_divmod(arg1, arg2, &out, &out2);
+#else
+    int retstatus = byte_ctype_divmod(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 1
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Byte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Byte, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Byte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Byte, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Byte);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Byte, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_divmod
+#undef IS_byte
+
+
+#line 1217
+#define IS_ubyte
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_divmod
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "divmod"
+#endif
+
+static PyObject *
+ubyte_divmod(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ubyte arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyUByteArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUByteArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UByte);
+        assert(is_forward || PyArray_IsScalar(b, UByte));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ubyte(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_divmod, ubyte_divmod);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_divmod(a,b);
+        case CONVERT_PYSCALAR:
+            if (UBYTE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UByte);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UByte);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ubyte out;
+#if 1
+    npy_ubyte out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 1
+    int retstatus = ubyte_ctype_divmod(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ubyte_ctype_divmod(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 1
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(UByte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UByte, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(UByte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UByte, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(UByte);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, UByte, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_divmod
+#undef IS_ubyte
+
+
+#line 1217
+#define IS_short
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_divmod
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "divmod"
+#endif
+
+static PyObject *
+short_divmod(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_short arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyShortArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyShortArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Short);
+        assert(is_forward || PyArray_IsScalar(b, Short));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_short(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_divmod, short_divmod);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_divmod(a,b);
+        case CONVERT_PYSCALAR:
+            if (SHORT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Short);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Short);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_short out;
+#if 1
+    npy_short out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 1
+    int retstatus = short_ctype_divmod(arg1, arg2, &out, &out2);
+#else
+    int retstatus = short_ctype_divmod(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 1
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Short);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Short, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Short);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Short, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Short);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Short, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_divmod
+#undef IS_short
+
+
+#line 1217
+#define IS_ushort
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_divmod
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "divmod"
+#endif
+
+static PyObject *
+ushort_divmod(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ushort arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyUShortArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUShortArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UShort);
+        assert(is_forward || PyArray_IsScalar(b, UShort));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ushort(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_divmod, ushort_divmod);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_divmod(a,b);
+        case CONVERT_PYSCALAR:
+            if (USHORT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UShort);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UShort);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ushort out;
+#if 1
+    npy_ushort out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 1
+    int retstatus = ushort_ctype_divmod(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ushort_ctype_divmod(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 1
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(UShort);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UShort, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(UShort);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UShort, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(UShort);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, UShort, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_divmod
+#undef IS_ushort
+
+
+#line 1217
+#define IS_int
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_divmod
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "divmod"
+#endif
+
+static PyObject *
+int_divmod(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_int arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyIntArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyIntArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Int);
+        assert(is_forward || PyArray_IsScalar(b, Int));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_int(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_divmod, int_divmod);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_divmod(a,b);
+        case CONVERT_PYSCALAR:
+            if (INT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Int);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Int);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_int out;
+#if 1
+    npy_int out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 1
+    int retstatus = int_ctype_divmod(arg1, arg2, &out, &out2);
+#else
+    int retstatus = int_ctype_divmod(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 1
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Int);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Int, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Int);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Int, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Int);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Int, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_divmod
+#undef IS_int
+
+
+#line 1217
+#define IS_uint
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_divmod
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "divmod"
+#endif
+
+static PyObject *
+uint_divmod(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_uint arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyUIntArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUIntArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UInt);
+        assert(is_forward || PyArray_IsScalar(b, UInt));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_uint(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_divmod, uint_divmod);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_divmod(a,b);
+        case CONVERT_PYSCALAR:
+            if (UINT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UInt);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UInt);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_uint out;
+#if 1
+    npy_uint out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 1
+    int retstatus = uint_ctype_divmod(arg1, arg2, &out, &out2);
+#else
+    int retstatus = uint_ctype_divmod(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 1
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(UInt);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UInt, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(UInt);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UInt, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(UInt);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, UInt, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_divmod
+#undef IS_uint
+
+
+#line 1217
+#define IS_long
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_divmod
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "divmod"
+#endif
+
+static PyObject *
+long_divmod(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_long arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Long);
+        assert(is_forward || PyArray_IsScalar(b, Long));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_long(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_divmod, long_divmod);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_divmod(a,b);
+        case CONVERT_PYSCALAR:
+            if (LONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Long);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Long);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_long out;
+#if 1
+    npy_long out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 1
+    int retstatus = long_ctype_divmod(arg1, arg2, &out, &out2);
+#else
+    int retstatus = long_ctype_divmod(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 1
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Long);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Long, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Long);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Long, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Long);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Long, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_divmod
+#undef IS_long
+
+
+#line 1217
+#define IS_ulong
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_divmod
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "divmod"
+#endif
+
+static PyObject *
+ulong_divmod(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ulong arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyULongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyULongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, ULong);
+        assert(is_forward || PyArray_IsScalar(b, ULong));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ulong(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_divmod, ulong_divmod);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_divmod(a,b);
+        case CONVERT_PYSCALAR:
+            if (ULONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, ULong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, ULong);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ulong out;
+#if 1
+    npy_ulong out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 1
+    int retstatus = ulong_ctype_divmod(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ulong_ctype_divmod(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 1
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(ULong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULong, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(ULong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULong, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(ULong);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, ULong, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_divmod
+#undef IS_ulong
+
+
+#line 1217
+#define IS_longlong
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_divmod
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "divmod"
+#endif
+
+static PyObject *
+longlong_divmod(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_longlong arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyLongLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyLongLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, LongLong);
+        assert(is_forward || PyArray_IsScalar(b, LongLong));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_longlong(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_divmod, longlong_divmod);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_divmod(a,b);
+        case CONVERT_PYSCALAR:
+            if (LONGLONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, LongLong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, LongLong);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_longlong out;
+#if 1
+    npy_longlong out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 1
+    int retstatus = longlong_ctype_divmod(arg1, arg2, &out, &out2);
+#else
+    int retstatus = longlong_ctype_divmod(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 1
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(LongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, LongLong, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(LongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, LongLong, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(LongLong);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, LongLong, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_divmod
+#undef IS_longlong
+
+
+#line 1217
+#define IS_ulonglong
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_divmod
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "divmod"
+#endif
+
+static PyObject *
+ulonglong_divmod(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ulonglong arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyULongLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyULongLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, ULongLong);
+        assert(is_forward || PyArray_IsScalar(b, ULongLong));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ulonglong(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_divmod, ulonglong_divmod);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_divmod(a,b);
+        case CONVERT_PYSCALAR:
+            if (ULONGLONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, ULongLong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, ULongLong);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ulonglong out;
+#if 1
+    npy_ulonglong out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 1
+    int retstatus = ulonglong_ctype_divmod(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ulonglong_ctype_divmod(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 1
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(ULongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULongLong, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(ULongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULongLong, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(ULongLong);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, ULongLong, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_divmod
+#undef IS_ulonglong
+
+
+#line 1217
+#define IS_byte
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_floor_divide
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "floor_divide"
+#endif
+
+static PyObject *
+byte_floor_divide(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_byte arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyByteArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyByteArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Byte);
+        assert(is_forward || PyArray_IsScalar(b, Byte));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_byte(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_floor_divide, byte_floor_divide);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_floor_divide(a,b);
+        case CONVERT_PYSCALAR:
+            if (BYTE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Byte);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Byte);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_byte out;
+#if 0
+    npy_byte out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = byte_ctype_floor_divide(arg1, arg2, &out, &out2);
+#else
+    int retstatus = byte_ctype_floor_divide(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Byte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Byte, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Byte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Byte, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Byte);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Byte, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_floor_divide
+#undef IS_byte
+
+
+#line 1217
+#define IS_ubyte
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_floor_divide
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "floor_divide"
+#endif
+
+static PyObject *
+ubyte_floor_divide(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ubyte arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyUByteArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUByteArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UByte);
+        assert(is_forward || PyArray_IsScalar(b, UByte));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ubyte(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_floor_divide, ubyte_floor_divide);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_floor_divide(a,b);
+        case CONVERT_PYSCALAR:
+            if (UBYTE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UByte);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UByte);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ubyte out;
+#if 0
+    npy_ubyte out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ubyte_ctype_floor_divide(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ubyte_ctype_floor_divide(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(UByte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UByte, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(UByte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UByte, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(UByte);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, UByte, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_floor_divide
+#undef IS_ubyte
+
+
+#line 1217
+#define IS_short
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_floor_divide
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "floor_divide"
+#endif
+
+static PyObject *
+short_floor_divide(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_short arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyShortArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyShortArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Short);
+        assert(is_forward || PyArray_IsScalar(b, Short));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_short(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_floor_divide, short_floor_divide);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_floor_divide(a,b);
+        case CONVERT_PYSCALAR:
+            if (SHORT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Short);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Short);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_short out;
+#if 0
+    npy_short out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = short_ctype_floor_divide(arg1, arg2, &out, &out2);
+#else
+    int retstatus = short_ctype_floor_divide(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Short);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Short, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Short);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Short, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Short);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Short, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_floor_divide
+#undef IS_short
+
+
+#line 1217
+#define IS_ushort
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_floor_divide
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "floor_divide"
+#endif
+
+static PyObject *
+ushort_floor_divide(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ushort arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyUShortArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUShortArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UShort);
+        assert(is_forward || PyArray_IsScalar(b, UShort));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ushort(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_floor_divide, ushort_floor_divide);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_floor_divide(a,b);
+        case CONVERT_PYSCALAR:
+            if (USHORT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UShort);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UShort);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ushort out;
+#if 0
+    npy_ushort out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ushort_ctype_floor_divide(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ushort_ctype_floor_divide(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(UShort);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UShort, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(UShort);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UShort, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(UShort);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, UShort, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_floor_divide
+#undef IS_ushort
+
+
+#line 1217
+#define IS_int
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_floor_divide
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "floor_divide"
+#endif
+
+static PyObject *
+int_floor_divide(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_int arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyIntArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyIntArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Int);
+        assert(is_forward || PyArray_IsScalar(b, Int));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_int(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_floor_divide, int_floor_divide);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_floor_divide(a,b);
+        case CONVERT_PYSCALAR:
+            if (INT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Int);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Int);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_int out;
+#if 0
+    npy_int out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = int_ctype_floor_divide(arg1, arg2, &out, &out2);
+#else
+    int retstatus = int_ctype_floor_divide(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Int);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Int, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Int);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Int, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Int);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Int, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_floor_divide
+#undef IS_int
+
+
+#line 1217
+#define IS_uint
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_floor_divide
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "floor_divide"
+#endif
+
+static PyObject *
+uint_floor_divide(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_uint arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyUIntArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUIntArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UInt);
+        assert(is_forward || PyArray_IsScalar(b, UInt));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_uint(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_floor_divide, uint_floor_divide);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_floor_divide(a,b);
+        case CONVERT_PYSCALAR:
+            if (UINT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UInt);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UInt);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_uint out;
+#if 0
+    npy_uint out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = uint_ctype_floor_divide(arg1, arg2, &out, &out2);
+#else
+    int retstatus = uint_ctype_floor_divide(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(UInt);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UInt, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(UInt);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UInt, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(UInt);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, UInt, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_floor_divide
+#undef IS_uint
+
+
+#line 1217
+#define IS_long
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_floor_divide
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "floor_divide"
+#endif
+
+static PyObject *
+long_floor_divide(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_long arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Long);
+        assert(is_forward || PyArray_IsScalar(b, Long));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_long(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_floor_divide, long_floor_divide);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_floor_divide(a,b);
+        case CONVERT_PYSCALAR:
+            if (LONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Long);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Long);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_long out;
+#if 0
+    npy_long out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = long_ctype_floor_divide(arg1, arg2, &out, &out2);
+#else
+    int retstatus = long_ctype_floor_divide(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Long);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Long, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Long);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Long, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Long);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Long, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_floor_divide
+#undef IS_long
+
+
+#line 1217
+#define IS_ulong
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_floor_divide
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "floor_divide"
+#endif
+
+static PyObject *
+ulong_floor_divide(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ulong arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyULongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyULongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, ULong);
+        assert(is_forward || PyArray_IsScalar(b, ULong));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ulong(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_floor_divide, ulong_floor_divide);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_floor_divide(a,b);
+        case CONVERT_PYSCALAR:
+            if (ULONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, ULong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, ULong);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ulong out;
+#if 0
+    npy_ulong out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ulong_ctype_floor_divide(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ulong_ctype_floor_divide(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(ULong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULong, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(ULong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULong, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(ULong);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, ULong, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_floor_divide
+#undef IS_ulong
+
+
+#line 1217
+#define IS_longlong
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_floor_divide
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "floor_divide"
+#endif
+
+static PyObject *
+longlong_floor_divide(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_longlong arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyLongLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyLongLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, LongLong);
+        assert(is_forward || PyArray_IsScalar(b, LongLong));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_longlong(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_floor_divide, longlong_floor_divide);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_floor_divide(a,b);
+        case CONVERT_PYSCALAR:
+            if (LONGLONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, LongLong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, LongLong);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_longlong out;
+#if 0
+    npy_longlong out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = longlong_ctype_floor_divide(arg1, arg2, &out, &out2);
+#else
+    int retstatus = longlong_ctype_floor_divide(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(LongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, LongLong, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(LongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, LongLong, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(LongLong);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, LongLong, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_floor_divide
+#undef IS_longlong
+
+
+#line 1217
+#define IS_ulonglong
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_floor_divide
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "floor_divide"
+#endif
+
+static PyObject *
+ulonglong_floor_divide(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ulonglong arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyULongLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyULongLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, ULongLong);
+        assert(is_forward || PyArray_IsScalar(b, ULongLong));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ulonglong(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_floor_divide, ulonglong_floor_divide);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_floor_divide(a,b);
+        case CONVERT_PYSCALAR:
+            if (ULONGLONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, ULongLong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, ULongLong);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ulonglong out;
+#if 0
+    npy_ulonglong out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ulonglong_ctype_floor_divide(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ulonglong_ctype_floor_divide(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(ULongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULongLong, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(ULongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULongLong, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(ULongLong);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, ULongLong, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_floor_divide
+#undef IS_ulonglong
+
+
+#line 1217
+#define IS_byte
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_lshift
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "lshift"
+#endif
+
+static PyObject *
+byte_lshift(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_byte arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyByteArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyByteArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Byte);
+        assert(is_forward || PyArray_IsScalar(b, Byte));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_byte(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_lshift, byte_lshift);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_lshift(a,b);
+        case CONVERT_PYSCALAR:
+            if (BYTE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Byte);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Byte);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_byte out;
+#if 0
+    npy_byte out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = byte_ctype_lshift(arg1, arg2, &out, &out2);
+#else
+    int retstatus = byte_ctype_lshift(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Byte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Byte, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Byte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Byte, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Byte);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Byte, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_lshift
+#undef IS_byte
+
+
+#line 1217
+#define IS_ubyte
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_lshift
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "lshift"
+#endif
+
+static PyObject *
+ubyte_lshift(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ubyte arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyUByteArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUByteArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UByte);
+        assert(is_forward || PyArray_IsScalar(b, UByte));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ubyte(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_lshift, ubyte_lshift);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_lshift(a,b);
+        case CONVERT_PYSCALAR:
+            if (UBYTE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UByte);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UByte);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ubyte out;
+#if 0
+    npy_ubyte out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ubyte_ctype_lshift(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ubyte_ctype_lshift(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(UByte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UByte, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(UByte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UByte, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(UByte);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, UByte, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_lshift
+#undef IS_ubyte
+
+
+#line 1217
+#define IS_short
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_lshift
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "lshift"
+#endif
+
+static PyObject *
+short_lshift(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_short arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyShortArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyShortArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Short);
+        assert(is_forward || PyArray_IsScalar(b, Short));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_short(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_lshift, short_lshift);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_lshift(a,b);
+        case CONVERT_PYSCALAR:
+            if (SHORT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Short);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Short);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_short out;
+#if 0
+    npy_short out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = short_ctype_lshift(arg1, arg2, &out, &out2);
+#else
+    int retstatus = short_ctype_lshift(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Short);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Short, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Short);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Short, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Short);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Short, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_lshift
+#undef IS_short
+
+
+#line 1217
+#define IS_ushort
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_lshift
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "lshift"
+#endif
+
+static PyObject *
+ushort_lshift(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ushort arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyUShortArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUShortArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UShort);
+        assert(is_forward || PyArray_IsScalar(b, UShort));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ushort(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_lshift, ushort_lshift);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_lshift(a,b);
+        case CONVERT_PYSCALAR:
+            if (USHORT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UShort);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UShort);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ushort out;
+#if 0
+    npy_ushort out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ushort_ctype_lshift(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ushort_ctype_lshift(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(UShort);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UShort, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(UShort);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UShort, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(UShort);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, UShort, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_lshift
+#undef IS_ushort
+
+
+#line 1217
+#define IS_int
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_lshift
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "lshift"
+#endif
+
+static PyObject *
+int_lshift(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_int arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyIntArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyIntArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Int);
+        assert(is_forward || PyArray_IsScalar(b, Int));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_int(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_lshift, int_lshift);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_lshift(a,b);
+        case CONVERT_PYSCALAR:
+            if (INT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Int);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Int);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_int out;
+#if 0
+    npy_int out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = int_ctype_lshift(arg1, arg2, &out, &out2);
+#else
+    int retstatus = int_ctype_lshift(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Int);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Int, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Int);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Int, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Int);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Int, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_lshift
+#undef IS_int
+
+
+#line 1217
+#define IS_uint
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_lshift
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "lshift"
+#endif
+
+static PyObject *
+uint_lshift(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_uint arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyUIntArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUIntArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UInt);
+        assert(is_forward || PyArray_IsScalar(b, UInt));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_uint(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_lshift, uint_lshift);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_lshift(a,b);
+        case CONVERT_PYSCALAR:
+            if (UINT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UInt);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UInt);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_uint out;
+#if 0
+    npy_uint out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = uint_ctype_lshift(arg1, arg2, &out, &out2);
+#else
+    int retstatus = uint_ctype_lshift(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(UInt);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UInt, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(UInt);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UInt, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(UInt);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, UInt, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_lshift
+#undef IS_uint
+
+
+#line 1217
+#define IS_long
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_lshift
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "lshift"
+#endif
+
+static PyObject *
+long_lshift(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_long arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Long);
+        assert(is_forward || PyArray_IsScalar(b, Long));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_long(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_lshift, long_lshift);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_lshift(a,b);
+        case CONVERT_PYSCALAR:
+            if (LONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Long);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Long);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_long out;
+#if 0
+    npy_long out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = long_ctype_lshift(arg1, arg2, &out, &out2);
+#else
+    int retstatus = long_ctype_lshift(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Long);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Long, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Long);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Long, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Long);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Long, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_lshift
+#undef IS_long
+
+
+#line 1217
+#define IS_ulong
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_lshift
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "lshift"
+#endif
+
+static PyObject *
+ulong_lshift(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ulong arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyULongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyULongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, ULong);
+        assert(is_forward || PyArray_IsScalar(b, ULong));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ulong(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_lshift, ulong_lshift);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_lshift(a,b);
+        case CONVERT_PYSCALAR:
+            if (ULONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, ULong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, ULong);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ulong out;
+#if 0
+    npy_ulong out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ulong_ctype_lshift(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ulong_ctype_lshift(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(ULong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULong, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(ULong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULong, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(ULong);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, ULong, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_lshift
+#undef IS_ulong
+
+
+#line 1217
+#define IS_longlong
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_lshift
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "lshift"
+#endif
+
+static PyObject *
+longlong_lshift(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_longlong arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyLongLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyLongLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, LongLong);
+        assert(is_forward || PyArray_IsScalar(b, LongLong));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_longlong(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_lshift, longlong_lshift);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_lshift(a,b);
+        case CONVERT_PYSCALAR:
+            if (LONGLONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, LongLong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, LongLong);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_longlong out;
+#if 0
+    npy_longlong out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = longlong_ctype_lshift(arg1, arg2, &out, &out2);
+#else
+    int retstatus = longlong_ctype_lshift(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(LongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, LongLong, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(LongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, LongLong, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(LongLong);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, LongLong, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_lshift
+#undef IS_longlong
+
+
+#line 1217
+#define IS_ulonglong
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_lshift
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "lshift"
+#endif
+
+static PyObject *
+ulonglong_lshift(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ulonglong arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyULongLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyULongLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, ULongLong);
+        assert(is_forward || PyArray_IsScalar(b, ULongLong));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ulonglong(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_lshift, ulonglong_lshift);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_lshift(a,b);
+        case CONVERT_PYSCALAR:
+            if (ULONGLONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, ULongLong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, ULongLong);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ulonglong out;
+#if 0
+    npy_ulonglong out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ulonglong_ctype_lshift(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ulonglong_ctype_lshift(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(ULongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULongLong, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(ULongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULongLong, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(ULongLong);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, ULongLong, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_lshift
+#undef IS_ulonglong
+
+
+#line 1217
+#define IS_byte
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_rshift
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "rshift"
+#endif
+
+static PyObject *
+byte_rshift(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_byte arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyByteArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyByteArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Byte);
+        assert(is_forward || PyArray_IsScalar(b, Byte));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_byte(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_rshift, byte_rshift);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_rshift(a,b);
+        case CONVERT_PYSCALAR:
+            if (BYTE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Byte);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Byte);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_byte out;
+#if 0
+    npy_byte out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = byte_ctype_rshift(arg1, arg2, &out, &out2);
+#else
+    int retstatus = byte_ctype_rshift(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Byte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Byte, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Byte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Byte, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Byte);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Byte, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_rshift
+#undef IS_byte
+
+
+#line 1217
+#define IS_ubyte
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_rshift
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "rshift"
+#endif
+
+static PyObject *
+ubyte_rshift(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ubyte arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyUByteArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUByteArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UByte);
+        assert(is_forward || PyArray_IsScalar(b, UByte));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ubyte(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_rshift, ubyte_rshift);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_rshift(a,b);
+        case CONVERT_PYSCALAR:
+            if (UBYTE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UByte);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UByte);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ubyte out;
+#if 0
+    npy_ubyte out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ubyte_ctype_rshift(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ubyte_ctype_rshift(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(UByte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UByte, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(UByte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UByte, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(UByte);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, UByte, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_rshift
+#undef IS_ubyte
+
+
+#line 1217
+#define IS_short
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_rshift
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "rshift"
+#endif
+
+static PyObject *
+short_rshift(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_short arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyShortArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyShortArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Short);
+        assert(is_forward || PyArray_IsScalar(b, Short));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_short(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_rshift, short_rshift);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_rshift(a,b);
+        case CONVERT_PYSCALAR:
+            if (SHORT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Short);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Short);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_short out;
+#if 0
+    npy_short out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = short_ctype_rshift(arg1, arg2, &out, &out2);
+#else
+    int retstatus = short_ctype_rshift(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Short);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Short, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Short);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Short, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Short);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Short, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_rshift
+#undef IS_short
+
+
+#line 1217
+#define IS_ushort
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_rshift
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "rshift"
+#endif
+
+static PyObject *
+ushort_rshift(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ushort arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyUShortArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUShortArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UShort);
+        assert(is_forward || PyArray_IsScalar(b, UShort));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ushort(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_rshift, ushort_rshift);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_rshift(a,b);
+        case CONVERT_PYSCALAR:
+            if (USHORT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UShort);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UShort);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ushort out;
+#if 0
+    npy_ushort out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ushort_ctype_rshift(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ushort_ctype_rshift(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(UShort);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UShort, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(UShort);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UShort, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(UShort);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, UShort, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_rshift
+#undef IS_ushort
+
+
+#line 1217
+#define IS_int
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_rshift
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "rshift"
+#endif
+
+static PyObject *
+int_rshift(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_int arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyIntArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyIntArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Int);
+        assert(is_forward || PyArray_IsScalar(b, Int));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_int(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_rshift, int_rshift);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_rshift(a,b);
+        case CONVERT_PYSCALAR:
+            if (INT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Int);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Int);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_int out;
+#if 0
+    npy_int out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = int_ctype_rshift(arg1, arg2, &out, &out2);
+#else
+    int retstatus = int_ctype_rshift(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Int);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Int, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Int);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Int, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Int);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Int, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_rshift
+#undef IS_int
+
+
+#line 1217
+#define IS_uint
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_rshift
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "rshift"
+#endif
+
+static PyObject *
+uint_rshift(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_uint arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyUIntArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUIntArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UInt);
+        assert(is_forward || PyArray_IsScalar(b, UInt));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_uint(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_rshift, uint_rshift);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_rshift(a,b);
+        case CONVERT_PYSCALAR:
+            if (UINT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UInt);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UInt);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_uint out;
+#if 0
+    npy_uint out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = uint_ctype_rshift(arg1, arg2, &out, &out2);
+#else
+    int retstatus = uint_ctype_rshift(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(UInt);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UInt, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(UInt);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UInt, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(UInt);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, UInt, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_rshift
+#undef IS_uint
+
+
+#line 1217
+#define IS_long
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_rshift
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "rshift"
+#endif
+
+static PyObject *
+long_rshift(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_long arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Long);
+        assert(is_forward || PyArray_IsScalar(b, Long));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_long(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_rshift, long_rshift);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_rshift(a,b);
+        case CONVERT_PYSCALAR:
+            if (LONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Long);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Long);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_long out;
+#if 0
+    npy_long out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = long_ctype_rshift(arg1, arg2, &out, &out2);
+#else
+    int retstatus = long_ctype_rshift(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Long);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Long, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Long);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Long, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Long);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Long, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_rshift
+#undef IS_long
+
+
+#line 1217
+#define IS_ulong
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_rshift
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "rshift"
+#endif
+
+static PyObject *
+ulong_rshift(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ulong arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyULongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyULongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, ULong);
+        assert(is_forward || PyArray_IsScalar(b, ULong));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ulong(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_rshift, ulong_rshift);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_rshift(a,b);
+        case CONVERT_PYSCALAR:
+            if (ULONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, ULong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, ULong);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ulong out;
+#if 0
+    npy_ulong out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ulong_ctype_rshift(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ulong_ctype_rshift(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(ULong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULong, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(ULong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULong, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(ULong);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, ULong, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_rshift
+#undef IS_ulong
+
+
+#line 1217
+#define IS_longlong
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_rshift
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "rshift"
+#endif
+
+static PyObject *
+longlong_rshift(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_longlong arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyLongLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyLongLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, LongLong);
+        assert(is_forward || PyArray_IsScalar(b, LongLong));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_longlong(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_rshift, longlong_rshift);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_rshift(a,b);
+        case CONVERT_PYSCALAR:
+            if (LONGLONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, LongLong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, LongLong);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_longlong out;
+#if 0
+    npy_longlong out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = longlong_ctype_rshift(arg1, arg2, &out, &out2);
+#else
+    int retstatus = longlong_ctype_rshift(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(LongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, LongLong, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(LongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, LongLong, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(LongLong);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, LongLong, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_rshift
+#undef IS_longlong
+
+
+#line 1217
+#define IS_ulonglong
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_rshift
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "rshift"
+#endif
+
+static PyObject *
+ulonglong_rshift(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ulonglong arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyULongLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyULongLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, ULongLong);
+        assert(is_forward || PyArray_IsScalar(b, ULongLong));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ulonglong(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_rshift, ulonglong_rshift);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_rshift(a,b);
+        case CONVERT_PYSCALAR:
+            if (ULONGLONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, ULongLong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, ULongLong);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ulonglong out;
+#if 0
+    npy_ulonglong out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ulonglong_ctype_rshift(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ulonglong_ctype_rshift(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(ULongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULongLong, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(ULongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULongLong, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(ULongLong);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, ULongLong, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_rshift
+#undef IS_ulonglong
+
+
+#line 1217
+#define IS_byte
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_and
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "and"
+#endif
+
+static PyObject *
+byte_and(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_byte arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyByteArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyByteArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Byte);
+        assert(is_forward || PyArray_IsScalar(b, Byte));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_byte(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_and, byte_and);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_and(a,b);
+        case CONVERT_PYSCALAR:
+            if (BYTE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Byte);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Byte);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_byte out;
+#if 0
+    npy_byte out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = byte_ctype_and(arg1, arg2, &out, &out2);
+#else
+    int retstatus = byte_ctype_and(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Byte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Byte, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Byte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Byte, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Byte);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Byte, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_and
+#undef IS_byte
+
+
+#line 1217
+#define IS_ubyte
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_and
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "and"
+#endif
+
+static PyObject *
+ubyte_and(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ubyte arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyUByteArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUByteArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UByte);
+        assert(is_forward || PyArray_IsScalar(b, UByte));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ubyte(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_and, ubyte_and);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_and(a,b);
+        case CONVERT_PYSCALAR:
+            if (UBYTE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UByte);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UByte);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ubyte out;
+#if 0
+    npy_ubyte out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ubyte_ctype_and(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ubyte_ctype_and(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(UByte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UByte, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(UByte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UByte, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(UByte);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, UByte, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_and
+#undef IS_ubyte
+
+
+#line 1217
+#define IS_short
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_and
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "and"
+#endif
+
+static PyObject *
+short_and(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_short arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyShortArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyShortArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Short);
+        assert(is_forward || PyArray_IsScalar(b, Short));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_short(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_and, short_and);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_and(a,b);
+        case CONVERT_PYSCALAR:
+            if (SHORT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Short);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Short);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_short out;
+#if 0
+    npy_short out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = short_ctype_and(arg1, arg2, &out, &out2);
+#else
+    int retstatus = short_ctype_and(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Short);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Short, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Short);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Short, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Short);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Short, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_and
+#undef IS_short
+
+
+#line 1217
+#define IS_ushort
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_and
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "and"
+#endif
+
+static PyObject *
+ushort_and(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ushort arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyUShortArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUShortArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UShort);
+        assert(is_forward || PyArray_IsScalar(b, UShort));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ushort(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_and, ushort_and);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_and(a,b);
+        case CONVERT_PYSCALAR:
+            if (USHORT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UShort);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UShort);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ushort out;
+#if 0
+    npy_ushort out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ushort_ctype_and(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ushort_ctype_and(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(UShort);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UShort, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(UShort);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UShort, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(UShort);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, UShort, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_and
+#undef IS_ushort
+
+
+#line 1217
+#define IS_int
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_and
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "and"
+#endif
+
+static PyObject *
+int_and(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_int arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyIntArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyIntArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Int);
+        assert(is_forward || PyArray_IsScalar(b, Int));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_int(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_and, int_and);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_and(a,b);
+        case CONVERT_PYSCALAR:
+            if (INT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Int);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Int);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_int out;
+#if 0
+    npy_int out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = int_ctype_and(arg1, arg2, &out, &out2);
+#else
+    int retstatus = int_ctype_and(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Int);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Int, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Int);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Int, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Int);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Int, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_and
+#undef IS_int
+
+
+#line 1217
+#define IS_uint
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_and
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "and"
+#endif
+
+static PyObject *
+uint_and(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_uint arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyUIntArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUIntArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UInt);
+        assert(is_forward || PyArray_IsScalar(b, UInt));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_uint(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_and, uint_and);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_and(a,b);
+        case CONVERT_PYSCALAR:
+            if (UINT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UInt);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UInt);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_uint out;
+#if 0
+    npy_uint out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = uint_ctype_and(arg1, arg2, &out, &out2);
+#else
+    int retstatus = uint_ctype_and(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(UInt);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UInt, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(UInt);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UInt, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(UInt);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, UInt, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_and
+#undef IS_uint
+
+
+#line 1217
+#define IS_long
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_and
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "and"
+#endif
+
+static PyObject *
+long_and(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_long arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Long);
+        assert(is_forward || PyArray_IsScalar(b, Long));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_long(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_and, long_and);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_and(a,b);
+        case CONVERT_PYSCALAR:
+            if (LONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Long);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Long);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_long out;
+#if 0
+    npy_long out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = long_ctype_and(arg1, arg2, &out, &out2);
+#else
+    int retstatus = long_ctype_and(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Long);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Long, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Long);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Long, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Long);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Long, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_and
+#undef IS_long
+
+
+#line 1217
+#define IS_ulong
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_and
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "and"
+#endif
+
+static PyObject *
+ulong_and(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ulong arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyULongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyULongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, ULong);
+        assert(is_forward || PyArray_IsScalar(b, ULong));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ulong(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_and, ulong_and);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_and(a,b);
+        case CONVERT_PYSCALAR:
+            if (ULONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, ULong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, ULong);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ulong out;
+#if 0
+    npy_ulong out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ulong_ctype_and(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ulong_ctype_and(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(ULong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULong, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(ULong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULong, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(ULong);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, ULong, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_and
+#undef IS_ulong
+
+
+#line 1217
+#define IS_longlong
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_and
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "and"
+#endif
+
+static PyObject *
+longlong_and(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_longlong arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyLongLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyLongLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, LongLong);
+        assert(is_forward || PyArray_IsScalar(b, LongLong));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_longlong(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_and, longlong_and);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_and(a,b);
+        case CONVERT_PYSCALAR:
+            if (LONGLONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, LongLong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, LongLong);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_longlong out;
+#if 0
+    npy_longlong out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = longlong_ctype_and(arg1, arg2, &out, &out2);
+#else
+    int retstatus = longlong_ctype_and(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(LongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, LongLong, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(LongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, LongLong, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(LongLong);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, LongLong, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_and
+#undef IS_longlong
+
+
+#line 1217
+#define IS_ulonglong
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_and
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "and"
+#endif
+
+static PyObject *
+ulonglong_and(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ulonglong arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyULongLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyULongLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, ULongLong);
+        assert(is_forward || PyArray_IsScalar(b, ULongLong));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ulonglong(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_and, ulonglong_and);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_and(a,b);
+        case CONVERT_PYSCALAR:
+            if (ULONGLONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, ULongLong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, ULongLong);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ulonglong out;
+#if 0
+    npy_ulonglong out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ulonglong_ctype_and(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ulonglong_ctype_and(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(ULongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULongLong, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(ULongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULongLong, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(ULongLong);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, ULongLong, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_and
+#undef IS_ulonglong
+
+
+#line 1217
+#define IS_byte
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_or
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "or"
+#endif
+
+static PyObject *
+byte_or(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_byte arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyByteArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyByteArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Byte);
+        assert(is_forward || PyArray_IsScalar(b, Byte));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_byte(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_or, byte_or);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_or(a,b);
+        case CONVERT_PYSCALAR:
+            if (BYTE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Byte);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Byte);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_byte out;
+#if 0
+    npy_byte out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = byte_ctype_or(arg1, arg2, &out, &out2);
+#else
+    int retstatus = byte_ctype_or(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Byte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Byte, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Byte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Byte, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Byte);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Byte, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_or
+#undef IS_byte
+
+
+#line 1217
+#define IS_ubyte
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_or
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "or"
+#endif
+
+static PyObject *
+ubyte_or(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ubyte arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyUByteArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUByteArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UByte);
+        assert(is_forward || PyArray_IsScalar(b, UByte));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ubyte(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_or, ubyte_or);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_or(a,b);
+        case CONVERT_PYSCALAR:
+            if (UBYTE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UByte);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UByte);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ubyte out;
+#if 0
+    npy_ubyte out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ubyte_ctype_or(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ubyte_ctype_or(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(UByte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UByte, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(UByte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UByte, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(UByte);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, UByte, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_or
+#undef IS_ubyte
+
+
+#line 1217
+#define IS_short
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_or
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "or"
+#endif
+
+static PyObject *
+short_or(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_short arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyShortArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyShortArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Short);
+        assert(is_forward || PyArray_IsScalar(b, Short));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_short(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_or, short_or);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_or(a,b);
+        case CONVERT_PYSCALAR:
+            if (SHORT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Short);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Short);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_short out;
+#if 0
+    npy_short out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = short_ctype_or(arg1, arg2, &out, &out2);
+#else
+    int retstatus = short_ctype_or(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Short);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Short, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Short);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Short, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Short);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Short, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_or
+#undef IS_short
+
+
+#line 1217
+#define IS_ushort
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_or
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "or"
+#endif
+
+static PyObject *
+ushort_or(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ushort arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyUShortArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUShortArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UShort);
+        assert(is_forward || PyArray_IsScalar(b, UShort));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ushort(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_or, ushort_or);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_or(a,b);
+        case CONVERT_PYSCALAR:
+            if (USHORT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UShort);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UShort);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ushort out;
+#if 0
+    npy_ushort out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ushort_ctype_or(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ushort_ctype_or(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(UShort);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UShort, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(UShort);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UShort, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(UShort);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, UShort, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_or
+#undef IS_ushort
+
+
+#line 1217
+#define IS_int
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_or
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "or"
+#endif
+
+static PyObject *
+int_or(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_int arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyIntArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyIntArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Int);
+        assert(is_forward || PyArray_IsScalar(b, Int));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_int(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_or, int_or);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_or(a,b);
+        case CONVERT_PYSCALAR:
+            if (INT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Int);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Int);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_int out;
+#if 0
+    npy_int out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = int_ctype_or(arg1, arg2, &out, &out2);
+#else
+    int retstatus = int_ctype_or(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Int);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Int, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Int);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Int, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Int);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Int, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_or
+#undef IS_int
+
+
+#line 1217
+#define IS_uint
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_or
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "or"
+#endif
+
+static PyObject *
+uint_or(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_uint arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyUIntArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUIntArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UInt);
+        assert(is_forward || PyArray_IsScalar(b, UInt));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_uint(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_or, uint_or);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_or(a,b);
+        case CONVERT_PYSCALAR:
+            if (UINT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UInt);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UInt);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_uint out;
+#if 0
+    npy_uint out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = uint_ctype_or(arg1, arg2, &out, &out2);
+#else
+    int retstatus = uint_ctype_or(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(UInt);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UInt, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(UInt);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UInt, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(UInt);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, UInt, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_or
+#undef IS_uint
+
+
+#line 1217
+#define IS_long
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_or
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "or"
+#endif
+
+static PyObject *
+long_or(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_long arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Long);
+        assert(is_forward || PyArray_IsScalar(b, Long));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_long(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_or, long_or);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_or(a,b);
+        case CONVERT_PYSCALAR:
+            if (LONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Long);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Long);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_long out;
+#if 0
+    npy_long out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = long_ctype_or(arg1, arg2, &out, &out2);
+#else
+    int retstatus = long_ctype_or(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Long);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Long, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Long);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Long, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Long);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Long, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_or
+#undef IS_long
+
+
+#line 1217
+#define IS_ulong
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_or
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "or"
+#endif
+
+static PyObject *
+ulong_or(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ulong arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyULongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyULongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, ULong);
+        assert(is_forward || PyArray_IsScalar(b, ULong));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ulong(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_or, ulong_or);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_or(a,b);
+        case CONVERT_PYSCALAR:
+            if (ULONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, ULong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, ULong);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ulong out;
+#if 0
+    npy_ulong out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ulong_ctype_or(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ulong_ctype_or(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(ULong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULong, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(ULong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULong, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(ULong);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, ULong, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_or
+#undef IS_ulong
+
+
+#line 1217
+#define IS_longlong
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_or
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "or"
+#endif
+
+static PyObject *
+longlong_or(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_longlong arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyLongLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyLongLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, LongLong);
+        assert(is_forward || PyArray_IsScalar(b, LongLong));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_longlong(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_or, longlong_or);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_or(a,b);
+        case CONVERT_PYSCALAR:
+            if (LONGLONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, LongLong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, LongLong);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_longlong out;
+#if 0
+    npy_longlong out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = longlong_ctype_or(arg1, arg2, &out, &out2);
+#else
+    int retstatus = longlong_ctype_or(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(LongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, LongLong, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(LongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, LongLong, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(LongLong);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, LongLong, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_or
+#undef IS_longlong
+
+
+#line 1217
+#define IS_ulonglong
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_or
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "or"
+#endif
+
+static PyObject *
+ulonglong_or(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ulonglong arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyULongLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyULongLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, ULongLong);
+        assert(is_forward || PyArray_IsScalar(b, ULongLong));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ulonglong(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_or, ulonglong_or);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_or(a,b);
+        case CONVERT_PYSCALAR:
+            if (ULONGLONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, ULongLong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, ULongLong);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ulonglong out;
+#if 0
+    npy_ulonglong out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ulonglong_ctype_or(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ulonglong_ctype_or(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(ULongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULongLong, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(ULongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULongLong, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(ULongLong);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, ULongLong, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_or
+#undef IS_ulonglong
+
+
+#line 1217
+#define IS_byte
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_xor
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "xor"
+#endif
+
+static PyObject *
+byte_xor(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_byte arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyByteArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyByteArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Byte);
+        assert(is_forward || PyArray_IsScalar(b, Byte));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_byte(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_xor, byte_xor);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_xor(a,b);
+        case CONVERT_PYSCALAR:
+            if (BYTE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Byte);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Byte);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_byte out;
+#if 0
+    npy_byte out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = byte_ctype_xor(arg1, arg2, &out, &out2);
+#else
+    int retstatus = byte_ctype_xor(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Byte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Byte, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Byte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Byte, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Byte);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Byte, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_xor
+#undef IS_byte
+
+
+#line 1217
+#define IS_ubyte
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_xor
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "xor"
+#endif
+
+static PyObject *
+ubyte_xor(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ubyte arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyUByteArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUByteArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UByte);
+        assert(is_forward || PyArray_IsScalar(b, UByte));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ubyte(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_xor, ubyte_xor);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_xor(a,b);
+        case CONVERT_PYSCALAR:
+            if (UBYTE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UByte);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UByte);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ubyte out;
+#if 0
+    npy_ubyte out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ubyte_ctype_xor(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ubyte_ctype_xor(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(UByte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UByte, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(UByte);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UByte, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(UByte);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, UByte, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_xor
+#undef IS_ubyte
+
+
+#line 1217
+#define IS_short
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_xor
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "xor"
+#endif
+
+static PyObject *
+short_xor(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_short arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyShortArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyShortArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Short);
+        assert(is_forward || PyArray_IsScalar(b, Short));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_short(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_xor, short_xor);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_xor(a,b);
+        case CONVERT_PYSCALAR:
+            if (SHORT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Short);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Short);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_short out;
+#if 0
+    npy_short out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = short_ctype_xor(arg1, arg2, &out, &out2);
+#else
+    int retstatus = short_ctype_xor(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Short);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Short, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Short);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Short, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Short);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Short, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_xor
+#undef IS_short
+
+
+#line 1217
+#define IS_ushort
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_xor
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "xor"
+#endif
+
+static PyObject *
+ushort_xor(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ushort arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyUShortArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUShortArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UShort);
+        assert(is_forward || PyArray_IsScalar(b, UShort));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ushort(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_xor, ushort_xor);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_xor(a,b);
+        case CONVERT_PYSCALAR:
+            if (USHORT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UShort);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UShort);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ushort out;
+#if 0
+    npy_ushort out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ushort_ctype_xor(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ushort_ctype_xor(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(UShort);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UShort, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(UShort);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UShort, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(UShort);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, UShort, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_xor
+#undef IS_ushort
+
+
+#line 1217
+#define IS_int
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_xor
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "xor"
+#endif
+
+static PyObject *
+int_xor(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_int arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyIntArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyIntArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Int);
+        assert(is_forward || PyArray_IsScalar(b, Int));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_int(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_xor, int_xor);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_xor(a,b);
+        case CONVERT_PYSCALAR:
+            if (INT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Int);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Int);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_int out;
+#if 0
+    npy_int out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = int_ctype_xor(arg1, arg2, &out, &out2);
+#else
+    int retstatus = int_ctype_xor(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Int);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Int, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Int);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Int, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Int);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Int, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_xor
+#undef IS_int
+
+
+#line 1217
+#define IS_uint
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_xor
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "xor"
+#endif
+
+static PyObject *
+uint_xor(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_uint arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyUIntArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUIntArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UInt);
+        assert(is_forward || PyArray_IsScalar(b, UInt));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_uint(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_xor, uint_xor);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_xor(a,b);
+        case CONVERT_PYSCALAR:
+            if (UINT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UInt);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UInt);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_uint out;
+#if 0
+    npy_uint out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = uint_ctype_xor(arg1, arg2, &out, &out2);
+#else
+    int retstatus = uint_ctype_xor(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(UInt);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UInt, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(UInt);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, UInt, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(UInt);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, UInt, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_xor
+#undef IS_uint
+
+
+#line 1217
+#define IS_long
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_xor
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "xor"
+#endif
+
+static PyObject *
+long_xor(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_long arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Long);
+        assert(is_forward || PyArray_IsScalar(b, Long));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_long(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_xor, long_xor);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_xor(a,b);
+        case CONVERT_PYSCALAR:
+            if (LONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Long);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Long);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_long out;
+#if 0
+    npy_long out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = long_ctype_xor(arg1, arg2, &out, &out2);
+#else
+    int retstatus = long_ctype_xor(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Long);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Long, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Long);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Long, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Long);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Long, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_xor
+#undef IS_long
+
+
+#line 1217
+#define IS_ulong
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_xor
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "xor"
+#endif
+
+static PyObject *
+ulong_xor(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ulong arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyULongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyULongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, ULong);
+        assert(is_forward || PyArray_IsScalar(b, ULong));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ulong(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_xor, ulong_xor);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_xor(a,b);
+        case CONVERT_PYSCALAR:
+            if (ULONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, ULong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, ULong);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ulong out;
+#if 0
+    npy_ulong out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ulong_ctype_xor(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ulong_ctype_xor(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(ULong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULong, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(ULong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULong, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(ULong);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, ULong, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_xor
+#undef IS_ulong
+
+
+#line 1217
+#define IS_longlong
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_xor
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "xor"
+#endif
+
+static PyObject *
+longlong_xor(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_longlong arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyLongLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyLongLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, LongLong);
+        assert(is_forward || PyArray_IsScalar(b, LongLong));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_longlong(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_xor, longlong_xor);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_xor(a,b);
+        case CONVERT_PYSCALAR:
+            if (LONGLONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, LongLong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, LongLong);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_longlong out;
+#if 0
+    npy_longlong out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = longlong_ctype_xor(arg1, arg2, &out, &out2);
+#else
+    int retstatus = longlong_ctype_xor(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(LongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, LongLong, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(LongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, LongLong, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(LongLong);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, LongLong, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_xor
+#undef IS_longlong
+
+
+#line 1217
+#define IS_ulonglong
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_xor
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "xor"
+#endif
+
+static PyObject *
+ulonglong_xor(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ulonglong arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyULongLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyULongLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, ULongLong);
+        assert(is_forward || PyArray_IsScalar(b, ULongLong));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ulonglong(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_xor, ulonglong_xor);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_xor(a,b);
+        case CONVERT_PYSCALAR:
+            if (ULONGLONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, ULongLong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, ULongLong);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_ulonglong out;
+#if 0
+    npy_ulonglong out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ulonglong_ctype_xor(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ulonglong_ctype_xor(arg1, arg2, &out);
+#endif
+
+#if 0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(ULongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULongLong, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(ULongLong);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, ULongLong, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(ULongLong);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, ULongLong, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_xor
+#undef IS_ulonglong
+
+
+#line 1217
+#define IS_byte
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_true_divide
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "true_divide"
+#endif
+
+static PyObject *
+byte_true_divide(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_byte arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyByteArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyByteArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Byte);
+        assert(is_forward || PyArray_IsScalar(b, Byte));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_byte(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_true_divide, byte_true_divide);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_true_divide(a,b);
+        case CONVERT_PYSCALAR:
+            if (BYTE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Byte);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Byte);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_double out;
+#if 0
+    npy_double out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = byte_ctype_true_divide(arg1, arg2, &out, &out2);
+#else
+    int retstatus = byte_ctype_true_divide(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Double);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Double, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Double);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Double, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Double);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Double, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_true_divide
+#undef IS_byte
+
+
+#line 1217
+#define IS_ubyte
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_true_divide
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "true_divide"
+#endif
+
+static PyObject *
+ubyte_true_divide(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ubyte arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyUByteArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUByteArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UByte);
+        assert(is_forward || PyArray_IsScalar(b, UByte));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ubyte(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_true_divide, ubyte_true_divide);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_true_divide(a,b);
+        case CONVERT_PYSCALAR:
+            if (UBYTE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UByte);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UByte);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_double out;
+#if 0
+    npy_double out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ubyte_ctype_true_divide(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ubyte_ctype_true_divide(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Double);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Double, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Double);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Double, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Double);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Double, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_true_divide
+#undef IS_ubyte
+
+
+#line 1217
+#define IS_short
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_true_divide
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "true_divide"
+#endif
+
+static PyObject *
+short_true_divide(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_short arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyShortArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyShortArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Short);
+        assert(is_forward || PyArray_IsScalar(b, Short));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_short(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_true_divide, short_true_divide);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_true_divide(a,b);
+        case CONVERT_PYSCALAR:
+            if (SHORT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Short);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Short);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_double out;
+#if 0
+    npy_double out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = short_ctype_true_divide(arg1, arg2, &out, &out2);
+#else
+    int retstatus = short_ctype_true_divide(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Double);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Double, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Double);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Double, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Double);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Double, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_true_divide
+#undef IS_short
+
+
+#line 1217
+#define IS_ushort
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_true_divide
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "true_divide"
+#endif
+
+static PyObject *
+ushort_true_divide(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ushort arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyUShortArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUShortArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UShort);
+        assert(is_forward || PyArray_IsScalar(b, UShort));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ushort(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_true_divide, ushort_true_divide);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_true_divide(a,b);
+        case CONVERT_PYSCALAR:
+            if (USHORT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UShort);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UShort);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_double out;
+#if 0
+    npy_double out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ushort_ctype_true_divide(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ushort_ctype_true_divide(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Double);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Double, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Double);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Double, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Double);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Double, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_true_divide
+#undef IS_ushort
+
+
+#line 1217
+#define IS_int
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_true_divide
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "true_divide"
+#endif
+
+static PyObject *
+int_true_divide(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_int arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyIntArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyIntArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Int);
+        assert(is_forward || PyArray_IsScalar(b, Int));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_int(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_true_divide, int_true_divide);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_true_divide(a,b);
+        case CONVERT_PYSCALAR:
+            if (INT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Int);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Int);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_double out;
+#if 0
+    npy_double out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = int_ctype_true_divide(arg1, arg2, &out, &out2);
+#else
+    int retstatus = int_ctype_true_divide(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Double);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Double, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Double);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Double, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Double);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Double, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_true_divide
+#undef IS_int
+
+
+#line 1217
+#define IS_uint
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_true_divide
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "true_divide"
+#endif
+
+static PyObject *
+uint_true_divide(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_uint arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyUIntArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUIntArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UInt);
+        assert(is_forward || PyArray_IsScalar(b, UInt));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_uint(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_true_divide, uint_true_divide);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_true_divide(a,b);
+        case CONVERT_PYSCALAR:
+            if (UINT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UInt);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UInt);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_double out;
+#if 0
+    npy_double out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = uint_ctype_true_divide(arg1, arg2, &out, &out2);
+#else
+    int retstatus = uint_ctype_true_divide(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Double);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Double, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Double);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Double, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Double);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Double, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_true_divide
+#undef IS_uint
+
+
+#line 1217
+#define IS_long
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_true_divide
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "true_divide"
+#endif
+
+static PyObject *
+long_true_divide(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_long arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Long);
+        assert(is_forward || PyArray_IsScalar(b, Long));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_long(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_true_divide, long_true_divide);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_true_divide(a,b);
+        case CONVERT_PYSCALAR:
+            if (LONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Long);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Long);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_double out;
+#if 0
+    npy_double out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = long_ctype_true_divide(arg1, arg2, &out, &out2);
+#else
+    int retstatus = long_ctype_true_divide(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Double);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Double, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Double);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Double, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Double);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Double, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_true_divide
+#undef IS_long
+
+
+#line 1217
+#define IS_ulong
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_true_divide
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "true_divide"
+#endif
+
+static PyObject *
+ulong_true_divide(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ulong arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyULongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyULongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, ULong);
+        assert(is_forward || PyArray_IsScalar(b, ULong));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ulong(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_true_divide, ulong_true_divide);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_true_divide(a,b);
+        case CONVERT_PYSCALAR:
+            if (ULONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, ULong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, ULong);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_double out;
+#if 0
+    npy_double out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ulong_ctype_true_divide(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ulong_ctype_true_divide(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Double);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Double, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Double);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Double, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Double);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Double, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_true_divide
+#undef IS_ulong
+
+
+#line 1217
+#define IS_longlong
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_true_divide
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "true_divide"
+#endif
+
+static PyObject *
+longlong_true_divide(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_longlong arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyLongLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyLongLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, LongLong);
+        assert(is_forward || PyArray_IsScalar(b, LongLong));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_longlong(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_true_divide, longlong_true_divide);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_true_divide(a,b);
+        case CONVERT_PYSCALAR:
+            if (LONGLONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, LongLong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, LongLong);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_double out;
+#if 0
+    npy_double out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = longlong_ctype_true_divide(arg1, arg2, &out, &out2);
+#else
+    int retstatus = longlong_ctype_true_divide(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Double);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Double, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Double);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Double, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Double);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Double, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_true_divide
+#undef IS_longlong
+
+
+#line 1217
+#define IS_ulonglong
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_true_divide
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "true_divide"
+#endif
+
+static PyObject *
+ulonglong_true_divide(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_ulonglong arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyULongLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyULongLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, ULongLong);
+        assert(is_forward || PyArray_IsScalar(b, ULongLong));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_ulonglong(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_true_divide, ulonglong_true_divide);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_true_divide(a,b);
+        case CONVERT_PYSCALAR:
+            if (ULONGLONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, ULongLong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, ULongLong);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_double out;
+#if 0
+    npy_double out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = ulonglong_ctype_true_divide(arg1, arg2, &out, &out2);
+#else
+    int retstatus = ulonglong_ctype_true_divide(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Double);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Double, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Double);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Double, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Double);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Double, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_true_divide
+#undef IS_ulonglong
+
+
+#line 1217
+#define IS_half
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_add
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "add"
+#endif
+
+static PyObject *
+half_add(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_half arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyHalfArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyHalfArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Half);
+        assert(is_forward || PyArray_IsScalar(b, Half));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_half(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_add, half_add);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_add(a,b);
+        case CONVERT_PYSCALAR:
+            if (HALF_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Half);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Half);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_half out;
+#if 0
+    npy_half out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = half_ctype_add(arg1, arg2, &out, &out2);
+#else
+    int retstatus = half_ctype_add(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Half);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Half, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Half);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Half, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Half);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Half, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_add
+#undef IS_half
+
+
+#line 1217
+#define IS_float
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_add
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "add"
+#endif
+
+static PyObject *
+float_add(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_float arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyFloatArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyFloatArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Float);
+        assert(is_forward || PyArray_IsScalar(b, Float));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_float(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_add, float_add);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_add(a,b);
+        case CONVERT_PYSCALAR:
+            if (FLOAT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Float);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Float);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_float out;
+#if 0
+    npy_float out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = float_ctype_add(arg1, arg2, &out, &out2);
+#else
+    int retstatus = float_ctype_add(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Float);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Float, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Float);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Float, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Float);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Float, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_add
+#undef IS_float
+
+
+#line 1217
+#define IS_double
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_add
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "add"
+#endif
+
+static PyObject *
+double_add(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_double arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyDoubleArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyDoubleArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Double);
+        assert(is_forward || PyArray_IsScalar(b, Double));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_double(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_add, double_add);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_add(a,b);
+        case CONVERT_PYSCALAR:
+            if (DOUBLE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Double);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Double);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_double out;
+#if 0
+    npy_double out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = double_ctype_add(arg1, arg2, &out, &out2);
+#else
+    int retstatus = double_ctype_add(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Double);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Double, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Double);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Double, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Double);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Double, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_add
+#undef IS_double
+
+
+#line 1217
+#define IS_longdouble
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_add
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "add"
+#endif
+
+static PyObject *
+longdouble_add(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_longdouble arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyLongDoubleArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyLongDoubleArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, LongDouble);
+        assert(is_forward || PyArray_IsScalar(b, LongDouble));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_longdouble(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_add, longdouble_add);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_add(a,b);
+        case CONVERT_PYSCALAR:
+            if (LONGDOUBLE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, LongDouble);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, LongDouble);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_longdouble out;
+#if 0
+    npy_longdouble out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = longdouble_ctype_add(arg1, arg2, &out, &out2);
+#else
+    int retstatus = longdouble_ctype_add(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(LongDouble);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, LongDouble, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(LongDouble);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, LongDouble, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(LongDouble);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, LongDouble, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_add
+#undef IS_longdouble
+
+
+#line 1217
+#define IS_cfloat
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_add
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "add"
+#endif
+
+static PyObject *
+cfloat_add(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_cfloat arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyCFloatArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyCFloatArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, CFloat);
+        assert(is_forward || PyArray_IsScalar(b, CFloat));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_cfloat(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_add, cfloat_add);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_add(a,b);
+        case CONVERT_PYSCALAR:
+            if (CFLOAT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, CFloat);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, CFloat);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_cfloat out;
+#if 0
+    npy_cfloat out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = cfloat_ctype_add(arg1, arg2, &out, &out2);
+#else
+    int retstatus = cfloat_ctype_add(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(CFloat);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, CFloat, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(CFloat);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, CFloat, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(CFloat);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, CFloat, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_add
+#undef IS_cfloat
+
+
+#line 1217
+#define IS_cdouble
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_add
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "add"
+#endif
+
+static PyObject *
+cdouble_add(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_cdouble arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyCDoubleArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyCDoubleArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, CDouble);
+        assert(is_forward || PyArray_IsScalar(b, CDouble));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_cdouble(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_add, cdouble_add);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_add(a,b);
+        case CONVERT_PYSCALAR:
+            if (CDOUBLE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, CDouble);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, CDouble);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_cdouble out;
+#if 0
+    npy_cdouble out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = cdouble_ctype_add(arg1, arg2, &out, &out2);
+#else
+    int retstatus = cdouble_ctype_add(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(CDouble);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, CDouble, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(CDouble);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, CDouble, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(CDouble);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, CDouble, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_add
+#undef IS_cdouble
+
+
+#line 1217
+#define IS_clongdouble
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_add
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "add"
+#endif
+
+static PyObject *
+clongdouble_add(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_clongdouble arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyCLongDoubleArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyCLongDoubleArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, CLongDouble);
+        assert(is_forward || PyArray_IsScalar(b, CLongDouble));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_clongdouble(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_add, clongdouble_add);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_add(a,b);
+        case CONVERT_PYSCALAR:
+            if (CLONGDOUBLE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, CLongDouble);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, CLongDouble);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_clongdouble out;
+#if 0
+    npy_clongdouble out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = clongdouble_ctype_add(arg1, arg2, &out, &out2);
+#else
+    int retstatus = clongdouble_ctype_add(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(CLongDouble);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, CLongDouble, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(CLongDouble);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, CLongDouble, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(CLongDouble);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, CLongDouble, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_add
+#undef IS_clongdouble
+
+
+#line 1217
+#define IS_half
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_subtract
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "subtract"
+#endif
+
+static PyObject *
+half_subtract(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_half arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyHalfArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyHalfArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Half);
+        assert(is_forward || PyArray_IsScalar(b, Half));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_half(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_subtract, half_subtract);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_subtract(a,b);
+        case CONVERT_PYSCALAR:
+            if (HALF_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Half);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Half);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_half out;
+#if 0
+    npy_half out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = half_ctype_subtract(arg1, arg2, &out, &out2);
+#else
+    int retstatus = half_ctype_subtract(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Half);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Half, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Half);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Half, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Half);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Half, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_subtract
+#undef IS_half
+
+
+#line 1217
+#define IS_float
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_subtract
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "subtract"
+#endif
+
+static PyObject *
+float_subtract(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_float arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyFloatArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyFloatArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Float);
+        assert(is_forward || PyArray_IsScalar(b, Float));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_float(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_subtract, float_subtract);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_subtract(a,b);
+        case CONVERT_PYSCALAR:
+            if (FLOAT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Float);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Float);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_float out;
+#if 0
+    npy_float out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = float_ctype_subtract(arg1, arg2, &out, &out2);
+#else
+    int retstatus = float_ctype_subtract(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Float);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Float, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Float);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Float, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Float);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Float, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_subtract
+#undef IS_float
+
+
+#line 1217
+#define IS_double
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_subtract
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "subtract"
+#endif
+
+static PyObject *
+double_subtract(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_double arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyDoubleArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyDoubleArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Double);
+        assert(is_forward || PyArray_IsScalar(b, Double));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_double(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_subtract, double_subtract);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_subtract(a,b);
+        case CONVERT_PYSCALAR:
+            if (DOUBLE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Double);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Double);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_double out;
+#if 0
+    npy_double out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = double_ctype_subtract(arg1, arg2, &out, &out2);
+#else
+    int retstatus = double_ctype_subtract(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Double);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Double, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Double);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Double, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Double);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Double, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_subtract
+#undef IS_double
+
+
+#line 1217
+#define IS_longdouble
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_subtract
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "subtract"
+#endif
+
+static PyObject *
+longdouble_subtract(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_longdouble arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyLongDoubleArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyLongDoubleArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, LongDouble);
+        assert(is_forward || PyArray_IsScalar(b, LongDouble));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_longdouble(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_subtract, longdouble_subtract);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_subtract(a,b);
+        case CONVERT_PYSCALAR:
+            if (LONGDOUBLE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, LongDouble);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, LongDouble);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_longdouble out;
+#if 0
+    npy_longdouble out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = longdouble_ctype_subtract(arg1, arg2, &out, &out2);
+#else
+    int retstatus = longdouble_ctype_subtract(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(LongDouble);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, LongDouble, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(LongDouble);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, LongDouble, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(LongDouble);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, LongDouble, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_subtract
+#undef IS_longdouble
+
+
+#line 1217
+#define IS_cfloat
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_subtract
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "subtract"
+#endif
+
+static PyObject *
+cfloat_subtract(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_cfloat arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyCFloatArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyCFloatArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, CFloat);
+        assert(is_forward || PyArray_IsScalar(b, CFloat));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_cfloat(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_subtract, cfloat_subtract);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_subtract(a,b);
+        case CONVERT_PYSCALAR:
+            if (CFLOAT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, CFloat);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, CFloat);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_cfloat out;
+#if 0
+    npy_cfloat out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = cfloat_ctype_subtract(arg1, arg2, &out, &out2);
+#else
+    int retstatus = cfloat_ctype_subtract(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(CFloat);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, CFloat, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(CFloat);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, CFloat, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(CFloat);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, CFloat, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_subtract
+#undef IS_cfloat
+
+
+#line 1217
+#define IS_cdouble
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_subtract
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "subtract"
+#endif
+
+static PyObject *
+cdouble_subtract(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_cdouble arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyCDoubleArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyCDoubleArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, CDouble);
+        assert(is_forward || PyArray_IsScalar(b, CDouble));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_cdouble(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_subtract, cdouble_subtract);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_subtract(a,b);
+        case CONVERT_PYSCALAR:
+            if (CDOUBLE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, CDouble);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, CDouble);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_cdouble out;
+#if 0
+    npy_cdouble out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = cdouble_ctype_subtract(arg1, arg2, &out, &out2);
+#else
+    int retstatus = cdouble_ctype_subtract(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(CDouble);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, CDouble, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(CDouble);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, CDouble, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(CDouble);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, CDouble, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_subtract
+#undef IS_cdouble
+
+
+#line 1217
+#define IS_clongdouble
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_subtract
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "subtract"
+#endif
+
+static PyObject *
+clongdouble_subtract(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_clongdouble arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyCLongDoubleArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyCLongDoubleArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, CLongDouble);
+        assert(is_forward || PyArray_IsScalar(b, CLongDouble));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_clongdouble(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_subtract, clongdouble_subtract);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_subtract(a,b);
+        case CONVERT_PYSCALAR:
+            if (CLONGDOUBLE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, CLongDouble);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, CLongDouble);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_clongdouble out;
+#if 0
+    npy_clongdouble out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = clongdouble_ctype_subtract(arg1, arg2, &out, &out2);
+#else
+    int retstatus = clongdouble_ctype_subtract(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(CLongDouble);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, CLongDouble, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(CLongDouble);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, CLongDouble, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(CLongDouble);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, CLongDouble, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_subtract
+#undef IS_clongdouble
+
+
+#line 1217
+#define IS_half
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_multiply
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "multiply"
+#endif
+
+static PyObject *
+half_multiply(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_half arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyHalfArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyHalfArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Half);
+        assert(is_forward || PyArray_IsScalar(b, Half));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_half(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_multiply, half_multiply);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_multiply(a,b);
+        case CONVERT_PYSCALAR:
+            if (HALF_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Half);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Half);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_half out;
+#if 0
+    npy_half out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = half_ctype_multiply(arg1, arg2, &out, &out2);
+#else
+    int retstatus = half_ctype_multiply(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Half);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Half, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Half);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Half, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Half);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Half, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_multiply
+#undef IS_half
+
+
+#line 1217
+#define IS_float
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_multiply
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "multiply"
+#endif
+
+static PyObject *
+float_multiply(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_float arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyFloatArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyFloatArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Float);
+        assert(is_forward || PyArray_IsScalar(b, Float));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_float(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_multiply, float_multiply);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_multiply(a,b);
+        case CONVERT_PYSCALAR:
+            if (FLOAT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Float);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Float);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_float out;
+#if 0
+    npy_float out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = float_ctype_multiply(arg1, arg2, &out, &out2);
+#else
+    int retstatus = float_ctype_multiply(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Float);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Float, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Float);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Float, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Float);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Float, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_multiply
+#undef IS_float
+
+
+#line 1217
+#define IS_double
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_multiply
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "multiply"
+#endif
+
+static PyObject *
+double_multiply(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_double arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyDoubleArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyDoubleArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Double);
+        assert(is_forward || PyArray_IsScalar(b, Double));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_double(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_multiply, double_multiply);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_multiply(a,b);
+        case CONVERT_PYSCALAR:
+            if (DOUBLE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Double);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Double);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_double out;
+#if 0
+    npy_double out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = double_ctype_multiply(arg1, arg2, &out, &out2);
+#else
+    int retstatus = double_ctype_multiply(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Double);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Double, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Double);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Double, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Double);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Double, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_multiply
+#undef IS_double
+
+
+#line 1217
+#define IS_longdouble
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_multiply
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "multiply"
+#endif
+
+static PyObject *
+longdouble_multiply(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_longdouble arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyLongDoubleArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyLongDoubleArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, LongDouble);
+        assert(is_forward || PyArray_IsScalar(b, LongDouble));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_longdouble(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_multiply, longdouble_multiply);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_multiply(a,b);
+        case CONVERT_PYSCALAR:
+            if (LONGDOUBLE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, LongDouble);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, LongDouble);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_longdouble out;
+#if 0
+    npy_longdouble out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = longdouble_ctype_multiply(arg1, arg2, &out, &out2);
+#else
+    int retstatus = longdouble_ctype_multiply(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(LongDouble);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, LongDouble, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(LongDouble);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, LongDouble, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(LongDouble);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, LongDouble, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_multiply
+#undef IS_longdouble
+
+
+#line 1217
+#define IS_cfloat
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_multiply
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "multiply"
+#endif
+
+static PyObject *
+cfloat_multiply(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_cfloat arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyCFloatArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyCFloatArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, CFloat);
+        assert(is_forward || PyArray_IsScalar(b, CFloat));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_cfloat(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_multiply, cfloat_multiply);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_multiply(a,b);
+        case CONVERT_PYSCALAR:
+            if (CFLOAT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, CFloat);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, CFloat);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_cfloat out;
+#if 0
+    npy_cfloat out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = cfloat_ctype_multiply(arg1, arg2, &out, &out2);
+#else
+    int retstatus = cfloat_ctype_multiply(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(CFloat);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, CFloat, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(CFloat);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, CFloat, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(CFloat);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, CFloat, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_multiply
+#undef IS_cfloat
+
+
+#line 1217
+#define IS_cdouble
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_multiply
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "multiply"
+#endif
+
+static PyObject *
+cdouble_multiply(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_cdouble arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyCDoubleArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyCDoubleArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, CDouble);
+        assert(is_forward || PyArray_IsScalar(b, CDouble));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_cdouble(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_multiply, cdouble_multiply);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_multiply(a,b);
+        case CONVERT_PYSCALAR:
+            if (CDOUBLE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, CDouble);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, CDouble);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_cdouble out;
+#if 0
+    npy_cdouble out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = cdouble_ctype_multiply(arg1, arg2, &out, &out2);
+#else
+    int retstatus = cdouble_ctype_multiply(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(CDouble);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, CDouble, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(CDouble);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, CDouble, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(CDouble);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, CDouble, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_multiply
+#undef IS_cdouble
+
+
+#line 1217
+#define IS_clongdouble
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_multiply
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "multiply"
+#endif
+
+static PyObject *
+clongdouble_multiply(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_clongdouble arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyCLongDoubleArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyCLongDoubleArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, CLongDouble);
+        assert(is_forward || PyArray_IsScalar(b, CLongDouble));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_clongdouble(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_multiply, clongdouble_multiply);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_multiply(a,b);
+        case CONVERT_PYSCALAR:
+            if (CLONGDOUBLE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, CLongDouble);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, CLongDouble);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_clongdouble out;
+#if 0
+    npy_clongdouble out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = clongdouble_ctype_multiply(arg1, arg2, &out, &out2);
+#else
+    int retstatus = clongdouble_ctype_multiply(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(CLongDouble);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, CLongDouble, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(CLongDouble);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, CLongDouble, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(CLongDouble);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, CLongDouble, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_multiply
+#undef IS_clongdouble
+
+
+#line 1217
+#define IS_half
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_true_divide
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "true_divide"
+#endif
+
+static PyObject *
+half_true_divide(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_half arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyHalfArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyHalfArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Half);
+        assert(is_forward || PyArray_IsScalar(b, Half));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_half(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_true_divide, half_true_divide);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_true_divide(a,b);
+        case CONVERT_PYSCALAR:
+            if (HALF_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Half);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Half);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_half out;
+#if 0
+    npy_half out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = half_ctype_true_divide(arg1, arg2, &out, &out2);
+#else
+    int retstatus = half_ctype_true_divide(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Half);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Half, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Half);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Half, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Half);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Half, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_true_divide
+#undef IS_half
+
+
+#line 1217
+#define IS_float
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_true_divide
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "true_divide"
+#endif
+
+static PyObject *
+float_true_divide(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_float arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyFloatArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyFloatArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Float);
+        assert(is_forward || PyArray_IsScalar(b, Float));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_float(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_true_divide, float_true_divide);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_true_divide(a,b);
+        case CONVERT_PYSCALAR:
+            if (FLOAT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Float);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Float);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_float out;
+#if 0
+    npy_float out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = float_ctype_true_divide(arg1, arg2, &out, &out2);
+#else
+    int retstatus = float_ctype_true_divide(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Float);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Float, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Float);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Float, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Float);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Float, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_true_divide
+#undef IS_float
+
+
+#line 1217
+#define IS_double
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_true_divide
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "true_divide"
+#endif
+
+static PyObject *
+double_true_divide(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_double arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyDoubleArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyDoubleArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Double);
+        assert(is_forward || PyArray_IsScalar(b, Double));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_double(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_true_divide, double_true_divide);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_true_divide(a,b);
+        case CONVERT_PYSCALAR:
+            if (DOUBLE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Double);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Double);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_double out;
+#if 0
+    npy_double out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = double_ctype_true_divide(arg1, arg2, &out, &out2);
+#else
+    int retstatus = double_ctype_true_divide(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Double);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Double, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Double);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Double, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Double);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Double, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_true_divide
+#undef IS_double
+
+
+#line 1217
+#define IS_longdouble
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_true_divide
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "true_divide"
+#endif
+
+static PyObject *
+longdouble_true_divide(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_longdouble arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyLongDoubleArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyLongDoubleArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, LongDouble);
+        assert(is_forward || PyArray_IsScalar(b, LongDouble));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_longdouble(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_true_divide, longdouble_true_divide);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_true_divide(a,b);
+        case CONVERT_PYSCALAR:
+            if (LONGDOUBLE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, LongDouble);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, LongDouble);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_longdouble out;
+#if 0
+    npy_longdouble out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = longdouble_ctype_true_divide(arg1, arg2, &out, &out2);
+#else
+    int retstatus = longdouble_ctype_true_divide(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(LongDouble);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, LongDouble, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(LongDouble);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, LongDouble, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(LongDouble);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, LongDouble, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_true_divide
+#undef IS_longdouble
+
+
+#line 1217
+#define IS_cfloat
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_true_divide
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "true_divide"
+#endif
+
+static PyObject *
+cfloat_true_divide(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_cfloat arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyCFloatArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyCFloatArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, CFloat);
+        assert(is_forward || PyArray_IsScalar(b, CFloat));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_cfloat(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_true_divide, cfloat_true_divide);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_true_divide(a,b);
+        case CONVERT_PYSCALAR:
+            if (CFLOAT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, CFloat);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, CFloat);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_cfloat out;
+#if 0
+    npy_cfloat out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = cfloat_ctype_true_divide(arg1, arg2, &out, &out2);
+#else
+    int retstatus = cfloat_ctype_true_divide(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(CFloat);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, CFloat, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(CFloat);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, CFloat, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(CFloat);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, CFloat, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_true_divide
+#undef IS_cfloat
+
+
+#line 1217
+#define IS_cdouble
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_true_divide
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "true_divide"
+#endif
+
+static PyObject *
+cdouble_true_divide(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_cdouble arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyCDoubleArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyCDoubleArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, CDouble);
+        assert(is_forward || PyArray_IsScalar(b, CDouble));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_cdouble(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_true_divide, cdouble_true_divide);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_true_divide(a,b);
+        case CONVERT_PYSCALAR:
+            if (CDOUBLE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, CDouble);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, CDouble);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_cdouble out;
+#if 0
+    npy_cdouble out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = cdouble_ctype_true_divide(arg1, arg2, &out, &out2);
+#else
+    int retstatus = cdouble_ctype_true_divide(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(CDouble);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, CDouble, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(CDouble);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, CDouble, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(CDouble);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, CDouble, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_true_divide
+#undef IS_cdouble
+
+
+#line 1217
+#define IS_clongdouble
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_true_divide
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "true_divide"
+#endif
+
+static PyObject *
+clongdouble_true_divide(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_clongdouble arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyCLongDoubleArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyCLongDoubleArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, CLongDouble);
+        assert(is_forward || PyArray_IsScalar(b, CLongDouble));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_clongdouble(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_true_divide, clongdouble_true_divide);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_true_divide(a,b);
+        case CONVERT_PYSCALAR:
+            if (CLONGDOUBLE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, CLongDouble);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, CLongDouble);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_clongdouble out;
+#if 0
+    npy_clongdouble out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = clongdouble_ctype_true_divide(arg1, arg2, &out, &out2);
+#else
+    int retstatus = clongdouble_ctype_true_divide(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(CLongDouble);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, CLongDouble, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(CLongDouble);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, CLongDouble, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(CLongDouble);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, CLongDouble, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_true_divide
+#undef IS_clongdouble
+
+
+#line 1217
+#define IS_half
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_floor_divide
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "floor_divide"
+#endif
+
+static PyObject *
+half_floor_divide(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_half arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyHalfArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyHalfArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Half);
+        assert(is_forward || PyArray_IsScalar(b, Half));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_half(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_floor_divide, half_floor_divide);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_floor_divide(a,b);
+        case CONVERT_PYSCALAR:
+            if (HALF_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Half);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Half);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_half out;
+#if 0
+    npy_half out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = half_ctype_floor_divide(arg1, arg2, &out, &out2);
+#else
+    int retstatus = half_ctype_floor_divide(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Half);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Half, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Half);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Half, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Half);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Half, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_floor_divide
+#undef IS_half
+
+
+#line 1217
+#define IS_float
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_floor_divide
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "floor_divide"
+#endif
+
+static PyObject *
+float_floor_divide(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_float arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyFloatArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyFloatArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Float);
+        assert(is_forward || PyArray_IsScalar(b, Float));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_float(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_floor_divide, float_floor_divide);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_floor_divide(a,b);
+        case CONVERT_PYSCALAR:
+            if (FLOAT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Float);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Float);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_float out;
+#if 0
+    npy_float out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = float_ctype_floor_divide(arg1, arg2, &out, &out2);
+#else
+    int retstatus = float_ctype_floor_divide(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Float);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Float, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Float);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Float, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Float);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Float, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_floor_divide
+#undef IS_float
+
+
+#line 1217
+#define IS_double
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_floor_divide
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "floor_divide"
+#endif
+
+static PyObject *
+double_floor_divide(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_double arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyDoubleArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyDoubleArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Double);
+        assert(is_forward || PyArray_IsScalar(b, Double));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_double(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_floor_divide, double_floor_divide);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_floor_divide(a,b);
+        case CONVERT_PYSCALAR:
+            if (DOUBLE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Double);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Double);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_double out;
+#if 0
+    npy_double out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = double_ctype_floor_divide(arg1, arg2, &out, &out2);
+#else
+    int retstatus = double_ctype_floor_divide(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Double);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Double, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Double);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Double, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Double);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Double, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_floor_divide
+#undef IS_double
+
+
+#line 1217
+#define IS_longdouble
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_floor_divide
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "floor_divide"
+#endif
+
+static PyObject *
+longdouble_floor_divide(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_longdouble arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyLongDoubleArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyLongDoubleArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, LongDouble);
+        assert(is_forward || PyArray_IsScalar(b, LongDouble));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_longdouble(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_floor_divide, longdouble_floor_divide);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_floor_divide(a,b);
+        case CONVERT_PYSCALAR:
+            if (LONGDOUBLE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, LongDouble);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, LongDouble);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_longdouble out;
+#if 0
+    npy_longdouble out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = longdouble_ctype_floor_divide(arg1, arg2, &out, &out2);
+#else
+    int retstatus = longdouble_ctype_floor_divide(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(LongDouble);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, LongDouble, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(LongDouble);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, LongDouble, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(LongDouble);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, LongDouble, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_floor_divide
+#undef IS_longdouble
+
+
+#line 1217
+#define IS_half
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_divmod
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "divmod"
+#endif
+
+static PyObject *
+half_divmod(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_half arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyHalfArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyHalfArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Half);
+        assert(is_forward || PyArray_IsScalar(b, Half));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_half(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_divmod, half_divmod);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_divmod(a,b);
+        case CONVERT_PYSCALAR:
+            if (HALF_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Half);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Half);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_half out;
+#if 1
+    npy_half out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 1
+    int retstatus = half_ctype_divmod(arg1, arg2, &out, &out2);
+#else
+    int retstatus = half_ctype_divmod(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 1
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Half);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Half, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Half);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Half, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Half);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Half, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_divmod
+#undef IS_half
+
+
+#line 1217
+#define IS_float
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_divmod
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "divmod"
+#endif
+
+static PyObject *
+float_divmod(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_float arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyFloatArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyFloatArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Float);
+        assert(is_forward || PyArray_IsScalar(b, Float));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_float(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_divmod, float_divmod);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_divmod(a,b);
+        case CONVERT_PYSCALAR:
+            if (FLOAT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Float);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Float);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_float out;
+#if 1
+    npy_float out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 1
+    int retstatus = float_ctype_divmod(arg1, arg2, &out, &out2);
+#else
+    int retstatus = float_ctype_divmod(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 1
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Float);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Float, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Float);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Float, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Float);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Float, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_divmod
+#undef IS_float
+
+
+#line 1217
+#define IS_double
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_divmod
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "divmod"
+#endif
+
+static PyObject *
+double_divmod(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_double arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyDoubleArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyDoubleArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Double);
+        assert(is_forward || PyArray_IsScalar(b, Double));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_double(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_divmod, double_divmod);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_divmod(a,b);
+        case CONVERT_PYSCALAR:
+            if (DOUBLE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Double);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Double);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_double out;
+#if 1
+    npy_double out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 1
+    int retstatus = double_ctype_divmod(arg1, arg2, &out, &out2);
+#else
+    int retstatus = double_ctype_divmod(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 1
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Double);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Double, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Double);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Double, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Double);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Double, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_divmod
+#undef IS_double
+
+
+#line 1217
+#define IS_longdouble
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_divmod
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "divmod"
+#endif
+
+static PyObject *
+longdouble_divmod(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_longdouble arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyLongDoubleArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyLongDoubleArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, LongDouble);
+        assert(is_forward || PyArray_IsScalar(b, LongDouble));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_longdouble(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_divmod, longdouble_divmod);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_divmod(a,b);
+        case CONVERT_PYSCALAR:
+            if (LONGDOUBLE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, LongDouble);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, LongDouble);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_longdouble out;
+#if 1
+    npy_longdouble out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 1
+    int retstatus = longdouble_ctype_divmod(arg1, arg2, &out, &out2);
+#else
+    int retstatus = longdouble_ctype_divmod(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 1
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(LongDouble);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, LongDouble, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(LongDouble);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, LongDouble, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(LongDouble);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, LongDouble, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_divmod
+#undef IS_longdouble
+
+
+#line 1217
+#define IS_half
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_remainder
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "remainder"
+#endif
+
+static PyObject *
+half_remainder(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_half arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyHalfArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyHalfArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Half);
+        assert(is_forward || PyArray_IsScalar(b, Half));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_half(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_remainder, half_remainder);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_remainder(a,b);
+        case CONVERT_PYSCALAR:
+            if (HALF_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Half);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Half);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_half out;
+#if 0
+    npy_half out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = half_ctype_remainder(arg1, arg2, &out, &out2);
+#else
+    int retstatus = half_ctype_remainder(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Half);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Half, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Half);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Half, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Half);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Half, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_remainder
+#undef IS_half
+
+
+#line 1217
+#define IS_float
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_remainder
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "remainder"
+#endif
+
+static PyObject *
+float_remainder(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_float arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyFloatArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyFloatArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Float);
+        assert(is_forward || PyArray_IsScalar(b, Float));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_float(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_remainder, float_remainder);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_remainder(a,b);
+        case CONVERT_PYSCALAR:
+            if (FLOAT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Float);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Float);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_float out;
+#if 0
+    npy_float out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = float_ctype_remainder(arg1, arg2, &out, &out2);
+#else
+    int retstatus = float_ctype_remainder(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Float);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Float, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Float);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Float, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Float);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Float, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_remainder
+#undef IS_float
+
+
+#line 1217
+#define IS_double
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_remainder
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "remainder"
+#endif
+
+static PyObject *
+double_remainder(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_double arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyDoubleArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyDoubleArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Double);
+        assert(is_forward || PyArray_IsScalar(b, Double));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_double(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_remainder, double_remainder);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_remainder(a,b);
+        case CONVERT_PYSCALAR:
+            if (DOUBLE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Double);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Double);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_double out;
+#if 0
+    npy_double out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = double_ctype_remainder(arg1, arg2, &out, &out2);
+#else
+    int retstatus = double_ctype_remainder(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(Double);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Double, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(Double);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, Double, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(Double);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Double, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_remainder
+#undef IS_double
+
+
+#line 1217
+#define IS_longdouble
+/* drop the "true_" from "true_divide" for floating point warnings: */
+#define IS_remainder
+#ifdef IS_true_divide
+    #define OP_NAME "divide"
+#else
+    #define OP_NAME "remainder"
+#endif
+
+static PyObject *
+longdouble_remainder(PyObject *a, PyObject *b)
+{
+    PyObject *ret;
+    npy_longdouble arg1, arg2, other_val;
+
+    /*
+     * Check if this operation may be considered forward.  Note `is_forward`
+     * does not imply that we can defer to a subclass `b`.  It just means that
+     * the first operand fits to the method.
+     */
+    int is_forward;
+    if (Py_TYPE(a) == &PyLongDoubleArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyLongDoubleArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, LongDouble);
+        assert(is_forward || PyArray_IsScalar(b, LongDouble));
+    }
+
+    /*
+     * Extract the other value (if it is compatible).  Otherwise, decide
+     * how to deal with it.  This is somewhat complicated.
+     *
+     * Note: This pattern is used multiple times below.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    conversion_result res = convert_to_longdouble(
+            other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_remainder, longdouble_remainder);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            /*
+             * defer to other;  This is normally a forward operation.  However,
+             * it could be backward if an operation is undefined forward.
+             * An example is the complex remainder `complex % bool` will defer
+             * even though it would normally handle the operation.
+             */
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+            /*
+             * Either an array-like, unknown scalar (any Python object, but
+             * also integers that are too large to convert to `long`), or
+             * even a subclass of a NumPy scalar (currently).
+             *
+             * Generally, we try dropping through to the array path here,
+             * but this can lead to infinite recursions for (c)longdouble.
+             */
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            /*
+             * Python scalar that is larger than the current one, or two
+             * NumPy scalars that promote to a third (uint16 + int16 -> int32).
+             *
+             * TODO: We could special case the promotion case here for much
+             *       better speed and to deal with integer overflow warnings
+             *       correctly.  (e.g. `uint8 * int8` cannot warn).
+             */
+            return PyGenericArrType_Type.tp_as_number->nb_remainder(a,b);
+        case CONVERT_PYSCALAR:
+            if (LONGDOUBLE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if 1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, LongDouble);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, LongDouble);
+    }
+
+    /*
+     * Prepare the actual calculation.
+     */
+    npy_longdouble out;
+#if 0
+    npy_longdouble out2;
+    PyObject *obj;
+#endif
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     * Note that `retstatus` is the "floating point error" value for integer
+     * functions.  Float functions should always return 0, and then use
+     * the following `npy_get_floatstatus_barrier`.
+     */
+#if 0
+    int retstatus = longdouble_ctype_remainder(arg1, arg2, &out, &out2);
+#else
+    int retstatus = longdouble_ctype_remainder(arg1, arg2, &out);
+#endif
+
+#if 1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar " OP_NAME, retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+
+#if 0
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+    obj = PyArrayScalar_New(LongDouble);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, LongDouble, out);
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyArrayScalar_New(LongDouble);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(obj, LongDouble, out2);
+    PyTuple_SET_ITEM(ret, 1, obj);
+#else
+    ret = PyArrayScalar_New(LongDouble);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, LongDouble, out);
+#endif
+    return ret;
+}
+
+
+#undef OP_NAME
+#undef IS_remainder
+#undef IS_longdouble
+
+
+
+#define _IS_ZERO(x) (x == 0)
+
+#line 1421
+#define IS_byte
+
+static PyObject *
+byte_power(PyObject *a, PyObject *b, PyObject *modulo)
+{
+    if (modulo != Py_None) {
+        /* modular exponentiation is not implemented (gh-8804) */
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+
+    PyObject *ret;
+    npy_byte arg1, arg2, other_val;
+
+    int is_forward;
+    if (Py_TYPE(a) == &PyByteArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyByteArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Byte);
+        assert(is_forward || PyArray_IsScalar(b, Byte));
+    }
+    /*
+     * Extract the other value (if it is compatible). See the generic
+     * (non power) version above for detailed notes.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    int res = convert_to_byte(other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_power, byte_power);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            return PyGenericArrType_Type.tp_as_number->nb_power(a, b, modulo);
+        case CONVERT_PYSCALAR:
+            if (BYTE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if !1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Byte);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Byte);
+    }
+
+    /*
+     * Prepare the actual calculation:
+     */
+    npy_byte out;
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     */
+#if 1 && !0
+    if (arg2 < 0) {
+        PyErr_SetString(PyExc_ValueError,
+                "Integers to negative integer powers are not allowed.");
+        return NULL;
+    }
+#endif
+    int retstatus = byte_ctype_power(arg1, arg2, &out);
+
+#if !1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar power", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    ret = PyArrayScalar_New(Byte);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Byte, out);
+
+    return ret;
+}
+
+
+#undef IS_byte
+
+#line 1421
+#define IS_ubyte
+
+static PyObject *
+ubyte_power(PyObject *a, PyObject *b, PyObject *modulo)
+{
+    if (modulo != Py_None) {
+        /* modular exponentiation is not implemented (gh-8804) */
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+
+    PyObject *ret;
+    npy_ubyte arg1, arg2, other_val;
+
+    int is_forward;
+    if (Py_TYPE(a) == &PyUByteArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUByteArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UByte);
+        assert(is_forward || PyArray_IsScalar(b, UByte));
+    }
+    /*
+     * Extract the other value (if it is compatible). See the generic
+     * (non power) version above for detailed notes.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    int res = convert_to_ubyte(other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_power, ubyte_power);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            return PyGenericArrType_Type.tp_as_number->nb_power(a, b, modulo);
+        case CONVERT_PYSCALAR:
+            if (UBYTE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if !1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UByte);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UByte);
+    }
+
+    /*
+     * Prepare the actual calculation:
+     */
+    npy_ubyte out;
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     */
+#if 1 && !1
+    if (arg2 < 0) {
+        PyErr_SetString(PyExc_ValueError,
+                "Integers to negative integer powers are not allowed.");
+        return NULL;
+    }
+#endif
+    int retstatus = ubyte_ctype_power(arg1, arg2, &out);
+
+#if !1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar power", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    ret = PyArrayScalar_New(UByte);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, UByte, out);
+
+    return ret;
+}
+
+
+#undef IS_ubyte
+
+#line 1421
+#define IS_short
+
+static PyObject *
+short_power(PyObject *a, PyObject *b, PyObject *modulo)
+{
+    if (modulo != Py_None) {
+        /* modular exponentiation is not implemented (gh-8804) */
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+
+    PyObject *ret;
+    npy_short arg1, arg2, other_val;
+
+    int is_forward;
+    if (Py_TYPE(a) == &PyShortArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyShortArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Short);
+        assert(is_forward || PyArray_IsScalar(b, Short));
+    }
+    /*
+     * Extract the other value (if it is compatible). See the generic
+     * (non power) version above for detailed notes.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    int res = convert_to_short(other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_power, short_power);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            return PyGenericArrType_Type.tp_as_number->nb_power(a, b, modulo);
+        case CONVERT_PYSCALAR:
+            if (SHORT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if !1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Short);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Short);
+    }
+
+    /*
+     * Prepare the actual calculation:
+     */
+    npy_short out;
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     */
+#if 1 && !0
+    if (arg2 < 0) {
+        PyErr_SetString(PyExc_ValueError,
+                "Integers to negative integer powers are not allowed.");
+        return NULL;
+    }
+#endif
+    int retstatus = short_ctype_power(arg1, arg2, &out);
+
+#if !1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar power", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    ret = PyArrayScalar_New(Short);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Short, out);
+
+    return ret;
+}
+
+
+#undef IS_short
+
+#line 1421
+#define IS_ushort
+
+static PyObject *
+ushort_power(PyObject *a, PyObject *b, PyObject *modulo)
+{
+    if (modulo != Py_None) {
+        /* modular exponentiation is not implemented (gh-8804) */
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+
+    PyObject *ret;
+    npy_ushort arg1, arg2, other_val;
+
+    int is_forward;
+    if (Py_TYPE(a) == &PyUShortArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUShortArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UShort);
+        assert(is_forward || PyArray_IsScalar(b, UShort));
+    }
+    /*
+     * Extract the other value (if it is compatible). See the generic
+     * (non power) version above for detailed notes.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    int res = convert_to_ushort(other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_power, ushort_power);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            return PyGenericArrType_Type.tp_as_number->nb_power(a, b, modulo);
+        case CONVERT_PYSCALAR:
+            if (USHORT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if !1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UShort);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UShort);
+    }
+
+    /*
+     * Prepare the actual calculation:
+     */
+    npy_ushort out;
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     */
+#if 1 && !1
+    if (arg2 < 0) {
+        PyErr_SetString(PyExc_ValueError,
+                "Integers to negative integer powers are not allowed.");
+        return NULL;
+    }
+#endif
+    int retstatus = ushort_ctype_power(arg1, arg2, &out);
+
+#if !1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar power", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    ret = PyArrayScalar_New(UShort);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, UShort, out);
+
+    return ret;
+}
+
+
+#undef IS_ushort
+
+#line 1421
+#define IS_int
+
+static PyObject *
+int_power(PyObject *a, PyObject *b, PyObject *modulo)
+{
+    if (modulo != Py_None) {
+        /* modular exponentiation is not implemented (gh-8804) */
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+
+    PyObject *ret;
+    npy_int arg1, arg2, other_val;
+
+    int is_forward;
+    if (Py_TYPE(a) == &PyIntArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyIntArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Int);
+        assert(is_forward || PyArray_IsScalar(b, Int));
+    }
+    /*
+     * Extract the other value (if it is compatible). See the generic
+     * (non power) version above for detailed notes.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    int res = convert_to_int(other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_power, int_power);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            return PyGenericArrType_Type.tp_as_number->nb_power(a, b, modulo);
+        case CONVERT_PYSCALAR:
+            if (INT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if !1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Int);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Int);
+    }
+
+    /*
+     * Prepare the actual calculation:
+     */
+    npy_int out;
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     */
+#if 1 && !0
+    if (arg2 < 0) {
+        PyErr_SetString(PyExc_ValueError,
+                "Integers to negative integer powers are not allowed.");
+        return NULL;
+    }
+#endif
+    int retstatus = int_ctype_power(arg1, arg2, &out);
+
+#if !1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar power", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    ret = PyArrayScalar_New(Int);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Int, out);
+
+    return ret;
+}
+
+
+#undef IS_int
+
+#line 1421
+#define IS_uint
+
+static PyObject *
+uint_power(PyObject *a, PyObject *b, PyObject *modulo)
+{
+    if (modulo != Py_None) {
+        /* modular exponentiation is not implemented (gh-8804) */
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+
+    PyObject *ret;
+    npy_uint arg1, arg2, other_val;
+
+    int is_forward;
+    if (Py_TYPE(a) == &PyUIntArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyUIntArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, UInt);
+        assert(is_forward || PyArray_IsScalar(b, UInt));
+    }
+    /*
+     * Extract the other value (if it is compatible). See the generic
+     * (non power) version above for detailed notes.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    int res = convert_to_uint(other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_power, uint_power);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            return PyGenericArrType_Type.tp_as_number->nb_power(a, b, modulo);
+        case CONVERT_PYSCALAR:
+            if (UINT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if !1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, UInt);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, UInt);
+    }
+
+    /*
+     * Prepare the actual calculation:
+     */
+    npy_uint out;
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     */
+#if 1 && !1
+    if (arg2 < 0) {
+        PyErr_SetString(PyExc_ValueError,
+                "Integers to negative integer powers are not allowed.");
+        return NULL;
+    }
+#endif
+    int retstatus = uint_ctype_power(arg1, arg2, &out);
+
+#if !1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar power", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    ret = PyArrayScalar_New(UInt);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, UInt, out);
+
+    return ret;
+}
+
+
+#undef IS_uint
+
+#line 1421
+#define IS_long
+
+static PyObject *
+long_power(PyObject *a, PyObject *b, PyObject *modulo)
+{
+    if (modulo != Py_None) {
+        /* modular exponentiation is not implemented (gh-8804) */
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+
+    PyObject *ret;
+    npy_long arg1, arg2, other_val;
+
+    int is_forward;
+    if (Py_TYPE(a) == &PyLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Long);
+        assert(is_forward || PyArray_IsScalar(b, Long));
+    }
+    /*
+     * Extract the other value (if it is compatible). See the generic
+     * (non power) version above for detailed notes.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    int res = convert_to_long(other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_power, long_power);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            return PyGenericArrType_Type.tp_as_number->nb_power(a, b, modulo);
+        case CONVERT_PYSCALAR:
+            if (LONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if !1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Long);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Long);
+    }
+
+    /*
+     * Prepare the actual calculation:
+     */
+    npy_long out;
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     */
+#if 1 && !0
+    if (arg2 < 0) {
+        PyErr_SetString(PyExc_ValueError,
+                "Integers to negative integer powers are not allowed.");
+        return NULL;
+    }
+#endif
+    int retstatus = long_ctype_power(arg1, arg2, &out);
+
+#if !1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar power", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    ret = PyArrayScalar_New(Long);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Long, out);
+
+    return ret;
+}
+
+
+#undef IS_long
+
+#line 1421
+#define IS_ulong
+
+static PyObject *
+ulong_power(PyObject *a, PyObject *b, PyObject *modulo)
+{
+    if (modulo != Py_None) {
+        /* modular exponentiation is not implemented (gh-8804) */
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+
+    PyObject *ret;
+    npy_ulong arg1, arg2, other_val;
+
+    int is_forward;
+    if (Py_TYPE(a) == &PyULongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyULongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, ULong);
+        assert(is_forward || PyArray_IsScalar(b, ULong));
+    }
+    /*
+     * Extract the other value (if it is compatible). See the generic
+     * (non power) version above for detailed notes.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    int res = convert_to_ulong(other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_power, ulong_power);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            return PyGenericArrType_Type.tp_as_number->nb_power(a, b, modulo);
+        case CONVERT_PYSCALAR:
+            if (ULONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if !1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, ULong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, ULong);
+    }
+
+    /*
+     * Prepare the actual calculation:
+     */
+    npy_ulong out;
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     */
+#if 1 && !1
+    if (arg2 < 0) {
+        PyErr_SetString(PyExc_ValueError,
+                "Integers to negative integer powers are not allowed.");
+        return NULL;
+    }
+#endif
+    int retstatus = ulong_ctype_power(arg1, arg2, &out);
+
+#if !1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar power", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    ret = PyArrayScalar_New(ULong);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, ULong, out);
+
+    return ret;
+}
+
+
+#undef IS_ulong
+
+#line 1421
+#define IS_longlong
+
+static PyObject *
+longlong_power(PyObject *a, PyObject *b, PyObject *modulo)
+{
+    if (modulo != Py_None) {
+        /* modular exponentiation is not implemented (gh-8804) */
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+
+    PyObject *ret;
+    npy_longlong arg1, arg2, other_val;
+
+    int is_forward;
+    if (Py_TYPE(a) == &PyLongLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyLongLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, LongLong);
+        assert(is_forward || PyArray_IsScalar(b, LongLong));
+    }
+    /*
+     * Extract the other value (if it is compatible). See the generic
+     * (non power) version above for detailed notes.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    int res = convert_to_longlong(other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_power, longlong_power);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            return PyGenericArrType_Type.tp_as_number->nb_power(a, b, modulo);
+        case CONVERT_PYSCALAR:
+            if (LONGLONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if !1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, LongLong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, LongLong);
+    }
+
+    /*
+     * Prepare the actual calculation:
+     */
+    npy_longlong out;
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     */
+#if 1 && !0
+    if (arg2 < 0) {
+        PyErr_SetString(PyExc_ValueError,
+                "Integers to negative integer powers are not allowed.");
+        return NULL;
+    }
+#endif
+    int retstatus = longlong_ctype_power(arg1, arg2, &out);
+
+#if !1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar power", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    ret = PyArrayScalar_New(LongLong);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, LongLong, out);
+
+    return ret;
+}
+
+
+#undef IS_longlong
+
+#line 1421
+#define IS_ulonglong
+
+static PyObject *
+ulonglong_power(PyObject *a, PyObject *b, PyObject *modulo)
+{
+    if (modulo != Py_None) {
+        /* modular exponentiation is not implemented (gh-8804) */
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+
+    PyObject *ret;
+    npy_ulonglong arg1, arg2, other_val;
+
+    int is_forward;
+    if (Py_TYPE(a) == &PyULongLongArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyULongLongArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, ULongLong);
+        assert(is_forward || PyArray_IsScalar(b, ULongLong));
+    }
+    /*
+     * Extract the other value (if it is compatible). See the generic
+     * (non power) version above for detailed notes.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    int res = convert_to_ulonglong(other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_power, ulonglong_power);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            return PyGenericArrType_Type.tp_as_number->nb_power(a, b, modulo);
+        case CONVERT_PYSCALAR:
+            if (ULONGLONG_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if !1
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, ULongLong);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, ULongLong);
+    }
+
+    /*
+     * Prepare the actual calculation:
+     */
+    npy_ulonglong out;
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     */
+#if 1 && !1
+    if (arg2 < 0) {
+        PyErr_SetString(PyExc_ValueError,
+                "Integers to negative integer powers are not allowed.");
+        return NULL;
+    }
+#endif
+    int retstatus = ulonglong_ctype_power(arg1, arg2, &out);
+
+#if !1
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar power", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    ret = PyArrayScalar_New(ULongLong);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, ULongLong, out);
+
+    return ret;
+}
+
+
+#undef IS_ulonglong
+
+#line 1421
+#define IS_half
+
+static PyObject *
+half_power(PyObject *a, PyObject *b, PyObject *modulo)
+{
+    if (modulo != Py_None) {
+        /* modular exponentiation is not implemented (gh-8804) */
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+
+    PyObject *ret;
+    npy_half arg1, arg2, other_val;
+
+    int is_forward;
+    if (Py_TYPE(a) == &PyHalfArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyHalfArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Half);
+        assert(is_forward || PyArray_IsScalar(b, Half));
+    }
+    /*
+     * Extract the other value (if it is compatible). See the generic
+     * (non power) version above for detailed notes.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    int res = convert_to_half(other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_power, half_power);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            return PyGenericArrType_Type.tp_as_number->nb_power(a, b, modulo);
+        case CONVERT_PYSCALAR:
+            if (HALF_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if !0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Half);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Half);
+    }
+
+    /*
+     * Prepare the actual calculation:
+     */
+    npy_half out;
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     */
+#if 0 && !0
+    if (arg2 < 0) {
+        PyErr_SetString(PyExc_ValueError,
+                "Integers to negative integer powers are not allowed.");
+        return NULL;
+    }
+#endif
+    int retstatus = half_ctype_power(arg1, arg2, &out);
+
+#if !0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar power", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    ret = PyArrayScalar_New(Half);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Half, out);
+
+    return ret;
+}
+
+
+#undef IS_half
+
+#line 1421
+#define IS_float
+
+static PyObject *
+float_power(PyObject *a, PyObject *b, PyObject *modulo)
+{
+    if (modulo != Py_None) {
+        /* modular exponentiation is not implemented (gh-8804) */
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+
+    PyObject *ret;
+    npy_float arg1, arg2, other_val;
+
+    int is_forward;
+    if (Py_TYPE(a) == &PyFloatArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyFloatArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Float);
+        assert(is_forward || PyArray_IsScalar(b, Float));
+    }
+    /*
+     * Extract the other value (if it is compatible). See the generic
+     * (non power) version above for detailed notes.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    int res = convert_to_float(other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_power, float_power);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            return PyGenericArrType_Type.tp_as_number->nb_power(a, b, modulo);
+        case CONVERT_PYSCALAR:
+            if (FLOAT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if !0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Float);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Float);
+    }
+
+    /*
+     * Prepare the actual calculation:
+     */
+    npy_float out;
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     */
+#if 0 && !0
+    if (arg2 < 0) {
+        PyErr_SetString(PyExc_ValueError,
+                "Integers to negative integer powers are not allowed.");
+        return NULL;
+    }
+#endif
+    int retstatus = float_ctype_power(arg1, arg2, &out);
+
+#if !0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar power", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    ret = PyArrayScalar_New(Float);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Float, out);
+
+    return ret;
+}
+
+
+#undef IS_float
+
+#line 1421
+#define IS_double
+
+static PyObject *
+double_power(PyObject *a, PyObject *b, PyObject *modulo)
+{
+    if (modulo != Py_None) {
+        /* modular exponentiation is not implemented (gh-8804) */
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+
+    PyObject *ret;
+    npy_double arg1, arg2, other_val;
+
+    int is_forward;
+    if (Py_TYPE(a) == &PyDoubleArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyDoubleArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, Double);
+        assert(is_forward || PyArray_IsScalar(b, Double));
+    }
+    /*
+     * Extract the other value (if it is compatible). See the generic
+     * (non power) version above for detailed notes.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    int res = convert_to_double(other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_power, double_power);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            return PyGenericArrType_Type.tp_as_number->nb_power(a, b, modulo);
+        case CONVERT_PYSCALAR:
+            if (DOUBLE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if !0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, Double);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, Double);
+    }
+
+    /*
+     * Prepare the actual calculation:
+     */
+    npy_double out;
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     */
+#if 0 && !0
+    if (arg2 < 0) {
+        PyErr_SetString(PyExc_ValueError,
+                "Integers to negative integer powers are not allowed.");
+        return NULL;
+    }
+#endif
+    int retstatus = double_ctype_power(arg1, arg2, &out);
+
+#if !0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar power", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    ret = PyArrayScalar_New(Double);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, Double, out);
+
+    return ret;
+}
+
+
+#undef IS_double
+
+#line 1421
+#define IS_longdouble
+
+static PyObject *
+longdouble_power(PyObject *a, PyObject *b, PyObject *modulo)
+{
+    if (modulo != Py_None) {
+        /* modular exponentiation is not implemented (gh-8804) */
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+
+    PyObject *ret;
+    npy_longdouble arg1, arg2, other_val;
+
+    int is_forward;
+    if (Py_TYPE(a) == &PyLongDoubleArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyLongDoubleArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, LongDouble);
+        assert(is_forward || PyArray_IsScalar(b, LongDouble));
+    }
+    /*
+     * Extract the other value (if it is compatible). See the generic
+     * (non power) version above for detailed notes.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    int res = convert_to_longdouble(other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_power, longdouble_power);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            return PyGenericArrType_Type.tp_as_number->nb_power(a, b, modulo);
+        case CONVERT_PYSCALAR:
+            if (LONGDOUBLE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if !0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, LongDouble);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, LongDouble);
+    }
+
+    /*
+     * Prepare the actual calculation:
+     */
+    npy_longdouble out;
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     */
+#if 0 && !0
+    if (arg2 < 0) {
+        PyErr_SetString(PyExc_ValueError,
+                "Integers to negative integer powers are not allowed.");
+        return NULL;
+    }
+#endif
+    int retstatus = longdouble_ctype_power(arg1, arg2, &out);
+
+#if !0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar power", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    ret = PyArrayScalar_New(LongDouble);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, LongDouble, out);
+
+    return ret;
+}
+
+
+#undef IS_longdouble
+
+#line 1421
+#define IS_cfloat
+
+static PyObject *
+cfloat_power(PyObject *a, PyObject *b, PyObject *modulo)
+{
+    if (modulo != Py_None) {
+        /* modular exponentiation is not implemented (gh-8804) */
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+
+    PyObject *ret;
+    npy_cfloat arg1, arg2, other_val;
+
+    int is_forward;
+    if (Py_TYPE(a) == &PyCFloatArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyCFloatArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, CFloat);
+        assert(is_forward || PyArray_IsScalar(b, CFloat));
+    }
+    /*
+     * Extract the other value (if it is compatible). See the generic
+     * (non power) version above for detailed notes.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    int res = convert_to_cfloat(other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_power, cfloat_power);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            return PyGenericArrType_Type.tp_as_number->nb_power(a, b, modulo);
+        case CONVERT_PYSCALAR:
+            if (CFLOAT_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if !0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, CFloat);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, CFloat);
+    }
+
+    /*
+     * Prepare the actual calculation:
+     */
+    npy_cfloat out;
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     */
+#if 0 && !0
+    if (arg2 < 0) {
+        PyErr_SetString(PyExc_ValueError,
+                "Integers to negative integer powers are not allowed.");
+        return NULL;
+    }
+#endif
+    int retstatus = cfloat_ctype_power(arg1, arg2, &out);
+
+#if !0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar power", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    ret = PyArrayScalar_New(CFloat);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, CFloat, out);
+
+    return ret;
+}
+
+
+#undef IS_cfloat
+
+#line 1421
+#define IS_cdouble
+
+static PyObject *
+cdouble_power(PyObject *a, PyObject *b, PyObject *modulo)
+{
+    if (modulo != Py_None) {
+        /* modular exponentiation is not implemented (gh-8804) */
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+
+    PyObject *ret;
+    npy_cdouble arg1, arg2, other_val;
+
+    int is_forward;
+    if (Py_TYPE(a) == &PyCDoubleArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyCDoubleArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, CDouble);
+        assert(is_forward || PyArray_IsScalar(b, CDouble));
+    }
+    /*
+     * Extract the other value (if it is compatible). See the generic
+     * (non power) version above for detailed notes.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    int res = convert_to_cdouble(other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_power, cdouble_power);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            return PyGenericArrType_Type.tp_as_number->nb_power(a, b, modulo);
+        case CONVERT_PYSCALAR:
+            if (CDOUBLE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if !0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, CDouble);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, CDouble);
+    }
+
+    /*
+     * Prepare the actual calculation:
+     */
+    npy_cdouble out;
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     */
+#if 0 && !0
+    if (arg2 < 0) {
+        PyErr_SetString(PyExc_ValueError,
+                "Integers to negative integer powers are not allowed.");
+        return NULL;
+    }
+#endif
+    int retstatus = cdouble_ctype_power(arg1, arg2, &out);
+
+#if !0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar power", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    ret = PyArrayScalar_New(CDouble);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, CDouble, out);
+
+    return ret;
+}
+
+
+#undef IS_cdouble
+
+#line 1421
+#define IS_clongdouble
+
+static PyObject *
+clongdouble_power(PyObject *a, PyObject *b, PyObject *modulo)
+{
+    if (modulo != Py_None) {
+        /* modular exponentiation is not implemented (gh-8804) */
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+
+    PyObject *ret;
+    npy_clongdouble arg1, arg2, other_val;
+
+    int is_forward;
+    if (Py_TYPE(a) == &PyCLongDoubleArrType_Type) {
+        is_forward = 1;
+    }
+    else if (Py_TYPE(b) == &PyCLongDoubleArrType_Type) {
+        is_forward = 0;
+    }
+    else {
+        /* subclasses are involved */
+        is_forward = PyArray_IsScalar(a, CLongDouble);
+        assert(is_forward || PyArray_IsScalar(b, CLongDouble));
+    }
+    /*
+     * Extract the other value (if it is compatible). See the generic
+     * (non power) version above for detailed notes.
+     */
+    PyObject *other = is_forward ? b : a;
+
+    npy_bool may_need_deferring;
+    int res = convert_to_clongdouble(other, &other_val, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        BINOP_GIVE_UP_IF_NEEDED(a, b, nb_power, clongdouble_power);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            return PyGenericArrType_Type.tp_as_number->nb_power(a, b, modulo);
+        case CONVERT_PYSCALAR:
+            if (CLONGDOUBLE_setitem(other, (char *)&other_val, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+#if !0
+    npy_clear_floatstatus_barrier((char*)&arg1);
+#endif
+
+    if (is_forward) {
+        arg1 = PyArrayScalar_VAL(a, CLongDouble);
+        arg2 = other_val;
+    }
+    else {
+        arg1 = other_val;
+        arg2 = PyArrayScalar_VAL(b, CLongDouble);
+    }
+
+    /*
+     * Prepare the actual calculation:
+     */
+    npy_clongdouble out;
+
+    /*
+     * here we do the actual calculation with arg1 and arg2
+     * as a function call.
+     */
+#if 0 && !0
+    if (arg2 < 0) {
+        PyErr_SetString(PyExc_ValueError,
+                "Integers to negative integer powers are not allowed.");
+        return NULL;
+    }
+#endif
+    int retstatus = clongdouble_ctype_power(arg1, arg2, &out);
+
+#if !0
+    /* Check status flag.  If it is set, then look up what to do */
+    retstatus |= npy_get_floatstatus_barrier((char*)&out);
+#endif
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar power", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    ret = PyArrayScalar_New(CLongDouble);
+    if (ret == NULL) {
+        return NULL;
+    }
+    PyArrayScalar_ASSIGN(ret, CLongDouble, out);
+
+    return ret;
+}
+
+
+#undef IS_clongdouble
+
+#undef _IS_ZERO
+
+
+#line 1544
+
+/*
+ * Complex numbers do not support remainder so we manually make sure that the
+ * operation is not defined.  This is/was especially important for longdoubles
+ * due to their tendency to recurse for some operations, see gh-18548.
+ * (We need to define the slots to avoid inheriting it.)
+ */
+static PyObject *
+cfloat_floor_divide(PyObject *NPY_UNUSED(a), PyObject *NPY_UNUSED(b))
+{
+    Py_RETURN_NOTIMPLEMENTED;
+}
+
+
+#line 1544
+
+/*
+ * Complex numbers do not support remainder so we manually make sure that the
+ * operation is not defined.  This is/was especially important for longdoubles
+ * due to their tendency to recurse for some operations, see gh-18548.
+ * (We need to define the slots to avoid inheriting it.)
+ */
+static PyObject *
+cdouble_floor_divide(PyObject *NPY_UNUSED(a), PyObject *NPY_UNUSED(b))
+{
+    Py_RETURN_NOTIMPLEMENTED;
+}
+
+
+#line 1544
+
+/*
+ * Complex numbers do not support remainder so we manually make sure that the
+ * operation is not defined.  This is/was especially important for longdoubles
+ * due to their tendency to recurse for some operations, see gh-18548.
+ * (We need to define the slots to avoid inheriting it.)
+ */
+static PyObject *
+clongdouble_floor_divide(PyObject *NPY_UNUSED(a), PyObject *NPY_UNUSED(b))
+{
+    Py_RETURN_NOTIMPLEMENTED;
+}
+
+
+#line 1544
+
+/*
+ * Complex numbers do not support remainder so we manually make sure that the
+ * operation is not defined.  This is/was especially important for longdoubles
+ * due to their tendency to recurse for some operations, see gh-18548.
+ * (We need to define the slots to avoid inheriting it.)
+ */
+static PyObject *
+cfloat_divmod(PyObject *NPY_UNUSED(a), PyObject *NPY_UNUSED(b))
+{
+    Py_RETURN_NOTIMPLEMENTED;
+}
+
+
+#line 1544
+
+/*
+ * Complex numbers do not support remainder so we manually make sure that the
+ * operation is not defined.  This is/was especially important for longdoubles
+ * due to their tendency to recurse for some operations, see gh-18548.
+ * (We need to define the slots to avoid inheriting it.)
+ */
+static PyObject *
+cdouble_divmod(PyObject *NPY_UNUSED(a), PyObject *NPY_UNUSED(b))
+{
+    Py_RETURN_NOTIMPLEMENTED;
+}
+
+
+#line 1544
+
+/*
+ * Complex numbers do not support remainder so we manually make sure that the
+ * operation is not defined.  This is/was especially important for longdoubles
+ * due to their tendency to recurse for some operations, see gh-18548.
+ * (We need to define the slots to avoid inheriting it.)
+ */
+static PyObject *
+clongdouble_divmod(PyObject *NPY_UNUSED(a), PyObject *NPY_UNUSED(b))
+{
+    Py_RETURN_NOTIMPLEMENTED;
+}
+
+
+#line 1544
+
+/*
+ * Complex numbers do not support remainder so we manually make sure that the
+ * operation is not defined.  This is/was especially important for longdoubles
+ * due to their tendency to recurse for some operations, see gh-18548.
+ * (We need to define the slots to avoid inheriting it.)
+ */
+static PyObject *
+cfloat_remainder(PyObject *NPY_UNUSED(a), PyObject *NPY_UNUSED(b))
+{
+    Py_RETURN_NOTIMPLEMENTED;
+}
+
+
+#line 1544
+
+/*
+ * Complex numbers do not support remainder so we manually make sure that the
+ * operation is not defined.  This is/was especially important for longdoubles
+ * due to their tendency to recurse for some operations, see gh-18548.
+ * (We need to define the slots to avoid inheriting it.)
+ */
+static PyObject *
+cdouble_remainder(PyObject *NPY_UNUSED(a), PyObject *NPY_UNUSED(b))
+{
+    Py_RETURN_NOTIMPLEMENTED;
+}
+
+
+#line 1544
+
+/*
+ * Complex numbers do not support remainder so we manually make sure that the
+ * operation is not defined.  This is/was especially important for longdoubles
+ * due to their tendency to recurse for some operations, see gh-18548.
+ * (We need to define the slots to avoid inheriting it.)
+ */
+static PyObject *
+clongdouble_remainder(PyObject *NPY_UNUSED(a), PyObject *NPY_UNUSED(b))
+{
+    Py_RETURN_NOTIMPLEMENTED;
+}
+
+
+
+#line 1564
+
+#line 1570
+
+#define half_lshift NULL
+
+
+#line 1570
+
+#define half_rshift NULL
+
+
+#line 1570
+
+#define half_and NULL
+
+
+#line 1570
+
+#define half_or NULL
+
+
+#line 1570
+
+#define half_xor NULL
+
+
+
+
+#line 1564
+
+#line 1570
+
+#define float_lshift NULL
+
+
+#line 1570
+
+#define float_rshift NULL
+
+
+#line 1570
+
+#define float_and NULL
+
+
+#line 1570
+
+#define float_or NULL
+
+
+#line 1570
+
+#define float_xor NULL
+
+
+
+
+#line 1564
+
+#line 1570
+
+#define double_lshift NULL
+
+
+#line 1570
+
+#define double_rshift NULL
+
+
+#line 1570
+
+#define double_and NULL
+
+
+#line 1570
+
+#define double_or NULL
+
+
+#line 1570
+
+#define double_xor NULL
+
+
+
+
+#line 1564
+
+#line 1570
+
+#define longdouble_lshift NULL
+
+
+#line 1570
+
+#define longdouble_rshift NULL
+
+
+#line 1570
+
+#define longdouble_and NULL
+
+
+#line 1570
+
+#define longdouble_or NULL
+
+
+#line 1570
+
+#define longdouble_xor NULL
+
+
+
+
+#line 1564
+
+#line 1570
+
+#define cfloat_lshift NULL
+
+
+#line 1570
+
+#define cfloat_rshift NULL
+
+
+#line 1570
+
+#define cfloat_and NULL
+
+
+#line 1570
+
+#define cfloat_or NULL
+
+
+#line 1570
+
+#define cfloat_xor NULL
+
+
+
+
+#line 1564
+
+#line 1570
+
+#define cdouble_lshift NULL
+
+
+#line 1570
+
+#define cdouble_rshift NULL
+
+
+#line 1570
+
+#define cdouble_and NULL
+
+
+#line 1570
+
+#define cdouble_or NULL
+
+
+#line 1570
+
+#define cdouble_xor NULL
+
+
+
+
+#line 1564
+
+#line 1570
+
+#define clongdouble_lshift NULL
+
+
+#line 1570
+
+#define clongdouble_rshift NULL
+
+
+#line 1570
+
+#define clongdouble_and NULL
+
+
+#line 1570
+
+#define clongdouble_or NULL
+
+
+#line 1570
+
+#define clongdouble_xor NULL
+
+
+
+
+
+#line 1628
+static PyObject *
+byte_negative(PyObject *a)
+{
+    npy_byte val;
+    npy_byte out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, Byte);
+    int retstatus = byte_ctype_negative(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar negative", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(Byte);
+    PyArrayScalar_ASSIGN(ret, Byte, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+ubyte_negative(PyObject *a)
+{
+    npy_ubyte val;
+    npy_ubyte out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, UByte);
+    int retstatus = ubyte_ctype_negative(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar negative", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(UByte);
+    PyArrayScalar_ASSIGN(ret, UByte, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+short_negative(PyObject *a)
+{
+    npy_short val;
+    npy_short out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, Short);
+    int retstatus = short_ctype_negative(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar negative", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(Short);
+    PyArrayScalar_ASSIGN(ret, Short, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+ushort_negative(PyObject *a)
+{
+    npy_ushort val;
+    npy_ushort out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, UShort);
+    int retstatus = ushort_ctype_negative(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar negative", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(UShort);
+    PyArrayScalar_ASSIGN(ret, UShort, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+int_negative(PyObject *a)
+{
+    npy_int val;
+    npy_int out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, Int);
+    int retstatus = int_ctype_negative(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar negative", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(Int);
+    PyArrayScalar_ASSIGN(ret, Int, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+uint_negative(PyObject *a)
+{
+    npy_uint val;
+    npy_uint out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, UInt);
+    int retstatus = uint_ctype_negative(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar negative", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(UInt);
+    PyArrayScalar_ASSIGN(ret, UInt, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+long_negative(PyObject *a)
+{
+    npy_long val;
+    npy_long out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, Long);
+    int retstatus = long_ctype_negative(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar negative", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(Long);
+    PyArrayScalar_ASSIGN(ret, Long, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+ulong_negative(PyObject *a)
+{
+    npy_ulong val;
+    npy_ulong out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, ULong);
+    int retstatus = ulong_ctype_negative(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar negative", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(ULong);
+    PyArrayScalar_ASSIGN(ret, ULong, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+longlong_negative(PyObject *a)
+{
+    npy_longlong val;
+    npy_longlong out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, LongLong);
+    int retstatus = longlong_ctype_negative(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar negative", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(LongLong);
+    PyArrayScalar_ASSIGN(ret, LongLong, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+ulonglong_negative(PyObject *a)
+{
+    npy_ulonglong val;
+    npy_ulonglong out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, ULongLong);
+    int retstatus = ulonglong_ctype_negative(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar negative", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(ULongLong);
+    PyArrayScalar_ASSIGN(ret, ULongLong, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+half_negative(PyObject *a)
+{
+    npy_half val;
+    npy_half out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, Half);
+    int retstatus = half_ctype_negative(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar negative", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(Half);
+    PyArrayScalar_ASSIGN(ret, Half, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+float_negative(PyObject *a)
+{
+    npy_float val;
+    npy_float out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, Float);
+    int retstatus = float_ctype_negative(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar negative", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(Float);
+    PyArrayScalar_ASSIGN(ret, Float, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+double_negative(PyObject *a)
+{
+    npy_double val;
+    npy_double out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, Double);
+    int retstatus = double_ctype_negative(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar negative", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(Double);
+    PyArrayScalar_ASSIGN(ret, Double, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+longdouble_negative(PyObject *a)
+{
+    npy_longdouble val;
+    npy_longdouble out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, LongDouble);
+    int retstatus = longdouble_ctype_negative(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar negative", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(LongDouble);
+    PyArrayScalar_ASSIGN(ret, LongDouble, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+cfloat_negative(PyObject *a)
+{
+    npy_cfloat val;
+    npy_cfloat out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, CFloat);
+    int retstatus = cfloat_ctype_negative(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar negative", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(CFloat);
+    PyArrayScalar_ASSIGN(ret, CFloat, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+cdouble_negative(PyObject *a)
+{
+    npy_cdouble val;
+    npy_cdouble out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, CDouble);
+    int retstatus = cdouble_ctype_negative(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar negative", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(CDouble);
+    PyArrayScalar_ASSIGN(ret, CDouble, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+clongdouble_negative(PyObject *a)
+{
+    npy_clongdouble val;
+    npy_clongdouble out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, CLongDouble);
+    int retstatus = clongdouble_ctype_negative(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar negative", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(CLongDouble);
+    PyArrayScalar_ASSIGN(ret, CLongDouble, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+byte_positive(PyObject *a)
+{
+    npy_byte val;
+    npy_byte out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, Byte);
+    int retstatus = byte_ctype_positive(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar positive", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(Byte);
+    PyArrayScalar_ASSIGN(ret, Byte, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+ubyte_positive(PyObject *a)
+{
+    npy_ubyte val;
+    npy_ubyte out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, UByte);
+    int retstatus = ubyte_ctype_positive(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar positive", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(UByte);
+    PyArrayScalar_ASSIGN(ret, UByte, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+short_positive(PyObject *a)
+{
+    npy_short val;
+    npy_short out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, Short);
+    int retstatus = short_ctype_positive(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar positive", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(Short);
+    PyArrayScalar_ASSIGN(ret, Short, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+ushort_positive(PyObject *a)
+{
+    npy_ushort val;
+    npy_ushort out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, UShort);
+    int retstatus = ushort_ctype_positive(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar positive", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(UShort);
+    PyArrayScalar_ASSIGN(ret, UShort, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+int_positive(PyObject *a)
+{
+    npy_int val;
+    npy_int out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, Int);
+    int retstatus = int_ctype_positive(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar positive", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(Int);
+    PyArrayScalar_ASSIGN(ret, Int, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+uint_positive(PyObject *a)
+{
+    npy_uint val;
+    npy_uint out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, UInt);
+    int retstatus = uint_ctype_positive(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar positive", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(UInt);
+    PyArrayScalar_ASSIGN(ret, UInt, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+long_positive(PyObject *a)
+{
+    npy_long val;
+    npy_long out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, Long);
+    int retstatus = long_ctype_positive(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar positive", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(Long);
+    PyArrayScalar_ASSIGN(ret, Long, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+ulong_positive(PyObject *a)
+{
+    npy_ulong val;
+    npy_ulong out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, ULong);
+    int retstatus = ulong_ctype_positive(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar positive", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(ULong);
+    PyArrayScalar_ASSIGN(ret, ULong, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+longlong_positive(PyObject *a)
+{
+    npy_longlong val;
+    npy_longlong out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, LongLong);
+    int retstatus = longlong_ctype_positive(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar positive", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(LongLong);
+    PyArrayScalar_ASSIGN(ret, LongLong, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+ulonglong_positive(PyObject *a)
+{
+    npy_ulonglong val;
+    npy_ulonglong out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, ULongLong);
+    int retstatus = ulonglong_ctype_positive(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar positive", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(ULongLong);
+    PyArrayScalar_ASSIGN(ret, ULongLong, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+half_positive(PyObject *a)
+{
+    npy_half val;
+    npy_half out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, Half);
+    int retstatus = half_ctype_positive(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar positive", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(Half);
+    PyArrayScalar_ASSIGN(ret, Half, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+float_positive(PyObject *a)
+{
+    npy_float val;
+    npy_float out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, Float);
+    int retstatus = float_ctype_positive(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar positive", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(Float);
+    PyArrayScalar_ASSIGN(ret, Float, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+double_positive(PyObject *a)
+{
+    npy_double val;
+    npy_double out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, Double);
+    int retstatus = double_ctype_positive(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar positive", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(Double);
+    PyArrayScalar_ASSIGN(ret, Double, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+longdouble_positive(PyObject *a)
+{
+    npy_longdouble val;
+    npy_longdouble out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, LongDouble);
+    int retstatus = longdouble_ctype_positive(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar positive", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(LongDouble);
+    PyArrayScalar_ASSIGN(ret, LongDouble, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+cfloat_positive(PyObject *a)
+{
+    npy_cfloat val;
+    npy_cfloat out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, CFloat);
+    int retstatus = cfloat_ctype_positive(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar positive", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(CFloat);
+    PyArrayScalar_ASSIGN(ret, CFloat, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+cdouble_positive(PyObject *a)
+{
+    npy_cdouble val;
+    npy_cdouble out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, CDouble);
+    int retstatus = cdouble_ctype_positive(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar positive", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(CDouble);
+    PyArrayScalar_ASSIGN(ret, CDouble, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+clongdouble_positive(PyObject *a)
+{
+    npy_clongdouble val;
+    npy_clongdouble out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, CLongDouble);
+    int retstatus = clongdouble_ctype_positive(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar positive", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(CLongDouble);
+    PyArrayScalar_ASSIGN(ret, CLongDouble, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+byte_absolute(PyObject *a)
+{
+    npy_byte val;
+    npy_byte out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, Byte);
+    int retstatus = byte_ctype_absolute(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar absolute", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(Byte);
+    PyArrayScalar_ASSIGN(ret, Byte, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+ubyte_absolute(PyObject *a)
+{
+    npy_ubyte val;
+    npy_ubyte out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, UByte);
+    int retstatus = ubyte_ctype_absolute(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar absolute", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(UByte);
+    PyArrayScalar_ASSIGN(ret, UByte, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+short_absolute(PyObject *a)
+{
+    npy_short val;
+    npy_short out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, Short);
+    int retstatus = short_ctype_absolute(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar absolute", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(Short);
+    PyArrayScalar_ASSIGN(ret, Short, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+ushort_absolute(PyObject *a)
+{
+    npy_ushort val;
+    npy_ushort out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, UShort);
+    int retstatus = ushort_ctype_absolute(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar absolute", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(UShort);
+    PyArrayScalar_ASSIGN(ret, UShort, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+int_absolute(PyObject *a)
+{
+    npy_int val;
+    npy_int out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, Int);
+    int retstatus = int_ctype_absolute(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar absolute", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(Int);
+    PyArrayScalar_ASSIGN(ret, Int, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+uint_absolute(PyObject *a)
+{
+    npy_uint val;
+    npy_uint out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, UInt);
+    int retstatus = uint_ctype_absolute(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar absolute", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(UInt);
+    PyArrayScalar_ASSIGN(ret, UInt, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+long_absolute(PyObject *a)
+{
+    npy_long val;
+    npy_long out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, Long);
+    int retstatus = long_ctype_absolute(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar absolute", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(Long);
+    PyArrayScalar_ASSIGN(ret, Long, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+ulong_absolute(PyObject *a)
+{
+    npy_ulong val;
+    npy_ulong out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, ULong);
+    int retstatus = ulong_ctype_absolute(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar absolute", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(ULong);
+    PyArrayScalar_ASSIGN(ret, ULong, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+longlong_absolute(PyObject *a)
+{
+    npy_longlong val;
+    npy_longlong out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, LongLong);
+    int retstatus = longlong_ctype_absolute(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar absolute", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(LongLong);
+    PyArrayScalar_ASSIGN(ret, LongLong, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+ulonglong_absolute(PyObject *a)
+{
+    npy_ulonglong val;
+    npy_ulonglong out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, ULongLong);
+    int retstatus = ulonglong_ctype_absolute(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar absolute", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(ULongLong);
+    PyArrayScalar_ASSIGN(ret, ULongLong, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+half_absolute(PyObject *a)
+{
+    npy_half val;
+    npy_half out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, Half);
+    int retstatus = half_ctype_absolute(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar absolute", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(Half);
+    PyArrayScalar_ASSIGN(ret, Half, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+float_absolute(PyObject *a)
+{
+    npy_float val;
+    npy_float out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, Float);
+    int retstatus = float_ctype_absolute(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar absolute", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(Float);
+    PyArrayScalar_ASSIGN(ret, Float, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+double_absolute(PyObject *a)
+{
+    npy_double val;
+    npy_double out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, Double);
+    int retstatus = double_ctype_absolute(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar absolute", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(Double);
+    PyArrayScalar_ASSIGN(ret, Double, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+longdouble_absolute(PyObject *a)
+{
+    npy_longdouble val;
+    npy_longdouble out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, LongDouble);
+    int retstatus = longdouble_ctype_absolute(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar absolute", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(LongDouble);
+    PyArrayScalar_ASSIGN(ret, LongDouble, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+cfloat_absolute(PyObject *a)
+{
+    npy_cfloat val;
+    npy_float out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, CFloat);
+    int retstatus = cfloat_ctype_absolute(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar absolute", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(Float);
+    PyArrayScalar_ASSIGN(ret, Float, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+cdouble_absolute(PyObject *a)
+{
+    npy_cdouble val;
+    npy_double out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, CDouble);
+    int retstatus = cdouble_ctype_absolute(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar absolute", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(Double);
+    PyArrayScalar_ASSIGN(ret, Double, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+clongdouble_absolute(PyObject *a)
+{
+    npy_clongdouble val;
+    npy_longdouble out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, CLongDouble);
+    int retstatus = clongdouble_ctype_absolute(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar absolute", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(LongDouble);
+    PyArrayScalar_ASSIGN(ret, LongDouble, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+byte_invert(PyObject *a)
+{
+    npy_byte val;
+    npy_byte out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, Byte);
+    int retstatus = byte_ctype_invert(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar invert", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(Byte);
+    PyArrayScalar_ASSIGN(ret, Byte, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+ubyte_invert(PyObject *a)
+{
+    npy_ubyte val;
+    npy_ubyte out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, UByte);
+    int retstatus = ubyte_ctype_invert(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar invert", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(UByte);
+    PyArrayScalar_ASSIGN(ret, UByte, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+short_invert(PyObject *a)
+{
+    npy_short val;
+    npy_short out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, Short);
+    int retstatus = short_ctype_invert(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar invert", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(Short);
+    PyArrayScalar_ASSIGN(ret, Short, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+ushort_invert(PyObject *a)
+{
+    npy_ushort val;
+    npy_ushort out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, UShort);
+    int retstatus = ushort_ctype_invert(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar invert", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(UShort);
+    PyArrayScalar_ASSIGN(ret, UShort, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+int_invert(PyObject *a)
+{
+    npy_int val;
+    npy_int out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, Int);
+    int retstatus = int_ctype_invert(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar invert", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(Int);
+    PyArrayScalar_ASSIGN(ret, Int, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+uint_invert(PyObject *a)
+{
+    npy_uint val;
+    npy_uint out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, UInt);
+    int retstatus = uint_ctype_invert(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar invert", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(UInt);
+    PyArrayScalar_ASSIGN(ret, UInt, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+long_invert(PyObject *a)
+{
+    npy_long val;
+    npy_long out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, Long);
+    int retstatus = long_ctype_invert(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar invert", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(Long);
+    PyArrayScalar_ASSIGN(ret, Long, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+ulong_invert(PyObject *a)
+{
+    npy_ulong val;
+    npy_ulong out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, ULong);
+    int retstatus = ulong_ctype_invert(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar invert", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(ULong);
+    PyArrayScalar_ASSIGN(ret, ULong, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+longlong_invert(PyObject *a)
+{
+    npy_longlong val;
+    npy_longlong out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, LongLong);
+    int retstatus = longlong_ctype_invert(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar invert", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(LongLong);
+    PyArrayScalar_ASSIGN(ret, LongLong, out);
+
+    return ret;
+}
+
+#line 1628
+static PyObject *
+ulonglong_invert(PyObject *a)
+{
+    npy_ulonglong val;
+    npy_ulonglong out;
+    PyObject *ret;
+
+
+    val = PyArrayScalar_VAL(a, ULongLong);
+    int retstatus = ulonglong_ctype_invert(val, &out);
+
+    if (retstatus) {
+        if (PyUFunc_GiveFloatingpointErrors("scalar invert", retstatus) < 0) {
+            return NULL;
+        }
+    }
+
+    /*
+     * TODO: Complex absolute should check floating point flags.
+     */
+
+    ret = PyArrayScalar_New(ULongLong);
+    PyArrayScalar_ASSIGN(ret, ULongLong, out);
+
+    return ret;
+}
+
+
+#line 1660
+
+#define half_invert NULL
+
+
+#line 1660
+
+#define float_invert NULL
+
+
+#line 1660
+
+#define double_invert NULL
+
+
+#line 1660
+
+#define longdouble_invert NULL
+
+
+#line 1660
+
+#define cfloat_invert NULL
+
+
+#line 1660
+
+#define cdouble_invert NULL
+
+
+#line 1660
+
+#define clongdouble_invert NULL
+
+
+
+#define _IS_NONZERO(x) (x != 0)
+#line 1683
+static int
+byte_bool(PyObject *a)
+{
+    int ret;
+    npy_byte val;
+
+    val = PyArrayScalar_VAL(a, Byte);
+
+#if 1
+    ret = _IS_NONZERO(val);
+#else
+    ret = (_IS_NONZERO(val.real) || _IS_NONZERO(val.imag));
+#endif
+
+    return ret;
+}
+
+#line 1683
+static int
+ubyte_bool(PyObject *a)
+{
+    int ret;
+    npy_ubyte val;
+
+    val = PyArrayScalar_VAL(a, UByte);
+
+#if 1
+    ret = _IS_NONZERO(val);
+#else
+    ret = (_IS_NONZERO(val.real) || _IS_NONZERO(val.imag));
+#endif
+
+    return ret;
+}
+
+#line 1683
+static int
+short_bool(PyObject *a)
+{
+    int ret;
+    npy_short val;
+
+    val = PyArrayScalar_VAL(a, Short);
+
+#if 1
+    ret = _IS_NONZERO(val);
+#else
+    ret = (_IS_NONZERO(val.real) || _IS_NONZERO(val.imag));
+#endif
+
+    return ret;
+}
+
+#line 1683
+static int
+ushort_bool(PyObject *a)
+{
+    int ret;
+    npy_ushort val;
+
+    val = PyArrayScalar_VAL(a, UShort);
+
+#if 1
+    ret = _IS_NONZERO(val);
+#else
+    ret = (_IS_NONZERO(val.real) || _IS_NONZERO(val.imag));
+#endif
+
+    return ret;
+}
+
+#line 1683
+static int
+int_bool(PyObject *a)
+{
+    int ret;
+    npy_int val;
+
+    val = PyArrayScalar_VAL(a, Int);
+
+#if 1
+    ret = _IS_NONZERO(val);
+#else
+    ret = (_IS_NONZERO(val.real) || _IS_NONZERO(val.imag));
+#endif
+
+    return ret;
+}
+
+#line 1683
+static int
+uint_bool(PyObject *a)
+{
+    int ret;
+    npy_uint val;
+
+    val = PyArrayScalar_VAL(a, UInt);
+
+#if 1
+    ret = _IS_NONZERO(val);
+#else
+    ret = (_IS_NONZERO(val.real) || _IS_NONZERO(val.imag));
+#endif
+
+    return ret;
+}
+
+#line 1683
+static int
+long_bool(PyObject *a)
+{
+    int ret;
+    npy_long val;
+
+    val = PyArrayScalar_VAL(a, Long);
+
+#if 1
+    ret = _IS_NONZERO(val);
+#else
+    ret = (_IS_NONZERO(val.real) || _IS_NONZERO(val.imag));
+#endif
+
+    return ret;
+}
+
+#line 1683
+static int
+ulong_bool(PyObject *a)
+{
+    int ret;
+    npy_ulong val;
+
+    val = PyArrayScalar_VAL(a, ULong);
+
+#if 1
+    ret = _IS_NONZERO(val);
+#else
+    ret = (_IS_NONZERO(val.real) || _IS_NONZERO(val.imag));
+#endif
+
+    return ret;
+}
+
+#line 1683
+static int
+longlong_bool(PyObject *a)
+{
+    int ret;
+    npy_longlong val;
+
+    val = PyArrayScalar_VAL(a, LongLong);
+
+#if 1
+    ret = _IS_NONZERO(val);
+#else
+    ret = (_IS_NONZERO(val.real) || _IS_NONZERO(val.imag));
+#endif
+
+    return ret;
+}
+
+#line 1683
+static int
+ulonglong_bool(PyObject *a)
+{
+    int ret;
+    npy_ulonglong val;
+
+    val = PyArrayScalar_VAL(a, ULongLong);
+
+#if 1
+    ret = _IS_NONZERO(val);
+#else
+    ret = (_IS_NONZERO(val.real) || _IS_NONZERO(val.imag));
+#endif
+
+    return ret;
+}
+
+#line 1683
+static int
+half_bool(PyObject *a)
+{
+    int ret;
+    npy_half val;
+
+    val = PyArrayScalar_VAL(a, Half);
+
+#if 1
+    ret = !npy_half_iszero(val);
+#else
+    ret = (!npy_half_iszero(val.real) || !npy_half_iszero(val.imag));
+#endif
+
+    return ret;
+}
+
+#line 1683
+static int
+float_bool(PyObject *a)
+{
+    int ret;
+    npy_float val;
+
+    val = PyArrayScalar_VAL(a, Float);
+
+#if 1
+    ret = _IS_NONZERO(val);
+#else
+    ret = (_IS_NONZERO(val.real) || _IS_NONZERO(val.imag));
+#endif
+
+    return ret;
+}
+
+#line 1683
+static int
+double_bool(PyObject *a)
+{
+    int ret;
+    npy_double val;
+
+    val = PyArrayScalar_VAL(a, Double);
+
+#if 1
+    ret = _IS_NONZERO(val);
+#else
+    ret = (_IS_NONZERO(val.real) || _IS_NONZERO(val.imag));
+#endif
+
+    return ret;
+}
+
+#line 1683
+static int
+longdouble_bool(PyObject *a)
+{
+    int ret;
+    npy_longdouble val;
+
+    val = PyArrayScalar_VAL(a, LongDouble);
+
+#if 1
+    ret = _IS_NONZERO(val);
+#else
+    ret = (_IS_NONZERO(val.real) || _IS_NONZERO(val.imag));
+#endif
+
+    return ret;
+}
+
+#line 1683
+static int
+cfloat_bool(PyObject *a)
+{
+    int ret;
+    npy_cfloat val;
+
+    val = PyArrayScalar_VAL(a, CFloat);
+
+#if 0
+    ret = _IS_NONZERO(val);
+#else
+    ret = (_IS_NONZERO(val.real) || _IS_NONZERO(val.imag));
+#endif
+
+    return ret;
+}
+
+#line 1683
+static int
+cdouble_bool(PyObject *a)
+{
+    int ret;
+    npy_cdouble val;
+
+    val = PyArrayScalar_VAL(a, CDouble);
+
+#if 0
+    ret = _IS_NONZERO(val);
+#else
+    ret = (_IS_NONZERO(val.real) || _IS_NONZERO(val.imag));
+#endif
+
+    return ret;
+}
+
+#line 1683
+static int
+clongdouble_bool(PyObject *a)
+{
+    int ret;
+    npy_clongdouble val;
+
+    val = PyArrayScalar_VAL(a, CLongDouble);
+
+#if 0
+    ret = _IS_NONZERO(val);
+#else
+    ret = (_IS_NONZERO(val.real) || _IS_NONZERO(val.imag));
+#endif
+
+    return ret;
+}
+
+#undef _IS_NONZERO
+
+
+static int
+emit_complexwarning(void)
+{
+    static PyObject *cls = NULL;
+    npy_cache_import("numpy.exceptions", "ComplexWarning", &cls);
+    if (cls == NULL) {
+        return -1;
+    }
+    return PyErr_WarnEx(cls,
+            "Casting complex values to real discards the imaginary part", 1);
+}
+
+#line 1737
+static PyObject *
+byte_int(PyObject *obj)
+{
+    PyObject *long_result;
+
+#if 0
+    signed long x = (PyArrayScalar_VAL(obj, Byte).real);
+#else
+    signed long x = (PyArrayScalar_VAL(obj, Byte));
+#endif
+
+#if 0
+    if (emit_complexwarning() < 0) {
+        return NULL;
+    }
+#endif
+
+    long_result = PyLong_FromLong(x);
+    if (long_result == NULL){
+        return NULL;
+    }
+
+    return long_result;
+}
+
+#line 1737
+static PyObject *
+ubyte_int(PyObject *obj)
+{
+    PyObject *long_result;
+
+#if 0
+    unsigned long x = (PyArrayScalar_VAL(obj, UByte).real);
+#else
+    unsigned long x = (PyArrayScalar_VAL(obj, UByte));
+#endif
+
+#if 0
+    if (emit_complexwarning() < 0) {
+        return NULL;
+    }
+#endif
+
+    long_result = PyLong_FromUnsignedLong(x);
+    if (long_result == NULL){
+        return NULL;
+    }
+
+    return long_result;
+}
+
+#line 1737
+static PyObject *
+short_int(PyObject *obj)
+{
+    PyObject *long_result;
+
+#if 0
+    signed long x = (PyArrayScalar_VAL(obj, Short).real);
+#else
+    signed long x = (PyArrayScalar_VAL(obj, Short));
+#endif
+
+#if 0
+    if (emit_complexwarning() < 0) {
+        return NULL;
+    }
+#endif
+
+    long_result = PyLong_FromLong(x);
+    if (long_result == NULL){
+        return NULL;
+    }
+
+    return long_result;
+}
+
+#line 1737
+static PyObject *
+ushort_int(PyObject *obj)
+{
+    PyObject *long_result;
+
+#if 0
+    unsigned long x = (PyArrayScalar_VAL(obj, UShort).real);
+#else
+    unsigned long x = (PyArrayScalar_VAL(obj, UShort));
+#endif
+
+#if 0
+    if (emit_complexwarning() < 0) {
+        return NULL;
+    }
+#endif
+
+    long_result = PyLong_FromUnsignedLong(x);
+    if (long_result == NULL){
+        return NULL;
+    }
+
+    return long_result;
+}
+
+#line 1737
+static PyObject *
+int_int(PyObject *obj)
+{
+    PyObject *long_result;
+
+#if 0
+    signed long x = (PyArrayScalar_VAL(obj, Int).real);
+#else
+    signed long x = (PyArrayScalar_VAL(obj, Int));
+#endif
+
+#if 0
+    if (emit_complexwarning() < 0) {
+        return NULL;
+    }
+#endif
+
+    long_result = PyLong_FromLong(x);
+    if (long_result == NULL){
+        return NULL;
+    }
+
+    return long_result;
+}
+
+#line 1737
+static PyObject *
+uint_int(PyObject *obj)
+{
+    PyObject *long_result;
+
+#if 0
+    unsigned long x = (PyArrayScalar_VAL(obj, UInt).real);
+#else
+    unsigned long x = (PyArrayScalar_VAL(obj, UInt));
+#endif
+
+#if 0
+    if (emit_complexwarning() < 0) {
+        return NULL;
+    }
+#endif
+
+    long_result = PyLong_FromUnsignedLong(x);
+    if (long_result == NULL){
+        return NULL;
+    }
+
+    return long_result;
+}
+
+#line 1737
+static PyObject *
+long_int(PyObject *obj)
+{
+    PyObject *long_result;
+
+#if 0
+    signed long x = (PyArrayScalar_VAL(obj, Long).real);
+#else
+    signed long x = (PyArrayScalar_VAL(obj, Long));
+#endif
+
+#if 0
+    if (emit_complexwarning() < 0) {
+        return NULL;
+    }
+#endif
+
+    long_result = PyLong_FromLong(x);
+    if (long_result == NULL){
+        return NULL;
+    }
+
+    return long_result;
+}
+
+#line 1737
+static PyObject *
+ulong_int(PyObject *obj)
+{
+    PyObject *long_result;
+
+#if 0
+    unsigned long x = (PyArrayScalar_VAL(obj, ULong).real);
+#else
+    unsigned long x = (PyArrayScalar_VAL(obj, ULong));
+#endif
+
+#if 0
+    if (emit_complexwarning() < 0) {
+        return NULL;
+    }
+#endif
+
+    long_result = PyLong_FromUnsignedLong(x);
+    if (long_result == NULL){
+        return NULL;
+    }
+
+    return long_result;
+}
+
+#line 1737
+static PyObject *
+longlong_int(PyObject *obj)
+{
+    PyObject *long_result;
+
+#if 0
+    signed PY_LONG_LONG x = (PyArrayScalar_VAL(obj, LongLong).real);
+#else
+    signed PY_LONG_LONG x = (PyArrayScalar_VAL(obj, LongLong));
+#endif
+
+#if 0
+    if (emit_complexwarning() < 0) {
+        return NULL;
+    }
+#endif
+
+    long_result = PyLong_FromLongLong(x);
+    if (long_result == NULL){
+        return NULL;
+    }
+
+    return long_result;
+}
+
+#line 1737
+static PyObject *
+ulonglong_int(PyObject *obj)
+{
+    PyObject *long_result;
+
+#if 0
+    unsigned PY_LONG_LONG x = (PyArrayScalar_VAL(obj, ULongLong).real);
+#else
+    unsigned PY_LONG_LONG x = (PyArrayScalar_VAL(obj, ULongLong));
+#endif
+
+#if 0
+    if (emit_complexwarning() < 0) {
+        return NULL;
+    }
+#endif
+
+    long_result = PyLong_FromUnsignedLongLong(x);
+    if (long_result == NULL){
+        return NULL;
+    }
+
+    return long_result;
+}
+
+#line 1737
+static PyObject *
+half_int(PyObject *obj)
+{
+    PyObject *long_result;
+
+#if 0
+     double x = npy_half_to_double(PyArrayScalar_VAL(obj, Half).real);
+#else
+     double x = npy_half_to_double(PyArrayScalar_VAL(obj, Half));
+#endif
+
+#if 0
+    if (emit_complexwarning() < 0) {
+        return NULL;
+    }
+#endif
+
+    long_result = PyLong_FromDouble(x);
+    if (long_result == NULL){
+        return NULL;
+    }
+
+    return long_result;
+}
+
+#line 1737
+static PyObject *
+float_int(PyObject *obj)
+{
+    PyObject *long_result;
+
+#if 0
+     double x = (PyArrayScalar_VAL(obj, Float).real);
+#else
+     double x = (PyArrayScalar_VAL(obj, Float));
+#endif
+
+#if 0
+    if (emit_complexwarning() < 0) {
+        return NULL;
+    }
+#endif
+
+    long_result = PyLong_FromDouble(x);
+    if (long_result == NULL){
+        return NULL;
+    }
+
+    return long_result;
+}
+
+#line 1737
+static PyObject *
+double_int(PyObject *obj)
+{
+    PyObject *long_result;
+
+#if 0
+     double x = (PyArrayScalar_VAL(obj, Double).real);
+#else
+     double x = (PyArrayScalar_VAL(obj, Double));
+#endif
+
+#if 0
+    if (emit_complexwarning() < 0) {
+        return NULL;
+    }
+#endif
+
+    long_result = PyLong_FromDouble(x);
+    if (long_result == NULL){
+        return NULL;
+    }
+
+    return long_result;
+}
+
+#line 1737
+static PyObject *
+longdouble_int(PyObject *obj)
+{
+    PyObject *long_result;
+
+#if 0
+     npy_longdouble x = (PyArrayScalar_VAL(obj, LongDouble).real);
+#else
+     npy_longdouble x = (PyArrayScalar_VAL(obj, LongDouble));
+#endif
+
+#if 0
+    if (emit_complexwarning() < 0) {
+        return NULL;
+    }
+#endif
+
+    long_result = npy_longdouble_to_PyLong(x);
+    if (long_result == NULL){
+        return NULL;
+    }
+
+    return long_result;
+}
+
+#line 1737
+static PyObject *
+cfloat_int(PyObject *obj)
+{
+    PyObject *long_result;
+
+#if 1
+     double x = (PyArrayScalar_VAL(obj, CFloat).real);
+#else
+     double x = (PyArrayScalar_VAL(obj, CFloat));
+#endif
+
+#if 1
+    if (emit_complexwarning() < 0) {
+        return NULL;
+    }
+#endif
+
+    long_result = PyLong_FromDouble(x);
+    if (long_result == NULL){
+        return NULL;
+    }
+
+    return long_result;
+}
+
+#line 1737
+static PyObject *
+cdouble_int(PyObject *obj)
+{
+    PyObject *long_result;
+
+#if 1
+     double x = (PyArrayScalar_VAL(obj, CDouble).real);
+#else
+     double x = (PyArrayScalar_VAL(obj, CDouble));
+#endif
+
+#if 1
+    if (emit_complexwarning() < 0) {
+        return NULL;
+    }
+#endif
+
+    long_result = PyLong_FromDouble(x);
+    if (long_result == NULL){
+        return NULL;
+    }
+
+    return long_result;
+}
+
+#line 1737
+static PyObject *
+clongdouble_int(PyObject *obj)
+{
+    PyObject *long_result;
+
+#if 1
+     npy_longdouble x = (PyArrayScalar_VAL(obj, CLongDouble).real);
+#else
+     npy_longdouble x = (PyArrayScalar_VAL(obj, CLongDouble));
+#endif
+
+#if 1
+    if (emit_complexwarning() < 0) {
+        return NULL;
+    }
+#endif
+
+    long_result = npy_longdouble_to_PyLong(x);
+    if (long_result == NULL){
+        return NULL;
+    }
+
+    return long_result;
+}
+
+
+#line 1777
+static PyObject *
+byte_float(PyObject *obj)
+{
+#if 0
+    if (emit_complexwarning() < 0) {
+        return NULL;
+    }
+    return PyFloat_FromDouble((PyArrayScalar_VAL(obj, Byte).real));
+#else
+    return PyFloat_FromDouble((PyArrayScalar_VAL(obj, Byte)));
+#endif
+}
+
+#line 1777
+static PyObject *
+ubyte_float(PyObject *obj)
+{
+#if 0
+    if (emit_complexwarning() < 0) {
+        return NULL;
+    }
+    return PyFloat_FromDouble((PyArrayScalar_VAL(obj, UByte).real));
+#else
+    return PyFloat_FromDouble((PyArrayScalar_VAL(obj, UByte)));
+#endif
+}
+
+#line 1777
+static PyObject *
+short_float(PyObject *obj)
+{
+#if 0
+    if (emit_complexwarning() < 0) {
+        return NULL;
+    }
+    return PyFloat_FromDouble((PyArrayScalar_VAL(obj, Short).real));
+#else
+    return PyFloat_FromDouble((PyArrayScalar_VAL(obj, Short)));
+#endif
+}
+
+#line 1777
+static PyObject *
+ushort_float(PyObject *obj)
+{
+#if 0
+    if (emit_complexwarning() < 0) {
+        return NULL;
+    }
+    return PyFloat_FromDouble((PyArrayScalar_VAL(obj, UShort).real));
+#else
+    return PyFloat_FromDouble((PyArrayScalar_VAL(obj, UShort)));
+#endif
+}
+
+#line 1777
+static PyObject *
+int_float(PyObject *obj)
+{
+#if 0
+    if (emit_complexwarning() < 0) {
+        return NULL;
+    }
+    return PyFloat_FromDouble((PyArrayScalar_VAL(obj, Int).real));
+#else
+    return PyFloat_FromDouble((PyArrayScalar_VAL(obj, Int)));
+#endif
+}
+
+#line 1777
+static PyObject *
+uint_float(PyObject *obj)
+{
+#if 0
+    if (emit_complexwarning() < 0) {
+        return NULL;
+    }
+    return PyFloat_FromDouble((PyArrayScalar_VAL(obj, UInt).real));
+#else
+    return PyFloat_FromDouble((PyArrayScalar_VAL(obj, UInt)));
+#endif
+}
+
+#line 1777
+static PyObject *
+long_float(PyObject *obj)
+{
+#if 0
+    if (emit_complexwarning() < 0) {
+        return NULL;
+    }
+    return PyFloat_FromDouble((PyArrayScalar_VAL(obj, Long).real));
+#else
+    return PyFloat_FromDouble((PyArrayScalar_VAL(obj, Long)));
+#endif
+}
+
+#line 1777
+static PyObject *
+ulong_float(PyObject *obj)
+{
+#if 0
+    if (emit_complexwarning() < 0) {
+        return NULL;
+    }
+    return PyFloat_FromDouble((PyArrayScalar_VAL(obj, ULong).real));
+#else
+    return PyFloat_FromDouble((PyArrayScalar_VAL(obj, ULong)));
+#endif
+}
+
+#line 1777
+static PyObject *
+longlong_float(PyObject *obj)
+{
+#if 0
+    if (emit_complexwarning() < 0) {
+        return NULL;
+    }
+    return PyFloat_FromDouble((PyArrayScalar_VAL(obj, LongLong).real));
+#else
+    return PyFloat_FromDouble((PyArrayScalar_VAL(obj, LongLong)));
+#endif
+}
+
+#line 1777
+static PyObject *
+ulonglong_float(PyObject *obj)
+{
+#if 0
+    if (emit_complexwarning() < 0) {
+        return NULL;
+    }
+    return PyFloat_FromDouble((PyArrayScalar_VAL(obj, ULongLong).real));
+#else
+    return PyFloat_FromDouble((PyArrayScalar_VAL(obj, ULongLong)));
+#endif
+}
+
+#line 1777
+static PyObject *
+half_float(PyObject *obj)
+{
+#if 0
+    if (emit_complexwarning() < 0) {
+        return NULL;
+    }
+    return PyFloat_FromDouble(npy_half_to_double(PyArrayScalar_VAL(obj, Half).real));
+#else
+    return PyFloat_FromDouble(npy_half_to_double(PyArrayScalar_VAL(obj, Half)));
+#endif
+}
+
+#line 1777
+static PyObject *
+float_float(PyObject *obj)
+{
+#if 0
+    if (emit_complexwarning() < 0) {
+        return NULL;
+    }
+    return PyFloat_FromDouble((PyArrayScalar_VAL(obj, Float).real));
+#else
+    return PyFloat_FromDouble((PyArrayScalar_VAL(obj, Float)));
+#endif
+}
+
+#line 1777
+static PyObject *
+double_float(PyObject *obj)
+{
+#if 0
+    if (emit_complexwarning() < 0) {
+        return NULL;
+    }
+    return PyFloat_FromDouble((PyArrayScalar_VAL(obj, Double).real));
+#else
+    return PyFloat_FromDouble((PyArrayScalar_VAL(obj, Double)));
+#endif
+}
+
+#line 1777
+static PyObject *
+longdouble_float(PyObject *obj)
+{
+#if 0
+    if (emit_complexwarning() < 0) {
+        return NULL;
+    }
+    return PyFloat_FromDouble((PyArrayScalar_VAL(obj, LongDouble).real));
+#else
+    return PyFloat_FromDouble((PyArrayScalar_VAL(obj, LongDouble)));
+#endif
+}
+
+#line 1777
+static PyObject *
+cfloat_float(PyObject *obj)
+{
+#if 1
+    if (emit_complexwarning() < 0) {
+        return NULL;
+    }
+    return PyFloat_FromDouble((PyArrayScalar_VAL(obj, CFloat).real));
+#else
+    return PyFloat_FromDouble((PyArrayScalar_VAL(obj, CFloat)));
+#endif
+}
+
+#line 1777
+static PyObject *
+cdouble_float(PyObject *obj)
+{
+#if 1
+    if (emit_complexwarning() < 0) {
+        return NULL;
+    }
+    return PyFloat_FromDouble((PyArrayScalar_VAL(obj, CDouble).real));
+#else
+    return PyFloat_FromDouble((PyArrayScalar_VAL(obj, CDouble)));
+#endif
+}
+
+#line 1777
+static PyObject *
+clongdouble_float(PyObject *obj)
+{
+#if 1
+    if (emit_complexwarning() < 0) {
+        return NULL;
+    }
+    return PyFloat_FromDouble((PyArrayScalar_VAL(obj, CLongDouble).real));
+#else
+    return PyFloat_FromDouble((PyArrayScalar_VAL(obj, CLongDouble)));
+#endif
+}
+
+
+#if __GNUC__ < 10
+    /* At least GCC 9.2 issues spurious warnings for arg2 below. */
+    #pragma GCC diagnostic push  /* matching pop after function and repeat */
+    #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+
+#line 1803
+#define def_cmp_le(arg1, arg2) (arg1 <= arg2)
+#define cmplx_cmp_le(arg1, arg2) ((arg1.real == arg2.real) ?        \
+                                      arg1.imag <= arg2.imag :        \
+                                      arg1.real <= arg2.real)
+#define def_half_cmp_le(arg1, arg2) npy_half_le(arg1, arg2)
+
+#line 1803
+#define def_cmp_ge(arg1, arg2) (arg1 >= arg2)
+#define cmplx_cmp_ge(arg1, arg2) ((arg1.real == arg2.real) ?        \
+                                      arg1.imag >= arg2.imag :        \
+                                      arg1.real >= arg2.real)
+#define def_half_cmp_ge(arg1, arg2) npy_half_ge(arg1, arg2)
+
+#line 1803
+#define def_cmp_lt(arg1, arg2) (arg1 < arg2)
+#define cmplx_cmp_lt(arg1, arg2) ((arg1.real == arg2.real) ?        \
+                                      arg1.imag < arg2.imag :        \
+                                      arg1.real < arg2.real)
+#define def_half_cmp_lt(arg1, arg2) npy_half_lt(arg1, arg2)
+
+#line 1803
+#define def_cmp_gt(arg1, arg2) (arg1 > arg2)
+#define cmplx_cmp_gt(arg1, arg2) ((arg1.real == arg2.real) ?        \
+                                      arg1.imag > arg2.imag :        \
+                                      arg1.real > arg2.real)
+#define def_half_cmp_gt(arg1, arg2) npy_half_gt(arg1, arg2)
+
+#line 1803
+#define def_cmp_eq(arg1, arg2) (arg1 == arg2)
+#define cmplx_cmp_eq(arg1, arg2) ((arg1.real == arg2.real) ?        \
+                                      arg1.imag == arg2.imag :        \
+                                      arg1.real == arg2.real)
+#define def_half_cmp_eq(arg1, arg2) npy_half_eq(arg1, arg2)
+
+#line 1803
+#define def_cmp_ne(arg1, arg2) (arg1 != arg2)
+#define cmplx_cmp_ne(arg1, arg2) ((arg1.real == arg2.real) ?        \
+                                      arg1.imag != arg2.imag :        \
+                                      arg1.real != arg2.real)
+#define def_half_cmp_ne(arg1, arg2) npy_half_ne(arg1, arg2)
+
+
+#line 1825
+#define IS_byte
+
+static PyObject*
+byte_richcompare(PyObject *self, PyObject *other, int cmp_op)
+{
+    npy_byte arg1, arg2;
+    int out = 0;
+
+    /*
+     * Extract the other value (if it is compatible).
+     */
+    npy_bool may_need_deferring;
+    int res = convert_to_byte(other, &arg2, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        RICHCMP_GIVE_UP_IF_NEEDED(self, other);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            return PyGenericArrType_Type.tp_richcompare(self, other, cmp_op);
+        case CONVERT_PYSCALAR:
+            if (BYTE_setitem(other, (char *)&arg2, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+    arg1 = PyArrayScalar_VAL(self, Byte);
+
+    /* here we do the actual calculation with arg1 and arg2 */
+    switch (cmp_op) {
+    case Py_EQ:
+        out = def_cmp_eq(arg1, arg2);
+        break;
+    case Py_NE:
+        out = def_cmp_ne(arg1, arg2);
+        break;
+    case Py_LE:
+        out = def_cmp_le(arg1, arg2);
+        break;
+    case Py_GE:
+        out = def_cmp_ge(arg1, arg2);
+        break;
+    case Py_LT:
+        out = def_cmp_lt(arg1, arg2);
+        break;
+    case Py_GT:
+        out = def_cmp_gt(arg1, arg2);
+        break;
+    }
+
+    if (out) {
+        PyArrayScalar_RETURN_TRUE;
+    }
+    else {
+        PyArrayScalar_RETURN_FALSE;
+    }
+}
+
+#undef IS_byte
+
+#line 1825
+#define IS_ubyte
+
+static PyObject*
+ubyte_richcompare(PyObject *self, PyObject *other, int cmp_op)
+{
+    npy_ubyte arg1, arg2;
+    int out = 0;
+
+    /*
+     * Extract the other value (if it is compatible).
+     */
+    npy_bool may_need_deferring;
+    int res = convert_to_ubyte(other, &arg2, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        RICHCMP_GIVE_UP_IF_NEEDED(self, other);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            return PyGenericArrType_Type.tp_richcompare(self, other, cmp_op);
+        case CONVERT_PYSCALAR:
+            if (UBYTE_setitem(other, (char *)&arg2, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+    arg1 = PyArrayScalar_VAL(self, UByte);
+
+    /* here we do the actual calculation with arg1 and arg2 */
+    switch (cmp_op) {
+    case Py_EQ:
+        out = def_cmp_eq(arg1, arg2);
+        break;
+    case Py_NE:
+        out = def_cmp_ne(arg1, arg2);
+        break;
+    case Py_LE:
+        out = def_cmp_le(arg1, arg2);
+        break;
+    case Py_GE:
+        out = def_cmp_ge(arg1, arg2);
+        break;
+    case Py_LT:
+        out = def_cmp_lt(arg1, arg2);
+        break;
+    case Py_GT:
+        out = def_cmp_gt(arg1, arg2);
+        break;
+    }
+
+    if (out) {
+        PyArrayScalar_RETURN_TRUE;
+    }
+    else {
+        PyArrayScalar_RETURN_FALSE;
+    }
+}
+
+#undef IS_ubyte
+
+#line 1825
+#define IS_short
+
+static PyObject*
+short_richcompare(PyObject *self, PyObject *other, int cmp_op)
+{
+    npy_short arg1, arg2;
+    int out = 0;
+
+    /*
+     * Extract the other value (if it is compatible).
+     */
+    npy_bool may_need_deferring;
+    int res = convert_to_short(other, &arg2, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        RICHCMP_GIVE_UP_IF_NEEDED(self, other);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            return PyGenericArrType_Type.tp_richcompare(self, other, cmp_op);
+        case CONVERT_PYSCALAR:
+            if (SHORT_setitem(other, (char *)&arg2, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+    arg1 = PyArrayScalar_VAL(self, Short);
+
+    /* here we do the actual calculation with arg1 and arg2 */
+    switch (cmp_op) {
+    case Py_EQ:
+        out = def_cmp_eq(arg1, arg2);
+        break;
+    case Py_NE:
+        out = def_cmp_ne(arg1, arg2);
+        break;
+    case Py_LE:
+        out = def_cmp_le(arg1, arg2);
+        break;
+    case Py_GE:
+        out = def_cmp_ge(arg1, arg2);
+        break;
+    case Py_LT:
+        out = def_cmp_lt(arg1, arg2);
+        break;
+    case Py_GT:
+        out = def_cmp_gt(arg1, arg2);
+        break;
+    }
+
+    if (out) {
+        PyArrayScalar_RETURN_TRUE;
+    }
+    else {
+        PyArrayScalar_RETURN_FALSE;
+    }
+}
+
+#undef IS_short
+
+#line 1825
+#define IS_ushort
+
+static PyObject*
+ushort_richcompare(PyObject *self, PyObject *other, int cmp_op)
+{
+    npy_ushort arg1, arg2;
+    int out = 0;
+
+    /*
+     * Extract the other value (if it is compatible).
+     */
+    npy_bool may_need_deferring;
+    int res = convert_to_ushort(other, &arg2, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        RICHCMP_GIVE_UP_IF_NEEDED(self, other);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            return PyGenericArrType_Type.tp_richcompare(self, other, cmp_op);
+        case CONVERT_PYSCALAR:
+            if (USHORT_setitem(other, (char *)&arg2, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+    arg1 = PyArrayScalar_VAL(self, UShort);
+
+    /* here we do the actual calculation with arg1 and arg2 */
+    switch (cmp_op) {
+    case Py_EQ:
+        out = def_cmp_eq(arg1, arg2);
+        break;
+    case Py_NE:
+        out = def_cmp_ne(arg1, arg2);
+        break;
+    case Py_LE:
+        out = def_cmp_le(arg1, arg2);
+        break;
+    case Py_GE:
+        out = def_cmp_ge(arg1, arg2);
+        break;
+    case Py_LT:
+        out = def_cmp_lt(arg1, arg2);
+        break;
+    case Py_GT:
+        out = def_cmp_gt(arg1, arg2);
+        break;
+    }
+
+    if (out) {
+        PyArrayScalar_RETURN_TRUE;
+    }
+    else {
+        PyArrayScalar_RETURN_FALSE;
+    }
+}
+
+#undef IS_ushort
+
+#line 1825
+#define IS_int
+
+static PyObject*
+int_richcompare(PyObject *self, PyObject *other, int cmp_op)
+{
+    npy_int arg1, arg2;
+    int out = 0;
+
+    /*
+     * Extract the other value (if it is compatible).
+     */
+    npy_bool may_need_deferring;
+    int res = convert_to_int(other, &arg2, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        RICHCMP_GIVE_UP_IF_NEEDED(self, other);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            return PyGenericArrType_Type.tp_richcompare(self, other, cmp_op);
+        case CONVERT_PYSCALAR:
+            if (INT_setitem(other, (char *)&arg2, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+    arg1 = PyArrayScalar_VAL(self, Int);
+
+    /* here we do the actual calculation with arg1 and arg2 */
+    switch (cmp_op) {
+    case Py_EQ:
+        out = def_cmp_eq(arg1, arg2);
+        break;
+    case Py_NE:
+        out = def_cmp_ne(arg1, arg2);
+        break;
+    case Py_LE:
+        out = def_cmp_le(arg1, arg2);
+        break;
+    case Py_GE:
+        out = def_cmp_ge(arg1, arg2);
+        break;
+    case Py_LT:
+        out = def_cmp_lt(arg1, arg2);
+        break;
+    case Py_GT:
+        out = def_cmp_gt(arg1, arg2);
+        break;
+    }
+
+    if (out) {
+        PyArrayScalar_RETURN_TRUE;
+    }
+    else {
+        PyArrayScalar_RETURN_FALSE;
+    }
+}
+
+#undef IS_int
+
+#line 1825
+#define IS_uint
+
+static PyObject*
+uint_richcompare(PyObject *self, PyObject *other, int cmp_op)
+{
+    npy_uint arg1, arg2;
+    int out = 0;
+
+    /*
+     * Extract the other value (if it is compatible).
+     */
+    npy_bool may_need_deferring;
+    int res = convert_to_uint(other, &arg2, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        RICHCMP_GIVE_UP_IF_NEEDED(self, other);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            return PyGenericArrType_Type.tp_richcompare(self, other, cmp_op);
+        case CONVERT_PYSCALAR:
+            if (UINT_setitem(other, (char *)&arg2, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+    arg1 = PyArrayScalar_VAL(self, UInt);
+
+    /* here we do the actual calculation with arg1 and arg2 */
+    switch (cmp_op) {
+    case Py_EQ:
+        out = def_cmp_eq(arg1, arg2);
+        break;
+    case Py_NE:
+        out = def_cmp_ne(arg1, arg2);
+        break;
+    case Py_LE:
+        out = def_cmp_le(arg1, arg2);
+        break;
+    case Py_GE:
+        out = def_cmp_ge(arg1, arg2);
+        break;
+    case Py_LT:
+        out = def_cmp_lt(arg1, arg2);
+        break;
+    case Py_GT:
+        out = def_cmp_gt(arg1, arg2);
+        break;
+    }
+
+    if (out) {
+        PyArrayScalar_RETURN_TRUE;
+    }
+    else {
+        PyArrayScalar_RETURN_FALSE;
+    }
+}
+
+#undef IS_uint
+
+#line 1825
+#define IS_long
+
+static PyObject*
+long_richcompare(PyObject *self, PyObject *other, int cmp_op)
+{
+    npy_long arg1, arg2;
+    int out = 0;
+
+    /*
+     * Extract the other value (if it is compatible).
+     */
+    npy_bool may_need_deferring;
+    int res = convert_to_long(other, &arg2, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        RICHCMP_GIVE_UP_IF_NEEDED(self, other);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            return PyGenericArrType_Type.tp_richcompare(self, other, cmp_op);
+        case CONVERT_PYSCALAR:
+            if (LONG_setitem(other, (char *)&arg2, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+    arg1 = PyArrayScalar_VAL(self, Long);
+
+    /* here we do the actual calculation with arg1 and arg2 */
+    switch (cmp_op) {
+    case Py_EQ:
+        out = def_cmp_eq(arg1, arg2);
+        break;
+    case Py_NE:
+        out = def_cmp_ne(arg1, arg2);
+        break;
+    case Py_LE:
+        out = def_cmp_le(arg1, arg2);
+        break;
+    case Py_GE:
+        out = def_cmp_ge(arg1, arg2);
+        break;
+    case Py_LT:
+        out = def_cmp_lt(arg1, arg2);
+        break;
+    case Py_GT:
+        out = def_cmp_gt(arg1, arg2);
+        break;
+    }
+
+    if (out) {
+        PyArrayScalar_RETURN_TRUE;
+    }
+    else {
+        PyArrayScalar_RETURN_FALSE;
+    }
+}
+
+#undef IS_long
+
+#line 1825
+#define IS_ulong
+
+static PyObject*
+ulong_richcompare(PyObject *self, PyObject *other, int cmp_op)
+{
+    npy_ulong arg1, arg2;
+    int out = 0;
+
+    /*
+     * Extract the other value (if it is compatible).
+     */
+    npy_bool may_need_deferring;
+    int res = convert_to_ulong(other, &arg2, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        RICHCMP_GIVE_UP_IF_NEEDED(self, other);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            return PyGenericArrType_Type.tp_richcompare(self, other, cmp_op);
+        case CONVERT_PYSCALAR:
+            if (ULONG_setitem(other, (char *)&arg2, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+    arg1 = PyArrayScalar_VAL(self, ULong);
+
+    /* here we do the actual calculation with arg1 and arg2 */
+    switch (cmp_op) {
+    case Py_EQ:
+        out = def_cmp_eq(arg1, arg2);
+        break;
+    case Py_NE:
+        out = def_cmp_ne(arg1, arg2);
+        break;
+    case Py_LE:
+        out = def_cmp_le(arg1, arg2);
+        break;
+    case Py_GE:
+        out = def_cmp_ge(arg1, arg2);
+        break;
+    case Py_LT:
+        out = def_cmp_lt(arg1, arg2);
+        break;
+    case Py_GT:
+        out = def_cmp_gt(arg1, arg2);
+        break;
+    }
+
+    if (out) {
+        PyArrayScalar_RETURN_TRUE;
+    }
+    else {
+        PyArrayScalar_RETURN_FALSE;
+    }
+}
+
+#undef IS_ulong
+
+#line 1825
+#define IS_longlong
+
+static PyObject*
+longlong_richcompare(PyObject *self, PyObject *other, int cmp_op)
+{
+    npy_longlong arg1, arg2;
+    int out = 0;
+
+    /*
+     * Extract the other value (if it is compatible).
+     */
+    npy_bool may_need_deferring;
+    int res = convert_to_longlong(other, &arg2, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        RICHCMP_GIVE_UP_IF_NEEDED(self, other);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            return PyGenericArrType_Type.tp_richcompare(self, other, cmp_op);
+        case CONVERT_PYSCALAR:
+            if (LONGLONG_setitem(other, (char *)&arg2, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+    arg1 = PyArrayScalar_VAL(self, LongLong);
+
+    /* here we do the actual calculation with arg1 and arg2 */
+    switch (cmp_op) {
+    case Py_EQ:
+        out = def_cmp_eq(arg1, arg2);
+        break;
+    case Py_NE:
+        out = def_cmp_ne(arg1, arg2);
+        break;
+    case Py_LE:
+        out = def_cmp_le(arg1, arg2);
+        break;
+    case Py_GE:
+        out = def_cmp_ge(arg1, arg2);
+        break;
+    case Py_LT:
+        out = def_cmp_lt(arg1, arg2);
+        break;
+    case Py_GT:
+        out = def_cmp_gt(arg1, arg2);
+        break;
+    }
+
+    if (out) {
+        PyArrayScalar_RETURN_TRUE;
+    }
+    else {
+        PyArrayScalar_RETURN_FALSE;
+    }
+}
+
+#undef IS_longlong
+
+#line 1825
+#define IS_ulonglong
+
+static PyObject*
+ulonglong_richcompare(PyObject *self, PyObject *other, int cmp_op)
+{
+    npy_ulonglong arg1, arg2;
+    int out = 0;
+
+    /*
+     * Extract the other value (if it is compatible).
+     */
+    npy_bool may_need_deferring;
+    int res = convert_to_ulonglong(other, &arg2, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        RICHCMP_GIVE_UP_IF_NEEDED(self, other);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            return PyGenericArrType_Type.tp_richcompare(self, other, cmp_op);
+        case CONVERT_PYSCALAR:
+            if (ULONGLONG_setitem(other, (char *)&arg2, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+    arg1 = PyArrayScalar_VAL(self, ULongLong);
+
+    /* here we do the actual calculation with arg1 and arg2 */
+    switch (cmp_op) {
+    case Py_EQ:
+        out = def_cmp_eq(arg1, arg2);
+        break;
+    case Py_NE:
+        out = def_cmp_ne(arg1, arg2);
+        break;
+    case Py_LE:
+        out = def_cmp_le(arg1, arg2);
+        break;
+    case Py_GE:
+        out = def_cmp_ge(arg1, arg2);
+        break;
+    case Py_LT:
+        out = def_cmp_lt(arg1, arg2);
+        break;
+    case Py_GT:
+        out = def_cmp_gt(arg1, arg2);
+        break;
+    }
+
+    if (out) {
+        PyArrayScalar_RETURN_TRUE;
+    }
+    else {
+        PyArrayScalar_RETURN_FALSE;
+    }
+}
+
+#undef IS_ulonglong
+
+#line 1825
+#define IS_half
+
+static PyObject*
+half_richcompare(PyObject *self, PyObject *other, int cmp_op)
+{
+    npy_half arg1, arg2;
+    int out = 0;
+
+    /*
+     * Extract the other value (if it is compatible).
+     */
+    npy_bool may_need_deferring;
+    int res = convert_to_half(other, &arg2, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        RICHCMP_GIVE_UP_IF_NEEDED(self, other);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            return PyGenericArrType_Type.tp_richcompare(self, other, cmp_op);
+        case CONVERT_PYSCALAR:
+            if (HALF_setitem(other, (char *)&arg2, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+    arg1 = PyArrayScalar_VAL(self, Half);
+
+    /* here we do the actual calculation with arg1 and arg2 */
+    switch (cmp_op) {
+    case Py_EQ:
+        out = def_half_cmp_eq(arg1, arg2);
+        break;
+    case Py_NE:
+        out = def_half_cmp_ne(arg1, arg2);
+        break;
+    case Py_LE:
+        out = def_half_cmp_le(arg1, arg2);
+        break;
+    case Py_GE:
+        out = def_half_cmp_ge(arg1, arg2);
+        break;
+    case Py_LT:
+        out = def_half_cmp_lt(arg1, arg2);
+        break;
+    case Py_GT:
+        out = def_half_cmp_gt(arg1, arg2);
+        break;
+    }
+
+    if (out) {
+        PyArrayScalar_RETURN_TRUE;
+    }
+    else {
+        PyArrayScalar_RETURN_FALSE;
+    }
+}
+
+#undef IS_half
+
+#line 1825
+#define IS_float
+
+static PyObject*
+float_richcompare(PyObject *self, PyObject *other, int cmp_op)
+{
+    npy_float arg1, arg2;
+    int out = 0;
+
+    /*
+     * Extract the other value (if it is compatible).
+     */
+    npy_bool may_need_deferring;
+    int res = convert_to_float(other, &arg2, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        RICHCMP_GIVE_UP_IF_NEEDED(self, other);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            return PyGenericArrType_Type.tp_richcompare(self, other, cmp_op);
+        case CONVERT_PYSCALAR:
+            if (FLOAT_setitem(other, (char *)&arg2, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+    arg1 = PyArrayScalar_VAL(self, Float);
+
+    /* here we do the actual calculation with arg1 and arg2 */
+    switch (cmp_op) {
+    case Py_EQ:
+        out = def_cmp_eq(arg1, arg2);
+        break;
+    case Py_NE:
+        out = def_cmp_ne(arg1, arg2);
+        break;
+    case Py_LE:
+        out = def_cmp_le(arg1, arg2);
+        break;
+    case Py_GE:
+        out = def_cmp_ge(arg1, arg2);
+        break;
+    case Py_LT:
+        out = def_cmp_lt(arg1, arg2);
+        break;
+    case Py_GT:
+        out = def_cmp_gt(arg1, arg2);
+        break;
+    }
+
+    if (out) {
+        PyArrayScalar_RETURN_TRUE;
+    }
+    else {
+        PyArrayScalar_RETURN_FALSE;
+    }
+}
+
+#undef IS_float
+
+#line 1825
+#define IS_double
+
+static PyObject*
+double_richcompare(PyObject *self, PyObject *other, int cmp_op)
+{
+    npy_double arg1, arg2;
+    int out = 0;
+
+    /*
+     * Extract the other value (if it is compatible).
+     */
+    npy_bool may_need_deferring;
+    int res = convert_to_double(other, &arg2, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        RICHCMP_GIVE_UP_IF_NEEDED(self, other);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            return PyGenericArrType_Type.tp_richcompare(self, other, cmp_op);
+        case CONVERT_PYSCALAR:
+            if (DOUBLE_setitem(other, (char *)&arg2, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+    arg1 = PyArrayScalar_VAL(self, Double);
+
+    /* here we do the actual calculation with arg1 and arg2 */
+    switch (cmp_op) {
+    case Py_EQ:
+        out = def_cmp_eq(arg1, arg2);
+        break;
+    case Py_NE:
+        out = def_cmp_ne(arg1, arg2);
+        break;
+    case Py_LE:
+        out = def_cmp_le(arg1, arg2);
+        break;
+    case Py_GE:
+        out = def_cmp_ge(arg1, arg2);
+        break;
+    case Py_LT:
+        out = def_cmp_lt(arg1, arg2);
+        break;
+    case Py_GT:
+        out = def_cmp_gt(arg1, arg2);
+        break;
+    }
+
+    if (out) {
+        PyArrayScalar_RETURN_TRUE;
+    }
+    else {
+        PyArrayScalar_RETURN_FALSE;
+    }
+}
+
+#undef IS_double
+
+#line 1825
+#define IS_longdouble
+
+static PyObject*
+longdouble_richcompare(PyObject *self, PyObject *other, int cmp_op)
+{
+    npy_longdouble arg1, arg2;
+    int out = 0;
+
+    /*
+     * Extract the other value (if it is compatible).
+     */
+    npy_bool may_need_deferring;
+    int res = convert_to_longdouble(other, &arg2, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        RICHCMP_GIVE_UP_IF_NEEDED(self, other);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            return PyGenericArrType_Type.tp_richcompare(self, other, cmp_op);
+        case CONVERT_PYSCALAR:
+            if (LONGDOUBLE_setitem(other, (char *)&arg2, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+    arg1 = PyArrayScalar_VAL(self, LongDouble);
+
+    /* here we do the actual calculation with arg1 and arg2 */
+    switch (cmp_op) {
+    case Py_EQ:
+        out = def_cmp_eq(arg1, arg2);
+        break;
+    case Py_NE:
+        out = def_cmp_ne(arg1, arg2);
+        break;
+    case Py_LE:
+        out = def_cmp_le(arg1, arg2);
+        break;
+    case Py_GE:
+        out = def_cmp_ge(arg1, arg2);
+        break;
+    case Py_LT:
+        out = def_cmp_lt(arg1, arg2);
+        break;
+    case Py_GT:
+        out = def_cmp_gt(arg1, arg2);
+        break;
+    }
+
+    if (out) {
+        PyArrayScalar_RETURN_TRUE;
+    }
+    else {
+        PyArrayScalar_RETURN_FALSE;
+    }
+}
+
+#undef IS_longdouble
+
+#line 1825
+#define IS_cfloat
+
+static PyObject*
+cfloat_richcompare(PyObject *self, PyObject *other, int cmp_op)
+{
+    npy_cfloat arg1, arg2;
+    int out = 0;
+
+    /*
+     * Extract the other value (if it is compatible).
+     */
+    npy_bool may_need_deferring;
+    int res = convert_to_cfloat(other, &arg2, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        RICHCMP_GIVE_UP_IF_NEEDED(self, other);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            return PyGenericArrType_Type.tp_richcompare(self, other, cmp_op);
+        case CONVERT_PYSCALAR:
+            if (CFLOAT_setitem(other, (char *)&arg2, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+    arg1 = PyArrayScalar_VAL(self, CFloat);
+
+    /* here we do the actual calculation with arg1 and arg2 */
+    switch (cmp_op) {
+    case Py_EQ:
+        out = cmplx_cmp_eq(arg1, arg2);
+        break;
+    case Py_NE:
+        out = cmplx_cmp_ne(arg1, arg2);
+        break;
+    case Py_LE:
+        out = cmplx_cmp_le(arg1, arg2);
+        break;
+    case Py_GE:
+        out = cmplx_cmp_ge(arg1, arg2);
+        break;
+    case Py_LT:
+        out = cmplx_cmp_lt(arg1, arg2);
+        break;
+    case Py_GT:
+        out = cmplx_cmp_gt(arg1, arg2);
+        break;
+    }
+
+    if (out) {
+        PyArrayScalar_RETURN_TRUE;
+    }
+    else {
+        PyArrayScalar_RETURN_FALSE;
+    }
+}
+
+#undef IS_cfloat
+
+#line 1825
+#define IS_cdouble
+
+static PyObject*
+cdouble_richcompare(PyObject *self, PyObject *other, int cmp_op)
+{
+    npy_cdouble arg1, arg2;
+    int out = 0;
+
+    /*
+     * Extract the other value (if it is compatible).
+     */
+    npy_bool may_need_deferring;
+    int res = convert_to_cdouble(other, &arg2, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        RICHCMP_GIVE_UP_IF_NEEDED(self, other);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            return PyGenericArrType_Type.tp_richcompare(self, other, cmp_op);
+        case CONVERT_PYSCALAR:
+            if (CDOUBLE_setitem(other, (char *)&arg2, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+    arg1 = PyArrayScalar_VAL(self, CDouble);
+
+    /* here we do the actual calculation with arg1 and arg2 */
+    switch (cmp_op) {
+    case Py_EQ:
+        out = cmplx_cmp_eq(arg1, arg2);
+        break;
+    case Py_NE:
+        out = cmplx_cmp_ne(arg1, arg2);
+        break;
+    case Py_LE:
+        out = cmplx_cmp_le(arg1, arg2);
+        break;
+    case Py_GE:
+        out = cmplx_cmp_ge(arg1, arg2);
+        break;
+    case Py_LT:
+        out = cmplx_cmp_lt(arg1, arg2);
+        break;
+    case Py_GT:
+        out = cmplx_cmp_gt(arg1, arg2);
+        break;
+    }
+
+    if (out) {
+        PyArrayScalar_RETURN_TRUE;
+    }
+    else {
+        PyArrayScalar_RETURN_FALSE;
+    }
+}
+
+#undef IS_cdouble
+
+#line 1825
+#define IS_clongdouble
+
+static PyObject*
+clongdouble_richcompare(PyObject *self, PyObject *other, int cmp_op)
+{
+    npy_clongdouble arg1, arg2;
+    int out = 0;
+
+    /*
+     * Extract the other value (if it is compatible).
+     */
+    npy_bool may_need_deferring;
+    int res = convert_to_clongdouble(other, &arg2, &may_need_deferring);
+    if (res == CONVERSION_ERROR) {
+        return NULL;  /* an error occurred (should never happen) */
+    }
+    if (may_need_deferring) {
+        RICHCMP_GIVE_UP_IF_NEEDED(self, other);
+    }
+    switch (res) {
+        case DEFER_TO_OTHER_KNOWN_SCALAR:
+            Py_RETURN_NOTIMPLEMENTED;
+        case CONVERSION_SUCCESS:
+            break;  /* successfully extracted value we can proceed */
+        case OTHER_IS_UNKNOWN_OBJECT:
+#if defined(IS_longdouble) || defined(IS_clongdouble)
+            Py_RETURN_NOTIMPLEMENTED;
+#endif
+        case PROMOTION_REQUIRED:
+            return PyGenericArrType_Type.tp_richcompare(self, other, cmp_op);
+        case CONVERT_PYSCALAR:
+            if (CLONGDOUBLE_setitem(other, (char *)&arg2, NULL) < 0) {
+                return NULL;
+            }
+            break;
+        default:
+            assert(0);  /* error was checked already, impossible to reach */
+            return NULL;
+    }
+
+    arg1 = PyArrayScalar_VAL(self, CLongDouble);
+
+    /* here we do the actual calculation with arg1 and arg2 */
+    switch (cmp_op) {
+    case Py_EQ:
+        out = cmplx_cmp_eq(arg1, arg2);
+        break;
+    case Py_NE:
+        out = cmplx_cmp_ne(arg1, arg2);
+        break;
+    case Py_LE:
+        out = cmplx_cmp_le(arg1, arg2);
+        break;
+    case Py_GE:
+        out = cmplx_cmp_ge(arg1, arg2);
+        break;
+    case Py_LT:
+        out = cmplx_cmp_lt(arg1, arg2);
+        break;
+    case Py_GT:
+        out = cmplx_cmp_gt(arg1, arg2);
+        break;
+    }
+
+    if (out) {
+        PyArrayScalar_RETURN_TRUE;
+    }
+    else {
+        PyArrayScalar_RETURN_FALSE;
+    }
+}
+
+#undef IS_clongdouble
+
+
+#if __GNUC__ < 10
+    #pragma GCC diagnostic pop
+#endif
+
+
+#line 1911
+static PyNumberMethods byte_as_number = {
+    .nb_add = (binaryfunc)byte_add,
+    .nb_subtract = (binaryfunc)byte_subtract,
+    .nb_multiply = (binaryfunc)byte_multiply,
+    .nb_remainder = (binaryfunc)byte_remainder,
+    .nb_divmod = (binaryfunc)byte_divmod,
+    .nb_power = (ternaryfunc)byte_power,
+    .nb_negative = (unaryfunc)byte_negative,
+    .nb_positive = (unaryfunc)byte_positive,
+    .nb_absolute = (unaryfunc)byte_absolute,
+    .nb_bool = (inquiry)byte_bool,
+    .nb_invert = (unaryfunc)byte_invert,
+    .nb_lshift = (binaryfunc)byte_lshift,
+    .nb_rshift = (binaryfunc)byte_rshift,
+    .nb_and = (binaryfunc)byte_and,
+    .nb_xor = (binaryfunc)byte_xor,
+    .nb_or = (binaryfunc)byte_or,
+    .nb_int = (unaryfunc)byte_int,
+    .nb_float = (unaryfunc)byte_float,
+    .nb_floor_divide = (binaryfunc)byte_floor_divide,
+    .nb_true_divide = (binaryfunc)byte_true_divide,
+    /* TODO: This struct/initialization should not be split between files */
+    .nb_index = (unaryfunc)NULL,  /* set in add_scalarmath below */
+};
+
+#line 1911
+static PyNumberMethods ubyte_as_number = {
+    .nb_add = (binaryfunc)ubyte_add,
+    .nb_subtract = (binaryfunc)ubyte_subtract,
+    .nb_multiply = (binaryfunc)ubyte_multiply,
+    .nb_remainder = (binaryfunc)ubyte_remainder,
+    .nb_divmod = (binaryfunc)ubyte_divmod,
+    .nb_power = (ternaryfunc)ubyte_power,
+    .nb_negative = (unaryfunc)ubyte_negative,
+    .nb_positive = (unaryfunc)ubyte_positive,
+    .nb_absolute = (unaryfunc)ubyte_absolute,
+    .nb_bool = (inquiry)ubyte_bool,
+    .nb_invert = (unaryfunc)ubyte_invert,
+    .nb_lshift = (binaryfunc)ubyte_lshift,
+    .nb_rshift = (binaryfunc)ubyte_rshift,
+    .nb_and = (binaryfunc)ubyte_and,
+    .nb_xor = (binaryfunc)ubyte_xor,
+    .nb_or = (binaryfunc)ubyte_or,
+    .nb_int = (unaryfunc)ubyte_int,
+    .nb_float = (unaryfunc)ubyte_float,
+    .nb_floor_divide = (binaryfunc)ubyte_floor_divide,
+    .nb_true_divide = (binaryfunc)ubyte_true_divide,
+    /* TODO: This struct/initialization should not be split between files */
+    .nb_index = (unaryfunc)NULL,  /* set in add_scalarmath below */
+};
+
+#line 1911
+static PyNumberMethods short_as_number = {
+    .nb_add = (binaryfunc)short_add,
+    .nb_subtract = (binaryfunc)short_subtract,
+    .nb_multiply = (binaryfunc)short_multiply,
+    .nb_remainder = (binaryfunc)short_remainder,
+    .nb_divmod = (binaryfunc)short_divmod,
+    .nb_power = (ternaryfunc)short_power,
+    .nb_negative = (unaryfunc)short_negative,
+    .nb_positive = (unaryfunc)short_positive,
+    .nb_absolute = (unaryfunc)short_absolute,
+    .nb_bool = (inquiry)short_bool,
+    .nb_invert = (unaryfunc)short_invert,
+    .nb_lshift = (binaryfunc)short_lshift,
+    .nb_rshift = (binaryfunc)short_rshift,
+    .nb_and = (binaryfunc)short_and,
+    .nb_xor = (binaryfunc)short_xor,
+    .nb_or = (binaryfunc)short_or,
+    .nb_int = (unaryfunc)short_int,
+    .nb_float = (unaryfunc)short_float,
+    .nb_floor_divide = (binaryfunc)short_floor_divide,
+    .nb_true_divide = (binaryfunc)short_true_divide,
+    /* TODO: This struct/initialization should not be split between files */
+    .nb_index = (unaryfunc)NULL,  /* set in add_scalarmath below */
+};
+
+#line 1911
+static PyNumberMethods ushort_as_number = {
+    .nb_add = (binaryfunc)ushort_add,
+    .nb_subtract = (binaryfunc)ushort_subtract,
+    .nb_multiply = (binaryfunc)ushort_multiply,
+    .nb_remainder = (binaryfunc)ushort_remainder,
+    .nb_divmod = (binaryfunc)ushort_divmod,
+    .nb_power = (ternaryfunc)ushort_power,
+    .nb_negative = (unaryfunc)ushort_negative,
+    .nb_positive = (unaryfunc)ushort_positive,
+    .nb_absolute = (unaryfunc)ushort_absolute,
+    .nb_bool = (inquiry)ushort_bool,
+    .nb_invert = (unaryfunc)ushort_invert,
+    .nb_lshift = (binaryfunc)ushort_lshift,
+    .nb_rshift = (binaryfunc)ushort_rshift,
+    .nb_and = (binaryfunc)ushort_and,
+    .nb_xor = (binaryfunc)ushort_xor,
+    .nb_or = (binaryfunc)ushort_or,
+    .nb_int = (unaryfunc)ushort_int,
+    .nb_float = (unaryfunc)ushort_float,
+    .nb_floor_divide = (binaryfunc)ushort_floor_divide,
+    .nb_true_divide = (binaryfunc)ushort_true_divide,
+    /* TODO: This struct/initialization should not be split between files */
+    .nb_index = (unaryfunc)NULL,  /* set in add_scalarmath below */
+};
+
+#line 1911
+static PyNumberMethods int_as_number = {
+    .nb_add = (binaryfunc)int_add,
+    .nb_subtract = (binaryfunc)int_subtract,
+    .nb_multiply = (binaryfunc)int_multiply,
+    .nb_remainder = (binaryfunc)int_remainder,
+    .nb_divmod = (binaryfunc)int_divmod,
+    .nb_power = (ternaryfunc)int_power,
+    .nb_negative = (unaryfunc)int_negative,
+    .nb_positive = (unaryfunc)int_positive,
+    .nb_absolute = (unaryfunc)int_absolute,
+    .nb_bool = (inquiry)int_bool,
+    .nb_invert = (unaryfunc)int_invert,
+    .nb_lshift = (binaryfunc)int_lshift,
+    .nb_rshift = (binaryfunc)int_rshift,
+    .nb_and = (binaryfunc)int_and,
+    .nb_xor = (binaryfunc)int_xor,
+    .nb_or = (binaryfunc)int_or,
+    .nb_int = (unaryfunc)int_int,
+    .nb_float = (unaryfunc)int_float,
+    .nb_floor_divide = (binaryfunc)int_floor_divide,
+    .nb_true_divide = (binaryfunc)int_true_divide,
+    /* TODO: This struct/initialization should not be split between files */
+    .nb_index = (unaryfunc)NULL,  /* set in add_scalarmath below */
+};
+
+#line 1911
+static PyNumberMethods uint_as_number = {
+    .nb_add = (binaryfunc)uint_add,
+    .nb_subtract = (binaryfunc)uint_subtract,
+    .nb_multiply = (binaryfunc)uint_multiply,
+    .nb_remainder = (binaryfunc)uint_remainder,
+    .nb_divmod = (binaryfunc)uint_divmod,
+    .nb_power = (ternaryfunc)uint_power,
+    .nb_negative = (unaryfunc)uint_negative,
+    .nb_positive = (unaryfunc)uint_positive,
+    .nb_absolute = (unaryfunc)uint_absolute,
+    .nb_bool = (inquiry)uint_bool,
+    .nb_invert = (unaryfunc)uint_invert,
+    .nb_lshift = (binaryfunc)uint_lshift,
+    .nb_rshift = (binaryfunc)uint_rshift,
+    .nb_and = (binaryfunc)uint_and,
+    .nb_xor = (binaryfunc)uint_xor,
+    .nb_or = (binaryfunc)uint_or,
+    .nb_int = (unaryfunc)uint_int,
+    .nb_float = (unaryfunc)uint_float,
+    .nb_floor_divide = (binaryfunc)uint_floor_divide,
+    .nb_true_divide = (binaryfunc)uint_true_divide,
+    /* TODO: This struct/initialization should not be split between files */
+    .nb_index = (unaryfunc)NULL,  /* set in add_scalarmath below */
+};
+
+#line 1911
+static PyNumberMethods long_as_number = {
+    .nb_add = (binaryfunc)long_add,
+    .nb_subtract = (binaryfunc)long_subtract,
+    .nb_multiply = (binaryfunc)long_multiply,
+    .nb_remainder = (binaryfunc)long_remainder,
+    .nb_divmod = (binaryfunc)long_divmod,
+    .nb_power = (ternaryfunc)long_power,
+    .nb_negative = (unaryfunc)long_negative,
+    .nb_positive = (unaryfunc)long_positive,
+    .nb_absolute = (unaryfunc)long_absolute,
+    .nb_bool = (inquiry)long_bool,
+    .nb_invert = (unaryfunc)long_invert,
+    .nb_lshift = (binaryfunc)long_lshift,
+    .nb_rshift = (binaryfunc)long_rshift,
+    .nb_and = (binaryfunc)long_and,
+    .nb_xor = (binaryfunc)long_xor,
+    .nb_or = (binaryfunc)long_or,
+    .nb_int = (unaryfunc)long_int,
+    .nb_float = (unaryfunc)long_float,
+    .nb_floor_divide = (binaryfunc)long_floor_divide,
+    .nb_true_divide = (binaryfunc)long_true_divide,
+    /* TODO: This struct/initialization should not be split between files */
+    .nb_index = (unaryfunc)NULL,  /* set in add_scalarmath below */
+};
+
+#line 1911
+static PyNumberMethods ulong_as_number = {
+    .nb_add = (binaryfunc)ulong_add,
+    .nb_subtract = (binaryfunc)ulong_subtract,
+    .nb_multiply = (binaryfunc)ulong_multiply,
+    .nb_remainder = (binaryfunc)ulong_remainder,
+    .nb_divmod = (binaryfunc)ulong_divmod,
+    .nb_power = (ternaryfunc)ulong_power,
+    .nb_negative = (unaryfunc)ulong_negative,
+    .nb_positive = (unaryfunc)ulong_positive,
+    .nb_absolute = (unaryfunc)ulong_absolute,
+    .nb_bool = (inquiry)ulong_bool,
+    .nb_invert = (unaryfunc)ulong_invert,
+    .nb_lshift = (binaryfunc)ulong_lshift,
+    .nb_rshift = (binaryfunc)ulong_rshift,
+    .nb_and = (binaryfunc)ulong_and,
+    .nb_xor = (binaryfunc)ulong_xor,
+    .nb_or = (binaryfunc)ulong_or,
+    .nb_int = (unaryfunc)ulong_int,
+    .nb_float = (unaryfunc)ulong_float,
+    .nb_floor_divide = (binaryfunc)ulong_floor_divide,
+    .nb_true_divide = (binaryfunc)ulong_true_divide,
+    /* TODO: This struct/initialization should not be split between files */
+    .nb_index = (unaryfunc)NULL,  /* set in add_scalarmath below */
+};
+
+#line 1911
+static PyNumberMethods longlong_as_number = {
+    .nb_add = (binaryfunc)longlong_add,
+    .nb_subtract = (binaryfunc)longlong_subtract,
+    .nb_multiply = (binaryfunc)longlong_multiply,
+    .nb_remainder = (binaryfunc)longlong_remainder,
+    .nb_divmod = (binaryfunc)longlong_divmod,
+    .nb_power = (ternaryfunc)longlong_power,
+    .nb_negative = (unaryfunc)longlong_negative,
+    .nb_positive = (unaryfunc)longlong_positive,
+    .nb_absolute = (unaryfunc)longlong_absolute,
+    .nb_bool = (inquiry)longlong_bool,
+    .nb_invert = (unaryfunc)longlong_invert,
+    .nb_lshift = (binaryfunc)longlong_lshift,
+    .nb_rshift = (binaryfunc)longlong_rshift,
+    .nb_and = (binaryfunc)longlong_and,
+    .nb_xor = (binaryfunc)longlong_xor,
+    .nb_or = (binaryfunc)longlong_or,
+    .nb_int = (unaryfunc)longlong_int,
+    .nb_float = (unaryfunc)longlong_float,
+    .nb_floor_divide = (binaryfunc)longlong_floor_divide,
+    .nb_true_divide = (binaryfunc)longlong_true_divide,
+    /* TODO: This struct/initialization should not be split between files */
+    .nb_index = (unaryfunc)NULL,  /* set in add_scalarmath below */
+};
+
+#line 1911
+static PyNumberMethods ulonglong_as_number = {
+    .nb_add = (binaryfunc)ulonglong_add,
+    .nb_subtract = (binaryfunc)ulonglong_subtract,
+    .nb_multiply = (binaryfunc)ulonglong_multiply,
+    .nb_remainder = (binaryfunc)ulonglong_remainder,
+    .nb_divmod = (binaryfunc)ulonglong_divmod,
+    .nb_power = (ternaryfunc)ulonglong_power,
+    .nb_negative = (unaryfunc)ulonglong_negative,
+    .nb_positive = (unaryfunc)ulonglong_positive,
+    .nb_absolute = (unaryfunc)ulonglong_absolute,
+    .nb_bool = (inquiry)ulonglong_bool,
+    .nb_invert = (unaryfunc)ulonglong_invert,
+    .nb_lshift = (binaryfunc)ulonglong_lshift,
+    .nb_rshift = (binaryfunc)ulonglong_rshift,
+    .nb_and = (binaryfunc)ulonglong_and,
+    .nb_xor = (binaryfunc)ulonglong_xor,
+    .nb_or = (binaryfunc)ulonglong_or,
+    .nb_int = (unaryfunc)ulonglong_int,
+    .nb_float = (unaryfunc)ulonglong_float,
+    .nb_floor_divide = (binaryfunc)ulonglong_floor_divide,
+    .nb_true_divide = (binaryfunc)ulonglong_true_divide,
+    /* TODO: This struct/initialization should not be split between files */
+    .nb_index = (unaryfunc)NULL,  /* set in add_scalarmath below */
+};
+
+#line 1911
+static PyNumberMethods half_as_number = {
+    .nb_add = (binaryfunc)half_add,
+    .nb_subtract = (binaryfunc)half_subtract,
+    .nb_multiply = (binaryfunc)half_multiply,
+    .nb_remainder = (binaryfunc)half_remainder,
+    .nb_divmod = (binaryfunc)half_divmod,
+    .nb_power = (ternaryfunc)half_power,
+    .nb_negative = (unaryfunc)half_negative,
+    .nb_positive = (unaryfunc)half_positive,
+    .nb_absolute = (unaryfunc)half_absolute,
+    .nb_bool = (inquiry)half_bool,
+    .nb_invert = (unaryfunc)half_invert,
+    .nb_lshift = (binaryfunc)half_lshift,
+    .nb_rshift = (binaryfunc)half_rshift,
+    .nb_and = (binaryfunc)half_and,
+    .nb_xor = (binaryfunc)half_xor,
+    .nb_or = (binaryfunc)half_or,
+    .nb_int = (unaryfunc)half_int,
+    .nb_float = (unaryfunc)half_float,
+    .nb_floor_divide = (binaryfunc)half_floor_divide,
+    .nb_true_divide = (binaryfunc)half_true_divide,
+    /* TODO: This struct/initialization should not be split between files */
+    .nb_index = (unaryfunc)NULL,  /* set in add_scalarmath below */
+};
+
+#line 1911
+static PyNumberMethods float_as_number = {
+    .nb_add = (binaryfunc)float_add,
+    .nb_subtract = (binaryfunc)float_subtract,
+    .nb_multiply = (binaryfunc)float_multiply,
+    .nb_remainder = (binaryfunc)float_remainder,
+    .nb_divmod = (binaryfunc)float_divmod,
+    .nb_power = (ternaryfunc)float_power,
+    .nb_negative = (unaryfunc)float_negative,
+    .nb_positive = (unaryfunc)float_positive,
+    .nb_absolute = (unaryfunc)float_absolute,
+    .nb_bool = (inquiry)float_bool,
+    .nb_invert = (unaryfunc)float_invert,
+    .nb_lshift = (binaryfunc)float_lshift,
+    .nb_rshift = (binaryfunc)float_rshift,
+    .nb_and = (binaryfunc)float_and,
+    .nb_xor = (binaryfunc)float_xor,
+    .nb_or = (binaryfunc)float_or,
+    .nb_int = (unaryfunc)float_int,
+    .nb_float = (unaryfunc)float_float,
+    .nb_floor_divide = (binaryfunc)float_floor_divide,
+    .nb_true_divide = (binaryfunc)float_true_divide,
+    /* TODO: This struct/initialization should not be split between files */
+    .nb_index = (unaryfunc)NULL,  /* set in add_scalarmath below */
+};
+
+#line 1911
+static PyNumberMethods double_as_number = {
+    .nb_add = (binaryfunc)double_add,
+    .nb_subtract = (binaryfunc)double_subtract,
+    .nb_multiply = (binaryfunc)double_multiply,
+    .nb_remainder = (binaryfunc)double_remainder,
+    .nb_divmod = (binaryfunc)double_divmod,
+    .nb_power = (ternaryfunc)double_power,
+    .nb_negative = (unaryfunc)double_negative,
+    .nb_positive = (unaryfunc)double_positive,
+    .nb_absolute = (unaryfunc)double_absolute,
+    .nb_bool = (inquiry)double_bool,
+    .nb_invert = (unaryfunc)double_invert,
+    .nb_lshift = (binaryfunc)double_lshift,
+    .nb_rshift = (binaryfunc)double_rshift,
+    .nb_and = (binaryfunc)double_and,
+    .nb_xor = (binaryfunc)double_xor,
+    .nb_or = (binaryfunc)double_or,
+    .nb_int = (unaryfunc)double_int,
+    .nb_float = (unaryfunc)double_float,
+    .nb_floor_divide = (binaryfunc)double_floor_divide,
+    .nb_true_divide = (binaryfunc)double_true_divide,
+    /* TODO: This struct/initialization should not be split between files */
+    .nb_index = (unaryfunc)NULL,  /* set in add_scalarmath below */
+};
+
+#line 1911
+static PyNumberMethods longdouble_as_number = {
+    .nb_add = (binaryfunc)longdouble_add,
+    .nb_subtract = (binaryfunc)longdouble_subtract,
+    .nb_multiply = (binaryfunc)longdouble_multiply,
+    .nb_remainder = (binaryfunc)longdouble_remainder,
+    .nb_divmod = (binaryfunc)longdouble_divmod,
+    .nb_power = (ternaryfunc)longdouble_power,
+    .nb_negative = (unaryfunc)longdouble_negative,
+    .nb_positive = (unaryfunc)longdouble_positive,
+    .nb_absolute = (unaryfunc)longdouble_absolute,
+    .nb_bool = (inquiry)longdouble_bool,
+    .nb_invert = (unaryfunc)longdouble_invert,
+    .nb_lshift = (binaryfunc)longdouble_lshift,
+    .nb_rshift = (binaryfunc)longdouble_rshift,
+    .nb_and = (binaryfunc)longdouble_and,
+    .nb_xor = (binaryfunc)longdouble_xor,
+    .nb_or = (binaryfunc)longdouble_or,
+    .nb_int = (unaryfunc)longdouble_int,
+    .nb_float = (unaryfunc)longdouble_float,
+    .nb_floor_divide = (binaryfunc)longdouble_floor_divide,
+    .nb_true_divide = (binaryfunc)longdouble_true_divide,
+    /* TODO: This struct/initialization should not be split between files */
+    .nb_index = (unaryfunc)NULL,  /* set in add_scalarmath below */
+};
+
+#line 1911
+static PyNumberMethods cfloat_as_number = {
+    .nb_add = (binaryfunc)cfloat_add,
+    .nb_subtract = (binaryfunc)cfloat_subtract,
+    .nb_multiply = (binaryfunc)cfloat_multiply,
+    .nb_remainder = (binaryfunc)cfloat_remainder,
+    .nb_divmod = (binaryfunc)cfloat_divmod,
+    .nb_power = (ternaryfunc)cfloat_power,
+    .nb_negative = (unaryfunc)cfloat_negative,
+    .nb_positive = (unaryfunc)cfloat_positive,
+    .nb_absolute = (unaryfunc)cfloat_absolute,
+    .nb_bool = (inquiry)cfloat_bool,
+    .nb_invert = (unaryfunc)cfloat_invert,
+    .nb_lshift = (binaryfunc)cfloat_lshift,
+    .nb_rshift = (binaryfunc)cfloat_rshift,
+    .nb_and = (binaryfunc)cfloat_and,
+    .nb_xor = (binaryfunc)cfloat_xor,
+    .nb_or = (binaryfunc)cfloat_or,
+    .nb_int = (unaryfunc)cfloat_int,
+    .nb_float = (unaryfunc)cfloat_float,
+    .nb_floor_divide = (binaryfunc)cfloat_floor_divide,
+    .nb_true_divide = (binaryfunc)cfloat_true_divide,
+    /* TODO: This struct/initialization should not be split between files */
+    .nb_index = (unaryfunc)NULL,  /* set in add_scalarmath below */
+};
+
+#line 1911
+static PyNumberMethods cdouble_as_number = {
+    .nb_add = (binaryfunc)cdouble_add,
+    .nb_subtract = (binaryfunc)cdouble_subtract,
+    .nb_multiply = (binaryfunc)cdouble_multiply,
+    .nb_remainder = (binaryfunc)cdouble_remainder,
+    .nb_divmod = (binaryfunc)cdouble_divmod,
+    .nb_power = (ternaryfunc)cdouble_power,
+    .nb_negative = (unaryfunc)cdouble_negative,
+    .nb_positive = (unaryfunc)cdouble_positive,
+    .nb_absolute = (unaryfunc)cdouble_absolute,
+    .nb_bool = (inquiry)cdouble_bool,
+    .nb_invert = (unaryfunc)cdouble_invert,
+    .nb_lshift = (binaryfunc)cdouble_lshift,
+    .nb_rshift = (binaryfunc)cdouble_rshift,
+    .nb_and = (binaryfunc)cdouble_and,
+    .nb_xor = (binaryfunc)cdouble_xor,
+    .nb_or = (binaryfunc)cdouble_or,
+    .nb_int = (unaryfunc)cdouble_int,
+    .nb_float = (unaryfunc)cdouble_float,
+    .nb_floor_divide = (binaryfunc)cdouble_floor_divide,
+    .nb_true_divide = (binaryfunc)cdouble_true_divide,
+    /* TODO: This struct/initialization should not be split between files */
+    .nb_index = (unaryfunc)NULL,  /* set in add_scalarmath below */
+};
+
+#line 1911
+static PyNumberMethods clongdouble_as_number = {
+    .nb_add = (binaryfunc)clongdouble_add,
+    .nb_subtract = (binaryfunc)clongdouble_subtract,
+    .nb_multiply = (binaryfunc)clongdouble_multiply,
+    .nb_remainder = (binaryfunc)clongdouble_remainder,
+    .nb_divmod = (binaryfunc)clongdouble_divmod,
+    .nb_power = (ternaryfunc)clongdouble_power,
+    .nb_negative = (unaryfunc)clongdouble_negative,
+    .nb_positive = (unaryfunc)clongdouble_positive,
+    .nb_absolute = (unaryfunc)clongdouble_absolute,
+    .nb_bool = (inquiry)clongdouble_bool,
+    .nb_invert = (unaryfunc)clongdouble_invert,
+    .nb_lshift = (binaryfunc)clongdouble_lshift,
+    .nb_rshift = (binaryfunc)clongdouble_rshift,
+    .nb_and = (binaryfunc)clongdouble_and,
+    .nb_xor = (binaryfunc)clongdouble_xor,
+    .nb_or = (binaryfunc)clongdouble_or,
+    .nb_int = (unaryfunc)clongdouble_int,
+    .nb_float = (unaryfunc)clongdouble_float,
+    .nb_floor_divide = (binaryfunc)clongdouble_floor_divide,
+    .nb_true_divide = (binaryfunc)clongdouble_true_divide,
+    /* TODO: This struct/initialization should not be split between files */
+    .nb_index = (unaryfunc)NULL,  /* set in add_scalarmath below */
+};
+
+
+NPY_NO_EXPORT void
+add_scalarmath(void)
+{
+    #line 1950
+    byte_as_number.nb_index = PyByteArrType_Type.tp_as_number->nb_index;
+    PyByteArrType_Type.tp_as_number = &(byte_as_number);
+    PyByteArrType_Type.tp_richcompare = byte_richcompare;
+    
+#line 1950
+    ubyte_as_number.nb_index = PyUByteArrType_Type.tp_as_number->nb_index;
+    PyUByteArrType_Type.tp_as_number = &(ubyte_as_number);
+    PyUByteArrType_Type.tp_richcompare = ubyte_richcompare;
+    
+#line 1950
+    short_as_number.nb_index = PyShortArrType_Type.tp_as_number->nb_index;
+    PyShortArrType_Type.tp_as_number = &(short_as_number);
+    PyShortArrType_Type.tp_richcompare = short_richcompare;
+    
+#line 1950
+    ushort_as_number.nb_index = PyUShortArrType_Type.tp_as_number->nb_index;
+    PyUShortArrType_Type.tp_as_number = &(ushort_as_number);
+    PyUShortArrType_Type.tp_richcompare = ushort_richcompare;
+    
+#line 1950
+    int_as_number.nb_index = PyIntArrType_Type.tp_as_number->nb_index;
+    PyIntArrType_Type.tp_as_number = &(int_as_number);
+    PyIntArrType_Type.tp_richcompare = int_richcompare;
+    
+#line 1950
+    uint_as_number.nb_index = PyUIntArrType_Type.tp_as_number->nb_index;
+    PyUIntArrType_Type.tp_as_number = &(uint_as_number);
+    PyUIntArrType_Type.tp_richcompare = uint_richcompare;
+    
+#line 1950
+    long_as_number.nb_index = PyLongArrType_Type.tp_as_number->nb_index;
+    PyLongArrType_Type.tp_as_number = &(long_as_number);
+    PyLongArrType_Type.tp_richcompare = long_richcompare;
+    
+#line 1950
+    ulong_as_number.nb_index = PyULongArrType_Type.tp_as_number->nb_index;
+    PyULongArrType_Type.tp_as_number = &(ulong_as_number);
+    PyULongArrType_Type.tp_richcompare = ulong_richcompare;
+    
+#line 1950
+    longlong_as_number.nb_index = PyLongLongArrType_Type.tp_as_number->nb_index;
+    PyLongLongArrType_Type.tp_as_number = &(longlong_as_number);
+    PyLongLongArrType_Type.tp_richcompare = longlong_richcompare;
+    
+#line 1950
+    ulonglong_as_number.nb_index = PyULongLongArrType_Type.tp_as_number->nb_index;
+    PyULongLongArrType_Type.tp_as_number = &(ulonglong_as_number);
+    PyULongLongArrType_Type.tp_richcompare = ulonglong_richcompare;
+    
+#line 1950
+    half_as_number.nb_index = PyHalfArrType_Type.tp_as_number->nb_index;
+    PyHalfArrType_Type.tp_as_number = &(half_as_number);
+    PyHalfArrType_Type.tp_richcompare = half_richcompare;
+    
+#line 1950
+    float_as_number.nb_index = PyFloatArrType_Type.tp_as_number->nb_index;
+    PyFloatArrType_Type.tp_as_number = &(float_as_number);
+    PyFloatArrType_Type.tp_richcompare = float_richcompare;
+    
+#line 1950
+    double_as_number.nb_index = PyDoubleArrType_Type.tp_as_number->nb_index;
+    PyDoubleArrType_Type.tp_as_number = &(double_as_number);
+    PyDoubleArrType_Type.tp_richcompare = double_richcompare;
+    
+#line 1950
+    longdouble_as_number.nb_index = PyLongDoubleArrType_Type.tp_as_number->nb_index;
+    PyLongDoubleArrType_Type.tp_as_number = &(longdouble_as_number);
+    PyLongDoubleArrType_Type.tp_richcompare = longdouble_richcompare;
+    
+#line 1950
+    cfloat_as_number.nb_index = PyCFloatArrType_Type.tp_as_number->nb_index;
+    PyCFloatArrType_Type.tp_as_number = &(cfloat_as_number);
+    PyCFloatArrType_Type.tp_richcompare = cfloat_richcompare;
+    
+#line 1950
+    cdouble_as_number.nb_index = PyCDoubleArrType_Type.tp_as_number->nb_index;
+    PyCDoubleArrType_Type.tp_as_number = &(cdouble_as_number);
+    PyCDoubleArrType_Type.tp_richcompare = cdouble_richcompare;
+    
+#line 1950
+    clongdouble_as_number.nb_index = PyCLongDoubleArrType_Type.tp_as_number->nb_index;
+    PyCLongDoubleArrType_Type.tp_as_number = &(clongdouble_as_number);
+    PyCLongDoubleArrType_Type.tp_richcompare = clongdouble_richcompare;
+    
+}
+
+
+NPY_NO_EXPORT int initscalarmath(PyObject * m)
+{
+    add_scalarmath();
+
+    return 0;
+}
+
diff --git a/numpy/core/src/_generated/scalartypes.c b/numpy/core/src/_generated/scalartypes.c
new file mode 100644
index 000000000000..f1ebd93c83cf
--- /dev/null
+++ b/numpy/core/src/_generated/scalartypes.c
@@ -0,0 +1,10557 @@
+#line 1 "numpy/core/src/multiarray/scalartypes.c.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+/* -*- c -*- */
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+#include <structmember.h>
+
+#define NPY_NO_DEPRECATED_API NPY_API_VERSION
+#ifndef _MULTIARRAYMODULE
+#define _MULTIARRAYMODULE
+#endif
+
+#include "numpy/arrayobject.h"
+#include "numpy/npy_math.h"
+#include "numpy/halffloat.h"
+#include "numpy/arrayscalars.h"
+
+#include "npy_pycompat.h"
+
+#include "arraytypes.h"
+#include "npy_config.h"
+#include "mapping.h"
+#include "ctors.h"
+#include "usertypes.h"
+#include "numpyos.h"
+#include "can_cast_table.h"
+#include "common.h"
+#include "scalartypes.h"
+#include "_datetime.h"
+#include "datetime_strings.h"
+#include "alloc.h"
+#include "npy_import.h"
+#include "dragon4.h"
+#include "npy_longdouble.h"
+#include "npy_buffer.h"
+
+#include <stdlib.h>
+
+#include "binop_override.h"
+
+/*
+ * used for allocating a single scalar, so use the default numpy
+ * memory allocators instead of the (maybe) user overrides
+ */
+NPY_NO_EXPORT void *
+npy_alloc_cache_zero(size_t nmemb, size_t size);
+
+NPY_NO_EXPORT void
+npy_free_cache(void * p, npy_uintp sz);
+
+NPY_NO_EXPORT PyBoolScalarObject _PyArrayScalar_BoolValues[] = {
+    {PyObject_HEAD_INIT(&PyBoolArrType_Type) 0},
+    {PyObject_HEAD_INIT(&PyBoolArrType_Type) 1},
+};
+
+/* TimeInteger is deleted, but still here to fill the API slot */
+NPY_NO_EXPORT PyTypeObject PyTimeIntegerArrType_Type;
+
+/*
+ * Inheritance is established later when tp_bases is set (or tp_base for
+ * single inheritance)
+ */
+
+#line 68
+NPY_NO_EXPORT PyTypeObject PyNumberArrType_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy.number",
+    .tp_basicsize = sizeof(PyObject),
+};
+
+#line 68
+NPY_NO_EXPORT PyTypeObject PyIntegerArrType_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy.integer",
+    .tp_basicsize = sizeof(PyObject),
+};
+
+#line 68
+NPY_NO_EXPORT PyTypeObject PySignedIntegerArrType_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy.signedinteger",
+    .tp_basicsize = sizeof(PyObject),
+};
+
+#line 68
+NPY_NO_EXPORT PyTypeObject PyUnsignedIntegerArrType_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy.unsignedinteger",
+    .tp_basicsize = sizeof(PyObject),
+};
+
+#line 68
+NPY_NO_EXPORT PyTypeObject PyInexactArrType_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy.inexact",
+    .tp_basicsize = sizeof(PyObject),
+};
+
+#line 68
+NPY_NO_EXPORT PyTypeObject PyFloatingArrType_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy.floating",
+    .tp_basicsize = sizeof(PyObject),
+};
+
+#line 68
+NPY_NO_EXPORT PyTypeObject PyComplexFloatingArrType_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy.complexfloating",
+    .tp_basicsize = sizeof(PyObject),
+};
+
+#line 68
+NPY_NO_EXPORT PyTypeObject PyFlexibleArrType_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy.flexible",
+    .tp_basicsize = sizeof(PyObject),
+};
+
+#line 68
+NPY_NO_EXPORT PyTypeObject PyCharacterArrType_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy.character",
+    .tp_basicsize = sizeof(PyObject),
+};
+
+
+static PyObject *
+gentype_alloc(PyTypeObject *type, Py_ssize_t nitems)
+{
+    PyObject *obj;
+    const size_t size = _PyObject_VAR_SIZE(type, nitems + 1);
+
+    obj = (PyObject *)PyObject_Malloc(size);
+    if (obj == NULL) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+    /*
+     * If we don't need to zero memory, we could use
+     * PyObject_{New, NewVar} for this whole function.
+     */
+    memset(obj, 0, size);
+    if (type->tp_itemsize == 0) {
+        PyObject_Init(obj, type);
+    }
+    else {
+        (void) PyObject_InitVar((PyVarObject *)obj, type, nitems);
+    }
+    return obj;
+}
+
+static void
+gentype_dealloc(PyObject *v)
+{
+    Py_TYPE(v)->tp_free(v);
+}
+
+static void
+gentype_free(PyObject *v)
+{
+    /*
+     * have an explicit tp_free to enforce inheritance from it.
+     * PyObject_Free is also the tp_free of PyBaseObject so python does not
+     * COPYSLOT it, instead it takes the next parent PyInt which has a
+     * different allocator
+     */
+    PyObject_Free(v);
+}
+
+
+static PyObject *
+gentype_power(PyObject *m1, PyObject *m2, PyObject *modulo)
+{
+    if (modulo != Py_None) {
+        /* modular exponentiation is not implemented (gh-8804) */
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+
+    BINOP_GIVE_UP_IF_NEEDED(m1, m2, nb_power, gentype_power);
+    return PyArray_Type.tp_as_number->nb_power(m1, m2, Py_None);
+}
+
+static PyObject *
+gentype_generic_method(PyObject *self, PyObject *args, PyObject *kwds,
+        char *str)
+{
+    PyObject *arr, *meth, *ret;
+
+    arr = PyArray_FromScalar(self, NULL);
+    if (arr == NULL) {
+        return NULL;
+    }
+    meth = PyObject_GetAttrString(arr, str);
+    if (meth == NULL) {
+        Py_DECREF(arr);
+        return NULL;
+    }
+    if (kwds == NULL) {
+        ret = PyObject_CallObject(meth, args);
+    }
+    else {
+        ret = PyObject_Call(meth, args, kwds);
+    }
+    Py_DECREF(meth);
+    Py_DECREF(arr);
+    if (ret && PyArray_Check(ret)) {
+        return PyArray_Return((PyArrayObject *)ret);
+    }
+    else {
+        return ret;
+    }
+}
+
+static PyObject *
+gentype_add(PyObject *m1, PyObject* m2)
+{
+    /* special case str.__radd__, which should not call array_add */
+    if (PyBytes_Check(m1) || PyUnicode_Check(m1)) {
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+    BINOP_GIVE_UP_IF_NEEDED(m1, m2, nb_add, gentype_add);
+    return PyArray_Type.tp_as_number->nb_add(m1, m2);
+}
+
+#line 180
+static PyObject *
+gentype_subtract(PyObject *m1, PyObject *m2)
+{
+    BINOP_GIVE_UP_IF_NEEDED(m1, m2, nb_subtract, gentype_subtract);
+    return PyArray_Type.tp_as_number->nb_subtract(m1, m2);
+}
+
+
+#line 180
+static PyObject *
+gentype_remainder(PyObject *m1, PyObject *m2)
+{
+    BINOP_GIVE_UP_IF_NEEDED(m1, m2, nb_remainder, gentype_remainder);
+    return PyArray_Type.tp_as_number->nb_remainder(m1, m2);
+}
+
+
+#line 180
+static PyObject *
+gentype_divmod(PyObject *m1, PyObject *m2)
+{
+    BINOP_GIVE_UP_IF_NEEDED(m1, m2, nb_divmod, gentype_divmod);
+    return PyArray_Type.tp_as_number->nb_divmod(m1, m2);
+}
+
+
+#line 180
+static PyObject *
+gentype_lshift(PyObject *m1, PyObject *m2)
+{
+    BINOP_GIVE_UP_IF_NEEDED(m1, m2, nb_lshift, gentype_lshift);
+    return PyArray_Type.tp_as_number->nb_lshift(m1, m2);
+}
+
+
+#line 180
+static PyObject *
+gentype_rshift(PyObject *m1, PyObject *m2)
+{
+    BINOP_GIVE_UP_IF_NEEDED(m1, m2, nb_rshift, gentype_rshift);
+    return PyArray_Type.tp_as_number->nb_rshift(m1, m2);
+}
+
+
+#line 180
+static PyObject *
+gentype_and(PyObject *m1, PyObject *m2)
+{
+    BINOP_GIVE_UP_IF_NEEDED(m1, m2, nb_and, gentype_and);
+    return PyArray_Type.tp_as_number->nb_and(m1, m2);
+}
+
+
+#line 180
+static PyObject *
+gentype_xor(PyObject *m1, PyObject *m2)
+{
+    BINOP_GIVE_UP_IF_NEEDED(m1, m2, nb_xor, gentype_xor);
+    return PyArray_Type.tp_as_number->nb_xor(m1, m2);
+}
+
+
+#line 180
+static PyObject *
+gentype_or(PyObject *m1, PyObject *m2)
+{
+    BINOP_GIVE_UP_IF_NEEDED(m1, m2, nb_or, gentype_or);
+    return PyArray_Type.tp_as_number->nb_or(m1, m2);
+}
+
+
+#line 180
+static PyObject *
+gentype_floor_divide(PyObject *m1, PyObject *m2)
+{
+    BINOP_GIVE_UP_IF_NEEDED(m1, m2, nb_floor_divide, gentype_floor_divide);
+    return PyArray_Type.tp_as_number->nb_floor_divide(m1, m2);
+}
+
+
+#line 180
+static PyObject *
+gentype_true_divide(PyObject *m1, PyObject *m2)
+{
+    BINOP_GIVE_UP_IF_NEEDED(m1, m2, nb_true_divide, gentype_true_divide);
+    return PyArray_Type.tp_as_number->nb_true_divide(m1, m2);
+}
+
+
+
+/* Get a nested slot, or NULL if absent */
+#define GET_NESTED_SLOT(type, group, slot) \
+    ((type)->group == NULL ? NULL : (type)->group->slot)
+
+static PyObject *
+gentype_multiply(PyObject *m1, PyObject *m2)
+{
+    /*
+     * If the other object supports sequence repeat and not number multiply
+     * we fall back on the python builtin to invoke the sequence repeat, rather
+     * than promoting both arguments to ndarray.
+     * This covers a list repeat by numpy scalars.
+     * A python defined class will always only have the nb_multiply slot and
+     * some classes may have neither defined. For the latter we want need
+     * to give the normal case a chance to convert the object to ndarray.
+     * Probably no class has both defined, but if they do, prefer number.
+     */
+    if (!PyArray_IsScalar(m1, Generic) &&
+            GET_NESTED_SLOT(Py_TYPE(m1), tp_as_sequence, sq_repeat) != NULL &&
+            GET_NESTED_SLOT(Py_TYPE(m1), tp_as_number, nb_multiply) == NULL) {
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+    if (!PyArray_IsScalar(m2, Generic) &&
+            GET_NESTED_SLOT(Py_TYPE(m2), tp_as_sequence, sq_repeat) != NULL &&
+            GET_NESTED_SLOT(Py_TYPE(m2), tp_as_number, nb_multiply) == NULL) {
+        Py_INCREF(Py_NotImplemented);
+        return Py_NotImplemented;
+    }
+    /* All normal cases are handled by PyArray's multiply */
+    BINOP_GIVE_UP_IF_NEEDED(m1, m2, nb_multiply, gentype_multiply);
+    return PyArray_Type.tp_as_number->nb_multiply(m1, m2);
+}
+
+#line 233
+static PyObject *
+npy_byte_bit_count(PyObject *self, PyObject *NPY_UNUSED(args))
+{
+    npy_byte scalar = PyArrayScalar_VAL(self, Byte);
+    uint8_t count = npy_popcounthh(scalar);
+    PyObject *result = PyLong_FromLong(count);
+
+    return result;
+}
+
+#line 233
+static PyObject *
+npy_ubyte_bit_count(PyObject *self, PyObject *NPY_UNUSED(args))
+{
+    npy_ubyte scalar = PyArrayScalar_VAL(self, UByte);
+    uint8_t count = npy_popcountuhh(scalar);
+    PyObject *result = PyLong_FromLong(count);
+
+    return result;
+}
+
+#line 233
+static PyObject *
+npy_short_bit_count(PyObject *self, PyObject *NPY_UNUSED(args))
+{
+    npy_short scalar = PyArrayScalar_VAL(self, Short);
+    uint8_t count = npy_popcounth(scalar);
+    PyObject *result = PyLong_FromLong(count);
+
+    return result;
+}
+
+#line 233
+static PyObject *
+npy_ushort_bit_count(PyObject *self, PyObject *NPY_UNUSED(args))
+{
+    npy_ushort scalar = PyArrayScalar_VAL(self, UShort);
+    uint8_t count = npy_popcountuh(scalar);
+    PyObject *result = PyLong_FromLong(count);
+
+    return result;
+}
+
+#line 233
+static PyObject *
+npy_int_bit_count(PyObject *self, PyObject *NPY_UNUSED(args))
+{
+    npy_int scalar = PyArrayScalar_VAL(self, Int);
+    uint8_t count = npy_popcount(scalar);
+    PyObject *result = PyLong_FromLong(count);
+
+    return result;
+}
+
+#line 233
+static PyObject *
+npy_uint_bit_count(PyObject *self, PyObject *NPY_UNUSED(args))
+{
+    npy_uint scalar = PyArrayScalar_VAL(self, UInt);
+    uint8_t count = npy_popcountu(scalar);
+    PyObject *result = PyLong_FromLong(count);
+
+    return result;
+}
+
+#line 233
+static PyObject *
+npy_long_bit_count(PyObject *self, PyObject *NPY_UNUSED(args))
+{
+    npy_long scalar = PyArrayScalar_VAL(self, Long);
+    uint8_t count = npy_popcountl(scalar);
+    PyObject *result = PyLong_FromLong(count);
+
+    return result;
+}
+
+#line 233
+static PyObject *
+npy_ulong_bit_count(PyObject *self, PyObject *NPY_UNUSED(args))
+{
+    npy_ulong scalar = PyArrayScalar_VAL(self, ULong);
+    uint8_t count = npy_popcountul(scalar);
+    PyObject *result = PyLong_FromLong(count);
+
+    return result;
+}
+
+#line 233
+static PyObject *
+npy_longlong_bit_count(PyObject *self, PyObject *NPY_UNUSED(args))
+{
+    npy_longlong scalar = PyArrayScalar_VAL(self, LongLong);
+    uint8_t count = npy_popcountll(scalar);
+    PyObject *result = PyLong_FromLongLong(count);
+
+    return result;
+}
+
+#line 233
+static PyObject *
+npy_ulonglong_bit_count(PyObject *self, PyObject *NPY_UNUSED(args))
+{
+    npy_ulonglong scalar = PyArrayScalar_VAL(self, ULongLong);
+    uint8_t count = npy_popcountull(scalar);
+    PyObject *result = PyLong_FromLongLong(count);
+
+    return result;
+}
+
+
+#line 248
+static PyObject *
+gentype_positive(PyObject *m1)
+{
+    PyObject *arr, *ret;
+
+    arr = PyArray_FromScalar(m1, NULL);
+    if (arr == NULL) {
+        return NULL;
+    }
+    ret = Py_TYPE(arr)->tp_as_number->nb_positive(arr);
+    Py_DECREF(arr);
+    return ret;
+}
+
+#line 248
+static PyObject *
+gentype_negative(PyObject *m1)
+{
+    PyObject *arr, *ret;
+
+    arr = PyArray_FromScalar(m1, NULL);
+    if (arr == NULL) {
+        return NULL;
+    }
+    ret = Py_TYPE(arr)->tp_as_number->nb_negative(arr);
+    Py_DECREF(arr);
+    return ret;
+}
+
+#line 248
+static PyObject *
+gentype_absolute(PyObject *m1)
+{
+    PyObject *arr, *ret;
+
+    arr = PyArray_FromScalar(m1, NULL);
+    if (arr == NULL) {
+        return NULL;
+    }
+    ret = Py_TYPE(arr)->tp_as_number->nb_absolute(arr);
+    Py_DECREF(arr);
+    return ret;
+}
+
+#line 248
+static PyObject *
+gentype_invert(PyObject *m1)
+{
+    PyObject *arr, *ret;
+
+    arr = PyArray_FromScalar(m1, NULL);
+    if (arr == NULL) {
+        return NULL;
+    }
+    ret = Py_TYPE(arr)->tp_as_number->nb_invert(arr);
+    Py_DECREF(arr);
+    return ret;
+}
+
+#line 248
+static PyObject *
+gentype_int(PyObject *m1)
+{
+    PyObject *arr, *ret;
+
+    arr = PyArray_FromScalar(m1, NULL);
+    if (arr == NULL) {
+        return NULL;
+    }
+    ret = Py_TYPE(arr)->tp_as_number->nb_int(arr);
+    Py_DECREF(arr);
+    return ret;
+}
+
+#line 248
+static PyObject *
+gentype_float(PyObject *m1)
+{
+    PyObject *arr, *ret;
+
+    arr = PyArray_FromScalar(m1, NULL);
+    if (arr == NULL) {
+        return NULL;
+    }
+    ret = Py_TYPE(arr)->tp_as_number->nb_float(arr);
+    Py_DECREF(arr);
+    return ret;
+}
+
+
+static int
+gentype_nonzero_number(PyObject *m1)
+{
+    PyObject *arr;
+    int ret;
+
+    arr = PyArray_FromScalar(m1, NULL);
+    if (arr == NULL) {
+        return -1;
+    }
+    ret = Py_TYPE(arr)->tp_as_number->nb_bool(arr);
+    Py_DECREF(arr);
+    return ret;
+}
+
+static PyObject *
+genint_type_str(PyObject *self)
+{
+    PyObject  *item, *item_str;
+    PyArray_Descr *descr = PyArray_DescrFromTypeObject((PyObject *)Py_TYPE(self));
+    void *val = scalar_value(self, descr);
+    switch (descr->type_num) {
+        case NPY_BYTE:
+            item = PyLong_FromLong(*(npy_byte *)val);
+            break;
+        case NPY_UBYTE:
+            item = PyLong_FromUnsignedLong(*(npy_ubyte *)val);
+            break;
+        case NPY_SHORT:
+            item = PyLong_FromLong(*(npy_short *)val);
+            break;
+        case NPY_USHORT:
+            item = PyLong_FromUnsignedLong(*(npy_ushort *)val);
+            break;
+        case NPY_INT:
+            item = PyLong_FromLong(*(npy_int *)val);
+            break;
+        case NPY_UINT:
+            item = PyLong_FromUnsignedLong(*(npy_uint *)val);
+            break;
+        case NPY_LONG:
+            item = PyLong_FromLong(*(npy_long *)val);
+            break;
+        case NPY_ULONG:
+            item = PyLong_FromUnsignedLong(*(npy_ulong *)val);
+            break;
+        case NPY_LONGLONG:
+            item = PyLong_FromLongLong(*(npy_longlong *)val);
+            break;
+        case NPY_ULONGLONG:
+            item = PyLong_FromUnsignedLongLong(*(npy_ulonglong *)val);
+            break;
+        default:
+            item = gentype_generic_method(self, NULL, NULL, "item");
+            break;
+    }
+    Py_DECREF(descr);
+    if (item == NULL) {
+        return NULL;
+    }
+
+    item_str = PyObject_Str(item);
+    Py_DECREF(item);
+    return item_str;
+}
+
+/*
+ * The __format__ method for PEP 3101.
+ */
+static PyObject *
+gentype_format(PyObject *self, PyObject *args)
+{
+    PyObject *format_spec;
+    PyObject *obj, *ret;
+
+    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec)) {
+        return NULL;
+    }
+
+    /*
+     * Convert to an appropriate Python type and call its format.
+     * TODO: For some types, like long double, this isn't right,
+     *       because it throws away precision.
+     */
+    if (Py_TYPE(self) == &PyBoolArrType_Type) {
+        obj = PyBool_FromLong(PyArrayScalar_VAL(self, Bool));
+    }
+    else if (PyArray_IsScalar(self, Integer)
+             && !PyArray_IsScalar(self, Timedelta)) {
+        obj = Py_TYPE(self)->tp_as_number->nb_int(self);
+    }
+    else if (PyArray_IsScalar(self, Floating)) {
+        obj = Py_TYPE(self)->tp_as_number->nb_float(self);
+    }
+    else if (PyArray_IsScalar(self, ComplexFloating)) {
+        double val[2];
+        PyArray_Descr *dtype = PyArray_DescrFromScalar(self);
+
+        if (dtype == NULL) {
+            return NULL;
+        }
+        if (PyArray_CastScalarDirect(self, dtype, &val[0], NPY_CDOUBLE) < 0) {
+            Py_DECREF(dtype);
+            return NULL;
+        }
+        obj = PyComplex_FromDoubles(val[0], val[1]);
+        Py_DECREF(dtype);
+    }
+    else {
+        obj = PyObject_Str(self);
+    }
+
+    if (obj == NULL) {
+        return NULL;
+    }
+
+    ret = PyObject_Format(obj, format_spec);
+    Py_DECREF(obj);
+    return ret;
+}
+
+#ifdef FORCE_NO_LONG_DOUBLE_FORMATTING
+#undef NPY_LONGDOUBLE_FMT
+#define NPY_LONGDOUBLE_FMT NPY_DOUBLE_FMT
+#endif
+
+#line 396
+
+NPY_NO_EXPORT PyObject *
+format_half(npy_half val, npy_bool scientific,
+              int precision, int sign, TrimMode trim,
+              int pad_left, int pad_right, int exp_digits)
+{
+    if (scientific) {
+        return Dragon4_Scientific_Half(&val,
+                        DigitMode_Unique, precision, -1,
+                        sign, trim, pad_left, exp_digits);
+    }
+    else {
+        return Dragon4_Positional_Half(&val,
+                        DigitMode_Unique, CutoffMode_TotalLength, precision,
+                        -1, sign, trim, pad_left, pad_right);
+    }
+}
+
+
+
+#line 396
+
+NPY_NO_EXPORT PyObject *
+format_float(npy_float val, npy_bool scientific,
+              int precision, int sign, TrimMode trim,
+              int pad_left, int pad_right, int exp_digits)
+{
+    if (scientific) {
+        return Dragon4_Scientific_Float(&val,
+                        DigitMode_Unique, precision, -1,
+                        sign, trim, pad_left, exp_digits);
+    }
+    else {
+        return Dragon4_Positional_Float(&val,
+                        DigitMode_Unique, CutoffMode_TotalLength, precision,
+                        -1, sign, trim, pad_left, pad_right);
+    }
+}
+
+
+
+#line 396
+
+NPY_NO_EXPORT PyObject *
+format_double(npy_double val, npy_bool scientific,
+              int precision, int sign, TrimMode trim,
+              int pad_left, int pad_right, int exp_digits)
+{
+    if (scientific) {
+        return Dragon4_Scientific_Double(&val,
+                        DigitMode_Unique, precision, -1,
+                        sign, trim, pad_left, exp_digits);
+    }
+    else {
+        return Dragon4_Positional_Double(&val,
+                        DigitMode_Unique, CutoffMode_TotalLength, precision,
+                        -1, sign, trim, pad_left, pad_right);
+    }
+}
+
+
+
+#line 396
+
+NPY_NO_EXPORT PyObject *
+format_longdouble(npy_longdouble val, npy_bool scientific,
+              int precision, int sign, TrimMode trim,
+              int pad_left, int pad_right, int exp_digits)
+{
+    if (scientific) {
+        return Dragon4_Scientific_LongDouble(&val,
+                        DigitMode_Unique, precision, -1,
+                        sign, trim, pad_left, exp_digits);
+    }
+    else {
+        return Dragon4_Positional_LongDouble(&val,
+                        DigitMode_Unique, CutoffMode_TotalLength, precision,
+                        -1, sign, trim, pad_left, pad_right);
+    }
+}
+
+
+
+
+/*
+ * Over-ride repr and str of array-scalar byte strings to remove NULL bytes and
+ * then call the corresponding functions of PyBytes_Type to generate the string
+ */
+
+#line 425
+static PyObject *
+stringtype_repr(PyObject *self)
+{
+    const npy_char *dptr, *ip;
+    Py_ssize_t len;
+    PyObject *new;
+    PyObject *ret;
+
+    ip = PyBytes_AS_STRING(self);
+    len = PyBytes_GET_SIZE(self);
+    for(dptr = ip + len - 1; len > 0 && *dptr == 0; len--, dptr--);
+    new = PyBytes_FromStringAndSize(ip, len);
+    if (new == NULL) {
+        return NULL;
+    }
+    ret = PyBytes_Type.tp_repr(new);
+    Py_DECREF(new);
+    return ret;
+}
+
+#line 425
+static PyObject *
+stringtype_str(PyObject *self)
+{
+    const npy_char *dptr, *ip;
+    Py_ssize_t len;
+    PyObject *new;
+    PyObject *ret;
+
+    ip = PyBytes_AS_STRING(self);
+    len = PyBytes_GET_SIZE(self);
+    for(dptr = ip + len - 1; len > 0 && *dptr == 0; len--, dptr--);
+    new = PyBytes_FromStringAndSize(ip, len);
+    if (new == NULL) {
+        return NULL;
+    }
+    ret = PyBytes_Type.tp_str(new);
+    Py_DECREF(new);
+    return ret;
+}
+
+
+/*
+ * Over-ride repr and str of array-scalar strings to remove NULL code points and
+ * then call the corresponding functions of PyUnicode_Type to generate the string
+ */
+
+#line 454
+static PyObject *
+unicodetype_repr(PyObject *self)
+{
+    Py_UCS4 *dptr, *ip;
+    Py_ssize_t len;
+    PyObject *new;
+    PyObject *ret;
+
+    /* PyUnicode_READY is called by PyUnicode_GetLength */
+    len = PyUnicode_GetLength(self);
+    ip = PyUnicode_AsUCS4Copy(self);
+    if (ip == NULL) {
+        return NULL;
+    }
+    for(dptr = ip + len - 1; len > 0 && *dptr == 0; len--, dptr--);
+    new = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, ip, len);
+    if (new == NULL) {
+        PyMem_Free(ip);
+        return NULL;
+    }
+    ret = PyUnicode_Type.tp_repr(new);
+    Py_DECREF(new);
+    PyMem_Free(ip);
+    return ret;
+}
+
+#line 454
+static PyObject *
+unicodetype_str(PyObject *self)
+{
+    Py_UCS4 *dptr, *ip;
+    Py_ssize_t len;
+    PyObject *new;
+    PyObject *ret;
+
+    /* PyUnicode_READY is called by PyUnicode_GetLength */
+    len = PyUnicode_GetLength(self);
+    ip = PyUnicode_AsUCS4Copy(self);
+    if (ip == NULL) {
+        return NULL;
+    }
+    for(dptr = ip + len - 1; len > 0 && *dptr == 0; len--, dptr--);
+    new = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, ip, len);
+    if (new == NULL) {
+        PyMem_Free(ip);
+        return NULL;
+    }
+    ret = PyUnicode_Type.tp_str(new);
+    Py_DECREF(new);
+    PyMem_Free(ip);
+    return ret;
+}
+
+
+/*
+ * Convert array of bytes to a string representation much like bytes.__repr__,
+ * but convert all bytes (including ASCII) to the `\x00` notation with
+ * uppercase hex codes (FF not ff).
+ *
+ * Largely copied from _Py_strhex_impl in CPython implementation
+ */
+static inline PyObject *
+_void_to_hex(const char* argbuf, const Py_ssize_t arglen,
+             const char *schars, const char *bprefix, const char *echars)
+{
+    PyObject *retval;
+    int extrachars, slen;
+    char *retbuf;
+    Py_ssize_t i, j;
+    char const *hexdigits = "0123456789ABCDEF";
+
+    extrachars = strlen(schars) + strlen(echars);
+    slen = extrachars + arglen*(2 + strlen(bprefix));
+
+    if (arglen > (PY_SSIZE_T_MAX / 2) - extrachars) {
+        return PyErr_NoMemory();
+    }
+
+    retbuf = (char *)PyMem_Malloc(slen);
+    if (!retbuf) {
+        return PyErr_NoMemory();
+    }
+
+    memcpy(retbuf, schars, strlen(schars));
+    j = strlen(schars);
+
+    for (i = 0; i < arglen; i++) {
+        unsigned char c;
+        memcpy(&retbuf[j], bprefix, strlen(bprefix));
+        j += strlen(bprefix);
+        c = (argbuf[i] >> 4) & 0xf;
+        retbuf[j++] = hexdigits[c];
+        c = argbuf[i] & 0xf;
+        retbuf[j++] = hexdigits[c];
+    }
+    memcpy(&retbuf[j], echars, strlen(echars));
+
+    retval = PyUnicode_FromStringAndSize(retbuf, slen);
+    PyMem_Free(retbuf);
+
+    return retval;
+}
+
+static PyObject *
+_void_scalar_repr(PyObject *obj) {
+    static PyObject *reprfunc = NULL;
+    npy_cache_import("numpy.core.arrayprint",
+                     "_void_scalar_repr", &reprfunc);
+    if (reprfunc == NULL) {
+        return NULL;
+    }
+    return PyObject_CallFunction(reprfunc, "O", obj);
+}
+
+static PyObject *
+voidtype_repr(PyObject *self)
+{
+    PyVoidScalarObject *s = (PyVoidScalarObject*) self;
+    if (PyDataType_HASFIELDS(s->descr)) {
+        return _void_scalar_repr(self);
+    }
+    return _void_to_hex(s->obval, s->descr->elsize, "void(b'", "\\x", "')");
+}
+
+static PyObject *
+voidtype_str(PyObject *self)
+{
+    PyVoidScalarObject *s = (PyVoidScalarObject*) self;
+    if (PyDataType_HASFIELDS(s->descr)) {
+        return _void_scalar_repr(self);
+    }
+    return _void_to_hex(s->obval, s->descr->elsize, "b'", "\\x", "'");
+}
+
+static PyObject *
+datetimetype_repr(PyObject *self)
+{
+    PyDatetimeScalarObject *scal;
+    npy_datetimestruct dts;
+    PyObject *ret;
+    char iso[NPY_DATETIME_MAX_ISO8601_STRLEN];
+    NPY_DATETIMEUNIT unit;
+
+    if (!PyArray_IsScalar(self, Datetime)) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "Called NumPy datetime repr on a non-datetime type");
+        return NULL;
+    }
+
+    scal = (PyDatetimeScalarObject *)self;
+
+    if (convert_datetime_to_datetimestruct(&scal->obmeta,
+                scal->obval, &dts) < 0) {
+        return NULL;
+    }
+
+    unit = scal->obmeta.base;
+    if (make_iso_8601_datetime(&dts, iso, sizeof(iso), 0, 0,
+                            unit, -1, NPY_SAFE_CASTING) < 0) {
+        return NULL;
+    }
+
+    /*
+     * For straight units or generic units, the unit will be deduced
+     * from the string, so it's not necessary to specify it.
+     */
+    if ((scal->obmeta.num == 1 && scal->obmeta.base != NPY_FR_h) ||
+            scal->obmeta.base == NPY_FR_GENERIC) {
+        ret = PyUnicode_FromFormat("numpy.datetime64('%s')", iso);
+    }
+    else {
+        PyObject *meta = metastr_to_unicode(&scal->obmeta, 1);
+        if (meta == NULL) {
+            return NULL;
+        }
+        ret = PyUnicode_FromFormat("numpy.datetime64('%s','%S')", iso, meta);
+        Py_DECREF(meta);
+    }
+
+    return ret;
+}
+
+static PyObject *
+timedeltatype_repr(PyObject *self)
+{
+    PyTimedeltaScalarObject *scal;
+    PyObject *val, *ret;
+
+    if (!PyArray_IsScalar(self, Timedelta)) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "Called NumPy timedelta repr on a non-datetime type");
+        return NULL;
+    }
+
+    scal = (PyTimedeltaScalarObject *)self;
+
+    /* The value */
+    if (scal->obval == NPY_DATETIME_NAT) {
+        val = PyUnicode_FromString("'NaT'");
+    }
+    else {
+         /* Can't use "%lld" if HAVE_LONG_LONG is not defined */
+#if defined(HAVE_LONG_LONG)
+        val = PyUnicode_FromFormat("%lld", (long long)scal->obval);
+#else
+        val = PyUnicode_FromFormat("%ld", (long)scal->obval);
+#endif
+    }
+    if (val == NULL) {
+        return NULL;
+    }
+
+    /* The metadata unit */
+    if (scal->obmeta.base == NPY_FR_GENERIC) {
+        ret = PyUnicode_FromFormat("numpy.timedelta64(%S)", val);
+    }
+    else {
+        PyObject *meta = metastr_to_unicode(&scal->obmeta, 1);
+        if (meta == NULL) {
+            Py_DECREF(val);
+            return NULL;
+        }
+        ret = PyUnicode_FromFormat("numpy.timedelta64(%S,'%S')", val, meta);
+        Py_DECREF(meta);
+    }
+    Py_DECREF(val);
+
+    return ret;
+}
+
+static PyObject *
+datetimetype_str(PyObject *self)
+{
+    PyDatetimeScalarObject *scal;
+    npy_datetimestruct dts;
+    char iso[NPY_DATETIME_MAX_ISO8601_STRLEN];
+    NPY_DATETIMEUNIT unit;
+
+    if (!PyArray_IsScalar(self, Datetime)) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "Called NumPy datetime str on a non-datetime type");
+        return NULL;
+    }
+
+    scal = (PyDatetimeScalarObject *)self;
+
+    if (convert_datetime_to_datetimestruct(&scal->obmeta, scal->obval,
+                                                            &dts) < 0) {
+        return NULL;
+    }
+
+    unit = scal->obmeta.base;
+    if (make_iso_8601_datetime(&dts, iso, sizeof(iso), 0, 0,
+                            unit, -1, NPY_SAFE_CASTING) < 0) {
+        return NULL;
+    }
+
+    return PyUnicode_FromString(iso);
+}
+
+static char *_datetime_verbose_strings[NPY_DATETIME_NUMUNITS] = {
+    "years",
+    "months",
+    "weeks",
+    "<invalid>",
+    "days",
+    "hours",
+    "minutes",
+    "seconds",
+    "milliseconds",
+    "microseconds",
+    "nanoseconds",
+    "picoseconds",
+    "femtoseconds",
+    "attoseconds",
+    "generic time units"
+};
+
+static PyObject *
+timedeltatype_str(PyObject *self)
+{
+    PyTimedeltaScalarObject *scal;
+    PyObject *ret;
+    char *basestr = "invalid";
+
+    if (!PyArray_IsScalar(self, Timedelta)) {
+        PyErr_SetString(PyExc_RuntimeError,
+                "Called NumPy timedelta str on a non-datetime type");
+        return NULL;
+    }
+
+    scal = (PyTimedeltaScalarObject *)self;
+
+    if (scal->obmeta.base >= 0 && scal->obmeta.base < NPY_DATETIME_NUMUNITS) {
+        basestr = _datetime_verbose_strings[scal->obmeta.base];
+    }
+    else {
+        PyErr_SetString(PyExc_RuntimeError,
+                "NumPy datetime metadata is corrupted");
+        return NULL;
+    }
+
+    if (scal->obval == NPY_DATETIME_NAT) {
+        ret = PyUnicode_FromString("NaT");
+    }
+    else {
+        /*
+         * Can't use "%lld" if HAVE_LONG_LONG is not defined
+         */
+#if defined(HAVE_LONG_LONG)
+        ret = PyUnicode_FromFormat("%lld %s",
+            (long long)(scal->obval * scal->obmeta.num), basestr);
+#else
+        ret = PyUnicode_FromFormat("%ld %s",
+            (long)(scal->obval * scal->obmeta.num), basestr);
+#endif
+    }
+
+    return ret;
+}
+
+/*
+ * float type str and repr
+ *
+ * These functions will return NULL if PyString creation fails.
+ */
+
+
+/*
+ *               *** BEGIN LEGACY PRINTING MODE CODE ***
+ *
+ * This code is legacy code needed to reproduce the printing behavior of
+ * scalars in numpy 1.13. One day we hope to remove it.
+ */
+
+/* determines if legacy mode is enabled, global set in multiarraymodule.c */
+extern int npy_legacy_print_mode;
+
+#define HALFPREC_REPR 5
+#define HALFPREC_STR 5
+#define FLOATPREC_REPR 8
+#define FLOATPREC_STR 6
+#define DOUBLEPREC_REPR 17
+#define DOUBLEPREC_STR 12
+#if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE
+#define LONGDOUBLEPREC_REPR DOUBLEPREC_REPR
+#define LONGDOUBLEPREC_STR DOUBLEPREC_STR
+#else /* More than probably needed on Intel FP */
+#define LONGDOUBLEPREC_REPR 20
+#define LONGDOUBLEPREC_STR 12
+#endif
+
+#line 783
+
+#line 790
+
+#define _FMT1 "%%.%i" NPY_FLOAT_FMT
+#define _FMT2 "%%+.%i" NPY_FLOAT_FMT
+
+static PyObject*
+legacy_cfloat_formatstr(npy_cfloat val)
+{
+    /* XXX: Find a correct size here for format string */
+    char format[64], buf[100], *res;
+
+    /*
+     * Ideally, we should handle this nan/inf stuff in NumpyOS_ascii_format*
+     */
+    if (val.real == 0.0 && npy_signbit(val.real) == 0) {
+        PyOS_snprintf(format, sizeof(format), _FMT1, FLOATPREC_STR);
+        res = NumPyOS_ascii_formatf(buf, sizeof(buf) - 1, format, val.imag, 0);
+        if (res == NULL) {
+            PyErr_SetString(PyExc_RuntimeError, "Error while formatting");
+            return NULL;
+        }
+        if (!npy_isfinite(val.imag)) {
+            strncat(buf, "*", sizeof(buf) - strlen(buf) - 1);
+        }
+        strncat(buf, "j", sizeof(buf) - strlen(buf) - 1);
+    }
+    else {
+        char re[64], im[64];
+
+        if (npy_isfinite(val.real)) {
+            PyOS_snprintf(format, sizeof(format), _FMT1, FLOATPREC_STR);
+            res = NumPyOS_ascii_formatf(re, sizeof(re), format,
+                                             val.real, 0);
+            if (res == NULL) {
+                PyErr_SetString(PyExc_RuntimeError, "Error while formatting");
+                return NULL;
+            }
+        }
+        else {
+            if (npy_isnan(val.real)) {
+                strcpy(re, "nan");
+            }
+            else if (val.real > 0){
+                strcpy(re, "inf");
+            }
+            else {
+                strcpy(re, "-inf");
+            }
+        }
+
+
+        if (npy_isfinite(val.imag)) {
+            PyOS_snprintf(format, sizeof(format), _FMT2, FLOATPREC_STR);
+            res = NumPyOS_ascii_formatf(im, sizeof(im), format,
+                                             val.imag, 0);
+            if (res == NULL) {
+                PyErr_SetString(PyExc_RuntimeError, "Error while formatting");
+                return NULL;
+            }
+        }
+        else {
+            if (npy_isnan(val.imag)) {
+                strcpy(im, "+nan");
+            }
+            else if (val.imag > 0){
+                strcpy(im, "+inf");
+            }
+            else {
+                strcpy(im, "-inf");
+            }
+            if (!npy_isfinite(val.imag)) {
+                strncat(im, "*", sizeof(im) - strlen(im) - 1);
+            }
+        }
+        PyOS_snprintf(buf, sizeof(buf), "(%s%sj)", re, im);
+    }
+
+    return PyUnicode_FromString(buf);
+}
+
+#undef _FMT1
+#undef _FMT2
+
+
+#line 790
+
+#define _FMT1 "%%.%i" NPY_DOUBLE_FMT
+#define _FMT2 "%%+.%i" NPY_DOUBLE_FMT
+
+static PyObject*
+legacy_cdouble_formatstr(npy_cdouble val)
+{
+    /* XXX: Find a correct size here for format string */
+    char format[64], buf[100], *res;
+
+    /*
+     * Ideally, we should handle this nan/inf stuff in NumpyOS_ascii_format*
+     */
+    if (val.real == 0.0 && npy_signbit(val.real) == 0) {
+        PyOS_snprintf(format, sizeof(format), _FMT1, DOUBLEPREC_STR);
+        res = NumPyOS_ascii_formatd(buf, sizeof(buf) - 1, format, val.imag, 0);
+        if (res == NULL) {
+            PyErr_SetString(PyExc_RuntimeError, "Error while formatting");
+            return NULL;
+        }
+        if (!npy_isfinite(val.imag)) {
+            strncat(buf, "*", sizeof(buf) - strlen(buf) - 1);
+        }
+        strncat(buf, "j", sizeof(buf) - strlen(buf) - 1);
+    }
+    else {
+        char re[64], im[64];
+
+        if (npy_isfinite(val.real)) {
+            PyOS_snprintf(format, sizeof(format), _FMT1, DOUBLEPREC_STR);
+            res = NumPyOS_ascii_formatd(re, sizeof(re), format,
+                                             val.real, 0);
+            if (res == NULL) {
+                PyErr_SetString(PyExc_RuntimeError, "Error while formatting");
+                return NULL;
+            }
+        }
+        else {
+            if (npy_isnan(val.real)) {
+                strcpy(re, "nan");
+            }
+            else if (val.real > 0){
+                strcpy(re, "inf");
+            }
+            else {
+                strcpy(re, "-inf");
+            }
+        }
+
+
+        if (npy_isfinite(val.imag)) {
+            PyOS_snprintf(format, sizeof(format), _FMT2, DOUBLEPREC_STR);
+            res = NumPyOS_ascii_formatd(im, sizeof(im), format,
+                                             val.imag, 0);
+            if (res == NULL) {
+                PyErr_SetString(PyExc_RuntimeError, "Error while formatting");
+                return NULL;
+            }
+        }
+        else {
+            if (npy_isnan(val.imag)) {
+                strcpy(im, "+nan");
+            }
+            else if (val.imag > 0){
+                strcpy(im, "+inf");
+            }
+            else {
+                strcpy(im, "-inf");
+            }
+            if (!npy_isfinite(val.imag)) {
+                strncat(im, "*", sizeof(im) - strlen(im) - 1);
+            }
+        }
+        PyOS_snprintf(buf, sizeof(buf), "(%s%sj)", re, im);
+    }
+
+    return PyUnicode_FromString(buf);
+}
+
+#undef _FMT1
+#undef _FMT2
+
+
+#line 790
+
+#define _FMT1 "%%.%i" NPY_LONGDOUBLE_FMT
+#define _FMT2 "%%+.%i" NPY_LONGDOUBLE_FMT
+
+static PyObject*
+legacy_clongdouble_formatstr(npy_clongdouble val)
+{
+    /* XXX: Find a correct size here for format string */
+    char format[64], buf[100], *res;
+
+    /*
+     * Ideally, we should handle this nan/inf stuff in NumpyOS_ascii_format*
+     */
+    if (val.real == 0.0 && npy_signbit(val.real) == 0) {
+        PyOS_snprintf(format, sizeof(format), _FMT1, LONGDOUBLEPREC_STR);
+        res = NumPyOS_ascii_formatl(buf, sizeof(buf) - 1, format, val.imag, 0);
+        if (res == NULL) {
+            PyErr_SetString(PyExc_RuntimeError, "Error while formatting");
+            return NULL;
+        }
+        if (!npy_isfinite(val.imag)) {
+            strncat(buf, "*", sizeof(buf) - strlen(buf) - 1);
+        }
+        strncat(buf, "j", sizeof(buf) - strlen(buf) - 1);
+    }
+    else {
+        char re[64], im[64];
+
+        if (npy_isfinite(val.real)) {
+            PyOS_snprintf(format, sizeof(format), _FMT1, LONGDOUBLEPREC_STR);
+            res = NumPyOS_ascii_formatl(re, sizeof(re), format,
+                                             val.real, 0);
+            if (res == NULL) {
+                PyErr_SetString(PyExc_RuntimeError, "Error while formatting");
+                return NULL;
+            }
+        }
+        else {
+            if (npy_isnan(val.real)) {
+                strcpy(re, "nan");
+            }
+            else if (val.real > 0){
+                strcpy(re, "inf");
+            }
+            else {
+                strcpy(re, "-inf");
+            }
+        }
+
+
+        if (npy_isfinite(val.imag)) {
+            PyOS_snprintf(format, sizeof(format), _FMT2, LONGDOUBLEPREC_STR);
+            res = NumPyOS_ascii_formatl(im, sizeof(im), format,
+                                             val.imag, 0);
+            if (res == NULL) {
+                PyErr_SetString(PyExc_RuntimeError, "Error while formatting");
+                return NULL;
+            }
+        }
+        else {
+            if (npy_isnan(val.imag)) {
+                strcpy(im, "+nan");
+            }
+            else if (val.imag > 0){
+                strcpy(im, "+inf");
+            }
+            else {
+                strcpy(im, "-inf");
+            }
+            if (!npy_isfinite(val.imag)) {
+                strncat(im, "*", sizeof(im) - strlen(im) - 1);
+            }
+        }
+        PyOS_snprintf(buf, sizeof(buf), "(%s%sj)", re, im);
+    }
+
+    return PyUnicode_FromString(buf);
+}
+
+#undef _FMT1
+#undef _FMT2
+
+
+
+#line 880
+
+#define _FMT1 "%%.%i" NPY_FLOAT_FMT
+
+static PyObject *
+legacy_float_formatstr(npy_float val){
+    /* XXX: Find a correct size here for format string */
+    char format[64], buf[100], *res;
+    size_t i, cnt;
+
+    PyOS_snprintf(format, sizeof(format), _FMT1, FLOATPREC_STR);
+    res = NumPyOS_ascii_formatf(buf, sizeof(buf), format, val, 0);
+    if (res == NULL) {
+        PyErr_SetString(PyExc_RuntimeError, "Error while formatting");
+        return NULL;
+    }
+
+    /* If nothing but digits after sign, append ".0" */
+    cnt = strlen(buf);
+    for (i = (buf[0] == '-') ? 1 : 0; i < cnt; ++i) {
+        if (!isdigit(Py_CHARMASK(buf[i]))) {
+            break;
+        }
+    }
+    if (i == cnt && sizeof(buf) >= cnt + 3) {
+        strcpy(&buf[cnt],".0");
+    }
+
+    return PyUnicode_FromString(buf);
+}
+
+#undef _FMT1
+
+
+#line 880
+
+#define _FMT1 "%%.%i" NPY_DOUBLE_FMT
+
+static PyObject *
+legacy_double_formatstr(npy_double val){
+    /* XXX: Find a correct size here for format string */
+    char format[64], buf[100], *res;
+    size_t i, cnt;
+
+    PyOS_snprintf(format, sizeof(format), _FMT1, DOUBLEPREC_STR);
+    res = NumPyOS_ascii_formatd(buf, sizeof(buf), format, val, 0);
+    if (res == NULL) {
+        PyErr_SetString(PyExc_RuntimeError, "Error while formatting");
+        return NULL;
+    }
+
+    /* If nothing but digits after sign, append ".0" */
+    cnt = strlen(buf);
+    for (i = (buf[0] == '-') ? 1 : 0; i < cnt; ++i) {
+        if (!isdigit(Py_CHARMASK(buf[i]))) {
+            break;
+        }
+    }
+    if (i == cnt && sizeof(buf) >= cnt + 3) {
+        strcpy(&buf[cnt],".0");
+    }
+
+    return PyUnicode_FromString(buf);
+}
+
+#undef _FMT1
+
+
+#line 880
+
+#define _FMT1 "%%.%i" NPY_LONGDOUBLE_FMT
+
+static PyObject *
+legacy_longdouble_formatstr(npy_longdouble val){
+    /* XXX: Find a correct size here for format string */
+    char format[64], buf[100], *res;
+    size_t i, cnt;
+
+    PyOS_snprintf(format, sizeof(format), _FMT1, LONGDOUBLEPREC_STR);
+    res = NumPyOS_ascii_formatl(buf, sizeof(buf), format, val, 0);
+    if (res == NULL) {
+        PyErr_SetString(PyExc_RuntimeError, "Error while formatting");
+        return NULL;
+    }
+
+    /* If nothing but digits after sign, append ".0" */
+    cnt = strlen(buf);
+    for (i = (buf[0] == '-') ? 1 : 0; i < cnt; ++i) {
+        if (!isdigit(Py_CHARMASK(buf[i]))) {
+            break;
+        }
+    }
+    if (i == cnt && sizeof(buf) >= cnt + 3) {
+        strcpy(&buf[cnt],".0");
+    }
+
+    return PyUnicode_FromString(buf);
+}
+
+#undef _FMT1
+
+
+
+
+#line 783
+
+#line 790
+
+#define _FMT1 "%%.%i" NPY_FLOAT_FMT
+#define _FMT2 "%%+.%i" NPY_FLOAT_FMT
+
+static PyObject*
+legacy_cfloat_formatrepr(npy_cfloat val)
+{
+    /* XXX: Find a correct size here for format string */
+    char format[64], buf[100], *res;
+
+    /*
+     * Ideally, we should handle this nan/inf stuff in NumpyOS_ascii_format*
+     */
+    if (val.real == 0.0 && npy_signbit(val.real) == 0) {
+        PyOS_snprintf(format, sizeof(format), _FMT1, FLOATPREC_REPR);
+        res = NumPyOS_ascii_formatf(buf, sizeof(buf) - 1, format, val.imag, 0);
+        if (res == NULL) {
+            PyErr_SetString(PyExc_RuntimeError, "Error while formatting");
+            return NULL;
+        }
+        if (!npy_isfinite(val.imag)) {
+            strncat(buf, "*", sizeof(buf) - strlen(buf) - 1);
+        }
+        strncat(buf, "j", sizeof(buf) - strlen(buf) - 1);
+    }
+    else {
+        char re[64], im[64];
+
+        if (npy_isfinite(val.real)) {
+            PyOS_snprintf(format, sizeof(format), _FMT1, FLOATPREC_REPR);
+            res = NumPyOS_ascii_formatf(re, sizeof(re), format,
+                                             val.real, 0);
+            if (res == NULL) {
+                PyErr_SetString(PyExc_RuntimeError, "Error while formatting");
+                return NULL;
+            }
+        }
+        else {
+            if (npy_isnan(val.real)) {
+                strcpy(re, "nan");
+            }
+            else if (val.real > 0){
+                strcpy(re, "inf");
+            }
+            else {
+                strcpy(re, "-inf");
+            }
+        }
+
+
+        if (npy_isfinite(val.imag)) {
+            PyOS_snprintf(format, sizeof(format), _FMT2, FLOATPREC_REPR);
+            res = NumPyOS_ascii_formatf(im, sizeof(im), format,
+                                             val.imag, 0);
+            if (res == NULL) {
+                PyErr_SetString(PyExc_RuntimeError, "Error while formatting");
+                return NULL;
+            }
+        }
+        else {
+            if (npy_isnan(val.imag)) {
+                strcpy(im, "+nan");
+            }
+            else if (val.imag > 0){
+                strcpy(im, "+inf");
+            }
+            else {
+                strcpy(im, "-inf");
+            }
+            if (!npy_isfinite(val.imag)) {
+                strncat(im, "*", sizeof(im) - strlen(im) - 1);
+            }
+        }
+        PyOS_snprintf(buf, sizeof(buf), "(%s%sj)", re, im);
+    }
+
+    return PyUnicode_FromString(buf);
+}
+
+#undef _FMT1
+#undef _FMT2
+
+
+#line 790
+
+#define _FMT1 "%%.%i" NPY_DOUBLE_FMT
+#define _FMT2 "%%+.%i" NPY_DOUBLE_FMT
+
+static PyObject*
+legacy_cdouble_formatrepr(npy_cdouble val)
+{
+    /* XXX: Find a correct size here for format string */
+    char format[64], buf[100], *res;
+
+    /*
+     * Ideally, we should handle this nan/inf stuff in NumpyOS_ascii_format*
+     */
+    if (val.real == 0.0 && npy_signbit(val.real) == 0) {
+        PyOS_snprintf(format, sizeof(format), _FMT1, DOUBLEPREC_REPR);
+        res = NumPyOS_ascii_formatd(buf, sizeof(buf) - 1, format, val.imag, 0);
+        if (res == NULL) {
+            PyErr_SetString(PyExc_RuntimeError, "Error while formatting");
+            return NULL;
+        }
+        if (!npy_isfinite(val.imag)) {
+            strncat(buf, "*", sizeof(buf) - strlen(buf) - 1);
+        }
+        strncat(buf, "j", sizeof(buf) - strlen(buf) - 1);
+    }
+    else {
+        char re[64], im[64];
+
+        if (npy_isfinite(val.real)) {
+            PyOS_snprintf(format, sizeof(format), _FMT1, DOUBLEPREC_REPR);
+            res = NumPyOS_ascii_formatd(re, sizeof(re), format,
+                                             val.real, 0);
+            if (res == NULL) {
+                PyErr_SetString(PyExc_RuntimeError, "Error while formatting");
+                return NULL;
+            }
+        }
+        else {
+            if (npy_isnan(val.real)) {
+                strcpy(re, "nan");
+            }
+            else if (val.real > 0){
+                strcpy(re, "inf");
+            }
+            else {
+                strcpy(re, "-inf");
+            }
+        }
+
+
+        if (npy_isfinite(val.imag)) {
+            PyOS_snprintf(format, sizeof(format), _FMT2, DOUBLEPREC_REPR);
+            res = NumPyOS_ascii_formatd(im, sizeof(im), format,
+                                             val.imag, 0);
+            if (res == NULL) {
+                PyErr_SetString(PyExc_RuntimeError, "Error while formatting");
+                return NULL;
+            }
+        }
+        else {
+            if (npy_isnan(val.imag)) {
+                strcpy(im, "+nan");
+            }
+            else if (val.imag > 0){
+                strcpy(im, "+inf");
+            }
+            else {
+                strcpy(im, "-inf");
+            }
+            if (!npy_isfinite(val.imag)) {
+                strncat(im, "*", sizeof(im) - strlen(im) - 1);
+            }
+        }
+        PyOS_snprintf(buf, sizeof(buf), "(%s%sj)", re, im);
+    }
+
+    return PyUnicode_FromString(buf);
+}
+
+#undef _FMT1
+#undef _FMT2
+
+
+#line 790
+
+#define _FMT1 "%%.%i" NPY_LONGDOUBLE_FMT
+#define _FMT2 "%%+.%i" NPY_LONGDOUBLE_FMT
+
+static PyObject*
+legacy_clongdouble_formatrepr(npy_clongdouble val)
+{
+    /* XXX: Find a correct size here for format string */
+    char format[64], buf[100], *res;
+
+    /*
+     * Ideally, we should handle this nan/inf stuff in NumpyOS_ascii_format*
+     */
+    if (val.real == 0.0 && npy_signbit(val.real) == 0) {
+        PyOS_snprintf(format, sizeof(format), _FMT1, LONGDOUBLEPREC_REPR);
+        res = NumPyOS_ascii_formatl(buf, sizeof(buf) - 1, format, val.imag, 0);
+        if (res == NULL) {
+            PyErr_SetString(PyExc_RuntimeError, "Error while formatting");
+            return NULL;
+        }
+        if (!npy_isfinite(val.imag)) {
+            strncat(buf, "*", sizeof(buf) - strlen(buf) - 1);
+        }
+        strncat(buf, "j", sizeof(buf) - strlen(buf) - 1);
+    }
+    else {
+        char re[64], im[64];
+
+        if (npy_isfinite(val.real)) {
+            PyOS_snprintf(format, sizeof(format), _FMT1, LONGDOUBLEPREC_REPR);
+            res = NumPyOS_ascii_formatl(re, sizeof(re), format,
+                                             val.real, 0);
+            if (res == NULL) {
+                PyErr_SetString(PyExc_RuntimeError, "Error while formatting");
+                return NULL;
+            }
+        }
+        else {
+            if (npy_isnan(val.real)) {
+                strcpy(re, "nan");
+            }
+            else if (val.real > 0){
+                strcpy(re, "inf");
+            }
+            else {
+                strcpy(re, "-inf");
+            }
+        }
+
+
+        if (npy_isfinite(val.imag)) {
+            PyOS_snprintf(format, sizeof(format), _FMT2, LONGDOUBLEPREC_REPR);
+            res = NumPyOS_ascii_formatl(im, sizeof(im), format,
+                                             val.imag, 0);
+            if (res == NULL) {
+                PyErr_SetString(PyExc_RuntimeError, "Error while formatting");
+                return NULL;
+            }
+        }
+        else {
+            if (npy_isnan(val.imag)) {
+                strcpy(im, "+nan");
+            }
+            else if (val.imag > 0){
+                strcpy(im, "+inf");
+            }
+            else {
+                strcpy(im, "-inf");
+            }
+            if (!npy_isfinite(val.imag)) {
+                strncat(im, "*", sizeof(im) - strlen(im) - 1);
+            }
+        }
+        PyOS_snprintf(buf, sizeof(buf), "(%s%sj)", re, im);
+    }
+
+    return PyUnicode_FromString(buf);
+}
+
+#undef _FMT1
+#undef _FMT2
+
+
+
+#line 880
+
+#define _FMT1 "%%.%i" NPY_FLOAT_FMT
+
+static PyObject *
+legacy_float_formatrepr(npy_float val){
+    /* XXX: Find a correct size here for format string */
+    char format[64], buf[100], *res;
+    size_t i, cnt;
+
+    PyOS_snprintf(format, sizeof(format), _FMT1, FLOATPREC_REPR);
+    res = NumPyOS_ascii_formatf(buf, sizeof(buf), format, val, 0);
+    if (res == NULL) {
+        PyErr_SetString(PyExc_RuntimeError, "Error while formatting");
+        return NULL;
+    }
+
+    /* If nothing but digits after sign, append ".0" */
+    cnt = strlen(buf);
+    for (i = (buf[0] == '-') ? 1 : 0; i < cnt; ++i) {
+        if (!isdigit(Py_CHARMASK(buf[i]))) {
+            break;
+        }
+    }
+    if (i == cnt && sizeof(buf) >= cnt + 3) {
+        strcpy(&buf[cnt],".0");
+    }
+
+    return PyUnicode_FromString(buf);
+}
+
+#undef _FMT1
+
+
+#line 880
+
+#define _FMT1 "%%.%i" NPY_DOUBLE_FMT
+
+static PyObject *
+legacy_double_formatrepr(npy_double val){
+    /* XXX: Find a correct size here for format string */
+    char format[64], buf[100], *res;
+    size_t i, cnt;
+
+    PyOS_snprintf(format, sizeof(format), _FMT1, DOUBLEPREC_REPR);
+    res = NumPyOS_ascii_formatd(buf, sizeof(buf), format, val, 0);
+    if (res == NULL) {
+        PyErr_SetString(PyExc_RuntimeError, "Error while formatting");
+        return NULL;
+    }
+
+    /* If nothing but digits after sign, append ".0" */
+    cnt = strlen(buf);
+    for (i = (buf[0] == '-') ? 1 : 0; i < cnt; ++i) {
+        if (!isdigit(Py_CHARMASK(buf[i]))) {
+            break;
+        }
+    }
+    if (i == cnt && sizeof(buf) >= cnt + 3) {
+        strcpy(&buf[cnt],".0");
+    }
+
+    return PyUnicode_FromString(buf);
+}
+
+#undef _FMT1
+
+
+#line 880
+
+#define _FMT1 "%%.%i" NPY_LONGDOUBLE_FMT
+
+static PyObject *
+legacy_longdouble_formatrepr(npy_longdouble val){
+    /* XXX: Find a correct size here for format string */
+    char format[64], buf[100], *res;
+    size_t i, cnt;
+
+    PyOS_snprintf(format, sizeof(format), _FMT1, LONGDOUBLEPREC_REPR);
+    res = NumPyOS_ascii_formatl(buf, sizeof(buf), format, val, 0);
+    if (res == NULL) {
+        PyErr_SetString(PyExc_RuntimeError, "Error while formatting");
+        return NULL;
+    }
+
+    /* If nothing but digits after sign, append ".0" */
+    cnt = strlen(buf);
+    for (i = (buf[0] == '-') ? 1 : 0; i < cnt; ++i) {
+        if (!isdigit(Py_CHARMASK(buf[i]))) {
+            break;
+        }
+    }
+    if (i == cnt && sizeof(buf) >= cnt + 3) {
+        strcpy(&buf[cnt],".0");
+    }
+
+    return PyUnicode_FromString(buf);
+}
+
+#undef _FMT1
+
+
+
+
+
+
+/*
+ *               *** END LEGACY PRINTING MODE CODE ***
+ */
+
+
+#line 925
+
+#line 931
+
+/* helper function choose scientific of fractional output, based on a cutoff */
+static PyObject *
+floattype_str_either(npy_float val, TrimMode trim_pos, TrimMode trim_sci,
+                         npy_bool sign)
+{
+
+    if (npy_legacy_print_mode <= 113) {
+        return legacy_float_formatstr(val);
+    }
+
+    int use_positional;
+    if (npy_isnan(val) || val == 0) {
+         use_positional = 1;
+    }
+    else {
+        npy_float absval = val < 0 ? -val : val;
+        use_positional = absval < 1.e16L && absval >= 1.e-4L;
+    }
+
+    if (use_positional) {
+        return format_float(val, 0, -1, sign, trim_pos, -1, -1, -1);
+    }
+    return format_float(val, 1, -1, sign, trim_sci, -1, -1, -1);
+}
+
+static PyObject *
+floattype_str(PyObject *self)
+{
+    return floattype_str_either(PyArrayScalar_VAL(self, Float),
+                                  TrimMode_LeaveOneZero, TrimMode_DptZeros, 0);
+}
+
+static PyObject *
+cfloattype_str(PyObject *self)
+{
+    PyObject *rstr, *istr;
+    npy_cfloat val = PyArrayScalar_VAL(self, CFloat);
+    TrimMode trim = TrimMode_DptZeros;
+
+    if (npy_legacy_print_mode <= 113) {
+        return legacy_cfloat_formatstr(val);
+    }
+
+    if (val.real == 0.0 && npy_signbit(val.real) == 0) {
+        istr = floattype_str_either(val.imag, trim, trim, 0);
+        if (istr == NULL) {
+            return NULL;
+        }
+        PyObject *ret = PyUnicode_FromFormat("%Sj", istr);
+        Py_DECREF(istr);
+        return ret;
+    }
+
+    if (npy_isfinite(val.real)) {
+        rstr = floattype_str_either(val.real, trim, trim, 0);
+    }
+    else if (npy_isnan(val.real)) {
+        rstr = PyUnicode_FromString("nan");
+    }
+    else if (val.real > 0){
+        rstr = PyUnicode_FromString("inf");
+    }
+    else {
+        rstr = PyUnicode_FromString("-inf");
+    }
+    if (rstr == NULL) {
+        return NULL;
+    }
+
+    if (npy_isfinite(val.imag)) {
+        istr = floattype_str_either(val.imag, trim, trim, 1);
+    }
+    else if (npy_isnan(val.imag)) {
+        istr = PyUnicode_FromString("+nan");
+    }
+    else if (val.imag > 0){
+        istr = PyUnicode_FromString("+inf");
+    }
+    else {
+        istr = PyUnicode_FromString("-inf");
+    }
+    if (istr == NULL) {
+        Py_DECREF(rstr);
+        return NULL;
+    }
+
+    PyObject *ret = PyUnicode_FromFormat("(%S%Sj)", rstr, istr);
+    Py_DECREF(rstr);
+    Py_DECREF(istr);
+    return ret;
+}
+
+#undef PREC
+
+
+#line 931
+
+/* helper function choose scientific of fractional output, based on a cutoff */
+static PyObject *
+doubletype_str_either(npy_double val, TrimMode trim_pos, TrimMode trim_sci,
+                         npy_bool sign)
+{
+
+    if (npy_legacy_print_mode <= 113) {
+        return legacy_double_formatstr(val);
+    }
+
+    int use_positional;
+    if (npy_isnan(val) || val == 0) {
+         use_positional = 1;
+    }
+    else {
+        npy_double absval = val < 0 ? -val : val;
+        use_positional = absval < 1.e16L && absval >= 1.e-4L;
+    }
+
+    if (use_positional) {
+        return format_double(val, 0, -1, sign, trim_pos, -1, -1, -1);
+    }
+    return format_double(val, 1, -1, sign, trim_sci, -1, -1, -1);
+}
+
+static PyObject *
+doubletype_str(PyObject *self)
+{
+    return doubletype_str_either(PyArrayScalar_VAL(self, Double),
+                                  TrimMode_LeaveOneZero, TrimMode_DptZeros, 0);
+}
+
+static PyObject *
+cdoubletype_str(PyObject *self)
+{
+    PyObject *rstr, *istr;
+    npy_cdouble val = PyArrayScalar_VAL(self, CDouble);
+    TrimMode trim = TrimMode_DptZeros;
+
+    if (npy_legacy_print_mode <= 113) {
+        return legacy_cdouble_formatstr(val);
+    }
+
+    if (val.real == 0.0 && npy_signbit(val.real) == 0) {
+        istr = doubletype_str_either(val.imag, trim, trim, 0);
+        if (istr == NULL) {
+            return NULL;
+        }
+        PyObject *ret = PyUnicode_FromFormat("%Sj", istr);
+        Py_DECREF(istr);
+        return ret;
+    }
+
+    if (npy_isfinite(val.real)) {
+        rstr = doubletype_str_either(val.real, trim, trim, 0);
+    }
+    else if (npy_isnan(val.real)) {
+        rstr = PyUnicode_FromString("nan");
+    }
+    else if (val.real > 0){
+        rstr = PyUnicode_FromString("inf");
+    }
+    else {
+        rstr = PyUnicode_FromString("-inf");
+    }
+    if (rstr == NULL) {
+        return NULL;
+    }
+
+    if (npy_isfinite(val.imag)) {
+        istr = doubletype_str_either(val.imag, trim, trim, 1);
+    }
+    else if (npy_isnan(val.imag)) {
+        istr = PyUnicode_FromString("+nan");
+    }
+    else if (val.imag > 0){
+        istr = PyUnicode_FromString("+inf");
+    }
+    else {
+        istr = PyUnicode_FromString("-inf");
+    }
+    if (istr == NULL) {
+        Py_DECREF(rstr);
+        return NULL;
+    }
+
+    PyObject *ret = PyUnicode_FromFormat("(%S%Sj)", rstr, istr);
+    Py_DECREF(rstr);
+    Py_DECREF(istr);
+    return ret;
+}
+
+#undef PREC
+
+
+#line 931
+
+/* helper function choose scientific of fractional output, based on a cutoff */
+static PyObject *
+longdoubletype_str_either(npy_longdouble val, TrimMode trim_pos, TrimMode trim_sci,
+                         npy_bool sign)
+{
+
+    if (npy_legacy_print_mode <= 113) {
+        return legacy_longdouble_formatstr(val);
+    }
+
+    int use_positional;
+    if (npy_isnan(val) || val == 0) {
+         use_positional = 1;
+    }
+    else {
+        npy_longdouble absval = val < 0 ? -val : val;
+        use_positional = absval < 1.e16L && absval >= 1.e-4L;
+    }
+
+    if (use_positional) {
+        return format_longdouble(val, 0, -1, sign, trim_pos, -1, -1, -1);
+    }
+    return format_longdouble(val, 1, -1, sign, trim_sci, -1, -1, -1);
+}
+
+static PyObject *
+longdoubletype_str(PyObject *self)
+{
+    return longdoubletype_str_either(PyArrayScalar_VAL(self, LongDouble),
+                                  TrimMode_LeaveOneZero, TrimMode_DptZeros, 0);
+}
+
+static PyObject *
+clongdoubletype_str(PyObject *self)
+{
+    PyObject *rstr, *istr;
+    npy_clongdouble val = PyArrayScalar_VAL(self, CLongDouble);
+    TrimMode trim = TrimMode_DptZeros;
+
+    if (npy_legacy_print_mode <= 113) {
+        return legacy_clongdouble_formatstr(val);
+    }
+
+    if (val.real == 0.0 && npy_signbit(val.real) == 0) {
+        istr = longdoubletype_str_either(val.imag, trim, trim, 0);
+        if (istr == NULL) {
+            return NULL;
+        }
+        PyObject *ret = PyUnicode_FromFormat("%Sj", istr);
+        Py_DECREF(istr);
+        return ret;
+    }
+
+    if (npy_isfinite(val.real)) {
+        rstr = longdoubletype_str_either(val.real, trim, trim, 0);
+    }
+    else if (npy_isnan(val.real)) {
+        rstr = PyUnicode_FromString("nan");
+    }
+    else if (val.real > 0){
+        rstr = PyUnicode_FromString("inf");
+    }
+    else {
+        rstr = PyUnicode_FromString("-inf");
+    }
+    if (rstr == NULL) {
+        return NULL;
+    }
+
+    if (npy_isfinite(val.imag)) {
+        istr = longdoubletype_str_either(val.imag, trim, trim, 1);
+    }
+    else if (npy_isnan(val.imag)) {
+        istr = PyUnicode_FromString("+nan");
+    }
+    else if (val.imag > 0){
+        istr = PyUnicode_FromString("+inf");
+    }
+    else {
+        istr = PyUnicode_FromString("-inf");
+    }
+    if (istr == NULL) {
+        Py_DECREF(rstr);
+        return NULL;
+    }
+
+    PyObject *ret = PyUnicode_FromFormat("(%S%Sj)", rstr, istr);
+    Py_DECREF(rstr);
+    Py_DECREF(istr);
+    return ret;
+}
+
+#undef PREC
+
+
+
+
+static PyObject *
+halftype_str(PyObject *self)
+{
+    npy_half val = PyArrayScalar_VAL(self, Half);
+    float floatval = npy_half_to_float(val);
+    float absval;
+
+    if (npy_legacy_print_mode <= 113) {
+        return legacy_float_formatstr(floatval);
+    }
+
+    absval = floatval < 0 ? -floatval : floatval;
+
+    if (absval == 0 || (absval < 1.e16 && absval >= 1.e-4) ) {
+        return format_half(val, 0, -1, 0, TrimMode_LeaveOneZero, -1, -1, -1);
+    }
+    return format_half(val, 1, -1, 0, TrimMode_DptZeros, -1, -1, -1);
+}
+
+
+
+#line 925
+
+#line 931
+
+/* helper function choose scientific of fractional output, based on a cutoff */
+static PyObject *
+floattype_repr_either(npy_float val, TrimMode trim_pos, TrimMode trim_sci,
+                         npy_bool sign)
+{
+
+    if (npy_legacy_print_mode <= 113) {
+        return legacy_float_formatrepr(val);
+    }
+
+    int use_positional;
+    if (npy_isnan(val) || val == 0) {
+         use_positional = 1;
+    }
+    else {
+        npy_float absval = val < 0 ? -val : val;
+        use_positional = absval < 1.e16L && absval >= 1.e-4L;
+    }
+
+    if (use_positional) {
+        return format_float(val, 0, -1, sign, trim_pos, -1, -1, -1);
+    }
+    return format_float(val, 1, -1, sign, trim_sci, -1, -1, -1);
+}
+
+static PyObject *
+floattype_repr(PyObject *self)
+{
+    return floattype_repr_either(PyArrayScalar_VAL(self, Float),
+                                  TrimMode_LeaveOneZero, TrimMode_DptZeros, 0);
+}
+
+static PyObject *
+cfloattype_repr(PyObject *self)
+{
+    PyObject *rstr, *istr;
+    npy_cfloat val = PyArrayScalar_VAL(self, CFloat);
+    TrimMode trim = TrimMode_DptZeros;
+
+    if (npy_legacy_print_mode <= 113) {
+        return legacy_cfloat_formatrepr(val);
+    }
+
+    if (val.real == 0.0 && npy_signbit(val.real) == 0) {
+        istr = floattype_repr_either(val.imag, trim, trim, 0);
+        if (istr == NULL) {
+            return NULL;
+        }
+        PyObject *ret = PyUnicode_FromFormat("%Sj", istr);
+        Py_DECREF(istr);
+        return ret;
+    }
+
+    if (npy_isfinite(val.real)) {
+        rstr = floattype_repr_either(val.real, trim, trim, 0);
+    }
+    else if (npy_isnan(val.real)) {
+        rstr = PyUnicode_FromString("nan");
+    }
+    else if (val.real > 0){
+        rstr = PyUnicode_FromString("inf");
+    }
+    else {
+        rstr = PyUnicode_FromString("-inf");
+    }
+    if (rstr == NULL) {
+        return NULL;
+    }
+
+    if (npy_isfinite(val.imag)) {
+        istr = floattype_repr_either(val.imag, trim, trim, 1);
+    }
+    else if (npy_isnan(val.imag)) {
+        istr = PyUnicode_FromString("+nan");
+    }
+    else if (val.imag > 0){
+        istr = PyUnicode_FromString("+inf");
+    }
+    else {
+        istr = PyUnicode_FromString("-inf");
+    }
+    if (istr == NULL) {
+        Py_DECREF(rstr);
+        return NULL;
+    }
+
+    PyObject *ret = PyUnicode_FromFormat("(%S%Sj)", rstr, istr);
+    Py_DECREF(rstr);
+    Py_DECREF(istr);
+    return ret;
+}
+
+#undef PREC
+
+
+#line 931
+
+/* helper function choose scientific of fractional output, based on a cutoff */
+static PyObject *
+doubletype_repr_either(npy_double val, TrimMode trim_pos, TrimMode trim_sci,
+                         npy_bool sign)
+{
+
+    if (npy_legacy_print_mode <= 113) {
+        return legacy_double_formatrepr(val);
+    }
+
+    int use_positional;
+    if (npy_isnan(val) || val == 0) {
+         use_positional = 1;
+    }
+    else {
+        npy_double absval = val < 0 ? -val : val;
+        use_positional = absval < 1.e16L && absval >= 1.e-4L;
+    }
+
+    if (use_positional) {
+        return format_double(val, 0, -1, sign, trim_pos, -1, -1, -1);
+    }
+    return format_double(val, 1, -1, sign, trim_sci, -1, -1, -1);
+}
+
+static PyObject *
+doubletype_repr(PyObject *self)
+{
+    return doubletype_repr_either(PyArrayScalar_VAL(self, Double),
+                                  TrimMode_LeaveOneZero, TrimMode_DptZeros, 0);
+}
+
+static PyObject *
+cdoubletype_repr(PyObject *self)
+{
+    PyObject *rstr, *istr;
+    npy_cdouble val = PyArrayScalar_VAL(self, CDouble);
+    TrimMode trim = TrimMode_DptZeros;
+
+    if (npy_legacy_print_mode <= 113) {
+        return legacy_cdouble_formatrepr(val);
+    }
+
+    if (val.real == 0.0 && npy_signbit(val.real) == 0) {
+        istr = doubletype_repr_either(val.imag, trim, trim, 0);
+        if (istr == NULL) {
+            return NULL;
+        }
+        PyObject *ret = PyUnicode_FromFormat("%Sj", istr);
+        Py_DECREF(istr);
+        return ret;
+    }
+
+    if (npy_isfinite(val.real)) {
+        rstr = doubletype_repr_either(val.real, trim, trim, 0);
+    }
+    else if (npy_isnan(val.real)) {
+        rstr = PyUnicode_FromString("nan");
+    }
+    else if (val.real > 0){
+        rstr = PyUnicode_FromString("inf");
+    }
+    else {
+        rstr = PyUnicode_FromString("-inf");
+    }
+    if (rstr == NULL) {
+        return NULL;
+    }
+
+    if (npy_isfinite(val.imag)) {
+        istr = doubletype_repr_either(val.imag, trim, trim, 1);
+    }
+    else if (npy_isnan(val.imag)) {
+        istr = PyUnicode_FromString("+nan");
+    }
+    else if (val.imag > 0){
+        istr = PyUnicode_FromString("+inf");
+    }
+    else {
+        istr = PyUnicode_FromString("-inf");
+    }
+    if (istr == NULL) {
+        Py_DECREF(rstr);
+        return NULL;
+    }
+
+    PyObject *ret = PyUnicode_FromFormat("(%S%Sj)", rstr, istr);
+    Py_DECREF(rstr);
+    Py_DECREF(istr);
+    return ret;
+}
+
+#undef PREC
+
+
+#line 931
+
+/* helper function choose scientific of fractional output, based on a cutoff */
+static PyObject *
+longdoubletype_repr_either(npy_longdouble val, TrimMode trim_pos, TrimMode trim_sci,
+                         npy_bool sign)
+{
+
+    if (npy_legacy_print_mode <= 113) {
+        return legacy_longdouble_formatrepr(val);
+    }
+
+    int use_positional;
+    if (npy_isnan(val) || val == 0) {
+         use_positional = 1;
+    }
+    else {
+        npy_longdouble absval = val < 0 ? -val : val;
+        use_positional = absval < 1.e16L && absval >= 1.e-4L;
+    }
+
+    if (use_positional) {
+        return format_longdouble(val, 0, -1, sign, trim_pos, -1, -1, -1);
+    }
+    return format_longdouble(val, 1, -1, sign, trim_sci, -1, -1, -1);
+}
+
+static PyObject *
+longdoubletype_repr(PyObject *self)
+{
+    return longdoubletype_repr_either(PyArrayScalar_VAL(self, LongDouble),
+                                  TrimMode_LeaveOneZero, TrimMode_DptZeros, 0);
+}
+
+static PyObject *
+clongdoubletype_repr(PyObject *self)
+{
+    PyObject *rstr, *istr;
+    npy_clongdouble val = PyArrayScalar_VAL(self, CLongDouble);
+    TrimMode trim = TrimMode_DptZeros;
+
+    if (npy_legacy_print_mode <= 113) {
+        return legacy_clongdouble_formatrepr(val);
+    }
+
+    if (val.real == 0.0 && npy_signbit(val.real) == 0) {
+        istr = longdoubletype_repr_either(val.imag, trim, trim, 0);
+        if (istr == NULL) {
+            return NULL;
+        }
+        PyObject *ret = PyUnicode_FromFormat("%Sj", istr);
+        Py_DECREF(istr);
+        return ret;
+    }
+
+    if (npy_isfinite(val.real)) {
+        rstr = longdoubletype_repr_either(val.real, trim, trim, 0);
+    }
+    else if (npy_isnan(val.real)) {
+        rstr = PyUnicode_FromString("nan");
+    }
+    else if (val.real > 0){
+        rstr = PyUnicode_FromString("inf");
+    }
+    else {
+        rstr = PyUnicode_FromString("-inf");
+    }
+    if (rstr == NULL) {
+        return NULL;
+    }
+
+    if (npy_isfinite(val.imag)) {
+        istr = longdoubletype_repr_either(val.imag, trim, trim, 1);
+    }
+    else if (npy_isnan(val.imag)) {
+        istr = PyUnicode_FromString("+nan");
+    }
+    else if (val.imag > 0){
+        istr = PyUnicode_FromString("+inf");
+    }
+    else {
+        istr = PyUnicode_FromString("-inf");
+    }
+    if (istr == NULL) {
+        Py_DECREF(rstr);
+        return NULL;
+    }
+
+    PyObject *ret = PyUnicode_FromFormat("(%S%Sj)", rstr, istr);
+    Py_DECREF(rstr);
+    Py_DECREF(istr);
+    return ret;
+}
+
+#undef PREC
+
+
+
+
+static PyObject *
+halftype_repr(PyObject *self)
+{
+    npy_half val = PyArrayScalar_VAL(self, Half);
+    float floatval = npy_half_to_float(val);
+    float absval;
+
+    if (npy_legacy_print_mode <= 113) {
+        return legacy_float_formatrepr(floatval);
+    }
+
+    absval = floatval < 0 ? -floatval : floatval;
+
+    if (absval == 0 || (absval < 1.e16 && absval >= 1.e-4) ) {
+        return format_half(val, 0, -1, 0, TrimMode_LeaveOneZero, -1, -1, -1);
+    }
+    return format_half(val, 1, -1, 0, TrimMode_DptZeros, -1, -1, -1);
+}
+
+
+
+
+#line 1056
+static PyObject *
+longdoubletype_float(PyObject *self)
+{
+    npy_longdouble val = PyArrayScalar_VAL(self, LongDouble);
+    return PyFloat_FromDouble((double) val);
+}
+
+static PyObject *
+longdoubletype_long(PyObject *self)
+{
+    npy_longdouble val = PyArrayScalar_VAL(self, LongDouble);
+    return npy_longdouble_to_PyLong(val);
+}
+
+
+#line 1056
+static PyObject *
+clongdoubletype_float(PyObject *self)
+{
+    npy_longdouble val = PyArrayScalar_VAL(self, CLongDouble).real;
+    return PyFloat_FromDouble((double) val);
+}
+
+static PyObject *
+clongdoubletype_long(PyObject *self)
+{
+    npy_longdouble val = PyArrayScalar_VAL(self, CLongDouble).real;
+    return npy_longdouble_to_PyLong(val);
+}
+
+
+
+static PyNumberMethods gentype_as_number = {
+    .nb_add = (binaryfunc)gentype_add,
+    .nb_subtract = (binaryfunc)gentype_subtract,
+    .nb_multiply = (binaryfunc)gentype_multiply,
+    .nb_remainder = (binaryfunc)gentype_remainder,
+    .nb_divmod = (binaryfunc)gentype_divmod,
+    .nb_power = (ternaryfunc)gentype_power,
+    .nb_negative = (unaryfunc)gentype_negative,
+    .nb_positive = (unaryfunc)gentype_positive,
+    .nb_absolute = (unaryfunc)gentype_absolute,
+    .nb_bool = (inquiry)gentype_nonzero_number,
+    .nb_invert = (unaryfunc)gentype_invert,
+    .nb_lshift = (binaryfunc)gentype_lshift,
+    .nb_rshift = (binaryfunc)gentype_rshift,
+    .nb_and = (binaryfunc)gentype_and,
+    .nb_xor = (binaryfunc)gentype_xor,
+    .nb_or = (binaryfunc)gentype_or,
+    .nb_int = (unaryfunc)gentype_int,
+    .nb_float = (unaryfunc)gentype_float,
+    .nb_floor_divide = (binaryfunc)gentype_floor_divide,
+    .nb_true_divide = (binaryfunc)gentype_true_divide,
+};
+
+
+static PyObject *
+gentype_richcompare(PyObject *self, PyObject *other, int cmp_op)
+{
+    PyObject *arr, *ret;
+
+    /*
+     * If the other object is None, False is always right. This avoids
+     * the array None comparison, at least until deprecation it is fixed.
+     * After that, this may be removed and numpy false would be returned.
+     *
+     * NOTE: np.equal(NaT, None) evaluates to TRUE! This is an
+     *       an inconsistency, which may has to be considered
+     *       when the deprecation is finished.
+     */
+    if (other == Py_None) {
+        if (cmp_op == Py_EQ) {
+            Py_RETURN_FALSE;
+        }
+        if (cmp_op == Py_NE) {
+            Py_RETURN_TRUE;
+        }
+    }
+
+   RICHCMP_GIVE_UP_IF_NEEDED(self, other);
+
+    arr = PyArray_FromScalar(self, NULL);
+    if (arr == NULL) {
+        return NULL;
+    }
+    /*
+     * Call via PyObject_RichCompare to ensure that other.__eq__
+     * has a chance to run when necessary
+     */
+    ret = PyObject_RichCompare(arr, other, cmp_op);
+    Py_DECREF(arr);
+    return ret;
+}
+
+static PyObject *
+gentype_ndim_get(PyObject *NPY_UNUSED(self), void *NPY_UNUSED(ignored))
+{
+    return PyLong_FromLong(0);
+}
+
+static PyObject *
+gentype_flags_get(PyObject *NPY_UNUSED(self), void *NPY_UNUSED(ignored))
+{
+    return PyArray_NewFlagsObject(NULL);
+}
+
+static PyObject *
+voidtype_flags_get(PyVoidScalarObject *self, void *NPY_UNUSED(ignored))
+{
+    PyObject *flagobj;
+    flagobj = PyArrayFlags_Type.tp_alloc(&PyArrayFlags_Type, 0);
+    if (flagobj == NULL) {
+        return NULL;
+    }
+    ((PyArrayFlagsObject *)flagobj)->arr = NULL;
+    ((PyArrayFlagsObject *)flagobj)->flags = self->flags;
+    return flagobj;
+}
+
+static PyObject *
+voidtype_dtypedescr_get(PyVoidScalarObject *self, void *NPY_UNUSED(ignored))
+{
+    Py_INCREF(self->descr);
+    return (PyObject *)self->descr;
+}
+
+
+static PyObject *
+inttype_numerator_get(PyObject *self, void *NPY_UNUSED(ignored))
+{
+    Py_INCREF(self);
+    return self;
+}
+
+
+static PyObject *
+inttype_denominator_get(PyObject *self, void *NPY_UNUSED(ignored))
+{
+    return PyLong_FromLong(1);
+}
+
+
+static PyObject *
+gentype_data_get(PyObject *self, void *NPY_UNUSED(ignored))
+{
+    return PyMemoryView_FromObject(self);
+}
+
+
+static PyObject *
+gentype_itemsize_get(PyObject *self, void *NPY_UNUSED(ignored))
+{
+    PyArray_Descr *typecode;
+    PyObject *ret;
+    int elsize;
+
+    typecode = PyArray_DescrFromScalar(self);
+    elsize = typecode->elsize;
+    ret = PyLong_FromLong((long) elsize);
+    Py_DECREF(typecode);
+    return ret;
+}
+
+static PyObject *
+gentype_size_get(PyObject *NPY_UNUSED(self), void *NPY_UNUSED(ignored))
+{
+    return PyLong_FromLong(1);
+}
+
+static PyObject *
+gentype_sizeof(PyObject *self, PyObject *NPY_UNUSED(args))
+{
+    Py_ssize_t nbytes;
+    PyObject * isz = gentype_itemsize_get(self, NULL);
+    if (isz == NULL) {
+        return NULL;
+    }
+    nbytes = PyLong_AsLong(isz) + Py_TYPE(self)->tp_basicsize +
+        Py_SIZE(self) * Py_TYPE(self)->tp_itemsize;
+    Py_DECREF(isz);
+    return PyLong_FromSsize_t(nbytes);
+}
+
+NPY_NO_EXPORT void
+gentype_struct_free(PyObject *ptr)
+{
+    PyArrayInterface *arrif = (PyArrayInterface*)PyCapsule_GetPointer(ptr, NULL);
+    if (arrif == NULL) {
+        PyErr_WriteUnraisable(ptr);
+        return;
+    }
+    PyObject *context = (PyObject *)PyCapsule_GetContext(ptr);
+    if (context == NULL && PyErr_Occurred()) {
+        PyErr_WriteUnraisable(ptr);
+    }
+    Py_XDECREF(context);
+    Py_XDECREF(arrif->descr);
+    PyArray_free(arrif->shape);
+    PyArray_free(arrif);
+}
+
+static PyObject *
+gentype_struct_get(PyObject *self, void *NPY_UNUSED(ignored))
+{
+    PyArrayObject *arr;
+    PyArrayInterface *inter;
+    PyObject *ret;
+
+    arr = (PyArrayObject *)PyArray_FromScalar(self, NULL);
+    inter = (PyArrayInterface *)PyArray_malloc(sizeof(PyArrayInterface));
+    inter->two = 2;
+    inter->nd = 0;
+    inter->flags = PyArray_FLAGS(arr);
+    inter->flags &= ~(NPY_ARRAY_WRITEBACKIFCOPY | NPY_ARRAY_OWNDATA);
+    inter->flags |= NPY_ARRAY_NOTSWAPPED;
+    inter->typekind = PyArray_DESCR(arr)->kind;
+    inter->itemsize = PyArray_DESCR(arr)->elsize;
+    inter->strides = NULL;
+    inter->shape = NULL;
+    inter->data = PyArray_DATA(arr);
+    inter->descr = NULL;
+
+    ret = NpyCapsule_FromVoidPtrAndDesc(inter, arr, gentype_struct_free);
+    return ret;
+}
+
+static PyObject *
+gentype_priority_get(PyObject *NPY_UNUSED(self), void *NPY_UNUSED(ignored))
+{
+    return PyFloat_FromDouble(NPY_SCALAR_PRIORITY);
+}
+
+static PyObject *
+gentype_shape_get(PyObject *NPY_UNUSED(self), void *NPY_UNUSED(ignored))
+{
+    return PyTuple_New(0);
+}
+
+
+static PyObject *
+gentype_interface_get(PyObject *self, void *NPY_UNUSED(ignored))
+{
+    PyArrayObject *arr;
+    PyObject *inter;
+
+    arr = (PyArrayObject *)PyArray_FromScalar(self, NULL);
+    if (arr == NULL) {
+        return NULL;
+    }
+    inter = PyObject_GetAttrString((PyObject *)arr, "__array_interface__");
+    if (inter != NULL) {
+        PyDict_SetItemString(inter, "__ref", (PyObject *)arr);
+    }
+    Py_DECREF(arr);
+    return inter;
+}
+
+
+
+static PyObject *
+gentype_typedescr_get(PyObject *self, void *NPY_UNUSED(ignored))
+{
+    return (PyObject *)PyArray_DescrFromScalar(self);
+}
+
+
+static PyObject *
+gentype_base_get(PyObject *NPY_UNUSED(self), void *NPY_UNUSED(ignored))
+{
+    Py_RETURN_NONE;
+}
+
+static PyObject *
+voidtype_base_get(PyVoidScalarObject *self, void *NPY_UNUSED(ignored))
+{
+    if (self->base == NULL) {
+        Py_RETURN_NONE;
+    }
+    else {
+        Py_INCREF(self->base);
+        return self->base;
+    }
+}
+
+
+static PyArray_Descr *
+_realdescr_fromcomplexscalar(PyObject *self, int *typenum)
+{
+    if (PyArray_IsScalar(self, CDouble)) {
+        *typenum = NPY_CDOUBLE;
+        return PyArray_DescrFromType(NPY_DOUBLE);
+    }
+    if (PyArray_IsScalar(self, CFloat)) {
+        *typenum = NPY_CFLOAT;
+        return PyArray_DescrFromType(NPY_FLOAT);
+    }
+    if (PyArray_IsScalar(self, CLongDouble)) {
+        *typenum = NPY_CLONGDOUBLE;
+        return PyArray_DescrFromType(NPY_LONGDOUBLE);
+    }
+    return NULL;
+}
+
+static PyObject *
+gentype_real_get(PyObject *self, void *NPY_UNUSED(ignored))
+{
+    PyArray_Descr *typecode;
+    PyObject *ret;
+    int typenum;
+
+    if (PyArray_IsScalar(self, ComplexFloating)) {
+        void *ptr;
+        typecode = _realdescr_fromcomplexscalar(self, &typenum);
+        ptr = scalar_value(self, NULL);
+        ret = PyArray_Scalar(ptr, typecode, NULL);
+        Py_DECREF(typecode);
+        return ret;
+    }
+    else if (PyArray_IsScalar(self, Object)) {
+        PyObject *obj = PyArrayScalar_VAL(self, Object);
+        ret = PyObject_GetAttrString(obj, "real");
+        if (ret != NULL) {
+            return ret;
+        }
+        PyErr_Clear();
+    }
+    Py_INCREF(self);
+    return (PyObject *)self;
+}
+
+static PyObject *
+gentype_imag_get(PyObject *self, void *NPY_UNUSED(ignored))
+{
+    PyArray_Descr *typecode=NULL;
+    PyObject *ret;
+    int typenum;
+
+    if (PyArray_IsScalar(self, ComplexFloating)) {
+        char *ptr;
+        typecode = _realdescr_fromcomplexscalar(self, &typenum);
+        ptr = (char *)scalar_value(self, NULL);
+        ret = PyArray_Scalar(ptr + typecode->elsize, typecode, NULL);
+    }
+    else if (PyArray_IsScalar(self, Object)) {
+        PyObject *obj = PyArrayScalar_VAL(self, Object);
+        PyArray_Descr *newtype;
+        ret = PyObject_GetAttrString(obj, "imag");
+        if (ret == NULL) {
+            PyErr_Clear();
+            obj = PyLong_FromLong(0);
+            newtype = PyArray_DescrFromType(NPY_OBJECT);
+            ret = PyArray_Scalar((char *)&obj, newtype, NULL);
+            Py_DECREF(newtype);
+            Py_DECREF(obj);
+        }
+    }
+    else {
+        char *temp;
+        int elsize;
+        typecode = PyArray_DescrFromScalar(self);
+        elsize = typecode->elsize;
+        temp = npy_alloc_cache_zero(1, elsize);
+        ret = PyArray_Scalar(temp, typecode, NULL);
+        npy_free_cache(temp, elsize);
+    }
+
+    Py_XDECREF(typecode);
+    return ret;
+}
+
+static PyObject *
+gentype_flat_get(PyObject *self, void *NPY_UNUSED(ignored))
+{
+    PyObject *ret, *arr;
+
+    arr = PyArray_FromScalar(self, NULL);
+    if (arr == NULL) {
+        return NULL;
+    }
+    ret = PyArray_IterNew(arr);
+    Py_DECREF(arr);
+    return ret;
+}
+
+
+static PyObject *
+gentype_transpose_get(PyObject *self, void *NPY_UNUSED(ignored))
+{
+    Py_INCREF(self);
+    return self;
+}
+
+
+static PyGetSetDef gentype_getsets[] = {
+    {"ndim",
+        (getter)gentype_ndim_get,
+        (setter) 0, NULL, NULL},
+    {"flags",
+        (getter)gentype_flags_get,
+        (setter)0, NULL, NULL},
+    {"shape",
+        (getter)gentype_shape_get,
+        (setter)0, NULL, NULL},
+    {"strides",
+        (getter)gentype_shape_get,
+        (setter) 0, NULL, NULL},
+    {"data",
+        (getter)gentype_data_get,
+        (setter) 0, NULL, NULL},
+    {"itemsize",
+        (getter)gentype_itemsize_get,
+        (setter)0, NULL, NULL},
+    {"size",
+        (getter)gentype_size_get,
+        (setter)0, NULL, NULL},
+    {"nbytes",
+        (getter)gentype_itemsize_get,
+        (setter)0, NULL, NULL},
+    {"base",
+        (getter)gentype_base_get,
+        (setter)0, NULL, NULL},
+    {"dtype",
+        (getter)gentype_typedescr_get,
+        NULL, NULL, NULL},
+    {"real",
+        (getter)gentype_real_get,
+        (setter)0, NULL, NULL},
+    {"imag",
+        (getter)gentype_imag_get,
+        (setter)0, NULL, NULL},
+    {"flat",
+        (getter)gentype_flat_get,
+        (setter)0, NULL, NULL},
+    {"T",
+        (getter)gentype_transpose_get,
+        (setter)0, NULL, NULL},
+    {"__array_interface__",
+        (getter)gentype_interface_get,
+        NULL,
+        "Array protocol: Python side",
+        NULL},
+    {"__array_struct__",
+        (getter)gentype_struct_get,
+        NULL,
+        "Array protocol: struct",
+        NULL},
+    {"__array_priority__",
+        (getter)gentype_priority_get,
+        NULL,
+        "Array priority.",
+        NULL},
+    {NULL, NULL, NULL, NULL, NULL}  /* Sentinel */
+};
+
+
+/* 0-dim array from scalar object */
+
+static char doc_getarray[] = "sc.__array__(dtype) return 0-dim array from "
+                             "scalar with specified dtype";
+
+static PyObject *
+gentype_getarray(PyObject *scalar, PyObject *args)
+{
+    PyArray_Descr *outcode=NULL;
+    PyObject *ret;
+
+    if (!PyArg_ParseTuple(args, "|O&:__array__", &PyArray_DescrConverter,
+                &outcode)) {
+        Py_XDECREF(outcode);
+        return NULL;
+    }
+    ret = PyArray_FromScalar(scalar, outcode);
+    return ret;
+}
+
+static char doc_sc_wraparray[] = "sc.__array_wrap__(obj) return scalar from array";
+
+static PyObject *
+gentype_wraparray(PyObject *NPY_UNUSED(scalar), PyObject *args)
+{
+    PyObject *obj;
+    PyArrayObject *arr;
+
+    if (PyTuple_Size(args) < 1) {
+        PyErr_SetString(PyExc_TypeError,
+                "only accepts 1 argument.");
+        return NULL;
+    }
+    obj = PyTuple_GET_ITEM(args, 0);
+    if (!PyArray_Check(obj)) {
+        PyErr_SetString(PyExc_TypeError,
+                "can only be called with ndarray object");
+        return NULL;
+    }
+    arr = (PyArrayObject *)obj;
+
+    return PyArray_Scalar(PyArray_DATA(arr),
+                    PyArray_DESCR(arr), (PyObject *)arr);
+}
+
+/*
+ * These gentype_* functions do not take keyword arguments.
+ * The proper flag is METH_VARARGS.
+ */
+#line 1550
+static PyObject *
+gentype_tolist(PyObject *self, PyObject *args)
+{
+    return gentype_generic_method(self, args, NULL, "tolist");
+}
+
+#line 1550
+static PyObject *
+gentype_item(PyObject *self, PyObject *args)
+{
+    return gentype_generic_method(self, args, NULL, "item");
+}
+
+#line 1550
+static PyObject *
+gentype___deepcopy__(PyObject *self, PyObject *args)
+{
+    return gentype_generic_method(self, args, NULL, "__deepcopy__");
+}
+
+#line 1550
+static PyObject *
+gentype___copy__(PyObject *self, PyObject *args)
+{
+    return gentype_generic_method(self, args, NULL, "__copy__");
+}
+
+#line 1550
+static PyObject *
+gentype_swapaxes(PyObject *self, PyObject *args)
+{
+    return gentype_generic_method(self, args, NULL, "swapaxes");
+}
+
+#line 1550
+static PyObject *
+gentype_conj(PyObject *self, PyObject *args)
+{
+    return gentype_generic_method(self, args, NULL, "conj");
+}
+
+#line 1550
+static PyObject *
+gentype_conjugate(PyObject *self, PyObject *args)
+{
+    return gentype_generic_method(self, args, NULL, "conjugate");
+}
+
+#line 1550
+static PyObject *
+gentype_nonzero(PyObject *self, PyObject *args)
+{
+    return gentype_generic_method(self, args, NULL, "nonzero");
+}
+
+#line 1550
+static PyObject *
+gentype_fill(PyObject *self, PyObject *args)
+{
+    return gentype_generic_method(self, args, NULL, "fill");
+}
+
+#line 1550
+static PyObject *
+gentype_transpose(PyObject *self, PyObject *args)
+{
+    return gentype_generic_method(self, args, NULL, "transpose");
+}
+
+#line 1550
+static PyObject *
+gentype_newbyteorder(PyObject *self, PyObject *args)
+{
+    return gentype_generic_method(self, args, NULL, "newbyteorder");
+}
+
+
+static PyObject *
+gentype_itemset(PyObject *NPY_UNUSED(self), PyObject *NPY_UNUSED(args))
+{
+    PyErr_SetString(PyExc_ValueError, "array-scalars are immutable");
+    return NULL;
+}
+
+static PyObject *
+gentype_byteswap(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    npy_bool inplace = NPY_FALSE;
+    static char *kwlist[] = {"inplace", NULL};
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O&:byteswap", kwlist,
+                                     PyArray_BoolConverter, &inplace)) {
+        return NULL;
+    }
+    if (inplace) {
+        PyErr_SetString(PyExc_ValueError,
+                "cannot byteswap a scalar in-place");
+        return NULL;
+    }
+    else {
+        /* get the data, copyswap it and pass it to a new Array scalar */
+        char *data;
+        PyArray_Descr *descr;
+        PyObject *new;
+        char *newmem;
+
+        descr = PyArray_DescrFromScalar(self);
+        data = (void *)scalar_value(self, descr);
+
+        newmem = PyObject_Malloc(descr->elsize);
+        if (newmem == NULL) {
+            Py_DECREF(descr);
+            return PyErr_NoMemory();
+        }
+        else {
+            descr->f->copyswap(newmem, data, 1, NULL);
+        }
+        new = PyArray_Scalar(newmem, descr, NULL);
+        PyObject_Free(newmem);
+        Py_DECREF(descr);
+        return new;
+    }
+}
+
+
+/*
+ * These gentype_* functions take keyword arguments.
+ * The proper flag is METH_VARARGS | METH_KEYWORDS.
+ */
+#line 1617
+static PyObject *
+gentype_take(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "take");
+}
+
+#line 1617
+static PyObject *
+gentype_getfield(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "getfield");
+}
+
+#line 1617
+static PyObject *
+gentype_put(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "put");
+}
+
+#line 1617
+static PyObject *
+gentype_repeat(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "repeat");
+}
+
+#line 1617
+static PyObject *
+gentype_tofile(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "tofile");
+}
+
+#line 1617
+static PyObject *
+gentype_mean(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "mean");
+}
+
+#line 1617
+static PyObject *
+gentype_trace(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "trace");
+}
+
+#line 1617
+static PyObject *
+gentype_diagonal(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "diagonal");
+}
+
+#line 1617
+static PyObject *
+gentype_clip(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "clip");
+}
+
+#line 1617
+static PyObject *
+gentype_std(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "std");
+}
+
+#line 1617
+static PyObject *
+gentype_var(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "var");
+}
+
+#line 1617
+static PyObject *
+gentype_sum(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "sum");
+}
+
+#line 1617
+static PyObject *
+gentype_cumsum(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "cumsum");
+}
+
+#line 1617
+static PyObject *
+gentype_prod(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "prod");
+}
+
+#line 1617
+static PyObject *
+gentype_cumprod(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "cumprod");
+}
+
+#line 1617
+static PyObject *
+gentype_compress(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "compress");
+}
+
+#line 1617
+static PyObject *
+gentype_sort(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "sort");
+}
+
+#line 1617
+static PyObject *
+gentype_argsort(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "argsort");
+}
+
+#line 1617
+static PyObject *
+gentype_round(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "round");
+}
+
+#line 1617
+static PyObject *
+gentype_argmax(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "argmax");
+}
+
+#line 1617
+static PyObject *
+gentype_argmin(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "argmin");
+}
+
+#line 1617
+static PyObject *
+gentype_max(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "max");
+}
+
+#line 1617
+static PyObject *
+gentype_min(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "min");
+}
+
+#line 1617
+static PyObject *
+gentype_ptp(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "ptp");
+}
+
+#line 1617
+static PyObject *
+gentype_any(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "any");
+}
+
+#line 1617
+static PyObject *
+gentype_all(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "all");
+}
+
+#line 1617
+static PyObject *
+gentype_astype(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "astype");
+}
+
+#line 1617
+static PyObject *
+gentype_resize(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "resize");
+}
+
+#line 1617
+static PyObject *
+gentype_reshape(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "reshape");
+}
+
+#line 1617
+static PyObject *
+gentype_choose(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "choose");
+}
+
+#line 1617
+static PyObject *
+gentype_tostring(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "tostring");
+}
+
+#line 1617
+static PyObject *
+gentype_tobytes(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "tobytes");
+}
+
+#line 1617
+static PyObject *
+gentype_copy(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "copy");
+}
+
+#line 1617
+static PyObject *
+gentype_searchsorted(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "searchsorted");
+}
+
+#line 1617
+static PyObject *
+gentype_view(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "view");
+}
+
+#line 1617
+static PyObject *
+gentype_flatten(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "flatten");
+}
+
+#line 1617
+static PyObject *
+gentype_ravel(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "ravel");
+}
+
+#line 1617
+static PyObject *
+gentype_squeeze(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    return gentype_generic_method(self, args, kwds, "squeeze");
+}
+
+
+
+#line 1629
+static PyObject *
+integertype_dunder_round(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    static char *kwlist[] = {"ndigits", NULL};
+    PyObject *ndigits = Py_None;
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O:__round__", kwlist, &ndigits)) {
+        return NULL;
+    }
+
+#if 0
+    if (DEPRECATE("The Python built-in `round` is deprecated for complex "
+                  "scalars, and will raise a `TypeError` in a future release. "
+                  "Use `np.round` or `scalar.round` instead.") < 0) {
+        return NULL;
+    }
+#endif
+
+    PyObject *tup;
+    if (ndigits == Py_None) {
+        tup = PyTuple_Pack(0);
+    }
+    else {
+        tup = PyTuple_Pack(1, ndigits);
+    }
+
+    if (tup == NULL) {
+        return NULL;
+    }
+
+    PyObject *obj = gentype_round(self, tup, NULL);
+    Py_DECREF(tup);
+    if (obj == NULL) {
+        return NULL;
+    }
+
+#if !0
+    if (ndigits == Py_None) {
+        PyObject *ret = PyNumber_Long(obj);
+        Py_DECREF(obj);
+        return ret;
+    }
+#endif
+
+    return obj;
+}
+
+#line 1629
+static PyObject *
+floatingtype_dunder_round(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    static char *kwlist[] = {"ndigits", NULL};
+    PyObject *ndigits = Py_None;
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O:__round__", kwlist, &ndigits)) {
+        return NULL;
+    }
+
+#if 0
+    if (DEPRECATE("The Python built-in `round` is deprecated for complex "
+                  "scalars, and will raise a `TypeError` in a future release. "
+                  "Use `np.round` or `scalar.round` instead.") < 0) {
+        return NULL;
+    }
+#endif
+
+    PyObject *tup;
+    if (ndigits == Py_None) {
+        tup = PyTuple_Pack(0);
+    }
+    else {
+        tup = PyTuple_Pack(1, ndigits);
+    }
+
+    if (tup == NULL) {
+        return NULL;
+    }
+
+    PyObject *obj = gentype_round(self, tup, NULL);
+    Py_DECREF(tup);
+    if (obj == NULL) {
+        return NULL;
+    }
+
+#if !0
+    if (ndigits == Py_None) {
+        PyObject *ret = PyNumber_Long(obj);
+        Py_DECREF(obj);
+        return ret;
+    }
+#endif
+
+    return obj;
+}
+
+#line 1629
+static PyObject *
+complexfloatingtype_dunder_round(PyObject *self, PyObject *args, PyObject *kwds)
+{
+    static char *kwlist[] = {"ndigits", NULL};
+    PyObject *ndigits = Py_None;
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O:__round__", kwlist, &ndigits)) {
+        return NULL;
+    }
+
+#if 1
+    if (DEPRECATE("The Python built-in `round` is deprecated for complex "
+                  "scalars, and will raise a `TypeError` in a future release. "
+                  "Use `np.round` or `scalar.round` instead.") < 0) {
+        return NULL;
+    }
+#endif
+
+    PyObject *tup;
+    if (ndigits == Py_None) {
+        tup = PyTuple_Pack(0);
+    }
+    else {
+        tup = PyTuple_Pack(1, ndigits);
+    }
+
+    if (tup == NULL) {
+        return NULL;
+    }
+
+    PyObject *obj = gentype_round(self, tup, NULL);
+    Py_DECREF(tup);
+    if (obj == NULL) {
+        return NULL;
+    }
+
+#if !1
+    if (ndigits == Py_None) {
+        PyObject *ret = PyNumber_Long(obj);
+        Py_DECREF(obj);
+        return ret;
+    }
+#endif
+
+    return obj;
+}
+
+
+static PyObject *
+voidtype_getfield(PyVoidScalarObject *self, PyObject *args, PyObject *kwds)
+{
+    /* Use ndarray's getfield to obtain the field safely */
+    return gentype_generic_method((PyObject *)self, args, kwds, "getfield");
+}
+
+static PyObject *
+gentype_setfield(PyObject *NPY_UNUSED(self), PyObject *NPY_UNUSED(args),
+                 PyObject *NPY_UNUSED(kwds))
+{
+    PyErr_SetString(PyExc_TypeError,
+            "Can't set fields in a non-void array scalar.");
+    return NULL;
+}
+
+static PyObject *
+voidtype_setfield(PyVoidScalarObject *self, PyObject *args, PyObject *kwds)
+{
+    /*
+     * We would like to use ndarray's setfield because it performs safety
+     * checks on the field datatypes and because it broadcasts properly.
+     * However, as a special case, void-scalar assignment broadcasts
+     * differently from ndarrays when assigning to an object field: Assignment
+     * to an ndarray object field broadcasts, but assignment to a void-scalar
+     * object-field should not, in order to allow nested ndarrays.
+     * These lines should then behave identically:
+     *
+     *     b = np.zeros(1, dtype=[('x', 'O')])
+     *     b[0]['x'] = arange(3)  # uses voidtype_setfield
+     *     b['x'][0] = arange(3)  # uses ndarray setitem
+     *
+     * Ndarray's setfield would try to broadcast the lhs. Instead we use
+     * ndarray getfield to get the field safely, then setitem with an empty
+     * tuple to set the value without broadcast. Note we also want subarrays to
+     * be set properly, ie
+     *
+     *     a = np.zeros(1, dtype=[('x', 'i', 5)])
+     *     a[0]['x'] = 1
+     *
+     * sets all values to 1. "getfield + setitem with empty tuple" takes
+     * care of both object arrays and subarrays.
+     */
+    PyObject *getfield_args, *value, *arr, *meth, *arr_field, *emptytuple;
+
+    value = PyTuple_GetItem(args, 0);
+    if (value == NULL) {
+        return NULL;
+    }
+    getfield_args = PyTuple_GetSlice(args, 1, 3);
+    if (getfield_args == NULL) {
+        return NULL;
+    }
+
+    /* 1. Convert to 0-d array and use getfield */
+    arr = PyArray_FromScalar((PyObject*)self, NULL);
+    if (arr == NULL) {
+        Py_DECREF(getfield_args);
+        return NULL;
+    }
+    meth = PyObject_GetAttrString(arr, "getfield");
+    if (meth == NULL) {
+        Py_DECREF(getfield_args);
+        Py_DECREF(arr);
+        return NULL;
+    }
+    if (kwds == NULL) {
+        arr_field = PyObject_CallObject(meth, getfield_args);
+    }
+    else {
+        arr_field = PyObject_Call(meth, getfield_args, kwds);
+    }
+    Py_DECREF(getfield_args);
+    Py_DECREF(meth);
+    Py_DECREF(arr);
+
+    if(arr_field == NULL){
+        return NULL;
+    }
+
+    /* 2. Assign the value using setitem with empty tuple. */
+    emptytuple = PyTuple_New(0);
+    if (PyObject_SetItem(arr_field, emptytuple, value) < 0) {
+        Py_DECREF(arr_field);
+        Py_DECREF(emptytuple);
+        return NULL;
+    }
+    Py_DECREF(emptytuple);
+    Py_DECREF(arr_field);
+
+    Py_RETURN_NONE;
+}
+
+
+static PyObject *
+gentype_reduce(PyObject *self, PyObject *NPY_UNUSED(args))
+{
+    PyObject *ret = NULL, *obj = NULL, *mod = NULL;
+    Py_buffer view;
+    const char *buffer;
+    Py_ssize_t buflen;
+
+    /* Return a tuple of (callable object, arguments) */
+    ret = PyTuple_New(2);
+    if (ret == NULL) {
+        return NULL;
+    }
+
+    if (PyObject_GetBuffer(self, &view, PyBUF_SIMPLE) >= 0) {
+        buffer = view.buf;
+        buflen = view.len;
+        /*
+         * In Python 3 both of the deprecated functions PyObject_AsWriteBuffer and
+         * PyObject_AsReadBuffer that this code replaces release the buffer. It is
+         * up to the object that supplies the buffer to guarantee that the buffer
+         * sticks around after the release.
+         */
+        PyBuffer_Release(&view);
+    }
+    else {
+        Py_DECREF(ret);
+        return NULL;
+    }
+
+    mod = PyImport_ImportModule("numpy.core._multiarray_umath");
+    if (mod == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    obj = PyObject_GetAttrString(mod, "scalar");
+    Py_DECREF(mod);
+    if (obj == NULL) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+    PyTuple_SET_ITEM(ret, 0, obj);
+    obj = PyObject_GetAttrString((PyObject *)self, "dtype");
+    if (PyArray_IsScalar(self, Object)) {
+        PyObject *val = PyArrayScalar_VAL(self, Object);
+        PyObject *tup = Py_BuildValue("NO", obj, val);
+        if (tup == NULL) {
+            Py_DECREF(ret);
+            return NULL;
+        }
+        PyTuple_SET_ITEM(ret, 1, tup);
+    }
+    else if (obj && PyDataType_FLAGCHK((PyArray_Descr *)obj, NPY_LIST_PICKLE)) {
+        /* a structured dtype with an object in a field */
+        PyArrayObject *arr = (PyArrayObject *)PyArray_FromScalar(self, NULL);
+        if (arr == NULL) {
+            Py_DECREF(ret);
+            return NULL;
+        }
+        /* Use the whole array which handles sturctured void correctly */
+        PyObject *tup = Py_BuildValue("NN", obj, arr);
+        if (tup == NULL) {
+            Py_DECREF(ret);
+            return NULL;
+        }
+        PyTuple_SET_ITEM(ret, 1, tup);
+    }
+    else {
+        mod = PyBytes_FromStringAndSize(buffer, buflen);
+        if (mod == NULL) {
+            Py_DECREF(ret);
+            return NULL;
+        }
+        PyTuple_SET_ITEM(ret, 1,
+                Py_BuildValue("NN", obj, mod));
+    }
+    return ret;
+}
+
+/* ignores everything */
+static PyObject *
+gentype_setstate(PyObject *NPY_UNUSED(self), PyObject *NPY_UNUSED(args))
+{
+    Py_RETURN_NONE;
+}
+
+static PyObject *
+gentype_dump(PyObject *self, PyObject *args)
+{
+    PyObject *file = NULL;
+    int ret;
+
+    if (!PyArg_ParseTuple(args, "O:dump", &file)) {
+        return NULL;
+    }
+    ret = PyArray_Dump(self, file, 2);
+    if (ret < 0) {
+        return NULL;
+    }
+    Py_RETURN_NONE;
+}
+
+static PyObject *
+gentype_dumps(PyObject *self, PyObject *args)
+{
+    if (!PyArg_ParseTuple(args, "")) {
+        return NULL;
+    }
+    return PyArray_Dumps(self, 2);
+}
+
+
+/* setting flags cannot be done for scalars */
+static PyObject *
+gentype_setflags(PyObject *NPY_UNUSED(self), PyObject *NPY_UNUSED(args),
+        PyObject *NPY_UNUSED(kwds))
+{
+    Py_RETURN_NONE;
+}
+
+static PyObject *
+numbertype_class_getitem_abc(PyObject *cls, PyObject *args)
+{
+    const Py_ssize_t args_len = PyTuple_Check(args) ? PyTuple_Size(args) : 1;
+    int args_len_expected;
+
+    /* complexfloating should take 2 parameters, all others take 1 */
+    if (PyType_IsSubtype((PyTypeObject *)cls,
+                         &PyComplexFloatingArrType_Type)) {
+        args_len_expected = 2;
+    }
+    else {
+        args_len_expected = 1;
+    }
+
+    if ((args_len > args_len_expected) || (args_len == 0)) {
+        return PyErr_Format(PyExc_TypeError,
+                            "Too %s arguments for %s",
+                            args_len > args_len_expected ? "many" : "few",
+                            ((PyTypeObject *)cls)->tp_name);
+    }
+    return Py_GenericAlias(cls, args);
+}
+
+/*
+ * Use for concrete np.number subclasses, making them act as if they
+ * were subtyped from e.g. np.signedinteger[object], thus lacking any
+ * free subscription parameters. Requires python >= 3.9.
+ */
+static PyObject *
+numbertype_class_getitem(PyObject *cls, PyObject *args)
+{
+    PyErr_Format(PyExc_TypeError,
+                 "There are no type variables left in %s",
+                 ((PyTypeObject *)cls)->tp_name);
+    return NULL;
+}
+
+/*
+ * casting complex numbers (that don't inherit from Python complex)
+ * to Python complex
+ */
+
+#line 1937
+static PyObject *
+cfloat_complex(PyObject *self, PyObject *NPY_UNUSED(args),
+               PyObject *NPY_UNUSED(kwds))
+{
+    return PyComplex_FromDoubles(PyArrayScalar_VAL(self, CFloat).real,
+                                 PyArrayScalar_VAL(self, CFloat).imag);
+}
+
+#line 1937
+static PyObject *
+clongdouble_complex(PyObject *self, PyObject *NPY_UNUSED(args),
+               PyObject *NPY_UNUSED(kwds))
+{
+    return PyComplex_FromDoubles(PyArrayScalar_VAL(self, CLongDouble).real,
+                                 PyArrayScalar_VAL(self, CLongDouble).imag);
+}
+
+
+#line 1955
+/* Heavily copied from the builtin float.as_integer_ratio */
+static PyObject *
+half_as_integer_ratio(PyObject *self, PyObject *NPY_UNUSED(args))
+{
+#if 1
+    npy_double val = npy_half_to_double(PyArrayScalar_VAL(self, Half));
+    npy_double frac;
+#else
+    npy_half val = PyArrayScalar_VAL(self, Half);
+    npy_half frac;
+#endif
+    int exponent;
+    int i;
+
+    PyObject *py_exponent = NULL;
+    PyObject *numerator = NULL;
+    PyObject *denominator = NULL;
+    PyObject *result_pair = NULL;
+    PyNumberMethods *long_methods = PyLong_Type.tp_as_number;
+
+    if (npy_isnan(val)) {
+        PyErr_SetString(PyExc_ValueError,
+                        "cannot convert NaN to integer ratio");
+        return NULL;
+    }
+    if (!npy_isfinite(val)) {
+        PyErr_SetString(PyExc_OverflowError,
+                        "cannot convert Infinity to integer ratio");
+        return NULL;
+    }
+
+    frac = npy_frexpf(val, &exponent); /* val == frac * 2**exponent exactly */
+
+    /* This relies on the floating point type being base 2 to converge */
+    for (i = 0; frac != npy_floorf(frac); i++) {
+        frac *= 2.0;
+        exponent--;
+    }
+
+    /* self == frac * 2**exponent exactly and frac is integral. */
+    numerator = PyLong_FromDouble(frac);
+    if (numerator == NULL)
+        goto error;
+    denominator = PyLong_FromLong(1);
+    if (denominator == NULL)
+        goto error;
+    py_exponent = PyLong_FromLong(exponent < 0 ? -exponent : exponent);
+    if (py_exponent == NULL)
+        goto error;
+
+    /* fold in 2**exponent */
+    if (exponent > 0) {
+        PyObject *temp = long_methods->nb_lshift(numerator, py_exponent);
+        if (temp == NULL)
+            goto error;
+        Py_DECREF(numerator);
+        numerator = temp;
+    }
+    else {
+        PyObject *temp = long_methods->nb_lshift(denominator, py_exponent);
+        if (temp == NULL)
+            goto error;
+        Py_DECREF(denominator);
+        denominator = temp;
+    }
+
+    result_pair = PyTuple_Pack(2, numerator, denominator);
+
+error:
+    Py_XDECREF(py_exponent);
+    Py_XDECREF(denominator);
+    Py_XDECREF(numerator);
+    return result_pair;
+}
+
+#line 1955
+/* Heavily copied from the builtin float.as_integer_ratio */
+static PyObject *
+float_as_integer_ratio(PyObject *self, PyObject *NPY_UNUSED(args))
+{
+#if 0
+    npy_double val = npy_half_to_double(PyArrayScalar_VAL(self, Float));
+    npy_double frac;
+#else
+    npy_float val = PyArrayScalar_VAL(self, Float);
+    npy_float frac;
+#endif
+    int exponent;
+    int i;
+
+    PyObject *py_exponent = NULL;
+    PyObject *numerator = NULL;
+    PyObject *denominator = NULL;
+    PyObject *result_pair = NULL;
+    PyNumberMethods *long_methods = PyLong_Type.tp_as_number;
+
+    if (npy_isnan(val)) {
+        PyErr_SetString(PyExc_ValueError,
+                        "cannot convert NaN to integer ratio");
+        return NULL;
+    }
+    if (!npy_isfinite(val)) {
+        PyErr_SetString(PyExc_OverflowError,
+                        "cannot convert Infinity to integer ratio");
+        return NULL;
+    }
+
+    frac = npy_frexpf(val, &exponent); /* val == frac * 2**exponent exactly */
+
+    /* This relies on the floating point type being base 2 to converge */
+    for (i = 0; frac != npy_floorf(frac); i++) {
+        frac *= 2.0;
+        exponent--;
+    }
+
+    /* self == frac * 2**exponent exactly and frac is integral. */
+    numerator = PyLong_FromDouble(frac);
+    if (numerator == NULL)
+        goto error;
+    denominator = PyLong_FromLong(1);
+    if (denominator == NULL)
+        goto error;
+    py_exponent = PyLong_FromLong(exponent < 0 ? -exponent : exponent);
+    if (py_exponent == NULL)
+        goto error;
+
+    /* fold in 2**exponent */
+    if (exponent > 0) {
+        PyObject *temp = long_methods->nb_lshift(numerator, py_exponent);
+        if (temp == NULL)
+            goto error;
+        Py_DECREF(numerator);
+        numerator = temp;
+    }
+    else {
+        PyObject *temp = long_methods->nb_lshift(denominator, py_exponent);
+        if (temp == NULL)
+            goto error;
+        Py_DECREF(denominator);
+        denominator = temp;
+    }
+
+    result_pair = PyTuple_Pack(2, numerator, denominator);
+
+error:
+    Py_XDECREF(py_exponent);
+    Py_XDECREF(denominator);
+    Py_XDECREF(numerator);
+    return result_pair;
+}
+
+#line 1955
+/* Heavily copied from the builtin float.as_integer_ratio */
+static PyObject *
+double_as_integer_ratio(PyObject *self, PyObject *NPY_UNUSED(args))
+{
+#if 0
+    npy_double val = npy_half_to_double(PyArrayScalar_VAL(self, Double));
+    npy_double frac;
+#else
+    npy_double val = PyArrayScalar_VAL(self, Double);
+    npy_double frac;
+#endif
+    int exponent;
+    int i;
+
+    PyObject *py_exponent = NULL;
+    PyObject *numerator = NULL;
+    PyObject *denominator = NULL;
+    PyObject *result_pair = NULL;
+    PyNumberMethods *long_methods = PyLong_Type.tp_as_number;
+
+    if (npy_isnan(val)) {
+        PyErr_SetString(PyExc_ValueError,
+                        "cannot convert NaN to integer ratio");
+        return NULL;
+    }
+    if (!npy_isfinite(val)) {
+        PyErr_SetString(PyExc_OverflowError,
+                        "cannot convert Infinity to integer ratio");
+        return NULL;
+    }
+
+    frac = npy_frexp(val, &exponent); /* val == frac * 2**exponent exactly */
+
+    /* This relies on the floating point type being base 2 to converge */
+    for (i = 0; frac != npy_floor(frac); i++) {
+        frac *= 2.0;
+        exponent--;
+    }
+
+    /* self == frac * 2**exponent exactly and frac is integral. */
+    numerator = PyLong_FromDouble(frac);
+    if (numerator == NULL)
+        goto error;
+    denominator = PyLong_FromLong(1);
+    if (denominator == NULL)
+        goto error;
+    py_exponent = PyLong_FromLong(exponent < 0 ? -exponent : exponent);
+    if (py_exponent == NULL)
+        goto error;
+
+    /* fold in 2**exponent */
+    if (exponent > 0) {
+        PyObject *temp = long_methods->nb_lshift(numerator, py_exponent);
+        if (temp == NULL)
+            goto error;
+        Py_DECREF(numerator);
+        numerator = temp;
+    }
+    else {
+        PyObject *temp = long_methods->nb_lshift(denominator, py_exponent);
+        if (temp == NULL)
+            goto error;
+        Py_DECREF(denominator);
+        denominator = temp;
+    }
+
+    result_pair = PyTuple_Pack(2, numerator, denominator);
+
+error:
+    Py_XDECREF(py_exponent);
+    Py_XDECREF(denominator);
+    Py_XDECREF(numerator);
+    return result_pair;
+}
+
+#line 1955
+/* Heavily copied from the builtin float.as_integer_ratio */
+static PyObject *
+longdouble_as_integer_ratio(PyObject *self, PyObject *NPY_UNUSED(args))
+{
+#if 0
+    npy_double val = npy_half_to_double(PyArrayScalar_VAL(self, LongDouble));
+    npy_double frac;
+#else
+    npy_longdouble val = PyArrayScalar_VAL(self, LongDouble);
+    npy_longdouble frac;
+#endif
+    int exponent;
+    int i;
+
+    PyObject *py_exponent = NULL;
+    PyObject *numerator = NULL;
+    PyObject *denominator = NULL;
+    PyObject *result_pair = NULL;
+    PyNumberMethods *long_methods = PyLong_Type.tp_as_number;
+
+    if (npy_isnan(val)) {
+        PyErr_SetString(PyExc_ValueError,
+                        "cannot convert NaN to integer ratio");
+        return NULL;
+    }
+    if (!npy_isfinite(val)) {
+        PyErr_SetString(PyExc_OverflowError,
+                        "cannot convert Infinity to integer ratio");
+        return NULL;
+    }
+
+    frac = npy_frexpl(val, &exponent); /* val == frac * 2**exponent exactly */
+
+    /* This relies on the floating point type being base 2 to converge */
+    for (i = 0; frac != npy_floorl(frac); i++) {
+        frac *= 2.0;
+        exponent--;
+    }
+
+    /* self == frac * 2**exponent exactly and frac is integral. */
+    numerator = npy_longdouble_to_PyLong(frac);
+    if (numerator == NULL)
+        goto error;
+    denominator = PyLong_FromLong(1);
+    if (denominator == NULL)
+        goto error;
+    py_exponent = PyLong_FromLong(exponent < 0 ? -exponent : exponent);
+    if (py_exponent == NULL)
+        goto error;
+
+    /* fold in 2**exponent */
+    if (exponent > 0) {
+        PyObject *temp = long_methods->nb_lshift(numerator, py_exponent);
+        if (temp == NULL)
+            goto error;
+        Py_DECREF(numerator);
+        numerator = temp;
+    }
+    else {
+        PyObject *temp = long_methods->nb_lshift(denominator, py_exponent);
+        if (temp == NULL)
+            goto error;
+        Py_DECREF(denominator);
+        denominator = temp;
+    }
+
+    result_pair = PyTuple_Pack(2, numerator, denominator);
+
+error:
+    Py_XDECREF(py_exponent);
+    Py_XDECREF(denominator);
+    Py_XDECREF(numerator);
+    return result_pair;
+}
+
+
+#line 2037
+static PyObject *
+half_is_integer(PyObject *self, PyObject *NPY_UNUSED(args))
+{
+#if 1
+    npy_double val = npy_half_to_double(PyArrayScalar_VAL(self, Half));
+#else
+    npy_half val = PyArrayScalar_VAL(self, Half);
+#endif
+    PyObject *ret;
+
+    if (npy_isnan(val)) {
+        Py_RETURN_FALSE;
+    }
+    if (!npy_isfinite(val)) {
+        Py_RETURN_FALSE;
+    }
+
+    ret = (npy_floorf(val) == val) ? Py_True : Py_False;
+    Py_INCREF(ret);
+    return ret;
+}
+
+#line 2037
+static PyObject *
+float_is_integer(PyObject *self, PyObject *NPY_UNUSED(args))
+{
+#if 0
+    npy_double val = npy_half_to_double(PyArrayScalar_VAL(self, Float));
+#else
+    npy_float val = PyArrayScalar_VAL(self, Float);
+#endif
+    PyObject *ret;
+
+    if (npy_isnan(val)) {
+        Py_RETURN_FALSE;
+    }
+    if (!npy_isfinite(val)) {
+        Py_RETURN_FALSE;
+    }
+
+    ret = (npy_floorf(val) == val) ? Py_True : Py_False;
+    Py_INCREF(ret);
+    return ret;
+}
+
+#line 2037
+static PyObject *
+double_is_integer(PyObject *self, PyObject *NPY_UNUSED(args))
+{
+#if 0
+    npy_double val = npy_half_to_double(PyArrayScalar_VAL(self, Double));
+#else
+    npy_double val = PyArrayScalar_VAL(self, Double);
+#endif
+    PyObject *ret;
+
+    if (npy_isnan(val)) {
+        Py_RETURN_FALSE;
+    }
+    if (!npy_isfinite(val)) {
+        Py_RETURN_FALSE;
+    }
+
+    ret = (npy_floor(val) == val) ? Py_True : Py_False;
+    Py_INCREF(ret);
+    return ret;
+}
+
+#line 2037
+static PyObject *
+longdouble_is_integer(PyObject *self, PyObject *NPY_UNUSED(args))
+{
+#if 0
+    npy_double val = npy_half_to_double(PyArrayScalar_VAL(self, LongDouble));
+#else
+    npy_longdouble val = PyArrayScalar_VAL(self, LongDouble);
+#endif
+    PyObject *ret;
+
+    if (npy_isnan(val)) {
+        Py_RETURN_FALSE;
+    }
+    if (!npy_isfinite(val)) {
+        Py_RETURN_FALSE;
+    }
+
+    ret = (npy_floorl(val) == val) ? Py_True : Py_False;
+    Py_INCREF(ret);
+    return ret;
+}
+
+
+static PyObject *
+integer_is_integer(PyObject *self, PyObject *NPY_UNUSED(args)) {
+    Py_RETURN_TRUE;
+}
+
+/*
+ * need to fill in doc-strings for these methods on import -- copy from
+ * array docstrings
+ */
+static PyMethodDef gentype_methods[] = {
+    {"tolist",
+        (PyCFunction)gentype_tolist,
+        METH_VARARGS, NULL},
+    {"item",
+        (PyCFunction)gentype_item,
+        METH_VARARGS, NULL},
+    {"itemset",
+        (PyCFunction)gentype_itemset,
+        METH_VARARGS, NULL},
+    {"tobytes",
+        (PyCFunction)gentype_tobytes,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"tofile",
+        (PyCFunction)gentype_tofile,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"tostring",
+        (PyCFunction)gentype_tostring,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"byteswap",
+        (PyCFunction)gentype_byteswap,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"astype",
+        (PyCFunction)gentype_astype,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"getfield",
+        (PyCFunction)gentype_getfield,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"setfield",
+        (PyCFunction)gentype_setfield,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"copy",
+        (PyCFunction)gentype_copy,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"resize",
+        (PyCFunction)gentype_resize,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"__array__",
+        (PyCFunction)gentype_getarray,
+        METH_VARARGS, doc_getarray},
+    {"__array_wrap__",
+        (PyCFunction)gentype_wraparray,
+        METH_VARARGS, doc_sc_wraparray},
+
+    /* for the sys module */
+    {"__sizeof__",
+        (PyCFunction)gentype_sizeof,
+        METH_NOARGS, NULL},
+
+    /* for the copy module */
+    {"__copy__",
+        (PyCFunction)gentype___copy__,
+        METH_VARARGS, NULL},
+    {"__deepcopy__",
+        (PyCFunction)gentype___deepcopy__,
+        METH_VARARGS, NULL},
+
+    {"__reduce__",
+        (PyCFunction) gentype_reduce,
+        METH_VARARGS, NULL},
+    /* For consistency does nothing */
+    {"__setstate__",
+        (PyCFunction) gentype_setstate,
+        METH_VARARGS, NULL},
+
+    {"dumps",
+        (PyCFunction) gentype_dumps,
+        METH_VARARGS, NULL},
+    {"dump",
+        (PyCFunction) gentype_dump,
+        METH_VARARGS, NULL},
+
+    /* Methods for array */
+    {"fill",
+        (PyCFunction)gentype_fill,
+        METH_VARARGS, NULL},
+    {"transpose",
+        (PyCFunction)gentype_transpose,
+        METH_VARARGS, NULL},
+    {"take",
+        (PyCFunction)gentype_take,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"put",
+        (PyCFunction)gentype_put,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"repeat",
+        (PyCFunction)gentype_repeat,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"choose",
+        (PyCFunction)gentype_choose,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"sort",
+        (PyCFunction)gentype_sort,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"argsort",
+        (PyCFunction)gentype_argsort,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"searchsorted",
+        (PyCFunction)gentype_searchsorted,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"argmax",
+        (PyCFunction)gentype_argmax,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"argmin",
+        (PyCFunction)gentype_argmin,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"reshape",
+        (PyCFunction)gentype_reshape,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"squeeze",
+        (PyCFunction)gentype_squeeze,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"view",
+        (PyCFunction)gentype_view,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"swapaxes",
+        (PyCFunction)gentype_swapaxes,
+        METH_VARARGS, NULL},
+    {"max",
+        (PyCFunction)gentype_max,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"min",
+        (PyCFunction)gentype_min,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"ptp",
+        (PyCFunction)gentype_ptp,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"mean",
+        (PyCFunction)gentype_mean,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"trace",
+        (PyCFunction)gentype_trace,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"diagonal",
+        (PyCFunction)gentype_diagonal,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"clip",
+        (PyCFunction)gentype_clip,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"conj",
+        (PyCFunction)gentype_conj,
+        METH_VARARGS, NULL},
+    {"conjugate",
+        (PyCFunction)gentype_conjugate,
+        METH_VARARGS, NULL},
+    {"nonzero",
+        (PyCFunction)gentype_nonzero,
+        METH_VARARGS, NULL},
+    {"std",
+        (PyCFunction)gentype_std,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"var",
+        (PyCFunction)gentype_var,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"sum",
+        (PyCFunction)gentype_sum,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"cumsum",
+        (PyCFunction)gentype_cumsum,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"prod",
+        (PyCFunction)gentype_prod,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"cumprod",
+        (PyCFunction)gentype_cumprod,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"all",
+        (PyCFunction)gentype_all,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"any",
+        (PyCFunction)gentype_any,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"compress",
+        (PyCFunction)gentype_compress,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"flatten",
+        (PyCFunction)gentype_flatten,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"ravel",
+        (PyCFunction)gentype_ravel,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"round",
+        (PyCFunction)gentype_round,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    /* For the format function */
+    {"__format__",
+        gentype_format,
+        METH_VARARGS,
+        "NumPy array scalar formatter"},
+    {"setflags",
+        (PyCFunction)gentype_setflags,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"newbyteorder",
+        (PyCFunction)gentype_newbyteorder,
+        METH_VARARGS, NULL},
+    {NULL, NULL, 0, NULL} /* sentinel */
+};
+
+
+static PyGetSetDef voidtype_getsets[] = {
+    {"flags",
+        (getter)voidtype_flags_get,
+        (setter)0,
+        "integer value of flags",
+        NULL},
+    {"dtype",
+        (getter)voidtype_dtypedescr_get,
+        (setter)0,
+        "dtype object",
+        NULL},
+    {"base",
+        (getter)voidtype_base_get,
+        (setter)0,
+        "base object",
+        NULL},
+    {NULL, NULL, NULL, NULL, NULL}
+};
+
+static PyMethodDef voidtype_methods[] = {
+    {"getfield",
+        (PyCFunction)voidtype_getfield,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"setfield",
+        (PyCFunction)voidtype_setfield,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {NULL, NULL, 0, NULL}
+};
+
+static PyGetSetDef inttype_getsets[] = {
+    {"numerator",
+        (getter)inttype_numerator_get,
+        (setter)0,
+        "numerator of value (the value itself)",
+        NULL},
+    {"denominator",
+        (getter)inttype_denominator_get,
+        (setter)0,
+        "denominator of value (1)",
+        NULL},
+    {NULL, NULL, NULL, NULL, NULL}
+};
+
+static PyMethodDef numbertype_methods[] = {
+    /* for typing */
+    {"__class_getitem__",
+        (PyCFunction)numbertype_class_getitem_abc,
+        METH_CLASS | METH_O, NULL},
+    {NULL, NULL, 0, NULL}  /* sentinel */
+};
+
+#line 2322
+static PyMethodDef cfloattype_methods[] = {
+    {"__complex__",
+        (PyCFunction)cfloat_complex,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    /* for typing */
+    {"__class_getitem__",
+        (PyCFunction)numbertype_class_getitem,
+        METH_CLASS | METH_O, NULL},
+    {NULL, NULL, 0, NULL}
+};
+
+#line 2322
+static PyMethodDef clongdoubletype_methods[] = {
+    {"__complex__",
+        (PyCFunction)clongdouble_complex,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    /* for typing */
+    {"__class_getitem__",
+        (PyCFunction)numbertype_class_getitem,
+        METH_CLASS | METH_O, NULL},
+    {NULL, NULL, 0, NULL}
+};
+
+
+#line 2337
+static PyMethodDef floatingtype_methods[] = {
+    /* Hook for the round() builtin */
+    {"__round__",
+        (PyCFunction)floatingtype_dunder_round,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {NULL, NULL, 0, NULL} /* sentinel */
+};
+
+#line 2337
+static PyMethodDef complexfloatingtype_methods[] = {
+    /* Hook for the round() builtin */
+    {"__round__",
+        (PyCFunction)complexfloatingtype_dunder_round,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {NULL, NULL, 0, NULL} /* sentinel */
+};
+
+
+static PyMethodDef integertype_methods[] = {
+    /* Hook for the round() builtin */
+    {"__round__",
+        (PyCFunction)integertype_dunder_round,
+        METH_VARARGS | METH_KEYWORDS, NULL},
+    {"is_integer",
+        (PyCFunction)integer_is_integer,
+        METH_NOARGS, NULL},
+    {NULL, NULL, 0, NULL} /* sentinel */
+};
+
+#line 2360
+static PyMethodDef halftype_methods[] = {
+    {"as_integer_ratio",
+        (PyCFunction)half_as_integer_ratio,
+        METH_NOARGS, NULL},
+    {"is_integer",
+        (PyCFunction)half_is_integer,
+        METH_NOARGS, NULL},
+    /* for typing */
+    {"__class_getitem__",
+        (PyCFunction)numbertype_class_getitem,
+        METH_CLASS | METH_O, NULL},
+    {NULL, NULL, 0, NULL}
+};
+
+#line 2360
+static PyMethodDef floattype_methods[] = {
+    {"as_integer_ratio",
+        (PyCFunction)float_as_integer_ratio,
+        METH_NOARGS, NULL},
+    {"is_integer",
+        (PyCFunction)float_is_integer,
+        METH_NOARGS, NULL},
+    /* for typing */
+    {"__class_getitem__",
+        (PyCFunction)numbertype_class_getitem,
+        METH_CLASS | METH_O, NULL},
+    {NULL, NULL, 0, NULL}
+};
+
+#line 2360
+static PyMethodDef doubletype_methods[] = {
+    {"as_integer_ratio",
+        (PyCFunction)double_as_integer_ratio,
+        METH_NOARGS, NULL},
+    {"is_integer",
+        (PyCFunction)double_is_integer,
+        METH_NOARGS, NULL},
+    /* for typing */
+    {"__class_getitem__",
+        (PyCFunction)numbertype_class_getitem,
+        METH_CLASS | METH_O, NULL},
+    {NULL, NULL, 0, NULL}
+};
+
+#line 2360
+static PyMethodDef longdoubletype_methods[] = {
+    {"as_integer_ratio",
+        (PyCFunction)longdouble_as_integer_ratio,
+        METH_NOARGS, NULL},
+    {"is_integer",
+        (PyCFunction)longdouble_is_integer,
+        METH_NOARGS, NULL},
+    /* for typing */
+    {"__class_getitem__",
+        (PyCFunction)numbertype_class_getitem,
+        METH_CLASS | METH_O, NULL},
+    {NULL, NULL, 0, NULL}
+};
+
+
+#line 2378
+static PyMethodDef timedeltatype_methods[] = {
+    /* for typing */
+    {"__class_getitem__",
+        (PyCFunction)numbertype_class_getitem,
+        METH_CLASS | METH_O, NULL},
+    {NULL, NULL, 0, NULL}
+};
+
+#line 2378
+static PyMethodDef cdoubletype_methods[] = {
+    /* for typing */
+    {"__class_getitem__",
+        (PyCFunction)numbertype_class_getitem,
+        METH_CLASS | METH_O, NULL},
+    {NULL, NULL, 0, NULL}
+};
+
+
+#line 2391
+static PyMethodDef bytetype_methods[] = {
+    /* for typing */
+    {"__class_getitem__",
+        (PyCFunction)numbertype_class_getitem,
+        METH_CLASS | METH_O, NULL},
+    {"bit_count",
+        (PyCFunction)npy_byte_bit_count,
+        METH_NOARGS, NULL},
+    {NULL, NULL, 0, NULL} /* sentinel */
+};
+
+#line 2391
+static PyMethodDef ubytetype_methods[] = {
+    /* for typing */
+    {"__class_getitem__",
+        (PyCFunction)numbertype_class_getitem,
+        METH_CLASS | METH_O, NULL},
+    {"bit_count",
+        (PyCFunction)npy_ubyte_bit_count,
+        METH_NOARGS, NULL},
+    {NULL, NULL, 0, NULL} /* sentinel */
+};
+
+#line 2391
+static PyMethodDef shorttype_methods[] = {
+    /* for typing */
+    {"__class_getitem__",
+        (PyCFunction)numbertype_class_getitem,
+        METH_CLASS | METH_O, NULL},
+    {"bit_count",
+        (PyCFunction)npy_short_bit_count,
+        METH_NOARGS, NULL},
+    {NULL, NULL, 0, NULL} /* sentinel */
+};
+
+#line 2391
+static PyMethodDef ushorttype_methods[] = {
+    /* for typing */
+    {"__class_getitem__",
+        (PyCFunction)numbertype_class_getitem,
+        METH_CLASS | METH_O, NULL},
+    {"bit_count",
+        (PyCFunction)npy_ushort_bit_count,
+        METH_NOARGS, NULL},
+    {NULL, NULL, 0, NULL} /* sentinel */
+};
+
+#line 2391
+static PyMethodDef inttype_methods[] = {
+    /* for typing */
+    {"__class_getitem__",
+        (PyCFunction)numbertype_class_getitem,
+        METH_CLASS | METH_O, NULL},
+    {"bit_count",
+        (PyCFunction)npy_int_bit_count,
+        METH_NOARGS, NULL},
+    {NULL, NULL, 0, NULL} /* sentinel */
+};
+
+#line 2391
+static PyMethodDef uinttype_methods[] = {
+    /* for typing */
+    {"__class_getitem__",
+        (PyCFunction)numbertype_class_getitem,
+        METH_CLASS | METH_O, NULL},
+    {"bit_count",
+        (PyCFunction)npy_uint_bit_count,
+        METH_NOARGS, NULL},
+    {NULL, NULL, 0, NULL} /* sentinel */
+};
+
+#line 2391
+static PyMethodDef longtype_methods[] = {
+    /* for typing */
+    {"__class_getitem__",
+        (PyCFunction)numbertype_class_getitem,
+        METH_CLASS | METH_O, NULL},
+    {"bit_count",
+        (PyCFunction)npy_long_bit_count,
+        METH_NOARGS, NULL},
+    {NULL, NULL, 0, NULL} /* sentinel */
+};
+
+#line 2391
+static PyMethodDef ulongtype_methods[] = {
+    /* for typing */
+    {"__class_getitem__",
+        (PyCFunction)numbertype_class_getitem,
+        METH_CLASS | METH_O, NULL},
+    {"bit_count",
+        (PyCFunction)npy_ulong_bit_count,
+        METH_NOARGS, NULL},
+    {NULL, NULL, 0, NULL} /* sentinel */
+};
+
+#line 2391
+static PyMethodDef longlongtype_methods[] = {
+    /* for typing */
+    {"__class_getitem__",
+        (PyCFunction)numbertype_class_getitem,
+        METH_CLASS | METH_O, NULL},
+    {"bit_count",
+        (PyCFunction)npy_longlong_bit_count,
+        METH_NOARGS, NULL},
+    {NULL, NULL, 0, NULL} /* sentinel */
+};
+
+#line 2391
+static PyMethodDef ulonglongtype_methods[] = {
+    /* for typing */
+    {"__class_getitem__",
+        (PyCFunction)numbertype_class_getitem,
+        METH_CLASS | METH_O, NULL},
+    {"bit_count",
+        (PyCFunction)npy_ulonglong_bit_count,
+        METH_NOARGS, NULL},
+    {NULL, NULL, 0, NULL} /* sentinel */
+};
+
+
+
+/************* As_mapping functions for void array scalar ************/
+
+static Py_ssize_t
+voidtype_length(PyVoidScalarObject *self)
+{
+    if (!PyDataType_HASFIELDS(self->descr)) {
+        return 0;
+    }
+    else {
+        /* return the number of fields */
+        return (Py_ssize_t) PyTuple_GET_SIZE(self->descr->names);
+    }
+}
+
+static PyObject *
+voidtype_subscript(PyVoidScalarObject *self, PyObject *ind);
+
+static PyObject *
+voidtype_item(PyVoidScalarObject *self, Py_ssize_t n)
+{
+    npy_intp m;
+    PyObject *flist=NULL;
+
+    if (!(PyDataType_HASFIELDS(self->descr))) {
+        PyErr_SetString(PyExc_IndexError,
+                "can't index void scalar without fields");
+        return NULL;
+    }
+    flist = self->descr->names;
+    m = PyTuple_GET_SIZE(flist);
+    if (n < 0) {
+        n += m;
+    }
+    if (n < 0 || n >= m) {
+        PyErr_Format(PyExc_IndexError, "invalid index (%d)", (int) n);
+        return NULL;
+    }
+
+    return voidtype_subscript(self, PyTuple_GetItem(flist, n));
+}
+
+/* get field by name or number */
+static PyObject *
+voidtype_subscript(PyVoidScalarObject *self, PyObject *ind)
+{
+    npy_intp n;
+    PyObject *ret, *res;
+
+    /* structured voids will accept an integer index */
+    if (PyDataType_HASFIELDS(self->descr)) {
+        n = PyArray_PyIntAsIntp(ind);
+        if (!error_converting(n)) {
+            return voidtype_item(self, (Py_ssize_t)n);
+        }
+        PyErr_Clear();
+    }
+
+    res = PyArray_FromScalar((PyObject*)self, NULL);
+
+    /* ellipsis should return 0d array */
+    if(ind == Py_Ellipsis){
+        return res;
+    }
+
+    /*
+     * other cases (field names, empty tuple) will return either
+     * scalar or non-0d array. Compute this using ndarray subscript.
+     */
+    ret = array_subscript((PyArrayObject *)res, ind);
+    Py_DECREF(res);
+    return PyArray_Return((PyArrayObject*)ret);
+}
+
+static int
+voidtype_ass_subscript(PyVoidScalarObject *self, PyObject *ind, PyObject *val);
+
+static int
+voidtype_ass_item(PyVoidScalarObject *self, Py_ssize_t n, PyObject *val)
+{
+    npy_intp m;
+    PyObject *flist=NULL;
+
+    if (!(PyDataType_HASFIELDS(self->descr))) {
+        PyErr_SetString(PyExc_IndexError,
+                "can't index void scalar without fields");
+        return -1;
+    }
+
+    flist = self->descr->names;
+    m = PyTuple_GET_SIZE(flist);
+    if (n < 0) {
+        n += m;
+    }
+    if (n < 0 || n >= m) {
+        PyErr_Format(PyExc_IndexError, "invalid index (%d)", (int) n);
+        return -1;
+    }
+
+    return voidtype_ass_subscript(self, PyTuple_GetItem(flist, n), val);
+}
+
+static int
+voidtype_ass_subscript(PyVoidScalarObject *self, PyObject *ind, PyObject *val)
+{
+    npy_intp n;
+    char *msg = "invalid index";
+    PyObject *args;
+
+    if (!PyDataType_HASFIELDS(self->descr)) {
+        PyErr_SetString(PyExc_IndexError,
+                "can't index void scalar without fields");
+        return -1;
+    }
+
+    if (!val) {
+        PyErr_SetString(PyExc_ValueError,
+                "cannot delete scalar field");
+        return -1;
+    }
+
+    if (PyUnicode_Check(ind)) {
+        /*
+         * Much like in voidtype_setfield, we cannot simply use ndarray's
+         * __setitem__ since assignment to void scalars should not broadcast
+         * the lhs. Instead we get a view through __getitem__ and then assign
+         * the value using setitem with an empty tuple (which treats both
+         * object arrays and subarrays properly).
+         *
+         * Also we do not want to use voidtype_setfield here, since we do
+         * not need to do the (slow) view safety checks, since we already
+         * know the dtype/offset are safe.
+         */
+
+        PyObject *arr, *arr_field, *meth, *emptytuple;
+
+        /* 1. Convert to 0-d array and use getitem */
+        arr = PyArray_FromScalar((PyObject*)self, NULL);
+        if (arr == NULL) {
+            return -1;
+        }
+        meth = PyObject_GetAttrString(arr, "__getitem__");
+        if (meth == NULL) {
+            Py_DECREF(arr);
+            return -1;
+        }
+        args = Py_BuildValue("(O)", ind);
+        arr_field = PyObject_CallObject(meth, args);
+        Py_DECREF(meth);
+        Py_DECREF(arr);
+        Py_DECREF(args);
+
+        if(arr_field == NULL){
+            return -1;
+        }
+
+        /* 2. Assign the value using setitem with empty tuple. */
+        emptytuple = PyTuple_New(0);
+        if (PyObject_SetItem(arr_field, emptytuple, val) < 0) {
+            Py_DECREF(arr_field);
+            Py_DECREF(emptytuple);
+            return -1;
+        }
+        Py_DECREF(emptytuple);
+        Py_DECREF(arr_field);
+        return 0;
+    }
+
+    /* try to convert it to a number */
+    n = PyArray_PyIntAsIntp(ind);
+    if (error_converting(n)) {
+        goto fail;
+    }
+    return voidtype_ass_item(self, (Py_ssize_t)n, val);
+
+fail:
+    PyErr_SetString(PyExc_IndexError, msg);
+    return -1;
+}
+
+static PyMappingMethods voidtype_as_mapping = {
+    .mp_length = (lenfunc)voidtype_length,
+    .mp_subscript = (binaryfunc)voidtype_subscript,
+    .mp_ass_subscript = (objobjargproc)voidtype_ass_subscript,
+};
+
+
+static PySequenceMethods voidtype_as_sequence = {
+    .sq_length = (lenfunc)voidtype_length,
+    .sq_item = (ssizeargfunc)voidtype_item,
+    .sq_ass_item = (ssizeobjargproc)voidtype_ass_item,
+};
+
+
+/*
+ * This function implements simple buffer export for user defined subclasses
+ * of `np.generic`. All other scalar types override the buffer export.
+ */
+static int
+gentype_arrtype_getbuffer(PyObject *self, Py_buffer *view, int flags)
+{
+    if ((flags & PyBUF_FORMAT) == PyBUF_FORMAT) {
+        PyErr_Format(PyExc_TypeError,
+                "NumPy scalar %R can only exported as a buffer without format.",
+                self);
+        return -1;
+    }
+    if ((flags & PyBUF_WRITEABLE) == PyBUF_WRITEABLE) {
+        PyErr_SetString(PyExc_BufferError, "scalar buffer is readonly");
+        return -1;
+    }
+    PyArray_Descr *descr = PyArray_DescrFromScalar(self);
+    if (descr == NULL) {
+        return -1;
+    }
+    if (!PyDataType_ISUSERDEF(descr)) {
+        /* This path would also reject the (hopefully) impossible "object" */
+        PyErr_Format(PyExc_TypeError,
+                "user-defined scalar %R registered for built-in dtype %S? "
+                "This should be impossible.",
+                self, descr);
+        Py_DECREF(descr);
+        return -1;
+    }
+    view->ndim = 0;
+    view->len = descr->elsize;
+    view->itemsize = descr->elsize;
+    view->shape = NULL;
+    view->strides = NULL;
+    view->suboffsets = NULL;
+    view->readonly = 1;  /* assume general (user) scalars are readonly. */
+    Py_INCREF(self);
+    view->obj = self;
+    view->buf = scalar_value(self, descr);
+    Py_DECREF(descr);
+    view->format = NULL;
+    return 0;
+}
+
+
+static PyBufferProcs gentype_arrtype_as_buffer = {
+    .bf_getbuffer = (getbufferproc)gentype_arrtype_getbuffer,
+};
+
+
+#line 2660
+
+static int
+bool_getbuffer(PyObject *self, Py_buffer *view, int flags)
+{
+    if ((flags & PyBUF_WRITEABLE) == PyBUF_WRITEABLE) {
+        PyErr_SetString(PyExc_BufferError, "scalar buffer is readonly");
+        return -1;
+    }
+    PyBoolScalarObject *scalar = (PyBoolScalarObject *)self;
+
+    static char fmt[3] = "?";
+
+    view->ndim = 0;
+    view->len = sizeof(scalar->obval);
+    view->itemsize = sizeof(scalar->obval);
+    view->shape = NULL;
+    view->strides = NULL;
+    view->suboffsets = NULL;
+    view->readonly = 1;
+    Py_INCREF(self);
+    view->obj = self;
+    view->buf = &(scalar->obval);
+
+    if ((flags & PyBUF_FORMAT) != PyBUF_FORMAT) {
+        /* It is unnecessary to find the correct format */
+        view->format = NULL;
+        return 0;
+    }
+
+    view->format = fmt;
+
+    return 0;
+}
+
+static PyBufferProcs bool_arrtype_as_buffer = {
+    .bf_getbuffer = bool_getbuffer,
+    /* No need to release the buffer */
+};
+
+
+#line 2660
+
+static int
+byte_getbuffer(PyObject *self, Py_buffer *view, int flags)
+{
+    if ((flags & PyBUF_WRITEABLE) == PyBUF_WRITEABLE) {
+        PyErr_SetString(PyExc_BufferError, "scalar buffer is readonly");
+        return -1;
+    }
+    PyByteScalarObject *scalar = (PyByteScalarObject *)self;
+
+    static char fmt[3] = "b";
+
+    view->ndim = 0;
+    view->len = sizeof(scalar->obval);
+    view->itemsize = sizeof(scalar->obval);
+    view->shape = NULL;
+    view->strides = NULL;
+    view->suboffsets = NULL;
+    view->readonly = 1;
+    Py_INCREF(self);
+    view->obj = self;
+    view->buf = &(scalar->obval);
+
+    if ((flags & PyBUF_FORMAT) != PyBUF_FORMAT) {
+        /* It is unnecessary to find the correct format */
+        view->format = NULL;
+        return 0;
+    }
+
+    view->format = fmt;
+
+    return 0;
+}
+
+static PyBufferProcs byte_arrtype_as_buffer = {
+    .bf_getbuffer = byte_getbuffer,
+    /* No need to release the buffer */
+};
+
+
+#line 2660
+
+static int
+short_getbuffer(PyObject *self, Py_buffer *view, int flags)
+{
+    if ((flags & PyBUF_WRITEABLE) == PyBUF_WRITEABLE) {
+        PyErr_SetString(PyExc_BufferError, "scalar buffer is readonly");
+        return -1;
+    }
+    PyShortScalarObject *scalar = (PyShortScalarObject *)self;
+
+    static char fmt[3] = "h";
+
+    view->ndim = 0;
+    view->len = sizeof(scalar->obval);
+    view->itemsize = sizeof(scalar->obval);
+    view->shape = NULL;
+    view->strides = NULL;
+    view->suboffsets = NULL;
+    view->readonly = 1;
+    Py_INCREF(self);
+    view->obj = self;
+    view->buf = &(scalar->obval);
+
+    if ((flags & PyBUF_FORMAT) != PyBUF_FORMAT) {
+        /* It is unnecessary to find the correct format */
+        view->format = NULL;
+        return 0;
+    }
+
+    view->format = fmt;
+
+    return 0;
+}
+
+static PyBufferProcs short_arrtype_as_buffer = {
+    .bf_getbuffer = short_getbuffer,
+    /* No need to release the buffer */
+};
+
+
+#line 2660
+
+static int
+int_getbuffer(PyObject *self, Py_buffer *view, int flags)
+{
+    if ((flags & PyBUF_WRITEABLE) == PyBUF_WRITEABLE) {
+        PyErr_SetString(PyExc_BufferError, "scalar buffer is readonly");
+        return -1;
+    }
+    PyIntScalarObject *scalar = (PyIntScalarObject *)self;
+
+    static char fmt[3] = "i";
+
+    view->ndim = 0;
+    view->len = sizeof(scalar->obval);
+    view->itemsize = sizeof(scalar->obval);
+    view->shape = NULL;
+    view->strides = NULL;
+    view->suboffsets = NULL;
+    view->readonly = 1;
+    Py_INCREF(self);
+    view->obj = self;
+    view->buf = &(scalar->obval);
+
+    if ((flags & PyBUF_FORMAT) != PyBUF_FORMAT) {
+        /* It is unnecessary to find the correct format */
+        view->format = NULL;
+        return 0;
+    }
+
+    view->format = fmt;
+
+    return 0;
+}
+
+static PyBufferProcs int_arrtype_as_buffer = {
+    .bf_getbuffer = int_getbuffer,
+    /* No need to release the buffer */
+};
+
+
+#line 2660
+
+static int
+long_getbuffer(PyObject *self, Py_buffer *view, int flags)
+{
+    if ((flags & PyBUF_WRITEABLE) == PyBUF_WRITEABLE) {
+        PyErr_SetString(PyExc_BufferError, "scalar buffer is readonly");
+        return -1;
+    }
+    PyLongScalarObject *scalar = (PyLongScalarObject *)self;
+
+    static char fmt[3] = "l";
+
+    view->ndim = 0;
+    view->len = sizeof(scalar->obval);
+    view->itemsize = sizeof(scalar->obval);
+    view->shape = NULL;
+    view->strides = NULL;
+    view->suboffsets = NULL;
+    view->readonly = 1;
+    Py_INCREF(self);
+    view->obj = self;
+    view->buf = &(scalar->obval);
+
+    if ((flags & PyBUF_FORMAT) != PyBUF_FORMAT) {
+        /* It is unnecessary to find the correct format */
+        view->format = NULL;
+        return 0;
+    }
+
+    view->format = fmt;
+
+    return 0;
+}
+
+static PyBufferProcs long_arrtype_as_buffer = {
+    .bf_getbuffer = long_getbuffer,
+    /* No need to release the buffer */
+};
+
+
+#line 2660
+
+static int
+longlong_getbuffer(PyObject *self, Py_buffer *view, int flags)
+{
+    if ((flags & PyBUF_WRITEABLE) == PyBUF_WRITEABLE) {
+        PyErr_SetString(PyExc_BufferError, "scalar buffer is readonly");
+        return -1;
+    }
+    PyLongLongScalarObject *scalar = (PyLongLongScalarObject *)self;
+
+    static char fmt[3] = "q";
+
+    view->ndim = 0;
+    view->len = sizeof(scalar->obval);
+    view->itemsize = sizeof(scalar->obval);
+    view->shape = NULL;
+    view->strides = NULL;
+    view->suboffsets = NULL;
+    view->readonly = 1;
+    Py_INCREF(self);
+    view->obj = self;
+    view->buf = &(scalar->obval);
+
+    if ((flags & PyBUF_FORMAT) != PyBUF_FORMAT) {
+        /* It is unnecessary to find the correct format */
+        view->format = NULL;
+        return 0;
+    }
+
+    view->format = fmt;
+
+    return 0;
+}
+
+static PyBufferProcs longlong_arrtype_as_buffer = {
+    .bf_getbuffer = longlong_getbuffer,
+    /* No need to release the buffer */
+};
+
+
+#line 2660
+
+static int
+ubyte_getbuffer(PyObject *self, Py_buffer *view, int flags)
+{
+    if ((flags & PyBUF_WRITEABLE) == PyBUF_WRITEABLE) {
+        PyErr_SetString(PyExc_BufferError, "scalar buffer is readonly");
+        return -1;
+    }
+    PyUByteScalarObject *scalar = (PyUByteScalarObject *)self;
+
+    static char fmt[3] = "B";
+
+    view->ndim = 0;
+    view->len = sizeof(scalar->obval);
+    view->itemsize = sizeof(scalar->obval);
+    view->shape = NULL;
+    view->strides = NULL;
+    view->suboffsets = NULL;
+    view->readonly = 1;
+    Py_INCREF(self);
+    view->obj = self;
+    view->buf = &(scalar->obval);
+
+    if ((flags & PyBUF_FORMAT) != PyBUF_FORMAT) {
+        /* It is unnecessary to find the correct format */
+        view->format = NULL;
+        return 0;
+    }
+
+    view->format = fmt;
+
+    return 0;
+}
+
+static PyBufferProcs ubyte_arrtype_as_buffer = {
+    .bf_getbuffer = ubyte_getbuffer,
+    /* No need to release the buffer */
+};
+
+
+#line 2660
+
+static int
+ushort_getbuffer(PyObject *self, Py_buffer *view, int flags)
+{
+    if ((flags & PyBUF_WRITEABLE) == PyBUF_WRITEABLE) {
+        PyErr_SetString(PyExc_BufferError, "scalar buffer is readonly");
+        return -1;
+    }
+    PyUShortScalarObject *scalar = (PyUShortScalarObject *)self;
+
+    static char fmt[3] = "H";
+
+    view->ndim = 0;
+    view->len = sizeof(scalar->obval);
+    view->itemsize = sizeof(scalar->obval);
+    view->shape = NULL;
+    view->strides = NULL;
+    view->suboffsets = NULL;
+    view->readonly = 1;
+    Py_INCREF(self);
+    view->obj = self;
+    view->buf = &(scalar->obval);
+
+    if ((flags & PyBUF_FORMAT) != PyBUF_FORMAT) {
+        /* It is unnecessary to find the correct format */
+        view->format = NULL;
+        return 0;
+    }
+
+    view->format = fmt;
+
+    return 0;
+}
+
+static PyBufferProcs ushort_arrtype_as_buffer = {
+    .bf_getbuffer = ushort_getbuffer,
+    /* No need to release the buffer */
+};
+
+
+#line 2660
+
+static int
+uint_getbuffer(PyObject *self, Py_buffer *view, int flags)
+{
+    if ((flags & PyBUF_WRITEABLE) == PyBUF_WRITEABLE) {
+        PyErr_SetString(PyExc_BufferError, "scalar buffer is readonly");
+        return -1;
+    }
+    PyUIntScalarObject *scalar = (PyUIntScalarObject *)self;
+
+    static char fmt[3] = "I";
+
+    view->ndim = 0;
+    view->len = sizeof(scalar->obval);
+    view->itemsize = sizeof(scalar->obval);
+    view->shape = NULL;
+    view->strides = NULL;
+    view->suboffsets = NULL;
+    view->readonly = 1;
+    Py_INCREF(self);
+    view->obj = self;
+    view->buf = &(scalar->obval);
+
+    if ((flags & PyBUF_FORMAT) != PyBUF_FORMAT) {
+        /* It is unnecessary to find the correct format */
+        view->format = NULL;
+        return 0;
+    }
+
+    view->format = fmt;
+
+    return 0;
+}
+
+static PyBufferProcs uint_arrtype_as_buffer = {
+    .bf_getbuffer = uint_getbuffer,
+    /* No need to release the buffer */
+};
+
+
+#line 2660
+
+static int
+ulong_getbuffer(PyObject *self, Py_buffer *view, int flags)
+{
+    if ((flags & PyBUF_WRITEABLE) == PyBUF_WRITEABLE) {
+        PyErr_SetString(PyExc_BufferError, "scalar buffer is readonly");
+        return -1;
+    }
+    PyULongScalarObject *scalar = (PyULongScalarObject *)self;
+
+    static char fmt[3] = "L";
+
+    view->ndim = 0;
+    view->len = sizeof(scalar->obval);
+    view->itemsize = sizeof(scalar->obval);
+    view->shape = NULL;
+    view->strides = NULL;
+    view->suboffsets = NULL;
+    view->readonly = 1;
+    Py_INCREF(self);
+    view->obj = self;
+    view->buf = &(scalar->obval);
+
+    if ((flags & PyBUF_FORMAT) != PyBUF_FORMAT) {
+        /* It is unnecessary to find the correct format */
+        view->format = NULL;
+        return 0;
+    }
+
+    view->format = fmt;
+
+    return 0;
+}
+
+static PyBufferProcs ulong_arrtype_as_buffer = {
+    .bf_getbuffer = ulong_getbuffer,
+    /* No need to release the buffer */
+};
+
+
+#line 2660
+
+static int
+ulonglong_getbuffer(PyObject *self, Py_buffer *view, int flags)
+{
+    if ((flags & PyBUF_WRITEABLE) == PyBUF_WRITEABLE) {
+        PyErr_SetString(PyExc_BufferError, "scalar buffer is readonly");
+        return -1;
+    }
+    PyULongLongScalarObject *scalar = (PyULongLongScalarObject *)self;
+
+    static char fmt[3] = "Q";
+
+    view->ndim = 0;
+    view->len = sizeof(scalar->obval);
+    view->itemsize = sizeof(scalar->obval);
+    view->shape = NULL;
+    view->strides = NULL;
+    view->suboffsets = NULL;
+    view->readonly = 1;
+    Py_INCREF(self);
+    view->obj = self;
+    view->buf = &(scalar->obval);
+
+    if ((flags & PyBUF_FORMAT) != PyBUF_FORMAT) {
+        /* It is unnecessary to find the correct format */
+        view->format = NULL;
+        return 0;
+    }
+
+    view->format = fmt;
+
+    return 0;
+}
+
+static PyBufferProcs ulonglong_arrtype_as_buffer = {
+    .bf_getbuffer = ulonglong_getbuffer,
+    /* No need to release the buffer */
+};
+
+
+#line 2660
+
+static int
+half_getbuffer(PyObject *self, Py_buffer *view, int flags)
+{
+    if ((flags & PyBUF_WRITEABLE) == PyBUF_WRITEABLE) {
+        PyErr_SetString(PyExc_BufferError, "scalar buffer is readonly");
+        return -1;
+    }
+    PyHalfScalarObject *scalar = (PyHalfScalarObject *)self;
+
+    static char fmt[3] = "e";
+
+    view->ndim = 0;
+    view->len = sizeof(scalar->obval);
+    view->itemsize = sizeof(scalar->obval);
+    view->shape = NULL;
+    view->strides = NULL;
+    view->suboffsets = NULL;
+    view->readonly = 1;
+    Py_INCREF(self);
+    view->obj = self;
+    view->buf = &(scalar->obval);
+
+    if ((flags & PyBUF_FORMAT) != PyBUF_FORMAT) {
+        /* It is unnecessary to find the correct format */
+        view->format = NULL;
+        return 0;
+    }
+
+    view->format = fmt;
+
+    return 0;
+}
+
+static PyBufferProcs half_arrtype_as_buffer = {
+    .bf_getbuffer = half_getbuffer,
+    /* No need to release the buffer */
+};
+
+
+#line 2660
+
+static int
+float_getbuffer(PyObject *self, Py_buffer *view, int flags)
+{
+    if ((flags & PyBUF_WRITEABLE) == PyBUF_WRITEABLE) {
+        PyErr_SetString(PyExc_BufferError, "scalar buffer is readonly");
+        return -1;
+    }
+    PyFloatScalarObject *scalar = (PyFloatScalarObject *)self;
+
+    static char fmt[3] = "f";
+
+    view->ndim = 0;
+    view->len = sizeof(scalar->obval);
+    view->itemsize = sizeof(scalar->obval);
+    view->shape = NULL;
+    view->strides = NULL;
+    view->suboffsets = NULL;
+    view->readonly = 1;
+    Py_INCREF(self);
+    view->obj = self;
+    view->buf = &(scalar->obval);
+
+    if ((flags & PyBUF_FORMAT) != PyBUF_FORMAT) {
+        /* It is unnecessary to find the correct format */
+        view->format = NULL;
+        return 0;
+    }
+
+    view->format = fmt;
+
+    return 0;
+}
+
+static PyBufferProcs float_arrtype_as_buffer = {
+    .bf_getbuffer = float_getbuffer,
+    /* No need to release the buffer */
+};
+
+
+#line 2660
+
+static int
+double_getbuffer(PyObject *self, Py_buffer *view, int flags)
+{
+    if ((flags & PyBUF_WRITEABLE) == PyBUF_WRITEABLE) {
+        PyErr_SetString(PyExc_BufferError, "scalar buffer is readonly");
+        return -1;
+    }
+    PyDoubleScalarObject *scalar = (PyDoubleScalarObject *)self;
+
+    static char fmt[3] = "d";
+
+    view->ndim = 0;
+    view->len = sizeof(scalar->obval);
+    view->itemsize = sizeof(scalar->obval);
+    view->shape = NULL;
+    view->strides = NULL;
+    view->suboffsets = NULL;
+    view->readonly = 1;
+    Py_INCREF(self);
+    view->obj = self;
+    view->buf = &(scalar->obval);
+
+    if ((flags & PyBUF_FORMAT) != PyBUF_FORMAT) {
+        /* It is unnecessary to find the correct format */
+        view->format = NULL;
+        return 0;
+    }
+
+    view->format = fmt;
+
+    return 0;
+}
+
+static PyBufferProcs double_arrtype_as_buffer = {
+    .bf_getbuffer = double_getbuffer,
+    /* No need to release the buffer */
+};
+
+
+#line 2660
+
+static int
+longdouble_getbuffer(PyObject *self, Py_buffer *view, int flags)
+{
+    if ((flags & PyBUF_WRITEABLE) == PyBUF_WRITEABLE) {
+        PyErr_SetString(PyExc_BufferError, "scalar buffer is readonly");
+        return -1;
+    }
+    PyLongDoubleScalarObject *scalar = (PyLongDoubleScalarObject *)self;
+
+    static char fmt[3] = "g";
+
+    view->ndim = 0;
+    view->len = sizeof(scalar->obval);
+    view->itemsize = sizeof(scalar->obval);
+    view->shape = NULL;
+    view->strides = NULL;
+    view->suboffsets = NULL;
+    view->readonly = 1;
+    Py_INCREF(self);
+    view->obj = self;
+    view->buf = &(scalar->obval);
+
+    if ((flags & PyBUF_FORMAT) != PyBUF_FORMAT) {
+        /* It is unnecessary to find the correct format */
+        view->format = NULL;
+        return 0;
+    }
+
+    view->format = fmt;
+
+    return 0;
+}
+
+static PyBufferProcs longdouble_arrtype_as_buffer = {
+    .bf_getbuffer = longdouble_getbuffer,
+    /* No need to release the buffer */
+};
+
+
+#line 2660
+
+static int
+cfloat_getbuffer(PyObject *self, Py_buffer *view, int flags)
+{
+    if ((flags & PyBUF_WRITEABLE) == PyBUF_WRITEABLE) {
+        PyErr_SetString(PyExc_BufferError, "scalar buffer is readonly");
+        return -1;
+    }
+    PyCFloatScalarObject *scalar = (PyCFloatScalarObject *)self;
+
+    static char fmt[3] = "Zf";
+
+    view->ndim = 0;
+    view->len = sizeof(scalar->obval);
+    view->itemsize = sizeof(scalar->obval);
+    view->shape = NULL;
+    view->strides = NULL;
+    view->suboffsets = NULL;
+    view->readonly = 1;
+    Py_INCREF(self);
+    view->obj = self;
+    view->buf = &(scalar->obval);
+
+    if ((flags & PyBUF_FORMAT) != PyBUF_FORMAT) {
+        /* It is unnecessary to find the correct format */
+        view->format = NULL;
+        return 0;
+    }
+
+    view->format = fmt;
+
+    return 0;
+}
+
+static PyBufferProcs cfloat_arrtype_as_buffer = {
+    .bf_getbuffer = cfloat_getbuffer,
+    /* No need to release the buffer */
+};
+
+
+#line 2660
+
+static int
+cdouble_getbuffer(PyObject *self, Py_buffer *view, int flags)
+{
+    if ((flags & PyBUF_WRITEABLE) == PyBUF_WRITEABLE) {
+        PyErr_SetString(PyExc_BufferError, "scalar buffer is readonly");
+        return -1;
+    }
+    PyCDoubleScalarObject *scalar = (PyCDoubleScalarObject *)self;
+
+    static char fmt[3] = "Zd";
+
+    view->ndim = 0;
+    view->len = sizeof(scalar->obval);
+    view->itemsize = sizeof(scalar->obval);
+    view->shape = NULL;
+    view->strides = NULL;
+    view->suboffsets = NULL;
+    view->readonly = 1;
+    Py_INCREF(self);
+    view->obj = self;
+    view->buf = &(scalar->obval);
+
+    if ((flags & PyBUF_FORMAT) != PyBUF_FORMAT) {
+        /* It is unnecessary to find the correct format */
+        view->format = NULL;
+        return 0;
+    }
+
+    view->format = fmt;
+
+    return 0;
+}
+
+static PyBufferProcs cdouble_arrtype_as_buffer = {
+    .bf_getbuffer = cdouble_getbuffer,
+    /* No need to release the buffer */
+};
+
+
+#line 2660
+
+static int
+clongdouble_getbuffer(PyObject *self, Py_buffer *view, int flags)
+{
+    if ((flags & PyBUF_WRITEABLE) == PyBUF_WRITEABLE) {
+        PyErr_SetString(PyExc_BufferError, "scalar buffer is readonly");
+        return -1;
+    }
+    PyCLongDoubleScalarObject *scalar = (PyCLongDoubleScalarObject *)self;
+
+    static char fmt[3] = "Zg";
+
+    view->ndim = 0;
+    view->len = sizeof(scalar->obval);
+    view->itemsize = sizeof(scalar->obval);
+    view->shape = NULL;
+    view->strides = NULL;
+    view->suboffsets = NULL;
+    view->readonly = 1;
+    Py_INCREF(self);
+    view->obj = self;
+    view->buf = &(scalar->obval);
+
+    if ((flags & PyBUF_FORMAT) != PyBUF_FORMAT) {
+        /* It is unnecessary to find the correct format */
+        view->format = NULL;
+        return 0;
+    }
+
+    view->format = fmt;
+
+    return 0;
+}
+
+static PyBufferProcs clongdouble_arrtype_as_buffer = {
+    .bf_getbuffer = clongdouble_getbuffer,
+    /* No need to release the buffer */
+};
+
+
+
+static int
+unicode_getbuffer(PyObject *self, Py_buffer *view, int flags)
+{
+    if ((flags & PyBUF_WRITEABLE) == PyBUF_WRITEABLE) {
+        PyErr_SetString(PyExc_BufferError, "scalar buffer is readonly");
+        return -1;
+    }
+    PyUnicodeScalarObject *scalar = (PyUnicodeScalarObject *)self;
+    Py_ssize_t length = PyUnicode_GetLength(self);
+
+    view->ndim = 0;
+    view->len = length * 4;
+    view->itemsize = length * 4;
+    view->shape = NULL;
+    view->strides = NULL;
+    view->suboffsets = NULL;
+    view->readonly = 1;
+    Py_INCREF(self);
+    view->obj = self;
+
+    if (scalar->obval == NULL) {
+        /*
+         * Unicode may not have the representation available, `scalar_value`
+         * ensures materialization.
+         */
+        PyArray_Descr *descr = PyArray_DescrFromType(NPY_UNICODE);
+        scalar_value(self, descr);
+        Py_DECREF(descr);
+        if (scalar->obval == NULL) {
+            /* allocating memory failed */
+            Py_SETREF(view->obj, NULL);
+            return -1;
+        }
+    }
+    view->buf = scalar->obval;
+
+    if ((flags & PyBUF_FORMAT) != PyBUF_FORMAT) {
+        /* It is unnecessary to find the correct format */
+        view->format = NULL;
+        return 0;
+    }
+
+    if (scalar->buffer_fmt != NULL) {
+        view->format = scalar->buffer_fmt;
+    }
+    else {
+        scalar->buffer_fmt = PyMem_Malloc(22);
+        if (scalar->buffer_fmt == NULL) {
+            Py_SETREF(view->obj, NULL);
+            return -1;
+        }
+        PyOS_snprintf(scalar->buffer_fmt, 22, "%" NPY_INTP_FMT "w", length);
+        view->format = scalar->buffer_fmt;
+    }
+
+    return 0;
+}
+
+static PyBufferProcs unicode_arrtype_as_buffer = {
+    .bf_getbuffer = unicode_getbuffer,
+    /* No need to release the buffer */
+};
+
+
+#line 2769
+
+static int
+datetime_getbuffer(PyObject *self, Py_buffer *view, int flags)
+{
+    if ((flags & PyBUF_WRITEABLE) == PyBUF_WRITEABLE) {
+        PyErr_SetString(PyExc_BufferError, "scalar buffer is readonly");
+        return -1;
+    }
+    PyDatetimeScalarObject *scalar = (PyDatetimeScalarObject *)self;
+
+    view->ndim = 1;
+    view->len = 8;
+    view->itemsize = 1;
+    static Py_ssize_t length = 8;
+    view->shape = &length;
+    view->strides = NULL;
+    view->suboffsets = NULL;
+    view->readonly = 1;
+    Py_INCREF(self);
+    view->obj = self;
+
+    view->buf = &(scalar->obval);
+
+    if ((flags & PyBUF_FORMAT) != PyBUF_FORMAT) {
+        /* It is unnecessary to find the correct format */
+        view->format = NULL;
+        return 0;
+    }
+
+    /* export datetime scalars as bytes (although arrays are not exported) */
+    view->format = "B";
+
+    return 0;
+}
+
+static PyBufferProcs datetime_arrtype_as_buffer = {
+        .bf_getbuffer = datetime_getbuffer,
+        /* No need to release the buffer */
+};
+
+
+#line 2769
+
+static int
+timedelta_getbuffer(PyObject *self, Py_buffer *view, int flags)
+{
+    if ((flags & PyBUF_WRITEABLE) == PyBUF_WRITEABLE) {
+        PyErr_SetString(PyExc_BufferError, "scalar buffer is readonly");
+        return -1;
+    }
+    PyTimedeltaScalarObject *scalar = (PyTimedeltaScalarObject *)self;
+
+    view->ndim = 1;
+    view->len = 8;
+    view->itemsize = 1;
+    static Py_ssize_t length = 8;
+    view->shape = &length;
+    view->strides = NULL;
+    view->suboffsets = NULL;
+    view->readonly = 1;
+    Py_INCREF(self);
+    view->obj = self;
+
+    view->buf = &(scalar->obval);
+
+    if ((flags & PyBUF_FORMAT) != PyBUF_FORMAT) {
+        /* It is unnecessary to find the correct format */
+        view->format = NULL;
+        return 0;
+    }
+
+    /* export datetime scalars as bytes (although arrays are not exported) */
+    view->format = "B";
+
+    return 0;
+}
+
+static PyBufferProcs timedelta_arrtype_as_buffer = {
+        .bf_getbuffer = timedelta_getbuffer,
+        /* No need to release the buffer */
+};
+
+
+
+static PyBufferProcs void_arrtype_as_buffer = {
+        .bf_getbuffer = void_getbuffer,  /* defined in buffer.c */
+        /* No need to release the buffer */
+};
+
+
+#define BASEFLAGS Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE
+#define LEAFFLAGS  Py_TPFLAGS_DEFAULT
+
+NPY_NO_EXPORT PyTypeObject PyGenericArrType_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy.generic",
+    .tp_basicsize = sizeof(PyObject),
+};
+
+
+static void
+void_dealloc(PyVoidScalarObject *v)
+{
+    if (v->flags & NPY_ARRAY_OWNDATA) {
+        npy_free_cache(v->obval, Py_SIZE(v));
+    }
+    Py_XDECREF(v->descr);
+    Py_XDECREF(v->base);
+    if (_buffer_info_free(v->_buffer_info, (PyObject *)v) < 0) {
+        PyErr_WriteUnraisable(NULL);
+    }
+    Py_TYPE(v)->tp_free(v);
+}
+
+
+static PyObject *
+object_arrtype_alloc(PyTypeObject *type, Py_ssize_t items)
+{
+    /*
+     * Object scalars should not actually exist, if they exist we should
+     * consider it to be a bug.
+     */
+    static PyObject *visibleDeprecationWarning = NULL;
+    npy_cache_import("numpy", "VisibleDeprecationWarning",
+                     &visibleDeprecationWarning);
+    if (visibleDeprecationWarning == NULL) {
+        return NULL;
+    }
+    if (PyErr_WarnEx(visibleDeprecationWarning,
+            "Creating a NumPy object scalar.  NumPy object scalars should "
+            "never be created.  If you see this message please inform the "
+            "NumPy developers.  Since this message should never be shown "
+            "this will raise a TypeError in the future.", 1) < 0) {
+        return NULL;
+    }
+    return gentype_alloc(type, items);
+}
+
+
+static void
+object_arrtype_dealloc(PyObject *v)
+{
+    Py_XDECREF(PyArrayScalar_VAL(v, Object));
+    Py_TYPE(v)->tp_free(v);
+}
+
+static void
+unicode_arrtype_dealloc(PyObject *v)
+{
+    /* note: may be null if it was never requested */
+    PyMem_Free(PyArrayScalar_VAL(v, Unicode));
+    PyMem_Free(((PyUnicodeScalarObject *)v)->buffer_fmt);
+    /* delegate to the base class */
+    PyUnicode_Type.tp_dealloc(v);
+}
+
+#line 2894
+
+/* used as a pattern for testing token equality */
+#define _BYTE_IS_BYTE
+
+static PyObject *
+byte_arrtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    /* allow base-class (if any) to do conversion */
+#if defined(_BYTE_IS_UNICODE)
+    PyObject *from_superclass = PyUnicode_Type.tp_new(type, args, kwds);
+#elif defined(_BYTE_IS_STRING)
+    PyObject *from_superclass = PyBytes_Type.tp_new(type, args, kwds);
+#elif defined(_BYTE_IS_DOUBLE)
+    PyObject *from_superclass = PyFloat_Type.tp_new(type, args, kwds);
+#endif
+#if defined(_BYTE_IS_UNICODE) || defined(_BYTE_IS_STRING) || defined(_BYTE_IS_DOUBLE)
+    if (from_superclass == NULL) {
+        /* don't clear the exception unless numpy can handle the arguments */
+        if (PyTuple_GET_SIZE(args) != 1 || (kwds && PyDict_Size(kwds) != 0)) {
+            return NULL;
+        }
+        PyErr_Clear();
+    }
+    else {
+#if defined(_BYTE_IS_UNICODE)
+        PyArrayScalar_VAL(from_superclass, Unicode) = NULL;
+#endif
+        return from_superclass;
+    }
+#endif
+
+    /* TODO: include type name in error message, which is not byte */
+    PyObject *obj = NULL;
+    static char *kwnames[] = {"", NULL};  /* positional-only */
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", kwnames, &obj)) {
+        return NULL;
+    }
+    PyArray_Descr *typecode = PyArray_DescrFromType(NPY_BYTE);
+    if (typecode == NULL) {
+        return NULL;
+    }
+    if (obj == NULL) {
+        PyObject *robj = PyArray_Scalar(NULL, typecode, NULL);
+        Py_DECREF(typecode);
+        if (robj == NULL) {
+            return NULL;
+        }
+#if !defined(_BYTE_IS_STRING) && !defined(_BYTE_IS_UNICODE)
+        memset(&PyArrayScalar_VAL(robj, Byte), 0, sizeof(npy_byte));
+#endif
+        return robj;
+    }
+
+    /* PyArray_FromAny steals a reference, reclaim it before it's gone */
+    Py_INCREF(typecode);
+    PyArrayObject *arr = (PyArrayObject *)PyArray_FromAny(
+            obj, typecode, 0, 0, NPY_ARRAY_FORCECAST, NULL);
+    if (arr == NULL) {
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    if (PyArray_NDIM(arr) > 0) {
+        Py_DECREF(typecode);
+        return (PyObject *)arr;
+    }
+
+    /* Convert the 0-d array to a scalar*/
+    PyObject *robj = PyArray_ToScalar(PyArray_DATA(arr), arr);
+    Py_DECREF(arr);
+
+    if (robj == NULL || Py_TYPE(robj) == type) {
+        Py_DECREF(typecode);
+        return robj;
+    }
+
+    /*
+     * `typecode` does not contain any subclass information, as it was thrown
+     * out by the call to `PyArray_DescrFromType` - we need to add this back.
+     *
+     * FIXME[gh-15467]: This branch is also hit for the "shadowed" builtin
+     * types like `longdouble` (which on platforms where they are the same size
+     * is shadowed by `double`), because `PyArray_FromAny` returns the
+     * shadowing type rather than the requested one.
+     */
+
+    /* Need to allocate new type and copy data-area over */
+    int itemsize;
+    if (type->tp_itemsize) {
+        itemsize = PyBytes_GET_SIZE(robj);
+    }
+    else {
+        itemsize = 0;
+    }
+    PyObject *new_obj = type->tp_alloc(type, itemsize);
+    if (new_obj == NULL) {
+        Py_DECREF(robj);
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    void *dest = scalar_value(new_obj, typecode);
+    void *src = scalar_value(robj, typecode);
+    Py_DECREF(typecode);
+#if defined(_BYTE_IS_STRING) || defined(_BYTE_IS_UNICODE)
+    if (itemsize == 0) { /* unicode */
+        itemsize = PyUnicode_GetLength(robj) * PyUnicode_KIND(robj);
+    }
+    memcpy(dest, src, itemsize);
+#else
+    *((npy_byte *)dest) = *((npy_byte *)src);
+#endif
+    Py_DECREF(robj);
+    return new_obj;
+}
+#undef _BYTE_IS_BYTE
+
+
+#line 2894
+
+/* used as a pattern for testing token equality */
+#define _SHORT_IS_SHORT
+
+static PyObject *
+short_arrtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    /* allow base-class (if any) to do conversion */
+#if defined(_SHORT_IS_UNICODE)
+    PyObject *from_superclass = PyUnicode_Type.tp_new(type, args, kwds);
+#elif defined(_SHORT_IS_STRING)
+    PyObject *from_superclass = PyBytes_Type.tp_new(type, args, kwds);
+#elif defined(_SHORT_IS_DOUBLE)
+    PyObject *from_superclass = PyFloat_Type.tp_new(type, args, kwds);
+#endif
+#if defined(_SHORT_IS_UNICODE) || defined(_SHORT_IS_STRING) || defined(_SHORT_IS_DOUBLE)
+    if (from_superclass == NULL) {
+        /* don't clear the exception unless numpy can handle the arguments */
+        if (PyTuple_GET_SIZE(args) != 1 || (kwds && PyDict_Size(kwds) != 0)) {
+            return NULL;
+        }
+        PyErr_Clear();
+    }
+    else {
+#if defined(_SHORT_IS_UNICODE)
+        PyArrayScalar_VAL(from_superclass, Unicode) = NULL;
+#endif
+        return from_superclass;
+    }
+#endif
+
+    /* TODO: include type name in error message, which is not short */
+    PyObject *obj = NULL;
+    static char *kwnames[] = {"", NULL};  /* positional-only */
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", kwnames, &obj)) {
+        return NULL;
+    }
+    PyArray_Descr *typecode = PyArray_DescrFromType(NPY_SHORT);
+    if (typecode == NULL) {
+        return NULL;
+    }
+    if (obj == NULL) {
+        PyObject *robj = PyArray_Scalar(NULL, typecode, NULL);
+        Py_DECREF(typecode);
+        if (robj == NULL) {
+            return NULL;
+        }
+#if !defined(_SHORT_IS_STRING) && !defined(_SHORT_IS_UNICODE)
+        memset(&PyArrayScalar_VAL(robj, Short), 0, sizeof(npy_short));
+#endif
+        return robj;
+    }
+
+    /* PyArray_FromAny steals a reference, reclaim it before it's gone */
+    Py_INCREF(typecode);
+    PyArrayObject *arr = (PyArrayObject *)PyArray_FromAny(
+            obj, typecode, 0, 0, NPY_ARRAY_FORCECAST, NULL);
+    if (arr == NULL) {
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    if (PyArray_NDIM(arr) > 0) {
+        Py_DECREF(typecode);
+        return (PyObject *)arr;
+    }
+
+    /* Convert the 0-d array to a scalar*/
+    PyObject *robj = PyArray_ToScalar(PyArray_DATA(arr), arr);
+    Py_DECREF(arr);
+
+    if (robj == NULL || Py_TYPE(robj) == type) {
+        Py_DECREF(typecode);
+        return robj;
+    }
+
+    /*
+     * `typecode` does not contain any subclass information, as it was thrown
+     * out by the call to `PyArray_DescrFromType` - we need to add this back.
+     *
+     * FIXME[gh-15467]: This branch is also hit for the "shadowed" builtin
+     * types like `longdouble` (which on platforms where they are the same size
+     * is shadowed by `double`), because `PyArray_FromAny` returns the
+     * shadowing type rather than the requested one.
+     */
+
+    /* Need to allocate new type and copy data-area over */
+    int itemsize;
+    if (type->tp_itemsize) {
+        itemsize = PyBytes_GET_SIZE(robj);
+    }
+    else {
+        itemsize = 0;
+    }
+    PyObject *new_obj = type->tp_alloc(type, itemsize);
+    if (new_obj == NULL) {
+        Py_DECREF(robj);
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    void *dest = scalar_value(new_obj, typecode);
+    void *src = scalar_value(robj, typecode);
+    Py_DECREF(typecode);
+#if defined(_SHORT_IS_STRING) || defined(_SHORT_IS_UNICODE)
+    if (itemsize == 0) { /* unicode */
+        itemsize = PyUnicode_GetLength(robj) * PyUnicode_KIND(robj);
+    }
+    memcpy(dest, src, itemsize);
+#else
+    *((npy_short *)dest) = *((npy_short *)src);
+#endif
+    Py_DECREF(robj);
+    return new_obj;
+}
+#undef _SHORT_IS_SHORT
+
+
+#line 2894
+
+/* used as a pattern for testing token equality */
+#define _INT_IS_INT
+
+static PyObject *
+int_arrtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    /* allow base-class (if any) to do conversion */
+#if defined(_INT_IS_UNICODE)
+    PyObject *from_superclass = PyUnicode_Type.tp_new(type, args, kwds);
+#elif defined(_INT_IS_STRING)
+    PyObject *from_superclass = PyBytes_Type.tp_new(type, args, kwds);
+#elif defined(_INT_IS_DOUBLE)
+    PyObject *from_superclass = PyFloat_Type.tp_new(type, args, kwds);
+#endif
+#if defined(_INT_IS_UNICODE) || defined(_INT_IS_STRING) || defined(_INT_IS_DOUBLE)
+    if (from_superclass == NULL) {
+        /* don't clear the exception unless numpy can handle the arguments */
+        if (PyTuple_GET_SIZE(args) != 1 || (kwds && PyDict_Size(kwds) != 0)) {
+            return NULL;
+        }
+        PyErr_Clear();
+    }
+    else {
+#if defined(_INT_IS_UNICODE)
+        PyArrayScalar_VAL(from_superclass, Unicode) = NULL;
+#endif
+        return from_superclass;
+    }
+#endif
+
+    /* TODO: include type name in error message, which is not int */
+    PyObject *obj = NULL;
+    static char *kwnames[] = {"", NULL};  /* positional-only */
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", kwnames, &obj)) {
+        return NULL;
+    }
+    PyArray_Descr *typecode = PyArray_DescrFromType(NPY_INT);
+    if (typecode == NULL) {
+        return NULL;
+    }
+    if (obj == NULL) {
+        PyObject *robj = PyArray_Scalar(NULL, typecode, NULL);
+        Py_DECREF(typecode);
+        if (robj == NULL) {
+            return NULL;
+        }
+#if !defined(_INT_IS_STRING) && !defined(_INT_IS_UNICODE)
+        memset(&PyArrayScalar_VAL(robj, Int), 0, sizeof(npy_int));
+#endif
+        return robj;
+    }
+
+    /* PyArray_FromAny steals a reference, reclaim it before it's gone */
+    Py_INCREF(typecode);
+    PyArrayObject *arr = (PyArrayObject *)PyArray_FromAny(
+            obj, typecode, 0, 0, NPY_ARRAY_FORCECAST, NULL);
+    if (arr == NULL) {
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    if (PyArray_NDIM(arr) > 0) {
+        Py_DECREF(typecode);
+        return (PyObject *)arr;
+    }
+
+    /* Convert the 0-d array to a scalar*/
+    PyObject *robj = PyArray_ToScalar(PyArray_DATA(arr), arr);
+    Py_DECREF(arr);
+
+    if (robj == NULL || Py_TYPE(robj) == type) {
+        Py_DECREF(typecode);
+        return robj;
+    }
+
+    /*
+     * `typecode` does not contain any subclass information, as it was thrown
+     * out by the call to `PyArray_DescrFromType` - we need to add this back.
+     *
+     * FIXME[gh-15467]: This branch is also hit for the "shadowed" builtin
+     * types like `longdouble` (which on platforms where they are the same size
+     * is shadowed by `double`), because `PyArray_FromAny` returns the
+     * shadowing type rather than the requested one.
+     */
+
+    /* Need to allocate new type and copy data-area over */
+    int itemsize;
+    if (type->tp_itemsize) {
+        itemsize = PyBytes_GET_SIZE(robj);
+    }
+    else {
+        itemsize = 0;
+    }
+    PyObject *new_obj = type->tp_alloc(type, itemsize);
+    if (new_obj == NULL) {
+        Py_DECREF(robj);
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    void *dest = scalar_value(new_obj, typecode);
+    void *src = scalar_value(robj, typecode);
+    Py_DECREF(typecode);
+#if defined(_INT_IS_STRING) || defined(_INT_IS_UNICODE)
+    if (itemsize == 0) { /* unicode */
+        itemsize = PyUnicode_GetLength(robj) * PyUnicode_KIND(robj);
+    }
+    memcpy(dest, src, itemsize);
+#else
+    *((npy_int *)dest) = *((npy_int *)src);
+#endif
+    Py_DECREF(robj);
+    return new_obj;
+}
+#undef _INT_IS_INT
+
+
+#line 2894
+
+/* used as a pattern for testing token equality */
+#define _LONG_IS_LONG
+
+static PyObject *
+long_arrtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    /* allow base-class (if any) to do conversion */
+#if defined(_LONG_IS_UNICODE)
+    PyObject *from_superclass = PyUnicode_Type.tp_new(type, args, kwds);
+#elif defined(_LONG_IS_STRING)
+    PyObject *from_superclass = PyBytes_Type.tp_new(type, args, kwds);
+#elif defined(_LONG_IS_DOUBLE)
+    PyObject *from_superclass = PyFloat_Type.tp_new(type, args, kwds);
+#endif
+#if defined(_LONG_IS_UNICODE) || defined(_LONG_IS_STRING) || defined(_LONG_IS_DOUBLE)
+    if (from_superclass == NULL) {
+        /* don't clear the exception unless numpy can handle the arguments */
+        if (PyTuple_GET_SIZE(args) != 1 || (kwds && PyDict_Size(kwds) != 0)) {
+            return NULL;
+        }
+        PyErr_Clear();
+    }
+    else {
+#if defined(_LONG_IS_UNICODE)
+        PyArrayScalar_VAL(from_superclass, Unicode) = NULL;
+#endif
+        return from_superclass;
+    }
+#endif
+
+    /* TODO: include type name in error message, which is not long */
+    PyObject *obj = NULL;
+    static char *kwnames[] = {"", NULL};  /* positional-only */
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", kwnames, &obj)) {
+        return NULL;
+    }
+    PyArray_Descr *typecode = PyArray_DescrFromType(NPY_LONG);
+    if (typecode == NULL) {
+        return NULL;
+    }
+    if (obj == NULL) {
+        PyObject *robj = PyArray_Scalar(NULL, typecode, NULL);
+        Py_DECREF(typecode);
+        if (robj == NULL) {
+            return NULL;
+        }
+#if !defined(_LONG_IS_STRING) && !defined(_LONG_IS_UNICODE)
+        memset(&PyArrayScalar_VAL(robj, Long), 0, sizeof(npy_long));
+#endif
+        return robj;
+    }
+
+    /* PyArray_FromAny steals a reference, reclaim it before it's gone */
+    Py_INCREF(typecode);
+    PyArrayObject *arr = (PyArrayObject *)PyArray_FromAny(
+            obj, typecode, 0, 0, NPY_ARRAY_FORCECAST, NULL);
+    if (arr == NULL) {
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    if (PyArray_NDIM(arr) > 0) {
+        Py_DECREF(typecode);
+        return (PyObject *)arr;
+    }
+
+    /* Convert the 0-d array to a scalar*/
+    PyObject *robj = PyArray_ToScalar(PyArray_DATA(arr), arr);
+    Py_DECREF(arr);
+
+    if (robj == NULL || Py_TYPE(robj) == type) {
+        Py_DECREF(typecode);
+        return robj;
+    }
+
+    /*
+     * `typecode` does not contain any subclass information, as it was thrown
+     * out by the call to `PyArray_DescrFromType` - we need to add this back.
+     *
+     * FIXME[gh-15467]: This branch is also hit for the "shadowed" builtin
+     * types like `longdouble` (which on platforms where they are the same size
+     * is shadowed by `double`), because `PyArray_FromAny` returns the
+     * shadowing type rather than the requested one.
+     */
+
+    /* Need to allocate new type and copy data-area over */
+    int itemsize;
+    if (type->tp_itemsize) {
+        itemsize = PyBytes_GET_SIZE(robj);
+    }
+    else {
+        itemsize = 0;
+    }
+    PyObject *new_obj = type->tp_alloc(type, itemsize);
+    if (new_obj == NULL) {
+        Py_DECREF(robj);
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    void *dest = scalar_value(new_obj, typecode);
+    void *src = scalar_value(robj, typecode);
+    Py_DECREF(typecode);
+#if defined(_LONG_IS_STRING) || defined(_LONG_IS_UNICODE)
+    if (itemsize == 0) { /* unicode */
+        itemsize = PyUnicode_GetLength(robj) * PyUnicode_KIND(robj);
+    }
+    memcpy(dest, src, itemsize);
+#else
+    *((npy_long *)dest) = *((npy_long *)src);
+#endif
+    Py_DECREF(robj);
+    return new_obj;
+}
+#undef _LONG_IS_LONG
+
+
+#line 2894
+
+/* used as a pattern for testing token equality */
+#define _LONGLONG_IS_LONGLONG
+
+static PyObject *
+longlong_arrtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    /* allow base-class (if any) to do conversion */
+#if defined(_LONGLONG_IS_UNICODE)
+    PyObject *from_superclass = PyUnicode_Type.tp_new(type, args, kwds);
+#elif defined(_LONGLONG_IS_STRING)
+    PyObject *from_superclass = PyBytes_Type.tp_new(type, args, kwds);
+#elif defined(_LONGLONG_IS_DOUBLE)
+    PyObject *from_superclass = PyFloat_Type.tp_new(type, args, kwds);
+#endif
+#if defined(_LONGLONG_IS_UNICODE) || defined(_LONGLONG_IS_STRING) || defined(_LONGLONG_IS_DOUBLE)
+    if (from_superclass == NULL) {
+        /* don't clear the exception unless numpy can handle the arguments */
+        if (PyTuple_GET_SIZE(args) != 1 || (kwds && PyDict_Size(kwds) != 0)) {
+            return NULL;
+        }
+        PyErr_Clear();
+    }
+    else {
+#if defined(_LONGLONG_IS_UNICODE)
+        PyArrayScalar_VAL(from_superclass, Unicode) = NULL;
+#endif
+        return from_superclass;
+    }
+#endif
+
+    /* TODO: include type name in error message, which is not longlong */
+    PyObject *obj = NULL;
+    static char *kwnames[] = {"", NULL};  /* positional-only */
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", kwnames, &obj)) {
+        return NULL;
+    }
+    PyArray_Descr *typecode = PyArray_DescrFromType(NPY_LONGLONG);
+    if (typecode == NULL) {
+        return NULL;
+    }
+    if (obj == NULL) {
+        PyObject *robj = PyArray_Scalar(NULL, typecode, NULL);
+        Py_DECREF(typecode);
+        if (robj == NULL) {
+            return NULL;
+        }
+#if !defined(_LONGLONG_IS_STRING) && !defined(_LONGLONG_IS_UNICODE)
+        memset(&PyArrayScalar_VAL(robj, LongLong), 0, sizeof(npy_longlong));
+#endif
+        return robj;
+    }
+
+    /* PyArray_FromAny steals a reference, reclaim it before it's gone */
+    Py_INCREF(typecode);
+    PyArrayObject *arr = (PyArrayObject *)PyArray_FromAny(
+            obj, typecode, 0, 0, NPY_ARRAY_FORCECAST, NULL);
+    if (arr == NULL) {
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    if (PyArray_NDIM(arr) > 0) {
+        Py_DECREF(typecode);
+        return (PyObject *)arr;
+    }
+
+    /* Convert the 0-d array to a scalar*/
+    PyObject *robj = PyArray_ToScalar(PyArray_DATA(arr), arr);
+    Py_DECREF(arr);
+
+    if (robj == NULL || Py_TYPE(robj) == type) {
+        Py_DECREF(typecode);
+        return robj;
+    }
+
+    /*
+     * `typecode` does not contain any subclass information, as it was thrown
+     * out by the call to `PyArray_DescrFromType` - we need to add this back.
+     *
+     * FIXME[gh-15467]: This branch is also hit for the "shadowed" builtin
+     * types like `longdouble` (which on platforms where they are the same size
+     * is shadowed by `double`), because `PyArray_FromAny` returns the
+     * shadowing type rather than the requested one.
+     */
+
+    /* Need to allocate new type and copy data-area over */
+    int itemsize;
+    if (type->tp_itemsize) {
+        itemsize = PyBytes_GET_SIZE(robj);
+    }
+    else {
+        itemsize = 0;
+    }
+    PyObject *new_obj = type->tp_alloc(type, itemsize);
+    if (new_obj == NULL) {
+        Py_DECREF(robj);
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    void *dest = scalar_value(new_obj, typecode);
+    void *src = scalar_value(robj, typecode);
+    Py_DECREF(typecode);
+#if defined(_LONGLONG_IS_STRING) || defined(_LONGLONG_IS_UNICODE)
+    if (itemsize == 0) { /* unicode */
+        itemsize = PyUnicode_GetLength(robj) * PyUnicode_KIND(robj);
+    }
+    memcpy(dest, src, itemsize);
+#else
+    *((npy_longlong *)dest) = *((npy_longlong *)src);
+#endif
+    Py_DECREF(robj);
+    return new_obj;
+}
+#undef _LONGLONG_IS_LONGLONG
+
+
+#line 2894
+
+/* used as a pattern for testing token equality */
+#define _UBYTE_IS_UBYTE
+
+static PyObject *
+ubyte_arrtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    /* allow base-class (if any) to do conversion */
+#if defined(_UBYTE_IS_UNICODE)
+    PyObject *from_superclass = PyUnicode_Type.tp_new(type, args, kwds);
+#elif defined(_UBYTE_IS_STRING)
+    PyObject *from_superclass = PyBytes_Type.tp_new(type, args, kwds);
+#elif defined(_UBYTE_IS_DOUBLE)
+    PyObject *from_superclass = PyFloat_Type.tp_new(type, args, kwds);
+#endif
+#if defined(_UBYTE_IS_UNICODE) || defined(_UBYTE_IS_STRING) || defined(_UBYTE_IS_DOUBLE)
+    if (from_superclass == NULL) {
+        /* don't clear the exception unless numpy can handle the arguments */
+        if (PyTuple_GET_SIZE(args) != 1 || (kwds && PyDict_Size(kwds) != 0)) {
+            return NULL;
+        }
+        PyErr_Clear();
+    }
+    else {
+#if defined(_UBYTE_IS_UNICODE)
+        PyArrayScalar_VAL(from_superclass, Unicode) = NULL;
+#endif
+        return from_superclass;
+    }
+#endif
+
+    /* TODO: include type name in error message, which is not ubyte */
+    PyObject *obj = NULL;
+    static char *kwnames[] = {"", NULL};  /* positional-only */
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", kwnames, &obj)) {
+        return NULL;
+    }
+    PyArray_Descr *typecode = PyArray_DescrFromType(NPY_UBYTE);
+    if (typecode == NULL) {
+        return NULL;
+    }
+    if (obj == NULL) {
+        PyObject *robj = PyArray_Scalar(NULL, typecode, NULL);
+        Py_DECREF(typecode);
+        if (robj == NULL) {
+            return NULL;
+        }
+#if !defined(_UBYTE_IS_STRING) && !defined(_UBYTE_IS_UNICODE)
+        memset(&PyArrayScalar_VAL(robj, UByte), 0, sizeof(npy_ubyte));
+#endif
+        return robj;
+    }
+
+    /* PyArray_FromAny steals a reference, reclaim it before it's gone */
+    Py_INCREF(typecode);
+    PyArrayObject *arr = (PyArrayObject *)PyArray_FromAny(
+            obj, typecode, 0, 0, NPY_ARRAY_FORCECAST, NULL);
+    if (arr == NULL) {
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    if (PyArray_NDIM(arr) > 0) {
+        Py_DECREF(typecode);
+        return (PyObject *)arr;
+    }
+
+    /* Convert the 0-d array to a scalar*/
+    PyObject *robj = PyArray_ToScalar(PyArray_DATA(arr), arr);
+    Py_DECREF(arr);
+
+    if (robj == NULL || Py_TYPE(robj) == type) {
+        Py_DECREF(typecode);
+        return robj;
+    }
+
+    /*
+     * `typecode` does not contain any subclass information, as it was thrown
+     * out by the call to `PyArray_DescrFromType` - we need to add this back.
+     *
+     * FIXME[gh-15467]: This branch is also hit for the "shadowed" builtin
+     * types like `longdouble` (which on platforms where they are the same size
+     * is shadowed by `double`), because `PyArray_FromAny` returns the
+     * shadowing type rather than the requested one.
+     */
+
+    /* Need to allocate new type and copy data-area over */
+    int itemsize;
+    if (type->tp_itemsize) {
+        itemsize = PyBytes_GET_SIZE(robj);
+    }
+    else {
+        itemsize = 0;
+    }
+    PyObject *new_obj = type->tp_alloc(type, itemsize);
+    if (new_obj == NULL) {
+        Py_DECREF(robj);
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    void *dest = scalar_value(new_obj, typecode);
+    void *src = scalar_value(robj, typecode);
+    Py_DECREF(typecode);
+#if defined(_UBYTE_IS_STRING) || defined(_UBYTE_IS_UNICODE)
+    if (itemsize == 0) { /* unicode */
+        itemsize = PyUnicode_GetLength(robj) * PyUnicode_KIND(robj);
+    }
+    memcpy(dest, src, itemsize);
+#else
+    *((npy_ubyte *)dest) = *((npy_ubyte *)src);
+#endif
+    Py_DECREF(robj);
+    return new_obj;
+}
+#undef _UBYTE_IS_UBYTE
+
+
+#line 2894
+
+/* used as a pattern for testing token equality */
+#define _USHORT_IS_USHORT
+
+static PyObject *
+ushort_arrtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    /* allow base-class (if any) to do conversion */
+#if defined(_USHORT_IS_UNICODE)
+    PyObject *from_superclass = PyUnicode_Type.tp_new(type, args, kwds);
+#elif defined(_USHORT_IS_STRING)
+    PyObject *from_superclass = PyBytes_Type.tp_new(type, args, kwds);
+#elif defined(_USHORT_IS_DOUBLE)
+    PyObject *from_superclass = PyFloat_Type.tp_new(type, args, kwds);
+#endif
+#if defined(_USHORT_IS_UNICODE) || defined(_USHORT_IS_STRING) || defined(_USHORT_IS_DOUBLE)
+    if (from_superclass == NULL) {
+        /* don't clear the exception unless numpy can handle the arguments */
+        if (PyTuple_GET_SIZE(args) != 1 || (kwds && PyDict_Size(kwds) != 0)) {
+            return NULL;
+        }
+        PyErr_Clear();
+    }
+    else {
+#if defined(_USHORT_IS_UNICODE)
+        PyArrayScalar_VAL(from_superclass, Unicode) = NULL;
+#endif
+        return from_superclass;
+    }
+#endif
+
+    /* TODO: include type name in error message, which is not ushort */
+    PyObject *obj = NULL;
+    static char *kwnames[] = {"", NULL};  /* positional-only */
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", kwnames, &obj)) {
+        return NULL;
+    }
+    PyArray_Descr *typecode = PyArray_DescrFromType(NPY_USHORT);
+    if (typecode == NULL) {
+        return NULL;
+    }
+    if (obj == NULL) {
+        PyObject *robj = PyArray_Scalar(NULL, typecode, NULL);
+        Py_DECREF(typecode);
+        if (robj == NULL) {
+            return NULL;
+        }
+#if !defined(_USHORT_IS_STRING) && !defined(_USHORT_IS_UNICODE)
+        memset(&PyArrayScalar_VAL(robj, UShort), 0, sizeof(npy_ushort));
+#endif
+        return robj;
+    }
+
+    /* PyArray_FromAny steals a reference, reclaim it before it's gone */
+    Py_INCREF(typecode);
+    PyArrayObject *arr = (PyArrayObject *)PyArray_FromAny(
+            obj, typecode, 0, 0, NPY_ARRAY_FORCECAST, NULL);
+    if (arr == NULL) {
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    if (PyArray_NDIM(arr) > 0) {
+        Py_DECREF(typecode);
+        return (PyObject *)arr;
+    }
+
+    /* Convert the 0-d array to a scalar*/
+    PyObject *robj = PyArray_ToScalar(PyArray_DATA(arr), arr);
+    Py_DECREF(arr);
+
+    if (robj == NULL || Py_TYPE(robj) == type) {
+        Py_DECREF(typecode);
+        return robj;
+    }
+
+    /*
+     * `typecode` does not contain any subclass information, as it was thrown
+     * out by the call to `PyArray_DescrFromType` - we need to add this back.
+     *
+     * FIXME[gh-15467]: This branch is also hit for the "shadowed" builtin
+     * types like `longdouble` (which on platforms where they are the same size
+     * is shadowed by `double`), because `PyArray_FromAny` returns the
+     * shadowing type rather than the requested one.
+     */
+
+    /* Need to allocate new type and copy data-area over */
+    int itemsize;
+    if (type->tp_itemsize) {
+        itemsize = PyBytes_GET_SIZE(robj);
+    }
+    else {
+        itemsize = 0;
+    }
+    PyObject *new_obj = type->tp_alloc(type, itemsize);
+    if (new_obj == NULL) {
+        Py_DECREF(robj);
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    void *dest = scalar_value(new_obj, typecode);
+    void *src = scalar_value(robj, typecode);
+    Py_DECREF(typecode);
+#if defined(_USHORT_IS_STRING) || defined(_USHORT_IS_UNICODE)
+    if (itemsize == 0) { /* unicode */
+        itemsize = PyUnicode_GetLength(robj) * PyUnicode_KIND(robj);
+    }
+    memcpy(dest, src, itemsize);
+#else
+    *((npy_ushort *)dest) = *((npy_ushort *)src);
+#endif
+    Py_DECREF(robj);
+    return new_obj;
+}
+#undef _USHORT_IS_USHORT
+
+
+#line 2894
+
+/* used as a pattern for testing token equality */
+#define _UINT_IS_UINT
+
+static PyObject *
+uint_arrtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    /* allow base-class (if any) to do conversion */
+#if defined(_UINT_IS_UNICODE)
+    PyObject *from_superclass = PyUnicode_Type.tp_new(type, args, kwds);
+#elif defined(_UINT_IS_STRING)
+    PyObject *from_superclass = PyBytes_Type.tp_new(type, args, kwds);
+#elif defined(_UINT_IS_DOUBLE)
+    PyObject *from_superclass = PyFloat_Type.tp_new(type, args, kwds);
+#endif
+#if defined(_UINT_IS_UNICODE) || defined(_UINT_IS_STRING) || defined(_UINT_IS_DOUBLE)
+    if (from_superclass == NULL) {
+        /* don't clear the exception unless numpy can handle the arguments */
+        if (PyTuple_GET_SIZE(args) != 1 || (kwds && PyDict_Size(kwds) != 0)) {
+            return NULL;
+        }
+        PyErr_Clear();
+    }
+    else {
+#if defined(_UINT_IS_UNICODE)
+        PyArrayScalar_VAL(from_superclass, Unicode) = NULL;
+#endif
+        return from_superclass;
+    }
+#endif
+
+    /* TODO: include type name in error message, which is not uint */
+    PyObject *obj = NULL;
+    static char *kwnames[] = {"", NULL};  /* positional-only */
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", kwnames, &obj)) {
+        return NULL;
+    }
+    PyArray_Descr *typecode = PyArray_DescrFromType(NPY_UINT);
+    if (typecode == NULL) {
+        return NULL;
+    }
+    if (obj == NULL) {
+        PyObject *robj = PyArray_Scalar(NULL, typecode, NULL);
+        Py_DECREF(typecode);
+        if (robj == NULL) {
+            return NULL;
+        }
+#if !defined(_UINT_IS_STRING) && !defined(_UINT_IS_UNICODE)
+        memset(&PyArrayScalar_VAL(robj, UInt), 0, sizeof(npy_uint));
+#endif
+        return robj;
+    }
+
+    /* PyArray_FromAny steals a reference, reclaim it before it's gone */
+    Py_INCREF(typecode);
+    PyArrayObject *arr = (PyArrayObject *)PyArray_FromAny(
+            obj, typecode, 0, 0, NPY_ARRAY_FORCECAST, NULL);
+    if (arr == NULL) {
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    if (PyArray_NDIM(arr) > 0) {
+        Py_DECREF(typecode);
+        return (PyObject *)arr;
+    }
+
+    /* Convert the 0-d array to a scalar*/
+    PyObject *robj = PyArray_ToScalar(PyArray_DATA(arr), arr);
+    Py_DECREF(arr);
+
+    if (robj == NULL || Py_TYPE(robj) == type) {
+        Py_DECREF(typecode);
+        return robj;
+    }
+
+    /*
+     * `typecode` does not contain any subclass information, as it was thrown
+     * out by the call to `PyArray_DescrFromType` - we need to add this back.
+     *
+     * FIXME[gh-15467]: This branch is also hit for the "shadowed" builtin
+     * types like `longdouble` (which on platforms where they are the same size
+     * is shadowed by `double`), because `PyArray_FromAny` returns the
+     * shadowing type rather than the requested one.
+     */
+
+    /* Need to allocate new type and copy data-area over */
+    int itemsize;
+    if (type->tp_itemsize) {
+        itemsize = PyBytes_GET_SIZE(robj);
+    }
+    else {
+        itemsize = 0;
+    }
+    PyObject *new_obj = type->tp_alloc(type, itemsize);
+    if (new_obj == NULL) {
+        Py_DECREF(robj);
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    void *dest = scalar_value(new_obj, typecode);
+    void *src = scalar_value(robj, typecode);
+    Py_DECREF(typecode);
+#if defined(_UINT_IS_STRING) || defined(_UINT_IS_UNICODE)
+    if (itemsize == 0) { /* unicode */
+        itemsize = PyUnicode_GetLength(robj) * PyUnicode_KIND(robj);
+    }
+    memcpy(dest, src, itemsize);
+#else
+    *((npy_uint *)dest) = *((npy_uint *)src);
+#endif
+    Py_DECREF(robj);
+    return new_obj;
+}
+#undef _UINT_IS_UINT
+
+
+#line 2894
+
+/* used as a pattern for testing token equality */
+#define _ULONG_IS_ULONG
+
+static PyObject *
+ulong_arrtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    /* allow base-class (if any) to do conversion */
+#if defined(_ULONG_IS_UNICODE)
+    PyObject *from_superclass = PyUnicode_Type.tp_new(type, args, kwds);
+#elif defined(_ULONG_IS_STRING)
+    PyObject *from_superclass = PyBytes_Type.tp_new(type, args, kwds);
+#elif defined(_ULONG_IS_DOUBLE)
+    PyObject *from_superclass = PyFloat_Type.tp_new(type, args, kwds);
+#endif
+#if defined(_ULONG_IS_UNICODE) || defined(_ULONG_IS_STRING) || defined(_ULONG_IS_DOUBLE)
+    if (from_superclass == NULL) {
+        /* don't clear the exception unless numpy can handle the arguments */
+        if (PyTuple_GET_SIZE(args) != 1 || (kwds && PyDict_Size(kwds) != 0)) {
+            return NULL;
+        }
+        PyErr_Clear();
+    }
+    else {
+#if defined(_ULONG_IS_UNICODE)
+        PyArrayScalar_VAL(from_superclass, Unicode) = NULL;
+#endif
+        return from_superclass;
+    }
+#endif
+
+    /* TODO: include type name in error message, which is not ulong */
+    PyObject *obj = NULL;
+    static char *kwnames[] = {"", NULL};  /* positional-only */
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", kwnames, &obj)) {
+        return NULL;
+    }
+    PyArray_Descr *typecode = PyArray_DescrFromType(NPY_ULONG);
+    if (typecode == NULL) {
+        return NULL;
+    }
+    if (obj == NULL) {
+        PyObject *robj = PyArray_Scalar(NULL, typecode, NULL);
+        Py_DECREF(typecode);
+        if (robj == NULL) {
+            return NULL;
+        }
+#if !defined(_ULONG_IS_STRING) && !defined(_ULONG_IS_UNICODE)
+        memset(&PyArrayScalar_VAL(robj, ULong), 0, sizeof(npy_ulong));
+#endif
+        return robj;
+    }
+
+    /* PyArray_FromAny steals a reference, reclaim it before it's gone */
+    Py_INCREF(typecode);
+    PyArrayObject *arr = (PyArrayObject *)PyArray_FromAny(
+            obj, typecode, 0, 0, NPY_ARRAY_FORCECAST, NULL);
+    if (arr == NULL) {
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    if (PyArray_NDIM(arr) > 0) {
+        Py_DECREF(typecode);
+        return (PyObject *)arr;
+    }
+
+    /* Convert the 0-d array to a scalar*/
+    PyObject *robj = PyArray_ToScalar(PyArray_DATA(arr), arr);
+    Py_DECREF(arr);
+
+    if (robj == NULL || Py_TYPE(robj) == type) {
+        Py_DECREF(typecode);
+        return robj;
+    }
+
+    /*
+     * `typecode` does not contain any subclass information, as it was thrown
+     * out by the call to `PyArray_DescrFromType` - we need to add this back.
+     *
+     * FIXME[gh-15467]: This branch is also hit for the "shadowed" builtin
+     * types like `longdouble` (which on platforms where they are the same size
+     * is shadowed by `double`), because `PyArray_FromAny` returns the
+     * shadowing type rather than the requested one.
+     */
+
+    /* Need to allocate new type and copy data-area over */
+    int itemsize;
+    if (type->tp_itemsize) {
+        itemsize = PyBytes_GET_SIZE(robj);
+    }
+    else {
+        itemsize = 0;
+    }
+    PyObject *new_obj = type->tp_alloc(type, itemsize);
+    if (new_obj == NULL) {
+        Py_DECREF(robj);
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    void *dest = scalar_value(new_obj, typecode);
+    void *src = scalar_value(robj, typecode);
+    Py_DECREF(typecode);
+#if defined(_ULONG_IS_STRING) || defined(_ULONG_IS_UNICODE)
+    if (itemsize == 0) { /* unicode */
+        itemsize = PyUnicode_GetLength(robj) * PyUnicode_KIND(robj);
+    }
+    memcpy(dest, src, itemsize);
+#else
+    *((npy_ulong *)dest) = *((npy_ulong *)src);
+#endif
+    Py_DECREF(robj);
+    return new_obj;
+}
+#undef _ULONG_IS_ULONG
+
+
+#line 2894
+
+/* used as a pattern for testing token equality */
+#define _ULONGLONG_IS_ULONGLONG
+
+static PyObject *
+ulonglong_arrtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    /* allow base-class (if any) to do conversion */
+#if defined(_ULONGLONG_IS_UNICODE)
+    PyObject *from_superclass = PyUnicode_Type.tp_new(type, args, kwds);
+#elif defined(_ULONGLONG_IS_STRING)
+    PyObject *from_superclass = PyBytes_Type.tp_new(type, args, kwds);
+#elif defined(_ULONGLONG_IS_DOUBLE)
+    PyObject *from_superclass = PyFloat_Type.tp_new(type, args, kwds);
+#endif
+#if defined(_ULONGLONG_IS_UNICODE) || defined(_ULONGLONG_IS_STRING) || defined(_ULONGLONG_IS_DOUBLE)
+    if (from_superclass == NULL) {
+        /* don't clear the exception unless numpy can handle the arguments */
+        if (PyTuple_GET_SIZE(args) != 1 || (kwds && PyDict_Size(kwds) != 0)) {
+            return NULL;
+        }
+        PyErr_Clear();
+    }
+    else {
+#if defined(_ULONGLONG_IS_UNICODE)
+        PyArrayScalar_VAL(from_superclass, Unicode) = NULL;
+#endif
+        return from_superclass;
+    }
+#endif
+
+    /* TODO: include type name in error message, which is not ulonglong */
+    PyObject *obj = NULL;
+    static char *kwnames[] = {"", NULL};  /* positional-only */
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", kwnames, &obj)) {
+        return NULL;
+    }
+    PyArray_Descr *typecode = PyArray_DescrFromType(NPY_ULONGLONG);
+    if (typecode == NULL) {
+        return NULL;
+    }
+    if (obj == NULL) {
+        PyObject *robj = PyArray_Scalar(NULL, typecode, NULL);
+        Py_DECREF(typecode);
+        if (robj == NULL) {
+            return NULL;
+        }
+#if !defined(_ULONGLONG_IS_STRING) && !defined(_ULONGLONG_IS_UNICODE)
+        memset(&PyArrayScalar_VAL(robj, ULongLong), 0, sizeof(npy_ulonglong));
+#endif
+        return robj;
+    }
+
+    /* PyArray_FromAny steals a reference, reclaim it before it's gone */
+    Py_INCREF(typecode);
+    PyArrayObject *arr = (PyArrayObject *)PyArray_FromAny(
+            obj, typecode, 0, 0, NPY_ARRAY_FORCECAST, NULL);
+    if (arr == NULL) {
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    if (PyArray_NDIM(arr) > 0) {
+        Py_DECREF(typecode);
+        return (PyObject *)arr;
+    }
+
+    /* Convert the 0-d array to a scalar*/
+    PyObject *robj = PyArray_ToScalar(PyArray_DATA(arr), arr);
+    Py_DECREF(arr);
+
+    if (robj == NULL || Py_TYPE(robj) == type) {
+        Py_DECREF(typecode);
+        return robj;
+    }
+
+    /*
+     * `typecode` does not contain any subclass information, as it was thrown
+     * out by the call to `PyArray_DescrFromType` - we need to add this back.
+     *
+     * FIXME[gh-15467]: This branch is also hit for the "shadowed" builtin
+     * types like `longdouble` (which on platforms where they are the same size
+     * is shadowed by `double`), because `PyArray_FromAny` returns the
+     * shadowing type rather than the requested one.
+     */
+
+    /* Need to allocate new type and copy data-area over */
+    int itemsize;
+    if (type->tp_itemsize) {
+        itemsize = PyBytes_GET_SIZE(robj);
+    }
+    else {
+        itemsize = 0;
+    }
+    PyObject *new_obj = type->tp_alloc(type, itemsize);
+    if (new_obj == NULL) {
+        Py_DECREF(robj);
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    void *dest = scalar_value(new_obj, typecode);
+    void *src = scalar_value(robj, typecode);
+    Py_DECREF(typecode);
+#if defined(_ULONGLONG_IS_STRING) || defined(_ULONGLONG_IS_UNICODE)
+    if (itemsize == 0) { /* unicode */
+        itemsize = PyUnicode_GetLength(robj) * PyUnicode_KIND(robj);
+    }
+    memcpy(dest, src, itemsize);
+#else
+    *((npy_ulonglong *)dest) = *((npy_ulonglong *)src);
+#endif
+    Py_DECREF(robj);
+    return new_obj;
+}
+#undef _ULONGLONG_IS_ULONGLONG
+
+
+#line 2894
+
+/* used as a pattern for testing token equality */
+#define _HALF_IS_HALF
+
+static PyObject *
+half_arrtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    /* allow base-class (if any) to do conversion */
+#if defined(_HALF_IS_UNICODE)
+    PyObject *from_superclass = PyUnicode_Type.tp_new(type, args, kwds);
+#elif defined(_HALF_IS_STRING)
+    PyObject *from_superclass = PyBytes_Type.tp_new(type, args, kwds);
+#elif defined(_HALF_IS_DOUBLE)
+    PyObject *from_superclass = PyFloat_Type.tp_new(type, args, kwds);
+#endif
+#if defined(_HALF_IS_UNICODE) || defined(_HALF_IS_STRING) || defined(_HALF_IS_DOUBLE)
+    if (from_superclass == NULL) {
+        /* don't clear the exception unless numpy can handle the arguments */
+        if (PyTuple_GET_SIZE(args) != 1 || (kwds && PyDict_Size(kwds) != 0)) {
+            return NULL;
+        }
+        PyErr_Clear();
+    }
+    else {
+#if defined(_HALF_IS_UNICODE)
+        PyArrayScalar_VAL(from_superclass, Unicode) = NULL;
+#endif
+        return from_superclass;
+    }
+#endif
+
+    /* TODO: include type name in error message, which is not half */
+    PyObject *obj = NULL;
+    static char *kwnames[] = {"", NULL};  /* positional-only */
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", kwnames, &obj)) {
+        return NULL;
+    }
+    PyArray_Descr *typecode = PyArray_DescrFromType(NPY_HALF);
+    if (typecode == NULL) {
+        return NULL;
+    }
+    if (obj == NULL) {
+        PyObject *robj = PyArray_Scalar(NULL, typecode, NULL);
+        Py_DECREF(typecode);
+        if (robj == NULL) {
+            return NULL;
+        }
+#if !defined(_HALF_IS_STRING) && !defined(_HALF_IS_UNICODE)
+        memset(&PyArrayScalar_VAL(robj, Half), 0, sizeof(npy_half));
+#endif
+        return robj;
+    }
+
+    /* PyArray_FromAny steals a reference, reclaim it before it's gone */
+    Py_INCREF(typecode);
+    PyArrayObject *arr = (PyArrayObject *)PyArray_FromAny(
+            obj, typecode, 0, 0, NPY_ARRAY_FORCECAST, NULL);
+    if (arr == NULL) {
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    if (PyArray_NDIM(arr) > 0) {
+        Py_DECREF(typecode);
+        return (PyObject *)arr;
+    }
+
+    /* Convert the 0-d array to a scalar*/
+    PyObject *robj = PyArray_ToScalar(PyArray_DATA(arr), arr);
+    Py_DECREF(arr);
+
+    if (robj == NULL || Py_TYPE(robj) == type) {
+        Py_DECREF(typecode);
+        return robj;
+    }
+
+    /*
+     * `typecode` does not contain any subclass information, as it was thrown
+     * out by the call to `PyArray_DescrFromType` - we need to add this back.
+     *
+     * FIXME[gh-15467]: This branch is also hit for the "shadowed" builtin
+     * types like `longdouble` (which on platforms where they are the same size
+     * is shadowed by `double`), because `PyArray_FromAny` returns the
+     * shadowing type rather than the requested one.
+     */
+
+    /* Need to allocate new type and copy data-area over */
+    int itemsize;
+    if (type->tp_itemsize) {
+        itemsize = PyBytes_GET_SIZE(robj);
+    }
+    else {
+        itemsize = 0;
+    }
+    PyObject *new_obj = type->tp_alloc(type, itemsize);
+    if (new_obj == NULL) {
+        Py_DECREF(robj);
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    void *dest = scalar_value(new_obj, typecode);
+    void *src = scalar_value(robj, typecode);
+    Py_DECREF(typecode);
+#if defined(_HALF_IS_STRING) || defined(_HALF_IS_UNICODE)
+    if (itemsize == 0) { /* unicode */
+        itemsize = PyUnicode_GetLength(robj) * PyUnicode_KIND(robj);
+    }
+    memcpy(dest, src, itemsize);
+#else
+    *((npy_half *)dest) = *((npy_half *)src);
+#endif
+    Py_DECREF(robj);
+    return new_obj;
+}
+#undef _HALF_IS_HALF
+
+
+#line 2894
+
+/* used as a pattern for testing token equality */
+#define _FLOAT_IS_FLOAT
+
+static PyObject *
+float_arrtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    /* allow base-class (if any) to do conversion */
+#if defined(_FLOAT_IS_UNICODE)
+    PyObject *from_superclass = PyUnicode_Type.tp_new(type, args, kwds);
+#elif defined(_FLOAT_IS_STRING)
+    PyObject *from_superclass = PyBytes_Type.tp_new(type, args, kwds);
+#elif defined(_FLOAT_IS_DOUBLE)
+    PyObject *from_superclass = PyFloat_Type.tp_new(type, args, kwds);
+#endif
+#if defined(_FLOAT_IS_UNICODE) || defined(_FLOAT_IS_STRING) || defined(_FLOAT_IS_DOUBLE)
+    if (from_superclass == NULL) {
+        /* don't clear the exception unless numpy can handle the arguments */
+        if (PyTuple_GET_SIZE(args) != 1 || (kwds && PyDict_Size(kwds) != 0)) {
+            return NULL;
+        }
+        PyErr_Clear();
+    }
+    else {
+#if defined(_FLOAT_IS_UNICODE)
+        PyArrayScalar_VAL(from_superclass, Unicode) = NULL;
+#endif
+        return from_superclass;
+    }
+#endif
+
+    /* TODO: include type name in error message, which is not float */
+    PyObject *obj = NULL;
+    static char *kwnames[] = {"", NULL};  /* positional-only */
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", kwnames, &obj)) {
+        return NULL;
+    }
+    PyArray_Descr *typecode = PyArray_DescrFromType(NPY_FLOAT);
+    if (typecode == NULL) {
+        return NULL;
+    }
+    if (obj == NULL) {
+        PyObject *robj = PyArray_Scalar(NULL, typecode, NULL);
+        Py_DECREF(typecode);
+        if (robj == NULL) {
+            return NULL;
+        }
+#if !defined(_FLOAT_IS_STRING) && !defined(_FLOAT_IS_UNICODE)
+        memset(&PyArrayScalar_VAL(robj, Float), 0, sizeof(npy_float));
+#endif
+        return robj;
+    }
+
+    /* PyArray_FromAny steals a reference, reclaim it before it's gone */
+    Py_INCREF(typecode);
+    PyArrayObject *arr = (PyArrayObject *)PyArray_FromAny(
+            obj, typecode, 0, 0, NPY_ARRAY_FORCECAST, NULL);
+    if (arr == NULL) {
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    if (PyArray_NDIM(arr) > 0) {
+        Py_DECREF(typecode);
+        return (PyObject *)arr;
+    }
+
+    /* Convert the 0-d array to a scalar*/
+    PyObject *robj = PyArray_ToScalar(PyArray_DATA(arr), arr);
+    Py_DECREF(arr);
+
+    if (robj == NULL || Py_TYPE(robj) == type) {
+        Py_DECREF(typecode);
+        return robj;
+    }
+
+    /*
+     * `typecode` does not contain any subclass information, as it was thrown
+     * out by the call to `PyArray_DescrFromType` - we need to add this back.
+     *
+     * FIXME[gh-15467]: This branch is also hit for the "shadowed" builtin
+     * types like `longdouble` (which on platforms where they are the same size
+     * is shadowed by `double`), because `PyArray_FromAny` returns the
+     * shadowing type rather than the requested one.
+     */
+
+    /* Need to allocate new type and copy data-area over */
+    int itemsize;
+    if (type->tp_itemsize) {
+        itemsize = PyBytes_GET_SIZE(robj);
+    }
+    else {
+        itemsize = 0;
+    }
+    PyObject *new_obj = type->tp_alloc(type, itemsize);
+    if (new_obj == NULL) {
+        Py_DECREF(robj);
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    void *dest = scalar_value(new_obj, typecode);
+    void *src = scalar_value(robj, typecode);
+    Py_DECREF(typecode);
+#if defined(_FLOAT_IS_STRING) || defined(_FLOAT_IS_UNICODE)
+    if (itemsize == 0) { /* unicode */
+        itemsize = PyUnicode_GetLength(robj) * PyUnicode_KIND(robj);
+    }
+    memcpy(dest, src, itemsize);
+#else
+    *((npy_float *)dest) = *((npy_float *)src);
+#endif
+    Py_DECREF(robj);
+    return new_obj;
+}
+#undef _FLOAT_IS_FLOAT
+
+
+#line 2894
+
+/* used as a pattern for testing token equality */
+#define _DOUBLE_IS_DOUBLE
+
+static PyObject *
+double_arrtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    /* allow base-class (if any) to do conversion */
+#if defined(_DOUBLE_IS_UNICODE)
+    PyObject *from_superclass = PyUnicode_Type.tp_new(type, args, kwds);
+#elif defined(_DOUBLE_IS_STRING)
+    PyObject *from_superclass = PyBytes_Type.tp_new(type, args, kwds);
+#elif defined(_DOUBLE_IS_DOUBLE)
+    PyObject *from_superclass = PyFloat_Type.tp_new(type, args, kwds);
+#endif
+#if defined(_DOUBLE_IS_UNICODE) || defined(_DOUBLE_IS_STRING) || defined(_DOUBLE_IS_DOUBLE)
+    if (from_superclass == NULL) {
+        /* don't clear the exception unless numpy can handle the arguments */
+        if (PyTuple_GET_SIZE(args) != 1 || (kwds && PyDict_Size(kwds) != 0)) {
+            return NULL;
+        }
+        PyErr_Clear();
+    }
+    else {
+#if defined(_DOUBLE_IS_UNICODE)
+        PyArrayScalar_VAL(from_superclass, Unicode) = NULL;
+#endif
+        return from_superclass;
+    }
+#endif
+
+    /* TODO: include type name in error message, which is not double */
+    PyObject *obj = NULL;
+    static char *kwnames[] = {"", NULL};  /* positional-only */
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", kwnames, &obj)) {
+        return NULL;
+    }
+    PyArray_Descr *typecode = PyArray_DescrFromType(NPY_DOUBLE);
+    if (typecode == NULL) {
+        return NULL;
+    }
+    if (obj == NULL) {
+        PyObject *robj = PyArray_Scalar(NULL, typecode, NULL);
+        Py_DECREF(typecode);
+        if (robj == NULL) {
+            return NULL;
+        }
+#if !defined(_DOUBLE_IS_STRING) && !defined(_DOUBLE_IS_UNICODE)
+        memset(&PyArrayScalar_VAL(robj, Double), 0, sizeof(npy_double));
+#endif
+        return robj;
+    }
+
+    /* PyArray_FromAny steals a reference, reclaim it before it's gone */
+    Py_INCREF(typecode);
+    PyArrayObject *arr = (PyArrayObject *)PyArray_FromAny(
+            obj, typecode, 0, 0, NPY_ARRAY_FORCECAST, NULL);
+    if (arr == NULL) {
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    if (PyArray_NDIM(arr) > 0) {
+        Py_DECREF(typecode);
+        return (PyObject *)arr;
+    }
+
+    /* Convert the 0-d array to a scalar*/
+    PyObject *robj = PyArray_ToScalar(PyArray_DATA(arr), arr);
+    Py_DECREF(arr);
+
+    if (robj == NULL || Py_TYPE(robj) == type) {
+        Py_DECREF(typecode);
+        return robj;
+    }
+
+    /*
+     * `typecode` does not contain any subclass information, as it was thrown
+     * out by the call to `PyArray_DescrFromType` - we need to add this back.
+     *
+     * FIXME[gh-15467]: This branch is also hit for the "shadowed" builtin
+     * types like `longdouble` (which on platforms where they are the same size
+     * is shadowed by `double`), because `PyArray_FromAny` returns the
+     * shadowing type rather than the requested one.
+     */
+
+    /* Need to allocate new type and copy data-area over */
+    int itemsize;
+    if (type->tp_itemsize) {
+        itemsize = PyBytes_GET_SIZE(robj);
+    }
+    else {
+        itemsize = 0;
+    }
+    PyObject *new_obj = type->tp_alloc(type, itemsize);
+    if (new_obj == NULL) {
+        Py_DECREF(robj);
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    void *dest = scalar_value(new_obj, typecode);
+    void *src = scalar_value(robj, typecode);
+    Py_DECREF(typecode);
+#if defined(_DOUBLE_IS_STRING) || defined(_DOUBLE_IS_UNICODE)
+    if (itemsize == 0) { /* unicode */
+        itemsize = PyUnicode_GetLength(robj) * PyUnicode_KIND(robj);
+    }
+    memcpy(dest, src, itemsize);
+#else
+    *((npy_double *)dest) = *((npy_double *)src);
+#endif
+    Py_DECREF(robj);
+    return new_obj;
+}
+#undef _DOUBLE_IS_DOUBLE
+
+
+#line 2894
+
+/* used as a pattern for testing token equality */
+#define _LONGDOUBLE_IS_LONGDOUBLE
+
+static PyObject *
+longdouble_arrtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    /* allow base-class (if any) to do conversion */
+#if defined(_LONGDOUBLE_IS_UNICODE)
+    PyObject *from_superclass = PyUnicode_Type.tp_new(type, args, kwds);
+#elif defined(_LONGDOUBLE_IS_STRING)
+    PyObject *from_superclass = PyBytes_Type.tp_new(type, args, kwds);
+#elif defined(_LONGDOUBLE_IS_DOUBLE)
+    PyObject *from_superclass = PyFloat_Type.tp_new(type, args, kwds);
+#endif
+#if defined(_LONGDOUBLE_IS_UNICODE) || defined(_LONGDOUBLE_IS_STRING) || defined(_LONGDOUBLE_IS_DOUBLE)
+    if (from_superclass == NULL) {
+        /* don't clear the exception unless numpy can handle the arguments */
+        if (PyTuple_GET_SIZE(args) != 1 || (kwds && PyDict_Size(kwds) != 0)) {
+            return NULL;
+        }
+        PyErr_Clear();
+    }
+    else {
+#if defined(_LONGDOUBLE_IS_UNICODE)
+        PyArrayScalar_VAL(from_superclass, Unicode) = NULL;
+#endif
+        return from_superclass;
+    }
+#endif
+
+    /* TODO: include type name in error message, which is not longdouble */
+    PyObject *obj = NULL;
+    static char *kwnames[] = {"", NULL};  /* positional-only */
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", kwnames, &obj)) {
+        return NULL;
+    }
+    PyArray_Descr *typecode = PyArray_DescrFromType(NPY_LONGDOUBLE);
+    if (typecode == NULL) {
+        return NULL;
+    }
+    if (obj == NULL) {
+        PyObject *robj = PyArray_Scalar(NULL, typecode, NULL);
+        Py_DECREF(typecode);
+        if (robj == NULL) {
+            return NULL;
+        }
+#if !defined(_LONGDOUBLE_IS_STRING) && !defined(_LONGDOUBLE_IS_UNICODE)
+        memset(&PyArrayScalar_VAL(robj, LongDouble), 0, sizeof(npy_longdouble));
+#endif
+        return robj;
+    }
+
+    /* PyArray_FromAny steals a reference, reclaim it before it's gone */
+    Py_INCREF(typecode);
+    PyArrayObject *arr = (PyArrayObject *)PyArray_FromAny(
+            obj, typecode, 0, 0, NPY_ARRAY_FORCECAST, NULL);
+    if (arr == NULL) {
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    if (PyArray_NDIM(arr) > 0) {
+        Py_DECREF(typecode);
+        return (PyObject *)arr;
+    }
+
+    /* Convert the 0-d array to a scalar*/
+    PyObject *robj = PyArray_ToScalar(PyArray_DATA(arr), arr);
+    Py_DECREF(arr);
+
+    if (robj == NULL || Py_TYPE(robj) == type) {
+        Py_DECREF(typecode);
+        return robj;
+    }
+
+    /*
+     * `typecode` does not contain any subclass information, as it was thrown
+     * out by the call to `PyArray_DescrFromType` - we need to add this back.
+     *
+     * FIXME[gh-15467]: This branch is also hit for the "shadowed" builtin
+     * types like `longdouble` (which on platforms where they are the same size
+     * is shadowed by `double`), because `PyArray_FromAny` returns the
+     * shadowing type rather than the requested one.
+     */
+
+    /* Need to allocate new type and copy data-area over */
+    int itemsize;
+    if (type->tp_itemsize) {
+        itemsize = PyBytes_GET_SIZE(robj);
+    }
+    else {
+        itemsize = 0;
+    }
+    PyObject *new_obj = type->tp_alloc(type, itemsize);
+    if (new_obj == NULL) {
+        Py_DECREF(robj);
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    void *dest = scalar_value(new_obj, typecode);
+    void *src = scalar_value(robj, typecode);
+    Py_DECREF(typecode);
+#if defined(_LONGDOUBLE_IS_STRING) || defined(_LONGDOUBLE_IS_UNICODE)
+    if (itemsize == 0) { /* unicode */
+        itemsize = PyUnicode_GetLength(robj) * PyUnicode_KIND(robj);
+    }
+    memcpy(dest, src, itemsize);
+#else
+    *((npy_longdouble *)dest) = *((npy_longdouble *)src);
+#endif
+    Py_DECREF(robj);
+    return new_obj;
+}
+#undef _LONGDOUBLE_IS_LONGDOUBLE
+
+
+#line 2894
+
+/* used as a pattern for testing token equality */
+#define _CFLOAT_IS_CFLOAT
+
+static PyObject *
+cfloat_arrtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    /* allow base-class (if any) to do conversion */
+#if defined(_CFLOAT_IS_UNICODE)
+    PyObject *from_superclass = PyUnicode_Type.tp_new(type, args, kwds);
+#elif defined(_CFLOAT_IS_STRING)
+    PyObject *from_superclass = PyBytes_Type.tp_new(type, args, kwds);
+#elif defined(_CFLOAT_IS_DOUBLE)
+    PyObject *from_superclass = PyFloat_Type.tp_new(type, args, kwds);
+#endif
+#if defined(_CFLOAT_IS_UNICODE) || defined(_CFLOAT_IS_STRING) || defined(_CFLOAT_IS_DOUBLE)
+    if (from_superclass == NULL) {
+        /* don't clear the exception unless numpy can handle the arguments */
+        if (PyTuple_GET_SIZE(args) != 1 || (kwds && PyDict_Size(kwds) != 0)) {
+            return NULL;
+        }
+        PyErr_Clear();
+    }
+    else {
+#if defined(_CFLOAT_IS_UNICODE)
+        PyArrayScalar_VAL(from_superclass, Unicode) = NULL;
+#endif
+        return from_superclass;
+    }
+#endif
+
+    /* TODO: include type name in error message, which is not cfloat */
+    PyObject *obj = NULL;
+    static char *kwnames[] = {"", NULL};  /* positional-only */
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", kwnames, &obj)) {
+        return NULL;
+    }
+    PyArray_Descr *typecode = PyArray_DescrFromType(NPY_CFLOAT);
+    if (typecode == NULL) {
+        return NULL;
+    }
+    if (obj == NULL) {
+        PyObject *robj = PyArray_Scalar(NULL, typecode, NULL);
+        Py_DECREF(typecode);
+        if (robj == NULL) {
+            return NULL;
+        }
+#if !defined(_CFLOAT_IS_STRING) && !defined(_CFLOAT_IS_UNICODE)
+        memset(&PyArrayScalar_VAL(robj, CFloat), 0, sizeof(npy_cfloat));
+#endif
+        return robj;
+    }
+
+    /* PyArray_FromAny steals a reference, reclaim it before it's gone */
+    Py_INCREF(typecode);
+    PyArrayObject *arr = (PyArrayObject *)PyArray_FromAny(
+            obj, typecode, 0, 0, NPY_ARRAY_FORCECAST, NULL);
+    if (arr == NULL) {
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    if (PyArray_NDIM(arr) > 0) {
+        Py_DECREF(typecode);
+        return (PyObject *)arr;
+    }
+
+    /* Convert the 0-d array to a scalar*/
+    PyObject *robj = PyArray_ToScalar(PyArray_DATA(arr), arr);
+    Py_DECREF(arr);
+
+    if (robj == NULL || Py_TYPE(robj) == type) {
+        Py_DECREF(typecode);
+        return robj;
+    }
+
+    /*
+     * `typecode` does not contain any subclass information, as it was thrown
+     * out by the call to `PyArray_DescrFromType` - we need to add this back.
+     *
+     * FIXME[gh-15467]: This branch is also hit for the "shadowed" builtin
+     * types like `longdouble` (which on platforms where they are the same size
+     * is shadowed by `double`), because `PyArray_FromAny` returns the
+     * shadowing type rather than the requested one.
+     */
+
+    /* Need to allocate new type and copy data-area over */
+    int itemsize;
+    if (type->tp_itemsize) {
+        itemsize = PyBytes_GET_SIZE(robj);
+    }
+    else {
+        itemsize = 0;
+    }
+    PyObject *new_obj = type->tp_alloc(type, itemsize);
+    if (new_obj == NULL) {
+        Py_DECREF(robj);
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    void *dest = scalar_value(new_obj, typecode);
+    void *src = scalar_value(robj, typecode);
+    Py_DECREF(typecode);
+#if defined(_CFLOAT_IS_STRING) || defined(_CFLOAT_IS_UNICODE)
+    if (itemsize == 0) { /* unicode */
+        itemsize = PyUnicode_GetLength(robj) * PyUnicode_KIND(robj);
+    }
+    memcpy(dest, src, itemsize);
+#else
+    *((npy_cfloat *)dest) = *((npy_cfloat *)src);
+#endif
+    Py_DECREF(robj);
+    return new_obj;
+}
+#undef _CFLOAT_IS_CFLOAT
+
+
+#line 2894
+
+/* used as a pattern for testing token equality */
+#define _CDOUBLE_IS_CDOUBLE
+
+static PyObject *
+cdouble_arrtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    /* allow base-class (if any) to do conversion */
+#if defined(_CDOUBLE_IS_UNICODE)
+    PyObject *from_superclass = PyUnicode_Type.tp_new(type, args, kwds);
+#elif defined(_CDOUBLE_IS_STRING)
+    PyObject *from_superclass = PyBytes_Type.tp_new(type, args, kwds);
+#elif defined(_CDOUBLE_IS_DOUBLE)
+    PyObject *from_superclass = PyFloat_Type.tp_new(type, args, kwds);
+#endif
+#if defined(_CDOUBLE_IS_UNICODE) || defined(_CDOUBLE_IS_STRING) || defined(_CDOUBLE_IS_DOUBLE)
+    if (from_superclass == NULL) {
+        /* don't clear the exception unless numpy can handle the arguments */
+        if (PyTuple_GET_SIZE(args) != 1 || (kwds && PyDict_Size(kwds) != 0)) {
+            return NULL;
+        }
+        PyErr_Clear();
+    }
+    else {
+#if defined(_CDOUBLE_IS_UNICODE)
+        PyArrayScalar_VAL(from_superclass, Unicode) = NULL;
+#endif
+        return from_superclass;
+    }
+#endif
+
+    /* TODO: include type name in error message, which is not cdouble */
+    PyObject *obj = NULL;
+    static char *kwnames[] = {"", NULL};  /* positional-only */
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", kwnames, &obj)) {
+        return NULL;
+    }
+    PyArray_Descr *typecode = PyArray_DescrFromType(NPY_CDOUBLE);
+    if (typecode == NULL) {
+        return NULL;
+    }
+    if (obj == NULL) {
+        PyObject *robj = PyArray_Scalar(NULL, typecode, NULL);
+        Py_DECREF(typecode);
+        if (robj == NULL) {
+            return NULL;
+        }
+#if !defined(_CDOUBLE_IS_STRING) && !defined(_CDOUBLE_IS_UNICODE)
+        memset(&PyArrayScalar_VAL(robj, CDouble), 0, sizeof(npy_cdouble));
+#endif
+        return robj;
+    }
+
+    /* PyArray_FromAny steals a reference, reclaim it before it's gone */
+    Py_INCREF(typecode);
+    PyArrayObject *arr = (PyArrayObject *)PyArray_FromAny(
+            obj, typecode, 0, 0, NPY_ARRAY_FORCECAST, NULL);
+    if (arr == NULL) {
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    if (PyArray_NDIM(arr) > 0) {
+        Py_DECREF(typecode);
+        return (PyObject *)arr;
+    }
+
+    /* Convert the 0-d array to a scalar*/
+    PyObject *robj = PyArray_ToScalar(PyArray_DATA(arr), arr);
+    Py_DECREF(arr);
+
+    if (robj == NULL || Py_TYPE(robj) == type) {
+        Py_DECREF(typecode);
+        return robj;
+    }
+
+    /*
+     * `typecode` does not contain any subclass information, as it was thrown
+     * out by the call to `PyArray_DescrFromType` - we need to add this back.
+     *
+     * FIXME[gh-15467]: This branch is also hit for the "shadowed" builtin
+     * types like `longdouble` (which on platforms where they are the same size
+     * is shadowed by `double`), because `PyArray_FromAny` returns the
+     * shadowing type rather than the requested one.
+     */
+
+    /* Need to allocate new type and copy data-area over */
+    int itemsize;
+    if (type->tp_itemsize) {
+        itemsize = PyBytes_GET_SIZE(robj);
+    }
+    else {
+        itemsize = 0;
+    }
+    PyObject *new_obj = type->tp_alloc(type, itemsize);
+    if (new_obj == NULL) {
+        Py_DECREF(robj);
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    void *dest = scalar_value(new_obj, typecode);
+    void *src = scalar_value(robj, typecode);
+    Py_DECREF(typecode);
+#if defined(_CDOUBLE_IS_STRING) || defined(_CDOUBLE_IS_UNICODE)
+    if (itemsize == 0) { /* unicode */
+        itemsize = PyUnicode_GetLength(robj) * PyUnicode_KIND(robj);
+    }
+    memcpy(dest, src, itemsize);
+#else
+    *((npy_cdouble *)dest) = *((npy_cdouble *)src);
+#endif
+    Py_DECREF(robj);
+    return new_obj;
+}
+#undef _CDOUBLE_IS_CDOUBLE
+
+
+#line 2894
+
+/* used as a pattern for testing token equality */
+#define _CLONGDOUBLE_IS_CLONGDOUBLE
+
+static PyObject *
+clongdouble_arrtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    /* allow base-class (if any) to do conversion */
+#if defined(_CLONGDOUBLE_IS_UNICODE)
+    PyObject *from_superclass = PyUnicode_Type.tp_new(type, args, kwds);
+#elif defined(_CLONGDOUBLE_IS_STRING)
+    PyObject *from_superclass = PyBytes_Type.tp_new(type, args, kwds);
+#elif defined(_CLONGDOUBLE_IS_DOUBLE)
+    PyObject *from_superclass = PyFloat_Type.tp_new(type, args, kwds);
+#endif
+#if defined(_CLONGDOUBLE_IS_UNICODE) || defined(_CLONGDOUBLE_IS_STRING) || defined(_CLONGDOUBLE_IS_DOUBLE)
+    if (from_superclass == NULL) {
+        /* don't clear the exception unless numpy can handle the arguments */
+        if (PyTuple_GET_SIZE(args) != 1 || (kwds && PyDict_Size(kwds) != 0)) {
+            return NULL;
+        }
+        PyErr_Clear();
+    }
+    else {
+#if defined(_CLONGDOUBLE_IS_UNICODE)
+        PyArrayScalar_VAL(from_superclass, Unicode) = NULL;
+#endif
+        return from_superclass;
+    }
+#endif
+
+    /* TODO: include type name in error message, which is not clongdouble */
+    PyObject *obj = NULL;
+    static char *kwnames[] = {"", NULL};  /* positional-only */
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", kwnames, &obj)) {
+        return NULL;
+    }
+    PyArray_Descr *typecode = PyArray_DescrFromType(NPY_CLONGDOUBLE);
+    if (typecode == NULL) {
+        return NULL;
+    }
+    if (obj == NULL) {
+        PyObject *robj = PyArray_Scalar(NULL, typecode, NULL);
+        Py_DECREF(typecode);
+        if (robj == NULL) {
+            return NULL;
+        }
+#if !defined(_CLONGDOUBLE_IS_STRING) && !defined(_CLONGDOUBLE_IS_UNICODE)
+        memset(&PyArrayScalar_VAL(robj, CLongDouble), 0, sizeof(npy_clongdouble));
+#endif
+        return robj;
+    }
+
+    /* PyArray_FromAny steals a reference, reclaim it before it's gone */
+    Py_INCREF(typecode);
+    PyArrayObject *arr = (PyArrayObject *)PyArray_FromAny(
+            obj, typecode, 0, 0, NPY_ARRAY_FORCECAST, NULL);
+    if (arr == NULL) {
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    if (PyArray_NDIM(arr) > 0) {
+        Py_DECREF(typecode);
+        return (PyObject *)arr;
+    }
+
+    /* Convert the 0-d array to a scalar*/
+    PyObject *robj = PyArray_ToScalar(PyArray_DATA(arr), arr);
+    Py_DECREF(arr);
+
+    if (robj == NULL || Py_TYPE(robj) == type) {
+        Py_DECREF(typecode);
+        return robj;
+    }
+
+    /*
+     * `typecode` does not contain any subclass information, as it was thrown
+     * out by the call to `PyArray_DescrFromType` - we need to add this back.
+     *
+     * FIXME[gh-15467]: This branch is also hit for the "shadowed" builtin
+     * types like `longdouble` (which on platforms where they are the same size
+     * is shadowed by `double`), because `PyArray_FromAny` returns the
+     * shadowing type rather than the requested one.
+     */
+
+    /* Need to allocate new type and copy data-area over */
+    int itemsize;
+    if (type->tp_itemsize) {
+        itemsize = PyBytes_GET_SIZE(robj);
+    }
+    else {
+        itemsize = 0;
+    }
+    PyObject *new_obj = type->tp_alloc(type, itemsize);
+    if (new_obj == NULL) {
+        Py_DECREF(robj);
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    void *dest = scalar_value(new_obj, typecode);
+    void *src = scalar_value(robj, typecode);
+    Py_DECREF(typecode);
+#if defined(_CLONGDOUBLE_IS_STRING) || defined(_CLONGDOUBLE_IS_UNICODE)
+    if (itemsize == 0) { /* unicode */
+        itemsize = PyUnicode_GetLength(robj) * PyUnicode_KIND(robj);
+    }
+    memcpy(dest, src, itemsize);
+#else
+    *((npy_clongdouble *)dest) = *((npy_clongdouble *)src);
+#endif
+    Py_DECREF(robj);
+    return new_obj;
+}
+#undef _CLONGDOUBLE_IS_CLONGDOUBLE
+
+
+#line 2894
+
+/* used as a pattern for testing token equality */
+#define _STRING_IS_STRING
+
+static PyObject *
+string_arrtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    /* allow base-class (if any) to do conversion */
+#if defined(_STRING_IS_UNICODE)
+    PyObject *from_superclass = PyUnicode_Type.tp_new(type, args, kwds);
+#elif defined(_STRING_IS_STRING)
+    PyObject *from_superclass = PyBytes_Type.tp_new(type, args, kwds);
+#elif defined(_STRING_IS_DOUBLE)
+    PyObject *from_superclass = PyFloat_Type.tp_new(type, args, kwds);
+#endif
+#if defined(_STRING_IS_UNICODE) || defined(_STRING_IS_STRING) || defined(_STRING_IS_DOUBLE)
+    if (from_superclass == NULL) {
+        /* don't clear the exception unless numpy can handle the arguments */
+        if (PyTuple_GET_SIZE(args) != 1 || (kwds && PyDict_Size(kwds) != 0)) {
+            return NULL;
+        }
+        PyErr_Clear();
+    }
+    else {
+#if defined(_STRING_IS_UNICODE)
+        PyArrayScalar_VAL(from_superclass, Unicode) = NULL;
+#endif
+        return from_superclass;
+    }
+#endif
+
+    /* TODO: include type name in error message, which is not string */
+    PyObject *obj = NULL;
+    static char *kwnames[] = {"", NULL};  /* positional-only */
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", kwnames, &obj)) {
+        return NULL;
+    }
+    PyArray_Descr *typecode = PyArray_DescrFromType(NPY_STRING);
+    if (typecode == NULL) {
+        return NULL;
+    }
+    if (obj == NULL) {
+        PyObject *robj = PyArray_Scalar(NULL, typecode, NULL);
+        Py_DECREF(typecode);
+        if (robj == NULL) {
+            return NULL;
+        }
+#if !defined(_STRING_IS_STRING) && !defined(_STRING_IS_UNICODE)
+        memset(&PyArrayScalar_VAL(robj, String), 0, sizeof(npy_string));
+#endif
+        return robj;
+    }
+
+    /* PyArray_FromAny steals a reference, reclaim it before it's gone */
+    Py_INCREF(typecode);
+    PyArrayObject *arr = (PyArrayObject *)PyArray_FromAny(
+            obj, typecode, 0, 0, NPY_ARRAY_FORCECAST, NULL);
+    if (arr == NULL) {
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    if (PyArray_NDIM(arr) > 0) {
+        Py_DECREF(typecode);
+        return (PyObject *)arr;
+    }
+
+    /* Convert the 0-d array to a scalar*/
+    PyObject *robj = PyArray_ToScalar(PyArray_DATA(arr), arr);
+    Py_DECREF(arr);
+
+    if (robj == NULL || Py_TYPE(robj) == type) {
+        Py_DECREF(typecode);
+        return robj;
+    }
+
+    /*
+     * `typecode` does not contain any subclass information, as it was thrown
+     * out by the call to `PyArray_DescrFromType` - we need to add this back.
+     *
+     * FIXME[gh-15467]: This branch is also hit for the "shadowed" builtin
+     * types like `longdouble` (which on platforms where they are the same size
+     * is shadowed by `double`), because `PyArray_FromAny` returns the
+     * shadowing type rather than the requested one.
+     */
+
+    /* Need to allocate new type and copy data-area over */
+    int itemsize;
+    if (type->tp_itemsize) {
+        itemsize = PyBytes_GET_SIZE(robj);
+    }
+    else {
+        itemsize = 0;
+    }
+    PyObject *new_obj = type->tp_alloc(type, itemsize);
+    if (new_obj == NULL) {
+        Py_DECREF(robj);
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    void *dest = scalar_value(new_obj, typecode);
+    void *src = scalar_value(robj, typecode);
+    Py_DECREF(typecode);
+#if defined(_STRING_IS_STRING) || defined(_STRING_IS_UNICODE)
+    if (itemsize == 0) { /* unicode */
+        itemsize = PyUnicode_GetLength(robj) * PyUnicode_KIND(robj);
+    }
+    memcpy(dest, src, itemsize);
+#else
+    *((npy_string *)dest) = *((npy_string *)src);
+#endif
+    Py_DECREF(robj);
+    return new_obj;
+}
+#undef _STRING_IS_STRING
+
+
+#line 2894
+
+/* used as a pattern for testing token equality */
+#define _UNICODE_IS_UNICODE
+
+static PyObject *
+unicode_arrtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    /* allow base-class (if any) to do conversion */
+#if defined(_UNICODE_IS_UNICODE)
+    PyObject *from_superclass = PyUnicode_Type.tp_new(type, args, kwds);
+#elif defined(_UNICODE_IS_STRING)
+    PyObject *from_superclass = PyBytes_Type.tp_new(type, args, kwds);
+#elif defined(_UNICODE_IS_DOUBLE)
+    PyObject *from_superclass = PyFloat_Type.tp_new(type, args, kwds);
+#endif
+#if defined(_UNICODE_IS_UNICODE) || defined(_UNICODE_IS_STRING) || defined(_UNICODE_IS_DOUBLE)
+    if (from_superclass == NULL) {
+        /* don't clear the exception unless numpy can handle the arguments */
+        if (PyTuple_GET_SIZE(args) != 1 || (kwds && PyDict_Size(kwds) != 0)) {
+            return NULL;
+        }
+        PyErr_Clear();
+    }
+    else {
+#if defined(_UNICODE_IS_UNICODE)
+        PyArrayScalar_VAL(from_superclass, Unicode) = NULL;
+#endif
+        return from_superclass;
+    }
+#endif
+
+    /* TODO: include type name in error message, which is not unicode */
+    PyObject *obj = NULL;
+    static char *kwnames[] = {"", NULL};  /* positional-only */
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O", kwnames, &obj)) {
+        return NULL;
+    }
+    PyArray_Descr *typecode = PyArray_DescrFromType(NPY_UNICODE);
+    if (typecode == NULL) {
+        return NULL;
+    }
+    if (obj == NULL) {
+        PyObject *robj = PyArray_Scalar(NULL, typecode, NULL);
+        Py_DECREF(typecode);
+        if (robj == NULL) {
+            return NULL;
+        }
+#if !defined(_UNICODE_IS_STRING) && !defined(_UNICODE_IS_UNICODE)
+        memset(&PyArrayScalar_VAL(robj, Unicode), 0, sizeof(npy_unicode));
+#endif
+        return robj;
+    }
+
+    /* PyArray_FromAny steals a reference, reclaim it before it's gone */
+    Py_INCREF(typecode);
+    PyArrayObject *arr = (PyArrayObject *)PyArray_FromAny(
+            obj, typecode, 0, 0, NPY_ARRAY_FORCECAST, NULL);
+    if (arr == NULL) {
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    if (PyArray_NDIM(arr) > 0) {
+        Py_DECREF(typecode);
+        return (PyObject *)arr;
+    }
+
+    /* Convert the 0-d array to a scalar*/
+    PyObject *robj = PyArray_ToScalar(PyArray_DATA(arr), arr);
+    Py_DECREF(arr);
+
+    if (robj == NULL || Py_TYPE(robj) == type) {
+        Py_DECREF(typecode);
+        return robj;
+    }
+
+    /*
+     * `typecode` does not contain any subclass information, as it was thrown
+     * out by the call to `PyArray_DescrFromType` - we need to add this back.
+     *
+     * FIXME[gh-15467]: This branch is also hit for the "shadowed" builtin
+     * types like `longdouble` (which on platforms where they are the same size
+     * is shadowed by `double`), because `PyArray_FromAny` returns the
+     * shadowing type rather than the requested one.
+     */
+
+    /* Need to allocate new type and copy data-area over */
+    int itemsize;
+    if (type->tp_itemsize) {
+        itemsize = PyBytes_GET_SIZE(robj);
+    }
+    else {
+        itemsize = 0;
+    }
+    PyObject *new_obj = type->tp_alloc(type, itemsize);
+    if (new_obj == NULL) {
+        Py_DECREF(robj);
+        Py_DECREF(typecode);
+        return NULL;
+    }
+    void *dest = scalar_value(new_obj, typecode);
+    void *src = scalar_value(robj, typecode);
+    Py_DECREF(typecode);
+#if defined(_UNICODE_IS_STRING) || defined(_UNICODE_IS_UNICODE)
+    if (itemsize == 0) { /* unicode */
+        itemsize = PyUnicode_GetLength(robj) * PyUnicode_KIND(robj);
+    }
+    memcpy(dest, src, itemsize);
+#else
+    *((npy_unicode *)dest) = *((npy_unicode *)src);
+#endif
+    Py_DECREF(robj);
+    return new_obj;
+}
+#undef _UNICODE_IS_UNICODE
+
+
+
+static PyObject *
+object_arrtype_new(PyTypeObject *NPY_UNUSED(type), PyObject *args, PyObject *kwds)
+{
+    PyObject *obj = Py_None;
+    static char *kwnames[] = {"", NULL};  /* positional-only */
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O:object_", kwnames, &obj)) {
+        return NULL;
+    }
+    PyArray_Descr *typecode = PyArray_DescrFromType(NPY_OBJECT);
+    if (typecode == NULL) {
+        return NULL;
+    }
+    PyArrayObject *arr = (PyArrayObject *)PyArray_FromAny(obj, typecode,
+                                    0, 0, NPY_ARRAY_FORCECAST, NULL);
+    return PyArray_Return(arr);
+}
+
+#line 3034
+
+static PyObject *
+datetime_arrtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    PyObject *obj = NULL, *meta_obj = NULL;
+    PyDatetimeScalarObject *ret;
+
+    static char *kwnames[] = {"", "", NULL};  /* positional-only */
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|OO", kwnames, &obj, &meta_obj)) {
+        return NULL;
+    }
+
+    /* Allocate the return scalar */
+    ret = (PyDatetimeScalarObject *)PyDatetimeArrType_Type.tp_alloc(
+                                            &PyDatetimeArrType_Type, 0);
+    if (ret == NULL) {
+        return NULL;
+    }
+
+    /* Incorporate the metadata if its provided */
+    if (meta_obj != NULL) {
+        /* Parse the provided metadata input */
+        if (convert_pyobject_to_datetime_metadata(meta_obj, &ret->obmeta)
+                                                                    < 0) {
+            Py_DECREF(ret);
+            return NULL;
+        }
+    }
+    else {
+        /*
+         * A unit of -1 signals that convert_pyobject_to_datetime
+         * should populate.
+         */
+        ret->obmeta.base = -1;
+    }
+
+    if (obj == NULL) {
+        if (ret->obmeta.base == -1) {
+            ret->obmeta.base = NPY_DATETIME_DEFAULTUNIT;
+            ret->obmeta.num = 1;
+        }
+
+        /* Make datetime default to NaT, timedelta default to zero */
+#if 1
+        ret->obval = NPY_DATETIME_NAT;
+#else
+        ret->obval = 0;
+#endif
+    }
+    else if (convert_pyobject_to_datetime(&ret->obmeta, obj,
+                            NPY_SAME_KIND_CASTING, &ret->obval) < 0) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+
+    return (PyObject *)ret;
+}
+
+#line 3034
+
+static PyObject *
+timedelta_arrtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    PyObject *obj = NULL, *meta_obj = NULL;
+    PyTimedeltaScalarObject *ret;
+
+    static char *kwnames[] = {"", "", NULL};  /* positional-only */
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|OO", kwnames, &obj, &meta_obj)) {
+        return NULL;
+    }
+
+    /* Allocate the return scalar */
+    ret = (PyTimedeltaScalarObject *)PyTimedeltaArrType_Type.tp_alloc(
+                                            &PyTimedeltaArrType_Type, 0);
+    if (ret == NULL) {
+        return NULL;
+    }
+
+    /* Incorporate the metadata if its provided */
+    if (meta_obj != NULL) {
+        /* Parse the provided metadata input */
+        if (convert_pyobject_to_datetime_metadata(meta_obj, &ret->obmeta)
+                                                                    < 0) {
+            Py_DECREF(ret);
+            return NULL;
+        }
+    }
+    else {
+        /*
+         * A unit of -1 signals that convert_pyobject_to_datetime
+         * should populate.
+         */
+        ret->obmeta.base = -1;
+    }
+
+    if (obj == NULL) {
+        if (ret->obmeta.base == -1) {
+            ret->obmeta.base = NPY_DATETIME_DEFAULTUNIT;
+            ret->obmeta.num = 1;
+        }
+
+        /* Make datetime default to NaT, timedelta default to zero */
+#if 0
+        ret->obval = NPY_DATETIME_NAT;
+#else
+        ret->obval = 0;
+#endif
+    }
+    else if (convert_pyobject_to_timedelta(&ret->obmeta, obj,
+                            NPY_SAME_KIND_CASTING, &ret->obval) < 0) {
+        Py_DECREF(ret);
+        return NULL;
+    }
+
+    return (PyObject *)ret;
+}
+
+
+/* bool->tp_new only returns Py_True or Py_False */
+static PyObject *
+bool_arrtype_new(PyTypeObject *NPY_UNUSED(type), PyObject *args, PyObject *kwds)
+{
+    PyObject *obj = NULL;
+    PyArrayObject *arr;
+
+    static char *kwnames[] = {"", NULL};  /* positional-only */
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|O:bool_", kwnames, &obj)) {
+        return NULL;
+    }
+    if (obj == NULL) {
+        PyArrayScalar_RETURN_FALSE;
+    }
+    if (obj == Py_False) {
+        PyArrayScalar_RETURN_FALSE;
+    }
+    if (obj == Py_True) {
+        PyArrayScalar_RETURN_TRUE;
+    }
+    arr = (PyArrayObject *)PyArray_FROM_OTF(obj,
+                                NPY_BOOL, NPY_ARRAY_FORCECAST);
+    if (arr && 0 == PyArray_NDIM(arr)) {
+        npy_bool val = *((npy_bool *)PyArray_DATA(arr));
+        Py_DECREF(arr);
+        PyArrayScalar_RETURN_BOOL_FROM_LONG(val);
+    }
+    return PyArray_Return((PyArrayObject *)arr);
+}
+
+static PyObject *
+bool_arrtype_and(PyObject *a, PyObject *b)
+{
+    if (PyArray_IsScalar(a, Bool) && PyArray_IsScalar(b, Bool)) {
+        PyArrayScalar_RETURN_BOOL_FROM_LONG
+            ((a == PyArrayScalar_True) & (b == PyArrayScalar_True));
+    }
+    return PyGenericArrType_Type.tp_as_number->nb_and(a, b);
+}
+
+static PyObject *
+bool_arrtype_or(PyObject *a, PyObject *b)
+{
+    if (PyArray_IsScalar(a, Bool) && PyArray_IsScalar(b, Bool)) {
+        PyArrayScalar_RETURN_BOOL_FROM_LONG
+            ((a == PyArrayScalar_True)|(b == PyArrayScalar_True));
+    }
+    return PyGenericArrType_Type.tp_as_number->nb_or(a, b);
+}
+
+static PyObject *
+bool_arrtype_xor(PyObject *a, PyObject *b)
+{
+    if (PyArray_IsScalar(a, Bool) && PyArray_IsScalar(b, Bool)) {
+        PyArrayScalar_RETURN_BOOL_FROM_LONG
+            ((a == PyArrayScalar_True)^(b == PyArrayScalar_True));
+    }
+    return PyGenericArrType_Type.tp_as_number->nb_xor(a, b);
+}
+
+static int
+bool_arrtype_nonzero(PyObject *a)
+{
+    return a == PyArrayScalar_True;
+}
+
+#line 3167
+static PyNumberMethods byte_arrtype_as_number;
+static PyObject *
+byte_index(PyObject *self)
+{
+    return PyLong_FromLong(PyArrayScalar_VAL(self, Byte));
+}
+
+#line 3167
+static PyNumberMethods short_arrtype_as_number;
+static PyObject *
+short_index(PyObject *self)
+{
+    return PyLong_FromLong(PyArrayScalar_VAL(self, Short));
+}
+
+#line 3167
+static PyNumberMethods int_arrtype_as_number;
+static PyObject *
+int_index(PyObject *self)
+{
+    return PyLong_FromLong(PyArrayScalar_VAL(self, Int));
+}
+
+#line 3167
+static PyNumberMethods long_arrtype_as_number;
+static PyObject *
+long_index(PyObject *self)
+{
+    return PyLong_FromLong(PyArrayScalar_VAL(self, Long));
+}
+
+#line 3167
+static PyNumberMethods ubyte_arrtype_as_number;
+static PyObject *
+ubyte_index(PyObject *self)
+{
+    return PyLong_FromLong(PyArrayScalar_VAL(self, UByte));
+}
+
+#line 3167
+static PyNumberMethods ushort_arrtype_as_number;
+static PyObject *
+ushort_index(PyObject *self)
+{
+    return PyLong_FromLong(PyArrayScalar_VAL(self, UShort));
+}
+
+#line 3167
+static PyNumberMethods longlong_arrtype_as_number;
+static PyObject *
+longlong_index(PyObject *self)
+{
+    return PyLong_FromLongLong(PyArrayScalar_VAL(self, LongLong));
+}
+
+#line 3167
+static PyNumberMethods uint_arrtype_as_number;
+static PyObject *
+uint_index(PyObject *self)
+{
+    return PyLong_FromUnsignedLong(PyArrayScalar_VAL(self, UInt));
+}
+
+#line 3167
+static PyNumberMethods ulong_arrtype_as_number;
+static PyObject *
+ulong_index(PyObject *self)
+{
+    return PyLong_FromUnsignedLong(PyArrayScalar_VAL(self, ULong));
+}
+
+#line 3167
+static PyNumberMethods ulonglong_arrtype_as_number;
+static PyObject *
+ulonglong_index(PyObject *self)
+{
+    return PyLong_FromUnsignedLongLong(PyArrayScalar_VAL(self, ULongLong));
+}
+
+
+#line 3181
+static PyNumberMethods half_arrtype_as_number;
+
+#line 3181
+static PyNumberMethods float_arrtype_as_number;
+
+#line 3181
+static PyNumberMethods double_arrtype_as_number;
+
+#line 3181
+static PyNumberMethods longdouble_arrtype_as_number;
+
+#line 3181
+static PyNumberMethods cfloat_arrtype_as_number;
+
+#line 3181
+static PyNumberMethods cdouble_arrtype_as_number;
+
+#line 3181
+static PyNumberMethods clongdouble_arrtype_as_number;
+
+
+static PyObject *
+bool_index(PyObject *a)
+{
+    if (DEPRECATE(
+            "In future, it will be an error for 'np.bool_' scalars to be "
+            "interpreted as an index") < 0) {
+        return NULL;
+    }
+    else {
+        return PyLong_FromLong(PyArrayScalar_VAL(a, Bool));
+    }
+}
+
+/* Arithmetic methods -- only so we can override &, |, ^. */
+NPY_NO_EXPORT PyNumberMethods bool_arrtype_as_number = {
+    .nb_bool = (inquiry)bool_arrtype_nonzero,
+    .nb_and = (binaryfunc)bool_arrtype_and,
+    .nb_xor = (binaryfunc)bool_arrtype_xor,
+    .nb_or = (binaryfunc)bool_arrtype_or,
+};
+
+static PyObject *
+void_arrtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
+{
+    PyObject *obj, *arr;
+    PyArray_Descr *descr = NULL;
+
+    static char *kwnames[] = {"", "dtype", NULL};
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|O&:void", kwnames,
+            &obj, &PyArray_DescrConverter2, &descr)) {
+        return NULL;
+    }
+    /*
+     * For a VOID scalar first see if obj is an integer or long
+     * and create new memory of that size (filled with 0) for the scalar
+     */
+    if (descr == NULL && (
+            PyLong_Check(obj) ||
+            PyArray_IsScalar(obj, Integer) ||
+            (PyArray_Check(obj) &&
+                     PyArray_NDIM((PyArrayObject *)obj)==0 &&
+                     PyArray_ISINTEGER((PyArrayObject *)obj)))) {
+
+        PyObject *length = Py_TYPE(obj)->tp_as_number->nb_int(obj);
+        if (length == NULL) {
+            return NULL;
+        }
+
+        PyObject *ret;
+        char *destptr;
+        npy_ulonglong memu = PyLong_AsUnsignedLongLong(length);
+        Py_DECREF(length);
+        if (PyErr_Occurred() || (memu > NPY_MAX_INT)) {
+            PyErr_Clear();
+            PyErr_Format(PyExc_OverflowError,
+                    "size must be non-negative and not greater than %d",
+                    (int) NPY_MAX_INT);
+            return NULL;
+        }
+        if (memu == 0) {
+            memu = 1;
+        }
+        destptr = npy_alloc_cache_zero(memu, 1);
+        if (destptr == NULL) {
+            return PyErr_NoMemory();
+        }
+        ret = type->tp_alloc(type, 0);
+        if (ret == NULL) {
+            npy_free_cache(destptr, memu);
+            return PyErr_NoMemory();
+        }
+        ((PyVoidScalarObject *)ret)->obval = destptr;
+        Py_SET_SIZE((PyVoidScalarObject *)ret, (int) memu);
+        ((PyVoidScalarObject *)ret)->flags = NPY_ARRAY_BEHAVED |
+                                             NPY_ARRAY_OWNDATA;
+        ((PyVoidScalarObject *)ret)->base = NULL;
+        ((PyVoidScalarObject *)ret)->descr =
+            PyArray_DescrNewFromType(NPY_VOID);
+        if (((PyVoidScalarObject *)ret)->descr == NULL) {
+            Py_DECREF(ret);
+            return NULL;
+        }
+        ((PyVoidScalarObject *)ret)->descr->elsize = (int) memu;
+        return ret;
+    }
+
+    if (descr == NULL) {
+        /* Use the "size-less" void dtype to discover the size. */
+        descr = PyArray_DescrNewFromType(NPY_VOID);
+        if (descr == NULL) {
+            return NULL;
+        }
+    }
+    else if (descr->type_num != NPY_VOID || PyDataType_HASSUBARRAY(descr)) {
+        /* we reject subarrays, since subarray scalars do not exist. */
+        PyErr_Format(PyExc_TypeError,
+                "void: descr must be a `void` dtype that is not "
+                "a subarray dtype (structured or unstructured). "
+                "Got '%.100R'.", descr);
+        Py_DECREF(descr);
+        return NULL;
+    }
+
+    arr = PyArray_FromAny(obj, descr, 0, 0, NPY_ARRAY_FORCECAST, NULL);
+    return PyArray_Return((PyArrayObject *)arr);
+}
+
+
+/****************  Define Hash functions ********************/
+
+#line 3298
+static npy_hash_t
+bool_arrtype_hash(PyObject *obj)
+{
+    return (npy_hash_t)(PyArrayScalar_VAL(obj, Bool));
+}
+
+#line 3298
+static npy_hash_t
+ubyte_arrtype_hash(PyObject *obj)
+{
+    return (npy_hash_t)(PyArrayScalar_VAL(obj, UByte));
+}
+
+#line 3298
+static npy_hash_t
+ushort_arrtype_hash(PyObject *obj)
+{
+    return (npy_hash_t)(PyArrayScalar_VAL(obj, UShort));
+}
+
+
+#line 3309
+static npy_hash_t
+byte_arrtype_hash(PyObject *obj)
+{
+    npy_hash_t x = (npy_hash_t)(PyArrayScalar_VAL(obj, Byte));
+    if (x == -1) {
+        x = -2;
+    }
+    return x;
+}
+
+#line 3309
+static npy_hash_t
+short_arrtype_hash(PyObject *obj)
+{
+    npy_hash_t x = (npy_hash_t)(PyArrayScalar_VAL(obj, Short));
+    if (x == -1) {
+        x = -2;
+    }
+    return x;
+}
+
+#line 3309
+static npy_hash_t
+uint_arrtype_hash(PyObject *obj)
+{
+    npy_hash_t x = (npy_hash_t)(PyArrayScalar_VAL(obj, UInt));
+    if (x == -1) {
+        x = -2;
+    }
+    return x;
+}
+
+
+static npy_hash_t
+ulong_arrtype_hash(PyObject *obj)
+{
+    PyObject * l = PyLong_FromUnsignedLong(PyArrayScalar_VAL(obj, ULong));
+    npy_hash_t x = PyObject_Hash(l);
+    Py_DECREF(l);
+    return x;
+}
+
+static npy_hash_t
+int_arrtype_hash(PyObject *obj)
+{
+    npy_hash_t x = (npy_hash_t)(PyArrayScalar_VAL(obj, Int));
+    if (x == -1) {
+        x = -2;
+    }
+    return x;
+}
+
+static npy_hash_t
+long_arrtype_hash(PyObject *obj)
+{
+    PyObject * l = PyLong_FromLong(PyArrayScalar_VAL(obj, Long));
+    npy_hash_t x = PyObject_Hash(l);
+    Py_DECREF(l);
+    return x;
+}
+
+#line 3353
+static inline npy_hash_t
+longlong_arrtype_hash(PyObject *obj)
+{
+    PyObject * l = PyLong_FromLongLong(
+                                 PyArrayScalar_VAL(obj, LongLong));
+    npy_hash_t x = PyObject_Hash(l);
+    Py_DECREF(l);
+    return x;
+}
+
+#line 3353
+static inline npy_hash_t
+ulonglong_arrtype_hash(PyObject *obj)
+{
+    PyObject * l = PyLong_FromUnsignedLongLong(
+                                 PyArrayScalar_VAL(obj, ULongLong));
+    npy_hash_t x = PyObject_Hash(l);
+    Py_DECREF(l);
+    return x;
+}
+
+
+
+#line 3369
+#if NPY_SIZEOF_HASH_T==NPY_SIZEOF_DATETIME
+static npy_hash_t
+datetime_arrtype_hash(PyObject *obj)
+{
+    npy_hash_t x = (npy_hash_t)(PyArrayScalar_VAL(obj, Datetime));
+    if (x == -1) {
+        x = -2;
+    }
+    return x;
+}
+#elif NPY_SIZEOF_LONGLONG==NPY_SIZEOF_DATETIME
+static npy_hash_t
+datetime_arrtype_hash(PyObject *obj)
+{
+    npy_hash_t y;
+    npy_longlong x = (PyArrayScalar_VAL(obj, Datetime));
+
+    if ((x <= LONG_MAX)) {
+        y = (npy_hash_t) x;
+    }
+    else {
+        union Mask {
+            long hashvals[2];
+            npy_longlong v;
+        } both;
+
+        both.v = x;
+        y = both.hashvals[0] + (1000003)*both.hashvals[1];
+    }
+    if (y == -1) {
+        y = -2;
+    }
+    return y;
+}
+#endif
+
+#line 3369
+#if NPY_SIZEOF_HASH_T==NPY_SIZEOF_DATETIME
+static npy_hash_t
+timedelta_arrtype_hash(PyObject *obj)
+{
+    npy_hash_t x = (npy_hash_t)(PyArrayScalar_VAL(obj, Timedelta));
+    if (x == -1) {
+        x = -2;
+    }
+    return x;
+}
+#elif NPY_SIZEOF_LONGLONG==NPY_SIZEOF_DATETIME
+static npy_hash_t
+timedelta_arrtype_hash(PyObject *obj)
+{
+    npy_hash_t y;
+    npy_longlong x = (PyArrayScalar_VAL(obj, Timedelta));
+
+    if ((x <= LONG_MAX)) {
+        y = (npy_hash_t) x;
+    }
+    else {
+        union Mask {
+            long hashvals[2];
+            npy_longlong v;
+        } both;
+
+        both.v = x;
+        y = both.hashvals[0] + (1000003)*both.hashvals[1];
+    }
+    if (y == -1) {
+        y = -2;
+    }
+    return y;
+}
+#endif
+
+
+
+
+/* Wrong thing to do for longdouble, but....*/
+
+#line 3414
+static npy_hash_t
+float_arrtype_hash(PyObject *obj)
+{
+    return Npy_HashDouble(obj, (double)PyArrayScalar_VAL(obj, Float));
+}
+
+/* borrowed from complex_hash */
+static npy_hash_t
+cfloat_arrtype_hash(PyObject *obj)
+{
+    npy_hash_t hashreal, hashimag, combined;
+    hashreal = Npy_HashDouble(
+            obj, (double)PyArrayScalar_VAL(obj, CFloat).real);
+
+    if (hashreal == -1) {
+        return -1;
+    }
+    hashimag = Npy_HashDouble(
+            obj, (double)PyArrayScalar_VAL(obj, CFloat).imag);
+    if (hashimag == -1) {
+        return -1;
+    }
+    combined = hashreal + 1000003 * hashimag;
+    if (combined == -1) {
+        combined = -2;
+    }
+    return combined;
+}
+
+#line 3414
+static npy_hash_t
+longdouble_arrtype_hash(PyObject *obj)
+{
+    return Npy_HashDouble(obj, (double)PyArrayScalar_VAL(obj, LongDouble));
+}
+
+/* borrowed from complex_hash */
+static npy_hash_t
+clongdouble_arrtype_hash(PyObject *obj)
+{
+    npy_hash_t hashreal, hashimag, combined;
+    hashreal = Npy_HashDouble(
+            obj, (double)PyArrayScalar_VAL(obj, CLongDouble).real);
+
+    if (hashreal == -1) {
+        return -1;
+    }
+    hashimag = Npy_HashDouble(
+            obj, (double)PyArrayScalar_VAL(obj, CLongDouble).imag);
+    if (hashimag == -1) {
+        return -1;
+    }
+    combined = hashreal + 1000003 * hashimag;
+    if (combined == -1) {
+        combined = -2;
+    }
+    return combined;
+}
+
+
+static npy_hash_t
+half_arrtype_hash(PyObject *obj)
+{
+    return Npy_HashDouble(
+            obj, npy_half_to_double(PyArrayScalar_VAL(obj, Half)));
+}
+
+static npy_hash_t
+object_arrtype_hash(PyObject *obj)
+{
+    return PyObject_Hash(PyArrayScalar_VAL(obj, Object));
+}
+
+/* we used to just hash the pointer */
+/* now use tuplehash algorithm using voidtype_item to get the object
+*/
+static npy_hash_t
+void_arrtype_hash(PyObject *obj)
+{
+    npy_hash_t x, y;
+    Py_ssize_t len, n;
+    PyVoidScalarObject *p;
+    PyObject *element;
+    npy_hash_t mult = 1000003L;
+    x = 0x345678L;
+    p = (PyVoidScalarObject *)obj;
+    /* Cannot hash mutable void scalars */
+    if (p->flags & NPY_ARRAY_WRITEABLE) {
+       PyErr_SetString(PyExc_TypeError, "unhashable type: 'writeable void-scalar'");
+       return -1;
+    }
+    len = voidtype_length(p);
+    for (n=0; n < len; n++) {
+        element = voidtype_item(p, n);
+        y = PyObject_Hash(element);
+        Py_DECREF(element);
+        if (y == -1)
+           return -1;
+        x = (x ^ y) * mult;
+        mult += (npy_hash_t)(82520L + len + len);
+    }
+    x += 97531L;
+    if (x == -1)
+        x = -2;
+    return x;
+}
+
+/*object arrtype getattro and setattro */
+static PyObject *
+object_arrtype_getattro(PyObjectScalarObject *obj, PyObject *attr) {
+    PyObject *res;
+
+    /* first look in object and then hand off to generic type */
+
+    res = PyObject_GenericGetAttr(obj->obval, attr);
+    if (res) {
+        return res;
+    }
+    PyErr_Clear();
+    return  PyObject_GenericGetAttr((PyObject *)obj, attr);
+}
+
+static int
+object_arrtype_setattro(PyObjectScalarObject *obj, PyObject *attr, PyObject *val) {
+    int res;
+    /* first look in object and then hand off to generic type */
+
+    res = PyObject_GenericSetAttr(obj->obval, attr, val);
+    if (res >= 0) {
+        return res;
+    }
+    PyErr_Clear();
+    return PyObject_GenericSetAttr((PyObject *)obj, attr, val);
+}
+
+static PyObject *
+object_arrtype_concat(PyObjectScalarObject *self, PyObject *other)
+{
+    return PySequence_Concat(self->obval, other);
+}
+
+static Py_ssize_t
+object_arrtype_length(PyObjectScalarObject *self)
+{
+    return PyObject_Length(self->obval);
+}
+
+static PyObject *
+object_arrtype_repeat(PyObjectScalarObject *self, Py_ssize_t count)
+{
+    return PySequence_Repeat(self->obval, count);
+}
+
+static PyObject *
+object_arrtype_subscript(PyObjectScalarObject *self, PyObject *key)
+{
+    return PyObject_GetItem(self->obval, key);
+}
+
+static int
+object_arrtype_ass_subscript(PyObjectScalarObject *self, PyObject *key,
+                             PyObject *value)
+{
+    return PyObject_SetItem(self->obval, key, value);
+}
+
+static int
+object_arrtype_contains(PyObjectScalarObject *self, PyObject *ob)
+{
+    return PySequence_Contains(self->obval, ob);
+}
+
+static PyObject *
+object_arrtype_inplace_concat(PyObjectScalarObject *self, PyObject *o)
+{
+    return PySequence_InPlaceConcat(self->obval, o);
+}
+
+static PyObject *
+object_arrtype_inplace_repeat(PyObjectScalarObject *self, Py_ssize_t count)
+{
+    return PySequence_InPlaceRepeat(self->obval, count);
+}
+
+static PySequenceMethods object_arrtype_as_sequence = {
+    .sq_length = (lenfunc)object_arrtype_length,
+    .sq_concat = (binaryfunc)object_arrtype_concat,
+    .sq_repeat = (ssizeargfunc)object_arrtype_repeat,
+    .sq_contains = (objobjproc)object_arrtype_contains,
+    .sq_inplace_concat = (binaryfunc)object_arrtype_inplace_concat,
+    .sq_inplace_repeat = (ssizeargfunc)object_arrtype_inplace_repeat,
+};
+
+static PyMappingMethods object_arrtype_as_mapping = {
+    .mp_length = (lenfunc)object_arrtype_length,
+    .mp_subscript = (binaryfunc)object_arrtype_subscript,
+    .mp_ass_subscript = (objobjargproc)object_arrtype_ass_subscript,
+};
+
+static int
+object_arrtype_getbuffer(PyObjectScalarObject *self, Py_buffer *view, int flags)
+{
+    PyBufferProcs *pb = Py_TYPE(self->obval)->tp_as_buffer;
+    if (pb == NULL || pb->bf_getbuffer == NULL) {
+        PyErr_SetString(PyExc_TypeError,
+                        "expected a readable buffer object");
+        return -1;
+    }
+    return (*pb->bf_getbuffer)(self->obval, view, flags);
+}
+
+static void
+object_arrtype_releasebuffer(PyObjectScalarObject *self, Py_buffer *view)
+{
+    PyBufferProcs *pb = Py_TYPE(self->obval)->tp_as_buffer;
+    if (pb == NULL) {
+        PyErr_SetString(PyExc_TypeError,
+                        "expected a readable buffer object");
+        return;
+    }
+    if (pb->bf_releasebuffer != NULL) {
+        (*pb->bf_releasebuffer)(self->obval, view);
+    }
+}
+
+static PyBufferProcs object_arrtype_as_buffer = {
+    .bf_getbuffer = (getbufferproc)object_arrtype_getbuffer,
+    .bf_releasebuffer = (releasebufferproc)object_arrtype_releasebuffer,
+};
+
+static PyObject *
+object_arrtype_call(PyObjectScalarObject *obj, PyObject *args, PyObject *kwds)
+{
+    return PyObject_Call(obj->obval, args, kwds);
+}
+
+NPY_NO_EXPORT PyTypeObject PyObjectArrType_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy." NPY_OBJECT_name,
+    .tp_basicsize = sizeof(PyObjectScalarObject),
+    .tp_dealloc = (destructor)object_arrtype_dealloc,
+    .tp_as_sequence = &object_arrtype_as_sequence,
+    .tp_as_mapping = &object_arrtype_as_mapping,
+    .tp_call = (ternaryfunc)object_arrtype_call,
+    .tp_getattro = (getattrofunc)object_arrtype_getattro,
+    .tp_setattro = (setattrofunc)object_arrtype_setattro,
+    .tp_as_buffer = &object_arrtype_as_buffer,
+    .tp_alloc = object_arrtype_alloc,
+};
+
+static PyObject *
+gen_arrtype_subscript(PyObject *self, PyObject *key)
+{
+    /*
+     * Only [...], [...,<???>], [<???>, ...],
+     * is allowed for indexing a scalar
+     *
+     * These return a new N-d array with a copy of
+     * the data where N is the number of None's in <???>.
+     */
+    PyObject *res, *ret;
+
+    res = PyArray_FromScalar(self, NULL);
+
+    ret = array_subscript((PyArrayObject *)res, key);
+    Py_DECREF(res);
+    if (ret == NULL) {
+        PyErr_SetString(PyExc_IndexError,
+                        "invalid index to scalar variable.");
+    }
+    return ret;
+}
+
+
+#line 3674
+
+NPY_NO_EXPORT PyTypeObject PyBoolArrType_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy." NPY_BOOL_name,
+    .tp_basicsize = sizeof(PyBoolScalarObject),
+};
+
+
+#line 3674
+
+NPY_NO_EXPORT PyTypeObject PyByteArrType_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy." NPY_BYTE_name,
+    .tp_basicsize = sizeof(PyByteScalarObject),
+};
+
+
+#line 3674
+
+NPY_NO_EXPORT PyTypeObject PyShortArrType_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy." NPY_SHORT_name,
+    .tp_basicsize = sizeof(PyShortScalarObject),
+};
+
+
+#line 3674
+
+NPY_NO_EXPORT PyTypeObject PyIntArrType_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy." NPY_INT_name,
+    .tp_basicsize = sizeof(PyIntScalarObject),
+};
+
+
+#line 3674
+
+NPY_NO_EXPORT PyTypeObject PyLongArrType_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy." NPY_LONG_name,
+    .tp_basicsize = sizeof(PyLongScalarObject),
+};
+
+
+#line 3674
+
+NPY_NO_EXPORT PyTypeObject PyLongLongArrType_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy." NPY_LONGLONG_name,
+    .tp_basicsize = sizeof(PyLongLongScalarObject),
+};
+
+
+#line 3674
+
+NPY_NO_EXPORT PyTypeObject PyUByteArrType_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy." NPY_UBYTE_name,
+    .tp_basicsize = sizeof(PyUByteScalarObject),
+};
+
+
+#line 3674
+
+NPY_NO_EXPORT PyTypeObject PyUShortArrType_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy." NPY_USHORT_name,
+    .tp_basicsize = sizeof(PyUShortScalarObject),
+};
+
+
+#line 3674
+
+NPY_NO_EXPORT PyTypeObject PyUIntArrType_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy." NPY_UINT_name,
+    .tp_basicsize = sizeof(PyUIntScalarObject),
+};
+
+
+#line 3674
+
+NPY_NO_EXPORT PyTypeObject PyULongArrType_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy." NPY_ULONG_name,
+    .tp_basicsize = sizeof(PyULongScalarObject),
+};
+
+
+#line 3674
+
+NPY_NO_EXPORT PyTypeObject PyULongLongArrType_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy." NPY_ULONGLONG_name,
+    .tp_basicsize = sizeof(PyULongLongScalarObject),
+};
+
+
+#line 3674
+
+NPY_NO_EXPORT PyTypeObject PyHalfArrType_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy." NPY_HALF_name,
+    .tp_basicsize = sizeof(PyHalfScalarObject),
+};
+
+
+#line 3674
+
+NPY_NO_EXPORT PyTypeObject PyFloatArrType_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy." NPY_FLOAT_name,
+    .tp_basicsize = sizeof(PyFloatScalarObject),
+};
+
+
+#line 3674
+
+NPY_NO_EXPORT PyTypeObject PyDoubleArrType_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy." NPY_DOUBLE_name,
+    .tp_basicsize = sizeof(PyDoubleScalarObject),
+};
+
+
+#line 3674
+
+NPY_NO_EXPORT PyTypeObject PyLongDoubleArrType_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy." NPY_LONGDOUBLE_name,
+    .tp_basicsize = sizeof(PyLongDoubleScalarObject),
+};
+
+
+#line 3674
+
+NPY_NO_EXPORT PyTypeObject PyCFloatArrType_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy." NPY_CFLOAT_name,
+    .tp_basicsize = sizeof(PyCFloatScalarObject),
+};
+
+
+#line 3674
+
+NPY_NO_EXPORT PyTypeObject PyCDoubleArrType_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy." NPY_CDOUBLE_name,
+    .tp_basicsize = sizeof(PyCDoubleScalarObject),
+};
+
+
+#line 3674
+
+NPY_NO_EXPORT PyTypeObject PyCLongDoubleArrType_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy." NPY_CLONGDOUBLE_name,
+    .tp_basicsize = sizeof(PyCLongDoubleScalarObject),
+};
+
+
+#line 3674
+
+NPY_NO_EXPORT PyTypeObject PyStringArrType_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy." NPY_STRING_name,
+    .tp_basicsize = sizeof(PyStringScalarObject),
+};
+
+
+#line 3674
+
+NPY_NO_EXPORT PyTypeObject PyUnicodeArrType_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy." NPY_UNICODE_name,
+    .tp_basicsize = sizeof(PyUnicodeScalarObject),
+};
+
+
+#line 3674
+
+NPY_NO_EXPORT PyTypeObject PyVoidArrType_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy." NPY_VOID_name,
+    .tp_basicsize = sizeof(PyVoidScalarObject),
+};
+
+
+#line 3674
+
+NPY_NO_EXPORT PyTypeObject PyDatetimeArrType_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy." NPY_DATETIME_name,
+    .tp_basicsize = sizeof(PyDatetimeScalarObject),
+};
+
+
+#line 3674
+
+NPY_NO_EXPORT PyTypeObject PyTimedeltaArrType_Type = {
+    PyVarObject_HEAD_INIT(NULL, 0)
+    .tp_name = "numpy." NPY_TIMEDELTA_name,
+    .tp_basicsize = sizeof(PyTimedeltaScalarObject),
+};
+
+
+
+
+static PyMappingMethods gentype_as_mapping = {
+    .mp_subscript = (binaryfunc)gen_arrtype_subscript,
+};
+
+
+/*
+ * This table maps the built-in type numbers to their scalar
+ * type numbers.  Note that signed integers are mapped to INTNEG_SCALAR,
+ * which is different than what PyArray_ScalarKind returns.
+ */
+NPY_NO_EXPORT signed char
+_npy_scalar_kinds_table[NPY_NTYPES];
+
+/*
+ * This table maps a scalar kind (excluding NPY_NOSCALAR)
+ * to the smallest type number of that kind.
+ */
+NPY_NO_EXPORT signed char
+_npy_smallest_type_of_kind_table[NPY_NSCALARKINDS];
+
+/*
+ * This table gives the type of the same kind, but next in the sequence
+ * of sizes.
+ */
+NPY_NO_EXPORT signed char
+_npy_next_larger_type_table[NPY_NTYPES];
+
+/*
+ * This table gives the smallest-size and smallest-kind type to which
+ * the input types may be safely cast, according to _npy_can_cast_safely.
+ */
+NPY_NO_EXPORT signed char
+_npy_type_promotion_table[NPY_NTYPES][NPY_NTYPES];
+
+NPY_NO_EXPORT void
+initialize_casting_tables(void)
+{
+    int i, j;
+
+    _npy_smallest_type_of_kind_table[NPY_BOOL_SCALAR] = NPY_BOOL;
+    _npy_smallest_type_of_kind_table[NPY_INTPOS_SCALAR] = NPY_UBYTE;
+    _npy_smallest_type_of_kind_table[NPY_INTNEG_SCALAR] = NPY_BYTE;
+    _npy_smallest_type_of_kind_table[NPY_FLOAT_SCALAR] = NPY_HALF;
+    _npy_smallest_type_of_kind_table[NPY_COMPLEX_SCALAR] = NPY_CFLOAT;
+    _npy_smallest_type_of_kind_table[NPY_OBJECT_SCALAR] = NPY_OBJECT;
+
+    /* Default for built-in types is object scalar */
+    memset(_npy_scalar_kinds_table, NPY_OBJECT_SCALAR,
+                                        sizeof(_npy_scalar_kinds_table));
+    /* Default for next largest type is -1, signalling no bigger */
+    memset(_npy_next_larger_type_table, -1,
+                                        sizeof(_npy_next_larger_type_table));
+
+    /* Compile-time loop of scalar kinds */
+
+    #line 3755
+
+    _npy_scalar_kinds_table[NPY_BOOL] = NPY_BOOL_SCALAR;
+    _npy_next_larger_type_table[NPY_BOOL] = -1;
+
+    
+#line 3755
+
+    _npy_scalar_kinds_table[NPY_BYTE] = NPY_INTNEG_SCALAR;
+    _npy_next_larger_type_table[NPY_BYTE] = NPY_SHORT;
+
+    
+#line 3755
+
+    _npy_scalar_kinds_table[NPY_UBYTE] = NPY_INTPOS_SCALAR;
+    _npy_next_larger_type_table[NPY_UBYTE] = NPY_USHORT;
+
+    
+#line 3755
+
+    _npy_scalar_kinds_table[NPY_SHORT] = NPY_INTNEG_SCALAR;
+    _npy_next_larger_type_table[NPY_SHORT] = NPY_INT;
+
+    
+#line 3755
+
+    _npy_scalar_kinds_table[NPY_USHORT] = NPY_INTPOS_SCALAR;
+    _npy_next_larger_type_table[NPY_USHORT] = NPY_UINT;
+
+    
+#line 3755
+
+    _npy_scalar_kinds_table[NPY_INT] = NPY_INTNEG_SCALAR;
+    _npy_next_larger_type_table[NPY_INT] = NPY_LONG;
+
+    
+#line 3755
+
+    _npy_scalar_kinds_table[NPY_UINT] = NPY_INTPOS_SCALAR;
+    _npy_next_larger_type_table[NPY_UINT] = NPY_ULONG;
+
+    
+#line 3755
+
+    _npy_scalar_kinds_table[NPY_LONG] = NPY_INTNEG_SCALAR;
+    _npy_next_larger_type_table[NPY_LONG] = NPY_LONGLONG;
+
+    
+#line 3755
+
+    _npy_scalar_kinds_table[NPY_ULONG] = NPY_INTPOS_SCALAR;
+    _npy_next_larger_type_table[NPY_ULONG] = NPY_ULONGLONG;
+
+    
+#line 3755
+
+    _npy_scalar_kinds_table[NPY_LONGLONG] = NPY_INTNEG_SCALAR;
+    _npy_next_larger_type_table[NPY_LONGLONG] = -1;
+
+    
+#line 3755
+
+    _npy_scalar_kinds_table[NPY_ULONGLONG] = NPY_INTPOS_SCALAR;
+    _npy_next_larger_type_table[NPY_ULONGLONG] = -1;
+
+    
+#line 3755
+
+    _npy_scalar_kinds_table[NPY_HALF] = NPY_FLOAT_SCALAR;
+    _npy_next_larger_type_table[NPY_HALF] = NPY_FLOAT;
+
+    
+#line 3755
+
+    _npy_scalar_kinds_table[NPY_FLOAT] = NPY_FLOAT_SCALAR;
+    _npy_next_larger_type_table[NPY_FLOAT] = NPY_DOUBLE;
+
+    
+#line 3755
+
+    _npy_scalar_kinds_table[NPY_DOUBLE] = NPY_FLOAT_SCALAR;
+    _npy_next_larger_type_table[NPY_DOUBLE] = NPY_LONGDOUBLE;
+
+    
+#line 3755
+
+    _npy_scalar_kinds_table[NPY_LONGDOUBLE] = NPY_FLOAT_SCALAR;
+    _npy_next_larger_type_table[NPY_LONGDOUBLE] = -1;
+
+    
+#line 3755
+
+    _npy_scalar_kinds_table[NPY_CFLOAT] = NPY_COMPLEX_SCALAR;
+    _npy_next_larger_type_table[NPY_CFLOAT] = NPY_CDOUBLE;
+
+    
+#line 3755
+
+    _npy_scalar_kinds_table[NPY_CDOUBLE] = NPY_COMPLEX_SCALAR;
+    _npy_next_larger_type_table[NPY_CDOUBLE] = NPY_CLONGDOUBLE;
+
+    
+#line 3755
+
+    _npy_scalar_kinds_table[NPY_CLONGDOUBLE] = NPY_COMPLEX_SCALAR;
+    _npy_next_larger_type_table[NPY_CLONGDOUBLE] = -1;
+
+    
+
+#undef _TO_NUM
+#undef _TO_BSIZE
+
+/**end repeat1**/
+
+#undef _FROM_NUM
+#undef _FROM_BSIZE
+
+/**end repeat**/
+
+    /*
+     * Now that the _can_cast_safely table is finished, we can
+     * use it to build the _type_promotion table
+     */
+    for (i = 0; i < NPY_NTYPES; ++i) {
+        _npy_type_promotion_table[i][i] = i;
+        /* Don't let number promote to string/unicode/void/datetime/timedelta */
+        if (i == NPY_STRING || i == NPY_UNICODE || i == NPY_VOID ||
+                i == NPY_DATETIME || i == NPY_TIMEDELTA) {
+            /* Promoting these types requires examining their contents */
+            _npy_type_promotion_table[i][i] = -1;
+            for (j = i + 1; j < NPY_NTYPES; ++j) {
+                _npy_type_promotion_table[i][j] = -1;
+                _npy_type_promotion_table[j][i] = -1;
+            }
+            /* Except they can convert to OBJECT */
+            _npy_type_promotion_table[i][NPY_OBJECT] = NPY_OBJECT;
+            _npy_type_promotion_table[NPY_OBJECT][i] = NPY_OBJECT;
+        }
+        else {
+            for (j = i + 1; j < NPY_NTYPES; ++j) {
+                /* Don't let number promote to string/unicode/void */
+                if (j == NPY_STRING || j == NPY_UNICODE || j == NPY_VOID) {
+                    _npy_type_promotion_table[i][j] = -1;
+                    _npy_type_promotion_table[j][i] = -1;
+                }
+                else if (_npy_can_cast_safely_table[i][j]) {
+                    _npy_type_promotion_table[i][j] = j;
+                    _npy_type_promotion_table[j][i] = j;
+                }
+                else if (_npy_can_cast_safely_table[j][i]) {
+                    _npy_type_promotion_table[i][j] = i;
+                    _npy_type_promotion_table[j][i] = i;
+                }
+                else {
+                    int k, iskind, jskind, skind;
+                    iskind = _npy_scalar_kinds_table[i];
+                    jskind = _npy_scalar_kinds_table[j];
+                    /* If there's no kind (void/string/etc) */
+                    if (iskind == NPY_NOSCALAR || jskind == NPY_NOSCALAR) {
+                        k = -1;
+                    }
+                    else {
+                        /* Start with the type of larger kind */
+                        if (iskind > jskind) {
+                            skind = iskind;
+                            k = i;
+                        }
+                        else {
+                            skind = jskind;
+                            k = j;
+                        }
+                        for (;;) {
+                            /* Try the next larger type of this kind */
+                            k = _npy_next_larger_type_table[k];
+
+                            /* If there is no larger, try a larger kind */
+                            if (k < 0) {
+                                ++skind;
+                                /* Use -1 to signal no promoted type found */
+                                if (skind < NPY_NSCALARKINDS) {
+                                    k = _npy_smallest_type_of_kind_table[skind];
+                                }
+                                else {
+                                    k = -1;
+                                    break;
+                                }
+                            }
+
+                            if (_npy_can_cast_safely_table[i][k] &&
+                                            _npy_can_cast_safely_table[j][k]) {
+                                break;
+                            }
+                        }
+                    }
+                    _npy_type_promotion_table[i][j] = k;
+                    _npy_type_promotion_table[j][i] = k;
+                }
+            }
+        }
+    }
+}
+
+static PyNumberMethods longdoubletype_as_number;
+static PyNumberMethods clongdoubletype_as_number;
+static void init_basetypes(void);
+
+
+NPY_NO_EXPORT void
+initialize_numeric_types(void)
+{
+    init_basetypes();
+    PyGenericArrType_Type.tp_dealloc = (destructor)gentype_dealloc;
+    PyGenericArrType_Type.tp_as_number = &gentype_as_number;
+    PyGenericArrType_Type.tp_as_mapping = &gentype_as_mapping;
+    PyGenericArrType_Type.tp_flags = BASEFLAGS;
+    PyGenericArrType_Type.tp_methods = gentype_methods;
+    PyGenericArrType_Type.tp_getset = gentype_getsets;
+    PyGenericArrType_Type.tp_new = NULL;
+    PyGenericArrType_Type.tp_alloc = gentype_alloc;
+    PyGenericArrType_Type.tp_free = (freefunc)gentype_free;
+    PyGenericArrType_Type.tp_richcompare = gentype_richcompare;
+    PyGenericArrType_Type.tp_as_buffer = &gentype_arrtype_as_buffer;
+
+    PyBoolArrType_Type.tp_as_number = &bool_arrtype_as_number;
+    /*
+     * need to add dummy versions with filled-in nb_index
+     * in-order for PyType_Ready to fill in .__index__() method
+     * also fill array_type_as_number struct with reasonable defaults
+     */
+
+    #line 3888
+    byte_arrtype_as_number = gentype_as_number;
+    PyByteArrType_Type.tp_as_number = &byte_arrtype_as_number;
+    PyByteArrType_Type.tp_as_number->nb_index = (unaryfunc)byte_index;
+
+    
+#line 3888
+    short_arrtype_as_number = gentype_as_number;
+    PyShortArrType_Type.tp_as_number = &short_arrtype_as_number;
+    PyShortArrType_Type.tp_as_number->nb_index = (unaryfunc)short_index;
+
+    
+#line 3888
+    int_arrtype_as_number = gentype_as_number;
+    PyIntArrType_Type.tp_as_number = &int_arrtype_as_number;
+    PyIntArrType_Type.tp_as_number->nb_index = (unaryfunc)int_index;
+
+    
+#line 3888
+    long_arrtype_as_number = gentype_as_number;
+    PyLongArrType_Type.tp_as_number = &long_arrtype_as_number;
+    PyLongArrType_Type.tp_as_number->nb_index = (unaryfunc)long_index;
+
+    
+#line 3888
+    longlong_arrtype_as_number = gentype_as_number;
+    PyLongLongArrType_Type.tp_as_number = &longlong_arrtype_as_number;
+    PyLongLongArrType_Type.tp_as_number->nb_index = (unaryfunc)longlong_index;
+
+    
+#line 3888
+    ubyte_arrtype_as_number = gentype_as_number;
+    PyUByteArrType_Type.tp_as_number = &ubyte_arrtype_as_number;
+    PyUByteArrType_Type.tp_as_number->nb_index = (unaryfunc)ubyte_index;
+
+    
+#line 3888
+    ushort_arrtype_as_number = gentype_as_number;
+    PyUShortArrType_Type.tp_as_number = &ushort_arrtype_as_number;
+    PyUShortArrType_Type.tp_as_number->nb_index = (unaryfunc)ushort_index;
+
+    
+#line 3888
+    uint_arrtype_as_number = gentype_as_number;
+    PyUIntArrType_Type.tp_as_number = &uint_arrtype_as_number;
+    PyUIntArrType_Type.tp_as_number->nb_index = (unaryfunc)uint_index;
+
+    
+#line 3888
+    ulong_arrtype_as_number = gentype_as_number;
+    PyULongArrType_Type.tp_as_number = &ulong_arrtype_as_number;
+    PyULongArrType_Type.tp_as_number->nb_index = (unaryfunc)ulong_index;
+
+    
+#line 3888
+    ulonglong_arrtype_as_number = gentype_as_number;
+    PyULongLongArrType_Type.tp_as_number = &ulonglong_arrtype_as_number;
+    PyULongLongArrType_Type.tp_as_number->nb_index = (unaryfunc)ulonglong_index;
+
+    
+
+    #line 3900
+    half_arrtype_as_number = gentype_as_number;
+    PyHalfArrType_Type.tp_as_number = &half_arrtype_as_number;
+
+    
+#line 3900
+    float_arrtype_as_number = gentype_as_number;
+    PyFloatArrType_Type.tp_as_number = &float_arrtype_as_number;
+
+    
+#line 3900
+    double_arrtype_as_number = gentype_as_number;
+    PyDoubleArrType_Type.tp_as_number = &double_arrtype_as_number;
+
+    
+#line 3900
+    longdouble_arrtype_as_number = gentype_as_number;
+    PyLongDoubleArrType_Type.tp_as_number = &longdouble_arrtype_as_number;
+
+    
+#line 3900
+    cfloat_arrtype_as_number = gentype_as_number;
+    PyCFloatArrType_Type.tp_as_number = &cfloat_arrtype_as_number;
+
+    
+#line 3900
+    cdouble_arrtype_as_number = gentype_as_number;
+    PyCDoubleArrType_Type.tp_as_number = &cdouble_arrtype_as_number;
+
+    
+#line 3900
+    clongdouble_arrtype_as_number = gentype_as_number;
+    PyCLongDoubleArrType_Type.tp_as_number = &clongdouble_arrtype_as_number;
+
+    
+
+    PyBoolArrType_Type.tp_as_number->nb_index = (unaryfunc)bool_index;
+
+    PyStringArrType_Type.tp_alloc = NULL;
+    PyStringArrType_Type.tp_free = NULL;
+
+    PyStringArrType_Type.tp_repr = stringtype_repr;
+    PyStringArrType_Type.tp_str = stringtype_str;
+
+    PyUnicodeArrType_Type.tp_repr = unicodetype_repr;
+    PyUnicodeArrType_Type.tp_str = unicodetype_str;
+
+    PyVoidArrType_Type.tp_methods = voidtype_methods;
+    PyVoidArrType_Type.tp_getset = voidtype_getsets;
+    PyVoidArrType_Type.tp_as_mapping = &voidtype_as_mapping;
+    PyVoidArrType_Type.tp_as_sequence = &voidtype_as_sequence;
+    PyVoidArrType_Type.tp_repr = voidtype_repr;
+    PyVoidArrType_Type.tp_str = voidtype_str;
+
+    PyIntegerArrType_Type.tp_getset = inttype_getsets;
+
+    PyNumberArrType_Type.tp_methods = numbertype_methods;
+
+    #line 3931
+
+    PyNumberArrType_Type.tp_flags = BASEFLAGS;
+
+    
+#line 3931
+
+    PyIntegerArrType_Type.tp_flags = BASEFLAGS;
+
+    
+#line 3931
+
+    PySignedIntegerArrType_Type.tp_flags = BASEFLAGS;
+
+    
+#line 3931
+
+    PyUnsignedIntegerArrType_Type.tp_flags = BASEFLAGS;
+
+    
+#line 3931
+
+    PyInexactArrType_Type.tp_flags = BASEFLAGS;
+
+    
+#line 3931
+
+    PyFloatingArrType_Type.tp_flags = BASEFLAGS;
+
+    
+#line 3931
+
+    PyComplexFloatingArrType_Type.tp_flags = BASEFLAGS;
+
+    
+#line 3931
+
+    PyFlexibleArrType_Type.tp_flags = BASEFLAGS;
+
+    
+#line 3931
+
+    PyCharacterArrType_Type.tp_flags = BASEFLAGS;
+
+    
+
+    #line 3946
+
+    PyBoolArrType_Type.tp_flags = BASEFLAGS;
+    PyBoolArrType_Type.tp_new = bool_arrtype_new;
+    PyBoolArrType_Type.tp_richcompare = gentype_richcompare;
+
+#define _IS_Bool  /* inherit string buffer */
+#if !defined(_IS_String)
+    PyBoolArrType_Type.tp_as_buffer = &bool_arrtype_as_buffer;
+#endif
+#undef _IS_Bool
+
+    
+#line 3946
+
+    PyByteArrType_Type.tp_flags = BASEFLAGS;
+    PyByteArrType_Type.tp_new = byte_arrtype_new;
+    PyByteArrType_Type.tp_richcompare = gentype_richcompare;
+
+#define _IS_Byte  /* inherit string buffer */
+#if !defined(_IS_String)
+    PyByteArrType_Type.tp_as_buffer = &byte_arrtype_as_buffer;
+#endif
+#undef _IS_Byte
+
+    
+#line 3946
+
+    PyShortArrType_Type.tp_flags = BASEFLAGS;
+    PyShortArrType_Type.tp_new = short_arrtype_new;
+    PyShortArrType_Type.tp_richcompare = gentype_richcompare;
+
+#define _IS_Short  /* inherit string buffer */
+#if !defined(_IS_String)
+    PyShortArrType_Type.tp_as_buffer = &short_arrtype_as_buffer;
+#endif
+#undef _IS_Short
+
+    
+#line 3946
+
+    PyIntArrType_Type.tp_flags = BASEFLAGS;
+    PyIntArrType_Type.tp_new = int_arrtype_new;
+    PyIntArrType_Type.tp_richcompare = gentype_richcompare;
+
+#define _IS_Int  /* inherit string buffer */
+#if !defined(_IS_String)
+    PyIntArrType_Type.tp_as_buffer = &int_arrtype_as_buffer;
+#endif
+#undef _IS_Int
+
+    
+#line 3946
+
+    PyLongArrType_Type.tp_flags = BASEFLAGS;
+    PyLongArrType_Type.tp_new = long_arrtype_new;
+    PyLongArrType_Type.tp_richcompare = gentype_richcompare;
+
+#define _IS_Long  /* inherit string buffer */
+#if !defined(_IS_String)
+    PyLongArrType_Type.tp_as_buffer = &long_arrtype_as_buffer;
+#endif
+#undef _IS_Long
+
+    
+#line 3946
+
+    PyLongLongArrType_Type.tp_flags = BASEFLAGS;
+    PyLongLongArrType_Type.tp_new = longlong_arrtype_new;
+    PyLongLongArrType_Type.tp_richcompare = gentype_richcompare;
+
+#define _IS_LongLong  /* inherit string buffer */
+#if !defined(_IS_String)
+    PyLongLongArrType_Type.tp_as_buffer = &longlong_arrtype_as_buffer;
+#endif
+#undef _IS_LongLong
+
+    
+#line 3946
+
+    PyUByteArrType_Type.tp_flags = BASEFLAGS;
+    PyUByteArrType_Type.tp_new = ubyte_arrtype_new;
+    PyUByteArrType_Type.tp_richcompare = gentype_richcompare;
+
+#define _IS_UByte  /* inherit string buffer */
+#if !defined(_IS_String)
+    PyUByteArrType_Type.tp_as_buffer = &ubyte_arrtype_as_buffer;
+#endif
+#undef _IS_UByte
+
+    
+#line 3946
+
+    PyUShortArrType_Type.tp_flags = BASEFLAGS;
+    PyUShortArrType_Type.tp_new = ushort_arrtype_new;
+    PyUShortArrType_Type.tp_richcompare = gentype_richcompare;
+
+#define _IS_UShort  /* inherit string buffer */
+#if !defined(_IS_String)
+    PyUShortArrType_Type.tp_as_buffer = &ushort_arrtype_as_buffer;
+#endif
+#undef _IS_UShort
+
+    
+#line 3946
+
+    PyUIntArrType_Type.tp_flags = BASEFLAGS;
+    PyUIntArrType_Type.tp_new = uint_arrtype_new;
+    PyUIntArrType_Type.tp_richcompare = gentype_richcompare;
+
+#define _IS_UInt  /* inherit string buffer */
+#if !defined(_IS_String)
+    PyUIntArrType_Type.tp_as_buffer = &uint_arrtype_as_buffer;
+#endif
+#undef _IS_UInt
+
+    
+#line 3946
+
+    PyULongArrType_Type.tp_flags = BASEFLAGS;
+    PyULongArrType_Type.tp_new = ulong_arrtype_new;
+    PyULongArrType_Type.tp_richcompare = gentype_richcompare;
+
+#define _IS_ULong  /* inherit string buffer */
+#if !defined(_IS_String)
+    PyULongArrType_Type.tp_as_buffer = &ulong_arrtype_as_buffer;
+#endif
+#undef _IS_ULong
+
+    
+#line 3946
+
+    PyULongLongArrType_Type.tp_flags = BASEFLAGS;
+    PyULongLongArrType_Type.tp_new = ulonglong_arrtype_new;
+    PyULongLongArrType_Type.tp_richcompare = gentype_richcompare;
+
+#define _IS_ULongLong  /* inherit string buffer */
+#if !defined(_IS_String)
+    PyULongLongArrType_Type.tp_as_buffer = &ulonglong_arrtype_as_buffer;
+#endif
+#undef _IS_ULongLong
+
+    
+#line 3946
+
+    PyHalfArrType_Type.tp_flags = BASEFLAGS;
+    PyHalfArrType_Type.tp_new = half_arrtype_new;
+    PyHalfArrType_Type.tp_richcompare = gentype_richcompare;
+
+#define _IS_Half  /* inherit string buffer */
+#if !defined(_IS_String)
+    PyHalfArrType_Type.tp_as_buffer = &half_arrtype_as_buffer;
+#endif
+#undef _IS_Half
+
+    
+#line 3946
+
+    PyFloatArrType_Type.tp_flags = BASEFLAGS;
+    PyFloatArrType_Type.tp_new = float_arrtype_new;
+    PyFloatArrType_Type.tp_richcompare = gentype_richcompare;
+
+#define _IS_Float  /* inherit string buffer */
+#if !defined(_IS_String)
+    PyFloatArrType_Type.tp_as_buffer = &float_arrtype_as_buffer;
+#endif
+#undef _IS_Float
+
+    
+#line 3946
+
+    PyDoubleArrType_Type.tp_flags = BASEFLAGS;
+    PyDoubleArrType_Type.tp_new = double_arrtype_new;
+    PyDoubleArrType_Type.tp_richcompare = gentype_richcompare;
+
+#define _IS_Double  /* inherit string buffer */
+#if !defined(_IS_String)
+    PyDoubleArrType_Type.tp_as_buffer = &double_arrtype_as_buffer;
+#endif
+#undef _IS_Double
+
+    
+#line 3946
+
+    PyLongDoubleArrType_Type.tp_flags = BASEFLAGS;
+    PyLongDoubleArrType_Type.tp_new = longdouble_arrtype_new;
+    PyLongDoubleArrType_Type.tp_richcompare = gentype_richcompare;
+
+#define _IS_LongDouble  /* inherit string buffer */
+#if !defined(_IS_String)
+    PyLongDoubleArrType_Type.tp_as_buffer = &longdouble_arrtype_as_buffer;
+#endif
+#undef _IS_LongDouble
+
+    
+#line 3946
+
+    PyCFloatArrType_Type.tp_flags = BASEFLAGS;
+    PyCFloatArrType_Type.tp_new = cfloat_arrtype_new;
+    PyCFloatArrType_Type.tp_richcompare = gentype_richcompare;
+
+#define _IS_CFloat  /* inherit string buffer */
+#if !defined(_IS_String)
+    PyCFloatArrType_Type.tp_as_buffer = &cfloat_arrtype_as_buffer;
+#endif
+#undef _IS_CFloat
+
+    
+#line 3946
+
+    PyCDoubleArrType_Type.tp_flags = BASEFLAGS;
+    PyCDoubleArrType_Type.tp_new = cdouble_arrtype_new;
+    PyCDoubleArrType_Type.tp_richcompare = gentype_richcompare;
+
+#define _IS_CDouble  /* inherit string buffer */
+#if !defined(_IS_String)
+    PyCDoubleArrType_Type.tp_as_buffer = &cdouble_arrtype_as_buffer;
+#endif
+#undef _IS_CDouble
+
+    
+#line 3946
+
+    PyCLongDoubleArrType_Type.tp_flags = BASEFLAGS;
+    PyCLongDoubleArrType_Type.tp_new = clongdouble_arrtype_new;
+    PyCLongDoubleArrType_Type.tp_richcompare = gentype_richcompare;
+
+#define _IS_CLongDouble  /* inherit string buffer */
+#if !defined(_IS_String)
+    PyCLongDoubleArrType_Type.tp_as_buffer = &clongdouble_arrtype_as_buffer;
+#endif
+#undef _IS_CLongDouble
+
+    
+#line 3946
+
+    PyStringArrType_Type.tp_flags = BASEFLAGS;
+    PyStringArrType_Type.tp_new = string_arrtype_new;
+    PyStringArrType_Type.tp_richcompare = gentype_richcompare;
+
+#define _IS_String  /* inherit string buffer */
+#if !defined(_IS_String)
+    PyStringArrType_Type.tp_as_buffer = &string_arrtype_as_buffer;
+#endif
+#undef _IS_String
+
+    
+#line 3946
+
+    PyUnicodeArrType_Type.tp_flags = BASEFLAGS;
+    PyUnicodeArrType_Type.tp_new = unicode_arrtype_new;
+    PyUnicodeArrType_Type.tp_richcompare = gentype_richcompare;
+
+#define _IS_Unicode  /* inherit string buffer */
+#if !defined(_IS_String)
+    PyUnicodeArrType_Type.tp_as_buffer = &unicode_arrtype_as_buffer;
+#endif
+#undef _IS_Unicode
+
+    
+#line 3946
+
+    PyVoidArrType_Type.tp_flags = BASEFLAGS;
+    PyVoidArrType_Type.tp_new = void_arrtype_new;
+    PyVoidArrType_Type.tp_richcompare = gentype_richcompare;
+
+#define _IS_Void  /* inherit string buffer */
+#if !defined(_IS_String)
+    PyVoidArrType_Type.tp_as_buffer = &void_arrtype_as_buffer;
+#endif
+#undef _IS_Void
+
+    
+#line 3946
+
+    PyObjectArrType_Type.tp_flags = BASEFLAGS;
+    PyObjectArrType_Type.tp_new = object_arrtype_new;
+    PyObjectArrType_Type.tp_richcompare = gentype_richcompare;
+
+#define _IS_Object  /* inherit string buffer */
+#if !defined(_IS_String)
+    PyObjectArrType_Type.tp_as_buffer = &object_arrtype_as_buffer;
+#endif
+#undef _IS_Object
+
+    
+#line 3946
+
+    PyDatetimeArrType_Type.tp_flags = BASEFLAGS;
+    PyDatetimeArrType_Type.tp_new = datetime_arrtype_new;
+    PyDatetimeArrType_Type.tp_richcompare = gentype_richcompare;
+
+#define _IS_Datetime  /* inherit string buffer */
+#if !defined(_IS_String)
+    PyDatetimeArrType_Type.tp_as_buffer = &datetime_arrtype_as_buffer;
+#endif
+#undef _IS_Datetime
+
+    
+#line 3946
+
+    PyTimedeltaArrType_Type.tp_flags = BASEFLAGS;
+    PyTimedeltaArrType_Type.tp_new = timedelta_arrtype_new;
+    PyTimedeltaArrType_Type.tp_richcompare = gentype_richcompare;
+
+#define _IS_Timedelta  /* inherit string buffer */
+#if !defined(_IS_String)
+    PyTimedeltaArrType_Type.tp_as_buffer = &timedelta_arrtype_as_buffer;
+#endif
+#undef _IS_Timedelta
+
+    
+
+    PyUnicodeArrType_Type.tp_dealloc = unicode_arrtype_dealloc;
+
+    #line 3969
+
+    PyBoolArrType_Type.tp_hash = bool_arrtype_hash;
+
+    
+#line 3969
+
+    PyByteArrType_Type.tp_hash = byte_arrtype_hash;
+
+    
+#line 3969
+
+    PyShortArrType_Type.tp_hash = short_arrtype_hash;
+
+    
+#line 3969
+
+    PyUByteArrType_Type.tp_hash = ubyte_arrtype_hash;
+
+    
+#line 3969
+
+    PyUShortArrType_Type.tp_hash = ushort_arrtype_hash;
+
+    
+#line 3969
+
+    PyUIntArrType_Type.tp_hash = uint_arrtype_hash;
+
+    
+#line 3969
+
+    PyULongArrType_Type.tp_hash = ulong_arrtype_hash;
+
+    
+#line 3969
+
+    PyULongLongArrType_Type.tp_hash = ulonglong_arrtype_hash;
+
+    
+#line 3969
+
+    PyHalfArrType_Type.tp_hash = half_arrtype_hash;
+
+    
+#line 3969
+
+    PyFloatArrType_Type.tp_hash = float_arrtype_hash;
+
+    
+#line 3969
+
+    PyLongDoubleArrType_Type.tp_hash = longdouble_arrtype_hash;
+
+    
+#line 3969
+
+    PyCFloatArrType_Type.tp_hash = cfloat_arrtype_hash;
+
+    
+#line 3969
+
+    PyCLongDoubleArrType_Type.tp_hash = clongdouble_arrtype_hash;
+
+    
+#line 3969
+
+    PyVoidArrType_Type.tp_hash = void_arrtype_hash;
+
+    
+#line 3969
+
+    PyObjectArrType_Type.tp_hash = object_arrtype_hash;
+
+    
+#line 3969
+
+    PyDatetimeArrType_Type.tp_hash = datetime_arrtype_hash;
+
+    
+#line 3969
+
+    PyTimedeltaArrType_Type.tp_hash = timedelta_arrtype_hash;
+
+    
+
+    #line 3978
+
+    PyCFloatArrType_Type.tp_methods = cfloattype_methods;
+
+    
+#line 3978
+
+    PyCLongDoubleArrType_Type.tp_methods = clongdoubletype_methods;
+
+    
+#line 3978
+
+    PyFloatingArrType_Type.tp_methods = floatingtype_methods;
+
+    
+#line 3978
+
+    PyIntegerArrType_Type.tp_methods = integertype_methods;
+
+    
+#line 3978
+
+    PyComplexFloatingArrType_Type.tp_methods = complexfloatingtype_methods;
+
+    
+
+    #line 3989
+
+    PyByteArrType_Type.tp_methods = bytetype_methods;
+
+    
+#line 3989
+
+    PyShortArrType_Type.tp_methods = shorttype_methods;
+
+    
+#line 3989
+
+    PyIntArrType_Type.tp_methods = inttype_methods;
+
+    
+#line 3989
+
+    PyLongArrType_Type.tp_methods = longtype_methods;
+
+    
+#line 3989
+
+    PyLongLongArrType_Type.tp_methods = longlongtype_methods;
+
+    
+#line 3989
+
+    PyUByteArrType_Type.tp_methods = ubytetype_methods;
+
+    
+#line 3989
+
+    PyUShortArrType_Type.tp_methods = ushorttype_methods;
+
+    
+#line 3989
+
+    PyUIntArrType_Type.tp_methods = uinttype_methods;
+
+    
+#line 3989
+
+    PyULongArrType_Type.tp_methods = ulongtype_methods;
+
+    
+#line 3989
+
+    PyULongLongArrType_Type.tp_methods = ulonglongtype_methods;
+
+    
+
+    #line 3998
+
+    PyHalfArrType_Type.tp_methods = halftype_methods;
+
+    
+#line 3998
+
+    PyFloatArrType_Type.tp_methods = floattype_methods;
+
+    
+#line 3998
+
+    PyDoubleArrType_Type.tp_methods = doubletype_methods;
+
+    
+#line 3998
+
+    PyLongDoubleArrType_Type.tp_methods = longdoubletype_methods;
+
+    
+
+    #line 4009
+
+    PyByteArrType_Type.tp_methods = bytetype_methods;
+
+    
+#line 4009
+
+    PyShortArrType_Type.tp_methods = shorttype_methods;
+
+    
+#line 4009
+
+    PyIntArrType_Type.tp_methods = inttype_methods;
+
+    
+#line 4009
+
+    PyLongArrType_Type.tp_methods = longtype_methods;
+
+    
+#line 4009
+
+    PyLongLongArrType_Type.tp_methods = longlongtype_methods;
+
+    
+#line 4009
+
+    PyUByteArrType_Type.tp_methods = ubytetype_methods;
+
+    
+#line 4009
+
+    PyUShortArrType_Type.tp_methods = ushorttype_methods;
+
+    
+#line 4009
+
+    PyUIntArrType_Type.tp_methods = uinttype_methods;
+
+    
+#line 4009
+
+    PyULongArrType_Type.tp_methods = ulongtype_methods;
+
+    
+#line 4009
+
+    PyULongLongArrType_Type.tp_methods = ulonglongtype_methods;
+
+    
+#line 4009
+
+    PyTimedeltaArrType_Type.tp_methods = timedeltatype_methods;
+
+    
+#line 4009
+
+    PyCDoubleArrType_Type.tp_methods = cdoubletype_methods;
+
+    
+
+    /* We won't be inheriting from Python Int type. */
+    PyIntArrType_Type.tp_hash = int_arrtype_hash;
+
+    /* We won't be inheriting from Python Int type. */
+    PyLongArrType_Type.tp_hash = long_arrtype_hash;
+
+    /* We won't be inheriting from Python Int type. */
+    PyLongLongArrType_Type.tp_hash = longlong_arrtype_hash;
+
+    #line 4026
+
+    PyHalfArrType_Type.tp_repr = halftype_repr;
+
+    PyFloatArrType_Type.tp_repr = floattype_repr;
+    PyCFloatArrType_Type.tp_repr = cfloattype_repr;
+
+    PyDoubleArrType_Type.tp_repr = doubletype_repr;
+    PyCDoubleArrType_Type.tp_repr  = cdoubletype_repr;
+
+    PyDatetimeArrType_Type.tp_repr = datetimetype_repr;
+    PyTimedeltaArrType_Type.tp_repr = timedeltatype_repr;
+
+    
+#line 4026
+
+    PyHalfArrType_Type.tp_str = halftype_str;
+
+    PyFloatArrType_Type.tp_str = floattype_str;
+    PyCFloatArrType_Type.tp_str = cfloattype_str;
+
+    PyDoubleArrType_Type.tp_str = doubletype_str;
+    PyCDoubleArrType_Type.tp_str  = cdoubletype_str;
+
+    PyDatetimeArrType_Type.tp_str = datetimetype_str;
+    PyTimedeltaArrType_Type.tp_str = timedeltatype_str;
+
+    
+
+
+    #line 4045
+
+    /* both str/repr use genint_type_str to avoid trailing "L" of longs */
+    PyBoolArrType_Type.tp_str = genint_type_str;
+    PyBoolArrType_Type.tp_repr = genint_type_str;
+
+    
+#line 4045
+
+    /* both str/repr use genint_type_str to avoid trailing "L" of longs */
+    PyByteArrType_Type.tp_str = genint_type_str;
+    PyByteArrType_Type.tp_repr = genint_type_str;
+
+    
+#line 4045
+
+    /* both str/repr use genint_type_str to avoid trailing "L" of longs */
+    PyUByteArrType_Type.tp_str = genint_type_str;
+    PyUByteArrType_Type.tp_repr = genint_type_str;
+
+    
+#line 4045
+
+    /* both str/repr use genint_type_str to avoid trailing "L" of longs */
+    PyShortArrType_Type.tp_str = genint_type_str;
+    PyShortArrType_Type.tp_repr = genint_type_str;
+
+    
+#line 4045
+
+    /* both str/repr use genint_type_str to avoid trailing "L" of longs */
+    PyUShortArrType_Type.tp_str = genint_type_str;
+    PyUShortArrType_Type.tp_repr = genint_type_str;
+
+    
+#line 4045
+
+    /* both str/repr use genint_type_str to avoid trailing "L" of longs */
+    PyIntArrType_Type.tp_str = genint_type_str;
+    PyIntArrType_Type.tp_repr = genint_type_str;
+
+    
+#line 4045
+
+    /* both str/repr use genint_type_str to avoid trailing "L" of longs */
+    PyUIntArrType_Type.tp_str = genint_type_str;
+    PyUIntArrType_Type.tp_repr = genint_type_str;
+
+    
+#line 4045
+
+    /* both str/repr use genint_type_str to avoid trailing "L" of longs */
+    PyLongArrType_Type.tp_str = genint_type_str;
+    PyLongArrType_Type.tp_repr = genint_type_str;
+
+    
+#line 4045
+
+    /* both str/repr use genint_type_str to avoid trailing "L" of longs */
+    PyULongArrType_Type.tp_str = genint_type_str;
+    PyULongArrType_Type.tp_repr = genint_type_str;
+
+    
+#line 4045
+
+    /* both str/repr use genint_type_str to avoid trailing "L" of longs */
+    PyLongLongArrType_Type.tp_str = genint_type_str;
+    PyLongLongArrType_Type.tp_repr = genint_type_str;
+
+    
+#line 4045
+
+    /* both str/repr use genint_type_str to avoid trailing "L" of longs */
+    PyULongLongArrType_Type.tp_str = genint_type_str;
+    PyULongLongArrType_Type.tp_repr = genint_type_str;
+
+    
+
+
+
+    #line 4058
+
+    /*
+     * These need to be coded specially because longdouble/clongdouble getitem
+     * does not return a normal Python type
+     */
+    longdoubletype_as_number.nb_float = longdoubletype_float;
+    longdoubletype_as_number.nb_int  = longdoubletype_long;
+
+    PyLongDoubleArrType_Type.tp_as_number = &longdoubletype_as_number;
+    PyLongDoubleArrType_Type.tp_repr = longdoubletype_repr;
+    PyLongDoubleArrType_Type.tp_str = longdoubletype_str;
+
+    
+#line 4058
+
+    /*
+     * These need to be coded specially because longdouble/clongdouble getitem
+     * does not return a normal Python type
+     */
+    clongdoubletype_as_number.nb_float = clongdoubletype_float;
+    clongdoubletype_as_number.nb_int  = clongdoubletype_long;
+
+    PyCLongDoubleArrType_Type.tp_as_number = &clongdoubletype_as_number;
+    PyCLongDoubleArrType_Type.tp_repr = clongdoubletype_repr;
+    PyCLongDoubleArrType_Type.tp_str = clongdoubletype_str;
+
+    
+
+    PyStringArrType_Type.tp_itemsize = sizeof(char);
+    PyVoidArrType_Type.tp_dealloc = (destructor) void_dealloc;
+
+    PyArrayIter_Type.tp_iter = PyObject_SelfIter;
+    PyArrayMapIter_Type.tp_iter = PyObject_SelfIter;
+}
+
+typedef struct {
+    PyTypeObject * type;
+    int typenum;
+} scalar_type;
+
+static scalar_type typeobjects[] = {
+    {&PyBoolArrType_Type, NPY_BOOL},
+    {&PyByteArrType_Type, NPY_BYTE},
+    {&PyUByteArrType_Type, NPY_UBYTE},
+    {&PyShortArrType_Type, NPY_SHORT},
+    {&PyUShortArrType_Type, NPY_USHORT},
+    {&PyIntArrType_Type, NPY_INT},
+    {&PyUIntArrType_Type, NPY_UINT},
+    {&PyLongArrType_Type, NPY_LONG},
+    {&PyULongArrType_Type, NPY_ULONG},
+    {&PyLongLongArrType_Type, NPY_LONGLONG},
+    {&PyULongLongArrType_Type, NPY_ULONGLONG},
+    {&PyFloatArrType_Type, NPY_FLOAT},
+    {&PyDoubleArrType_Type, NPY_DOUBLE},
+    {&PyLongDoubleArrType_Type, NPY_LONGDOUBLE},
+    {&PyCFloatArrType_Type, NPY_CFLOAT},
+    {&PyCDoubleArrType_Type, NPY_CDOUBLE},
+    {&PyCLongDoubleArrType_Type, NPY_CLONGDOUBLE},
+    {&PyObjectArrType_Type, NPY_OBJECT},
+    {&PyStringArrType_Type, NPY_STRING},
+    {&PyUnicodeArrType_Type, NPY_UNICODE},
+    {&PyVoidArrType_Type, NPY_VOID},
+    {&PyDatetimeArrType_Type, NPY_DATETIME},
+    {&PyTimedeltaArrType_Type, NPY_TIMEDELTA},
+    {&PyHalfArrType_Type, NPY_HALF}
+};
+
+static int compare_types(const void * a_, const void * b_)
+{
+    const PyTypeObject * a = ((const scalar_type *)a_)->type;
+    const PyTypeObject * b = ((const scalar_type *)b_)->type;
+    if (a < b) {
+        return -1;
+    }
+    else if (a > b) {
+        return 1;
+    }
+    return 0;
+}
+
+static void init_basetypes(void)
+{
+    qsort(typeobjects, sizeof(typeobjects) / sizeof(typeobjects[0]),
+          sizeof(typeobjects[0]),
+          compare_types);
+}
+
+
+NPY_NO_EXPORT int
+get_typeobj_idx(PyTypeObject * obj)
+{
+    npy_intp imin = 0, imax = sizeof(typeobjects) / sizeof(typeobjects[0]) - 1;
+    while (imax >= imin)
+    {
+        npy_intp imid = ((imax - imin) / 2) + imin;
+        if(typeobjects[imid].type == obj) {
+            return imid;
+        }
+        else if (typeobjects[imid].type < obj) {
+            imin = imid + 1;
+        }
+        else {
+            imax = imid - 1;
+        }
+    }
+
+    return -1;
+}
+
+NPY_NO_EXPORT int
+is_anyscalar_exact(PyObject *obj)
+{
+    return get_typeobj_idx(Py_TYPE(obj)) >= 0;
+}
+
+NPY_NO_EXPORT int
+_typenum_fromtypeobj(PyObject *type, int user)
+{
+    int typenum, i;
+
+    typenum = NPY_NOTYPE;
+    i = get_typeobj_idx((PyTypeObject*)type);
+    if (i >= 0) {
+        typenum = typeobjects[i].typenum;
+    }
+
+    if (!user) {
+        return typenum;
+    }
+    /* Search any registered types */
+    i = 0;
+    while (i < NPY_NUMUSERTYPES) {
+        if (type == (PyObject *)(userdescrs[i]->typeobj)) {
+            typenum = i + NPY_USERDEF;
+            break;
+        }
+        i++;
+    }
+    return typenum;
+}
+
diff --git a/numpy/core/src/_generated/templ_common.h b/numpy/core/src/_generated/templ_common.h
new file mode 100644
index 000000000000..c62cae0868e0
--- /dev/null
+++ b/numpy/core/src/_generated/templ_common.h
@@ -0,0 +1,253 @@
+#line 1 "numpy/core/src/common/templ_common.h.src"
+
+/*
+ *****************************************************************************
+ **       This file was autogenerated from a template  DO NOT EDIT!!!!      **
+ **       Changes should be made to the original source (.src) file         **
+ *****************************************************************************
+ */
+
+#line 1
+#ifndef __NPY_TYPED_COMMON_INC
+#define __NPY_TYPED_COMMON_INC
+
+/* utility functions that profit from templates */
+
+#include "numpy/npy_common.h"
+#include <assert.h>
+
+#line 20
+/*
+ * writes result of a * b into r
+ * returns 1 if a * b overflowed else returns 0
+ * 
+ * These functions are not designed to work if either a or b is negative, but
+ * that is not checked. Could use absolute values and adjust the sign if that
+ * functionality was desired.
+ */
+static inline int
+npy_mul_with_overflow_int(npy_int * r, npy_int a, npy_int b)
+{
+#ifdef HAVE___BUILTIN_MUL_OVERFLOW
+    return __builtin_mul_overflow(a, b, r);
+#else
+    const npy_int half_sz = ((npy_int)1 << ((sizeof(a) * 8 - 1 ) / 2));
+
+    *r = a * b;
+    /*
+     * avoid expensive division on common no overflow case
+     */
+    if ((NPY_UNLIKELY((a | b) >= half_sz) || (a | b) < 0) &&
+            a != 0 && 
+#if 1
+            abs(b) > abs(NPY_MAX_INT / a)
+#else
+            b > NPY_MAX_INT / a
+#endif
+            ) {
+        return 1;
+    }
+    return 0;
+#endif
+}
+
+#line 20
+/*
+ * writes result of a * b into r
+ * returns 1 if a * b overflowed else returns 0
+ * 
+ * These functions are not designed to work if either a or b is negative, but
+ * that is not checked. Could use absolute values and adjust the sign if that
+ * functionality was desired.
+ */
+static inline int
+npy_mul_with_overflow_uint(npy_uint * r, npy_uint a, npy_uint b)
+{
+#ifdef HAVE___BUILTIN_MUL_OVERFLOW
+    return __builtin_mul_overflow(a, b, r);
+#else
+    const npy_uint half_sz = ((npy_uint)1 << ((sizeof(a) * 8 - 1 ) / 2));
+
+    *r = a * b;
+    /*
+     * avoid expensive division on common no overflow case
+     */
+    if ((NPY_UNLIKELY((a | b) >= half_sz) || (a | b) < 0) &&
+            a != 0 && 
+#if 0
+            abs(b) > abs(NPY_MAX_UINT / a)
+#else
+            b > NPY_MAX_UINT / a
+#endif
+            ) {
+        return 1;
+    }
+    return 0;
+#endif
+}
+
+#line 20
+/*
+ * writes result of a * b into r
+ * returns 1 if a * b overflowed else returns 0
+ * 
+ * These functions are not designed to work if either a or b is negative, but
+ * that is not checked. Could use absolute values and adjust the sign if that
+ * functionality was desired.
+ */
+static inline int
+npy_mul_with_overflow_long(npy_long * r, npy_long a, npy_long b)
+{
+#ifdef HAVE___BUILTIN_MUL_OVERFLOW
+    return __builtin_mul_overflow(a, b, r);
+#else
+    const npy_long half_sz = ((npy_long)1 << ((sizeof(a) * 8 - 1 ) / 2));
+
+    *r = a * b;
+    /*
+     * avoid expensive division on common no overflow case
+     */
+    if ((NPY_UNLIKELY((a | b) >= half_sz) || (a | b) < 0) &&
+            a != 0 && 
+#if 1
+            labs(b) > labs(NPY_MAX_LONG / a)
+#else
+            b > NPY_MAX_LONG / a
+#endif
+            ) {
+        return 1;
+    }
+    return 0;
+#endif
+}
+
+#line 20
+/*
+ * writes result of a * b into r
+ * returns 1 if a * b overflowed else returns 0
+ * 
+ * These functions are not designed to work if either a or b is negative, but
+ * that is not checked. Could use absolute values and adjust the sign if that
+ * functionality was desired.
+ */
+static inline int
+npy_mul_with_overflow_ulong(npy_ulong * r, npy_ulong a, npy_ulong b)
+{
+#ifdef HAVE___BUILTIN_MUL_OVERFLOW
+    return __builtin_mul_overflow(a, b, r);
+#else
+    const npy_ulong half_sz = ((npy_ulong)1 << ((sizeof(a) * 8 - 1 ) / 2));
+
+    *r = a * b;
+    /*
+     * avoid expensive division on common no overflow case
+     */
+    if ((NPY_UNLIKELY((a | b) >= half_sz) || (a | b) < 0) &&
+            a != 0 && 
+#if 0
+            labs(b) > labs(NPY_MAX_ULONG / a)
+#else
+            b > NPY_MAX_ULONG / a
+#endif
+            ) {
+        return 1;
+    }
+    return 0;
+#endif
+}
+
+#line 20
+/*
+ * writes result of a * b into r
+ * returns 1 if a * b overflowed else returns 0
+ * 
+ * These functions are not designed to work if either a or b is negative, but
+ * that is not checked. Could use absolute values and adjust the sign if that
+ * functionality was desired.
+ */
+static inline int
+npy_mul_with_overflow_longlong(npy_longlong * r, npy_longlong a, npy_longlong b)
+{
+#ifdef HAVE___BUILTIN_MUL_OVERFLOW
+    return __builtin_mul_overflow(a, b, r);
+#else
+    const npy_longlong half_sz = ((npy_longlong)1 << ((sizeof(a) * 8 - 1 ) / 2));
+
+    *r = a * b;
+    /*
+     * avoid expensive division on common no overflow case
+     */
+    if ((NPY_UNLIKELY((a | b) >= half_sz) || (a | b) < 0) &&
+            a != 0 && 
+#if 1
+            llabs(b) > llabs(NPY_MAX_LONGLONG / a)
+#else
+            b > NPY_MAX_LONGLONG / a
+#endif
+            ) {
+        return 1;
+    }
+    return 0;
+#endif
+}
+
+#line 20
+/*
+ * writes result of a * b into r
+ * returns 1 if a * b overflowed else returns 0
+ * 
+ * These functions are not designed to work if either a or b is negative, but
+ * that is not checked. Could use absolute values and adjust the sign if that
+ * functionality was desired.
+ */
+static inline int
+npy_mul_with_overflow_ulonglong(npy_ulonglong * r, npy_ulonglong a, npy_ulonglong b)
+{
+#ifdef HAVE___BUILTIN_MUL_OVERFLOW
+    return __builtin_mul_overflow(a, b, r);
+#else
+    const npy_ulonglong half_sz = ((npy_ulonglong)1 << ((sizeof(a) * 8 - 1 ) / 2));
+
+    *r = a * b;
+    /*
+     * avoid expensive division on common no overflow case
+     */
+    if ((NPY_UNLIKELY((a | b) >= half_sz) || (a | b) < 0) &&
+            a != 0 && 
+#if 0
+            llabs(b) > llabs(NPY_MAX_ULONGLONG / a)
+#else
+            b > NPY_MAX_ULONGLONG / a
+#endif
+            ) {
+        return 1;
+    }
+    return 0;
+#endif
+}
+
+
+static inline int
+npy_mul_sizes_with_overflow (npy_intp * r, npy_intp a, npy_intp b)
+{
+#ifdef HAVE___BUILTIN_MUL_OVERFLOW
+    return __builtin_mul_overflow(a, b, r);
+#else
+    /* this function only supports non-negative numbers */
+    assert(a >= 0 && b >= 0);
+    const npy_intp half_sz = ((npy_intp)1 << ((sizeof(a) * 8 - 1 ) / 2));
+
+    *r = a * b;
+    /*
+     * avoid expensive division on common no overflow case
+     */
+    if (NPY_UNLIKELY((a | b) >= half_sz)
+        && a != 0 && b > NPY_MAX_INTP / a) {
+        return 1;
+    }
+    return 0;
+#endif
+}
+
+#endif
+
diff --git a/z b/z
new file mode 100755
index 000000000000..7da7ec8905ec
--- /dev/null
+++ b/z
@@ -0,0 +1,20 @@
+#!/usr/bin/env bash
+# Copyright(c) The Maintainers of Nanvix.
+# Licensed under the MIT License.
+
+# Unified entry point: delegates to z.sh (Linux/macOS) or z.ps1 (Windows).
+# Requires nanvix-zutil to be installed (pip install nanvix-zutil).
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+case "$(uname -s)" in
+    CYGWIN* | MINGW* | MSYS*)
+        exec powershell.exe -NoProfile -ExecutionPolicy Bypass \
+            -File "$SCRIPT_DIR/z.ps1" "$@"
+        ;;
+    *)
+        exec "$SCRIPT_DIR/z.sh" "$@"
+        ;;
+esac
diff --git a/z.sh b/z.sh
new file mode 100755
index 000000000000..2a5bc0dd3280
--- /dev/null
+++ b/z.sh
@@ -0,0 +1,91 @@
+#!/usr/bin/env bash
+# Copyright(c) The Maintainers of Nanvix.
+# Licensed under the MIT License.
+
+# Thin wrapper that delegates to the nanvix-zutil CLI.
+# Self-bootstraps nanvix-zutil into .nanvix/venv/ if it is not already installed.
+
+set -euo pipefail
+
+PINNED_VERSION="0.8.2"
+RAW_ZUTIL_VERSION="${NANVIX_ZUTIL_VERSION:-$PINNED_VERSION}"
+ZUTIL_VERSION="${RAW_ZUTIL_VERSION#v}"
+REPO_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd -P)"
+VENV="$REPO_ROOT/.nanvix/venv"
+
+function _resolve_venv_paths() {
+    if [ -d "$VENV/Scripts" ]; then
+        VENV_BIN="$VENV/Scripts/nanvix-zutil.exe"
+        VENV_PYTHON="$VENV/Scripts/python.exe"
+    else
+        VENV_BIN="$VENV/bin/nanvix-zutil"
+        VENV_PYTHON="$VENV/bin/python"
+    fi
+}
+_resolve_venv_paths
+ZUTIL_GLOBAL_VERSION="$(nanvix-zutil --version 2>/dev/null || true)"
+
+function bootstrap() {
+    echo "nanvix-zutil not found -- bootstrapping nanvix-zutil==${ZUTIL_VERSION}..." >&2
+
+    if ! command -v python3 &>/dev/null; then
+        echo "Error: python3 not found. Install Python 3 and ensure python3 is on PATH." >&2
+        exit 1
+    fi
+
+    WHEEL_URL="https://github.com/nanvix/zutils/releases/download/v${ZUTIL_VERSION}/nanvix_zutil-${ZUTIL_VERSION}-py3-none-any.whl"
+    if [ -d "$VENV" ]; then
+        python3 -m venv --clear "$VENV"
+    else
+        python3 -m venv "$VENV"
+    fi
+    _resolve_venv_paths
+    "$VENV_PYTHON" -m pip install --quiet "nanvix-zutil[lint] @ ${WHEEL_URL}"
+}
+
+BIN=""
+if [ ! -d "$VENV" ] && [ -z "$ZUTIL_GLOBAL_VERSION" ]; then
+    bootstrap
+    BIN="$VENV_BIN"
+elif [ -x "$VENV_BIN" ]; then
+    VENV_VERSION="$("$VENV_BIN" --version 2>/dev/null || true)"
+    if [ "$VENV_VERSION" != "nanvix-zutil ${ZUTIL_VERSION}" ]; then
+        echo "Warning: venv nanvix-zutil version mismatch. Re-bootstrapping..." >&2
+        bootstrap
+    fi
+    BIN="$VENV_BIN"
+elif [ -d "$VENV" ] && ! command -v nanvix-zutil &>/dev/null; then
+    bootstrap
+    BIN="$VENV_BIN"
+else
+    BIN="nanvix-zutil"
+fi
+
+ARGS=()
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --with-nanvix=*)
+            WITH_NANVIX="$(cd -- "${1#--with-nanvix=}" 2>/dev/null && pwd -P)"
+            export WITH_NANVIX
+            shift
+            ;;
+        --with-nanvix)
+            WITH_NANVIX="$(cd -- "$2" 2>/dev/null && pwd -P)"
+            export WITH_NANVIX
+            shift 2
+            ;;
+        *)
+            ARGS+=("$1")
+            shift
+            ;;
+    esac
+done
+
+if [[ "${ARGS[0]:-}" == "distclean" ]]; then
+    "$BIN" "${ARGS[@]}"
+    EC=$?
+    if [ -d "$VENV" ]; then rm -rf "$VENV" 2>/dev/null || true; fi
+    exit $EC
+fi
+
+exec "$BIN" "${ARGS[@]}"